Update DataClass.py, GetArc_Ehentai.py, and 3 more files...

E-hentai Page Parse
DataClass added
This commit is contained in:
2024-04-05 21:51:36 +09:00
parent dbe5377d6f
commit 809748a73a
5 changed files with 202 additions and 192 deletions

13
DataClass.py Normal file
View File

@@ -0,0 +1,13 @@
class MangaMetaInfo:
def __init__(self, title, url, lang, manType, *tags):
self.title = title
self.url = url
self.lang = lang
self.manType = manType
self.tags = tags
pass
#series
#type
#languages
#tags

View File

@@ -6,113 +6,110 @@ from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
class MangaMetaInfo: listResult = []
def __init__(self, title, url, lang, manType, *tags):
self.title = title
self.url = url
self.lang = lang
self.manType = manType
self.tags = tags
pass
#series def GetSearchResult(searchWord):
#type print("E-hentai start")
#languages #url = getSiteUrl() + searchWord
#tags url = "https://e-hentai.org/"
driver = webdriver.Chrome()
driver.get(url)
class MangaInfo:
listResult = [] # 웹페이지가 로드될 때까지 기다리기
try:
def GetSearchResult(self, searchWord): WebDriverWait(driver, 30).until(
url = self.getSiteUrl() + searchWord EC.presence_of_element_located((By.CLASS_NAME, 'dp'))
driver = webdriver.Chrome() )
driver.get(url) except TimeoutException:
print("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.")
# 웹페이지가 로드될 때까지 기다리기
try:
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CLASS_NAME, 'lillie'))
)
except TimeoutException:
print("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.")
driver.quit()
return
strContent = driver.page_source
driver.quit() driver.quit()
return
self.parseMangaInfos(strContent) strContent = driver.page_source
driver.quit()
pass
#
def getSiteUrl(self):
strRet = "https://e-hentai.org/search.html?"
return strRet
# parseMangaInfos(strContent)
def parseMangaInfos(self, html_doc):
# BeautifulSoup 객체 생성
soup = BeautifulSoup(html_doc, 'html.parser')
gallery_elements = soup.find_all(class_='gallery-content')
for element in gallery_elements:
self.djParse(element)
def djParse(self, soup_element): pass
childs = soup_element.find_all(class_='dj')
for child in childs:
self.djtitleParse(child)
self.artistlistParse(child)
self.djDescParse(child)
print("\r\n")
def djtitleParse(self, soup_element):
element = soup_element.find('h1', class_='lillie')
title = element.text
a_tag = element.find('a')
a_url = a_tag.get('href')
print("title : " + title)
print("URl : " + a_url)
def artistlistParse(self, soup_element):
element = soup_element.find('div', class_='artist-list')
print("artists")
a_tags = element.find_all('a')
for tag in a_tags:
artist = tag.text
a_url = tag.get('href')
print(" " + artist + " " + a_url)
def djDescParse(self, soup_element): #
element = soup_element.find('table', class_='dj-desc') def getSiteUrl():
tb_rows = element.find_all('tr') strRet = "https://e-hentai.org/?f_search="
for row in tb_rows:
tds = row.find_all('td') return strRet
if 2 != len(tds):
print("td get failed")
continue #
def parseMangaInfos(html_doc):
print(tds[0].text + " : ") # BeautifulSoup 객체 생성
soup = BeautifulSoup(html_doc, 'html.parser')
a_tags = tds[1].find_all('a') gallery_table = soup.find('table', class_='itg gltc')
for tag in a_tags: gls = gallery_table.find_all('tr')
tag_name = tag.text
tag_url = tag.get('href') idx = 1;
print(" " + tag_name + " " + tag_url) for gl in gls:
gl1cParse(gl)
pass gl2cParse(gl)
gl3cParse(gl)
print("\r\n")
# type
def gl1cParse(soup_element):
element = soup_element.find('td', class_='gl1c glcat')
if element is None:
return
man_type = element.find('div')
print("type : " + man_type.text)
# torrent
def gl2cParse(soup_element):
element = soup_element.find('td', class_='gl2c')
if element is None:
return
trt_btn = element.find('div', class_='gldown')
trt_url = trt_btn.find('a')
if trt_url:
url = trt_url.get('href')
print("torrent : " + url)
else:
print("torrent : none")
#
def gl3cParse(soup_element):
element = soup_element.find('td', class_='gl3c glname')
if element is None:
return
elemenr_url = element.find('a')
man_url = elemenr_url.get('href')
element_title = element.find('div', class_='glink')
man_title = element_title.text
print("title : " + man_title)
print("Url : " + man_url)
print("tags : ")
tags = element.find_all('div', class_='gt')
man_tags = []
for tag in tags:
man_tag = tag.get('title')
print(" " + man_tag)
man_tags.append(man_tag)
print(len(man_tags))
pass

View File

@@ -21,98 +21,96 @@ class MangaMetaInfo:
#tags #tags
class MangaInfo: listResult = []
listResult = [] def GetSearchResult(searchWord):
url = getSiteUrl() + searchWord
def GetSearchResult(self, searchWord): driver = webdriver.Chrome()
url = self.getSiteUrl() + searchWord driver.get(url)
driver = webdriver.Chrome()
driver.get(url) # 웹페이지가 로드될 때까지 기다리기
try:
# 웹페이지가 로드될 때까지 기다리기 WebDriverWait(driver, 30).until(
try: EC.presence_of_element_located((By.CLASS_NAME, 'lillie'))
WebDriverWait(driver, 30).until( )
EC.presence_of_element_located((By.CLASS_NAME, 'lillie')) except TimeoutException:
) print("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.")
except TimeoutException:
print("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.")
driver.quit()
return
strContent = driver.page_source
driver.quit() driver.quit()
return
self.parseMangaInfos(strContent) strContent = driver.page_source
driver.quit()
pass
#
def getSiteUrl(self):
strRet = "https://hitomi.la/search.html?"
return strRet
# parseMangaInfos(strContent)
def parseMangaInfos(self, html_doc):
# BeautifulSoup 객체 생성
soup = BeautifulSoup(html_doc, 'html.parser')
gallery_elements = soup.find_all(class_='gallery-content')
for element in gallery_elements:
self.djParse(element)
def djParse(self, soup_element): pass
childs = soup_element.find_all(class_='dj')
#
for child in childs: def getSiteUrl():
self.djtitleParse(child) strRet = "https://hitomi.la/search.html?"
self.artistlistParse(child)
self.djDescParse(child)
print("\r\n")
def djtitleParse(self, soup_element):
element = soup_element.find('h1', class_='lillie')
title = element.text
a_tag = element.find('a') return strRet
a_url = a_tag.get('href')
print("title : " + title) #
print("URl : " + a_url) def parseMangaInfos(html_doc):
# BeautifulSoup 객체 생성
soup = BeautifulSoup(html_doc, 'html.parser')
gallery_elements = soup.find_all(class_='gallery-content')
for element in gallery_elements:
djParse(element)
def artistlistParse(self, soup_element):
element = soup_element.find('div', class_='artist-list') def djParse(soup_element):
childs = soup_element.find_all(class_='dj')
for child in childs:
djtitleParse(child)
artistlistParse(child)
djDescParse(child)
print("artists") print("\r\n")
a_tags = element.find_all('a')
def djtitleParse(soup_element):
element = soup_element.find('h1', class_='lillie')
title = element.text
a_tag = element.find('a')
a_url = a_tag.get('href')
print("title : " + title)
print("URl : " + a_url)
def artistlistParse(soup_element):
element = soup_element.find('div', class_='artist-list')
print("artists")
a_tags = element.find_all('a')
for tag in a_tags:
artist = tag.text
a_url = tag.get('href')
print(" " + artist + " " + a_url)
def djDescParse(soup_element):
element = soup_element.find('table', class_='dj-desc')
tb_rows = element.find_all('tr')
for row in tb_rows:
tds = row.find_all('td')
if 2 != len(tds):
print("td get failed")
continue
print(tds[0].text + " : ")
a_tags = tds[1].find_all('a')
for tag in a_tags: for tag in a_tags:
artist = tag.text tag_name = tag.text
a_url = tag.get('href') tag_url = tag.get('href')
print(" " + artist + " " + a_url) print(" " + tag_name + " " + tag_url)
pass
def djDescParse(self, soup_element):
element = soup_element.find('table', class_='dj-desc')
tb_rows = element.find_all('tr')
for row in tb_rows:
tds = row.find_all('td')
if 2 != len(tds):
print("td get failed")
continue
print(tds[0].text + " : ")
a_tags = tds[1].find_all('a')
for tag in a_tags:
tag_name = tag.text
tag_url = tag.get('href')
print(" " + tag_name + " " + tag_url)
pass

View File

@@ -9,7 +9,6 @@ xls_name = "mangaDB.xlsx"
list_MetaInfo = [] list_MetaInfo = []
# #
def GetCurrentTime(): def GetCurrentTime():
# 현재 시간을 구하고 구조체로 변환 # 현재 시간을 구하고 구조체로 변환
@@ -37,8 +36,7 @@ def GetCurrentTime():
# except FileNotFoundError: # except FileNotFoundError:
# wb = Workbook() # wb = Workbook()
# print("xls Created") # print("xls Created")
# ws = wb.active # ws = wb.active
# # time, title, url, tags (comma) # # time, title, url, tags (comma)
# ws['A1'] = "Modified Time" # ws['A1'] = "Modified Time"

10
main.py
View File

@@ -1,8 +1,12 @@
import GetArc_Hitomi as gethitomi import GetArc_Hitomi as getHitomi
import GetArc_Ehentai as getEhentai
def main(): def main():
obj = gethitomi.MangaInfo() #getHitomi.GetSearchResult("trouble sweets")
obj.GetSearchResult("trouble sweets") getEhentai.GetSearchResult("artist%3A%22kotomi+yo-ji%24%22")
#artist:"kotomi yo-ji$"
#"artist%3A%22kotomi+yo-ji%24%22"
# For Main Loop # For Main Loop