diff --git a/DataClass.py b/DataClass.py new file mode 100644 index 0000000..8d589ca --- /dev/null +++ b/DataClass.py @@ -0,0 +1,13 @@ +class MangaMetaInfo: + def __init__(self, title, url, lang, manType, *tags): + self.title = title + self.url = url + self.lang = lang + self.manType = manType + self.tags = tags + pass + + #series + #type + #languages + #tags \ No newline at end of file diff --git a/GetArc_Ehentai.py b/GetArc_Ehentai.py index 3c7b9a4..61e8337 100644 --- a/GetArc_Ehentai.py +++ b/GetArc_Ehentai.py @@ -6,113 +6,110 @@ from selenium.common.exceptions import TimeoutException from bs4 import BeautifulSoup -class MangaMetaInfo: - def __init__(self, title, url, lang, manType, *tags): - self.title = title - self.url = url - self.lang = lang - self.manType = manType - self.tags = tags - pass +listResult = [] - #series - #type - #languages - #tags - - -class MangaInfo: +def GetSearchResult(searchWord): + print("E-hentai start") + #url = getSiteUrl() + searchWord + url = "https://e-hentai.org/" + driver = webdriver.Chrome() + driver.get(url) - listResult = [] - - def GetSearchResult(self, searchWord): - url = self.getSiteUrl() + searchWord - driver = webdriver.Chrome() - driver.get(url) - - # 웹페이지가 로드될 때까지 기다리기 - try: - WebDriverWait(driver, 30).until( - EC.presence_of_element_located((By.CLASS_NAME, 'lillie')) - ) - except TimeoutException: - print("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.") - driver.quit() - return - - strContent = driver.page_source + # 웹페이지가 로드될 때까지 기다리기 + try: + WebDriverWait(driver, 30).until( + EC.presence_of_element_located((By.CLASS_NAME, 'dp')) + ) + except TimeoutException: + print("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.") driver.quit() + return - self.parseMangaInfos(strContent) - - pass - - # - def getSiteUrl(self): - strRet = "https://e-hentai.org/search.html?" - - return strRet - + strContent = driver.page_source + driver.quit() - # - def parseMangaInfos(self, html_doc): - # BeautifulSoup 객체 생성 - soup = BeautifulSoup(html_doc, 'html.parser') - gallery_elements = soup.find_all(class_='gallery-content') - - for element in gallery_elements: - self.djParse(element) - + parseMangaInfos(strContent) - def djParse(self, soup_element): - childs = soup_element.find_all(class_='dj') - - for child in childs: - self.djtitleParse(child) - self.artistlistParse(child) - self.djDescParse(child) - - print("\r\n") - - - def djtitleParse(self, soup_element): - element = soup_element.find('h1', class_='lillie') - title = element.text - - a_tag = element.find('a') - a_url = a_tag.get('href') - - print("title : " + title) - print("URl : " + a_url) - - def artistlistParse(self, soup_element): - element = soup_element.find('div', class_='artist-list') - - print("artists") - - a_tags = element.find_all('a') - for tag in a_tags: - artist = tag.text - a_url = tag.get('href') - print(" " + artist + " " + a_url) - + pass - def djDescParse(self, soup_element): - element = soup_element.find('table', class_='dj-desc') - tb_rows = element.find_all('tr') - for row in tb_rows: - tds = row.find_all('td') - if 2 != len(tds): - print("td get failed") - continue - - print(tds[0].text + " : ") - - a_tags = tds[1].find_all('a') - for tag in a_tags: - tag_name = tag.text - tag_url = tag.get('href') - print(" " + tag_name + " " + tag_url) - - pass +# +def getSiteUrl(): + strRet = "https://e-hentai.org/?f_search=" + + return strRet + + +# +def parseMangaInfos(html_doc): + # BeautifulSoup 객체 생성 + soup = BeautifulSoup(html_doc, 'html.parser') + gallery_table = soup.find('table', class_='itg gltc') + gls = gallery_table.find_all('tr') + + idx = 1; + for gl in gls: + gl1cParse(gl) + gl2cParse(gl) + gl3cParse(gl) + print("\r\n") + + +# type +def gl1cParse(soup_element): + element = soup_element.find('td', class_='gl1c glcat') + + if element is None: + return + + man_type = element.find('div') + + print("type : " + man_type.text) + +# torrent +def gl2cParse(soup_element): + element = soup_element.find('td', class_='gl2c') + + if element is None: + return + + trt_btn = element.find('div', class_='gldown') + trt_url = trt_btn.find('a') + + if trt_url: + url = trt_url.get('href') + print("torrent : " + url) + else: + print("torrent : none") + + +# +def gl3cParse(soup_element): + element = soup_element.find('td', class_='gl3c glname') + + if element is None: + return + + elemenr_url = element.find('a') + man_url = elemenr_url.get('href') + + element_title = element.find('div', class_='glink') + man_title = element_title.text + + print("title : " + man_title) + print("Url : " + man_url) + + print("tags : ") + tags = element.find_all('div', class_='gt') + man_tags = [] + for tag in tags: + man_tag = tag.get('title') + print(" " + man_tag) + man_tags.append(man_tag) + + print(len(man_tags)) + + + + + pass diff --git a/GetArc_Hitomi.py b/GetArc_Hitomi.py index a2913a6..4fa7387 100644 --- a/GetArc_Hitomi.py +++ b/GetArc_Hitomi.py @@ -21,98 +21,96 @@ class MangaMetaInfo: #tags -class MangaInfo: +listResult = [] - listResult = [] - - def GetSearchResult(self, searchWord): - url = self.getSiteUrl() + searchWord - driver = webdriver.Chrome() - driver.get(url) - - # 웹페이지가 로드될 때까지 기다리기 - try: - WebDriverWait(driver, 30).until( - EC.presence_of_element_located((By.CLASS_NAME, 'lillie')) - ) - except TimeoutException: - print("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.") - driver.quit() - return - - strContent = driver.page_source +def GetSearchResult(searchWord): + url = getSiteUrl() + searchWord + driver = webdriver.Chrome() + driver.get(url) + + # 웹페이지가 로드될 때까지 기다리기 + try: + WebDriverWait(driver, 30).until( + EC.presence_of_element_located((By.CLASS_NAME, 'lillie')) + ) + except TimeoutException: + print("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.") driver.quit() + return - self.parseMangaInfos(strContent) - - pass - - # - def getSiteUrl(self): - strRet = "https://hitomi.la/search.html?" - - return strRet - + strContent = driver.page_source + driver.quit() - # - def parseMangaInfos(self, html_doc): - # BeautifulSoup 객체 생성 - soup = BeautifulSoup(html_doc, 'html.parser') - gallery_elements = soup.find_all(class_='gallery-content') - - for element in gallery_elements: - self.djParse(element) - + parseMangaInfos(strContent) - def djParse(self, soup_element): - childs = soup_element.find_all(class_='dj') - - for child in childs: - self.djtitleParse(child) - self.artistlistParse(child) - self.djDescParse(child) - - print("\r\n") - - - def djtitleParse(self, soup_element): - element = soup_element.find('h1', class_='lillie') - title = element.text + pass + +# +def getSiteUrl(): + strRet = "https://hitomi.la/search.html?" - a_tag = element.find('a') - a_url = a_tag.get('href') - - print("title : " + title) - print("URl : " + a_url) + return strRet + + +# +def parseMangaInfos(html_doc): + # BeautifulSoup 객체 생성 + soup = BeautifulSoup(html_doc, 'html.parser') + gallery_elements = soup.find_all(class_='gallery-content') + + for element in gallery_elements: + djParse(element) - def artistlistParse(self, soup_element): - element = soup_element.find('div', class_='artist-list') + +def djParse(soup_element): + childs = soup_element.find_all(class_='dj') + + for child in childs: + djtitleParse(child) + artistlistParse(child) + djDescParse(child) - print("artists") + print("\r\n") - a_tags = element.find_all('a') + +def djtitleParse(soup_element): + element = soup_element.find('h1', class_='lillie') + title = element.text + + a_tag = element.find('a') + a_url = a_tag.get('href') + + print("title : " + title) + print("URl : " + a_url) + +def artistlistParse(soup_element): + element = soup_element.find('div', class_='artist-list') + + print("artists") + + a_tags = element.find_all('a') + for tag in a_tags: + artist = tag.text + a_url = tag.get('href') + print(" " + artist + " " + a_url) + + +def djDescParse(soup_element): + element = soup_element.find('table', class_='dj-desc') + tb_rows = element.find_all('tr') + for row in tb_rows: + tds = row.find_all('td') + if 2 != len(tds): + print("td get failed") + continue + + print(tds[0].text + " : ") + + a_tags = tds[1].find_all('a') for tag in a_tags: - artist = tag.text - a_url = tag.get('href') - print(" " + artist + " " + a_url) - - - def djDescParse(self, soup_element): - element = soup_element.find('table', class_='dj-desc') - tb_rows = element.find_all('tr') - for row in tb_rows: - tds = row.find_all('td') - if 2 != len(tds): - print("td get failed") - continue - - print(tds[0].text + " : ") - - a_tags = tds[1].find_all('a') - for tag in a_tags: - tag_name = tag.text - tag_url = tag.get('href') - print(" " + tag_name + " " + tag_url) - - pass + tag_name = tag.text + tag_url = tag.get('href') + print(" " + tag_name + " " + tag_url) + + pass diff --git a/StoreXLS.py b/StoreXLS.py index bf22999..d92d132 100644 --- a/StoreXLS.py +++ b/StoreXLS.py @@ -9,7 +9,6 @@ xls_name = "mangaDB.xlsx" list_MetaInfo = [] - # def GetCurrentTime(): # 현재 시간을 구하고 구조체로 변환 @@ -37,8 +36,7 @@ def GetCurrentTime(): # except FileNotFoundError: # wb = Workbook() # print("xls Created") - - + # ws = wb.active # # time, title, url, tags (comma) # ws['A1'] = "Modified Time" diff --git a/main.py b/main.py index dc5e315..af11b9e 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,12 @@ -import GetArc_Hitomi as gethitomi +import GetArc_Hitomi as getHitomi +import GetArc_Ehentai as getEhentai def main(): - obj = gethitomi.MangaInfo() - obj.GetSearchResult("trouble sweets") + #getHitomi.GetSearchResult("trouble sweets") + getEhentai.GetSearchResult("artist%3A%22kotomi+yo-ji%24%22") + + #artist:"kotomi yo-ji$" + #"artist%3A%22kotomi+yo-ji%24%22" # For Main Loop