From dbe5377d6fef534a4daeb705f962595e20a6c672 Mon Sep 17 00:00:00 2001 From: Lee Young Hoon Date: Fri, 5 Apr 2024 02:25:55 +0900 Subject: [PATCH] first added --- GetArc_Ehentai.py | 118 ++++++++++++++++++++++++++++++++++++++++++++++ GetArc_Hitomi.py | 118 ++++++++++++++++++++++++++++++++++++++++++++++ StoreXLS.py | 75 +++++++++++++++++++++++++++++ main.py | 11 +++++ 4 files changed, 322 insertions(+) create mode 100644 GetArc_Ehentai.py create mode 100644 GetArc_Hitomi.py create mode 100644 StoreXLS.py create mode 100644 main.py diff --git a/GetArc_Ehentai.py b/GetArc_Ehentai.py new file mode 100644 index 0000000..3c7b9a4 --- /dev/null +++ b/GetArc_Ehentai.py @@ -0,0 +1,118 @@ +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import TimeoutException +from bs4 import BeautifulSoup + + +class MangaMetaInfo: + def __init__(self, title, url, lang, manType, *tags): + self.title = title + self.url = url + self.lang = lang + self.manType = manType + self.tags = tags + pass + + #series + #type + #languages + #tags + + +class MangaInfo: + + listResult = [] + + def GetSearchResult(self, searchWord): + url = self.getSiteUrl() + searchWord + driver = webdriver.Chrome() + driver.get(url) + + # 웹페이지가 로드될 때까지 기다리기 + try: + WebDriverWait(driver, 30).until( + EC.presence_of_element_located((By.CLASS_NAME, 'lillie')) + ) + except TimeoutException: + print("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.") + driver.quit() + return + + strContent = driver.page_source + driver.quit() + + self.parseMangaInfos(strContent) + + pass + + # + def getSiteUrl(self): + strRet = "https://e-hentai.org/search.html?" + + return strRet + + + # + def parseMangaInfos(self, html_doc): + # BeautifulSoup 객체 생성 + soup = BeautifulSoup(html_doc, 'html.parser') + gallery_elements = soup.find_all(class_='gallery-content') + + for element in gallery_elements: + self.djParse(element) + + + def djParse(self, soup_element): + childs = soup_element.find_all(class_='dj') + + for child in childs: + self.djtitleParse(child) + self.artistlistParse(child) + self.djDescParse(child) + + print("\r\n") + + + def djtitleParse(self, soup_element): + element = soup_element.find('h1', class_='lillie') + title = element.text + + a_tag = element.find('a') + a_url = a_tag.get('href') + + print("title : " + title) + print("URl : " + a_url) + + def artistlistParse(self, soup_element): + element = soup_element.find('div', class_='artist-list') + + print("artists") + + a_tags = element.find_all('a') + for tag in a_tags: + artist = tag.text + a_url = tag.get('href') + print(" " + artist + " " + a_url) + + + def djDescParse(self, soup_element): + element = soup_element.find('table', class_='dj-desc') + tb_rows = element.find_all('tr') + for row in tb_rows: + tds = row.find_all('td') + if 2 != len(tds): + print("td get failed") + continue + + print(tds[0].text + " : ") + + a_tags = tds[1].find_all('a') + for tag in a_tags: + tag_name = tag.text + tag_url = tag.get('href') + print(" " + tag_name + " " + tag_url) + + pass + diff --git a/GetArc_Hitomi.py b/GetArc_Hitomi.py new file mode 100644 index 0000000..a2913a6 --- /dev/null +++ b/GetArc_Hitomi.py @@ -0,0 +1,118 @@ +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import TimeoutException +from bs4 import BeautifulSoup + + +class MangaMetaInfo: + def __init__(self, title, url, lang, manType, *tags): + self.title = title + self.url = url + self.lang = lang + self.manType = manType + self.tags = tags + pass + + #series + #type + #languages + #tags + + +class MangaInfo: + + listResult = [] + + def GetSearchResult(self, searchWord): + url = self.getSiteUrl() + searchWord + driver = webdriver.Chrome() + driver.get(url) + + # 웹페이지가 로드될 때까지 기다리기 + try: + WebDriverWait(driver, 30).until( + EC.presence_of_element_located((By.CLASS_NAME, 'lillie')) + ) + except TimeoutException: + print("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.") + driver.quit() + return + + strContent = driver.page_source + driver.quit() + + self.parseMangaInfos(strContent) + + pass + + # + def getSiteUrl(self): + strRet = "https://hitomi.la/search.html?" + + return strRet + + + # + def parseMangaInfos(self, html_doc): + # BeautifulSoup 객체 생성 + soup = BeautifulSoup(html_doc, 'html.parser') + gallery_elements = soup.find_all(class_='gallery-content') + + for element in gallery_elements: + self.djParse(element) + + + def djParse(self, soup_element): + childs = soup_element.find_all(class_='dj') + + for child in childs: + self.djtitleParse(child) + self.artistlistParse(child) + self.djDescParse(child) + + print("\r\n") + + + def djtitleParse(self, soup_element): + element = soup_element.find('h1', class_='lillie') + title = element.text + + a_tag = element.find('a') + a_url = a_tag.get('href') + + print("title : " + title) + print("URl : " + a_url) + + def artistlistParse(self, soup_element): + element = soup_element.find('div', class_='artist-list') + + print("artists") + + a_tags = element.find_all('a') + for tag in a_tags: + artist = tag.text + a_url = tag.get('href') + print(" " + artist + " " + a_url) + + + def djDescParse(self, soup_element): + element = soup_element.find('table', class_='dj-desc') + tb_rows = element.find_all('tr') + for row in tb_rows: + tds = row.find_all('td') + if 2 != len(tds): + print("td get failed") + continue + + print(tds[0].text + " : ") + + a_tags = tds[1].find_all('a') + for tag in a_tags: + tag_name = tag.text + tag_url = tag.get('href') + print(" " + tag_name + " " + tag_url) + + pass + diff --git a/StoreXLS.py b/StoreXLS.py new file mode 100644 index 0000000..bf22999 --- /dev/null +++ b/StoreXLS.py @@ -0,0 +1,75 @@ +import os +import time +from openpyxl import Workbook +from openpyxl import load_workbook +from openpyxl.utils import get_column_letter + + +xls_name = "mangaDB.xlsx" +list_MetaInfo = [] + + + +# +def GetCurrentTime(): + # 현재 시간을 구하고 구조체로 변환 + current_time_struct = time.localtime() + + # 구조체에서 연, 월, 일, 시간, 분, 초를 추출 + year = current_time_struct.tm_year + month = current_time_struct.tm_mon + day = current_time_struct.tm_mday + hour = current_time_struct.tm_hour + minute = current_time_struct.tm_min + second = current_time_struct.tm_sec + + strRet = (f"{year}/{month}/{day}_{hour}:{minute}:{second}") + + return strRet + + +# # +# def XLSWriteMangainfo(title, url, *tags): +# # +# try: +# wb = load_workbook(xls_path) +# print("Open Successed") +# except FileNotFoundError: +# wb = Workbook() +# print("xls Created") + + +# ws = wb.active +# # time, title, url, tags (comma) +# ws['A1'] = "Modified Time" +# ws['B1'] = int(time.time()) + + +# if 'list' not in wb.sheetnames: +# ws1 = wb.create_sheet(title='list') +# print('list sheet created') + +# wb.save(xls_path) + +# ws2 = wb['list'] + +# # 폴더 경로 +# folder_path = '/media/gerd/test/hiyobi_temp/' + +# # 폴더 내의 파일 및 폴더 목록 가져오기 +# items = os.listdir(folder_path) + +# index = 2 +# # 파일 및 폴더 목록 출력 +# for item in items: +# pos = 'A' + str(index) +# ws2[pos] = item +# index += 1 +# #print(item) + +# print(str(index) + " searched") +# wb.save(xls_path) + +# wb.close() + + diff --git a/main.py b/main.py new file mode 100644 index 0000000..dc5e315 --- /dev/null +++ b/main.py @@ -0,0 +1,11 @@ +import GetArc_Hitomi as gethitomi + +def main(): + obj = gethitomi.MangaInfo() + obj.GetSearchResult("trouble sweets") + + +# For Main Loop +if __name__ == '__main__': + main() +