from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from bs4 import BeautifulSoup import UtilPack as util import DataClass as info # def GetSearchResult(searchWord): url = getSiteUrl(searchWord) util.DbgOut("Hitomi : " + url) driver = webdriver.Chrome() driver.get(url) # 웹페이지가 로드될 때까지 기다리기 try: WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.CLASS_NAME, 'lillie')) ) except TimeoutException: util.DbgOut("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.") driver.quit() return strContent = driver.page_source driver.quit() parseMangaInfos(strContent) pass # def getSiteUrl(searchWord): strRet = "https://hitomi.la/" if False == util.IsEmptyStr(searchWord): if False == searchWord.isdigit(): strRet = strRet + "search.html?" + searchWord else: strRet = strRet + "galleries/" + searchWord + ".html" return strRet # def parseMangaInfos(html_doc): # BeautifulSoup 객체 생성 soup = BeautifulSoup(html_doc, 'html.parser') gallery_elements = soup.find_all(class_='gallery-content') listDJs = [] for element in gallery_elements: listDJ = djParse(element) listDJs.extend(listDJ) print(len(listDJs)) def djParse(soup_element): childs = soup_element.find_all(class_='dj') listInfos = [] for child in childs: info = djTitleParse(child) listTag1 = djArtistParse(child, info) listTag2 = djDescParse(child, info) listInfos.append(info) return listInfos def djTitleParse(input_element): element = input_element.find('h1', class_='lillie') title = element.text a_tag = element.find('a') url = a_tag.get('href') util.DbgOut("title : " + title) util.DbgOut("URl : " + url) return info.CBZInfo(title, url) def djArtistParse(input_element, retPtr): element = input_element.find('div', class_='artist-list') a_tags = element.find_all('a') listArtists = [] for tag in a_tags: artist = tag.text a_url = tag.get('href') retPtr.AddArtist(artist) listArtists.append( info.TagInfo(artist, a_url) ) return listArtists def djDescParse(input_element, retPtr): element = input_element.find('table', class_='dj-desc') tb_rows = element.find_all('tr') listTags = [] for row in tb_rows: tds = row.find_all('td') if 2 != len(tds): util.DbgOut("Warning : td get failed") continue outMsg = f"{tds[0].text} : \r\n" a_tags = tds[1].find_all('a') for tag in a_tags: tag_name = tag.text tag_url = tag.get('href') retPtr.AddTag(tag_name) listTags.append(info.TagInfo(tag_name, tag_url)) outMsg += f" {tag_name} {tag_url}\r\n" util.DbgOut(outMsg) # if "Series" == tds[0]: retPtr.serires = listTags[-1].name elif "Type" == tds[0]: retPtr.type = listTags[-1].name elif "Language" == tds[0]: retPtr.language = listTags[-1].name else: pass return listTags