Update .gitignore, DataClass.py, and 19 more files...
오랜만에 서버 정리하고 커밋. 파일 위치를 정리했다. 캘리버 DB 를 열고 정보를 열람. Pupil 을 통해 다운받은 정보를 관리하기 위해 새로운 클래스 추가
This commit is contained in:
369
GetArc_Hitomi.py
369
GetArc_Hitomi.py
@@ -6,180 +6,209 @@ from selenium.common.exceptions import TimeoutException
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import time
|
||||
|
||||
import UtilPack as util
|
||||
import DataClass as info
|
||||
|
||||
|
||||
#
|
||||
def GetSearchResult(searchWord):
|
||||
url = getSiteUrl(searchWord)
|
||||
class GetArc_Hitomi:
|
||||
m_strBaseURL = "https://hitomi.la/"
|
||||
|
||||
m_listTagsTemp = list[info.TagInfo]()
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def GetSearchResult(self, strWord: str, bSaveHTML: bool = False):
|
||||
if util.IsEmptyStr(strWord):
|
||||
util.DbgOut("Error : SearchWord is empty", True)
|
||||
return
|
||||
|
||||
util.DbgOut("Hitomi : " + url, True)
|
||||
|
||||
driver = webdriver.Chrome()
|
||||
driver.get(url)
|
||||
|
||||
# 웹페이지가 로드될 때까지 기다리기
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(
|
||||
#EC.presence_of_element_located((By.CLASS_NAME, 'lillie'))
|
||||
lambda d: d.execute_script("return document.readyState") == "complete"
|
||||
)
|
||||
except TimeoutException:
|
||||
util.DbgOut("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.", True)
|
||||
driver.quit()
|
||||
return
|
||||
|
||||
strContent = driver.page_source
|
||||
|
||||
listRet = parseMangaInfos(strContent)
|
||||
|
||||
for Idx in range(len(listRet)):
|
||||
print(f"{Idx} : {listRet[Idx]}")
|
||||
|
||||
driver.quit()
|
||||
|
||||
def GetListSearchResult(list_ID):
|
||||
driver = webdriver.Chrome()
|
||||
|
||||
# 웹페이지가 로드될 때까지 기다리기
|
||||
try:
|
||||
for id in list_ID:
|
||||
url = getSiteUrl(id)
|
||||
util.DbgOut("Hitomi : " + url, True)
|
||||
|
||||
driver.get(url)
|
||||
|
||||
WebDriverWait(driver, 10).until(
|
||||
lambda d: d.execute_script("return document.readyState") == "complete"
|
||||
)
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
strContent = driver.page_source
|
||||
listRet = parseMangaInfos(strContent)
|
||||
|
||||
#for Idx in range(len(listRet)):
|
||||
# print(f"{Idx} : {listRet[Idx]}")
|
||||
|
||||
try:
|
||||
for Idx in range(len(listRet)):
|
||||
print(f"{Idx} : {listRet[Idx]}")
|
||||
with open( f"{id}.txt", 'w') as file:
|
||||
for item in listRet[Idx]:
|
||||
file.write( + "\n")
|
||||
file.close()
|
||||
except IOError:
|
||||
util.DbgOut(f"Error: Could not write to the file at {id}.txt.", True)
|
||||
|
||||
except Exception as e:
|
||||
util.DbgOut("Hitomi Loading Error : ", e)
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
|
||||
def getSiteUrl(searchWord):
|
||||
strRet = "https://hitomi.la/"
|
||||
|
||||
if False == searchWord.isdigit():
|
||||
strRet = f"{strRet}search.html?{searchWord}"
|
||||
else:
|
||||
strRet = f"{strRet}galleries/{searchWord}.html"
|
||||
|
||||
return strRet
|
||||
|
||||
#
|
||||
def parseMangaInfos(html_doc):
|
||||
# BeautifulSoup 객체 생성
|
||||
soup = BeautifulSoup(html_doc, 'html.parser')
|
||||
gallery_elements = soup.find_all(class_='gallery-content')
|
||||
|
||||
listDJs = []
|
||||
for element in gallery_elements:
|
||||
listDJ = djParse(element)
|
||||
listDJs.extend(listDJ)
|
||||
|
||||
return listDJs
|
||||
|
||||
|
||||
def djParse(soup_element):
|
||||
childs = soup_element.find_all(class_='dj')
|
||||
|
||||
listInfos = []
|
||||
for child in childs:
|
||||
info = djTitleParse(child)
|
||||
|
||||
listTag1 = djArtistParse(child, info)
|
||||
listTag2 = djDescParse(child, info)
|
||||
|
||||
listInfos.append(info)
|
||||
|
||||
return listInfos
|
||||
|
||||
|
||||
def djTitleParse(input_element):
|
||||
element = input_element.find('h1', class_='lillie')
|
||||
title = element.text
|
||||
|
||||
a_tag = element.find('a')
|
||||
url = a_tag.get('href')
|
||||
|
||||
#util.DbgOut("title : " + title)
|
||||
#util.DbgOut("URl : " + url)
|
||||
|
||||
return info.CBZInfo(title, url)
|
||||
|
||||
|
||||
def djArtistParse(input_element, retPtr):
|
||||
element = input_element.find('div', class_='artist-list')
|
||||
|
||||
a_tags = element.find_all('a')
|
||||
listArtists = []
|
||||
for tag in a_tags:
|
||||
artist = tag.text
|
||||
a_url = tag.get('href')
|
||||
retPtr.AddArtist(artist)
|
||||
listArtists.append( info.TagInfo(artist, a_url) )
|
||||
|
||||
return listArtists
|
||||
|
||||
|
||||
def djDescParse(input_element, retPtr):
|
||||
element = input_element.find('table', class_='dj-desc')
|
||||
tb_rows = element.find_all('tr')
|
||||
listTags = []
|
||||
for row in tb_rows:
|
||||
tds = row.find_all('td')
|
||||
if 2 != len(tds):
|
||||
util.DbgOut("Warning : td get failed")
|
||||
continue
|
||||
|
||||
|
||||
outMsg = f"{tds[0].text} : \r\n"
|
||||
|
||||
a_tags = tds[1].find_all('a')
|
||||
for tag in a_tags:
|
||||
tag_name = tag.text
|
||||
tag_url = tag.get('href')
|
||||
|
||||
retPtr.AddTag(tag_name)
|
||||
|
||||
listTags.append(info.TagInfo(tag_name, tag_url))
|
||||
|
||||
outMsg += f" {tag_name} {tag_url}\r\n"
|
||||
|
||||
#util.DbgOut(outMsg)
|
||||
|
||||
#
|
||||
if "Series" == tds[0]:
|
||||
retPtr.serires = listTags[-1].name
|
||||
elif "Type" == tds[0]:
|
||||
retPtr.type = listTags[-1].name
|
||||
elif "Language" == tds[0]:
|
||||
retPtr.language = listTags[-1].name
|
||||
strURL = ""
|
||||
if strWord.isdigit():
|
||||
strURL = self.getSiteUrlForGallery(int(strWord))
|
||||
else:
|
||||
pass
|
||||
|
||||
return listTags
|
||||
strURL = self.getSiteUrlForSearch(strWord)
|
||||
|
||||
util.DbgOut(f"Hitomi : {strURL}", True)
|
||||
|
||||
driver = webdriver.Chrome()
|
||||
driver.get(strURL)
|
||||
|
||||
# 웹페이지가 로드될 때까지 기다리기
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(
|
||||
#EC.presence_of_element_located((By.CLASS_NAME, 'lillie'))
|
||||
lambda d: d.execute_script("return document.readyState") == "complete"
|
||||
)
|
||||
except TimeoutException:
|
||||
util.DbgOut("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.", True)
|
||||
driver.quit()
|
||||
return
|
||||
|
||||
strContent = driver.page_source
|
||||
driver.quit()
|
||||
|
||||
if True == bSaveHTML:
|
||||
strFileName = f"{strWord}_result.html"
|
||||
with open(strFileName, "w", encoding="utf-8") as file:
|
||||
file.write(strContent)
|
||||
|
||||
util.DbgOut(f"HTML content saved to {strFileName}", True)
|
||||
|
||||
listRet = self.parseMangaInfos(strContent)
|
||||
|
||||
for Idx in range(len(listRet)):
|
||||
util.DbgOut(f"{Idx} : {listRet[Idx]}", True)
|
||||
|
||||
#
|
||||
def GetListSearchResult(self, listID: list[int], bSave: bool = False):
|
||||
driver = webdriver.Chrome()
|
||||
|
||||
# 웹페이지가 로드될 때까지 기다리기
|
||||
try:
|
||||
for nID in listID:
|
||||
strURL = self.getSiteUrlForGallery(nID)
|
||||
util.DbgOut(f"Hitomi : {strURL}", True)
|
||||
|
||||
driver.get(strURL)
|
||||
|
||||
WebDriverWait(driver, 10).until(
|
||||
lambda d: d.execute_script("return document.readyState") == "complete"
|
||||
)
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
strContent = driver.page_source
|
||||
listRet = self.parseMangaInfos(strContent)
|
||||
|
||||
#for Idx in range(len(listRet)):
|
||||
# print(f"{Idx} : {listRet[Idx]}")
|
||||
|
||||
try:
|
||||
for Idx in range(len(listRet)):
|
||||
print(f"{Idx} : {listRet[Idx]}")
|
||||
with open( f"{id}.txt", 'w') as file:
|
||||
for item in listRet[Idx]:
|
||||
file.write( + "\n")
|
||||
except IOError:
|
||||
util.DbgOut(f"Error: Could not write to the file at {id}.txt.", True)
|
||||
|
||||
except Exception as e:
|
||||
util.DbgOut(f"Hitomi Loading Error : {e}", True)
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
|
||||
def getSiteUrlForSearch(self, searchWord: str) -> str:
|
||||
return f"{self.m_strBaseURL}search.html?{searchWord}"
|
||||
|
||||
def getSiteUrlForGallery(self, nHitomiID: int) -> str:
|
||||
return f"{self.m_strBaseURL}galleries/{nHitomiID}.html"
|
||||
|
||||
#
|
||||
def parseMangaInfos(self, html_doc : str) -> list[info.CBZInfo]:
|
||||
# BeautifulSoup 객체 생성
|
||||
soup = BeautifulSoup(html_doc, 'html.parser')
|
||||
gallery_elements = soup.find_all(class_='gallery-content')
|
||||
|
||||
listDJs: list[info.CBZInfo] = []
|
||||
for element in gallery_elements:
|
||||
listDJ = self.djParse(element)
|
||||
listDJs.extend(listDJ)
|
||||
|
||||
return listDJs
|
||||
|
||||
#
|
||||
def djParse(self, soup_element) -> list[info.CBZInfo]:
|
||||
childs = soup_element.find_all(class_='dj')
|
||||
|
||||
listInfos: list[info.CBZInfo] = []
|
||||
for child in childs:
|
||||
info = self.djTitleParse(child)
|
||||
self.djArtistParse(child, info)
|
||||
self.djDescParse(child, info)
|
||||
|
||||
listInfos.append(info)
|
||||
|
||||
return listInfos
|
||||
|
||||
#
|
||||
def djTitleParse(self, input_element):
|
||||
element = input_element.find('h1', class_='lillie')
|
||||
strTitle: str = element.text
|
||||
|
||||
a_tag = element.find('a')
|
||||
strURL: str = a_tag.get('href')
|
||||
|
||||
#util.DbgOut("title : " + title)
|
||||
#util.DbgOut("URl : " + url)
|
||||
|
||||
return info.CBZInfo(strTitle, strURL)
|
||||
|
||||
#
|
||||
def djArtistParse(self, input_element, retPtr):
|
||||
element = input_element.find('div', class_='artist-list')
|
||||
|
||||
a_tags = element.find_all('a')
|
||||
for tag in a_tags:
|
||||
artist = tag.text
|
||||
a_url = tag.get('href')
|
||||
retPtr.AddArtist(artist)
|
||||
|
||||
#
|
||||
def djDescParse(self, input_element, retPtr):
|
||||
element = input_element.find('table', class_='dj-desc')
|
||||
tb_rows = element.find_all('tr')
|
||||
listTags = []
|
||||
for row in tb_rows:
|
||||
tds = row.find_all('td')
|
||||
if 2 != len(tds):
|
||||
util.DbgOut("Warning : td get failed")
|
||||
continue
|
||||
|
||||
outMsg = f"{tds[0].text} : \r\n"
|
||||
|
||||
a_tags = tds[1].find_all('a')
|
||||
for tag in a_tags:
|
||||
tag_name = tag.text
|
||||
tag_url = tag.get('href')
|
||||
|
||||
retPtr.AddTag(tag_name)
|
||||
|
||||
listTags.append(info.TagInfo(tag_name, tag_url))
|
||||
|
||||
outMsg += f" {tag_name} {tag_url}\r\n"
|
||||
|
||||
#util.DbgOut(outMsg)
|
||||
|
||||
#
|
||||
if "Series" == tds[0]:
|
||||
retPtr.serires = listTags[-1].name
|
||||
elif "Type" == tds[0]:
|
||||
retPtr.type = listTags[-1].name
|
||||
elif "Language" == tds[0]:
|
||||
retPtr.language = listTags[-1].name
|
||||
else:
|
||||
pass
|
||||
|
||||
return listTags
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
# Hitomi Search Test
|
||||
hitomi = GetArc_Hitomi()
|
||||
|
||||
# 검색어로 검색
|
||||
#hitomi.GetSearchResult("test")
|
||||
|
||||
# ID로 검색
|
||||
hitomi.GetSearchResult("11107", True)
|
||||
|
||||
# ID 리스트로 검색
|
||||
#listID = [1234567, 2345678, 3456789]
|
||||
#hitomi.GetListSearchResult(listID, True)
|
||||
|
||||
# For Main Loop
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user