CollectMangaInfo/GetArc_Hitomi.py

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup

import time
import UtilPack as util
import DataClass as info

#
class GetArc_Hitomi:
    m_strBaseURL = "https://hitomi.la/"

    m_listTagsTemp = list[info.TagInfo]()

    def __init__(self):
        pass

    def GetSearchResult(self, strWord: str, bSaveHTML: bool = False):
        if util.IsEmptyStr(strWord):
            util.DbgOut("Error : SearchWord is empty", True)
            return

        strURL = ""
        if strWord.isdigit():
            strURL = self.getSiteUrlForGallery(int(strWord))
        else:
            strURL = self.getSiteUrlForSearch(strWord)

        util.DbgOut(f"Hitomi : {strURL}", True)

        driver = webdriver.Chrome()
        driver.get(strURL)

        # 웹페이지가 로드될 때까지 기다리기
        try:
            WebDriverWait(driver, 10).until(
                #EC.presence_of_element_located((By.CLASS_NAME, 'lillie'))
                lambda d: d.execute_script("return document.readyState") == "complete"
            )
        except TimeoutException:
            util.DbgOut("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.", True)
            driver.quit()
            return

        strContent = driver.page_source
        driver.quit()

        if True == bSaveHTML:
            strFileName = f"{strWord}_result.html"
            with open(strFileName, "w", encoding="utf-8") as file:
                file.write(strContent)

            util.DbgOut(f"HTML content saved to {strFileName}", True)

        listRet = self.parseMangaInfos(strContent)

        for Idx in range(len(listRet)):
            util.DbgOut(f"{Idx} : {listRet[Idx]}", True)

    #
    def GetListSearchResult(self, listID: list[int], bSave: bool = False):
        driver = webdriver.Chrome()

        # 웹페이지가 로드될 때까지 기다리기
        try:
            for nID in listID:
                strURL = self.getSiteUrlForGallery(nID)
                util.DbgOut(f"Hitomi : {strURL}", True)

                driver.get(strURL)

                WebDriverWait(driver, 10).until(
                lambda d: d.execute_script("return document.readyState") == "complete"
                )

                time.sleep(2)

                strContent = driver.page_source
                listRet = self.parseMangaInfos(strContent)

                #for Idx in range(len(listRet)):
                #    print(f"{Idx} : {listRet[Idx]}")

                try:
                    for Idx in range(len(listRet)):
                        print(f"{Idx} : {listRet[Idx]}")
                        with open( f"{id}.txt", 'w') as file:
                            for item in listRet[Idx]:
                                file.write( + "\n")
                except IOError:
                    util.DbgOut(f"Error: Could not write to the file at {id}.txt.", True)

        except Exception as e:
            util.DbgOut(f"Hitomi Loading Error : {e}", True)
        finally:
            driver.quit()


    def getSiteUrlForSearch(self, searchWord: str) -> str:
        return f"{self.m_strBaseURL}search.html?{searchWord}"

    def getSiteUrlForGallery(self, nHitomiID: int) -> str:
        return f"{self.m_strBaseURL}galleries/{nHitomiID}.html"

    #
    def parseMangaInfos(self, html_doc : str) -> list[info.CBZInfo]:
        # BeautifulSoup 객체 생성
        soup = BeautifulSoup(html_doc, 'html.parser')
        gallery_elements = soup.find_all(class_='gallery-content')

        listDJs: list[info.CBZInfo] = []
        for element in gallery_elements:
            listDJ = self.djParse(element)
            listDJs.extend(listDJ)

        return listDJs

    #
    def djParse(self, soup_element) -> list[info.CBZInfo]:
        childs = soup_element.find_all(class_='dj')

        listInfos: list[info.CBZInfo] = []
        for child in childs:
            info = self.djTitleParse(child)
            self.djArtistParse(child, info)
            self.djDescParse(child, info)

            listInfos.append(info)

        return listInfos

    #
    def djTitleParse(self, input_element):
        element = input_element.find('h1', class_='lillie')
        strTitle: str = element.text

        a_tag = element.find('a')
        strURL: str = a_tag.get('href')

        #util.DbgOut("title : " + title)
        #util.DbgOut("URl : " + url)

        return info.CBZInfo(strTitle, strURL)

    #
    def djArtistParse(self, input_element, retPtr):
        element = input_element.find('div', class_='artist-list')

        a_tags = element.find_all('a')
        for tag in a_tags:
            artist = tag.text
            a_url = tag.get('href')
            retPtr.AddArtist(artist)

    #
    def djDescParse(self, input_element, retPtr):
        element = input_element.find('table', class_='dj-desc')
        tb_rows = element.find_all('tr')
        listTags = []
        for row in tb_rows:
            tds = row.find_all('td')
            if 2 != len(tds):
                util.DbgOut("Warning : td get failed")
                continue

            outMsg = f"{tds[0].text} : \r\n"

            a_tags = tds[1].find_all('a')
            for tag in a_tags:
                tag_name = tag.text
                tag_url = tag.get('href')

                retPtr.AddTag(tag_name)

                listTags.append(info.TagInfo(tag_name, tag_url))

                outMsg += f"        {tag_name} {tag_url}\r\n"

            #util.DbgOut(outMsg)

            #
            if "Series" == tds[0]:
                retPtr.serires = listTags[-1].name
            elif "Type" == tds[0]:
                retPtr.type = listTags[-1].name
            elif "Language" == tds[0]:
                retPtr.language = listTags[-1].name
            else:
                pass

        return listTags


def main():
    # Hitomi Search Test
    hitomi = GetArc_Hitomi()

    # 검색어로 검색
    #hitomi.GetSearchResult("test")

    # ID로 검색
    hitomi.GetSearchResult("11107", True)

    # ID 리스트로 검색
    #listID = [1234567, 2345678, 3456789]
    #hitomi.GetListSearchResult(listID, True)

# For Main Loop
if __name__ == '__main__':
    main()