CollectMangaInfo/GetArc_Ehentai.py

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup


class MangaMetaInfo:
    def __init__(self, title, url, lang, manType, *tags):
        self.title = title
        self.url = url
        self.lang = lang
        self.manType = manType
        self.tags = tags
        pass

            #series
            #type
            #languages
            #tags


class MangaInfo:

    listResult = []

    def GetSearchResult(self, searchWord):
        url = self.getSiteUrl() + searchWord
        driver = webdriver.Chrome()
        driver.get(url)

        # 웹페이지가 로드될 때까지 기다리기
        try:
            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'lillie'))
            )
        except TimeoutException:
            print("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.")
            driver.quit()
            return

        strContent = driver.page_source
        driver.quit()

        self.parseMangaInfos(strContent)

        pass

    #
    def getSiteUrl(self):
        strRet = "https://e-hentai.org/search.html?"

        return strRet


    #
    def parseMangaInfos(self, html_doc):
        # BeautifulSoup 객체 생성
        soup = BeautifulSoup(html_doc, 'html.parser')
        gallery_elements = soup.find_all(class_='gallery-content')

        for element in gallery_elements:
            self.djParse(element)


    def djParse(self, soup_element):
        childs = soup_element.find_all(class_='dj')

        for child in childs:
            self.djtitleParse(child)
            self.artistlistParse(child)
            self.djDescParse(child)

            print("\r\n")


    def djtitleParse(self, soup_element):
        element = soup_element.find('h1', class_='lillie')
        title = element.text

        a_tag = element.find('a')
        a_url = a_tag.get('href')

        print("title : " + title)
        print("URl : " + a_url)

    def artistlistParse(self, soup_element):
        element = soup_element.find('div', class_='artist-list')

        print("artists")

        a_tags = element.find_all('a')
        for tag in a_tags:
            artist = tag.text
            a_url = tag.get('href')
            print("    " + artist + " " + a_url)


    def djDescParse(self, soup_element):
        element = soup_element.find('table', class_='dj-desc')
        tb_rows = element.find_all('tr')
        for row in tb_rows:
            tds = row.find_all('td')
            if 2 != len(tds):
                print("td get failed")
                continue

            print(tds[0].text + " : ")

            a_tags = tds[1].find_all('a')
            for tag in a_tags:
                tag_name = tag.text
                tag_url = tag.get('href')
                print("        " + tag_name + " " + tag_url)

        pass