CollectMangaInfo/GetArc_Hitomi.py

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup

import UtilPack as util

listResult = []

def GetSearchResult(searchWord):
    url = getSiteUrl(searchWord)

    util.DbgOut("Hitomi : " + url)

    driver = webdriver.Chrome()
    driver.get(url)

    # 웹페이지가 로드될 때까지 기다리기
    try:
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'lillie'))
        )
    except TimeoutException:
        util.DbgOut("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.")
        driver.quit()
        return

    strContent = driver.page_source
    driver.quit()

    parseMangaInfos(strContent)

    pass

#
def getSiteUrl(searchWord):
    strRet = "https://hitomi.la/"

    if False == util.IsEmptyStr(searchWord):
        strRet = strRet + "search.html?" + searchWord

    return strRet


#
def parseMangaInfos(html_doc):
    # BeautifulSoup 객체 생성
    soup = BeautifulSoup(html_doc, 'html.parser')
    gallery_elements = soup.find_all(class_='gallery-content')

    for element in gallery_elements:
        djParse(element)


def djParse(soup_element):
    childs = soup_element.find_all(class_='dj')

    for child in childs:
        djtitleParse(child)
        artistlistParse(child)
        djDescParse(child)

        print("\r\n")


def djtitleParse(soup_element):
    element = soup_element.find('h1', class_='lillie')
    title = element.text

    a_tag = element.find('a')
    a_url = a_tag.get('href')

    print("title : " + title)
    print("URl : " + a_url)

def artistlistParse(soup_element):
    element = soup_element.find('div', class_='artist-list')

    print("artists")

    a_tags = element.find_all('a')
    for tag in a_tags:
        artist = tag.text
        a_url = tag.get('href')
        print("    " + artist + " " + a_url)


def djDescParse(soup_element):
    element = soup_element.find('table', class_='dj-desc')
    tb_rows = element.find_all('tr')
    for row in tb_rows:
        tds = row.find_all('td')
        if 2 != len(tds):
            print("td get failed")
            continue

        print(tds[0].text + " : ")

        a_tags = tds[1].find_all('a')
        for tag in a_tags:
            tag_name = tag.text
            tag_url = tag.get('href')
            print("        " + tag_name + " " + tag_url)

    pass