Update DataClass.py, GetArc_Ehentai.py, and 3 more files...

E-hentai Page Parse DataClass added
2024-04-05 21:51:36 +09:00
parent dbe5377d6f
commit 809748a73a
5 changed files with 202 additions and 192 deletions
--- a/DataClass.py
+++ b/DataClass.py
@@ -0,0 +1,13 @@
 class MangaMetaInfo:
    def __init__(self, title, url, lang, manType, *tags):
        self.title = title
        self.url = url
        self.lang = lang
        self.manType = manType
        self.tags = tags
        pass
            #series
            #type
            #languages
            #tags
--- a/GetArc_Ehentai.py
+++ b/GetArc_Ehentai.py
@@ -6,113 +6,110 @@ from selenium.common.exceptions import TimeoutException
 from bs4 import BeautifulSoup
-class MangaMetaInfo:
+listResult = []
    def __init__(self, title, url, lang, manType, *tags):
        self.title = title
        self.url = url
        self.lang = lang
        self.manType = manType
        self.tags = tags
        pass
-            #series
+def GetSearchResult(searchWord):
-            #type
+    print("E-hentai start")
-            #languages
+    #url = getSiteUrl() + searchWord
-            #tags
+    url = "https://e-hentai.org/"
-
+    driver = webdriver.Chrome()
-
+    driver.get(url)
 class MangaInfo:
-    listResult = []
+    # 웹페이지가 로드될 때까지 기다리기
-        
+    try:
-    def GetSearchResult(self, searchWord):
+        WebDriverWait(driver, 30).until(
-        url = self.getSiteUrl() + searchWord
+            EC.presence_of_element_located((By.CLASS_NAME, 'dp'))
-        driver = webdriver.Chrome()
+        )
-        driver.get(url)
+    except TimeoutException:
-        
+        print("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.")
        # 웹페이지가 로드될 때까지 기다리기
        try:
            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'lillie'))
            )
        except TimeoutException:
            print("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.")
            driver.quit()
            return
        strContent = driver.page_source
        driver.quit()
        return
-        self.parseMangaInfos(strContent)
+    strContent = driver.page_source
-        
+    driver.quit()
        pass
    #
    def getSiteUrl(self):
        strRet = "https://e-hentai.org/search.html?"
        return strRet
-    #
+    parseMangaInfos(strContent)
    def parseMangaInfos(self, html_doc):
        # BeautifulSoup 객체 생성
        soup = BeautifulSoup(html_doc, 'html.parser')
        gallery_elements = soup.find_all(class_='gallery-content')
        for element in gallery_elements:
            self.djParse(element)
-    def djParse(self, soup_element):
+    pass
        childs = soup_element.find_all(class_='dj')
        for child in childs:
            self.djtitleParse(child)
            self.artistlistParse(child)
            self.djDescParse(child)
            print("\r\n")
    def djtitleParse(self, soup_element):
        element = soup_element.find('h1', class_='lillie')
        title = element.text
        a_tag = element.find('a')
        a_url = a_tag.get('href')
        print("title : " + title)
        print("URl : " + a_url)
    def artistlistParse(self, soup_element):
        element = soup_element.find('div', class_='artist-list')
        print("artists")
        a_tags = element.find_all('a')
        for tag in a_tags:
            artist = tag.text
            a_url = tag.get('href')
            print("    " + artist + " " + a_url)
-    def djDescParse(self, soup_element):
+#
-        element = soup_element.find('table', class_='dj-desc')
+def getSiteUrl():
-        tb_rows = element.find_all('tr')
+    strRet = "https://e-hentai.org/?f_search="
-        for row in tb_rows:
+
-            tds = row.find_all('td')
+    return strRet
-            if 2 != len(tds):
+    
-                print("td get failed")
+
-                continue
+#
-            
+def parseMangaInfos(html_doc):
-            print(tds[0].text + " : ")
+    # BeautifulSoup 객체 생성
-            
+    soup = BeautifulSoup(html_doc, 'html.parser')
-            a_tags = tds[1].find_all('a')
+    gallery_table = soup.find('table', class_='itg gltc')
-            for tag in a_tags:
+    gls = gallery_table.find_all('tr')
-                tag_name = tag.text
+    
-                tag_url = tag.get('href')
+    idx = 1;
-                print("        " + tag_name + " " + tag_url)
+    for gl in gls:
-            
+        gl1cParse(gl)
-        pass
+        gl2cParse(gl)
        gl3cParse(gl)
        print("\r\n")
 # type
 def gl1cParse(soup_element):
    element = soup_element.find('td', class_='gl1c glcat')
    if element is None:
        return
    man_type = element.find('div')
    print("type : " + man_type.text)
 # torrent
 def gl2cParse(soup_element):
    element = soup_element.find('td', class_='gl2c')
    if element is None:
        return
    trt_btn = element.find('div', class_='gldown')
    trt_url = trt_btn.find('a')
    if trt_url:
        url = trt_url.get('href')
        print("torrent : " + url)
    else:
        print("torrent : none")
 #
 def gl3cParse(soup_element):
    element = soup_element.find('td', class_='gl3c glname')
    if element is None:
        return
    elemenr_url = element.find('a')
    man_url = elemenr_url.get('href')
    element_title = element.find('div', class_='glink')
    man_title = element_title.text
    print("title : " + man_title)
    print("Url : " + man_url)
    print("tags : ")
    tags = element.find_all('div', class_='gt')
    man_tags = []
    for tag in tags:
        man_tag = tag.get('title')
        print("       " + man_tag)
        man_tags.append(man_tag)
    print(len(man_tags))
    pass
--- a/GetArc_Hitomi.py
+++ b/GetArc_Hitomi.py
@@ -21,98 +21,96 @@ class MangaMetaInfo:
            #tags
-class MangaInfo:
+listResult = []
-    listResult = []
+def GetSearchResult(searchWord):
-        
+    url = getSiteUrl() + searchWord
-    def GetSearchResult(self, searchWord):
+    driver = webdriver.Chrome()
-        url = self.getSiteUrl() + searchWord
+    driver.get(url)
-        driver = webdriver.Chrome()
+    
-        driver.get(url)
+    # 웹페이지가 로드될 때까지 기다리기
-        
+    try:
-        # 웹페이지가 로드될 때까지 기다리기
+        WebDriverWait(driver, 30).until(
-        try:
+            EC.presence_of_element_located((By.CLASS_NAME, 'lillie'))
-            WebDriverWait(driver, 30).until(
+        )
-                EC.presence_of_element_located((By.CLASS_NAME, 'lillie'))
+    except TimeoutException:
-            )
+        print("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.")
        except TimeoutException:
            print("페이지가 로드되지 않았거나 요소를 찾을 수 없습니다.")
            driver.quit()
            return
        strContent = driver.page_source
        driver.quit()
        return
-        self.parseMangaInfos(strContent)
+    strContent = driver.page_source
-        
+    driver.quit()
        pass
    #
    def getSiteUrl(self):
        strRet = "https://hitomi.la/search.html?"
        return strRet
-    #
+    parseMangaInfos(strContent)
    def parseMangaInfos(self, html_doc):
        # BeautifulSoup 객체 생성
        soup = BeautifulSoup(html_doc, 'html.parser')
        gallery_elements = soup.find_all(class_='gallery-content')
        for element in gallery_elements:
            self.djParse(element)
-    def djParse(self, soup_element):
+    pass
-        childs = soup_element.find_all(class_='dj')
+    
-        
+#
-        for child in childs:
+def getSiteUrl():
-            self.djtitleParse(child)
+    strRet = "https://hitomi.la/search.html?"
            self.artistlistParse(child)
            self.djDescParse(child)
            print("\r\n")
    def djtitleParse(self, soup_element):
        element = soup_element.find('h1', class_='lillie')
        title = element.text
-        a_tag = element.find('a')
+    return strRet
-        a_url = a_tag.get('href')
+    
-            
+
-        print("title : " + title)
+#
-        print("URl : " + a_url)
+def parseMangaInfos(html_doc):
    # BeautifulSoup 객체 생성
    soup = BeautifulSoup(html_doc, 'html.parser')
    gallery_elements = soup.find_all(class_='gallery-content')
    for element in gallery_elements:
        djParse(element)
-    def artistlistParse(self, soup_element):
+
-        element = soup_element.find('div', class_='artist-list')
+def djParse(soup_element):
    childs = soup_element.find_all(class_='dj')
    for child in childs:
        djtitleParse(child)
        artistlistParse(child)
        djDescParse(child)
-        print("artists")
+        print("\r\n")
-        a_tags = element.find_all('a')
+        
 def djtitleParse(soup_element):
    element = soup_element.find('h1', class_='lillie')
    title = element.text
    a_tag = element.find('a')
    a_url = a_tag.get('href')
    print("title : " + title)
    print("URl : " + a_url)
 def artistlistParse(soup_element):
    element = soup_element.find('div', class_='artist-list')
    print("artists")
    a_tags = element.find_all('a')
    for tag in a_tags:
        artist = tag.text
        a_url = tag.get('href')
        print("    " + artist + " " + a_url)
 def djDescParse(soup_element):
    element = soup_element.find('table', class_='dj-desc')
    tb_rows = element.find_all('tr')
    for row in tb_rows:
        tds = row.find_all('td')
        if 2 != len(tds):
            print("td get failed")
            continue
        print(tds[0].text + " : ")
        a_tags = tds[1].find_all('a')
        for tag in a_tags:
-            artist = tag.text
+            tag_name = tag.text
-            a_url = tag.get('href')
+            tag_url = tag.get('href')
-            print("    " + artist + " " + a_url)
+            print("        " + tag_name + " " + tag_url)
-            
+        
-    
+    pass
    def djDescParse(self, soup_element):
        element = soup_element.find('table', class_='dj-desc')
        tb_rows = element.find_all('tr')
        for row in tb_rows:
            tds = row.find_all('td')
            if 2 != len(tds):
                print("td get failed")
                continue
            print(tds[0].text + " : ")
            a_tags = tds[1].find_all('a')
            for tag in a_tags:
                tag_name = tag.text
                tag_url = tag.get('href')
                print("        " + tag_name + " " + tag_url)
        pass
--- a/StoreXLS.py
+++ b/StoreXLS.py
@@ -9,7 +9,6 @@ xls_name = "mangaDB.xlsx"
 list_MetaInfo = []
 #
 def GetCurrentTime():
    # 현재 시간을 구하고 구조체로 변환
@@ -37,8 +36,7 @@ def GetCurrentTime():
 #     except FileNotFoundError:
 #         wb = Workbook()
 #         print("xls Created")
-       
+      
 #     ws = wb.active
 #     # time, title, url, tags (comma)
 #     ws['A1'] = "Modified Time"
--- a/main.py
+++ b/main.py
@@ -1,8 +1,12 @@
-import GetArc_Hitomi as gethitomi
+import GetArc_Hitomi as getHitomi
 import GetArc_Ehentai as getEhentai
 def main():
-    obj = gethitomi.MangaInfo()
+    #getHitomi.GetSearchResult("trouble sweets")
-    obj.GetSearchResult("trouble sweets")
+    getEhentai.GetSearchResult("artist%3A%22kotomi+yo-ji%24%22")
    #artist:"kotomi yo-ji$"
    #"artist%3A%22kotomi+yo-ji%24%22"
 # For Main Loop