From 97e060fa0cf21d3be385b386f355702dc0a765ae Mon Sep 17 00:00:00 2001 From: tom5079 Date: Mon, 5 Oct 2020 18:06:49 +0900 Subject: [PATCH] (WIP) Added update script --- .gitignore | 1 + en/tags.json | 3 --- ko.json | 0 scripts/requirements.txt | 2 ++ scripts/tags.py | 48 ++++++++++++++++++++++++++++++++++++++++ scripts/update.py | 0 scripts/update.sh | 0 template.json | 0 8 files changed, 51 insertions(+), 3 deletions(-) create mode 100644 .gitignore delete mode 100644 en/tags.json create mode 100644 ko.json create mode 100644 scripts/requirements.txt create mode 100644 scripts/tags.py create mode 100644 scripts/update.py create mode 100644 scripts/update.sh create mode 100644 template.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..dbe9c82b --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.vscode/ \ No newline at end of file diff --git a/en/tags.json b/en/tags.json deleted file mode 100644 index c5e02bfb..00000000 --- a/en/tags.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "test": "test" -} \ No newline at end of file diff --git a/ko.json b/ko.json new file mode 100644 index 00000000..e69de29b diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 00000000..a98ae430 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1,2 @@ +requests +beautifulsoup4 \ No newline at end of file diff --git a/scripts/tags.py b/scripts/tags.py new file mode 100644 index 00000000..5d9d626e --- /dev/null +++ b/scripts/tags.py @@ -0,0 +1,48 @@ +# Returns a list of all tags sorted by the number of tags + +import re +import json + +import requests +from bs4 import BeautifulSoup + +indices = ['123'] + [chr(ord('a')+i) for i in range(26)] +tags = dict() + +count_regex = re.compile(r".+\((\d+)\)$") + +for index in indices: + url = f'https://hitomi.la/alltags-{index}.html' + + soup = BeautifulSoup(requests.get(url).text, 'html.parser') + + for item in soup.select('.content li'): + tag = item.a.text + count = int(count_regex.match(item.text).group(1)) + + tags[tag] = count + + break + +tag_regex = re.compile(r".+:(.+)$") +def clean(tag): + match = tag_regex.match(tag) + + if match: + return match.group(1) + else: + return tag + +temp = dict() +for k, v in tags.items(): + tag = clean(k) + + if tag in temp: + if v > temp[tag]: + temp[tag] = v + else: + temp[tag] = v + +tags = sorted(temp, key=temp.get, reverse=True) + +print(json.dumps(tags, indent=4)) \ No newline at end of file diff --git a/scripts/update.py b/scripts/update.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/update.sh b/scripts/update.sh new file mode 100644 index 00000000..e69de29b diff --git a/template.json b/template.json new file mode 100644 index 00000000..e69de29b