From fb2f791356c134d8a26817e0a69b3b67ba7fc62d Mon Sep 17 00:00:00 2001 From: hjCodeCloud <7482185+hjcodecloud@user.noreply.gitee.com> Date: Tue, 29 Jun 2021 17:38:21 +0800 Subject: [PATCH] =?UTF-8?q?=E8=85=BE=E8=AE=AF=E5=8A=A8=E6=BC=AB=E7=88=AC?= =?UTF-8?q?=E8=99=AB=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...54\350\231\253\344\273\243\347\240\201.py" | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 "NO10/\350\205\276\350\256\257\345\212\250\346\274\253\347\210\254\350\231\253\344\273\243\347\240\201.py" diff --git "a/NO10/\350\205\276\350\256\257\345\212\250\346\274\253\347\210\254\350\231\253\344\273\243\347\240\201.py" "b/NO10/\350\205\276\350\256\257\345\212\250\346\274\253\347\210\254\350\231\253\344\273\243\347\240\201.py" new file mode 100644 index 0000000..fa94a2a --- /dev/null +++ "b/NO10/\350\205\276\350\256\257\345\212\250\346\274\253\347\210\254\350\231\253\344\273\243\347\240\201.py" @@ -0,0 +1,75 @@ +import requests +from fake_useragent import UserAgent +import re +import threading + + +def replace_mark(my_str): + return my_str.replace(",", ",").replace('"', "“") + + +def format_html(html): + li_pattern = re.compile( + '[\s\S]+?') + title_url_pattern = re.compile( + '(.*?)') + sign_pattern = re.compile('签约') + exclusive_pattern = re.compile('独家') + author_pattern = re.compile( + '

(.*?)

') + tags_pattern = re.compile('(.*?)') + score_pattern = re.compile('人气:(.*?)') + items = li_pattern.findall(html) + for item in items: + title_url = title_url_pattern.search(item) + title = title_url.group(2) + url = title_url.group(1) + sign = 0 + exclusive = 0 + if sign_pattern.search(item) is not None: + sign = 1 + if exclusive_pattern.search(item) is not None: + exclusive = 1 + + author = author_pattern.search(item).group(1) + + tags = tags_pattern.findall(item) + + score = score_pattern.search(item).group(1) + lock.acquire() + with open("./qq.csv", "a+", encoding="utf-8") as f: + f.write( + f'{replace_mark(title)},{url},{sign},{exclusive},{replace_mark(author)},{"#".join(tags)},"{replace_mark(score)}" \n') + + lock.release() + + +def run(index): + + ua = UserAgent(use_cache_server=False) + + response = requests.get( + f"https://ac.qq.com/Comic/index/page/{index}", headers={'User-Agent': ua.random}) + html = response.text + format_html(html) + semaphore.release() + + +lock = threading.Lock() +if __name__ == "__main__": + + num = 0 + + semaphore = threading.BoundedSemaphore(5) + lst_record_threads = [] + for index in range(1, 462): + print(f"正在抓取{index}") + semaphore.acquire() + t = threading.Thread(target=run, args=(index, )) + t.start() + lst_record_threads.append(t) + + for rt in lst_record_threads: + rt.join() + + print("数据爬取完毕") -- GitLab