From fb2f791356c134d8a26817e0a69b3b67ba7fc62d Mon Sep 17 00:00:00 2001
From: hjCodeCloud <7482185+hjcodecloud@user.noreply.gitee.com>
Date: Tue, 29 Jun 2021 17:38:21 +0800
Subject: [PATCH] =?UTF-8?q?=E8=85=BE=E8=AE=AF=E5=8A=A8=E6=BC=AB=E7=88=AC?=
=?UTF-8?q?=E8=99=AB=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
...54\350\231\253\344\273\243\347\240\201.py" | 75 +++++++++++++++++++
1 file changed, 75 insertions(+)
create mode 100644 "NO10/\350\205\276\350\256\257\345\212\250\346\274\253\347\210\254\350\231\253\344\273\243\347\240\201.py"
diff --git "a/NO10/\350\205\276\350\256\257\345\212\250\346\274\253\347\210\254\350\231\253\344\273\243\347\240\201.py" "b/NO10/\350\205\276\350\256\257\345\212\250\346\274\253\347\210\254\350\231\253\344\273\243\347\240\201.py"
new file mode 100644
index 0000000..fa94a2a
--- /dev/null
+++ "b/NO10/\350\205\276\350\256\257\345\212\250\346\274\253\347\210\254\350\231\253\344\273\243\347\240\201.py"
@@ -0,0 +1,75 @@
+import requests
+from fake_useragent import UserAgent
+import re
+import threading
+
+
+def replace_mark(my_str):
+ return my_str.replace(",", ",").replace('"', "“")
+
+
+def format_html(html):
+ li_pattern = re.compile(
+ '
[\s\S]+?')
+ title_url_pattern = re.compile(
+ '(.*?)')
+ sign_pattern = re.compile('签约')
+ exclusive_pattern = re.compile('独家')
+ author_pattern = re.compile(
+ '(.*?)
')
+ tags_pattern = re.compile('(.*?)')
+ score_pattern = re.compile('人气:(.*?)')
+ items = li_pattern.findall(html)
+ for item in items:
+ title_url = title_url_pattern.search(item)
+ title = title_url.group(2)
+ url = title_url.group(1)
+ sign = 0
+ exclusive = 0
+ if sign_pattern.search(item) is not None:
+ sign = 1
+ if exclusive_pattern.search(item) is not None:
+ exclusive = 1
+
+ author = author_pattern.search(item).group(1)
+
+ tags = tags_pattern.findall(item)
+
+ score = score_pattern.search(item).group(1)
+ lock.acquire()
+ with open("./qq.csv", "a+", encoding="utf-8") as f:
+ f.write(
+ f'{replace_mark(title)},{url},{sign},{exclusive},{replace_mark(author)},{"#".join(tags)},"{replace_mark(score)}" \n')
+
+ lock.release()
+
+
+def run(index):
+
+ ua = UserAgent(use_cache_server=False)
+
+ response = requests.get(
+ f"https://ac.qq.com/Comic/index/page/{index}", headers={'User-Agent': ua.random})
+ html = response.text
+ format_html(html)
+ semaphore.release()
+
+
+lock = threading.Lock()
+if __name__ == "__main__":
+
+ num = 0
+
+ semaphore = threading.BoundedSemaphore(5)
+ lst_record_threads = []
+ for index in range(1, 462):
+ print(f"正在抓取{index}")
+ semaphore.acquire()
+ t = threading.Thread(target=run, args=(index, ))
+ t.start()
+ lst_record_threads.append(t)
+
+ for rt in lst_record_threads:
+ rt.join()
+
+ print("数据爬取完毕")
--
GitLab