diff --git "a/NO10/\350\205\276\350\256\257\345\212\250\346\274\253\347\210\254\350\231\253\344\273\243\347\240\201.py" "b/NO10/\350\205\276\350\256\257\345\212\250\346\274\253\347\210\254\350\231\253\344\273\243\347\240\201.py" new file mode 100644 index 0000000000000000000000000000000000000000..fa94a2a0dc1b5a811708fed2ad3ec06eef20841f --- /dev/null +++ "b/NO10/\350\205\276\350\256\257\345\212\250\346\274\253\347\210\254\350\231\253\344\273\243\347\240\201.py" @@ -0,0 +1,75 @@ +import requests +from fake_useragent import UserAgent +import re +import threading + + +def replace_mark(my_str): + return my_str.replace(",", ",").replace('"', "“") + + +def format_html(html): + li_pattern = re.compile( + '[\s\S]+?') + title_url_pattern = re.compile( + '(.*?)') + sign_pattern = re.compile('签约') + exclusive_pattern = re.compile('独家') + author_pattern = re.compile( + '

(.*?)

') + tags_pattern = re.compile('(.*?)') + score_pattern = re.compile('人气:(.*?)') + items = li_pattern.findall(html) + for item in items: + title_url = title_url_pattern.search(item) + title = title_url.group(2) + url = title_url.group(1) + sign = 0 + exclusive = 0 + if sign_pattern.search(item) is not None: + sign = 1 + if exclusive_pattern.search(item) is not None: + exclusive = 1 + + author = author_pattern.search(item).group(1) + + tags = tags_pattern.findall(item) + + score = score_pattern.search(item).group(1) + lock.acquire() + with open("./qq.csv", "a+", encoding="utf-8") as f: + f.write( + f'{replace_mark(title)},{url},{sign},{exclusive},{replace_mark(author)},{"#".join(tags)},"{replace_mark(score)}" \n') + + lock.release() + + +def run(index): + + ua = UserAgent(use_cache_server=False) + + response = requests.get( + f"https://ac.qq.com/Comic/index/page/{index}", headers={'User-Agent': ua.random}) + html = response.text + format_html(html) + semaphore.release() + + +lock = threading.Lock() +if __name__ == "__main__": + + num = 0 + + semaphore = threading.BoundedSemaphore(5) + lst_record_threads = [] + for index in range(1, 462): + print(f"正在抓取{index}") + semaphore.acquire() + t = threading.Thread(target=run, args=(index, )) + t.start() + lst_record_threads.append(t) + + for rt in lst_record_threads: + rt.join() + + print("数据爬取完毕")