From 1548bd3dd6e35da073535ee48d07954048fac936 Mon Sep 17 00:00:00 2001 From: hihell Date: Fri, 3 Sep 2021 15:13:21 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=AD=E4=BB=8B=E7=BD=91=E7=AB=99=E6=95=B0?= =?UTF-8?q?=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...22\345\220\215\346\225\260\346\215\256.py" | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 "NO28/\344\270\255\344\273\213\347\275\221\347\253\231\346\216\222\345\220\215\346\225\260\346\215\256.py" diff --git "a/NO28/\344\270\255\344\273\213\347\275\221\347\253\231\346\216\222\345\220\215\346\225\260\346\215\256.py" "b/NO28/\344\270\255\344\273\213\347\275\221\347\253\231\346\216\222\345\220\215\346\225\260\346\215\256.py" new file mode 100644 index 0000000..83a09e5 --- /dev/null +++ "b/NO28/\344\270\255\344\273\213\347\275\221\347\253\231\346\216\222\345\220\215\346\225\260\346\215\256.py" @@ -0,0 +1,96 @@ +from queue import Queue +import time +import threading +import requests +from lxml import etree +import random +import re + + +def get_headers(): + uas = [ + "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)", + "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)", + "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36", + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)", + "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", + "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", + "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);", + "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", + "Sosospider+(+http://help.soso.com/webspider.htm)", + "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)" + ] + ua = random.choice(uas) + headers = { + "user-agent": ua + } + return headers + + +def get_total_page(): + res = requests.get( + 'https://www.zhongjie.com/top/rank_all_1.html', headers=get_headers(), timeout=5) + element = etree.HTML(res.text) + last_page = element.xpath("//a[@class='weiye']/@href")[0] + pattern = re.compile('(\d+)') + page = pattern.search(last_page) + return int(page.group(1)) + + +# 生产者 +def producer(): + while True: + # 取一个分类ID + url = urls.get() + urls.task_done() + if url is None: + break + + res = requests.get(url=url, headers=get_headers(), timeout=5) + text = res.text + element = etree.HTML(text) + links = element.xpath('//a[@class="copyright_title"]/@href') + for i in links: + wait_list_urls.put("https://www.zhongjie.com" + i) + + +# 消费者 + + +def consumer(): + while True: + url = wait_list_urls.get() + wait_list_urls.task_done() + if url is None: + break + + res = requests.get(url=url, headers=get_headers(), timeout=5) + text = res.text + element = etree.HTML(text) + + title = element.xpath('//div[@class="info-head-l"]/h1/text()') + link = element.xpath('//div[@class="info-head-l"]/p[1]/a/text()') + description = element.xpath('//div[@class="info-head-l"]/p[2]/text()') + print(title, link, description) + + +if __name__ == "__main__": + + # 初始化一个队列 + urls = Queue(maxsize=0) + last_page = get_total_page() + for p in range(1, last_page + 1): + urls.put(f"https://www.zhongjie.com/top/rank_all_{p}.html") + + wait_list_urls = Queue(maxsize=0) + # 开启2个生产者线程 + for p_in in range(1, 3): + p = threading.Thread(target=producer) + p.start() + + # 开启2个消费者线程 + for p_in in range(1, 2): + p = threading.Thread(target=consumer) + p.start() -- GitLab