From bbb6e2e132b87440270e21813789263b5ec43369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A2=A6=E6=83=B3=E6=A9=A1=E7=9A=AE=E6=93=A6?= Date: Tue, 20 Dec 2022 11:37:01 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A4=8D=E7=9B=98=E6=A1=88=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../\345\217\245\345\255\220\347\275\221.py" | 83 +++++++++++++++ ...57\347\210\261\345\245\263\344\272\272.py" | 100 ++++++++++++++++++ .../\347\253\231\351\205\267.py" | 10 ++ .../\351\273\204\351\271\244\346\245\274.py" | 58 ++++++++++ .../\345\217\245\345\255\220\347\275\221.py" | 83 +++++++++++++++ 5 files changed, 334 insertions(+) create mode 100644 "\345\244\215\347\233\230\346\241\210\344\276\213/\345\217\245\345\255\220\347\275\221.py" create mode 100644 "\345\244\215\347\233\230\346\241\210\344\276\213/\345\217\257\347\210\261\345\245\263\344\272\272.py" create mode 100644 "\345\244\215\347\233\230\346\241\210\344\276\213/\347\253\231\351\205\267.py" create mode 100644 "\345\244\215\347\233\230\346\241\210\344\276\213/\351\273\204\351\271\244\346\245\274.py" create mode 100644 "\346\241\210\344\276\21317\346\233\264\346\226\260\357\274\214\347\276\244\350\265\260\347\275\221/\345\217\245\345\255\220\347\275\221.py" diff --git "a/\345\244\215\347\233\230\346\241\210\344\276\213/\345\217\245\345\255\220\347\275\221.py" "b/\345\244\215\347\233\230\346\241\210\344\276\213/\345\217\245\345\255\220\347\275\221.py" new file mode 100644 index 0000000..d685be2 --- /dev/null +++ "b/\345\244\215\347\233\230\346\241\210\344\276\213/\345\217\245\345\255\220\347\275\221.py" @@ -0,0 +1,83 @@ +import requests +from lxml import etree +import random + + +class Spider16: + def __init__(self): + + self.wait_urls = ["https://www.qunzou.com/xuexi/list_1_1.html"] + self.url_template = "https://www.qunzou.com/xuexi/list_1_{num}.html" + self.details = [] + + def get_headers(self): + uas = [ + "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)", + "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)", + "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36", + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)", + "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", + "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", + "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);", + "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", + "Sosospider+(+http://help.soso.com/webspider.htm)", + "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)" + ] + ua = random.choice(uas) + headers = { + "user-agent": ua, + "referer": "https://www.baidu.com" + } + return headers + + # 生成待爬取页面 + def create_urls(self): + headers = self.get_headers() + page_url = self.wait_urls[0] + res = requests.get(url=page_url, headers=headers, timeout=5) + html = etree.HTML(res.text) + # 提取总页码 + last_page = html.xpath("//span[@class='pageinfo']/strong[1]/text()")[0] + + # 生成待爬取页面 + for i in range(1, int(last_page) + 1): + self.wait_urls.append(self.url_template.format(num=i)) + + def get_html(self): + for url in self.wait_urls: + headers = self.get_headers() + res = requests.get(url, headers=headers, timeout=5) + if res: + html = etree.HTML(res.text) + detail_link_list = html.xpath("//div[@class='list']//h6/a/@href") + for d in detail_link_list: + self.details.append(f"https://www.qunzou.com{d}") + # 测试用,直接 return + return + + def get_detail(self): + for url in self.details: + headers = self.get_headers() + res = requests.get(url, headers=headers, timeout=5) + res.encoding = "gb2312" + if res: + html = etree.HTML(res.text) + sentences = html.xpath("//div[@id='content']//p/text()") + # 打印句子 + long_str = "\n".join(sentences) + + print(long_str) + # with open("sentences.txt", "a+", encoding="utf-8") as f: + # f.write(long_str) + + def run(self): + self.create_urls() + self.get_html() + self.get_detail() + + +if __name__ == '__main__': + s = Spider16() + s.run() diff --git "a/\345\244\215\347\233\230\346\241\210\344\276\213/\345\217\257\347\210\261\345\245\263\344\272\272.py" "b/\345\244\215\347\233\230\346\241\210\344\276\213/\345\217\257\347\210\261\345\245\263\344\272\272.py" new file mode 100644 index 0000000..ad58ec2 --- /dev/null +++ "b/\345\244\215\347\233\230\346\241\210\344\276\213/\345\217\257\347\210\261\345\245\263\344\272\272.py" @@ -0,0 +1,100 @@ +import requests +import re +import threading +import time + +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"} + +# 详情页图片地址 URL +detail_urls = [] + +mutex = threading.Lock() + + +# 循环获取URL +def get_detail_urls(url): + res = requests.get(url=url, headers=headers) + res.encoding = 'gb2312' + if res is not None: + + html = res.text # 读取页面源码 + # 对目标源码页数据进行裁剪 + # 获取 ul class = "g-gxlist-imgbox" 的数据 + # 该数据在标签