From a14ac74f25230b08afb1497d23ca923bdc4f1dda Mon Sep 17 00:00:00 2001 From: hihell Date: Sat, 4 Sep 2021 14:14:14 +0800 Subject: [PATCH] =?UTF-8?q?=E7=A8=BF=E5=AE=9A=E8=AE=BE=E8=AE=A1=E7=B4=A0?= =?UTF-8?q?=E6=9D=90=E9=87=87=E9=9B=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...0\346\224\276\345\234\260\345\235\200.txt" | 0 NO29/index.py | 86 +++++++++++++++++++ README.md | 9 +- 3 files changed, 93 insertions(+), 2 deletions(-) create mode 100644 "NO29/imgs/\345\233\276\347\211\207\345\255\230\346\224\276\345\234\260\345\235\200.txt" create mode 100644 NO29/index.py diff --git "a/NO29/imgs/\345\233\276\347\211\207\345\255\230\346\224\276\345\234\260\345\235\200.txt" "b/NO29/imgs/\345\233\276\347\211\207\345\255\230\346\224\276\345\234\260\345\235\200.txt" new file mode 100644 index 0000000..e69de29 diff --git a/NO29/index.py b/NO29/index.py new file mode 100644 index 0000000..e00af29 --- /dev/null +++ b/NO29/index.py @@ -0,0 +1,86 @@ +import requests + +from queue import Queue +import random +import threading +import time + + +def get_headers(): + user_agent_list = [ + "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)", + "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)", + "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36", + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)", + "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", + "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", + "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);", + "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", + "Sosospider+(+http://help.soso.com/webspider.htm)", + "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)" + ] + UserAgent = random.choice(user_agent_list) + headers = {'User-Agent': UserAgent, 'referer': 'https://sucai.gaoding.com/'} + return headers + + +# 生产者线程 +class Producer(threading.Thread): + def __init__(self, t_name, queue): + threading.Thread.__init__(self, name=t_name) + self.data = queue + + # 测试爬取 3 页,实际采集的时候,可以放大到100页 + def run(self): + # 测试数据,爬取3页 + for i in range(1, 101): + print("线程名: %s,序号:%d, 正在向队列写入数据 " % (self.getName(), i)) + url = 'https://api-sucai.gaoding.com/api/search-api/sucai/templates/search?q=&sort=&colors=&styles=&filter_id=1617130&page_size=100&page_num={}'.format( + i) + res = requests.get(url=url, headers=get_headers(), timeout=5) + if res: + data = res.json() + for item in data: + title = item["title"] + img_url = item["preview"]["url"] + self.data.put((title, img_url)) + print("%s: %s 写入完成!" % (time.ctime(), self.getName())) + + +# 消费者线程 +class Consumer(threading.Thread): + def __init__(self, t_name, queue): + threading.Thread.__init__(self, name=t_name) + self.data = queue + + def run(self): + while True: + val = self.data.get() + if val is not None: + print("线程名:%s,正在读取数据:%s" % (self.getName(), val)) + title, url = val + res = requests.get(url=url, headers=get_headers(), timeout=5) + if res: + try: + with open(f"./imgs/{title}.png", "wb") as f: + f.write(res.content) + print(f"{val}", "写入完毕") + except Exception as e: + pass + +# 主函数 +def main(): + queue = Queue() + producer = Producer('生产者', queue) + consumer = Consumer('消费者', queue) + producer.start() + consumer.start() + producer.join() + consumer.join() + print('所有线程执行完毕') + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/README.md b/README.md index a8848e7..6a6d27c 100644 --- a/README.md +++ b/README.md @@ -50,8 +50,13 @@ Python爬虫120例正式开始 ### 多线程 threading + queue 模块 26. [全国美容大夫数据采集数据(花容网 huaroo 公开数据),爬虫120例之26例](https://dream.blog.csdn.net/article/details/119914401) 27. [一个站点不够学?那就在用Python增加一个采集目标,一派话题广场+某金融论坛话题广场爬虫](https://dream.blog.csdn.net/article/details/119914560) -28. [域名中介数据采集,待发布] -29. [稿定设计数据采集,待发布] +28. [Python爬虫采集,中介网互联网网站排行榜, 样本数量:58341](https://dream.blog.csdn.net/article/details/119941727) +29. [用Python保住“设计大哥“的头发,直接甩给他10000张参考图,爬虫采集【稿定设计】平面模板素材](https://dream.blog.csdn.net/article/details/120010272) + +### requests-html 库学习 + +30. [外网站点排行榜数据采集] + -- GitLab