index.py 3.3 KB
Newer Older
梦想橡皮擦's avatar
梦想橡皮擦 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
import requests

from queue import Queue
import random
import threading
import time


def get_headers():
    user_agent_list = [
        "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
        "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
        "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
        "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
        "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
        "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
        "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
        "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
        "Sosospider+(+http://help.soso.com/webspider.htm)",
        "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
    ]
    UserAgent = random.choice(user_agent_list)
    headers = {'User-Agent': UserAgent, 'referer': 'https://sucai.gaoding.com/'}
    return headers


# 生产者线程
class Producer(threading.Thread):
    def __init__(self, t_name, queue):
        threading.Thread.__init__(self, name=t_name)
        self.data = queue

    # 测试爬取 3 页,实际采集的时候,可以放大到100页
    def run(self):
        # 测试数据,爬取3页
        for i in range(1, 101):
            print("线程名: %s,序号:%d, 正在向队列写入数据 " % (self.getName(), i))
            url = 'https://api-sucai.gaoding.com/api/search-api/sucai/templates/search?q=&sort=&colors=&styles=&filter_id=1617130&page_size=100&page_num={}'.format(
                i)
            res = requests.get(url=url, headers=get_headers(), timeout=5)
            if res:
                data = res.json()
                for item in data:
                    title = item["title"]
                    img_url = item["preview"]["url"]
                    self.data.put((title, img_url))
        print("%s: %s 写入完成!" % (time.ctime(), self.getName()))


# 消费者线程
class Consumer(threading.Thread):
    def __init__(self, t_name, queue):
        threading.Thread.__init__(self, name=t_name)
        self.data = queue

    def run(self):
        while True:
            val = self.data.get()
            if val is not None:
                print("线程名:%s,正在读取数据:%s" % (self.getName(), val))
                title, url = val
                res = requests.get(url=url, headers=get_headers(), timeout=5)
                if res:
                    try:
                        with open(f"./imgs/{title}.png", "wb") as f:
                            f.write(res.content)
                            print(f"{val}", "写入完毕")
                    except Exception as e:
                        pass

# 主函数
def main():
    queue = Queue()
    producer = Producer('生产者', queue)
    consumer = Consumer('消费者', queue)
    producer.start()
    consumer.start()
    producer.join()
    consumer.join()
    print('所有线程执行完毕')


if __name__ == '__main__':
    main()