话题广场.py 3.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
from queue import Queue
import time
import threading
import requests
from lxml import etree
import random


def get_headers():
    uas = [
        "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
        "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
        "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
        "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
        "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
        "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
        "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
        "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
        "Sosospider+(+http://help.soso.com/webspider.htm)",
        "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
    ]
    ua = random.choice(uas)
    headers = {
        "user-agent": ua
    }
    return headers


# 热门话题列表页待抓取链接
hot_subjects = Queue(maxsize=0)
for i in range(1, 11):
    url = f'https://www.jisilu.cn/topic/square/id-hot__feature_id-__page-{i}'
    hot_subjects.put(url)


# 初始化一个队列
q_data_ids = Queue(maxsize=0)

# 生产者


def producer():
    while hot_subjects.qsize() > 0:
        list_url = hot_subjects.get()
        hot_subjects.task_done()

        print("正在解析:", list_url)
        # 获取分页地址
        res = requests.get(list_url, headers=get_headers(), timeout=3)
        element = etree.HTML(res.text)
        data_ids = element.xpath('//a[@class="aw-topic-name"]/@data-id')
        for data_id in data_ids:
            q_data_ids.put(data_id)


# 消费者
def consumer():

    while True:
        # 取一个分类ID
        data_id = q_data_ids.get()
        q_data_ids.task_done()
        if data_id is None:
            break

        start_page = 1
        url = f'https://www.jisilu.cn/question/ajax/discuss/sort_type-new__topic_id-{data_id}__page-{start_page}'
        res = requests.get(url=url, headers=get_headers(), timeout=5)
        text = res.text
        while len(text) > 0:

            url = f'https://www.jisilu.cn/question/ajax/discuss/sort_type-new__topic_id-{data_id}__page-{start_page}'
            res = requests.get(url=url, headers=get_headers(), timeout=5)
            print(res.url)
            text = res.text
           
            start_page += 1
            if len(text)>0:
                element = etree.HTML(res.text)
                titles = element.xpath('//h4/a/text()')
                urls = element.xpath('//h4/a/@href')
                names = element.xpath('//a[@class="aw-user-name"]/text()')
                data = zip(titles,names,urls)
                save_list = [f"{item[0]},{item[1]},{item[2]}\n" for item in data]
                long_str = "".join(save_list)
                with open("./data.csv","a+",encoding="utf-8") as f:
                    f.write(long_str)


# 开启2个生产者线程
for p_in in range(1, 3):
    p = threading.Thread(target=producer)
    p.start()


# 开启2个消费者线程
for p_in in range(1, 2):
    p = threading.Thread(target=consumer)
    p.start()