句子网.py 3.2 KB
Newer Older
梦想橡皮擦's avatar
梦想橡皮擦 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
import requests
from lxml import etree
import random


class Spider16:
    def __init__(self):

        self.wait_urls = ["https://www.qunzou.com/xuexi/list_1_1.html"]
        self.url_template = "https://www.qunzou.com/xuexi/list_1_{num}.html"
        self.details = []

    def get_headers(self):
        uas = [
            "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
            "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
            "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
            "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
            "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
            "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
            "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
            "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
            "Sosospider+(+http://help.soso.com/webspider.htm)",
            "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
        ]
        ua = random.choice(uas)
        headers = {
            "user-agent": ua,
            "referer": "https://www.baidu.com"
        }
        return headers

    # 生成待爬取页面
    def create_urls(self):
        headers = self.get_headers()
        page_url = self.wait_urls[0]
        res = requests.get(url=page_url, headers=headers, timeout=5)
        html = etree.HTML(res.text)
        # 提取总页码
        last_page = html.xpath("//span[@class='pageinfo']/strong[1]/text()")[0]

        # 生成待爬取页面
        for i in range(1, int(last_page) + 1):
            self.wait_urls.append(self.url_template.format(num=i))

    def get_html(self):
        for url in self.wait_urls:
            headers = self.get_headers()
            res = requests.get(url, headers=headers, timeout=5)
            if res:
                html = etree.HTML(res.text)
                detail_link_list = html.xpath("//div[@class='list']//h6/a/@href")
                for d in detail_link_list:
                    self.details.append(f"https://www.qunzou.com{d}")
                    # 测试用,直接 return
                    return

    def get_detail(self):
        for url in self.details:
            headers = self.get_headers()
            res = requests.get(url, headers=headers, timeout=5)
            res.encoding = "gb2312"
            if res:
                html = etree.HTML(res.text)
                sentences = html.xpath("//div[@id='content']//p/text()")
                # 打印句子
                long_str = "\n".join(sentences)

                print(long_str)
                # with open("sentences.txt", "a+", encoding="utf-8") as f:
                #     f.write(long_str)

    def run(self):
        self.create_urls()
        self.get_html()
        self.get_detail()


if __name__ == '__main__':
    s = Spider16()
    s.run()