孔夫子旧书网.py 4.7 KB
Newer Older
梦想橡皮擦's avatar
梦想橡皮擦 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
import requests
from lxml.html import etree
import random
import time


class SSS:
    def __init__(self):

        self.url_format = 'https://book.kongfz.com/C{}/v6w{}/'
        # 待抓取的分类,可以扩展
        self.types = ["wenxue", "xiaoshuo"]
        self.session = requests.Session()
        self.headers = self.get_headers()
        self.categorys = []

    def get_categorys(self):

        with self.session.get(url='https://book.kongfz.com/Cfalv/', headers=self.headers) as res:
            if res:
                html = etree.HTML(res.text)
                items = html.cssselect('.tushu div.link-item a')
                # 匹配出URL中的type
                for item in items:
                    # print(item)
                    # print(item.get("href"))
                    href = item.get("href")
                    type = href[href.find('C') + 1:-1]
                    self.categorys.append(type)

    def get_headers(self):
        uas = [
            "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
            "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
            "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
            "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
            "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
            "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
            "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
            "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
            "Sosospider+(+http://help.soso.com/webspider.htm)",
            "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
        ]
        ua = random.choice(uas)
        headers = {
            "user-agent": ua,
            "referer": "https://www.baidu.com"
        }
        return headers

    def get_detail(self, type, page):
        with self.session.get(url=self.url_format.format(type, page), headers=self.headers, timeout=5) as res:
            if res.text:
                with open(f"./孔夫子/{type}_{page}.html", "w+", encoding="utf-8") as f:
                    f.write(res.text)
            else:
                # 如果无数据,重新请求
                print(f"页码{page}请求异常,重新请求")
                self.get_detail(page)

    def run(self):
        pagesize = 5
        for type in self.types:
            for page in range(1, pagesize):
                self.get_detail(type, page)
                time.sleep(2)
                print(f"分类:{type},页码:{page}页面储存完毕!")


# 数据提取类
class Analysis:
    def __init__(self):
        # 待抓取的分类,可以扩展
        self.types = ["wenxue", "xiaoshuo"]

    # 去除特殊字符
    def remove_character(self, origin_str):
        if origin_str is None:
            return
        origin_str = origin_str.replace('\n', '')
        origin_str = origin_str.replace(',', ',')
        return origin_str

    def format(self, text):
        html = etree.HTML(text)
        # 获取所有项目区域 div
        div_books = html.cssselect('div#listBox>div.item')
        for book in div_books:
            # 获取标题属性值
            title = book.cssselect('div.item-info>div.title')[0].get('title')
            # 作者默认给空值
            author = None
            author_div = book.cssselect('div.item-info>div.zl-isbn-info>span:nth-child(1)')
            if len(author_div) > 0:
                author = author_div[0].text
            # 出版社相同操作
            publisher = None
            publisher_div = book.cssselect('div.item-info>div.zl-isbn-info>span:nth-child(2)')
            if len(publisher_div) > 0:
                # 进行数据提取与截取
                publisher = publisher_div[0].text.split(' ')[1]
            # print(publisher)

            # 数据整合
            print(title, author, publisher)

    def run(self):
        pagesize = 5
        for type in self.types:
            for page in range(1, pagesize):
                with open(f"./孔夫子/{type}_{page}.html", "r", encoding="utf-8") as f:
                    text = f.read()
                    # print(text)
                    self.format(text)


if __name__ == '__main__':
    # 采集数据,运行哪部分,去除注释即可
    # s = SSS()
    # s.run()
    # 提取数据
    # s.get_categorys()
    a = Analysis()
    a.run()