孔夫子旧书网

b0f61b78 · 梦想橡皮擦 · e945c38a · b0f61b78 · b0f61b78 · b0f61b78
10 changed file
--- a/NO21/孔夫子/wenxue_1.html
+++ b/NO21/孔夫子/wenxue_1.html
--- a/NO21/孔夫子/wenxue_2.html
+++ b/NO21/孔夫子/wenxue_2.html
--- a/NO21/孔夫子/wenxue_3.html
+++ b/NO21/孔夫子/wenxue_3.html
--- a/NO21/孔夫子/wenxue_4.html
+++ b/NO21/孔夫子/wenxue_4.html
--- a/NO21/孔夫子/xiaoshuo_1.html
+++ b/NO21/孔夫子/xiaoshuo_1.html
--- a/NO21/孔夫子/xiaoshuo_2.html
+++ b/NO21/孔夫子/xiaoshuo_2.html
--- a/NO21/孔夫子/xiaoshuo_3.html
+++ b/NO21/孔夫子/xiaoshuo_3.html
--- a/NO21/孔夫子/xiaoshuo_4.html
+++ b/NO21/孔夫子/xiaoshuo_4.html
--- a/NO21/孔夫子旧书网.py
+++ b/NO21/孔夫子旧书网.py
+import requests
+from lxml.html import etree
+import random
+import time
+class SSS:
+    def __init__(self):
+        self.url_format = 'https://book.kongfz.com/C{}/v6w{}/'
+        # 待抓取的分类，可以扩展
+        self.types = ["wenxue", "xiaoshuo"]
+        self.session = requests.Session()
+        self.headers = self.get_headers()
+        self.categorys = []
+    def get_categorys(self):
+        with self.session.get(url='https://book.kongfz.com/Cfalv/', headers=self.headers) as res:
+            if res:
+                html = etree.HTML(res.text)
+                items = html.cssselect('.tushu div.link-item a')
+                # 匹配出URL中的type
+                for item in items:
+                    # print(item)
+                    # print(item.get("href"))
+                    href = item.get("href")
+                    type = href[href.find('C') + 1:-1]
+                    self.categorys.append(type)
+    def get_headers(self):
+        uas = [
+            "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
+            "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
+            "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
+            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
+            "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
+            "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
+            "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
+            "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
+            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
+            "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
+            "Sosospider+(+http://help.soso.com/webspider.htm)",
+            "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
+        ]
+        ua = random.choice(uas)
+        headers = {
+            "user-agent": ua,
+            "referer": "https://www.baidu.com"
+        }
+        return headers
+    def get_detail(self, type, page):
+        with self.session.get(url=self.url_format.format(type, page), headers=self.headers, timeout=5) as res:
+            if res.text:
+                with open(f"./孔夫子/{type}_{page}.html", "w+", encoding="utf-8") as f:
+                    f.write(res.text)
+            else:
+                # 如果无数据，重新请求
+                print(f"页码{page}请求异常，重新请求")
+                self.get_detail(page)
+    def run(self):
+        pagesize = 5
+        for type in self.types:
+            for page in range(1, pagesize):
+                self.get_detail(type, page)
+                time.sleep(2)
+                print(f"分类：{type}，页码：{page}页面储存完毕！")
+# 数据提取类
+class Analysis:
+    def __init__(self):
+        # 待抓取的分类，可以扩展
+        self.types = ["wenxue", "xiaoshuo"]
+    # 去除特殊字符
+    def remove_character(self, origin_str):
+        if origin_str is None:
+            return
+        origin_str = origin_str.replace('\n', '')
+        origin_str = origin_str.replace(',', '，')
+        return origin_str
+    def format(self, text):
+        html = etree.HTML(text)
+        # 获取所有项目区域 div
+        div_books = html.cssselect('div#listBox>div.item')
+        for book in div_books:
+            # 获取标题属性值
+            title = book.cssselect('div.item-info>div.title')[0].get('title')
+            # 作者默认给空值
+            author = None
+            author_div = book.cssselect('div.item-info>div.zl-isbn-info>span:nth-child(1)')
+            if len(author_div) > 0:
+                author = author_div[0].text
+            # 出版社相同操作
+            publisher = None
+            publisher_div = book.cssselect('div.item-info>div.zl-isbn-info>span:nth-child(2)')
+            if len(publisher_div) > 0:
+                # 进行数据提取与截取
+                publisher = publisher_div[0].text.split(' ')[1]
+            # print(publisher)
+            # 数据整合
+            print(title, author, publisher)
+    def run(self):
+        pagesize = 5
+        for type in self.types:
+            for page in range(1, pagesize):
+                with open(f"./孔夫子/{type}_{page}.html", "r", encoding="utf-8") as f:
+                    text = f.read()
+                    # print(text)
+                    self.format(text)
+if __name__ == '__main__':
+    # 采集数据，运行哪部分，去除注释即可
+    # s = SSS()
+    # s.run()
+    # 提取数据
+    # s.get_categorys()
+    a = Analysis()
+    a.run()
--- a/README.md
+++ b/README.md
@@ -22,4 +22,11 @@ Python爬虫120例正式开始
 12. [1小时赚100元，某群X友，周末采集了20000+漫展历史数据，毫无技术难度](https://blog.csdn.net/hihell/article/details/118485941)
 13. [程序员（媛）不懂汉服？岂能让别人小看，咱先靠肉眼大数据识别万张穿搭照](https://dream.blog.csdn.net/article/details/118541741)
 14. [老友(研发岗)被裁后，想加盟小吃店，我用Python采集了一点数据，多少是个心意](https://dream.blog.csdn.net/article/details/118706925)
+15. [整个大活，采集8个代理IP站点，为Python代理池铺路，爬虫120例之第15例](https://dream.blog.csdn.net/article/details/119137580)
+16. [极复杂编码，下载《原神》角色高清图、中日无损配音，爬虫 16 / 120 例](https://dream.blog.csdn.net/article/details/111028288)
+17. [爬虫120例之第17例，用Python面向对象的思路，采集各种精彩句子](https://dream.blog.csdn.net/article/details/119632820)
+18. [requests库与 lxml 库常用操作整理+总结，爬虫120例阶段整理篇](https://dream.blog.csdn.net/article/details/119633672)
+19. [正则表达式 与 XPath 语法领域细解，初学阶段的你，该怎么学？](https://dream.blog.csdn.net/article/details/119633700)
+20. [Python爬虫120例之第20例，1637、一路商机网全站加盟数据采集](https://dream.blog.csdn.net/article/details/119850647)
+21. [孔夫子旧书网数据采集，举一反三学爬虫，Python爬虫120例第21例](https://dream.blog.csdn.net/article/details/119878744)