import requests
from lxml.html import etree
import random
import time
class SSS:
def __init__(self):
self.url_format = 'https://book.kongfz.com/C{}/v6w{}/'
# 待抓取的分类,可以扩展
self.types = ["wenxue", "xiaoshuo"]
self.session = requests.Session()
self.headers = self.get_headers()
self.categorys = []
def get_categorys(self):
with self.session.get(url='https://book.kongfz.com/Cfalv/', headers=self.headers) as res:
if res:
html = etree.HTML(res.text)
items = html.cssselect('.tushu div.link-item a')
# 匹配出URL中的type
for item in items:
# print(item)
# print(item.get("href"))
href = item.get("href")
type = href[href.find('C') + 1:-1]
self.categorys.append(type)
def get_headers(self):
uas = [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"Sosospider+(+http://help.soso.com/webspider.htm)",
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua = random.choice(uas)
headers = {
"user-agent": ua,
"referer": "https://www.baidu.com"
}
return headers
def get_detail(self, type, page):
with self.session.get(url=self.url_format.format(type, page), headers=self.headers, timeout=5) as res:
if res.text:
with open(f"./孔夫子/{type}_{page}.html", "w+", encoding="utf-8") as f:
f.write(res.text)
else:
# 如果无数据,重新请求
print(f"页码{page}请求异常,重新请求")
self.get_detail(page)
def run(self):
pagesize = 5
for type in self.types:
for page in range(1, pagesize):
self.get_detail(type, page)
time.sleep(2)
print(f"分类:{type},页码:{page}页面储存完毕!")
# 数据提取类
class Analysis:
def __init__(self):
# 待抓取的分类,可以扩展
self.types = ["wenxue", "xiaoshuo"]
# 去除特殊字符
def remove_character(self, origin_str):
if origin_str is None:
return
origin_str = origin_str.replace('\n', '')
origin_str = origin_str.replace(',', ',')
return origin_str
def format(self, text):
html = etree.HTML(text)
# 获取所有项目区域 div
div_books = html.cssselect('div#listBox>div.item')
for book in div_books:
# 获取标题属性值
title = book.cssselect('div.item-info>div.title')[0].get('title')
# 作者默认给空值
author = None
author_div = book.cssselect('div.item-info>div.zl-isbn-info>span:nth-child(1)')
if len(author_div) > 0:
author = author_div[0].text
# 出版社相同操作
publisher = None
publisher_div = book.cssselect('div.item-info>div.zl-isbn-info>span:nth-child(2)')
if len(publisher_div) > 0:
# 进行数据提取与截取
publisher = publisher_div[0].text.split(' ')[1]
# print(publisher)
# 数据整合
print(title, author, publisher)
def run(self):
pagesize = 5
for type in self.types:
for page in range(1, pagesize):
with open(f"./孔夫子/{type}_{page}.html", "r", encoding="utf-8") as f:
text = f.read()
# print(text)
self.format(text)
if __name__ == '__main__':
# 采集数据,运行哪部分,去除注释即可
# s = SSS()
# s.run()
# 提取数据
# s.get_categorys()
a = Analysis()
a.run()