import requests from lxml.html import etree import random import time class SSS: def __init__(self): self.url_format = '{}/v6w{}/' # 待抓取的分类,可以扩展 self.types = ["wenxue", "xiaoshuo"] self.session = requests.Session() self.headers = self.get_headers() self.categorys = [] def get_categorys(self): with self.session.get(url='', headers=self.headers) as res: if res: html = etree.HTML(res.text) items = html.cssselect('.tushu a') # 匹配出URL中的type for item in items: # print(item) # print(item.get("href")) href = item.get("href") type = href[href.find('C') + 1:-1] self.categorys.append(type) def get_headers(self): uas = [ "Mozilla/5.0 (compatible; Baiduspider/2.0; +", "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +", "Baiduspider-image+(+", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36", "Mozilla/5.0 (compatible; Googlebot/2.1; +", "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +", "Sogou web spider/4.0(+", "Sogou News Spider/4.0(+", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);", "Mozilla/5.0 (compatible; bingbot/2.0; +", "Sosospider+(+", "Mozilla/5.0 (compatible; Yahoo! Slurp China;" ] ua = random.choice(uas) headers = { "user-agent": ua, "referer": "" } return headers def get_detail(self, type, page): with self.session.get(url=self.url_format.format(type, page), headers=self.headers, timeout=5) as res: if res.text: with open(f"./孔夫子/{type}_{page}.html", "w+", encoding="utf-8") as f: f.write(res.text) else: # 如果无数据,重新请求 print(f"页码{page}请求异常,重新请求") self.get_detail(page) def run(self): pagesize = 5 for type in self.types: for page in range(1, pagesize): self.get_detail(type, page) time.sleep(2) print(f"分类:{type},页码:{page}页面储存完毕!") # 数据提取类 class Analysis: def __init__(self): # 待抓取的分类,可以扩展 self.types = ["wenxue", "xiaoshuo"] # 去除特殊字符 def remove_character(self, origin_str): if origin_str is None: return origin_str = origin_str.replace('\n', '') origin_str = origin_str.replace(',', ',') return origin_str def format(self, text): html = etree.HTML(text) # 获取所有项目区域 div div_books = html.cssselect('div#listBox>div.item') for book in div_books: # 获取标题属性值 title = book.cssselect('div.item-info>div.title')[0].get('title') # 作者默认给空值 author = None author_div = book.cssselect('div.item-info>div.zl-isbn-info>span:nth-child(1)') if len(author_div) > 0: author = author_div[0].text # 出版社相同操作 publisher = None publisher_div = book.cssselect('div.item-info>div.zl-isbn-info>span:nth-child(2)') if len(publisher_div) > 0: # 进行数据提取与截取 publisher = publisher_div[0].text.split(' ')[1] # print(publisher) # 数据整合 print(title, author, publisher) def run(self): pagesize = 5 for type in self.types: for page in range(1, pagesize): with open(f"./孔夫子/{type}_{page}.html", "r", encoding="utf-8") as f: text = # print(text) self.format(text) if __name__ == '__main__': # 采集数据,运行哪部分,去除注释即可 # s = SSS() # # 提取数据 # s.get_categorys() a = Analysis()