书伴网数据爬虫

adddc2e5 · 梦想橡皮擦 · d7345539 · adddc2e5
隐藏空白更改
内联并排

Showing with 56 addition and 0 deletion

案例11/书伴网数据采集.py 案例11/书伴网数据采集.py +56 -0

未找到文件。
--- a/案例11/书伴网数据采集.py
+++ b/案例11/书伴网数据采集.py
+import requests
+from lxml import etree
+# 导入协程模块
+import asyncio
+import aiohttp
+
+
+headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
+           "Host": "www.shuban.net",
+           "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"}
+
+
+async def get_content(url):
+    print("正在操作:{}".format(url))
+    # 创建一个session 去获取数据 
+    async with aiohttp.ClientSession() as session:
+        async with session.get(url,headers=headers) as res:
+            if res.status == 200:
+                source = await res.text()  # 等待获取文本
+                tree =etree.HTML(source)
+               	await async_content(tree)
+
+
+async def async_content(tree):
+    
+    title = tree.xpath("//h1[@class='title']/a/text()")[0]
+    print(title)
+	# 如果页面没有信息，直接返回即可
+    # if title == '':
+    #     return
+    # else:
+    #     try:
+    #         description = tree.xpath("//div[@class='hanghang-shu-content-font']")
+    #         author = description[0].xpath("p[1]/text()")[0].replace("作者：","") if description[0].xpath("p[1]/text()")[0] is not None else None
+    #         cate = description[0].xpath("p[2]/text()")[0].replace("分类：","") if description[0].xpath("p[2]/text()")[0] is not None else None
+    #         douban = description[0].xpath("p[3]/text()")[0].replace("豆瓣评分：","") if description[0].xpath("p[3]/text()")[0] is not None else None
+    #         # 这部分内容不明确，不做记录
+    #         #des = description[0].xpath("p[5]/text()")[0] if description[0].xpath("p[5]/text()")[0] is not None else None
+    #         download = tree.xpath("//a[@class='downloads']")
+    #     except Exception as e:
+    #         print(title)
+    #         return
+
+    # ls = [
+    #     title,author,cate,douban,download[0].get('href')
+    # ]
+    # return ls
+
+if __name__ == '__main__':
+    url_format = "https://www.shuban.net/read-{}.html"
+    full_urllist = [url_format.format(i) for i in range(50773,50783)]  # 控制到第3页，更多数据自行获取
+    loop = asyncio.get_event_loop()
+    tasks = [asyncio.ensure_future(get_content(url)) for url in full_urllist]
+    results = loop.run_until_complete(asyncio.wait(tasks))
+
+