diff --git "a/\346\241\210\344\276\21311/\344\271\246\344\274\264\347\275\221\346\225\260\346\215\256\351\207\207\351\233\206.py" "b/\346\241\210\344\276\21311/\344\271\246\344\274\264\347\275\221\346\225\260\346\215\256\351\207\207\351\233\206.py" new file mode 100644 index 0000000000000000000000000000000000000000..7753f6917c49a69c44901a343938886c0a8ed5c5 --- /dev/null +++ "b/\346\241\210\344\276\21311/\344\271\246\344\274\264\347\275\221\346\225\260\346\215\256\351\207\207\351\233\206.py" @@ -0,0 +1,56 @@ +import requests +from lxml import etree +# 导入协程模块 +import asyncio +import aiohttp + + +headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", + "Host": "www.shuban.net", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"} + + +async def get_content(url): + print("正在操作:{}".format(url)) + # 创建一个session 去获取数据 + async with aiohttp.ClientSession() as session: + async with session.get(url,headers=headers) as res: + if res.status == 200: + source = await res.text() # 等待获取文本 + tree =etree.HTML(source) + await async_content(tree) + + +async def async_content(tree): + + title = tree.xpath("//h1[@class='title']/a/text()")[0] + print(title) + # 如果页面没有信息,直接返回即可 + # if title == '': + # return + # else: + # try: + # description = tree.xpath("//div[@class='hanghang-shu-content-font']") + # author = description[0].xpath("p[1]/text()")[0].replace("作者:","") if description[0].xpath("p[1]/text()")[0] is not None else None + # cate = description[0].xpath("p[2]/text()")[0].replace("分类:","") if description[0].xpath("p[2]/text()")[0] is not None else None + # douban = description[0].xpath("p[3]/text()")[0].replace("豆瓣评分:","") if description[0].xpath("p[3]/text()")[0] is not None else None + # # 这部分内容不明确,不做记录 + # #des = description[0].xpath("p[5]/text()")[0] if description[0].xpath("p[5]/text()")[0] is not None else None + # download = tree.xpath("//a[@class='downloads']") + # except Exception as e: + # print(title) + # return + + # ls = [ + # title,author,cate,douban,download[0].get('href') + # ] + # return ls + +if __name__ == '__main__': + url_format = "https://www.shuban.net/read-{}.html" + full_urllist = [url_format.format(i) for i in range(50773,50783)] # 控制到第3页,更多数据自行获取 + loop = asyncio.get_event_loop() + tasks = [asyncio.ensure_future(get_content(url)) for url in full_urllist] + results = loop.run_until_complete(asyncio.wait(tasks)) + +