懒人畅听案例补充

bf1520ba · 梦想橡皮擦 · 737404a1 · bf1520ba · 737404a1 · 737404a1
Showing with 64 addition and 72 deletion

NO23/懒人畅听网数据.py NO23/懒人畅听网数据.py +52 -0

NO23/虎牙/0.json NO23/虎牙/0.json +0 -0

NO23/虎牙直播数据.py NO23/虎牙直播数据.py +0 -72

README.md README.md +12 -0

未找到文件。
--- a/NO23/懒人畅听网数据.py
+++ b/NO23/懒人畅听网数据.py
+import threading
+from threading import Lock,Thread
+import random,requests
+from lxml import etree
+
+def get_headers():
+    uas = [
+        "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
+    ]
+    ua = random.choice(uas)
+    headers = {
+        "user-agent": ua,
+        "referer": "https://www.baidu.com/"
+    }
+    return headers
+
+
+def run(url,semaphore):
+    headers = get_headers()
+    semaphore.acquire()   #加锁
+    res = requests.get(url,headers=headers,timeout=5)
+    if res:
+        text = res.text
+        element = etree.HTML(text)
+        titles = element.xpath('//a[@class="book-item-name"]/text()')
+        authors = element.xpath('//a[@class="author"]/text()')
+        weakens = element.xpath('//a[@class="g-user-shutdown"]/text()')
+        save(url,titles,authors,weakens)
+
+
+    semaphore.release()    #释放
+
+def save(url,titles,authors,weakens):
+    data_list = zip(titles,authors,weakens)
+    for item in data_list:
+        with open("./data.csv","a+",encoding="utf-8") as f:
+            f.write(f"{item[0]},{item[1]},{item[2]}\n")
+    print(url,"该URL地址数据写入完毕")
+if __name__== '__main__':
+    lock = Lock()
+    url_format = 'https://www.lrts.me/book/category/1/recommend/{}/20'
+    # 拼接URL，全局共享变量
+    urls = [url_format.format(i) for i in range(1, 1372)]
+    l = []
+    semaphore = threading.BoundedSemaphore(5)   # 最多允许5个线程同时运行
+    for url in urls:
+        t = threading.Thread(target=run,args=(url,semaphore))
+        t.start()
+    while threading.active_count() !=1:
+        pass
+    else:
+        print('所有线程运行完毕')
--- a/NO23/虎牙/0.json
+++ b/NO23/虎牙/0.json
--- a/NO23/虎牙直播数据.py
+++ b/NO23/虎牙直播数据.py
-import threading
-import requests
-import random
-
-class Common:
-    def __init__(self):
-        pass
-
-    def get_headers(self):
-        uas = [
-            "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
-            "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
-            "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
-            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
-            "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
-            "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
-            "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
-            "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
-            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
-            "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
-            "Sosospider+(+http://help.soso.com/webspider.htm)",
-            "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
-        ]
-        ua = random.choice(uas)
-        headers = {
-            "user-agent": ua,
-            "referer": "https://www.baidu.com"
-        }
-        return headers
-
-
-def run(index, url, semaphore, headers):
-    semaphore.acquire()  # 加锁
-    res = requests.get(url, headers=headers, timeout=5)
-    res.encoding = 'utf-8'
-    text = res.text
-    text = text.replace('getLiveListJsonpCallback(', '')
-    text = text[:-1]
-    # print(text)
-    # json_data = json.loads(text)
-    # print(json_data)
-    save(index,text)
-    semaphore.release()  # 释放
-
-
-def save(index, text):
-    with open(f"./虎牙/{index}.json", "w", encoding="utf-8") as f:
-        f.write(f"{text}")
-    print("该URL地址数据写入完毕")
-
-
-if __name__ == '__main__':
-    # 获取总页码
-    first_url = 'https://www.huya.com/cache.php?m=LiveList&do=getLiveListByPage&tagAll=0&callback=&page=1'
-    c = Common()
-    res = requests.get(url=first_url, headers=c.get_headers())
-    data = res.json()
-    if data['status'] == 200:
-        total_page = data['data']['totalPage']
-
-    url_format = 'https://www.huya.com/cache.php?m=LiveList&do=getLiveListByPage&tagAll=0&callback=getLiveListJsonpCallback&page={}'
-    # 拼接URL，全局共享变量
-    urls = [url_format.format(i) for i in range(1, total_page)]
-    # 最多允许5个线程同时运行
-    semaphore = threading.BoundedSemaphore(5)
-    for i, url in enumerate(urls):
-        t = threading.Thread(target=run, args=(i, url, semaphore, c.get_headers()))
-        t.start()
-    while threading.active_count() != 1:
-        pass
-    else:
-        print('所有线程运行完毕')
--- a/README.md
+++ b/README.md
@@ -8,16 +8,20 @@ Python爬虫120例正式开始

 ## Python 爬虫 120 例，已完成文章清单

+### requests 库 + re 模块
 1. [10 行代码集 2000 张美女图，Python 爬虫 120 例，再上征途](https://dream.blog.csdn.net/article/details/117024328)
 2. [通过 Python 爬虫，发现 60%女装大佬游走在 cosplay 领域](https://dream.blog.csdn.net/article/details/117221667)
 3. [Python 千猫图，简单技术满足你的收集控](https://dream.blog.csdn.net/article/details/117458947)
 4. [熊孩子说“你没看过奥特曼”，赶紧用 Python 学习一下，没想到](https://dream.blog.csdn.net/article/details/117458985)
 5. [技术圈的【多肉小达人】，一篇文章你就能做到](https://blog.csdn.net/hihell/article/details/117661488)
 6. [我用 Python 连夜离线了 100G 图片，只为了防止网站被消失](https://dream.blog.csdn.net/article/details/117918309)
+### requests 库 + re 模块 + threading 模块
 7. [对 Python 爬虫编写者充满诱惑的网站,《可爱图片网》,瞧人这网站名字起的](https://dream.blog.csdn.net/article/details/118035208)
 8. [5000张高清壁纸大图（手机用），用Python在法律的边缘又试探了一把](https://dream.blog.csdn.net/article/details/118145504)
 9. [10994部漫画信息，用Python实施大采集，因为反爬差一点就翻车了](https://blog.csdn.net/hihell/article/details/118222271)
 10. [爬动漫“上瘾”之后，放弃午休，迫不及待的用Python薅了腾讯动漫的数据，啧啧啧](https://blog.csdn.net/hihell/article/details/118340372)
+
+### requests 库 + lxml 库
 11. [他说：“只是单纯的想用Python收集一些素颜照，做机器学习使用”，“我信你个鬼！”](https://blog.csdn.net/hihell/article/details/118385640)
 12. [1小时赚100元，某群X友，周末采集了20000+漫展历史数据，毫无技术难度](https://blog.csdn.net/hihell/article/details/118485941)
 13. [程序员（媛）不懂汉服？岂能让别人小看，咱先靠肉眼大数据识别万张穿搭照](https://dream.blog.csdn.net/article/details/118541741)
@@ -25,8 +29,16 @@ Python爬虫120例正式开始
 15. [整个大活，采集8个代理IP站点，为Python代理池铺路，爬虫120例之第15例](https://dream.blog.csdn.net/article/details/119137580)
 16. [极复杂编码，下载《原神》角色高清图、中日无损配音，爬虫 16 / 120 例](https://dream.blog.csdn.net/article/details/111028288)
 17. [爬虫120例之第17例，用Python面向对象的思路，采集各种精彩句子](https://dream.blog.csdn.net/article/details/119632820)
+
+### 技术阶段整理
 18. [requests库与 lxml 库常用操作整理+总结，爬虫120例阶段整理篇](https://dream.blog.csdn.net/article/details/119633672)
 19. [正则表达式 与 XPath 语法领域细解，初学阶段的你，该怎么学？](https://dream.blog.csdn.net/article/details/119633700)
+
+### requests 库 + lxml 库 + cssselect 库
 20. [Python爬虫120例之第20例，1637、一路商机网全站加盟数据采集](https://dream.blog.csdn.net/article/details/119850647)
 21. [孔夫子旧书网数据采集，举一反三学爬虫，Python爬虫120例第21例](https://dream.blog.csdn.net/article/details/119878744)

+### 多线程爬虫之 threading 模块
+22. [谁有粉？就爬谁！他粉多，就爬他！Python 多线程采集 260000+ 粉丝数据](https://dream.blog.csdn.net/article/details/119931364)
+23. [懒人畅听网，有声小说类目数据采集，多线程速采案例，Python爬虫120例之23例](https://dream.blog.csdn.net/article/details/119914203)
+