花容网，一派网，热门话题案例上传

22fe92d3 · 梦想橡皮擦 · a1dbe14f · 22fe92d3 · 22fe92d3 · 22fe92d3
Showing with 41232 addition and 0 deletion

NO26/医美.py NO26/医美.py +94 -0

NO27/data.csv NO27/data.csv +40985 -0

NO27/一派数据采集.py NO27/一派数据采集.py +52 -0

NO27/话题广场.py NO27/话题广场.py +101 -0

未找到文件。
--- a/NO26/医美.py
+++ b/NO26/医美.py
+import requests
+import threading
+from queue import Queue
+from lxml import etree
+import time
+import random
+
+# 初始化一个队列
+q = Queue(maxsize=0)
+# 批量添加数据
+for page in range(1, 4):
+    q.put('https://www.huaroo.net/d/pg_{}/'.format(page))
+
+# 获取头文件
+
+
+def get_headers():
+    uas = [
+        "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
+        "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
+        "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
+        "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
+        "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
+        "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
+        "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
+        "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
+        "Sosospider+(+http://help.soso.com/webspider.htm)",
+        "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
+    ]
+    ua = random.choice(uas)
+    headers = {
+        "user-agent": ua,
+        "referer": "https://www.baidu.com"
+    }
+    return headers
+
+# 格式化数据
+
+
+def format(text):
+    element = etree.HTML(text)
+    # print(element)
+    article_list = element.xpath('//div[contains(@class,"article_list")]')
+    # print(article_list)
+    wait_save_str = ""
+    for article in article_list:
+
+        title = article.xpath(
+            "./a/div/div[@class='article_title']/text()")[0].strip()
+        hospital = article.xpath(
+            "./a/div/div[@class='hospital_list_content mt10 oh']/div[1]/text()")[0].strip()
+        duties = article.xpath(
+            "./a/div/div[@class='hospital_list_content mt10 oh']/div[2]/text()")[0].strip()
+        practice = article.xpath(
+            "./a/div/div[@class='hospital_list_content mt10 oh']/div[3]/text()")[0].strip()
+        project = article.xpath(
+            "./a/div/div[@class='hospital_list_content mt10 oh']/div[4]/text()")[0].strip()
+        wait_save_str += f"{title},{hospital},{duties},{practice},{project}\n"
+    save(wait_save_str)
+# 储存数据
+
+
+def save(wait_save_str):
+    with open('./医美2.csv', 'a+', encoding='utf-8') as f:
+        f.write(wait_save_str)
+    print(wait_save_str, "---保存成功")
+
+
+# 爬虫请求与解析入口
+def run():
+    while q.qsize() > 0:
+        url = q.get()
+        q.task_done()
+        # print(url)
+        res = requests.get(url=url, headers=get_headers(), timeout=10)
+        format(res.text)
+        
+
+
+l = []
+for i in range(2):
+    t = threading.Thread(target=run)
+    l.append(t)
+    t.start()
+
+for p in l:
+    p.join()
+
+print("多线程执行完毕")
+
+q.join()
+print("所有线程运行完毕")
--- a/NO27/data.csv
+++ b/NO27/data.csv
--- a/NO27/一派数据采集.py
+++ b/NO27/一派数据采集.py
+import requests
+import threading
+from queue import LifoQueue
+import time
+import random
+
+# 初始化一个队列
+q = LifoQueue(maxsize=0)
+# 批量添加数据
+for page in range(1, 7):
+    # https://sspai.com/api/v1/bullet/search/page/get?type=0&limit=10&offset=0&created_at=0
+    q.put('https://sspai.com/api/v1/bullet/search/page/get?type=0&limit=10&offset={}&created_at=0'.format((page-1)*10))
+
+
+def get_headers():
+    uas = [
+        "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
+        "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
+        "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
+        "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
+        "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
+        "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
+        "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
+        "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
+        "Sosospider+(+http://help.soso.com/webspider.htm)",
+        "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
+    ]
+    ua = random.choice(uas)
+    headers = {
+        "user-agent": ua
+    }
+    return headers
+
+
+# 储存数据
+def save(text):
+    with open(f'{time.time()}.json', 'a+', encoding='utf-8') as f:
+        f.write(text)
+    print(text, "--- 保存成功")
+
+
+if __name__ == "__main__":
+    while q.qsize() > 0:
+        url = q.get()
+        q.task_done()
+        res = requests.get(url=url, headers=get_headers(), timeout=10)
+        save(res.text)
+
+    q.join()
+    print("所有任务都已完成")
--- a/NO27/话题广场.py
+++ b/NO27/话题广场.py
+from queue import Queue
+import time
+import threading
+import requests
+from lxml import etree
+import random
+
+
+def get_headers():
+    uas = [
+        "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
+        "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
+        "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
+        "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
+        "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
+        "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
+        "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
+        "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
+        "Sosospider+(+http://help.soso.com/webspider.htm)",
+        "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
+    ]
+    ua = random.choice(uas)
+    headers = {
+        "user-agent": ua
+    }
+    return headers
+
+
+# 热门话题列表页待抓取链接
+hot_subjects = Queue(maxsize=0)
+for i in range(1, 11):
+    url = f'https://www.jisilu.cn/topic/square/id-hot__feature_id-__page-{i}'
+    hot_subjects.put(url)
+
+
+# 初始化一个队列
+q_data_ids = Queue(maxsize=0)
+
+# 生产者
+
+
+def producer():
+    while hot_subjects.qsize() > 0:
+        list_url = hot_subjects.get()
+        hot_subjects.task_done()
+
+        print("正在解析：", list_url)
+        # 获取分页地址
+        res = requests.get(list_url, headers=get_headers(), timeout=3)
+        element = etree.HTML(res.text)
+        data_ids = element.xpath('//a[@class="aw-topic-name"]/@data-id')
+        for data_id in data_ids:
+            q_data_ids.put(data_id)
+
+
+# 消费者
+def consumer():
+
+    while True:
+        # 取一个分类ID
+        data_id = q_data_ids.get()
+        q_data_ids.task_done()
+        if data_id is None:
+            break
+
+        start_page = 1
+        url = f'https://www.jisilu.cn/question/ajax/discuss/sort_type-new__topic_id-{data_id}__page-{start_page}'
+        res = requests.get(url=url, headers=get_headers(), timeout=5)
+        text = res.text
+        while len(text) > 0:
+
+            url = f'https://www.jisilu.cn/question/ajax/discuss/sort_type-new__topic_id-{data_id}__page-{start_page}'
+            res = requests.get(url=url, headers=get_headers(), timeout=5)
+            print(res.url)
+            text = res.text
+           
+            start_page += 1
+            if len(text)>0:
+                element = etree.HTML(res.text)
+                titles = element.xpath('//h4/a/text()')
+                urls = element.xpath('//h4/a/@href')
+                names = element.xpath('//a[@class="aw-user-name"]/text()')
+                data = zip(titles,names,urls)
+                save_list = [f"{item[0]},{item[1]},{item[2]}\n" for item in data]
+                long_str = "".join(save_list)
+                with open("./data.csv","a+",encoding="utf-8") as f:
+                    f.write(long_str)
+
+
+# 开启2个生产者线程
+for p_in in range(1, 3):
+    p = threading.Thread(target=producer)
+    p.start()
+
+
+# 开启2个消费者线程
+for p_in in range(1, 2):
+    p = threading.Thread(target=consumer)
+    p.start()