复盘案例

bbb6e2e1 · 梦想橡皮擦 · 705acb4c · bbb6e2e1 · bbb6e2e1 · bbb6e2e1
5 changed file
--- a/复盘案例/句子网.py
+++ b/复盘案例/句子网.py
+import requests
+from lxml import etree
+import random
+
+
+class Spider16:
+    def __init__(self):
+
+        self.wait_urls = ["https://www.qunzou.com/xuexi/list_1_1.html"]
+        self.url_template = "https://www.qunzou.com/xuexi/list_1_{num}.html"
+        self.details = []
+
+    def get_headers(self):
+        uas = [
+            "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
+            "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
+            "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
+            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
+            "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
+            "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
+            "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
+            "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
+            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
+            "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
+            "Sosospider+(+http://help.soso.com/webspider.htm)",
+            "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
+        ]
+        ua = random.choice(uas)
+        headers = {
+            "user-agent": ua,
+            "referer": "https://www.baidu.com"
+        }
+        return headers
+
+    # 生成待爬取页面
+    def create_urls(self):
+        headers = self.get_headers()
+        page_url = self.wait_urls[0]
+        res = requests.get(url=page_url, headers=headers, timeout=5)
+        html = etree.HTML(res.text)
+        # 提取总页码
+        last_page = html.xpath("//span[@class='pageinfo']/strong[1]/text()")[0]
+
+        # 生成待爬取页面
+        for i in range(1, int(last_page) + 1):
+            self.wait_urls.append(self.url_template.format(num=i))
+
+    def get_html(self):
+        for url in self.wait_urls:
+            headers = self.get_headers()
+            res = requests.get(url, headers=headers, timeout=5)
+            if res:
+                html = etree.HTML(res.text)
+                detail_link_list = html.xpath("//div[@class='list']//h6/a/@href")
+                for d in detail_link_list:
+                    self.details.append(f"https://www.qunzou.com{d}")
+                    # 测试用，直接 return
+                    return
+
+    def get_detail(self):
+        for url in self.details:
+            headers = self.get_headers()
+            res = requests.get(url, headers=headers, timeout=5)
+            res.encoding = "gb2312"
+            if res:
+                html = etree.HTML(res.text)
+                sentences = html.xpath("//div[@id='content']//p/text()")
+                # 打印句子
+                long_str = "\n".join(sentences)
+
+                print(long_str)
+                # with open("sentences.txt", "a+", encoding="utf-8") as f:
+                #     f.write(long_str)
+
+    def run(self):
+        self.create_urls()
+        self.get_html()
+        self.get_detail()
+
+
+if __name__ == '__main__':
+    s = Spider16()
+    s.run()
--- a/复盘案例/可爱女人.py
+++ b/复盘案例/可爱女人.py
+import requests
+import re
+import threading
+import time
+
+headers = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"}
+
+# 详情页图片地址 URL
+detail_urls = []
+
+mutex = threading.Lock()
+
+
+# 循环获取URL
+def get_detail_urls(url):
+    res = requests.get(url=url, headers=headers)
+    res.encoding = 'gb2312'
+    if res is not None:
+
+        html = res.text  # 读取页面源码
+        # 对目标源码页数据进行裁剪
+        # 获取 ul class = "g-gxlist-imgbox" 的数据
+        # 该数据在标签 <ul class="g-gxlist-imgbox"> 和 <div class="pagelist"> 之间
+        html = html[html.find('<ul class="g-gxlist-imgbox">'):html.find('<div class="pagelist">')]
+        # 裁剪之后的数据，可以使用正则提取
+        # 设置正则表达式对象
+        pattern = re.compile('<a href="(.*?)" target="_blank" title=".*?">')
+        # 提取详情页地址
+        find_urls = pattern.findall(html)
+
+        if find_urls:
+            # 上锁
+            mutex.acquire()
+            # 添加到全局变量中
+            detail_urls.extend(find_urls)
+            # 释放锁
+            mutex.release()
+
+
+# 保存图片线程
+def save_image():
+    global detail_urls
+
+    while True:
+        # 上锁
+        mutex.acquire()
+        if len(detail_urls) > 0:
+            # 获取列表第1项
+            img_url = detail_urls[0]
+            # 删除列表第1项
+            del detail_urls[0]
+            # 释放锁
+            mutex.release()
+            res = requests.get(url=img_url, headers=headers)
+
+            if res is not None:
+                html = res.text
+
+                # 裁切目标源码，便于后续整体提取
+                html = html[html.find('<div class="img-list3">'):html.find('<div class="m_ssxx">')]
+                pattern = re.compile('<img alt=".*?" src="(.*?)" />')
+
+                img_list = pattern.findall(html)
+
+                if img_list:
+                    for img in img_list:
+                        print(f"线程{threading.currentThread().name}", "抓取图片中：", img)
+
+                        try:
+                            res = requests.get(img)
+                            with open(f"images/{threading.currentThread().name + str(time.time())}.png", "wb+") as f:
+                                f.write(res.content)
+                        except Exception as e:
+                            print(e)
+        else:
+            print("等待中，长时间等待，可以直接关闭")
+
+
+if __name__ == '__main__':
+    # 生成分页地址
+    origin_url = ['http://www.imeitou.com/nvsheng/']
+    for i in range(2, 11):
+        origin_url.append(f'http://www.imeitou.com/nvsheng/index_{i}.html')
+
+    # 获取图片详情页地址
+    for d_url in origin_url:
+        get_detail_urls(d_url)
+
+    # 测试得到的详情页地址列表
+    # 测试得到 160 条地址，数据量是正确的
+    print(len(detail_urls))
+
+    # 保存图片线程配置+启动
+    # 这里我们开启2个线程
+    save1 = threading.Thread(target=save_image)
+    save1.start()
+
+    save2 = threading.Thread(target=save_image)
+    save2.start()
--- a/复盘案例/站酷.py
+++ b/复盘案例/站酷.py
+# import requests
+#
+# response = requests.get("https://www.uisdc.com/archives")
+# content = response.text
+#
+# with open("ca_demo.html", "w") as file:
+#     file.write(content)
+import urllib.parse
+decoded = urllib.parse.unquote("%3Ci+class%3D%22uname%22+title%3D%22%E4%BC%98%E7%A7%80%E7%BD%91%E9%A1%B5%E8%AE%BE%E8%AE%A1%22%3E%E4%BC%98%E7%A7%80%E7%BD%91%E9%A1%B5%E8%AE%BE%E8%AE%A1%3C%2Fi%3E")
+print(decoded)
\ No newline at end of file
--- a/复盘案例/黄鹤楼.py
+++ b/复盘案例/黄鹤楼.py
+import threading
+import asyncio
+import time
+import requests
+import lxml
+from bs4 import BeautifulSoup
+
+
+async def get(url):
+    return requests.get(url)
+
+
+async def get_html(url):
+    print("准备抓取：", url)
+    res = await get(url)
+    return res.text
+
+
+async def save_img(img_url):
+    print("图片下载中：", img_url)
+    res = await get(img_url)
+    if res is not None:
+        with open(f'./imgs/{time.time()}.jpg', 'wb') as f:
+            f.write(res.content)
+            return img_url,"ok"
+
+
+async def main(url_list):
+    # 创建 5 个任务
+    tasks = [asyncio.ensure_future(get_html(url_list[_])) for _ in range(len(url_list))]
+
+    dones, pending = await asyncio.wait(tasks)
+    for task in dones:
+        html = task.result()
+        soup = BeautifulSoup(html, 'lxml')
+        div_tag = soup.find(attrs={'class': 'lbox'})
+        imgs = div_tag.find_all('img')
+
+
+        for img in imgs:
+            ret = await save_img(img["data-original"])
+            print(ret)
+
+
+if __name__ == '__main__':
+    # 修改为黄鹤楼，测试方便，仅使用10页
+    urls = [f"https://www.huanghelou.cc/category-44_{page}.html" for page in range(1, 10)]
+    totle_page = len(urls) // 5 if len(urls) % 5 == 0 else len(urls) // 5 + 1
+    # 对 urls 列表进行切片，方便采集
+    for page in range(0, totle_page):
+        start_page = 0 if page == 0 else page * 5
+        end_page = (page + 1) * 5
+
+
+        # 循环事件对象
+        loop = asyncio.get_event_loop()
+
+        loop.run_until_complete(main(urls[start_page:end_page]))
--- a/案例17更新，群走网/句子网.py
+++ b/案例17更新，群走网/句子网.py
+import requests
+from lxml import etree
+import random
+
+
+class Spider16:
+    def __init__(self):
+
+        self.wait_urls = ["https://www.qunzou.com/xuexi/list_1_1.html"]
+        self.url_template = "https://www.qunzou.com/xuexi/list_1_{num}.html"
+        self.details = []
+
+    def get_headers(self):
+        uas = [
+            "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
+            "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
+            "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
+            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
+            "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
+            "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
+            "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
+            "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
+            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
+            "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
+            "Sosospider+(+http://help.soso.com/webspider.htm)",
+            "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
+        ]
+        ua = random.choice(uas)
+        headers = {
+            "user-agent": ua,
+            "referer": "https://www.baidu.com"
+        }
+        return headers
+
+    # 生成待爬取页面
+    def create_urls(self):
+        headers = self.get_headers()
+        page_url = self.wait_urls[0]
+        res = requests.get(url=page_url, headers=headers, timeout=5)
+        html = etree.HTML(res.text)
+        # 提取总页码
+        last_page = html.xpath("//span[@class='pageinfo']/strong[1]/text()")[0]
+
+        # 生成待爬取页面
+        for i in range(1, int(last_page) + 1):
+            self.wait_urls.append(self.url_template.format(num=i))
+
+    def get_html(self):
+        for url in self.wait_urls:
+            headers = self.get_headers()
+            res = requests.get(url, headers=headers, timeout=5)
+            if res:
+                html = etree.HTML(res.text)
+                detail_link_list = html.xpath("//div[@class='list']//h6/a/@href")
+                for d in detail_link_list:
+                    self.details.append(f"https://www.qunzou.com{d}")
+                    # 测试用，直接 return
+                    return
+
+    def get_detail(self):
+        for url in self.details:
+            headers = self.get_headers()
+            res = requests.get(url, headers=headers, timeout=5)
+            res.encoding = "gb2312"
+            if res:
+                html = etree.HTML(res.text)
+                sentences = html.xpath("//div[@id='content']//p/text()")
+                # 打印句子
+                long_str = "\n".join(sentences)
+
+                print(long_str)
+                # with open("sentences.txt", "a+", encoding="utf-8") as f:
+                #     f.write(long_str)
+
+    def run(self):
+        self.create_urls()
+        self.get_html()
+        self.get_detail()
+
+
+if __name__ == '__main__':
+    s = Spider16()
+    s.run()