溧阳摄影圈

69d0fb02 · 梦想橡皮擦 · 934e2741 · 69d0fb02 · 69d0fb02 · 69d0fb02
6 changed file
--- a/NO35/imgs/束手无策_td_att3067312.jpg
+++ b/NO35/imgs/束手无策_td_att3067312.jpg
--- a/NO35/imgs/白茶神韵_td_att3067306.jpg
+++ b/NO35/imgs/白茶神韵_td_att3067306.jpg
--- a/NO35/imgs/神乎其技！用50的焦距打了一组鸟，黄臀鵯！_td_att2584333.jpg
+++ b/NO35/imgs/神乎其技！用50的焦距打了一组鸟，黄臀鵯！_td_att2584333.jpg
--- a/NO35/imgs/神乎其技！用50的焦距打了一组鸟，黄臀鵯！_td_att2584334.jpg
+++ b/NO35/imgs/神乎其技！用50的焦距打了一组鸟，黄臀鵯！_td_att2584334.jpg
--- a/NO35/imgs/秋日晨韵_td_att3065566.jpg
+++ b/NO35/imgs/秋日晨韵_td_att3065566.jpg
--- a/NO35/溧阳摄影圈.py
+++ b/NO35/溧阳摄影圈.py
+import random
+import threading
+import logging
+from bs4 import BeautifulSoup
+import requests
+import lxml
+logging.basicConfig(level=logging.NOTSET) # 设置日志输出级别
+# 声明一个 LiYang 类，其继承自 threading.Thread
+class LiYangThread(threading.Thread):
+    def __init__(self):
+        threading.Thread.__init__(self) # 实例化多线程对象
+        self._headers = self._get_headers() # 随机获取 ua
+        self._timeout = 5 # 设置超时时间
+    # 每个线程都去获取全局资源
+    def run(self):
+        # while True: # 此处为多线程开启位置
+        try:
+            res = requests.get(url="http://www.jsly001.com/thread-htm-fid-45-page-1.html", headers=self._headers,
+                               timeout=self._timeout) # 测试获取第一页数据
+        except Exception as e:
+            logging.error(e)
+        if res is not None:
+            html_text = res.text
+            self._format_html(html_text) # 调用html解析函数
+    def _format_html(self, html):
+        # 使用 lxml 进行解析
+        soup = BeautifulSoup(html, 'lxml')
+        # 获取板块主题分割区域，主要为防止获取置顶的主题
+        part_tr = soup.find(attrs={'class': 'bbs_tr4'})
+        if part_tr is not None:
+            items = part_tr.find_all_next(attrs={"name": "readlink"}) # 获取详情页地址
+        else:
+            items = soup.find_all(attrs={"name": "readlink"})
+        # 解析出标题与数据
+        data = [(item.text, f'http://www.jsly001.com/{item["href"]}') for item in items]
+        # 进入标题内页
+        for name, url in data:
+            self._get_imgs(name, url)
+    def _get_imgs(self, name, url):
+        """解析图片地址"""
+        try:
+            res = requests.get(url=url, headers=self._headers, timeout=self._timeout)
+        except Exception as e:
+            logging.error(e)
+        if res is not None:
+            soup = BeautifulSoup(res.text, 'lxml')
+            origin_div1 = soup.find(attrs={'class': 'tpc_content'})
+            origin_div2 = soup.find(attrs={'class': 'imgList'})
+            content = origin_div2 if origin_div2 else origin_div1
+            if content is not None:
+                imgs = content.find_all('img')
+                # print([img.get("src") for img in imgs])
+                self._save_img(name, imgs) # 保存图片
+    def _save_img(self, name, imgs):
+        """保存图片"""
+        for img in imgs:
+            url = img.get("src")
+            if url.find('http') < 0:
+                continue
+            id_ = img.find_parent('span').get("id")
+            try:
+                res = requests.get(url=url, headers=self._headers, timeout=self._timeout)
+            except Exception as e:
+                logging.error(e)
+            if res is not None:
+                name = name.replace("/", "_")
+                with open(f'./imgs/{name}_{id_}.jpg', "wb+") as f: # 注意在 python 运行时目录提前创建 imgs 文件夹
+                    f.write(res.content)
+    def _get_headers(self):
+        uas = [
+            "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
+            "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
+            "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
+            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
+            "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
+            "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
+            "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
+            "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
+            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
+            "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
+            "Sosospider+(+http://help.soso.com/webspider.htm)",
+            "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
+        ]
+        ua = random.choice(uas)
+        headers = {
+            "user-agent": ua
+        }
+        return headers
+if __name__ == '__main__':
+    my_thread = LiYangThread()
+    my_thread.run()