diff --git a/NO36/imgs/Thread E_vaccine_PNG70.png b/NO36/imgs/Thread E_vaccine_PNG70.png new file mode 100644 index 0000000000000000000000000000000000000000..88e885f5bb38f3668691d75961c7bd72e5de5bc9 Binary files /dev/null and b/NO36/imgs/Thread E_vaccine_PNG70.png differ diff --git "a/NO36/\345\205\215\346\212\240\345\233\276\347\211\207\344\270\213\350\275\275.py" "b/NO36/\345\205\215\346\212\240\345\233\276\347\211\207\344\270\213\350\275\275.py" new file mode 100644 index 0000000000000000000000000000000000000000..0465d69918d56bd9adb521760e308bd765d25b71 --- /dev/null +++ "b/NO36/\345\205\215\346\212\240\345\233\276\347\211\207\344\270\213\350\275\275.py" @@ -0,0 +1,139 @@ +import random +import logging +import threading +from typing import Optional, Text +import requests +from bs4 import BeautifulSoup +import lxml + +logging.basicConfig(level=logging.WARNING) +thread_lock = threading.Lock() + + +class PngImg(threading.Thread): + # 构造函数 + def __init__(self, thread_name, headers_func, requests_func) -> None: + threading.Thread.__init__(self) + self._headers = headers_func() + self._timeout = 5 + self.requests_func = requests_func + self._thread_name = thread_name + + def run(self) -> None: + bast_host = "http://pngimg.com" + while True: + # 全局锁,获取地址 + thread_lock.acquire() + global all_links + if all_links is None: + break + + list_url = bast_host + all_links.pop().get('href') + thread_lock.release() + print(self._thread_name + " 正在运行,采集的地址是 " + list_url) + + list_html_str = self.requests_func(url=list_url, headers=self._headers, timeout=self._timeout) + ret_imgs = self._get_imgs(list_html_str) + + self._save(ret_imgs) + + def _get_imgs(self, html) -> list: + """获取所有的图片地址 + :return: 图片 list + """ + soup = BeautifulSoup(html, 'lxml') + # 获取图片所在 div 标签 + div_imgs = soup.find_all(attrs={'class': 'png_imgs'}) + # 图片地址为空,用来保存图片 tag + + imgs_src = [] + for div_img in div_imgs: + # 遍历 div 标签,检索后代标签中的 img 图片标签 + imgs_src.append(div_img.a.img.get("src")) + + return imgs_src + + def _save(self, imgs): + """保存图片 """ + for img in imgs: + img = img.replace('small/', '') # 去除 small 标记,获取大图 + img_url = "https://pngimg.com{}".format(img) # 拼接完整图片访问地址 + name = img[img.rfind('/') + 1:] + # print(img_url) + # print(name) + + try: + res = requests.get(url=img_url, headers=self._headers, timeout=self._timeout) + except Exception as e: + logging.error(e) + + if res is not None: + name = name.replace("/", "_") + with open(f'./imgs/{self._thread_name}_{name}', "wb+") as f: + f.write(res.content) + + +def get_headers(): + uas = [ + "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)", + "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)", + "Baiduspider-image+(+http://www.baidu.com/search/spider.htm)", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36", + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)", + "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", + "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", + "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);", + "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", + "Sosospider+(+http://help.soso.com/webspider.htm)", + "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)" + ] + ua = random.choice(uas) + headers = { + "user-agent": ua + } + return headers + + +# 通用的 requests get 请求方法 +def get_html(url: Text, headers: dict, timeout: int) -> Optional[Text]: + try: + res = requests.get(url=url, headers=headers, timeout=timeout) + except Exception as e: + logging.error(e) + + if res is not None: + return res.text + else: + return None + + +if __name__ == '__main__': + url = "http://pngimg.com/" + headers = get_headers() + # 获取首页的 HTML 数据 + html_str = get_html(url, headers, 5) + # 解析首页的HTML数据,获取所有列表页链接 + soup = BeautifulSoup(html_str, 'lxml') + + + div_parents = soup.find_all(attrs={'class': 'sub_category'}) + + # 获取到所有的详情页地址 + all_links = [] + for div in div_parents: + all_links.extend(div.find_all('a')) + + print("累计获取到",len(all_links),"个列表页数据") + + # 通过第一个地址进行测试 + # first_url = all_links[0] + # + # list_url = first_url.get('href') + # bast_host = "http://pngimg.com" + # real_url = bast_host + list_url + + threads = ["Thread A", "Thread B", "Thread C", "Thread D", "Thread E"] + for t in threads: + my_thread = PngImg(t, get_headers, get_html) + my_thread.start() diff --git a/README.md b/README.md index 6182ce9fb848e197d497d0f562bb9609d7c39867..78598a1753ce3d1d8c5d0c0e4b8cfd93f966d7f6 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ 34. [在120篇系列专栏中,才能学会 python beautifulsoup4 模块,7000字博客+爬第九工场网](https://dream.blog.csdn.net/article/details/120384794) 35. [都说python是万能的,这次用python看溧阳摄影圈,真不错](https://dream.blog.csdn.net/article/details/120407050) -36. pngimg.com 透明 PNG 图片站采集 +36. [全程干货,用 python 下载某站全部【免抠图片】,图片背景透明,格式PNG](https://dream.blog.csdn.net/article/details/120414397) ### 📙 协程学习