diff --git a/NO38/new_imgs/1632486604.0140002.jpg b/NO38/new_imgs/1632486604.0140002.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6b8a8d9025e962df78baf1c30c0dbc33bc80d85a Binary files /dev/null and b/NO38/new_imgs/1632486604.0140002.jpg differ diff --git "a/NO38/\351\272\246\346\266\251\347\275\221 MyCoser\345\215\217\347\250\213\347\210\254\350\231\253.py" "b/NO38/\351\272\246\346\266\251\347\275\221 MyCoser\345\215\217\347\250\213\347\210\254\350\231\253.py" new file mode 100644 index 0000000000000000000000000000000000000000..ebe138e0a30131f6b269faa108dd1999fe94602e --- /dev/null +++ "b/NO38/\351\272\246\346\266\251\347\275\221 MyCoser\345\215\217\347\250\213\347\210\254\350\231\253.py" @@ -0,0 +1,58 @@ +import threading +import asyncio +import time +import requests +import lxml +from bs4 import BeautifulSoup + + +async def get(url): + return requests.get(url) + + +async def get_html(url): + print("准备抓取:", url) + res = await get(url) + return res.text + + +async def save_img(img_url): + # thumbMid_5ae3e05fd3945 将小图替换为大图 + img_url = img_url.replace('thumb','thumbMid') + img_url = "http://mycoser.com/" + img_url + print("图片下载中:", img_url) + res = await get(img_url) + if res is not None: + with open(f'./new_imgs/{time.time()}.jpg', 'wb') as f: + f.write(res.content) + return img_url,"ok" + + +async def main(url_list): + # 创建 5 个任务 + tasks = [asyncio.ensure_future(get_html(url_list[_])) for _ in range(len(url_list))] + + dones, pending = await asyncio.wait(tasks) + for task in dones: + html = task.result() + soup = BeautifulSoup(html, 'lxml') + divimg_tags = soup.find_all(attrs={'class': 'workimage'}) + + for div in divimg_tags: + ret = await save_img(div.a.img["data-original"]) + print(ret) + + +if __name__ == '__main__': + urls = [f"http://mycoser.com/picture/lists/p/{page}" for page in range(1, 17)] + totle_page = len(urls) // 5 if len(urls) % 5 == 0 else len(urls) // 5 + 1 + # 对 urls 列表进行切片,方便采集 + for page in range(0, totle_page): + start_page = 0 if page == 0 else page * 5 + end_page = (page + 1) * 5 + + # 循环事件对象 + # loop = asyncio.get_event_loop() + # + # loop.run_until_complete(main(urls[start_page:end_page])) + asyncio.run(main(urls[start_page:end_page])) diff --git a/README.md b/README.md index baed5383373a33fde3fdf96a7c79aefebbb32356..214f717db877623dc9989e2e346b311ab6f7ee36 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ ### 📙 协程学习 37. [python 爬虫爱好者必须掌握的知识点“ 协程爬虫”,看一下如何用 gevent 采集女生用头像](https://dream.blog.csdn.net/article/details/120421824) -38. 麦涩网 MyCoser|cosplay 采集,asyncio 库学习 +38. [python协程总学不会?不可能的,边学协程边采集Coser图吧!](https://dream.blog.csdn.net/article/details/120445004) 39. 中少绘本 MP4 视频采集,asyncio 协程第3篇 40. Bensound 站 MP3 采集,asyncio + aiohttp 协程第4篇 41. 历史剧网采集,协程并发控制