From 8690fb957df4980e70dbc709798aaf1691bd10f1 Mon Sep 17 00:00:00 2001 From: hihell Date: Thu, 21 Oct 2021 10:11:43 +0800 Subject: [PATCH] =?UTF-8?q?41=E6=A1=88=E4=BE=8B=EF=BC=8C=E5=8D=8F=E7=A8=8B?= =?UTF-8?q?=E6=9C=80=E5=90=8E=E4=B8=80=E7=AF=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...55\347\232\204\345\272\224\347\224\250.py" | 38 +++++++++++++++++ ...66\344\277\241\345\217\267\351\207\217.py" | 42 +++++++++++++++++++ ...66\350\277\236\346\216\245\346\225\260.py" | 31 ++++++++++++++ ...32\345\244\232\347\272\277\347\250\213.py" | 38 +++++++++++++++++ README.md | 8 ++-- 5 files changed, 153 insertions(+), 4 deletions(-) create mode 100644 "NO41/Semaphore \345\234\250\345\215\217\347\250\213\344\270\255\347\232\204\345\272\224\347\224\250.py" create mode 100644 "NO41/Semaphore \346\216\247\345\210\266\344\277\241\345\217\267\351\207\217.py" create mode 100644 "NO41/TCPConnector \351\231\220\345\210\266\350\277\236\346\216\245\346\225\260.py" create mode 100644 "NO41/\346\231\256\351\200\232\345\244\232\347\272\277\347\250\213.py" diff --git "a/NO41/Semaphore \345\234\250\345\215\217\347\250\213\344\270\255\347\232\204\345\272\224\347\224\250.py" "b/NO41/Semaphore \345\234\250\345\215\217\347\250\213\344\270\255\347\232\204\345\272\224\347\224\250.py" new file mode 100644 index 0000000..64291ad --- /dev/null +++ "b/NO41/Semaphore \345\234\250\345\215\217\347\250\213\344\270\255\347\232\204\345\272\224\347\224\250.py" @@ -0,0 +1,38 @@ +import time + +import asyncio +import aiohttp +from bs4 import BeautifulSoup + + +async def get_title(semaphore, url): + async with semaphore: + print("正在采集:", url) + async with aiohttp.request('GET', url) as res: + html = await res.text() + soup = BeautifulSoup(html, 'html.parser') + title_tags = soup.find_all(attrs={'class': 'item-title'}) + event_names = [item.a.text for item in title_tags] + print(event_names) + + +async def main(): + semaphore = asyncio.Semaphore(10) # 控制每次最多执行 10 个线程 + tasks = [asyncio.ensure_future(get_title(semaphore, "http://www.lishiju.net/hotevents/p{}".format(i))) for i in + range(111)] + dones, pendings = await asyncio.wait(tasks) + # for task in dones: + # print(len(task.result())) + + +if __name__ == '__main__': + + start_time = time.perf_counter() + asyncio.run(main()) + print("代码运行时间为:", time.perf_counter() - start_time) + + # # 创建事件循环。 + # event_loop = asyncio.get_event_loop() + # # 启动事件循环并等待协程main()结束。 + # event_loop.run_until_complete(main()) + # # 代码运行时间为: 2.227831242 diff --git "a/NO41/Semaphore \346\216\247\345\210\266\344\277\241\345\217\267\351\207\217.py" "b/NO41/Semaphore \346\216\247\345\210\266\344\277\241\345\217\267\351\207\217.py" new file mode 100644 index 0000000..d1dc944 --- /dev/null +++ "b/NO41/Semaphore \346\216\247\345\210\266\344\277\241\345\217\267\351\207\217.py" @@ -0,0 +1,42 @@ +import threading +import time + +import requests +from bs4 import BeautifulSoup + + +class MyThread(threading.Thread): + def __init__(self, url): + threading.Thread.__init__(self) + self.__url = url + + def run(self): + if semaphore.acquire(): # 计数器 -1 + print("正在采集:", self.__url) + res = requests.get(url=self.__url) + soup = BeautifulSoup(res.text, 'html.parser') + title_tags = soup.find_all(attrs={'class': 'item-title'}) + event_names = [item.a.text for item in title_tags] + print(event_names) + print("") + semaphore.release() # 计数器 +1 + + +if __name__ == "__main__": + semaphore = threading.Semaphore(5) # 控制每次最多执行 5 个线程 + start_time = time.perf_counter() + threads = [] + for i in range(111): # 创建了110个线程。 + threads.append(MyThread(url="http://www.lishiju.net/hotevents/p{}".format(i))) + for t in threads: + t.start() # 启动了110个线程。 + + for t in threads: + t.join() # 等待线程结束 + + print("累计耗时:", time.perf_counter() - start_time) + # 累计耗时: 2.8005530640000003 + + + + diff --git "a/NO41/TCPConnector \351\231\220\345\210\266\350\277\236\346\216\245\346\225\260.py" "b/NO41/TCPConnector \351\231\220\345\210\266\350\277\236\346\216\245\346\225\260.py" new file mode 100644 index 0000000..a1146e5 --- /dev/null +++ "b/NO41/TCPConnector \351\231\220\345\210\266\350\277\236\346\216\245\346\225\260.py" @@ -0,0 +1,31 @@ +import time + +import asyncio +import aiohttp +from bs4 import BeautifulSoup + + +async def get_title(session, url): + async with session.get(url) as res: + print("正在采集:", url) + html = await res.text() + soup = BeautifulSoup(html, 'html.parser') + title_tags = soup.find_all(attrs={'class': 'item-title'}) + event_names = [item.a.text for item in title_tags] + print(event_names) + + + +async def main(): + connector = aiohttp.TCPConnector(limit=1) # 限制同时连接数 + async with aiohttp.ClientSession(connector=connector) as session: + tasks = [asyncio.ensure_future(get_title(session, "http://www.lishiju.net/hotevents/p{}".format(i))) for i in + range(111)] + await asyncio.wait(tasks) + + + +if __name__ == '__main__': + start_time = time.perf_counter() + asyncio.run(main()) + print("代码运行时间为:", time.perf_counter() - start_time) diff --git "a/NO41/\346\231\256\351\200\232\345\244\232\347\272\277\347\250\213.py" "b/NO41/\346\231\256\351\200\232\345\244\232\347\272\277\347\250\213.py" new file mode 100644 index 0000000..6ad3027 --- /dev/null +++ "b/NO41/\346\231\256\351\200\232\345\244\232\347\272\277\347\250\213.py" @@ -0,0 +1,38 @@ +import threading +import time + +import requests +from bs4 import BeautifulSoup + + +class MyThread(threading.Thread): + def __init__(self, url): + threading.Thread.__init__(self) + self.__url = url + + def run(self): + + print("正在采集:", self.__url) + res = requests.get(url=self.__url) + soup = BeautifulSoup(res.text, 'html.parser') + title_tags = soup.find_all(attrs={'class': 'item-title'}) + event_names = [item.a.text for item in title_tags] + print(event_names) + print("") + + + +if __name__ == "__main__": + + start_time = time.perf_counter() + threads = [] + for i in range(111): # 创建了110个线程。 + threads.append(MyThread(url="http://www.lishiju.net/hotevents/p{}".format(i))) + for t in threads: + t.start() # 启动了110个线程。 + + for t in threads: + t.join() # 等待线程结束 + + print("累计耗时:", time.perf_counter() - start_time) + # 累计耗时: 1.537718624 diff --git a/README.md b/README.md index 214f717..169ec4e 100644 --- a/README.md +++ b/README.md @@ -79,11 +79,11 @@ 37. [python 爬虫爱好者必须掌握的知识点“ 协程爬虫”,看一下如何用 gevent 采集女生用头像](https://dream.blog.csdn.net/article/details/120421824) 38. [python协程总学不会?不可能的,边学协程边采集Coser图吧!](https://dream.blog.csdn.net/article/details/120445004) -39. 中少绘本 MP4 视频采集,asyncio 协程第3篇 -40. Bensound 站 MP3 采集,asyncio + aiohttp 协程第4篇 -41. 历史剧网采集,协程并发控制 - +39. [你是不是已经成为【爸爸程序员】了?用Python给自己的宝下载200+绘本动画吧,协程第3遍学习](https://dream.blog.csdn.net/article/details/120463479) +40. [python 协程第4课,目标数据源为 mp3 ,目标站点为 bensound.com](https://dream.blog.csdn.net/article/details/120507981) +41. [python 协程补个知识点,控制并发数,python 数据采集必会技能](https://dream.blog.csdn.net/article/details/120879805) +### 📘 scrapy 库学习 -- GitLab