...
 
Commits (11)
    https://gitcode.net/weixin_43582101/asyncpy/-/commit/033f5189dc73af23aa81c6bf5e195e01fedf706f Update LICENSE.txt 2021-02-26T09:14:40+08:00 lx 45314745+lixi5338619@users.noreply.github.com https://gitcode.net/weixin_43582101/asyncpy/-/commit/384444919ee2f2b2700b382ade7ff73eac35482b Update README.md 2021-08-25T11:13:53+08:00 lx 45314745+lixi5338619@users.noreply.github.com https://gitcode.net/weixin_43582101/asyncpy/-/commit/0ce006528896257c19c39827a0ad70f7b6e4438d Update request.py 2021-09-03T11:34:26+08:00 lx 45314745+lixi5338619@users.noreply.github.com https://gitcode.net/weixin_43582101/asyncpy/-/commit/32e3a8e1dc7a048c0557339852983522aa0b9082 Update spider.py 2022-04-29T09:53:11+08:00 李玺 45314745+lixi5338619@users.noreply.github.com 修复爬虫结束后出现的异常: RuntimeError: Event loop stopped before Future completed. https://gitcode.net/weixin_43582101/asyncpy/-/commit/56a1e2f5c7bacac1f3990cd491eba00c41ab807f Update setup.py 2022-06-08T18:12:33+08:00 李玺 45314745+lixi5338619@users.noreply.github.com https://gitcode.net/weixin_43582101/asyncpy/-/commit/c00ae6cb02756d242c68f676b8f053d6a2d8606a 修复all_tasks问题 2022-06-08T18:13:29+08:00 李玺 45314745+lixi5338619@users.noreply.github.com https://gitcode.net/weixin_43582101/asyncpy/-/commit/a8c0c79d45c83ee88ffcee6a4b9f6e52916b7007 Update README.md 2022-09-21T15:41:14+08:00 李玺 45314745+lixi5338619@users.noreply.github.com https://gitcode.net/weixin_43582101/asyncpy/-/commit/88a7053664b5e05c0ecf2c1d9fde4af95c0d37a7 Update README.md 2022-09-21T17:34:20+08:00 李玺 45314745+lixi5338619@users.noreply.github.com https://gitcode.net/weixin_43582101/asyncpy/-/commit/67927f726646da12ddc5fddd665bf863715d8650 update 2022-09-21T17:41:33+08:00 李玺 125066648@qq.com https://gitcode.net/weixin_43582101/asyncpy/-/commit/74223d4748d8c90397828d65aa51d6e0e9ad7851 collections 2022-10-23T13:02:21+08:00 李玺 45314745+lixi5338619@users.noreply.github.com https://gitcode.net/weixin_43582101/asyncpy/-/commit/b47a0c7ba8e0ddad7021635907f1e33f889a021b Update __init__.py 2022-10-23T13:11:11+08:00 李玺 45314745+lixi5338619@users.noreply.github.com
# asyncpy
Use asyncio and aiohttp's concatenated web crawler framework
Use asyncio and aiohttp's concatenated web crawler framework
<img src="https://img-blog.csdnimg.cn/20200523121741871.png?x-oss-process=image/resize,m_fixed,h_224,w_224"/>
Asyncpy是我基于asyncio和aiohttp开发的一个轻便高效的爬虫框架,采用了scrapy的设计模式,参考了github上一些开源框架的处理逻辑。
---
## 更新事项
- 1.1.7: 修复事件循环结束时的报错问题
- 1.1.8: 在spider文件中不再需要手动导入settings_attr
- - -
使用文档 : [https://blog.csdn.net/weixin_43582101/article/details/106320674](https://blog.csdn.net/weixin_43582101/article/details/106320674)
......@@ -69,7 +76,8 @@ asyncpy genspider demo
|PIPELINES|管道|
|MIDDLEWARE|中间件|
如果要启动全局settings的话,需要在spider文件中通过settings_attr 传入settings:
1.1.8版本之前,如果要启动全局settings的话,需要在 spider文件中通过settings_attr 传入settings:
```python
import settings
class DemoSpider(Spider):
......@@ -77,6 +85,9 @@ class DemoSpider(Spider):
start_urls = []
settings_attr = settings
```
**新版本中无需手动传入settings。**
- - -
### 自定义settings
如果需要对单个爬虫文件进行settings配置,可以像scrapy一样在爬虫文件中引入 **custom_settings**
......
Copyright (c) 2020 [Lixi]
Author: Lx
QQ: 993066119
QQ: 993066119
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
......@@ -17,4 +17,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
\ No newline at end of file
SOFTWARE.
......@@ -20,7 +20,7 @@ from docopt import docopt
__all__ = ["Middleware","Request","Response","Spider","IgnoreThisItem"]
VERSION = '1.1.5'
VERSION = '1.2.1'
DEFAULT_ENCODING = 'utf-8'
......@@ -58,4 +58,4 @@ def cli():
argv = docopt(__doc__, version=VERSION)
if argv.get('genspider'):
name = argv['<name>']
create_base(name=name)
\ No newline at end of file
create_base(name=name)
......@@ -19,7 +19,6 @@ class Request(object):
name = "Request"
REQUEST_CONFIG = settings.DEFAULT_REQUEST_CONFIG
REQUEST_CONFIG["RETRY_FUNC"] = Coroutine
REQUEST_CONFIG["VALID"] = Coroutine
......@@ -31,6 +30,7 @@ class Request(object):
settings_attr=None,
callback=None, encoding: Optional[str] = None,
headers: dict = None,
cookies: dict = None,
meta: dict = None, custom_settings: dict = None,
request_session=None, **aiohttp_kwargs):
self.url = url
......@@ -43,16 +43,18 @@ class Request(object):
self.callback = callback
self.encoding = encoding
self.headers = headers or {}
self.cookies = cookies or {}
self.meta = meta or {}
self.request_session = request_session
self.settings_attr = settings_attr or {}
if self.settings_attr and settings_attr.get('USER_AGENT'):
self.headers['User-Agent'] = settings_attr.get('USER_AGENT')
self.request_settings = self.REQUEST_CONFIG
if self.settings_attr and settings_attr.get('DEFAULT_REQUEST_CONFIG'):
self.request_settings = settings_attr.get('DEFAULT_REQUEST_CONFIG')
if custom_settings:
self.request_settings = custom_settings
......@@ -77,7 +79,6 @@ class Request(object):
return self.request_session
async def fetch(self, delay=True) -> Response:
"""Fetch all the information by using aiohttp"""
if delay and self.request_settings.get("DOWNLOAD_DELAY", 0) > 0:
await asyncio.sleep(self.request_settings["DOWNLOAD_DELAY"])
......@@ -150,15 +151,15 @@ class Request(object):
await self.request_session.close()
async def _make_request(self):
"""通过 aiohttp 发起请求"""
"""Aiohttp send request"""
self.logger.info(f"<{self.method}: {self.url}>")
if self.method == "GET":
request_func = self.current_request_session.get(
self.url, headers=self.headers, ssl=self.ssl, **self.aiohttp_kwargs
self.url, headers=self.headers, cookies=self.cookies, ssl=self.ssl, **self.aiohttp_kwargs
)
else:
request_func = self.current_request_session.post(
self.url, headers=self.headers, ssl=self.ssl, **self.aiohttp_kwargs
self.url, headers=self.headers, cookies=self.cookies, ssl=self.ssl, **self.aiohttp_kwargs
)
resp = await request_func
return resp
......@@ -196,4 +197,4 @@ class Request(object):
return response
def __repr__(self):
return f"<{self.method} {self.url}>"
\ No newline at end of file
return f"<{self.method} {self.url}>"
# -*- coding: utf-8 -*-
import json
from typing import Any, Callable, Optional
from http.cookies import SimpleCookie
......
......@@ -6,7 +6,7 @@
CREATE YOUR DEFAULT_CONFIG !
Some configuration:
CONCURRENT_REQUESTS 并发数量
CONCURRENT_REQUESTS 线程数量
RETRIES 重试次数
DOWNLOAD_DELAY 下载延时
RETRY_DELAY 重试延时
......
# -*- coding: utf-8 -*-
import os
import asyncio
import collections
from collections.abc import Iterable
import typing
import weakref
import traceback
......@@ -22,10 +22,6 @@ import importlib
import sys
try:
import uvloop
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
......@@ -37,7 +33,6 @@ class SpiderHook:
"""
SpiderHook is used for extend spider
"""
callback_result_map: dict = None
async def _run_spider_hook(self, hook_func):
......@@ -100,7 +95,7 @@ class Spider(SpiderHook):
"""
Spider is used for control requests better
"""
name = None
name = ''
custom_settings = None
settings_attr = None
# Default values passing to each request object. Not implemented yet.
......@@ -142,7 +137,7 @@ class Spider(SpiderHook):
"""
if not isinstance(self.start_urls, collections.Iterable):
if not isinstance(self.start_urls, Iterable):
raise ValueError(
"start_urls must be a Iterable object"
)
......@@ -171,19 +166,27 @@ class Spider(SpiderHook):
else:
self.middleware = middleware or Middleware()
# async queue as a producer
self.request_queue = asyncio.Queue()
if not self.settings_attr:
from asyncpy import settings
try:
sys.path.append(os.path.dirname(os.getcwd()))
import settings
self.settings_attr = get_attrs(settings)
except:
from asyncpy import settings
self.settings_attr = get_attrs(settings)
lg = get_logger()
lg.warning("【import settings Not Found,Please check path or project】")
lg.warning("【import settings Not Found,Please check path or project】")
else:
self.settings_attr = get_attrs(self.settings_attr)
self.concurrency = self.settings_attr.get('CONCURRENT_REQUESTS')
if not self.concurrency:
self.concurrency = settings.CONCURRENT_REQUESTS
# set logger
if isinstance(self.settings_attr,dict) and self.settings_attr.get('LOG_FILE'):
if isinstance(self.settings_attr,dict) and self.settings_attr.get('LOG_FILE'):
LOG_FILE, LOG_LEVEL = self.settings_attr.get('LOG_FILE'),self.settings_attr.get('LOG_LEVEL','INFO')
self.logger = get_logger(name=self.name,filename=LOG_FILE,level=LOG_LEVEL)
......@@ -194,6 +197,9 @@ class Spider(SpiderHook):
self.logger = check_logger(name=self.name)
if not self.concurrency:
self.concurrency = settings.CONCURRENT_REQUESTS
# semaphore, used for concurrency control
self.sem = asyncio.Semaphore(self.concurrency)
......@@ -203,8 +209,10 @@ class Spider(SpiderHook):
async def _cancel_tasks(self):
tasks = []
for task in asyncio.Task.all_tasks():
if task is not asyncio.tasks.Task.current_task():
#for task in asyncio.Task.all_tasks():
# if task is not asyncio.tasks.Task.current_task():
for task in asyncio.all_tasks():
if task is not asyncio.current_task():
tasks.append(task)
task.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
......@@ -301,10 +309,11 @@ class Spider(SpiderHook):
signal, lambda: asyncio.ensure_future(self.stop(signal))
)
except NotImplementedError:
self.logger.warning(
f"{self.name} tried to use loop.add_signal_handler "
"but it is not implemented on this platform."
)
# self.logger.warning(
# f"{self.name} tried to use loop.add_signal_handler "
# "but it is not implemented on this platform."
# )
...
# Run hook before spider start crawling
await self._run_spider_hook(after_start)
......@@ -401,8 +410,6 @@ class Spider(SpiderHook):
# pipelines_list.append(pipelines)
loop = loop or asyncio.new_event_loop()
spider_ins = cls(middleware=middleware, loop=loop, **spider_kwargs,pipelines=pipelines)
......@@ -544,8 +551,8 @@ class Spider(SpiderHook):
for i in range(self.worker_numbers)
]
self.logger.info(f"Worker started")
for worker in workers:
self.logger.info(f"ensure_future started_worker: {id(worker)}")
#for worker in workers:
# self.logger.info(f"ensure_future started_worker: {id(worker)}")
await self.request_queue.join()
......@@ -586,4 +593,4 @@ class Spider(SpiderHook):
"""
self.logger.info(f"Asyncpy finished spider: {self.name}")
await self._cancel_tasks()
self.loop.stop()
\ No newline at end of file
#self.loop.stop()
# -*- coding: utf-8 -*-
"""
CREATE YOUR DEFAULT_CONFIG !
Some configuration:
CONCURRENT_REQUESTS 并发数量
CONCURRENT_REQUESTS 线程数量
RETRIES 重试次数
DOWNLOAD_DELAY 下载延时
RETRY_DELAY 重试延时
......@@ -23,13 +21,9 @@ CONCURRENT_REQUESTS = 20
MIDDLEWARE = [
# 'middlewares.middleware',
]
# PIPELINES = []
# 'middlewares.middleware',
]
......@@ -38,7 +32,7 @@ DEFAULT_REQUEST_CONFIG = {
"RETRIES": 0,
"DOWNLOAD_DELAY": 0,
"RETRY_DELAY": 0,
"DOWNLOAD_TIMEOUT": 10,
"DOWNLOAD_TIMEOUT": 30,
}
......@@ -48,9 +42,5 @@ DEFAULT_REQUEST_CONFIG = {
# CLOSESPIDER_TIMEOUT = 10
#USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36"
\ No newline at end of file
#USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36"
\ No newline at end of file
# -*- coding: utf-8 -*-
from asyncpy.spider import Spider
import settings
class DemoSpider(Spider):
name = 'templates'
settings_attr = settings
start_urls = []
async def parse(self, response):
pass
......
......@@ -10,7 +10,7 @@ with open("README.md", "r",encoding='utf-8') as fh:
setup(
name="asyncpy",
url="https://github.com/lixi5338619/asyncpy.git",
version= '1.1.5',
version= '1.1.9',
description="Use asyncio and aiohttp's concatenated web crawler framework",
long_description=long_description,
author="lx",
......@@ -37,4 +37,4 @@ setup(
## python setup.py sdist bdist_wheel
## twine upload dist/*
## twine upload dist/*
\ No newline at end of file