b47a0c7b · b47a0c7b · b47a0c7b · b47a0c7b · b47a0c7b · b47a0c7b
11 changed file
--- a/README.md
+++ b/README.md
 # asyncpy
-Use asyncio and aiohttp's concatenated web crawler framework 
+Use asyncio and aiohttp's concatenated web crawler framework  

 <img src="https://img-blog.csdnimg.cn/20200523121741871.png?x-oss-process=image/resize,m_fixed,h_224,w_224"/>


 Asyncpy是我基于asyncio和aiohttp开发的一个轻便高效的爬虫框架，采用了scrapy的设计模式，参考了github上一些开源框架的处理逻辑。

+---
+
+## 更新事项
+
+- 1.1.7： 修复事件循环结束时的报错问题
+- 1.1.8： 在spider文件中不再需要手动导入settings_attr
+

 - - -
 使用文档 : [https://blog.csdn.net/weixin_43582101/article/details/106320674](https://blog.csdn.net/weixin_43582101/article/details/106320674)
@@ -69,7 +76,8 @@ asyncpy genspider demo
 |PIPELINES|管道|
 |MIDDLEWARE|中间件|

-如果要启动全局settings的话，需要在spider文件中通过settings_attr 传入settings：
+
+1.1.8版本之前，如果要启动全局settings的话，需要在 spider文件中通过settings_attr 传入settings：
 ```python
 import settings
 class DemoSpider(Spider):
@@ -77,6 +85,9 @@ class DemoSpider(Spider):
    start_urls = []
    settings_attr = settings
 ```
+
+**新版本中无需手动传入settings。**
+
 - - -
 ### 自定义settings
 如果需要对单个爬虫文件进行settings配置，可以像scrapy一样在爬虫文件中引入 **custom_settings**。

--- a/asyncpy/LICENSE.txt
+++ b/asyncpy/LICENSE.txt
 Copyright (c) 2020 [Lixi]
 Author:  Lx
-QQ: 993066119
+QQ: 993066119 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
@@ -17,4 +17,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
\ No newline at end of file
+SOFTWARE.
--- a/asyncpy/asyncpy/__init__.py
+++ b/asyncpy/asyncpy/__init__.py
@@ -20,7 +20,7 @@ from docopt import docopt
 __all__ = ["Middleware","Request","Response","Spider","IgnoreThisItem"]


-VERSION = '1.1.5'
+VERSION = '1.2.1'

 DEFAULT_ENCODING = 'utf-8'

@@ -58,4 +58,4 @@ def cli():
    argv = docopt(__doc__, version=VERSION)
    if argv.get('genspider'):
        name = argv['<name>']
-        create_base(name=name)
\ No newline at end of file
+        create_base(name=name)
--- a/asyncpy/asyncpy/request.py
+++ b/asyncpy/asyncpy/request.py
@@ -19,7 +19,6 @@ class Request(object):
    name = "Request"

    REQUEST_CONFIG = settings.DEFAULT_REQUEST_CONFIG
-
    REQUEST_CONFIG["RETRY_FUNC"] = Coroutine
    REQUEST_CONFIG["VALID"] = Coroutine

@@ -31,6 +30,7 @@ class Request(object):
                 settings_attr=None,
                 callback=None, encoding: Optional[str] = None,
                 headers: dict = None,
+                 cookies: dict = None,
                 meta: dict = None, custom_settings: dict = None,
                 request_session=None, **aiohttp_kwargs):
        self.url = url
@@ -43,16 +43,18 @@ class Request(object):
        self.callback = callback
        self.encoding = encoding
        self.headers = headers or {}
-
+        self.cookies = cookies or {}
        self.meta = meta or {}
        self.request_session = request_session

        self.settings_attr = settings_attr or {}
        if self.settings_attr and settings_attr.get('USER_AGENT'):
            self.headers['User-Agent'] = settings_attr.get('USER_AGENT')
+
        self.request_settings = self.REQUEST_CONFIG
        if self.settings_attr and settings_attr.get('DEFAULT_REQUEST_CONFIG'):
            self.request_settings = settings_attr.get('DEFAULT_REQUEST_CONFIG')
+
        if custom_settings:
            self.request_settings = custom_settings

@@ -77,7 +79,6 @@ class Request(object):
        return self.request_session

    async def fetch(self, delay=True) -> Response:
-
        """Fetch all the information by using aiohttp"""
        if delay and self.request_settings.get("DOWNLOAD_DELAY", 0) > 0:
            await asyncio.sleep(self.request_settings["DOWNLOAD_DELAY"])
@@ -150,15 +151,15 @@ class Request(object):
            await self.request_session.close()

    async def _make_request(self):
-        """通过 aiohttp 发起请求"""
+        """Aiohttp send request"""
        self.logger.info(f"<{self.method}: {self.url}>")
        if self.method == "GET":
            request_func = self.current_request_session.get(
-                self.url, headers=self.headers, ssl=self.ssl, **self.aiohttp_kwargs
+                self.url, headers=self.headers, cookies=self.cookies, ssl=self.ssl, **self.aiohttp_kwargs
            )
        else:
            request_func = self.current_request_session.post(
-                self.url, headers=self.headers, ssl=self.ssl, **self.aiohttp_kwargs
+                self.url, headers=self.headers, cookies=self.cookies, ssl=self.ssl, **self.aiohttp_kwargs
            )
        resp = await request_func
        return resp
@@ -196,4 +197,4 @@ class Request(object):
            return response

    def __repr__(self):
-        return f"<{self.method} {self.url}>"
\ No newline at end of file
+        return f"<{self.method} {self.url}>"
--- a/asyncpy/asyncpy/response.py
+++ b/asyncpy/asyncpy/response.py
 # -*- coding: utf-8 -*-

-
-
-
 import json
 from typing import Any, Callable, Optional
 from http.cookies import SimpleCookie

--- a/asyncpy/asyncpy/settings.py
+++ b/asyncpy/asyncpy/settings.py
@@ -6,7 +6,7 @@
 CREATE YOUR DEFAULT_CONFIG !

 Some configuration:
-        CONCURRENT_REQUESTS     并发数量
+        CONCURRENT_REQUESTS     线程数量
        RETRIES                 重试次数
        DOWNLOAD_DELAY          下载延时
        RETRY_DELAY             重试延时

--- a/asyncpy/asyncpy/spider.py
+++ b/asyncpy/asyncpy/spider.py
 # -*- coding: utf-8 -*-

-
+import os
 import asyncio
-import collections
+from collections.abc import Iterable
 import typing
 import weakref
 import traceback
@@ -22,10 +22,6 @@ import importlib
 import sys


-
-
-
-
 try:
    import uvloop
    asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
@@ -37,7 +33,6 @@ class SpiderHook:
    """
    SpiderHook is used for extend spider
    """
-
    callback_result_map: dict = None

    async def _run_spider_hook(self, hook_func):
@@ -100,7 +95,7 @@ class Spider(SpiderHook):
    """
    Spider is used for control requests better
    """
-    name = None
+    name = ''
    custom_settings = None
    settings_attr = None
    # Default values passing to each request object. Not implemented yet.
@@ -142,7 +137,7 @@ class Spider(SpiderHook):
        """


-        if not isinstance(self.start_urls, collections.Iterable):
+        if not isinstance(self.start_urls, Iterable):
            raise ValueError(
                "start_urls must be a Iterable object"
            )
@@ -171,19 +166,27 @@ class Spider(SpiderHook):
        else:
            self.middleware = middleware or Middleware()

+
        # async queue as a producer
        self.request_queue = asyncio.Queue()
        if not self.settings_attr:
-            from asyncpy import settings
+            try:
+                sys.path.append(os.path.dirname(os.getcwd()))
+                import settings
+                self.settings_attr = get_attrs(settings)
+            except:
+                from asyncpy import settings
+                self.settings_attr = get_attrs(settings)
+                lg = get_logger()
+                lg.warning("【import settings Not Found,Please check path or project】")
+                lg.warning("【import settings Not Found,Please check path or project】")
        else:
            self.settings_attr = get_attrs(self.settings_attr)
            self.concurrency = self.settings_attr.get('CONCURRENT_REQUESTS')
-        if not self.concurrency:
-            self.concurrency = settings.CONCURRENT_REQUESTS


        # set logger
-        if  isinstance(self.settings_attr,dict) and self.settings_attr.get('LOG_FILE'):
+        if isinstance(self.settings_attr,dict) and self.settings_attr.get('LOG_FILE'):
            LOG_FILE, LOG_LEVEL = self.settings_attr.get('LOG_FILE'),self.settings_attr.get('LOG_LEVEL','INFO')
            self.logger = get_logger(name=self.name,filename=LOG_FILE,level=LOG_LEVEL)

@@ -194,6 +197,9 @@ class Spider(SpiderHook):
            self.logger = check_logger(name=self.name)


+        if not self.concurrency:
+            self.concurrency = settings.CONCURRENT_REQUESTS
+
        # semaphore, used for concurrency control
        self.sem = asyncio.Semaphore(self.concurrency)

@@ -203,8 +209,10 @@ class Spider(SpiderHook):

    async def _cancel_tasks(self):
        tasks = []
-        for task in asyncio.Task.all_tasks():
-            if task is not asyncio.tasks.Task.current_task():
+        #for task in asyncio.Task.all_tasks():
+        #    if task is not asyncio.tasks.Task.current_task():
+        for task in asyncio.all_tasks():
+            if task is not asyncio.current_task():
                tasks.append(task)
                task.cancel()
        await asyncio.gather(*tasks, return_exceptions=True)
@@ -301,10 +309,11 @@ class Spider(SpiderHook):
                    signal, lambda: asyncio.ensure_future(self.stop(signal))
                )
            except NotImplementedError:
-                self.logger.warning(
-                    f"{self.name} tried to use loop.add_signal_handler "
-                    "but it is not implemented on this platform."
-                )
+                # self.logger.warning(
+                #     f"{self.name} tried to use loop.add_signal_handler "
+                #     "but it is not implemented on this platform."
+                # )
+                ...
        # Run hook before spider start crawling
        await self._run_spider_hook(after_start)

@@ -401,8 +410,6 @@ class Spider(SpiderHook):
        #             pipelines_list.append(pipelines)


-
-
        loop = loop or asyncio.new_event_loop()
        spider_ins = cls(middleware=middleware, loop=loop, **spider_kwargs,pipelines=pipelines)

@@ -544,8 +551,8 @@ class Spider(SpiderHook):
            for i in range(self.worker_numbers)
        ]
        self.logger.info(f"Worker started")
-        for worker in workers:
-            self.logger.info(f"ensure_future started_worker: {id(worker)}")
+        #for worker in workers:
+        #    self.logger.info(f"ensure_future started_worker: {id(worker)}")

        await self.request_queue.join()

@@ -586,4 +593,4 @@ class Spider(SpiderHook):
        """
        self.logger.info(f"Asyncpy finished spider: {self.name}")
        await self._cancel_tasks()
-        self.loop.stop()
\ No newline at end of file
+        #self.loop.stop()
--- a/asyncpy/asyncpy/templates/__init__.py
+++ b/asyncpy/asyncpy/templates/__init__.py
-# -*- coding: utf-8 -*-
--- a/asyncpy/asyncpy/templates/settings.py
+++ b/asyncpy/asyncpy/templates/settings.py
 # -*- coding: utf-8 -*-

-
-
 """
 CREATE YOUR DEFAULT_CONFIG !

 Some configuration:
-        CONCURRENT_REQUESTS     并发数量
+        CONCURRENT_REQUESTS     线程数量
        RETRIES                 重试次数
        DOWNLOAD_DELAY          下载延时
        RETRY_DELAY             重试延时
@@ -23,13 +21,9 @@ CONCURRENT_REQUESTS = 20



-
 MIDDLEWARE = [
-# 'middlewares.middleware',
-            ]
-
-
-# PIPELINES = []
+    # 'middlewares.middleware',
+]



@@ -38,7 +32,7 @@ DEFAULT_REQUEST_CONFIG = {
    "RETRIES": 0,
    "DOWNLOAD_DELAY": 0,
    "RETRY_DELAY": 0,
-    "DOWNLOAD_TIMEOUT": 10,
+    "DOWNLOAD_TIMEOUT": 30,
 }


@@ -48,9 +42,5 @@ DEFAULT_REQUEST_CONFIG = {



-# CLOSESPIDER_TIMEOUT = 10
-
-
-

-#USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36"
\ No newline at end of file
+#USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36"
\ No newline at end of file
--- a/asyncpy/asyncpy/templates/spiders/templates.py
+++ b/asyncpy/asyncpy/templates/spiders/templates.py
 # -*- coding: utf-8 -*-

-
 from asyncpy.spider import Spider
-import settings


 class DemoSpider(Spider):
    name = 'templates'
-    settings_attr = settings

    start_urls = []


-
    async def parse(self, response):
        pass


--- a/asyncpy/setup.py
+++ b/asyncpy/setup.py
@@ -10,7 +10,7 @@ with open("README.md", "r",encoding='utf-8') as fh:
 setup(
    name="asyncpy",
    url="https://github.com/lixi5338619/asyncpy.git",
-    version= '1.1.5',
+    version= '1.1.9',
    description="Use asyncio and aiohttp's concatenated web crawler framework",
    long_description=long_description,
    author="lx",
@@ -37,4 +37,4 @@ setup(


 ## python setup.py sdist bdist_wheel
-## twine upload dist/*
+## twine upload dist/*
\ No newline at end of file