提交 3f5a1956 编写于 作者: A Adrián Chaves

Merge remote-tracking branch 'upstream/master' into retry-request

name: Checks
on: [push, pull_request]
jobs:
checks:
runs-on: ubuntu-18.04
strategy:
matrix:
include:
- python-version: 3.8
env:
TOXENV: security
- python-version: 3.8
env:
TOXENV: flake8
- python-version: 3.8
env:
TOXENV: pylint
- python-version: 3.8
env:
TOXENV: typing
- python-version: 3.7 # Keep in sync with .readthedocs.yml
env:
TOXENV: docs
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Run check
env: ${{ matrix.env }}
run: |
pip install -U tox
tox
name: Publish
on: [push]
jobs:
publish:
runs-on: ubuntu-18.04
if: startsWith(github.event.ref, 'refs/tags/')
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: 3.8
- name: Check Tag
id: check-release-tag
run: |
if [[ ${{ github.event.ref }} =~ ^refs/tags/[0-9]+[.][0-9]+[.][0-9]+(rc[0-9]+|[.]dev[0-9]+)?$ ]]; then
echo ::set-output name=release_tag::true
fi
- name: Publish to PyPI
if: steps.check-release-tag.outputs.release_tag == 'true'
run: |
pip install --upgrade setuptools wheel twine
python setup.py sdist bdist_wheel
export TWINE_USERNAME=__token__
export TWINE_PASSWORD=${{ secrets.PYPI_TOKEN }}
twine upload dist/*
name: macOS
on: [push, pull_request]
jobs:
tests:
runs-on: macos-10.15
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Run tests
run: |
pip install -U tox
tox -e py
- name: Upload coverage report
run: bash <(curl -s https://codecov.io/bash)
name: Ubuntu
on: [push, pull_request]
jobs:
tests:
runs-on: ubuntu-18.04
strategy:
matrix:
include:
- python-version: 3.7
env:
TOXENV: py
- python-version: 3.8
env:
TOXENV: py
- python-version: pypy3
env:
TOXENV: pypy3
PYPY_VERSION: 3.6-v7.3.1
# pinned deps
- python-version: 3.6.12
env:
TOXENV: pinned
- python-version: 3.6.12
env:
TOXENV: asyncio-pinned
- python-version: pypy3
env:
TOXENV: pypy3-pinned
PYPY_VERSION: 3.6-v7.2.0
# extras
- python-version: 3.8
env:
TOXENV: extra-deps
- python-version: 3.8
env:
TOXENV: asyncio
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install system libraries
if: matrix.python-version == 'pypy3' || contains(matrix.env.TOXENV, 'pinned')
run: |
sudo apt-get update
sudo apt-get install libxml2-dev libxslt-dev
- name: Run tests
env: ${{ matrix.env }}
run: |
if [[ ! -z "$PYPY_VERSION" ]]; then
export PYPY_VERSION="pypy$PYPY_VERSION-linux64"
wget "https://downloads.python.org/pypy/${PYPY_VERSION}.tar.bz2"
tar -jxf ${PYPY_VERSION}.tar.bz2
$PYPY_VERSION/bin/pypy3 -m venv "$HOME/virtualenvs/$PYPY_VERSION"
source "$HOME/virtualenvs/$PYPY_VERSION/bin/activate"
fi
pip install -U tox
tox
- name: Upload coverage report
run: bash <(curl -s https://codecov.io/bash)
name: Run test suite
name: Windows
on: [push, pull_request]
jobs:
test-windows:
name: "Windows Tests"
runs-on: ${{ matrix.os }}
tests:
runs-on: windows-latest
strategy:
matrix:
os: [windows-latest]
python-version: [3.7, 3.8]
env: [TOXENV: py]
include:
- os: windows-latest
python-version: 3.6
- python-version: 3.6
env:
TOXENV: windows-pinned
- python-version: 3.7
env:
TOXENV: py
- python-version: 3.8
env:
TOXENV: py
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v1
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Run test suite
- name: Run tests
env: ${{ matrix.env }}
run: |
pip install -U tox twine wheel codecov
pip install -U tox
tox
language: python
dist: xenial
branches:
only:
- master
- /^\d\.\d+$/
- /^\d\.\d+\.\d+(rc\d+|\.dev\d+)?$/
matrix:
include:
- env: TOXENV=security
python: 3.8
- env: TOXENV=flake8
python: 3.8
- env: TOXENV=pylint
python: 3.8
- env: TOXENV=docs
python: 3.7 # Keep in sync with .readthedocs.yml
- env: TOXENV=typing
python: 3.8
- env: TOXENV=pinned
python: 3.6.1
- env: TOXENV=asyncio-pinned
python: 3.6.1
- env: TOXENV=pypy3-pinned PYPY_VERSION=3.6-v7.2.0
- env: TOXENV=py
python: 3.6
- env: TOXENV=pypy3 PYPY_VERSION=3.6-v7.3.1
- env: TOXENV=py
python: 3.7
- env: TOXENV=py PYPI_RELEASE_JOB=true
python: 3.8
dist: bionic
- env: TOXENV=extra-deps
python: 3.8
dist: bionic
- env: TOXENV=asyncio
python: 3.8
dist: bionic
install:
- |
if [[ ! -z "$PYPY_VERSION" ]]; then
export PYPY_VERSION="pypy$PYPY_VERSION-linux64"
wget "https://downloads.python.org/pypy/${PYPY_VERSION}.tar.bz2"
tar -jxf ${PYPY_VERSION}.tar.bz2
virtualenv --python="$PYPY_VERSION/bin/pypy3" "$HOME/virtualenvs/$PYPY_VERSION"
source "$HOME/virtualenvs/$PYPY_VERSION/bin/activate"
fi
- pip install -U tox twine wheel codecov
script: tox
after_success:
- codecov
notifications:
irc:
use_notice: true
skip_join: true
channels:
- irc.freenode.org#scrapy
cache:
directories:
- $HOME/.cache/pip
deploy:
provider: pypi
distributions: "sdist bdist_wheel"
user: scrapy
password:
secure: JaAKcy1AXWXDK3LXdjOtKyaVPCSFoCGCnW15g4f65E/8Fsi9ZzDfmBa4Equs3IQb/vs/if2SVrzJSr7arN7r9Z38Iv1mUXHkFAyA3Ym8mThfABBzzcUWEQhIHrCX0Tdlx9wQkkhs+PZhorlmRS4gg5s6DzPaeA2g8SCgmlRmFfA=
on:
tags: true
repo: scrapy/scrapy
condition: "$PYPI_RELEASE_JOB == true && $TRAVIS_TAG =~ ^[0-9]+[.][0-9]+[.][0-9]+(rc[0-9]+|[.]dev[0-9]+)?$"
Scrapy was brought to life by Shane Evans while hacking a scraping framework
prototype for Mydeco (mydeco.com). It soon became maintained, extended and
improved by Insophia (insophia.com), with the initial sponsorship of Mydeco to
bootstrap the project. In mid-2011, Scrapinghub became the new official
maintainer.
bootstrap the project. In mid-2011, Scrapinghub (now Zyte) became the new
official maintainer.
Here is the list of the primary authors & contributors:
......
......@@ -55,7 +55,7 @@ further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at opensource@scrapinghub.com. All
reported by contacting the project team at opensource@zyte.com. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
......
......@@ -10,9 +10,17 @@ Scrapy
:target: https://pypi.python.org/pypi/Scrapy
:alt: Supported Python Versions
.. image:: https://img.shields.io/travis/scrapy/scrapy/master.svg
:target: https://travis-ci.org/scrapy/scrapy
:alt: Build Status
.. image:: https://github.com/scrapy/scrapy/workflows/Ubuntu/badge.svg
:target: https://github.com/scrapy/scrapy/actions?query=workflow%3AUbuntu
:alt: Ubuntu
.. image:: https://github.com/scrapy/scrapy/workflows/macOS/badge.svg
:target: https://github.com/scrapy/scrapy/actions?query=workflow%3AmacOS
:alt: macOS
.. image:: https://github.com/scrapy/scrapy/workflows/Windows/badge.svg
:target: https://github.com/scrapy/scrapy/actions?query=workflow%3AWindows
:alt: Windows
.. image:: https://img.shields.io/badge/wheel-yes-brightgreen.svg
:target: https://pypi.python.org/pypi/Scrapy
......@@ -34,9 +42,16 @@ Scrapy is a fast high-level web crawling and web scraping framework, used to
crawl websites and extract structured data from their pages. It can be used for
a wide range of purposes, from data mining to monitoring and automated testing.
Scrapy is maintained by Zyte_ (formerly Scrapinghub) and `many other
contributors`_.
.. _many other contributors: https://github.com/scrapy/scrapy/graphs/contributors
.. _Zyte: https://www.zyte.com/
Check the Scrapy homepage at https://scrapy.org for more information,
including a list of features.
Requirements
============
......@@ -81,7 +96,7 @@ Please note that this project is released with a Contributor Code of Conduct
(see https://github.com/scrapy/scrapy/blob/master/CODE_OF_CONDUCT.md).
By participating in this project you agree to abide by its terms.
Please report unacceptable behavior to opensource@scrapinghub.com.
Please report unacceptable behavior to opensource@zyte.com.
Companies using Scrapy
======================
......
......@@ -2,6 +2,8 @@ from pathlib import Path
import pytest
from scrapy.utils.reactor import install_reactor
from tests.keys import generate_keys
......@@ -40,6 +42,14 @@ def pytest_collection_modifyitems(session, config, items):
pass
def pytest_addoption(parser):
parser.addoption(
"--reactor",
default="default",
choices=["default", "asyncio"],
)
@pytest.fixture(scope='class')
def reactor_pytest(request):
if not request.cls:
......@@ -55,5 +65,10 @@ def only_asyncio(request, reactor_pytest):
pytest.skip('This test is only run with --reactor=asyncio')
def pytest_configure(config):
if config.getoption("--reactor") == "asyncio":
install_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
# Generate localhost certificate files, needed by some tests
generate_keys()
......@@ -283,6 +283,7 @@ coverage_ignore_pyobjects = [
intersphinx_mapping = {
'attrs': ('https://www.attrs.org/en/stable/', None),
'coverage': ('https://coverage.readthedocs.io/en/stable', None),
'cryptography' : ('https://cryptography.io/en/latest/', None),
'cssselect': ('https://cssselect.readthedocs.io/en/latest', None),
'itemloaders': ('https://itemloaders.readthedocs.io/en/latest/', None),
'pytest': ('https://docs.pytest.org/en/latest', None),
......
......@@ -69,10 +69,9 @@ In case of any trouble related to these dependencies,
please refer to their respective installation instructions:
* `lxml installation`_
* `cryptography installation`_
* :doc:`cryptography installation <cryptography:installation>`
.. _lxml installation: https://lxml.de/installation.html
.. _cryptography installation: https://cryptography.io/en/latest/installation/
.. _intro-using-virtualenv:
......@@ -265,10 +264,8 @@ For details, see `Issue #2473 <https://github.com/scrapy/scrapy/issues/2473>`_.
.. _cryptography: https://cryptography.io/en/latest/
.. _pyOpenSSL: https://pypi.org/project/pyOpenSSL/
.. _setuptools: https://pypi.python.org/pypi/setuptools
.. _AUR Scrapy package: https://aur.archlinux.org/packages/scrapy/
.. _homebrew: https://brew.sh/
.. _zsh: https://www.zsh.org/
.. _Scrapinghub: https://scrapinghub.com
.. _Anaconda: https://docs.anaconda.com/anaconda/
.. _Miniconda: https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html
.. _conda-forge: https://conda-forge.org/
......@@ -2428,7 +2428,7 @@ Bug fixes
- Fix compatibility with Twisted 17+ (:issue:`2496`, :issue:`2528`).
- Fix ``scrapy.Item`` inheritance on Python 3.6 (:issue:`2511`).
- Enforce numeric values for components order in ``SPIDER_MIDDLEWARES``,
``DOWNLOADER_MIDDLEWARES``, ``EXTENIONS`` and ``SPIDER_CONTRACTS`` (:issue:`2420`).
``DOWNLOADER_MIDDLEWARES``, ``EXTENSIONS`` and ``SPIDER_CONTRACTS`` (:issue:`2420`).
Documentation
~~~~~~~~~~~~~
......
......@@ -14,7 +14,7 @@ spiders come in.
Popular choices for deploying Scrapy spiders are:
* :ref:`Scrapyd <deploy-scrapyd>` (open source)
* :ref:`Scrapy Cloud <deploy-scrapy-cloud>` (cloud-based)
* :ref:`Zyte Scrapy Cloud <deploy-scrapy-cloud>` (cloud-based)
.. _deploy-scrapyd:
......@@ -32,28 +32,28 @@ Scrapyd is maintained by some of the Scrapy developers.
.. _deploy-scrapy-cloud:
Deploying to Scrapy Cloud
=========================
Deploying to Zyte Scrapy Cloud
==============================
`Scrapy Cloud`_ is a hosted, cloud-based service by `Scrapinghub`_,
the company behind Scrapy.
`Zyte Scrapy Cloud`_ is a hosted, cloud-based service by Zyte_, the company
behind Scrapy.
Scrapy Cloud removes the need to setup and monitor servers
and provides a nice UI to manage spiders and review scraped items,
logs and stats.
Zyte Scrapy Cloud removes the need to setup and monitor servers and provides a
nice UI to manage spiders and review scraped items, logs and stats.
To deploy spiders to Scrapy Cloud you can use the `shub`_ command line tool.
Please refer to the `Scrapy Cloud documentation`_ for more information.
To deploy spiders to Zyte Scrapy Cloud you can use the `shub`_ command line
tool.
Please refer to the `Zyte Scrapy Cloud documentation`_ for more information.
Scrapy Cloud is compatible with Scrapyd and one can switch between
Zyte Scrapy Cloud is compatible with Scrapyd and one can switch between
them as needed - the configuration is read from the ``scrapy.cfg`` file
just like ``scrapyd-deploy``.
.. _Scrapyd: https://github.com/scrapy/scrapyd
.. _Deploying your project: https://scrapyd.readthedocs.io/en/latest/deploy.html
.. _Scrapy Cloud: https://scrapinghub.com/scrapy-cloud
.. _Scrapyd: https://github.com/scrapy/scrapyd
.. _scrapyd-client: https://github.com/scrapy/scrapyd-client
.. _shub: https://doc.scrapinghub.com/shub.html
.. _scrapyd-deploy documentation: https://scrapyd.readthedocs.io/en/latest/deploy.html
.. _Scrapy Cloud documentation: https://doc.scrapinghub.com/scrapy-cloud.html
.. _Scrapinghub: https://scrapinghub.com/
.. _shub: https://shub.readthedocs.io/en/latest/
.. _Zyte: https://zyte.com/
.. _Zyte Scrapy Cloud: https://www.zyte.com/scrapy-cloud/
.. _Zyte Scrapy Cloud documentation: https://docs.zyte.com/scrapy-cloud.html
......@@ -123,7 +123,7 @@ Example::
def serialize_field(self, field, name, value):
if field == 'price':
return f'$ {str(value)}'
return super(Product, self).serialize_field(field, name, value)
return super().serialize_field(field, name, value)
.. _topics-exporters-reference:
......
......@@ -101,7 +101,7 @@ instance, which can be accessed and used like this::
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['https://scrapinghub.com']
start_urls = ['https://scrapy.org']
def parse(self, response):
self.logger.info('Parse function called on %s', response.url)
......@@ -117,7 +117,7 @@ Python logger you want. For example::
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['https://scrapinghub.com']
start_urls = ['https://scrapy.org']
def parse(self, response):
logger.info('Parse function called on %s', response.url)
......
......@@ -63,7 +63,7 @@ project as example.
process = CrawlerProcess(get_project_settings())
# 'followall' is the name of one of the spiders of the project.
process.crawl('followall', domain='scrapinghub.com')
process.crawl('followall', domain='scrapy.org')
process.start() # the script will block here until the crawling is finished
There's another Scrapy utility that provides more control over the crawling
......@@ -244,7 +244,7 @@ Here are some tips to keep in mind when dealing with these kinds of sites:
super proxy that you can attach your own proxies to.
* use a highly distributed downloader that circumvents bans internally, so you
can just focus on parsing clean pages. One example of such downloaders is
`Crawlera`_
`Zyte Smart Proxy Manager`_
If you are still unable to prevent your bot getting banned, consider contacting
`commercial support`_.
......@@ -254,5 +254,5 @@ If you are still unable to prevent your bot getting banned, consider contacting
.. _ProxyMesh: https://proxymesh.com/
.. _Google cache: http://www.googleguide.com/cached_pages.html
.. _testspiders: https://github.com/scrapinghub/testspiders
.. _Crawlera: https://scrapinghub.com/crawlera
.. _scrapoxy: https://scrapoxy.io/
.. _Zyte Smart Proxy Manager: https://www.zyte.com/smart-proxy-manager/
......@@ -693,9 +693,19 @@ Response objects
:param ip_address: The IP address of the server from which the Response originated.
:type ip_address: :class:`ipaddress.IPv4Address` or :class:`ipaddress.IPv6Address`
:param protocol: The protocol that was used to download the response.
For instance: "HTTP/1.0", "HTTP/1.1"
:type protocol: :class:`str`
.. versionadded:: 2.0.0
The ``certificate`` parameter.
.. versionadded:: 2.1.0
The ``ip_address`` parameter.
.. versionadded:: VERSION
The ``protocol`` parameter.
.. attribute:: Response.url
A string containing the URL of the response.
......@@ -780,6 +790,8 @@ Response objects
.. attribute:: Response.certificate
.. versionadded:: 2.0.0
A :class:`twisted.internet.ssl.Certificate` object representing
the server's SSL certificate.
......@@ -795,6 +807,17 @@ Response objects
handler, i.e. for ``http(s)`` responses. For other handlers,
:attr:`ip_address` is always ``None``.
.. attribute:: Response.protocol
.. versionadded:: VERSION
The protocol that was used to download the response.
For instance: "HTTP/1.0", "HTTP/1.1"
This attribute is currently only populated by the HTTP download
handlers, i.e. for ``http(s)`` responses. For other handlers,
:attr:`protocol` is always ``None``.
.. method:: Response.copy()
Returns a new Response which is a copy of this Response.
......
......@@ -464,10 +464,10 @@ effectively. If you are not much familiar with XPath yet,
you may want to take a look first at this `XPath tutorial`_.
.. note::
Some of the tips are based on `this post from ScrapingHub's blog`_.
Some of the tips are based on `this post from Zyte's blog`_.
.. _`XPath tutorial`: http://www.zvon.org/comp/r/tut-XPath_1.html
.. _`this post from ScrapingHub's blog`: https://blog.scrapinghub.com/2014/07/17/xpath-tips-from-the-web-scraping-trenches/
.. _this post from Zyte's blog: https://www.zyte.com/blog/xpath-tips-from-the-web-scraping-trenches/
.. _topics-selectors-relative-xpaths:
......
......@@ -18,7 +18,6 @@ addopts =
--ignore=docs/topics/stats.rst
--ignore=docs/topics/telnetconsole.rst
--ignore=docs/utils
twisted = 1
markers =
only_asyncio: marks tests as only enabled when --reactor=asyncio is passed
flake8-max-line-length = 119
......@@ -36,8 +35,5 @@ flake8-ignore =
scrapy/spiders/__init__.py E402 F401
# Issues pending a review:
scrapy/utils/http.py F403
scrapy/utils/markup.py F403
scrapy/utils/multipart.py F403
scrapy/utils/url.py F403 F405
tests/test_loader.py E741
......@@ -303,11 +303,14 @@ class ScrapyAgent:
proxyHost = to_unicode(proxyHost)
omitConnectTunnel = b'noconnect' in proxyParams
if omitConnectTunnel:
warnings.warn("Using HTTPS proxies in the noconnect mode is deprecated. "
"If you use Crawlera, it doesn't require this mode anymore, "
"so you should update scrapy-crawlera to 1.3.0+ "
"and remove '?noconnect' from the Crawlera URL.",
ScrapyDeprecationWarning)
warnings.warn(
"Using HTTPS proxies in the noconnect mode is deprecated. "
"If you use Zyte Smart Proxy Manager (formerly Crawlera), "
"it doesn't require this mode anymore, so you should "
"update scrapy-crawlera to 1.3.0+ and remove '?noconnect' "
"from the Zyte Smart Proxy Manager URL.",
ScrapyDeprecationWarning,
)
if scheme == b'https' and not omitConnectTunnel:
proxyAuth = request.headers.get(b'Proxy-Authorization', None)
proxyConf = (proxyHost, proxyPort, proxyAuth)
......@@ -434,6 +437,11 @@ class ScrapyAgent:
def _cb_bodydone(self, result, request, url):
headers = Headers(result["txresponse"].headers.getAllRawHeaders())
respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"])
try:
version = result["txresponse"].version
protocol = f"{to_unicode(version[0])}/{version[1]}.{version[2]}"
except (AttributeError, TypeError, IndexError):
protocol = None
response = respcls(
url=url,
status=int(result["txresponse"].code),
......@@ -442,6 +450,7 @@ class ScrapyAgent:
flags=result["flags"],
certificate=result["certificate"],
ip_address=result["ip_address"],
protocol=protocol,
)
if result.get("failure"):
result["failure"].value.response = response
......
......@@ -7,7 +7,7 @@ from twisted.internet.protocol import ClientFactory
from scrapy.http import Headers
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_bytes
from scrapy.utils.python import to_bytes, to_unicode
from scrapy.responsetypes import responsetypes
......@@ -110,7 +110,7 @@ class ScrapyHTTPClientFactory(ClientFactory):
status = int(self.status)
headers = Headers(self.response_headers)
respcls = responsetypes.from_args(headers=headers, url=self._url)
return respcls(url=self._url, status=status, headers=headers, body=body)
return respcls(url=self._url, status=status, headers=headers, body=body, protocol=to_unicode(self.version))
def _set_connection_attributes(self, request):
parsed = urlparse_cached(request)
......
import os
import json
import logging
import warnings
from os.path import join, exists
from queuelib import PriorityQueue
from scrapy.utils.misc import load_object, create_instance
from scrapy.utils.job import job_dir
from scrapy.utils.deprecate import ScrapyDeprecationWarning
logger = logging.getLogger(__name__)
......@@ -56,14 +52,6 @@ class Scheduler:
dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
dupefilter = create_instance(dupefilter_cls, settings, crawler)
pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
if pqclass is PriorityQueue:
warnings.warn("SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
" is no longer supported because of API changes; "
"please use 'scrapy.pqueues.ScrapyPriorityQueue'",
ScrapyDeprecationWarning)
from scrapy.pqueues import ScrapyPriorityQueue
pqclass = ScrapyPriorityQueue
dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
logunser = settings.getbool('SCHEDULER_DEBUG')
......
......@@ -41,86 +41,92 @@ class SpiderMiddlewareManager(MiddlewareManager):
process_spider_exception = getattr(mw, 'process_spider_exception', None)
self.methods['process_spider_exception'].appendleft(process_spider_exception)
def scrape_response(self, scrape_func, response, request, spider):
def process_spider_input(response):
for method in self.methods['process_spider_input']:
try:
result = method(response=response, spider=spider)
if result is not None:
msg = (f"Middleware {_fname(method)} must return None "
f"or raise an exception, got {type(result)}")
raise _InvalidOutput(msg)
except _InvalidOutput:
raise
except Exception:
return scrape_func(Failure(), request, spider)
return scrape_func(response, request, spider)
def _evaluate_iterable(iterable, exception_processor_index, recover_to):
def _process_spider_input(self, scrape_func, response, request, spider):
for method in self.methods['process_spider_input']:
try:
for r in iterable:
yield r
except Exception as ex:
exception_result = process_spider_exception(Failure(ex), exception_processor_index)
if isinstance(exception_result, Failure):
raise
recover_to.extend(exception_result)
def process_spider_exception(_failure, start_index=0):
exception = _failure.value
# don't handle _InvalidOutput exception
if isinstance(exception, _InvalidOutput):
return _failure
method_list = islice(self.methods['process_spider_exception'], start_index, None)
for method_index, method in enumerate(method_list, start=start_index):
if method is None:
continue
result = method(response=response, exception=exception, spider=spider)
if _isiterable(result):
# stop exception handling by handing control over to the
# process_spider_output chain if an iterable has been returned
return process_spider_output(result, method_index + 1)
elif result is None:
continue
else:
result = method(response=response, spider=spider)
if result is not None:
msg = (f"Middleware {_fname(method)} must return None "
f"or an iterable, got {type(result)}")
f"or raise an exception, got {type(result)}")
raise _InvalidOutput(msg)
except _InvalidOutput:
raise
except Exception:
return scrape_func(Failure(), request, spider)
return scrape_func(response, request, spider)
def _evaluate_iterable(self, response, spider, iterable, exception_processor_index, recover_to):
try:
for r in iterable:
yield r
except Exception as ex:
exception_result = self._process_spider_exception(response, spider, Failure(ex),
exception_processor_index)
if isinstance(exception_result, Failure):
raise
recover_to.extend(exception_result)
def _process_spider_exception(self, response, spider, _failure, start_index=0):
exception = _failure.value
# don't handle _InvalidOutput exception
if isinstance(exception, _InvalidOutput):
return _failure
method_list = islice(self.methods['process_spider_exception'], start_index, None)
for method_index, method in enumerate(method_list, start=start_index):
if method is None:
continue
result = method(response=response, exception=exception, spider=spider)
if _isiterable(result):
# stop exception handling by handing control over to the
# process_spider_output chain if an iterable has been returned
return self._process_spider_output(response, spider, result, method_index + 1)
elif result is None:
continue
else:
msg = (f"Middleware {_fname(method)} must return None "
f"or an iterable, got {type(result)}")
raise _InvalidOutput(msg)
return _failure
def _process_spider_output(self, response, spider, result, start_index=0):
# items in this iterable do not need to go through the process_spider_output
# chain, they went through it already from the process_spider_exception method
recovered = MutableChain()
method_list = islice(self.methods['process_spider_output'], start_index, None)
for method_index, method in enumerate(method_list, start=start_index):
if method is None:
continue
try:
# might fail directly if the output value is not a generator
result = method(response=response, result=result, spider=spider)
except Exception as ex:
exception_result = self._process_spider_exception(response, spider, Failure(ex), method_index + 1)
if isinstance(exception_result, Failure):
raise
return exception_result
if _isiterable(result):
result = self._evaluate_iterable(response, spider, result, method_index + 1, recovered)
else:
msg = (f"Middleware {_fname(method)} must return an "
f"iterable, got {type(result)}")
raise _InvalidOutput(msg)
def process_spider_output(result, start_index=0):
# items in this iterable do not need to go through the process_spider_output
# chain, they went through it already from the process_spider_exception method
recovered = MutableChain()
method_list = islice(self.methods['process_spider_output'], start_index, None)
for method_index, method in enumerate(method_list, start=start_index):
if method is None:
continue
try:
# might fail directly if the output value is not a generator
result = method(response=response, result=result, spider=spider)
except Exception as ex:
exception_result = process_spider_exception(Failure(ex), method_index + 1)
if isinstance(exception_result, Failure):
raise
return exception_result
if _isiterable(result):
result = _evaluate_iterable(result, method_index + 1, recovered)
else:
msg = (f"Middleware {_fname(method)} must return an "
f"iterable, got {type(result)}")
raise _InvalidOutput(msg)
return MutableChain(result, recovered)
return MutableChain(result, recovered)
def _process_callback_output(self, response, spider, result):
recovered = MutableChain()
result = self._evaluate_iterable(response, spider, result, 0, recovered)
return MutableChain(self._process_spider_output(response, spider, result), recovered)
def scrape_response(self, scrape_func, response, request, spider):
def process_callback_output(result):
recovered = MutableChain()
result = _evaluate_iterable(result, 0, recovered)
return MutableChain(process_spider_output(result), recovered)
return self._process_callback_output(response, spider, result)
def process_spider_exception(_failure):
return self._process_spider_exception(response, spider, _failure)
dfd = mustbe_deferred(process_spider_input, response)
dfd = mustbe_deferred(self._process_spider_input, scrape_func, response, request, spider)
dfd.addCallbacks(callback=process_callback_output, errback=process_spider_exception)
return dfd
......
......@@ -160,7 +160,7 @@ def _select_value(ele, n, v):
multiple = ele.multiple
if v is None and not multiple:
# Match browser behaviour on simple select tag without options selected
# And for select tags wihout options
# And for select tags without options
o = ele.value_options
return (n, o[0]) if o else (None, None)
elif v is not None and multiple:
......
......@@ -17,8 +17,18 @@ from scrapy.utils.trackref import object_ref
class Response(object_ref):
def __init__(self, url, status=200, headers=None, body=b'', flags=None,
request=None, certificate=None, ip_address=None):
def __init__(
self,
url,
status=200,
headers=None,
body=b"",
flags=None,
request=None,
certificate=None,
ip_address=None,
protocol=None,
):
self.headers = Headers(headers or {})
self.status = int(status)
self._set_body(body)
......@@ -27,6 +37,7 @@ class Response(object_ref):
self.flags = [] if flags is None else list(flags)
self.certificate = certificate
self.ip_address = ip_address
self.protocol = protocol
@property
def cb_kwargs(self):
......@@ -89,8 +100,9 @@ class Response(object_ref):
"""Create a new Response with the same attributes except for those
given new values.
"""
for x in ['url', 'status', 'headers', 'body',
'request', 'flags', 'certificate', 'ip_address']:
for x in [
"url", "status", "headers", "body", "request", "flags", "certificate", "ip_address", "protocol",
]:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs)
......
......@@ -86,7 +86,7 @@ class MediaPipeline:
info = self.spiderinfo
requests = arg_to_iter(self.get_media_requests(item, info))
dlist = [self._process_request(r, info, item) for r in requests]
dfd = DeferredList(dlist, consumeErrors=1)
dfd = DeferredList(dlist, consumeErrors=True)
return dfd.addCallback(self.item_completed, item, info)
def _process_request(self, request, info, item):
......
async def collect_asyncgen(result):
results = []
async for x in result:
results.append(x)
return results
......@@ -105,7 +105,7 @@ def process_parallel(callbacks, input, *a, **kw):
callbacks
"""
dfds = [defer.succeed(input).addCallback(x, *a, **kw) for x in callbacks]
d = defer.DeferredList(dfds, fireOnOneErrback=1, consumeErrors=1)
d = defer.DeferredList(dfds, fireOnOneErrback=True, consumeErrors=True)
d.addCallbacks(lambda r: [x[1] for x in r], lambda f: f.value.subFailure)
return d
......
import struct
from gzip import GzipFile
from io import BytesIO
import re
import struct
from scrapy.utils.decorators import deprecated
......@@ -42,17 +41,5 @@ def gunzip(data):
return b''.join(output_list)
_is_gzipped = re.compile(br'^application/(x-)?gzip\b', re.I).search
_is_octetstream = re.compile(br'^(application|binary)/octet-stream\b', re.I).search
@deprecated
def is_gzipped(response):
"""Return True if the response is gzipped, or False otherwise"""
ctype = response.headers.get('Content-Type', b'')
cenc = response.headers.get('Content-Encoding', b'').lower()
return _is_gzipped(ctype) or _is_octetstream(ctype) and cenc in (b'gzip', b'x-gzip')
def gzip_magic_number(response):
return response.body[:3] == b'\x1f\x8b\x08'
"""
Transitional module for moving to the w3lib library.
For new code, always import from w3lib.http instead of this module
"""
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.decorators import deprecated
from w3lib.http import * # noqa: F401
warnings.warn("Module `scrapy.utils.http` is deprecated, "
"Please import from `w3lib.http` instead.",
ScrapyDeprecationWarning, stacklevel=2)
@deprecated
def decode_chunked_transfer(chunked_body):
"""Parsed body received with chunked transfer encoding, and return the
decoded body.
For more info see:
https://en.wikipedia.org/wiki/Chunked_transfer_encoding
"""
body, h, t = '', '', chunked_body
while t:
h, t = t.split('\r\n', 1)
if h == '0':
break
size = int(h, 16)
body += t[:size]
t = t[size + 2:]
return body
"""
Transitional module for moving to the w3lib library.
For new code, always import from w3lib.html instead of this module
"""
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
from w3lib.html import * # noqa: F401
warnings.warn("Module `scrapy.utils.markup` is deprecated. "
"Please import from `w3lib.html` instead.",
ScrapyDeprecationWarning, stacklevel=2)
"""
Transitional module for moving to the w3lib library.
For new code, always import from w3lib.form instead of this module
"""
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
from w3lib.form import * # noqa: F401
warnings.warn("Module `scrapy.utils.multipart` is deprecated. "
"If you're using `encode_multipart` function, please use "
"`urllib3.filepost.encode_multipart_formdata` instead",
ScrapyDeprecationWarning, stacklevel=2)
import os
import pickle
import warnings
from importlib import import_module
......@@ -68,18 +67,10 @@ def get_project_settings():
if settings_module_path:
settings.setmodule(settings_module_path, priority='project')
pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
if pickled_settings:
warnings.warn("Use of environment variable "
"'SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE' "
"is deprecated.", ScrapyDeprecationWarning)
settings.setdict(pickle.loads(pickled_settings), priority='project')
scrapy_envvars = {k[7:]: v for k, v in os.environ.items() if
k.startswith('SCRAPY_')}
valid_envvars = {
'CHECK',
'PICKLED_SETTINGS_TO_OVERRIDE',
'PROJECT',
'PYTHON_SHELL',
'SETTINGS_MODULE',
......
"""
Helpers using Python 3.6+ syntax (ignore SyntaxError on import).
"""
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.asyncgen import collect_asyncgen # noqa: F401
async def collect_asyncgen(result):
results = []
async for x in result:
results.append(x)
return results
warnings.warn(
"Module `scrapy.utils.py36` is deprecated, please import from `scrapy.utils.asyncgen` instead.",
category=ScrapyDeprecationWarning,
stacklevel=2,
)
......@@ -4,17 +4,14 @@ import logging
from scrapy.spiders import Spider
from scrapy.utils.defer import deferred_from_coro
from scrapy.utils.misc import arg_to_iter
try:
from scrapy.utils.py36 import collect_asyncgen
except SyntaxError:
collect_asyncgen = None
from scrapy.utils.asyncgen import collect_asyncgen
logger = logging.getLogger(__name__)
def iterate_spider_output(result):
if collect_asyncgen and hasattr(inspect, 'isasyncgen') and inspect.isasyncgen(result):
if inspect.isasyncgen(result):
d = deferred_from_coro(collect_asyncgen(result))
d.addCallback(iterate_spider_output)
return d
......
......@@ -55,9 +55,6 @@ ignore_errors = True
[mypy-scrapy.utils.response]
ignore_errors = True
[mypy-scrapy.utils.spider]
ignore_errors = True
[mypy-scrapy.utils.trackref]
ignore_errors = True
......
......@@ -24,7 +24,6 @@ install_requires = [
'cssselect>=0.9.1',
'itemloaders>=1.0.1',
'parsel>=1.5.0',
'PyDispatcher>=2.0.5',
'pyOpenSSL>=16.2.0',
'queuelib>=1.4.2',
'service_identity>=16.0.0',
......@@ -34,11 +33,12 @@ install_requires = [
'itemadapter>=0.1.0',
]
extras_require = {}
cpython_dependencies = [
'lxml>=3.5.0',
'PyDispatcher>=2.0.5',
]
if has_environment_marker_platform_impl_support():
extras_require[':platform_python_implementation == "CPython"'] = [
'lxml>=3.5.0',
]
extras_require[':platform_python_implementation == "CPython"'] = cpython_dependencies
extras_require[':platform_python_implementation == "PyPy"'] = [
# Earlier lxml versions are affected by
# https://foss.heptapod.net/pypy/pypy/-/issues/2498,
......@@ -49,14 +49,14 @@ if has_environment_marker_platform_impl_support():
'PyPyDispatcher>=2.1.0',
]
else:
install_requires.append('lxml>=3.5.0')
install_requires.extend(cpython_dependencies)
setup(
name='Scrapy',
version=version,
url='https://scrapy.org',
project_urls = {
project_urls={
'Documentation': 'https://docs.scrapy.org/',
'Source': 'https://github.com/scrapy/scrapy',
'Tracker': 'https://github.com/scrapy/scrapy/issues',
......
# Tests requirements
attrs
dataclasses; python_version == '3.6'
mitmproxy; python_version >= '3.7'
mitmproxy >= 4.0.4, < 5; python_version >= '3.6' and python_version < '3.7'
pyftpdlib
# https://github.com/pytest-dev/pytest-twisted/issues/93
pytest != 5.4, != 5.4.1
pytest
pytest-cov
pytest-twisted >= 1.11
pytest-xdist
sybil >= 1.3.0 # https://github.com/cjw296/sybil/issues/20#issuecomment-605433422
testfixtures
uvloop; platform_system != "Windows"
uvloop < 0.15.0; platform_system != "Windows" and python_version == '3.6'
uvloop; platform_system != "Windows" and python_version > '3.6'
# optional for shell wrapper tests
bpython
brotlipy # optional for HTTP compress downloader middleware tests
zstandard # optional for HTTP compress downloader middleware tests
zstandard; implementation_name != 'pypy' # optional for HTTP compress downloader middleware tests
ipython
pywin32; sys_platform == "win32"
import os
import re
from configparser import ConfigParser
from importlib import import_module
from twisted import version as twisted_version
from twisted.trial import unittest
class ScrapyUtilsTest(unittest.TestCase):
def test_required_openssl_version(self):
try:
module = import_module('OpenSSL')
......@@ -13,6 +19,32 @@ class ScrapyUtilsTest(unittest.TestCase):
installed_version = [int(x) for x in module.__version__.split('.')[:2]]
assert installed_version >= [0, 6], "OpenSSL >= 0.6 required"
def test_pinned_twisted_version(self):
"""When running tests within a Tox environment with pinned
dependencies, make sure that the version of Twisted is the pinned
version.
See https://github.com/scrapy/scrapy/pull/4814#issuecomment-706230011
"""
if not os.environ.get('_SCRAPY_PINNED', None):
self.skipTest('Not in a pinned environment')
tox_config_file_path = os.path.join(
os.path.dirname(__file__),
'..',
'tox.ini',
)
config_parser = ConfigParser()
config_parser.read(tox_config_file_path)
pattern = r'Twisted==([\d.]+)'
match = re.search(pattern, config_parser['pinned']['deps'])
pinned_twisted_version_string = match[1]
self.assertEqual(
twisted_version.short(),
pinned_twisted_version_string
)
if __name__ == "__main__":
unittest.main()
......@@ -115,6 +115,7 @@ class FileTestCase(unittest.TestCase):
self.assertEqual(response.url, request.url)
self.assertEqual(response.status, 200)
self.assertEqual(response.body, b'0123456789')
self.assertEqual(response.protocol, None)
request = Request(path_to_file_uri(self.tmpname + '^'))
assert request.url.upper().endswith('%5E')
......@@ -360,6 +361,13 @@ class Http10TestCase(HttpTestCase):
"""HTTP 1.0 test case"""
download_handler_cls = HTTP10DownloadHandler
def test_protocol(self):
request = Request(self.getURL("host"), method="GET")
d = self.download_request(request, Spider("foo"))
d.addCallback(lambda r: r.protocol)
d.addCallback(self.assertEqual, "HTTP/1.0")
return d
class Https10TestCase(Http10TestCase):
scheme = 'https'
......@@ -489,6 +497,13 @@ class Http11TestCase(HttpTestCase):
def test_download_broken_chunked_content_allow_data_loss_via_setting(self):
return self.test_download_broken_content_allow_data_loss_via_setting('broken-chunked')
def test_protocol(self):
request = Request(self.getURL("host"), method="GET")
d = self.download_request(request, Spider("foo"))
d.addCallback(lambda r: r.protocol)
d.addCallback(self.assertEqual, "HTTP/1.1")
return d
class Https11TestCase(Http11TestCase):
scheme = 'https'
......@@ -962,6 +977,7 @@ class BaseFTPTestCase(unittest.TestCase):
self.assertEqual(r.status, 200)
self.assertEqual(r.body, b'I have the power!')
self.assertEqual(r.headers, {b'Local Filename': [b''], b'Size': [b'17']})
self.assertIsNone(r.protocol)
return self._add_test_callbacks(d, _test)
def test_ftp_download_path_with_spaces(self):
......@@ -1120,3 +1136,10 @@ class DataURITestCase(unittest.TestCase):
request = Request('data:text/plain;base64,SGVsbG8sIHdvcmxkLg%3D%3D')
return self.download_request(request, self.spider).addCallback(_test)
def test_protocol(self):
def _test(response):
self.assertIsNone(response.protocol)
request = Request("data:,")
return self.download_request(request, self.spider).addCallback(_test)
import asyncio
from unittest import mock
from unittest import mock, SkipTest
from pytest import mark
from twisted import version as twisted_version
from twisted.internet import defer
from twisted.internet.defer import Deferred
from twisted.trial.unittest import TestCase
from twisted.python.failure import Failure
from twisted.python.versions import Version
from scrapy.http import Request, Response
from scrapy.spiders import Spider
......@@ -211,10 +213,21 @@ class MiddlewareUsingDeferreds(ManagerTestCase):
self.assertFalse(download_func.called)
@mark.usefixtures('reactor_pytest')
class MiddlewareUsingCoro(ManagerTestCase):
"""Middlewares using asyncio coroutines should work"""
def test_asyncdef(self):
if (
self.reactor_pytest == 'asyncio'
and twisted_version < Version('twisted', 18, 4, 0)
):
raise SkipTest(
'Due to https://twistedmatrix.com/trac/ticket/9390, this test '
'hangs when using AsyncIO and Twisted versions lower than '
'18.4.0'
)
resp = Response('http://example.com/index.html')
class CoroMiddleware:
......@@ -235,6 +248,12 @@ class MiddlewareUsingCoro(ManagerTestCase):
@mark.only_asyncio()
def test_asyncdef_asyncio(self):
if twisted_version < Version('twisted', 18, 4, 0):
raise SkipTest(
'Due to https://twistedmatrix.com/trac/ticket/9390, this test '
'hangs when using Twisted versions lower than 18.4.0'
)
resp = Response('http://example.com/index.html')
class CoroMiddleware:
......
import json
import os
import platform
import re
import sys
from subprocess import Popen, PIPE
from urllib.parse import urlsplit, urlunsplit
from unittest import skipIf
from testfixtures import LogCapture
from twisted.internet import defer
from twisted.trial.unittest import TestCase
......@@ -57,13 +54,14 @@ def _wrong_credentials(proxy_url):
return urlunsplit(bad_auth_proxy)
@skipIf("pypy" in sys.executable,
"mitmproxy does not support PyPy")
@skipIf(platform.system() == 'Windows' and sys.version_info < (3, 7),
"mitmproxy does not support Windows when running Python < 3.7")
class ProxyConnectTestCase(TestCase):
def setUp(self):
try:
import mitmproxy # noqa: F401
except ImportError:
self.skipTest('mitmproxy is not installed')
self.mockserver = MockServer()
self.mockserver.__enter__()
self._oldenv = os.environ.copy()
......
from pytest import mark
from twisted.trial import unittest
from twisted.internet import reactor, defer
from twisted.python.failure import Failure
from scrapy.utils.defer import (
deferred_f_from_coro_f,
iter_errback,
mustbe_deferred,
process_chain,
......@@ -117,3 +119,18 @@ class IterErrbackTest(unittest.TestCase):
self.assertEqual(out, [0, 1, 2, 3, 4])
self.assertEqual(len(errors), 1)
self.assertIsInstance(errors[0].value, ZeroDivisionError)
class AsyncDefTestsuiteTest(unittest.TestCase):
@deferred_f_from_coro_f
async def test_deferred_f_from_coro_f(self):
pass
@deferred_f_from_coro_f
async def test_deferred_f_from_coro_f_generator(self):
yield
@mark.xfail(reason="Checks that the test is actually executed", strict=True)
@deferred_f_from_coro_f
async def test_deferred_f_from_coro_f_xfail(self):
raise Exception("This is expected to be raised")
......@@ -3,10 +3,11 @@ from os.path import join
from w3lib.encoding import html_to_unicode
from scrapy.utils.gz import gunzip, is_gzipped
from scrapy.http import Response, Headers
from scrapy.utils.gz import gunzip, gzip_magic_number
from scrapy.http import Response
from tests import tests_datadir
SAMPLEDIR = join(tests_datadir, 'compressed')
......@@ -14,8 +15,12 @@ class GunzipTest(unittest.TestCase):
def test_gunzip_basic(self):
with open(join(SAMPLEDIR, 'feed-sample1.xml.gz'), 'rb') as f:
text = gunzip(f.read())
self.assertEqual(len(text), 9950)
r1 = Response("http://www.example.com", body=f.read())
self.assertTrue(gzip_magic_number(r1))
r2 = Response("http://www.example.com", body=gunzip(r1.body))
self.assertFalse(gzip_magic_number(r2))
self.assertEqual(len(r2.body), 9950)
def test_gunzip_truncated(self):
with open(join(SAMPLEDIR, 'truncated-crc-error.gz'), 'rb') as f:
......@@ -28,46 +33,16 @@ class GunzipTest(unittest.TestCase):
def test_gunzip_truncated_short(self):
with open(join(SAMPLEDIR, 'truncated-crc-error-short.gz'), 'rb') as f:
text = gunzip(f.read())
assert text.endswith(b'</html>')
def test_is_x_gzipped_right(self):
hdrs = Headers({"Content-Type": "application/x-gzip"})
r1 = Response("http://www.example.com", headers=hdrs)
self.assertTrue(is_gzipped(r1))
r1 = Response("http://www.example.com", body=f.read())
self.assertTrue(gzip_magic_number(r1))
def test_is_gzipped_right(self):
hdrs = Headers({"Content-Type": "application/gzip"})
r1 = Response("http://www.example.com", headers=hdrs)
self.assertTrue(is_gzipped(r1))
def test_is_gzipped_not_quite(self):
hdrs = Headers({"Content-Type": "application/gzippppp"})
r1 = Response("http://www.example.com", headers=hdrs)
self.assertFalse(is_gzipped(r1))
def test_is_gzipped_case_insensitive(self):
hdrs = Headers({"Content-Type": "Application/X-Gzip"})
r1 = Response("http://www.example.com", headers=hdrs)
self.assertTrue(is_gzipped(r1))
hdrs = Headers({"Content-Type": "application/X-GZIP ; charset=utf-8"})
r1 = Response("http://www.example.com", headers=hdrs)
self.assertTrue(is_gzipped(r1))
r2 = Response("http://www.example.com", body=gunzip(r1.body))
assert r2.body.endswith(b'</html>')
self.assertFalse(gzip_magic_number(r2))
def test_is_gzipped_empty(self):
r1 = Response("http://www.example.com")
self.assertFalse(is_gzipped(r1))
def test_is_gzipped_wrong(self):
hdrs = Headers({"Content-Type": "application/javascript"})
r1 = Response("http://www.example.com", headers=hdrs)
self.assertFalse(is_gzipped(r1))
def test_is_gzipped_with_charset(self):
hdrs = Headers({"Content-Type": "application/x-gzip;charset=utf-8"})
r1 = Response("http://www.example.com", headers=hdrs)
self.assertTrue(is_gzipped(r1))
self.assertFalse(gzip_magic_number(r1))
def test_gunzip_illegal_eof(self):
with open(join(SAMPLEDIR, 'unexpected-eof.gz'), 'rb') as f:
......
import unittest
from scrapy.utils.http import decode_chunked_transfer
class ChunkedTest(unittest.TestCase):
def test_decode_chunked_transfer(self):
"""Example taken from: http://en.wikipedia.org/wiki/Chunked_transfer_encoding"""
chunked_body = "25\r\n" + "This is the data in the first chunk\r\n\r\n"
chunked_body += "1C\r\n" + "and this is the second one\r\n\r\n"
chunked_body += "3\r\n" + "con\r\n"
chunked_body += "8\r\n" + "sequence\r\n"
chunked_body += "0\r\n\r\n"
body = decode_chunked_transfer(chunked_body)
self.assertEqual(
body,
"This is the data in the first chunk\r\nand this is the second one\r\nconsequence"
)
import asyncio
from unittest import SkipTest
from pydispatch import dispatcher
from pytest import mark
from testfixtures import LogCapture
from twisted.trial import unittest
from twisted.python.failure import Failure
from twisted import version as twisted_version
from twisted.internet import defer, reactor
from pydispatch import dispatcher
from twisted.python.failure import Failure
from twisted.python.versions import Version
from twisted.trial import unittest
from scrapy.utils.signal import send_catch_log, send_catch_log_deferred
from scrapy.utils.test import get_from_asyncio_queue
......@@ -68,6 +71,7 @@ class SendCatchLogDeferredTest2(SendCatchLogDeferredTest):
return d
@mark.usefixtures('reactor_pytest')
class SendCatchLogDeferredAsyncDefTest(SendCatchLogDeferredTest):
async def ok_handler(self, arg, handlers_called):
......@@ -76,6 +80,19 @@ class SendCatchLogDeferredAsyncDefTest(SendCatchLogDeferredTest):
await defer.succeed(42)
return "OK"
def test_send_catch_log(self):
if (
self.reactor_pytest == 'asyncio'
and twisted_version < Version('twisted', 18, 4, 0)
):
raise SkipTest(
'Due to https://twistedmatrix.com/trac/ticket/9390, this test '
'fails due to a timeout when using AsyncIO and Twisted '
'versions lower than 18.4.0'
)
return super().test_send_catch_log()
@mark.only_asyncio()
class SendCatchLogDeferredAsyncioTest(SendCatchLogDeferredTest):
......@@ -86,6 +103,16 @@ class SendCatchLogDeferredAsyncioTest(SendCatchLogDeferredTest):
await asyncio.sleep(0.2)
return await get_from_asyncio_queue("OK")
def test_send_catch_log(self):
if twisted_version < Version('twisted', 18, 4, 0):
raise SkipTest(
'Due to https://twistedmatrix.com/trac/ticket/9390, this test '
'fails due to a timeout when using Twisted versions lower '
'than 18.4.0'
)
return super().test_send_catch_log()
class SendCatchLogTest2(unittest.TestCase):
......
......@@ -4,7 +4,10 @@ Tests borrowed from the twisted.web.client tests.
"""
import os
import shutil
import sys
from pkg_resources import parse_version
import cryptography
import OpenSSL.SSL
from twisted.trial import unittest
from twisted.web import server, static, util, resource
......@@ -414,6 +417,8 @@ class WebClientCustomCiphersSSLTestCase(WebClientSSLTestCase):
).addCallback(self.assertEqual, to_bytes(s))
def testPayloadDisabledCipher(self):
if sys.implementation.name == "pypy" and parse_version(cryptography.__version__) <= parse_version("2.3.1"):
self.skipTest("This does work in PyPy with cryptography<=2.3.1")
s = "0123456789" * 10
settings = Settings({'DOWNLOADER_CLIENT_TLS_CIPHERS': 'ECDHE-RSA-AES256-GCM-SHA384'})
client_context_factory = create_instance(ScrapyClientContextFactory, settings=settings, crawler=None)
......
......@@ -11,6 +11,10 @@ minversion = 1.7.0
deps =
-ctests/constraints.txt
-rtests/requirements-py3.txt
# mitmproxy does not support PyPy
# mitmproxy does not support Windows when running Python < 3.7
mitmproxy >= 4.0.4; python_version >= '3.7' and implementation_name != 'pypy'
mitmproxy >= 4.0.4, < 5; python_version >= '3.6' and python_version < '3.7' and platform_system != 'Windows' and implementation_name != 'pypy'
# Extras
botocore>=1.4.87
Pillow>=4.0.0
......@@ -20,8 +24,10 @@ passenv =
AWS_SECRET_ACCESS_KEY
GCS_TEST_FILE_URI
GCS_PROJECT_ID
#allow tox virtualenv to upgrade pip/wheel/setuptools
download = true
commands =
py.test --cov=scrapy --cov-report= {posargs:--durations=10 docs scrapy tests}
py.test --cov=scrapy --cov-report=xml --cov-report= {posargs:--durations=10 docs scrapy tests}
[testenv:typing]
basepython = python3
......@@ -66,7 +72,6 @@ deps =
itemadapter==0.1.0
parsel==1.5.0
Protego==0.1.15
PyDispatcher==2.0.5
pyOpenSSL==16.2.0
queuelib==1.4.2
service_identity==16.0.0
......@@ -74,15 +79,24 @@ deps =
w3lib==1.17.0
zope.interface==4.1.3
-rtests/requirements-py3.txt
# mitmproxy 4.0.4+ requires upgrading some of the pinned dependencies
# above, hence we do not install it in pinned environments at the moment
# Extras
botocore==1.4.87
google-cloud-storage==1.29.0
Pillow==4.0.0
setenv =
_SCRAPY_PINNED=true
[testenv:pinned]
deps =
{[pinned]deps}
lxml==3.5.0
PyDispatcher==2.0.5
setenv =
{[pinned]setenv}
[testenv:windows-pinned]
basepython = python3
......@@ -91,6 +105,9 @@ deps =
# First lxml version that includes a Windows wheel for Python 3.6, so we do
# not need to build lxml from sources in a CI Windows job:
lxml==3.8.0
PyDispatcher==2.0.5
setenv =
{[pinned]setenv}
[testenv:extra-deps]
deps =
......@@ -103,8 +120,10 @@ commands =
{[testenv]commands} --reactor=asyncio
[testenv:asyncio-pinned]
commands = {[testenv:asyncio]commands}
deps = {[testenv:pinned]deps}
commands = {[testenv:asyncio]commands}
setenv =
{[pinned]setenv}
[testenv:pypy3]
basepython = pypy3
......@@ -113,11 +132,13 @@ commands =
[testenv:pypy3-pinned]
basepython = {[testenv:pypy3]basepython}
commands = {[testenv:pypy3]commands}
deps =
{[pinned]deps}
lxml==4.0.0
PyPyDispatcher==2.1.0
commands = {[testenv:pypy3]commands}
setenv =
{[pinned]setenv}
[docs]
changedir = docs
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册