提交 dbef7e2b 编写于 作者: M Mikhail Korobov

Merge pull request #1947 from scrapy/canonicalize-url

[MRG+1] Fix canonicalize_url() on Python 3 and re-enable tests
......@@ -7,15 +7,18 @@ to the w3lib.url module. Always import those from there instead.
"""
import posixpath
import re
import six
from six.moves.urllib.parse import (ParseResult, urlunparse, urldefrag,
urlparse, parse_qsl, urlencode,
unquote)
quote, unquote)
if not six.PY2:
from urllib.parse import unquote_to_bytes
# scrapy.utils.url was moved to w3lib.url and import * ensures this
# move doesn't break old code
from w3lib.url import *
from w3lib.url import _safe_chars
from scrapy.utils.python import to_native_str
from scrapy.utils.python import to_bytes, to_native_str, to_unicode
def url_is_from_any_domain(url, domains):
......@@ -37,42 +40,112 @@ def url_has_any_extension(url, extensions):
return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
return (
to_native_str(parts.scheme),
to_native_str(parts.netloc.encode('idna')),
# default encoding for path component SHOULD be UTF-8
quote(to_bytes(parts.path, path_encoding), _safe_chars),
quote(to_bytes(parts.params, path_encoding), _safe_chars),
# encoding of query and fragment follows page encoding
# or form-charset (if known and passed)
quote(to_bytes(parts.query, encoding), _safe_chars),
quote(to_bytes(parts.fragment, encoding), _safe_chars)
)
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
encoding=None):
"""Canonicalize the given url by applying the following procedures:
- sort query arguments, first by key, then by value
- percent encode paths and query arguments. non-ASCII characters are
percent-encoded using UTF-8 (RFC-3986)
- percent encode paths ; non-ASCII characters are percent-encoded
using UTF-8 (RFC-3986)
- percent encode query arguments ; non-ASCII characters are percent-encoded
using passed `encoding` (UTF-8 by default)
- normalize all spaces (in query arguments) '+' (plus symbol)
- normalize percent encodings case (%2f -> %2F)
- remove query arguments with blank values (unless keep_blank_values is True)
- remove fragments (unless keep_fragments is True)
- remove query arguments with blank values (unless `keep_blank_values` is True)
- remove fragments (unless `keep_fragments` is True)
The url passed can be a str or unicode, while the url returned is always a
str.
The url passed can be bytes or unicode, while the url returned is
always a native str (bytes in Python 2, unicode in Python 3).
For examples see the tests in tests/test_utils_url.py
"""
# If supplied `encoding` is not compatible with all characters in `url`,
# fallback to UTF-8 as safety net.
# UTF-8 can handle all Unicode characters,
# so we should be covered regarding URL normalization,
# if not for proper URL expected by remote website.
try:
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
parse_url(url), encoding=encoding)
except UnicodeEncodeError as e:
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
parse_url(url), encoding='utf8')
scheme, netloc, path, params, query, fragment = parse_url(url)
keyvals = parse_qsl(query, keep_blank_values)
# 1. decode query-string as UTF-8 (or keep raw bytes),
# sort values,
# and percent-encode them back
if six.PY2:
keyvals = parse_qsl(query, keep_blank_values)
else:
# Python3's urllib.parse.parse_qsl does not work as wanted
# for percent-encoded characters that do not match passed encoding,
# they get lost.
#
# e.g., 'q=b%a3' becomes [('q', 'b\ufffd')]
# (ie. with 'REPLACEMENT CHARACTER' (U+FFFD),
# instead of \xa3 that you get with Python2's parse_qsl)
#
# what we want here is to keep raw bytes, and percent encode them
# so as to preserve whatever encoding what originally used.
#
# See https://tools.ietf.org/html/rfc3987#section-6.4:
#
# For example, it is possible to have a URI reference of
# "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the
# document name is encoded in iso-8859-1 based on server settings, but
# where the fragment identifier is encoded in UTF-8 according to
# [XPointer]. The IRI corresponding to the above URI would be (in XML
# notation)
# "http://www.example.org/r%E9sum%E9.xml#résumé".
# Similar considerations apply to query parts. The functionality of
# IRIs (namely, to be able to include non-ASCII characters) can only be
# used if the query part is encoded in UTF-8.
keyvals = parse_qsl_to_bytes(query, keep_blank_values)
keyvals.sort()
query = urlencode(keyvals)
# XXX: copied from w3lib.url.safe_url_string to add encoding argument
# path = to_native_str(path, encoding)
# path = moves.urllib.parse.quote(path, _safe_chars, encoding='latin1') or '/'
# 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes)
# and percent-encode path again (this normalizes to upper-case %XX)
uqp = _unquotepath(path)
path = quote(uqp, _safe_chars) or '/'
path = safe_url_string(_unquotepath(path)) or '/'
fragment = '' if not keep_fragments else fragment
# every part should be safe already
return urlunparse((scheme, netloc.lower(), path, params, query, fragment))
def _unquotepath(path):
for reserved in ('2f', '2F', '3f', '3F'):
path = path.replace('%' + reserved, '%25' + reserved.upper())
return unquote(path)
if six.PY2:
# in Python 2, '%a3' becomes '\xa3', which is what we want
return unquote(path)
else:
# in Python 3,
# standard lib's unquote() does not work for non-UTF-8
# percent-escaped characters, they get lost.
# e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD)
#
# unquote_to_bytes() returns raw bytes instead
return unquote_to_bytes(path)
def parse_url(url, encoding=None):
......@@ -81,7 +154,60 @@ def parse_url(url, encoding=None):
"""
if isinstance(url, ParseResult):
return url
return urlparse(to_native_str(url, encoding))
return urlparse(to_unicode(url, encoding))
if not six.PY2:
from urllib.parse import _coerce_args, unquote_to_bytes
def parse_qsl_to_bytes(qs, keep_blank_values=False, strict_parsing=False):
"""Parse a query given as a string argument.
Data are returned as a list of name, value pairs as bytes.
Arguments:
qs: percent-encoded query string to be parsed
keep_blank_values: flag indicating whether blank values in
percent-encoded queries should be treated as blank strings. A
true value indicates that blanks should be retained as blank
strings. The default false value indicates that blank values
are to be ignored and treated as if they were not included.
strict_parsing: flag indicating what to do with parsing errors. If
false (the default), errors are silently ignored. If true,
errors raise a ValueError exception.
"""
# This code is the same as Python3's parse_qsl()
# (at https://hg.python.org/cpython/rev/c38ac7ab8d9a)
# except for the unquote(s, encoding, errors) calls replaced
# with unquote_to_bytes(s)
qs, _coerce_result = _coerce_args(qs)
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
r = []
for name_value in pairs:
if not name_value and not strict_parsing:
continue
nv = name_value.split('=', 1)
if len(nv) != 2:
if strict_parsing:
raise ValueError("bad query field: %r" % (name_value,))
# Handle case of a control-name with no equal sign
if keep_blank_values:
nv.append('')
else:
continue
if len(nv[1]) or keep_blank_values:
name = nv[0].replace('+', ' ')
name = unquote_to_bytes(name)
name = _coerce_result(name)
value = nv[1].replace('+', ' ')
value = unquote_to_bytes(value)
value = _coerce_result(value)
r.append((name, value))
return r
def escape_ajax(url):
......
......@@ -2,10 +2,12 @@
import unittest
import six
from six.moves.urllib.parse import urlparse
from scrapy.spiders import Spider
from scrapy.utils.url import (url_is_from_any_domain, url_is_from_spider,
canonicalize_url, add_http_if_no_scheme,
guess_scheme)
guess_scheme, parse_url)
__doctests__ = ['scrapy.utils.url']
......@@ -123,16 +125,55 @@ class CanonicalizeUrlTest(unittest.TestCase):
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a%20space&a=1"),
"http://www.example.com/do?a=1&q=a+space")
@unittest.skipUnless(six.PY2, "TODO")
def test_canonicalize_url_unicode_path(self):
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé"),
"http://www.example.com/r%C3%A9sum%C3%A9")
def test_canonicalize_url_unicode_query_string(self):
# default encoding for path and query is UTF-8
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé"),
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
# passed encoding will affect query string
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé", encoding='latin1'),
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%E9sum%E9")
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='cp1251'),
"http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%EE%F1%F1%E8%FF")
def test_canonicalize_url_unicode_query_string_wrong_encoding(self):
# trying to encode with wrong encoding
# fallback to UTF-8
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?currency=€", encoding='latin1'),
"http://www.example.com/r%C3%A9sum%C3%A9?currency=%E2%82%AC")
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='latin1'),
"http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F")
def test_normalize_percent_encoding_in_paths(self):
self.assertEqual(canonicalize_url("http://www.example.com/r%c3%a9sum%c3%a9"),
"http://www.example.com/r%C3%A9sum%C3%A9")
# non-UTF8 encoded sequences: they should be kept untouched, only upper-cased
# 'latin1'-encoded sequence in path
self.assertEqual(canonicalize_url("http://www.example.com/a%a3do"),
"http://www.example.com/a%A3do"),
"http://www.example.com/a%A3do")
# 'latin1'-encoded path, UTF-8 encoded query string
self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9"),
"http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
# 'latin1'-encoded path and query string
self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%e9sum%e9"),
"http://www.example.com/a%A3do?q=r%E9sum%E9")
@unittest.skipUnless(six.PY2, "TODO")
def test_normalize_percent_encoding_in_query_arguments(self):
self.assertEqual(canonicalize_url("http://www.example.com/do?k=b%a3"),
"http://www.example.com/do?k=b%A3")
self.assertEqual(canonicalize_url("http://www.example.com/do?k=r%c3%a9sum%c3%a9"),
"http://www.example.com/do?k=r%C3%A9sum%C3%A9")
def test_non_ascii_percent_encoding_in_paths(self):
self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"),
"http://www.example.com/a%20do?a=1"),
......@@ -144,7 +185,7 @@ class CanonicalizeUrlTest(unittest.TestCase):
"http://www.example.com/a%20do%C2%A3.html?a=1")
def test_non_ascii_percent_encoding_in_query_arguments(self):
self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=\xa3500&a=5&z=3"),
self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=£500&a=5&z=3"),
u"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
self.assertEqual(canonicalize_url(b"http://www.example.com/do?price=\xc2\xa3500&a=5&z=3"),
"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
......@@ -167,7 +208,6 @@ class CanonicalizeUrlTest(unittest.TestCase):
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"),
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html")
@unittest.skipUnless(six.PY2, "TODO")
def test_safe_characters_unicode(self):
# urllib.quote uses a mapping cache of encoded characters. when parsing
# an already percent-encoded url, it will fail if that url was not
......@@ -181,12 +221,50 @@ class CanonicalizeUrlTest(unittest.TestCase):
self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"),
"http://www.example.com/")
def test_canonicalize_idns(self):
self.assertEqual(canonicalize_url(u'http://www.bücher.de?q=bücher'),
'http://www.xn--bcher-kva.de/?q=b%C3%BCcher')
# Japanese (+ reordering query parameters)
self.assertEqual(canonicalize_url(u'http://はじめよう.みんな/?query=サ&maxResults=5'),
'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?maxResults=5&query=%E3%82%B5')
def test_quoted_slash_and_question_sign(self):
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"),
"http://foo.com/AC%2FDC+rocks%3F/?yeah=1")
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
"http://foo.com/AC%2FDC/")
def test_canonicalize_urlparsed(self):
# canonicalize_url() can be passed an already urlparse'd URL
self.assertEqual(canonicalize_url(urlparse(u"http://www.example.com/résumé?q=résumé")),
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
self.assertEqual(canonicalize_url(urlparse('http://www.example.com/caf%e9-con-leche.htm')),
'http://www.example.com/caf%E9-con-leche.htm')
self.assertEqual(canonicalize_url(urlparse("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
"http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
def test_canonicalize_parse_url(self):
# parse_url() wraps urlparse and is used in link extractors
self.assertEqual(canonicalize_url(parse_url(u"http://www.example.com/résumé?q=résumé")),
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
self.assertEqual(canonicalize_url(parse_url('http://www.example.com/caf%e9-con-leche.htm')),
'http://www.example.com/caf%E9-con-leche.htm')
self.assertEqual(canonicalize_url(parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
"http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
def test_canonicalize_url_idempotence(self):
for url, enc in [(u'http://www.bücher.de/résumé?q=résumé', 'utf8'),
(u'http://www.example.com/résumé?q=résumé', 'latin1'),
(u'http://www.example.com/résumé?country=Россия', 'cp1251'),
(u'http://はじめよう.みんな/?query=サ&maxResults=5', 'iso2022jp')]:
canonicalized = canonicalize_url(url, encoding=enc)
# if we canonicalize again, we ge the same result
self.assertEqual(canonicalize_url(canonicalized, encoding=enc), canonicalized)
# without encoding, already canonicalized URL is canonicalized identically
self.assertEqual(canonicalize_url(canonicalized), canonicalized)
class AddHttpIfNoScheme(unittest.TestCase):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册