提交 8a101321 编写于 作者: P Paul Moore 提交者: GitHub

Another attempt to fix encoding issues (#4486)

When reading bytes from subprocesses, use the locale encoding. Don't fail if the data is encoded incorrectly - instead, use the backslashreplace error handler (and warn the user).
上级 a3ce000b
......@@ -90,6 +90,25 @@ before invoking ``setup.py``. The injection should be transparent to
``setup.py`` emulating the commands pip requires may need to be aware that it
takes place.
Build System Output
~~~~~~~~~~~~~~~~~~~
Any output produced by the build system will be read by pip (for display to the
user if requested). In order to correctly read the build system output, pip
requires that the output is written in a well-defined encoding, specifically
the encoding the user has configured for text output (which can be obtained in
Python using ``locale.getpreferredencoding``). If the configured encoding is
ASCII, pip assumes UTF-8 (to account for the behaviour of some Unix systems).
Build systems should ensure that any tools they invoke (compilers, etc) produce
output in the correct encoding. In practice - and in particular on Windows,
where tools are inconsistent in their use of the "OEM" and "ANSI" codepages -
this may not always be possible. Pip will therefore attempt to recover cleanly
if presented with incorrectly encoded build tool output, by translating
unexpected byte sequences to Python-style hexadecimal escape sequences
(``"\x80\xff"``, etc). However, it is still possible for output to be displayed
using an incorrect encoding (mojibake).
Future Developments
~~~~~~~~~~~~~~~~~~~
......
Improve handling of text output from build tools (avoid Unicode errors)
......@@ -4,6 +4,9 @@ from __future__ import absolute_import, division
import os
import sys
import codecs
import locale
import logging
from pip._vendor.six import text_type
......@@ -24,6 +27,8 @@ __all__ = [
]
logger = logging.getLogger(__name__)
if sys.version_info >= (3, 4):
uses_pycache = True
from importlib.util import cache_from_source
......@@ -36,22 +41,76 @@ else:
cache_from_source = None
if sys.version_info >= (3,):
def console_to_str(s):
try:
return s.decode(sys.__stdout__.encoding)
except UnicodeDecodeError:
return s.decode('utf_8')
if sys.version_info >= (3, 5):
backslashreplace_decode = "backslashreplace"
else:
# In version 3.4 and older, backslashreplace exists
# but does not support use for decoding.
# We implement our own replace handler for this
# situation, so that we can consistently use
# backslash replacement for all versions.
def backslashreplace_decode_fn(err):
raw_bytes = (err.object[i] for i in range(err.start, err.end))
if sys.version_info[0] == 2:
# Python 2 gave us characters - convert to numeric bytes
raw_bytes = (ord(b) for b in raw_bytes)
return u"".join(u"\\x%x" % c for c in raw_bytes), err.end
codecs.register_error(
"backslashreplace_decode",
backslashreplace_decode_fn)
backslashreplace_decode = "backslashreplace_decode"
def console_to_str(data):
"""Return a string, safe for output, of subprocess output.
We assume the data is in the locale preferred encoding.
If it won't decode properly, we warn the user but decode as
best we can.
We also ensure that the output can be safely written to
standard output without encoding errors.
"""
# First, get the encoding we assume. This is the preferred
# encoding for the locale, unless that is not found, or
# it is ASCII, in which case assume UTF-8
encoding = locale.getpreferredencoding()
if (not encoding) or codecs.lookup(encoding).name == "ascii":
encoding = "utf-8"
# Now try to decode the data - if we fail, warn the user and
# decode with replacement.
try:
s = data.decode(encoding)
except UnicodeDecodeError:
logger.warning(
"Subprocess output does not appear to be encoded as %s" %
encoding)
s = data.decode(encoding, errors=backslashreplace_decode)
# Make sure we can print the output, by encoding it to the output
# encoding with replacement of unencodable characters, and then
# decoding again.
# We use stderr's encoding because it's less likely to be
# redirected and if we don't find an encoding we skip this
# step (on the assumption that output is wrapped by something
# that won't fail).
output_encoding = sys.__stderr__.encoding
if output_encoding:
s = s.encode(output_encoding, errors="backslashreplace")
s = s.decode(output_encoding)
return s
if sys.version_info >= (3,):
def native_str(s, replace=False):
if isinstance(s, bytes):
return s.decode('utf-8', 'replace' if replace else 'strict')
return s
else:
def console_to_str(s):
return s
def native_str(s, replace=False):
# Replace is ignored -- unicode to UTF-8 can't fail
if isinstance(s, text_type):
......
import locale
import os
import pip.compat
import pytest
from pip.compat import expanduser, get_path_uid, native_str
from pip.compat import expanduser, get_path_uid, native_str, console_to_str
def test_get_path_uid():
......@@ -40,6 +41,29 @@ def test_get_path_uid_symlink_without_NOFOLLOW(tmpdir, monkeypatch):
get_path_uid(fs)
def test_console_to_str(monkeypatch):
some_bytes = b"a\xE9\xC3\xE9b"
encodings = ('ascii', 'utf-8', 'iso-8859-1', 'iso-8859-5',
'koi8_r', 'cp850')
for e in encodings:
monkeypatch.setattr(locale, 'getpreferredencoding', lambda: e)
result = console_to_str(some_bytes)
assert result.startswith("a")
assert result.endswith("b")
def test_console_to_str_warning(monkeypatch):
some_bytes = b"a\xE9b"
def check_warning(msg):
assert msg.startswith(
"Subprocess output does not appear to be encoded as")
monkeypatch.setattr(locale, 'getpreferredencoding', lambda: 'utf-8')
monkeypatch.setattr(pip.compat.logger, 'warning', check_warning)
console_to_str(some_bytes)
def test_to_native_str_type():
some_bytes = b"test\xE9 et approuv\xC3\xE9"
some_unicode = b"test\xE9 et approuv\xE9".decode('iso-8859-15')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册