提交 20b2f324 编写于 作者: D Donald Stufft

Upgrade requests to 2.14.2

上级 065b9de2
Upgraded requests to 2.13.0.
Upgraded requests to 2.14.2.
......@@ -41,8 +41,8 @@ is at <http://python-requests.org>.
"""
__title__ = 'requests'
__version__ = '2.13.0'
__build__ = 0x021300
__version__ = '2.14.2'
__build__ = 0x021402
__author__ = 'Kenneth Reitz'
__license__ = 'Apache 2.0'
__copyright__ = 'Copyright 2016 Kenneth Reitz'
......
......@@ -19,7 +19,7 @@ from .packages.urllib3.util.retry import Retry
from .compat import urlparse, basestring
from .utils import (DEFAULT_CA_BUNDLE_PATH, get_encoding_from_headers,
prepend_scheme_if_needed, get_auth_from_url, urldefragauth,
select_proxy, to_native_string)
select_proxy)
from .structures import CaseInsensitiveDict
from .packages.urllib3.exceptions import ClosedPoolError
from .packages.urllib3.exceptions import ConnectTimeoutError
......@@ -64,7 +64,9 @@ class BaseAdapter(object):
data before giving up, as a float, or a :ref:`(connect timeout,
read timeout) <timeouts>` tuple.
:type timeout: float or tuple
:param verify: (optional) Whether to verify SSL certificates.
:param verify: (optional) Either a boolean, in which case it controls whether we verify
the server's TLS certificate, or a string, in which case it must be a path
to a CA bundle to use
:param cert: (optional) Any user-provided SSL certificate to be trusted.
:param proxies: (optional) The proxies dictionary to apply to the request.
"""
......@@ -202,7 +204,9 @@ class HTTPAdapter(BaseAdapter):
:param conn: The urllib3 connection object associated with the cert.
:param url: The requested URL.
:param verify: Whether we should actually verify the certificate.
:param verify: Either a boolean, in which case it controls whether we verify
the server's TLS certificate, or a string, in which case it must be a path
to a CA bundle to use
:param cert: The SSL certificate to verify.
"""
if url.lower().startswith('https') and verify:
......@@ -216,8 +220,9 @@ class HTTPAdapter(BaseAdapter):
if not cert_loc:
cert_loc = DEFAULT_CA_BUNDLE_PATH
if not cert_loc:
raise Exception("Could not find a suitable SSL CA certificate bundle.")
if not cert_loc or not os.path.exists(cert_loc):
raise IOError("Could not find a suitable TLS CA certificate bundle, "
"invalid path: {0}".format(cert_loc))
conn.cert_reqs = 'CERT_REQUIRED'
......@@ -236,6 +241,12 @@ class HTTPAdapter(BaseAdapter):
conn.key_file = cert[1]
else:
conn.cert_file = cert
if conn.cert_file and not os.path.exists(conn.cert_file):
raise IOError("Could not find the TLS certificate file, "
"invalid path: {0}".format(conn.cert_file))
if conn.key_file and not os.path.exists(conn.key_file):
raise IOError("Could not find the TLS key file, "
"invalid path: {0}".format(conn.key_file))
def build_response(self, req, resp):
"""Builds a :class:`Response <requests.Response>` object from a urllib3
......@@ -380,8 +391,10 @@ class HTTPAdapter(BaseAdapter):
:param timeout: (optional) How long to wait for the server to send
data before giving up, as a float, or a :ref:`(connect timeout,
read timeout) <timeouts>` tuple.
:type timeout: float or tuple
:param verify: (optional) Whether to verify SSL certificates.
:type timeout: float or tuple or urllib3 Timeout object
:param verify: (optional) Either a boolean, in which case it controls whether
we verify the server's TLS certificate, or a string, in which case it
must be a path to a CA bundle to use
:param cert: (optional) Any user-provided SSL certificate to be trusted.
:param proxies: (optional) The proxies dictionary to apply to the request.
:rtype: requests.Response
......@@ -405,6 +418,8 @@ class HTTPAdapter(BaseAdapter):
"timeout tuple, or a single float to set "
"both timeouts to the same value".format(timeout))
raise ValueError(err)
elif isinstance(timeout, TimeoutSauce):
pass
else:
timeout = TimeoutSauce(connect=timeout, read=timeout)
......
......@@ -19,7 +19,7 @@ def request(method, url, **kwargs):
:param method: method for the new :class:`Request` object.
:param url: URL for the new :class:`Request` object.
:param params: (optional) Dictionary or bytes to be sent in the query string for the :class:`Request`.
:param data: (optional) Dictionary, bytes, or file-like object to send in the body of the :class:`Request`.
:param data: (optional) Dictionary or list of tuples ``[(key, value)]`` (will be form-encoded), bytes, or file-like object to send in the body of the :class:`Request`.
:param json: (optional) json data to send in the body of the :class:`Request`.
:param headers: (optional) Dictionary of HTTP Headers to send with the :class:`Request`.
:param cookies: (optional) Dict or CookieJar object to send with the :class:`Request`.
......@@ -29,14 +29,16 @@ def request(method, url, **kwargs):
defining the content type of the given file and ``custom_headers`` a dict-like object containing additional headers
to add for the file.
:param auth: (optional) Auth tuple to enable Basic/Digest/Custom HTTP Auth.
:param timeout: (optional) How long to wait for the server to send data
:param timeout: (optional) How many seconds to wait for the server to send data
before giving up, as a float, or a :ref:`(connect timeout, read
timeout) <timeouts>` tuple.
:type timeout: float or tuple
:param allow_redirects: (optional) Boolean. Enable/disable GET/OPTIONS/POST/PUT/PATCH/DELETE/HEAD redirection. Defaults to ``True``.
:type allow_redirects: bool
:param proxies: (optional) Dictionary mapping protocol to the URL of the proxy.
:param verify: (optional) whether the SSL cert will be verified. A CA_BUNDLE path can also be provided. Defaults to ``True``.
:param verify: (optional) Either a boolean, in which case it controls whether we verify
the server's TLS certificate, or a string, in which case it must be a path
to a CA bundle to use. Defaults to ``True``.
:param stream: (optional) if ``False``, the response content will be immediately downloaded.
:param cert: (optional) if String, path to ssl client cert file (.pem). If Tuple, ('cert', 'key') pair.
:return: :class:`Response <Response>` object
......@@ -57,7 +59,7 @@ def request(method, url, **kwargs):
def get(url, params=None, **kwargs):
"""Sends a GET request.
r"""Sends a GET request.
:param url: URL for the new :class:`Request` object.
:param params: (optional) Dictionary or bytes to be sent in the query string for the :class:`Request`.
......@@ -71,7 +73,7 @@ def get(url, params=None, **kwargs):
def options(url, **kwargs):
"""Sends a OPTIONS request.
r"""Sends a OPTIONS request.
:param url: URL for the new :class:`Request` object.
:param \*\*kwargs: Optional arguments that ``request`` takes.
......@@ -84,7 +86,7 @@ def options(url, **kwargs):
def head(url, **kwargs):
"""Sends a HEAD request.
r"""Sends a HEAD request.
:param url: URL for the new :class:`Request` object.
:param \*\*kwargs: Optional arguments that ``request`` takes.
......@@ -97,10 +99,10 @@ def head(url, **kwargs):
def post(url, data=None, json=None, **kwargs):
"""Sends a POST request.
r"""Sends a POST request.
:param url: URL for the new :class:`Request` object.
:param data: (optional) Dictionary, bytes, or file-like object to send in the body of the :class:`Request`.
:param data: (optional) Dictionary (will be form-encoded), bytes, or file-like object to send in the body of the :class:`Request`.
:param json: (optional) json data to send in the body of the :class:`Request`.
:param \*\*kwargs: Optional arguments that ``request`` takes.
:return: :class:`Response <Response>` object
......@@ -111,10 +113,10 @@ def post(url, data=None, json=None, **kwargs):
def put(url, data=None, **kwargs):
"""Sends a PUT request.
r"""Sends a PUT request.
:param url: URL for the new :class:`Request` object.
:param data: (optional) Dictionary, bytes, or file-like object to send in the body of the :class:`Request`.
:param data: (optional) Dictionary (will be form-encoded), bytes, or file-like object to send in the body of the :class:`Request`.
:param json: (optional) json data to send in the body of the :class:`Request`.
:param \*\*kwargs: Optional arguments that ``request`` takes.
:return: :class:`Response <Response>` object
......@@ -125,10 +127,10 @@ def put(url, data=None, **kwargs):
def patch(url, data=None, **kwargs):
"""Sends a PATCH request.
r"""Sends a PATCH request.
:param url: URL for the new :class:`Request` object.
:param data: (optional) Dictionary, bytes, or file-like object to send in the body of the :class:`Request`.
:param data: (optional) Dictionary (will be form-encoded), bytes, or file-like object to send in the body of the :class:`Request`.
:param json: (optional) json data to send in the body of the :class:`Request`.
:param \*\*kwargs: Optional arguments that ``request`` takes.
:return: :class:`Response <Response>` object
......@@ -139,7 +141,7 @@ def patch(url, data=None, **kwargs):
def delete(url, **kwargs):
"""Sends a DELETE request.
r"""Sends a DELETE request.
:param url: URL for the new :class:`Request` object.
:param \*\*kwargs: Optional arguments that ``request`` takes.
......
......@@ -20,7 +20,6 @@ from .compat import urlparse, str, basestring
from .cookies import extract_cookies_to_jar
from ._internal_utils import to_native_string
from .utils import parse_dict_header
from .status_codes import codes
CONTENT_TYPE_FORM_URLENCODED = 'application/x-www-form-urlencoded'
CONTENT_TYPE_MULTI_PART = 'multipart/form-data'
......@@ -227,6 +226,12 @@ class HTTPDigestAuth(AuthBase):
:rtype: requests.Response
"""
# If response is not 4xx, do not auth
# See https://github.com/kennethreitz/requests/issues/3772
if not 400 <= r.status_code < 500:
self._thread_local.num_401_calls = 1
return r
if self._thread_local.pos is not None:
# Rewind the file position indicator of the body to where
# it was to resend the request.
......
此差异已折叠。
......@@ -39,7 +39,9 @@ import json
# ---------
if is_py2:
from urllib import quote, unquote, quote_plus, unquote_plus, urlencode, getproxies, proxy_bypass
from urllib import (
quote, unquote, quote_plus, unquote_plus, urlencode, getproxies,
proxy_bypass, proxy_bypass_environment, getproxies_environment)
from urlparse import urlparse, urlunparse, urljoin, urlsplit, urldefrag
from urllib2 import parse_http_list
import cookielib
......@@ -56,7 +58,7 @@ if is_py2:
elif is_py3:
from urllib.parse import urlparse, urlunparse, urljoin, urlsplit, urlencode, quote, unquote, quote_plus, unquote_plus, urldefrag
from urllib.request import parse_http_list, getproxies, proxy_bypass
from urllib.request import parse_http_list, getproxies, proxy_bypass, proxy_bypass_environment, getproxies_environment
from http import cookiejar as cookielib
from http.cookies import Morsel
from io import StringIO
......
......@@ -19,8 +19,6 @@ from .compat import cookielib, urlparse, urlunparse, Morsel
try:
import threading
# grr, pyflakes: this fixes "redefinition of unused 'threading'"
threading
except ImportError:
import dummy_threading as threading
......
......@@ -16,7 +16,7 @@ import sys
# such as in Embedded Python. See https://github.com/kennethreitz/requests/issues/3578.
import encodings.idna
from io import BytesIO, UnsupportedOperation
from io import UnsupportedOperation
from .hooks import default_hooks
from .structures import CaseInsensitiveDict
......@@ -36,7 +36,7 @@ from .utils import (
stream_decode_response_unicode, to_key_val_list, parse_header_links,
iter_slices, guess_json_utf, super_len, check_header_validity)
from .compat import (
cookielib, urlunparse, urlsplit, urlencode, str, bytes, StringIO,
cookielib, urlunparse, urlsplit, urlencode, str, bytes,
is_py2, chardet, builtin_str, basestring)
from .compat import json as complexjson
from .status_codes import codes
......@@ -659,11 +659,23 @@ class Response(object):
return '<Response [%s]>' % (self.status_code)
def __bool__(self):
"""Returns true if :attr:`status_code` is 'OK'."""
"""Returns True if :attr:`status_code` is less than 400.
This attribute checks if the status code of the response is between
400 and 600 to see if there was a client error or a server error. If
the status code, is between 200 and 400, this will return True. This
is **not** a check to see if the response code is ``200 OK``.
"""
return self.ok
def __nonzero__(self):
"""Returns true if :attr:`status_code` is 'OK'."""
"""Returns True if :attr:`status_code` is less than 400.
This attribute checks if the status code of the response is between
400 and 600 to see if there was a client error or a server error. If
the status code, is between 200 and 400, this will return True. This
is **not** a check to see if the response code is ``200 OK``.
"""
return self.ok
def __iter__(self):
......@@ -672,6 +684,13 @@ class Response(object):
@property
def ok(self):
"""Returns True if :attr:`status_code` is less than 400.
This attribute checks if the status code of the response is between
400 and 600 to see if there was a client error or a server error. If
the status code, is between 200 and 400, this will return True. This
is **not** a check to see if the response code is ``200 OK``.
"""
try:
self.raise_for_status()
except HTTPError:
......@@ -840,7 +859,7 @@ class Response(object):
return content
def json(self, **kwargs):
"""Returns the json-encoded content of a response, if any.
r"""Returns the json-encoded content of a response, if any.
:param \*\*kwargs: Optional arguments that ``json.loads`` takes.
:raises ValueError: If the response body does not contain valid json.
......
......@@ -15,18 +15,25 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
__version__ = "2.3.0"
from sys import version_info
from .compat import PY2, PY3
from .universaldetector import UniversalDetector
from .version import __version__, VERSION
def detect(aBuf):
if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or
(version_info >= (3, 0) and not isinstance(aBuf, bytes))):
raise ValueError('Expected a bytes object, not a unicode object')
from . import universaldetector
u = universaldetector.UniversalDetector()
u.reset()
u.feed(aBuf)
u.close()
return u.result
def detect(byte_str):
"""
Detect the encoding of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError('Expected object of type bytes or bytearray, got: '
'{0}'.format(type(byte_str)))
else:
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
return detector.close()
......@@ -28,15 +28,20 @@
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import Big5DistributionAnalysis
from .mbcssm import Big5SMModel
from .mbcssm import BIG5_SM_MODEL
class Big5Prober(MultiByteCharSetProber):
def __init__(self):
MultiByteCharSetProber.__init__(self)
self._mCodingSM = CodingStateMachine(Big5SMModel)
self._mDistributionAnalyzer = Big5DistributionAnalysis()
super(Big5Prober, self).__init__()
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
self.distribution_analyzer = Big5DistributionAnalysis()
self.reset()
def get_charset_name(self):
@property
def charset_name(self):
return "Big5"
@property
def language(self):
return "Chinese"
......@@ -25,82 +25,84 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .euctwfreq import (EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE,
from .euctwfreq import (EUCTW_CHAR_TO_FREQ_ORDER, EUCTW_TABLE_SIZE,
EUCTW_TYPICAL_DISTRIBUTION_RATIO)
from .euckrfreq import (EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE,
from .euckrfreq import (EUCKR_CHAR_TO_FREQ_ORDER, EUCKR_TABLE_SIZE,
EUCKR_TYPICAL_DISTRIBUTION_RATIO)
from .gb2312freq import (GB2312CharToFreqOrder, GB2312_TABLE_SIZE,
from .gb2312freq import (GB2312_CHAR_TO_FREQ_ORDER, GB2312_TABLE_SIZE,
GB2312_TYPICAL_DISTRIBUTION_RATIO)
from .big5freq import (Big5CharToFreqOrder, BIG5_TABLE_SIZE,
from .big5freq import (BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE,
BIG5_TYPICAL_DISTRIBUTION_RATIO)
from .jisfreq import (JISCharToFreqOrder, JIS_TABLE_SIZE,
from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE,
JIS_TYPICAL_DISTRIBUTION_RATIO)
from .compat import wrap_ord
ENOUGH_DATA_THRESHOLD = 1024
SURE_YES = 0.99
SURE_NO = 0.01
MINIMUM_DATA_THRESHOLD = 3
class CharDistributionAnalysis(object):
ENOUGH_DATA_THRESHOLD = 1024
SURE_YES = 0.99
SURE_NO = 0.01
MINIMUM_DATA_THRESHOLD = 3
class CharDistributionAnalysis:
def __init__(self):
# Mapping table to get frequency order from char order (get from
# GetOrder())
self._mCharToFreqOrder = None
self._mTableSize = None # Size of above table
self._char_to_freq_order = None
self._table_size = None # Size of above table
# This is a constant value which varies from language to language,
# used in calculating confidence. See
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
# for further detail.
self._mTypicalDistributionRatio = None
self.typical_distribution_ratio = None
self._done = None
self._total_chars = None
self._freq_chars = None
self.reset()
def reset(self):
"""reset analyser, clear any state"""
# If this flag is set to True, detection is done and conclusion has
# been made
self._mDone = False
self._mTotalChars = 0 # Total characters encountered
self._done = False
self._total_chars = 0 # Total characters encountered
# The number of characters whose frequency order is less than 512
self._mFreqChars = 0
self._freq_chars = 0
def feed(self, aBuf, aCharLen):
def feed(self, char, char_len):
"""feed a character with known length"""
if aCharLen == 2:
if char_len == 2:
# we only care about 2-bytes character in our distribution analysis
order = self.get_order(aBuf)
order = self.get_order(char)
else:
order = -1
if order >= 0:
self._mTotalChars += 1
self._total_chars += 1
# order is valid
if order < self._mTableSize:
if 512 > self._mCharToFreqOrder[order]:
self._mFreqChars += 1
if order < self._table_size:
if 512 > self._char_to_freq_order[order]:
self._freq_chars += 1
def get_confidence(self):
"""return confidence based on existing data"""
# if we didn't receive any character in our consideration range,
# return negative answer
if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD:
return SURE_NO
if self._total_chars <= 0 or self._freq_chars <= self.MINIMUM_DATA_THRESHOLD:
return self.SURE_NO
if self._mTotalChars != self._mFreqChars:
r = (self._mFreqChars / ((self._mTotalChars - self._mFreqChars)
* self._mTypicalDistributionRatio))
if r < SURE_YES:
if self._total_chars != self._freq_chars:
r = (self._freq_chars / ((self._total_chars - self._freq_chars)
* self.typical_distribution_ratio))
if r < self.SURE_YES:
return r
# normalize confidence (we don't want to be 100% sure)
return SURE_YES
return self.SURE_YES
def got_enough_data(self):
# It is not necessary to receive all data to draw conclusion.
# For charset detection, certain amount of data is enough
return self._mTotalChars > ENOUGH_DATA_THRESHOLD
return self._total_chars > self.ENOUGH_DATA_THRESHOLD
def get_order(self, aBuf):
def get_order(self, byte_str):
# We do not handle characters based on the original encoding string,
# but convert this encoding string to a number, here called order.
# This allows multiple encodings of a language to share one frequency
......@@ -110,55 +112,55 @@ class CharDistributionAnalysis:
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
CharDistributionAnalysis.__init__(self)
self._mCharToFreqOrder = EUCTWCharToFreqOrder
self._mTableSize = EUCTW_TABLE_SIZE
self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
super(EUCTWDistributionAnalysis, self).__init__()
self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
self._table_size = EUCTW_TABLE_SIZE
self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aBuf):
def get_order(self, byte_str):
# for euc-TW encoding, we are interested
# first byte range: 0xc4 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char = wrap_ord(aBuf[0])
first_char = byte_str[0]
if first_char >= 0xC4:
return 94 * (first_char - 0xC4) + wrap_ord(aBuf[1]) - 0xA1
return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
else:
return -1
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
CharDistributionAnalysis.__init__(self)
self._mCharToFreqOrder = EUCKRCharToFreqOrder
self._mTableSize = EUCKR_TABLE_SIZE
self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
super(EUCKRDistributionAnalysis, self).__init__()
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
self._table_size = EUCKR_TABLE_SIZE
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aBuf):
def get_order(self, byte_str):
# for euc-KR encoding, we are interested
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char = wrap_ord(aBuf[0])
first_char = byte_str[0]
if first_char >= 0xB0:
return 94 * (first_char - 0xB0) + wrap_ord(aBuf[1]) - 0xA1
return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
else:
return -1
class GB2312DistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
CharDistributionAnalysis.__init__(self)
self._mCharToFreqOrder = GB2312CharToFreqOrder
self._mTableSize = GB2312_TABLE_SIZE
self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
super(GB2312DistributionAnalysis, self).__init__()
self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
self._table_size = GB2312_TABLE_SIZE
self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aBuf):
def get_order(self, byte_str):
# for GB2312 encoding, we are interested
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
first_char, second_char = byte_str[0], byte_str[1]
if (first_char >= 0xB0) and (second_char >= 0xA1):
return 94 * (first_char - 0xB0) + second_char - 0xA1
else:
......@@ -167,17 +169,17 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
class Big5DistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
CharDistributionAnalysis.__init__(self)
self._mCharToFreqOrder = Big5CharToFreqOrder
self._mTableSize = BIG5_TABLE_SIZE
self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
super(Big5DistributionAnalysis, self).__init__()
self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
self._table_size = BIG5_TABLE_SIZE
self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aBuf):
def get_order(self, byte_str):
# for big5 encoding, we are interested
# first byte range: 0xa4 -- 0xfe
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
first_char, second_char = byte_str[0], byte_str[1]
if first_char >= 0xA4:
if second_char >= 0xA1:
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
......@@ -189,17 +191,17 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
class SJISDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
CharDistributionAnalysis.__init__(self)
self._mCharToFreqOrder = JISCharToFreqOrder
self._mTableSize = JIS_TABLE_SIZE
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
super(SJISDistributionAnalysis, self).__init__()
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
self._table_size = JIS_TABLE_SIZE
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aBuf):
def get_order(self, byte_str):
# for sjis encoding, we are interested
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
# no validation needed here. State machine has done that
first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
first_char, second_char = byte_str[0], byte_str[1]
if (first_char >= 0x81) and (first_char <= 0x9F):
order = 188 * (first_char - 0x81)
elif (first_char >= 0xE0) and (first_char <= 0xEF):
......@@ -214,18 +216,18 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
CharDistributionAnalysis.__init__(self)
self._mCharToFreqOrder = JISCharToFreqOrder
self._mTableSize = JIS_TABLE_SIZE
self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
super(EUCJPDistributionAnalysis, self).__init__()
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
self._table_size = JIS_TABLE_SIZE
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, aBuf):
def get_order(self, byte_str):
# for euc-JP encoding, we are interested
# first byte range: 0xa0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
char = wrap_ord(aBuf[0])
char = byte_str[0]
if char >= 0xA0:
return 94 * (char - 0xA1) + wrap_ord(aBuf[1]) - 0xa1
return 94 * (char - 0xA1) + byte_str[1] - 0xa1
else:
return -1
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code.
#
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
......@@ -13,94 +13,94 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from . import constants
import sys
from .enums import ProbingState
from .charsetprober import CharSetProber
class CharSetGroupProber(CharSetProber):
def __init__(self):
CharSetProber.__init__(self)
self._mActiveNum = 0
self._mProbers = []
self._mBestGuessProber = None
def __init__(self, lang_filter=None):
super(CharSetGroupProber, self).__init__(lang_filter=lang_filter)
self._active_num = 0
self.probers = []
self._best_guess_prober = None
def reset(self):
CharSetProber.reset(self)
self._mActiveNum = 0
for prober in self._mProbers:
super(CharSetGroupProber, self).reset()
self._active_num = 0
for prober in self.probers:
if prober:
prober.reset()
prober.active = True
self._mActiveNum += 1
self._mBestGuessProber = None
self._active_num += 1
self._best_guess_prober = None
@property
def charset_name(self):
if not self._best_guess_prober:
self.get_confidence()
if not self._best_guess_prober:
return None
return self._best_guess_prober.charset_name
def get_charset_name(self):
if not self._mBestGuessProber:
@property
def language(self):
if not self._best_guess_prober:
self.get_confidence()
if not self._mBestGuessProber:
if not self._best_guess_prober:
return None
# self._mBestGuessProber = self._mProbers[0]
return self._mBestGuessProber.get_charset_name()
return self._best_guess_prober.language
def feed(self, aBuf):
for prober in self._mProbers:
def feed(self, byte_str):
for prober in self.probers:
if not prober:
continue
if not prober.active:
continue
st = prober.feed(aBuf)
if not st:
state = prober.feed(byte_str)
if not state:
continue
if st == constants.eFoundIt:
self._mBestGuessProber = prober
return self.get_state()
elif st == constants.eNotMe:
if state == ProbingState.FOUND_IT:
self._best_guess_prober = prober
return self.state
elif state == ProbingState.NOT_ME:
prober.active = False
self._mActiveNum -= 1
if self._mActiveNum <= 0:
self._mState = constants.eNotMe
return self.get_state()
return self.get_state()
self._active_num -= 1
if self._active_num <= 0:
self._state = ProbingState.NOT_ME
return self.state
return self.state
def get_confidence(self):
st = self.get_state()
if st == constants.eFoundIt:
state = self.state
if state == ProbingState.FOUND_IT:
return 0.99
elif st == constants.eNotMe:
elif state == ProbingState.NOT_ME:
return 0.01
bestConf = 0.0
self._mBestGuessProber = None
for prober in self._mProbers:
best_conf = 0.0
self._best_guess_prober = None
for prober in self.probers:
if not prober:
continue
if not prober.active:
if constants._debug:
sys.stderr.write(prober.get_charset_name()
+ ' not active\n')
self.logger.debug('%s not active', prober.charset_name)
continue
cf = prober.get_confidence()
if constants._debug:
sys.stderr.write('%s confidence = %s\n' %
(prober.get_charset_name(), cf))
if bestConf < cf:
bestConf = cf
self._mBestGuessProber = prober
if not self._mBestGuessProber:
conf = prober.get_confidence()
self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf)
if best_conf < conf:
best_conf = conf
self._best_guess_prober = prober
if not self._best_guess_prober:
return 0.0
return bestConf
# else:
# self._mBestGuessProber = self._mProbers[0]
# return self._mBestGuessProber.get_confidence()
return best_conf
......@@ -26,37 +26,120 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from . import constants
import logging
import re
from .enums import ProbingState
class CharSetProber:
def __init__(self):
pass
class CharSetProber(object):
SHORTCUT_THRESHOLD = 0.95
def __init__(self, lang_filter=None):
self._state = None
self.lang_filter = lang_filter
self.logger = logging.getLogger(__name__)
def reset(self):
self._mState = constants.eDetecting
self._state = ProbingState.DETECTING
def get_charset_name(self):
@property
def charset_name(self):
return None
def feed(self, aBuf):
def feed(self, buf):
pass
def get_state(self):
return self._mState
@property
def state(self):
return self._state
def get_confidence(self):
return 0.0
def filter_high_bit_only(self, aBuf):
aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf)
return aBuf
@staticmethod
def filter_high_byte_only(buf):
buf = re.sub(b'([\x00-\x7F])+', b' ', buf)
return buf
@staticmethod
def filter_international_words(buf):
"""
We define three types of bytes:
alphabet: english alphabets [a-zA-Z]
international: international characters [\x80-\xFF]
marker: everything else [^a-zA-Z\x80-\xFF]
The input buffer can be thought to contain a series of words delimited
by markers. This function works to filter all words that contain at
least one international character. All contiguous sequences of markers
are replaced by a single space ascii character.
This filter applies to all scripts which do not use English characters.
"""
filtered = bytearray()
# This regex expression filters out only words that have at-least one
# international character. The word may include one marker character at
# the end.
words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
buf)
for word in words:
filtered.extend(word[:-1])
# If the last character in the word is a marker, replace it with a
# space as markers shouldn't affect our analysis (they are used
# similarly across all languages and may thus have similar
# frequencies).
last_char = word[-1:]
if not last_char.isalpha() and last_char < b'\x80':
last_char = b' '
filtered.extend(last_char)
return filtered
@staticmethod
def filter_with_english_letters(buf):
"""
Returns a copy of ``buf`` that retains only the sequences of English
alphabet and high byte characters that are not between <> characters.
Also retains English alphabet and high byte characters immediately
before occurrences of >.
This filter can be applied to all scripts which contain both English
characters and extended ASCII characters, but is currently only used by
``Latin1Prober``.
"""
filtered = bytearray()
in_tag = False
prev = 0
for curr in range(len(buf)):
# Slice here to get bytes instead of an int with Python 3
buf_char = buf[curr:curr + 1]
# Check if we're coming out of or entering an HTML tag
if buf_char == b'>':
in_tag = False
elif buf_char == b'<':
in_tag = True
# If current character is not extended-ASCII and not alphabetic...
if buf_char < b'\x80' and not buf_char.isalpha():
# ...and we're not in a tag
if curr > prev and not in_tag:
# Keep everything after last non-extended-ASCII,
# non-alphabetic character
filtered.extend(buf[prev:curr])
# Output a space to delimit stretch we kept
filtered.extend(b' ')
prev = curr + 1
def filter_without_english_letters(self, aBuf):
aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf)
return aBuf
# If we're not in a tag...
if not in_tag:
# Keep everything after last non-extended-ASCII, non-alphabetic
# character
filtered.extend(buf[prev:])
def filter_with_english_letters(self, aBuf):
# TODO
return aBuf
return filtered
#!/usr/bin/env python
"""
Script which takes one or more file paths and reports on their detected
encodings
Example::
% chardetect somefile someotherfile
somefile: windows-1252 with confidence 0.5
someotherfile: ascii with confidence 1.0
If no paths are provided, it takes its input from stdin.
"""
from __future__ import absolute_import, print_function, unicode_literals
import argparse
import sys
from chardet import __version__
from chardet.compat import PY2
from chardet.universaldetector import UniversalDetector
def description_of(lines, name='stdin'):
"""
Return a string describing the probable encoding of a file or
list of strings.
:param lines: The lines to get the encoding of.
:type lines: Iterable of bytes
:param name: Name of file or collection of lines
:type name: str
"""
u = UniversalDetector()
for line in lines:
line = bytearray(line)
u.feed(line)
# shortcut out of the loop to save reading further - particularly useful if we read a BOM.
if u.done:
break
u.close()
result = u.result
if PY2:
name = name.decode(sys.getfilesystemencoding(), 'ignore')
if result['encoding']:
return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
result['confidence'])
else:
return '{0}: no result'.format(name)
def main(argv=None):
"""
Handles command line arguments and gets things started.
:param argv: List of arguments, as if specified on the command-line.
If None, ``sys.argv[1:]`` is used instead.
:type argv: list of str
"""
# Get command line arguments
parser = argparse.ArgumentParser(
description="Takes one or more file paths and reports their detected \
encodings")
parser.add_argument('input',
help='File whose encoding we would like to determine. \
(default: stdin)',
type=argparse.FileType('rb'), nargs='*',
default=[sys.stdin if PY2 else sys.stdin.buffer])
parser.add_argument('--version', action='version',
version='%(prog)s {0}'.format(__version__))
args = parser.parse_args(argv)
for f in args.input:
if f.isatty():
print("You are running chardetect interactively. Press " +
"CTRL-D twice at the start of a blank line to signal the " +
"end of your input. If you want help, run chardetect " +
"--help\n", file=sys.stderr)
print(description_of(f, f.name))
if __name__ == '__main__':
main()
......@@ -25,37 +25,64 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .constants import eStart
from .compat import wrap_ord
import logging
from .enums import MachineState
class CodingStateMachine:
class CodingStateMachine(object):
"""
A state machine to verify a byte sequence for a particular encoding. For
each byte the detector receives, it will feed that byte to every active
state machine available, one byte at a time. The state machine changes its
state based on its previous state and the byte it receives. There are 3
states in a state machine that are of interest to an auto-detector:
START state: This is the state to start with, or a legal byte sequence
(i.e. a valid code point) for character has been identified.
ME state: This indicates that the state machine identified a byte sequence
that is specific to the charset it is designed for and that
there is no other possible encoding which can contain this byte
sequence. This will to lead to an immediate positive answer for
the detector.
ERROR state: This indicates the state machine identified an illegal byte
sequence for that encoding. This will lead to an immediate
negative answer for this encoding. Detector will exclude this
encoding from consideration from here on.
"""
def __init__(self, sm):
self._mModel = sm
self._mCurrentBytePos = 0
self._mCurrentCharLen = 0
self._model = sm
self._curr_byte_pos = 0
self._curr_char_len = 0
self._curr_state = None
self.logger = logging.getLogger(__name__)
self.reset()
def reset(self):
self._mCurrentState = eStart
self._curr_state = MachineState.START
def next_state(self, c):
# for each byte we get its class
# if it is first byte, we also get byte length
# PY3K: aBuf is a byte stream, so c is an int, not a byte
byteCls = self._mModel['classTable'][wrap_ord(c)]
if self._mCurrentState == eStart:
self._mCurrentBytePos = 0
self._mCurrentCharLen = self._mModel['charLenTable'][byteCls]
# from byte's class and stateTable, we get its next state
curr_state = (self._mCurrentState * self._mModel['classFactor']
+ byteCls)
self._mCurrentState = self._mModel['stateTable'][curr_state]
self._mCurrentBytePos += 1
return self._mCurrentState
byte_class = self._model['class_table'][c]
if self._curr_state == MachineState.START:
self._curr_byte_pos = 0
self._curr_char_len = self._model['char_len_table'][byte_class]
# from byte's class and state_table, we get its next state
curr_state = (self._curr_state * self._model['class_factor']
+ byte_class)
self._curr_state = self._model['state_table'][curr_state]
self._curr_byte_pos += 1
return self._curr_state
def get_current_charlen(self):
return self._mCurrentCharLen
return self._curr_char_len
def get_coding_state_machine(self):
return self._mModel['name']
return self._model['name']
@property
def language(self):
return self._model['language']
######################## BEGIN LICENSE BLOCK ########################
# Contributor(s):
# Ian Cordasco - port to Python
# Dan Blanchard
# Ian Cordasco
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
......@@ -22,13 +23,12 @@ import sys
if sys.version_info < (3, 0):
PY2 = True
PY3 = False
base_str = (str, unicode)
text_type = unicode
else:
PY2 = False
PY3 = True
base_str = (bytes, str)
def wrap_ord(a):
if sys.version_info < (3, 0) and isinstance(a, base_str):
return ord(a)
else:
return a
text_type = str
......@@ -25,20 +25,25 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCKRDistributionAnalysis
from .mbcssm import CP949SMModel
from .codingstatemachine import CodingStateMachine
from .mbcharsetprober import MultiByteCharSetProber
from .mbcssm import CP949_SM_MODEL
class CP949Prober(MultiByteCharSetProber):
def __init__(self):
MultiByteCharSetProber.__init__(self)
self._mCodingSM = CodingStateMachine(CP949SMModel)
super(CP949Prober, self).__init__()
self.coding_sm = CodingStateMachine(CP949_SM_MODEL)
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
# not different.
self._mDistributionAnalyzer = EUCKRDistributionAnalysis()
self.distribution_analyzer = EUCKRDistributionAnalysis()
self.reset()
def get_charset_name(self):
@property
def charset_name(self):
return "CP949"
@property
def language(self):
return "Korean"
"""
All of the Enums that are used throughout the chardet package.
:author: Dan Blanchard (dan.blanchard@gmail.com)
"""
class InputState(object):
"""
This enum represents the different states a universal detector can be in.
"""
PURE_ASCII = 0
ESC_ASCII = 1
HIGH_BYTE = 2
class LanguageFilter(object):
"""
This enum represents the different language filters we can apply to a
``UniversalDetector``.
"""
CHINESE_SIMPLIFIED = 0x01
CHINESE_TRADITIONAL = 0x02
JAPANESE = 0x04
KOREAN = 0x08
NON_CJK = 0x10
ALL = 0x1F
CHINESE = CHINESE_SIMPLIFIED | CHINESE_TRADITIONAL
CJK = CHINESE | JAPANESE | KOREAN
class ProbingState(object):
"""
This enum represents the different states a prober can be in.
"""
DETECTING = 0
FOUND_IT = 1
NOT_ME = 2
class MachineState(object):
"""
This enum represents the different states a state machine can be in.
"""
START = 0
ERROR = 1
ITS_ME = 2
class SequenceLikelihood(object):
"""
This enum represents the likelihood of a character following the previous one.
"""
NEGATIVE = 0
UNLIKELY = 1
LIKELY = 2
POSITIVE = 3
@classmethod
def get_num_categories(cls):
""":returns: The number of likelihood categories in the enum."""
return 4
class CharacterCategory(object):
"""
This enum represents the different categories language models for
``SingleByteCharsetProber`` put characters into.
Anything less than CONTROL is considered a letter.
"""
UNDEFINED = 255
LINE_BREAK = 254
SYMBOL = 253
DIGIT = 252
CONTROL = 251
......@@ -25,62 +25,77 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from . import constants
from .escsm import (HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel,
ISO2022KRSMModel)
from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine
from .compat import wrap_ord
from .enums import LanguageFilter, ProbingState, MachineState
from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL,
ISO2022KR_SM_MODEL)
class EscCharSetProber(CharSetProber):
def __init__(self):
CharSetProber.__init__(self)
self._mCodingSM = [
CodingStateMachine(HZSMModel),
CodingStateMachine(ISO2022CNSMModel),
CodingStateMachine(ISO2022JPSMModel),
CodingStateMachine(ISO2022KRSMModel)
]
"""
This CharSetProber uses a "code scheme" approach for detecting encodings,
whereby easily recognizable escape or shift sequences are relied on to
identify these encodings.
"""
def __init__(self, lang_filter=None):
super(EscCharSetProber, self).__init__(lang_filter=lang_filter)
self.coding_sm = []
if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))
self.coding_sm.append(CodingStateMachine(ISO2022CN_SM_MODEL))
if self.lang_filter & LanguageFilter.JAPANESE:
self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
if self.lang_filter & LanguageFilter.KOREAN:
self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
self.active_sm_count = None
self._detected_charset = None
self._detected_language = None
self._state = None
self.reset()
def reset(self):
CharSetProber.reset(self)
for codingSM in self._mCodingSM:
if not codingSM:
super(EscCharSetProber, self).reset()
for coding_sm in self.coding_sm:
if not coding_sm:
continue
codingSM.active = True
codingSM.reset()
self._mActiveSM = len(self._mCodingSM)
self._mDetectedCharset = None
coding_sm.active = True
coding_sm.reset()
self.active_sm_count = len(self.coding_sm)
self._detected_charset = None
self._detected_language = None
@property
def charset_name(self):
return self._detected_charset
def get_charset_name(self):
return self._mDetectedCharset
@property
def language(self):
return self._detected_language
def get_confidence(self):
if self._mDetectedCharset:
if self._detected_charset:
return 0.99
else:
return 0.00
def feed(self, aBuf):
for c in aBuf:
# PY3K: aBuf is a byte array, so c is an int, not a byte
for codingSM in self._mCodingSM:
if not codingSM:
continue
if not codingSM.active:
def feed(self, byte_str):
for c in byte_str:
for coding_sm in self.coding_sm:
if not coding_sm or not coding_sm.active:
continue
codingState = codingSM.next_state(wrap_ord(c))
if codingState == constants.eError:
codingSM.active = False
self._mActiveSM -= 1
if self._mActiveSM <= 0:
self._mState = constants.eNotMe
return self.get_state()
elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt
self._mDetectedCharset = codingSM.get_coding_state_machine() # nopep8
return self.get_state()
coding_state = coding_sm.next_state(c)
if coding_state == MachineState.ERROR:
coding_sm.active = False
self.active_sm_count -= 1
if self.active_sm_count <= 0:
self._state = ProbingState.NOT_ME
return self.state
elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
self._detected_charset = coding_sm.get_coding_state_machine()
self._detected_language = coding_sm.language
return self.state
return self.get_state()
return self.state
......@@ -25,9 +25,9 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .constants import eStart, eError, eItsMe
from .enums import MachineState
HZ_cls = (
HZ_CLS = (
1,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
......@@ -62,24 +62,25 @@ HZ_cls = (
1,1,1,1,1,1,1,1, # f8 - ff
)
HZ_st = (
eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17
5,eError, 6,eError, 5, 5, 4,eError,# 18-1f
4,eError, 4, 4, 4,eError, 4,eError,# 20-27
4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart,# 28-2f
HZ_ST = (
MachineState.START,MachineState.ERROR, 3,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START, 4,MachineState.ERROR,# 10-17
5,MachineState.ERROR, 6,MachineState.ERROR, 5, 5, 4,MachineState.ERROR,# 18-1f
4,MachineState.ERROR, 4, 4, 4,MachineState.ERROR, 4,MachineState.ERROR,# 20-27
4,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 28-2f
)
HZCharLenTable = (0, 0, 0, 0, 0, 0)
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
HZSMModel = {'classTable': HZ_cls,
'classFactor': 6,
'stateTable': HZ_st,
'charLenTable': HZCharLenTable,
'name': "HZ-GB-2312"}
HZ_SM_MODEL = {'class_table': HZ_CLS,
'class_factor': 6,
'state_table': HZ_ST,
'char_len_table': HZ_CHAR_LEN_TABLE,
'name': "HZ-GB-2312",
'language': 'Chinese'}
ISO2022CN_cls = (
ISO2022CN_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
......@@ -114,26 +115,27 @@ ISO2022CN_cls = (
2,2,2,2,2,2,2,2, # f8 - ff
)
ISO2022CN_st = (
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f
eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
eItsMe,eItsMe,eItsMe,eError,eError,eError, 4,eError,# 18-1f
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 20-27
5, 6,eError,eError,eError,eError,eError,eError,# 28-2f
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 30-37
eError,eError,eError,eError,eError,eItsMe,eError,eStart,# 38-3f
ISO2022CN_ST = (
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,# 18-1f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 20-27
5, 6,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 28-2f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 30-37
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,# 38-3f
)
ISO2022CNCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0)
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
ISO2022CNSMModel = {'classTable': ISO2022CN_cls,
'classFactor': 9,
'stateTable': ISO2022CN_st,
'charLenTable': ISO2022CNCharLenTable,
'name': "ISO-2022-CN"}
ISO2022CN_SM_MODEL = {'class_table': ISO2022CN_CLS,
'class_factor': 9,
'state_table': ISO2022CN_ST,
'char_len_table': ISO2022CN_CHAR_LEN_TABLE,
'name': "ISO-2022-CN",
'language': 'Chinese'}
ISO2022JP_cls = (
ISO2022JP_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,2,2, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
......@@ -168,27 +170,28 @@ ISO2022JP_cls = (
2,2,2,2,2,2,2,2, # f8 - ff
)
ISO2022JP_st = (
eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07
eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17
eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,# 18-1f
eError, 5,eError,eError,eError, 4,eError,eError,# 20-27
eError,eError,eError, 6,eItsMe,eError,eItsMe,eError,# 28-2f
eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,# 30-37
eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f
eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47
ISO2022JP_ST = (
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,# 18-1f
MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 20-27
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 6,MachineState.ITS_ME,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,# 28-2f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,# 30-37
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 38-3f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.START,# 40-47
)
ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
ISO2022JPSMModel = {'classTable': ISO2022JP_cls,
'classFactor': 10,
'stateTable': ISO2022JP_st,
'charLenTable': ISO2022JPCharLenTable,
'name': "ISO-2022-JP"}
ISO2022JP_SM_MODEL = {'class_table': ISO2022JP_CLS,
'class_factor': 10,
'state_table': ISO2022JP_ST,
'char_len_table': ISO2022JP_CHAR_LEN_TABLE,
'name': "ISO-2022-JP",
'language': 'Japanese'}
ISO2022KR_cls = (
ISO2022KR_CLS = (
2,0,0,0,0,0,0,0, # 00 - 07
0,0,0,0,0,0,0,0, # 08 - 0f
0,0,0,0,0,0,0,0, # 10 - 17
......@@ -223,20 +226,21 @@ ISO2022KR_cls = (
2,2,2,2,2,2,2,2, # f8 - ff
)
ISO2022KR_st = (
eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07
eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f
eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17
eError,eError,eError,eError, 5,eError,eError,eError,# 18-1f
eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart,# 20-27
ISO2022KR_ST = (
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 10-17
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 18-1f
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 20-27
)
ISO2022KRCharLenTable = (0, 0, 0, 0, 0, 0)
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
ISO2022KR_SM_MODEL = {'class_table': ISO2022KR_CLS,
'class_factor': 6,
'state_table': ISO2022KR_ST,
'char_len_table': ISO2022KR_CHAR_LEN_TABLE,
'name': "ISO-2022-KR",
'language': 'Korean'}
ISO2022KRSMModel = {'classTable': ISO2022KR_cls,
'classFactor': 6,
'stateTable': ISO2022KR_st,
'charLenTable': ISO2022KRCharLenTable,
'name': "ISO-2022-KR"}
# flake8: noqa
......@@ -25,66 +25,68 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import sys
from . import constants
from .enums import ProbingState, MachineState
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCJPDistributionAnalysis
from .jpcntx import EUCJPContextAnalysis
from .mbcssm import EUCJPSMModel
from .mbcssm import EUCJP_SM_MODEL
class EUCJPProber(MultiByteCharSetProber):
def __init__(self):
MultiByteCharSetProber.__init__(self)
self._mCodingSM = CodingStateMachine(EUCJPSMModel)
self._mDistributionAnalyzer = EUCJPDistributionAnalysis()
self._mContextAnalyzer = EUCJPContextAnalysis()
super(EUCJPProber, self).__init__()
self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)
self.distribution_analyzer = EUCJPDistributionAnalysis()
self.context_analyzer = EUCJPContextAnalysis()
self.reset()
def reset(self):
MultiByteCharSetProber.reset(self)
self._mContextAnalyzer.reset()
super(EUCJPProber, self).reset()
self.context_analyzer.reset()
def get_charset_name(self):
@property
def charset_name(self):
return "EUC-JP"
def feed(self, aBuf):
aLen = len(aBuf)
for i in range(0, aLen):
# PY3K: aBuf is a byte array, so aBuf[i] is an int, not a byte
codingState = self._mCodingSM.next_state(aBuf[i])
if codingState == constants.eError:
if constants._debug:
sys.stderr.write(self.get_charset_name()
+ ' prober hit error at byte ' + str(i)
+ '\n')
self._mState = constants.eNotMe
@property
def language(self):
return "Japanese"
def feed(self, byte_str):
for i in range(len(byte_str)):
# PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte
coding_state = self.coding_sm.next_state(byte_str[i])
if coding_state == MachineState.ERROR:
self.logger.debug('%s %s prober hit error at byte %s',
self.charset_name, self.language, i)
self._state = ProbingState.NOT_ME
break
elif codingState == constants.eItsMe:
self._mState = constants.eFoundIt
elif coding_state == MachineState.ITS_ME:
self._state = ProbingState.FOUND_IT
break
elif codingState == constants.eStart:
charLen = self._mCodingSM.get_current_charlen()
elif coding_state == MachineState.START:
char_len = self.coding_sm.get_current_charlen()
if i == 0:
self._mLastChar[1] = aBuf[0]
self._mContextAnalyzer.feed(self._mLastChar, charLen)
self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
self._last_char[1] = byte_str[0]
self.context_analyzer.feed(self._last_char, char_len)
self.distribution_analyzer.feed(self._last_char, char_len)
else:
self._mContextAnalyzer.feed(aBuf[i - 1:i + 1], charLen)
self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
charLen)
self.context_analyzer.feed(byte_str[i - 1:i + 1],
char_len)
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
char_len)
self._mLastChar[0] = aBuf[aLen - 1]
self._last_char[0] = byte_str[-1]
if self.get_state() == constants.eDetecting:
if (self._mContextAnalyzer.got_enough_data() and
(self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
self._mState = constants.eFoundIt
if self.state == ProbingState.DETECTING:
if (self.context_analyzer.got_enough_data() and
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
self._state = ProbingState.FOUND_IT
return self.get_state()
return self.state
def get_confidence(self):
contxtCf = self._mContextAnalyzer.get_confidence()
distribCf = self._mDistributionAnalyzer.get_confidence()
return max(contxtCf, distribCf)
context_conf = self.context_analyzer.get_confidence()
distrib_conf = self.distribution_analyzer.get_confidence()
return max(context_conf, distrib_conf)
......@@ -28,15 +28,20 @@
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCKRDistributionAnalysis
from .mbcssm import EUCKRSMModel
from .mbcssm import EUCKR_SM_MODEL
class EUCKRProber(MultiByteCharSetProber):
def __init__(self):
MultiByteCharSetProber.__init__(self)
self._mCodingSM = CodingStateMachine(EUCKRSMModel)
self._mDistributionAnalyzer = EUCKRDistributionAnalysis()
super(EUCKRProber, self).__init__()
self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL)
self.distribution_analyzer = EUCKRDistributionAnalysis()
self.reset()
def get_charset_name(self):
@property
def charset_name(self):
return "EUC-KR"
@property
def language(self):
return "Korean"
......@@ -13,12 +13,12 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
......@@ -28,14 +28,19 @@
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import EUCTWDistributionAnalysis
from .mbcssm import EUCTWSMModel
from .mbcssm import EUCTW_SM_MODEL
class EUCTWProber(MultiByteCharSetProber):
def __init__(self):
MultiByteCharSetProber.__init__(self)
self._mCodingSM = CodingStateMachine(EUCTWSMModel)
self._mDistributionAnalyzer = EUCTWDistributionAnalysis()
super(EUCTWProber, self).__init__()
self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL)
self.distribution_analyzer = EUCTWDistributionAnalysis()
self.reset()
def get_charset_name(self):
@property
def charset_name(self):
return "EUC-TW"
@property
def language(self):
return "Taiwan"
......@@ -43,7 +43,7 @@ GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9
GB2312_TABLE_SIZE = 3760
GB2312CharToFreqOrder = (
GB2312_CHAR_TO_FREQ_ORDER = (
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
2204, 869,4207, 970,2678,5626,2944,2956,1479,4048, 514,3595, 588,1346,2820,3409,
......@@ -278,195 +278,6 @@ GB2312CharToFreqOrder = (
1718,1717,2655,3453,3143,4465, 161,2889,2980,2009,1421, 56,1908,1640,2387,2232,
1917,1874,2477,4921, 148, 83,3438, 592,4245,2882,1822,1055, 741, 115,1496,1624,
381,1638,4592,1020, 516,3214, 458, 947,4575,1432, 211,1514,2926,1865,2142, 189,
852,1221,1400,1486, 882,2299,4036, 351, 28,1122, 700,6479,6480,6481,6482,6483, # last 512
#Everything below is of no interest for detection purpose
5508,6484,3900,3414,3974,4441,4024,3537,4037,5628,5099,3633,6485,3148,6486,3636,
5509,3257,5510,5973,5445,5872,4941,4403,3174,4627,5873,6276,2286,4230,5446,5874,
5122,6102,6103,4162,5447,5123,5323,4849,6277,3980,3851,5066,4246,5774,5067,6278,
3001,2807,5695,3346,5775,5974,5158,5448,6487,5975,5976,5776,3598,6279,5696,4806,
4211,4154,6280,6488,6489,6490,6281,4212,5037,3374,4171,6491,4562,4807,4722,4827,
5977,6104,4532,4079,5159,5324,5160,4404,3858,5359,5875,3975,4288,4610,3486,4512,
5325,3893,5360,6282,6283,5560,2522,4231,5978,5186,5449,2569,3878,6284,5401,3578,
4415,6285,4656,5124,5979,2506,4247,4449,3219,3417,4334,4969,4329,6492,4576,4828,
4172,4416,4829,5402,6286,3927,3852,5361,4369,4830,4477,4867,5876,4173,6493,6105,
4657,6287,6106,5877,5450,6494,4155,4868,5451,3700,5629,4384,6288,6289,5878,3189,
4881,6107,6290,6495,4513,6496,4692,4515,4723,5100,3356,6497,6291,3810,4080,5561,
3570,4430,5980,6498,4355,5697,6499,4724,6108,6109,3764,4050,5038,5879,4093,3226,
6292,5068,5217,4693,3342,5630,3504,4831,4377,4466,4309,5698,4431,5777,6293,5778,
4272,3706,6110,5326,3752,4676,5327,4273,5403,4767,5631,6500,5699,5880,3475,5039,
6294,5562,5125,4348,4301,4482,4068,5126,4593,5700,3380,3462,5981,5563,3824,5404,
4970,5511,3825,4738,6295,6501,5452,4516,6111,5881,5564,6502,6296,5982,6503,4213,
4163,3454,6504,6112,4009,4450,6113,4658,6297,6114,3035,6505,6115,3995,4904,4739,
4563,4942,4110,5040,3661,3928,5362,3674,6506,5292,3612,4791,5565,4149,5983,5328,
5259,5021,4725,4577,4564,4517,4364,6298,5405,4578,5260,4594,4156,4157,5453,3592,
3491,6507,5127,5512,4709,4922,5984,5701,4726,4289,6508,4015,6116,5128,4628,3424,
4241,5779,6299,4905,6509,6510,5454,5702,5780,6300,4365,4923,3971,6511,5161,3270,
3158,5985,4100, 867,5129,5703,6117,5363,3695,3301,5513,4467,6118,6512,5455,4232,
4242,4629,6513,3959,4478,6514,5514,5329,5986,4850,5162,5566,3846,4694,6119,5456,
4869,5781,3779,6301,5704,5987,5515,4710,6302,5882,6120,4392,5364,5705,6515,6121,
6516,6517,3736,5988,5457,5989,4695,2457,5883,4551,5782,6303,6304,6305,5130,4971,
6122,5163,6123,4870,3263,5365,3150,4871,6518,6306,5783,5069,5706,3513,3498,4409,
5330,5632,5366,5458,5459,3991,5990,4502,3324,5991,5784,3696,4518,5633,4119,6519,
4630,5634,4417,5707,4832,5992,3418,6124,5993,5567,4768,5218,6520,4595,3458,5367,
6125,5635,6126,4202,6521,4740,4924,6307,3981,4069,4385,6308,3883,2675,4051,3834,
4302,4483,5568,5994,4972,4101,5368,6309,5164,5884,3922,6127,6522,6523,5261,5460,
5187,4164,5219,3538,5516,4111,3524,5995,6310,6311,5369,3181,3386,2484,5188,3464,
5569,3627,5708,6524,5406,5165,4677,4492,6312,4872,4851,5885,4468,5996,6313,5709,
5710,6128,2470,5886,6314,5293,4882,5785,3325,5461,5101,6129,5711,5786,6525,4906,
6526,6527,4418,5887,5712,4808,2907,3701,5713,5888,6528,3765,5636,5331,6529,6530,
3593,5889,3637,4943,3692,5714,5787,4925,6315,6130,5462,4405,6131,6132,6316,5262,
6531,6532,5715,3859,5716,5070,4696,5102,3929,5788,3987,4792,5997,6533,6534,3920,
4809,5000,5998,6535,2974,5370,6317,5189,5263,5717,3826,6536,3953,5001,4883,3190,
5463,5890,4973,5999,4741,6133,6134,3607,5570,6000,4711,3362,3630,4552,5041,6318,
6001,2950,2953,5637,4646,5371,4944,6002,2044,4120,3429,6319,6537,5103,4833,6538,
6539,4884,4647,3884,6003,6004,4758,3835,5220,5789,4565,5407,6540,6135,5294,4697,
4852,6320,6321,3206,4907,6541,6322,4945,6542,6136,6543,6323,6005,4631,3519,6544,
5891,6545,5464,3784,5221,6546,5571,4659,6547,6324,6137,5190,6548,3853,6549,4016,
4834,3954,6138,5332,3827,4017,3210,3546,4469,5408,5718,3505,4648,5790,5131,5638,
5791,5465,4727,4318,6325,6326,5792,4553,4010,4698,3439,4974,3638,4335,3085,6006,
5104,5042,5166,5892,5572,6327,4356,4519,5222,5573,5333,5793,5043,6550,5639,5071,
4503,6328,6139,6551,6140,3914,3901,5372,6007,5640,4728,4793,3976,3836,4885,6552,
4127,6553,4451,4102,5002,6554,3686,5105,6555,5191,5072,5295,4611,5794,5296,6556,
5893,5264,5894,4975,5466,5265,4699,4976,4370,4056,3492,5044,4886,6557,5795,4432,
4769,4357,5467,3940,4660,4290,6141,4484,4770,4661,3992,6329,4025,4662,5022,4632,
4835,4070,5297,4663,4596,5574,5132,5409,5895,6142,4504,5192,4664,5796,5896,3885,
5575,5797,5023,4810,5798,3732,5223,4712,5298,4084,5334,5468,6143,4052,4053,4336,
4977,4794,6558,5335,4908,5576,5224,4233,5024,4128,5469,5225,4873,6008,5045,4729,
4742,4633,3675,4597,6559,5897,5133,5577,5003,5641,5719,6330,6560,3017,2382,3854,
4406,4811,6331,4393,3964,4946,6561,2420,3722,6562,4926,4378,3247,1736,4442,6332,
5134,6333,5226,3996,2918,5470,4319,4003,4598,4743,4744,4485,3785,3902,5167,5004,
5373,4394,5898,6144,4874,1793,3997,6334,4085,4214,5106,5642,4909,5799,6009,4419,
4189,3330,5899,4165,4420,5299,5720,5227,3347,6145,4081,6335,2876,3930,6146,3293,
3786,3910,3998,5900,5300,5578,2840,6563,5901,5579,6147,3531,5374,6564,6565,5580,
4759,5375,6566,6148,3559,5643,6336,6010,5517,6337,6338,5721,5902,3873,6011,6339,
6567,5518,3868,3649,5722,6568,4771,4947,6569,6149,4812,6570,2853,5471,6340,6341,
5644,4795,6342,6012,5723,6343,5724,6013,4349,6344,3160,6150,5193,4599,4514,4493,
5168,4320,6345,4927,3666,4745,5169,5903,5005,4928,6346,5725,6014,4730,4203,5046,
4948,3395,5170,6015,4150,6016,5726,5519,6347,5047,3550,6151,6348,4197,4310,5904,
6571,5581,2965,6152,4978,3960,4291,5135,6572,5301,5727,4129,4026,5905,4853,5728,
5472,6153,6349,4533,2700,4505,5336,4678,3583,5073,2994,4486,3043,4554,5520,6350,
6017,5800,4487,6351,3931,4103,5376,6352,4011,4321,4311,4190,5136,6018,3988,3233,
4350,5906,5645,4198,6573,5107,3432,4191,3435,5582,6574,4139,5410,6353,5411,3944,
5583,5074,3198,6575,6354,4358,6576,5302,4600,5584,5194,5412,6577,6578,5585,5413,
5303,4248,5414,3879,4433,6579,4479,5025,4854,5415,6355,4760,4772,3683,2978,4700,
3797,4452,3965,3932,3721,4910,5801,6580,5195,3551,5907,3221,3471,3029,6019,3999,
5908,5909,5266,5267,3444,3023,3828,3170,4796,5646,4979,4259,6356,5647,5337,3694,
6357,5648,5338,4520,4322,5802,3031,3759,4071,6020,5586,4836,4386,5048,6581,3571,
4679,4174,4949,6154,4813,3787,3402,3822,3958,3215,3552,5268,4387,3933,4950,4359,
6021,5910,5075,3579,6358,4234,4566,5521,6359,3613,5049,6022,5911,3375,3702,3178,
4911,5339,4521,6582,6583,4395,3087,3811,5377,6023,6360,6155,4027,5171,5649,4421,
4249,2804,6584,2270,6585,4000,4235,3045,6156,5137,5729,4140,4312,3886,6361,4330,
6157,4215,6158,3500,3676,4929,4331,3713,4930,5912,4265,3776,3368,5587,4470,4855,
3038,4980,3631,6159,6160,4132,4680,6161,6362,3923,4379,5588,4255,6586,4121,6587,
6363,4649,6364,3288,4773,4774,6162,6024,6365,3543,6588,4274,3107,3737,5050,5803,
4797,4522,5589,5051,5730,3714,4887,5378,4001,4523,6163,5026,5522,4701,4175,2791,
3760,6589,5473,4224,4133,3847,4814,4815,4775,3259,5416,6590,2738,6164,6025,5304,
3733,5076,5650,4816,5590,6591,6165,6592,3934,5269,6593,3396,5340,6594,5804,3445,
3602,4042,4488,5731,5732,3525,5591,4601,5196,6166,6026,5172,3642,4612,3202,4506,
4798,6366,3818,5108,4303,5138,5139,4776,3332,4304,2915,3415,4434,5077,5109,4856,
2879,5305,4817,6595,5913,3104,3144,3903,4634,5341,3133,5110,5651,5805,6167,4057,
5592,2945,4371,5593,6596,3474,4182,6367,6597,6168,4507,4279,6598,2822,6599,4777,
4713,5594,3829,6169,3887,5417,6170,3653,5474,6368,4216,2971,5228,3790,4579,6369,
5733,6600,6601,4951,4746,4555,6602,5418,5475,6027,3400,4665,5806,6171,4799,6028,
5052,6172,3343,4800,4747,5006,6370,4556,4217,5476,4396,5229,5379,5477,3839,5914,
5652,5807,4714,3068,4635,5808,6173,5342,4192,5078,5419,5523,5734,6174,4557,6175,
4602,6371,6176,6603,5809,6372,5735,4260,3869,5111,5230,6029,5112,6177,3126,4681,
5524,5915,2706,3563,4748,3130,6178,4018,5525,6604,6605,5478,4012,4837,6606,4534,
4193,5810,4857,3615,5479,6030,4082,3697,3539,4086,5270,3662,4508,4931,5916,4912,
5811,5027,3888,6607,4397,3527,3302,3798,2775,2921,2637,3966,4122,4388,4028,4054,
1633,4858,5079,3024,5007,3982,3412,5736,6608,3426,3236,5595,3030,6179,3427,3336,
3279,3110,6373,3874,3039,5080,5917,5140,4489,3119,6374,5812,3405,4494,6031,4666,
4141,6180,4166,6032,5813,4981,6609,5081,4422,4982,4112,3915,5653,3296,3983,6375,
4266,4410,5654,6610,6181,3436,5082,6611,5380,6033,3819,5596,4535,5231,5306,5113,
6612,4952,5918,4275,3113,6613,6376,6182,6183,5814,3073,4731,4838,5008,3831,6614,
4888,3090,3848,4280,5526,5232,3014,5655,5009,5737,5420,5527,6615,5815,5343,5173,
5381,4818,6616,3151,4953,6617,5738,2796,3204,4360,2989,4281,5739,5174,5421,5197,
3132,5141,3849,5142,5528,5083,3799,3904,4839,5480,2880,4495,3448,6377,6184,5271,
5919,3771,3193,6034,6035,5920,5010,6036,5597,6037,6378,6038,3106,5422,6618,5423,
5424,4142,6619,4889,5084,4890,4313,5740,6620,3437,5175,5307,5816,4199,5198,5529,
5817,5199,5656,4913,5028,5344,3850,6185,2955,5272,5011,5818,4567,4580,5029,5921,
3616,5233,6621,6622,6186,4176,6039,6379,6380,3352,5200,5273,2908,5598,5234,3837,
5308,6623,6624,5819,4496,4323,5309,5201,6625,6626,4983,3194,3838,4167,5530,5922,
5274,6381,6382,3860,3861,5599,3333,4292,4509,6383,3553,5481,5820,5531,4778,6187,
3955,3956,4324,4389,4218,3945,4325,3397,2681,5923,4779,5085,4019,5482,4891,5382,
5383,6040,4682,3425,5275,4094,6627,5310,3015,5483,5657,4398,5924,3168,4819,6628,
5925,6629,5532,4932,4613,6041,6630,4636,6384,4780,4204,5658,4423,5821,3989,4683,
5822,6385,4954,6631,5345,6188,5425,5012,5384,3894,6386,4490,4104,6632,5741,5053,
6633,5823,5926,5659,5660,5927,6634,5235,5742,5824,4840,4933,4820,6387,4859,5928,
4955,6388,4143,3584,5825,5346,5013,6635,5661,6389,5014,5484,5743,4337,5176,5662,
6390,2836,6391,3268,6392,6636,6042,5236,6637,4158,6638,5744,5663,4471,5347,3663,
4123,5143,4293,3895,6639,6640,5311,5929,5826,3800,6189,6393,6190,5664,5348,3554,
3594,4749,4603,6641,5385,4801,6043,5827,4183,6642,5312,5426,4761,6394,5665,6191,
4715,2669,6643,6644,5533,3185,5427,5086,5930,5931,5386,6192,6044,6645,4781,4013,
5745,4282,4435,5534,4390,4267,6045,5746,4984,6046,2743,6193,3501,4087,5485,5932,
5428,4184,4095,5747,4061,5054,3058,3862,5933,5600,6646,5144,3618,6395,3131,5055,
5313,6396,4650,4956,3855,6194,3896,5202,4985,4029,4225,6195,6647,5828,5486,5829,
3589,3002,6648,6397,4782,5276,6649,6196,6650,4105,3803,4043,5237,5830,6398,4096,
3643,6399,3528,6651,4453,3315,4637,6652,3984,6197,5535,3182,3339,6653,3096,2660,
6400,6654,3449,5934,4250,4236,6047,6401,5831,6655,5487,3753,4062,5832,6198,6199,
6656,3766,6657,3403,4667,6048,6658,4338,2897,5833,3880,2797,3780,4326,6659,5748,
5015,6660,5387,4351,5601,4411,6661,3654,4424,5935,4339,4072,5277,4568,5536,6402,
6662,5238,6663,5349,5203,6200,5204,6201,5145,4536,5016,5056,4762,5834,4399,4957,
6202,6403,5666,5749,6664,4340,6665,5936,5177,5667,6666,6667,3459,4668,6404,6668,
6669,4543,6203,6670,4276,6405,4480,5537,6671,4614,5205,5668,6672,3348,2193,4763,
6406,6204,5937,5602,4177,5669,3419,6673,4020,6205,4443,4569,5388,3715,3639,6407,
6049,4058,6206,6674,5938,4544,6050,4185,4294,4841,4651,4615,5488,6207,6408,6051,
5178,3241,3509,5835,6208,4958,5836,4341,5489,5278,6209,2823,5538,5350,5206,5429,
6675,4638,4875,4073,3516,4684,4914,4860,5939,5603,5389,6052,5057,3237,5490,3791,
6676,6409,6677,4821,4915,4106,5351,5058,4243,5539,4244,5604,4842,4916,5239,3028,
3716,5837,5114,5605,5390,5940,5430,6210,4332,6678,5540,4732,3667,3840,6053,4305,
3408,5670,5541,6410,2744,5240,5750,6679,3234,5606,6680,5607,5671,3608,4283,4159,
4400,5352,4783,6681,6411,6682,4491,4802,6211,6412,5941,6413,6414,5542,5751,6683,
4669,3734,5942,6684,6415,5943,5059,3328,4670,4144,4268,6685,6686,6687,6688,4372,
3603,6689,5944,5491,4373,3440,6416,5543,4784,4822,5608,3792,4616,5838,5672,3514,
5391,6417,4892,6690,4639,6691,6054,5673,5839,6055,6692,6056,5392,6212,4038,5544,
5674,4497,6057,6693,5840,4284,5675,4021,4545,5609,6418,4454,6419,6213,4113,4472,
5314,3738,5087,5279,4074,5610,4959,4063,3179,4750,6058,6420,6214,3476,4498,4716,
5431,4960,4685,6215,5241,6694,6421,6216,6695,5841,5945,6422,3748,5946,5179,3905,
5752,5545,5947,4374,6217,4455,6423,4412,6218,4803,5353,6696,3832,5280,6219,4327,
4702,6220,6221,6059,4652,5432,6424,3749,4751,6425,5753,4986,5393,4917,5948,5030,
5754,4861,4733,6426,4703,6697,6222,4671,5949,4546,4961,5180,6223,5031,3316,5281,
6698,4862,4295,4934,5207,3644,6427,5842,5950,6428,6429,4570,5843,5282,6430,6224,
5088,3239,6060,6699,5844,5755,6061,6431,2701,5546,6432,5115,5676,4039,3993,3327,
4752,4425,5315,6433,3941,6434,5677,4617,4604,3074,4581,6225,5433,6435,6226,6062,
4823,5756,5116,6227,3717,5678,4717,5845,6436,5679,5846,6063,5847,6064,3977,3354,
6437,3863,5117,6228,5547,5394,4499,4524,6229,4605,6230,4306,4500,6700,5951,6065,
3693,5952,5089,4366,4918,6701,6231,5548,6232,6702,6438,4704,5434,6703,6704,5953,
4168,6705,5680,3420,6706,5242,4407,6066,3812,5757,5090,5954,4672,4525,3481,5681,
4618,5395,5354,5316,5955,6439,4962,6707,4526,6440,3465,4673,6067,6441,5682,6708,
5435,5492,5758,5683,4619,4571,4674,4804,4893,4686,5493,4753,6233,6068,4269,6442,
6234,5032,4705,5146,5243,5208,5848,6235,6443,4963,5033,4640,4226,6236,5849,3387,
6444,6445,4436,4437,5850,4843,5494,4785,4894,6709,4361,6710,5091,5956,3331,6237,
4987,5549,6069,6711,4342,3517,4473,5317,6070,6712,6071,4706,6446,5017,5355,6713,
6714,4988,5436,6447,4734,5759,6715,4735,4547,4456,4754,6448,5851,6449,6450,3547,
5852,5318,6451,6452,5092,4205,6716,6238,4620,4219,5611,6239,6072,4481,5760,5957,
5958,4059,6240,6453,4227,4537,6241,5761,4030,4186,5244,5209,3761,4457,4876,3337,
5495,5181,6242,5959,5319,5612,5684,5853,3493,5854,6073,4169,5613,5147,4895,6074,
5210,6717,5182,6718,3830,6243,2798,3841,6075,6244,5855,5614,3604,4606,5496,5685,
5118,5356,6719,6454,5960,5357,5961,6720,4145,3935,4621,5119,5962,4261,6721,6455,
4786,5963,4375,4582,6245,6246,6247,6076,5437,4877,5856,3376,4380,6248,4160,6722,
5148,6456,5211,6457,6723,4718,6458,6724,6249,5358,4044,3297,6459,6250,5857,5615,
5497,5245,6460,5498,6725,6251,6252,5550,3793,5499,2959,5396,6461,6462,4572,5093,
5500,5964,3806,4146,6463,4426,5762,5858,6077,6253,4755,3967,4220,5965,6254,4989,
5501,6464,4352,6726,6078,4764,2290,5246,3906,5438,5283,3767,4964,2861,5763,5094,
6255,6256,4622,5616,5859,5860,4707,6727,4285,4708,4824,5617,6257,5551,4787,5212,
4965,4935,4687,6465,6728,6466,5686,6079,3494,4413,2995,5247,5966,5618,6729,5967,
5764,5765,5687,5502,6730,6731,6080,5397,6467,4990,6258,6732,4538,5060,5619,6733,
4719,5688,5439,5018,5149,5284,5503,6734,6081,4607,6259,5120,3645,5861,4583,6260,
4584,4675,5620,4098,5440,6261,4863,2379,3306,4585,5552,5689,4586,5285,6735,4864,
6736,5286,6082,6737,4623,3010,4788,4381,4558,5621,4587,4896,3698,3161,5248,4353,
4045,6262,3754,5183,4588,6738,6263,6739,6740,5622,3936,6741,6468,6742,6264,5095,
6469,4991,5968,6743,4992,6744,6083,4897,6745,4256,5766,4307,3108,3968,4444,5287,
3889,4343,6084,4510,6085,4559,6086,4898,5969,6746,5623,5061,4919,5249,5250,5504,
5441,6265,5320,4878,3242,5862,5251,3428,6087,6747,4237,5624,5442,6266,5553,4539,
6748,2585,3533,5398,4262,6088,5150,4736,4438,6089,6267,5505,4966,6749,6268,6750,
6269,5288,5554,3650,6090,6091,4624,6092,5690,6751,5863,4270,5691,4277,5555,5864,
6752,5692,4720,4865,6470,5151,4688,4825,6753,3094,6754,6471,3235,4653,6755,5213,
5399,6756,3201,4589,5865,4967,6472,5866,6473,5019,3016,6757,5321,4756,3957,4573,
6093,4993,5767,4721,6474,6758,5625,6759,4458,6475,6270,6760,5556,4994,5214,5252,
6271,3875,5768,6094,5034,5506,4376,5769,6761,2120,6476,5253,5770,6762,5771,5970,
3990,5971,5557,5558,5772,6477,6095,2787,4641,5972,5121,6096,6097,6272,6763,3703,
5867,5507,6273,4206,6274,4789,6098,6764,3619,3646,3833,3804,2394,3788,4936,3978,
4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767)
852,1221,1400,1486, 882,2299,4036, 351, 28,1122, 700,6479,6480,6481,6482,6483, #last 512
)
# flake8: noqa
......@@ -13,12 +13,12 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
......@@ -28,14 +28,19 @@
from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import GB2312DistributionAnalysis
from .mbcssm import GB2312SMModel
from .mbcssm import GB2312_SM_MODEL
class GB2312Prober(MultiByteCharSetProber):
def __init__(self):
MultiByteCharSetProber.__init__(self)
self._mCodingSM = CodingStateMachine(GB2312SMModel)
self._mDistributionAnalyzer = GB2312DistributionAnalysis()
super(GB2312Prober, self).__init__()
self.coding_sm = CodingStateMachine(GB2312_SM_MODEL)
self.distribution_analyzer = GB2312DistributionAnalysis()
self.reset()
def get_charset_name(self):
@property
def charset_name(self):
return "GB2312"
@property
def language(self):
return "Chinese"
......@@ -26,8 +26,7 @@
######################### END LICENSE BLOCK #########################
from .charsetprober import CharSetProber
from .constants import eNotMe, eDetecting
from .compat import wrap_ord
from .enums import ProbingState
# This prober doesn't actually recognize a language or a charset.
# It is a helper prober for the use of the Hebrew model probers
......@@ -126,56 +125,59 @@ from .compat import wrap_ord
# model probers scores. The answer is returned in the form of the name of the
# charset identified, either "windows-1255" or "ISO-8859-8".
# windows-1255 / ISO-8859-8 code points of interest
FINAL_KAF = 0xea
NORMAL_KAF = 0xeb
FINAL_MEM = 0xed
NORMAL_MEM = 0xee
FINAL_NUN = 0xef
NORMAL_NUN = 0xf0
FINAL_PE = 0xf3
NORMAL_PE = 0xf4
FINAL_TSADI = 0xf5
NORMAL_TSADI = 0xf6
# Minimum Visual vs Logical final letter score difference.
# If the difference is below this, don't rely solely on the final letter score
# distance.
MIN_FINAL_CHAR_DISTANCE = 5
class HebrewProber(CharSetProber):
# windows-1255 / ISO-8859-8 code points of interest
FINAL_KAF = 0xea
NORMAL_KAF = 0xeb
FINAL_MEM = 0xed
NORMAL_MEM = 0xee
FINAL_NUN = 0xef
NORMAL_NUN = 0xf0
FINAL_PE = 0xf3
NORMAL_PE = 0xf4
FINAL_TSADI = 0xf5
NORMAL_TSADI = 0xf6
# Minimum Visual vs Logical model score difference.
# If the difference is below this, don't rely at all on the model score
# distance.
MIN_MODEL_DISTANCE = 0.01
# Minimum Visual vs Logical final letter score difference.
# If the difference is below this, don't rely solely on the final letter score
# distance.
MIN_FINAL_CHAR_DISTANCE = 5
VISUAL_HEBREW_NAME = "ISO-8859-8"
LOGICAL_HEBREW_NAME = "windows-1255"
# Minimum Visual vs Logical model score difference.
# If the difference is below this, don't rely at all on the model score
# distance.
MIN_MODEL_DISTANCE = 0.01
VISUAL_HEBREW_NAME = "ISO-8859-8"
LOGICAL_HEBREW_NAME = "windows-1255"
class HebrewProber(CharSetProber):
def __init__(self):
CharSetProber.__init__(self)
self._mLogicalProber = None
self._mVisualProber = None
super(HebrewProber, self).__init__()
self._final_char_logical_score = None
self._final_char_visual_score = None
self._prev = None
self._before_prev = None
self._logical_prober = None
self._visual_prober = None
self.reset()
def reset(self):
self._mFinalCharLogicalScore = 0
self._mFinalCharVisualScore = 0
self._final_char_logical_score = 0
self._final_char_visual_score = 0
# The two last characters seen in the previous buffer,
# mPrev and mBeforePrev are initialized to space in order to simulate
# a word delimiter at the beginning of the data
self._mPrev = ' '
self._mBeforePrev = ' '
self._prev = ' '
self._before_prev = ' '
# These probers are owned by the group prober.
def set_model_probers(self, logicalProber, visualProber):
self._mLogicalProber = logicalProber
self._mVisualProber = visualProber
self._logical_prober = logicalProber
self._visual_prober = visualProber
def is_final(self, c):
return wrap_ord(c) in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE,
FINAL_TSADI]
return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
self.FINAL_PE, self.FINAL_TSADI]
def is_non_final(self, c):
# The normal Tsadi is not a good Non-Final letter due to words like
......@@ -188,9 +190,10 @@ class HebrewProber(CharSetProber):
# for example legally end with a Non-Final Pe or Kaf. However, the
# benefit of these letters as Non-Final letters outweighs the damage
# since these words are quite rare.
return wrap_ord(c) in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE]
return c in [self.NORMAL_KAF, self.NORMAL_MEM,
self.NORMAL_NUN, self.NORMAL_PE]
def feed(self, aBuf):
def feed(self, byte_str):
# Final letter analysis for logical-visual decision.
# Look for evidence that the received buffer is either logical Hebrew
# or visual Hebrew.
......@@ -217,67 +220,73 @@ class HebrewProber(CharSetProber):
# We automatically filter out all 7-bit characters (replace them with
# spaces) so the word boundary detection works properly. [MAP]
if self.get_state() == eNotMe:
if self.state == ProbingState.NOT_ME:
# Both model probers say it's not them. No reason to continue.
return eNotMe
return ProbingState.NOT_ME
aBuf = self.filter_high_bit_only(aBuf)
byte_str = self.filter_high_byte_only(byte_str)
for cur in aBuf:
for cur in byte_str:
if cur == ' ':
# We stand on a space - a word just ended
if self._mBeforePrev != ' ':
# next-to-last char was not a space so self._mPrev is not a
if self._before_prev != ' ':
# next-to-last char was not a space so self._prev is not a
# 1 letter word
if self.is_final(self._mPrev):
if self.is_final(self._prev):
# case (1) [-2:not space][-1:final letter][cur:space]
self._mFinalCharLogicalScore += 1
elif self.is_non_final(self._mPrev):
self._final_char_logical_score += 1
elif self.is_non_final(self._prev):
# case (2) [-2:not space][-1:Non-Final letter][
# cur:space]
self._mFinalCharVisualScore += 1
self._final_char_visual_score += 1
else:
# Not standing on a space
if ((self._mBeforePrev == ' ') and
(self.is_final(self._mPrev)) and (cur != ' ')):
if ((self._before_prev == ' ') and
(self.is_final(self._prev)) and (cur != ' ')):
# case (3) [-2:space][-1:final letter][cur:not space]
self._mFinalCharVisualScore += 1
self._mBeforePrev = self._mPrev
self._mPrev = cur
self._final_char_visual_score += 1
self._before_prev = self._prev
self._prev = cur
# Forever detecting, till the end or until both model probers return
# eNotMe (handled above)
return eDetecting
# ProbingState.NOT_ME (handled above)
return ProbingState.DETECTING
def get_charset_name(self):
@property
def charset_name(self):
# Make the decision: is it Logical or Visual?
# If the final letter score distance is dominant enough, rely on it.
finalsub = self._mFinalCharLogicalScore - self._mFinalCharVisualScore
if finalsub >= MIN_FINAL_CHAR_DISTANCE:
return LOGICAL_HEBREW_NAME
if finalsub <= -MIN_FINAL_CHAR_DISTANCE:
return VISUAL_HEBREW_NAME
finalsub = self._final_char_logical_score - self._final_char_visual_score
if finalsub >= self.MIN_FINAL_CHAR_DISTANCE:
return self.LOGICAL_HEBREW_NAME
if finalsub <= -self.MIN_FINAL_CHAR_DISTANCE:
return self.VISUAL_HEBREW_NAME
# It's not dominant enough, try to rely on the model scores instead.
modelsub = (self._mLogicalProber.get_confidence()
- self._mVisualProber.get_confidence())
if modelsub > MIN_MODEL_DISTANCE:
return LOGICAL_HEBREW_NAME
if modelsub < -MIN_MODEL_DISTANCE:
return VISUAL_HEBREW_NAME
modelsub = (self._logical_prober.get_confidence()
- self._visual_prober.get_confidence())
if modelsub > self.MIN_MODEL_DISTANCE:
return self.LOGICAL_HEBREW_NAME
if modelsub < -self.MIN_MODEL_DISTANCE:
return self.VISUAL_HEBREW_NAME
# Still no good, back to final letter distance, maybe it'll save the
# day.
if finalsub < 0.0:
return VISUAL_HEBREW_NAME
return self.VISUAL_HEBREW_NAME
# (finalsub > 0 - Logical) or (don't know what to do) default to
# Logical.
return LOGICAL_HEBREW_NAME
return self.LOGICAL_HEBREW_NAME
@property
def language(self):
return 'Hebrew'
def get_state(self):
@property
def state(self):
# Remain active as long as any of the model probers are active.
if (self._mLogicalProber.get_state() == eNotMe) and \
(self._mVisualProber.get_state() == eNotMe):
return eNotMe
return eDetecting
if (self._logical_prober.state == ProbingState.NOT_ME) and \
(self._visual_prober.state == ProbingState.NOT_ME):
return ProbingState.NOT_ME
return ProbingState.DETECTING
......@@ -25,13 +25,6 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .compat import wrap_ord
NUM_OF_CATEGORY = 6
DONT_KNOW = -1
ENOUGH_REL_THRESHOLD = 100
MAX_REL_THRESHOLD = 1000
MINIMUM_DATA_THRESHOLD = 4
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
jp2CharContext = (
......@@ -120,24 +113,35 @@ jp2CharContext = (
(0,4,0,3,0,3,0,3,0,3,5,5,3,3,3,3,4,3,4,3,3,3,4,4,4,3,3,3,3,4,3,5,3,3,1,3,2,4,5,5,5,5,4,3,4,5,5,3,2,2,3,3,3,3,2,3,3,1,2,3,2,4,3,3,3,4,0,4,0,2,0,4,3,2,2,1,2,0,3,0,0,4,1),
)
class JapaneseContextAnalysis:
class JapaneseContextAnalysis(object):
NUM_OF_CATEGORY = 6
DONT_KNOW = -1
ENOUGH_REL_THRESHOLD = 100
MAX_REL_THRESHOLD = 1000
MINIMUM_DATA_THRESHOLD = 4
def __init__(self):
self._total_rel = None
self._rel_sample = None
self._need_to_skip_char_num = None
self._last_char_order = None
self._done = None
self.reset()
def reset(self):
self._mTotalRel = 0 # total sequence received
# category counters, each interger counts sequence in its category
self._mRelSample = [0] * NUM_OF_CATEGORY
self._total_rel = 0 # total sequence received
# category counters, each integer counts sequence in its category
self._rel_sample = [0] * self.NUM_OF_CATEGORY
# if last byte in current buffer is not the last byte of a character,
# we need to know how many bytes to skip in next buffer
self._mNeedToSkipCharNum = 0
self._mLastCharOrder = -1 # The order of previous char
self._need_to_skip_char_num = 0
self._last_char_order = -1 # The order of previous char
# If this flag is set to True, detection is done and conclusion has
# been made
self._mDone = False
self._done = False
def feed(self, aBuf, aLen):
if self._mDone:
def feed(self, byte_str, num_bytes):
if self._done:
return
# The buffer we got is byte oriented, and a character may span in more than one
......@@ -147,81 +151,83 @@ class JapaneseContextAnalysis:
# well and analyse the character once it is complete, but since a
# character will not make much difference, by simply skipping
# this character will simply our logic and improve performance.
i = self._mNeedToSkipCharNum
while i < aLen:
order, charLen = self.get_order(aBuf[i:i + 2])
i += charLen
if i > aLen:
self._mNeedToSkipCharNum = i - aLen
self._mLastCharOrder = -1
i = self._need_to_skip_char_num
while i < num_bytes:
order, char_len = self.get_order(byte_str[i:i + 2])
i += char_len
if i > num_bytes:
self._need_to_skip_char_num = i - num_bytes
self._last_char_order = -1
else:
if (order != -1) and (self._mLastCharOrder != -1):
self._mTotalRel += 1
if self._mTotalRel > MAX_REL_THRESHOLD:
self._mDone = True
if (order != -1) and (self._last_char_order != -1):
self._total_rel += 1
if self._total_rel > self.MAX_REL_THRESHOLD:
self._done = True
break
self._mRelSample[jp2CharContext[self._mLastCharOrder][order]] += 1
self._mLastCharOrder = order
self._rel_sample[jp2CharContext[self._last_char_order][order]] += 1
self._last_char_order = order
def got_enough_data(self):
return self._mTotalRel > ENOUGH_REL_THRESHOLD
return self._total_rel > self.ENOUGH_REL_THRESHOLD
def get_confidence(self):
# This is just one way to calculate confidence. It works well for me.
if self._mTotalRel > MINIMUM_DATA_THRESHOLD:
return (self._mTotalRel - self._mRelSample[0]) / self._mTotalRel
if self._total_rel > self.MINIMUM_DATA_THRESHOLD:
return (self._total_rel - self._rel_sample[0]) / self._total_rel
else:
return DONT_KNOW
return self.DONT_KNOW
def get_order(self, aBuf):
def get_order(self, byte_str):
return -1, 1
class SJISContextAnalysis(JapaneseContextAnalysis):
def __init__(self):
self.charset_name = "SHIFT_JIS"
super(SJISContextAnalysis, self).__init__()
self._charset_name = "SHIFT_JIS"
def get_charset_name(self):
return self.charset_name
@property
def charset_name(self):
return self._charset_name
def get_order(self, aBuf):
if not aBuf:
def get_order(self, byte_str):
if not byte_str:
return -1, 1
# find out current char's byte length
first_char = wrap_ord(aBuf[0])
if ((0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC)):
charLen = 2
first_char = byte_str[0]
if (0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC):
char_len = 2
if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
self.charset_name = "CP932"
self._charset_name = "CP932"
else:
charLen = 1
char_len = 1
# return its order if it is hiragana
if len(aBuf) > 1:
second_char = wrap_ord(aBuf[1])
if len(byte_str) > 1:
second_char = byte_str[1]
if (first_char == 202) and (0x9F <= second_char <= 0xF1):
return second_char - 0x9F, charLen
return second_char - 0x9F, char_len
return -1, charLen
return -1, char_len
class EUCJPContextAnalysis(JapaneseContextAnalysis):
def get_order(self, aBuf):
if not aBuf:
def get_order(self, byte_str):
if not byte_str:
return -1, 1
# find out current char's byte length
first_char = wrap_ord(aBuf[0])
first_char = byte_str[0]
if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE):
charLen = 2
char_len = 2
elif first_char == 0x8F:
charLen = 3
char_len = 3
else:
charLen = 1
char_len = 1
# return its order if it is hiragana
if len(aBuf) > 1:
second_char = wrap_ord(aBuf[1])
if len(byte_str) > 1:
second_char = byte_str[1]
if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3):
return second_char - 0xA1, charLen
return second_char - 0xA1, char_len
return -1, char_len
return -1, charLen
# flake8: noqa
......@@ -210,20 +210,19 @@ BulgarianLangModel = (
)
Latin5BulgarianModel = {
'charToOrderMap': Latin5_BulgarianCharToOrderMap,
'precedenceMatrix': BulgarianLangModel,
'mTypicalPositiveRatio': 0.969392,
'keepEnglishLetter': False,
'charsetName': "ISO-8859-5"
'char_to_order_map': Latin5_BulgarianCharToOrderMap,
'precedence_matrix': BulgarianLangModel,
'typical_positive_ratio': 0.969392,
'keep_english_letter': False,
'charset_name': "ISO-8859-5",
'language': 'Bulgairan',
}
Win1251BulgarianModel = {
'charToOrderMap': win1251BulgarianCharToOrderMap,
'precedenceMatrix': BulgarianLangModel,
'mTypicalPositiveRatio': 0.969392,
'keepEnglishLetter': False,
'charsetName': "windows-1251"
'char_to_order_map': win1251BulgarianCharToOrderMap,
'precedence_matrix': BulgarianLangModel,
'typical_positive_ratio': 0.969392,
'keep_english_letter': False,
'charset_name': "windows-1251",
'language': 'Bulgarian',
}
# flake8: noqa
......@@ -31,7 +31,7 @@
# 252: 0 - 9
# Character Mapping Table:
Latin7_CharToOrderMap = (
Latin7_char_to_order_map = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
......@@ -50,7 +50,7 @@ Latin7_CharToOrderMap = (
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0
)
win1253_CharToOrderMap = (
win1253_char_to_order_map = (
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10
253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20
......@@ -207,19 +207,19 @@ GreekLangModel = (
)
Latin7GreekModel = {
'charToOrderMap': Latin7_CharToOrderMap,
'precedenceMatrix': GreekLangModel,
'mTypicalPositiveRatio': 0.982851,
'keepEnglishLetter': False,
'charsetName': "ISO-8859-7"
'char_to_order_map': Latin7_char_to_order_map,
'precedence_matrix': GreekLangModel,
'typical_positive_ratio': 0.982851,
'keep_english_letter': False,
'charset_name': "ISO-8859-7",
'language': 'Greek',
}
Win1253GreekModel = {
'charToOrderMap': win1253_CharToOrderMap,
'precedenceMatrix': GreekLangModel,
'mTypicalPositiveRatio': 0.982851,
'keepEnglishLetter': False,
'charsetName': "windows-1253"
'char_to_order_map': win1253_char_to_order_map,
'precedence_matrix': GreekLangModel,
'typical_positive_ratio': 0.982851,
'keep_english_letter': False,
'charset_name': "windows-1253",
'language': 'Greek',
}
# flake8: noqa
......@@ -190,11 +190,10 @@ ThaiLangModel = (
)
TIS620ThaiModel = {
'charToOrderMap': TIS620CharToOrderMap,
'precedenceMatrix': ThaiLangModel,
'mTypicalPositiveRatio': 0.926386,
'keepEnglishLetter': False,
'charsetName': "TIS-620"
'char_to_order_map': TIS620CharToOrderMap,
'precedence_matrix': ThaiLangModel,
'typical_positive_ratio': 0.926386,
'keep_english_letter': False,
'charset_name': "TIS-620",
'language': 'Thai',
}
# flake8: noqa
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册