extract_chinese_and_punct.py 4.2 KB
Newer Older
K
kgresearch 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2019 Baidu.com, Inc. All Rights Reserved
#
"""
requirements:
Authors: daisongtai(daisongtai@baidu.com)
Date:    2019/5/29 6:38 PM
"""
from __future__ import print_function
import sys
import re
import io

LHan = [
    [0x2E80, 0x2E99],  # Han # So  [26] CJK RADICAL REPEAT, CJK RADICAL RAP
    [0x2E9B, 0x2EF3
     ],  # Han # So  [89] CJK RADICAL CHOKE, CJK RADICAL C-SIMPLIFIED TURTLE
    [0x2F00, 0x2FD5],  # Han # So [214] KANGXI RADICAL ONE, KANGXI RADICAL FLUTE
    0x3005,  # Han # Lm       IDEOGRAPHIC ITERATION MARK
    0x3007,  # Han # Nl       IDEOGRAPHIC NUMBER ZERO
    [0x3021,
     0x3029],  # Han # Nl   [9] HANGZHOU NUMERAL ONE, HANGZHOU NUMERAL NINE
    [0x3038,
     0x303A],  # Han # Nl   [3] HANGZHOU NUMERAL TEN, HANGZHOU NUMERAL THIRTY
    0x303B,  # Han # Lm       VERTICAL IDEOGRAPHIC ITERATION MARK
    [
        0x3400, 0x4DB5
    ],  # Han # Lo [6582] CJK UNIFIED IDEOGRAPH-3400, CJK UNIFIED IDEOGRAPH-4DB5
    [
        0x4E00, 0x9FC3
    ],  # Han # Lo [20932] CJK UNIFIED IDEOGRAPH-4E00, CJK UNIFIED IDEOGRAPH-9FC3
    [
        0xF900, 0xFA2D
    ],  # Han # Lo [302] CJK COMPATIBILITY IDEOGRAPH-F900, CJK COMPATIBILITY IDEOGRAPH-FA2D
    [
        0xFA30, 0xFA6A
    ],  # Han # Lo  [59] CJK COMPATIBILITY IDEOGRAPH-FA30, CJK COMPATIBILITY IDEOGRAPH-FA6A
    [
        0xFA70, 0xFAD9
    ],  # Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70, CJK COMPATIBILITY IDEOGRAPH-FAD9
    [
        0x20000, 0x2A6D6
    ],  # Han # Lo [42711] CJK UNIFIED IDEOGRAPH-20000, CJK UNIFIED IDEOGRAPH-2A6D6
    [0x2F800, 0x2FA1D]
]  # Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800, CJK COMPATIBILITY IDEOGRAPH-2FA1D

CN_PUNCTS = [(0x3002, "。"), (0xFF1F, "?"), (0xFF01, "!"), (0xFF0C, ","),
             (0x3001, "、"), (0xFF1B, ";"), (0xFF1A, ":"), (0x300C, "「"),
             (0x300D, "」"), (0x300E, "『"), (0x300F, "』"), (0x2018, "‘"),
             (0x2019, "’"), (0x201C, "“"), (0x201D, "”"), (0xFF08, "("),
             (0xFF09, ")"), (0x3014, "〔"), (0x3015, "〕"), (0x3010, "【"),
             (0x3011, "】"), (0x2014, "—"), (0x2026, "…"), (0x2013, "–"),
             (0xFF0E, "."), (0x300A, "《"), (0x300B, "》"), (0x3008, "〈"),
             (0x3009, "〉"), (0x2015, "―"), (0xff0d, "-"), (0x0020, " ")]
#(0xFF5E, "~"),

EN_PUNCTS = [[0x0021, 0x002F], [0x003A, 0x0040], [0x005B, 0x0060],
             [0x007B, 0x007E]]


class ChineseAndPunctuationExtractor(object):
    def __init__(self):
        self.chinese_re = self.build_re()

    def is_chinese_or_punct(self, c):
        if self.chinese_re.match(c):
            return True
        else:
            return False

    def build_re(self):
        L = []
        for i in LHan:
            if isinstance(i, list):
                f, t = i
                try:
                    f = chr(f)
                    t = chr(t)
                    L.append('%s-%s' % (f, t))
                except:
                    pass  # A narrow python build, so can't use chars > 65535 without surrogate pairs!

            else:
                try:
                    L.append(chr(i))
                except:
                    pass
        for j, _ in CN_PUNCTS:
            try:
                L.append(chr(j))
            except:
                pass

        for k in EN_PUNCTS:
            f, t = k
            try:
                f = chr(f)
                t = chr(t)
                L.append('%s-%s' % (f, t))
            except:
                raise ValueError()
                pass  # A narrow python build, so can't use chars > 65535 without surrogate pairs!

        RE = '[%s]' % ''.join(L)
        # print('RE:', RE.encode('utf-8'))
        return re.compile(RE, re.UNICODE)


if __name__ == '__main__':
    extractor = ChineseAndPunctuationExtractor()
    for c in "韩邦庆(1856~1894)曾用名寄,字子云,别署太仙、大一山人、花也怜侬、三庆":
        if extractor.is_chinese_or_punct(c):
            print(c, 'yes')
        else:
            print(c, "no")

    print("~", extractor.is_chinese_or_punct("~"))
    print("~", extractor.is_chinese_or_punct("~"))
    print("―", extractor.is_chinese_or_punct("―"))
    print("-", extractor.is_chinese_or_punct("-"))