# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes.""" import collections import sentencepiece as spm import unicodedata from .args import str2bool def clean_text(text): """Performs invalid character removal and whitespace cleanup on text.""" text = text.replace(u"“", u'"')\ .replace(u'”', u'"')\ .replace(u'‘', "'")\ .replace(u'’', u"'")\ .replace(u'—', u'-') output = [] for char in text: if _is_control(char): continue if _is_whitespace(char): output.append(" ") else: output.append(char) return "".join(output) def preprocess_text(inputs, remove_space=True, lower=False): """preprocess data by removing extra space and normalize data.""" outputs = inputs if remove_space: outputs = " ".join(inputs.strip().split()) outputs = unicodedata.normalize("NFKD", outputs) outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) if lower: outputs = outputs.lower() return outputs def encode_pieces(spm_model, text, return_unicode=True, sample=False): """turn sentences into word pieces.""" # liujiaxiang: add for ernie-albert, mainly consider for “/”/‘/’/— causing too many unk text = clean_text(text) if not sample: pieces = spm_model.EncodeAsPieces(text) else: pieces = spm_model.SampleEncodeAsPieces(text, 64, 0.1) return pieces def encode_ids(spm_model, text, sample=False): """turn sentences into word pieces.""" pieces = encode_pieces(spm_model, text, return_unicode=False, sample=sample) ids = [spm_model.PieceToId(piece) for piece in pieces] return ids def convert_to_unicode(text): """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" if isinstance(text, str): return text elif isinstance(text, bytes): return text.decode("utf-8", "ignore") else: raise ValueError("Unsupported string type: %s" % (type(text))) def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() fin = open(vocab_file, 'r', encoding="UTF-8") for num, line in enumerate(fin): items = convert_to_unicode(line.rstrip()).split("\t") if len(items) > 2: break token = items[0] index = items[1] if len(items) == 2 else num token = token.strip() vocab[token] = int(index) return vocab def convert_by_vocab(vocab, items): """Converts a sequence of [tokens|ids] using the vocab.""" output = [] for item in items: output.append(vocab[item]) return output class SentencePieceTokenizer(object): """Runs end-to-end tokenziation.""" @classmethod def add_cmdline_args(cls, parser): """Add cmdline argurments.""" group = parser.add_argument_group("Tokenizer") group.add_argument("--vocab_path", type=str, required=True) group.add_argument("--do_lower_case", type=str2bool, default=False) group.add_argument("--spm_model_file", type=str, required=True) return group def __init__(self, args): self.spm_model = spm.SentencePieceProcessor() self.spm_model.Load(args.spm_model_file) self.vocab = load_vocab(args.vocab_path) self.do_lower_case = args.do_lower_case self.inv_vocab = {v: k for k, v in self.vocab.items()} def tokenize(self, text): """Tokenizes a piece of text.""" text = preprocess_text(text, lower=self.do_lower_case) return encode_pieces(self.spm_model, text, return_unicode=True) def convert_tokens_to_ids(self, tokens): """Convert tokens to ids.""" ret = [] unk_id = self.vocab[""] for token in tokens: if token in self.vocab: ret.append(self.vocab[token]) else: ret.append(unk_id) return ret def convert_ids_to_tokens(self, ids): """Convert ids to tokens.""" return convert_by_vocab(self.inv_vocab, ids) def merge_subword(self, tokens): """Merge subword.""" ret = [] for token in tokens: if token.startswith(u"▁"): ret.append(token[1:]) else: if len(ret): ret[-1] += token else: ret.append(token) ret = [token for token in ret if token] return ret def convert_ids_to_str(self, ids): """Convert ids to string.""" tokens = self.convert_ids_to_tokens(ids) tokens = self.merge_subword(tokens) res = " ".join(tokens).replace("", "") res = res.replace("", "\n").replace("\n ", "\n").strip() return res def _is_whitespace(char): """Checks whether `chars` is a whitespace character.""" # \t, \n, and \r are technically contorl characters but we treat them # as whitespace since they are generally considered as such. if char == " " or char == "\t" or char == "\n" or char == "\r": return True cat = unicodedata.category(char) if cat == "Zs": return True return False def _is_control(char): """Checks whether `chars` is a control character.""" # These are technically control characters but we count them as whitespace # characters. if char == "\t" or char == "\n" or char == "\r": return False cat = unicodedata.category(char) if cat.startswith("C"): return True return False