# coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy import io import json import os import six import unicodedata from paddle.utils.download import get_path_from_url from paddlenlp.utils.env import MODEL_HOME from ..data.vocab import Vocab from .utils import InitTrackerMeta, fn_args_to_dict __all__ = ['PretrainedTokenizer'] def convert_to_unicode(text): """ Converts `text` to Unicode (if it's not already), assuming utf-8 input. Args: text (str|bytes): Text to be converted to unicode. Returns: str: converted text. """ if isinstance(text, str): return text elif isinstance(text, bytes): return text.decode("utf-8", "ignore") else: raise ValueError("Unsupported string type: %s" % (type(text))) def whitespace_tokenize(text): """ Runs basic whitespace cleaning and splitting on a peice of text. Args: text (str): Text to be tokened. Returns: list(str): Token list. """ text = text.strip() if not text: return [] tokens = text.split() return tokens def _is_whitespace(char): """ Checks whether `chars` is a whitespace character. """ # \t, \n, and \r are technically contorl characters but we treat them # as whitespace since they are generally considered as such. if char == " " or char == "\t" or char == "\n" or char == "\r": return True cat = unicodedata.category(char) if cat == "Zs": return True return False def _is_control(char): """Checks whether `chars` is a control character.""" # These are technically control characters but we count them as whitespace # characters. if char == "\t" or char == "\n" or char == "\r": return False cat = unicodedata.category(char) if cat.startswith("C"): return True return False def _is_punctuation(char): """Checks whether `chars` is a punctuation character.""" cp = ord(char) # We treat all non-letter/number ASCII as punctuation. # Characters such as "^", "$", and "`" are not in the Unicode # Punctuation class but we treat them as punctuation anyways, for # consistency. if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): return True cat = unicodedata.category(char) if cat.startswith("P"): return True return False @six.add_metaclass(InitTrackerMeta) class PretrainedTokenizer(object): """ The base class for all pretrained tokenizers. It provides some attributes and common methods for all pretrained tokenizers, including attributes for and special tokens (arguments of `__init__` whose name ends with `_token`) and methods for saving and loading. It also includes some class attributes (should be set by derived classes): - `tokenizer_config_file` (str): represents the file name for saving and loading tokenizer configuration, it's value is `tokenizer_config.json`. - `resource_files_names` (dict): use this to map resource related arguments of `__init__` to specific file names for saving and loading. - `pretrained_resource_files_map` (dict): The dict has the same keys as `resource_files_names`, the values are also dict mapping specific pretrained model name to URL linking to vocabulary or other resources. - `pretrained_init_configuration` (dict): The dict has pretrained model names as keys, and the values are also dict preserving corresponding configuration for tokenizer initialization. """ tokenizer_config_file = "tokenizer_config.json" pretrained_init_configuration = {} resource_files_names = {} # keys are arguments of __init__ pretrained_resource_files_map = {} def _wrap_init(self, original_init, *args, **kwargs): """ It would be hooked after `__init__` to add specials tokens (arguments of `__init__` whose name ends with `_token`) as attributes of the tokenizer instance. """ # expose tokens as attributes init_dict = fn_args_to_dict(original_init, *args, **kwargs) special_tokens_map = {} for identifier, token in init_dict.items(): if identifier.endswith('_token'): # setattr(self, identifier, token) special_tokens_map[identifier] = token self.special_tokens_map = special_tokens_map @property def all_special_tokens(self): """ List all the special tokens ('', ''...) mapped to class attributes (cls_token, unk_token...). """ all_toks = [] set_attr = self.special_tokens_map for attr_value in set_attr.values(): all_toks = all_toks + (list(attr_value) if isinstance(attr_value, ( list, tuple)) else [attr_value]) all_toks = list(set(all_toks)) return all_toks @property def all_special_ids(self): """ List the vocabulary indices of the special tokens ('', ''...) mapped to class attributes (cls_token, unk_token...). """ all_toks = self.all_special_tokens all_ids = self.convert_tokens_to_ids(all_toks) return all_ids def convert_tokens_to_ids(self, tokens): """ Converts a sequence of tokens into ids using the vocab. The tokenizer should has the `vocab` attribute. Args: tokens (list(str)): List of tokens. Returns: list: Converted id list. """ return self.vocab.to_indices(tokens) def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (list of string) to a single string by using :code:`' '.join(tokens)` . Args: tokens (list(str)): List of tokens. Returns: str: Converted string. """ return " ".join(tokens) def convert_ids_to_tokens(self, ids, skip_special_tokens=False): """ Converts a single index or a sequence of indices (integers) in a token or a sequence of tokens (str) by using the vocabulary. Args: skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False """ tokens = self.vocab.to_tokens(ids) if skip_special_tokens and isinstance(tokens, list): tokens = [ token for token in tokens if token not in self.all_special_tokens ] return tokens @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): """ Instantiate an instance of `PretrainedTokenizer` from a predefined tokenizer specified by name or path., and it always corresponds to a pretrained model. Args: pretrained_model_name_or_path (str): A name of or a file path to a pretrained model. *args (tuple): position arguments for `__init__`. If provide, use this as position argument values for tokenizer initialization. **kwargs (dict): keyword arguments for `__init__`. If provide, use this to update pre-defined keyword argument values for tokenizer initialization. Returns: PretrainedTokenizer: An instance of PretrainedTokenizer. """ pretrained_models = list(cls.pretrained_init_configuration.keys()) vocab_files = {} init_configuration = {} if pretrained_model_name_or_path in pretrained_models: for file_id, map_list in cls.pretrained_resource_files_map.items(): vocab_files[file_id] = map_list[pretrained_model_name_or_path] init_configuration = copy.deepcopy( cls.pretrained_init_configuration[ pretrained_model_name_or_path]) else: if os.path.isdir(pretrained_model_name_or_path): for file_id, file_name in cls.resource_files_names.items(): full_file_name = os.path.join(pretrained_model_name_or_path, file_name) vocab_files[file_id] = full_file_name vocab_files["tokenizer_config_file"] = os.path.join( pretrained_model_name_or_path, cls.tokenizer_config_file) else: raise ValueError( "Calling {}.from_pretrained() with a model identifier or the " "path to a directory instead. The supported model " "identifiers are as follows: {}".format( cls.__name__, cls.pretrained_init_configuration.keys())) default_root = os.path.join(MODEL_HOME, pretrained_model_name_or_path) resolved_vocab_files = {} for file_id, file_path in vocab_files.items(): resolved_vocab_files[ file_id] = file_path if file_path is None or os.path.isfile( file_path) else get_path_from_url(file_path, default_root, None) # Prepare tokenizer initialization kwargs # Did we saved some inputs and kwargs to reload ? tokenizer_config_file = resolved_vocab_files.pop( "tokenizer_config_file", None) if tokenizer_config_file is not None: with io.open(tokenizer_config_file, encoding="utf-8") as f: init_kwargs = json.load(f) else: init_kwargs = init_configuration # position args are stored in kwargs, maybe better not include init_args = init_kwargs.pop("init_args", ()) init_kwargs.pop("init_class", None) # Update with newly provided args and kwargs init_args = init_args if not args else args init_kwargs.update(kwargs) # Merge resolved_vocab_files arguments in init_kwargs if not including. # Maybe need more ways to load resources. for args_name, file_path in resolved_vocab_files.items(): if args_name not in init_kwargs: init_kwargs[args_name] = file_path # TODO(guosheng): avoid reduplication of position args and key word args tokenizer = cls(*init_args, **init_kwargs) return tokenizer def save_pretrained(self, save_directory): """ Save tokenizer configuration and related resources to files under `save_directory`. Args: save_directory (str): Directory to save files into. """ assert os.path.isdir( save_directory ), "Saving directory ({}) should be a directory".format(save_directory) tokenizer_config_file = os.path.join(save_directory, self.tokenizer_config_file) # init_config is set in metaclass created `__init__`, tokenizer_config = self.init_config with io.open(tokenizer_config_file, "w", encoding="utf-8") as f: f.write(json.dumps(tokenizer_config, ensure_ascii=False)) self.save_resources(save_directory) def save_resources(self, save_directory): """ Save tokenizer related resources to files under `save_directory`. Args: save_directory (str): Directory to save files into. """ assert hasattr(self, 'vocab') and len( self.resource_files_names) == 1, "Must overwrite `save_resources`" file_name = os.path.join(save_directory, list(self.resource_files_names.values())[0]) self.save_vocabulary(file_name, self.vocab) @staticmethod def load_vocabulary(filepath, unk_token=None, pad_token=None, bos_token=None, eos_token=None, **kwargs): """ Instantiate an instance of `Vocab` from a file reserving all tokens by using `Vocab.from_dict`. The file contains a token per line, and the line number would be the index of corresponding token. Args: filepath (str): path of file to construct vocabulary. unk_token (str): special token for unknown token. If no need, it also could be None. Default: None. pad_token (str): special token for padding token. If no need, it also could be None. Default: None. bos_token (str): special token for bos token. If no need, it also could be None. Default: None. eos_token (str): special token for eos token. If no need, it also could be None. Default: None. **kwargs (dict): keyword arguments for `Vocab.from_dict`. Returns: Vocab: An instance of `Vocab`. """ token_to_idx = {} with io.open(filepath, 'r', encoding='utf-8') as f: for index, line in enumerate(f): token = line.rstrip('\n') token_to_idx[token] = int(index) vocab = Vocab.from_dict( token_to_idx, unk_token=unk_token, pad_token=pad_token, bos_token=bos_token, eos_token=eos_token, **kwargs) return vocab @staticmethod def save_vocabulary(filepath, vocab): """ Save all tokens to a vocabulary file. The file contains a token per line, and the line number would be the index of corresponding token. Agrs: filepath (str): File path to be saved to. vocab (Vocab|dict): the Vocab or dict instance to be saved. """ if isinstance(vocab, Vocab): tokens = vocab.idx_to_token else: tokens = sorted(vocab.keys(), key=lambda token: vocab[token]) with io.open(filepath, 'w', encoding='utf-8') as f: for token in tokens: f.write(token + '\n') def __getattr__(self, name): if name.endswith('_token'): return self.special_tokens_map[name] elif name.endswith('_token_id'): return self.convert_tokens_to_ids(self.special_tokens_map[name[: -3]]) raise AttributeError("'{}' object has no attribute '{}'".format( type(self).__name__, name)) def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0): """ Truncates a sequence pair in place to the maximum length. Args: ids: list of tokenized input ids. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. num_tokens_to_remove (:obj:`int`, `optional`, defaults to ``0``): number of tokens to remove using the truncation strategy truncation_strategy: string selected in the following options: - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_seq_len starting from the longest one at each token (when there is a pair of input sequences). Overflowing tokens only contains overflow from the first sequence. - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_seq_len) stride (:obj:`int`, `optional`, defaults to ``0``): If set to a number along with max_seq_len, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. """ if num_tokens_to_remove <= 0: return ids, pair_ids, [] if truncation_strategy == 'longest_first': overflowing_tokens = [] for _ in range(num_tokens_to_remove): if pair_ids is None or len(ids) > len(pair_ids): overflowing_tokens = [ids[-1]] + overflowing_tokens ids = ids[:-1] else: pair_ids = pair_ids[:-1] window_len = min(len(ids), stride) if window_len > 0: overflowing_tokens = ids[-window_len:] + overflowing_tokens elif truncation_strategy == 'only_first': assert len(ids) > num_tokens_to_remove window_len = min(len(ids), stride + num_tokens_to_remove) overflowing_tokens = ids[-window_len:] ids = ids[:-num_tokens_to_remove] elif truncation_strategy == 'only_second': assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove window_len = min(len(pair_ids), stride + num_tokens_to_remove) overflowing_tokens = pair_ids[-window_len:] pair_ids = pair_ids[:-num_tokens_to_remove] elif truncation_strategy == 'do_not_truncate': raise ValueError( "Input sequence are too long for max_length. Please select a truncation strategy." ) else: raise ValueError( "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']" ) return (ids, pair_ids, overflowing_tokens)