# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ text preprocess """ import random import sys import os import base64 import numpy as np reload(sys) sys.setdefaultencoding("utf-8") from preprocess import tokenization class PreprocessorBasic(object): """ Main class for text preprocess """ def __init__(self, tokenizer_name, vocab_path, tagger_path="", nltk_data_path="", do_lower_case=True): self.do_lower_case = do_lower_case self.tokenizer = getattr(tokenization, tokenizer_name)(vocab_file=vocab_path, do_lower_case=do_lower_case) self.vocab = self.tokenizer.vocab def convert_sentence_to_ids_without_cls(self, sentence): """ Convert sentence to ids without cls """ tokens = self.tokenizer.tokenize(sentence) ids = self.tokenizer.convert_tokens_to_ids(tokens) return ids