From f4d9b46bf09ef35dd3fc65e609bd11a43ab66ec4 Mon Sep 17 00:00:00 2001 From: Steffy-zxf <48793257+Steffy-zxf@users.noreply.github.com> Date: Tue, 2 Mar 2021 14:28:58 +0800 Subject: [PATCH] Fix the compatibility error caused by the upgrade of PretrainedTokenizer --- paddlehub/datasets/base_nlp_dataset.py | 84 +++++++++++++----- paddlehub/module/nlp_module.py | 115 +++++++++++++------------ paddlehub/utils/utils.py | 12 ++- requirements.txt | 2 +- 4 files changed, 130 insertions(+), 83 deletions(-) diff --git a/paddlehub/datasets/base_nlp_dataset.py b/paddlehub/datasets/base_nlp_dataset.py index 504a7d16..c4cebdda 100644 --- a/paddlehub/datasets/base_nlp_dataset.py +++ b/paddlehub/datasets/base_nlp_dataset.py @@ -11,13 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union, Tuple import csv import io import os +from typing import Dict, List, Optional, Union, Tuple import numpy as np import paddle +import paddlenlp +from packaging.version import Version from paddlehub.env import DATA_HOME from paddlenlp.transformers import PretrainedTokenizer @@ -27,7 +29,6 @@ from paddlehub.utils.utils import download, reseg_token_label, pad_sequence, tru from paddlehub.utils.xarfile import is_xarfile, unarchive - class InputExample(object): """ The input data structure of Transformer modules (BERT, ERNIE and so on). @@ -233,7 +234,16 @@ class TextClassificationDataset(BaseNLPDataset, paddle.io.Dataset): records = [] for example in examples: if isinstance(self.tokenizer, PretrainedTokenizer): - record = self.tokenizer.encode(text=example.text_a, text_pair=example.text_b, max_seq_len=self.max_seq_len) + if Version(paddlenlp.__version__) <= Version('2.0.0rc2'): + record = self.tokenizer.encode( + text=example.text_a, text_pair=example.text_b, max_seq_len=self.max_seq_len) + else: + record = self.tokenizer( + text=example.text_a, + text_pair=example.text_b, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=True, + return_length=True) elif isinstance(self.tokenizer, JiebaTokenizer): pad_token = self.tokenizer.vocab.pad_token @@ -246,7 +256,9 @@ class TextClassificationDataset(BaseNLPDataset, paddle.io.Dataset): ids = pad_sequence(ids, self.max_seq_len, pad_token_id) record = {'text': ids, 'seq_len': seq_len} else: - raise RuntimeError("Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer".format(type(self.tokenizer))) + raise RuntimeError( + "Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer" + .format(type(self.tokenizer))) if not record: logger.info( @@ -260,17 +272,26 @@ class TextClassificationDataset(BaseNLPDataset, paddle.io.Dataset): def __getitem__(self, idx): record = self.records[idx] if isinstance(self.tokenizer, PretrainedTokenizer): + input_ids = np.array(record['input_ids']) + if Version(paddlenlp.__version__) >= Version('2.0.0rc5'): + token_type_ids = np.array(record['token_type_ids']) + else: + token_type_ids = record['segment_ids'] + if 'label' in record.keys(): - return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['label'], dtype=np.int64) + return input_ids, token_type_ids, np.array(record['label'], dtype=np.int64) else: - return np.array(record['input_ids']), np.array(record['segment_ids']) + return input_ids, token_type_ids + elif isinstance(self.tokenizer, JiebaTokenizer): if 'label' in record.keys(): return np.array(record['text']), np.array(record['label'], dtype=np.int64) else: return np.array(record['text']) else: - raise RuntimeError("Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer".format(type(self.tokenizer))) + raise RuntimeError( + "Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer". + format(type(self.tokenizer))) def __len__(self): return len(self.records) @@ -303,6 +324,7 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset): is_file_with_header(:obj:bool, `optional`, default to :obj: False) : Whether or not the file is with the header introduction. """ + def __init__(self, base_path: str, tokenizer: Union[PretrainedTokenizer, JiebaTokenizer], @@ -311,7 +333,7 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset): data_file: str = None, label_file: str = None, label_list: list = None, - split_char: str ="\002", + split_char: str = "\002", no_entity_label: str = "O", ignore_label: int = -100, is_file_with_header: bool = False): @@ -365,7 +387,15 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset): pad_token = self.tokenizer.pad_token tokens, labels = reseg_token_label(tokenizer=self.tokenizer, tokens=tokens, labels=labels) - record = self.tokenizer.encode(text=tokens, max_seq_len=self.max_seq_len) + if Version(paddlenlp.__version__) <= Version('2.0.0rc2'): + record = self.tokenizer.encode(text=tokens, max_seq_len=self.max_seq_len) + else: + record = self.tokenizer( + text=tokens, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=True, + is_split_into_words=True, + return_length=True) elif isinstance(self.tokenizer, JiebaTokenizer): pad_token = self.tokenizer.vocab.pad_token @@ -379,12 +409,13 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset): record = {'text': ids, 'seq_len': seq_len} else: - raise RuntimeError("Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer".format(type(self.tokenizer))) + raise RuntimeError( + "Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer" + .format(type(self.tokenizer))) if not record: logger.info( - "The text %s has been dropped as it has no words in the vocab after tokenization." - % example.text_a) + "The text %s has been dropped as it has no words in the vocab after tokenization." % example.text_a) continue # convert labels into record @@ -395,37 +426,46 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset): elif isinstance(self.tokenizer, JiebaTokenizer): tokens_with_specical_token = [self.tokenizer.vocab.to_tokens(id_) for id_ in record['text']] else: - raise RuntimeError("Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer".format(type(self.tokenizer))) + raise RuntimeError( + "Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer" + .format(type(self.tokenizer))) tokens_index = 0 for token in tokens_with_specical_token: - if tokens_index < len( - tokens) and token == tokens[tokens_index]: - record["label"].append( - self.label_list.index(labels[tokens_index])) + if tokens_index < len(tokens) and token == tokens[tokens_index]: + record["label"].append(self.label_list.index(labels[tokens_index])) tokens_index += 1 elif token in [pad_token]: record["label"].append(self.ignore_label) # label of special token else: - record["label"].append( - self.label_list.index(self.no_entity_label)) + record["label"].append(self.label_list.index(self.no_entity_label)) records.append(record) return records def __getitem__(self, idx): record = self.records[idx] if isinstance(self.tokenizer, PretrainedTokenizer): + input_ids = np.array(record['input_ids']) + seq_lens = np.array(record['seq_len']) + if Version(paddlenlp.__version__) >= Version('2.0.0rc5'): + token_type_ids = np.array(record['token_type_ids']) + else: + token_type_ids = np.array(record['segment_ids']) + if 'label' in record.keys(): - return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['seq_len']), np.array(record['label'], dtype=np.int64) + return input_ids, token_type_ids, seq_lens, np.array(record['label'], dtype=np.int64) else: - return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['seq_len']) + return input_ids, token_type_ids, seq_lens + elif isinstance(self.tokenizer, JiebaTokenizer): if 'label' in record.keys(): return np.array(record['text']), np.array(record['seq_len']), np.array(record['label'], dtype=np.int64) else: return np.array(record['text']), np.array(record['seq_len']) else: - raise RuntimeError("Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer".format(type(self.tokenizer))) + raise RuntimeError( + "Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer". + format(type(self.tokenizer))) def __len__(self): return len(self.records) diff --git a/paddlehub/module/nlp_module.py b/paddlehub/module/nlp_module.py index 49784320..5973076e 100644 --- a/paddlehub/module/nlp_module.py +++ b/paddlehub/module/nlp_module.py @@ -11,9 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -# FIXME(zhangxuefei): remove this file after paddlenlp is released. - import copy import functools import inspect @@ -25,6 +22,7 @@ from typing import List, Tuple import paddle import paddle.nn as nn +from packaging.version import Version from paddle.dataset.common import DATA_HOME from paddle.utils.download import get_path_from_url from paddlehub.module.module import serving, RunModule, runnable @@ -32,11 +30,11 @@ from paddlehub.module.module import serving, RunModule, runnable from paddlehub.utils.log import logger from paddlehub.utils.utils import reseg_token_label +import paddlenlp from paddlenlp.embeddings.token_embedding import EMBEDDING_HOME, EMBEDDING_URL_ROOT from paddlenlp.data import JiebaTokenizer from paddlehub.compat.module.nlp_module import DataFormatError - __all__ = [ 'PretrainedModel', 'register_base_model', @@ -357,14 +355,9 @@ class TextServing(object): """ A base class for text model which supports serving. """ + @serving - def predict_method( - self, - data: List[List[str]], - max_seq_len: int = 128, - batch_size: int = 1, - use_gpu: bool = False - ): + def predict_method(self, data: List[List[str]], max_seq_len: int = 128, batch_size: int = 1, use_gpu: bool = False): """ Run predict method as a service. Serving as a task which is specified from serving config. @@ -391,20 +384,16 @@ class TextServing(object): if self.task == 'token-cls': # remove labels of [CLS] token and pad tokens - results = [ - token_labels[1:len(data[i][0])+1] for i, token_labels in enumerate(results) - ] + results = [token_labels[1:len(data[i][0]) + 1] for i, token_labels in enumerate(results)] return results - elif self.task is None: # embedding service + elif self.task is None: # embedding service results = self.get_embedding(data, use_gpu) return results - else: # unknown service - logger.error( - f'Unknown task {self.task}, current tasks supported:\n' - '1. seq-cls: sequence classification service;\n' - '2. token-cls: sequence labeling service;\n' - '3. None: embedding service' - ) + else: # unknown service + logger.error(f'Unknown task {self.task}, current tasks supported:\n' + '1. seq-cls: sequence classification service;\n' + '2. token-cls: sequence labeling service;\n' + '3. None: embedding service') return @@ -422,11 +411,33 @@ class TransformerModule(RunModule, TextServing): if self.task == 'token-cls': # Extra processing of token-cls task tokens = text[0].split(split_char) text[0], _ = reseg_token_label(tokenizer=tokenizer, tokens=tokens) + is_split_into_words = True + else: + is_split_into_words = False if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len, pad_to_max_seq_len=pad_to_max_seq_len) + if Version(paddlenlp.__version__) <= Version('2.0.0rc2'): + encoded_inputs = tokenizer.encode( + text[0], text_pair=None, max_seq_len=max_seq_len, pad_to_max_seq_len=pad_to_max_seq_len) + else: + encoded_inputs = tokenizer( + text=text[0], + max_seq_len=max_seq_len, + pad_to_max_seq_len=True, + is_split_into_words=is_split_into_words, + return_length=True) elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len, pad_to_max_seq_len=pad_to_max_seq_len) + if Version(paddlenlp.__version__) <= Version('2.0.0rc2'): + encoded_inputs = tokenizer.encode( + text[0], text_pair=text[1], max_seq_len=max_seq_len, pad_to_max_seq_len=pad_to_max_seq_len) + else: + encoded_inputs = tokenizer( + text=text[0], + text_pair=text[1], + max_seq_len=max_seq_len, + pad_to_max_seq_len=True, + is_split_into_words=is_split_into_words, + return_length=True) else: raise RuntimeError( 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) @@ -442,7 +453,14 @@ class TransformerModule(RunModule, TextServing): examples = [] for text in data: encoded_inputs = self._convert_text_to_input(tokenizer, text, max_seq_len, split_char) - examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids'])) + input_ids = encoded_inputs['input_ids'] + + if Version(paddlenlp.__version__) >= Version('2.0.0rc5'): + token_type_ids = encoded_inputs['token_type_ids'] + else: + token_type_ids = encoded_inputs['segment_ids'] + + examples.append((input_ids, token_type_ids)) # Seperates data into some batches. one_batch = [] @@ -468,7 +486,8 @@ class TransformerModule(RunModule, TextServing): if self.task == 'seq-cls': predictions, avg_loss, metric = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) elif self.task == 'token-cls': - predictions, avg_loss, metric = self(input_ids=batch[0], token_type_ids=batch[1], seq_lengths=batch[2], labels=batch[3]) + predictions, avg_loss, metric = self( + input_ids=batch[0], token_type_ids=batch[1], seq_lengths=batch[2], labels=batch[3]) self.metric.reset() return {'loss': avg_loss, 'metrics': metric} @@ -485,7 +504,8 @@ class TransformerModule(RunModule, TextServing): if self.task == 'seq-cls': predictions, avg_loss, metric = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) elif self.task == 'token-cls': - predictions, avg_loss, metric = self(input_ids=batch[0], token_type_ids=batch[1], seq_lengths=batch[2], labels=batch[3]) + predictions, avg_loss, metric = self( + input_ids=batch[0], token_type_ids=batch[1], seq_lengths=batch[2], labels=batch[3]) self.metric.reset() return {'metrics': metric} @@ -502,20 +522,14 @@ class TransformerModule(RunModule, TextServing): if self.task is not None: raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task) - return self.predict( - data=data, - use_gpu=use_gpu - ) - - def predict( - self, - data: List[List[str]], - max_seq_len: int = 128, - split_char: str = '\002', - batch_size: int = 1, - use_gpu: bool = False - ): + return self.predict(data=data, use_gpu=use_gpu) + def predict(self, + data: List[List[str]], + max_seq_len: int = 128, + split_char: str = '\002', + batch_size: int = 1, + use_gpu: bool = False): """ Predicts the data labels. @@ -532,12 +546,10 @@ class TransformerModule(RunModule, TextServing): """ if self.task not in self._tasks_supported \ and self.task is not None: # None for getting embedding - raise RuntimeError( - f'Unknown task {self.task}, current tasks supported:\n' - '1. seq-cls: sequence classification;\n' - '2. token-cls: sequence labeling;\n' - '3. None: embedding' - ) + raise RuntimeError(f'Unknown task {self.task}, current tasks supported:\n' + '1. seq-cls: sequence classification;\n' + '2. token-cls: sequence labeling;\n' + '3. None: embedding') paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') @@ -563,10 +575,7 @@ class TransformerModule(RunModule, TextServing): results.extend(token_labels) elif self.task == None: sequence_output, pooled_output = self(input_ids, segment_ids) - results.append([ - pooled_output.squeeze(0).numpy().tolist(), - sequence_output.squeeze(0).numpy().tolist() - ]) + results.append([pooled_output.squeeze(0).numpy().tolist(), sequence_output.squeeze(0).numpy().tolist()]) return results @@ -575,6 +584,7 @@ class EmbeddingServing(object): """ A base class for embedding model which supports serving. """ + @serving def calc_similarity(self, data: List[List[str]]): """ @@ -593,8 +603,7 @@ class EmbeddingServing(object): for word in word_pair: if self.get_idx_from_word(word) == \ self.get_idx_from_word(self.vocab.unk_token): - raise RuntimeError( - f'Word "{word}" is not in vocab. Please check your inputs.') + raise RuntimeError(f'Word "{word}" is not in vocab. Please check your inputs.') results.append(str(self.cosine_sim(*word_pair))) return results @@ -627,5 +636,5 @@ class EmbeddingModule(RunModule, EmbeddingServing): """ if self.embedding_name.endswith('.en'): # English raise NotImplementedError # TODO: (chenxiaojie) add tokenizer of English embedding - else: # Chinese + else: # Chinese return JiebaTokenizer(self.vocab) diff --git a/paddlehub/utils/utils.py b/paddlehub/utils/utils.py index 4152057d..f17f39f3 100644 --- a/paddlehub/utils/utils.py +++ b/paddlehub/utils/utils.py @@ -336,12 +336,11 @@ def reseg_token_label(tokenizer, tokens: List[str], labels: List[str] = None): ''' if labels: if len(tokens) != len(labels): - raise ValueError( - "The length of tokens must be same with labels") + raise ValueError("The length of tokens must be same with labels") ret_tokens = [] ret_labels = [] for token, label in zip(tokens, labels): - sub_token = tokenizer(token) + sub_token = tokenizer._tokenize(token) if len(sub_token) == 0: continue ret_tokens.extend(sub_token) @@ -354,13 +353,12 @@ def reseg_token_label(tokenizer, tokens: List[str], labels: List[str] = None): ret_labels.extend([sub_label] * (len(sub_token) - 1)) if len(ret_tokens) != len(ret_labels): - raise ValueError( - "The length of ret_tokens can't match with labels") + raise ValueError("The length of ret_tokens can't match with labels") return ret_tokens, ret_labels else: ret_tokens = [] for token in tokens: - sub_token = tokenizer(token) + sub_token = tokenizer._tokenize(token) if len(sub_token) == 0: continue ret_tokens.extend(sub_token) @@ -376,7 +374,7 @@ def pad_sequence(ids: List[int], max_seq_len: int, pad_token_id: int): assert len(ids) <= max_seq_len, \ f'The input length {len(ids)} is greater than max_seq_len {max_seq_len}. '\ 'Please check the input list and max_seq_len if you really want to pad a sequence.' - return ids[:] + [pad_token_id]*(max_seq_len-len(ids)) + return ids[:] + [pad_token_id] * (max_seq_len - len(ids)) def trunc_sequence(ids: List[int], max_seq_len: int): diff --git a/requirements.txt b/requirements.txt index 8102079e..2721ee4b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,4 @@ tqdm visualdl >= 2.0.0 # gunicorn not support windows gunicorn >= 19.10.0; sys_platform != "win32" -paddlenlp >= 2.0.0b2 \ No newline at end of file +paddlenlp >= 2.0.0rc5 -- GitLab