# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from functools import partial import argparse import os import random import time import numpy as np import paddle import paddle.nn.functional as F from paddlenlp.data import Stack, Tuple, Pad import paddlenlp as ppnlp MODEL_CLASSES = { "bert": (ppnlp.transformers.BertForSequenceClassification, ppnlp.transformers.BertTokenizer), 'ernie': (ppnlp.transformers.ErnieForSequenceClassification, ppnlp.transformers.ErnieTokenizer), 'roberta': (ppnlp.transformers.RobertaForSequenceClassification, ppnlp.transformers.RobertaTokenizer), } # yapf: disable def parse_args(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--model_type", default='ernie', required=True, type=str, help="Model type selected in the list: " +", ".join(MODEL_CLASSES.keys())) parser.add_argument("--model_name_or_path", default='ernie-tiny', required=True, type=str, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(sum([list(classes[-1].pretrained_init_configuration.keys()) for classes in MODEL_CLASSES.values()], []))) parser.add_argument("--params_path", type=str, required=True, help="The path to model parameters to be loaded.") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. " "Sequences longer than this will be truncated, sequences shorter will be padded.") parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs to use, 0 for CPU.") args = parser.parse_args() return args # yapf: enable def convert_example(example, tokenizer, label_list, max_seq_length=512, is_test=False): """ Builds model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. And creates a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence has the following format: - single sequence: ``[CLS] X [SEP]`` - pair of sequences: ``[CLS] A [SEP] B [SEP]`` A BERT sequence pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | If only one sequence, only returns the first portion of the mask (0's). Args: example(obj:`list[str]`): List of input data, containing text and label if it have label. tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. label_list(obj:`list[str]`): All the labels that the data has. max_seq_len(obj:`int`): The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded. is_test(obj:`False`, defaults to `False`): Whether the example contains label or not. Returns: input_ids(obj:`list[int]`): The list of token ids. segment_ids(obj: `list[int]`): List of sequence pair mask. label(obj:`numpy.array`, data type of int64, optional): The input label if not is_test. """ text = example encoded_inputs = tokenizer.encode(text=text, max_seq_len=max_seq_length) input_ids = encoded_inputs["input_ids"] segment_ids = encoded_inputs["segment_ids"] if not is_test: # create label maps label_map = {} for (i, l) in enumerate(label_list): label_map[l] = i label = label_map[label] label = np.array([label], dtype="int64") return input_ids, segment_ids, label else: return input_ids, segment_ids def predict(model, data, tokenizer, label_map, batch_size=1): """ Predicts the data labels. Args: model (obj:`paddle.nn.Layer`): A model to classify texts. data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object. A Example object contains `text`(word_ids) and `seq_len`(sequence length). tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. label_map(obj:`dict`): The label id (key) to label str (value) map. batch_size(obj:`int`, defaults to 1): The number of batch. Returns: results(obj:`dict`): All the predictions labels. """ examples = [] for text in data: input_ids, segment_ids = convert_example( text, tokenizer, label_list=label_map.values(), max_seq_length=args.max_seq_length, is_test=True) examples.append((input_ids, segment_ids)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment ): fn(samples) # Seperates data into some batches. batches = [] one_batch = [] for example in examples: one_batch.append(example) if len(one_batch) == batch_size: batches.append(one_batch) one_batch = [] if one_batch: # The last batch whose size is less than the config batch_size setting. batches.append(one_batch) results = [] model.eval() for batch in batches: input_ids, segment_ids = batchify_fn(batch) input_ids = paddle.to_tensor(input_ids) segment_ids = paddle.to_tensor(segment_ids) logits = model(input_ids, segment_ids) probs = F.softmax(logits, axis=1) idx = paddle.argmax(probs, axis=1).numpy() idx = idx.tolist() labels = [label_map[i] for i in idx] results.extend(labels) return results if __name__ == "__main__": args = parse_args() paddle.set_device("gpu" if args.n_gpu else "cpu") args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] if args.model_name_or_path == 'ernie-tiny': # ErnieTinyTokenizer is special for ernie-tiny pretained model. tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained( args.model_name_or_path) else: tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) data = [ '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', ] label_map = {0: 'negative', 1: 'positive'} model = model_class.from_pretrained( args.model_name_or_path, num_classes=len(label_map)) if args.params_path and os.path.isfile(args.params_path): state_dict = paddle.load(args.params_path) model.set_dict(state_dict) print("Loaded parameters from %s" % args.params_path) results = predict( model, data, tokenizer, label_map, batch_size=args.batch_size) for idx, text in enumerate(data): print('Data: {} \t Lable: {}'.format(text, results[idx]))