From f67ad5be678278c67e8714bd08dacb38bdb0ebb2 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Wed, 30 Dec 2020 23:57:48 +0800 Subject: [PATCH] Update transformer modules (#1147) * Add token-cls task for transformer modules * Fix numpy dtype mismatch in windows * Update README.md * Update token-cls task for ernie_tiny * Update token-cls task for ernie_tiny * Update token-cls task for other Transformer modules * Update README.md of modules and token-cls demo * Add chunk_scheme arg check in SeqLabelingDataset * Update ChunkEvaluator and paddlenlp requirement * Update README.md * Update token-cls demo --- demo/sequence_labeling/README.md | 41 ++- demo/sequence_labeling/train.py | 52 ++- .../language_model/bert-base-cased/README.md | 52 +-- .../language_model/bert-base-cased/module.py | 215 ++++--------- .../bert-base-chinese/README.md | 55 ++-- .../bert-base-chinese/module.py | 221 ++++--------- .../bert-base-multilingual-cased/README.md | 55 ++-- .../bert-base-multilingual-cased/module.py | 221 ++++--------- .../bert-base-multilingual-uncased/README.md | 55 ++-- .../bert-base-multilingual-uncased/module.py | 222 ++++--------- .../bert-base-uncased/README.md | 55 ++-- .../bert-base-uncased/module.py | 215 ++++--------- .../language_model/bert-large-cased/README.md | 55 ++-- .../language_model/bert-large-cased/module.py | 214 ++++--------- .../bert-large-uncased/README.md | 57 ++-- .../bert-large-uncased/module.py | 215 ++++--------- .../language_model/chinese_bert_wwm/README.md | 250 +++++---------- .../chinese_bert_wwm/model/__init__.py | 0 .../chinese_bert_wwm/model/bert.py | 197 ------------ .../model/transformer_encoder.py | 295 ------------------ .../language_model/chinese_bert_wwm/module.py | 148 ++++++--- .../chinese_bert_wwm_ext/README.md | 152 +++++---- .../chinese_bert_wwm_ext/model/__init__.py | 0 .../chinese_bert_wwm_ext/model/bert.py | 197 ------------ .../model/transformer_encoder.py | 295 ------------------ .../chinese_bert_wwm_ext/module.py | 148 ++++++--- modules/text/language_model/ernie/README.md | 53 ++-- modules/text/language_model/ernie/module.py | 214 ++++--------- .../text/language_model/ernie_tiny/README.md | 34 +- .../text/language_model/ernie_tiny/module.py | 73 ++--- .../ernie_v2_eng_base/README.md | 52 +-- .../ernie_v2_eng_base/module.py | 221 ++++--------- .../ernie_v2_eng_large/README.md | 54 ++-- .../ernie_v2_eng_large/module.py | 221 ++++--------- modules/text/language_model/rbt3/README.md | 152 +++++---- .../language_model/rbt3/model/__init__.py | 0 .../text/language_model/rbt3/model/bert.py | 197 ------------ .../rbt3/model/transformer_encoder.py | 295 ------------------ modules/text/language_model/rbt3/module.py | 145 ++++++--- modules/text/language_model/rbtl3/README.md | 152 +++++---- .../language_model/rbtl3/model/__init__.py | 0 .../text/language_model/rbtl3/model/bert.py | 197 ------------ .../rbtl3/model/transformer_encoder.py | 295 ------------------ modules/text/language_model/rbtl3/module.py | 145 ++++++--- .../roberta-wwm-ext-large/README.md | 56 ++-- .../roberta-wwm-ext-large/module.py | 219 ++++--------- .../language_model/roberta-wwm-ext/README.md | 56 ++-- .../language_model/roberta-wwm-ext/module.py | 219 ++++--------- paddlehub/datasets/base_nlp_dataset.py | 35 +-- paddlehub/datasets/msra_ner.py | 12 +- paddlehub/module/modeling_bert.py | 289 ----------------- paddlehub/module/modeling_ernie.py | 243 --------------- paddlehub/module/modeling_roberta.py | 215 ------------- paddlehub/module/nlp_module.py | 14 +- requirements.txt | 2 +- 55 files changed, 2111 insertions(+), 5431 deletions(-) delete mode 100644 modules/text/language_model/chinese_bert_wwm/model/__init__.py delete mode 100644 modules/text/language_model/chinese_bert_wwm/model/bert.py delete mode 100644 modules/text/language_model/chinese_bert_wwm/model/transformer_encoder.py delete mode 100644 modules/text/language_model/chinese_bert_wwm_ext/model/__init__.py delete mode 100644 modules/text/language_model/chinese_bert_wwm_ext/model/bert.py delete mode 100644 modules/text/language_model/chinese_bert_wwm_ext/model/transformer_encoder.py delete mode 100644 modules/text/language_model/rbt3/model/__init__.py delete mode 100644 modules/text/language_model/rbt3/model/bert.py delete mode 100644 modules/text/language_model/rbt3/model/transformer_encoder.py delete mode 100644 modules/text/language_model/rbtl3/model/__init__.py delete mode 100644 modules/text/language_model/rbtl3/model/bert.py delete mode 100644 modules/text/language_model/rbtl3/model/transformer_encoder.py delete mode 100644 paddlehub/module/modeling_bert.py delete mode 100644 paddlehub/module/modeling_ernie.py delete mode 100644 paddlehub/module/modeling_roberta.py diff --git a/demo/sequence_labeling/README.md b/demo/sequence_labeling/README.md index fda17c32..04c3450a 100644 --- a/demo/sequence_labeling/README.md +++ b/demo/sequence_labeling/README.md @@ -28,10 +28,21 @@ python train.py 使用PaddleHub Fine-tune API进行Fine-tune可以分为4个步骤。 ### Step1: 选择模型 + +在命名实体识别的任务中,因不同的数据集标识实体的标签不同,评测的方式也有所差异。因此,在初始化模型的之前,需要先确定实际标签的形式,下方的`label_list`则是MSRA-NER数据集中使用的标签类别。 +如果用户使用的实体识别的数据集的标签方式与MSRA-NER不同,则需要自行根据数据集确定。 +```python +label_list = ["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"] +label_map = { + idx: label for idx, label in enumerate(label_list) +} +``` + +接下来创建任务所使用的`model` ```python import paddlehub as hub -model = hub.Module(name='ernie_tiny', version='2.0.1', task='token-cls') +model = hub.Module(name='ernie_tiny', version='2.0.1', task='token-cls', label_map=label_map) ``` 其中,参数: @@ -40,7 +51,29 @@ model = hub.Module(name='ernie_tiny', version='2.0.1', task='token-cls') * `version`:module版本号 * `task`:fine-tune任务。此处为`token-cls`,表示序列标注任务。 -通过以上的一行代码,`model`初始化为一个适用于序列标注任务的模型,为ERNIE Tiny的预训练模型后拼接上一个输出token共享的全连接网络(Full Connected)。 +PaddleHub还提供BERT等模型可供选择, 当前支持序列标注任务的模型对应的加载示例如下: + +模型名 | PaddleHub Module +---------------------------------- | :------: +ERNIE, Chinese | `hub.Module(name='ernie')` +ERNIE tiny, Chinese | `hub.Module(name='ernie_tiny')` +ERNIE 2.0 Base, English | `hub.Module(name='ernie_v2_eng_base')` +ERNIE 2.0 Large, English | `hub.Module(name='ernie_v2_eng_large')` +BERT-Base, Cased | `hub.Module(name='bert-base-cased')` +BERT-Base, Uncased | `hub.Module(name='bert-base-uncased')` +BERT-Large, Cased | `hub.Module(name='bert-large-cased')` +BERT-Large, Uncased | `hub.Module(name='bert-large-uncased')` +BERT-Base, Multilingual Cased | `hub.Module(nane='bert-base-multilingual-cased')` +BERT-Base, Multilingual Uncased | `hub.Module(nane='bert-base-multilingual-uncased')` +BERT-Base, Chinese | `hub.Module(name='bert-base-chinese')` +BERT-wwm, Chinese | `hub.Module(name='chinese-bert-wwm')` +BERT-wwm-ext, Chinese | `hub.Module(name='chinese-bert-wwm-ext')` +RoBERTa-wwm-ext, Chinese | `hub.Module(name='roberta-wwm-ext')` +RoBERTa-wwm-ext-large, Chinese | `hub.Module(name='roberta-wwm-ext-large')` +RBT3, Chinese | `hub.Module(name='rbt3')` +RBTL3, Chinese | `hub.Module(name='rbtl3')` + +通过以上的一行代码,`model`初始化为一个适用于序列标注任务的模型,为ERNIE Tiny的预训练模型后拼接上一个输出token共享的全连接网络(Full Connected)。 ![](https://ss1.bdstatic.com/70cFuXSh_Q1YnxGkpoWK1HF6hhy/it/u=224484727,3049769188&fm=15&gp=0.jpg) 以上图片来自于:https://arxiv.org/pdf/1810.04805.pdf @@ -49,9 +82,9 @@ model = hub.Module(name='ernie_tiny', version='2.0.1', task='token-cls') ```python train_dataset = hub.datasets.MSRA_NER( - tokenizer=model.get_tokenizer(tokenize_chinese_chars=True), max_seq_len=50, mode='train') + tokenizer=model.get_tokenizer(), max_seq_len=128, mode='train') dev_dataset = hub.datasets.MSRA_NER( - tokenizer=model.get_tokenizer(tokenize_chinese_chars=True), max_seq_len=50, mode='dev') + tokenizer=model.get_tokenizer(), max_seq_len=128, mode='dev') ``` * `tokenizer`:表示该module所需用到的tokenizer,其将对输入文本完成切词,并转化成module运行所需模型输入格式。 diff --git a/demo/sequence_labeling/train.py b/demo/sequence_labeling/train.py index 43a81fb4..3e26d20b 100644 --- a/demo/sequence_labeling/train.py +++ b/demo/sequence_labeling/train.py @@ -14,32 +14,60 @@ import paddle import paddlehub as hub +from paddlehub.datasets import MSRA_NER + +import ast +import argparse + +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") +parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for fine-tuning, input should be True or False") +parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") +parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.") +parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") +parser.add_argument("--checkpoint_dir", type=str, default='./checkpoint', help="Directory to model checkpoint") +parser.add_argument("--save_interval", type=int, default=1, help="Save checkpoint every n epoch.") + +args = parser.parse_args() + if __name__ == '__main__': label_list = ["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"] label_map = { idx: label for idx, label in enumerate(label_list) } + model = hub.Module( name='ernie_tiny', version='2.0.1', task='token-cls', - label_map=label_map, + label_map=label_map, # Required for token classification task ) - train_dataset = hub.datasets.MSRA_NER( - tokenizer=model.get_tokenizer(), - max_seq_len=128, + tokenizer = model.get_tokenizer() + train_dataset = MSRA_NER( + tokenizer=tokenizer, + max_seq_len=args.max_seq_len, mode='train' ) - - dev_dataset = hub.datasets.MSRA_NER( - tokenizer=model.get_tokenizer(), - max_seq_len=50, + dev_dataset = MSRA_NER( + tokenizer=tokenizer, + max_seq_len=args.max_seq_len, mode='dev' ) + test_dataset = MSRA_NER( + tokenizer=tokenizer, + max_seq_len=args.max_seq_len, + mode='test' + ) - optimizer = paddle.optimizer.AdamW(learning_rate=5e-5, parameters=model.parameters()) - trainer = hub.Trainer(model, optimizer, checkpoint_dir='token_cls_save_dir', use_gpu=True) - - trainer.train(train_dataset, epochs=3, batch_size=32, eval_dataset=dev_dataset, save_interval=1) + optimizer = paddle.optimizer.AdamW(learning_rate=args.learning_rate, parameters=model.parameters()) + trainer = hub.Trainer(model, optimizer, checkpoint_dir=args.checkpoint_dir, use_gpu=args.use_gpu) + trainer.train( + train_dataset, + epochs=args.num_epoch, + batch_size=args.batch_size, + eval_dataset=dev_dataset, + save_interval=args.save_interval, + ) + trainer.evaluate(test_dataset, batch_size=args.batch_size) diff --git a/modules/text/language_model/bert-base-cased/README.md b/modules/text/language_model/bert-base-cased/README.md index 2d6aac86..f75cfd72 100644 --- a/modules/text/language_model/bert-base-cased/README.md +++ b/modules/text/language_model/bert-base-cased/README.md @@ -1,5 +1,5 @@ ```shell -$ hub install bert-base-cased==2.0.0 +$ hub install bert-base-cased==2.0.1 ```

@@ -14,23 +14,29 @@ $ hub install bert-base-cased==2.0.0 def __init__( task=None, load_checkpoint=None, - label_map=None) + label_map=None, + num_classes=2, + **kwargs, +) ``` 创建Module对象(动态图组网版本)。 **参数** -* `task`: 任务名称,可为`sequence_classification`。 +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 * `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 * `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python def predict( data, max_seq_len=128, batch_size=1, - use_gpu=False) + use_gpu=False +) ``` **参数** @@ -45,7 +51,9 @@ def predict( ```python def get_embedding( - texts, + data, + max_seq_len=128, + batch_size=1, use_gpu=False ) ``` @@ -54,7 +62,9 @@ def get_embedding( **参数** -* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** @@ -68,16 +78,16 @@ def get_embedding( import paddlehub as hub data = [ - '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', - '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', - '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], ] label_map = {0: 'negative', 1: 'positive'} model = hub.Module( name='bert-base-cased', - version='2.0.0', - task='sequence_classification', + version='2.0.1', + task='seq-cls', load_checkpoint='/path/to/parameters', label_map=label_map) results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) @@ -85,7 +95,9 @@ for idx, text in enumerate(data): print('Data: {} \t Lable: {}'.format(text, results[idx])) ``` -参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) ## 服务部署 @@ -111,12 +123,12 @@ $ hub serving start -m bert-base-cased import requests import json -# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]} -text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]] -# 以key的方式指定text传入预测方法的时的参数,此例中为"texts" -# 对应本地部署,则为module.get_embedding(texts=text) -data = {"texts": text} -# 发送post请求,content-type类型应指定json方式 +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip url = "http://10.12.121.132:8866/predict/bert-base-cased" # 指定post请求的headers为application/json方式 headers = {"Content-Type": "application/json"} @@ -149,3 +161,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图,接口有所变化。 + +* 2.0.1 + + 任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/bert-base-cased/module.py b/modules/text/language_model/bert-base-cased/module.py index 92a8b7d2..8b7b75d5 100644 --- a/modules/text/language_model/bert-base-cased/module.py +++ b/modules/text/language_model/bert-base-cased/module.py @@ -11,29 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict import os +import math -from paddle.dataset.common import DATA_HOME import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlehub import BertTokenizer -from paddlehub.module.modeling_bert import BertForSequenceClassification, BertModel -from paddlehub.module.module import moduleinfo, serving +from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification +from paddlenlp.transformers.bert.tokenizer import BertTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule from paddlehub.utils.log import logger -from paddlehub.utils.utils import download @moduleinfo( name="bert-base-cased", - version="2.0.0", + version="2.0.1", summary= "bert_cased_L-12_H-768_A-12, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", author_email="", - type="nlp/semantic_model") + type="nlp/semantic_model", + meta=TransformerModule) class Bert(nn.Layer): """ BERT model @@ -41,181 +43,80 @@ class Bert(nn.Layer): def __init__( self, - task=None, - load_checkpoint=None, - label_map=None, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, ): super(Bert, self).__init__() - # TODO(zhangxuefei): add token_classification task + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes + if task == 'sequence_classification': - self.model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path='bert-base-cased') + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': + self.model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path='bert-base-cased', num_classes=self.num_classes, **kwargs) self.criterion = paddle.nn.loss.CrossEntropyLoss() - self.metric = paddle.metric.Accuracy(name='acc_accumulation') + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = BertForTokenClassification.from_pretrained(pretrained_model_name_or_path='bert-base-cased', num_classes=self.num_classes, **kwargs) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) elif task is None: - self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-cased') + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-cased', **kwargs) else: - raise RuntimeError("Unknown task %s, task should be sequence_classification" % task) + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) self.task = task - self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None): + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): result = self.model(input_ids, token_type_ids, position_ids, attention_mask) - if self.task is not None: + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) - return probs, loss, acc + return probs, loss, {'acc': acc} return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs else: sequence_output, pooled_output = result return sequence_output, pooled_output - def get_vocab_path(self): - """ - Gets the path of the module vocabulary path. - """ - save_path = os.path.join(DATA_HOME, 'bert-base-cased', 'bert-base-cased-vocab.txt') - if not os.path.exists(save_path) or not os.path.isfile(save_path): - url = "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-cased-vocab.txt" - download(url, os.path.join(DATA_HOME, 'bert-base-cased')) - return save_path - - def get_tokenizer(self, tokenize_chinese_chars=True): + @staticmethod + def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. - Args: - tokenize_chinese_chars (:obj: bool , defaults to :obj: True): - Whether to tokenize chinese characters or not. - Returns: - tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module. - """ - return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path()) - - def training_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for training, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as loss and metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'loss': avg_loss, 'metrics': {'acc': acc}} - - def validation_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for validation, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'metrics': {'acc': acc}} - - def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False): """ - Predicts the data labels. - - Args: - data (obj:`List(str)`): The processed data whose each element is the raw text. - max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`): - If set to a number, will limit the total sequence returned so that it has a maximum length. - batch_size(obj:`int`, defaults to 1): The number of batch. - use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not. - - Returns: - results(obj:`list`): All the predictions labels. - """ - # TODO(zhangxuefei): add task token_classification task predict. - if self.task not in ['sequence_classification']: - raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - tokenizer = self.get_tokenizer() - - examples = [] - for text in data: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids'])) - - def _batchify_fn(batch): - input_ids = [entry[0] for entry in batch] - segment_ids = [entry[1] for entry in batch] - return input_ids, segment_ids - - # Seperates data into some batches. - batches = [] - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) - - results = [] - self.eval() - for batch in batches: - input_ids, segment_ids = _batchify_fn(batch) - input_ids = paddle.to_tensor(input_ids) - segment_ids = paddle.to_tensor(segment_ids) - - # TODO(zhangxuefei): add task token_classification postprocess after prediction. - if self.task == 'sequence_classification': - probs = self(input_ids, segment_ids) - idx = paddle.argmax(probs, axis=1).numpy() - idx = idx.tolist() - labels = [self.label_map[i] for i in idx] - results.extend(labels) - - return results - - @serving - def get_embedding(self, texts, use_gpu=False): - if self.task is not None: - raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - - tokenizer = self.get_tokenizer() - results = [] - for text in texts: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - - encoded_inputs = tokenizer.encode(text, pad_to_max_seq_len=False) - input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0) - segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0) - sequence_output, pooled_output = self(input_ids, segment_ids) - - sequence_output = sequence_output.squeeze(0) - pooled_output = pooled_output.squeeze(0) - results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist())) - return results + return BertTokenizer.from_pretrained( + pretrained_model_name_or_path='bert-base-cased', *args, **kwargs) diff --git a/modules/text/language_model/bert-base-chinese/README.md b/modules/text/language_model/bert-base-chinese/README.md index d13c35db..3d9d31de 100644 --- a/modules/text/language_model/bert-base-chinese/README.md +++ b/modules/text/language_model/bert-base-chinese/README.md @@ -1,6 +1,7 @@ ```shell -$ hub install bert-base-chinese==2.0.0 +$ hub install bert-base-chinese==2.0.1 ``` +


@@ -13,29 +14,35 @@ $ hub install bert-base-chinese==2.0.0 def __init__( task=None, load_checkpoint=None, - label_map=None) + label_map=None, + num_classes=2, + **kwargs, +) ``` 创建Module对象(动态图组网版本)。 **参数** -* `task`: 任务名称,可为`sequence_classification`。 +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 * `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 * `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python def predict( data, max_seq_len=128, batch_size=1, - use_gpu=False) + use_gpu=False +) ``` **参数** * `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例, - 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 + 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 * `max_seq_len`:模型处理文本的最大长度 * `batch_size`:模型批处理大小 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 @@ -44,7 +51,9 @@ def predict( ```python def get_embedding( - texts, + data, + max_seq_len=128, + batch_size=1, use_gpu=False ) ``` @@ -53,7 +62,9 @@ def get_embedding( **参数** -* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** @@ -67,16 +78,16 @@ def get_embedding( import paddlehub as hub data = [ - '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', - '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', - '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], ] label_map = {0: 'negative', 1: 'positive'} model = hub.Module( name='bert-base-chinese', - version='2.0.0', - task='sequence_classification', + version='2.0.1', + task='seq-cls', load_checkpoint='/path/to/parameters', label_map=label_map) results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) @@ -84,7 +95,9 @@ for idx, text in enumerate(data): print('Data: {} \t Lable: {}'.format(text, results[idx])) ``` -参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) ## 服务部署 @@ -110,12 +123,12 @@ $ hub serving start -m bert-base-chinese import requests import json -# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]} -text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]] -# 以key的方式指定text传入预测方法的时的参数,此例中为"texts" -# 对应本地部署,则为module.get_embedding(texts=text) -data = {"texts": text} -# 发送post请求,content-type类型应指定json方式 +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip url = "http://10.12.121.132:8866/predict/bert-base-chinese" # 指定post请求的headers为application/json方式 headers = {"Content-Type": "application/json"} @@ -148,3 +161,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图,接口有所变化。 + +* 2.0.1 + + 任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/bert-base-chinese/module.py b/modules/text/language_model/bert-base-chinese/module.py index f6c63c35..bb8cca19 100644 --- a/modules/text/language_model/bert-base-chinese/module.py +++ b/modules/text/language_model/bert-base-chinese/module.py @@ -11,29 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict import os +import math -from paddle.dataset.common import DATA_HOME import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlehub import BertTokenizer -from paddlehub.module.modeling_bert import BertForSequenceClassification, BertModel -from paddlehub.module.module import moduleinfo, serving +from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification +from paddlenlp.transformers.bert.tokenizer import BertTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule from paddlehub.utils.log import logger -from paddlehub.utils.utils import download @moduleinfo( name="bert-base-chinese", - version="2.0.0", + version="2.0.1", summary= "bert_chinese_L-12_H-768_A-12, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", author_email="", - type="nlp/semantic_model") + type="nlp/semantic_model", + meta=TransformerModule) class Bert(nn.Layer): """ Bert model @@ -41,181 +43,88 @@ class Bert(nn.Layer): def __init__( self, - task=None, - load_checkpoint=None, - label_map=None, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, ): super(Bert, self).__init__() - # TODO(zhangxuefei): add token_classification task + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes + if task == 'sequence_classification': + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': self.model = BertForSequenceClassification.from_pretrained( - pretrained_model_name_or_path='bert-base-chinese') + pretrained_model_name_or_path='bert-base-chinese', + num_classes=self.num_classes, + **kwargs + ) self.criterion = paddle.nn.loss.CrossEntropyLoss() - self.metric = paddle.metric.Accuracy(name='acc_accumulation') + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = BertForTokenClassification.from_pretrained( + pretrained_model_name_or_path='bert-base-chinese', + num_classes=self.num_classes, + **kwargs + ) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) elif task is None: - self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-chinese') + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-chinese', **kwargs) else: - raise RuntimeError("Unknown task %s, task should be sequence_classification" % task) + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) self.task = task - self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None): + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): result = self.model(input_ids, token_type_ids, position_ids, attention_mask) - if self.task is not None: + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) - return probs, loss, acc + return probs, loss, {'acc': acc} return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs else: sequence_output, pooled_output = result return sequence_output, pooled_output - def get_vocab_path(self): - """ - Gets the path of the module vocabulary path. - """ - save_path = os.path.join(DATA_HOME, 'bert-base-chinese', 'bert-base-chinese-vocab.txt') - if not os.path.exists(save_path) or not os.path.isfile(save_path): - url = "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt" - download(url, os.path.join(DATA_HOME, 'bert-base-chinese')) - return save_path - - def get_tokenizer(self, tokenize_chinese_chars=True): + @staticmethod + def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. - Args: - tokenize_chinese_chars (:obj: bool , defaults to :obj: True): - Whether to tokenize chinese characters or not. - Returns: - tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module. - """ - return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path()) - - def training_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for training, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as loss and metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'loss': avg_loss, 'metrics': {'acc': acc}} - - def validation_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for validation, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'metrics': {'acc': acc}} - - def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False): """ - Predicts the data labels. - - Args: - data (obj:`List(str)`): The processed data whose each element is the raw text. - max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`): - If set to a number, will limit the total sequence returned so that it has a maximum length. - batch_size(obj:`int`, defaults to 1): The number of batch. - use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not. - - Returns: - results(obj:`list`): All the predictions labels. - """ - # TODO(zhangxuefei): add task token_classification task predict. - if self.task not in ['sequence_classification']: - raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - tokenizer = self.get_tokenizer() - - examples = [] - for text in data: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids'])) - - def _batchify_fn(batch): - input_ids = [entry[0] for entry in batch] - segment_ids = [entry[1] for entry in batch] - return input_ids, segment_ids - - # Seperates data into some batches. - batches = [] - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) - - results = [] - self.eval() - for batch in batches: - input_ids, segment_ids = _batchify_fn(batch) - input_ids = paddle.to_tensor(input_ids) - segment_ids = paddle.to_tensor(segment_ids) - - # TODO(zhangxuefei): add task token_classification postprocess after prediction. - if self.task == 'sequence_classification': - probs = self(input_ids, segment_ids) - idx = paddle.argmax(probs, axis=1).numpy() - idx = idx.tolist() - labels = [self.label_map[i] for i in idx] - results.extend(labels) - - return results - - @serving - def get_embedding(self, texts, use_gpu=False): - if self.task is not None: - raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - - tokenizer = self.get_tokenizer() - results = [] - for text in texts: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - - input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0) - segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0) - sequence_output, pooled_output = self(input_ids, segment_ids) - - sequence_output = sequence_output.squeeze(0) - pooled_output = pooled_output.squeeze(0) - results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist())) - return results + return BertTokenizer.from_pretrained( + pretrained_model_name_or_path='bert-base-chinese', *args, **kwargs) diff --git a/modules/text/language_model/bert-base-multilingual-cased/README.md b/modules/text/language_model/bert-base-multilingual-cased/README.md index cfeccad4..a6881ca2 100644 --- a/modules/text/language_model/bert-base-multilingual-cased/README.md +++ b/modules/text/language_model/bert-base-multilingual-cased/README.md @@ -1,6 +1,7 @@ ```shell -$ hub install bert-base-multilingual-cased==2.0.0 +$ hub install bert-base-multilingual-cased==2.0.1 ``` +


@@ -13,29 +14,35 @@ $ hub install bert-base-multilingual-cased==2.0.0 def __init__( task=None, load_checkpoint=None, - label_map=None) + label_map=None, + num_classes=2, + **kwargs, +) ``` 创建Module对象(动态图组网版本)。 **参数** -* `task`: 任务名称,可为`sequence_classification`。 +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 * `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 * `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python def predict( data, max_seq_len=128, batch_size=1, - use_gpu=False) + use_gpu=False +) ``` **参数** * `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例, - 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 + 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 * `max_seq_len`:模型处理文本的最大长度 * `batch_size`:模型批处理大小 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 @@ -44,7 +51,9 @@ def predict( ```python def get_embedding( - texts, + data, + max_seq_len=128, + batch_size=1, use_gpu=False ) ``` @@ -53,7 +62,9 @@ def get_embedding( **参数** -* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** @@ -67,16 +78,16 @@ def get_embedding( import paddlehub as hub data = [ - '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', - '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', - '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], ] label_map = {0: 'negative', 1: 'positive'} model = hub.Module( name='bert-base-multilingual-cased', - version='2.0.0', - task='sequence_classification', + version='2.0.1', + task='seq-cls', load_checkpoint='/path/to/parameters', label_map=label_map) results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) @@ -84,7 +95,9 @@ for idx, text in enumerate(data): print('Data: {} \t Lable: {}'.format(text, results[idx])) ``` -参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) ## 服务部署 @@ -110,12 +123,12 @@ $ hub serving start -m bert-base-multilingual-cased import requests import json -# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]} -text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]] -# 以key的方式指定text传入预测方法的时的参数,此例中为"texts" -# 对应本地部署,则为module.get_embedding(texts=text) -data = {"texts": text} -# 发送post请求,content-type类型应指定json方式 +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip url = "http://10.12.121.132:8866/predict/bert-base-multilingual-cased" # 指定post请求的headers为application/json方式 headers = {"Content-Type": "application/json"} @@ -148,3 +161,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图,接口有所变化。 + +* 2.0.1 + + 任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/bert-base-multilingual-cased/module.py b/modules/text/language_model/bert-base-multilingual-cased/module.py index d164ba53..124a0ce4 100644 --- a/modules/text/language_model/bert-base-multilingual-cased/module.py +++ b/modules/text/language_model/bert-base-multilingual-cased/module.py @@ -11,29 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict import os +import math -from paddle.dataset.common import DATA_HOME import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlehub import BertTokenizer -from paddlehub.module.modeling_bert import BertForSequenceClassification, BertModel -from paddlehub.module.module import moduleinfo, serving +from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification +from paddlenlp.transformers.bert.tokenizer import BertTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule from paddlehub.utils.log import logger -from paddlehub.utils.utils import download @moduleinfo( name="bert-base-multilingual-cased", - version="2.0.0", + version="2.0.1", summary= "bert_multi_cased_L-12_H-768_A-12, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", author_email="", - type="nlp/semantic_model") + type="nlp/semantic_model", + meta=TransformerModule) class Bert(nn.Layer): """ BERT model @@ -41,181 +43,88 @@ class Bert(nn.Layer): def __init__( self, - task=None, - load_checkpoint=None, - label_map=None, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, ): super(Bert, self).__init__() - # TODO(zhangxuefei): add token_classification task + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes + if task == 'sequence_classification': + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': self.model = BertForSequenceClassification.from_pretrained( - pretrained_model_name_or_path='bert-base-multilingual-cased') + pretrained_model_name_or_path='bert-base-multilingual-cased', + num_classes=self.num_classes, + **kwargs + ) self.criterion = paddle.nn.loss.CrossEntropyLoss() - self.metric = paddle.metric.Accuracy(name='acc_accumulation') + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = BertForTokenClassification.from_pretrained( + pretrained_model_name_or_path='bert-base-multilingual-cased', + num_classes=self.num_classes, + **kwargs + ) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) elif task is None: - self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-multilingual-cased') + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-multilingual-cased', **kwargs) else: - raise RuntimeError("Unknown task %s, task should be sequence_classification" % task) + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) self.task = task - self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None): + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): result = self.model(input_ids, token_type_ids, position_ids, attention_mask) - if self.task is not None: + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) - return probs, loss, acc + return probs, loss, {'acc': acc} return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs else: sequence_output, pooled_output = result return sequence_output, pooled_output - def get_vocab_path(self): - """ - Gets the path of the module vocabulary path. - """ - save_path = os.path.join(DATA_HOME, 'bert-base-multilingual-cased', 'bert-base-multilingual-cased-vocab.txt') - if not os.path.exists(save_path) or not os.path.isfile(save_path): - url = "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-multilingual-cased-vocab.txt" - download(url, os.path.join(DATA_HOME, 'bert-base-multilingual-cased')) - return save_path - - def get_tokenizer(self, tokenize_chinese_chars=True): + @staticmethod + def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. - Args: - tokenize_chinese_chars (:obj: bool , defaults to :obj: True): - Whether to tokenize chinese characters or not. - Returns: - tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module. - """ - return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path()) - - def training_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for training, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as loss and metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'loss': avg_loss, 'metrics': {'acc': acc}} - - def validation_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for validation, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'metrics': {'acc': acc}} - - def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False): """ - Predicts the data labels. - - Args: - data (obj:`List(str)`): The processed data whose each element is the raw text. - max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`): - If set to a number, will limit the total sequence returned so that it has a maximum length. - batch_size(obj:`int`, defaults to 1): The number of batch. - use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not. - - Returns: - results(obj:`list`): All the predictions labels. - """ - # TODO(zhangxuefei): add task token_classification task predict. - if self.task not in ['sequence_classification']: - raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - tokenizer = self.get_tokenizer() - - examples = [] - for text in data: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids'])) - - def _batchify_fn(batch): - input_ids = [entry[0] for entry in batch] - segment_ids = [entry[1] for entry in batch] - return input_ids, segment_ids - - # Seperates data into some batches. - batches = [] - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) - - results = [] - self.eval() - for batch in batches: - input_ids, segment_ids = _batchify_fn(batch) - input_ids = paddle.to_tensor(input_ids) - segment_ids = paddle.to_tensor(segment_ids) - - # TODO(zhangxuefei): add task token_classification postprocess after prediction. - if self.task == 'sequence_classification': - probs = self(input_ids, segment_ids) - idx = paddle.argmax(probs, axis=1).numpy() - idx = idx.tolist() - labels = [self.label_map[i] for i in idx] - results.extend(labels) - - return results - - @serving - def get_embedding(self, texts, use_gpu=False): - if self.task is not None: - raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - - tokenizer = self.get_tokenizer() - results = [] - for text in texts: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - - input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0) - segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0) - sequence_output, pooled_output = self(input_ids, segment_ids) - - sequence_output = sequence_output.squeeze(0) - pooled_output = pooled_output.squeeze(0) - results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist())) - return results + return BertTokenizer.from_pretrained( + pretrained_model_name_or_path='bert-base-multilingual-cased', *args, **kwargs) diff --git a/modules/text/language_model/bert-base-multilingual-uncased/README.md b/modules/text/language_model/bert-base-multilingual-uncased/README.md index 4f083b93..079b2a2b 100644 --- a/modules/text/language_model/bert-base-multilingual-uncased/README.md +++ b/modules/text/language_model/bert-base-multilingual-uncased/README.md @@ -1,6 +1,7 @@ ```shell -$ hub install bert-base-multilingual-uncased==2.0.0 +$ hub install bert-base-multilingual-uncased==2.0.1 ``` +


@@ -13,29 +14,35 @@ $ hub install bert-base-multilingual-uncased==2.0.0 def __init__( task=None, load_checkpoint=None, - label_map=None) + label_map=None, + num_classes=2, + **kwargs, +) ``` 创建Module对象(动态图组网版本)。 **参数** -* `task`: 任务名称,可为`sequence_classification`。 +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 * `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 * `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python def predict( data, max_seq_len=128, batch_size=1, - use_gpu=False) + use_gpu=False +) ``` **参数** * `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例, - 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 + 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 * `max_seq_len`:模型处理文本的最大长度 * `batch_size`:模型批处理大小 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 @@ -44,7 +51,9 @@ def predict( ```python def get_embedding( - texts, + data, + max_seq_len=128, + batch_size=1, use_gpu=False ) ``` @@ -53,7 +62,9 @@ def get_embedding( **参数** -* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** @@ -67,16 +78,16 @@ def get_embedding( import paddlehub as hub data = [ - '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', - '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', - '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], ] label_map = {0: 'negative', 1: 'positive'} model = hub.Module( name='bert-base-multilingual-uncased', - version='2.0.0', - task='sequence_classification', + version='2.0.1', + task='seq-cls', load_checkpoint='/path/to/parameters', label_map=label_map) results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) @@ -84,7 +95,9 @@ for idx, text in enumerate(data): print('Data: {} \t Lable: {}'.format(text, results[idx])) ``` -参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) ## 服务部署 @@ -110,12 +123,12 @@ $ hub serving start -m bert-base-multilingual-uncased import requests import json -# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]} -text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]] -# 以key的方式指定text传入预测方法的时的参数,此例中为"texts" -# 对应本地部署,则为module.get_embedding(texts=text) -data = {"texts": text} -# 发送post请求,content-type类型应指定json方式 +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip url = "http://10.12.121.132:8866/predict/bert-base-multilingual-uncased" # 指定post请求的headers为application/json方式 headers = {"Content-Type": "application/json"} @@ -148,3 +161,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图,接口有所变化。 + +* 2.0.1 + + 任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/bert-base-multilingual-uncased/module.py b/modules/text/language_model/bert-base-multilingual-uncased/module.py index 410a8129..c957d7e3 100644 --- a/modules/text/language_model/bert-base-multilingual-uncased/module.py +++ b/modules/text/language_model/bert-base-multilingual-uncased/module.py @@ -11,29 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict import os +import math -from paddle.dataset.common import DATA_HOME import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlehub import BertTokenizer -from paddlehub.module.modeling_bert import BertForSequenceClassification, BertModel -from paddlehub.module.module import moduleinfo, serving +from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification +from paddlenlp.transformers.bert.tokenizer import BertTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule from paddlehub.utils.log import logger -from paddlehub.utils.utils import download @moduleinfo( name="bert-base-multilingual-uncased", - version="2.0.0", + version="2.0.1", summary= "bert_multi_uncased_L-12_H-768_A-12, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", author_email="", - type="nlp/semantic_model") + type="nlp/semantic_model", + meta=TransformerModule) class Bert(nn.Layer): """ BERT model @@ -41,182 +43,88 @@ class Bert(nn.Layer): def __init__( self, - task=None, - load_checkpoint=None, - label_map=None, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, ): super(Bert, self).__init__() - # TODO(zhangxuefei): add token_classification task + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes + if task == 'sequence_classification': + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': self.model = BertForSequenceClassification.from_pretrained( - pretrained_model_name_or_path='bert-base-multilingual-uncased') + pretrained_model_name_or_path='bert-base-multilingual-uncased', + num_classes=self.num_classes, + **kwargs + ) self.criterion = paddle.nn.loss.CrossEntropyLoss() - self.metric = paddle.metric.Accuracy(name='acc_accumulation') + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = BertForTokenClassification.from_pretrained( + pretrained_model_name_or_path='bert-base-multilingual-uncased', + num_classes=self.num_classes, + **kwargs + ) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) elif task is None: - self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-multilingual-uncased') + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-multilingual-uncased', **kwargs) else: - raise RuntimeError("Unknown task %s, task should be sequence_classification" % task) + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) self.task = task - self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None): + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): result = self.model(input_ids, token_type_ids, position_ids, attention_mask) - if self.task is not None: + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) - return probs, loss, acc + return probs, loss, {'acc': acc} return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs else: sequence_output, pooled_output = result return sequence_output, pooled_output - def get_vocab_path(self): - """ - Gets the path of the module vocabulary path. - """ - save_path = os.path.join(DATA_HOME, 'bert-base-multilingual-uncased', - 'bert-base-multilingual-uncased-vocab.txt') - if not os.path.exists(save_path) or not os.path.isfile(save_path): - url = "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-multilingual-uncased-vocab.txt" - download(url, os.path.join(DATA_HOME, 'bert-base-multilingual-uncased')) - return save_path - - def get_tokenizer(self, tokenize_chinese_chars=True): + @staticmethod + def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. - Args: - tokenize_chinese_chars (:obj: bool , defaults to :obj: True): - Whether to tokenize chinese characters or not. - Returns: - tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module. - """ - return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path()) - - def training_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for training, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as loss and metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'loss': avg_loss, 'metrics': {'acc': acc}} - - def validation_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for validation, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'metrics': {'acc': acc}} - - def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False): """ - Predicts the data labels. - - Args: - data (obj:`List(str)`): The processed data whose each element is the raw text. - max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`): - If set to a number, will limit the total sequence returned so that it has a maximum length. - batch_size(obj:`int`, defaults to 1): The number of batch. - use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not. - - Returns: - results(obj:`list`): All the predictions labels. - """ - # TODO(zhangxuefei): add task token_classification task predict. - if self.task not in ['sequence_classification']: - raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - tokenizer = self.get_tokenizer() - - examples = [] - for text in data: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids'])) - - def _batchify_fn(batch): - input_ids = [entry[0] for entry in batch] - segment_ids = [entry[1] for entry in batch] - return input_ids, segment_ids - - # Seperates data into some batches. - batches = [] - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) - - results = [] - self.eval() - for batch in batches: - input_ids, segment_ids = _batchify_fn(batch) - input_ids = paddle.to_tensor(input_ids) - segment_ids = paddle.to_tensor(segment_ids) - - # TODO(zhangxuefei): add task token_classification postprocess after prediction. - if self.task == 'sequence_classification': - probs = self(input_ids, segment_ids) - idx = paddle.argmax(probs, axis=1).numpy() - idx = idx.tolist() - labels = [self.label_map[i] for i in idx] - results.extend(labels) - - return results - - @serving - def get_embedding(self, texts, use_gpu=False): - if self.task is not None: - raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - - tokenizer = self.get_tokenizer() - results = [] - for text in texts: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - - input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0) - segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0) - sequence_output, pooled_output = self(input_ids, segment_ids) - - sequence_output = sequence_output.squeeze(0) - pooled_output = pooled_output.squeeze(0) - results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist())) - return results + return BertTokenizer.from_pretrained( + pretrained_model_name_or_path='bert-base-multilingual-uncased', *args, **kwargs) diff --git a/modules/text/language_model/bert-base-uncased/README.md b/modules/text/language_model/bert-base-uncased/README.md index 2b2aa202..dfb5e864 100644 --- a/modules/text/language_model/bert-base-uncased/README.md +++ b/modules/text/language_model/bert-base-uncased/README.md @@ -1,6 +1,7 @@ ```shell -$ hub install bert-base-uncased==2.0.0 +$ hub install bert-base-uncased==2.0.1 ``` +


@@ -13,29 +14,35 @@ $ hub install bert-base-uncased==2.0.0 def __init__( task=None, load_checkpoint=None, - label_map=None) + label_map=None, + num_classes=2, + **kwargs, +) ``` 创建Module对象(动态图组网版本)。 **参数** -* `task`: 任务名称,可为`sequence_classification`。 +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 * `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 * `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python def predict( data, max_seq_len=128, batch_size=1, - use_gpu=False) + use_gpu=False +) ``` **参数** * `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例, - 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 + 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 * `max_seq_len`:模型处理文本的最大长度 * `batch_size`:模型批处理大小 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 @@ -44,7 +51,9 @@ def predict( ```python def get_embedding( - texts, + data, + max_seq_len=128, + batch_size=1, use_gpu=False ) ``` @@ -53,7 +62,9 @@ def get_embedding( **参数** -* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** @@ -67,16 +78,16 @@ def get_embedding( import paddlehub as hub data = [ - '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', - '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', - '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], ] label_map = {0: 'negative', 1: 'positive'} model = hub.Module( name='bert-base-uncased', - version='2.0.0', - task='sequence_classification', + version='2.0.1', + task='seq-cls', load_checkpoint='/path/to/parameters', label_map=label_map) results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) @@ -84,7 +95,9 @@ for idx, text in enumerate(data): print('Data: {} \t Lable: {}'.format(text, results[idx])) ``` -参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) ## 服务部署 @@ -110,12 +123,12 @@ $ hub serving start -m bert-base-uncased import requests import json -# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]} -text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]] -# 以key的方式指定text传入预测方法的时的参数,此例中为"texts" -# 对应本地部署,则为module.get_embedding(texts=text) -data = {"texts": text} -# 发送post请求,content-type类型应指定json方式 +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip url = "http://10.12.121.132:8866/predict/bert-base-uncased" # 指定post请求的headers为application/json方式 headers = {"Content-Type": "application/json"} @@ -148,3 +161,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图,接口有所变化。 + +* 2.0.1 + + 任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/bert-base-uncased/module.py b/modules/text/language_model/bert-base-uncased/module.py index 98c49b03..8c06ad34 100644 --- a/modules/text/language_model/bert-base-uncased/module.py +++ b/modules/text/language_model/bert-base-uncased/module.py @@ -11,29 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict import os +import math -from paddle.dataset.common import DATA_HOME import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlehub import BertTokenizer -from paddlehub.module.modeling_bert import BertForSequenceClassification, BertModel -from paddlehub.module.module import moduleinfo, serving +from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification +from paddlenlp.transformers.bert.tokenizer import BertTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule from paddlehub.utils.log import logger -from paddlehub.utils.utils import download @moduleinfo( name="bert-base-uncased", - version="2.0.0", + version="2.0.1", summary= "bert_uncased_L-12_H-768_A-12, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", author_email="", - type="nlp/semantic_model") + type="nlp/semantic_model", + meta=TransformerModule) class Bert(nn.Layer): """ BERT model @@ -41,181 +43,80 @@ class Bert(nn.Layer): def __init__( self, - task=None, - load_checkpoint=None, - label_map=None, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, ): super(Bert, self).__init__() - # TODO(zhangxuefei): add token_classification task + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes + if task == 'sequence_classification': - self.model = BertForSequenceClassification.from_pretrained( - pretrained_model_name_or_path='bert-base-uncased') + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': + self.model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path='bert-base-uncased', num_classes=self.num_classes, **kwargs) self.criterion = paddle.nn.loss.CrossEntropyLoss() - self.metric = paddle.metric.Accuracy(name='acc_accumulation') + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = BertForTokenClassification.from_pretrained(pretrained_model_name_or_path='bert-base-uncased', num_classes=self.num_classes, **kwargs) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) elif task is None: - self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-uncased') + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-uncased', **kwargs) else: - raise RuntimeError("Unknown task %s, task should be sequence_classification" % task) + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) self.task = task - self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None): + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): result = self.model(input_ids, token_type_ids, position_ids, attention_mask) - if self.task is not None: + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) - return probs, loss, acc + return probs, loss, {'acc': acc} return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs else: sequence_output, pooled_output = result return sequence_output, pooled_output - def get_vocab_path(self): - """ - Gets the path of the module vocabulary path. - """ - save_path = os.path.join(DATA_HOME, 'bert-base-uncased', 'bert-base-uncased-vocab.txt') - if not os.path.exists(save_path) or not os.path.isfile(save_path): - url = "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-uncased-vocab.txt" - download(url, os.path.join(DATA_HOME, 'bert-base-uncased')) - return save_path - - def get_tokenizer(self, tokenize_chinese_chars=True): + @staticmethod + def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. - Args: - tokenize_chinese_chars (:obj: bool , defaults to :obj: True): - Whether to tokenize chinese characters or not. - Returns: - tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module. - """ - return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path()) - - def training_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for training, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as loss and metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'loss': avg_loss, 'metrics': {'acc': acc}} - - def validation_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for validation, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'metrics': {'acc': acc}} - - def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False): """ - Predicts the data labels. - - Args: - data (obj:`List(str)`): The processed data whose each element is the raw text. - max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`): - If set to a number, will limit the total sequence returned so that it has a maximum length. - batch_size(obj:`int`, defaults to 1): The number of batch. - use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not. - - Returns: - results(obj:`list`): All the predictions labels. - """ - # TODO(zhangxuefei): add task token_classification task predict. - if self.task not in ['sequence_classification']: - raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - tokenizer = self.get_tokenizer() - - examples = [] - for text in data: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids'])) - - def _batchify_fn(batch): - input_ids = [entry[0] for entry in batch] - segment_ids = [entry[1] for entry in batch] - return input_ids, segment_ids - - # Seperates data into some batches. - batches = [] - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) - - results = [] - self.eval() - for batch in batches: - input_ids, segment_ids = _batchify_fn(batch) - input_ids = paddle.to_tensor(input_ids) - segment_ids = paddle.to_tensor(segment_ids) - - # TODO(zhangxuefei): add task token_classification postprocess after prediction. - if self.task == 'sequence_classification': - probs = self(input_ids, segment_ids) - idx = paddle.argmax(probs, axis=1).numpy() - idx = idx.tolist() - labels = [self.label_map[i] for i in idx] - results.extend(labels) - - return results - - @serving - def get_embedding(self, texts, use_gpu=False): - if self.task is not None: - raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - - tokenizer = self.get_tokenizer() - results = [] - for text in texts: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - - input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0) - segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0) - sequence_output, pooled_output = self(input_ids, segment_ids) - - sequence_output = sequence_output.squeeze(0) - pooled_output = pooled_output.squeeze(0) - results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist())) - return results + return BertTokenizer.from_pretrained( + pretrained_model_name_or_path='bert-base-uncased', *args, **kwargs) diff --git a/modules/text/language_model/bert-large-cased/README.md b/modules/text/language_model/bert-large-cased/README.md index 54219b12..344d5441 100644 --- a/modules/text/language_model/bert-large-cased/README.md +++ b/modules/text/language_model/bert-large-cased/README.md @@ -1,6 +1,7 @@ ```shell -$ hub install bert-large-cased==2.0.0 +$ hub install bert-large-cased==2.0.1 ``` +


@@ -13,29 +14,35 @@ $ hub install bert-large-cased==2.0.0 def __init__( task=None, load_checkpoint=None, - label_map=None) + label_map=None, + num_classes=2, + **kwargs, +) ``` 创建Module对象(动态图组网版本)。 **参数** -* `task`: 任务名称,可为`sequence_classification`。 +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 * `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 * `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python def predict( data, max_seq_len=128, batch_size=1, - use_gpu=False) + use_gpu=False +) ``` **参数** * `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例, - 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 + 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 * `max_seq_len`:模型处理文本的最大长度 * `batch_size`:模型批处理大小 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 @@ -44,7 +51,9 @@ def predict( ```python def get_embedding( - texts, + data, + max_seq_len=128, + batch_size=1, use_gpu=False ) ``` @@ -53,7 +62,9 @@ def get_embedding( **参数** -* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** @@ -67,16 +78,16 @@ def get_embedding( import paddlehub as hub data = [ - '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', - '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', - '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], ] label_map = {0: 'negative', 1: 'positive'} model = hub.Module( name='bert-large-cased', - version='2.0.0', - task='sequence_classification', + version='2.0.1', + task='seq-cls', load_checkpoint='/path/to/parameters', label_map=label_map) results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) @@ -84,7 +95,9 @@ for idx, text in enumerate(data): print('Data: {} \t Lable: {}'.format(text, results[idx])) ``` -参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) ## 服务部署 @@ -110,12 +123,12 @@ $ hub serving start -m bert-large-cased import requests import json -# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]} -text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]] -# 以key的方式指定text传入预测方法的时的参数,此例中为"texts" -# 对应本地部署,则为module.get_embedding(texts=text) -data = {"texts": text} -# 发送post请求,content-type类型应指定json方式 +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip url = "http://10.12.121.132:8866/predict/bert-large-cased" # 指定post请求的headers为application/json方式 headers = {"Content-Type": "application/json"} @@ -148,3 +161,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图,接口有所变化。 + +* 2.0.1 + + 任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/bert-large-cased/module.py b/modules/text/language_model/bert-large-cased/module.py index 58b03010..d456b78f 100644 --- a/modules/text/language_model/bert-large-cased/module.py +++ b/modules/text/language_model/bert-large-cased/module.py @@ -11,29 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict import os +import math -from paddle.dataset.common import DATA_HOME import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlehub import BertTokenizer -from paddlehub.module.modeling_bert import BertForSequenceClassification, BertModel -from paddlehub.module.module import moduleinfo, serving +from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification +from paddlenlp.transformers.bert.tokenizer import BertTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule from paddlehub.utils.log import logger -from paddlehub.utils.utils import download @moduleinfo( name="bert-large-cased", - version="2.0.0", + version="2.0.1", summary= "bert_cased_L-24_H-1024_A-16, 24-layer, 1024-hidden, 16-heads, 340M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", author_email="", - type="nlp/semantic_model") + type="nlp/semantic_model", + meta=TransformerModule) class Bert(nn.Layer): """ BERT model @@ -41,180 +43,80 @@ class Bert(nn.Layer): def __init__( self, - task=None, - load_checkpoint=None, - label_map=None, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, ): super(Bert, self).__init__() - # TODO(zhangxuefei): add token_classification task + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes + if task == 'sequence_classification': - self.model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path='bert-large-cased') + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': + self.model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path='bert-large-cased', num_classes=self.num_classes, **kwargs) self.criterion = paddle.nn.loss.CrossEntropyLoss() - self.metric = paddle.metric.Accuracy(name='acc_accumulation') + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = BertForTokenClassification.from_pretrained(pretrained_model_name_or_path='bert-large-cased', num_classes=self.num_classes, **kwargs) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) elif task is None: - self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-large-cased') + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-large-cased', **kwargs) else: - raise RuntimeError("Unknown task %s, task should be sequence_classification" % task) + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) self.task = task - self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None): + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): result = self.model(input_ids, token_type_ids, position_ids, attention_mask) - if self.task is not None: + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) - return probs, loss, acc + return probs, loss, {'acc': acc} return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs else: sequence_output, pooled_output = result return sequence_output, pooled_output - def get_vocab_path(self): - """ - Gets the path of the module vocabulary path. - """ - save_path = os.path.join(DATA_HOME, 'bert-large-cased', 'bert-large-cased-vocab.txt') - if not os.path.exists(save_path) or not os.path.isfile(save_path): - url = "https://paddle-hapi.bj.bcebos.com/models/bert/bert-large-cased-vocab.txt" - download(url, os.path.join(DATA_HOME, 'bert-large-cased')) - return save_path - - def get_tokenizer(self, tokenize_chinese_chars=True): + @staticmethod + def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. - Args: - tokenize_chinese_chars (:obj: bool , defaults to :obj: True): - Whether to tokenize chinese characters or not. - Returns: - tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module. - """ - return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path()) - - def training_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for training, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as loss and metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'loss': avg_loss, 'metrics': {'acc': acc}} - - def validation_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for validation, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'metrics': {'acc': acc}} - - def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False): """ - Predicts the data labels. - - Args: - data (obj:`List(str)`): The processed data whose each element is the raw text. - max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`): - If set to a number, will limit the total sequence returned so that it has a maximum length. - batch_size(obj:`int`, defaults to 1): The number of batch. - use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not. - - Returns: - results(obj:`list`): All the predictions labels. - """ - # TODO(zhangxuefei): add task token_classification task predict. - if self.task not in ['sequence_classification']: - raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - tokenizer = self.get_tokenizer() - - examples = [] - for text in data: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids'])) - - def _batchify_fn(batch): - input_ids = [entry[0] for entry in batch] - segment_ids = [entry[1] for entry in batch] - return input_ids, segment_ids - - # Seperates data into some batches. - batches = [] - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) - - results = [] - self.eval() - for batch in batches: - input_ids, segment_ids = _batchify_fn(batch) - input_ids = paddle.to_tensor(input_ids) - segment_ids = paddle.to_tensor(segment_ids) - - # TODO(zhangxuefei): add task token_classification postprocess after prediction. - if self.task == 'sequence_classification': - probs = self(input_ids, segment_ids) - idx = paddle.argmax(probs, axis=1).numpy() - idx = idx.tolist() - labels = [self.label_map[i] for i in idx] - results.extend(labels) - - return results - - @serving - def get_embedding(self, texts, use_gpu=False): - if self.task is not None: - raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - - tokenizer = self.get_tokenizer() - results = [] - for text in texts: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - - input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0) - segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0) - sequence_output, pooled_output = self(input_ids, segment_ids) - - sequence_output = sequence_output.squeeze(0) - pooled_output = pooled_output.squeeze(0) - results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist())) - return results + return BertTokenizer.from_pretrained( + pretrained_model_name_or_path='bert-large-cased', *args, **kwargs) diff --git a/modules/text/language_model/bert-large-uncased/README.md b/modules/text/language_model/bert-large-uncased/README.md index 4520f37b..e2964f85 100644 --- a/modules/text/language_model/bert-large-uncased/README.md +++ b/modules/text/language_model/bert-large-uncased/README.md @@ -1,6 +1,7 @@ ```shell -$ hub install bert-large-uncased==2.0.0 +$ hub install bert-large-uncased==2.0.1 ``` +


@@ -13,29 +14,35 @@ $ hub install bert-large-uncased==2.0.0 def __init__( task=None, load_checkpoint=None, - label_map=None) + label_map=None, + num_classes=2, + **kwargs, +) ``` 创建Module对象(动态图组网版本)。 **参数** -* `task`: 任务名称,可为`sequence_classification`。 +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 * `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 * `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python def predict( data, max_seq_len=128, batch_size=1, - use_gpu=False) + use_gpu=False +) ``` **参数** * `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例, - 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 + 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 * `max_seq_len`:模型处理文本的最大长度 * `batch_size`:模型批处理大小 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 @@ -44,7 +51,9 @@ def predict( ```python def get_embedding( - texts, + data, + max_seq_len=128, + batch_size=1, use_gpu=False ) ``` @@ -53,7 +62,9 @@ def get_embedding( **参数** -* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** @@ -67,16 +78,16 @@ def get_embedding( import paddlehub as hub data = [ - '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', - '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', - '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], ] label_map = {0: 'negative', 1: 'positive'} model = hub.Module( - name='bert-large-画丶cased', - version='2.0.0', - task='sequence_classification', + name='bert-large-uncased', + version='2.0.1', + task='seq-cls', load_checkpoint='/path/to/parameters', label_map=label_map) results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) @@ -84,7 +95,9 @@ for idx, text in enumerate(data): print('Data: {} \t Lable: {}'.format(text, results[idx])) ``` -参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) ## 服务部署 @@ -110,12 +123,12 @@ $ hub serving start -m bert-large-uncased import requests import json -# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]} -text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]] -# 以key的方式指定text传入预测方法的时的参数,此例中为"texts" -# 对应本地部署,则为module.get_embedding(texts=text) -data = {"texts": text} -# 发送post请求,content-type类型应指定json方式 +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip url = "http://10.12.121.132:8866/predict/bert-large-uncased" # 指定post请求的headers为application/json方式 headers = {"Content-Type": "application/json"} @@ -148,3 +161,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图,接口有所变化。 + +* 2.0.1 + + 任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/bert-large-uncased/module.py b/modules/text/language_model/bert-large-uncased/module.py index 57020f07..cedcba1d 100644 --- a/modules/text/language_model/bert-large-uncased/module.py +++ b/modules/text/language_model/bert-large-uncased/module.py @@ -11,29 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict import os +import math -from paddle.dataset.common import DATA_HOME import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlehub import BertTokenizer -from paddlehub.module.modeling_bert import BertForSequenceClassification, BertModel -from paddlehub.module.module import moduleinfo, serving +from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification +from paddlenlp.transformers.bert.tokenizer import BertTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule from paddlehub.utils.log import logger -from paddlehub.utils.utils import download @moduleinfo( name="bert-large-uncased", - version="2.0.0", + version="2.0.1", summary= "bert_uncased_L-24_H-1024_A-16, 24-layer, 1024-hidden, 16-heads, 340M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", author_email="", - type="nlp/semantic_model") + type="nlp/semantic_model", + meta=TransformerModule) class Bert(nn.Layer): """ BERT model @@ -41,181 +43,80 @@ class Bert(nn.Layer): def __init__( self, - task=None, - load_checkpoint=None, - label_map=None, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, ): super(Bert, self).__init__() - # TODO(zhangxuefei): add token_classification task + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes + if task == 'sequence_classification': - self.model = BertForSequenceClassification.from_pretrained( - pretrained_model_name_or_path='bert-large-uncased') + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': + self.model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path='bert-large-uncased', num_classes=self.num_classes, **kwargs) self.criterion = paddle.nn.loss.CrossEntropyLoss() - self.metric = paddle.metric.Accuracy(name='acc_accumulation') + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = BertForTokenClassification.from_pretrained(pretrained_model_name_or_path='bert-large-uncased', num_classes=self.num_classes, **kwargs) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) elif task is None: - self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-large-uncased') + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-large-uncased', **kwargs) else: - raise RuntimeError("Unknown task %s, task should be sequence_classification" % task) + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) self.task = task - self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None): + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): result = self.model(input_ids, token_type_ids, position_ids, attention_mask) - if self.task is not None: + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) - return probs, loss, acc + return probs, loss, {'acc': acc} return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs else: sequence_output, pooled_output = result return sequence_output, pooled_output - def get_vocab_path(self): - """ - Gets the path of the module vocabulary path. - """ - save_path = os.path.join(DATA_HOME, 'bert-large-uncased', 'bert-large-uncased-vocab.txt') - if not os.path.exists(save_path) or not os.path.isfile(save_path): - url = "https://paddle-hapi.bj.bcebos.com/models/bert/bert-large-uncased-vocab.txt" - download(url, os.path.join(DATA_HOME, 'bert-large-uncased')) - return save_path - - def get_tokenizer(self, tokenize_chinese_chars=True): + @staticmethod + def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. - Args: - tokenize_chinese_chars (:obj: bool , defaults to :obj: True): - Whether to tokenize chinese characters or not. - Returns: - tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module. - """ - return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path()) - - def training_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for training, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as loss and metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'loss': avg_loss, 'metrics': {'acc': acc}} - - def validation_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for validation, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'metrics': {'acc': acc}} - - def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False): """ - Predicts the data labels. - - Args: - data (obj:`List(str)`): The processed data whose each element is the raw text. - max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`): - If set to a number, will limit the total sequence returned so that it has a maximum length. - batch_size(obj:`int`, defaults to 1): The number of batch. - use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not. - - Returns: - results(obj:`list`): All the predictions labels. - """ - # TODO(zhangxuefei): add task token_classification task predict. - if self.task not in ['sequence_classification']: - raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - tokenizer = self.get_tokenizer() - - examples = [] - for text in data: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids'])) - - def _batchify_fn(batch): - input_ids = [entry[0] for entry in batch] - segment_ids = [entry[1] for entry in batch] - return input_ids, segment_ids - - # Seperates data into some batches. - batches = [] - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) - - results = [] - self.eval() - for batch in batches: - input_ids, segment_ids = _batchify_fn(batch) - input_ids = paddle.to_tensor(input_ids) - segment_ids = paddle.to_tensor(segment_ids) - - # TODO(zhangxuefei): add task token_classification postprocess after prediction. - if self.task == 'sequence_classification': - probs = self(input_ids, segment_ids) - idx = paddle.argmax(probs, axis=1).numpy() - idx = idx.tolist() - labels = [self.label_map[i] for i in idx] - results.extend(labels) - - return results - - @serving - def get_embedding(self, texts, use_gpu=False): - if self.task is not None: - raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - - tokenizer = self.get_tokenizer() - results = [] - for text in texts: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - - input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0) - segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0) - sequence_output, pooled_output = self(input_ids, segment_ids) - - sequence_output = sequence_output.squeeze(0) - pooled_output = pooled_output.squeeze(0) - results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist())) - return results + return BertTokenizer.from_pretrained( + pretrained_model_name_or_path='bert-large-uncased', *args, **kwargs) diff --git a/modules/text/language_model/chinese_bert_wwm/README.md b/modules/text/language_model/chinese_bert_wwm/README.md index 96ae17ac..6f0460c5 100644 --- a/modules/text/language_model/chinese_bert_wwm/README.md +++ b/modules/text/language_model/chinese_bert_wwm/README.md @@ -1,5 +1,5 @@ ```shell -$ hub install chinese-bert-wwm==1.0.0 +$ hub install chinese-bert-wwm==2.0.1 ```


@@ -9,56 +9,50 @@ $ hub install chinese-bert-wwm==1.0.0 ## API ```python -def context( - trainable=True, - max_seq_len=128 +def __init__( + task=None, + load_checkpoint=None, + label_map=None, + num_classes=2, + **kwargs, ) ``` -用于获取Module的上下文信息,得到输入、输出以及预训练的Paddle Program副本 -**参数** -```shell -$ hub install chinese-bert-wwm==1.0.0 -``` -

-
-

+创建Module对象(动态图组网版本)。 -更多详情请参考[BERT论文](https://arxiv.org/abs/1810.04805), [Chinese-BERT-wwm技术报告](https://arxiv.org/abs/1906.08101) +**参数** + +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 +* `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 +* `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 -## API ```python -def context( - trainable=True, - max_seq_len=128 +def predict( + data, + max_seq_len=128, + batch_size=1, + use_gpu=False ) ``` -用于获取Module的上下文信息,得到输入、输出以及预训练的Paddle Program副本 **参数** -> trainable:设置为True时,Module中的参数在Fine-tune时也会随之训练,否则保持不变。 -> max_seq_len:BERT模型的最大序列长度,若序列长度不足,会通过padding方式补到**max_seq_len**, 若序列长度大于该值,则会以截断方式让序列长度为**max_seq_len**,max_seq_len可取值范围为0~512; +* `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例, + 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 +* `max_seq_len`:模型处理文本的最大长度 +* `batch_size`:模型批处理大小 +* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** -> inputs:dict类型,有以下字段: -> >**position_ids**存放输入文本tokenize后各token所在该文本的位置,shape为\[batch_size, max_seq_len\],int64类型; -> >**segment_ids**存放各token所在文本的标识(token属于文本1或者文本2),shape为\[batch_size, max_seq_len\],int64类型; -> >**input_mask**存放token是否为padding的标识,shape为\[batch_size, max_seq_len\],int64类型; -> -> outputs:dict类型,Module的输出特征,有以下字段: -> >**pooled_output**字段存放句子粒度的特征,可用于文本分类等任务,shape为 \[batch_size, 768\],int64类型; -> >**sequence_output**字段存放字粒度的特征,可用于序列标注等任务,shape为 \[batch_size, seq_len, 768\],int64类型; -> -> program:包含该Module计算图的Program。 - - ```python def get_embedding( - texts, - use_gpu=False, - batch_size=1 + data, + max_seq_len=128, + batch_size=1, + use_gpu=False ) ``` @@ -66,159 +60,79 @@ def get_embedding( **参数** -> texts:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 -> use_gpu:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 +* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** -> results:list类型,格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\],其中每个元素都是对应样例的特征输出,每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。 -> - -```python -def get_params_layer() -``` - -用于获取参数层信息,该方法与ULMFiTStrategy联用可以严格按照层数设置分层学习率与逐层解冻。 +* `results`:list类型,格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\],其中每个元素都是对应样例的特征输出,每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。 -**参数** - -> 无 - -**返回** - -> params_layer:dict类型,key为参数名,值为参数所在层数 **代码示例** ```python import paddlehub as hub -# Load $ hub install chinese-bert-wwm pretrained model -module = hub.Module(name="chinese-bert-wwm") -inputs, outputs, program = module.context(trainable=True, max_seq_len=128) - -# Must feed all the tensor of chinese-bert-wwm's module need -input_ids = inputs["input_ids"] -position_ids = inputs["position_ids"] -segment_ids = inputs["segment_ids"] -input_mask = inputs["input_mask"] - -# Use "pooled_output" for sentence-level output. -pooled_output = outputs["pooled_output"] - -# Use "sequence_output" for token-level output. -sequence_output = outputs["sequence_output"] - -# Use "get_embedding" to get embedding result. -embedding_result = module.get_embedding(texts=[["Sample1_text_a"],["Sample2_text_a","Sample2_text_b"]], use_gpu=True) - -# Use "get_params_layer" to get params layer and used to ULMFiTStrategy. -params_layer = module.get_params_layer() -strategy = hub.finetune.strategy.ULMFiTStrategy(frz_params_layer=params_layer, dis_params_layer=params_layer) +data = [ + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], +] +label_map = {0: 'negative', 1: 'positive'} + +model = hub.Module( + name='chinese-bert-wwm', + version='2.0.1', + task='seq-cls', + load_checkpoint='/path/to/parameters', + label_map=label_map) +results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) +for idx, text in enumerate(data): + print('Data: {} \t Lable: {}'.format(text, results[idx])) ``` -## 查看代码 -https://github.com/ymcui/Chinese-BERT-wwm - +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) -## 贡献者 +## 服务部署 -[ymcui](https://github.com/ymcui) +PaddleHub Serving可以部署一个在线获取预训练词向量。 -## 依赖 +### Step1: 启动PaddleHub Serving -paddlepaddle >= 1.6.2 +运行启动命令: -paddlehub >= 1.6.0 - -## 更新历史 - -* 1.0.0 - - 初始发布 - -* 1.0.0 - - 支持get_embedding与get_params_layer - -> trainable:设置为True时,Module中的参数在Fine-tune时也会随之训练,否则保持不变。 -> max_seq_len:BERT模型的最大序列长度,若序列长度不足,会通过padding方式补到**max_seq_len**, 若序列长度大于该值,则会以截断方式让序列长度为**max_seq_len**,max_seq_len可取值范围为0~512; - -**返回** -> inputs:dict类型,有以下字段: -> >**position_ids**存放输入文本tokenize后各token所在该文本的位置,shape为\[batch_size, max_seq_len\],int64类型; -> >**segment_ids**存放各token所在文本的标识(token属于文本1或者文本2),shape为\[batch_size, max_seq_len\],int64类型; -> >**input_mask**存放token是否为padding的标识,shape为\[batch_size, max_seq_len\],int64类型; -> -> outputs:dict类型,Module的输出特征,有以下字段: -> >**pooled_output**字段存放句子粒度的特征,可用于文本分类等任务,shape为 \[batch_size, 768\],int64类型; -> >**sequence_output**字段存放字粒度的特征,可用于序列标注等任务,shape为 \[batch_size, seq_len, 768\],int64类型; -> -> program:包含该Module计算图的Program。 - - - -```python -def get_embedding( - texts, - use_gpu=False, - batch_size=1 -) -``` - -用于获取输入文本的句子粒度特征与字粒度特征 - -**参数** - -> texts:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 -> use_gpu:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 - -**返回** - -> results:list类型,格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\],其中每个元素都是对应样例的特征输出,每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。 -> - -```python -def get_params_layer() +```shell +$ hub serving start -m chinese-bert-wwm ``` -用于获取参数层信息,该方法与ULMFiTStrategy联用可以严格按照层数设置分层学习率与逐层解冻。 +这样就完成了一个获取预训练词向量服务化API的部署,默认端口号为8866。 -**参数** - -> 无 - -**返回** +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 -> params_layer:dict类型,key为参数名,值为参数所在层数 +### Step2: 发送预测请求 -**代码示例** +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 ```python -import paddlehub as hub - -# Load $ hub install chinese-bert-wwm pretrained model -module = hub.Module(name="chinese-bert-wwm") -inputs, outputs, program = module.context(trainable=True, max_seq_len=128) - -# Must feed all the tensor of chinese-bert-wwm's module need -input_ids = inputs["input_ids"] -position_ids = inputs["position_ids"] -segment_ids = inputs["segment_ids"] -input_mask = inputs["input_mask"] - -# Use "pooled_output" for sentence-level output. -pooled_output = outputs["pooled_output"] - -# Use "sequence_output" for token-level output. -sequence_output = outputs["sequence_output"] - -# Use "get_embedding" to get embedding result. -embedding_result = module.get_embedding(texts=[["Sample1_text_a"],["Sample2_text_a","Sample2_text_b"]], use_gpu=True) - -# Use "get_params_layer" to get params layer and used to ULMFiTStrategy. -params_layer = module.get_params_layer() -strategy = hub.finetune.strategy.ULMFiTStrategy(frz_params_layer=params_layer, dis_params_layer=params_layer) +import requests +import json + +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/chinese-bert-wwm" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) ``` ## 查看代码 @@ -231,12 +145,16 @@ https://github.com/ymcui/Chinese-BERT-wwm ## 依赖 -paddlepaddle >= 1.6.2 +paddlepaddle >= 2.0.0 -paddlehub >= 1.6.0 +paddlehub >= 2.0.0 ## 更新历史 * 1.0.0 初始发布 + +* 2.0.1 + + 全面升级动态图,接口有所变化。任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/chinese_bert_wwm/model/__init__.py b/modules/text/language_model/chinese_bert_wwm/model/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modules/text/language_model/chinese_bert_wwm/model/bert.py b/modules/text/language_model/chinese_bert_wwm/model/bert.py deleted file mode 100644 index 819bdbad..00000000 --- a/modules/text/language_model/chinese_bert_wwm/model/bert.py +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""BERT model.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import six -import json - -import paddle.fluid as fluid - -from chinese_bert_wwm.model.transformer_encoder import encoder, pre_process_layer - - -class BertConfig(object): - def __init__(self, config_path): - self._config_dict = self._parse(config_path) - - def _parse(self, config_path): - try: - with open(config_path) as json_file: - config_dict = json.load(json_file) - except Exception: - raise IOError("Error in parsing bert model config file '%s'" % config_path) - else: - return config_dict - - def __getitem__(self, key): - return self._config_dict[key] - - def print_config(self): - for arg, value in sorted(six.iteritems(self._config_dict)): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -class BertModel(object): - def __init__(self, src_ids, position_ids, sentence_ids, input_mask, config, weight_sharing=True, use_fp16=False): - - self._emb_size = config['hidden_size'] - self._n_layer = config['num_hidden_layers'] - self._n_head = config['num_attention_heads'] - self._voc_size = config['vocab_size'] - self._max_position_seq_len = config['max_position_embeddings'] - self._sent_types = config['type_vocab_size'] - self._hidden_act = config['hidden_act'] - self._prepostprocess_dropout = config['hidden_dropout_prob'] - self._attention_dropout = config['attention_probs_dropout_prob'] - self._weight_sharing = weight_sharing - - self._word_emb_name = "word_embedding" - self._pos_emb_name = "pos_embedding" - self._sent_emb_name = "sent_embedding" - self._dtype = "float16" if use_fp16 else "float32" - - # Initialize all weigths by truncated normal initializer, and all biases - # will be initialized by constant zero by default. - self._param_initializer = fluid.initializer.TruncatedNormal(scale=config['initializer_range']) - - self._build_model(src_ids, position_ids, sentence_ids, input_mask) - - def _build_model(self, src_ids, position_ids, sentence_ids, input_mask): - # padding id in vocabulary must be set to 0 - emb_out = fluid.layers.embedding(input=src_ids, - size=[self._voc_size, self._emb_size], - dtype=self._dtype, - param_attr=fluid.ParamAttr(name=self._word_emb_name, - initializer=self._param_initializer), - is_sparse=False) - position_emb_out = fluid.layers.embedding(input=position_ids, - size=[self._max_position_seq_len, self._emb_size], - dtype=self._dtype, - param_attr=fluid.ParamAttr(name=self._pos_emb_name, - initializer=self._param_initializer)) - - sent_emb_out = fluid.layers.embedding(sentence_ids, - size=[self._sent_types, self._emb_size], - dtype=self._dtype, - param_attr=fluid.ParamAttr(name=self._sent_emb_name, - initializer=self._param_initializer)) - - emb_out = emb_out + position_emb_out - emb_out = emb_out + sent_emb_out - - emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder') - - if self._dtype == "float16": - input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype) - - self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True) - self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) - n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1) - n_head_self_attn_mask.stop_gradient = True - - self._enc_out = encoder(enc_input=emb_out, - attn_bias=n_head_self_attn_mask, - n_layer=self._n_layer, - n_head=self._n_head, - d_key=self._emb_size // self._n_head, - d_value=self._emb_size // self._n_head, - d_model=self._emb_size, - d_inner_hid=self._emb_size * 4, - prepostprocess_dropout=self._prepostprocess_dropout, - attention_dropout=self._attention_dropout, - relu_dropout=0, - hidden_act=self._hidden_act, - preprocess_cmd="", - postprocess_cmd="dan", - param_initializer=self._param_initializer, - name='encoder') - - def get_sequence_output(self): - return self._enc_out - - def get_pooled_output(self): - """Get the first feature of each sequence for classification""" - - next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1]) - next_sent_feat = fluid.layers.fc(input=next_sent_feat, - size=self._emb_size, - act="tanh", - param_attr=fluid.ParamAttr(name="pooled_fc.w_0", - initializer=self._param_initializer), - bias_attr="pooled_fc.b_0") - return next_sent_feat - - def get_pretraining_output(self, mask_label, mask_pos, labels): - """Get the loss & accuracy for pretraining""" - - mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32') - - # extract the first token feature in each sentence - next_sent_feat = self.get_pooled_output() - reshaped_emb_out = fluid.layers.reshape(x=self._enc_out, shape=[-1, self._emb_size]) - # extract masked tokens' feature - mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos) - - # transform: fc - mask_trans_feat = fluid.layers.fc(input=mask_feat, - size=self._emb_size, - act=self._hidden_act, - param_attr=fluid.ParamAttr(name='mask_lm_trans_fc.w_0', - initializer=self._param_initializer), - bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0')) - # transform: layer norm - mask_trans_feat = pre_process_layer(mask_trans_feat, 'n', name='mask_lm_trans') - - mask_lm_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0", - initializer=fluid.initializer.Constant(value=0.0)) - if self._weight_sharing: - fc_out = fluid.layers.matmul(x=mask_trans_feat, - y=fluid.default_main_program().global_block().var(self._word_emb_name), - transpose_y=True) - fc_out += fluid.layers.create_parameter(shape=[self._voc_size], - dtype=self._dtype, - attr=mask_lm_out_bias_attr, - is_bias=True) - - else: - fc_out = fluid.layers.fc(input=mask_trans_feat, - size=self._voc_size, - param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0", - initializer=self._param_initializer), - bias_attr=mask_lm_out_bias_attr) - - mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out, label=mask_label) - mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss) - - next_sent_fc_out = fluid.layers.fc(input=next_sent_feat, - size=2, - param_attr=fluid.ParamAttr(name="next_sent_fc.w_0", - initializer=self._param_initializer), - bias_attr="next_sent_fc.b_0") - - next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(logits=next_sent_fc_out, - label=labels, - return_softmax=True) - - next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax, label=labels) - - mean_next_sent_loss = fluid.layers.mean(next_sent_loss) - - loss = mean_next_sent_loss + mean_mask_lm_loss - return next_sent_acc, mean_mask_lm_loss, loss diff --git a/modules/text/language_model/chinese_bert_wwm/model/transformer_encoder.py b/modules/text/language_model/chinese_bert_wwm/model/transformer_encoder.py deleted file mode 100644 index b15d8388..00000000 --- a/modules/text/language_model/chinese_bert_wwm/model/transformer_encoder.py +++ /dev/null @@ -1,295 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Transformer encoder.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from functools import partial - -import paddle.fluid as fluid -import paddle.fluid.layers as layers - - -def multi_head_attention(queries, - keys, - values, - attn_bias, - d_key, - d_value, - d_model, - n_head=1, - dropout_rate=0., - cache=None, - param_initializer=None, - name='multi_head_att'): - """ - Multi-Head Attention. Note that attn_bias is added to the logit before - computing softmax activiation to mask certain selected positions so that - they will not considered in attention weights. - """ - keys = queries if keys is None else keys - values = keys if values is None else values - - if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): - raise ValueError("Inputs: quries, keys and values should all be 3-D tensors.") - - def __compute_qkv(queries, keys, values, n_head, d_key, d_value): - """ - Add linear projection to queries, keys, and values. - """ - q = layers.fc(input=queries, - size=d_key * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_query_fc.w_0', initializer=param_initializer), - bias_attr=name + '_query_fc.b_0') - k = layers.fc(input=keys, - size=d_key * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_key_fc.w_0', initializer=param_initializer), - bias_attr=name + '_key_fc.b_0') - v = layers.fc(input=values, - size=d_value * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_value_fc.w_0', initializer=param_initializer), - bias_attr=name + '_value_fc.b_0') - return q, k, v - - def __split_heads(x, n_head): - """ - Reshape the last dimension of inpunt tensor x so that it becomes two - dimensions and then transpose. Specifically, input a tensor with shape - [bs, max_sequence_length, n_head * hidden_dim] then output a tensor - with shape [bs, n_head, max_sequence_length, hidden_dim]. - """ - hidden_size = x.shape[-1] - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True) - - # permuate the dimensions into: - # [batch_size, n_head, max_sequence_len, hidden_size_per_head] - return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) - - def __combine_heads(x): - """ - Transpose and then reshape the last two dimensions of inpunt tensor x - so that it becomes one dimension, which is reverse to __split_heads. - """ - if len(x.shape) == 3: return x - if len(x.shape) != 4: - raise ValueError("Input(x) should be a 4-D Tensor.") - - trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - return layers.reshape(x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True) - - def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): - """ - Scaled Dot-Product Attention - """ - scaled_q = layers.scale(x=q, scale=d_key**-0.5) - product = layers.matmul(x=scaled_q, y=k, transpose_y=True) - if attn_bias: - product += attn_bias - weights = layers.softmax(product) - if dropout_rate: - weights = layers.dropout(weights, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - out = layers.matmul(weights, v) - return out - - q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) - - if cache is not None: # use cache and concat time steps - # Since the inplace reshape in __split_heads changes the shape of k and - # v, which is the cache input for next time step, reshape the cache - # input from the previous time step first. - k = cache["k"] = layers.concat([layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1) - v = cache["v"] = layers.concat([layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1) - - q = __split_heads(q, n_head) - k = __split_heads(k, n_head) - v = __split_heads(v, n_head) - - ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate) - - out = __combine_heads(ctx_multiheads) - - # Project back to the model size. - proj_out = layers.fc(input=out, - size=d_model, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_output_fc.w_0', initializer=param_initializer), - bias_attr=name + '_output_fc.b_0') - return proj_out - - -def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate, hidden_act, param_initializer=None, name='ffn'): - """ - Position-wise Feed-Forward Networks. - This module consists of two linear transformations with a ReLU activation - in between, which is applied to each position separately and identically. - """ - hidden = layers.fc(input=x, - size=d_inner_hid, - num_flatten_dims=2, - act=hidden_act, - param_attr=fluid.ParamAttr(name=name + '_fc_0.w_0', initializer=param_initializer), - bias_attr=name + '_fc_0.b_0') - if dropout_rate: - hidden = layers.dropout(hidden, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - out = layers.fc(input=hidden, - size=d_hid, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_fc_1.w_0', initializer=param_initializer), - bias_attr=name + '_fc_1.b_0') - return out - - -def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., name=''): - """ - Add residual connection, layer normalization and droput to the out tensor - optionally according to the value of process_cmd. - This will be used before or after multi-head attention and position-wise - feed-forward networks. - """ - for cmd in process_cmd: - if cmd == "a": # add residual connection - out = out + prev_out if prev_out else out - elif cmd == "n": # add layer normalization - out_dtype = out.dtype - if out_dtype == fluid.core.VarDesc.VarType.FP16: - out = layers.cast(x=out, dtype="float32") - out = layers.layer_norm(out, - begin_norm_axis=len(out.shape) - 1, - param_attr=fluid.ParamAttr(name=name + '_layer_norm_scale', - initializer=fluid.initializer.Constant(1.)), - bias_attr=fluid.ParamAttr(name=name + '_layer_norm_bias', - initializer=fluid.initializer.Constant(0.))) - if out_dtype == fluid.core.VarDesc.VarType.FP16: - out = layers.cast(x=out, dtype="float16") - elif cmd == "d": # add dropout - if dropout_rate: - out = layers.dropout(out, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - return out - - -pre_process_layer = partial(pre_post_process_layer, None) -post_process_layer = pre_post_process_layer - - -def encoder_layer(enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd="n", - postprocess_cmd="da", - param_initializer=None, - name=''): - """The encoder layers that can be stacked to form a deep encoder. - This module consits of a multi-head (self) attention followed by - position-wise feed-forward networks and both the two components companied - with the post_process_layer to add residual connection, layer normalization - and droput. - """ - attn_output = multi_head_attention(pre_process_layer(enc_input, - preprocess_cmd, - prepostprocess_dropout, - name=name + '_pre_att'), - None, - None, - attn_bias, - d_key, - d_value, - d_model, - n_head, - attention_dropout, - param_initializer=param_initializer, - name=name + '_multi_head_att') - attn_output = post_process_layer(enc_input, - attn_output, - postprocess_cmd, - prepostprocess_dropout, - name=name + '_post_att') - ffd_output = positionwise_feed_forward(pre_process_layer(attn_output, - preprocess_cmd, - prepostprocess_dropout, - name=name + '_pre_ffn'), - d_inner_hid, - d_model, - relu_dropout, - hidden_act, - param_initializer=param_initializer, - name=name + '_ffn') - return post_process_layer(attn_output, ffd_output, postprocess_cmd, prepostprocess_dropout, name=name + '_post_ffn') - - -def encoder(enc_input, - attn_bias, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd="n", - postprocess_cmd="da", - param_initializer=None, - name=''): - """ - The encoder is composed of a stack of identical layers returned by calling - encoder_layer. - """ - for i in range(n_layer): - enc_output = encoder_layer(enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd, - postprocess_cmd, - param_initializer=param_initializer, - name=name + '_layer_' + str(i)) - enc_input = enc_output - enc_output = pre_process_layer(enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder") - - return enc_output diff --git a/modules/text/language_model/chinese_bert_wwm/module.py b/modules/text/language_model/chinese_bert_wwm/module.py index 70ea366d..3ee03088 100644 --- a/modules/text/language_model/chinese_bert_wwm/module.py +++ b/modules/text/language_model/chinese_bert_wwm/module.py @@ -1,7 +1,6 @@ -# coding:utf-8 -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # -# Licensed under the Apache License, Version 2.0 (the "License" +# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # @@ -12,62 +11,121 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - +from typing import Dict import os +import math -from paddlehub import TransformerModule -from paddlehub.module.module import moduleinfo +import paddle +import paddle.nn as nn +import paddle.nn.functional as F -from chinese_bert_wwm.model.bert import BertConfig, BertModel +from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification +from paddlenlp.transformers.bert.tokenizer import BertTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule +from paddlehub.utils.log import logger @moduleinfo( name="chinese-bert-wwm", - version="1.0.0", - summary="chinese-bert-wwm, 12-layer, 768-hidden, 12-heads, 110M parameters ", + version="2.0.1", + summary= + "chinese-bert-wwm, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="ymcui", author_email="ymcui@ir.hit.edu.cn", type="nlp/semantic_model", + meta=TransformerModule ) -class BertWwm(TransformerModule): - def _initialize(self): - self.MAX_SEQ_LEN = 512 - self.params_path = os.path.join(self.directory, "assets", "params") - self.vocab_path = os.path.join(self.directory, "assets", "vocab.txt") +class BertWwm(nn.Layer): + """ + BertWwm model + """ - bert_config_path = os.path.join(self.directory, "assets", "bert_config.json") - self.bert_config = BertConfig(bert_config_path) + def __init__( + self, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, + ): + super(BertWwm, self).__init__() + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes - def net(self, input_ids, position_ids, segment_ids, input_mask): - """ - create neural network. + if task == 'sequence_classification': + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': + self.model = BertForSequenceClassification.from_pretrained( + pretrained_model_name_or_path='bert-wwm-chinese', + num_classes=self.num_classes, + **kwargs + ) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = BertForTokenClassification.from_pretrained( + pretrained_model_name_or_path='bert-wwm-chinese', + num_classes=self.num_classes, + **kwargs + ) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) + elif task is None: + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-wwm-chinese', **kwargs) + else: + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) - Args: - input_ids (tensor): the word ids. - position_ids (tensor): the position ids. - segment_ids (tensor): the segment ids. - input_mask (tensor): the padding mask. + self.task = task - Returns: - pooled_output (tensor): sentence-level output for classification task. - sequence_output (tensor): token-level output for sequence task. - """ - bert = BertModel(src_ids=input_ids, - position_ids=position_ids, - sentence_ids=segment_ids, - input_mask=input_mask, - config=self.bert_config, - use_fp16=False) - pooled_output = bert.get_pooled_output() - sequence_output = bert.get_sequence_output() - return pooled_output, sequence_output + if load_checkpoint is not None and os.path.isfile(load_checkpoint): + state_dict = paddle.load(load_checkpoint) + self.set_state_dict(state_dict) + logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + if self.task == 'seq-cls': + logits = result + probs = F.softmax(logits, axis=1) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs + else: + sequence_output, pooled_output = result + return sequence_output, pooled_output -if __name__ == '__main__': - test_module = BertWwm() + @staticmethod + def get_tokenizer(*args, **kwargs): + """ + Gets the tokenizer that is customized for this module. + """ + return BertTokenizer.from_pretrained( + pretrained_model_name_or_path='bert-wwm-chinese', *args, **kwargs) diff --git a/modules/text/language_model/chinese_bert_wwm_ext/README.md b/modules/text/language_model/chinese_bert_wwm_ext/README.md index 79d742a5..03357e3c 100644 --- a/modules/text/language_model/chinese_bert_wwm_ext/README.md +++ b/modules/text/language_model/chinese_bert_wwm_ext/README.md @@ -1,5 +1,5 @@ ```shell -$ hub install chinese-bert-wwm-ext==1.0.0 +$ hub install chinese-bert-wwm-ext==2.0.1 ```


@@ -9,94 +9,130 @@ $ hub install chinese-bert-wwm-ext==1.0.0 ## API ```python -def context( - trainable=True, - max_seq_len=128 +def __init__( + task=None, + load_checkpoint=None, + label_map=None, + num_classes=2, + **kwargs, ) ``` -用于获取Module的上下文信息,得到输入、输出以及预训练的Paddle Program副本 -**参数** - -> trainable:设置为True时,Module中的参数在Fine-tune时也会随之训练,否则保持不变。 -> max_seq_len:BERT模型的最大序列长度,若序列长度不足,会通过padding方式补到**max_seq_len**, 若序列长度大于该值,则会以截断方式让序列长度为**max_seq_len**,max_seq_len可取值范围为0~512; - -**返回** -> inputs:dict类型,有以下字段: -> >**input_ids**存放输入文本tokenize后各token对应BERT词汇表的word ids, shape为\[batch_size, max_seq_len\],int64类型; -> >**position_ids**存放输入文本tokenize后各token所在该文本的位置,shape为\[batch_size, max_seq_len\],int64类型; -> >**segment_ids**存放各token所在文本的标识(token属于文本1或者文本2),shape为\[batch_size, max_seq_len\],int64类型; -> >**input_mask**存放token是否为padding的标识,shape为\[batch_size, max_seq_len\],int64类型; -> -> outputs:dict类型,Module的输出特征,有以下字段: -> >**pooled_output**字段存放句子粒度的特征,可用于文本分类等任务,shape为 \[batch_size, 768\],int64类型; -> >**sequence_output**字段存放字粒度的特征,可用于序列标注等任务,shape为 \[batch_size, seq_len, 768\],int64类型; -> -> program:包含该Module计算图的Program。 +创建Module对象(动态图组网版本)。 +**参数** +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 +* `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 +* `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python -def get_embedding( - texts, - use_gpu=False, - batch_size=1 +def predict( + data, + max_seq_len=128, + batch_size=1, + use_gpu=False ) ``` -用于获取输入文本的句子粒度特征与字粒度特征 - **参数** -> texts:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 -> use_gpu:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 +* `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例, + 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 +* `max_seq_len`:模型处理文本的最大长度 +* `batch_size`:模型批处理大小 +* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** -> results:list类型,格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\],其中每个元素都是对应样例的特征输出,每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。 -> - ```python -def get_params_layer() +def get_embedding( + data, + max_seq_len=128, + batch_size=1, + use_gpu=False +) ``` -用于获取参数层信息,该方法与ULMFiTStrategy联用可以严格按照层数设置分层学习率与逐层解冻。 +用于获取输入文本的句子粒度特征与字粒度特征 **参数** -> 无 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 +* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** -> params_layer:dict类型,key为参数名,值为参数所在层数 +* `results`:list类型,格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\],其中每个元素都是对应样例的特征输出,每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。 + **代码示例** ```python import paddlehub as hub -# Load $ hub install chinese-bert-wwm-ext pretrained model -module = hub.Module(name="chinese-bert-wwm-ext") -inputs, outputs, program = module.context(trainable=True, max_seq_len=128) +data = [ + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], +] +label_map = {0: 'negative', 1: 'positive'} + +model = hub.Module( + name='chinese-bert-wwm-ext', + version='2.0.1', + task='seq-cls', + load_checkpoint='/path/to/parameters', + label_map=label_map) +results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) +for idx, text in enumerate(data): + print('Data: {} \t Lable: {}'.format(text, results[idx])) +``` -# Must feed all the tensor of chinese-bert-wwm-ext's module need -input_ids = inputs["input_ids"] -position_ids = inputs["position_ids"] -segment_ids = inputs["segment_ids"] -input_mask = inputs["input_mask"] +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) -# Use "pooled_output" for sentence-level output. -pooled_output = outputs["pooled_output"] +## 服务部署 -# Use "sequence_output" for token-level output. -sequence_output = outputs["sequence_output"] +PaddleHub Serving可以部署一个在线获取预训练词向量。 -# Use "get_embedding" to get embedding result. -embedding_result = module.get_embedding(texts=[["Sample1_text_a"],["Sample2_text_a","Sample2_text_b"]], use_gpu=True) +### Step1: 启动PaddleHub Serving -# Use "get_params_layer" to get params layer and used to ULMFiTStrategy. -params_layer = module.get_params_layer() -strategy = hub.finetune.strategy.ULMFiTStrategy(frz_params_layer=params_layer, dis_params_layer=params_layer) +运行启动命令: + +```shell +$ hub serving start -m chinese-bert-wwm-ext +``` + +这样就完成了一个获取预训练词向量服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/chinese-bert-wwm-ext" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) ``` ## 查看代码 @@ -109,12 +145,16 @@ https://github.com/ymcui/Chinese-BERT-wwm ## 依赖 -paddlepaddle >= 1.6.2 +paddlepaddle >= 2.0.0 -paddlehub >= 1.6.0 +paddlehub >= 2.0.0 ## 更新历史 * 1.0.0 初始发布 + +* 2.0.1 + + 全面升级动态图,接口有所变化。任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/chinese_bert_wwm_ext/model/__init__.py b/modules/text/language_model/chinese_bert_wwm_ext/model/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modules/text/language_model/chinese_bert_wwm_ext/model/bert.py b/modules/text/language_model/chinese_bert_wwm_ext/model/bert.py deleted file mode 100644 index cf2a32c1..00000000 --- a/modules/text/language_model/chinese_bert_wwm_ext/model/bert.py +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""BERT model.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import six -import json - -import paddle.fluid as fluid - -from chinese_bert_wwm_ext.model.transformer_encoder import encoder, pre_process_layer - - -class BertConfig(object): - def __init__(self, config_path): - self._config_dict = self._parse(config_path) - - def _parse(self, config_path): - try: - with open(config_path) as json_file: - config_dict = json.load(json_file) - except Exception: - raise IOError("Error in parsing bert model config file '%s'" % config_path) - else: - return config_dict - - def __getitem__(self, key): - return self._config_dict[key] - - def print_config(self): - for arg, value in sorted(six.iteritems(self._config_dict)): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -class BertModel(object): - def __init__(self, src_ids, position_ids, sentence_ids, input_mask, config, weight_sharing=True, use_fp16=False): - - self._emb_size = config['hidden_size'] - self._n_layer = config['num_hidden_layers'] - self._n_head = config['num_attention_heads'] - self._voc_size = config['vocab_size'] - self._max_position_seq_len = config['max_position_embeddings'] - self._sent_types = config['type_vocab_size'] - self._hidden_act = config['hidden_act'] - self._prepostprocess_dropout = config['hidden_dropout_prob'] - self._attention_dropout = config['attention_probs_dropout_prob'] - self._weight_sharing = weight_sharing - - self._word_emb_name = "word_embedding" - self._pos_emb_name = "pos_embedding" - self._sent_emb_name = "sent_embedding" - self._dtype = "float16" if use_fp16 else "float32" - - # Initialize all weigths by truncated normal initializer, and all biases - # will be initialized by constant zero by default. - self._param_initializer = fluid.initializer.TruncatedNormal(scale=config['initializer_range']) - - self._build_model(src_ids, position_ids, sentence_ids, input_mask) - - def _build_model(self, src_ids, position_ids, sentence_ids, input_mask): - # padding id in vocabulary must be set to 0 - emb_out = fluid.layers.embedding(input=src_ids, - size=[self._voc_size, self._emb_size], - dtype=self._dtype, - param_attr=fluid.ParamAttr(name=self._word_emb_name, - initializer=self._param_initializer), - is_sparse=False) - position_emb_out = fluid.layers.embedding(input=position_ids, - size=[self._max_position_seq_len, self._emb_size], - dtype=self._dtype, - param_attr=fluid.ParamAttr(name=self._pos_emb_name, - initializer=self._param_initializer)) - - sent_emb_out = fluid.layers.embedding(sentence_ids, - size=[self._sent_types, self._emb_size], - dtype=self._dtype, - param_attr=fluid.ParamAttr(name=self._sent_emb_name, - initializer=self._param_initializer)) - - emb_out = emb_out + position_emb_out - emb_out = emb_out + sent_emb_out - - emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder') - - if self._dtype == "float16": - input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype) - - self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True) - self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) - n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1) - n_head_self_attn_mask.stop_gradient = True - - self._enc_out = encoder(enc_input=emb_out, - attn_bias=n_head_self_attn_mask, - n_layer=self._n_layer, - n_head=self._n_head, - d_key=self._emb_size // self._n_head, - d_value=self._emb_size // self._n_head, - d_model=self._emb_size, - d_inner_hid=self._emb_size * 4, - prepostprocess_dropout=self._prepostprocess_dropout, - attention_dropout=self._attention_dropout, - relu_dropout=0, - hidden_act=self._hidden_act, - preprocess_cmd="", - postprocess_cmd="dan", - param_initializer=self._param_initializer, - name='encoder') - - def get_sequence_output(self): - return self._enc_out - - def get_pooled_output(self): - """Get the first feature of each sequence for classification""" - - next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1]) - next_sent_feat = fluid.layers.fc(input=next_sent_feat, - size=self._emb_size, - act="tanh", - param_attr=fluid.ParamAttr(name="pooled_fc.w_0", - initializer=self._param_initializer), - bias_attr="pooled_fc.b_0") - return next_sent_feat - - def get_pretraining_output(self, mask_label, mask_pos, labels): - """Get the loss & accuracy for pretraining""" - - mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32') - - # extract the first token feature in each sentence - next_sent_feat = self.get_pooled_output() - reshaped_emb_out = fluid.layers.reshape(x=self._enc_out, shape=[-1, self._emb_size]) - # extract masked tokens' feature - mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos) - - # transform: fc - mask_trans_feat = fluid.layers.fc(input=mask_feat, - size=self._emb_size, - act=self._hidden_act, - param_attr=fluid.ParamAttr(name='mask_lm_trans_fc.w_0', - initializer=self._param_initializer), - bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0')) - # transform: layer norm - mask_trans_feat = pre_process_layer(mask_trans_feat, 'n', name='mask_lm_trans') - - mask_lm_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0", - initializer=fluid.initializer.Constant(value=0.0)) - if self._weight_sharing: - fc_out = fluid.layers.matmul(x=mask_trans_feat, - y=fluid.default_main_program().global_block().var(self._word_emb_name), - transpose_y=True) - fc_out += fluid.layers.create_parameter(shape=[self._voc_size], - dtype=self._dtype, - attr=mask_lm_out_bias_attr, - is_bias=True) - - else: - fc_out = fluid.layers.fc(input=mask_trans_feat, - size=self._voc_size, - param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0", - initializer=self._param_initializer), - bias_attr=mask_lm_out_bias_attr) - - mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out, label=mask_label) - mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss) - - next_sent_fc_out = fluid.layers.fc(input=next_sent_feat, - size=2, - param_attr=fluid.ParamAttr(name="next_sent_fc.w_0", - initializer=self._param_initializer), - bias_attr="next_sent_fc.b_0") - - next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(logits=next_sent_fc_out, - label=labels, - return_softmax=True) - - next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax, label=labels) - - mean_next_sent_loss = fluid.layers.mean(next_sent_loss) - - loss = mean_next_sent_loss + mean_mask_lm_loss - return next_sent_acc, mean_mask_lm_loss, loss diff --git a/modules/text/language_model/chinese_bert_wwm_ext/model/transformer_encoder.py b/modules/text/language_model/chinese_bert_wwm_ext/model/transformer_encoder.py deleted file mode 100644 index b15d8388..00000000 --- a/modules/text/language_model/chinese_bert_wwm_ext/model/transformer_encoder.py +++ /dev/null @@ -1,295 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Transformer encoder.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from functools import partial - -import paddle.fluid as fluid -import paddle.fluid.layers as layers - - -def multi_head_attention(queries, - keys, - values, - attn_bias, - d_key, - d_value, - d_model, - n_head=1, - dropout_rate=0., - cache=None, - param_initializer=None, - name='multi_head_att'): - """ - Multi-Head Attention. Note that attn_bias is added to the logit before - computing softmax activiation to mask certain selected positions so that - they will not considered in attention weights. - """ - keys = queries if keys is None else keys - values = keys if values is None else values - - if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): - raise ValueError("Inputs: quries, keys and values should all be 3-D tensors.") - - def __compute_qkv(queries, keys, values, n_head, d_key, d_value): - """ - Add linear projection to queries, keys, and values. - """ - q = layers.fc(input=queries, - size=d_key * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_query_fc.w_0', initializer=param_initializer), - bias_attr=name + '_query_fc.b_0') - k = layers.fc(input=keys, - size=d_key * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_key_fc.w_0', initializer=param_initializer), - bias_attr=name + '_key_fc.b_0') - v = layers.fc(input=values, - size=d_value * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_value_fc.w_0', initializer=param_initializer), - bias_attr=name + '_value_fc.b_0') - return q, k, v - - def __split_heads(x, n_head): - """ - Reshape the last dimension of inpunt tensor x so that it becomes two - dimensions and then transpose. Specifically, input a tensor with shape - [bs, max_sequence_length, n_head * hidden_dim] then output a tensor - with shape [bs, n_head, max_sequence_length, hidden_dim]. - """ - hidden_size = x.shape[-1] - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True) - - # permuate the dimensions into: - # [batch_size, n_head, max_sequence_len, hidden_size_per_head] - return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) - - def __combine_heads(x): - """ - Transpose and then reshape the last two dimensions of inpunt tensor x - so that it becomes one dimension, which is reverse to __split_heads. - """ - if len(x.shape) == 3: return x - if len(x.shape) != 4: - raise ValueError("Input(x) should be a 4-D Tensor.") - - trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - return layers.reshape(x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True) - - def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): - """ - Scaled Dot-Product Attention - """ - scaled_q = layers.scale(x=q, scale=d_key**-0.5) - product = layers.matmul(x=scaled_q, y=k, transpose_y=True) - if attn_bias: - product += attn_bias - weights = layers.softmax(product) - if dropout_rate: - weights = layers.dropout(weights, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - out = layers.matmul(weights, v) - return out - - q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) - - if cache is not None: # use cache and concat time steps - # Since the inplace reshape in __split_heads changes the shape of k and - # v, which is the cache input for next time step, reshape the cache - # input from the previous time step first. - k = cache["k"] = layers.concat([layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1) - v = cache["v"] = layers.concat([layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1) - - q = __split_heads(q, n_head) - k = __split_heads(k, n_head) - v = __split_heads(v, n_head) - - ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate) - - out = __combine_heads(ctx_multiheads) - - # Project back to the model size. - proj_out = layers.fc(input=out, - size=d_model, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_output_fc.w_0', initializer=param_initializer), - bias_attr=name + '_output_fc.b_0') - return proj_out - - -def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate, hidden_act, param_initializer=None, name='ffn'): - """ - Position-wise Feed-Forward Networks. - This module consists of two linear transformations with a ReLU activation - in between, which is applied to each position separately and identically. - """ - hidden = layers.fc(input=x, - size=d_inner_hid, - num_flatten_dims=2, - act=hidden_act, - param_attr=fluid.ParamAttr(name=name + '_fc_0.w_0', initializer=param_initializer), - bias_attr=name + '_fc_0.b_0') - if dropout_rate: - hidden = layers.dropout(hidden, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - out = layers.fc(input=hidden, - size=d_hid, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_fc_1.w_0', initializer=param_initializer), - bias_attr=name + '_fc_1.b_0') - return out - - -def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., name=''): - """ - Add residual connection, layer normalization and droput to the out tensor - optionally according to the value of process_cmd. - This will be used before or after multi-head attention and position-wise - feed-forward networks. - """ - for cmd in process_cmd: - if cmd == "a": # add residual connection - out = out + prev_out if prev_out else out - elif cmd == "n": # add layer normalization - out_dtype = out.dtype - if out_dtype == fluid.core.VarDesc.VarType.FP16: - out = layers.cast(x=out, dtype="float32") - out = layers.layer_norm(out, - begin_norm_axis=len(out.shape) - 1, - param_attr=fluid.ParamAttr(name=name + '_layer_norm_scale', - initializer=fluid.initializer.Constant(1.)), - bias_attr=fluid.ParamAttr(name=name + '_layer_norm_bias', - initializer=fluid.initializer.Constant(0.))) - if out_dtype == fluid.core.VarDesc.VarType.FP16: - out = layers.cast(x=out, dtype="float16") - elif cmd == "d": # add dropout - if dropout_rate: - out = layers.dropout(out, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - return out - - -pre_process_layer = partial(pre_post_process_layer, None) -post_process_layer = pre_post_process_layer - - -def encoder_layer(enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd="n", - postprocess_cmd="da", - param_initializer=None, - name=''): - """The encoder layers that can be stacked to form a deep encoder. - This module consits of a multi-head (self) attention followed by - position-wise feed-forward networks and both the two components companied - with the post_process_layer to add residual connection, layer normalization - and droput. - """ - attn_output = multi_head_attention(pre_process_layer(enc_input, - preprocess_cmd, - prepostprocess_dropout, - name=name + '_pre_att'), - None, - None, - attn_bias, - d_key, - d_value, - d_model, - n_head, - attention_dropout, - param_initializer=param_initializer, - name=name + '_multi_head_att') - attn_output = post_process_layer(enc_input, - attn_output, - postprocess_cmd, - prepostprocess_dropout, - name=name + '_post_att') - ffd_output = positionwise_feed_forward(pre_process_layer(attn_output, - preprocess_cmd, - prepostprocess_dropout, - name=name + '_pre_ffn'), - d_inner_hid, - d_model, - relu_dropout, - hidden_act, - param_initializer=param_initializer, - name=name + '_ffn') - return post_process_layer(attn_output, ffd_output, postprocess_cmd, prepostprocess_dropout, name=name + '_post_ffn') - - -def encoder(enc_input, - attn_bias, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd="n", - postprocess_cmd="da", - param_initializer=None, - name=''): - """ - The encoder is composed of a stack of identical layers returned by calling - encoder_layer. - """ - for i in range(n_layer): - enc_output = encoder_layer(enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd, - postprocess_cmd, - param_initializer=param_initializer, - name=name + '_layer_' + str(i)) - enc_input = enc_output - enc_output = pre_process_layer(enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder") - - return enc_output diff --git a/modules/text/language_model/chinese_bert_wwm_ext/module.py b/modules/text/language_model/chinese_bert_wwm_ext/module.py index 273b2f02..6ff6803f 100644 --- a/modules/text/language_model/chinese_bert_wwm_ext/module.py +++ b/modules/text/language_model/chinese_bert_wwm_ext/module.py @@ -1,7 +1,6 @@ -# coding:utf-8 -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # -# Licensed under the Apache License, Version 2.0 (the "License" +# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # @@ -12,62 +11,121 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - +from typing import Dict import os +import math -from paddlehub import TransformerModule -from paddlehub.module.module import moduleinfo +import paddle +import paddle.nn as nn +import paddle.nn.functional as F -from chinese_bert_wwm_ext.model.bert import BertConfig, BertModel +from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification +from paddlenlp.transformers.bert.tokenizer import BertTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule +from paddlehub.utils.log import logger @moduleinfo( name="chinese-bert-wwm-ext", - version="1.0.0", - summary="chinese-bert-wwm-ext, 12-layer, 768-hidden, 12-heads, 110M parameters ", + version="2.0.1", + summary= + "chinese-bert-wwm-ext, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="ymcui", author_email="ymcui@ir.hit.edu.cn", type="nlp/semantic_model", + meta=TransformerModule ) -class BertWwm(TransformerModule): - def _initialize(self): - self.MAX_SEQ_LEN = 512 - self.params_path = os.path.join(self.directory, "assets", "params") - self.vocab_path = os.path.join(self.directory, "assets", "vocab.txt") +class BertWwm(nn.Layer): + """ + BertWwm model + """ - bert_config_path = os.path.join(self.directory, "assets", "bert_config.json") - self.bert_config = BertConfig(bert_config_path) + def __init__( + self, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, + ): + super(BertWwm, self).__init__() + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes - def net(self, input_ids, position_ids, segment_ids, input_mask): - """ - create neural network. + if task == 'sequence_classification': + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': + self.model = BertForSequenceClassification.from_pretrained( + pretrained_model_name_or_path='bert-wwm-ext-chinese', + num_classes=self.num_classes, + **kwargs + ) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = BertForTokenClassification.from_pretrained( + pretrained_model_name_or_path='bert-wwm-ext-chinese', + num_classes=self.num_classes, + **kwargs + ) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) + elif task is None: + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-wwm-ext-chinese', **kwargs) + else: + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) - Args: - input_ids (tensor): the word ids. - position_ids (tensor): the position ids. - segment_ids (tensor): the segment ids. - input_mask (tensor): the padding mask. + self.task = task - Returns: - pooled_output (tensor): sentence-level output for classification task. - sequence_output (tensor): token-level output for sequence task. - """ - bert = BertModel(src_ids=input_ids, - position_ids=position_ids, - sentence_ids=segment_ids, - input_mask=input_mask, - config=self.bert_config, - use_fp16=False) - pooled_output = bert.get_pooled_output() - sequence_output = bert.get_sequence_output() - return pooled_output, sequence_output + if load_checkpoint is not None and os.path.isfile(load_checkpoint): + state_dict = paddle.load(load_checkpoint) + self.set_state_dict(state_dict) + logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + if self.task == 'seq-cls': + logits = result + probs = F.softmax(logits, axis=1) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs + else: + sequence_output, pooled_output = result + return sequence_output, pooled_output -if __name__ == '__main__': - test_module = BertWwm() + @staticmethod + def get_tokenizer(*args, **kwargs): + """ + Gets the tokenizer that is customized for this module. + """ + return BertTokenizer.from_pretrained( + pretrained_model_name_or_path='bert-wwm-ext-chinese', *args, **kwargs) diff --git a/modules/text/language_model/ernie/README.md b/modules/text/language_model/ernie/README.md index 4aebcebf..4ee91755 100644 --- a/modules/text/language_model/ernie/README.md +++ b/modules/text/language_model/ernie/README.md @@ -1,5 +1,5 @@ ```shell -$ hub install ernie==2.0.0 +$ hub install ernie==2.0.1 ``` ## 在线体验 AI Studio 快速体验 @@ -15,7 +15,6 @@ $ hub install ernie==2.0.0

- 更多详情请参考[ERNIE论文](https://arxiv.org/abs/1904.09223) ## API @@ -24,23 +23,29 @@ $ hub install ernie==2.0.0 def __init__( task=None, load_checkpoint=None, - label_map=None) + label_map=None, + num_classes=2, + **kwargs, +) ``` 创建Module对象(动态图组网版本)。 **参数** -* `task`: 任务名称,可为`sequence_classification`。 +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 * `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 * `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python def predict( data, max_seq_len=128, batch_size=1, - use_gpu=False) + use_gpu=False +) ``` **参数** @@ -55,7 +60,9 @@ def predict( ```python def get_embedding( - texts, + data, + max_seq_len=128, + batch_size=1, use_gpu=False ) ``` @@ -64,7 +71,9 @@ def get_embedding( **参数** -* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** @@ -78,16 +87,16 @@ def get_embedding( import paddlehub as hub data = [ - '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', - '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', - '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], ] label_map = {0: 'negative', 1: 'positive'} model = hub.Module( name='ernie', - version='2.0.0', - task='sequence_classification', + version='2.0.1', + task='seq-cls', load_checkpoint='/path/to/parameters', label_map=label_map) results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) @@ -95,7 +104,9 @@ for idx, text in enumerate(data): print('Data: {} \t Lable: {}'.format(text, results[idx])) ``` -参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) ## 服务部署 @@ -121,12 +132,12 @@ $ hub serving start -m ernie import requests import json -# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]} -text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]] -# 以key的方式指定text传入预测方法的时的参数,此例中为"texts" -# 对应本地部署,则为module.get_embedding(texts=text) -data = {"texts": text} -# 发送post请求,content-type类型应指定json方式 +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip url = "http://10.12.121.132:8866/predict/ernie" # 指定post请求的headers为application/json方式 headers = {"Content-Type": "application/json"} @@ -170,3 +181,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图版本,接口有所变化 + +* 2.0.1 + + 任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/ernie/module.py b/modules/text/language_model/ernie/module.py index 752a33a4..37a99500 100644 --- a/modules/text/language_model/ernie/module.py +++ b/modules/text/language_model/ernie/module.py @@ -11,29 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict import os +import math -from paddle.dataset.common import DATA_HOME import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlehub import BertTokenizer -from paddlehub.module.modeling_ernie import ErnieModel, ErnieForSequenceClassification -from paddlehub.module.module import moduleinfo, serving +from paddlenlp.transformers.ernie.modeling import ErnieModel, ErnieForSequenceClassification, ErnieForTokenClassification +from paddlenlp.transformers.ernie.tokenizer import ErnieTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule from paddlehub.utils.log import logger -from paddlehub.utils.utils import download @moduleinfo( name="ernie", - version="2.0.0", + version="2.0.1", summary= "Baidu's ERNIE, Enhanced Representation through kNowledge IntEgration, max_seq_len=512 when predtrained. The module is executed as paddle.dygraph.", author="paddlepaddle", author_email="", - type="nlp/semantic_model") + type="nlp/semantic_model", + meta=TransformerModule) class Ernie(nn.Layer): """ Ernie model @@ -41,180 +43,80 @@ class Ernie(nn.Layer): def __init__( self, - task=None, - load_checkpoint=None, - label_map=None, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, ): super(Ernie, self).__init__() - # TODO(zhangxuefei): add token_classification task + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes + if task == 'sequence_classification': - self.model = ErnieForSequenceClassification.from_pretrained(pretrained_model_name_or_path='ernie') + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': + self.model = ErnieForSequenceClassification.from_pretrained(pretrained_model_name_or_path='ernie-1.0', num_classes=self.num_classes, **kwargs) self.criterion = paddle.nn.loss.CrossEntropyLoss() - self.metric = paddle.metric.Accuracy(name='acc_accumulation') + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = ErnieForTokenClassification.from_pretrained(pretrained_model_name_or_path='ernie-1.0', num_classes=self.num_classes, **kwargs) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) elif task is None: - self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie') + self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie-1.0', **kwargs) else: - raise RuntimeError("Unknown task %s, task should be sequence_classification" % task) + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) self.task = task - self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None): + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): result = self.model(input_ids, token_type_ids, position_ids, attention_mask) - if self.task is not None: + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) - return probs, loss, acc + return probs, loss, {'acc': acc} return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs else: sequence_output, pooled_output = result return sequence_output, pooled_output - def get_vocab_path(self): - """ - Gets the path of the module vocabulary path. - """ - save_path = os.path.join(DATA_HOME, 'ernie', 'vocab.txt') - if not os.path.exists(save_path) or not os.path.isfile(save_path): - url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie/vocab.txt" - download(url, os.path.join(DATA_HOME, 'ernie')) - return save_path - - def get_tokenizer(self, tokenize_chinese_chars=True): + @staticmethod + def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. - Args: - tokenize_chinese_chars (:obj: bool , defaults to :obj: True): - Whether to tokenize chinese characters or not. - Returns: - tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module. - """ - return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path()) - - def training_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for training, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as loss and metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'loss': avg_loss, 'metrics': {'acc': acc}} - - def validation_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for validation, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'metrics': {'acc': acc}} - - def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False): """ - Predicts the data labels. - - Args: - data (obj:`List(str)`): The processed data whose each element is the raw text. - max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`): - If set to a number, will limit the total sequence returned so that it has a maximum length. - batch_size(obj:`int`, defaults to 1): The number of batch. - use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not. - - Returns: - results(obj:`list`): All the predictions labels. - """ - # TODO(zhangxuefei): add task token_classification task predict. - if self.task not in ['sequence_classification']: - raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - tokenizer = self.get_tokenizer() - - examples = [] - for text in data: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids'])) - - def _batchify_fn(batch): - input_ids = [entry[0] for entry in batch] - segment_ids = [entry[1] for entry in batch] - return input_ids, segment_ids - - # Seperates data into some batches. - batches = [] - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) - - results = [] - self.eval() - for batch in batches: - input_ids, segment_ids = _batchify_fn(batch) - input_ids = paddle.to_tensor(input_ids) - segment_ids = paddle.to_tensor(segment_ids) - - # TODO(zhangxuefei): add task token_classification postprocess after prediction. - if self.task == 'sequence_classification': - probs = self(input_ids, segment_ids) - idx = paddle.argmax(probs, axis=1).numpy() - idx = idx.tolist() - labels = [self.label_map[i] for i in idx] - results.extend(labels) - - return results - - @serving - def get_embedding(self, texts, use_gpu=False): - if self.task is not None: - raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - - tokenizer = self.get_tokenizer() - results = [] - for text in texts: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - - input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0) - segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0) - sequence_output, pooled_output = self(input_ids, segment_ids) - - sequence_output = sequence_output.squeeze(0) - pooled_output = pooled_output.squeeze(0) - results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist())) - return results + return ErnieTokenizer.from_pretrained( + pretrained_model_name_or_path='ernie-1.0', *args, **kwargs) diff --git a/modules/text/language_model/ernie_tiny/README.md b/modules/text/language_model/ernie_tiny/README.md index 899c500e..f02419a6 100644 --- a/modules/text/language_model/ernie_tiny/README.md +++ b/modules/text/language_model/ernie_tiny/README.md @@ -23,7 +23,10 @@ $ hub install ernie_tiny==2.0.1 def __init__( task=None, load_checkpoint=None, - label_map=None) + label_map=None, + num_classes=2, + **kwargs, +) ``` 创建Module对象(动态图组网版本)。 @@ -33,13 +36,16 @@ def __init__( * `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 * `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 * `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python def predict( data, max_seq_len=128, batch_size=1, - use_gpu=False) + use_gpu=False +) ``` **参数** @@ -54,7 +60,9 @@ def predict( ```python def get_embedding( - texts, + data, + max_seq_len=128, + batch_size=1, use_gpu=False ) ``` @@ -63,7 +71,9 @@ def get_embedding( **参数** -* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** @@ -94,7 +104,9 @@ for idx, text in enumerate(data): print('Data: {} \t Lable: {}'.format(text, results[idx])) ``` -参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) ## 服务部署 @@ -120,12 +132,12 @@ $ hub serving start -m ernie_tiny import requests import json -# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]} -text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]] -# 以key的方式指定text传入预测方法的时的参数,此例中为"texts" -# 对应本地部署,则为module.get_embedding(texts=text) -data = {"texts": text} -# 发送post请求,content-type类型应指定json方式 +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip url = "http://10.12.121.132:8866/predict/ernie_tiny" # 指定post请求的headers为application/json方式 headers = {"Content-Type": "application/json"} diff --git a/modules/text/language_model/ernie_tiny/module.py b/modules/text/language_model/ernie_tiny/module.py index 77b954c8..d309ac47 100644 --- a/modules/text/language_model/ernie_tiny/module.py +++ b/modules/text/language_model/ernie_tiny/module.py @@ -11,19 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Dict import os +import math -from paddle.dataset.common import DATA_HOME import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlehub import ErnieTinyTokenizer from paddlenlp.transformers.ernie.modeling import ErnieModel, ErnieForSequenceClassification, ErnieForTokenClassification +from paddlenlp.transformers.ernie.tokenizer import ErnieTinyTokenizer +from paddlenlp.metrics import ChunkEvaluator from paddlehub.module.module import moduleinfo from paddlehub.module.nlp_module import TransformerModule from paddlehub.utils.log import logger -from paddlehub.utils.utils import download @moduleinfo( @@ -41,14 +42,15 @@ class ErnieTiny(nn.Layer): def __init__( self, - task=None, - load_checkpoint=None, - label_map=None, - num_classes=2, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, **kwargs, ): super(ErnieTiny, self).__init__() if label_map: + self.label_map = label_map self.num_classes = len(label_map) else: self.num_classes = num_classes @@ -57,7 +59,7 @@ class ErnieTiny(nn.Layer): task = 'seq-cls' logger.warning( "current task name 'sequence_classification' was renamed to 'seq-cls', " - "'sequence_classification' has been deprecated and will be removed the future.", + "'sequence_classification' has been deprecated and will be removed in the future.", ) if task == 'seq-cls': self.model = ErnieForSequenceClassification.from_pretrained(pretrained_model_name_or_path='ernie-tiny', num_classes=self.num_classes, **kwargs) @@ -66,7 +68,9 @@ class ErnieTiny(nn.Layer): elif task == 'token-cls': self.model = ErnieForTokenClassification.from_pretrained(pretrained_model_name_or_path='ernie-tiny', num_classes=self.num_classes, **kwargs) self.criterion = paddle.nn.loss.CrossEntropyLoss() - self.metric = paddle.metric.Accuracy() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) elif task is None: self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie-tiny', **kwargs) else: @@ -74,14 +78,13 @@ class ErnieTiny(nn.Layer): task, self._tasks_supported)) self.task = task - self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None): + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): result = self.model(input_ids, token_type_ids, position_ids, attention_mask) if self.task == 'seq-cls': logits = result @@ -90,49 +93,29 @@ class ErnieTiny(nn.Layer): loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) - return probs, loss, acc + return probs, loss, {'acc': acc} return probs elif self.task == 'token-cls': logits = result - token_level_probs = F.softmax(logits, axis=2) + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) if labels is not None: - labels = paddle.to_tensor(labels).unsqueeze(-1) - loss = self.criterion(logits, labels) - correct = self.metric.compute(token_level_probs, labels) - acc = self.metric.update(correct) - return token_level_probs, loss, acc + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs else: sequence_output, pooled_output = result return sequence_output, pooled_output - def get_vocab_path(self): - """ - Gets the path of the module vocabulary path. - """ - save_path = os.path.join(DATA_HOME, 'ernie_tiny', 'vocab.txt') - if not os.path.exists(save_path) or not os.path.isfile(save_path): - url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_tiny/vocab.txt" - download(url, os.path.join(DATA_HOME, 'ernie_tiny')) - return save_path - - def get_tokenizer(self, tokenize_chinese_chars=True): + @staticmethod + def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. - Args: - tokenize_chinese_chars (:obj: bool , defaults to :obj: True): - Whether to tokenize chinese characters or not. - Returns: - tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module. """ - spm_path = os.path.join(DATA_HOME, 'ernie_tiny', 'spm_cased_simp_sampled.model') - if not os.path.exists(spm_path) or not os.path.isfile(spm_path): - url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_tiny/spm_cased_simp_sampled.model" - download(url, os.path.join(DATA_HOME, 'ernie_tiny')) - - word_dict_path = os.path.join(DATA_HOME, 'ernie_tiny', 'dict.wordseg.pickle') - if not os.path.exists(word_dict_path) or not os.path.isfile(word_dict_path): - url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_tiny/dict.wordseg.pickle" - download(url, os.path.join(DATA_HOME, 'ernie_tiny')) - - return ErnieTinyTokenizer(self.get_vocab_path(), spm_path, word_dict_path) + return ErnieTinyTokenizer.from_pretrained( + pretrained_model_name_or_path='ernie-tiny', *args, **kwargs) diff --git a/modules/text/language_model/ernie_v2_eng_base/README.md b/modules/text/language_model/ernie_v2_eng_base/README.md index cd203a4a..d5ece7a9 100644 --- a/modules/text/language_model/ernie_v2_eng_base/README.md +++ b/modules/text/language_model/ernie_v2_eng_base/README.md @@ -1,6 +1,6 @@ ```shell -$ hub install ernie_v2_eng_base==2.0.0 +$ hub install ernie_v2_eng_base==2.0.1 ```

@@ -19,23 +19,29 @@ $ hub install ernie_v2_eng_base==2.0.0 def __init__( task=None, load_checkpoint=None, - label_map=None) + label_map=None, + num_classes=2, + **kwargs, +) ``` 创建Module对象(动态图组网版本)。 **参数** -* `task`: 任务名称,可为`sequence_classification`。 +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 * `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 * `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python def predict( data, max_seq_len=128, batch_size=1, - use_gpu=False) + use_gpu=False +) ``` **参数** @@ -50,7 +56,9 @@ def predict( ```python def get_embedding( - texts, + data, + max_seq_len=128, + batch_size=1, use_gpu=False ) ``` @@ -59,7 +67,9 @@ def get_embedding( **参数** -* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** @@ -73,16 +83,16 @@ def get_embedding( import paddlehub as hub data = [ - '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', - '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', - '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], ] label_map = {0: 'negative', 1: 'positive'} model = hub.Module( name='ernie_v2_eng_base', - version='2.0.0', - task='sequence_classification', + version='2.0.1', + task='seq-cls', load_checkpoint='/path/to/parameters', label_map=label_map) results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) @@ -90,7 +100,9 @@ for idx, text in enumerate(data): print('Data: {} \t Lable: {}'.format(text, results[idx])) ``` -参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) ## 服务部署 @@ -116,12 +128,12 @@ $ hub serving start -m ernie_v2_eng_base import requests import json -# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]} -text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]] -# 以key的方式指定text传入预测方法的时的参数,此例中为"texts" -# 对应本地部署,则为module.get_embedding(texts=text) -data = {"texts": text} -# 发送post请求,content-type类型应指定json方式 +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip url = "http://10.12.121.132:8866/predict/ernie_v2_eng_base" # 指定post请求的headers为application/json方式 headers = {"Content-Type": "application/json"} @@ -157,3 +169,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图版本,接口有所变化 + +* 2.0.1 + + 任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/ernie_v2_eng_base/module.py b/modules/text/language_model/ernie_v2_eng_base/module.py index b9b74ba3..59ea31b7 100644 --- a/modules/text/language_model/ernie_v2_eng_base/module.py +++ b/modules/text/language_model/ernie_v2_eng_base/module.py @@ -11,29 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict import os +import math -from paddle.dataset.common import DATA_HOME import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlehub import BertTokenizer -from paddlehub.module.modeling_ernie import ErnieModel, ErnieForSequenceClassification -from paddlehub.module.module import moduleinfo, serving +from paddlenlp.transformers.ernie.modeling import ErnieModel, ErnieForSequenceClassification, ErnieForTokenClassification +from paddlenlp.transformers.ernie.tokenizer import ErnieTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule from paddlehub.utils.log import logger -from paddlehub.utils.utils import download @moduleinfo( name="ernie_v2_eng_base", - version="2.0.0", + version="2.0.1", summary= "Baidu's ERNIE 2.0, Enhanced Representation through kNowledge IntEgration, max_seq_len=512 when predtrained. The module is executed as paddle.dygraph.", author="paddlepaddle", author_email="", - type="nlp/semantic_model") + type="nlp/semantic_model", + meta=TransformerModule) class ErnieV2(nn.Layer): """ Ernie model @@ -41,181 +43,88 @@ class ErnieV2(nn.Layer): def __init__( self, - task=None, - load_checkpoint=None, - label_map=None, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, ): super(ErnieV2, self).__init__() - # TODO(zhangxuefei): add token_classification task + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes + if task == 'sequence_classification': + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': self.model = ErnieForSequenceClassification.from_pretrained( - pretrained_model_name_or_path='ernie_v2_eng_base') + pretrained_model_name_or_path='ernie-2.0-en', + num_classes=self.num_classes, + **kwargs + ) self.criterion = paddle.nn.loss.CrossEntropyLoss() - self.metric = paddle.metric.Accuracy(name='acc_accumulation') + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = ErnieForTokenClassification.from_pretrained( + pretrained_model_name_or_path='ernie-2.0-en', + num_classes=self.num_classes, + **kwargs + ) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) elif task is None: - self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie_v2_eng_base') + self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie-2.0-en', **kwargs) else: - raise RuntimeError("Unknown task %s, task should be sequence_classification" % task) + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) self.task = task - self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None): + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): result = self.model(input_ids, token_type_ids, position_ids, attention_mask) - if self.task is not None: + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) - return probs, loss, acc + return probs, loss, {'acc': acc} return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs else: sequence_output, pooled_output = result return sequence_output, pooled_output - def get_vocab_path(self): - """ - Gets the path of the module vocabulary path. - """ - save_path = os.path.join(DATA_HOME, 'ernie', 'vocab.txt') - if not os.path.exists(save_path) or not os.path.isfile(save_path): - url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_v2_eng_base/vocab.txt" - download(url, os.path.join(DATA_HOME, 'ernie')) - return save_path - - def get_tokenizer(self, tokenize_chinese_chars=True): + @staticmethod + def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. - Args: - tokenize_chinese_chars (:obj: bool , defaults to :obj: True): - Whether to tokenize chinese characters or not. - Returns: - tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module. - """ - return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path()) - - def training_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for training, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as loss and metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'loss': avg_loss, 'metrics': {'acc': acc}} - - def validation_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for validation, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'metrics': {'acc': acc}} - - def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False): """ - Predicts the data labels. - - Args: - data (obj:`List(str)`): The processed data whose each element is the raw text. - max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`): - If set to a number, will limit the total sequence returned so that it has a maximum length. - batch_size(obj:`int`, defaults to 1): The number of batch. - use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not. - - Returns: - results(obj:`list`): All the predictions labels. - """ - # TODO(zhangxuefei): add task token_classification task predict. - if self.task not in ['sequence_classification']: - raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - tokenizer = self.get_tokenizer() - - examples = [] - for text in data: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids'])) - - def _batchify_fn(batch): - input_ids = [entry[0] for entry in batch] - segment_ids = [entry[1] for entry in batch] - return input_ids, segment_ids - - # Seperates data into some batches. - batches = [] - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) - - results = [] - self.eval() - for batch in batches: - input_ids, segment_ids = _batchify_fn(batch) - input_ids = paddle.to_tensor(input_ids) - segment_ids = paddle.to_tensor(segment_ids) - - # TODO(zhangxuefei): add task token_classification postprocess after prediction. - if self.task == 'sequence_classification': - probs = self(input_ids, segment_ids) - idx = paddle.argmax(probs, axis=1).numpy() - idx = idx.tolist() - labels = [self.label_map[i] for i in idx] - results.extend(labels) - - return results - - @serving - def get_embedding(self, texts, use_gpu=False): - if self.task is not None: - raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - - tokenizer = self.get_tokenizer() - results = [] - for text in texts: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - - input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0) - segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0) - sequence_output, pooled_output = self(input_ids, segment_ids) - - sequence_output = sequence_output.squeeze(0) - pooled_output = pooled_output.squeeze(0) - results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist())) - return results + return ErnieTokenizer.from_pretrained( + pretrained_model_name_or_path='ernie-2.0-en', *args, **kwargs) \ No newline at end of file diff --git a/modules/text/language_model/ernie_v2_eng_large/README.md b/modules/text/language_model/ernie_v2_eng_large/README.md index 149d6012..680bc1be 100644 --- a/modules/text/language_model/ernie_v2_eng_large/README.md +++ b/modules/text/language_model/ernie_v2_eng_large/README.md @@ -1,6 +1,6 @@ ```shell -$ hub install ernie_v2_eng_large==2.0.0 +$ hub install ernie_v2_eng_large==2.0.1 ```

@@ -19,29 +19,35 @@ $ hub install ernie_v2_eng_large==2.0.0 def __init__( task=None, load_checkpoint=None, - label_map=None) + label_map=None, + num_classes=2, + **kwargs, +) ``` 创建Module对象(动态图组网版本)。 **参数** -* `task`: 任务名称,可为`sequence_classification`。 +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 * `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 * `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python def predict( data, max_seq_len=128, batch_size=1, - use_gpu=False) + use_gpu=False +) ``` **参数** * `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例, - 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 + 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 * `max_seq_len`:模型处理文本的最大长度 * `batch_size`:模型批处理大小 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 @@ -50,7 +56,9 @@ def predict( ```python def get_embedding( - texts, + data, + max_seq_len=128, + batch_size=1, use_gpu=False ) ``` @@ -59,7 +67,9 @@ def get_embedding( **参数** -* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** @@ -73,16 +83,16 @@ def get_embedding( import paddlehub as hub data = [ - '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', - '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', - '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], ] label_map = {0: 'negative', 1: 'positive'} model = hub.Module( name='ernie_v2_eng_large', - version='2.0.0', - task='sequence_classification', + version='2.0.1', + task='seq-cls', load_checkpoint='/path/to/parameters', label_map=label_map) results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) @@ -90,7 +100,9 @@ for idx, text in enumerate(data): print('Data: {} \t Lable: {}'.format(text, results[idx])) ``` -参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) ## 服务部署 @@ -116,12 +128,12 @@ $ hub serving start -m ernie_v2_eng_large import requests import json -# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]} -text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]] -# 以key的方式指定text传入预测方法的时的参数,此例中为"texts" -# 对应本地部署,则为module.get_embedding(texts=text) -data = {"texts": text} -# 发送post请求,content-type类型应指定json方式 +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip url = "http://10.12.121.132:8866/predict/ernie_v2_eng_large" # 指定post请求的headers为application/json方式 headers = {"Content-Type": "application/json"} @@ -157,3 +169,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图版本,接口有所变化 + +* 2.0.1 + + 任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/ernie_v2_eng_large/module.py b/modules/text/language_model/ernie_v2_eng_large/module.py index 8d3ae55f..0d54a670 100644 --- a/modules/text/language_model/ernie_v2_eng_large/module.py +++ b/modules/text/language_model/ernie_v2_eng_large/module.py @@ -11,29 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict import os +import math -from paddle.dataset.common import DATA_HOME import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlehub import BertTokenizer -from paddlehub.module.modeling_ernie import ErnieModel, ErnieForSequenceClassification -from paddlehub.module.module import moduleinfo, serving +from paddlenlp.transformers.ernie.modeling import ErnieModel, ErnieForSequenceClassification, ErnieForTokenClassification +from paddlenlp.transformers.ernie.tokenizer import ErnieTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule from paddlehub.utils.log import logger -from paddlehub.utils.utils import download @moduleinfo( name="ernie_v2_eng_large", - version="2.0.0", + version="2.0.1", summary= "Baidu's ERNIE 2.0, Enhanced Representation through kNowledge IntEgration, max_seq_len=512 when predtrained. The module is executed as paddle.dygraph.", author="paddlepaddle", author_email="", - type="nlp/semantic_model") + type="nlp/semantic_model", + meta=TransformerModule) class ErnieV2(nn.Layer): """ Ernie model @@ -41,181 +43,88 @@ class ErnieV2(nn.Layer): def __init__( self, - task=None, - load_checkpoint=None, - label_map=None, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, ): super(ErnieV2, self).__init__() - # TODO(zhangxuefei): add token_classification task + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes + if task == 'sequence_classification': + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': self.model = ErnieForSequenceClassification.from_pretrained( - pretrained_model_name_or_path='ernie_v2_eng_large') + pretrained_model_name_or_path='ernie-2.0-large-en', + num_classes=self.num_classes, + **kwargs + ) self.criterion = paddle.nn.loss.CrossEntropyLoss() - self.metric = paddle.metric.Accuracy(name='acc_accumulation') + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = ErnieForTokenClassification.from_pretrained( + pretrained_model_name_or_path='ernie-2.0-large-en', + num_classes=self.num_classes, + **kwargs + ) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) elif task is None: - self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie_v2_eng_large') + self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie-2.0-large-en', **kwargs) else: - raise RuntimeError("Unknown task %s, task should be sequence_classification" % task) + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) self.task = task - self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None): + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): result = self.model(input_ids, token_type_ids, position_ids, attention_mask) - if self.task is not None: + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) - return probs, loss, acc + return probs, loss, {'acc': acc} return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs else: sequence_output, pooled_output = result return sequence_output, pooled_output - def get_vocab_path(self): - """ - Gets the path of the module vocabulary path. - """ - save_path = os.path.join(DATA_HOME, 'ernie', 'vocab.txt') - if not os.path.exists(save_path) or not os.path.isfile(save_path): - url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_v2_eng_large/vocab.txt" - download(url, os.path.join(DATA_HOME, 'ernie')) - return save_path - - def get_tokenizer(self, tokenize_chinese_chars=True): + @staticmethod + def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. - Args: - tokenize_chinese_chars (:obj: bool , defaults to :obj: True): - Whether to tokenize chinese characters or not. - Returns: - tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module. - """ - return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path()) - - def training_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for training, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as loss and metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'loss': avg_loss, 'metrics': {'acc': acc}} - - def validation_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for validation, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'metrics': {'acc': acc}} - - def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False): """ - Predicts the data labels. - - Args: - data (obj:`List(str)`): The processed data whose each element is the raw text. - max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`): - If set to a number, will limit the total sequence returned so that it has a maximum length. - batch_size(obj:`int`, defaults to 1): The number of batch. - use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not. - - Returns: - results(obj:`list`): All the predictions labels. - """ - # TODO(zhangxuefei): add task token_classification task predict. - if self.task not in ['sequence_classification']: - raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - tokenizer = self.get_tokenizer() - - examples = [] - for text in data: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids'])) - - def _batchify_fn(batch): - input_ids = [entry[0] for entry in batch] - segment_ids = [entry[1] for entry in batch] - return input_ids, segment_ids - - # Seperates data into some batches. - batches = [] - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) - - results = [] - self.eval() - for batch in batches: - input_ids, segment_ids = _batchify_fn(batch) - input_ids = paddle.to_tensor(input_ids) - segment_ids = paddle.to_tensor(segment_ids) - - # TODO(zhangxuefei): add task token_classification postprocess after prediction. - if self.task == 'sequence_classification': - probs = self(input_ids, segment_ids) - idx = paddle.argmax(probs, axis=1).numpy() - idx = idx.tolist() - labels = [self.label_map[i] for i in idx] - results.extend(labels) - - return results - - @serving - def get_embedding(self, texts, use_gpu=False): - if self.task is not None: - raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - - tokenizer = self.get_tokenizer() - results = [] - for text in texts: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - - input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0) - segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0) - sequence_output, pooled_output = self(input_ids, segment_ids) - - sequence_output = sequence_output.squeeze(0) - pooled_output = pooled_output.squeeze(0) - results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist())) - return results + return ErnieTokenizer.from_pretrained( + pretrained_model_name_or_path='ernie-2.0-large-en', *args, **kwargs) diff --git a/modules/text/language_model/rbt3/README.md b/modules/text/language_model/rbt3/README.md index a9a001d8..baf02dd1 100644 --- a/modules/text/language_model/rbt3/README.md +++ b/modules/text/language_model/rbt3/README.md @@ -1,5 +1,5 @@ ```shell -$ hub install rbt3==1.0.0 +$ hub install rtb3==2.0.1 ```


@@ -9,94 +9,130 @@ $ hub install rbt3==1.0.0 ## API ```python -def context( - trainable=True, - max_seq_len=128 +def __init__( + task=None, + load_checkpoint=None, + label_map=None, + num_classes=2, + **kwargs, ) ``` -用于获取Module的上下文信息,得到输入、输出以及预训练的Paddle Program副本 -**参数** - -> trainable:设置为True时,Module中的参数在Fine-tune时也会随之训练,否则保持不变。 -> max_seq_len:BERT模型的最大序列长度,若序列长度不足,会通过padding方式补到**max_seq_len**, 若序列长度大于该值,则会以截断方式让序列长度为**max_seq_len**,max_seq_len可取值范围为0~512; - -**返回** -> inputs:dict类型,有以下字段: -> >**input_ids**存放输入文本tokenize后各token对应BERT词汇表的word ids, shape为\[batch_size, max_seq_len\],int64类型; -> >**position_ids**存放输入文本tokenize后各token所在该文本的位置,shape为\[batch_size, max_seq_len\],int64类型; -> >**segment_ids**存放各token所在文本的标识(token属于文本1或者文本2),shape为\[batch_size, max_seq_len\],int64类型; -> >**input_mask**存放token是否为padding的标识,shape为\[batch_size, max_seq_len\],int64类型; -> -> outputs:dict类型,Module的输出特征,有以下字段: -> >**pooled_output**字段存放句子粒度的特征,可用于文本分类等任务,shape为 \[batch_size, 768\],int64类型; -> >**sequence_output**字段存放字粒度的特征,可用于序列标注等任务,shape为 \[batch_size, seq_len, 768\],int64类型; -> -> program:包含该Module计算图的Program。 +创建Module对象(动态图组网版本)。 +**参数** +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 +* `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 +* `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python -def get_embedding( - texts, - use_gpu=False, - batch_size=1 +def predict( + data, + max_seq_len=128, + batch_size=1, + use_gpu=False ) ``` -用于获取输入文本的句子粒度特征与字粒度特征 - **参数** -> texts:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 -> use_gpu:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 +* `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例, + 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 +* `max_seq_len`:模型处理文本的最大长度 +* `batch_size`:模型批处理大小 +* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** -> results:list类型,格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\],其中每个元素都是对应样例的特征输出,每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。 -> - ```python -def get_params_layer() +def get_embedding( + data, + max_seq_len=128, + batch_size=1, + use_gpu=False +) ``` -用于获取参数层信息,该方法与ULMFiTStrategy联用可以严格按照层数设置分层学习率与逐层解冻。 +用于获取输入文本的句子粒度特征与字粒度特征 **参数** -> 无 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 +* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** -> params_layer:dict类型,key为参数名,值为参数所在层数 +* `results`:list类型,格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\],其中每个元素都是对应样例的特征输出,每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。 + **代码示例** ```python import paddlehub as hub -# Load $ hub install rbt3 pretrained model -module = hub.Module(name="rbt3") -inputs, outputs, program = module.context(trainable=True, max_seq_len=128) +data = [ + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], +] +label_map = {0: 'negative', 1: 'positive'} + +model = hub.Module( + name='rtb3', + version='2.0.1', + task='seq-cls', + load_checkpoint='/path/to/parameters', + label_map=label_map) +results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) +for idx, text in enumerate(data): + print('Data: {} \t Lable: {}'.format(text, results[idx])) +``` -# Must feed all the tensor of rbt3's module need -input_ids = inputs["input_ids"] -position_ids = inputs["position_ids"] -segment_ids = inputs["segment_ids"] -input_mask = inputs["input_mask"] +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) -# Use "pooled_output" for sentence-level output. -pooled_output = outputs["pooled_output"] +## 服务部署 -# Use "sequence_output" for token-level output. -sequence_output = outputs["sequence_output"] +PaddleHub Serving可以部署一个在线获取预训练词向量。 -# Use "get_embedding" to get embedding result. -embedding_result = module.get_embedding(texts=[["Sample1_text_a"],["Sample2_text_a","Sample2_text_b"]], use_gpu=True) +### Step1: 启动PaddleHub Serving -# Use "get_params_layer" to get params layer and used to ULMFiTStrategy. -params_layer = module.get_params_layer() -strategy = hub.finetune.strategy.ULMFiTStrategy(frz_params_layer=params_layer, dis_params_layer=params_layer) +运行启动命令: + +```shell +$ hub serving start -m rtb3 +``` + +这样就完成了一个获取预训练词向量服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/rtb3" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) ``` ## 查看代码 @@ -109,12 +145,16 @@ https://github.com/ymcui/Chinese-BERT-wwm ## 依赖 -paddlepaddle >= 1.6.2 +paddlepaddle >= 2.0.0 -paddlehub >= 1.6.0 +paddlehub >= 2.0.0 ## 更新历史 * 1.0.0 初始发布 + +* 2.0.1 + + 全面升级动态图,接口有所变化。任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/rbt3/model/__init__.py b/modules/text/language_model/rbt3/model/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modules/text/language_model/rbt3/model/bert.py b/modules/text/language_model/rbt3/model/bert.py deleted file mode 100644 index 4d37cb02..00000000 --- a/modules/text/language_model/rbt3/model/bert.py +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""BERT model.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import six -import json - -import paddle.fluid as fluid - -from rbt3.model.transformer_encoder import encoder, pre_process_layer - - -class BertConfig(object): - def __init__(self, config_path): - self._config_dict = self._parse(config_path) - - def _parse(self, config_path): - try: - with open(config_path) as json_file: - config_dict = json.load(json_file) - except Exception: - raise IOError("Error in parsing bert model config file '%s'" % config_path) - else: - return config_dict - - def __getitem__(self, key): - return self._config_dict[key] - - def print_config(self): - for arg, value in sorted(six.iteritems(self._config_dict)): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -class BertModel(object): - def __init__(self, src_ids, position_ids, sentence_ids, input_mask, config, weight_sharing=True, use_fp16=False): - - self._emb_size = config['hidden_size'] - self._n_layer = config['num_hidden_layers'] - self._n_head = config['num_attention_heads'] - self._voc_size = config['vocab_size'] - self._max_position_seq_len = config['max_position_embeddings'] - self._sent_types = config['type_vocab_size'] - self._hidden_act = config['hidden_act'] - self._prepostprocess_dropout = config['hidden_dropout_prob'] - self._attention_dropout = config['attention_probs_dropout_prob'] - self._weight_sharing = weight_sharing - - self._word_emb_name = "word_embedding" - self._pos_emb_name = "pos_embedding" - self._sent_emb_name = "sent_embedding" - self._dtype = "float16" if use_fp16 else "float32" - - # Initialize all weigths by truncated normal initializer, and all biases - # will be initialized by constant zero by default. - self._param_initializer = fluid.initializer.TruncatedNormal(scale=config['initializer_range']) - - self._build_model(src_ids, position_ids, sentence_ids, input_mask) - - def _build_model(self, src_ids, position_ids, sentence_ids, input_mask): - # padding id in vocabulary must be set to 0 - emb_out = fluid.layers.embedding(input=src_ids, - size=[self._voc_size, self._emb_size], - dtype=self._dtype, - param_attr=fluid.ParamAttr(name=self._word_emb_name, - initializer=self._param_initializer), - is_sparse=False) - position_emb_out = fluid.layers.embedding(input=position_ids, - size=[self._max_position_seq_len, self._emb_size], - dtype=self._dtype, - param_attr=fluid.ParamAttr(name=self._pos_emb_name, - initializer=self._param_initializer)) - - sent_emb_out = fluid.layers.embedding(sentence_ids, - size=[self._sent_types, self._emb_size], - dtype=self._dtype, - param_attr=fluid.ParamAttr(name=self._sent_emb_name, - initializer=self._param_initializer)) - - emb_out = emb_out + position_emb_out - emb_out = emb_out + sent_emb_out - - emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder') - - if self._dtype == "float16": - input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype) - - self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True) - self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) - n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1) - n_head_self_attn_mask.stop_gradient = True - - self._enc_out = encoder(enc_input=emb_out, - attn_bias=n_head_self_attn_mask, - n_layer=self._n_layer, - n_head=self._n_head, - d_key=self._emb_size // self._n_head, - d_value=self._emb_size // self._n_head, - d_model=self._emb_size, - d_inner_hid=self._emb_size * 4, - prepostprocess_dropout=self._prepostprocess_dropout, - attention_dropout=self._attention_dropout, - relu_dropout=0, - hidden_act=self._hidden_act, - preprocess_cmd="", - postprocess_cmd="dan", - param_initializer=self._param_initializer, - name='encoder') - - def get_sequence_output(self): - return self._enc_out - - def get_pooled_output(self): - """Get the first feature of each sequence for classification""" - - next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1]) - next_sent_feat = fluid.layers.fc(input=next_sent_feat, - size=self._emb_size, - act="tanh", - param_attr=fluid.ParamAttr(name="pooled_fc.w_0", - initializer=self._param_initializer), - bias_attr="pooled_fc.b_0") - return next_sent_feat - - def get_pretraining_output(self, mask_label, mask_pos, labels): - """Get the loss & accuracy for pretraining""" - - mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32') - - # extract the first token feature in each sentence - next_sent_feat = self.get_pooled_output() - reshaped_emb_out = fluid.layers.reshape(x=self._enc_out, shape=[-1, self._emb_size]) - # extract masked tokens' feature - mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos) - - # transform: fc - mask_trans_feat = fluid.layers.fc(input=mask_feat, - size=self._emb_size, - act=self._hidden_act, - param_attr=fluid.ParamAttr(name='mask_lm_trans_fc.w_0', - initializer=self._param_initializer), - bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0')) - # transform: layer norm - mask_trans_feat = pre_process_layer(mask_trans_feat, 'n', name='mask_lm_trans') - - mask_lm_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0", - initializer=fluid.initializer.Constant(value=0.0)) - if self._weight_sharing: - fc_out = fluid.layers.matmul(x=mask_trans_feat, - y=fluid.default_main_program().global_block().var(self._word_emb_name), - transpose_y=True) - fc_out += fluid.layers.create_parameter(shape=[self._voc_size], - dtype=self._dtype, - attr=mask_lm_out_bias_attr, - is_bias=True) - - else: - fc_out = fluid.layers.fc(input=mask_trans_feat, - size=self._voc_size, - param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0", - initializer=self._param_initializer), - bias_attr=mask_lm_out_bias_attr) - - mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out, label=mask_label) - mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss) - - next_sent_fc_out = fluid.layers.fc(input=next_sent_feat, - size=2, - param_attr=fluid.ParamAttr(name="next_sent_fc.w_0", - initializer=self._param_initializer), - bias_attr="next_sent_fc.b_0") - - next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(logits=next_sent_fc_out, - label=labels, - return_softmax=True) - - next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax, label=labels) - - mean_next_sent_loss = fluid.layers.mean(next_sent_loss) - - loss = mean_next_sent_loss + mean_mask_lm_loss - return next_sent_acc, mean_mask_lm_loss, loss diff --git a/modules/text/language_model/rbt3/model/transformer_encoder.py b/modules/text/language_model/rbt3/model/transformer_encoder.py deleted file mode 100644 index b15d8388..00000000 --- a/modules/text/language_model/rbt3/model/transformer_encoder.py +++ /dev/null @@ -1,295 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Transformer encoder.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from functools import partial - -import paddle.fluid as fluid -import paddle.fluid.layers as layers - - -def multi_head_attention(queries, - keys, - values, - attn_bias, - d_key, - d_value, - d_model, - n_head=1, - dropout_rate=0., - cache=None, - param_initializer=None, - name='multi_head_att'): - """ - Multi-Head Attention. Note that attn_bias is added to the logit before - computing softmax activiation to mask certain selected positions so that - they will not considered in attention weights. - """ - keys = queries if keys is None else keys - values = keys if values is None else values - - if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): - raise ValueError("Inputs: quries, keys and values should all be 3-D tensors.") - - def __compute_qkv(queries, keys, values, n_head, d_key, d_value): - """ - Add linear projection to queries, keys, and values. - """ - q = layers.fc(input=queries, - size=d_key * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_query_fc.w_0', initializer=param_initializer), - bias_attr=name + '_query_fc.b_0') - k = layers.fc(input=keys, - size=d_key * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_key_fc.w_0', initializer=param_initializer), - bias_attr=name + '_key_fc.b_0') - v = layers.fc(input=values, - size=d_value * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_value_fc.w_0', initializer=param_initializer), - bias_attr=name + '_value_fc.b_0') - return q, k, v - - def __split_heads(x, n_head): - """ - Reshape the last dimension of inpunt tensor x so that it becomes two - dimensions and then transpose. Specifically, input a tensor with shape - [bs, max_sequence_length, n_head * hidden_dim] then output a tensor - with shape [bs, n_head, max_sequence_length, hidden_dim]. - """ - hidden_size = x.shape[-1] - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True) - - # permuate the dimensions into: - # [batch_size, n_head, max_sequence_len, hidden_size_per_head] - return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) - - def __combine_heads(x): - """ - Transpose and then reshape the last two dimensions of inpunt tensor x - so that it becomes one dimension, which is reverse to __split_heads. - """ - if len(x.shape) == 3: return x - if len(x.shape) != 4: - raise ValueError("Input(x) should be a 4-D Tensor.") - - trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - return layers.reshape(x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True) - - def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): - """ - Scaled Dot-Product Attention - """ - scaled_q = layers.scale(x=q, scale=d_key**-0.5) - product = layers.matmul(x=scaled_q, y=k, transpose_y=True) - if attn_bias: - product += attn_bias - weights = layers.softmax(product) - if dropout_rate: - weights = layers.dropout(weights, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - out = layers.matmul(weights, v) - return out - - q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) - - if cache is not None: # use cache and concat time steps - # Since the inplace reshape in __split_heads changes the shape of k and - # v, which is the cache input for next time step, reshape the cache - # input from the previous time step first. - k = cache["k"] = layers.concat([layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1) - v = cache["v"] = layers.concat([layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1) - - q = __split_heads(q, n_head) - k = __split_heads(k, n_head) - v = __split_heads(v, n_head) - - ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate) - - out = __combine_heads(ctx_multiheads) - - # Project back to the model size. - proj_out = layers.fc(input=out, - size=d_model, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_output_fc.w_0', initializer=param_initializer), - bias_attr=name + '_output_fc.b_0') - return proj_out - - -def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate, hidden_act, param_initializer=None, name='ffn'): - """ - Position-wise Feed-Forward Networks. - This module consists of two linear transformations with a ReLU activation - in between, which is applied to each position separately and identically. - """ - hidden = layers.fc(input=x, - size=d_inner_hid, - num_flatten_dims=2, - act=hidden_act, - param_attr=fluid.ParamAttr(name=name + '_fc_0.w_0', initializer=param_initializer), - bias_attr=name + '_fc_0.b_0') - if dropout_rate: - hidden = layers.dropout(hidden, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - out = layers.fc(input=hidden, - size=d_hid, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_fc_1.w_0', initializer=param_initializer), - bias_attr=name + '_fc_1.b_0') - return out - - -def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., name=''): - """ - Add residual connection, layer normalization and droput to the out tensor - optionally according to the value of process_cmd. - This will be used before or after multi-head attention and position-wise - feed-forward networks. - """ - for cmd in process_cmd: - if cmd == "a": # add residual connection - out = out + prev_out if prev_out else out - elif cmd == "n": # add layer normalization - out_dtype = out.dtype - if out_dtype == fluid.core.VarDesc.VarType.FP16: - out = layers.cast(x=out, dtype="float32") - out = layers.layer_norm(out, - begin_norm_axis=len(out.shape) - 1, - param_attr=fluid.ParamAttr(name=name + '_layer_norm_scale', - initializer=fluid.initializer.Constant(1.)), - bias_attr=fluid.ParamAttr(name=name + '_layer_norm_bias', - initializer=fluid.initializer.Constant(0.))) - if out_dtype == fluid.core.VarDesc.VarType.FP16: - out = layers.cast(x=out, dtype="float16") - elif cmd == "d": # add dropout - if dropout_rate: - out = layers.dropout(out, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - return out - - -pre_process_layer = partial(pre_post_process_layer, None) -post_process_layer = pre_post_process_layer - - -def encoder_layer(enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd="n", - postprocess_cmd="da", - param_initializer=None, - name=''): - """The encoder layers that can be stacked to form a deep encoder. - This module consits of a multi-head (self) attention followed by - position-wise feed-forward networks and both the two components companied - with the post_process_layer to add residual connection, layer normalization - and droput. - """ - attn_output = multi_head_attention(pre_process_layer(enc_input, - preprocess_cmd, - prepostprocess_dropout, - name=name + '_pre_att'), - None, - None, - attn_bias, - d_key, - d_value, - d_model, - n_head, - attention_dropout, - param_initializer=param_initializer, - name=name + '_multi_head_att') - attn_output = post_process_layer(enc_input, - attn_output, - postprocess_cmd, - prepostprocess_dropout, - name=name + '_post_att') - ffd_output = positionwise_feed_forward(pre_process_layer(attn_output, - preprocess_cmd, - prepostprocess_dropout, - name=name + '_pre_ffn'), - d_inner_hid, - d_model, - relu_dropout, - hidden_act, - param_initializer=param_initializer, - name=name + '_ffn') - return post_process_layer(attn_output, ffd_output, postprocess_cmd, prepostprocess_dropout, name=name + '_post_ffn') - - -def encoder(enc_input, - attn_bias, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd="n", - postprocess_cmd="da", - param_initializer=None, - name=''): - """ - The encoder is composed of a stack of identical layers returned by calling - encoder_layer. - """ - for i in range(n_layer): - enc_output = encoder_layer(enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd, - postprocess_cmd, - param_initializer=param_initializer, - name=name + '_layer_' + str(i)) - enc_input = enc_output - enc_output = pre_process_layer(enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder") - - return enc_output diff --git a/modules/text/language_model/rbt3/module.py b/modules/text/language_model/rbt3/module.py index b35e0cd8..3833c987 100644 --- a/modules/text/language_model/rbt3/module.py +++ b/modules/text/language_model/rbt3/module.py @@ -1,7 +1,6 @@ -# coding:utf-8 -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # -# Licensed under the Apache License, Version 2.0 (the "License" +# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # @@ -12,62 +11,120 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - +from typing import Dict import os +import math -from paddlehub import TransformerModule -from paddlehub.module.module import moduleinfo +import paddle +import paddle.nn as nn +import paddle.nn.functional as F -from rbt3.model.bert import BertConfig, BertModel +from paddlenlp.transformers.roberta.modeling import RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel +from paddlenlp.transformers.roberta.tokenizer import RobertaTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule +from paddlehub.utils.log import logger @moduleinfo( name="rbt3", - version="1.0.0", + version="2.0.1", summary="rbt3, 3-layer, 768-hidden, 12-heads, 38M parameters ", author="ymcui", author_email="ymcui@ir.hit.edu.cn", type="nlp/semantic_model", + meta=TransformerModule, ) -class BertWwm(TransformerModule): - def _initialize(self): - self.MAX_SEQ_LEN = 512 - self.params_path = os.path.join(self.directory, "assets", "params") - self.vocab_path = os.path.join(self.directory, "assets", "vocab.txt") +class Roberta(nn.Layer): + """ + RoBERTa model + """ - bert_config_path = os.path.join(self.directory, "assets", "bert_config_rbt3.json") - self.bert_config = BertConfig(bert_config_path) + def __init__( + self, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, + ): + super(Roberta, self).__init__() + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes - def net(self, input_ids, position_ids, segment_ids, input_mask): - """ - create neural network. + if task == 'sequence_classification': + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': + self.model = RobertaForSequenceClassification.from_pretrained( + pretrained_model_name_or_path='rbt3', + num_classes=self.num_classes, + **kwargs + ) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = RobertaForTokenClassification.from_pretrained( + pretrained_model_name_or_path='rbt3', + num_classes=self.num_classes, + **kwargs + ) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) + elif task is None: + self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='rbt3', **kwargs) + else: + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) - Args: - input_ids (tensor): the word ids. - position_ids (tensor): the position ids. - segment_ids (tensor): the segment ids. - input_mask (tensor): the padding mask. + self.task = task - Returns: - pooled_output (tensor): sentence-level output for classification task. - sequence_output (tensor): token-level output for sequence task. - """ - bert = BertModel(src_ids=input_ids, - position_ids=position_ids, - sentence_ids=segment_ids, - input_mask=input_mask, - config=self.bert_config, - use_fp16=False) - pooled_output = bert.get_pooled_output() - sequence_output = bert.get_sequence_output() - return pooled_output, sequence_output + if load_checkpoint is not None and os.path.isfile(load_checkpoint): + state_dict = paddle.load(load_checkpoint) + self.set_state_dict(state_dict) + logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + if self.task == 'seq-cls': + logits = result + probs = F.softmax(logits, axis=1) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs + else: + sequence_output, pooled_output = result + return sequence_output, pooled_output -if __name__ == '__main__': - test_module = BertWwm() + @staticmethod + def get_tokenizer(*args, **kwargs): + """ + Gets the tokenizer that is customized for this module. + """ + return RobertaTokenizer.from_pretrained( + pretrained_model_name_or_path='rbt3', *args, **kwargs) diff --git a/modules/text/language_model/rbtl3/README.md b/modules/text/language_model/rbtl3/README.md index 53107271..f1dd9c43 100644 --- a/modules/text/language_model/rbtl3/README.md +++ b/modules/text/language_model/rbtl3/README.md @@ -1,5 +1,5 @@ ```shell -$ hub install rbtl3==1.0.0 +$ hub install rbtl3==2.0.1 ```


@@ -9,94 +9,130 @@ $ hub install rbtl3==1.0.0 ## API ```python -def context( - trainable=True, - max_seq_len=128 +def __init__( + task=None, + load_checkpoint=None, + label_map=None, + num_classes=2, + **kwargs, ) ``` -用于获取Module的上下文信息,得到输入、输出以及预训练的Paddle Program副本 -**参数** - -> trainable:设置为True时,Module中的参数在Fine-tune时也会随之训练,否则保持不变。 -> max_seq_len:BERT模型的最大序列长度,若序列长度不足,会通过padding方式补到**max_seq_len**, 若序列长度大于该值,则会以截断方式让序列长度为**max_seq_len**,max_seq_len可取值范围为0~512; - -**返回** -> inputs:dict类型,有以下字段: -> >**input_ids**存放输入文本tokenize后各token对应BERT词汇表的word ids, shape为\[batch_size, max_seq_len\],int64类型; -> >**position_ids**存放输入文本tokenize后各token所在该文本的位置,shape为\[batch_size, max_seq_len\],int64类型; -> >**segment_ids**存放各token所在文本的标识(token属于文本1或者文本2),shape为\[batch_size, max_seq_len\],int64类型; -> >**input_mask**存放token是否为padding的标识,shape为\[batch_size, max_seq_len\],int64类型; -> -> outputs:dict类型,Module的输出特征,有以下字段: -> >**pooled_output**字段存放句子粒度的特征,可用于文本分类等任务,shape为 \[batch_size, 768\],int64类型; -> >**sequence_output**字段存放字粒度的特征,可用于序列标注等任务,shape为 \[batch_size, seq_len, 768\],int64类型; -> -> program:包含该Module计算图的Program。 +创建Module对象(动态图组网版本)。 +**参数** +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 +* `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 +* `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python -def get_embedding( - texts, - use_gpu=False, - batch_size=1 +def predict( + data, + max_seq_len=128, + batch_size=1, + use_gpu=False ) ``` -用于获取输入文本的句子粒度特征与字粒度特征 - **参数** -> texts:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 -> use_gpu:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 +* `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例, + 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 +* `max_seq_len`:模型处理文本的最大长度 +* `batch_size`:模型批处理大小 +* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** -> results:list类型,格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\],其中每个元素都是对应样例的特征输出,每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。 -> - ```python -def get_params_layer() +def get_embedding( + data, + max_seq_len=128, + batch_size=1, + use_gpu=False +) ``` -用于获取参数层信息,该方法与ULMFiTStrategy联用可以严格按照层数设置分层学习率与逐层解冻。 +用于获取输入文本的句子粒度特征与字粒度特征 **参数** -> 无 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 +* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** -> params_layer:dict类型,key为参数名,值为参数所在层数 +* `results`:list类型,格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\],其中每个元素都是对应样例的特征输出,每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。 + **代码示例** ```python import paddlehub as hub -# Load $ hub install rbtl3 pretrained model -module = hub.Module(name="rbtl3") -inputs, outputs, program = module.context(trainable=True, max_seq_len=128) +data = [ + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], +] +label_map = {0: 'negative', 1: 'positive'} + +model = hub.Module( + name='rbtl3', + version='2.0.1', + task='seq-cls', + load_checkpoint='/path/to/parameters', + label_map=label_map) +results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) +for idx, text in enumerate(data): + print('Data: {} \t Lable: {}'.format(text, results[idx])) +``` -# Must feed all the tensor of rbtl3's module need -input_ids = inputs["input_ids"] -position_ids = inputs["position_ids"] -segment_ids = inputs["segment_ids"] -input_mask = inputs["input_mask"] +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) -# Use "pooled_output" for sentence-level output. -pooled_output = outputs["pooled_output"] +## 服务部署 -# Use "sequence_output" for token-level output. -sequence_output = outputs["sequence_output"] +PaddleHub Serving可以部署一个在线获取预训练词向量。 -# Use "get_embedding" to get embedding result. -embedding_result = module.get_embedding(texts=[["Sample1_text_a"],["Sample2_text_a","Sample2_text_b"]], use_gpu=True) +### Step1: 启动PaddleHub Serving -# Use "get_params_layer" to get params layer and used to ULMFiTStrategy. -params_layer = module.get_params_layer() -strategy = hub.finetune.strategy.ULMFiTStrategy(frz_params_layer=params_layer, dis_params_layer=params_layer) +运行启动命令: + +```shell +$ hub serving start -m rbtl3 +``` + +这样就完成了一个获取预训练词向量服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/rbtl3" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) ``` ## 查看代码 @@ -109,12 +145,16 @@ https://github.com/ymcui/Chinese-BERT-wwm ## 依赖 -paddlepaddle >= 1.6.2 +paddlepaddle >= 2.0.0 -paddlehub >= 1.6.0 +paddlehub >= 2.0.0 ## 更新历史 * 1.0.0 初始发布 + +* 2.0.1 + + 全面升级动态图,接口有所变化。任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/rbtl3/model/__init__.py b/modules/text/language_model/rbtl3/model/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modules/text/language_model/rbtl3/model/bert.py b/modules/text/language_model/rbtl3/model/bert.py deleted file mode 100644 index 8c27ad34..00000000 --- a/modules/text/language_model/rbtl3/model/bert.py +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""BERT model.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import six -import json - -import paddle.fluid as fluid - -from rbtl3.model.transformer_encoder import encoder, pre_process_layer - - -class BertConfig(object): - def __init__(self, config_path): - self._config_dict = self._parse(config_path) - - def _parse(self, config_path): - try: - with open(config_path) as json_file: - config_dict = json.load(json_file) - except Exception: - raise IOError("Error in parsing bert model config file '%s'" % config_path) - else: - return config_dict - - def __getitem__(self, key): - return self._config_dict[key] - - def print_config(self): - for arg, value in sorted(six.iteritems(self._config_dict)): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -class BertModel(object): - def __init__(self, src_ids, position_ids, sentence_ids, input_mask, config, weight_sharing=True, use_fp16=False): - - self._emb_size = config['hidden_size'] - self._n_layer = config['num_hidden_layers'] - self._n_head = config['num_attention_heads'] - self._voc_size = config['vocab_size'] - self._max_position_seq_len = config['max_position_embeddings'] - self._sent_types = config['type_vocab_size'] - self._hidden_act = config['hidden_act'] - self._prepostprocess_dropout = config['hidden_dropout_prob'] - self._attention_dropout = config['attention_probs_dropout_prob'] - self._weight_sharing = weight_sharing - - self._word_emb_name = "word_embedding" - self._pos_emb_name = "pos_embedding" - self._sent_emb_name = "sent_embedding" - self._dtype = "float16" if use_fp16 else "float32" - - # Initialize all weigths by truncated normal initializer, and all biases - # will be initialized by constant zero by default. - self._param_initializer = fluid.initializer.TruncatedNormal(scale=config['initializer_range']) - - self._build_model(src_ids, position_ids, sentence_ids, input_mask) - - def _build_model(self, src_ids, position_ids, sentence_ids, input_mask): - # padding id in vocabulary must be set to 0 - emb_out = fluid.layers.embedding(input=src_ids, - size=[self._voc_size, self._emb_size], - dtype=self._dtype, - param_attr=fluid.ParamAttr(name=self._word_emb_name, - initializer=self._param_initializer), - is_sparse=False) - position_emb_out = fluid.layers.embedding(input=position_ids, - size=[self._max_position_seq_len, self._emb_size], - dtype=self._dtype, - param_attr=fluid.ParamAttr(name=self._pos_emb_name, - initializer=self._param_initializer)) - - sent_emb_out = fluid.layers.embedding(sentence_ids, - size=[self._sent_types, self._emb_size], - dtype=self._dtype, - param_attr=fluid.ParamAttr(name=self._sent_emb_name, - initializer=self._param_initializer)) - - emb_out = emb_out + position_emb_out - emb_out = emb_out + sent_emb_out - - emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder') - - if self._dtype == "float16": - input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype) - - self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True) - self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) - n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1) - n_head_self_attn_mask.stop_gradient = True - - self._enc_out = encoder(enc_input=emb_out, - attn_bias=n_head_self_attn_mask, - n_layer=self._n_layer, - n_head=self._n_head, - d_key=self._emb_size // self._n_head, - d_value=self._emb_size // self._n_head, - d_model=self._emb_size, - d_inner_hid=self._emb_size * 4, - prepostprocess_dropout=self._prepostprocess_dropout, - attention_dropout=self._attention_dropout, - relu_dropout=0, - hidden_act=self._hidden_act, - preprocess_cmd="", - postprocess_cmd="dan", - param_initializer=self._param_initializer, - name='encoder') - - def get_sequence_output(self): - return self._enc_out - - def get_pooled_output(self): - """Get the first feature of each sequence for classification""" - - next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1]) - next_sent_feat = fluid.layers.fc(input=next_sent_feat, - size=self._emb_size, - act="tanh", - param_attr=fluid.ParamAttr(name="pooled_fc.w_0", - initializer=self._param_initializer), - bias_attr="pooled_fc.b_0") - return next_sent_feat - - def get_pretraining_output(self, mask_label, mask_pos, labels): - """Get the loss & accuracy for pretraining""" - - mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32') - - # extract the first token feature in each sentence - next_sent_feat = self.get_pooled_output() - reshaped_emb_out = fluid.layers.reshape(x=self._enc_out, shape=[-1, self._emb_size]) - # extract masked tokens' feature - mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos) - - # transform: fc - mask_trans_feat = fluid.layers.fc(input=mask_feat, - size=self._emb_size, - act=self._hidden_act, - param_attr=fluid.ParamAttr(name='mask_lm_trans_fc.w_0', - initializer=self._param_initializer), - bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0')) - # transform: layer norm - mask_trans_feat = pre_process_layer(mask_trans_feat, 'n', name='mask_lm_trans') - - mask_lm_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0", - initializer=fluid.initializer.Constant(value=0.0)) - if self._weight_sharing: - fc_out = fluid.layers.matmul(x=mask_trans_feat, - y=fluid.default_main_program().global_block().var(self._word_emb_name), - transpose_y=True) - fc_out += fluid.layers.create_parameter(shape=[self._voc_size], - dtype=self._dtype, - attr=mask_lm_out_bias_attr, - is_bias=True) - - else: - fc_out = fluid.layers.fc(input=mask_trans_feat, - size=self._voc_size, - param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0", - initializer=self._param_initializer), - bias_attr=mask_lm_out_bias_attr) - - mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out, label=mask_label) - mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss) - - next_sent_fc_out = fluid.layers.fc(input=next_sent_feat, - size=2, - param_attr=fluid.ParamAttr(name="next_sent_fc.w_0", - initializer=self._param_initializer), - bias_attr="next_sent_fc.b_0") - - next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(logits=next_sent_fc_out, - label=labels, - return_softmax=True) - - next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax, label=labels) - - mean_next_sent_loss = fluid.layers.mean(next_sent_loss) - - loss = mean_next_sent_loss + mean_mask_lm_loss - return next_sent_acc, mean_mask_lm_loss, loss diff --git a/modules/text/language_model/rbtl3/model/transformer_encoder.py b/modules/text/language_model/rbtl3/model/transformer_encoder.py deleted file mode 100644 index b15d8388..00000000 --- a/modules/text/language_model/rbtl3/model/transformer_encoder.py +++ /dev/null @@ -1,295 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Transformer encoder.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from functools import partial - -import paddle.fluid as fluid -import paddle.fluid.layers as layers - - -def multi_head_attention(queries, - keys, - values, - attn_bias, - d_key, - d_value, - d_model, - n_head=1, - dropout_rate=0., - cache=None, - param_initializer=None, - name='multi_head_att'): - """ - Multi-Head Attention. Note that attn_bias is added to the logit before - computing softmax activiation to mask certain selected positions so that - they will not considered in attention weights. - """ - keys = queries if keys is None else keys - values = keys if values is None else values - - if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): - raise ValueError("Inputs: quries, keys and values should all be 3-D tensors.") - - def __compute_qkv(queries, keys, values, n_head, d_key, d_value): - """ - Add linear projection to queries, keys, and values. - """ - q = layers.fc(input=queries, - size=d_key * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_query_fc.w_0', initializer=param_initializer), - bias_attr=name + '_query_fc.b_0') - k = layers.fc(input=keys, - size=d_key * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_key_fc.w_0', initializer=param_initializer), - bias_attr=name + '_key_fc.b_0') - v = layers.fc(input=values, - size=d_value * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_value_fc.w_0', initializer=param_initializer), - bias_attr=name + '_value_fc.b_0') - return q, k, v - - def __split_heads(x, n_head): - """ - Reshape the last dimension of inpunt tensor x so that it becomes two - dimensions and then transpose. Specifically, input a tensor with shape - [bs, max_sequence_length, n_head * hidden_dim] then output a tensor - with shape [bs, n_head, max_sequence_length, hidden_dim]. - """ - hidden_size = x.shape[-1] - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True) - - # permuate the dimensions into: - # [batch_size, n_head, max_sequence_len, hidden_size_per_head] - return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) - - def __combine_heads(x): - """ - Transpose and then reshape the last two dimensions of inpunt tensor x - so that it becomes one dimension, which is reverse to __split_heads. - """ - if len(x.shape) == 3: return x - if len(x.shape) != 4: - raise ValueError("Input(x) should be a 4-D Tensor.") - - trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - return layers.reshape(x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True) - - def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): - """ - Scaled Dot-Product Attention - """ - scaled_q = layers.scale(x=q, scale=d_key**-0.5) - product = layers.matmul(x=scaled_q, y=k, transpose_y=True) - if attn_bias: - product += attn_bias - weights = layers.softmax(product) - if dropout_rate: - weights = layers.dropout(weights, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - out = layers.matmul(weights, v) - return out - - q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) - - if cache is not None: # use cache and concat time steps - # Since the inplace reshape in __split_heads changes the shape of k and - # v, which is the cache input for next time step, reshape the cache - # input from the previous time step first. - k = cache["k"] = layers.concat([layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1) - v = cache["v"] = layers.concat([layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1) - - q = __split_heads(q, n_head) - k = __split_heads(k, n_head) - v = __split_heads(v, n_head) - - ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate) - - out = __combine_heads(ctx_multiheads) - - # Project back to the model size. - proj_out = layers.fc(input=out, - size=d_model, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_output_fc.w_0', initializer=param_initializer), - bias_attr=name + '_output_fc.b_0') - return proj_out - - -def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate, hidden_act, param_initializer=None, name='ffn'): - """ - Position-wise Feed-Forward Networks. - This module consists of two linear transformations with a ReLU activation - in between, which is applied to each position separately and identically. - """ - hidden = layers.fc(input=x, - size=d_inner_hid, - num_flatten_dims=2, - act=hidden_act, - param_attr=fluid.ParamAttr(name=name + '_fc_0.w_0', initializer=param_initializer), - bias_attr=name + '_fc_0.b_0') - if dropout_rate: - hidden = layers.dropout(hidden, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - out = layers.fc(input=hidden, - size=d_hid, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + '_fc_1.w_0', initializer=param_initializer), - bias_attr=name + '_fc_1.b_0') - return out - - -def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., name=''): - """ - Add residual connection, layer normalization and droput to the out tensor - optionally according to the value of process_cmd. - This will be used before or after multi-head attention and position-wise - feed-forward networks. - """ - for cmd in process_cmd: - if cmd == "a": # add residual connection - out = out + prev_out if prev_out else out - elif cmd == "n": # add layer normalization - out_dtype = out.dtype - if out_dtype == fluid.core.VarDesc.VarType.FP16: - out = layers.cast(x=out, dtype="float32") - out = layers.layer_norm(out, - begin_norm_axis=len(out.shape) - 1, - param_attr=fluid.ParamAttr(name=name + '_layer_norm_scale', - initializer=fluid.initializer.Constant(1.)), - bias_attr=fluid.ParamAttr(name=name + '_layer_norm_bias', - initializer=fluid.initializer.Constant(0.))) - if out_dtype == fluid.core.VarDesc.VarType.FP16: - out = layers.cast(x=out, dtype="float16") - elif cmd == "d": # add dropout - if dropout_rate: - out = layers.dropout(out, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - return out - - -pre_process_layer = partial(pre_post_process_layer, None) -post_process_layer = pre_post_process_layer - - -def encoder_layer(enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd="n", - postprocess_cmd="da", - param_initializer=None, - name=''): - """The encoder layers that can be stacked to form a deep encoder. - This module consits of a multi-head (self) attention followed by - position-wise feed-forward networks and both the two components companied - with the post_process_layer to add residual connection, layer normalization - and droput. - """ - attn_output = multi_head_attention(pre_process_layer(enc_input, - preprocess_cmd, - prepostprocess_dropout, - name=name + '_pre_att'), - None, - None, - attn_bias, - d_key, - d_value, - d_model, - n_head, - attention_dropout, - param_initializer=param_initializer, - name=name + '_multi_head_att') - attn_output = post_process_layer(enc_input, - attn_output, - postprocess_cmd, - prepostprocess_dropout, - name=name + '_post_att') - ffd_output = positionwise_feed_forward(pre_process_layer(attn_output, - preprocess_cmd, - prepostprocess_dropout, - name=name + '_pre_ffn'), - d_inner_hid, - d_model, - relu_dropout, - hidden_act, - param_initializer=param_initializer, - name=name + '_ffn') - return post_process_layer(attn_output, ffd_output, postprocess_cmd, prepostprocess_dropout, name=name + '_post_ffn') - - -def encoder(enc_input, - attn_bias, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd="n", - postprocess_cmd="da", - param_initializer=None, - name=''): - """ - The encoder is composed of a stack of identical layers returned by calling - encoder_layer. - """ - for i in range(n_layer): - enc_output = encoder_layer(enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd, - postprocess_cmd, - param_initializer=param_initializer, - name=name + '_layer_' + str(i)) - enc_input = enc_output - enc_output = pre_process_layer(enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder") - - return enc_output diff --git a/modules/text/language_model/rbtl3/module.py b/modules/text/language_model/rbtl3/module.py index a60c30a4..500fc42c 100644 --- a/modules/text/language_model/rbtl3/module.py +++ b/modules/text/language_model/rbtl3/module.py @@ -1,7 +1,6 @@ -# coding:utf-8 -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # -# Licensed under the Apache License, Version 2.0 (the "License" +# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # @@ -12,62 +11,120 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - +from typing import Dict import os +import math -from paddlehub import TransformerModule -from paddlehub.module.module import moduleinfo +import paddle +import paddle.nn as nn +import paddle.nn.functional as F -from rbtl3.model.bert import BertConfig, BertModel +from paddlenlp.transformers.roberta.modeling import RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel +from paddlenlp.transformers.roberta.tokenizer import RobertaTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule +from paddlehub.utils.log import logger @moduleinfo( name="rbtl3", - version="1.0.0", + version="2.0.1", summary="rbtl3, 3-layer, 1024-hidden, 16-heads, 61M parameters ", author="ymcui", author_email="ymcui@ir.hit.edu.cn", type="nlp/semantic_model", + meta=TransformerModule, ) -class BertWwm(TransformerModule): - def _initialize(self): - self.MAX_SEQ_LEN = 512 - self.params_path = os.path.join(self.directory, "assets", "params") - self.vocab_path = os.path.join(self.directory, "assets", "vocab.txt") +class Roberta(nn.Layer): + """ + RoBERTa model + """ - bert_config_path = os.path.join(self.directory, "assets", "bert_config_rbtl3.json") - self.bert_config = BertConfig(bert_config_path) + def __init__( + self, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, + ): + super(Roberta, self).__init__() + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes - def net(self, input_ids, position_ids, segment_ids, input_mask): - """ - create neural network. + if task == 'sequence_classification': + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': + self.model = RobertaForSequenceClassification.from_pretrained( + pretrained_model_name_or_path='rbtl3', + num_classes=self.num_classes, + **kwargs + ) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = RobertaForTokenClassification.from_pretrained( + pretrained_model_name_or_path='rbtl3', + num_classes=self.num_classes, + **kwargs + ) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) + elif task is None: + self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='rbtl3', **kwargs) + else: + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) - Args: - input_ids (tensor): the word ids. - position_ids (tensor): the position ids. - segment_ids (tensor): the segment ids. - input_mask (tensor): the padding mask. + self.task = task - Returns: - pooled_output (tensor): sentence-level output for classification task. - sequence_output (tensor): token-level output for sequence task. - """ - bert = BertModel(src_ids=input_ids, - position_ids=position_ids, - sentence_ids=segment_ids, - input_mask=input_mask, - config=self.bert_config, - use_fp16=False) - pooled_output = bert.get_pooled_output() - sequence_output = bert.get_sequence_output() - return pooled_output, sequence_output + if load_checkpoint is not None and os.path.isfile(load_checkpoint): + state_dict = paddle.load(load_checkpoint) + self.set_state_dict(state_dict) + logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + if self.task == 'seq-cls': + logits = result + probs = F.softmax(logits, axis=1) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs + else: + sequence_output, pooled_output = result + return sequence_output, pooled_output -if __name__ == '__main__': - test_module = BertWwm() + @staticmethod + def get_tokenizer(*args, **kwargs): + """ + Gets the tokenizer that is customized for this module. + """ + return RobertaTokenizer.from_pretrained( + pretrained_model_name_or_path='rbtl3', *args, **kwargs) diff --git a/modules/text/language_model/roberta-wwm-ext-large/README.md b/modules/text/language_model/roberta-wwm-ext-large/README.md index 77d1b02c..4d19bf2b 100644 --- a/modules/text/language_model/roberta-wwm-ext-large/README.md +++ b/modules/text/language_model/roberta-wwm-ext-large/README.md @@ -1,5 +1,5 @@ ```shell -$ hub install roberta-wwm-ext-large==2.0.0 +$ hub install roberta-wwm-ext-large==2.0.1 ```


@@ -13,29 +13,35 @@ $ hub install roberta-wwm-ext-large==2.0.0 def __init__( task=None, load_checkpoint=None, - label_map=None) + label_map=None, + num_classes=2, + **kwargs, +) ``` 创建Module对象(动态图组网版本)。 **参数** -* `task`: 任务名称,可为`sequence_classification`。 +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 * `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 * `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python def predict( data, max_seq_len=128, batch_size=1, - use_gpu=False) + use_gpu=False +) ``` **参数** * `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例, - 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 + 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 * `max_seq_len`:模型处理文本的最大长度 * `batch_size`:模型批处理大小 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 @@ -44,7 +50,9 @@ def predict( ```python def get_embedding( - texts, + data, + max_seq_len=128, + batch_size=1, use_gpu=False ) ``` @@ -53,7 +61,9 @@ def get_embedding( **参数** -* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** @@ -67,16 +77,16 @@ def get_embedding( import paddlehub as hub data = [ - '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', - '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', - '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], ] label_map = {0: 'negative', 1: 'positive'} model = hub.Module( name='roberta-wwm-ext-large', - version='2.0.0', - task='sequence_classification', + version='2.0.1', + task='seq-cls', load_checkpoint='/path/to/parameters', label_map=label_map) results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) @@ -84,7 +94,9 @@ for idx, text in enumerate(data): print('Data: {} \t Lable: {}'.format(text, results[idx])) ``` -参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) ## 服务部署 @@ -110,12 +122,12 @@ $ hub serving start -m roberta-wwm-ext-large import requests import json -# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]} -text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]] -# 以key的方式指定text传入预测方法的时的参数,此例中为"texts" -# 对应本地部署,则为module.get_embedding(texts=text) -data = {"texts": text} -# 发送post请求,content-type类型应指定json方式 +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip url = "http://10.12.121.132:8866/predict/roberta-wwm-ext-large" # 指定post请求的headers为application/json方式 headers = {"Content-Type": "application/json"} @@ -126,7 +138,7 @@ print(r.json()) ## 查看代码 -https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/pretrain_langauge_models/BERT +https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/paddlenlp/transformers/roberta ## 依赖 @@ -144,3 +156,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图,接口有所变化。 + +* 2.0.1 + + 任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/roberta-wwm-ext-large/module.py b/modules/text/language_model/roberta-wwm-ext-large/module.py index 7785bf1e..aa45811d 100644 --- a/modules/text/language_model/roberta-wwm-ext-large/module.py +++ b/modules/text/language_model/roberta-wwm-ext-large/module.py @@ -11,29 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict import os +import math -from paddle.dataset.common import DATA_HOME import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlehub import BertTokenizer -from paddlehub.module.modeling_roberta import RobertaModel, RobertaForSequenceClassification -from paddlehub.module.module import moduleinfo, serving +from paddlenlp.transformers.roberta.modeling import RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel +from paddlenlp.transformers.roberta.tokenizer import RobertaTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule from paddlehub.utils.log import logger -from paddlehub.utils.utils import download @moduleinfo( name="roberta-wwm-ext-large", - version="2.0.0", + version="2.0.1", summary= "chinese-roberta-wwm-ext-large, 24-layer, 1024-hidden, 16-heads, 340M parameters. The module is executed as paddle.dygraph.", author="ymcui", author_email="ymcui@ir.hit.edu.cn", type="nlp/semantic_model", + meta=TransformerModule, ) class Roberta(nn.Layer): """ @@ -42,181 +44,88 @@ class Roberta(nn.Layer): def __init__( self, - task=None, - load_checkpoint=None, - label_map=None, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, ): super(Roberta, self).__init__() - # TODO(zhangxuefei): add token_classification task + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes + if task == 'sequence_classification': + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': self.model = RobertaForSequenceClassification.from_pretrained( - pretrained_model_name_or_path='roberta-wwm-ext-large') + pretrained_model_name_or_path='roberta-wwm-ext-large', + num_classes=self.num_classes, + **kwargs + ) self.criterion = paddle.nn.loss.CrossEntropyLoss() - self.metric = paddle.metric.Accuracy(name='acc_accumulation') + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = RobertaForTokenClassification.from_pretrained( + pretrained_model_name_or_path='roberta-wwm-ext-large', + num_classes=self.num_classes, + **kwargs + ) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) elif task is None: - self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='roberta-wwm-ext-large') + self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='roberta-wwm-ext-large', **kwargs) else: - raise RuntimeError("Unknown task %s, task should be sequence_classification" % task) + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) self.task = task - self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None): + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): result = self.model(input_ids, token_type_ids, position_ids, attention_mask) - if self.task is not None: + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) - return probs, loss, acc + return probs, loss, {'acc': acc} return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs else: sequence_output, pooled_output = result return sequence_output, pooled_output - def get_vocab_path(self): - """ - Gets the path of the module vocabulary path. - """ - save_path = os.path.join(DATA_HOME, 'roberta-wwm-ext-large', 'vocab.txt') - if not os.path.exists(save_path) or not os.path.isfile(save_path): - url = "https://paddlenlp.bj.bcebos.com/models/transformers/roberta_large/vocab.txt" - download(url, os.path.join(DATA_HOME, 'roberta-wwm-ext-large')) - return save_path - - def get_tokenizer(self, tokenize_chinese_chars=True): + @staticmethod + def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. - Args: - tokenize_chinese_chars (:obj: bool , defaults to :obj: True): - Whether to tokenize chinese characters or not. - Returns: - tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module. - """ - return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path()) - - def training_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for training, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as loss and metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'loss': avg_loss, 'metrics': {'acc': acc}} - - def validation_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for validation, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'metrics': {'acc': acc}} - - def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False): """ - Predicts the data labels. - - Args: - data (obj:`List(str)`): The processed data whose each element is the raw text. - max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`): - If set to a number, will limit the total sequence returned so that it has a maximum length. - batch_size(obj:`int`, defaults to 1): The number of batch. - use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not. - - Returns: - results(obj:`list`): All the predictions labels. - """ - # TODO(zhangxuefei): add task token_classification task predict. - if self.task not in ['sequence_classification']: - raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - tokenizer = self.get_tokenizer() - - examples = [] - for text in data: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids'])) - - def _batchify_fn(batch): - input_ids = [entry[0] for entry in batch] - segment_ids = [entry[1] for entry in batch] - return input_ids, segment_ids - - # Seperates data into some batches. - batches = [] - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) - - results = [] - self.eval() - for batch in batches: - input_ids, segment_ids = _batchify_fn(batch) - input_ids = paddle.to_tensor(input_ids) - segment_ids = paddle.to_tensor(segment_ids) - - # TODO(zhangxuefei): add task token_classification postprocess after prediction. - if self.task == 'sequence_classification': - probs = self(input_ids, segment_ids) - idx = paddle.argmax(probs, axis=1).numpy() - idx = idx.tolist() - labels = [self.label_map[i] for i in idx] - results.extend(labels) - - return results - - @serving - def get_embedding(self, texts, use_gpu=False): - if self.task is not None: - raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - - tokenizer = self.get_tokenizer() - results = [] - for text in texts: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - - input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0) - segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0) - sequence_output, pooled_output = self(input_ids, segment_ids) - - sequence_output = sequence_output.squeeze(0) - pooled_output = pooled_output.squeeze(0) - results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist())) - return results + return RobertaTokenizer.from_pretrained( + pretrained_model_name_or_path='roberta-wwm-ext-large', *args, **kwargs) diff --git a/modules/text/language_model/roberta-wwm-ext/README.md b/modules/text/language_model/roberta-wwm-ext/README.md index 8f4eeb80..9ee71b85 100644 --- a/modules/text/language_model/roberta-wwm-ext/README.md +++ b/modules/text/language_model/roberta-wwm-ext/README.md @@ -1,5 +1,5 @@ ```shell -$ hub install roberta-wwm-ext==2.0.0 +$ hub install roberta-wwm-ext==2.0.1 ```


@@ -13,29 +13,35 @@ $ hub install roberta-wwm-ext==2.0.0 def __init__( task=None, load_checkpoint=None, - label_map=None) + label_map=None, + num_classes=2, + **kwargs, +) ``` 创建Module对象(动态图组网版本)。 **参数** -* `task`: 任务名称,可为`sequence_classification`。 +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 * `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 * `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python def predict( data, max_seq_len=128, batch_size=1, - use_gpu=False) + use_gpu=False +) ``` **参数** * `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例, - 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 + 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 * `max_seq_len`:模型处理文本的最大长度 * `batch_size`:模型批处理大小 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 @@ -44,7 +50,9 @@ def predict( ```python def get_embedding( - texts, + data, + max_seq_len=128, + batch_size=1, use_gpu=False ) ``` @@ -53,7 +61,9 @@ def get_embedding( **参数** -* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** @@ -67,16 +77,16 @@ def get_embedding( import paddlehub as hub data = [ - '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', - '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', - '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], ] label_map = {0: 'negative', 1: 'positive'} model = hub.Module( name='roberta-wwm-ext', - version='2.0.0', - task='sequence_classification', + version='2.0.1', + task='seq-cls', load_checkpoint='/path/to/parameters', label_map=label_map) results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) @@ -84,7 +94,9 @@ for idx, text in enumerate(data): print('Data: {} \t Lable: {}'.format(text, results[idx])) ``` -参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) ## 服务部署 @@ -110,12 +122,12 @@ $ hub serving start -m roberta-wwm-ext import requests import json -# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]} -text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]] -# 以key的方式指定text传入预测方法的时的参数,此例中为"texts" -# 对应本地部署,则为module.get_embedding(texts=text) -data = {"texts": text} -# 发送post请求,content-type类型应指定json方式 +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip url = "http://10.12.121.132:8866/predict/roberta-wwm-ext" # 指定post请求的headers为application/json方式 headers = {"Content-Type": "application/json"} @@ -126,7 +138,7 @@ print(r.json()) ## 查看代码 -https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/pretrain_langauge_models/BERT +https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/paddlenlp/transformers/roberta ## 依赖 @@ -144,3 +156,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图,接口有所变化。 + +* 2.0.1 + + 任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/roberta-wwm-ext/module.py b/modules/text/language_model/roberta-wwm-ext/module.py index a4df8146..8fa2bbe7 100644 --- a/modules/text/language_model/roberta-wwm-ext/module.py +++ b/modules/text/language_model/roberta-wwm-ext/module.py @@ -11,29 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict import os +import math -from paddle.dataset.common import DATA_HOME import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlehub import BertTokenizer -from paddlehub.module.modeling_roberta import RobertaModel, RobertaForSequenceClassification -from paddlehub.module.module import moduleinfo, serving +from paddlenlp.transformers.roberta.modeling import RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel +from paddlenlp.transformers.roberta.tokenizer import RobertaTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule from paddlehub.utils.log import logger -from paddlehub.utils.utils import download @moduleinfo( name="roberta-wwm-ext", - version="2.0.0", + version="2.0.1", summary= "chinese-roberta-wwm-ext, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="ymcui", author_email="ymcui@ir.hit.edu.cn", type="nlp/semantic_model", + meta=TransformerModule, ) class Roberta(nn.Layer): """ @@ -42,181 +44,88 @@ class Roberta(nn.Layer): def __init__( self, - task=None, - load_checkpoint=None, - label_map=None, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, ): super(Roberta, self).__init__() - # TODO(zhangxuefei): add token_classification task + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes + if task == 'sequence_classification': + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': self.model = RobertaForSequenceClassification.from_pretrained( - pretrained_model_name_or_path='roberta-wwm-ext') + pretrained_model_name_or_path='roberta-wwm-ext', + num_classes=self.num_classes, + **kwargs + ) self.criterion = paddle.nn.loss.CrossEntropyLoss() - self.metric = paddle.metric.Accuracy(name='acc_accumulation') + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = RobertaForTokenClassification.from_pretrained( + pretrained_model_name_or_path='roberta-wwm-ext', + num_classes=self.num_classes, + **kwargs + ) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) elif task is None: - self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='roberta-wwm-ext') + self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='roberta-wwm-ext', **kwargs) else: - raise RuntimeError("Unknown task %s, task should be sequence_classification" % task) + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) self.task = task - self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None): + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): result = self.model(input_ids, token_type_ids, position_ids, attention_mask) - if self.task is not None: + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) - return probs, loss, acc + return probs, loss, {'acc': acc} return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs else: sequence_output, pooled_output = result return sequence_output, pooled_output - def get_vocab_path(self): - """ - Gets the path of the module vocabulary path. - """ - save_path = os.path.join(DATA_HOME, 'roberta-wwm-ext', 'vocab.txt') - if not os.path.exists(save_path) or not os.path.isfile(save_path): - url = "https://paddlenlp.bj.bcebos.com/models/transformers/roberta_base/vocab.txt" - download(url, os.path.join(DATA_HOME, 'roberta-wwm-ext')) - return save_path - - def get_tokenizer(self, tokenize_chinese_chars=True): + @staticmethod + def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. - Args: - tokenize_chinese_chars (:obj: bool , defaults to :obj: True): - Whether to tokenize chinese characters or not. - Returns: - tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module. - """ - return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path()) - - def training_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for training, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as loss and metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'loss': avg_loss, 'metrics': {'acc': acc}} - - def validation_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for validation, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'metrics': {'acc': acc}} - - def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False): """ - Predicts the data labels. - - Args: - data (obj:`List(Union(str))`): The processed data (the one sequence or sequence pair) whose each element is the raw text. - max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`): - If set to a number, will limit the total sequence returned so that it has a maximum length. - batch_size(obj:`int`, defaults to 1): The number of batch. - use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not. - - Returns: - results(obj:`list`): All the predictions labels. - """ - # TODO(zhangxuefei): add task token_classification task predict. - if self.task not in ['sequence_classification']: - raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - tokenizer = self.get_tokenizer() - - examples = [] - for text in data: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids'])) - - def _batchify_fn(batch): - input_ids = [entry[0] for entry in batch] - segment_ids = [entry[1] for entry in batch] - return input_ids, segment_ids - - # Seperates data into some batches. - batches = [] - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) - - results = [] - self.eval() - for batch in batches: - input_ids, segment_ids = _batchify_fn(batch) - input_ids = paddle.to_tensor(input_ids) - segment_ids = paddle.to_tensor(segment_ids) - - # TODO(zhangxuefei): add task token_classification postprocess after prediction. - if self.task == 'sequence_classification': - probs = self(input_ids, segment_ids) - idx = paddle.argmax(probs, axis=1).numpy() - idx = idx.tolist() - labels = [self.label_map[i] for i in idx] - results.extend(labels) - - return results - - @serving - def get_embedding(self, texts, use_gpu=False): - if self.task is not None: - raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - - tokenizer = self.get_tokenizer() - results = [] - for text in texts: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - - input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0) - segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0) - sequence_output, pooled_output = self(input_ids, segment_ids) - - sequence_output = sequence_output.squeeze(0) - pooled_output = pooled_output.squeeze(0) - results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist())) - return results + return RobertaTokenizer.from_pretrained( + pretrained_model_name_or_path='roberta-wwm-ext', *args, **kwargs) diff --git a/paddlehub/datasets/base_nlp_dataset.py b/paddlehub/datasets/base_nlp_dataset.py index acca7b8c..1c9ae13a 100644 --- a/paddlehub/datasets/base_nlp_dataset.py +++ b/paddlehub/datasets/base_nlp_dataset.py @@ -246,15 +246,9 @@ class TextClassificationDataset(BaseNLPDataset, paddle.io.Dataset): def __getitem__(self, idx): record = self.records[idx] if 'label' in record.keys(): - if isinstance(self.tokenizer, BertTokenizer): - return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['label']) - elif isinstance(self.tokenizer, CustomTokenizer): - return np.array(record['text']), np.array(record['seq_len']), np.array(record['label']) + return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['label'], dtype=np.int64) else: - if isinstance(self.tokenizer, BertTokenizer): - return np.array(record['input_ids']), np.array(record['segment_ids']) - elif isinstance(self.tokenizer, CustomTokenizer): - return np.array(record['text']), np.array(record['seq_len']) + return np.array(record['input_ids']), np.array(record['segment_ids']) def __len__(self): return len(self.records) @@ -269,8 +263,9 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset): data_file: str = None, label_file: str = None, label_list: list = None, - split_char="\002", - no_entity_label="O", + split_char: str ="\002", + no_entity_label: str = "O", + ignore_label: int = -100, is_file_with_header: bool = False): super(SeqLabelingDataset, self).__init__( base_path=base_path, @@ -283,6 +278,7 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset): self.no_entity_label = no_entity_label self.split_char = split_char + self.ignore_label = ignore_label self.examples = self._read_file(self.data_file, is_file_with_header) self.records = self._convert_examples_to_records(self.examples) @@ -327,8 +323,7 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset): continue if labels: record["label"] = [] - tokens_with_specical_token = self.tokenizer.decode( - record, only_convert_to_tokens=True) + tokens_with_specical_token = self.tokenizer.convert_ids_to_tokens(record['input_ids']) tokens_index = 0 for token in tokens_with_specical_token: if tokens_index < len( @@ -336,6 +331,8 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset): record["label"].append( self.label_list.index(labels[tokens_index])) tokens_index += 1 + elif token in [self.tokenizer.pad_token]: + record["label"].append(self.ignore_label) # label of special token else: record["label"].append( self.label_list.index(self.no_entity_label)) @@ -351,7 +348,7 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset): ret_tokens = [] ret_labels = [] for token, label in zip(tokens, labels): - sub_token = self.tokenizer.tokenize(token) + sub_token = self.tokenizer(token) if len(sub_token) == 0: continue ret_tokens.extend(sub_token) @@ -370,7 +367,7 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset): else: ret_tokens = [] for token in tokens: - sub_token = self.tokenizer.tokenize(token) + sub_token = self.tokenizer(token) if len(sub_token) == 0: continue ret_tokens.extend(sub_token) @@ -381,15 +378,9 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset): def __getitem__(self, idx): record = self.records[idx] if 'label' in record.keys(): - if isinstance(self.tokenizer, BertTokenizer): - return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['label']) - else: # TODO(chenxiaojie): add CustomTokenizer supported - raise NotImplementedError + return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['seq_len']), np.array(record['label'], dtype=np.int64) else: - if isinstance(self.tokenizer, BertTokenizer): - return np.array(record['input_ids']), np.array(record['segment_ids']) - else: # TODO(chenxiaojie): add CustomTokenizer supported - raise NotImplementedError + return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['seq_len']) def __len__(self): return len(self.records) diff --git a/paddlehub/datasets/msra_ner.py b/paddlehub/datasets/msra_ner.py index c9f4b2e3..8440e7c0 100644 --- a/paddlehub/datasets/msra_ner.py +++ b/paddlehub/datasets/msra_ner.py @@ -31,8 +31,16 @@ class MSRA_NER(SeqLabelingDataset): for research purposes. For more information please refer to https://www.microsoft.com/en-us/download/details.aspx?id=52531 """ - def __init__(self, tokenizer: Union[BertTokenizer, CustomTokenizer], max_seq_len: int = 128, mode: str = 'train'): + + def __init__( + self, + tokenizer: Union[BertTokenizer, CustomTokenizer], + max_seq_len: int = 128, + mode: str = 'train', + ): base_path = os.path.join(DATA_HOME, "msra_ner") + label_list = ["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"] + if mode == 'train': data_file = 'train.tsv' elif mode == 'test': @@ -46,6 +54,6 @@ class MSRA_NER(SeqLabelingDataset): mode=mode, data_file=data_file, label_file=None, - label_list=["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"], + label_list=label_list, is_file_with_header=True, ) diff --git a/paddlehub/module/modeling_bert.py b/paddlehub/module/modeling_bert.py deleted file mode 100644 index 5ab602b8..00000000 --- a/paddlehub/module/modeling_bert.py +++ /dev/null @@ -1,289 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# FIXME(zhangxuefei): remove this file after paddlenlp is released. - -import paddle -import paddle.nn as nn - -from paddlehub.module.nlp_module import PretrainedModel, register_base_model - - -class BertEmbeddings(nn.Layer): - """ - Include embeddings from word, position and token_type embeddings - """ - - def __init__(self, - vocab_size, - hidden_size=768, - hidden_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16): - super(BertEmbeddings, self).__init__() - self.word_embeddings = nn.Embedding(vocab_size, hidden_size, padding_idx=0) - self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size) - self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size) - self.layer_norm = nn.LayerNorm(hidden_size) - self.dropout = nn.Dropout(hidden_dropout_prob) - - def forward(self, input_ids, token_type_ids=None, position_ids=None): - if position_ids is None: - # maybe need use shape op to unify static graph and dynamic graph - seq_length = input_ids.shape[1] - position_ids = paddle.arange(0, seq_length, dtype="int64") - if token_type_ids is None: - token_type_ids = paddle.zeros_like(input_ids, dtype="int64") - - input_embedings = self.word_embeddings(input_ids) - position_embeddings = self.position_embeddings(position_ids) - token_type_embeddings = self.token_type_embeddings(token_type_ids) - - embeddings = input_embedings + position_embeddings + token_type_embeddings - embeddings = self.layer_norm(embeddings) - embeddings = self.dropout(embeddings) - return embeddings - - -class BertPooler(nn.Layer): - """ - """ - - def __init__(self, hidden_size): - super(BertPooler, self).__init__() - self.dense = nn.Linear(hidden_size, hidden_size) - self.activation = nn.Tanh() - - def forward(self, hidden_states): - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. - first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) - pooled_output = self.activation(pooled_output) - return pooled_output - - -class BertPretrainedModel(PretrainedModel): - """ - An abstract class for pretrained BERT models. It provides BERT related - `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`, - `pretrained_init_configuration`, `base_model_prefix` for downloading and - loading pretrained models. See `PretrainedModel` for more details. - """ - - model_config_file = "model_config.json" - pretrained_init_configuration = { - "bert-base-uncased": { - "vocab_size": 30522, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "bert-large-uncased": { - "vocab_size": 30522, - "hidden_size": 1024, - "num_hidden_layers": 24, - "num_attention_heads": 16, - "intermediate_size": 4096, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "bert-base-multilingual-uncased": { - "vocab_size": 105879, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "bert-base-cased": { - "vocab_size": 30522, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "bert-base-chinese": { - "vocab_size": 21128, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "bert-base-multilingual-cased": { - "vocab_size": 119547, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "bert-large-cased": { - "vocab_size": 28996, - "hidden_size": 1024, - "num_hidden_layers": 24, - "num_attention_heads": 16, - "intermediate_size": 4096, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - } - resource_files_names = {"model_state": "model_state.pdparams"} - pretrained_resource_files_map = { - "model_state": { - "bert-base-uncased": "https://paddlenlp.bj.bcebos.com/models/transformers/bert-base-uncased.pdparams", - "bert-large-uncased": "https://paddlenlp.bj.bcebos.com/models/transformers/bert-large-uncased.pdparams", - "bert-base-multilingual-uncased": - "http://paddlenlp.bj.bcebos.com/models/transformers/bert-base-multilingual-uncased.pdparams", - "bert-base-cased": "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-base-cased.pdparams", - "bert-base-chinese": "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-base-chinese.pdparams", - "bert-base-multilingual-cased": - "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-base-multilingual-cased.pdparamss", - "bert-large-cased": "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-large-cased.pdparams" - } - } - base_model_prefix = "bert" - - def init_weights(self, layer): - """ Initialization hook """ - if isinstance(layer, (nn.Linear, nn.Embedding)): - # only support dygraph, use truncated_normal and make it inplace - # and configurable later - layer.weight.set_value( - paddle.tensor.normal( - mean=0.0, - std=self.initializer_range - if hasattr(self, "initializer_range") else self.bert.config["initializer_range"], - shape=layer.weight.shape)) - elif isinstance(layer, nn.LayerNorm): - layer._epsilon = 1e-12 - - -@register_base_model -class BertModel(BertPretrainedModel): - """ - """ - - def __init__(self, - vocab_size, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - initializer_range=0.02, - pad_token_id=0): - super(BertModel, self).__init__() - self.pad_token_id = pad_token_id - self.initializer_range = initializer_range - self.embeddings = BertEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, - type_vocab_size) - encoder_layer = nn.TransformerEncoderLayer( - hidden_size, - num_attention_heads, - intermediate_size, - dropout=hidden_dropout_prob, - activation=hidden_act, - attn_dropout=attention_probs_dropout_prob, - act_dropout=0) - self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers) - self.pooler = BertPooler(hidden_size) - self.apply(self.init_weights) - - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None): - if attention_mask is None: - attention_mask = paddle.unsqueeze( - (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e9, axis=[1, 2]) - embedding_output = self.embeddings( - input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids) - encoder_outputs = self.encoder(embedding_output, attention_mask) - sequence_output = encoder_outputs - pooled_output = self.pooler(sequence_output) - return sequence_output, pooled_output - - -class BertForSequenceClassification(BertPretrainedModel): - """ - Model for sentence (pair) classification task with BERT. - Args: - bert (BertModel): An instance of BertModel. - num_classes (int, optional): The number of classes. Default 2 - dropout (float, optional): The dropout probability for output of BERT. - If None, use the same value as `hidden_dropout_prob` of `BertModel` - instance `bert`. Default None - """ - - def __init__(self, bert, num_classes=2, dropout=None): - super(BertForSequenceClassification, self).__init__() - self.num_classes = num_classes - self.bert = bert # allow bert to be config - self.dropout = nn.Dropout(dropout if dropout is not None else self.bert.config["hidden_dropout_prob"]) - self.classifier = nn.Linear(self.bert.config["hidden_size"], num_classes) - self.apply(self.init_weights) - - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None): - _, pooled_output = self.bert( - input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask) - - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - return logits diff --git a/paddlehub/module/modeling_ernie.py b/paddlehub/module/modeling_ernie.py deleted file mode 100644 index ef43785b..00000000 --- a/paddlehub/module/modeling_ernie.py +++ /dev/null @@ -1,243 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# FIXME(zhangxuefei): remove this file after paddlenlp is released. - -import paddle -import paddle.nn as nn - -from paddlehub.module.nlp_module import PretrainedModel, register_base_model - - -class ErnieEmbeddings(nn.Layer): - """ - Include embeddings from word, position and token_type embeddings - """ - - def __init__(self, - vocab_size, - hidden_size=768, - hidden_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - pad_token_id=0): - super(ErnieEmbeddings, self).__init__() - self.word_embeddings = nn.Embedding(vocab_size, hidden_size, padding_idx=pad_token_id) - self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size) - self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size) - self.layer_norm = nn.LayerNorm(hidden_size) - self.dropout = nn.Dropout(hidden_dropout_prob) - - def forward(self, input_ids, token_type_ids=None, position_ids=None): - if position_ids is None: - # maybe need use shape op to unify static graph and dynamic graph - seq_length = input_ids.shape[1] - position_ids = paddle.arange(0, seq_length, dtype="int64") - if token_type_ids is None: - token_type_ids = paddle.zeros_like(input_ids, dtype="int64") - - input_embedings = self.word_embeddings(input_ids) - position_embeddings = self.position_embeddings(position_ids) - token_type_embeddings = self.token_type_embeddings(token_type_ids) - - embeddings = input_embedings + position_embeddings + token_type_embeddings - embeddings = self.layer_norm(embeddings) - embeddings = self.dropout(embeddings) - return embeddings - - -class ErniePooler(nn.Layer): - """ - """ - - def __init__(self, hidden_size): - super(ErniePooler, self).__init__() - self.dense = nn.Linear(hidden_size, hidden_size) - self.activation = nn.Tanh() - - def forward(self, hidden_states): - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. - first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) - pooled_output = self.activation(pooled_output) - return pooled_output - - -class ErniePretrainedModel(PretrainedModel): - """ - An abstract class for pretrained ERNIE models. It provides ERNIE related - `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`, - `pretrained_init_configuration`, `base_model_prefix` for downloading and - loading pretrained models. See `PretrainedModel` for more details. - """ - - model_config_file = "model_config.json" - pretrained_init_configuration = { - "ernie": { - "attention_probs_dropout_prob": 0.1, - "hidden_act": "relu", - "hidden_dropout_prob": 0.1, - "hidden_size": 768, - "initializer_range": 0.02, - "max_position_embeddings": 513, - "num_attention_heads": 12, - "num_hidden_layers": 12, - "type_vocab_size": 2, - "vocab_size": 18000, - "pad_token_id": 0, - }, - "ernie_tiny": { - "attention_probs_dropout_prob": 0.1, - "hidden_act": "relu", - "hidden_dropout_prob": 0.1, - "hidden_size": 1024, - "initializer_range": 0.02, - "intermediate_size": 4096, - "max_position_embeddings": 600, - "num_attention_heads": 16, - "num_hidden_layers": 3, - "type_vocab_size": 2, - "vocab_size": 50006, - "pad_token_id": 0, - }, - "ernie_v2_eng_base": { - "attention_probs_dropout_prob": 0.1, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 768, - "initializer_range": 0.02, - "max_position_embeddings": 512, - "num_attention_heads": 12, - "num_hidden_layers": 12, - "type_vocab_size": 4, - "vocab_size": 30522, - "pad_token_id": 0, - }, - "ernie_v2_eng_large": { - "attention_probs_dropout_prob": 0.1, - "intermediate_size": 4096, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 1024, - "initializer_range": 0.02, - "max_position_embeddings": 512, - "num_attention_heads": 16, - "num_hidden_layers": 24, - "type_vocab_size": 4, - "vocab_size": 30522, - "pad_token_id": 0, - }, - } - resource_files_names = {"model_state": "model_state.pdparams"} - pretrained_resource_files_map = { - "model_state": { - "ernie": - "https://paddlenlp.bj.bcebos.com/models/transformers/ernie/ernie_v1_chn_base.pdparams", - "ernie_tiny": - "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_tiny/ernie_tiny.pdparams", - "ernie_v2_eng_base": - "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_v2_base/ernie_v2_eng_base.pdparams", - "ernie_v2_eng_large": - "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_v2_large/ernie_v2_eng_large.pdparams", - } - } - base_model_prefix = "ernie" - - def init_weights(self, layer): - """ Initialization hook """ - if isinstance(layer, (nn.Linear, nn.Embedding)): - # only support dygraph, use truncated_normal and make it inplace - # and configurable later - layer.weight.set_value( - paddle.tensor.normal( - mean=0.0, - std=self.initializer_range - if hasattr(self, "initializer_range") else self.ernie.config["initializer_range"], - shape=layer.weight.shape)) - - -@register_base_model -class ErnieModel(ErniePretrainedModel): - """ - """ - - def __init__(self, - vocab_size, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - initializer_range=0.02, - pad_token_id=0): - super(ErnieModel, self).__init__() - self.pad_token_id = pad_token_id - self.initializer_range = initializer_range - self.embeddings = ErnieEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, - type_vocab_size, pad_token_id) - encoder_layer = nn.TransformerEncoderLayer( - hidden_size, - num_attention_heads, - intermediate_size, - dropout=hidden_dropout_prob, - activation=hidden_act, - attn_dropout=attention_probs_dropout_prob, - act_dropout=0) - self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers) - self.pooler = ErniePooler(hidden_size) - self.apply(self.init_weights) - - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None): - if attention_mask is None: - attention_mask = paddle.unsqueeze( - (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e9, axis=[1, 2]) - embedding_output = self.embeddings( - input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids) - encoder_outputs = self.encoder(embedding_output, attention_mask) - sequence_output = encoder_outputs - pooled_output = self.pooler(sequence_output) - return sequence_output, pooled_output - - -class ErnieForSequenceClassification(ErniePretrainedModel): - """ - Model for sentence (pair) classification task with ERNIE. - Args: - ernie (ErnieModel): An instance of `ErnieModel`. - num_classes (int, optional): The number of classes. Default 2 - dropout (float, optional): The dropout probability for output of ERNIE. - If None, use the same value as `hidden_dropout_prob` of `ErnieModel` - instance `Ernie`. Default None - """ - - def __init__(self, ernie, num_classes=2, dropout=None): - super(ErnieForSequenceClassification, self).__init__() - self.num_classes = num_classes - self.ernie = ernie # allow ernie to be config - self.dropout = nn.Dropout(dropout if dropout is not None else self.ernie.config["hidden_dropout_prob"]) - self.classifier = nn.Linear(self.ernie.config["hidden_size"], num_classes) - self.apply(self.init_weights) - - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None): - _, pooled_output = self.ernie( - input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask) - - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - return logits diff --git a/paddlehub/module/modeling_roberta.py b/paddlehub/module/modeling_roberta.py deleted file mode 100644 index 62d75539..00000000 --- a/paddlehub/module/modeling_roberta.py +++ /dev/null @@ -1,215 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# FIXME(zhangxuefei): remove this file after paddlenlp is released. - -import paddle -import paddle.nn as nn - -from paddlehub.module.nlp_module import PretrainedModel, register_base_model - - -class RobertaEmbeddings(nn.Layer): - """ - Include embeddings from word, position and token_type embeddings - """ - - def __init__(self, - vocab_size, - hidden_size=768, - hidden_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - pad_token_id=0): - super(RobertaEmbeddings, self).__init__() - self.word_embeddings = nn.Embedding(vocab_size, hidden_size, padding_idx=pad_token_id) - self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size) - self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size) - self.layer_norm = nn.LayerNorm(hidden_size) - self.dropout = nn.Dropout(hidden_dropout_prob) - - def forward(self, input_ids, token_type_ids=None, position_ids=None): - if position_ids is None: - # maybe need use shape op to unify static graph and dynamic graph - seq_length = input_ids.shape[1] - position_ids = paddle.arange(0, seq_length, dtype="int64") - if token_type_ids is None: - token_type_ids = paddle.zeros_like(input_ids, dtype="int64") - - input_embedings = self.word_embeddings(input_ids) - position_embeddings = self.position_embeddings(position_ids) - token_type_embeddings = self.token_type_embeddings(token_type_ids) - - embeddings = input_embedings + position_embeddings + token_type_embeddings - embeddings = self.layer_norm(embeddings) - embeddings = self.dropout(embeddings) - return embeddings - - -class RobertaPooler(nn.Layer): - """ - """ - - def __init__(self, hidden_size): - super(RobertaPooler, self).__init__() - self.dense = nn.Linear(hidden_size, hidden_size) - self.activation = nn.Tanh() - - def forward(self, hidden_states): - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. - first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) - pooled_output = self.activation(pooled_output) - return pooled_output - - -class RobertaPretrainedModel(PretrainedModel): - """ - An abstract class for pretrained RoBERTa models. It provides RoBERTa related - `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`, - `pretrained_init_configuration`, `base_model_prefix` for downloading and - loading pretrained models. See `PretrainedModel` for more details. - """ - - model_config_file = "model_config.json" - pretrained_init_configuration = { - "roberta-wwm-ext": { - "attention_probs_dropout_prob": 0.1, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 768, - "initializer_range": 0.02, - "intermediate_size": 3072, - "max_position_embeddings": 512, - "num_attention_heads": 12, - "num_hidden_layers": 12, - "type_vocab_size": 2, - "vocab_size": 21128, - "pad_token_id": 0 - }, - "roberta-wwm-ext-large": { - "attention_probs_dropout_prob": 0.1, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 1024, - "initializer_range": 0.02, - "intermediate_size": 4096, - "max_position_embeddings": 512, - "num_attention_heads": 16, - "num_hidden_layers": 24, - "type_vocab_size": 2, - "vocab_size": 21128, - "pad_token_id": 0 - } - } - resource_files_names = {"model_state": "model_state.pdparams"} - pretrained_resource_files_map = { - "model_state": { - "roberta-wwm-ext": - "https://paddlenlp.bj.bcebos.com/models/transformers/roberta_base/roberta_chn_base.pdparams", - "roberta-wwm-ext-large": - "https://paddlenlp.bj.bcebos.com/models/transformers/roberta_large/roberta_chn_large.pdparams", - } - } - base_model_prefix = "roberta" - - def init_weights(self, layer): - """ Initialization hook """ - if isinstance(layer, (nn.Linear, nn.Embedding)): - # only support dygraph, use truncated_normal and make it inplace - # and configurable later - layer.weight.set_value( - paddle.tensor.normal( - mean=0.0, - std=self.initializer_range - if hasattr(self, "initializer_range") else self.roberta.config["initializer_range"], - shape=layer.weight.shape)) - elif isinstance(layer, nn.LayerNorm): - layer._epsilon = 1e-12 - - -@register_base_model -class RobertaModel(RobertaPretrainedModel): - """ - """ - - def __init__(self, - vocab_size, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - initializer_range=0.02, - pad_token_id=0): - super(RobertaModel, self).__init__() - self.pad_token_id = pad_token_id - self.initializer_range = initializer_range - self.embeddings = RobertaEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, - type_vocab_size, pad_token_id) - encoder_layer = nn.TransformerEncoderLayer( - hidden_size, - num_attention_heads, - intermediate_size, - dropout=hidden_dropout_prob, - activation=hidden_act, - attn_dropout=attention_probs_dropout_prob, - act_dropout=0) - self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers) - self.pooler = RobertaPooler(hidden_size) - self.apply(self.init_weights) - - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None): - if attention_mask is None: - attention_mask = paddle.unsqueeze( - (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e9, axis=[1, 2]) - embedding_output = self.embeddings( - input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids) - encoder_outputs = self.encoder(embedding_output, attention_mask) - sequence_output = encoder_outputs - pooled_output = self.pooler(sequence_output) - return sequence_output, pooled_output - - -class RobertaForSequenceClassification(RobertaPretrainedModel): - """ - Model for sentence (pair) classification task with RoBERTa. - Args: - roberta (RobertaModel): An instance of `RobertaModel`. - num_classes (int, optional): The number of classes. Default 2 - dropout (float, optional): The dropout probability for output of RoBERTa. - If None, use the same value as `hidden_dropout_prob` of `RobertaModel` - instance `Roberta`. Default None - """ - - def __init__(self, roberta, num_classes=2, dropout=None): - super(RobertaForSequenceClassification, self).__init__() - self.num_classes = num_classes - self.roberta = roberta # allow roberta to be config - self.dropout = nn.Dropout(dropout if dropout is not None else self.roberta.config["hidden_dropout_prob"]) - self.classifier = nn.Linear(self.roberta.config["hidden_size"], num_classes) - self.apply(self.init_weights) - - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None): - _, pooled_output = self.roberta( - input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask) - - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - return logits diff --git a/paddlehub/module/nlp_module.py b/paddlehub/module/nlp_module.py index 37c86f3a..ddfd546c 100644 --- a/paddlehub/module/nlp_module.py +++ b/paddlehub/module/nlp_module.py @@ -453,8 +453,11 @@ class TransformerModule(RunModule, TextServing): Returns: results(:obj: Dict) : The model outputs, such as loss and metrics. """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'loss': avg_loss, 'metrics': {'acc': acc}} + if self.task == 'seq-cls': + predictions, avg_loss, metric = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) + elif self.task == 'token-cls': + predictions, avg_loss, metric = self(input_ids=batch[0], token_type_ids=batch[1], seq_lengths=batch[2], labels=batch[3]) + return {'loss': avg_loss, 'metrics': metric} def validation_step(self, batch: List[paddle.Tensor], batch_idx: int): """ @@ -466,8 +469,11 @@ class TransformerModule(RunModule, TextServing): Returns: results(:obj: Dict) : The model outputs, such as metrics. """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'metrics': {'acc': acc}} + if self.task == 'seq-cls': + predictions, avg_loss, metric = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) + elif self.task == 'token-cls': + predictions, avg_loss, metric = self(input_ids=batch[0], token_type_ids=batch[1], seq_lengths=batch[2], labels=batch[3]) + return {'metrics': metric} def get_embedding(self, data: List[List[str]], max_seq_len=128, batch_size=1, use_gpu=False): """ diff --git a/requirements.txt b/requirements.txt index fe998426..8102079e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,4 @@ tqdm visualdl >= 2.0.0 # gunicorn not support windows gunicorn >= 19.10.0; sys_platform != "win32" -paddlenlp >= 2.0.0b \ No newline at end of file +paddlenlp >= 2.0.0b2 \ No newline at end of file -- GitLab