From f67ad5be678278c67e8714bd08dacb38bdb0ebb2 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Wed, 30 Dec 2020 23:57:48 +0800 Subject: [PATCH] Update transformer modules (#1147) * Add token-cls task for transformer modules * Fix numpy dtype mismatch in windows * Update README.md * Update token-cls task for ernie_tiny * Update token-cls task for ernie_tiny * Update token-cls task for other Transformer modules * Update README.md of modules and token-cls demo * Add chunk_scheme arg check in SeqLabelingDataset * Update ChunkEvaluator and paddlenlp requirement * Update README.md * Update token-cls demo --- demo/sequence_labeling/README.md | 41 ++- demo/sequence_labeling/train.py | 52 ++- .../language_model/bert-base-cased/README.md | 52 +-- .../language_model/bert-base-cased/module.py | 215 ++++--------- .../bert-base-chinese/README.md | 55 ++-- .../bert-base-chinese/module.py | 221 ++++--------- .../bert-base-multilingual-cased/README.md | 55 ++-- .../bert-base-multilingual-cased/module.py | 221 ++++--------- .../bert-base-multilingual-uncased/README.md | 55 ++-- .../bert-base-multilingual-uncased/module.py | 222 ++++--------- .../bert-base-uncased/README.md | 55 ++-- .../bert-base-uncased/module.py | 215 ++++--------- .../language_model/bert-large-cased/README.md | 55 ++-- .../language_model/bert-large-cased/module.py | 214 ++++--------- .../bert-large-uncased/README.md | 57 ++-- .../bert-large-uncased/module.py | 215 ++++--------- .../language_model/chinese_bert_wwm/README.md | 250 +++++---------- .../chinese_bert_wwm/model/__init__.py | 0 .../chinese_bert_wwm/model/bert.py | 197 ------------ .../model/transformer_encoder.py | 295 ------------------ .../language_model/chinese_bert_wwm/module.py | 148 ++++++--- .../chinese_bert_wwm_ext/README.md | 152 +++++---- .../chinese_bert_wwm_ext/model/__init__.py | 0 .../chinese_bert_wwm_ext/model/bert.py | 197 ------------ .../model/transformer_encoder.py | 295 ------------------ .../chinese_bert_wwm_ext/module.py | 148 ++++++--- modules/text/language_model/ernie/README.md | 53 ++-- modules/text/language_model/ernie/module.py | 214 ++++--------- .../text/language_model/ernie_tiny/README.md | 34 +- .../text/language_model/ernie_tiny/module.py | 73 ++--- .../ernie_v2_eng_base/README.md | 52 +-- .../ernie_v2_eng_base/module.py | 221 ++++--------- .../ernie_v2_eng_large/README.md | 54 ++-- .../ernie_v2_eng_large/module.py | 221 ++++--------- modules/text/language_model/rbt3/README.md | 152 +++++---- .../language_model/rbt3/model/__init__.py | 0 .../text/language_model/rbt3/model/bert.py | 197 ------------ .../rbt3/model/transformer_encoder.py | 295 ------------------ modules/text/language_model/rbt3/module.py | 145 ++++++--- modules/text/language_model/rbtl3/README.md | 152 +++++---- .../language_model/rbtl3/model/__init__.py | 0 .../text/language_model/rbtl3/model/bert.py | 197 ------------ .../rbtl3/model/transformer_encoder.py | 295 ------------------ modules/text/language_model/rbtl3/module.py | 145 ++++++--- .../roberta-wwm-ext-large/README.md | 56 ++-- .../roberta-wwm-ext-large/module.py | 219 ++++--------- .../language_model/roberta-wwm-ext/README.md | 56 ++-- .../language_model/roberta-wwm-ext/module.py | 219 ++++--------- paddlehub/datasets/base_nlp_dataset.py | 35 +-- paddlehub/datasets/msra_ner.py | 12 +- paddlehub/module/modeling_bert.py | 289 ----------------- paddlehub/module/modeling_ernie.py | 243 --------------- paddlehub/module/modeling_roberta.py | 215 ------------- paddlehub/module/nlp_module.py | 14 +- requirements.txt | 2 +- 55 files changed, 2111 insertions(+), 5431 deletions(-) delete mode 100644 modules/text/language_model/chinese_bert_wwm/model/__init__.py delete mode 100644 modules/text/language_model/chinese_bert_wwm/model/bert.py delete mode 100644 modules/text/language_model/chinese_bert_wwm/model/transformer_encoder.py delete mode 100644 modules/text/language_model/chinese_bert_wwm_ext/model/__init__.py delete mode 100644 modules/text/language_model/chinese_bert_wwm_ext/model/bert.py delete mode 100644 modules/text/language_model/chinese_bert_wwm_ext/model/transformer_encoder.py delete mode 100644 modules/text/language_model/rbt3/model/__init__.py delete mode 100644 modules/text/language_model/rbt3/model/bert.py delete mode 100644 modules/text/language_model/rbt3/model/transformer_encoder.py delete mode 100644 modules/text/language_model/rbtl3/model/__init__.py delete mode 100644 modules/text/language_model/rbtl3/model/bert.py delete mode 100644 modules/text/language_model/rbtl3/model/transformer_encoder.py delete mode 100644 paddlehub/module/modeling_bert.py delete mode 100644 paddlehub/module/modeling_ernie.py delete mode 100644 paddlehub/module/modeling_roberta.py diff --git a/demo/sequence_labeling/README.md b/demo/sequence_labeling/README.md index fda17c32..04c3450a 100644 --- a/demo/sequence_labeling/README.md +++ b/demo/sequence_labeling/README.md @@ -28,10 +28,21 @@ python train.py 使用PaddleHub Fine-tune API进行Fine-tune可以分为4个步骤。 ### Step1: 选择模型 + +在命名实体识别的任务中,因不同的数据集标识实体的标签不同,评测的方式也有所差异。因此,在初始化模型的之前,需要先确定实际标签的形式,下方的`label_list`则是MSRA-NER数据集中使用的标签类别。 +如果用户使用的实体识别的数据集的标签方式与MSRA-NER不同,则需要自行根据数据集确定。 +```python +label_list = ["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"] +label_map = { + idx: label for idx, label in enumerate(label_list) +} +``` + +接下来创建任务所使用的`model` ```python import paddlehub as hub -model = hub.Module(name='ernie_tiny', version='2.0.1', task='token-cls') +model = hub.Module(name='ernie_tiny', version='2.0.1', task='token-cls', label_map=label_map) ``` 其中,参数: @@ -40,7 +51,29 @@ model = hub.Module(name='ernie_tiny', version='2.0.1', task='token-cls') * `version`:module版本号 * `task`:fine-tune任务。此处为`token-cls`,表示序列标注任务。 -通过以上的一行代码,`model`初始化为一个适用于序列标注任务的模型,为ERNIE Tiny的预训练模型后拼接上一个输出token共享的全连接网络(Full Connected)。 +PaddleHub还提供BERT等模型可供选择, 当前支持序列标注任务的模型对应的加载示例如下: + +模型名 | PaddleHub Module +---------------------------------- | :------: +ERNIE, Chinese | `hub.Module(name='ernie')` +ERNIE tiny, Chinese | `hub.Module(name='ernie_tiny')` +ERNIE 2.0 Base, English | `hub.Module(name='ernie_v2_eng_base')` +ERNIE 2.0 Large, English | `hub.Module(name='ernie_v2_eng_large')` +BERT-Base, Cased | `hub.Module(name='bert-base-cased')` +BERT-Base, Uncased | `hub.Module(name='bert-base-uncased')` +BERT-Large, Cased | `hub.Module(name='bert-large-cased')` +BERT-Large, Uncased | `hub.Module(name='bert-large-uncased')` +BERT-Base, Multilingual Cased | `hub.Module(nane='bert-base-multilingual-cased')` +BERT-Base, Multilingual Uncased | `hub.Module(nane='bert-base-multilingual-uncased')` +BERT-Base, Chinese | `hub.Module(name='bert-base-chinese')` +BERT-wwm, Chinese | `hub.Module(name='chinese-bert-wwm')` +BERT-wwm-ext, Chinese | `hub.Module(name='chinese-bert-wwm-ext')` +RoBERTa-wwm-ext, Chinese | `hub.Module(name='roberta-wwm-ext')` +RoBERTa-wwm-ext-large, Chinese | `hub.Module(name='roberta-wwm-ext-large')` +RBT3, Chinese | `hub.Module(name='rbt3')` +RBTL3, Chinese | `hub.Module(name='rbtl3')` + +通过以上的一行代码,`model`初始化为一个适用于序列标注任务的模型,为ERNIE Tiny的预训练模型后拼接上一个输出token共享的全连接网络(Full Connected)。 ![](https://ss1.bdstatic.com/70cFuXSh_Q1YnxGkpoWK1HF6hhy/it/u=224484727,3049769188&fm=15&gp=0.jpg) 以上图片来自于:https://arxiv.org/pdf/1810.04805.pdf @@ -49,9 +82,9 @@ model = hub.Module(name='ernie_tiny', version='2.0.1', task='token-cls') ```python train_dataset = hub.datasets.MSRA_NER( - tokenizer=model.get_tokenizer(tokenize_chinese_chars=True), max_seq_len=50, mode='train') + tokenizer=model.get_tokenizer(), max_seq_len=128, mode='train') dev_dataset = hub.datasets.MSRA_NER( - tokenizer=model.get_tokenizer(tokenize_chinese_chars=True), max_seq_len=50, mode='dev') + tokenizer=model.get_tokenizer(), max_seq_len=128, mode='dev') ``` * `tokenizer`:表示该module所需用到的tokenizer,其将对输入文本完成切词,并转化成module运行所需模型输入格式。 diff --git a/demo/sequence_labeling/train.py b/demo/sequence_labeling/train.py index 43a81fb4..3e26d20b 100644 --- a/demo/sequence_labeling/train.py +++ b/demo/sequence_labeling/train.py @@ -14,32 +14,60 @@ import paddle import paddlehub as hub +from paddlehub.datasets import MSRA_NER + +import ast +import argparse + +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") +parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for fine-tuning, input should be True or False") +parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") +parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.") +parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") +parser.add_argument("--checkpoint_dir", type=str, default='./checkpoint', help="Directory to model checkpoint") +parser.add_argument("--save_interval", type=int, default=1, help="Save checkpoint every n epoch.") + +args = parser.parse_args() + if __name__ == '__main__': label_list = ["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"] label_map = { idx: label for idx, label in enumerate(label_list) } + model = hub.Module( name='ernie_tiny', version='2.0.1', task='token-cls', - label_map=label_map, + label_map=label_map, # Required for token classification task ) - train_dataset = hub.datasets.MSRA_NER( - tokenizer=model.get_tokenizer(), - max_seq_len=128, + tokenizer = model.get_tokenizer() + train_dataset = MSRA_NER( + tokenizer=tokenizer, + max_seq_len=args.max_seq_len, mode='train' ) - - dev_dataset = hub.datasets.MSRA_NER( - tokenizer=model.get_tokenizer(), - max_seq_len=50, + dev_dataset = MSRA_NER( + tokenizer=tokenizer, + max_seq_len=args.max_seq_len, mode='dev' ) + test_dataset = MSRA_NER( + tokenizer=tokenizer, + max_seq_len=args.max_seq_len, + mode='test' + ) - optimizer = paddle.optimizer.AdamW(learning_rate=5e-5, parameters=model.parameters()) - trainer = hub.Trainer(model, optimizer, checkpoint_dir='token_cls_save_dir', use_gpu=True) - - trainer.train(train_dataset, epochs=3, batch_size=32, eval_dataset=dev_dataset, save_interval=1) + optimizer = paddle.optimizer.AdamW(learning_rate=args.learning_rate, parameters=model.parameters()) + trainer = hub.Trainer(model, optimizer, checkpoint_dir=args.checkpoint_dir, use_gpu=args.use_gpu) + trainer.train( + train_dataset, + epochs=args.num_epoch, + batch_size=args.batch_size, + eval_dataset=dev_dataset, + save_interval=args.save_interval, + ) + trainer.evaluate(test_dataset, batch_size=args.batch_size) diff --git a/modules/text/language_model/bert-base-cased/README.md b/modules/text/language_model/bert-base-cased/README.md index 2d6aac86..f75cfd72 100644 --- a/modules/text/language_model/bert-base-cased/README.md +++ b/modules/text/language_model/bert-base-cased/README.md @@ -1,5 +1,5 @@ ```shell -$ hub install bert-base-cased==2.0.0 +$ hub install bert-base-cased==2.0.1 ```
@@ -14,23 +14,29 @@ $ hub install bert-base-cased==2.0.0 def __init__( task=None, load_checkpoint=None, - label_map=None) + label_map=None, + num_classes=2, + **kwargs, +) ``` 创建Module对象(动态图组网版本)。 **参数** -* `task`: 任务名称,可为`sequence_classification`。 +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 * `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 * `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python def predict( data, max_seq_len=128, batch_size=1, - use_gpu=False) + use_gpu=False +) ``` **参数** @@ -45,7 +51,9 @@ def predict( ```python def get_embedding( - texts, + data, + max_seq_len=128, + batch_size=1, use_gpu=False ) ``` @@ -54,7 +62,9 @@ def get_embedding( **参数** -* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** @@ -68,16 +78,16 @@ def get_embedding( import paddlehub as hub data = [ - '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', - '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', - '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], ] label_map = {0: 'negative', 1: 'positive'} model = hub.Module( name='bert-base-cased', - version='2.0.0', - task='sequence_classification', + version='2.0.1', + task='seq-cls', load_checkpoint='/path/to/parameters', label_map=label_map) results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) @@ -85,7 +95,9 @@ for idx, text in enumerate(data): print('Data: {} \t Lable: {}'.format(text, results[idx])) ``` -参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) ## 服务部署 @@ -111,12 +123,12 @@ $ hub serving start -m bert-base-cased import requests import json -# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]} -text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]] -# 以key的方式指定text传入预测方法的时的参数,此例中为"texts" -# 对应本地部署,则为module.get_embedding(texts=text) -data = {"texts": text} -# 发送post请求,content-type类型应指定json方式 +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip url = "http://10.12.121.132:8866/predict/bert-base-cased" # 指定post请求的headers为application/json方式 headers = {"Content-Type": "application/json"} @@ -149,3 +161,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图,接口有所变化。 + +* 2.0.1 + + 任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/bert-base-cased/module.py b/modules/text/language_model/bert-base-cased/module.py index 92a8b7d2..8b7b75d5 100644 --- a/modules/text/language_model/bert-base-cased/module.py +++ b/modules/text/language_model/bert-base-cased/module.py @@ -11,29 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict import os +import math -from paddle.dataset.common import DATA_HOME import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlehub import BertTokenizer -from paddlehub.module.modeling_bert import BertForSequenceClassification, BertModel -from paddlehub.module.module import moduleinfo, serving +from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification +from paddlenlp.transformers.bert.tokenizer import BertTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule from paddlehub.utils.log import logger -from paddlehub.utils.utils import download @moduleinfo( name="bert-base-cased", - version="2.0.0", + version="2.0.1", summary= "bert_cased_L-12_H-768_A-12, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", author_email="", - type="nlp/semantic_model") + type="nlp/semantic_model", + meta=TransformerModule) class Bert(nn.Layer): """ BERT model @@ -41,181 +43,80 @@ class Bert(nn.Layer): def __init__( self, - task=None, - load_checkpoint=None, - label_map=None, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, ): super(Bert, self).__init__() - # TODO(zhangxuefei): add token_classification task + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes + if task == 'sequence_classification': - self.model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path='bert-base-cased') + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': + self.model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path='bert-base-cased', num_classes=self.num_classes, **kwargs) self.criterion = paddle.nn.loss.CrossEntropyLoss() - self.metric = paddle.metric.Accuracy(name='acc_accumulation') + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = BertForTokenClassification.from_pretrained(pretrained_model_name_or_path='bert-base-cased', num_classes=self.num_classes, **kwargs) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) elif task is None: - self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-cased') + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-cased', **kwargs) else: - raise RuntimeError("Unknown task %s, task should be sequence_classification" % task) + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) self.task = task - self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None): + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): result = self.model(input_ids, token_type_ids, position_ids, attention_mask) - if self.task is not None: + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) - return probs, loss, acc + return probs, loss, {'acc': acc} return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs else: sequence_output, pooled_output = result return sequence_output, pooled_output - def get_vocab_path(self): - """ - Gets the path of the module vocabulary path. - """ - save_path = os.path.join(DATA_HOME, 'bert-base-cased', 'bert-base-cased-vocab.txt') - if not os.path.exists(save_path) or not os.path.isfile(save_path): - url = "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-cased-vocab.txt" - download(url, os.path.join(DATA_HOME, 'bert-base-cased')) - return save_path - - def get_tokenizer(self, tokenize_chinese_chars=True): + @staticmethod + def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. - Args: - tokenize_chinese_chars (:obj: bool , defaults to :obj: True): - Whether to tokenize chinese characters or not. - Returns: - tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module. - """ - return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path()) - - def training_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for training, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as loss and metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'loss': avg_loss, 'metrics': {'acc': acc}} - - def validation_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for validation, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'metrics': {'acc': acc}} - - def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False): """ - Predicts the data labels. - - Args: - data (obj:`List(str)`): The processed data whose each element is the raw text. - max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`): - If set to a number, will limit the total sequence returned so that it has a maximum length. - batch_size(obj:`int`, defaults to 1): The number of batch. - use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not. - - Returns: - results(obj:`list`): All the predictions labels. - """ - # TODO(zhangxuefei): add task token_classification task predict. - if self.task not in ['sequence_classification']: - raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - tokenizer = self.get_tokenizer() - - examples = [] - for text in data: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids'])) - - def _batchify_fn(batch): - input_ids = [entry[0] for entry in batch] - segment_ids = [entry[1] for entry in batch] - return input_ids, segment_ids - - # Seperates data into some batches. - batches = [] - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) - - results = [] - self.eval() - for batch in batches: - input_ids, segment_ids = _batchify_fn(batch) - input_ids = paddle.to_tensor(input_ids) - segment_ids = paddle.to_tensor(segment_ids) - - # TODO(zhangxuefei): add task token_classification postprocess after prediction. - if self.task == 'sequence_classification': - probs = self(input_ids, segment_ids) - idx = paddle.argmax(probs, axis=1).numpy() - idx = idx.tolist() - labels = [self.label_map[i] for i in idx] - results.extend(labels) - - return results - - @serving - def get_embedding(self, texts, use_gpu=False): - if self.task is not None: - raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - - tokenizer = self.get_tokenizer() - results = [] - for text in texts: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - - encoded_inputs = tokenizer.encode(text, pad_to_max_seq_len=False) - input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0) - segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0) - sequence_output, pooled_output = self(input_ids, segment_ids) - - sequence_output = sequence_output.squeeze(0) - pooled_output = pooled_output.squeeze(0) - results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist())) - return results + return BertTokenizer.from_pretrained( + pretrained_model_name_or_path='bert-base-cased', *args, **kwargs) diff --git a/modules/text/language_model/bert-base-chinese/README.md b/modules/text/language_model/bert-base-chinese/README.md index d13c35db..3d9d31de 100644 --- a/modules/text/language_model/bert-base-chinese/README.md +++ b/modules/text/language_model/bert-base-chinese/README.md @@ -1,6 +1,7 @@ ```shell -$ hub install bert-base-chinese==2.0.0 +$ hub install bert-base-chinese==2.0.1 ``` +
@@ -9,56 +9,50 @@ $ hub install chinese-bert-wwm==1.0.0
## API
```python
-def context(
- trainable=True,
- max_seq_len=128
+def __init__(
+ task=None,
+ load_checkpoint=None,
+ label_map=None,
+ num_classes=2,
+ **kwargs,
)
```
-用于获取Module的上下文信息,得到输入、输出以及预训练的Paddle Program副本
-**参数**
-```shell
-$ hub install chinese-bert-wwm==1.0.0
-```
-
-
-
@@ -9,94 +9,130 @@ $ hub install chinese-bert-wwm-ext==1.0.0
## API
```python
-def context(
- trainable=True,
- max_seq_len=128
+def __init__(
+ task=None,
+ load_checkpoint=None,
+ label_map=None,
+ num_classes=2,
+ **kwargs,
)
```
-用于获取Module的上下文信息,得到输入、输出以及预训练的Paddle Program副本
-**参数**
-
-> trainable:设置为True时,Module中的参数在Fine-tune时也会随之训练,否则保持不变。
-> max_seq_len:BERT模型的最大序列长度,若序列长度不足,会通过padding方式补到**max_seq_len**, 若序列长度大于该值,则会以截断方式让序列长度为**max_seq_len**,max_seq_len可取值范围为0~512;
-
-**返回**
-> inputs:dict类型,有以下字段:
-> >**input_ids**存放输入文本tokenize后各token对应BERT词汇表的word ids, shape为\[batch_size, max_seq_len\],int64类型;
-> >**position_ids**存放输入文本tokenize后各token所在该文本的位置,shape为\[batch_size, max_seq_len\],int64类型;
-> >**segment_ids**存放各token所在文本的标识(token属于文本1或者文本2),shape为\[batch_size, max_seq_len\],int64类型;
-> >**input_mask**存放token是否为padding的标识,shape为\[batch_size, max_seq_len\],int64类型;
->
-> outputs:dict类型,Module的输出特征,有以下字段:
-> >**pooled_output**字段存放句子粒度的特征,可用于文本分类等任务,shape为 \[batch_size, 768\],int64类型;
-> >**sequence_output**字段存放字粒度的特征,可用于序列标注等任务,shape为 \[batch_size, seq_len, 768\],int64类型;
->
-> program:包含该Module计算图的Program。
+创建Module对象(动态图组网版本)。
+**参数**
+* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
+* `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
+* `label_map`:预测时的类别映射表。
+* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。
+* `**kwargs`:用户额外指定的关键字字典类型的参数。
```python
-def get_embedding(
- texts,
- use_gpu=False,
- batch_size=1
+def predict(
+ data,
+ max_seq_len=128,
+ batch_size=1,
+ use_gpu=False
)
```
-用于获取输入文本的句子粒度特征与字粒度特征
-
**参数**
-> texts:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。
-> use_gpu:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。
+* `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,
+ 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。
+* `max_seq_len`:模型处理文本的最大长度
+* `batch_size`:模型批处理大小
+* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。
**返回**
-> results:list类型,格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\],其中每个元素都是对应样例的特征输出,每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。
->
-
```python
-def get_params_layer()
+def get_embedding(
+ data,
+ max_seq_len=128,
+ batch_size=1,
+ use_gpu=False
+)
```
-用于获取参数层信息,该方法与ULMFiTStrategy联用可以严格按照层数设置分层学习率与逐层解冻。
+用于获取输入文本的句子粒度特征与字粒度特征
**参数**
-> 无
+* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。
+* `max_seq_len`:模型处理文本的最大长度。
+* `batch_size`:模型批处理大小。
+* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。
**返回**
-> params_layer:dict类型,key为参数名,值为参数所在层数
+* `results`:list类型,格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\],其中每个元素都是对应样例的特征输出,每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。
+
**代码示例**
```python
import paddlehub as hub
-# Load $ hub install chinese-bert-wwm-ext pretrained model
-module = hub.Module(name="chinese-bert-wwm-ext")
-inputs, outputs, program = module.context(trainable=True, max_seq_len=128)
+data = [
+ ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'],
+ ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'],
+ ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'],
+]
+label_map = {0: 'negative', 1: 'positive'}
+
+model = hub.Module(
+ name='chinese-bert-wwm-ext',
+ version='2.0.1',
+ task='seq-cls',
+ load_checkpoint='/path/to/parameters',
+ label_map=label_map)
+results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
+for idx, text in enumerate(data):
+ print('Data: {} \t Lable: {}'.format(text, results[idx]))
+```
-# Must feed all the tensor of chinese-bert-wwm-ext's module need
-input_ids = inputs["input_ids"]
-position_ids = inputs["position_ids"]
-segment_ids = inputs["segment_ids"]
-input_mask = inputs["input_mask"]
+详情可参考PaddleHub示例:
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
-# Use "pooled_output" for sentence-level output.
-pooled_output = outputs["pooled_output"]
+## 服务部署
-# Use "sequence_output" for token-level output.
-sequence_output = outputs["sequence_output"]
+PaddleHub Serving可以部署一个在线获取预训练词向量。
-# Use "get_embedding" to get embedding result.
-embedding_result = module.get_embedding(texts=[["Sample1_text_a"],["Sample2_text_a","Sample2_text_b"]], use_gpu=True)
+### Step1: 启动PaddleHub Serving
-# Use "get_params_layer" to get params layer and used to ULMFiTStrategy.
-params_layer = module.get_params_layer()
-strategy = hub.finetune.strategy.ULMFiTStrategy(frz_params_layer=params_layer, dis_params_layer=params_layer)
+运行启动命令:
+
+```shell
+$ hub serving start -m chinese-bert-wwm-ext
+```
+
+这样就完成了一个获取预训练词向量服务化API的部署,默认端口号为8866。
+
+**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。
+
+### Step2: 发送预测请求
+
+配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果
+
+```python
+import requests
+import json
+
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数,此例中为"data"
+# 对应本地部署,则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip
+url = "http://10.12.121.132:8866/predict/chinese-bert-wwm-ext"
+# 指定post请求的headers为application/json方式
+headers = {"Content-Type": "application/json"}
+
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+print(r.json())
```
## 查看代码
@@ -109,12 +145,16 @@ https://github.com/ymcui/Chinese-BERT-wwm
## 依赖
-paddlepaddle >= 1.6.2
+paddlepaddle >= 2.0.0
-paddlehub >= 1.6.0
+paddlehub >= 2.0.0
## 更新历史
* 1.0.0
初始发布
+
+* 2.0.1
+
+ 全面升级动态图,接口有所变化。任务名称调整,增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/chinese_bert_wwm_ext/model/__init__.py b/modules/text/language_model/chinese_bert_wwm_ext/model/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/modules/text/language_model/chinese_bert_wwm_ext/model/bert.py b/modules/text/language_model/chinese_bert_wwm_ext/model/bert.py
deleted file mode 100644
index cf2a32c1..00000000
--- a/modules/text/language_model/chinese_bert_wwm_ext/model/bert.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-import json
-
-import paddle.fluid as fluid
-
-from chinese_bert_wwm_ext.model.transformer_encoder import encoder, pre_process_layer
-
-
-class BertConfig(object):
- def __init__(self, config_path):
- self._config_dict = self._parse(config_path)
-
- def _parse(self, config_path):
- try:
- with open(config_path) as json_file:
- config_dict = json.load(json_file)
- except Exception:
- raise IOError("Error in parsing bert model config file '%s'" % config_path)
- else:
- return config_dict
-
- def __getitem__(self, key):
- return self._config_dict[key]
-
- def print_config(self):
- for arg, value in sorted(six.iteritems(self._config_dict)):
- print('%s: %s' % (arg, value))
- print('------------------------------------------------')
-
-
-class BertModel(object):
- def __init__(self, src_ids, position_ids, sentence_ids, input_mask, config, weight_sharing=True, use_fp16=False):
-
- self._emb_size = config['hidden_size']
- self._n_layer = config['num_hidden_layers']
- self._n_head = config['num_attention_heads']
- self._voc_size = config['vocab_size']
- self._max_position_seq_len = config['max_position_embeddings']
- self._sent_types = config['type_vocab_size']
- self._hidden_act = config['hidden_act']
- self._prepostprocess_dropout = config['hidden_dropout_prob']
- self._attention_dropout = config['attention_probs_dropout_prob']
- self._weight_sharing = weight_sharing
-
- self._word_emb_name = "word_embedding"
- self._pos_emb_name = "pos_embedding"
- self._sent_emb_name = "sent_embedding"
- self._dtype = "float16" if use_fp16 else "float32"
-
- # Initialize all weigths by truncated normal initializer, and all biases
- # will be initialized by constant zero by default.
- self._param_initializer = fluid.initializer.TruncatedNormal(scale=config['initializer_range'])
-
- self._build_model(src_ids, position_ids, sentence_ids, input_mask)
-
- def _build_model(self, src_ids, position_ids, sentence_ids, input_mask):
- # padding id in vocabulary must be set to 0
- emb_out = fluid.layers.embedding(input=src_ids,
- size=[self._voc_size, self._emb_size],
- dtype=self._dtype,
- param_attr=fluid.ParamAttr(name=self._word_emb_name,
- initializer=self._param_initializer),
- is_sparse=False)
- position_emb_out = fluid.layers.embedding(input=position_ids,
- size=[self._max_position_seq_len, self._emb_size],
- dtype=self._dtype,
- param_attr=fluid.ParamAttr(name=self._pos_emb_name,
- initializer=self._param_initializer))
-
- sent_emb_out = fluid.layers.embedding(sentence_ids,
- size=[self._sent_types, self._emb_size],
- dtype=self._dtype,
- param_attr=fluid.ParamAttr(name=self._sent_emb_name,
- initializer=self._param_initializer))
-
- emb_out = emb_out + position_emb_out
- emb_out = emb_out + sent_emb_out
-
- emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
-
- if self._dtype == "float16":
- input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
-
- self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True)
- self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
- n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1)
- n_head_self_attn_mask.stop_gradient = True
-
- self._enc_out = encoder(enc_input=emb_out,
- attn_bias=n_head_self_attn_mask,
- n_layer=self._n_layer,
- n_head=self._n_head,
- d_key=self._emb_size // self._n_head,
- d_value=self._emb_size // self._n_head,
- d_model=self._emb_size,
- d_inner_hid=self._emb_size * 4,
- prepostprocess_dropout=self._prepostprocess_dropout,
- attention_dropout=self._attention_dropout,
- relu_dropout=0,
- hidden_act=self._hidden_act,
- preprocess_cmd="",
- postprocess_cmd="dan",
- param_initializer=self._param_initializer,
- name='encoder')
-
- def get_sequence_output(self):
- return self._enc_out
-
- def get_pooled_output(self):
- """Get the first feature of each sequence for classification"""
-
- next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1])
- next_sent_feat = fluid.layers.fc(input=next_sent_feat,
- size=self._emb_size,
- act="tanh",
- param_attr=fluid.ParamAttr(name="pooled_fc.w_0",
- initializer=self._param_initializer),
- bias_attr="pooled_fc.b_0")
- return next_sent_feat
-
- def get_pretraining_output(self, mask_label, mask_pos, labels):
- """Get the loss & accuracy for pretraining"""
-
- mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
-
- # extract the first token feature in each sentence
- next_sent_feat = self.get_pooled_output()
- reshaped_emb_out = fluid.layers.reshape(x=self._enc_out, shape=[-1, self._emb_size])
- # extract masked tokens' feature
- mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
-
- # transform: fc
- mask_trans_feat = fluid.layers.fc(input=mask_feat,
- size=self._emb_size,
- act=self._hidden_act,
- param_attr=fluid.ParamAttr(name='mask_lm_trans_fc.w_0',
- initializer=self._param_initializer),
- bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
- # transform: layer norm
- mask_trans_feat = pre_process_layer(mask_trans_feat, 'n', name='mask_lm_trans')
-
- mask_lm_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0",
- initializer=fluid.initializer.Constant(value=0.0))
- if self._weight_sharing:
- fc_out = fluid.layers.matmul(x=mask_trans_feat,
- y=fluid.default_main_program().global_block().var(self._word_emb_name),
- transpose_y=True)
- fc_out += fluid.layers.create_parameter(shape=[self._voc_size],
- dtype=self._dtype,
- attr=mask_lm_out_bias_attr,
- is_bias=True)
-
- else:
- fc_out = fluid.layers.fc(input=mask_trans_feat,
- size=self._voc_size,
- param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0",
- initializer=self._param_initializer),
- bias_attr=mask_lm_out_bias_attr)
-
- mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out, label=mask_label)
- mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
-
- next_sent_fc_out = fluid.layers.fc(input=next_sent_feat,
- size=2,
- param_attr=fluid.ParamAttr(name="next_sent_fc.w_0",
- initializer=self._param_initializer),
- bias_attr="next_sent_fc.b_0")
-
- next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(logits=next_sent_fc_out,
- label=labels,
- return_softmax=True)
-
- next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax, label=labels)
-
- mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
-
- loss = mean_next_sent_loss + mean_mask_lm_loss
- return next_sent_acc, mean_mask_lm_loss, loss
diff --git a/modules/text/language_model/chinese_bert_wwm_ext/model/transformer_encoder.py b/modules/text/language_model/chinese_bert_wwm_ext/model/transformer_encoder.py
deleted file mode 100644
index b15d8388..00000000
--- a/modules/text/language_model/chinese_bert_wwm_ext/model/transformer_encoder.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Transformer encoder."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from functools import partial
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-
-def multi_head_attention(queries,
- keys,
- values,
- attn_bias,
- d_key,
- d_value,
- d_model,
- n_head=1,
- dropout_rate=0.,
- cache=None,
- param_initializer=None,
- name='multi_head_att'):
- """
- Multi-Head Attention. Note that attn_bias is added to the logit before
- computing softmax activiation to mask certain selected positions so that
- they will not considered in attention weights.
- """
- keys = queries if keys is None else keys
- values = keys if values is None else values
-
- if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
- raise ValueError("Inputs: quries, keys and values should all be 3-D tensors.")
-
- def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
- """
- Add linear projection to queries, keys, and values.
- """
- q = layers.fc(input=queries,
- size=d_key * n_head,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + '_query_fc.w_0', initializer=param_initializer),
- bias_attr=name + '_query_fc.b_0')
- k = layers.fc(input=keys,
- size=d_key * n_head,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + '_key_fc.w_0', initializer=param_initializer),
- bias_attr=name + '_key_fc.b_0')
- v = layers.fc(input=values,
- size=d_value * n_head,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + '_value_fc.w_0', initializer=param_initializer),
- bias_attr=name + '_value_fc.b_0')
- return q, k, v
-
- def __split_heads(x, n_head):
- """
- Reshape the last dimension of inpunt tensor x so that it becomes two
- dimensions and then transpose. Specifically, input a tensor with shape
- [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
- with shape [bs, n_head, max_sequence_length, hidden_dim].
- """
- hidden_size = x.shape[-1]
- # The value 0 in shape attr means copying the corresponding dimension
- # size of the input as the output dimension size.
- reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
-
- # permuate the dimensions into:
- # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
- return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-
- def __combine_heads(x):
- """
- Transpose and then reshape the last two dimensions of inpunt tensor x
- so that it becomes one dimension, which is reverse to __split_heads.
- """
- if len(x.shape) == 3: return x
- if len(x.shape) != 4:
- raise ValueError("Input(x) should be a 4-D Tensor.")
-
- trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
- # The value 0 in shape attr means copying the corresponding dimension
- # size of the input as the output dimension size.
- return layers.reshape(x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True)
-
- def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
- """
- Scaled Dot-Product Attention
- """
- scaled_q = layers.scale(x=q, scale=d_key**-0.5)
- product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
- if attn_bias:
- product += attn_bias
- weights = layers.softmax(product)
- if dropout_rate:
- weights = layers.dropout(weights,
- dropout_prob=dropout_rate,
- dropout_implementation="upscale_in_train",
- is_test=False)
- out = layers.matmul(weights, v)
- return out
-
- q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-
- if cache is not None: # use cache and concat time steps
- # Since the inplace reshape in __split_heads changes the shape of k and
- # v, which is the cache input for next time step, reshape the cache
- # input from the previous time step first.
- k = cache["k"] = layers.concat([layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1)
- v = cache["v"] = layers.concat([layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1)
-
- q = __split_heads(q, n_head)
- k = __split_heads(k, n_head)
- v = __split_heads(v, n_head)
-
- ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate)
-
- out = __combine_heads(ctx_multiheads)
-
- # Project back to the model size.
- proj_out = layers.fc(input=out,
- size=d_model,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + '_output_fc.w_0', initializer=param_initializer),
- bias_attr=name + '_output_fc.b_0')
- return proj_out
-
-
-def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate, hidden_act, param_initializer=None, name='ffn'):
- """
- Position-wise Feed-Forward Networks.
- This module consists of two linear transformations with a ReLU activation
- in between, which is applied to each position separately and identically.
- """
- hidden = layers.fc(input=x,
- size=d_inner_hid,
- num_flatten_dims=2,
- act=hidden_act,
- param_attr=fluid.ParamAttr(name=name + '_fc_0.w_0', initializer=param_initializer),
- bias_attr=name + '_fc_0.b_0')
- if dropout_rate:
- hidden = layers.dropout(hidden,
- dropout_prob=dropout_rate,
- dropout_implementation="upscale_in_train",
- is_test=False)
- out = layers.fc(input=hidden,
- size=d_hid,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + '_fc_1.w_0', initializer=param_initializer),
- bias_attr=name + '_fc_1.b_0')
- return out
-
-
-def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., name=''):
- """
- Add residual connection, layer normalization and droput to the out tensor
- optionally according to the value of process_cmd.
- This will be used before or after multi-head attention and position-wise
- feed-forward networks.
- """
- for cmd in process_cmd:
- if cmd == "a": # add residual connection
- out = out + prev_out if prev_out else out
- elif cmd == "n": # add layer normalization
- out_dtype = out.dtype
- if out_dtype == fluid.core.VarDesc.VarType.FP16:
- out = layers.cast(x=out, dtype="float32")
- out = layers.layer_norm(out,
- begin_norm_axis=len(out.shape) - 1,
- param_attr=fluid.ParamAttr(name=name + '_layer_norm_scale',
- initializer=fluid.initializer.Constant(1.)),
- bias_attr=fluid.ParamAttr(name=name + '_layer_norm_bias',
- initializer=fluid.initializer.Constant(0.)))
- if out_dtype == fluid.core.VarDesc.VarType.FP16:
- out = layers.cast(x=out, dtype="float16")
- elif cmd == "d": # add dropout
- if dropout_rate:
- out = layers.dropout(out,
- dropout_prob=dropout_rate,
- dropout_implementation="upscale_in_train",
- is_test=False)
- return out
-
-
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-
-
-def encoder_layer(enc_input,
- attn_bias,
- n_head,
- d_key,
- d_value,
- d_model,
- d_inner_hid,
- prepostprocess_dropout,
- attention_dropout,
- relu_dropout,
- hidden_act,
- preprocess_cmd="n",
- postprocess_cmd="da",
- param_initializer=None,
- name=''):
- """The encoder layers that can be stacked to form a deep encoder.
- This module consits of a multi-head (self) attention followed by
- position-wise feed-forward networks and both the two components companied
- with the post_process_layer to add residual connection, layer normalization
- and droput.
- """
- attn_output = multi_head_attention(pre_process_layer(enc_input,
- preprocess_cmd,
- prepostprocess_dropout,
- name=name + '_pre_att'),
- None,
- None,
- attn_bias,
- d_key,
- d_value,
- d_model,
- n_head,
- attention_dropout,
- param_initializer=param_initializer,
- name=name + '_multi_head_att')
- attn_output = post_process_layer(enc_input,
- attn_output,
- postprocess_cmd,
- prepostprocess_dropout,
- name=name + '_post_att')
- ffd_output = positionwise_feed_forward(pre_process_layer(attn_output,
- preprocess_cmd,
- prepostprocess_dropout,
- name=name + '_pre_ffn'),
- d_inner_hid,
- d_model,
- relu_dropout,
- hidden_act,
- param_initializer=param_initializer,
- name=name + '_ffn')
- return post_process_layer(attn_output, ffd_output, postprocess_cmd, prepostprocess_dropout, name=name + '_post_ffn')
-
-
-def encoder(enc_input,
- attn_bias,
- n_layer,
- n_head,
- d_key,
- d_value,
- d_model,
- d_inner_hid,
- prepostprocess_dropout,
- attention_dropout,
- relu_dropout,
- hidden_act,
- preprocess_cmd="n",
- postprocess_cmd="da",
- param_initializer=None,
- name=''):
- """
- The encoder is composed of a stack of identical layers returned by calling
- encoder_layer.
- """
- for i in range(n_layer):
- enc_output = encoder_layer(enc_input,
- attn_bias,
- n_head,
- d_key,
- d_value,
- d_model,
- d_inner_hid,
- prepostprocess_dropout,
- attention_dropout,
- relu_dropout,
- hidden_act,
- preprocess_cmd,
- postprocess_cmd,
- param_initializer=param_initializer,
- name=name + '_layer_' + str(i))
- enc_input = enc_output
- enc_output = pre_process_layer(enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
-
- return enc_output
diff --git a/modules/text/language_model/chinese_bert_wwm_ext/module.py b/modules/text/language_model/chinese_bert_wwm_ext/module.py
index 273b2f02..6ff6803f 100644
--- a/modules/text/language_model/chinese_bert_wwm_ext/module.py
+++ b/modules/text/language_model/chinese_bert_wwm_ext/module.py
@@ -1,7 +1,6 @@
-# coding:utf-8
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
-# Licensed under the Apache License, Version 2.0 (the "License"
+# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
@@ -12,62 +11,121 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
+from typing import Dict
import os
+import math
-from paddlehub import TransformerModule
-from paddlehub.module.module import moduleinfo
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
-from chinese_bert_wwm_ext.model.bert import BertConfig, BertModel
+from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification
+from paddlenlp.transformers.bert.tokenizer import BertTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
+from paddlehub.utils.log import logger
@moduleinfo(
name="chinese-bert-wwm-ext",
- version="1.0.0",
- summary="chinese-bert-wwm-ext, 12-layer, 768-hidden, 12-heads, 110M parameters ",
+ version="2.0.1",
+ summary=
+ "chinese-bert-wwm-ext, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.",
author="ymcui",
author_email="ymcui@ir.hit.edu.cn",
type="nlp/semantic_model",
+ meta=TransformerModule
)
-class BertWwm(TransformerModule):
- def _initialize(self):
- self.MAX_SEQ_LEN = 512
- self.params_path = os.path.join(self.directory, "assets", "params")
- self.vocab_path = os.path.join(self.directory, "assets", "vocab.txt")
+class BertWwm(nn.Layer):
+ """
+ BertWwm model
+ """
- bert_config_path = os.path.join(self.directory, "assets", "bert_config.json")
- self.bert_config = BertConfig(bert_config_path)
+ def __init__(
+ self,
+ task: str = None,
+ load_checkpoint: str = None,
+ label_map: Dict = None,
+ num_classes: int = 2,
+ **kwargs,
+ ):
+ super(BertWwm, self).__init__()
+ if label_map:
+ self.label_map = label_map
+ self.num_classes = len(label_map)
+ else:
+ self.num_classes = num_classes
- def net(self, input_ids, position_ids, segment_ids, input_mask):
- """
- create neural network.
+ if task == 'sequence_classification':
+ task = 'seq-cls'
+ logger.warning(
+ "current task name 'sequence_classification' was renamed to 'seq-cls', "
+ "'sequence_classification' has been deprecated and will be removed in the future.",
+ )
+ if task == 'seq-cls':
+ self.model = BertForSequenceClassification.from_pretrained(
+ pretrained_model_name_or_path='bert-wwm-ext-chinese',
+ num_classes=self.num_classes,
+ **kwargs
+ )
+ self.criterion = paddle.nn.loss.CrossEntropyLoss()
+ self.metric = paddle.metric.Accuracy()
+ elif task == 'token-cls':
+ self.model = BertForTokenClassification.from_pretrained(
+ pretrained_model_name_or_path='bert-wwm-ext-chinese',
+ num_classes=self.num_classes,
+ **kwargs
+ )
+ self.criterion = paddle.nn.loss.CrossEntropyLoss()
+ self.metric = ChunkEvaluator(
+ label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+ )
+ elif task is None:
+ self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-wwm-ext-chinese', **kwargs)
+ else:
+ raise RuntimeError("Unknown task {}, task should be one in {}".format(
+ task, self._tasks_supported))
- Args:
- input_ids (tensor): the word ids.
- position_ids (tensor): the position ids.
- segment_ids (tensor): the segment ids.
- input_mask (tensor): the padding mask.
+ self.task = task
- Returns:
- pooled_output (tensor): sentence-level output for classification task.
- sequence_output (tensor): token-level output for sequence task.
- """
- bert = BertModel(src_ids=input_ids,
- position_ids=position_ids,
- sentence_ids=segment_ids,
- input_mask=input_mask,
- config=self.bert_config,
- use_fp16=False)
- pooled_output = bert.get_pooled_output()
- sequence_output = bert.get_sequence_output()
- return pooled_output, sequence_output
+ if load_checkpoint is not None and os.path.isfile(load_checkpoint):
+ state_dict = paddle.load(load_checkpoint)
+ self.set_state_dict(state_dict)
+ logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
+ def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
+ result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
+ if self.task == 'seq-cls':
+ logits = result
+ probs = F.softmax(logits, axis=1)
+ if labels is not None:
+ loss = self.criterion(logits, labels)
+ correct = self.metric.compute(probs, labels)
+ acc = self.metric.update(correct)
+ return probs, loss, {'acc': acc}
+ return probs
+ elif self.task == 'token-cls':
+ logits = result
+ token_level_probs = F.softmax(logits, axis=-1)
+ preds = token_level_probs.argmax(axis=-1)
+ if labels is not None:
+ loss = self.criterion(logits, labels.unsqueeze(-1))
+ num_infer_chunks, num_label_chunks, num_correct_chunks = \
+ self.metric.compute(None, seq_lengths, preds, labels)
+ self.metric.update(
+ num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+ _, _, f1_score = map(float, self.metric.accumulate())
+ return token_level_probs, loss, {'f1_score': f1_score}
+ return token_level_probs
+ else:
+ sequence_output, pooled_output = result
+ return sequence_output, pooled_output
-if __name__ == '__main__':
- test_module = BertWwm()
+ @staticmethod
+ def get_tokenizer(*args, **kwargs):
+ """
+ Gets the tokenizer that is customized for this module.
+ """
+ return BertTokenizer.from_pretrained(
+ pretrained_model_name_or_path='bert-wwm-ext-chinese', *args, **kwargs)
diff --git a/modules/text/language_model/ernie/README.md b/modules/text/language_model/ernie/README.md
index 4aebcebf..4ee91755 100644
--- a/modules/text/language_model/ernie/README.md
+++ b/modules/text/language_model/ernie/README.md
@@ -1,5 +1,5 @@
```shell
-$ hub install ernie==2.0.0
+$ hub install ernie==2.0.1
```
## 在线体验
AI Studio 快速体验
@@ -15,7 +15,6 @@ $ hub install ernie==2.0.0
@@ -19,23 +19,29 @@ $ hub install ernie_v2_eng_base==2.0.0 def __init__( task=None, load_checkpoint=None, - label_map=None) + label_map=None, + num_classes=2, + **kwargs, +) ``` 创建Module对象(动态图组网版本)。 **参数** -* `task`: 任务名称,可为`sequence_classification`。 +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 * `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 * `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python def predict( data, max_seq_len=128, batch_size=1, - use_gpu=False) + use_gpu=False +) ``` **参数** @@ -50,7 +56,9 @@ def predict( ```python def get_embedding( - texts, + data, + max_seq_len=128, + batch_size=1, use_gpu=False ) ``` @@ -59,7 +67,9 @@ def get_embedding( **参数** -* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** @@ -73,16 +83,16 @@ def get_embedding( import paddlehub as hub data = [ - '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', - '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', - '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], ] label_map = {0: 'negative', 1: 'positive'} model = hub.Module( name='ernie_v2_eng_base', - version='2.0.0', - task='sequence_classification', + version='2.0.1', + task='seq-cls', load_checkpoint='/path/to/parameters', label_map=label_map) results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) @@ -90,7 +100,9 @@ for idx, text in enumerate(data): print('Data: {} \t Lable: {}'.format(text, results[idx])) ``` -参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) ## 服务部署 @@ -116,12 +128,12 @@ $ hub serving start -m ernie_v2_eng_base import requests import json -# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]} -text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]] -# 以key的方式指定text传入预测方法的时的参数,此例中为"texts" -# 对应本地部署,则为module.get_embedding(texts=text) -data = {"texts": text} -# 发送post请求,content-type类型应指定json方式 +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip url = "http://10.12.121.132:8866/predict/ernie_v2_eng_base" # 指定post请求的headers为application/json方式 headers = {"Content-Type": "application/json"} @@ -157,3 +169,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图版本,接口有所变化 + +* 2.0.1 + + 任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/ernie_v2_eng_base/module.py b/modules/text/language_model/ernie_v2_eng_base/module.py index b9b74ba3..59ea31b7 100644 --- a/modules/text/language_model/ernie_v2_eng_base/module.py +++ b/modules/text/language_model/ernie_v2_eng_base/module.py @@ -11,29 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict import os +import math -from paddle.dataset.common import DATA_HOME import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlehub import BertTokenizer -from paddlehub.module.modeling_ernie import ErnieModel, ErnieForSequenceClassification -from paddlehub.module.module import moduleinfo, serving +from paddlenlp.transformers.ernie.modeling import ErnieModel, ErnieForSequenceClassification, ErnieForTokenClassification +from paddlenlp.transformers.ernie.tokenizer import ErnieTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule from paddlehub.utils.log import logger -from paddlehub.utils.utils import download @moduleinfo( name="ernie_v2_eng_base", - version="2.0.0", + version="2.0.1", summary= "Baidu's ERNIE 2.0, Enhanced Representation through kNowledge IntEgration, max_seq_len=512 when predtrained. The module is executed as paddle.dygraph.", author="paddlepaddle", author_email="", - type="nlp/semantic_model") + type="nlp/semantic_model", + meta=TransformerModule) class ErnieV2(nn.Layer): """ Ernie model @@ -41,181 +43,88 @@ class ErnieV2(nn.Layer): def __init__( self, - task=None, - load_checkpoint=None, - label_map=None, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, ): super(ErnieV2, self).__init__() - # TODO(zhangxuefei): add token_classification task + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes + if task == 'sequence_classification': + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': self.model = ErnieForSequenceClassification.from_pretrained( - pretrained_model_name_or_path='ernie_v2_eng_base') + pretrained_model_name_or_path='ernie-2.0-en', + num_classes=self.num_classes, + **kwargs + ) self.criterion = paddle.nn.loss.CrossEntropyLoss() - self.metric = paddle.metric.Accuracy(name='acc_accumulation') + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = ErnieForTokenClassification.from_pretrained( + pretrained_model_name_or_path='ernie-2.0-en', + num_classes=self.num_classes, + **kwargs + ) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) elif task is None: - self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie_v2_eng_base') + self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie-2.0-en', **kwargs) else: - raise RuntimeError("Unknown task %s, task should be sequence_classification" % task) + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) self.task = task - self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None): + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): result = self.model(input_ids, token_type_ids, position_ids, attention_mask) - if self.task is not None: + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) - return probs, loss, acc + return probs, loss, {'acc': acc} return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs else: sequence_output, pooled_output = result return sequence_output, pooled_output - def get_vocab_path(self): - """ - Gets the path of the module vocabulary path. - """ - save_path = os.path.join(DATA_HOME, 'ernie', 'vocab.txt') - if not os.path.exists(save_path) or not os.path.isfile(save_path): - url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_v2_eng_base/vocab.txt" - download(url, os.path.join(DATA_HOME, 'ernie')) - return save_path - - def get_tokenizer(self, tokenize_chinese_chars=True): + @staticmethod + def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. - Args: - tokenize_chinese_chars (:obj: bool , defaults to :obj: True): - Whether to tokenize chinese characters or not. - Returns: - tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module. - """ - return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path()) - - def training_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for training, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as loss and metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'loss': avg_loss, 'metrics': {'acc': acc}} - - def validation_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for validation, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'metrics': {'acc': acc}} - - def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False): """ - Predicts the data labels. - - Args: - data (obj:`List(str)`): The processed data whose each element is the raw text. - max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`): - If set to a number, will limit the total sequence returned so that it has a maximum length. - batch_size(obj:`int`, defaults to 1): The number of batch. - use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not. - - Returns: - results(obj:`list`): All the predictions labels. - """ - # TODO(zhangxuefei): add task token_classification task predict. - if self.task not in ['sequence_classification']: - raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - tokenizer = self.get_tokenizer() - - examples = [] - for text in data: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids'])) - - def _batchify_fn(batch): - input_ids = [entry[0] for entry in batch] - segment_ids = [entry[1] for entry in batch] - return input_ids, segment_ids - - # Seperates data into some batches. - batches = [] - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) - - results = [] - self.eval() - for batch in batches: - input_ids, segment_ids = _batchify_fn(batch) - input_ids = paddle.to_tensor(input_ids) - segment_ids = paddle.to_tensor(segment_ids) - - # TODO(zhangxuefei): add task token_classification postprocess after prediction. - if self.task == 'sequence_classification': - probs = self(input_ids, segment_ids) - idx = paddle.argmax(probs, axis=1).numpy() - idx = idx.tolist() - labels = [self.label_map[i] for i in idx] - results.extend(labels) - - return results - - @serving - def get_embedding(self, texts, use_gpu=False): - if self.task is not None: - raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - - tokenizer = self.get_tokenizer() - results = [] - for text in texts: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - - input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0) - segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0) - sequence_output, pooled_output = self(input_ids, segment_ids) - - sequence_output = sequence_output.squeeze(0) - pooled_output = pooled_output.squeeze(0) - results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist())) - return results + return ErnieTokenizer.from_pretrained( + pretrained_model_name_or_path='ernie-2.0-en', *args, **kwargs) \ No newline at end of file diff --git a/modules/text/language_model/ernie_v2_eng_large/README.md b/modules/text/language_model/ernie_v2_eng_large/README.md index 149d6012..680bc1be 100644 --- a/modules/text/language_model/ernie_v2_eng_large/README.md +++ b/modules/text/language_model/ernie_v2_eng_large/README.md @@ -1,6 +1,6 @@ ```shell -$ hub install ernie_v2_eng_large==2.0.0 +$ hub install ernie_v2_eng_large==2.0.1 ```
@@ -19,29 +19,35 @@ $ hub install ernie_v2_eng_large==2.0.0 def __init__( task=None, load_checkpoint=None, - label_map=None) + label_map=None, + num_classes=2, + **kwargs, +) ``` 创建Module对象(动态图组网版本)。 **参数** -* `task`: 任务名称,可为`sequence_classification`。 +* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。 * `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。 * `label_map`:预测时的类别映射表。 +* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 ```python def predict( data, max_seq_len=128, batch_size=1, - use_gpu=False) + use_gpu=False +) ``` **参数** * `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例, - 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 + 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。 * `max_seq_len`:模型处理文本的最大长度 * `batch_size`:模型批处理大小 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 @@ -50,7 +56,9 @@ def predict( ```python def get_embedding( - texts, + data, + max_seq_len=128, + batch_size=1, use_gpu=False ) ``` @@ -59,7 +67,9 @@ def get_embedding( **参数** -* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。 +* `max_seq_len`:模型处理文本的最大长度。 +* `batch_size`:模型批处理大小。 * `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。 **返回** @@ -73,16 +83,16 @@ def get_embedding( import paddlehub as hub data = [ - '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', - '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', - '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', + ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'], + ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'], + ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'], ] label_map = {0: 'negative', 1: 'positive'} model = hub.Module( name='ernie_v2_eng_large', - version='2.0.0', - task='sequence_classification', + version='2.0.1', + task='seq-cls', load_checkpoint='/path/to/parameters', label_map=label_map) results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False) @@ -90,7 +100,9 @@ for idx, text in enumerate(data): print('Data: {} \t Lable: {}'.format(text, results[idx])) ``` -参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation +详情可参考PaddleHub示例: +- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification) +- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling) ## 服务部署 @@ -116,12 +128,12 @@ $ hub serving start -m ernie_v2_eng_large import requests import json -# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]} -text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]] -# 以key的方式指定text传入预测方法的时的参数,此例中为"texts" -# 对应本地部署,则为module.get_embedding(texts=text) -data = {"texts": text} -# 发送post请求,content-type类型应指定json方式 +# 指定用于获取embedding的文本[[text_1], [text_2], ... ]} +text = [["今天是个好日子"], ["天气预报说今天要下雨"]] +# 以key的方式指定text传入预测方法的时的参数,此例中为"data" +# 对应本地部署,则为module.get_embedding(data=text) +data = {"data": text} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip url = "http://10.12.121.132:8866/predict/ernie_v2_eng_large" # 指定post请求的headers为application/json方式 headers = {"Content-Type": "application/json"} @@ -157,3 +169,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图版本,接口有所变化 + +* 2.0.1 + + 任务名称调整,增加序列标注任务`token-cls` diff --git a/modules/text/language_model/ernie_v2_eng_large/module.py b/modules/text/language_model/ernie_v2_eng_large/module.py index 8d3ae55f..0d54a670 100644 --- a/modules/text/language_model/ernie_v2_eng_large/module.py +++ b/modules/text/language_model/ernie_v2_eng_large/module.py @@ -11,29 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict import os +import math -from paddle.dataset.common import DATA_HOME import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlehub import BertTokenizer -from paddlehub.module.modeling_ernie import ErnieModel, ErnieForSequenceClassification -from paddlehub.module.module import moduleinfo, serving +from paddlenlp.transformers.ernie.modeling import ErnieModel, ErnieForSequenceClassification, ErnieForTokenClassification +from paddlenlp.transformers.ernie.tokenizer import ErnieTokenizer +from paddlenlp.metrics import ChunkEvaluator +from paddlehub.module.module import moduleinfo +from paddlehub.module.nlp_module import TransformerModule from paddlehub.utils.log import logger -from paddlehub.utils.utils import download @moduleinfo( name="ernie_v2_eng_large", - version="2.0.0", + version="2.0.1", summary= "Baidu's ERNIE 2.0, Enhanced Representation through kNowledge IntEgration, max_seq_len=512 when predtrained. The module is executed as paddle.dygraph.", author="paddlepaddle", author_email="", - type="nlp/semantic_model") + type="nlp/semantic_model", + meta=TransformerModule) class ErnieV2(nn.Layer): """ Ernie model @@ -41,181 +43,88 @@ class ErnieV2(nn.Layer): def __init__( self, - task=None, - load_checkpoint=None, - label_map=None, + task: str = None, + load_checkpoint: str = None, + label_map: Dict = None, + num_classes: int = 2, + **kwargs, ): super(ErnieV2, self).__init__() - # TODO(zhangxuefei): add token_classification task + if label_map: + self.label_map = label_map + self.num_classes = len(label_map) + else: + self.num_classes = num_classes + if task == 'sequence_classification': + task = 'seq-cls' + logger.warning( + "current task name 'sequence_classification' was renamed to 'seq-cls', " + "'sequence_classification' has been deprecated and will be removed in the future.", + ) + if task == 'seq-cls': self.model = ErnieForSequenceClassification.from_pretrained( - pretrained_model_name_or_path='ernie_v2_eng_large') + pretrained_model_name_or_path='ernie-2.0-large-en', + num_classes=self.num_classes, + **kwargs + ) self.criterion = paddle.nn.loss.CrossEntropyLoss() - self.metric = paddle.metric.Accuracy(name='acc_accumulation') + self.metric = paddle.metric.Accuracy() + elif task == 'token-cls': + self.model = ErnieForTokenClassification.from_pretrained( + pretrained_model_name_or_path='ernie-2.0-large-en', + num_classes=self.num_classes, + **kwargs + ) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = ChunkEvaluator( + label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] + ) elif task is None: - self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie_v2_eng_large') + self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie-2.0-large-en', **kwargs) else: - raise RuntimeError("Unknown task %s, task should be sequence_classification" % task) + raise RuntimeError("Unknown task {}, task should be one in {}".format( + task, self._tasks_supported)) self.task = task - self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None): + def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): result = self.model(input_ids, token_type_ids, position_ids, attention_mask) - if self.task is not None: + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) - return probs, loss, acc + return probs, loss, {'acc': acc} return probs + elif self.task == 'token-cls': + logits = result + token_level_probs = F.softmax(logits, axis=-1) + preds = token_level_probs.argmax(axis=-1) + if labels is not None: + loss = self.criterion(logits, labels.unsqueeze(-1)) + num_infer_chunks, num_label_chunks, num_correct_chunks = \ + self.metric.compute(None, seq_lengths, preds, labels) + self.metric.update( + num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) + _, _, f1_score = map(float, self.metric.accumulate()) + return token_level_probs, loss, {'f1_score': f1_score} + return token_level_probs else: sequence_output, pooled_output = result return sequence_output, pooled_output - def get_vocab_path(self): - """ - Gets the path of the module vocabulary path. - """ - save_path = os.path.join(DATA_HOME, 'ernie', 'vocab.txt') - if not os.path.exists(save_path) or not os.path.isfile(save_path): - url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_v2_eng_large/vocab.txt" - download(url, os.path.join(DATA_HOME, 'ernie')) - return save_path - - def get_tokenizer(self, tokenize_chinese_chars=True): + @staticmethod + def get_tokenizer(*args, **kwargs): """ Gets the tokenizer that is customized for this module. - Args: - tokenize_chinese_chars (:obj: bool , defaults to :obj: True): - Whether to tokenize chinese characters or not. - Returns: - tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module. - """ - return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path()) - - def training_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for training, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as loss and metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'loss': avg_loss, 'metrics': {'acc': acc}} - - def validation_step(self, batch: List[paddle.Tensor], batch_idx: int): - """ - One step for validation, which should be called as forward computation. - Args: - batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed, - such as input_ids, sent_ids, pos_ids, input_mask and labels. - batch_idx(int): The index of batch. - Returns: - results(:obj: Dict) : The model outputs, such as metrics. - """ - predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2]) - return {'metrics': {'acc': acc}} - - def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False): """ - Predicts the data labels. - - Args: - data (obj:`List(str)`): The processed data whose each element is the raw text. - max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`): - If set to a number, will limit the total sequence returned so that it has a maximum length. - batch_size(obj:`int`, defaults to 1): The number of batch. - use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not. - - Returns: - results(obj:`list`): All the predictions labels. - """ - # TODO(zhangxuefei): add task token_classification task predict. - if self.task not in ['sequence_classification']: - raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - tokenizer = self.get_tokenizer() - - examples = [] - for text in data: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids'])) - - def _batchify_fn(batch): - input_ids = [entry[0] for entry in batch] - segment_ids = [entry[1] for entry in batch] - return input_ids, segment_ids - - # Seperates data into some batches. - batches = [] - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) - - results = [] - self.eval() - for batch in batches: - input_ids, segment_ids = _batchify_fn(batch) - input_ids = paddle.to_tensor(input_ids) - segment_ids = paddle.to_tensor(segment_ids) - - # TODO(zhangxuefei): add task token_classification postprocess after prediction. - if self.task == 'sequence_classification': - probs = self(input_ids, segment_ids) - idx = paddle.argmax(probs, axis=1).numpy() - idx = idx.tolist() - labels = [self.label_map[i] for i in idx] - results.extend(labels) - - return results - - @serving - def get_embedding(self, texts, use_gpu=False): - if self.task is not None: - raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task) - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - - tokenizer = self.get_tokenizer() - results = [] - for text in texts: - if len(text) == 1: - encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False) - elif len(text) == 2: - encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False) - else: - raise RuntimeError( - 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) - - input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0) - segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0) - sequence_output, pooled_output = self(input_ids, segment_ids) - - sequence_output = sequence_output.squeeze(0) - pooled_output = pooled_output.squeeze(0) - results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist())) - return results + return ErnieTokenizer.from_pretrained( + pretrained_model_name_or_path='ernie-2.0-large-en', *args, **kwargs) diff --git a/modules/text/language_model/rbt3/README.md b/modules/text/language_model/rbt3/README.md index a9a001d8..baf02dd1 100644 --- a/modules/text/language_model/rbt3/README.md +++ b/modules/text/language_model/rbt3/README.md @@ -1,5 +1,5 @@ ```shell -$ hub install rbt3==1.0.0 +$ hub install rtb3==2.0.1 ```
@@ -9,94 +9,130 @@ $ hub install rbt3==1.0.0
## API
```python
-def context(
- trainable=True,
- max_seq_len=128
+def __init__(
+ task=None,
+ load_checkpoint=None,
+ label_map=None,
+ num_classes=2,
+ **kwargs,
)
```
-用于获取Module的上下文信息,得到输入、输出以及预训练的Paddle Program副本
-**参数**
-
-> trainable:设置为True时,Module中的参数在Fine-tune时也会随之训练,否则保持不变。
-> max_seq_len:BERT模型的最大序列长度,若序列长度不足,会通过padding方式补到**max_seq_len**, 若序列长度大于该值,则会以截断方式让序列长度为**max_seq_len**,max_seq_len可取值范围为0~512;
-
-**返回**
-> inputs:dict类型,有以下字段:
-> >**input_ids**存放输入文本tokenize后各token对应BERT词汇表的word ids, shape为\[batch_size, max_seq_len\],int64类型;
-> >**position_ids**存放输入文本tokenize后各token所在该文本的位置,shape为\[batch_size, max_seq_len\],int64类型;
-> >**segment_ids**存放各token所在文本的标识(token属于文本1或者文本2),shape为\[batch_size, max_seq_len\],int64类型;
-> >**input_mask**存放token是否为padding的标识,shape为\[batch_size, max_seq_len\],int64类型;
->
-> outputs:dict类型,Module的输出特征,有以下字段:
-> >**pooled_output**字段存放句子粒度的特征,可用于文本分类等任务,shape为 \[batch_size, 768\],int64类型;
-> >**sequence_output**字段存放字粒度的特征,可用于序列标注等任务,shape为 \[batch_size, seq_len, 768\],int64类型;
->
-> program:包含该Module计算图的Program。
+创建Module对象(动态图组网版本)。
+**参数**
+* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
+* `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
+* `label_map`:预测时的类别映射表。
+* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。
+* `**kwargs`:用户额外指定的关键字字典类型的参数。
```python
-def get_embedding(
- texts,
- use_gpu=False,
- batch_size=1
+def predict(
+ data,
+ max_seq_len=128,
+ batch_size=1,
+ use_gpu=False
)
```
-用于获取输入文本的句子粒度特征与字粒度特征
-
**参数**
-> texts:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。
-> use_gpu:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。
+* `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,
+ 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。
+* `max_seq_len`:模型处理文本的最大长度
+* `batch_size`:模型批处理大小
+* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。
**返回**
-> results:list类型,格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\],其中每个元素都是对应样例的特征输出,每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。
->
-
```python
-def get_params_layer()
+def get_embedding(
+ data,
+ max_seq_len=128,
+ batch_size=1,
+ use_gpu=False
+)
```
-用于获取参数层信息,该方法与ULMFiTStrategy联用可以严格按照层数设置分层学习率与逐层解冻。
+用于获取输入文本的句子粒度特征与字粒度特征
**参数**
-> 无
+* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。
+* `max_seq_len`:模型处理文本的最大长度。
+* `batch_size`:模型批处理大小。
+* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。
**返回**
-> params_layer:dict类型,key为参数名,值为参数所在层数
+* `results`:list类型,格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\],其中每个元素都是对应样例的特征输出,每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。
+
**代码示例**
```python
import paddlehub as hub
-# Load $ hub install rbt3 pretrained model
-module = hub.Module(name="rbt3")
-inputs, outputs, program = module.context(trainable=True, max_seq_len=128)
+data = [
+ ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'],
+ ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'],
+ ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'],
+]
+label_map = {0: 'negative', 1: 'positive'}
+
+model = hub.Module(
+ name='rtb3',
+ version='2.0.1',
+ task='seq-cls',
+ load_checkpoint='/path/to/parameters',
+ label_map=label_map)
+results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
+for idx, text in enumerate(data):
+ print('Data: {} \t Lable: {}'.format(text, results[idx]))
+```
-# Must feed all the tensor of rbt3's module need
-input_ids = inputs["input_ids"]
-position_ids = inputs["position_ids"]
-segment_ids = inputs["segment_ids"]
-input_mask = inputs["input_mask"]
+详情可参考PaddleHub示例:
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
-# Use "pooled_output" for sentence-level output.
-pooled_output = outputs["pooled_output"]
+## 服务部署
-# Use "sequence_output" for token-level output.
-sequence_output = outputs["sequence_output"]
+PaddleHub Serving可以部署一个在线获取预训练词向量。
-# Use "get_embedding" to get embedding result.
-embedding_result = module.get_embedding(texts=[["Sample1_text_a"],["Sample2_text_a","Sample2_text_b"]], use_gpu=True)
+### Step1: 启动PaddleHub Serving
-# Use "get_params_layer" to get params layer and used to ULMFiTStrategy.
-params_layer = module.get_params_layer()
-strategy = hub.finetune.strategy.ULMFiTStrategy(frz_params_layer=params_layer, dis_params_layer=params_layer)
+运行启动命令:
+
+```shell
+$ hub serving start -m rtb3
+```
+
+这样就完成了一个获取预训练词向量服务化API的部署,默认端口号为8866。
+
+**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。
+
+### Step2: 发送预测请求
+
+配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果
+
+```python
+import requests
+import json
+
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数,此例中为"data"
+# 对应本地部署,则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip
+url = "http://10.12.121.132:8866/predict/rtb3"
+# 指定post请求的headers为application/json方式
+headers = {"Content-Type": "application/json"}
+
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+print(r.json())
```
## 查看代码
@@ -109,12 +145,16 @@ https://github.com/ymcui/Chinese-BERT-wwm
## 依赖
-paddlepaddle >= 1.6.2
+paddlepaddle >= 2.0.0
-paddlehub >= 1.6.0
+paddlehub >= 2.0.0
## 更新历史
* 1.0.0
初始发布
+
+* 2.0.1
+
+ 全面升级动态图,接口有所变化。任务名称调整,增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/rbt3/model/__init__.py b/modules/text/language_model/rbt3/model/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/modules/text/language_model/rbt3/model/bert.py b/modules/text/language_model/rbt3/model/bert.py
deleted file mode 100644
index 4d37cb02..00000000
--- a/modules/text/language_model/rbt3/model/bert.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-import json
-
-import paddle.fluid as fluid
-
-from rbt3.model.transformer_encoder import encoder, pre_process_layer
-
-
-class BertConfig(object):
- def __init__(self, config_path):
- self._config_dict = self._parse(config_path)
-
- def _parse(self, config_path):
- try:
- with open(config_path) as json_file:
- config_dict = json.load(json_file)
- except Exception:
- raise IOError("Error in parsing bert model config file '%s'" % config_path)
- else:
- return config_dict
-
- def __getitem__(self, key):
- return self._config_dict[key]
-
- def print_config(self):
- for arg, value in sorted(six.iteritems(self._config_dict)):
- print('%s: %s' % (arg, value))
- print('------------------------------------------------')
-
-
-class BertModel(object):
- def __init__(self, src_ids, position_ids, sentence_ids, input_mask, config, weight_sharing=True, use_fp16=False):
-
- self._emb_size = config['hidden_size']
- self._n_layer = config['num_hidden_layers']
- self._n_head = config['num_attention_heads']
- self._voc_size = config['vocab_size']
- self._max_position_seq_len = config['max_position_embeddings']
- self._sent_types = config['type_vocab_size']
- self._hidden_act = config['hidden_act']
- self._prepostprocess_dropout = config['hidden_dropout_prob']
- self._attention_dropout = config['attention_probs_dropout_prob']
- self._weight_sharing = weight_sharing
-
- self._word_emb_name = "word_embedding"
- self._pos_emb_name = "pos_embedding"
- self._sent_emb_name = "sent_embedding"
- self._dtype = "float16" if use_fp16 else "float32"
-
- # Initialize all weigths by truncated normal initializer, and all biases
- # will be initialized by constant zero by default.
- self._param_initializer = fluid.initializer.TruncatedNormal(scale=config['initializer_range'])
-
- self._build_model(src_ids, position_ids, sentence_ids, input_mask)
-
- def _build_model(self, src_ids, position_ids, sentence_ids, input_mask):
- # padding id in vocabulary must be set to 0
- emb_out = fluid.layers.embedding(input=src_ids,
- size=[self._voc_size, self._emb_size],
- dtype=self._dtype,
- param_attr=fluid.ParamAttr(name=self._word_emb_name,
- initializer=self._param_initializer),
- is_sparse=False)
- position_emb_out = fluid.layers.embedding(input=position_ids,
- size=[self._max_position_seq_len, self._emb_size],
- dtype=self._dtype,
- param_attr=fluid.ParamAttr(name=self._pos_emb_name,
- initializer=self._param_initializer))
-
- sent_emb_out = fluid.layers.embedding(sentence_ids,
- size=[self._sent_types, self._emb_size],
- dtype=self._dtype,
- param_attr=fluid.ParamAttr(name=self._sent_emb_name,
- initializer=self._param_initializer))
-
- emb_out = emb_out + position_emb_out
- emb_out = emb_out + sent_emb_out
-
- emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
-
- if self._dtype == "float16":
- input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
-
- self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True)
- self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
- n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1)
- n_head_self_attn_mask.stop_gradient = True
-
- self._enc_out = encoder(enc_input=emb_out,
- attn_bias=n_head_self_attn_mask,
- n_layer=self._n_layer,
- n_head=self._n_head,
- d_key=self._emb_size // self._n_head,
- d_value=self._emb_size // self._n_head,
- d_model=self._emb_size,
- d_inner_hid=self._emb_size * 4,
- prepostprocess_dropout=self._prepostprocess_dropout,
- attention_dropout=self._attention_dropout,
- relu_dropout=0,
- hidden_act=self._hidden_act,
- preprocess_cmd="",
- postprocess_cmd="dan",
- param_initializer=self._param_initializer,
- name='encoder')
-
- def get_sequence_output(self):
- return self._enc_out
-
- def get_pooled_output(self):
- """Get the first feature of each sequence for classification"""
-
- next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1])
- next_sent_feat = fluid.layers.fc(input=next_sent_feat,
- size=self._emb_size,
- act="tanh",
- param_attr=fluid.ParamAttr(name="pooled_fc.w_0",
- initializer=self._param_initializer),
- bias_attr="pooled_fc.b_0")
- return next_sent_feat
-
- def get_pretraining_output(self, mask_label, mask_pos, labels):
- """Get the loss & accuracy for pretraining"""
-
- mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
-
- # extract the first token feature in each sentence
- next_sent_feat = self.get_pooled_output()
- reshaped_emb_out = fluid.layers.reshape(x=self._enc_out, shape=[-1, self._emb_size])
- # extract masked tokens' feature
- mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
-
- # transform: fc
- mask_trans_feat = fluid.layers.fc(input=mask_feat,
- size=self._emb_size,
- act=self._hidden_act,
- param_attr=fluid.ParamAttr(name='mask_lm_trans_fc.w_0',
- initializer=self._param_initializer),
- bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
- # transform: layer norm
- mask_trans_feat = pre_process_layer(mask_trans_feat, 'n', name='mask_lm_trans')
-
- mask_lm_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0",
- initializer=fluid.initializer.Constant(value=0.0))
- if self._weight_sharing:
- fc_out = fluid.layers.matmul(x=mask_trans_feat,
- y=fluid.default_main_program().global_block().var(self._word_emb_name),
- transpose_y=True)
- fc_out += fluid.layers.create_parameter(shape=[self._voc_size],
- dtype=self._dtype,
- attr=mask_lm_out_bias_attr,
- is_bias=True)
-
- else:
- fc_out = fluid.layers.fc(input=mask_trans_feat,
- size=self._voc_size,
- param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0",
- initializer=self._param_initializer),
- bias_attr=mask_lm_out_bias_attr)
-
- mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out, label=mask_label)
- mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
-
- next_sent_fc_out = fluid.layers.fc(input=next_sent_feat,
- size=2,
- param_attr=fluid.ParamAttr(name="next_sent_fc.w_0",
- initializer=self._param_initializer),
- bias_attr="next_sent_fc.b_0")
-
- next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(logits=next_sent_fc_out,
- label=labels,
- return_softmax=True)
-
- next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax, label=labels)
-
- mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
-
- loss = mean_next_sent_loss + mean_mask_lm_loss
- return next_sent_acc, mean_mask_lm_loss, loss
diff --git a/modules/text/language_model/rbt3/model/transformer_encoder.py b/modules/text/language_model/rbt3/model/transformer_encoder.py
deleted file mode 100644
index b15d8388..00000000
--- a/modules/text/language_model/rbt3/model/transformer_encoder.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Transformer encoder."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from functools import partial
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-
-def multi_head_attention(queries,
- keys,
- values,
- attn_bias,
- d_key,
- d_value,
- d_model,
- n_head=1,
- dropout_rate=0.,
- cache=None,
- param_initializer=None,
- name='multi_head_att'):
- """
- Multi-Head Attention. Note that attn_bias is added to the logit before
- computing softmax activiation to mask certain selected positions so that
- they will not considered in attention weights.
- """
- keys = queries if keys is None else keys
- values = keys if values is None else values
-
- if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
- raise ValueError("Inputs: quries, keys and values should all be 3-D tensors.")
-
- def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
- """
- Add linear projection to queries, keys, and values.
- """
- q = layers.fc(input=queries,
- size=d_key * n_head,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + '_query_fc.w_0', initializer=param_initializer),
- bias_attr=name + '_query_fc.b_0')
- k = layers.fc(input=keys,
- size=d_key * n_head,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + '_key_fc.w_0', initializer=param_initializer),
- bias_attr=name + '_key_fc.b_0')
- v = layers.fc(input=values,
- size=d_value * n_head,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + '_value_fc.w_0', initializer=param_initializer),
- bias_attr=name + '_value_fc.b_0')
- return q, k, v
-
- def __split_heads(x, n_head):
- """
- Reshape the last dimension of inpunt tensor x so that it becomes two
- dimensions and then transpose. Specifically, input a tensor with shape
- [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
- with shape [bs, n_head, max_sequence_length, hidden_dim].
- """
- hidden_size = x.shape[-1]
- # The value 0 in shape attr means copying the corresponding dimension
- # size of the input as the output dimension size.
- reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
-
- # permuate the dimensions into:
- # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
- return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-
- def __combine_heads(x):
- """
- Transpose and then reshape the last two dimensions of inpunt tensor x
- so that it becomes one dimension, which is reverse to __split_heads.
- """
- if len(x.shape) == 3: return x
- if len(x.shape) != 4:
- raise ValueError("Input(x) should be a 4-D Tensor.")
-
- trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
- # The value 0 in shape attr means copying the corresponding dimension
- # size of the input as the output dimension size.
- return layers.reshape(x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True)
-
- def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
- """
- Scaled Dot-Product Attention
- """
- scaled_q = layers.scale(x=q, scale=d_key**-0.5)
- product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
- if attn_bias:
- product += attn_bias
- weights = layers.softmax(product)
- if dropout_rate:
- weights = layers.dropout(weights,
- dropout_prob=dropout_rate,
- dropout_implementation="upscale_in_train",
- is_test=False)
- out = layers.matmul(weights, v)
- return out
-
- q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-
- if cache is not None: # use cache and concat time steps
- # Since the inplace reshape in __split_heads changes the shape of k and
- # v, which is the cache input for next time step, reshape the cache
- # input from the previous time step first.
- k = cache["k"] = layers.concat([layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1)
- v = cache["v"] = layers.concat([layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1)
-
- q = __split_heads(q, n_head)
- k = __split_heads(k, n_head)
- v = __split_heads(v, n_head)
-
- ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate)
-
- out = __combine_heads(ctx_multiheads)
-
- # Project back to the model size.
- proj_out = layers.fc(input=out,
- size=d_model,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + '_output_fc.w_0', initializer=param_initializer),
- bias_attr=name + '_output_fc.b_0')
- return proj_out
-
-
-def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate, hidden_act, param_initializer=None, name='ffn'):
- """
- Position-wise Feed-Forward Networks.
- This module consists of two linear transformations with a ReLU activation
- in between, which is applied to each position separately and identically.
- """
- hidden = layers.fc(input=x,
- size=d_inner_hid,
- num_flatten_dims=2,
- act=hidden_act,
- param_attr=fluid.ParamAttr(name=name + '_fc_0.w_0', initializer=param_initializer),
- bias_attr=name + '_fc_0.b_0')
- if dropout_rate:
- hidden = layers.dropout(hidden,
- dropout_prob=dropout_rate,
- dropout_implementation="upscale_in_train",
- is_test=False)
- out = layers.fc(input=hidden,
- size=d_hid,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + '_fc_1.w_0', initializer=param_initializer),
- bias_attr=name + '_fc_1.b_0')
- return out
-
-
-def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., name=''):
- """
- Add residual connection, layer normalization and droput to the out tensor
- optionally according to the value of process_cmd.
- This will be used before or after multi-head attention and position-wise
- feed-forward networks.
- """
- for cmd in process_cmd:
- if cmd == "a": # add residual connection
- out = out + prev_out if prev_out else out
- elif cmd == "n": # add layer normalization
- out_dtype = out.dtype
- if out_dtype == fluid.core.VarDesc.VarType.FP16:
- out = layers.cast(x=out, dtype="float32")
- out = layers.layer_norm(out,
- begin_norm_axis=len(out.shape) - 1,
- param_attr=fluid.ParamAttr(name=name + '_layer_norm_scale',
- initializer=fluid.initializer.Constant(1.)),
- bias_attr=fluid.ParamAttr(name=name + '_layer_norm_bias',
- initializer=fluid.initializer.Constant(0.)))
- if out_dtype == fluid.core.VarDesc.VarType.FP16:
- out = layers.cast(x=out, dtype="float16")
- elif cmd == "d": # add dropout
- if dropout_rate:
- out = layers.dropout(out,
- dropout_prob=dropout_rate,
- dropout_implementation="upscale_in_train",
- is_test=False)
- return out
-
-
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-
-
-def encoder_layer(enc_input,
- attn_bias,
- n_head,
- d_key,
- d_value,
- d_model,
- d_inner_hid,
- prepostprocess_dropout,
- attention_dropout,
- relu_dropout,
- hidden_act,
- preprocess_cmd="n",
- postprocess_cmd="da",
- param_initializer=None,
- name=''):
- """The encoder layers that can be stacked to form a deep encoder.
- This module consits of a multi-head (self) attention followed by
- position-wise feed-forward networks and both the two components companied
- with the post_process_layer to add residual connection, layer normalization
- and droput.
- """
- attn_output = multi_head_attention(pre_process_layer(enc_input,
- preprocess_cmd,
- prepostprocess_dropout,
- name=name + '_pre_att'),
- None,
- None,
- attn_bias,
- d_key,
- d_value,
- d_model,
- n_head,
- attention_dropout,
- param_initializer=param_initializer,
- name=name + '_multi_head_att')
- attn_output = post_process_layer(enc_input,
- attn_output,
- postprocess_cmd,
- prepostprocess_dropout,
- name=name + '_post_att')
- ffd_output = positionwise_feed_forward(pre_process_layer(attn_output,
- preprocess_cmd,
- prepostprocess_dropout,
- name=name + '_pre_ffn'),
- d_inner_hid,
- d_model,
- relu_dropout,
- hidden_act,
- param_initializer=param_initializer,
- name=name + '_ffn')
- return post_process_layer(attn_output, ffd_output, postprocess_cmd, prepostprocess_dropout, name=name + '_post_ffn')
-
-
-def encoder(enc_input,
- attn_bias,
- n_layer,
- n_head,
- d_key,
- d_value,
- d_model,
- d_inner_hid,
- prepostprocess_dropout,
- attention_dropout,
- relu_dropout,
- hidden_act,
- preprocess_cmd="n",
- postprocess_cmd="da",
- param_initializer=None,
- name=''):
- """
- The encoder is composed of a stack of identical layers returned by calling
- encoder_layer.
- """
- for i in range(n_layer):
- enc_output = encoder_layer(enc_input,
- attn_bias,
- n_head,
- d_key,
- d_value,
- d_model,
- d_inner_hid,
- prepostprocess_dropout,
- attention_dropout,
- relu_dropout,
- hidden_act,
- preprocess_cmd,
- postprocess_cmd,
- param_initializer=param_initializer,
- name=name + '_layer_' + str(i))
- enc_input = enc_output
- enc_output = pre_process_layer(enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
-
- return enc_output
diff --git a/modules/text/language_model/rbt3/module.py b/modules/text/language_model/rbt3/module.py
index b35e0cd8..3833c987 100644
--- a/modules/text/language_model/rbt3/module.py
+++ b/modules/text/language_model/rbt3/module.py
@@ -1,7 +1,6 @@
-# coding:utf-8
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
-# Licensed under the Apache License, Version 2.0 (the "License"
+# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
@@ -12,62 +11,120 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
+from typing import Dict
import os
+import math
-from paddlehub import TransformerModule
-from paddlehub.module.module import moduleinfo
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
-from rbt3.model.bert import BertConfig, BertModel
+from paddlenlp.transformers.roberta.modeling import RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel
+from paddlenlp.transformers.roberta.tokenizer import RobertaTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
+from paddlehub.utils.log import logger
@moduleinfo(
name="rbt3",
- version="1.0.0",
+ version="2.0.1",
summary="rbt3, 3-layer, 768-hidden, 12-heads, 38M parameters ",
author="ymcui",
author_email="ymcui@ir.hit.edu.cn",
type="nlp/semantic_model",
+ meta=TransformerModule,
)
-class BertWwm(TransformerModule):
- def _initialize(self):
- self.MAX_SEQ_LEN = 512
- self.params_path = os.path.join(self.directory, "assets", "params")
- self.vocab_path = os.path.join(self.directory, "assets", "vocab.txt")
+class Roberta(nn.Layer):
+ """
+ RoBERTa model
+ """
- bert_config_path = os.path.join(self.directory, "assets", "bert_config_rbt3.json")
- self.bert_config = BertConfig(bert_config_path)
+ def __init__(
+ self,
+ task: str = None,
+ load_checkpoint: str = None,
+ label_map: Dict = None,
+ num_classes: int = 2,
+ **kwargs,
+ ):
+ super(Roberta, self).__init__()
+ if label_map:
+ self.label_map = label_map
+ self.num_classes = len(label_map)
+ else:
+ self.num_classes = num_classes
- def net(self, input_ids, position_ids, segment_ids, input_mask):
- """
- create neural network.
+ if task == 'sequence_classification':
+ task = 'seq-cls'
+ logger.warning(
+ "current task name 'sequence_classification' was renamed to 'seq-cls', "
+ "'sequence_classification' has been deprecated and will be removed in the future.",
+ )
+ if task == 'seq-cls':
+ self.model = RobertaForSequenceClassification.from_pretrained(
+ pretrained_model_name_or_path='rbt3',
+ num_classes=self.num_classes,
+ **kwargs
+ )
+ self.criterion = paddle.nn.loss.CrossEntropyLoss()
+ self.metric = paddle.metric.Accuracy()
+ elif task == 'token-cls':
+ self.model = RobertaForTokenClassification.from_pretrained(
+ pretrained_model_name_or_path='rbt3',
+ num_classes=self.num_classes,
+ **kwargs
+ )
+ self.criterion = paddle.nn.loss.CrossEntropyLoss()
+ self.metric = ChunkEvaluator(
+ label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+ )
+ elif task is None:
+ self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='rbt3', **kwargs)
+ else:
+ raise RuntimeError("Unknown task {}, task should be one in {}".format(
+ task, self._tasks_supported))
- Args:
- input_ids (tensor): the word ids.
- position_ids (tensor): the position ids.
- segment_ids (tensor): the segment ids.
- input_mask (tensor): the padding mask.
+ self.task = task
- Returns:
- pooled_output (tensor): sentence-level output for classification task.
- sequence_output (tensor): token-level output for sequence task.
- """
- bert = BertModel(src_ids=input_ids,
- position_ids=position_ids,
- sentence_ids=segment_ids,
- input_mask=input_mask,
- config=self.bert_config,
- use_fp16=False)
- pooled_output = bert.get_pooled_output()
- sequence_output = bert.get_sequence_output()
- return pooled_output, sequence_output
+ if load_checkpoint is not None and os.path.isfile(load_checkpoint):
+ state_dict = paddle.load(load_checkpoint)
+ self.set_state_dict(state_dict)
+ logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
+ def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
+ result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
+ if self.task == 'seq-cls':
+ logits = result
+ probs = F.softmax(logits, axis=1)
+ if labels is not None:
+ loss = self.criterion(logits, labels)
+ correct = self.metric.compute(probs, labels)
+ acc = self.metric.update(correct)
+ return probs, loss, {'acc': acc}
+ return probs
+ elif self.task == 'token-cls':
+ logits = result
+ token_level_probs = F.softmax(logits, axis=-1)
+ preds = token_level_probs.argmax(axis=-1)
+ if labels is not None:
+ loss = self.criterion(logits, labels.unsqueeze(-1))
+ num_infer_chunks, num_label_chunks, num_correct_chunks = \
+ self.metric.compute(None, seq_lengths, preds, labels)
+ self.metric.update(
+ num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+ _, _, f1_score = map(float, self.metric.accumulate())
+ return token_level_probs, loss, {'f1_score': f1_score}
+ return token_level_probs
+ else:
+ sequence_output, pooled_output = result
+ return sequence_output, pooled_output
-if __name__ == '__main__':
- test_module = BertWwm()
+ @staticmethod
+ def get_tokenizer(*args, **kwargs):
+ """
+ Gets the tokenizer that is customized for this module.
+ """
+ return RobertaTokenizer.from_pretrained(
+ pretrained_model_name_or_path='rbt3', *args, **kwargs)
diff --git a/modules/text/language_model/rbtl3/README.md b/modules/text/language_model/rbtl3/README.md
index 53107271..f1dd9c43 100644
--- a/modules/text/language_model/rbtl3/README.md
+++ b/modules/text/language_model/rbtl3/README.md
@@ -1,5 +1,5 @@
```shell
-$ hub install rbtl3==1.0.0
+$ hub install rbtl3==2.0.1
```
@@ -9,94 +9,130 @@ $ hub install rbtl3==1.0.0
## API
```python
-def context(
- trainable=True,
- max_seq_len=128
+def __init__(
+ task=None,
+ load_checkpoint=None,
+ label_map=None,
+ num_classes=2,
+ **kwargs,
)
```
-用于获取Module的上下文信息,得到输入、输出以及预训练的Paddle Program副本
-**参数**
-
-> trainable:设置为True时,Module中的参数在Fine-tune时也会随之训练,否则保持不变。
-> max_seq_len:BERT模型的最大序列长度,若序列长度不足,会通过padding方式补到**max_seq_len**, 若序列长度大于该值,则会以截断方式让序列长度为**max_seq_len**,max_seq_len可取值范围为0~512;
-
-**返回**
-> inputs:dict类型,有以下字段:
-> >**input_ids**存放输入文本tokenize后各token对应BERT词汇表的word ids, shape为\[batch_size, max_seq_len\],int64类型;
-> >**position_ids**存放输入文本tokenize后各token所在该文本的位置,shape为\[batch_size, max_seq_len\],int64类型;
-> >**segment_ids**存放各token所在文本的标识(token属于文本1或者文本2),shape为\[batch_size, max_seq_len\],int64类型;
-> >**input_mask**存放token是否为padding的标识,shape为\[batch_size, max_seq_len\],int64类型;
->
-> outputs:dict类型,Module的输出特征,有以下字段:
-> >**pooled_output**字段存放句子粒度的特征,可用于文本分类等任务,shape为 \[batch_size, 768\],int64类型;
-> >**sequence_output**字段存放字粒度的特征,可用于序列标注等任务,shape为 \[batch_size, seq_len, 768\],int64类型;
->
-> program:包含该Module计算图的Program。
+创建Module对象(动态图组网版本)。
+**参数**
+* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
+* `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
+* `label_map`:预测时的类别映射表。
+* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。
+* `**kwargs`:用户额外指定的关键字字典类型的参数。
```python
-def get_embedding(
- texts,
- use_gpu=False,
- batch_size=1
+def predict(
+ data,
+ max_seq_len=128,
+ batch_size=1,
+ use_gpu=False
)
```
-用于获取输入文本的句子粒度特征与字粒度特征
-
**参数**
-> texts:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。
-> use_gpu:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。
+* `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,
+ 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。
+* `max_seq_len`:模型处理文本的最大长度
+* `batch_size`:模型批处理大小
+* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。
**返回**
-> results:list类型,格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\],其中每个元素都是对应样例的特征输出,每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。
->
-
```python
-def get_params_layer()
+def get_embedding(
+ data,
+ max_seq_len=128,
+ batch_size=1,
+ use_gpu=False
+)
```
-用于获取参数层信息,该方法与ULMFiTStrategy联用可以严格按照层数设置分层学习率与逐层解冻。
+用于获取输入文本的句子粒度特征与字粒度特征
**参数**
-> 无
+* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。
+* `max_seq_len`:模型处理文本的最大长度。
+* `batch_size`:模型批处理大小。
+* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。
**返回**
-> params_layer:dict类型,key为参数名,值为参数所在层数
+* `results`:list类型,格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\],其中每个元素都是对应样例的特征输出,每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。
+
**代码示例**
```python
import paddlehub as hub
-# Load $ hub install rbtl3 pretrained model
-module = hub.Module(name="rbtl3")
-inputs, outputs, program = module.context(trainable=True, max_seq_len=128)
+data = [
+ ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'],
+ ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'],
+ ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'],
+]
+label_map = {0: 'negative', 1: 'positive'}
+
+model = hub.Module(
+ name='rbtl3',
+ version='2.0.1',
+ task='seq-cls',
+ load_checkpoint='/path/to/parameters',
+ label_map=label_map)
+results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
+for idx, text in enumerate(data):
+ print('Data: {} \t Lable: {}'.format(text, results[idx]))
+```
-# Must feed all the tensor of rbtl3's module need
-input_ids = inputs["input_ids"]
-position_ids = inputs["position_ids"]
-segment_ids = inputs["segment_ids"]
-input_mask = inputs["input_mask"]
+详情可参考PaddleHub示例:
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
-# Use "pooled_output" for sentence-level output.
-pooled_output = outputs["pooled_output"]
+## 服务部署
-# Use "sequence_output" for token-level output.
-sequence_output = outputs["sequence_output"]
+PaddleHub Serving可以部署一个在线获取预训练词向量。
-# Use "get_embedding" to get embedding result.
-embedding_result = module.get_embedding(texts=[["Sample1_text_a"],["Sample2_text_a","Sample2_text_b"]], use_gpu=True)
+### Step1: 启动PaddleHub Serving
-# Use "get_params_layer" to get params layer and used to ULMFiTStrategy.
-params_layer = module.get_params_layer()
-strategy = hub.finetune.strategy.ULMFiTStrategy(frz_params_layer=params_layer, dis_params_layer=params_layer)
+运行启动命令:
+
+```shell
+$ hub serving start -m rbtl3
+```
+
+这样就完成了一个获取预训练词向量服务化API的部署,默认端口号为8866。
+
+**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。
+
+### Step2: 发送预测请求
+
+配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果
+
+```python
+import requests
+import json
+
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数,此例中为"data"
+# 对应本地部署,则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip
+url = "http://10.12.121.132:8866/predict/rbtl3"
+# 指定post请求的headers为application/json方式
+headers = {"Content-Type": "application/json"}
+
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+print(r.json())
```
## 查看代码
@@ -109,12 +145,16 @@ https://github.com/ymcui/Chinese-BERT-wwm
## 依赖
-paddlepaddle >= 1.6.2
+paddlepaddle >= 2.0.0
-paddlehub >= 1.6.0
+paddlehub >= 2.0.0
## 更新历史
* 1.0.0
初始发布
+
+* 2.0.1
+
+ 全面升级动态图,接口有所变化。任务名称调整,增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/rbtl3/model/__init__.py b/modules/text/language_model/rbtl3/model/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/modules/text/language_model/rbtl3/model/bert.py b/modules/text/language_model/rbtl3/model/bert.py
deleted file mode 100644
index 8c27ad34..00000000
--- a/modules/text/language_model/rbtl3/model/bert.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-import json
-
-import paddle.fluid as fluid
-
-from rbtl3.model.transformer_encoder import encoder, pre_process_layer
-
-
-class BertConfig(object):
- def __init__(self, config_path):
- self._config_dict = self._parse(config_path)
-
- def _parse(self, config_path):
- try:
- with open(config_path) as json_file:
- config_dict = json.load(json_file)
- except Exception:
- raise IOError("Error in parsing bert model config file '%s'" % config_path)
- else:
- return config_dict
-
- def __getitem__(self, key):
- return self._config_dict[key]
-
- def print_config(self):
- for arg, value in sorted(six.iteritems(self._config_dict)):
- print('%s: %s' % (arg, value))
- print('------------------------------------------------')
-
-
-class BertModel(object):
- def __init__(self, src_ids, position_ids, sentence_ids, input_mask, config, weight_sharing=True, use_fp16=False):
-
- self._emb_size = config['hidden_size']
- self._n_layer = config['num_hidden_layers']
- self._n_head = config['num_attention_heads']
- self._voc_size = config['vocab_size']
- self._max_position_seq_len = config['max_position_embeddings']
- self._sent_types = config['type_vocab_size']
- self._hidden_act = config['hidden_act']
- self._prepostprocess_dropout = config['hidden_dropout_prob']
- self._attention_dropout = config['attention_probs_dropout_prob']
- self._weight_sharing = weight_sharing
-
- self._word_emb_name = "word_embedding"
- self._pos_emb_name = "pos_embedding"
- self._sent_emb_name = "sent_embedding"
- self._dtype = "float16" if use_fp16 else "float32"
-
- # Initialize all weigths by truncated normal initializer, and all biases
- # will be initialized by constant zero by default.
- self._param_initializer = fluid.initializer.TruncatedNormal(scale=config['initializer_range'])
-
- self._build_model(src_ids, position_ids, sentence_ids, input_mask)
-
- def _build_model(self, src_ids, position_ids, sentence_ids, input_mask):
- # padding id in vocabulary must be set to 0
- emb_out = fluid.layers.embedding(input=src_ids,
- size=[self._voc_size, self._emb_size],
- dtype=self._dtype,
- param_attr=fluid.ParamAttr(name=self._word_emb_name,
- initializer=self._param_initializer),
- is_sparse=False)
- position_emb_out = fluid.layers.embedding(input=position_ids,
- size=[self._max_position_seq_len, self._emb_size],
- dtype=self._dtype,
- param_attr=fluid.ParamAttr(name=self._pos_emb_name,
- initializer=self._param_initializer))
-
- sent_emb_out = fluid.layers.embedding(sentence_ids,
- size=[self._sent_types, self._emb_size],
- dtype=self._dtype,
- param_attr=fluid.ParamAttr(name=self._sent_emb_name,
- initializer=self._param_initializer))
-
- emb_out = emb_out + position_emb_out
- emb_out = emb_out + sent_emb_out
-
- emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
-
- if self._dtype == "float16":
- input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
-
- self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True)
- self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
- n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1)
- n_head_self_attn_mask.stop_gradient = True
-
- self._enc_out = encoder(enc_input=emb_out,
- attn_bias=n_head_self_attn_mask,
- n_layer=self._n_layer,
- n_head=self._n_head,
- d_key=self._emb_size // self._n_head,
- d_value=self._emb_size // self._n_head,
- d_model=self._emb_size,
- d_inner_hid=self._emb_size * 4,
- prepostprocess_dropout=self._prepostprocess_dropout,
- attention_dropout=self._attention_dropout,
- relu_dropout=0,
- hidden_act=self._hidden_act,
- preprocess_cmd="",
- postprocess_cmd="dan",
- param_initializer=self._param_initializer,
- name='encoder')
-
- def get_sequence_output(self):
- return self._enc_out
-
- def get_pooled_output(self):
- """Get the first feature of each sequence for classification"""
-
- next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1])
- next_sent_feat = fluid.layers.fc(input=next_sent_feat,
- size=self._emb_size,
- act="tanh",
- param_attr=fluid.ParamAttr(name="pooled_fc.w_0",
- initializer=self._param_initializer),
- bias_attr="pooled_fc.b_0")
- return next_sent_feat
-
- def get_pretraining_output(self, mask_label, mask_pos, labels):
- """Get the loss & accuracy for pretraining"""
-
- mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
-
- # extract the first token feature in each sentence
- next_sent_feat = self.get_pooled_output()
- reshaped_emb_out = fluid.layers.reshape(x=self._enc_out, shape=[-1, self._emb_size])
- # extract masked tokens' feature
- mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
-
- # transform: fc
- mask_trans_feat = fluid.layers.fc(input=mask_feat,
- size=self._emb_size,
- act=self._hidden_act,
- param_attr=fluid.ParamAttr(name='mask_lm_trans_fc.w_0',
- initializer=self._param_initializer),
- bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
- # transform: layer norm
- mask_trans_feat = pre_process_layer(mask_trans_feat, 'n', name='mask_lm_trans')
-
- mask_lm_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0",
- initializer=fluid.initializer.Constant(value=0.0))
- if self._weight_sharing:
- fc_out = fluid.layers.matmul(x=mask_trans_feat,
- y=fluid.default_main_program().global_block().var(self._word_emb_name),
- transpose_y=True)
- fc_out += fluid.layers.create_parameter(shape=[self._voc_size],
- dtype=self._dtype,
- attr=mask_lm_out_bias_attr,
- is_bias=True)
-
- else:
- fc_out = fluid.layers.fc(input=mask_trans_feat,
- size=self._voc_size,
- param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0",
- initializer=self._param_initializer),
- bias_attr=mask_lm_out_bias_attr)
-
- mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out, label=mask_label)
- mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
-
- next_sent_fc_out = fluid.layers.fc(input=next_sent_feat,
- size=2,
- param_attr=fluid.ParamAttr(name="next_sent_fc.w_0",
- initializer=self._param_initializer),
- bias_attr="next_sent_fc.b_0")
-
- next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(logits=next_sent_fc_out,
- label=labels,
- return_softmax=True)
-
- next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax, label=labels)
-
- mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
-
- loss = mean_next_sent_loss + mean_mask_lm_loss
- return next_sent_acc, mean_mask_lm_loss, loss
diff --git a/modules/text/language_model/rbtl3/model/transformer_encoder.py b/modules/text/language_model/rbtl3/model/transformer_encoder.py
deleted file mode 100644
index b15d8388..00000000
--- a/modules/text/language_model/rbtl3/model/transformer_encoder.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Transformer encoder."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from functools import partial
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-
-def multi_head_attention(queries,
- keys,
- values,
- attn_bias,
- d_key,
- d_value,
- d_model,
- n_head=1,
- dropout_rate=0.,
- cache=None,
- param_initializer=None,
- name='multi_head_att'):
- """
- Multi-Head Attention. Note that attn_bias is added to the logit before
- computing softmax activiation to mask certain selected positions so that
- they will not considered in attention weights.
- """
- keys = queries if keys is None else keys
- values = keys if values is None else values
-
- if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
- raise ValueError("Inputs: quries, keys and values should all be 3-D tensors.")
-
- def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
- """
- Add linear projection to queries, keys, and values.
- """
- q = layers.fc(input=queries,
- size=d_key * n_head,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + '_query_fc.w_0', initializer=param_initializer),
- bias_attr=name + '_query_fc.b_0')
- k = layers.fc(input=keys,
- size=d_key * n_head,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + '_key_fc.w_0', initializer=param_initializer),
- bias_attr=name + '_key_fc.b_0')
- v = layers.fc(input=values,
- size=d_value * n_head,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + '_value_fc.w_0', initializer=param_initializer),
- bias_attr=name + '_value_fc.b_0')
- return q, k, v
-
- def __split_heads(x, n_head):
- """
- Reshape the last dimension of inpunt tensor x so that it becomes two
- dimensions and then transpose. Specifically, input a tensor with shape
- [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
- with shape [bs, n_head, max_sequence_length, hidden_dim].
- """
- hidden_size = x.shape[-1]
- # The value 0 in shape attr means copying the corresponding dimension
- # size of the input as the output dimension size.
- reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
-
- # permuate the dimensions into:
- # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
- return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-
- def __combine_heads(x):
- """
- Transpose and then reshape the last two dimensions of inpunt tensor x
- so that it becomes one dimension, which is reverse to __split_heads.
- """
- if len(x.shape) == 3: return x
- if len(x.shape) != 4:
- raise ValueError("Input(x) should be a 4-D Tensor.")
-
- trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
- # The value 0 in shape attr means copying the corresponding dimension
- # size of the input as the output dimension size.
- return layers.reshape(x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True)
-
- def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
- """
- Scaled Dot-Product Attention
- """
- scaled_q = layers.scale(x=q, scale=d_key**-0.5)
- product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
- if attn_bias:
- product += attn_bias
- weights = layers.softmax(product)
- if dropout_rate:
- weights = layers.dropout(weights,
- dropout_prob=dropout_rate,
- dropout_implementation="upscale_in_train",
- is_test=False)
- out = layers.matmul(weights, v)
- return out
-
- q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-
- if cache is not None: # use cache and concat time steps
- # Since the inplace reshape in __split_heads changes the shape of k and
- # v, which is the cache input for next time step, reshape the cache
- # input from the previous time step first.
- k = cache["k"] = layers.concat([layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1)
- v = cache["v"] = layers.concat([layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1)
-
- q = __split_heads(q, n_head)
- k = __split_heads(k, n_head)
- v = __split_heads(v, n_head)
-
- ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate)
-
- out = __combine_heads(ctx_multiheads)
-
- # Project back to the model size.
- proj_out = layers.fc(input=out,
- size=d_model,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + '_output_fc.w_0', initializer=param_initializer),
- bias_attr=name + '_output_fc.b_0')
- return proj_out
-
-
-def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate, hidden_act, param_initializer=None, name='ffn'):
- """
- Position-wise Feed-Forward Networks.
- This module consists of two linear transformations with a ReLU activation
- in between, which is applied to each position separately and identically.
- """
- hidden = layers.fc(input=x,
- size=d_inner_hid,
- num_flatten_dims=2,
- act=hidden_act,
- param_attr=fluid.ParamAttr(name=name + '_fc_0.w_0', initializer=param_initializer),
- bias_attr=name + '_fc_0.b_0')
- if dropout_rate:
- hidden = layers.dropout(hidden,
- dropout_prob=dropout_rate,
- dropout_implementation="upscale_in_train",
- is_test=False)
- out = layers.fc(input=hidden,
- size=d_hid,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + '_fc_1.w_0', initializer=param_initializer),
- bias_attr=name + '_fc_1.b_0')
- return out
-
-
-def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., name=''):
- """
- Add residual connection, layer normalization and droput to the out tensor
- optionally according to the value of process_cmd.
- This will be used before or after multi-head attention and position-wise
- feed-forward networks.
- """
- for cmd in process_cmd:
- if cmd == "a": # add residual connection
- out = out + prev_out if prev_out else out
- elif cmd == "n": # add layer normalization
- out_dtype = out.dtype
- if out_dtype == fluid.core.VarDesc.VarType.FP16:
- out = layers.cast(x=out, dtype="float32")
- out = layers.layer_norm(out,
- begin_norm_axis=len(out.shape) - 1,
- param_attr=fluid.ParamAttr(name=name + '_layer_norm_scale',
- initializer=fluid.initializer.Constant(1.)),
- bias_attr=fluid.ParamAttr(name=name + '_layer_norm_bias',
- initializer=fluid.initializer.Constant(0.)))
- if out_dtype == fluid.core.VarDesc.VarType.FP16:
- out = layers.cast(x=out, dtype="float16")
- elif cmd == "d": # add dropout
- if dropout_rate:
- out = layers.dropout(out,
- dropout_prob=dropout_rate,
- dropout_implementation="upscale_in_train",
- is_test=False)
- return out
-
-
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-
-
-def encoder_layer(enc_input,
- attn_bias,
- n_head,
- d_key,
- d_value,
- d_model,
- d_inner_hid,
- prepostprocess_dropout,
- attention_dropout,
- relu_dropout,
- hidden_act,
- preprocess_cmd="n",
- postprocess_cmd="da",
- param_initializer=None,
- name=''):
- """The encoder layers that can be stacked to form a deep encoder.
- This module consits of a multi-head (self) attention followed by
- position-wise feed-forward networks and both the two components companied
- with the post_process_layer to add residual connection, layer normalization
- and droput.
- """
- attn_output = multi_head_attention(pre_process_layer(enc_input,
- preprocess_cmd,
- prepostprocess_dropout,
- name=name + '_pre_att'),
- None,
- None,
- attn_bias,
- d_key,
- d_value,
- d_model,
- n_head,
- attention_dropout,
- param_initializer=param_initializer,
- name=name + '_multi_head_att')
- attn_output = post_process_layer(enc_input,
- attn_output,
- postprocess_cmd,
- prepostprocess_dropout,
- name=name + '_post_att')
- ffd_output = positionwise_feed_forward(pre_process_layer(attn_output,
- preprocess_cmd,
- prepostprocess_dropout,
- name=name + '_pre_ffn'),
- d_inner_hid,
- d_model,
- relu_dropout,
- hidden_act,
- param_initializer=param_initializer,
- name=name + '_ffn')
- return post_process_layer(attn_output, ffd_output, postprocess_cmd, prepostprocess_dropout, name=name + '_post_ffn')
-
-
-def encoder(enc_input,
- attn_bias,
- n_layer,
- n_head,
- d_key,
- d_value,
- d_model,
- d_inner_hid,
- prepostprocess_dropout,
- attention_dropout,
- relu_dropout,
- hidden_act,
- preprocess_cmd="n",
- postprocess_cmd="da",
- param_initializer=None,
- name=''):
- """
- The encoder is composed of a stack of identical layers returned by calling
- encoder_layer.
- """
- for i in range(n_layer):
- enc_output = encoder_layer(enc_input,
- attn_bias,
- n_head,
- d_key,
- d_value,
- d_model,
- d_inner_hid,
- prepostprocess_dropout,
- attention_dropout,
- relu_dropout,
- hidden_act,
- preprocess_cmd,
- postprocess_cmd,
- param_initializer=param_initializer,
- name=name + '_layer_' + str(i))
- enc_input = enc_output
- enc_output = pre_process_layer(enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
-
- return enc_output
diff --git a/modules/text/language_model/rbtl3/module.py b/modules/text/language_model/rbtl3/module.py
index a60c30a4..500fc42c 100644
--- a/modules/text/language_model/rbtl3/module.py
+++ b/modules/text/language_model/rbtl3/module.py
@@ -1,7 +1,6 @@
-# coding:utf-8
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
-# Licensed under the Apache License, Version 2.0 (the "License"
+# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
@@ -12,62 +11,120 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
+from typing import Dict
import os
+import math
-from paddlehub import TransformerModule
-from paddlehub.module.module import moduleinfo
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
-from rbtl3.model.bert import BertConfig, BertModel
+from paddlenlp.transformers.roberta.modeling import RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel
+from paddlenlp.transformers.roberta.tokenizer import RobertaTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
+from paddlehub.utils.log import logger
@moduleinfo(
name="rbtl3",
- version="1.0.0",
+ version="2.0.1",
summary="rbtl3, 3-layer, 1024-hidden, 16-heads, 61M parameters ",
author="ymcui",
author_email="ymcui@ir.hit.edu.cn",
type="nlp/semantic_model",
+ meta=TransformerModule,
)
-class BertWwm(TransformerModule):
- def _initialize(self):
- self.MAX_SEQ_LEN = 512
- self.params_path = os.path.join(self.directory, "assets", "params")
- self.vocab_path = os.path.join(self.directory, "assets", "vocab.txt")
+class Roberta(nn.Layer):
+ """
+ RoBERTa model
+ """
- bert_config_path = os.path.join(self.directory, "assets", "bert_config_rbtl3.json")
- self.bert_config = BertConfig(bert_config_path)
+ def __init__(
+ self,
+ task: str = None,
+ load_checkpoint: str = None,
+ label_map: Dict = None,
+ num_classes: int = 2,
+ **kwargs,
+ ):
+ super(Roberta, self).__init__()
+ if label_map:
+ self.label_map = label_map
+ self.num_classes = len(label_map)
+ else:
+ self.num_classes = num_classes
- def net(self, input_ids, position_ids, segment_ids, input_mask):
- """
- create neural network.
+ if task == 'sequence_classification':
+ task = 'seq-cls'
+ logger.warning(
+ "current task name 'sequence_classification' was renamed to 'seq-cls', "
+ "'sequence_classification' has been deprecated and will be removed in the future.",
+ )
+ if task == 'seq-cls':
+ self.model = RobertaForSequenceClassification.from_pretrained(
+ pretrained_model_name_or_path='rbtl3',
+ num_classes=self.num_classes,
+ **kwargs
+ )
+ self.criterion = paddle.nn.loss.CrossEntropyLoss()
+ self.metric = paddle.metric.Accuracy()
+ elif task == 'token-cls':
+ self.model = RobertaForTokenClassification.from_pretrained(
+ pretrained_model_name_or_path='rbtl3',
+ num_classes=self.num_classes,
+ **kwargs
+ )
+ self.criterion = paddle.nn.loss.CrossEntropyLoss()
+ self.metric = ChunkEvaluator(
+ label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+ )
+ elif task is None:
+ self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='rbtl3', **kwargs)
+ else:
+ raise RuntimeError("Unknown task {}, task should be one in {}".format(
+ task, self._tasks_supported))
- Args:
- input_ids (tensor): the word ids.
- position_ids (tensor): the position ids.
- segment_ids (tensor): the segment ids.
- input_mask (tensor): the padding mask.
+ self.task = task
- Returns:
- pooled_output (tensor): sentence-level output for classification task.
- sequence_output (tensor): token-level output for sequence task.
- """
- bert = BertModel(src_ids=input_ids,
- position_ids=position_ids,
- sentence_ids=segment_ids,
- input_mask=input_mask,
- config=self.bert_config,
- use_fp16=False)
- pooled_output = bert.get_pooled_output()
- sequence_output = bert.get_sequence_output()
- return pooled_output, sequence_output
+ if load_checkpoint is not None and os.path.isfile(load_checkpoint):
+ state_dict = paddle.load(load_checkpoint)
+ self.set_state_dict(state_dict)
+ logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
+ def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
+ result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
+ if self.task == 'seq-cls':
+ logits = result
+ probs = F.softmax(logits, axis=1)
+ if labels is not None:
+ loss = self.criterion(logits, labels)
+ correct = self.metric.compute(probs, labels)
+ acc = self.metric.update(correct)
+ return probs, loss, {'acc': acc}
+ return probs
+ elif self.task == 'token-cls':
+ logits = result
+ token_level_probs = F.softmax(logits, axis=-1)
+ preds = token_level_probs.argmax(axis=-1)
+ if labels is not None:
+ loss = self.criterion(logits, labels.unsqueeze(-1))
+ num_infer_chunks, num_label_chunks, num_correct_chunks = \
+ self.metric.compute(None, seq_lengths, preds, labels)
+ self.metric.update(
+ num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+ _, _, f1_score = map(float, self.metric.accumulate())
+ return token_level_probs, loss, {'f1_score': f1_score}
+ return token_level_probs
+ else:
+ sequence_output, pooled_output = result
+ return sequence_output, pooled_output
-if __name__ == '__main__':
- test_module = BertWwm()
+ @staticmethod
+ def get_tokenizer(*args, **kwargs):
+ """
+ Gets the tokenizer that is customized for this module.
+ """
+ return RobertaTokenizer.from_pretrained(
+ pretrained_model_name_or_path='rbtl3', *args, **kwargs)
diff --git a/modules/text/language_model/roberta-wwm-ext-large/README.md b/modules/text/language_model/roberta-wwm-ext-large/README.md
index 77d1b02c..4d19bf2b 100644
--- a/modules/text/language_model/roberta-wwm-ext-large/README.md
+++ b/modules/text/language_model/roberta-wwm-ext-large/README.md
@@ -1,5 +1,5 @@
```shell
-$ hub install roberta-wwm-ext-large==2.0.0
+$ hub install roberta-wwm-ext-large==2.0.1
```
@@ -13,29 +13,35 @@ $ hub install roberta-wwm-ext-large==2.0.0
def __init__(
task=None,
load_checkpoint=None,
- label_map=None)
+ label_map=None,
+ num_classes=2,
+ **kwargs,
+)
```
创建Module对象(动态图组网版本)。
**参数**
-* `task`: 任务名称,可为`sequence_classification`。
+* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
* `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
* `label_map`:预测时的类别映射表。
+* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。
+* `**kwargs`:用户额外指定的关键字字典类型的参数。
```python
def predict(
data,
max_seq_len=128,
batch_size=1,
- use_gpu=False)
+ use_gpu=False
+)
```
**参数**
* `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,
- 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。
+ 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。
* `max_seq_len`:模型处理文本的最大长度
* `batch_size`:模型批处理大小
* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。
@@ -44,7 +50,9 @@ def predict(
```python
def get_embedding(
- texts,
+ data,
+ max_seq_len=128,
+ batch_size=1,
use_gpu=False
)
```
@@ -53,7 +61,9 @@ def get_embedding(
**参数**
-* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。
+* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。
+* `max_seq_len`:模型处理文本的最大长度。
+* `batch_size`:模型批处理大小。
* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。
**返回**
@@ -67,16 +77,16 @@ def get_embedding(
import paddlehub as hub
data = [
- '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般',
- '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片',
- '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。',
+ ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'],
+ ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'],
+ ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'],
]
label_map = {0: 'negative', 1: 'positive'}
model = hub.Module(
name='roberta-wwm-ext-large',
- version='2.0.0',
- task='sequence_classification',
+ version='2.0.1',
+ task='seq-cls',
load_checkpoint='/path/to/parameters',
label_map=label_map)
results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
@@ -84,7 +94,9 @@ for idx, text in enumerate(data):
print('Data: {} \t Lable: {}'.format(text, results[idx]))
```
-参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation
+详情可参考PaddleHub示例:
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
## 服务部署
@@ -110,12 +122,12 @@ $ hub serving start -m roberta-wwm-ext-large
import requests
import json
-# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]}
-text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]]
-# 以key的方式指定text传入预测方法的时的参数,此例中为"texts"
-# 对应本地部署,则为module.get_embedding(texts=text)
-data = {"texts": text}
-# 发送post请求,content-type类型应指定json方式
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数,此例中为"data"
+# 对应本地部署,则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip
url = "http://10.12.121.132:8866/predict/roberta-wwm-ext-large"
# 指定post请求的headers为application/json方式
headers = {"Content-Type": "application/json"}
@@ -126,7 +138,7 @@ print(r.json())
## 查看代码
-https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/pretrain_langauge_models/BERT
+https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/paddlenlp/transformers/roberta
## 依赖
@@ -144,3 +156,7 @@ paddlehub >= 2.0.0
* 2.0.0
全面升级动态图,接口有所变化。
+
+* 2.0.1
+
+ 任务名称调整,增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/roberta-wwm-ext-large/module.py b/modules/text/language_model/roberta-wwm-ext-large/module.py
index 7785bf1e..aa45811d 100644
--- a/modules/text/language_model/roberta-wwm-ext-large/module.py
+++ b/modules/text/language_model/roberta-wwm-ext-large/module.py
@@ -11,29 +11,31 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict
import os
+import math
-from paddle.dataset.common import DATA_HOME
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
-from paddlehub import BertTokenizer
-from paddlehub.module.modeling_roberta import RobertaModel, RobertaForSequenceClassification
-from paddlehub.module.module import moduleinfo, serving
+from paddlenlp.transformers.roberta.modeling import RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel
+from paddlenlp.transformers.roberta.tokenizer import RobertaTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
from paddlehub.utils.log import logger
-from paddlehub.utils.utils import download
@moduleinfo(
name="roberta-wwm-ext-large",
- version="2.0.0",
+ version="2.0.1",
summary=
"chinese-roberta-wwm-ext-large, 24-layer, 1024-hidden, 16-heads, 340M parameters. The module is executed as paddle.dygraph.",
author="ymcui",
author_email="ymcui@ir.hit.edu.cn",
type="nlp/semantic_model",
+ meta=TransformerModule,
)
class Roberta(nn.Layer):
"""
@@ -42,181 +44,88 @@ class Roberta(nn.Layer):
def __init__(
self,
- task=None,
- load_checkpoint=None,
- label_map=None,
+ task: str = None,
+ load_checkpoint: str = None,
+ label_map: Dict = None,
+ num_classes: int = 2,
+ **kwargs,
):
super(Roberta, self).__init__()
- # TODO(zhangxuefei): add token_classification task
+ if label_map:
+ self.label_map = label_map
+ self.num_classes = len(label_map)
+ else:
+ self.num_classes = num_classes
+
if task == 'sequence_classification':
+ task = 'seq-cls'
+ logger.warning(
+ "current task name 'sequence_classification' was renamed to 'seq-cls', "
+ "'sequence_classification' has been deprecated and will be removed in the future.",
+ )
+ if task == 'seq-cls':
self.model = RobertaForSequenceClassification.from_pretrained(
- pretrained_model_name_or_path='roberta-wwm-ext-large')
+ pretrained_model_name_or_path='roberta-wwm-ext-large',
+ num_classes=self.num_classes,
+ **kwargs
+ )
self.criterion = paddle.nn.loss.CrossEntropyLoss()
- self.metric = paddle.metric.Accuracy(name='acc_accumulation')
+ self.metric = paddle.metric.Accuracy()
+ elif task == 'token-cls':
+ self.model = RobertaForTokenClassification.from_pretrained(
+ pretrained_model_name_or_path='roberta-wwm-ext-large',
+ num_classes=self.num_classes,
+ **kwargs
+ )
+ self.criterion = paddle.nn.loss.CrossEntropyLoss()
+ self.metric = ChunkEvaluator(
+ label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+ )
elif task is None:
- self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='roberta-wwm-ext-large')
+ self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='roberta-wwm-ext-large', **kwargs)
else:
- raise RuntimeError("Unknown task %s, task should be sequence_classification" % task)
+ raise RuntimeError("Unknown task {}, task should be one in {}".format(
+ task, self._tasks_supported))
self.task = task
- self.label_map = label_map
if load_checkpoint is not None and os.path.isfile(load_checkpoint):
state_dict = paddle.load(load_checkpoint)
self.set_state_dict(state_dict)
logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
- def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None):
+ def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
- if self.task is not None:
+ if self.task == 'seq-cls':
logits = result
probs = F.softmax(logits, axis=1)
if labels is not None:
loss = self.criterion(logits, labels)
correct = self.metric.compute(probs, labels)
acc = self.metric.update(correct)
- return probs, loss, acc
+ return probs, loss, {'acc': acc}
return probs
+ elif self.task == 'token-cls':
+ logits = result
+ token_level_probs = F.softmax(logits, axis=-1)
+ preds = token_level_probs.argmax(axis=-1)
+ if labels is not None:
+ loss = self.criterion(logits, labels.unsqueeze(-1))
+ num_infer_chunks, num_label_chunks, num_correct_chunks = \
+ self.metric.compute(None, seq_lengths, preds, labels)
+ self.metric.update(
+ num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+ _, _, f1_score = map(float, self.metric.accumulate())
+ return token_level_probs, loss, {'f1_score': f1_score}
+ return token_level_probs
else:
sequence_output, pooled_output = result
return sequence_output, pooled_output
- def get_vocab_path(self):
- """
- Gets the path of the module vocabulary path.
- """
- save_path = os.path.join(DATA_HOME, 'roberta-wwm-ext-large', 'vocab.txt')
- if not os.path.exists(save_path) or not os.path.isfile(save_path):
- url = "https://paddlenlp.bj.bcebos.com/models/transformers/roberta_large/vocab.txt"
- download(url, os.path.join(DATA_HOME, 'roberta-wwm-ext-large'))
- return save_path
-
- def get_tokenizer(self, tokenize_chinese_chars=True):
+ @staticmethod
+ def get_tokenizer(*args, **kwargs):
"""
Gets the tokenizer that is customized for this module.
- Args:
- tokenize_chinese_chars (:obj: bool , defaults to :obj: True):
- Whether to tokenize chinese characters or not.
- Returns:
- tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module.
- """
- return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path())
-
- def training_step(self, batch: List[paddle.Tensor], batch_idx: int):
- """
- One step for training, which should be called as forward computation.
- Args:
- batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
- such as input_ids, sent_ids, pos_ids, input_mask and labels.
- batch_idx(int): The index of batch.
- Returns:
- results(:obj: Dict) : The model outputs, such as loss and metrics.
- """
- predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
- return {'loss': avg_loss, 'metrics': {'acc': acc}}
-
- def validation_step(self, batch: List[paddle.Tensor], batch_idx: int):
- """
- One step for validation, which should be called as forward computation.
- Args:
- batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
- such as input_ids, sent_ids, pos_ids, input_mask and labels.
- batch_idx(int): The index of batch.
- Returns:
- results(:obj: Dict) : The model outputs, such as metrics.
- """
- predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
- return {'metrics': {'acc': acc}}
-
- def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False):
"""
- Predicts the data labels.
-
- Args:
- data (obj:`List(str)`): The processed data whose each element is the raw text.
- max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`):
- If set to a number, will limit the total sequence returned so that it has a maximum length.
- batch_size(obj:`int`, defaults to 1): The number of batch.
- use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not.
-
- Returns:
- results(obj:`list`): All the predictions labels.
- """
- # TODO(zhangxuefei): add task token_classification task predict.
- if self.task not in ['sequence_classification']:
- raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task)
-
- paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
- tokenizer = self.get_tokenizer()
-
- examples = []
- for text in data:
- if len(text) == 1:
- encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len)
- elif len(text) == 2:
- encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len)
- else:
- raise RuntimeError(
- 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
- examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids']))
-
- def _batchify_fn(batch):
- input_ids = [entry[0] for entry in batch]
- segment_ids = [entry[1] for entry in batch]
- return input_ids, segment_ids
-
- # Seperates data into some batches.
- batches = []
- one_batch = []
- for example in examples:
- one_batch.append(example)
- if len(one_batch) == batch_size:
- batches.append(one_batch)
- one_batch = []
- if one_batch:
- # The last batch whose size is less than the config batch_size setting.
- batches.append(one_batch)
-
- results = []
- self.eval()
- for batch in batches:
- input_ids, segment_ids = _batchify_fn(batch)
- input_ids = paddle.to_tensor(input_ids)
- segment_ids = paddle.to_tensor(segment_ids)
-
- # TODO(zhangxuefei): add task token_classification postprocess after prediction.
- if self.task == 'sequence_classification':
- probs = self(input_ids, segment_ids)
- idx = paddle.argmax(probs, axis=1).numpy()
- idx = idx.tolist()
- labels = [self.label_map[i] for i in idx]
- results.extend(labels)
-
- return results
-
- @serving
- def get_embedding(self, texts, use_gpu=False):
- if self.task is not None:
- raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task)
-
- paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-
- tokenizer = self.get_tokenizer()
- results = []
- for text in texts:
- if len(text) == 1:
- encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False)
- elif len(text) == 2:
- encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False)
- else:
- raise RuntimeError(
- 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-
- input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0)
- segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0)
- sequence_output, pooled_output = self(input_ids, segment_ids)
-
- sequence_output = sequence_output.squeeze(0)
- pooled_output = pooled_output.squeeze(0)
- results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist()))
- return results
+ return RobertaTokenizer.from_pretrained(
+ pretrained_model_name_or_path='roberta-wwm-ext-large', *args, **kwargs)
diff --git a/modules/text/language_model/roberta-wwm-ext/README.md b/modules/text/language_model/roberta-wwm-ext/README.md
index 8f4eeb80..9ee71b85 100644
--- a/modules/text/language_model/roberta-wwm-ext/README.md
+++ b/modules/text/language_model/roberta-wwm-ext/README.md
@@ -1,5 +1,5 @@
```shell
-$ hub install roberta-wwm-ext==2.0.0
+$ hub install roberta-wwm-ext==2.0.1
```
@@ -13,29 +13,35 @@ $ hub install roberta-wwm-ext==2.0.0
def __init__(
task=None,
load_checkpoint=None,
- label_map=None)
+ label_map=None,
+ num_classes=2,
+ **kwargs,
+)
```
创建Module对象(动态图组网版本)。
**参数**
-* `task`: 任务名称,可为`sequence_classification`。
+* `task`: 任务名称,可为`seq-cls`(文本分类任务,原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
* `load_checkpoint`:使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
* `label_map`:预测时的类别映射表。
+* `num_classes`:分类任务的类别数,如果指定了`label_map`,此参数可不传,默认2分类。
+* `**kwargs`:用户额外指定的关键字字典类型的参数。
```python
def predict(
data,
max_seq_len=128,
batch_size=1,
- use_gpu=False)
+ use_gpu=False
+)
```
**参数**
* `data`: 待预测数据,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,
- 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。
+ 每个样例可以包含text\_a与text\_b。每个样例文本数量(1个或者2个)需和训练时保持一致。
* `max_seq_len`:模型处理文本的最大长度
* `batch_size`:模型批处理大小
* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。
@@ -44,7 +50,9 @@ def predict(
```python
def get_embedding(
- texts,
+ data,
+ max_seq_len=128,
+ batch_size=1,
use_gpu=False
)
```
@@ -53,7 +61,9 @@ def get_embedding(
**参数**
-* `texts`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。
+* `data`:输入文本列表,格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\],其中每个元素都是一个样例,每个样例可以包含text\_a与text\_b。
+* `max_seq_len`:模型处理文本的最大长度。
+* `batch_size`:模型批处理大小。
* `use_gpu`:是否使用gpu,默认为False。对于GPU用户,建议开启use_gpu。
**返回**
@@ -67,16 +77,16 @@ def get_embedding(
import paddlehub as hub
data = [
- '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般',
- '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片',
- '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。',
+ ['这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'],
+ ['怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片'],
+ ['作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。'],
]
label_map = {0: 'negative', 1: 'positive'}
model = hub.Module(
name='roberta-wwm-ext',
- version='2.0.0',
- task='sequence_classification',
+ version='2.0.1',
+ task='seq-cls',
load_checkpoint='/path/to/parameters',
label_map=label_map)
results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
@@ -84,7 +94,9 @@ for idx, text in enumerate(data):
print('Data: {} \t Lable: {}'.format(text, results[idx]))
```
-参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation
+详情可参考PaddleHub示例:
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
## 服务部署
@@ -110,12 +122,12 @@ $ hub serving start -m roberta-wwm-ext
import requests
import json
-# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]}
-text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"]]
-# 以key的方式指定text传入预测方法的时的参数,此例中为"texts"
-# 对应本地部署,则为module.get_embedding(texts=text)
-data = {"texts": text}
-# 发送post请求,content-type类型应指定json方式
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数,此例中为"data"
+# 对应本地部署,则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip
url = "http://10.12.121.132:8866/predict/roberta-wwm-ext"
# 指定post请求的headers为application/json方式
headers = {"Content-Type": "application/json"}
@@ -126,7 +138,7 @@ print(r.json())
## 查看代码
-https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/pretrain_langauge_models/BERT
+https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/paddlenlp/transformers/roberta
## 依赖
@@ -144,3 +156,7 @@ paddlehub >= 2.0.0
* 2.0.0
全面升级动态图,接口有所变化。
+
+* 2.0.1
+
+ 任务名称调整,增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/roberta-wwm-ext/module.py b/modules/text/language_model/roberta-wwm-ext/module.py
index a4df8146..8fa2bbe7 100644
--- a/modules/text/language_model/roberta-wwm-ext/module.py
+++ b/modules/text/language_model/roberta-wwm-ext/module.py
@@ -11,29 +11,31 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict
import os
+import math
-from paddle.dataset.common import DATA_HOME
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
-from paddlehub import BertTokenizer
-from paddlehub.module.modeling_roberta import RobertaModel, RobertaForSequenceClassification
-from paddlehub.module.module import moduleinfo, serving
+from paddlenlp.transformers.roberta.modeling import RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel
+from paddlenlp.transformers.roberta.tokenizer import RobertaTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
from paddlehub.utils.log import logger
-from paddlehub.utils.utils import download
@moduleinfo(
name="roberta-wwm-ext",
- version="2.0.0",
+ version="2.0.1",
summary=
"chinese-roberta-wwm-ext, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.",
author="ymcui",
author_email="ymcui@ir.hit.edu.cn",
type="nlp/semantic_model",
+ meta=TransformerModule,
)
class Roberta(nn.Layer):
"""
@@ -42,181 +44,88 @@ class Roberta(nn.Layer):
def __init__(
self,
- task=None,
- load_checkpoint=None,
- label_map=None,
+ task: str = None,
+ load_checkpoint: str = None,
+ label_map: Dict = None,
+ num_classes: int = 2,
+ **kwargs,
):
super(Roberta, self).__init__()
- # TODO(zhangxuefei): add token_classification task
+ if label_map:
+ self.label_map = label_map
+ self.num_classes = len(label_map)
+ else:
+ self.num_classes = num_classes
+
if task == 'sequence_classification':
+ task = 'seq-cls'
+ logger.warning(
+ "current task name 'sequence_classification' was renamed to 'seq-cls', "
+ "'sequence_classification' has been deprecated and will be removed in the future.",
+ )
+ if task == 'seq-cls':
self.model = RobertaForSequenceClassification.from_pretrained(
- pretrained_model_name_or_path='roberta-wwm-ext')
+ pretrained_model_name_or_path='roberta-wwm-ext',
+ num_classes=self.num_classes,
+ **kwargs
+ )
self.criterion = paddle.nn.loss.CrossEntropyLoss()
- self.metric = paddle.metric.Accuracy(name='acc_accumulation')
+ self.metric = paddle.metric.Accuracy()
+ elif task == 'token-cls':
+ self.model = RobertaForTokenClassification.from_pretrained(
+ pretrained_model_name_or_path='roberta-wwm-ext',
+ num_classes=self.num_classes,
+ **kwargs
+ )
+ self.criterion = paddle.nn.loss.CrossEntropyLoss()
+ self.metric = ChunkEvaluator(
+ label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+ )
elif task is None:
- self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='roberta-wwm-ext')
+ self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='roberta-wwm-ext', **kwargs)
else:
- raise RuntimeError("Unknown task %s, task should be sequence_classification" % task)
+ raise RuntimeError("Unknown task {}, task should be one in {}".format(
+ task, self._tasks_supported))
self.task = task
- self.label_map = label_map
if load_checkpoint is not None and os.path.isfile(load_checkpoint):
state_dict = paddle.load(load_checkpoint)
self.set_state_dict(state_dict)
logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
- def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None):
+ def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
- if self.task is not None:
+ if self.task == 'seq-cls':
logits = result
probs = F.softmax(logits, axis=1)
if labels is not None:
loss = self.criterion(logits, labels)
correct = self.metric.compute(probs, labels)
acc = self.metric.update(correct)
- return probs, loss, acc
+ return probs, loss, {'acc': acc}
return probs
+ elif self.task == 'token-cls':
+ logits = result
+ token_level_probs = F.softmax(logits, axis=-1)
+ preds = token_level_probs.argmax(axis=-1)
+ if labels is not None:
+ loss = self.criterion(logits, labels.unsqueeze(-1))
+ num_infer_chunks, num_label_chunks, num_correct_chunks = \
+ self.metric.compute(None, seq_lengths, preds, labels)
+ self.metric.update(
+ num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+ _, _, f1_score = map(float, self.metric.accumulate())
+ return token_level_probs, loss, {'f1_score': f1_score}
+ return token_level_probs
else:
sequence_output, pooled_output = result
return sequence_output, pooled_output
- def get_vocab_path(self):
- """
- Gets the path of the module vocabulary path.
- """
- save_path = os.path.join(DATA_HOME, 'roberta-wwm-ext', 'vocab.txt')
- if not os.path.exists(save_path) or not os.path.isfile(save_path):
- url = "https://paddlenlp.bj.bcebos.com/models/transformers/roberta_base/vocab.txt"
- download(url, os.path.join(DATA_HOME, 'roberta-wwm-ext'))
- return save_path
-
- def get_tokenizer(self, tokenize_chinese_chars=True):
+ @staticmethod
+ def get_tokenizer(*args, **kwargs):
"""
Gets the tokenizer that is customized for this module.
- Args:
- tokenize_chinese_chars (:obj: bool , defaults to :obj: True):
- Whether to tokenize chinese characters or not.
- Returns:
- tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module.
- """
- return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path())
-
- def training_step(self, batch: List[paddle.Tensor], batch_idx: int):
- """
- One step for training, which should be called as forward computation.
- Args:
- batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
- such as input_ids, sent_ids, pos_ids, input_mask and labels.
- batch_idx(int): The index of batch.
- Returns:
- results(:obj: Dict) : The model outputs, such as loss and metrics.
- """
- predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
- return {'loss': avg_loss, 'metrics': {'acc': acc}}
-
- def validation_step(self, batch: List[paddle.Tensor], batch_idx: int):
- """
- One step for validation, which should be called as forward computation.
- Args:
- batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
- such as input_ids, sent_ids, pos_ids, input_mask and labels.
- batch_idx(int): The index of batch.
- Returns:
- results(:obj: Dict) : The model outputs, such as metrics.
- """
- predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
- return {'metrics': {'acc': acc}}
-
- def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False):
"""
- Predicts the data labels.
-
- Args:
- data (obj:`List(Union(str))`): The processed data (the one sequence or sequence pair) whose each element is the raw text.
- max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`):
- If set to a number, will limit the total sequence returned so that it has a maximum length.
- batch_size(obj:`int`, defaults to 1): The number of batch.
- use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not.
-
- Returns:
- results(obj:`list`): All the predictions labels.
- """
- # TODO(zhangxuefei): add task token_classification task predict.
- if self.task not in ['sequence_classification']:
- raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task)
-
- paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
- tokenizer = self.get_tokenizer()
-
- examples = []
- for text in data:
- if len(text) == 1:
- encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len)
- elif len(text) == 2:
- encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len)
- else:
- raise RuntimeError(
- 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
- examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids']))
-
- def _batchify_fn(batch):
- input_ids = [entry[0] for entry in batch]
- segment_ids = [entry[1] for entry in batch]
- return input_ids, segment_ids
-
- # Seperates data into some batches.
- batches = []
- one_batch = []
- for example in examples:
- one_batch.append(example)
- if len(one_batch) == batch_size:
- batches.append(one_batch)
- one_batch = []
- if one_batch:
- # The last batch whose size is less than the config batch_size setting.
- batches.append(one_batch)
-
- results = []
- self.eval()
- for batch in batches:
- input_ids, segment_ids = _batchify_fn(batch)
- input_ids = paddle.to_tensor(input_ids)
- segment_ids = paddle.to_tensor(segment_ids)
-
- # TODO(zhangxuefei): add task token_classification postprocess after prediction.
- if self.task == 'sequence_classification':
- probs = self(input_ids, segment_ids)
- idx = paddle.argmax(probs, axis=1).numpy()
- idx = idx.tolist()
- labels = [self.label_map[i] for i in idx]
- results.extend(labels)
-
- return results
-
- @serving
- def get_embedding(self, texts, use_gpu=False):
- if self.task is not None:
- raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task)
-
- paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-
- tokenizer = self.get_tokenizer()
- results = []
- for text in texts:
- if len(text) == 1:
- encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False)
- elif len(text) == 2:
- encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False)
- else:
- raise RuntimeError(
- 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-
- input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0)
- segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0)
- sequence_output, pooled_output = self(input_ids, segment_ids)
-
- sequence_output = sequence_output.squeeze(0)
- pooled_output = pooled_output.squeeze(0)
- results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist()))
- return results
+ return RobertaTokenizer.from_pretrained(
+ pretrained_model_name_or_path='roberta-wwm-ext', *args, **kwargs)
diff --git a/paddlehub/datasets/base_nlp_dataset.py b/paddlehub/datasets/base_nlp_dataset.py
index acca7b8c..1c9ae13a 100644
--- a/paddlehub/datasets/base_nlp_dataset.py
+++ b/paddlehub/datasets/base_nlp_dataset.py
@@ -246,15 +246,9 @@ class TextClassificationDataset(BaseNLPDataset, paddle.io.Dataset):
def __getitem__(self, idx):
record = self.records[idx]
if 'label' in record.keys():
- if isinstance(self.tokenizer, BertTokenizer):
- return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['label'])
- elif isinstance(self.tokenizer, CustomTokenizer):
- return np.array(record['text']), np.array(record['seq_len']), np.array(record['label'])
+ return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['label'], dtype=np.int64)
else:
- if isinstance(self.tokenizer, BertTokenizer):
- return np.array(record['input_ids']), np.array(record['segment_ids'])
- elif isinstance(self.tokenizer, CustomTokenizer):
- return np.array(record['text']), np.array(record['seq_len'])
+ return np.array(record['input_ids']), np.array(record['segment_ids'])
def __len__(self):
return len(self.records)
@@ -269,8 +263,9 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset):
data_file: str = None,
label_file: str = None,
label_list: list = None,
- split_char="\002",
- no_entity_label="O",
+ split_char: str ="\002",
+ no_entity_label: str = "O",
+ ignore_label: int = -100,
is_file_with_header: bool = False):
super(SeqLabelingDataset, self).__init__(
base_path=base_path,
@@ -283,6 +278,7 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset):
self.no_entity_label = no_entity_label
self.split_char = split_char
+ self.ignore_label = ignore_label
self.examples = self._read_file(self.data_file, is_file_with_header)
self.records = self._convert_examples_to_records(self.examples)
@@ -327,8 +323,7 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset):
continue
if labels:
record["label"] = []
- tokens_with_specical_token = self.tokenizer.decode(
- record, only_convert_to_tokens=True)
+ tokens_with_specical_token = self.tokenizer.convert_ids_to_tokens(record['input_ids'])
tokens_index = 0
for token in tokens_with_specical_token:
if tokens_index < len(
@@ -336,6 +331,8 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset):
record["label"].append(
self.label_list.index(labels[tokens_index]))
tokens_index += 1
+ elif token in [self.tokenizer.pad_token]:
+ record["label"].append(self.ignore_label) # label of special token
else:
record["label"].append(
self.label_list.index(self.no_entity_label))
@@ -351,7 +348,7 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset):
ret_tokens = []
ret_labels = []
for token, label in zip(tokens, labels):
- sub_token = self.tokenizer.tokenize(token)
+ sub_token = self.tokenizer(token)
if len(sub_token) == 0:
continue
ret_tokens.extend(sub_token)
@@ -370,7 +367,7 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset):
else:
ret_tokens = []
for token in tokens:
- sub_token = self.tokenizer.tokenize(token)
+ sub_token = self.tokenizer(token)
if len(sub_token) == 0:
continue
ret_tokens.extend(sub_token)
@@ -381,15 +378,9 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset):
def __getitem__(self, idx):
record = self.records[idx]
if 'label' in record.keys():
- if isinstance(self.tokenizer, BertTokenizer):
- return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['label'])
- else: # TODO(chenxiaojie): add CustomTokenizer supported
- raise NotImplementedError
+ return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['seq_len']), np.array(record['label'], dtype=np.int64)
else:
- if isinstance(self.tokenizer, BertTokenizer):
- return np.array(record['input_ids']), np.array(record['segment_ids'])
- else: # TODO(chenxiaojie): add CustomTokenizer supported
- raise NotImplementedError
+ return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['seq_len'])
def __len__(self):
return len(self.records)
diff --git a/paddlehub/datasets/msra_ner.py b/paddlehub/datasets/msra_ner.py
index c9f4b2e3..8440e7c0 100644
--- a/paddlehub/datasets/msra_ner.py
+++ b/paddlehub/datasets/msra_ner.py
@@ -31,8 +31,16 @@ class MSRA_NER(SeqLabelingDataset):
for research purposes. For more information please refer to
https://www.microsoft.com/en-us/download/details.aspx?id=52531
"""
- def __init__(self, tokenizer: Union[BertTokenizer, CustomTokenizer], max_seq_len: int = 128, mode: str = 'train'):
+
+ def __init__(
+ self,
+ tokenizer: Union[BertTokenizer, CustomTokenizer],
+ max_seq_len: int = 128,
+ mode: str = 'train',
+ ):
base_path = os.path.join(DATA_HOME, "msra_ner")
+ label_list = ["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"]
+
if mode == 'train':
data_file = 'train.tsv'
elif mode == 'test':
@@ -46,6 +54,6 @@ class MSRA_NER(SeqLabelingDataset):
mode=mode,
data_file=data_file,
label_file=None,
- label_list=["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"],
+ label_list=label_list,
is_file_with_header=True,
)
diff --git a/paddlehub/module/modeling_bert.py b/paddlehub/module/modeling_bert.py
deleted file mode 100644
index 5ab602b8..00000000
--- a/paddlehub/module/modeling_bert.py
+++ /dev/null
@@ -1,289 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# FIXME(zhangxuefei): remove this file after paddlenlp is released.
-
-import paddle
-import paddle.nn as nn
-
-from paddlehub.module.nlp_module import PretrainedModel, register_base_model
-
-
-class BertEmbeddings(nn.Layer):
- """
- Include embeddings from word, position and token_type embeddings
- """
-
- def __init__(self,
- vocab_size,
- hidden_size=768,
- hidden_dropout_prob=0.1,
- max_position_embeddings=512,
- type_vocab_size=16):
- super(BertEmbeddings, self).__init__()
- self.word_embeddings = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
- self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
- self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)
- self.layer_norm = nn.LayerNorm(hidden_size)
- self.dropout = nn.Dropout(hidden_dropout_prob)
-
- def forward(self, input_ids, token_type_ids=None, position_ids=None):
- if position_ids is None:
- # maybe need use shape op to unify static graph and dynamic graph
- seq_length = input_ids.shape[1]
- position_ids = paddle.arange(0, seq_length, dtype="int64")
- if token_type_ids is None:
- token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
-
- input_embedings = self.word_embeddings(input_ids)
- position_embeddings = self.position_embeddings(position_ids)
- token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
- embeddings = input_embedings + position_embeddings + token_type_embeddings
- embeddings = self.layer_norm(embeddings)
- embeddings = self.dropout(embeddings)
- return embeddings
-
-
-class BertPooler(nn.Layer):
- """
- """
-
- def __init__(self, hidden_size):
- super(BertPooler, self).__init__()
- self.dense = nn.Linear(hidden_size, hidden_size)
- self.activation = nn.Tanh()
-
- def forward(self, hidden_states):
- # We "pool" the model by simply taking the hidden state corresponding
- # to the first token.
- first_token_tensor = hidden_states[:, 0]
- pooled_output = self.dense(first_token_tensor)
- pooled_output = self.activation(pooled_output)
- return pooled_output
-
-
-class BertPretrainedModel(PretrainedModel):
- """
- An abstract class for pretrained BERT models. It provides BERT related
- `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
- `pretrained_init_configuration`, `base_model_prefix` for downloading and
- loading pretrained models. See `PretrainedModel` for more details.
- """
-
- model_config_file = "model_config.json"
- pretrained_init_configuration = {
- "bert-base-uncased": {
- "vocab_size": 30522,
- "hidden_size": 768,
- "num_hidden_layers": 12,
- "num_attention_heads": 12,
- "intermediate_size": 3072,
- "hidden_act": "gelu",
- "hidden_dropout_prob": 0.1,
- "attention_probs_dropout_prob": 0.1,
- "max_position_embeddings": 512,
- "type_vocab_size": 2,
- "initializer_range": 0.02,
- "pad_token_id": 0,
- },
- "bert-large-uncased": {
- "vocab_size": 30522,
- "hidden_size": 1024,
- "num_hidden_layers": 24,
- "num_attention_heads": 16,
- "intermediate_size": 4096,
- "hidden_act": "gelu",
- "hidden_dropout_prob": 0.1,
- "attention_probs_dropout_prob": 0.1,
- "max_position_embeddings": 512,
- "type_vocab_size": 2,
- "initializer_range": 0.02,
- "pad_token_id": 0,
- },
- "bert-base-multilingual-uncased": {
- "vocab_size": 105879,
- "hidden_size": 768,
- "num_hidden_layers": 12,
- "num_attention_heads": 12,
- "intermediate_size": 3072,
- "hidden_act": "gelu",
- "hidden_dropout_prob": 0.1,
- "attention_probs_dropout_prob": 0.1,
- "max_position_embeddings": 512,
- "type_vocab_size": 2,
- "initializer_range": 0.02,
- "pad_token_id": 0,
- },
- "bert-base-cased": {
- "vocab_size": 30522,
- "hidden_size": 768,
- "num_hidden_layers": 12,
- "num_attention_heads": 12,
- "intermediate_size": 3072,
- "hidden_act": "gelu",
- "hidden_dropout_prob": 0.1,
- "attention_probs_dropout_prob": 0.1,
- "max_position_embeddings": 512,
- "type_vocab_size": 2,
- "initializer_range": 0.02,
- "pad_token_id": 0,
- },
- "bert-base-chinese": {
- "vocab_size": 21128,
- "hidden_size": 768,
- "num_hidden_layers": 12,
- "num_attention_heads": 12,
- "intermediate_size": 3072,
- "hidden_act": "gelu",
- "hidden_dropout_prob": 0.1,
- "attention_probs_dropout_prob": 0.1,
- "max_position_embeddings": 512,
- "type_vocab_size": 2,
- "initializer_range": 0.02,
- "pad_token_id": 0,
- },
- "bert-base-multilingual-cased": {
- "vocab_size": 119547,
- "hidden_size": 768,
- "num_hidden_layers": 12,
- "num_attention_heads": 12,
- "intermediate_size": 3072,
- "hidden_act": "gelu",
- "hidden_dropout_prob": 0.1,
- "attention_probs_dropout_prob": 0.1,
- "max_position_embeddings": 512,
- "type_vocab_size": 2,
- "initializer_range": 0.02,
- "pad_token_id": 0,
- },
- "bert-large-cased": {
- "vocab_size": 28996,
- "hidden_size": 1024,
- "num_hidden_layers": 24,
- "num_attention_heads": 16,
- "intermediate_size": 4096,
- "hidden_act": "gelu",
- "hidden_dropout_prob": 0.1,
- "attention_probs_dropout_prob": 0.1,
- "max_position_embeddings": 512,
- "type_vocab_size": 2,
- "initializer_range": 0.02,
- "pad_token_id": 0,
- },
- }
- resource_files_names = {"model_state": "model_state.pdparams"}
- pretrained_resource_files_map = {
- "model_state": {
- "bert-base-uncased": "https://paddlenlp.bj.bcebos.com/models/transformers/bert-base-uncased.pdparams",
- "bert-large-uncased": "https://paddlenlp.bj.bcebos.com/models/transformers/bert-large-uncased.pdparams",
- "bert-base-multilingual-uncased":
- "http://paddlenlp.bj.bcebos.com/models/transformers/bert-base-multilingual-uncased.pdparams",
- "bert-base-cased": "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-base-cased.pdparams",
- "bert-base-chinese": "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-base-chinese.pdparams",
- "bert-base-multilingual-cased":
- "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-base-multilingual-cased.pdparamss",
- "bert-large-cased": "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-large-cased.pdparams"
- }
- }
- base_model_prefix = "bert"
-
- def init_weights(self, layer):
- """ Initialization hook """
- if isinstance(layer, (nn.Linear, nn.Embedding)):
- # only support dygraph, use truncated_normal and make it inplace
- # and configurable later
- layer.weight.set_value(
- paddle.tensor.normal(
- mean=0.0,
- std=self.initializer_range
- if hasattr(self, "initializer_range") else self.bert.config["initializer_range"],
- shape=layer.weight.shape))
- elif isinstance(layer, nn.LayerNorm):
- layer._epsilon = 1e-12
-
-
-@register_base_model
-class BertModel(BertPretrainedModel):
- """
- """
-
- def __init__(self,
- vocab_size,
- hidden_size=768,
- num_hidden_layers=12,
- num_attention_heads=12,
- intermediate_size=3072,
- hidden_act="gelu",
- hidden_dropout_prob=0.1,
- attention_probs_dropout_prob=0.1,
- max_position_embeddings=512,
- type_vocab_size=16,
- initializer_range=0.02,
- pad_token_id=0):
- super(BertModel, self).__init__()
- self.pad_token_id = pad_token_id
- self.initializer_range = initializer_range
- self.embeddings = BertEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings,
- type_vocab_size)
- encoder_layer = nn.TransformerEncoderLayer(
- hidden_size,
- num_attention_heads,
- intermediate_size,
- dropout=hidden_dropout_prob,
- activation=hidden_act,
- attn_dropout=attention_probs_dropout_prob,
- act_dropout=0)
- self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
- self.pooler = BertPooler(hidden_size)
- self.apply(self.init_weights)
-
- def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
- if attention_mask is None:
- attention_mask = paddle.unsqueeze(
- (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e9, axis=[1, 2])
- embedding_output = self.embeddings(
- input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
- encoder_outputs = self.encoder(embedding_output, attention_mask)
- sequence_output = encoder_outputs
- pooled_output = self.pooler(sequence_output)
- return sequence_output, pooled_output
-
-
-class BertForSequenceClassification(BertPretrainedModel):
- """
- Model for sentence (pair) classification task with BERT.
- Args:
- bert (BertModel): An instance of BertModel.
- num_classes (int, optional): The number of classes. Default 2
- dropout (float, optional): The dropout probability for output of BERT.
- If None, use the same value as `hidden_dropout_prob` of `BertModel`
- instance `bert`. Default None
- """
-
- def __init__(self, bert, num_classes=2, dropout=None):
- super(BertForSequenceClassification, self).__init__()
- self.num_classes = num_classes
- self.bert = bert # allow bert to be config
- self.dropout = nn.Dropout(dropout if dropout is not None else self.bert.config["hidden_dropout_prob"])
- self.classifier = nn.Linear(self.bert.config["hidden_size"], num_classes)
- self.apply(self.init_weights)
-
- def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
- _, pooled_output = self.bert(
- input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask)
-
- pooled_output = self.dropout(pooled_output)
- logits = self.classifier(pooled_output)
- return logits
diff --git a/paddlehub/module/modeling_ernie.py b/paddlehub/module/modeling_ernie.py
deleted file mode 100644
index ef43785b..00000000
--- a/paddlehub/module/modeling_ernie.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# FIXME(zhangxuefei): remove this file after paddlenlp is released.
-
-import paddle
-import paddle.nn as nn
-
-from paddlehub.module.nlp_module import PretrainedModel, register_base_model
-
-
-class ErnieEmbeddings(nn.Layer):
- """
- Include embeddings from word, position and token_type embeddings
- """
-
- def __init__(self,
- vocab_size,
- hidden_size=768,
- hidden_dropout_prob=0.1,
- max_position_embeddings=512,
- type_vocab_size=2,
- pad_token_id=0):
- super(ErnieEmbeddings, self).__init__()
- self.word_embeddings = nn.Embedding(vocab_size, hidden_size, padding_idx=pad_token_id)
- self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
- self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)
- self.layer_norm = nn.LayerNorm(hidden_size)
- self.dropout = nn.Dropout(hidden_dropout_prob)
-
- def forward(self, input_ids, token_type_ids=None, position_ids=None):
- if position_ids is None:
- # maybe need use shape op to unify static graph and dynamic graph
- seq_length = input_ids.shape[1]
- position_ids = paddle.arange(0, seq_length, dtype="int64")
- if token_type_ids is None:
- token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
-
- input_embedings = self.word_embeddings(input_ids)
- position_embeddings = self.position_embeddings(position_ids)
- token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
- embeddings = input_embedings + position_embeddings + token_type_embeddings
- embeddings = self.layer_norm(embeddings)
- embeddings = self.dropout(embeddings)
- return embeddings
-
-
-class ErniePooler(nn.Layer):
- """
- """
-
- def __init__(self, hidden_size):
- super(ErniePooler, self).__init__()
- self.dense = nn.Linear(hidden_size, hidden_size)
- self.activation = nn.Tanh()
-
- def forward(self, hidden_states):
- # We "pool" the model by simply taking the hidden state corresponding
- # to the first token.
- first_token_tensor = hidden_states[:, 0]
- pooled_output = self.dense(first_token_tensor)
- pooled_output = self.activation(pooled_output)
- return pooled_output
-
-
-class ErniePretrainedModel(PretrainedModel):
- """
- An abstract class for pretrained ERNIE models. It provides ERNIE related
- `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
- `pretrained_init_configuration`, `base_model_prefix` for downloading and
- loading pretrained models. See `PretrainedModel` for more details.
- """
-
- model_config_file = "model_config.json"
- pretrained_init_configuration = {
- "ernie": {
- "attention_probs_dropout_prob": 0.1,
- "hidden_act": "relu",
- "hidden_dropout_prob": 0.1,
- "hidden_size": 768,
- "initializer_range": 0.02,
- "max_position_embeddings": 513,
- "num_attention_heads": 12,
- "num_hidden_layers": 12,
- "type_vocab_size": 2,
- "vocab_size": 18000,
- "pad_token_id": 0,
- },
- "ernie_tiny": {
- "attention_probs_dropout_prob": 0.1,
- "hidden_act": "relu",
- "hidden_dropout_prob": 0.1,
- "hidden_size": 1024,
- "initializer_range": 0.02,
- "intermediate_size": 4096,
- "max_position_embeddings": 600,
- "num_attention_heads": 16,
- "num_hidden_layers": 3,
- "type_vocab_size": 2,
- "vocab_size": 50006,
- "pad_token_id": 0,
- },
- "ernie_v2_eng_base": {
- "attention_probs_dropout_prob": 0.1,
- "hidden_act": "gelu",
- "hidden_dropout_prob": 0.1,
- "hidden_size": 768,
- "initializer_range": 0.02,
- "max_position_embeddings": 512,
- "num_attention_heads": 12,
- "num_hidden_layers": 12,
- "type_vocab_size": 4,
- "vocab_size": 30522,
- "pad_token_id": 0,
- },
- "ernie_v2_eng_large": {
- "attention_probs_dropout_prob": 0.1,
- "intermediate_size": 4096,
- "hidden_act": "gelu",
- "hidden_dropout_prob": 0.1,
- "hidden_size": 1024,
- "initializer_range": 0.02,
- "max_position_embeddings": 512,
- "num_attention_heads": 16,
- "num_hidden_layers": 24,
- "type_vocab_size": 4,
- "vocab_size": 30522,
- "pad_token_id": 0,
- },
- }
- resource_files_names = {"model_state": "model_state.pdparams"}
- pretrained_resource_files_map = {
- "model_state": {
- "ernie":
- "https://paddlenlp.bj.bcebos.com/models/transformers/ernie/ernie_v1_chn_base.pdparams",
- "ernie_tiny":
- "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_tiny/ernie_tiny.pdparams",
- "ernie_v2_eng_base":
- "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_v2_base/ernie_v2_eng_base.pdparams",
- "ernie_v2_eng_large":
- "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_v2_large/ernie_v2_eng_large.pdparams",
- }
- }
- base_model_prefix = "ernie"
-
- def init_weights(self, layer):
- """ Initialization hook """
- if isinstance(layer, (nn.Linear, nn.Embedding)):
- # only support dygraph, use truncated_normal and make it inplace
- # and configurable later
- layer.weight.set_value(
- paddle.tensor.normal(
- mean=0.0,
- std=self.initializer_range
- if hasattr(self, "initializer_range") else self.ernie.config["initializer_range"],
- shape=layer.weight.shape))
-
-
-@register_base_model
-class ErnieModel(ErniePretrainedModel):
- """
- """
-
- def __init__(self,
- vocab_size,
- hidden_size=768,
- num_hidden_layers=12,
- num_attention_heads=12,
- intermediate_size=3072,
- hidden_act="gelu",
- hidden_dropout_prob=0.1,
- attention_probs_dropout_prob=0.1,
- max_position_embeddings=512,
- type_vocab_size=16,
- initializer_range=0.02,
- pad_token_id=0):
- super(ErnieModel, self).__init__()
- self.pad_token_id = pad_token_id
- self.initializer_range = initializer_range
- self.embeddings = ErnieEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings,
- type_vocab_size, pad_token_id)
- encoder_layer = nn.TransformerEncoderLayer(
- hidden_size,
- num_attention_heads,
- intermediate_size,
- dropout=hidden_dropout_prob,
- activation=hidden_act,
- attn_dropout=attention_probs_dropout_prob,
- act_dropout=0)
- self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
- self.pooler = ErniePooler(hidden_size)
- self.apply(self.init_weights)
-
- def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
- if attention_mask is None:
- attention_mask = paddle.unsqueeze(
- (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e9, axis=[1, 2])
- embedding_output = self.embeddings(
- input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
- encoder_outputs = self.encoder(embedding_output, attention_mask)
- sequence_output = encoder_outputs
- pooled_output = self.pooler(sequence_output)
- return sequence_output, pooled_output
-
-
-class ErnieForSequenceClassification(ErniePretrainedModel):
- """
- Model for sentence (pair) classification task with ERNIE.
- Args:
- ernie (ErnieModel): An instance of `ErnieModel`.
- num_classes (int, optional): The number of classes. Default 2
- dropout (float, optional): The dropout probability for output of ERNIE.
- If None, use the same value as `hidden_dropout_prob` of `ErnieModel`
- instance `Ernie`. Default None
- """
-
- def __init__(self, ernie, num_classes=2, dropout=None):
- super(ErnieForSequenceClassification, self).__init__()
- self.num_classes = num_classes
- self.ernie = ernie # allow ernie to be config
- self.dropout = nn.Dropout(dropout if dropout is not None else self.ernie.config["hidden_dropout_prob"])
- self.classifier = nn.Linear(self.ernie.config["hidden_size"], num_classes)
- self.apply(self.init_weights)
-
- def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
- _, pooled_output = self.ernie(
- input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask)
-
- pooled_output = self.dropout(pooled_output)
- logits = self.classifier(pooled_output)
- return logits
diff --git a/paddlehub/module/modeling_roberta.py b/paddlehub/module/modeling_roberta.py
deleted file mode 100644
index 62d75539..00000000
--- a/paddlehub/module/modeling_roberta.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# FIXME(zhangxuefei): remove this file after paddlenlp is released.
-
-import paddle
-import paddle.nn as nn
-
-from paddlehub.module.nlp_module import PretrainedModel, register_base_model
-
-
-class RobertaEmbeddings(nn.Layer):
- """
- Include embeddings from word, position and token_type embeddings
- """
-
- def __init__(self,
- vocab_size,
- hidden_size=768,
- hidden_dropout_prob=0.1,
- max_position_embeddings=512,
- type_vocab_size=16,
- pad_token_id=0):
- super(RobertaEmbeddings, self).__init__()
- self.word_embeddings = nn.Embedding(vocab_size, hidden_size, padding_idx=pad_token_id)
- self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
- self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)
- self.layer_norm = nn.LayerNorm(hidden_size)
- self.dropout = nn.Dropout(hidden_dropout_prob)
-
- def forward(self, input_ids, token_type_ids=None, position_ids=None):
- if position_ids is None:
- # maybe need use shape op to unify static graph and dynamic graph
- seq_length = input_ids.shape[1]
- position_ids = paddle.arange(0, seq_length, dtype="int64")
- if token_type_ids is None:
- token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
-
- input_embedings = self.word_embeddings(input_ids)
- position_embeddings = self.position_embeddings(position_ids)
- token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
- embeddings = input_embedings + position_embeddings + token_type_embeddings
- embeddings = self.layer_norm(embeddings)
- embeddings = self.dropout(embeddings)
- return embeddings
-
-
-class RobertaPooler(nn.Layer):
- """
- """
-
- def __init__(self, hidden_size):
- super(RobertaPooler, self).__init__()
- self.dense = nn.Linear(hidden_size, hidden_size)
- self.activation = nn.Tanh()
-
- def forward(self, hidden_states):
- # We "pool" the model by simply taking the hidden state corresponding
- # to the first token.
- first_token_tensor = hidden_states[:, 0]
- pooled_output = self.dense(first_token_tensor)
- pooled_output = self.activation(pooled_output)
- return pooled_output
-
-
-class RobertaPretrainedModel(PretrainedModel):
- """
- An abstract class for pretrained RoBERTa models. It provides RoBERTa related
- `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
- `pretrained_init_configuration`, `base_model_prefix` for downloading and
- loading pretrained models. See `PretrainedModel` for more details.
- """
-
- model_config_file = "model_config.json"
- pretrained_init_configuration = {
- "roberta-wwm-ext": {
- "attention_probs_dropout_prob": 0.1,
- "hidden_act": "gelu",
- "hidden_dropout_prob": 0.1,
- "hidden_size": 768,
- "initializer_range": 0.02,
- "intermediate_size": 3072,
- "max_position_embeddings": 512,
- "num_attention_heads": 12,
- "num_hidden_layers": 12,
- "type_vocab_size": 2,
- "vocab_size": 21128,
- "pad_token_id": 0
- },
- "roberta-wwm-ext-large": {
- "attention_probs_dropout_prob": 0.1,
- "hidden_act": "gelu",
- "hidden_dropout_prob": 0.1,
- "hidden_size": 1024,
- "initializer_range": 0.02,
- "intermediate_size": 4096,
- "max_position_embeddings": 512,
- "num_attention_heads": 16,
- "num_hidden_layers": 24,
- "type_vocab_size": 2,
- "vocab_size": 21128,
- "pad_token_id": 0
- }
- }
- resource_files_names = {"model_state": "model_state.pdparams"}
- pretrained_resource_files_map = {
- "model_state": {
- "roberta-wwm-ext":
- "https://paddlenlp.bj.bcebos.com/models/transformers/roberta_base/roberta_chn_base.pdparams",
- "roberta-wwm-ext-large":
- "https://paddlenlp.bj.bcebos.com/models/transformers/roberta_large/roberta_chn_large.pdparams",
- }
- }
- base_model_prefix = "roberta"
-
- def init_weights(self, layer):
- """ Initialization hook """
- if isinstance(layer, (nn.Linear, nn.Embedding)):
- # only support dygraph, use truncated_normal and make it inplace
- # and configurable later
- layer.weight.set_value(
- paddle.tensor.normal(
- mean=0.0,
- std=self.initializer_range
- if hasattr(self, "initializer_range") else self.roberta.config["initializer_range"],
- shape=layer.weight.shape))
- elif isinstance(layer, nn.LayerNorm):
- layer._epsilon = 1e-12
-
-
-@register_base_model
-class RobertaModel(RobertaPretrainedModel):
- """
- """
-
- def __init__(self,
- vocab_size,
- hidden_size=768,
- num_hidden_layers=12,
- num_attention_heads=12,
- intermediate_size=3072,
- hidden_act="gelu",
- hidden_dropout_prob=0.1,
- attention_probs_dropout_prob=0.1,
- max_position_embeddings=512,
- type_vocab_size=16,
- initializer_range=0.02,
- pad_token_id=0):
- super(RobertaModel, self).__init__()
- self.pad_token_id = pad_token_id
- self.initializer_range = initializer_range
- self.embeddings = RobertaEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings,
- type_vocab_size, pad_token_id)
- encoder_layer = nn.TransformerEncoderLayer(
- hidden_size,
- num_attention_heads,
- intermediate_size,
- dropout=hidden_dropout_prob,
- activation=hidden_act,
- attn_dropout=attention_probs_dropout_prob,
- act_dropout=0)
- self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
- self.pooler = RobertaPooler(hidden_size)
- self.apply(self.init_weights)
-
- def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
- if attention_mask is None:
- attention_mask = paddle.unsqueeze(
- (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e9, axis=[1, 2])
- embedding_output = self.embeddings(
- input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
- encoder_outputs = self.encoder(embedding_output, attention_mask)
- sequence_output = encoder_outputs
- pooled_output = self.pooler(sequence_output)
- return sequence_output, pooled_output
-
-
-class RobertaForSequenceClassification(RobertaPretrainedModel):
- """
- Model for sentence (pair) classification task with RoBERTa.
- Args:
- roberta (RobertaModel): An instance of `RobertaModel`.
- num_classes (int, optional): The number of classes. Default 2
- dropout (float, optional): The dropout probability for output of RoBERTa.
- If None, use the same value as `hidden_dropout_prob` of `RobertaModel`
- instance `Roberta`. Default None
- """
-
- def __init__(self, roberta, num_classes=2, dropout=None):
- super(RobertaForSequenceClassification, self).__init__()
- self.num_classes = num_classes
- self.roberta = roberta # allow roberta to be config
- self.dropout = nn.Dropout(dropout if dropout is not None else self.roberta.config["hidden_dropout_prob"])
- self.classifier = nn.Linear(self.roberta.config["hidden_size"], num_classes)
- self.apply(self.init_weights)
-
- def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
- _, pooled_output = self.roberta(
- input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask)
-
- pooled_output = self.dropout(pooled_output)
- logits = self.classifier(pooled_output)
- return logits
diff --git a/paddlehub/module/nlp_module.py b/paddlehub/module/nlp_module.py
index 37c86f3a..ddfd546c 100644
--- a/paddlehub/module/nlp_module.py
+++ b/paddlehub/module/nlp_module.py
@@ -453,8 +453,11 @@ class TransformerModule(RunModule, TextServing):
Returns:
results(:obj: Dict) : The model outputs, such as loss and metrics.
"""
- predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
- return {'loss': avg_loss, 'metrics': {'acc': acc}}
+ if self.task == 'seq-cls':
+ predictions, avg_loss, metric = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
+ elif self.task == 'token-cls':
+ predictions, avg_loss, metric = self(input_ids=batch[0], token_type_ids=batch[1], seq_lengths=batch[2], labels=batch[3])
+ return {'loss': avg_loss, 'metrics': metric}
def validation_step(self, batch: List[paddle.Tensor], batch_idx: int):
"""
@@ -466,8 +469,11 @@ class TransformerModule(RunModule, TextServing):
Returns:
results(:obj: Dict) : The model outputs, such as metrics.
"""
- predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
- return {'metrics': {'acc': acc}}
+ if self.task == 'seq-cls':
+ predictions, avg_loss, metric = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
+ elif self.task == 'token-cls':
+ predictions, avg_loss, metric = self(input_ids=batch[0], token_type_ids=batch[1], seq_lengths=batch[2], labels=batch[3])
+ return {'metrics': metric}
def get_embedding(self, data: List[List[str]], max_seq_len=128, batch_size=1, use_gpu=False):
"""
diff --git a/requirements.txt b/requirements.txt
index fe998426..8102079e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,4 +16,4 @@ tqdm
visualdl >= 2.0.0
# gunicorn not support windows
gunicorn >= 19.10.0; sys_platform != "win32"
-paddlenlp >= 2.0.0b
\ No newline at end of file
+paddlenlp >= 2.0.0b2
\ No newline at end of file
--
GitLab