update codes for paddenlp text cls example

update codes for paddenlp text cls example

update codes for paddenlp text cls example
8c9d8f56 · Steffy-zxf · GitHub · 7fae3401 · 8c9d8f56 · 8c9d8f56
11 changed file
--- a/PaddleNLP/examples/text_classification/pretrained_models/README.md
+++ b/PaddleNLP/examples/text_classification/pretrained_models/README.md
@@ -20,7 +20,7 @@
 | bert-base-chinese  | 0.93833 | 0.94750 |
 | bert-wwm-chinese | 0.94583 | 0.94917 |
 | bert-wwm-ext-chinese | 0.94667 | 0.95500 |
-| ernie  | 0.94667  | 0.95333  |
+| ernie-1.0  | 0.94667  | 0.95333  |
 | ernie-tiny  | 0.93917  | 0.94833 |
 | roberta-wwm-ext  | 0.94750  | 0.95250 |
 | roberta-wwm-ext-large | 0.95250 | 0.95333 |
@@ -68,13 +68,16 @@ python train.py --model_type ernie --model_name ernie-tiny --n_gpu 1 --save_dir
 可支持配置的参数：
 * `model_type`：必选，模型类型，可以选择bert，ernie，roberta。
-* `model_name`： 必选，具体的模型简称。如`model_type=ernie`，则model_name可以选择`ernie-1.0`和`ernie-tiny`。`model_type=bert`，则model_name可以选择`bert-base-chinese`。
+* `model_name`： 必选，具体的模型简称。
-   `model_type=roberta`，则model_name可以选择`roberta-wwm-ext-large`和`roberta-wwm-ext`。
+   如`model_type=ernie`，则model_name可以选择`ernie-1.0`和`ernie-tiny`。
+   如`model_type=bert`，则model_name可以选择`bert-base-chinese`，`bert-wwm-chinese`，`bert-wwm-ext-chinese`。
+   如`model_type=roberta`，则model_name可以选择`roberta-wwm-ext-large`，`roberta-wwm-ext`，`rbt3`，`rbtl3`。
 * `save_dir`：必选，保存训练模型的目录。
 * `max_seq_length`：可选，ERNIE/BERT模型使用的最大序列长度，最大不能超过512, 若出现显存不足，请适当调低这一参数；默认为128。
 * `batch_size`：可选，批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为32。
 * `learning_rate`：可选，Fine-tune的最大学习率；默认为5e-5。
 * `weight_decay`：可选，控制正则项力度的参数，用于防止过拟合，默认为0.00。
+* `epochs`: 训练轮次，默认为3。
 * `warmup_proption`：可选，学习率warmup策略的比例，如果0.1，则学习率会在前10%训练step的过程中从0慢慢增长到learning_rate, 而后再缓慢衰减，默认为0.1。
 * `init_from_ckpt`：可选，模型参数路径，热启动模型训练；默认为None。
 * `seed`：可选，随机种子，默认为1000.

--- a/PaddleNLP/examples/text_classification/pretrained_models/train.py
+++ b/PaddleNLP/examples/text_classification/pretrained_models/train.py
@@ -235,8 +235,8 @@ def do_train(args):
    train_dataset, dev_dataset, test_dataset = ppnlp.datasets.ChnSentiCorp.get_datasets(
        ['train', 'dev', 'test'])
-    if args.model_name == 'ernie_tiny':
+    if args.model_name == 'ernie-tiny':
-        # ErnieTinyTokenizer is special for ernie_tiny pretained model.
+        # ErnieTinyTokenizer is special for ernie-tiny pretained model.
        tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained(
            args.model_name)
    else:

--- a/PaddleNLP/examples/text_classification/rnn/README.md
+++ b/PaddleNLP/examples/text_classification/rnn/README.md
@@ -63,8 +63,7 @@
 以下是本项目主要代码结构及说明：
 ```text
-.
+rnn/
-├── data.py # 数据读取
 ├── predict.py # 模型预测
 ├── utils.py # 数据处理工具
 ├── train.py # 训练模型主程序入口，包括训练、评估
@@ -81,10 +80,6 @@ from paddlenlp.datasets import ChnSentiCorp
 train_ds, dev_ds, test_ds = ChnSentiCorp.get_datasets(['train', 'dev', 'test'])
 ```
-#### 自定义数据集
-训练数据由两列组成："text_a\tlabel"
 ### 模型训练
 在模型训练之前，需要先下载词汇表文件word_dict.txt，用于构造词-id映射关系。
@@ -112,9 +107,9 @@ GPU 启动：
 * `vocab_path`: 词汇表文件路径。
 * `use_gpu`: 是否使用GPU进行训练， 默认为`False`。
 * `network`: 模型网络名称，默认为`bilstm_attn`， 可更换为bilstm, bigru, birnn，bow，lstm，rnn，gru，bilstm_attn，textcnn等。
-* `lr`: 学习率， 默认为5e-4。
+* `lr`: 学习率， 默认为5e-5。
 * `batch_size`: 运行一个batch大小，默认为64。
-* `epochs`: 训练轮次，默认为5。
+* `epochs`: 训练轮次，默认为10。
 * `save_dir`: 训练保存模型的文件路径。
 * `init_from_ckpt`: 恢复模型训练的断点路径。

--- a/PaddleNLP/examples/text_classification/rnn/train.py
+++ b/PaddleNLP/examples/text_classification/rnn/train.py
@@ -14,7 +14,9 @@
 from functools import partial
 import argparse
 import os
+import random
+import numpy as np
 import paddle
 import paddlenlp as ppnlp
 from paddlenlp.data import Stack, Tuple, Pad
@@ -24,18 +26,25 @@ from utils import load_vocab, convert_example
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
-parser.add_argument("--epochs", type=int, default=3, help="Number of epoches for training.")
+parser.add_argument("--epochs", type=int, default=10, help="Number of epoches for training.")
 parser.add_argument('--use_gpu', type=eval, default=False, help="Whether use GPU for training, input should be True or False")
-parser.add_argument("--lr", type=float, default=5e-4, help="Learning rate used to train.")
+parser.add_argument("--lr", type=float, default=5e-5, help="Learning rate used to train.")
 parser.add_argument("--save_dir", type=str, default='chekpoints/', help="Directory to save model checkpoint")
 parser.add_argument("--batch_size", type=int, default=64, help="Total examples' number of a batch for training.")
-parser.add_argument("--vocab_path", type=str, default="./word_dict.txt", help="The directory to dataset.")
+parser.add_argument("--vocab_path", type=str, default="./senta_word_dict.txt", help="The directory to dataset.")
 parser.add_argument('--network', type=str, default="bilstm_attn", help="Which network you would like to choose bow, lstm, bilstm, gru, bigru, rnn, birnn, bilstm_attn and textcnn?")
 parser.add_argument("--init_from_ckpt", type=str, default=None, help="The path of checkpoint to be loaded.")
 args = parser.parse_args()
 # yapf: enable
+def set_seed(seed=1000):
+    """sets random seed"""
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
 def create_dataloader(dataset,
                      trans_fn=None,
                      mode='train',
@@ -79,6 +88,7 @@ def create_dataloader(dataset,
 if __name__ == "__main__":
+    set_seed()
    paddle.set_device('gpu') if args.use_gpu else paddle.set_device('cpu')
    # Loads vocab.
@@ -103,7 +113,7 @@ if __name__ == "__main__":
    trans_fn = partial(
        convert_example,
        vocab=vocab,
-        unk_token_id=vocab['[UNK]'],
+        unk_token_id=vocab.get('[UNK]', 1),
        is_test=False)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=vocab['[PAD]']),  # input_ids
@@ -116,6 +126,7 @@ if __name__ == "__main__":
        batch_size=args.batch_size,
        mode='train',
        use_gpu=args.use_gpu,
+        pad_token_id=vocab.get('[PAD]', 0),
        batchify_fn=batchify_fn)
    dev_loader = create_dataloader(
        dev_ds,
@@ -123,6 +134,7 @@ if __name__ == "__main__":
        batch_size=args.batch_size,
        mode='validation',
        use_gpu=args.use_gpu,
+        pad_token_id=vocab.get('[PAD]', 0),
        batchify_fn=batchify_fn)
    test_loader = create_dataloader(
        test_ds,
@@ -130,6 +142,7 @@ if __name__ == "__main__":
        batch_size=args.batch_size,
        mode='test',
        use_gpu=args.use_gpu,
+        pad_token_id=vocab.get('[PAD]', 0),
        batchify_fn=batchify_fn)
    optimizer = paddle.optimizer.Adam(

--- a/PaddleNLP/examples/text_classification/rnn/utils.py
+++ b/PaddleNLP/examples/text_classification/rnn/utils.py
@@ -52,36 +52,6 @@ def convert_tokens_to_ids(tokens, vocab):
    return ids
-def convert_example(example, vocab, unk_token_id=1, is_test=False):
-    """
-    Builds model inputs from a sequence for sequence classification tasks. 
-    It use `jieba.cut` to tokenize text.
-    Args:
-        example(obj:`list[str]`): List of input data, containing text and label if it have label.
-        vocab(obj:`dict`): The vocabulary.
-        unk_token_id(obj:`int`, defaults to 1): The unknown token id.
-        is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
-    Returns:
-        input_ids(obj:`list[int]`): The list of token ids.s
-        valid_length(obj:`int`): The input sequence valid length.
-        label(obj:`numpy.array`, data type of int64, optional): The input label if not is_test.
-    """
-    input_ids = []
-    for token in jieba.cut(example[0]):
-        token_id = vocab.get(token, unk_token_id)
-        input_ids.append(token_id)
-    valid_length = len(input_ids)
-    if not is_test:
-        label = np.array(example[-1], dtype="int64")
-        return input_ids, valid_length, label
-    else:
-        return input_ids, valid_length
 def pad_texts_to_max_seq_len(texts, max_seq_len, pad_token_id=0):
    """
    Padded the texts to the max sequence length if the length of text is lower than it.
@@ -148,8 +118,7 @@ def convert_example(example, vocab, unk_token_id=1, is_test=False):
    for token in jieba.cut(example[0]):
        token_id = vocab.get(token, unk_token_id)
        input_ids.append(token_id)
-    valid_length = len(input_ids)
+    valid_length = np.array(len(input_ids), dtype='int64')
-    valid_length = np.array(valid_length, dtype="int64")
    if not is_test:
        label = np.array(example[-1], dtype="int64")

--- a/PaddleNLP/examples/text_matching/sentence_transformers/README.md
+++ b/PaddleNLP/examples/text_matching/sentence_transformers/README.md
@@ -78,13 +78,6 @@ PaddleNLP提供了丰富的预训练模型，并且可以便捷地获取PaddlePa
 ```text
 sentence_transformers/
-├── checkpoint
-│   ├── model_100
-│   │   ├── model_state.pdparams
-│   │   ├── tokenizer_config.json
-│   │   └── vocab.txt
-│   ├── ...
-│
 ├── model.py # Sentence Transfomer 组网文件
 ├── README.md # 文本说明
 └── train.py # 模型训练评估
@@ -102,13 +95,16 @@ python train.py --model_type ernie --model_name ernie-1.0 --n_gpu 1 --save_dir .
 可支持配置的参数：
 * `model_type`：必选，模型类型，可以选择bert，ernie，roberta。
-* `model_name`： 必选，具体的模型简称。如`model_type=ernie`，则model_name可以选择`ernie-1.0`和`ernie-tiny`。`model_type=bert`，则model_name可以选择`bert-base-chinese`。
+* `model_name`： 必选，具体的模型简称。
-   `model_type=roberta`，则model_name可以选择`roberta-wwm-ext-large`和`roberta-wwm-ext`。
+   如`model_type=ernie`，则model_name可以选择`ernie-1.0`和`ernie-tiny`。
+   如`model_type=bert`，则model_name可以选择`bert-base-chinese`，`bert-wwm-chinese`，`bert-wwm-ext-chinese`。
+   如`model_type=roberta`，则model_name可以选择`roberta-wwm-ext-large`，`roberta-wwm-ext`，`rbt3`，`rbtl3`。
 * `save_dir`：必选，保存训练模型的目录。
 * `max_seq_length`：可选，ERNIE/BERT模型使用的最大序列长度，最大不能超过512, 若出现显存不足，请适当调低这一参数；默认为128。
 * `batch_size`：可选，批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为32。
 * `learning_rate`：可选，Fine-tune的最大学习率；默认为5e-5。
 * `weight_decay`：可选，控制正则项力度的参数，用于防止过拟合，默认为0.00。
+* `epochs`: 训练轮次，默认为3。
 * `warmup_proption`：可选，学习率warmup策略的比例，如果0.1，则学习率会在前10%训练step的过程中从0慢慢增长到learning_rate, 而后再缓慢衰减，默认为0.1。
 * `init_from_ckpt`：可选，模型参数路径，热启动模型训练；默认为None。
 * `seed`：可选，随机种子，默认为1000.

--- a/PaddleNLP/examples/text_matching/simnet/README.md
+++ b/PaddleNLP/examples/text_matching/simnet/README.md
@@ -13,12 +13,11 @@ SimNet框架在百度各产品上广泛应用，主要包括BOW、CNN、RNN、MM
 | 模型                                             | 模型介绍                                                     |
 | ------------------------------------------------ | ------------------------------------------------------------ |
 | BOW（Bag Of Words）                              | 非序列模型，将句子表示为其所包含词的向量的加和               |
-| RNN (Recurrent Neural Network)                   | 序列模型，能够有效地处理序列信息                             |
+| CNN                                          | 序列模型，使用卷积操作，提取局部区域地特征             |
 | GRU（Gated Recurrent Unit）                      | 序列模型，能够较好地解决序列文本中长距离依赖的问题           |
 | LSTM（Long Short Term Memory）                   | 序列模型，能够较好地解决序列文本中长距离依赖的问题           |
-## TBD 增加模型效果
 | 模型  | dev acc | test acc |
 | ---- | ------- | -------- |
 | BoW  | 0.7290 | 0.75232 |
@@ -53,8 +52,7 @@ SimNet框架在百度各产品上广泛应用，主要包括BOW、CNN、RNN、MM
 以下是本项目主要代码结构及说明：
 ```text
-.
+simnet/
-├── data.py # 数据读取
 ├── predict.py # 模型预测
 ├── utils.py # 数据处理工具
 ├── train.py # 训练模型主程序入口，包括训练、评估

--- a/PaddleNLP/paddlenlp/__init__.py
+++ b/PaddleNLP/paddlenlp/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = '2.0.0a5'
+__version__ = '2.0.0a6'
 from . import data
 from . import datasets

--- a/PaddleNLP/paddlenlp/models/senta.py
+++ b/PaddleNLP/paddlenlp/models/senta.py
@@ -63,7 +63,7 @@ class Senta(nn.Layer):
                vocab_size,
                num_classes,
                emb_dim,
-                direction='bidrectional',
+                direction='bidirectional',
                padding_idx=pad_token_id)
        elif network == 'cnn':
            self.model = CNNModel(

--- a/PaddleNLP/paddlenlp/seq2vec/encoder.py
+++ b/PaddleNLP/paddlenlp/seq2vec/encoder.py
@@ -221,7 +221,7 @@ class GRUEncoder(nn.Layer):
            E.g., setting num_layers=2 would mean stacking two GRUs together to form a stacked GRU, 
            with the second GRU taking in outputs of the first GRU and computing the final results.
        direction (obj:`str`, optional, defaults to obj:`forwrd`): The direction of the network. 
-            It can be "forward", "backward" and "bidirectional".
+            It can be "forward" and "bidirectional".
            When "bidirectional", the way to merge outputs of forward and backward is concatenating.
        dropout (obj:`float`, optional, defaults to 0.0): If non-zero, introduces a Dropout layer 
            on the outputs of each GRU layer except the last layer, with dropout probability equal to dropout.
@@ -344,7 +344,7 @@ class LSTMEncoder(nn.Layer):
            E.g., setting num_layers=2 would mean stacking two LSTMs together to form a stacked LSTM, 
            with the second LSTM taking in outputs of the first LSTM and computing the final results.
        direction (obj:`str`, optional, defaults to obj:`forwrd`): The direction of the network. 
-            It can be "forward", "backward" and "bidirectional".
+            It can be "forward" and "bidirectional".
            When "bidirectional", the way to merge outputs of forward and backward is concatenating.
        dropout (obj:`float`, optional, defaults to 0.0): If non-zero, introduces a Dropout layer 
            on the outputs of each LSTM layer except the last layer, with dropout probability equal to dropout.
@@ -468,7 +468,7 @@ class RNNEncoder(nn.Layer):
            E.g., setting num_layers=2 would mean stacking two RNNs together to form a stacked RNN, 
            with the second RNN taking in outputs of the first RNN and computing the final results.
        direction (obj:`str`, optional, defaults to obj:`forwrd`): The direction of the network. 
-            It can be "forward", "backward" and "bidirectional".
+            It can be "forward" and "bidirectional".
            When "bidirectional", the way to merge outputs of forward and backward is concatenating.
        dropout (obj:`float`, optional, defaults to 0.0): If non-zero, introduces a Dropout layer 
            on the outputs of each RNN layer except the last layer, with dropout probability equal to dropout.

--- a/PaddleNLP/paddlenlp/transformers/ernie/modeling.py
+++ b/PaddleNLP/paddlenlp/transformers/ernie/modeling.py
@@ -156,9 +156,9 @@ class ErniePretrainedModel(PretrainedModel):
            "ernie-tiny":
            "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_tiny/ernie_tiny.pdparams",
            "ernie-2.0-en":
-            "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_v2_base/ernie-2.0-en.pdparams",
+            "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_v2_base/ernie_v2_eng_base.pdparams",
            "ernie-2.0-large-en":
-            "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_v2_large/ernie-2.0-large-en.pdparams",
+            "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_v2_large/ernie_v2_eng_large.pdparams",
        }
    }
    base_model_prefix = "ernie"