From 344d03bf0b933d2d656f8e9333d003f9b0e3c2e0 Mon Sep 17 00:00:00 2001 From: kinghuin Date: Sat, 12 Dec 2020 16:56:11 +0800 Subject: [PATCH] fix lac typo and image url (#5028) --- PaddleNLP/examples/lexical_analysis/README.md | 19 ++++++++++--------- PaddleNLP/examples/lexical_analysis/eval.py | 4 ++-- .../examples/lexical_analysis/predict.py | 4 ++-- PaddleNLP/examples/lexical_analysis/train.py | 6 +++--- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/PaddleNLP/examples/lexical_analysis/README.md b/PaddleNLP/examples/lexical_analysis/README.md index 15682f1b..debbafc5 100644 --- a/PaddleNLP/examples/lexical_analysis/README.md +++ b/PaddleNLP/examples/lexical_analysis/README.md @@ -4,7 +4,7 @@ 词法分析任务的输入是一个字符串(我们后面使用『句子』来指代它),而输出是句子中的词边界和词性、实体类别。序列标注是词法分析的经典建模方式,我们使用基于 GRU 的网络结构学习特征,将学习到的特征接入 CRF 解码层完成序列标注。模型结构如下所示:
-![GRU-CRF-MODEL](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/lexical_analysis/gru-crf-model.png) +![GRU-CRF-MODEL](https://paddlenlp.bj.bcebos.com/imgs/gru-crf-model.png) 1. 输入采用 one-hot 方式表示,每个字以一个 id 表示 2. one-hot 序列通过字表,转换为实向量表示的字向量序列; @@ -18,7 +18,9 @@ - Python >= 3.6 -- PaddlePaddle >= 2.0.0,安装方式请参考 [快速安装](https://www.paddlepaddle.org.cn/install/quick)。 +- PaddlePaddle >= 2.0.0rc1,安装方式请参考 [快速安装](https://www.paddlepaddle.org.cn/install/quick)。 + +- PaddleNLP >= 2.0.0b, 安装方式:`pip install paddlenlp>=2.0.0b` ### 2.2 数据准备 @@ -59,34 +61,33 @@ export CUDA_VISIBLE_DEVICES=0,1 # 支持多卡训练 ```bash python -m paddle.distributed.launch train.py \ - --root ./lexical_analysis_dataset_tiny \ + --data_dir ./lexical_analysis_dataset_tiny \ --model_save_dir ./save_dir \ --epochs 10 \ --batch_size 32 \ - --use_gpu True + --use_gpu True \ + # --init_checkpoint ./save_dir/final ``` -其中 root 是数据集所在文件夹路径。 +其中 data_dir 是数据集所在文件夹路径,init_checkpoint 是模型加载路径,通过设置init_checkpoint可以启动增量训练。 ### 2.4 模型评估 通过加载训练保存的模型,可以对测试集数据进行验证,启动方式如下: ```bash -python eval.py --root ./lexical_analysis_dataset_tiny \ +python eval.py --data_dir ./lexical_analysis_dataset_tiny \ --init_checkpoint ./save_dir/final \ --batch_size 32 \ --use_gpu True ``` -其中 init_checkpoint 是模型加载路径。 - ### 2.5 模型预测 对无标签数据可以启动模型预测: ```bash -python predict.py --root ./lexical_analysis_dataset_tiny \ +python predict.py --data_dir ./lexical_analysis_dataset_tiny \ --init_checkpoint ./save_dir/final \ --batch_size 32 \ --use_gpu True diff --git a/PaddleNLP/examples/lexical_analysis/eval.py b/PaddleNLP/examples/lexical_analysis/eval.py index 6fc58f4d..50742de7 100644 --- a/PaddleNLP/examples/lexical_analysis/eval.py +++ b/PaddleNLP/examples/lexical_analysis/eval.py @@ -28,7 +28,7 @@ from model import BiGruCrf # yapf: disable parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--root", type=str, default=None, help="The folder where the dataset is located.") +parser.add_argument("--data_dir", type=str, default=None, help="The folder where the dataset is located.") parser.add_argument("--init_checkpoint", type=str, default=None, help="Path to init model.") parser.add_argument("--batch_size", type=int, default=300, help="The number of sequences contained in a mini-batch.") parser.add_argument("--max_seq_len", type=int, default=64, help="Number of words of the longest seqence.") @@ -44,7 +44,7 @@ def evaluate(args): paddle.set_device("gpu" if args.use_gpu else "cpu") # create dataset. - test_dataset = LacDataset(args.root, mode='test') + test_dataset = LacDataset(args.data_dir, mode='test') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids Stack(), # length diff --git a/PaddleNLP/examples/lexical_analysis/predict.py b/PaddleNLP/examples/lexical_analysis/predict.py index 0ac7ca90..accdd78b 100644 --- a/PaddleNLP/examples/lexical_analysis/predict.py +++ b/PaddleNLP/examples/lexical_analysis/predict.py @@ -27,7 +27,7 @@ from model import BiGruCrf # yapf: disable parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--root", type=str, default=None, help="The folder where the dataset is located.") +parser.add_argument("--data_dir", type=str, default=None, help="The folder where the dataset is located.") parser.add_argument("--init_checkpoint", type=str, default=None, help="Path to init model.") parser.add_argument("--batch_size", type=int, default=300, help="The number of sequences contained in a mini-batch.") parser.add_argument("--max_seq_len", type=int, default=64, help="Number of words of the longest seqence.") @@ -43,7 +43,7 @@ def infer(args): paddle.set_device("gpu" if args.use_gpu else "cpu") # create dataset. - infer_dataset = LacDataset(args.root, mode='infer') + infer_dataset = LacDataset(args.data_dir, mode='infer') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids diff --git a/PaddleNLP/examples/lexical_analysis/train.py b/PaddleNLP/examples/lexical_analysis/train.py index 14a1b239..1bb08d5b 100644 --- a/PaddleNLP/examples/lexical_analysis/train.py +++ b/PaddleNLP/examples/lexical_analysis/train.py @@ -28,7 +28,7 @@ from paddlenlp.metrics import ChunkEvaluator # yapf: disable parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--root", type=str, default=None, help="The folder where the dataset is located.") +parser.add_argument("--data_dir", type=str, default=None, help="The folder where the dataset is located.") parser.add_argument("--init_checkpoint", type=str, default=None, help="Path to init model.") parser.add_argument("--model_save_dir", type=str, default=None, help="The model will be saved in this path.") parser.add_argument("--epochs", type=int, default=10, help="Corpus iteration num.") @@ -51,8 +51,8 @@ def train(args): paddle.set_device("cpu") # create dataset. - train_dataset = LacDataset(args.root, mode='train') - test_dataset = LacDataset(args.root, mode='test') + train_dataset = LacDataset(args.data_dir, mode='train') + test_dataset = LacDataset(args.data_dir, mode='test') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids -- GitLab