diff --git a/BERT/README.md b/BERT/README.md index 84e05b2df588be9fb42e541f84321203c34009b6..6f122caf113a20fd1e0a5b46f43fa8d7aa280c2e 100644 --- a/BERT/README.md +++ b/BERT/README.md @@ -12,7 +12,7 @@ - 支持 BERT GPU 多卡 Fine-tuning - 提供 BERT 预测接口 demo, 方便多硬件设备生产环境的部署 -2)支持FP16/FP32混合精度训练和 Fine-tuning,节省显存开销、加速训练过程; +2)支持 FP16/FP32 混合精度训练和 Fine-tuning,节省显存开销、加速训练过程; 3)提供转换成 Paddle Fluid 参数格式的 [BERT 开源预训练模型](https://github.com/google-research/bert) 供下载,以进行下游任务的 Fine-tuning, 包括如下模型: @@ -155,7 +155,7 @@ python -u run_classifier.py --task_name ${TASK_NAME} \ --num_iteration_per_drop_scope 1 ``` -这里的 `chinese_L-12_H-768_A-12` 即是转换后的中文预训练模型。需要注意的是,BERT 支持按两种方式构建一个 batch 的数据,`in_tokens` 参数影响 `batch_size` 参数的意义,如果 `in_tokens` 为 `true` 则按照 token 个数构建 batch, 如不设定则按照 example 个数来构建 batch. 训练过程中会输出训练误差、训练速度等信息,训练结束后会输出如下所示的在验证集上的测试结果: +这里的 `chinese_L-12_H-768_A-12` 即是转换后的中文预训练模型。需要注意的是,BERT on PaddlePaddle 支持按两种方式构建一个 batch 的数据,`in_tokens` 参数影响 `batch_size` 参数的意义,如果 `in_tokens` 为 `true` 则按照 token 个数构建 batch, 如不设定则按照 example 个数来构建 batch. 训练过程中会输出训练误差、训练速度等信息,训练结束后会输出如下所示的在验证集上的测试结果: ``` [dev evaluation] ave loss: 0.622958, ave acc: 0.770281, elapsed time: 8.946956 s diff --git a/BERT/predict_classifier.py b/BERT/predict_classifier.py index 3b644996bc05e216277dbcde177ecfdc16b10430..fe813ffeffdd2beead9742e66507aac1b3d62db4 100644 --- a/BERT/predict_classifier.py +++ b/BERT/predict_classifier.py @@ -109,13 +109,13 @@ def main(args): # Due to the design that ParallelExecutor would drop small batches (mostly the last batch) # So using ParallelExecutor may left some data unpredicted - # if prediction of each and every example is needed, use Executor instead + # if prediction of each and every example is needed, please use Executor instead predict_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, main_program=predict_prog) predict_pyreader.decorate_tensor_provider( processor.data_generator( - batch_size=args.batch_size, phase='test', epoch=1)) + batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) predict_pyreader.start() all_results = [] diff --git a/BERT/run_classifier.py b/BERT/run_classifier.py index fd5e307eefe0deb837690a7cbd7fc1b65fd35cb0..5ba2ca9224ac0b0710cc078a923c365be5fad847 100644 --- a/BERT/run_classifier.py +++ b/BERT/run_classifier.py @@ -65,7 +65,7 @@ data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data process data_g.add_arg("data_dir", str, None, "Path to training data.") data_g.add_arg("vocab_path", str, None, "Vocabulary path.") data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.") -data_g.add_arg("batch_size", int, 8192, "Total examples' number in batch for training. see also --in_tokens.") +data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.") data_g.add_arg("in_tokens", bool, False, "If set, the batch size will be the maximum number of tokens in one batch. " "Otherwise, it will be the maximum number of examples in one batch.")