Merge pull request #1 from PaddlePaddle/develop

update

Merge pull request #1 from PaddlePaddle/develop
update
3a4d6312 · zhengya01 · GitHub · a171e58e · 17be726c · 3a4d6312
14 changed file
--- a/BERT/.run_ce.sh
+++ b/BERT/.run_ce.sh
+export FLAGS_enable_parallel_graph=1
+export FLAGS_sync_nccl_allreduce=1
+
+BERT_BASE_PATH="chinese_L-12_H-768_A-12"
+TASK_NAME='xnli'
+DATA_PATH=data/xnli/XNLI-MT-1.0
+CKPT_PATH=pretrain_model
+
+train(){
+python -u run_classifier.py --task_name ${TASK_NAME} \
+                   --use_cuda true \
+                   --do_train true \
+                   --do_val false \
+                   --do_test false \
+                   --batch_size 8192 \
+                   --in_tokens true \
+                   --init_checkpoint pretrain_model/chinese_L-12_H-768_A-12/ \
+                   --data_dir ${DATA_PATH} \
+                   --vocab_path pretrain_model/chinese_L-12_H-768_A-12/vocab.txt \
+                   --checkpoints ${CKPT_PATH} \
+                   --save_steps 1000 \
+                   --weight_decay  0.01 \
+                   --warmup_proportion 0.0 \
+                   --validation_steps 25 \
+                   --epoch 1 \
+                   --max_seq_len 512 \
+                   --bert_config_path pretrain_model/chinese_L-12_H-768_A-12/bert_config.json \
+                   --learning_rate 1e-4 \
+                   --skip_steps 10 \
+                   --random_seed 100 \
+                   --enable_ce \
+                   --shuffle false
+}
+
+export CUDA_VISIBLE_DEVICES=0
+train | python _ce.py
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+train | python _ce.py
--- a/BERT/__init__.py
+++ b/BERT/__init__.py
--- a/BERT/_ce.py
+++ b/BERT/_ce.py
+####this file is only used for continuous evaluation test!
+
+import os
+import sys
+sys.path.insert(0, os.environ['ceroot'])
+#sys.path.append('.')
+from kpi import CostKpi, DurationKpi, AccKpi
+
+#### NOTE kpi.py should shared in models in some way!!!!
+
+train_cost_xnli_card1_kpi = CostKpi('train_cost_xnli_card1', 0.002, 0, actived=True)
+train_acc_xnli_card1_kpi = AccKpi('train_acc_xnli_card1', 0.002, 0, actived=True)
+train_duration_xnli_card1_kpi = DurationKpi(
+    'train_duration_xnli_card1', 0.01, 0, actived=True)
+train_cost_xnli_card4_kpi = CostKpi('train_cost_xnli_card4', 0.002, 0, actived=True)
+train_acc_xnli_card4_kpi = AccKpi('train_acc_xnli_card4', 0.02, 0, actived=True)
+train_duration_xnli_card4_kpi = DurationKpi(
+    'train_duration_xnli_card4', 0.03, 0, actived=True)
+
+tracking_kpis = [
+        train_cost_xnli_card1_kpi,
+        train_acc_xnli_card1_kpi,
+        train_duration_xnli_card1_kpi,
+        train_cost_xnli_card4_kpi,
+        train_acc_xnli_card4_kpi,
+        train_duration_xnli_card4_kpi,
+]
+
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+    The suggestion:
+    each line in the log should be key, value, for example:
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            print("-----%s" % fs)
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    print("*****")
+    print(log)
+    print("****")
+    log_to_ce(log)
--- a/BERT/run_classifier.py
+++ b/BERT/run_classifier.py
@@ -32,6 +32,7 @@ from model.classifier import create_model
 from optimization import optimization
 from utils.args import ArgumentGroup, print_arguments, check_cuda
 from utils.init import init_pretraining_params, init_checkpoint
+from utils.cards import get_cards
 import dist_utils

 num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
@@ -87,6 +88,8 @@ run_type_g.add_arg("do_train",                     bool,   True,  "Whether to pe
 run_type_g.add_arg("do_val",                       bool,   True,  "Whether to perform evaluation on dev data set.")
 run_type_g.add_arg("do_test",                      bool,   True,  "Whether to perform evaluation on test data set.")

+parser.add_argument("--enable_ce", action='store_true', help="The flag indicating whether to run the task for continuous evaluation.")
+
 args = parser.parse_args()
 # yapf: enable.

@@ -298,6 +301,7 @@ def main(args):
        total_cost, total_acc, total_num_seqs = [], [], []
        time_begin = time.time()
        throughput = []
+        ce_info = []
        while True:
            try:
                # steps += 1
@@ -341,6 +345,7 @@ def main(args):
                           current_epoch, current_example, num_train_examples,
                           steps, np.sum(total_cost) / np.sum(total_num_seqs),
                           np.sum(total_acc) / np.sum(total_num_seqs))
+                    ce_info.append([np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), used_time])
                    if steps > 0 :
                        throughput.append( args.skip_steps / used_time)
                        log_record = log_record + ", speed: %f steps/s" % (args.skip_steps / used_time)
@@ -388,6 +393,24 @@ def main(args):
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break
+        if args.enable_ce:
+            card_num = get_cards()
+            ce_cost = 0
+            ce_acc = 0
+            ce_time = 0
+            try:
+                ce_cost = ce_info[-2][0]
+                ce_acc = ce_info[-2][1]
+                ce_time = ce_info[-2][2]
+            except:
+                print("ce info error")
+            print("kpis\ttrain_duration_%s_card%s\t%s" %
+                (args.task_name, card_num, ce_time))
+            print("kpis\ttrain_cost_%s_card%s\t%f" %
+                (args.task_name, card_num, ce_cost))
+            print("kpis\ttrain_acc_%s_card%s\t%f" %
+                (args.task_name, card_num, ce_acc))
+

    # final eval on dev set
    if args.do_val:

--- a/BERT/utils/cards.py
+++ b/BERT/utils/cards.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+
+def get_cards():
+    """
+    get gpu cards number
+    """
+    num = 0
+    cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
+    if cards != '':
+        num = len(cards.split(","))
+    return num
+
+
--- a/ELMo/train.py
+++ b/ELMo/train.py
@@ -18,6 +18,7 @@ from __future__ import print_function

 import six
 import numpy as np
+import random
 import time
 import os
 import math

--- a/ERNIE/.run_ce.sh
+++ b/ERNIE/.run_ce.sh
+set -eux
+
+export FLAGS_sync_nccl_allreduce=1
+MODEL_PATH=ERNIE_1.0.1
+TASK_DATA_PATH=task_data
+
+train() {
+python -u run_classifier.py \
+                   --use_cuda true \
+                   --do_train true \
+                   --do_val true \
+                   --do_test true \
+                   --verbose true \
+                   --batch_size 8192 \
+                   --in_tokens true \
+                   --init_pretraining_params ${MODEL_PATH}/params \
+                   --train_set ${TASK_DATA_PATH}/xnli/train.tsv \
+                   --dev_set ${TASK_DATA_PATH}/xnli/dev.tsv \
+                   --test_set ${TASK_DATA_PATH}/xnli/test.tsv \
+                   --vocab_path config/vocab.txt \
+                   --label_map ${TASK_DATA_PATH}/xnli/label_map.json \
+                   --ernie_config_path config/ernie_config.json \
+                   --checkpoints ./checkpoints \
+                   --save_steps 2000 \
+                   --weight_decay  0.01 \
+                   --warmup_proportion 0.0 \
+                   --validation_steps 25 \
+                   --epoch 1 \
+                   --max_seq_len 512 \
+                   --learning_rate 1e-4 \
+                   --skip_steps 10 \
+                   --num_iteration_per_drop_scope 1 \
+                   --num_labels 3 \
+                   --random_seed 100 \
+                   --enable_ce \
+                   --shuffle false
+}
+
+export CUDA_VISIBLE_DEVICES=0
+train | python _ce.py
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+train | python _ce.py
--- a/ERNIE/README.md
+++ b/ERNIE/README.md
@@ -279,7 +279,7 @@ text_a  label
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=7

-python -u ernir_encoder.py \
+python -u ernie_encoder.py \
                   --use_cuda true \
                   --batch_size 32 \
                   --output_dir "./test" \
@@ -295,3 +295,25 @@ python -u ernir_encoder.py \
 #### 如何获取输入句子中每个 token 经过 ERNIE 编码后的 Embedding 表示？

 [解决方案同上](#如何获取输入句子经过-ERNIE-编码后的-Embedding-表示?)
+
+#### 如何利用 finetune 得到的模型对新数据进行批量预测？
+
+我们以分类任务为例，给出了分类任务进行批量预测的脚本, 使用示例如下:
+
+```
+python -u predict_classifier.py \
+       --use_cuda true \
+       --batch_size 32 \
+       --vocab_path config/vocab.txt \
+       --init_checkpoint "./checkpoints/step_100" \
+       --do_lower_case true \
+       --max_seq_len 128 \
+       --ernie_config_path config/ernie_config.json \
+       --do_predict true \
+       --predict_set ${TASK_DATA_PATH}/lcqmc/test.tsv \
+       --num_labels 2
+```
+
+实际使用时，需要通过 `init_checkpoint` 指定预测用的模型，通过 `predict_set` 指定待预测的数据文件，通过 `num_labels` 配置分类的类别数目;
+
+**Note**: predict_set 的数据格式与 dev_set 和 test_set 的数据格式完全一致，是由 text_a、text_b(可选) 、label 组成的2列/3列 tsv 文件，predict_set 中的 label 列起到占位符的作用，全部置 0 即可;
--- a/ERNIE/__init__.py
+++ b/ERNIE/__init__.py
--- a/ERNIE/_ce.py
+++ b/ERNIE/_ce.py
+####this file is only used for continuous evaluation test!
+
+import os
+import sys
+sys.path.insert(0, os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+
+#### NOTE kpi.py should shared in models in some way!!!!
+
+train_loss_card1_kpi = CostKpi('train_loss_card1', 0.03, 0, actived=True)
+train_acc_card1_kpi = AccKpi('train_acc_card1', 0.06, 0, actived=True)
+train_duration_card1_kpi = DurationKpi(
+    'train_duration_card1', 0.01, 0, actived=True)
+train_loss_card4_kpi = CostKpi('train_loss_card4', 0.01, 0, actived=True)
+train_acc_card4_kpi = AccKpi('train_acc_card4', 0.02, 0, actived=True)
+train_duration_card4_kpi = DurationKpi(
+    'train_duration_card4', 0.02, 0, actived=True)
+
+tracking_kpis = [
+        train_loss_card1_kpi,
+        train_acc_card1_kpi,
+        train_duration_card1_kpi,
+        train_loss_card4_kpi,
+        train_acc_card4_kpi,
+        train_duration_card4_kpi,
+]
+
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+    The suggestion:
+    each line in the log should be key, value, for example:
+    "
+    train_loss\t1.0
+    test_loss\t1.0
+    train_loss\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            print("-----%s" % fs)
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    print("*****")
+    print(log)
+    print("****")
+    log_to_ce(log)
--- a/ERNIE/finetune_args.py
+++ b/ERNIE/finetune_args.py
@@ -74,4 +74,7 @@ run_type_g.add_arg("do_train",                     bool,   True,  "Whether to pe
 run_type_g.add_arg("do_val",                       bool,   True,  "Whether to perform evaluation on dev data set.")
 run_type_g.add_arg("do_test",                      bool,   True,  "Whether to perform evaluation on test data set.")
 run_type_g.add_arg("metrics",                      bool,   True,  "Whether to perform evaluation on test data set.")
+run_type_g.add_arg("shuffle",                      bool,   True,  "")
+
+parser.add_argument("--enable_ce", action='store_true', help="The flag indicating whether to run the task for continuous evaluation.")
 # yapf: enable
--- a/ERNIE/predict_classifier.py
+++ b/ERNIE/predict_classifier.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Load classifier's checkpoint to do prediction or save inference model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import argparse
+import numpy as np
+import multiprocessing
+import paddle.fluid as fluid
+
+from reader.task_reader import ClassifyReader
+from model.ernie import ErnieConfig
+from finetune.classifier import create_model
+
+from utils.args import ArgumentGroup, print_arguments
+from utils.init import init_pretraining_params
+from finetune_args import parser
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+model_g = ArgumentGroup(parser, "model", "options to init, resume and save model.")
+model_g.add_arg("ernie_config_path",            str,  None,  "Path to the json file for bert model config.")
+model_g.add_arg("init_checkpoint",              str,  None,  "Init checkpoint to resume training from.")
+model_g.add_arg("use_fp16",                     bool, False, "Whether to resume parameters from fp16 checkpoint.")
+model_g.add_arg("num_labels",                   int,  2,     "num labels for classify")
+
+data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options.")
+data_g.add_arg("predict_set",         str,  None,  "Predict set file")
+data_g.add_arg("vocab_path",          str,  None,  "Vocabulary path.")
+data_g.add_arg("label_map_config",    str,  None,  "Label_map_config json file.")
+data_g.add_arg("max_seq_len",         int,  128,   "Number of words of the longest seqence.")
+data_g.add_arg("batch_size",          int,  32,    "Total examples' number in batch for training. see also --in_tokens.")
+data_g.add_arg("do_lower_case",       bool, True,
+               "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
+
+run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
+run_type_g.add_arg("use_cuda",          bool,   True,  "If set, use GPU for training.")
+run_type_g.add_arg("do_prediction",     bool,   True,  "Whether to do prediction on test set.")
+
+args = parser.parse_args()
+# yapf: enable.
+
+def main(args):
+    ernie_config = ErnieConfig(args.ernie_config_path)
+    ernie_config.print_config()
+
+    reader = ClassifyReader(
+        vocab_path=args.vocab_path,
+        label_map_config=args.label_map_config,
+        max_seq_len=args.max_seq_len,
+        do_lower_case=args.do_lower_case,
+        in_tokens=False)
+
+    predict_prog = fluid.Program()
+    predict_startup = fluid.Program()
+    with fluid.program_guard(predict_prog, predict_startup):
+        with fluid.unique_name.guard():
+            predict_pyreader, probs, feed_target_names = create_model(
+                args,
+                pyreader_name='predict_reader',
+                ernie_config=ernie_config,
+                is_prediction=True)
+
+    predict_prog = predict_prog.clone(for_test=True)
+
+    if args.use_cuda:
+        place = fluid.CUDAPlace(0)
+        dev_count = fluid.core.get_cuda_device_count()
+    else:
+        place = fluid.CPUPlace()
+        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+
+    place = fluid.CUDAPlace(0) if args.use_cuda == True else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(predict_startup)
+
+    if args.init_checkpoint:
+        init_pretraining_params(exe, args.init_checkpoint, predict_prog)
+    else:
+        raise ValueError("args 'init_checkpoint' should be set for prediction!")
+
+    predict_exe = fluid.Executor(place)
+
+    predict_data_generator = reader.data_generator(
+        input_file=args.predict_set,
+        batch_size=args.batch_size,
+        epoch=1,
+        shuffle=False)
+
+    predict_pyreader.decorate_tensor_provider(predict_data_generator)
+
+    predict_pyreader.start()
+    all_results = []
+    time_begin = time.time()
+    while True:
+        try:
+            results = predict_exe.run(program=predict_prog, fetch_list=[probs.name])
+            all_results.extend(results[0])
+        except fluid.core.EOFException:
+            predict_pyreader.reset()
+            break
+    time_end = time.time()
+
+    np.set_printoptions(precision=4, suppress=True)
+    print("-------------- prediction results --------------")
+    for index, result in enumerate(all_results):
+        print(str(index) + '\t{}'.format(result))
+
+
+if __name__ == '__main__':
+    print_arguments(args)
+    main(args)
--- a/ERNIE/run_classifier.py
+++ b/ERNIE/run_classifier.py
@@ -29,6 +29,7 @@ from finetune.classifier import create_model, evaluate
 from optimization import optimization
 from utils.args import print_arguments, check_cuda
 from utils.init import init_pretraining_params, init_checkpoint
+from utils.cards import get_cards
 from finetune_args import parser

 args = parser.parse_args()
@@ -67,7 +68,7 @@ def main(args):
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
-            shuffle=True,
+            shuffle=args.shuffle,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)
@@ -85,6 +86,8 @@ def main(args):
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()
+        if args.random_seed is not None and args.enable_ce:
+            train_program.random_seed = args.random_seed

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
@@ -187,6 +190,7 @@ def main(args):
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

+        ce_info = []
        time_begin = time.time()
        while True:
            try:
@@ -213,6 +217,7 @@ def main(args):
                          (current_epoch, current_example, num_train_examples,
                           steps, outputs["loss"], outputs["accuracy"],
                           args.skip_steps / used_time))
+                    ce_info.append([outputs["loss"], outputs["accuracy"], used_time])
                    time_begin = time.time()

                if steps % args.save_steps == 0:
@@ -246,6 +251,24 @@ def main(args):
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break
+        if args.enable_ce:
+            card_num = get_cards()
+            ce_loss = 0
+            ce_acc = 0
+            ce_time = 0
+            try:
+                ce_loss = ce_info[-2][0]
+                ce_acc = ce_info[-2][1]
+                ce_time = ce_info[-2][2]
+            except:
+                print("ce info error")
+            print("kpis\ttrain_duration_card%s\t%s" %
+                (card_num, ce_time))
+            print("kpis\ttrain_loss_card%s\t%f" %
+                (card_num, ce_loss))
+            print("kpis\ttrain_acc_card%s\t%f" %
+                (card_num, ce_acc))
+

    # final eval on dev set
    if args.do_val:

--- a/ERNIE/utils/cards.py
+++ b/ERNIE/utils/cards.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+
+def get_cards():
+    """
+    get gpu cards number
+    """
+    num = 0
+    cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
+    if cards != '':
+        num = len(cards.split(","))
+    return num
+
+