Merge pull request #2 from PaddlePaddle/develop

update

Merge pull request #2 from PaddlePaddle/develop
update
34f1f624 · zhengya01 · GitHub · 3a4d6312 · 4ddf6218 · 34f1f624
141 changed file
--- a/.gitignore
+++ b/.gitignore
 *.pyc
+*.un~
--- a/.metas/ernie2.0_arch.png
+++ b/.metas/ernie2.0_arch.png
--- a/.metas/ernie2.0_model.png
+++ b/.metas/ernie2.0_model.png
--- a/ERNIE/.run_ce.sh
+++ b/ERNIE/.run_ce.sh
--- a/BERT/README.md
+++ b/BERT/README.md
--- a/BERT/_ce.py
+++ b/BERT/_ce.py
-####this file is only used for continuous evaluation test!
-
-import os
-import sys
-sys.path.insert(0, os.environ['ceroot'])
-#sys.path.append('.')
-from kpi import CostKpi, DurationKpi, AccKpi
-
-#### NOTE kpi.py should shared in models in some way!!!!
-
-train_cost_xnli_card1_kpi = CostKpi('train_cost_xnli_card1', 0.002, 0, actived=True)
-train_acc_xnli_card1_kpi = AccKpi('train_acc_xnli_card1', 0.002, 0, actived=True)
-train_duration_xnli_card1_kpi = DurationKpi(
-    'train_duration_xnli_card1', 0.01, 0, actived=True)
-train_cost_xnli_card4_kpi = CostKpi('train_cost_xnli_card4', 0.002, 0, actived=True)
-train_acc_xnli_card4_kpi = AccKpi('train_acc_xnli_card4', 0.02, 0, actived=True)
-train_duration_xnli_card4_kpi = DurationKpi(
-    'train_duration_xnli_card4', 0.03, 0, actived=True)
-
-tracking_kpis = [
-        train_cost_xnli_card1_kpi,
-        train_acc_xnli_card1_kpi,
-        train_duration_xnli_card1_kpi,
-        train_cost_xnli_card4_kpi,
-        train_acc_xnli_card4_kpi,
-        train_duration_xnli_card4_kpi,
-]
-
-
-def parse_log(log):
-    '''
-    This method should be implemented by model developers.
-    The suggestion:
-    each line in the log should be key, value, for example:
-    "
-    train_cost\t1.0
-    test_cost\t1.0
-    train_cost\t1.0
-    train_cost\t1.0
-    train_acc\t1.2
-    "
-    '''
-    for line in log.split('\n'):
-        fs = line.strip().split('\t')
-        print(fs)
-        if len(fs) == 3 and fs[0] == 'kpis':
-            print("-----%s" % fs)
-            kpi_name = fs[1]
-            kpi_value = float(fs[2])
-            yield kpi_name, kpi_value
-
-
-def log_to_ce(log):
-    kpi_tracker = {}
-    for kpi in tracking_kpis:
-        kpi_tracker[kpi.name] = kpi
-
-    for (kpi_name, kpi_value) in parse_log(log):
-        print(kpi_name, kpi_value)
-        kpi_tracker[kpi_name].add_record(kpi_value)
-        kpi_tracker[kpi_name].persist()
-
-
-if __name__ == '__main__':
-    log = sys.stdin.read()
-    print("*****")
-    print(log)
-    print("****")
-    log_to_ce(log)
--- a/BERT/batching.py
+++ b/BERT/batching.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Mask, padding and batching."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-
-def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
-    """
-    Add mask for batch_tokens, return out, mask_label, mask_pos;
-    Note: mask_pos responding the batch_tokens after padded;
-    """
-    max_len = max([len(sent) for sent in batch_tokens])
-    mask_label = []
-    mask_pos = []
-    prob_mask = np.random.rand(total_token_num)
-    # Note: the first token is [CLS], so [low=1]
-    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
-    pre_sent_len = 0
-    prob_index = 0
-    for sent_index, sent in enumerate(batch_tokens):
-        mask_flag = False
-        prob_index += pre_sent_len
-        for token_index, token in enumerate(sent):
-            prob = prob_mask[prob_index + token_index]
-            if prob > 0.15:
-                continue
-            elif 0.03 < prob <= 0.15:
-                # mask
-                if token != SEP and token != CLS:
-                    mask_label.append(sent[token_index])
-                    sent[token_index] = MASK
-                    mask_flag = True
-                    mask_pos.append(sent_index * max_len + token_index)
-            elif 0.015 < prob <= 0.03:
-                # random replace
-                if token != SEP and token != CLS:
-                    mask_label.append(sent[token_index])
-                    sent[token_index] = replace_ids[prob_index + token_index]
-                    mask_flag = True
-                    mask_pos.append(sent_index * max_len + token_index)
-            else:
-                # keep the original token
-                if token != SEP and token != CLS:
-                    mask_label.append(sent[token_index])
-                    mask_pos.append(sent_index * max_len + token_index)
-        pre_sent_len = len(sent)
-
-        # ensure at least mask one word in a sentence
-        while not mask_flag:
-            token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
-            if sent[token_index] != SEP and sent[token_index] != CLS:
-                mask_label.append(sent[token_index])
-                sent[token_index] = MASK
-                mask_flag = True
-                mask_pos.append(sent_index * max_len + token_index)
-    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
-    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
-    return batch_tokens, mask_label, mask_pos
-
-
-def prepare_batch_data(insts,
-                       total_token_num,
-                       voc_size=0,
-                       pad_id=None,
-                       cls_id=None,
-                       sep_id=None,
-                       mask_id=None,
-                       return_input_mask=True,
-                       return_max_len=True,
-                       return_num_token=False):
-    """
-    1. generate Tensor of data
-    2. generate Tensor of position
-    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
-    """
-
-    batch_src_ids = [inst[0] for inst in insts]
-    batch_sent_ids = [inst[1] for inst in insts]
-    batch_pos_ids = [inst[2] for inst in insts]
-    labels_list = []
-    # compatible with squad, whose example includes start/end positions, 
-    # or unique id
-
-    for i in range(3, len(insts[0]), 1):
-        labels = [inst[i] for inst in insts]
-        labels = np.array(labels).astype("int64").reshape([-1, 1])
-        labels_list.append(labels)
-
-    # First step: do mask without padding
-    if mask_id >= 0:
-        out, mask_label, mask_pos = mask(
-            batch_src_ids,
-            total_token_num,
-            vocab_size=voc_size,
-            CLS=cls_id,
-            SEP=sep_id,
-            MASK=mask_id)
-    else:
-        out = batch_src_ids
-    # Second step: padding
-    src_id, self_input_mask = pad_batch_data(
-        out, pad_idx=pad_id, return_input_mask=True)
-    pos_id = pad_batch_data(
-        batch_pos_ids,
-        pad_idx=pad_id,
-        return_pos=False,
-        return_input_mask=False)
-    sent_id = pad_batch_data(
-        batch_sent_ids,
-        pad_idx=pad_id,
-        return_pos=False,
-        return_input_mask=False)
-
-    if mask_id >= 0:
-        return_list = [
-            src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos
-        ] + labels_list
-    else:
-        return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
-
-    return return_list if len(return_list) > 1 else return_list[0]
-
-
-def pad_batch_data(insts,
-                   pad_idx=0,
-                   return_pos=False,
-                   return_input_mask=False,
-                   return_max_len=False,
-                   return_num_token=False):
-    """
-    Pad the instances to the max sequence length in batch, and generate the
-    corresponding position data and input mask.
-    """
-    return_list = []
-    max_len = max(len(inst) for inst in insts)
-    # Any token included in dict can be used to pad, since the paddings' loss
-    # will be masked out by weights and make no effect on parameter gradients.
-
-    inst_data = np.array([
-        list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
-    ])
-    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
-
-    # position data
-    if return_pos:
-        inst_pos = np.array([
-            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
-            for inst in insts
-        ])
-
-        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
-
-    if return_input_mask:
-        # This is used to avoid attention on paddings.
-        input_mask_data = np.array([[1] * len(inst) + [0] *
-                                    (max_len - len(inst)) for inst in insts])
-        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
-        return_list += [input_mask_data.astype("float32")]
-
-    if return_max_len:
-        return_list += [max_len]
-
-    if return_num_token:
-        num_token = 0
-        for inst in insts:
-            num_token += len(inst)
-        return_list += [num_token]
-
-    return return_list if len(return_list) > 1 else return_list[0]
-
-
-if __name__ == "__main__":
-    pass
--- a/BERT/convert_params.py
+++ b/BERT/convert_params.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Google official BERT models to Fluid parameters."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import collections
-from utils.args import print_arguments
-import tensorflow as tf
-import paddle.fluid as fluid
-from tensorflow.python import pywrap_tensorflow
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(__doc__)
-    parser.add_argument(
-        "--init_tf_checkpoint",
-        type=str,
-        required=True,
-        help="Initial TF checkpoint (a pre-trained BERT model).")
-
-    parser.add_argument(
-        "--fluid_params_dir",
-        type=str,
-        required=True,
-        help="The directory to store converted Fluid parameters.")
-    args = parser.parse_args()
-    return args
-
-
-def parse(init_checkpoint):
-    tf_fluid_param_name_map = collections.OrderedDict()
-    tf_param_name_shape_map = collections.OrderedDict()
-
-    init_vars = tf.train.list_variables(init_checkpoint)
-    for (var_name, var_shape) in init_vars:
-        fluid_param_name = ''
-        if var_name.startswith('bert/'):
-            key = var_name[5:]
-            if (key.startswith('embeddings/')):
-                if (key.endswith('LayerNorm/gamma')):
-                    fluid_param_name = 'pre_encoder_layer_norm_scale'
-                elif (key.endswith('LayerNorm/beta')):
-                    fluid_param_name = 'pre_encoder_layer_norm_bias'
-                elif (key.endswith('position_embeddings')):
-                    fluid_param_name = 'pos_embedding'
-                elif (key.endswith('word_embeddings')):
-                    fluid_param_name = 'word_embedding'
-                elif (key.endswith('token_type_embeddings')):
-                    fluid_param_name = 'sent_embedding'
-                else:
-                    print("ignored param: %s" % var_name)
-            elif (key.startswith('encoder/')):
-                key = key[8:]
-                layer_num = int(key[key.find('_') + 1:key.find('/')])
-                suffix = "encoder_layer_" + str(layer_num)
-                if key.endswith('attention/output/LayerNorm/beta'):
-                    fluid_param_name = suffix + '_post_att_layer_norm_bias'
-                elif key.endswith('attention/output/LayerNorm/gamma'):
-                    fluid_param_name = suffix + '_post_att_layer_norm_scale'
-                elif key.endswith('attention/output/dense/bias'):
-                    fluid_param_name = suffix + '_multi_head_att_output_fc.b_0'
-                elif key.endswith('attention/output/dense/kernel'):
-                    fluid_param_name = suffix + '_multi_head_att_output_fc.w_0'
-                elif key.endswith('attention/self/key/bias'):
-                    fluid_param_name = suffix + '_multi_head_att_key_fc.b_0'
-                elif key.endswith('attention/self/key/kernel'):
-                    fluid_param_name = suffix + '_multi_head_att_key_fc.w_0'
-                elif key.endswith('attention/self/query/bias'):
-                    fluid_param_name = suffix + '_multi_head_att_query_fc.b_0'
-                elif key.endswith('attention/self/query/kernel'):
-                    fluid_param_name = suffix + '_multi_head_att_query_fc.w_0'
-                elif key.endswith('attention/self/value/bias'):
-                    fluid_param_name = suffix + '_multi_head_att_value_fc.b_0'
-                elif key.endswith('attention/self/value/kernel'):
-                    fluid_param_name = suffix + '_multi_head_att_value_fc.w_0'
-                elif key.endswith('intermediate/dense/bias'):
-                    fluid_param_name = suffix + '_ffn_fc_0.b_0'
-                elif key.endswith('intermediate/dense/kernel'):
-                    fluid_param_name = suffix + '_ffn_fc_0.w_0'
-                elif key.endswith('output/LayerNorm/beta'):
-                    fluid_param_name = suffix + '_post_ffn_layer_norm_bias'
-                elif key.endswith('output/LayerNorm/gamma'):
-                    fluid_param_name = suffix + '_post_ffn_layer_norm_scale'
-                elif key.endswith('output/dense/bias'):
-                    fluid_param_name = suffix + '_ffn_fc_1.b_0'
-                elif key.endswith('output/dense/kernel'):
-                    fluid_param_name = suffix + '_ffn_fc_1.w_0'
-                else:
-                    print("ignored param: %s" % var_name)
-            elif (key.startswith('pooler/')):
-                if key.endswith('dense/bias'):
-                    fluid_param_name = 'pooled_fc.b_0'
-                elif key.endswith('dense/kernel'):
-                    fluid_param_name = 'pooled_fc.w_0'
-                else:
-                    print("ignored param: %s" % var_name)
-            else:
-                print("ignored param: %s" % var_name)
-
-        elif var_name.startswith('cls/'):
-            if var_name == 'cls/predictions/output_bias':
-                fluid_param_name = 'mask_lm_out_fc.b_0'
-            elif var_name == 'cls/predictions/transform/LayerNorm/beta':
-                fluid_param_name = 'mask_lm_trans_layer_norm_bias'
-            elif var_name == 'cls/predictions/transform/LayerNorm/gamma':
-                fluid_param_name = 'mask_lm_trans_layer_norm_scale'
-            elif var_name == 'cls/predictions/transform/dense/bias':
-                fluid_param_name = 'mask_lm_trans_fc.b_0'
-            elif var_name == 'cls/predictions/transform/dense/kernel':
-                fluid_param_name = 'mask_lm_trans_fc.w_0'
-            elif var_name == 'cls/seq_relationship/output_bias':
-                fluid_param_name = 'next_sent_fc.b_0'
-            elif var_name == 'cls/seq_relationship/output_weights':
-                fluid_param_name = 'next_sent_fc.w_0'
-            elif var_name == 'cls/squad/output_weights':
-                fluid_param_name = 'cls_squad_out_w'
-            elif var_name == 'cls/squad/output_bias':
-                fluid_param_name = 'cls_squad_out_b'
-            else:
-                print("ignored param: %s" % var_name)
-        else:
-            if var_name == 'output_weights':
-                fluid_param_name = 'cls_out_w'
-            elif var_name == 'output_bias':
-                fluid_param_name = 'cls_out_b'
-            else:
-                print("ignored param: %s" % var_name)
-
-        if fluid_param_name != '':
-            tf_fluid_param_name_map[var_name] = fluid_param_name
-            tf_param_name_shape_map[var_name] = var_shape
-            fluid_param_name = ''
-
-    return tf_fluid_param_name_map, tf_param_name_shape_map
-
-
-def convert(args):
-    tf_fluid_param_name_map, tf_param_name_shape_map = parse(
-        args.init_tf_checkpoint)
-    program = fluid.Program()
-    global_block = program.global_block()
-    for param in tf_fluid_param_name_map:
-        global_block.create_parameter(
-            name=tf_fluid_param_name_map[param],
-            shape=tf_param_name_shape_map[param],
-            dtype='float32',
-            initializer=fluid.initializer.Constant(value=0.0))
-
-    place = fluid.core.CPUPlace()
-    exe = fluid.Executor(place)
-    exe.run(program)
-
-    print('---------------------- Converted Parameters -----------------------')
-    print('###### [TF param name] --> [Fluid param name]  [param shape] ######')
-    print('-------------------------------------------------------------------')
-
-    reader = pywrap_tensorflow.NewCheckpointReader(args.init_tf_checkpoint)
-    for param in tf_fluid_param_name_map:
-        value = reader.get_tensor(param)
-        if param == 'cls/seq_relationship/output_weights':
-            value = np.transpose(value)
-        if param == 'cls/squad/output_weights':
-            value = np.transpose(value)
-        if param == 'output_weights':
-            value = np.transpose(value)
-        fluid.global_scope().find_var(tf_fluid_param_name_map[
-            param]).get_tensor().set(value, place)
-        print(param, ' --> ', tf_fluid_param_name_map[param], '  ', value.shape)
-
-    fluid.io.save_params(exe, args.fluid_params_dir, main_program=program)
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    print_arguments(args)
-    convert(args)
--- a/BERT/data/demo_config/vocab.txt
+++ b/BERT/data/demo_config/vocab.txt
--- a/BERT/data/demo_wiki_tokens.txt
+++ b/BERT/data/demo_wiki_tokens.txt
--- a/BERT/data/train/demo_wiki_train.gz
+++ b/BERT/data/train/demo_wiki_train.gz
--- a/BERT/data/validation/demo_wiki_validation.gz
+++ b/BERT/data/validation/demo_wiki_validation.gz
--- a/BERT/dist_utils.py
+++ b/BERT/dist_utils.py
-#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import paddle.fluid as fluid
-
-def nccl2_prepare(trainer_id, startup_prog, main_prog):
-   config = fluid.DistributeTranspilerConfig()
-   config.mode = "nccl2"
-   t = fluid.DistributeTranspiler(config=config)
-   t.transpile(trainer_id,
-      trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'),
-      current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'),
-      startup_program=startup_prog,
-      program=main_prog)
-
-def prepare_for_multi_process(exe, build_strategy, train_prog):
-   # prepare for multi-process
-   trainer_id = int(os.environ.get('PADDLE_TRAINER_ID', 0))
-   num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
-   if num_trainers < 2: return
-   print("PADDLE_TRAINERS_NUM", num_trainers)
-   print("PADDLE_TRAINER_ID", trainer_id)
-   build_strategy.num_trainers =  num_trainers
-   build_strategy.trainer_id = trainer_id
-   # NOTE(zcd): use multi processes to train the model,
-   # and each process use one GPU card.
-   startup_prog = fluid.Program()
-   nccl2_prepare(trainer_id, startup_prog, train_prog)
-   # the startup_prog are run two times, but it doesn't matter.
-   exe.run(startup_prog) 
--- a/BERT/inference/CMakeLists.txt
+++ b/BERT/inference/CMakeLists.txt
-CMAKE_MINIMUM_REQUIRED(VERSION 3.2)
-PROJECT(inference_demo)
-SET(CMAKE_C_COMPILER gcc)
-SET(CMAKE_CXX_COMPILER g++)
-ADD_COMPILE_OPTIONS(-std=c++11 -D_GLIBCXX_USE_CXX11_ABI=0)
-
-SET(FLUID_INFER_LIB fluid_inference)
-SET(FLUID_INC_PATH ${FLUID_INFER_LIB}/paddle/include)
-SET(FLUID_LIB_PATH ${FLUID_INFER_LIB}/paddle/lib)
-
-SET(GLOG_INC_PATH ${FLUID_INFER_LIB}/third_party/install/glog/include)
-SET(GLOG_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/glog/lib)
-
-SET(GFLAGS_INC_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/include)
-SET(GFLAGS_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/lib)
-SET(MKLDNN_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mkldnn/lib)
-SET(MKLML_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mklml/lib)
-
-INCLUDE_DIRECTORIES(${FLUID_INC_PATH})
-INCLUDE_DIRECTORIES(${GLOG_INC_PATH})
-INCLUDE_DIRECTORIES(${GFLAGS_INC_PATH})
-
-LINK_DIRECTORIES(${FLUID_LIB_PATH})
-LINK_DIRECTORIES(${GLOG_LIB_PATH})
-LINK_DIRECTORIES(${GFLAGS_LIB_PATH})
-LINK_DIRECTORIES(${MKLML_LIB_PATH})
-LINK_DIRECTORIES(${MKLDNN_LIB_PATH})
-
-ADD_EXECUTABLE(inference inference.cc)
-TARGET_LINK_LIBRARIES(inference dl paddle_fluid glog gflags pthread)
-
--- a/BERT/inference/README.md
+++ b/BERT/inference/README.md
-# BERT模型inference demo
-
-## 数据预处理
-实际应用场景中，模型部署之后用户还需要编写对应的程序对输入进行处理，然后把得到的数据传给模型进行预测。这里为了演示的需要，用 `gen_demo_data.py` 来进行数据处理，包括 tokenization，batching，numericalization，并且把处理后的数据输出为文本文件。使用方法如下：
-
-``` bash
-TASK_NAME="xnli"
-DATA_PATH=/path/to/xnli/data/
-BERT_BASE_PATH=/path/to/bert/pretrained/model/
-python gen_demo_data.py \
-    --task_name ${TASK_NAME} \
-    --data_path ${DATA_PATH} \
-    --vocab_path "${BERT_BASE_PATH}/vocab.txt" \
-    --batch_size 4096 \
-    --in_tokens \
-    > data.txt
-```
-
-**生成的数据格式**
-
-生成的数据一行代表一个 `batch`, 包含四个字段
-
-```text
-src_id, pos_id, segment_id, input_mask
-```
-
-字段之间按照分号(;)分隔，其中各字段内部 `shape` 和 `data` 按照冒号(:)分隔，`shape` 和 `data` 内部按空格分隔，`input_mask` 为 FLOAT32 类型，其余字段为 INT64 类型。
-
-## 编译和运行
-
-为了编译 inference demo，`C++` 编译器需要支持 `C++11` 标准。
-
-首先下载对应的 [PaddlePaddle预测库](http://paddlepaddle.org/documentation/docs/zh/1.3/advanced_usage/deploy/inference/build_and_install_lib_cn.html) , 根据使用的 paddle 的版本和配置状况 (是否使用 avx, mkl, 以及 cuda, cudnn 版本) 选择下载对应的版本，并解压至 `inference` 目录，会得到 `fluid_inference` 子目录。
-
-假设`paddle_infer_lib_path`是刚才解压得到的`fluid_inference`子目录的绝对路径，设置运行相关的环境变量(以 `cpu_avx_mkl` 版本为例)
-
-``` bash
-LD_LIBRARY_PATH=${paddle_infer_lib_path}/paddle/lib/:$LD_LIBRARY_PATH
-LD_LIBRARY_PATH=${paddle_infer_lib_path}/third_party/install/mklml/lib:$LD_LIBRARY_PATH
-LD_LIBRARY_PATH=${paddle_infer_lib_path}/third_party/install/mkldnn/lib:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH
-```
-
-编译 demo
-
-``` bash
-mkdir build && cd build
-cmake .. -DFLUID_INFER_LIB=${paddle_infer_lib_path}
-make
-```
-
-这会在 `build` 目录下生成运行 `inference` 可执行文件。
-
-运行 demo
-
-```bash
-./inference --logtostderr \
-    --model_dir $INFERENCE_MODEL_PATH \
-    --data $DATA_PATH \
-    --repeat $REPEAT_TIMES
-    --output_prediction \
-    --use_gpu \
-```
-
-参数 `repeat` 设置了执行预测的循环次数，一般在性能测试时可以设置其为大于 1 的某个整数，以观察多次预测的平均时间消耗。 在设置了 `output_prediction` 之后，预测程序会将每个样本的预测结果以概率的形式输出，其格式为：
-
-```
-样本id \t 类别0概率 \t 类别1概率 \t 类别2概率 ...
-```
-
-最后，在支持 NV GPUs 的环境中可以使能 `use_gpu`，否则就会在 CPU 上执行预测。
--- a/BERT/inference/gen_demo_data.py
+++ b/BERT/inference/gen_demo_data.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import sys
-sys.path.append("..")
-from reader import cls
-
-
-def main():
-    args = parse_args()
-    task_name = args.task_name.lower()
-    processors = {
-        'xnli': cls.XnliProcessor,
-        'cola': cls.ColaProcessor,
-        'mrpc': cls.MrpcProcessor,
-        'mnli': cls.MnliProcessor,
-    }
-
-    processor = processors[task_name](data_dir=args.data_path,
-                                      vocab_path=args.vocab_path,
-                                      max_seq_len=args.max_seq_len,
-                                      do_lower_case=args.do_lower_case,
-                                      in_tokens=args.in_tokens,
-                                      random_seed=args.random_seed)
-    example = processor.get_test_examples(args.data_path)[0]
-    gen = processor.data_generator(
-        args.batch_size, phase='test', epoch=1, shuffle=False)()
-
-    for i, data in enumerate(gen):
-        data = data[:4]
-        sample = []
-        for field in data:
-            shape_str = ' '.join(map(str, field.shape))
-            data_str = ' '.join(map(str, field.reshape(-1).tolist()))
-            sample.append(shape_str + ':' + data_str)
-        print(';'.join(sample))
-
-
-def str2bool(v):
-    # because argparse does not support to parse "true, False" as python
-    # boolean directly
-    return v.lower() in ("true", "t", "1")
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(prog="bert data prepare")
-    parser.add_argument(
-        "--task_name",
-        type=str,
-        default='xnli',
-        choices=["xnli", "mnli", "cola", "mrpc"],
-        help="task name, used to specify data preprocessor")
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=4096,
-        help="batch size, see also --in_tokens")
-    parser.add_argument(
-        "--in_tokens",
-        action='store_true',
-        help="if set, batch_size means token number in a batch, otherwise "
-        "it means example number in a batch")
-    parser.add_argument(
-        '--do_lower_case',
-        type=str2bool,
-        default=True,
-        choices=[True, False],
-        help="Whether to lower case the input text. Should be True for uncased "
-        "models and False for cased models.")
-    parser.add_argument("--vocab_path", type=str, help="path of vocabulary")
-    parser.add_argument("--data_path", type=str, help="path of data to process")
-    parser.add_argument(
-        "--max_seq_len", type=int, default=128, help="max sequence length")
-    parser.add_argument(
-        "--random_seed", type=int, default=0, help="random seed")
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    main()
--- a/BERT/inference/inference.cc
+++ b/BERT/inference/inference.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <paddle_inference_api.h>
-#include <chrono>
-#include <iostream>
-#include <fstream>
-#include <numeric>
-#include <sstream>
-#include <string>
-#include <vector>
-
-DEFINE_string(model_dir, "", "Inference model directory.");
-DEFINE_string(data, "", "Input data path.");
-DEFINE_int32(repeat, 1, "Repeat times.");
-DEFINE_int32(num_labels, 3, "Number of labels.");
-DEFINE_bool(output_prediction, false, "Whether to output the prediction results.");
-DEFINE_bool(use_gpu, false, "Whether to use GPU for prediction.");
-
-template <typename T>
-void GetValueFromStream(std::stringstream *ss, T *t) {
-  (*ss) >> (*t);
-}
-
-template <>
-void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
-  *t = ss->str();
-}
-
-// Split string to vector
-template <typename T>
-void Split(const std::string &line, char sep, std::vector<T> *v) {
-  std::stringstream ss;
-  T t;
-  for (auto c : line) {
-    if (c != sep) {
-      ss << c;
-    } else {
-      GetValueFromStream<T>(&ss, &t);
-      v->push_back(std::move(t));
-      ss.str({});
-      ss.clear();
-    }
-  }
-
-  if (!ss.str().empty()) {
-    GetValueFromStream<T>(&ss, &t);
-    v->push_back(std::move(t));
-    ss.str({});
-    ss.clear();
-  }
-}
-
-template <typename T>
-constexpr paddle::PaddleDType GetPaddleDType();
-
-template <>
-constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
-  return paddle::PaddleDType::INT64;
-}
-
-template <>
-constexpr paddle::PaddleDType GetPaddleDType<float>() {
-  return paddle::PaddleDType::FLOAT32;
-}
-
-
-// Parse tensor from string
-template <typename T>
-bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
-  std::vector<std::string> data;
-  Split(field, ':', &data);
-  if (data.size() < 2) {
-    LOG(ERROR) << "parse tensor error!";
-    return false;
-  }
-
-  std::string shape_str = data[0];
-
-  std::vector<int> shape;
-  Split(shape_str, ' ', &shape);
-
-  std::string mat_str = data[1];
-
-  std::vector<T> mat;
-  Split(mat_str, ' ', &mat);
-
-  tensor->shape = shape;
-  auto size =
-      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
-      sizeof(T);
-  tensor->data.Resize(size);
-  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
-  tensor->dtype = GetPaddleDType<T>();
-
-  return true;
-}
-
-// Parse input tensors from string
-bool ParseLine(const std::string &line,
-               std::vector<paddle::PaddleTensor> *tensors) {
-  std::vector<std::string> fields;
-  Split(line, ';', &fields);
-
-  if (fields.size() < 4) return false;
-
-  tensors->clear();
-  tensors->reserve(4);
-
-  int i = 0;
-  // src_id
-  paddle::PaddleTensor src_id;
-  ParseTensor<int64_t>(fields[i++], &src_id);
-  tensors->push_back(src_id);
-
-  // pos_id
-  paddle::PaddleTensor pos_id;
-  ParseTensor<int64_t>(fields[i++], &pos_id);
-  tensors->push_back(pos_id);
-
-  // segment_id
-  paddle::PaddleTensor segment_id;
-  ParseTensor<int64_t>(fields[i++], &segment_id);
-  tensors->push_back(segment_id);
-
-  // input mask
-  paddle::PaddleTensor input_mask;
-  ParseTensor<float>(fields[i++], &input_mask);
-  tensors->push_back(input_mask);
-
-  return true;
-}
-
-template <typename T>
-void PrintTensor(const paddle::PaddleTensor &t) {
-  std::stringstream ss;
-  ss.str({});
-  ss.clear();
-  ss << "Tensor: shape[";
-  for (auto i: t.shape) {
-    ss << i << " ";
-  }
-  ss << "], data[";
-  T *data = static_cast<T *>(t.data.data());
-  for (int i = 0; i < t.data.length() / sizeof(T); i++) {
-    ss << data[i] << " ";
-  }
-
-  ss << "]";
-  LOG(INFO) << ss.str();
-}
-
-void PrintInputs(const std::vector<paddle::PaddleTensor> &inputs) {
-  for (const auto &t : inputs) {
-    if (t.dtype == paddle::PaddleDType::INT64) {
-      PrintTensor<int64_t>(t);
-    } else {
-      PrintTensor<float>(t);
-    }
-  }
-}
-
-// Print outputs to log
-void PrintOutputs(const std::vector<paddle::PaddleTensor> &outputs, int &cnt) {
-  for (size_t i = 0; i < outputs.front().data.length() / sizeof(float); 
-       i += FLAGS_num_labels) {
-    std::cout << cnt << "\t";
-    for (size_t j = 0; j < FLAGS_num_labels; ++j) {
-      std::cout  << static_cast<float *>(outputs.front().data.data())[i+j] << "\t";
-    }
-    std::cout << std::endl;
-    cnt += 1;
-  }
-}
-
-bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs) {
-  if (FLAGS_data.empty()) {
-    LOG(ERROR) << "please set input data path";
-    return false;
-  }
-
-  std::ifstream fin(FLAGS_data);
-  std::string line;
-
-  int lineno = 0;
-  while (std::getline(fin, line)) {
-    std::vector<paddle::PaddleTensor> feed_data;
-    if (!ParseLine(line, &feed_data)) {
-      LOG(ERROR) << "Parse line[" << lineno << "] error!";
-    } else {
-      inputs->push_back(std::move(feed_data));
-    }
-  }
-
-  return true;
-}
-
-int main(int argc, char *argv[]) {
-  google::InitGoogleLogging(*argv);
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-  if (FLAGS_model_dir.empty()) {
-    LOG(ERROR) << "please set model dir";
-    return -1;
-  }
-
-  paddle::NativeConfig config;
-  config.model_dir = FLAGS_model_dir;
-  if (FLAGS_use_gpu) {
-    config.use_gpu = true;
-    config.fraction_of_gpu_memory = 0.15;
-    config.device = 0;
-  }
-
-  auto predictor = CreatePaddlePredictor(config);
-
-  std::vector<std::vector<paddle::PaddleTensor>> inputs;
-  if (!LoadInputData(&inputs)) {
-    LOG(ERROR) << "load input data error!";
-    return -1;
-  }
-
-  std::vector<paddle::PaddleTensor> fetch;
-  int total_time{0};
-  int num_samples{0};
-  int out_cnt = 0;
-  for (int i = 0; i < FLAGS_repeat; i++) {
-    for (auto feed : inputs) {
-      fetch.clear();
-      auto start = std::chrono::system_clock::now();
-      predictor->Run(feed, &fetch);
-      if (FLAGS_output_prediction && i == 0) {
-	PrintOutputs(fetch, out_cnt);
-      }
-      auto end = std::chrono::system_clock::now();
-      if (!fetch.empty()) {
-        total_time +=
-            std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
-                .count();
-        num_samples += fetch.front().data.length() / FLAGS_num_labels / sizeof(float);
-      }
-    }
-  }
-  
-
-  auto per_sample_ms =
-      static_cast<float>(total_time) / num_samples;
-  LOG(INFO) << "Run on " << num_samples 
-            << " samples over "<< FLAGS_repeat << " times, average latency: " << per_sample_ms
-            << "ms per sample.";
-
-  return 0;
-}
--- a/BERT/model/classifier.py
+++ b/BERT/model/classifier.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Model for classifier."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle.fluid as fluid
-
-from model.bert import BertModel
-
-
-def create_model(args,
-                 pyreader_name,
-                 bert_config,
-                 num_labels,
-                 is_prediction=False):
-    pyreader = fluid.layers.py_reader(
-        capacity=50,
-        shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
-                [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]],
-        dtypes=['int64', 'int64', 'int64', 'float32', 'int64'],
-        lod_levels=[0, 0, 0, 0, 0],
-        name=pyreader_name,
-        use_double_buffer=True)
-
-    (src_ids, pos_ids, sent_ids, input_mask,
-     labels) = fluid.layers.read_file(pyreader)
-
-    bert = BertModel(
-        src_ids=src_ids,
-        position_ids=pos_ids,
-        sentence_ids=sent_ids,
-        input_mask=input_mask,
-        config=bert_config,
-        use_fp16=args.use_fp16)
-
-    cls_feats = bert.get_pooled_output()
-    cls_feats = fluid.layers.dropout(
-        x=cls_feats,
-        dropout_prob=0.1,
-        dropout_implementation="upscale_in_train")
-    logits = fluid.layers.fc(
-        input=cls_feats,
-        size=num_labels,
-        param_attr=fluid.ParamAttr(
-            name="cls_out_w",
-            initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
-        bias_attr=fluid.ParamAttr(
-            name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
-
-    if is_prediction:
-        probs = fluid.layers.softmax(logits)
-        feed_targets_name = [
-            src_ids.name, pos_ids.name, sent_ids.name, input_mask.name
-        ]
-        return pyreader, probs, feed_targets_name
-
-    ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
-        logits=logits, label=labels, return_softmax=True)
-    loss = fluid.layers.mean(x=ce_loss)
-
-    if args.use_fp16 and args.loss_scaling > 1.0:
-        loss *= args.loss_scaling
-
-    num_seqs = fluid.layers.create_tensor(dtype='int64')
-    accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs)
-
-    return pyreader, loss, probs, accuracy, num_seqs
--- a/BERT/model/transformer_encoder.py
+++ b/BERT/model/transformer_encoder.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Transformer encoder."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from functools import partial
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-
-def multi_head_attention(queries,
-                         keys,
-                         values,
-                         attn_bias,
-                         d_key,
-                         d_value,
-                         d_model,
-                         n_head=1,
-                         dropout_rate=0.,
-                         cache=None,
-                         param_initializer=None,
-                         name='multi_head_att'):
-    """
-    Multi-Head Attention. Note that attn_bias is added to the logit before
-    computing softmax activiation to mask certain selected positions so that
-    they will not considered in attention weights.
-    """
-    keys = queries if keys is None else keys
-    values = keys if values is None else values
-
-    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
-        raise ValueError(
-            "Inputs: quries, keys and values should all be 3-D tensors.")
-
-    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
-        """
-        Add linear projection to queries, keys, and values.
-        """
-        q = layers.fc(input=queries,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_query_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_query_fc.b_0')
-        k = layers.fc(input=keys,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_key_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_key_fc.b_0')
-        v = layers.fc(input=values,
-                      size=d_value * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_value_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_value_fc.b_0')
-        return q, k, v
-
-    def __split_heads(x, n_head):
-        """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
-        dimensions and then transpose. Specifically, input a tensor with shape
-        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
-        with shape [bs, n_head, max_sequence_length, hidden_dim].
-        """
-        hidden_size = x.shape[-1]
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        reshaped = layers.reshape(
-            x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
-
-        # permuate the dimensions into:
-        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
-        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-
-    def __combine_heads(x):
-        """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
-        so that it becomes one dimension, which is reverse to __split_heads.
-        """
-        if len(x.shape) == 3: return x
-        if len(x.shape) != 4:
-            raise ValueError("Input(x) should be a 4-D Tensor.")
-
-        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        return layers.reshape(
-            x=trans_x,
-            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
-            inplace=True)
-
-    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
-        """
-        Scaled Dot-Product Attention
-        """
-        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
-        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
-        if attn_bias:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if dropout_rate:
-            weights = layers.dropout(
-                weights,
-                dropout_prob=dropout_rate,
-                dropout_implementation="upscale_in_train",
-                is_test=False)
-        out = layers.matmul(weights, v)
-        return out
-
-    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-
-    if cache is not None:  # use cache and concat time steps
-        # Since the inplace reshape in __split_heads changes the shape of k and
-        # v, which is the cache input for next time step, reshape the cache
-        # input from the previous time step first.
-        k = cache["k"] = layers.concat(
-            [layers.reshape(
-                cache["k"], shape=[0, 0, d_model]), k], axis=1)
-        v = cache["v"] = layers.concat(
-            [layers.reshape(
-                cache["v"], shape=[0, 0, d_model]), v], axis=1)
-
-    q = __split_heads(q, n_head)
-    k = __split_heads(k, n_head)
-    v = __split_heads(v, n_head)
-
-    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
-                                                  dropout_rate)
-
-    out = __combine_heads(ctx_multiheads)
-
-    # Project back to the model size.
-    proj_out = layers.fc(input=out,
-                         size=d_model,
-                         num_flatten_dims=2,
-                         param_attr=fluid.ParamAttr(
-                             name=name + '_output_fc.w_0',
-                             initializer=param_initializer),
-                         bias_attr=name + '_output_fc.b_0')
-    return proj_out
-
-
-def positionwise_feed_forward(x,
-                              d_inner_hid,
-                              d_hid,
-                              dropout_rate,
-                              hidden_act,
-                              param_initializer=None,
-                              name='ffn'):
-    """
-    Position-wise Feed-Forward Networks.
-    This module consists of two linear transformations with a ReLU activation
-    in between, which is applied to each position separately and identically.
-    """
-    hidden = layers.fc(input=x,
-                       size=d_inner_hid,
-                       num_flatten_dims=2,
-                       act=hidden_act,
-                       param_attr=fluid.ParamAttr(
-                           name=name + '_fc_0.w_0',
-                           initializer=param_initializer),
-                       bias_attr=name + '_fc_0.b_0')
-    if dropout_rate:
-        hidden = layers.dropout(
-            hidden,
-            dropout_prob=dropout_rate,
-            dropout_implementation="upscale_in_train",
-            is_test=False)
-    out = layers.fc(input=hidden,
-                    size=d_hid,
-                    num_flatten_dims=2,
-                    param_attr=fluid.ParamAttr(
-                        name=name + '_fc_1.w_0', initializer=param_initializer),
-                    bias_attr=name + '_fc_1.b_0')
-    return out
-
-
-def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,
-                           name=''):
-    """
-    Add residual connection, layer normalization and droput to the out tensor
-    optionally according to the value of process_cmd.
-    This will be used before or after multi-head attention and position-wise
-    feed-forward networks.
-    """
-    for cmd in process_cmd:
-        if cmd == "a":  # add residual connection
-            out = out + prev_out if prev_out else out
-        elif cmd == "n":  # add layer normalization
-            out_dtype = out.dtype
-            if out_dtype == fluid.core.VarDesc.VarType.FP16:
-                out = layers.cast(x=out, dtype="float32")
-            out = layers.layer_norm(
-                out,
-                begin_norm_axis=len(out.shape) - 1,
-                param_attr=fluid.ParamAttr(
-                    name=name + '_layer_norm_scale',
-                    initializer=fluid.initializer.Constant(1.)),
-                bias_attr=fluid.ParamAttr(
-                    name=name + '_layer_norm_bias',
-                    initializer=fluid.initializer.Constant(0.)))
-            if out_dtype == fluid.core.VarDesc.VarType.FP16:
-                out = layers.cast(x=out, dtype="float16")
-        elif cmd == "d":  # add dropout
-            if dropout_rate:
-                out = layers.dropout(
-                    out,
-                    dropout_prob=dropout_rate,
-                    dropout_implementation="upscale_in_train",
-                    is_test=False)
-    return out
-
-
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-
-
-def encoder_layer(enc_input,
-                  attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  prepostprocess_dropout,
-                  attention_dropout,
-                  relu_dropout,
-                  hidden_act,
-                  preprocess_cmd="n",
-                  postprocess_cmd="da",
-                  param_initializer=None,
-                  name=''):
-    """The encoder layers that can be stacked to form a deep encoder.
-    This module consits of a multi-head (self) attention followed by
-    position-wise feed-forward networks and both the two components companied
-    with the post_process_layer to add residual connection, layer normalization
-    and droput.
-    """
-    attn_output = multi_head_attention(
-        pre_process_layer(
-            enc_input,
-            preprocess_cmd,
-            prepostprocess_dropout,
-            name=name + '_pre_att'),
-        None,
-        None,
-        attn_bias,
-        d_key,
-        d_value,
-        d_model,
-        n_head,
-        attention_dropout,
-        param_initializer=param_initializer,
-        name=name + '_multi_head_att')
-    attn_output = post_process_layer(
-        enc_input,
-        attn_output,
-        postprocess_cmd,
-        prepostprocess_dropout,
-        name=name + '_post_att')
-    ffd_output = positionwise_feed_forward(
-        pre_process_layer(
-            attn_output,
-            preprocess_cmd,
-            prepostprocess_dropout,
-            name=name + '_pre_ffn'),
-        d_inner_hid,
-        d_model,
-        relu_dropout,
-        hidden_act,
-        param_initializer=param_initializer,
-        name=name + '_ffn')
-    return post_process_layer(
-        attn_output,
-        ffd_output,
-        postprocess_cmd,
-        prepostprocess_dropout,
-        name=name + '_post_ffn')
-
-
-def encoder(enc_input,
-            attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd="n",
-            postprocess_cmd="da",
-            param_initializer=None,
-            name=''):
-    """
-    The encoder is composed of a stack of identical layers returned by calling
-    encoder_layer.
-    """
-    for i in range(n_layer):
-        enc_output = encoder_layer(
-            enc_input,
-            attn_bias,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd,
-            postprocess_cmd,
-            param_initializer=param_initializer,
-            name=name + '_layer_' + str(i))
-        enc_input = enc_output
-    enc_output = pre_process_layer(
-        enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
-
-    return enc_output
--- a/BERT/optimization.py
+++ b/BERT/optimization.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Optimization and learning rate scheduling."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import paddle.fluid as fluid
-from utils.fp16 import create_master_params_grads, master_param_to_train_param
-
-
-def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
-    """ Applies linear warmup of learning rate from 0 and decay to 0."""
-    with fluid.default_main_program()._lr_schedule_guard():
-        lr = fluid.layers.tensor.create_global_var(
-            shape=[1],
-            value=0.0,
-            dtype='float32',
-            persistable=True,
-            name="scheduled_learning_rate")
-
-        global_step = fluid.layers.learning_rate_scheduler._decay_step_counter()
-
-        with fluid.layers.control_flow.Switch() as switch:
-            with switch.case(global_step < warmup_steps):
-                warmup_lr = learning_rate * (global_step / warmup_steps)
-                fluid.layers.tensor.assign(warmup_lr, lr)
-            with switch.default():
-                decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
-                    learning_rate=learning_rate,
-                    decay_steps=num_train_steps,
-                    end_learning_rate=0.0,
-                    power=1.0,
-                    cycle=False)
-                fluid.layers.tensor.assign(decayed_lr, lr)
-
-        return lr
-
-
-def optimization(loss,
-                 warmup_steps,
-                 num_train_steps,
-                 learning_rate,
-                 train_program,
-                 startup_prog,
-                 weight_decay,
-                 scheduler='linear_warmup_decay',
-                 use_fp16=False,
-                 loss_scaling=1.0):
-    if warmup_steps > 0:
-        if scheduler == 'noam_decay':
-            scheduled_lr = fluid.layers.learning_rate_scheduler\
-             .noam_decay(1/(warmup_steps *(learning_rate ** 2)),
-                         warmup_steps)
-        elif scheduler == 'linear_warmup_decay':
-            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
-                                               num_train_steps)
-        else:
-            raise ValueError("Unkown learning rate scheduler, should be "
-                             "'noam_decay' or 'linear_warmup_decay'")
-        optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
-    else:
-        optimizer = fluid.optimizer.Adam(learning_rate=learning_rate)
-        scheduled_lr = learning_rate
-
-    clip_norm_thres = 1.0
-    # When using mixed precision training, scale the gradient clip threshold
-    # by loss_scaling
-    if use_fp16 and loss_scaling > 1.0:
-        clip_norm_thres *= loss_scaling
-    fluid.clip.set_gradient_clip(
-        clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres))
-
-    def exclude_from_weight_decay(name):
-        if name.find("layer_norm") > -1:
-            return True
-        bias_suffix = ["_bias", "_b", ".b_0"]
-        for suffix in bias_suffix:
-            if name.endswith(suffix):
-                return True
-        return False
-
-    param_list = dict()
-
-    if use_fp16:
-        param_grads = optimizer.backward(loss)
-        master_param_grads = create_master_params_grads(
-            param_grads, train_program, startup_prog, loss_scaling)
-
-        for param, _ in master_param_grads:
-            param_list[param.name] = param * 1.0
-            param_list[param.name].stop_gradient = True
-
-        optimizer.apply_gradients(master_param_grads)
-
-        if weight_decay > 0:
-            for param, grad in master_param_grads:
-                if exclude_from_weight_decay(param.name.rstrip(".master")):
-                    continue
-                with param.block.program._optimized_guard(
-                    [param, grad]), fluid.framework.name_scope("weight_decay"):
-                    updated_param = param - param_list[
-                        param.name] * weight_decay * scheduled_lr
-                    fluid.layers.assign(output=param, input=updated_param)
-
-        master_param_to_train_param(master_param_grads, param_grads,
-                                    train_program)
-
-    else:
-        for param in train_program.global_block().all_parameters():
-            param_list[param.name] = param * 1.0
-            param_list[param.name].stop_gradient = True
-
-        _, param_grads = optimizer.minimize(loss)
-
-        if weight_decay > 0:
-            for param, grad in param_grads:
-                if exclude_from_weight_decay(param.name):
-                    continue
-                with param.block.program._optimized_guard(
-                    [param, grad]), fluid.framework.name_scope("weight_decay"):
-                    updated_param = param - param_list[
-                        param.name] * weight_decay * scheduled_lr
-                    fluid.layers.assign(output=param, input=updated_param)
-
-    return scheduled_lr
--- a/BERT/reader/cls.py
+++ b/BERT/reader/cls.py
--- a/BERT/reader/pretraining.py
+++ b/BERT/reader/pretraining.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from __future__ import division
-
-import os
-import numpy as np
-import types
-import gzip
-import logging
-import re
-import six
-import collections
-import tokenization
-
-import paddle
-import paddle.fluid as fluid
-
-from batching import prepare_batch_data
-
-
-class DataReader(object):
-    def __init__(self,
-                 data_dir,
-                 vocab_path,
-                 batch_size=4096,
-                 in_tokens=True,
-                 max_seq_len=512,
-                 shuffle_files=True,
-                 epoch=100,
-                 voc_size=0,
-                 is_test=False,
-                 generate_neg_sample=False):
-
-        self.vocab = self.load_vocab(vocab_path)
-        self.data_dir = data_dir
-        self.batch_size = batch_size
-        self.in_tokens = in_tokens
-        self.shuffle_files = shuffle_files
-        self.epoch = epoch
-        self.current_epoch = 0
-        self.current_file_index = 0
-        self.total_file = 0
-        self.current_file = None
-        self.voc_size = voc_size
-        self.max_seq_len = max_seq_len
-        self.pad_id = self.vocab["[PAD]"]
-        self.cls_id = self.vocab["[CLS]"]
-        self.sep_id = self.vocab["[SEP]"]
-        self.mask_id = self.vocab["[MASK]"]
-        self.is_test = is_test
-        self.generate_neg_sample = generate_neg_sample
-        if self.in_tokens:
-            assert self.batch_size >= self.max_seq_len, "The number of " \
-                   "tokens in batch should not be smaller than max seq length."
-
-        if self.is_test:
-            self.epoch = 1
-            self.shuffle_files = False
-
-    def get_progress(self):
-        """return current progress of traning data
-        """
-        return self.current_epoch, self.current_file_index, self.total_file, self.current_file
-
-    def parse_line(self, line, max_seq_len=512):
-        """ parse one line to token_ids, sentence_ids, pos_ids, label
-        """
-        line = line.strip().decode().split(";")
-        assert len(line) == 4, "One sample must have 4 fields!"
-        (token_ids, sent_ids, pos_ids, label) = line
-        token_ids = [int(token) for token in token_ids.split(" ")]
-        sent_ids = [int(token) for token in sent_ids.split(" ")]
-        pos_ids = [int(token) for token in pos_ids.split(" ")]
-        assert len(token_ids) == len(sent_ids) == len(
-            pos_ids
-        ), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids)"
-        label = int(label)
-        if len(token_ids) > max_seq_len:
-            return None
-        return [token_ids, sent_ids, pos_ids, label]
-
-    def read_file(self, file):
-        assert file.endswith('.gz'), "[ERROR] %s is not a gzip file" % file
-        file_path = self.data_dir + "/" + file
-        with gzip.open(file_path, "rb") as f:
-            for line in f:
-                parsed_line = self.parse_line(
-                    line, max_seq_len=self.max_seq_len)
-                if parsed_line is None:
-                    continue
-                yield parsed_line
-
-    def convert_to_unicode(self, text):
-        """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
-        if six.PY3:
-            if isinstance(text, str):
-                return text
-            elif isinstance(text, bytes):
-                return text.decode("utf-8", "ignore")
-            else:
-                raise ValueError("Unsupported string type: %s" % (type(text)))
-        elif six.PY2:
-            if isinstance(text, str):
-                return text.decode("utf-8", "ignore")
-            elif isinstance(text, unicode):
-                return text
-            else:
-                raise ValueError("Unsupported string type: %s" % (type(text)))
-        else:
-            raise ValueError("Not running on Python2 or Python 3?")
-
-    def load_vocab(self, vocab_file):
-        """Loads a vocabulary file into a dictionary."""
-        vocab = collections.OrderedDict()
-        fin = open(vocab_file)
-        for num, line in enumerate(fin):
-            items = self.convert_to_unicode(line.strip()).split("\t")
-            if len(items) > 2:
-                break
-            token = items[0]
-            index = items[1] if len(items) == 2 else num
-            token = token.strip()
-            vocab[token] = int(index)
-        return vocab
-
-    def random_pair_neg_samples(self, pos_samples):
-        """ randomly generate negtive samples using pos_samples
-
-            Args:
-                pos_samples: list of positive samples
-            
-            Returns:
-                neg_samples: list of negtive samples
-        """
-        np.random.shuffle(pos_samples)
-        num_sample = len(pos_samples)
-        neg_samples = []
-        miss_num = 0
-
-        for i in range(num_sample):
-            pair_index = (i + 1) % num_sample
-            origin_src_ids = pos_samples[i][0]
-            origin_sep_index = origin_src_ids.index(2)
-            pair_src_ids = pos_samples[pair_index][0]
-            pair_sep_index = pair_src_ids.index(2)
-
-            src_ids = origin_src_ids[:origin_sep_index + 1] + pair_src_ids[
-                pair_sep_index + 1:]
-            if len(src_ids) >= self.max_seq_len:
-                miss_num += 1
-                continue
-            sent_ids = [0] * len(origin_src_ids[:origin_sep_index + 1]) + [
-                1
-            ] * len(pair_src_ids[pair_sep_index + 1:])
-            pos_ids = list(range(len(src_ids)))
-            neg_sample = [src_ids, sent_ids, pos_ids, 0]
-            assert len(src_ids) == len(sent_ids) == len(
-                pos_ids
-            ), "[ERROR]len(src_id) == lne(sent_id) == len(pos_id) must be True"
-            neg_samples.append(neg_sample)
-        return neg_samples, miss_num
-
-    def mixin_negtive_samples(self, pos_sample_generator, buffer=1000):
-        """ 1. generate negtive samples by randomly group sentence_1 and sentence_2 of positive samples
-            2. combine negtive samples and positive samples
-            
-            Args:
-                pos_sample_generator: a generator producing a parsed positive sample, which is a list: [token_ids, sent_ids, pos_ids, 1]
-
-            Returns:
-                sample: one sample from shuffled positive samples and negtive samples
-        """
-        pos_samples = []
-        num_total_miss = 0
-        pos_sample_num = 0
-        try:
-            while True:
-                while len(pos_samples) < buffer:
-                    pos_sample = next(pos_sample_generator)
-                    label = pos_sample[3]
-                    assert label == 1, "positive sample's label must be 1"
-                    pos_samples.append(pos_sample)
-                    pos_sample_num += 1
-
-                neg_samples, miss_num = self.random_pair_neg_samples(
-                    pos_samples)
-                num_total_miss += miss_num
-                samples = pos_samples + neg_samples
-                pos_samples = []
-                np.random.shuffle(samples)
-                for sample in samples:
-                    yield sample
-        except StopIteration:
-            print("stopiteration: reach end of file")
-            if len(pos_samples) == 1:
-                yield pos_samples[0]
-            elif len(pos_samples) == 0:
-                yield None
-            else:
-                neg_samples, miss_num = self.random_pair_neg_samples(
-                    pos_samples)
-                num_total_miss += miss_num
-                samples = pos_samples + neg_samples
-                pos_samples = []
-                np.random.shuffle(samples)
-                for sample in samples:
-                    yield sample
-            print("miss_num:%d\tideal_total_sample_num:%d\tmiss_rate:%f" %
-                  (num_total_miss, pos_sample_num * 2,
-                   num_total_miss / (pos_sample_num * 2)))
-
-    def data_generator(self):
-        """
-        data_generator
-        """
-        files = os.listdir(self.data_dir)
-        self.total_file = len(files)
-        assert self.total_file > 0, "[Error] data_dir is empty"
-
-        def wrapper():
-            def reader():
-                for epoch in range(self.epoch):
-                    self.current_epoch = epoch + 1
-                    if self.shuffle_files:
-                        np.random.shuffle(files)
-                    for index, file in enumerate(files):
-                        self.current_file_index = index + 1
-                        self.current_file = file
-                        sample_generator = self.read_file(file)
-                        if not self.is_test and self.generate_neg_sample:
-                            sample_generator = self.mixin_negtive_samples(
-                                sample_generator)
-                        for sample in sample_generator:
-                            if sample is None:
-                                continue
-                            yield sample
-
-            def batch_reader(reader, batch_size, in_tokens):
-                batch, total_token_num, max_len = [], 0, 0
-                for parsed_line in reader():
-                    token_ids, sent_ids, pos_ids, label = parsed_line
-                    max_len = max(max_len, len(token_ids))
-                    if in_tokens:
-                        to_append = (len(batch) + 1) * max_len <= batch_size
-                    else:
-                        to_append = len(batch) < batch_size
-                    if to_append:
-                        batch.append(parsed_line)
-                        total_token_num += len(token_ids)
-                    else:
-                        yield batch, total_token_num
-                        batch, total_token_num, max_len = [parsed_line], len(
-                            token_ids), len(token_ids)
-
-                if len(batch) > 0:
-                    yield batch, total_token_num
-
-            for batch_data, total_token_num in batch_reader(
-                    reader, self.batch_size, self.in_tokens):
-                yield prepare_batch_data(
-                    batch_data,
-                    total_token_num,
-                    voc_size=self.voc_size,
-                    pad_id=self.pad_id,
-                    cls_id=self.cls_id,
-                    sep_id=self.sep_id,
-                    mask_id=self.mask_id,
-                    return_input_mask=True,
-                    return_max_len=False,
-                    return_num_token=False)
-
-        return wrapper
-
-
-if __name__ == "__main__":
-    pass
--- a/BERT/reader/squad.py
+++ b/BERT/reader/squad.py
--- a/BERT/run_squad.py
+++ b/BERT/run_squad.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Finetuning on SQuAD."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import collections
-import multiprocessing
-import os
-import time
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-
-from reader.squad import DataProcessor, write_predictions
-from model.bert import BertConfig, BertModel
-from utils.args import ArgumentGroup, print_arguments, check_cuda
-from optimization import optimization
-from utils.init import init_pretraining_params, init_checkpoint
-
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
-model_g.add_arg("bert_config_path",         str,  None,           "Path to the json file for bert model config.")
-model_g.add_arg("init_checkpoint",          str,  None,           "Init checkpoint to resume training from.")
-model_g.add_arg("init_pretraining_params",  str,  None,
-                "Init pre-training params which preforms fine-tuning from. If the "
-                 "arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
-model_g.add_arg("checkpoints",              str,  "checkpoints",  "Path to save checkpoints.")
-
-train_g = ArgumentGroup(parser, "training", "training options.")
-train_g.add_arg("epoch",             int,    3,      "Number of epoches for fine-tuning.")
-train_g.add_arg("learning_rate",     float,  5e-5,   "Learning rate used to train with warmup.")
-train_g.add_arg("lr_scheduler",      str,    "linear_warmup_decay",
-                "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
-train_g.add_arg("weight_decay",      float,  0.01,   "Weight decay rate for L2 regularizer.")
-train_g.add_arg("warmup_proportion", float,  0.1,
-                "Proportion of training steps to perform linear learning rate warmup for.")
-train_g.add_arg("save_steps",        int,    1000,   "The steps interval to save checkpoints.")
-train_g.add_arg("use_fp16",          bool,   False,  "Whether to use fp16 mixed precision training.")
-train_g.add_arg("loss_scaling",      float,  1.0,
-                "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
-
-log_g = ArgumentGroup(parser, "logging", "logging related.")
-log_g.add_arg("skip_steps",          int,    10,    "The steps interval to print loss.")
-log_g.add_arg("verbose",             bool,   False, "Whether to output verbose log.")
-
-data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
-data_g.add_arg("train_file",                str,   None,  "SQuAD json for training. E.g., train-v1.1.json.")
-data_g.add_arg("predict_file",              str,   None,  "SQuAD json for predictions. E.g. dev-v1.1.json or test-v1.1.json.")
-data_g.add_arg("vocab_path",                str,   None,  "Vocabulary path.")
-data_g.add_arg("version_2_with_negative",   bool,  False,
-               "If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.")
-data_g.add_arg("max_seq_len",               int,   512,   "Number of words of the longest seqence.")
-data_g.add_arg("max_query_length",          int,   64,    "Max query length.")
-data_g.add_arg("max_answer_length",         int,   30,    "Max answer length.")
-data_g.add_arg("batch_size",                int,   12,    "Total examples' number in batch for training. see also --in_tokens.")
-data_g.add_arg("in_tokens",                 bool,  False,
-               "If set, the batch size will be the maximum number of tokens in one batch. "
-               "Otherwise, it will be the maximum number of examples in one batch.")
-data_g.add_arg("do_lower_case",             bool,  True,
-               "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
-data_g.add_arg("doc_stride",                int,   128,
-               "When splitting up a long document into chunks, how much stride to take between chunks.")
-data_g.add_arg("n_best_size",               int,   20,
-               "The total number of n-best predictions to generate in the nbest_predictions.json output file.")
-data_g.add_arg("null_score_diff_threshold", float, 0.0,
-               "If null_score - best_non_null is greater than the threshold predict null.")
-data_g.add_arg("random_seed",               int,   0,      "Random seed.")
-
-run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
-run_type_g.add_arg("use_cuda",                     bool,   True,  "If set, use GPU for training.")
-run_type_g.add_arg("use_fast_executor",            bool,   False, "If set, use fast parallel executor (in experiment).")
-run_type_g.add_arg("num_iteration_per_drop_scope", int,    1,     "Ihe iteration intervals to clean up temporary variables.")
-run_type_g.add_arg("do_train",                     bool,   True,  "Whether to perform training.")
-run_type_g.add_arg("do_predict",                   bool,   True,  "Whether to perform prediction.")
-
-args = parser.parse_args()
-# yapf: enable.
-
-def create_model(pyreader_name, bert_config, is_training=False):
-    if is_training:
-        pyreader = fluid.layers.py_reader(
-            capacity=50,
-            shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
-                    [-1, args.max_seq_len, 1],
-                    [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
-            dtypes=[
-                'int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
-            lod_levels=[0, 0, 0, 0, 0, 0],
-            name=pyreader_name,
-            use_double_buffer=True)
-        (src_ids, pos_ids, sent_ids, input_mask, start_positions,
-         end_positions) = fluid.layers.read_file(pyreader)
-    else:
-        pyreader = fluid.layers.py_reader(
-            capacity=50,
-            shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
-                    [-1, args.max_seq_len, 1],
-                    [-1, args.max_seq_len, 1], [-1, 1]],
-            dtypes=['int64', 'int64', 'int64', 'float32', 'int64'],
-            lod_levels=[0, 0, 0, 0, 0],
-            name=pyreader_name,
-            use_double_buffer=True)
-        (src_ids, pos_ids, sent_ids, input_mask, unique_id) = fluid.layers.read_file(pyreader)
-
-    bert = BertModel(
-        src_ids=src_ids,
-        position_ids=pos_ids,
-        sentence_ids=sent_ids,
-        input_mask=input_mask,
-        config=bert_config,
-        use_fp16=args.use_fp16)
-
-    enc_out = bert.get_sequence_output()
-
-    logits = fluid.layers.fc(
-        input=enc_out,
-        size=2,
-        num_flatten_dims=2,
-        param_attr=fluid.ParamAttr(
-            name="cls_squad_out_w",
-            initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
-        bias_attr=fluid.ParamAttr(
-            name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.)))
-
-    logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
-    start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
-
-    batch_ones = fluid.layers.fill_constant_batch_size_like(
-        input=start_logits, dtype='int64', shape=[1], value=1)
-    num_seqs = fluid.layers.reduce_sum(input=batch_ones)
-
-    if is_training:
-
-        def compute_loss(logits, positions):
-            loss = fluid.layers.softmax_with_cross_entropy(
-                logits=logits, label=positions)
-            loss = fluid.layers.mean(x=loss)
-            return loss
-
-        start_loss = compute_loss(start_logits, start_positions)
-        end_loss = compute_loss(end_logits, end_positions)
-        total_loss = (start_loss + end_loss) / 2.0
-        if args.use_fp16 and args.loss_scaling > 1.0:
-            total_loss = total_loss * args.loss_scaling
-
-        return pyreader, total_loss, num_seqs
-    else:
-        return pyreader, unique_id, start_logits, end_logits, num_seqs
-
-
-RawResult = collections.namedtuple("RawResult",
-                                   ["unique_id", "start_logits", "end_logits"])
-
-
-def predict(test_exe, test_program, test_pyreader, fetch_list, processor):
-    if not os.path.exists(args.checkpoints):
-        os.makedirs(args.checkpoints)
-    output_prediction_file = os.path.join(args.checkpoints, "predictions.json")
-    output_nbest_file = os.path.join(args.checkpoints, "nbest_predictions.json")
-    output_null_log_odds_file = os.path.join(args.checkpoints, "null_odds.json")
-
-    test_pyreader.start()
-    all_results = []
-    time_begin = time.time()
-    while True:
-        try:
-            np_unique_ids, np_start_logits, np_end_logits, np_num_seqs = test_exe.run(
-                fetch_list=fetch_list, program=test_program)
-            for idx in range(np_unique_ids.shape[0]):
-                if len(all_results) % 1000 == 0:
-                    print("Processing example: %d" % len(all_results))
-                unique_id = int(np_unique_ids[idx])
-                start_logits = [float(x) for x in np_start_logits[idx].flat]
-                end_logits = [float(x) for x in np_end_logits[idx].flat]
-                all_results.append(
-                    RawResult(
-                        unique_id=unique_id,
-                        start_logits=start_logits,
-                        end_logits=end_logits))
-        except fluid.core.EOFException:
-            test_pyreader.reset()
-            break
-    time_end = time.time()
-
-    features = processor.get_features(
-        processor.predict_examples, is_training=False)
-    write_predictions(processor.predict_examples, features, all_results,
-                      args.n_best_size, args.max_answer_length,
-                      args.do_lower_case, output_prediction_file,
-                      output_nbest_file, output_null_log_odds_file,
-                      args.version_2_with_negative,
-                      args.null_score_diff_threshold, args.verbose)
-
-
-def train(args):
-    bert_config = BertConfig(args.bert_config_path)
-    bert_config.print_config()
-
-    if not (args.do_train or args.do_predict):
-        raise ValueError("For args `do_train` and `do_predict`, at "
-                         "least one of them must be True.")
-
-    if args.use_cuda:
-        place = fluid.CUDAPlace(0)
-        dev_count = fluid.core.get_cuda_device_count()
-    else:
-        place = fluid.CPUPlace()
-        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-    exe = fluid.Executor(place)
-
-    processor = DataProcessor(
-        vocab_path=args.vocab_path,
-        do_lower_case=args.do_lower_case,
-        max_seq_length=args.max_seq_len,
-        in_tokens=args.in_tokens,
-        doc_stride=args.doc_stride,
-        max_query_length=args.max_query_length)
-
-    startup_prog = fluid.Program()
-    if args.random_seed is not None:
-        startup_prog.random_seed = args.random_seed
-
-    if args.do_train:
-        train_data_generator = processor.data_generator(
-            data_path=args.train_file,
-            batch_size=args.batch_size,
-            phase='train',
-            shuffle=True,
-            dev_count=dev_count,
-            version_2_with_negative=args.version_2_with_negative,
-            epoch=args.epoch)
-
-        num_train_examples = processor.get_num_examples(phase='train')
-        if args.in_tokens:
-            max_train_steps = args.epoch * num_train_examples // (
-                args.batch_size // args.max_seq_len) // dev_count
-        else:
-            max_train_steps = args.epoch * num_train_examples // (
-                args.batch_size) // dev_count
-        warmup_steps = int(max_train_steps * args.warmup_proportion)
-        print("Device count: %d" % dev_count)
-        print("Num train examples: %d" % num_train_examples)
-        print("Max train steps: %d" % max_train_steps)
-        print("Num warmup steps: %d" % warmup_steps)
-
-        train_program = fluid.Program()
-        with fluid.program_guard(train_program, startup_prog):
-            with fluid.unique_name.guard():
-                train_pyreader, loss, num_seqs = create_model(
-                    pyreader_name='train_reader',
-                    bert_config=bert_config,
-                    is_training=True)
-
-                scheduled_lr = optimization(
-                    loss=loss,
-                    warmup_steps=warmup_steps,
-                    num_train_steps=max_train_steps,
-                    learning_rate=args.learning_rate,
-                    train_program=train_program,
-                    startup_prog=startup_prog,
-                    weight_decay=args.weight_decay,
-                    scheduler=args.lr_scheduler,
-                    use_fp16=args.use_fp16,
-                    loss_scaling=args.loss_scaling)
-
-                fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name])
-
-        if args.verbose:
-            if args.in_tokens:
-                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
-                    program=train_program,
-                    batch_size=args.batch_size // args.max_seq_len)
-            else:
-                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
-                    program=train_program, batch_size=args.batch_size)
-            print("Theoretical memory usage in training:  %.3f - %.3f %s" %
-                  (lower_mem, upper_mem, unit))
-
-    if args.do_predict:
-        test_prog = fluid.Program()
-        with fluid.program_guard(test_prog, startup_prog):
-            with fluid.unique_name.guard():
-                test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model(
-                    pyreader_name='test_reader',
-                    bert_config=bert_config,
-                    is_training=False)
-
-                fluid.memory_optimize(test_prog, skip_opt_set=[unique_ids.name,
-                    start_logits.name, end_logits.name, num_seqs.name])
-
-        test_prog = test_prog.clone(for_test=True)
-
-    exe.run(startup_prog)
-
-    if args.do_train:
-        if args.init_checkpoint and args.init_pretraining_params:
-            print(
-                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
-                "both are set! Only arg 'init_checkpoint' is made valid.")
-        if args.init_checkpoint:
-            init_checkpoint(
-                exe,
-                args.init_checkpoint,
-                main_program=startup_prog,
-                use_fp16=args.use_fp16)
-        elif args.init_pretraining_params:
-            init_pretraining_params(
-                exe,
-                args.init_pretraining_params,
-                main_program=startup_prog,
-                use_fp16=args.use_fp16)
-    elif args.do_predict:
-        if not args.init_checkpoint:
-            raise ValueError("args 'init_checkpoint' should be set if"
-                             "only doing prediction!")
-        init_checkpoint(
-            exe,
-            args.init_checkpoint,
-            main_program=startup_prog,
-            use_fp16=args.use_fp16)
-
-    if args.do_train:
-        exec_strategy = fluid.ExecutionStrategy()
-        exec_strategy.use_experimental_executor = args.use_fast_executor
-        exec_strategy.num_threads = dev_count
-        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
-
-        train_exe = fluid.ParallelExecutor(
-            use_cuda=args.use_cuda,
-            loss_name=loss.name,
-            exec_strategy=exec_strategy,
-            main_program=train_program)
-
-        train_pyreader.decorate_tensor_provider(train_data_generator)
-
-        train_pyreader.start()
-        steps = 0
-        total_cost, total_num_seqs = [], []
-        time_begin = time.time()
-        while steps < max_train_steps:
-            try:
-                steps += 1
-                if steps % args.skip_steps == 0:
-                    if warmup_steps <= 0:
-                        fetch_list = [loss.name, num_seqs.name]
-                    else:
-                        fetch_list = [
-                            loss.name, scheduled_lr.name, num_seqs.name
-                        ]
-                else:
-                    fetch_list = []
-
-                outputs = train_exe.run(fetch_list=fetch_list)
-
-                if steps % args.skip_steps == 0:
-                    if warmup_steps <= 0:
-                        np_loss, np_num_seqs = outputs
-                    else:
-                        np_loss, np_lr, np_num_seqs = outputs
-                    total_cost.extend(np_loss * np_num_seqs)
-                    total_num_seqs.extend(np_num_seqs)
-
-                    if args.verbose:
-                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
-                        )
-                        verbose += "learning rate: %f" % (
-                            np_lr[0]
-                            if warmup_steps > 0 else args.learning_rate)
-                        print(verbose)
-
-                    time_end = time.time()
-                    used_time = time_end - time_begin
-                    current_example, epoch = processor.get_train_progress()
-
-                    print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
-                          "speed: %f steps/s" %
-                          (epoch, current_example, num_train_examples, steps,
-                           np.sum(total_cost) / np.sum(total_num_seqs),
-                           args.skip_steps / used_time))
-                    total_cost, total_num_seqs = [], []
-                    time_begin = time.time()
-
-                if steps % args.save_steps == 0 or steps == max_train_steps:
-                    save_path = os.path.join(args.checkpoints,
-                                             "step_" + str(steps))
-                    fluid.io.save_persistables(exe, save_path, train_program)
-            except fluid.core.EOFException:
-                save_path = os.path.join(args.checkpoints,
-                                         "step_" + str(steps) + "_final")
-                fluid.io.save_persistables(exe, save_path, train_program)
-                train_pyreader.reset()
-                break
-
-    if args.do_predict:
-        test_pyreader.decorate_tensor_provider(
-            processor.data_generator(
-                data_path=args.predict_file,
-                batch_size=args.batch_size,
-                phase='predict',
-                shuffle=False,
-                dev_count=1,
-                epoch=1))
-
-        predict(exe, test_prog, test_pyreader, [
-            unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
-        ], processor)
-
-
-if __name__ == '__main__':
-    print_arguments(args)
-    check_cuda(args.use_cuda)
-    train(args)
--- a/BERT/test_local_dist.sh
+++ b/BERT/test_local_dist.sh
-#!/bin/bash
-set -xe
-
-# Paddle debug envs
-export GLOG_v=1
-export GLOG_logtostderr=1
-
-# Unset proxy
-unset https_proxy http_proxy
-
-# NCCL debug envs
-export NCCL_P2P_DISABLE=1
-export NCCL_DEBUG=INFO
-# Comment it if your nccl support IB
-export NCCL_IB_DISABLE=1
-
-# Add your nodes endpoints here.
-export worker_endpoints=127.0.0.1:9184,127.0.0.1:9185
-export current_endpoint=127.0.0.1:9184
-export CUDA_VISIBLE_DEVICES=0
-
-./train.sh -local n > 0.log 2>&1 &
-
-# Add your nodes endpoints here.
-export current_endpoint=127.0.0.1:9185
-export CUDA_VISIBLE_DEVICES=1
-
-./train.sh -local n > 1.log 2>&1 &
--- a/BERT/tokenization.py
+++ b/BERT/tokenization.py
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import unicodedata
-import six
-
-
-def convert_to_unicode(text):
-    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
-    if six.PY3:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, bytes):
-            return text.decode("utf-8", "ignore")
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    elif six.PY2:
-        if isinstance(text, str):
-            return text.decode("utf-8", "ignore")
-        elif isinstance(text, unicode):
-            return text
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    else:
-        raise ValueError("Not running on Python2 or Python 3?")
-
-
-def printable_text(text):
-    """Returns text encoded in a way suitable for print or `tf.logging`."""
-
-    # These functions want `str` for both Python2 and Python3, but in one case
-    # it's a Unicode string and in the other it's a byte string.
-    if six.PY3:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, bytes):
-            return text.decode("utf-8", "ignore")
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    elif six.PY2:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, unicode):
-            return text.encode("utf-8")
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    else:
-        raise ValueError("Not running on Python2 or Python 3?")
-
-
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    fin = open(vocab_file)
-    for num, line in enumerate(fin):
-        items = convert_to_unicode(line.strip()).split("\t")
-        if len(items) > 2:
-            break
-        token = items[0]
-        index = items[1] if len(items) == 2 else num
-        token = token.strip()
-        vocab[token] = int(index)
-    return vocab
-
-
-def convert_by_vocab(vocab, items):
-    """Converts a sequence of [tokens|ids] using the vocab."""
-    output = []
-    for item in items:
-        output.append(vocab[item])
-    return output
-
-
-def convert_tokens_to_ids(vocab, tokens):
-    return convert_by_vocab(vocab, tokens)
-
-
-def convert_ids_to_tokens(inv_vocab, ids):
-    return convert_by_vocab(inv_vocab, ids)
-
-
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a peice of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-class FullTokenizer(object):
-    """Runs end-to-end tokenziation."""
-
-    def __init__(self, vocab_file, do_lower_case=True):
-        self.vocab = load_vocab(vocab_file)
-        self.inv_vocab = {v: k for k, v in self.vocab.items()}
-        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-
-    def tokenize(self, text):
-        split_tokens = []
-        for token in self.basic_tokenizer.tokenize(text):
-            for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                split_tokens.append(sub_token)
-
-        return split_tokens
-
-    def convert_tokens_to_ids(self, tokens):
-        return convert_by_vocab(self.vocab, tokens)
-
-    def convert_ids_to_tokens(self, ids):
-        return convert_by_vocab(self.inv_vocab, ids)
-
-
-class CharTokenizer(object):
-    """Runs end-to-end tokenziation."""
-
-    def __init__(self, vocab_file, do_lower_case=True):
-        self.vocab = load_vocab(vocab_file)
-        self.inv_vocab = {v: k for k, v in self.vocab.items()}
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-
-    def tokenize(self, text):
-        split_tokens = []
-        for token in text.lower().split(" "):
-            for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                split_tokens.append(sub_token)
-
-        return split_tokens
-
-    def convert_tokens_to_ids(self, tokens):
-        return convert_by_vocab(self.vocab, tokens)
-
-    def convert_ids_to_tokens(self, ids):
-        return convert_by_vocab(self.inv_vocab, ids)
-
-
-class BasicTokenizer(object):
-    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
-
-    def __init__(self, do_lower_case=True):
-        """Constructs a BasicTokenizer.
-
-        Args:
-            do_lower_case: Whether to lower case the input.
-        """
-        self.do_lower_case = do_lower_case
-
-    def tokenize(self, text):
-        """Tokenizes a piece of text."""
-        text = convert_to_unicode(text)
-        text = self._clean_text(text)
-
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        text = self._tokenize_chinese_chars(text)
-
-        orig_tokens = whitespace_tokenize(text)
-        split_tokens = []
-        for token in orig_tokens:
-            if self.do_lower_case:
-                token = token.lower()
-                token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text):
-        """Splits punctuation on a piece of text."""
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-            (cp >= 0x3400 and cp <= 0x4DBF) or  #
-            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-            (cp >= 0x2B820 and cp <= 0x2CEAF) or
-            (cp >= 0xF900 and cp <= 0xFAFF) or  #
-            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xfffd or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-class WordpieceTokenizer(object):
-    """Runs WordPiece tokenziation."""
-
-    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """Tokenizes a piece of text into its word pieces.
-
-        This uses a greedy longest-match-first algorithm to perform tokenization
-        using the given vocabulary.
-
-        For example:
-            input = "unaffable"
-            output = ["un", "##aff", "##able"]
-
-        Args:
-            text: A single token or whitespace separated tokens. This should have
-                already been passed through `BasicTokenizer.
-
-        Returns:
-            A list of wordpiece tokens.
-        """
-
-        text = convert_to_unicode(text)
-
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
-
-
-def _is_whitespace(char):
-    """Checks whether `chars` is a whitespace character."""
-    # \t, \n, and \r are technically contorl characters but we treat them
-    # as whitespace since they are generally considered as such.
-    if char == " " or char == "\t" or char == "\n" or char == "\r":
-        return True
-    cat = unicodedata.category(char)
-    if cat == "Zs":
-        return True
-    return False
-
-
-def _is_control(char):
-    """Checks whether `chars` is a control character."""
-    # These are technically control characters but we count them as whitespace
-    # characters.
-    if char == "\t" or char == "\n" or char == "\r":
-        return False
-    cat = unicodedata.category(char)
-    if cat.startswith("C"):
-        return True
-    return False
-
-
-def _is_punctuation(char):
-    """Checks whether `chars` is a punctuation character."""
-    cp = ord(char)
-    # We treat all non-letter/number ASCII as punctuation.
-    # Characters such as "^", "$", and "`" are not in the Unicode
-    # Punctuation class but we treat them as punctuation anyways, for
-    # consistency.
-    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
-        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
-        return True
-    cat = unicodedata.category(char)
-    if cat.startswith("P"):
-        return True
-    return False
--- a/BERT/train.py
+++ b/BERT/train.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT pretraining."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import time
-import argparse
-import numpy as np
-import multiprocessing
-
-import paddle
-import paddle.fluid as fluid
-
-from reader.pretraining import DataReader
-from model.bert import BertModel, BertConfig
-from optimization import optimization
-from utils.args import ArgumentGroup, print_arguments, check_cuda
-from utils.init import init_checkpoint, init_pretraining_params
-
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
-model_g.add_arg("bert_config_path",      str,  "./config/bert_config.json",  "Path to the json file for bert model config.")
-model_g.add_arg("init_checkpoint",       str,  None,                         "Init checkpoint to resume training from.")
-model_g.add_arg("checkpoints",           str,  "checkpoints",                "Path to save checkpoints.")
-model_g.add_arg("weight_sharing",        bool, True,                         "If set, share weights between word embedding and masked lm.")
-model_g.add_arg("generate_neg_sample",   bool, True,                         "If set, randomly generate negtive samples by positive samples.")
-
-train_g = ArgumentGroup(parser, "training", "training options.")
-train_g.add_arg("epoch",             int,    100,     "Number of epoches for training.")
-train_g.add_arg("learning_rate",     float,  0.0001,  "Learning rate used to train with warmup.")
-train_g.add_arg("lr_scheduler",      str,    "linear_warmup_decay",
-                "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
-train_g.add_arg("weight_decay",      float,  0.01,    "Weight decay rate for L2 regularizer.")
-train_g.add_arg("num_train_steps",   int,    1000000, "Total steps to perform pretraining.")
-train_g.add_arg("warmup_steps",      int,    4000,    "Total steps to perform warmup when pretraining.")
-train_g.add_arg("save_steps",        int,    10000,   "The steps interval to save checkpoints.")
-train_g.add_arg("validation_steps",  int,    1000,    "The steps interval to evaluate model performance.")
-train_g.add_arg("use_fp16",          bool,   False,   "Whether to use fp16 mixed precision training.")
-train_g.add_arg("loss_scaling",      float,  1.0,
-                "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
-
-log_g = ArgumentGroup(parser,     "logging", "logging related.")
-log_g.add_arg("skip_steps",          int,    10,    "The steps interval to print loss.")
-log_g.add_arg("verbose",             bool,   False, "Whether to output verbose log.")
-
-data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
-data_g.add_arg("data_dir",            str,  "./data/train/",       "Path to training data.")
-data_g.add_arg("validation_set_dir",  str,  "./data/validation/",  "Path to validation data.")
-data_g.add_arg("test_set_dir",        str,  None,                  "Path to test data.")
-data_g.add_arg("vocab_path",          str,  "./config/vocab.txt",  "Vocabulary path.")
-data_g.add_arg("max_seq_len",         int,  512,                   "Tokens' number of the longest seqence allowed.")
-data_g.add_arg("batch_size",          int,  8192,
-               "The total number of examples in one batch for training, see also --in_tokens.")
-data_g.add_arg("in_tokens",           bool, True,
-               "If set, the batch size will be the maximum number of tokens in one batch. "
-               "Otherwise, it will be the maximum number of examples in one batch.")
-
-run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
-run_type_g.add_arg("is_distributed",               bool,   False,  "If set, then start distributed training.")
-run_type_g.add_arg("use_cuda",                     bool,   True,   "If set, use GPU for training.")
-run_type_g.add_arg("use_fast_executor",            bool,   False,  "If set, use fast parallel executor (in experiment).")
-run_type_g.add_arg("num_iteration_per_drop_scope", int,    1,      "Ihe iteration intervals to clean up temporary variables.")
-run_type_g.add_arg("do_test",                      bool,   False,  "Whether to perform evaluation on test data set.")
-
-args = parser.parse_args()
-# yapf: enable.
-
-
-def create_model(pyreader_name, bert_config):
-    pyreader = fluid.layers.py_reader(
-        capacity=70,
-        shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
-                [-1, args.max_seq_len, 1],
-                [-1, args.max_seq_len, 1], [-1, 1], [-1, 1],
-                [-1, 1]],
-        dtypes=[
-            'int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'int64'
-        ],
-        lod_levels=[0, 0, 0, 0, 0, 0, 0],
-        name=pyreader_name,
-        use_double_buffer=True)
-
-    (src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels) = fluid.layers.read_file(pyreader)
-
-    bert = BertModel(
-        src_ids=src_ids,
-        position_ids=pos_ids,
-        sentence_ids=sent_ids,
-        input_mask=input_mask,
-        config=bert_config,
-        weight_sharing=args.weight_sharing,
-        use_fp16=args.use_fp16)
-
-    next_sent_acc, mask_lm_loss, total_loss = bert.get_pretraining_output(
-        mask_label, mask_pos, labels)
-
-    if args.use_fp16 and args.loss_scaling > 1.0:
-        total_loss *= args.loss_scaling
-
-    return pyreader, next_sent_acc, mask_lm_loss, total_loss
-
-
-def predict_wrapper(args,
-                    exe,
-                    bert_config,
-                    test_prog=None,
-                    pyreader=None,
-                    fetch_list=None):
-    # Context to do validation.
-    data_path = args.test_set_dir if args.do_test else args.validation_set_dir
-    data_reader = DataReader(
-        data_path,
-        vocab_path=args.vocab_path,
-        batch_size=args.batch_size,
-        in_tokens=args.in_tokens,
-        voc_size=bert_config['vocab_size'],
-        shuffle_files=False,
-        epoch=1,
-        max_seq_len=args.max_seq_len,
-        is_test=True)
-
-    if args.do_test:
-        assert args.init_checkpoint is not None, "[FATAL] Please use --init_checkpoint '/path/to/checkpoints' \
-                                                  to specify you pretrained model checkpoints"
-
-        init_pretraining_params(exe, args.init_checkpoint, test_prog)
-
-    def predict(exe=exe, pyreader=pyreader):
-
-        pyreader.decorate_tensor_provider(data_reader.data_generator())
-        pyreader.start()
-
-        cost = 0
-        lm_cost = 0
-        acc = 0
-        steps = 0
-        time_begin = time.time()
-        while True:
-            try:
-                each_next_acc, each_mask_lm_cost, each_total_cost = exe.run(
-                    fetch_list=fetch_list, program=test_prog)
-                acc += each_next_acc
-                lm_cost += each_mask_lm_cost
-                cost += each_total_cost
-                steps += 1
-                if args.do_test and steps % args.skip_steps == 0:
-                    print("[test_set] steps: %d" % steps)
-
-            except fluid.core.EOFException:
-                pyreader.reset()
-                break
-
-        used_time = time.time() - time_begin
-        return cost, lm_cost, acc, steps, (args.skip_steps / used_time)
-
-    return predict
-
-
-def test(args):
-    bert_config = BertConfig(args.bert_config_path)
-    bert_config.print_config()
-
-    test_prog = fluid.Program()
-    test_startup = fluid.Program()
-    with fluid.program_guard(test_prog, test_startup):
-        with fluid.unique_name.guard():
-            test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model(
-                pyreader_name='test_reader', bert_config=bert_config)
-
-    test_prog = test_prog.clone(for_test=True)
-
-    place = fluid.CUDAPlace(0) if args.use_cuda == True else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    exe.run(test_startup)
-
-    predict = predict_wrapper(
-        args,
-        exe,
-        bert_config,
-        test_prog=test_prog,
-        pyreader=test_pyreader,
-        fetch_list=[next_sent_acc.name, mask_lm_loss.name, total_loss.name])
-
-    print("test begin")
-    loss, lm_loss, acc, steps, speed = predict()
-    print(
-        "[test_set] loss: %f, global ppl: %f, next_sent_acc: %f, speed: %f steps/s"
-        % (np.mean(np.array(loss) / steps),
-           np.exp(np.mean(np.array(lm_loss) / steps)),
-           np.mean(np.array(acc) / steps), speed))
-
-
-def train(args):
-    print("pretraining start")
-    bert_config = BertConfig(args.bert_config_path)
-    bert_config.print_config()
-
-    train_program = fluid.Program()
-    startup_prog = fluid.Program()
-    with fluid.program_guard(train_program, startup_prog):
-        with fluid.unique_name.guard():
-            train_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model(
-                pyreader_name='train_reader', bert_config=bert_config)
-            scheduled_lr = optimization(
-                loss=total_loss,
-                warmup_steps=args.warmup_steps,
-                num_train_steps=args.num_train_steps,
-                learning_rate=args.learning_rate,
-                train_program=train_program,
-                startup_prog=startup_prog,
-                weight_decay=args.weight_decay,
-                scheduler=args.lr_scheduler,
-                use_fp16=args.use_fp16,
-                loss_scaling=args.loss_scaling)
-
-            fluid.memory_optimize(
-                input_program=train_program,
-                skip_opt_set=[
-                    next_sent_acc.name, mask_lm_loss.name, total_loss.name
-                ])
-
-    test_prog = fluid.Program()
-    with fluid.program_guard(test_prog, startup_prog):
-        with fluid.unique_name.guard():
-            test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model(
-                pyreader_name='test_reader', bert_config=bert_config)
-
-    test_prog = test_prog.clone(for_test=True)
-
-    if args.use_cuda:
-        place = fluid.CUDAPlace(0)
-        dev_count = fluid.core.get_cuda_device_count()
-    else:
-        place = fluid.CPUPlace()
-        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-
-    print("Device count %d" % dev_count)
-    if args.verbose:
-        if args.in_tokens:
-            lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
-         program=train_program,
-         batch_size=args.batch_size // args.max_seq_len)
-        else:
-            lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
-         program=train_program, batch_size=args.batch_size)
-        print("Theoretical memory usage in training: %.3f - %.3f %s" %
-              (lower_mem, upper_mem, unit))
-
-    nccl2_num_trainers = 1
-    nccl2_trainer_id = 0
-    print("args.is_distributed:", args.is_distributed)
-    if args.is_distributed:
-        worker_endpoints_env = os.getenv("worker_endpoints")
-        worker_endpoints = worker_endpoints_env.split(",")
-        trainers_num = len(worker_endpoints)
-        current_endpoint = os.getenv("current_endpoint")
-        trainer_id = worker_endpoints.index(current_endpoint)
-        if trainer_id == 0:
-            print("train_id == 0, sleep 60s")
-            time.sleep(60)
-        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
-              trainer_id:{}"
-                            .format(worker_endpoints, trainers_num,
-                                    current_endpoint, trainer_id))
-
-        # prepare nccl2 env.
-        config = fluid.DistributeTranspilerConfig()
-        config.mode = "nccl2"
-        t = fluid.DistributeTranspiler(config=config)
-        t.transpile(
-            trainer_id,
-            trainers=worker_endpoints_env,
-            current_endpoint=current_endpoint,
-            program=train_program,
-            startup_program=startup_prog)
-        nccl2_num_trainers = trainers_num
-        nccl2_trainer_id = trainer_id
-
-    exe = fluid.Executor(place)
-    exe.run(startup_prog)
-
-    if args.init_checkpoint and args.init_checkpoint != "":
-        init_checkpoint(exe, args.init_checkpoint, train_program, args.use_fp16)
-
-    data_reader = DataReader(
-        data_dir=args.data_dir,
-        batch_size=args.batch_size,
-        in_tokens=args.in_tokens,
-        vocab_path=args.vocab_path,
-        voc_size=bert_config['vocab_size'],
-        epoch=args.epoch,
-        max_seq_len=args.max_seq_len,
-        generate_neg_sample=args.generate_neg_sample)
-
-    exec_strategy = fluid.ExecutionStrategy()
-    exec_strategy.use_experimental_executor = args.use_fast_executor
-    exec_strategy.num_threads = dev_count
-    exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
-
-    # use_ngraph is for CPU only, please refer to README_ngraph.md for details
-    use_ngraph = os.getenv('FLAGS_use_ngraph')
-    if not use_ngraph:
-        train_exe = fluid.ParallelExecutor(
-            use_cuda=args.use_cuda,
-            loss_name=total_loss.name,
-            exec_strategy=exec_strategy,
-            main_program=train_program,
-            num_trainers=nccl2_num_trainers,
-            trainer_id=nccl2_trainer_id)
-    else:
-        train_exe = exe
-
-    if args.validation_set_dir and args.validation_set_dir != "":
-        predict = predict_wrapper(
-            args,
-            exe,
-            bert_config,
-            test_prog=test_prog,
-            pyreader=test_pyreader,
-            fetch_list=[
-                next_sent_acc.name, mask_lm_loss.name, total_loss.name
-            ])
-
-    train_pyreader.decorate_tensor_provider(data_reader.data_generator())
-    train_pyreader.start()
-    steps = 0
-    cost = []
-    lm_cost = []
-    acc = []
-    time_begin = time.time()
-    while steps < args.num_train_steps:
-        try:
-            steps += nccl2_num_trainers
-            skip_steps = args.skip_steps * nccl2_num_trainers
-
-            if nccl2_trainer_id != 0:
-                if use_ngraph:
-                    train_exe.run(fetch_list=[], program=train_program)
-                else:
-                    train_exe.run(fetch_list=[])
-                continue
-
-            if steps % skip_steps != 0:
-                if use_ngraph:
-                    train_exe.run(fetch_list=[], program=train_program)
-                else:
-                    train_exe.run(fetch_list=[])
-
-            else:
-                if use_ngraph:
-                    each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = train_exe.run(
-                        fetch_list=[
-                            next_sent_acc.name, mask_lm_loss.name, total_loss.name,
-                            scheduled_lr.name], program=train_program)
-                else:
-                    each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = train_exe.run(
-                        fetch_list=[
-                            next_sent_acc.name, mask_lm_loss.name, total_loss.name,
-                            scheduled_lr.name])
-
-                acc.extend(each_next_acc)
-                lm_cost.extend(each_mask_lm_cost)
-                cost.extend(each_total_cost)
-
-                print("feed_queue size", train_pyreader.queue.size())
-                time_end = time.time()
-                used_time = time_end - time_begin
-                epoch, current_file_index, total_file, current_file = data_reader.get_progress(
-                )
-                print("current learning_rate:%f" % np_lr[0])
-                print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
-                      "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s"
-                      % (epoch, current_file_index, total_file, steps,
-                         np.mean(np.array(cost)),
-                         np.mean(np.exp(np.array(lm_cost))),
-                         np.mean(np.array(acc)), skip_steps / used_time,
-                         current_file))
-                cost = []
-                lm_cost = []
-                acc = []
-                time_begin = time.time()
-
-            if steps % args.save_steps == 0:
-                save_path = os.path.join(args.checkpoints, "step_" + str(steps))
-                fluid.io.save_persistables(exe, save_path, train_program)
-
-            if args.validation_set_dir and steps % args.validation_steps == 0:
-                vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict(
-                )
-                print("[validation_set] epoch: %d, step: %d, "
-                      "loss: %f, global ppl: %f, batch-averged ppl: %f, "
-                      "next_sent_acc: %f, speed: %f steps/s" %
-                      (epoch, steps,
-                       np.mean(np.array(vali_cost) / vali_steps),
-                       np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)),
-                       np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)),
-                       np.mean(np.array(vali_acc) / vali_steps), vali_speed))
-
-        except fluid.core.EOFException:
-            train_pyreader.reset()
-            break
-
-if __name__ == '__main__':
-    print_arguments(args)
-    check_cuda(args.use_cuda)
-    if args.do_test:
-        test(args)
-    else:
-        train(args)
--- a/BERT/train.sh
+++ b/BERT/train.sh
-#!/bin/bash
-
-set -xe
-
-while true ; do
-  case "$1" in
-    -local) is_local="$2" ; shift 2 ;;
-    *)
-       if [[ ${#1} > 0 ]]; then
-          echo "not supported arugments ${1}" ; exit 1 ;
-       else
-           break
-       fi
-       ;;
-  esac
-done
-
-case "$is_local" in
-    n) is_distributed="--is_distributed true" ;;
-    y) is_distributed="--is_distributed false" ;;
-    *) echo "not support argument -local: ${is_local}" ; exit 1 ;;
-esac
-
-# pretrain config
-SAVE_STEPS=10000
-BATCH_SIZE=4096
-LR_RATE=1e-4
-WEIGHT_DECAY=0.01
-MAX_LEN=512
-TRAIN_DATA_DIR=data/train
-VALIDATION_DATA_DIR=data/validation
-CONFIG_PATH=data/demo_config/bert_config.json
-VOCAB_PATH=data/demo_config/vocab.txt
-
-# Change your train arguments:
-python -u ./train.py ${is_distributed}\
-        --use_cuda true\
-        --weight_sharing true\
-        --batch_size ${BATCH_SIZE} \
-        --data_dir ${TRAIN_DATA_DIR} \
-        --validation_set_dir ${VALIDATION_DATA_DIR} \
-        --bert_config_path ${CONFIG_PATH} \
-        --vocab_path ${VOCAB_PATH} \
-        --generate_neg_sample true\
-        --checkpoints ./output \
-        --save_steps ${SAVE_STEPS} \
-        --learning_rate ${LR_RATE} \
-        --weight_decay ${WEIGHT_DECAY:-0} \
-        --max_seq_len ${MAX_LEN} \
-        --skip_steps 20 \
-        --validation_steps 1000 \
-        --num_iteration_per_drop_scope 10 \
-        --use_fp16 false \
-        --loss_scaling 8.0
-       
--- a/BERT/utils/args.py
+++ b/BERT/utils/args.py
--- a/BERT/utils/cards.py
+++ b/BERT/utils/cards.py
--- a/ELMo/run.sh
+++ b/ELMo/run.sh
--- a/ELMo/LAC_demo/bilm.py
+++ b/ELMo/LAC_demo/bilm.py
--- a/ELMo/LAC_demo/conf/q2b.dic
+++ b/ELMo/LAC_demo/conf/q2b.dic
--- a/ELMo/LAC_demo/data/dev/dev.tsv
+++ b/ELMo/LAC_demo/data/dev/dev.tsv
--- a/ELMo/LAC_demo/data/tag.dic
+++ b/ELMo/LAC_demo/data/tag.dic
--- a/ELMo/LAC_demo/data/train/train.tsv
+++ b/ELMo/LAC_demo/data/train/train.tsv
--- a/ELMo/LAC_demo/network.py
+++ b/ELMo/LAC_demo/network.py
--- a/ELMo/LAC_demo/reader.py
+++ b/ELMo/LAC_demo/reader.py
--- a/ELMo/LAC_demo/run.sh
+++ b/ELMo/LAC_demo/run.sh
--- a/ELMo/LAC_demo/train.py
+++ b/ELMo/LAC_demo/train.py
--- a/ELMo/README.md
+++ b/ELMo/README.md
--- a/ELMo/args.py
+++ b/ELMo/args.py
--- a/ELMo/data.py
+++ b/ELMo/data.py
--- a/ELMo/data/dev/sentence_file.txt
+++ b/ELMo/data/dev/sentence_file.txt
--- a/ELMo/data/dev/sentence_file_2.txt
+++ b/ELMo/data/dev/sentence_file_2.txt
--- a/ELMo/data/train/sentence_file.txt
+++ b/ELMo/data/train/sentence_file.txt
--- a/ELMo/data/train/sentence_file_1.txt
+++ b/ELMo/data/train/sentence_file_1.txt
--- a/ELMo/data/vocabulary_min5k.txt
+++ b/ELMo/data/vocabulary_min5k.txt
--- a/ELMo/lm_model.py
+++ b/ELMo/lm_model.py
--- a/ELMo/train.py
+++ b/ELMo/train.py
--- a/ERNIE/README.md
+++ b/ERNIE/README.md
--- a/ERNIE/finetune/__init__.py
+++ b/ERNIE/finetune/__init__.py
--- a/ERNIE/model/__init__.py
+++ b/ERNIE/model/__init__.py
--- a/ERNIE/reader/__init__.py
+++ b/ERNIE/reader/__init__.py
--- a/ERNIE/utils/__init__.py
+++ b/ERNIE/utils/__init__.py
--- a/ERNIE/utils/fp16.py
+++ b/ERNIE/utils/fp16.py
--- a/ERNIE/utils/init.py
+++ b/ERNIE/utils/init.py
--- a/README.md
+++ b/README.md
--- a/README.zh.md
+++ b/README.zh.md
--- a/BERT/__init__.py
+++ b/BERT/__init__.py
--- a/ERNIE/_ce.py
+++ b/ERNIE/_ce.py
--- a/ERNIE/batching.py
+++ b/ERNIE/batching.py
--- a/BERT/predict_classifier.py
+++ b/BERT/predict_classifier.py
--- a/ERNIE/config/ernie_config.json
+++ b/ERNIE/config/ernie_config.json
--- a/ERNIE/config/vocab.txt
+++ b/ERNIE/config/vocab.txt
--- a/config/vocab_en.txt
+++ b/config/vocab_en.txt
--- a/ERNIE/data/demo_train_set.gz
+++ b/ERNIE/data/demo_train_set.gz
--- a/ERNIE/data/demo_valid_set.gz
+++ b/ERNIE/data/demo_valid_set.gz
--- a/ERNIE/data/train_filelist
+++ b/ERNIE/data/train_filelist
--- a/ERNIE/data/valid_filelist
+++ b/ERNIE/data/valid_filelist
--- a/ERNIE/ernie_encoder.py
+++ b/ERNIE/ernie_encoder.py
--- a/BERT/model/__init__.py
+++ b/BERT/model/__init__.py
--- a/ERNIE/finetune/classifier.py
+++ b/ERNIE/finetune/classifier.py
--- a/finetune/mrc.py
+++ b/finetune/mrc.py
--- a/ERNIE/finetune/sequence_label.py
+++ b/ERNIE/finetune/sequence_label.py
--- a/ERNIE/finetune_args.py
+++ b/ERNIE/finetune_args.py
--- a/BERT/reader/__init__.py
+++ b/BERT/reader/__init__.py
--- a/BERT/model/bert.py
+++ b/BERT/model/bert.py
--- a/ERNIE/model/ernie.py
+++ b/ERNIE/model/ernie.py
--- a/ERNIE/model/transformer_encoder.py
+++ b/ERNIE/model/transformer_encoder.py
--- a/ERNIE/optimization.py
+++ b/ERNIE/optimization.py
--- a/ERNIE/predict_classifier.py
+++ b/ERNIE/predict_classifier.py
--- a/ERNIE/pretrain_args.py
+++ b/ERNIE/pretrain_args.py
--- a/BERT/utils/__init__.py
+++ b/BERT/utils/__init__.py
--- a/ERNIE/reader/pretraining.py
+++ b/ERNIE/reader/pretraining.py
--- a/ERNIE/reader/task_reader.py
+++ b/ERNIE/reader/task_reader.py
--- a/BERT/run_classifier.py
+++ b/BERT/run_classifier.py
--- a/ERNIE/run_classifier.py
+++ b/ERNIE/run_classifier.py
--- a/ERNIE/run_sequence_labeling.py
+++ b/ERNIE/run_sequence_labeling.py
--- a/script/en_glue/ernie_base/CoLA/task.sh
+++ b/script/en_glue/ernie_base/CoLA/task.sh
--- a/script/en_glue/ernie_base/MNLI/task.sh
+++ b/script/en_glue/ernie_base/MNLI/task.sh
--- a/script/en_glue/ernie_base/MRPC/task.sh
+++ b/script/en_glue/ernie_base/MRPC/task.sh
--- a/script/en_glue/ernie_base/QNLI/task.sh
+++ b/script/en_glue/ernie_base/QNLI/task.sh
--- a/script/en_glue/ernie_base/QQP/task.sh
+++ b/script/en_glue/ernie_base/QQP/task.sh
--- a/script/en_glue/ernie_base/RTE/task.sh
+++ b/script/en_glue/ernie_base/RTE/task.sh
--- a/script/en_glue/ernie_base/SST-2/task.sh
+++ b/script/en_glue/ernie_base/SST-2/task.sh
--- a/script/en_glue/ernie_base/STS-B/task.sh
+++ b/script/en_glue/ernie_base/STS-B/task.sh
--- a/script/en_glue/ernie_base/WNLI/task.sh
+++ b/script/en_glue/ernie_base/WNLI/task.sh
--- a/BERT/data/demo_config/bert_config.json
+++ b/BERT/data/demo_config/bert_config.json
--- a/script/en_glue/ernie_base/vocab.txt
+++ b/script/en_glue/ernie_base/vocab.txt
--- a/script/en_glue/ernie_large/CoLA/task.sh
+++ b/script/en_glue/ernie_large/CoLA/task.sh
--- a/script/en_glue/ernie_large/MNLI/task.sh
+++ b/script/en_glue/ernie_large/MNLI/task.sh
--- a/script/en_glue/ernie_large/MRPC/task.sh
+++ b/script/en_glue/ernie_large/MRPC/task.sh
--- a/script/en_glue/ernie_large/QNLI/task.sh
+++ b/script/en_glue/ernie_large/QNLI/task.sh
--- a/script/en_glue/ernie_large/QQP/task.sh
+++ b/script/en_glue/ernie_large/QQP/task.sh
--- a/script/en_glue/ernie_large/RTE/task.sh
+++ b/script/en_glue/ernie_large/RTE/task.sh
--- a/script/en_glue/ernie_large/SST-2/task.sh
+++ b/script/en_glue/ernie_large/SST-2/task.sh
--- a/script/en_glue/ernie_large/STS-B/task.sh
+++ b/script/en_glue/ernie_large/STS-B/task.sh
--- a/script/en_glue/ernie_large/WNLI/task.sh
+++ b/script/en_glue/ernie_large/WNLI/task.sh
--- a/script/en_glue/ernie_large/ernie_config.json
+++ b/script/en_glue/ernie_large/ernie_config.json
--- a/script/en_glue/ernie_large/vocab.txt
+++ b/script/en_glue/ernie_large/vocab.txt
--- a/script/en_glue/preprocess/cvt.sh
+++ b/script/en_glue/preprocess/cvt.sh
--- a/script/en_glue/preprocess/mnli.py
+++ b/script/en_glue/preprocess/mnli.py
--- a/script/en_glue/preprocess/qnli.py
+++ b/script/en_glue/preprocess/qnli.py
--- a/script/zh_task/ernie_base/run_ChnSentiCorp.sh
+++ b/script/zh_task/ernie_base/run_ChnSentiCorp.sh
--- a/script/zh_task/ernie_base/run_bq.sh
+++ b/script/zh_task/ernie_base/run_bq.sh
--- a/script/zh_task/ernie_base/run_cmrc2018.sh
+++ b/script/zh_task/ernie_base/run_cmrc2018.sh
--- a/script/zh_task/ernie_base/run_dbqa.sh
+++ b/script/zh_task/ernie_base/run_dbqa.sh
--- a/script/zh_task/ernie_base/run_drcd.sh
+++ b/script/zh_task/ernie_base/run_drcd.sh
--- a/script/zh_task/ernie_base/run_lcqmc.sh
+++ b/script/zh_task/ernie_base/run_lcqmc.sh
--- a/script/zh_task/ernie_base/run_msra_ner.sh
+++ b/script/zh_task/ernie_base/run_msra_ner.sh
--- a/script/zh_task/ernie_base/run_thuc.sh
+++ b/script/zh_task/ernie_base/run_thuc.sh
--- a/script/zh_task/ernie_base/run_xnli.sh
+++ b/script/zh_task/ernie_base/run_xnli.sh
--- a/ERNIE/script/run_ChnSentiCorp.sh
+++ b/ERNIE/script/run_ChnSentiCorp.sh
--- a/script/zh_task/ernie_large/run_bq.sh
+++ b/script/zh_task/ernie_large/run_bq.sh
--- a/script/zh_task/ernie_large/run_cmrc2018.sh
+++ b/script/zh_task/ernie_large/run_cmrc2018.sh
--- a/ERNIE/script/run_dbqa.sh
+++ b/ERNIE/script/run_dbqa.sh
--- a/script/zh_task/ernie_large/run_drcd.sh
+++ b/script/zh_task/ernie_large/run_drcd.sh
--- a/ERNIE/script/run_lcqmc.sh
+++ b/ERNIE/script/run_lcqmc.sh
--- a/ERNIE/script/run_msra_ner.sh
+++ b/ERNIE/script/run_msra_ner.sh
--- a/script/zh_task/ernie_large/run_thuc.sh
+++ b/script/zh_task/ernie_large/run_thuc.sh
--- a/ERNIE/script/run_xnli.sh
+++ b/ERNIE/script/run_xnli.sh
--- a/ERNIE/script/pretrain.sh
+++ b/ERNIE/script/pretrain.sh
--- a/ERNIE/tokenization.py
+++ b/ERNIE/tokenization.py
--- a/ERNIE/train.py
+++ b/ERNIE/train.py
--- a/ERNIE/__init__.py
+++ b/ERNIE/__init__.py
--- a/ERNIE/utils/args.py
+++ b/ERNIE/utils/args.py
--- a/ERNIE/utils/cards.py
+++ b/ERNIE/utils/cards.py
--- a/utils/cmrc2018_eval.py
+++ b/utils/cmrc2018_eval.py
--- a/BERT/utils/fp16.py
+++ b/BERT/utils/fp16.py
--- a/BERT/utils/init.py
+++ b/BERT/utils/init.py