未验证 提交 34f1f624 编写于 作者: Z zhengya01 提交者: GitHub

Merge pull request #2 from PaddlePaddle/develop

update
此差异已折叠。
####this file is only used for continuous evaluation test!
import os
import sys
sys.path.insert(0, os.environ['ceroot'])
#sys.path.append('.')
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_cost_xnli_card1_kpi = CostKpi('train_cost_xnli_card1', 0.002, 0, actived=True)
train_acc_xnli_card1_kpi = AccKpi('train_acc_xnli_card1', 0.002, 0, actived=True)
train_duration_xnli_card1_kpi = DurationKpi(
'train_duration_xnli_card1', 0.01, 0, actived=True)
train_cost_xnli_card4_kpi = CostKpi('train_cost_xnli_card4', 0.002, 0, actived=True)
train_acc_xnli_card4_kpi = AccKpi('train_acc_xnli_card4', 0.02, 0, actived=True)
train_duration_xnli_card4_kpi = DurationKpi(
'train_duration_xnli_card4', 0.03, 0, actived=True)
tracking_kpis = [
train_cost_xnli_card1_kpi,
train_acc_xnli_card1_kpi,
train_duration_xnli_card1_kpi,
train_cost_xnli_card4_kpi,
train_acc_xnli_card4_kpi,
train_duration_xnli_card4_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print(log)
print("****")
log_to_ce(log)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mask, padding and batching."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
"""
max_len = max([len(sent) for sent in batch_tokens])
mask_label = []
mask_pos = []
prob_mask = np.random.rand(total_token_num)
# Note: the first token is [CLS], so [low=1]
replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
pre_sent_len = 0
prob_index = 0
for sent_index, sent in enumerate(batch_tokens):
mask_flag = False
prob_index += pre_sent_len
for token_index, token in enumerate(sent):
prob = prob_mask[prob_index + token_index]
if prob > 0.15:
continue
elif 0.03 < prob <= 0.15:
# mask
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
elif 0.015 < prob <= 0.03:
# random replace
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = replace_ids[prob_index + token_index]
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
else:
# keep the original token
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
mask_pos.append(sent_index * max_len + token_index)
pre_sent_len = len(sent)
# ensure at least mask one word in a sentence
while not mask_flag:
token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
if sent[token_index] != SEP and sent[token_index] != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
return batch_tokens, mask_label, mask_pos
def prepare_batch_data(insts,
total_token_num,
voc_size=0,
pad_id=None,
cls_id=None,
sep_id=None,
mask_id=None,
return_input_mask=True,
return_max_len=True,
return_num_token=False):
"""
1. generate Tensor of data
2. generate Tensor of position
3. generate self attention mask, [shape: batch_size * max_len * max_len]
"""
batch_src_ids = [inst[0] for inst in insts]
batch_sent_ids = [inst[1] for inst in insts]
batch_pos_ids = [inst[2] for inst in insts]
labels_list = []
# compatible with squad, whose example includes start/end positions,
# or unique id
for i in range(3, len(insts[0]), 1):
labels = [inst[i] for inst in insts]
labels = np.array(labels).astype("int64").reshape([-1, 1])
labels_list.append(labels)
# First step: do mask without padding
if mask_id >= 0:
out, mask_label, mask_pos = mask(
batch_src_ids,
total_token_num,
vocab_size=voc_size,
CLS=cls_id,
SEP=sep_id,
MASK=mask_id)
else:
out = batch_src_ids
# Second step: padding
src_id, self_input_mask = pad_batch_data(
out, pad_idx=pad_id, return_input_mask=True)
pos_id = pad_batch_data(
batch_pos_ids,
pad_idx=pad_id,
return_pos=False,
return_input_mask=False)
sent_id = pad_batch_data(
batch_sent_ids,
pad_idx=pad_id,
return_pos=False,
return_input_mask=False)
if mask_id >= 0:
return_list = [
src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos
] + labels_list
else:
return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
return return_list if len(return_list) > 1 else return_list[0]
def pad_batch_data(insts,
pad_idx=0,
return_pos=False,
return_input_mask=False,
return_max_len=False,
return_num_token=False):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and input mask.
"""
return_list = []
max_len = max(len(inst) for inst in insts)
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
inst_data = np.array([
list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
])
return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
# position data
if return_pos:
inst_pos = np.array([
list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
for inst in insts
])
return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
if return_input_mask:
# This is used to avoid attention on paddings.
input_mask_data = np.array([[1] * len(inst) + [0] *
(max_len - len(inst)) for inst in insts])
input_mask_data = np.expand_dims(input_mask_data, axis=-1)
return_list += [input_mask_data.astype("float32")]
if return_max_len:
return_list += [max_len]
if return_num_token:
num_token = 0
for inst in insts:
num_token += len(inst)
return_list += [num_token]
return return_list if len(return_list) > 1 else return_list[0]
if __name__ == "__main__":
pass
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Google official BERT models to Fluid parameters."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import argparse
import collections
from utils.args import print_arguments
import tensorflow as tf
import paddle.fluid as fluid
from tensorflow.python import pywrap_tensorflow
def parse_args():
parser = argparse.ArgumentParser(__doc__)
parser.add_argument(
"--init_tf_checkpoint",
type=str,
required=True,
help="Initial TF checkpoint (a pre-trained BERT model).")
parser.add_argument(
"--fluid_params_dir",
type=str,
required=True,
help="The directory to store converted Fluid parameters.")
args = parser.parse_args()
return args
def parse(init_checkpoint):
tf_fluid_param_name_map = collections.OrderedDict()
tf_param_name_shape_map = collections.OrderedDict()
init_vars = tf.train.list_variables(init_checkpoint)
for (var_name, var_shape) in init_vars:
fluid_param_name = ''
if var_name.startswith('bert/'):
key = var_name[5:]
if (key.startswith('embeddings/')):
if (key.endswith('LayerNorm/gamma')):
fluid_param_name = 'pre_encoder_layer_norm_scale'
elif (key.endswith('LayerNorm/beta')):
fluid_param_name = 'pre_encoder_layer_norm_bias'
elif (key.endswith('position_embeddings')):
fluid_param_name = 'pos_embedding'
elif (key.endswith('word_embeddings')):
fluid_param_name = 'word_embedding'
elif (key.endswith('token_type_embeddings')):
fluid_param_name = 'sent_embedding'
else:
print("ignored param: %s" % var_name)
elif (key.startswith('encoder/')):
key = key[8:]
layer_num = int(key[key.find('_') + 1:key.find('/')])
suffix = "encoder_layer_" + str(layer_num)
if key.endswith('attention/output/LayerNorm/beta'):
fluid_param_name = suffix + '_post_att_layer_norm_bias'
elif key.endswith('attention/output/LayerNorm/gamma'):
fluid_param_name = suffix + '_post_att_layer_norm_scale'
elif key.endswith('attention/output/dense/bias'):
fluid_param_name = suffix + '_multi_head_att_output_fc.b_0'
elif key.endswith('attention/output/dense/kernel'):
fluid_param_name = suffix + '_multi_head_att_output_fc.w_0'
elif key.endswith('attention/self/key/bias'):
fluid_param_name = suffix + '_multi_head_att_key_fc.b_0'
elif key.endswith('attention/self/key/kernel'):
fluid_param_name = suffix + '_multi_head_att_key_fc.w_0'
elif key.endswith('attention/self/query/bias'):
fluid_param_name = suffix + '_multi_head_att_query_fc.b_0'
elif key.endswith('attention/self/query/kernel'):
fluid_param_name = suffix + '_multi_head_att_query_fc.w_0'
elif key.endswith('attention/self/value/bias'):
fluid_param_name = suffix + '_multi_head_att_value_fc.b_0'
elif key.endswith('attention/self/value/kernel'):
fluid_param_name = suffix + '_multi_head_att_value_fc.w_0'
elif key.endswith('intermediate/dense/bias'):
fluid_param_name = suffix + '_ffn_fc_0.b_0'
elif key.endswith('intermediate/dense/kernel'):
fluid_param_name = suffix + '_ffn_fc_0.w_0'
elif key.endswith('output/LayerNorm/beta'):
fluid_param_name = suffix + '_post_ffn_layer_norm_bias'
elif key.endswith('output/LayerNorm/gamma'):
fluid_param_name = suffix + '_post_ffn_layer_norm_scale'
elif key.endswith('output/dense/bias'):
fluid_param_name = suffix + '_ffn_fc_1.b_0'
elif key.endswith('output/dense/kernel'):
fluid_param_name = suffix + '_ffn_fc_1.w_0'
else:
print("ignored param: %s" % var_name)
elif (key.startswith('pooler/')):
if key.endswith('dense/bias'):
fluid_param_name = 'pooled_fc.b_0'
elif key.endswith('dense/kernel'):
fluid_param_name = 'pooled_fc.w_0'
else:
print("ignored param: %s" % var_name)
else:
print("ignored param: %s" % var_name)
elif var_name.startswith('cls/'):
if var_name == 'cls/predictions/output_bias':
fluid_param_name = 'mask_lm_out_fc.b_0'
elif var_name == 'cls/predictions/transform/LayerNorm/beta':
fluid_param_name = 'mask_lm_trans_layer_norm_bias'
elif var_name == 'cls/predictions/transform/LayerNorm/gamma':
fluid_param_name = 'mask_lm_trans_layer_norm_scale'
elif var_name == 'cls/predictions/transform/dense/bias':
fluid_param_name = 'mask_lm_trans_fc.b_0'
elif var_name == 'cls/predictions/transform/dense/kernel':
fluid_param_name = 'mask_lm_trans_fc.w_0'
elif var_name == 'cls/seq_relationship/output_bias':
fluid_param_name = 'next_sent_fc.b_0'
elif var_name == 'cls/seq_relationship/output_weights':
fluid_param_name = 'next_sent_fc.w_0'
elif var_name == 'cls/squad/output_weights':
fluid_param_name = 'cls_squad_out_w'
elif var_name == 'cls/squad/output_bias':
fluid_param_name = 'cls_squad_out_b'
else:
print("ignored param: %s" % var_name)
else:
if var_name == 'output_weights':
fluid_param_name = 'cls_out_w'
elif var_name == 'output_bias':
fluid_param_name = 'cls_out_b'
else:
print("ignored param: %s" % var_name)
if fluid_param_name != '':
tf_fluid_param_name_map[var_name] = fluid_param_name
tf_param_name_shape_map[var_name] = var_shape
fluid_param_name = ''
return tf_fluid_param_name_map, tf_param_name_shape_map
def convert(args):
tf_fluid_param_name_map, tf_param_name_shape_map = parse(
args.init_tf_checkpoint)
program = fluid.Program()
global_block = program.global_block()
for param in tf_fluid_param_name_map:
global_block.create_parameter(
name=tf_fluid_param_name_map[param],
shape=tf_param_name_shape_map[param],
dtype='float32',
initializer=fluid.initializer.Constant(value=0.0))
place = fluid.core.CPUPlace()
exe = fluid.Executor(place)
exe.run(program)
print('---------------------- Converted Parameters -----------------------')
print('###### [TF param name] --> [Fluid param name] [param shape] ######')
print('-------------------------------------------------------------------')
reader = pywrap_tensorflow.NewCheckpointReader(args.init_tf_checkpoint)
for param in tf_fluid_param_name_map:
value = reader.get_tensor(param)
if param == 'cls/seq_relationship/output_weights':
value = np.transpose(value)
if param == 'cls/squad/output_weights':
value = np.transpose(value)
if param == 'output_weights':
value = np.transpose(value)
fluid.global_scope().find_var(tf_fluid_param_name_map[
param]).get_tensor().set(value, place)
print(param, ' --> ', tf_fluid_param_name_map[param], ' ', value.shape)
fluid.io.save_params(exe, args.fluid_params_dir, main_program=program)
if __name__ == '__main__':
args = parse_args()
print_arguments(args)
convert(args)
此差异已折叠。
此差异已折叠。
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import paddle.fluid as fluid
def nccl2_prepare(trainer_id, startup_prog, main_prog):
config = fluid.DistributeTranspilerConfig()
config.mode = "nccl2"
t = fluid.DistributeTranspiler(config=config)
t.transpile(trainer_id,
trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'),
current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'),
startup_program=startup_prog,
program=main_prog)
def prepare_for_multi_process(exe, build_strategy, train_prog):
# prepare for multi-process
trainer_id = int(os.environ.get('PADDLE_TRAINER_ID', 0))
num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
if num_trainers < 2: return
print("PADDLE_TRAINERS_NUM", num_trainers)
print("PADDLE_TRAINER_ID", trainer_id)
build_strategy.num_trainers = num_trainers
build_strategy.trainer_id = trainer_id
# NOTE(zcd): use multi processes to train the model,
# and each process use one GPU card.
startup_prog = fluid.Program()
nccl2_prepare(trainer_id, startup_prog, train_prog)
# the startup_prog are run two times, but it doesn't matter.
exe.run(startup_prog)
CMAKE_MINIMUM_REQUIRED(VERSION 3.2)
PROJECT(inference_demo)
SET(CMAKE_C_COMPILER gcc)
SET(CMAKE_CXX_COMPILER g++)
ADD_COMPILE_OPTIONS(-std=c++11 -D_GLIBCXX_USE_CXX11_ABI=0)
SET(FLUID_INFER_LIB fluid_inference)
SET(FLUID_INC_PATH ${FLUID_INFER_LIB}/paddle/include)
SET(FLUID_LIB_PATH ${FLUID_INFER_LIB}/paddle/lib)
SET(GLOG_INC_PATH ${FLUID_INFER_LIB}/third_party/install/glog/include)
SET(GLOG_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/glog/lib)
SET(GFLAGS_INC_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/include)
SET(GFLAGS_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/lib)
SET(MKLDNN_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mkldnn/lib)
SET(MKLML_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mklml/lib)
INCLUDE_DIRECTORIES(${FLUID_INC_PATH})
INCLUDE_DIRECTORIES(${GLOG_INC_PATH})
INCLUDE_DIRECTORIES(${GFLAGS_INC_PATH})
LINK_DIRECTORIES(${FLUID_LIB_PATH})
LINK_DIRECTORIES(${GLOG_LIB_PATH})
LINK_DIRECTORIES(${GFLAGS_LIB_PATH})
LINK_DIRECTORIES(${MKLML_LIB_PATH})
LINK_DIRECTORIES(${MKLDNN_LIB_PATH})
ADD_EXECUTABLE(inference inference.cc)
TARGET_LINK_LIBRARIES(inference dl paddle_fluid glog gflags pthread)
# BERT模型inference demo
## 数据预处理
实际应用场景中,模型部署之后用户还需要编写对应的程序对输入进行处理,然后把得到的数据传给模型进行预测。这里为了演示的需要,用 `gen_demo_data.py` 来进行数据处理,包括 tokenization,batching,numericalization,并且把处理后的数据输出为文本文件。使用方法如下:
``` bash
TASK_NAME="xnli"
DATA_PATH=/path/to/xnli/data/
BERT_BASE_PATH=/path/to/bert/pretrained/model/
python gen_demo_data.py \
--task_name ${TASK_NAME} \
--data_path ${DATA_PATH} \
--vocab_path "${BERT_BASE_PATH}/vocab.txt" \
--batch_size 4096 \
--in_tokens \
> data.txt
```
**生成的数据格式**
生成的数据一行代表一个 `batch`, 包含四个字段
```text
src_id, pos_id, segment_id, input_mask
```
字段之间按照分号(;)分隔,其中各字段内部 `shape``data` 按照冒号(:)分隔,`shape``data` 内部按空格分隔,`input_mask` 为 FLOAT32 类型,其余字段为 INT64 类型。
## 编译和运行
为了编译 inference demo,`C++` 编译器需要支持 `C++11` 标准。
首先下载对应的 [PaddlePaddle预测库](http://paddlepaddle.org/documentation/docs/zh/1.3/advanced_usage/deploy/inference/build_and_install_lib_cn.html) , 根据使用的 paddle 的版本和配置状况 (是否使用 avx, mkl, 以及 cuda, cudnn 版本) 选择下载对应的版本,并解压至 `inference` 目录,会得到 `fluid_inference` 子目录。
假设`paddle_infer_lib_path`是刚才解压得到的`fluid_inference`子目录的绝对路径,设置运行相关的环境变量(以 `cpu_avx_mkl` 版本为例)
``` bash
LD_LIBRARY_PATH=${paddle_infer_lib_path}/paddle/lib/:$LD_LIBRARY_PATH
LD_LIBRARY_PATH=${paddle_infer_lib_path}/third_party/install/mklml/lib:$LD_LIBRARY_PATH
LD_LIBRARY_PATH=${paddle_infer_lib_path}/third_party/install/mkldnn/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH
```
编译 demo
``` bash
mkdir build && cd build
cmake .. -DFLUID_INFER_LIB=${paddle_infer_lib_path}
make
```
这会在 `build` 目录下生成运行 `inference` 可执行文件。
运行 demo
```bash
./inference --logtostderr \
--model_dir $INFERENCE_MODEL_PATH \
--data $DATA_PATH \
--repeat $REPEAT_TIMES
--output_prediction \
--use_gpu \
```
参数 `repeat` 设置了执行预测的循环次数,一般在性能测试时可以设置其为大于 1 的某个整数,以观察多次预测的平均时间消耗。 在设置了 `output_prediction` 之后,预测程序会将每个样本的预测结果以概率的形式输出,其格式为:
```
样本id \t 类别0概率 \t 类别1概率 \t 类别2概率 ...
```
最后,在支持 NV GPUs 的环境中可以使能 `use_gpu`,否则就会在 CPU 上执行预测。
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import sys
sys.path.append("..")
from reader import cls
def main():
args = parse_args()
task_name = args.task_name.lower()
processors = {
'xnli': cls.XnliProcessor,
'cola': cls.ColaProcessor,
'mrpc': cls.MrpcProcessor,
'mnli': cls.MnliProcessor,
}
processor = processors[task_name](data_dir=args.data_path,
vocab_path=args.vocab_path,
max_seq_len=args.max_seq_len,
do_lower_case=args.do_lower_case,
in_tokens=args.in_tokens,
random_seed=args.random_seed)
example = processor.get_test_examples(args.data_path)[0]
gen = processor.data_generator(
args.batch_size, phase='test', epoch=1, shuffle=False)()
for i, data in enumerate(gen):
data = data[:4]
sample = []
for field in data:
shape_str = ' '.join(map(str, field.shape))
data_str = ' '.join(map(str, field.reshape(-1).tolist()))
sample.append(shape_str + ':' + data_str)
print(';'.join(sample))
def str2bool(v):
# because argparse does not support to parse "true, False" as python
# boolean directly
return v.lower() in ("true", "t", "1")
def parse_args():
parser = argparse.ArgumentParser(prog="bert data prepare")
parser.add_argument(
"--task_name",
type=str,
default='xnli',
choices=["xnli", "mnli", "cola", "mrpc"],
help="task name, used to specify data preprocessor")
parser.add_argument(
"--batch_size",
type=int,
default=4096,
help="batch size, see also --in_tokens")
parser.add_argument(
"--in_tokens",
action='store_true',
help="if set, batch_size means token number in a batch, otherwise "
"it means example number in a batch")
parser.add_argument(
'--do_lower_case',
type=str2bool,
default=True,
choices=[True, False],
help="Whether to lower case the input text. Should be True for uncased "
"models and False for cased models.")
parser.add_argument("--vocab_path", type=str, help="path of vocabulary")
parser.add_argument("--data_path", type=str, help="path of data to process")
parser.add_argument(
"--max_seq_len", type=int, default=128, help="max sequence length")
parser.add_argument(
"--random_seed", type=int, default=0, help="random seed")
return parser.parse_args()
if __name__ == "__main__":
main()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <paddle_inference_api.h>
#include <chrono>
#include <iostream>
#include <fstream>
#include <numeric>
#include <sstream>
#include <string>
#include <vector>
DEFINE_string(model_dir, "", "Inference model directory.");
DEFINE_string(data, "", "Input data path.");
DEFINE_int32(repeat, 1, "Repeat times.");
DEFINE_int32(num_labels, 3, "Number of labels.");
DEFINE_bool(output_prediction, false, "Whether to output the prediction results.");
DEFINE_bool(use_gpu, false, "Whether to use GPU for prediction.");
template <typename T>
void GetValueFromStream(std::stringstream *ss, T *t) {
(*ss) >> (*t);
}
template <>
void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
*t = ss->str();
}
// Split string to vector
template <typename T>
void Split(const std::string &line, char sep, std::vector<T> *v) {
std::stringstream ss;
T t;
for (auto c : line) {
if (c != sep) {
ss << c;
} else {
GetValueFromStream<T>(&ss, &t);
v->push_back(std::move(t));
ss.str({});
ss.clear();
}
}
if (!ss.str().empty()) {
GetValueFromStream<T>(&ss, &t);
v->push_back(std::move(t));
ss.str({});
ss.clear();
}
}
template <typename T>
constexpr paddle::PaddleDType GetPaddleDType();
template <>
constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
return paddle::PaddleDType::INT64;
}
template <>
constexpr paddle::PaddleDType GetPaddleDType<float>() {
return paddle::PaddleDType::FLOAT32;
}
// Parse tensor from string
template <typename T>
bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
std::vector<std::string> data;
Split(field, ':', &data);
if (data.size() < 2) {
LOG(ERROR) << "parse tensor error!";
return false;
}
std::string shape_str = data[0];
std::vector<int> shape;
Split(shape_str, ' ', &shape);
std::string mat_str = data[1];
std::vector<T> mat;
Split(mat_str, ' ', &mat);
tensor->shape = shape;
auto size =
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
sizeof(T);
tensor->data.Resize(size);
std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
tensor->dtype = GetPaddleDType<T>();
return true;
}
// Parse input tensors from string
bool ParseLine(const std::string &line,
std::vector<paddle::PaddleTensor> *tensors) {
std::vector<std::string> fields;
Split(line, ';', &fields);
if (fields.size() < 4) return false;
tensors->clear();
tensors->reserve(4);
int i = 0;
// src_id
paddle::PaddleTensor src_id;
ParseTensor<int64_t>(fields[i++], &src_id);
tensors->push_back(src_id);
// pos_id
paddle::PaddleTensor pos_id;
ParseTensor<int64_t>(fields[i++], &pos_id);
tensors->push_back(pos_id);
// segment_id
paddle::PaddleTensor segment_id;
ParseTensor<int64_t>(fields[i++], &segment_id);
tensors->push_back(segment_id);
// input mask
paddle::PaddleTensor input_mask;
ParseTensor<float>(fields[i++], &input_mask);
tensors->push_back(input_mask);
return true;
}
template <typename T>
void PrintTensor(const paddle::PaddleTensor &t) {
std::stringstream ss;
ss.str({});
ss.clear();
ss << "Tensor: shape[";
for (auto i: t.shape) {
ss << i << " ";
}
ss << "], data[";
T *data = static_cast<T *>(t.data.data());
for (int i = 0; i < t.data.length() / sizeof(T); i++) {
ss << data[i] << " ";
}
ss << "]";
LOG(INFO) << ss.str();
}
void PrintInputs(const std::vector<paddle::PaddleTensor> &inputs) {
for (const auto &t : inputs) {
if (t.dtype == paddle::PaddleDType::INT64) {
PrintTensor<int64_t>(t);
} else {
PrintTensor<float>(t);
}
}
}
// Print outputs to log
void PrintOutputs(const std::vector<paddle::PaddleTensor> &outputs, int &cnt) {
for (size_t i = 0; i < outputs.front().data.length() / sizeof(float);
i += FLAGS_num_labels) {
std::cout << cnt << "\t";
for (size_t j = 0; j < FLAGS_num_labels; ++j) {
std::cout << static_cast<float *>(outputs.front().data.data())[i+j] << "\t";
}
std::cout << std::endl;
cnt += 1;
}
}
bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs) {
if (FLAGS_data.empty()) {
LOG(ERROR) << "please set input data path";
return false;
}
std::ifstream fin(FLAGS_data);
std::string line;
int lineno = 0;
while (std::getline(fin, line)) {
std::vector<paddle::PaddleTensor> feed_data;
if (!ParseLine(line, &feed_data)) {
LOG(ERROR) << "Parse line[" << lineno << "] error!";
} else {
inputs->push_back(std::move(feed_data));
}
}
return true;
}
int main(int argc, char *argv[]) {
google::InitGoogleLogging(*argv);
gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir.empty()) {
LOG(ERROR) << "please set model dir";
return -1;
}
paddle::NativeConfig config;
config.model_dir = FLAGS_model_dir;
if (FLAGS_use_gpu) {
config.use_gpu = true;
config.fraction_of_gpu_memory = 0.15;
config.device = 0;
}
auto predictor = CreatePaddlePredictor(config);
std::vector<std::vector<paddle::PaddleTensor>> inputs;
if (!LoadInputData(&inputs)) {
LOG(ERROR) << "load input data error!";
return -1;
}
std::vector<paddle::PaddleTensor> fetch;
int total_time{0};
int num_samples{0};
int out_cnt = 0;
for (int i = 0; i < FLAGS_repeat; i++) {
for (auto feed : inputs) {
fetch.clear();
auto start = std::chrono::system_clock::now();
predictor->Run(feed, &fetch);
if (FLAGS_output_prediction && i == 0) {
PrintOutputs(fetch, out_cnt);
}
auto end = std::chrono::system_clock::now();
if (!fetch.empty()) {
total_time +=
std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
.count();
num_samples += fetch.front().data.length() / FLAGS_num_labels / sizeof(float);
}
}
}
auto per_sample_ms =
static_cast<float>(total_time) / num_samples;
LOG(INFO) << "Run on " << num_samples
<< " samples over "<< FLAGS_repeat << " times, average latency: " << per_sample_ms
<< "ms per sample.";
return 0;
}
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Model for classifier."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from model.bert import BertModel
def create_model(args,
pyreader_name,
bert_config,
num_labels,
is_prediction=False):
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]],
dtypes=['int64', 'int64', 'int64', 'float32', 'int64'],
lod_levels=[0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)
(src_ids, pos_ids, sent_ids, input_mask,
labels) = fluid.layers.read_file(pyreader)
bert = BertModel(
src_ids=src_ids,
position_ids=pos_ids,
sentence_ids=sent_ids,
input_mask=input_mask,
config=bert_config,
use_fp16=args.use_fp16)
cls_feats = bert.get_pooled_output()
cls_feats = fluid.layers.dropout(
x=cls_feats,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
logits = fluid.layers.fc(
input=cls_feats,
size=num_labels,
param_attr=fluid.ParamAttr(
name="cls_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
if is_prediction:
probs = fluid.layers.softmax(logits)
feed_targets_name = [
src_ids.name, pos_ids.name, sent_ids.name, input_mask.name
]
return pyreader, probs, feed_targets_name
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=logits, label=labels, return_softmax=True)
loss = fluid.layers.mean(x=ce_loss)
if args.use_fp16 and args.loss_scaling > 1.0:
loss *= args.loss_scaling
num_seqs = fluid.layers.create_tensor(dtype='int64')
accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs)
return pyreader, loss, probs, accuracy, num_seqs
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Transformer encoder."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from functools import partial
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as layers
def multi_head_attention(queries,
keys,
values,
attn_bias,
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.,
cache=None,
param_initializer=None,
name='multi_head_att'):
"""
Multi-Head Attention. Note that attn_bias is added to the logit before
computing softmax activiation to mask certain selected positions so that
they will not considered in attention weights.
"""
keys = queries if keys is None else keys
values = keys if values is None else values
if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
raise ValueError(
"Inputs: quries, keys and values should all be 3-D tensors.")
def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
"""
Add linear projection to queries, keys, and values.
"""
q = layers.fc(input=queries,
size=d_key * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_query_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_query_fc.b_0')
k = layers.fc(input=keys,
size=d_key * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_key_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_key_fc.b_0')
v = layers.fc(input=values,
size=d_value * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_value_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_value_fc.b_0')
return q, k, v
def __split_heads(x, n_head):
"""
Reshape the last dimension of inpunt tensor x so that it becomes two
dimensions and then transpose. Specifically, input a tensor with shape
[bs, max_sequence_length, n_head * hidden_dim] then output a tensor
with shape [bs, n_head, max_sequence_length, hidden_dim].
"""
hidden_size = x.shape[-1]
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
reshaped = layers.reshape(
x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
# permuate the dimensions into:
# [batch_size, n_head, max_sequence_len, hidden_size_per_head]
return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
def __combine_heads(x):
"""
Transpose and then reshape the last two dimensions of inpunt tensor x
so that it becomes one dimension, which is reverse to __split_heads.
"""
if len(x.shape) == 3: return x
if len(x.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
return layers.reshape(
x=trans_x,
shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
inplace=True)
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
"""
Scaled Dot-Product Attention
"""
scaled_q = layers.scale(x=q, scale=d_key**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
if attn_bias:
product += attn_bias
weights = layers.softmax(product)
if dropout_rate:
weights = layers.dropout(
weights,
dropout_prob=dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
out = layers.matmul(weights, v)
return out
q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
if cache is not None: # use cache and concat time steps
# Since the inplace reshape in __split_heads changes the shape of k and
# v, which is the cache input for next time step, reshape the cache
# input from the previous time step first.
k = cache["k"] = layers.concat(
[layers.reshape(
cache["k"], shape=[0, 0, d_model]), k], axis=1)
v = cache["v"] = layers.concat(
[layers.reshape(
cache["v"], shape=[0, 0, d_model]), v], axis=1)
q = __split_heads(q, n_head)
k = __split_heads(k, n_head)
v = __split_heads(v, n_head)
ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
dropout_rate)
out = __combine_heads(ctx_multiheads)
# Project back to the model size.
proj_out = layers.fc(input=out,
size=d_model,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_output_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_output_fc.b_0')
return proj_out
def positionwise_feed_forward(x,
d_inner_hid,
d_hid,
dropout_rate,
hidden_act,
param_initializer=None,
name='ffn'):
"""
Position-wise Feed-Forward Networks.
This module consists of two linear transformations with a ReLU activation
in between, which is applied to each position separately and identically.
"""
hidden = layers.fc(input=x,
size=d_inner_hid,
num_flatten_dims=2,
act=hidden_act,
param_attr=fluid.ParamAttr(
name=name + '_fc_0.w_0',
initializer=param_initializer),
bias_attr=name + '_fc_0.b_0')
if dropout_rate:
hidden = layers.dropout(
hidden,
dropout_prob=dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
out = layers.fc(input=hidden,
size=d_hid,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_fc_1.w_0', initializer=param_initializer),
bias_attr=name + '_fc_1.b_0')
return out
def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,
name=''):
"""
Add residual connection, layer normalization and droput to the out tensor
optionally according to the value of process_cmd.
This will be used before or after multi-head attention and position-wise
feed-forward networks.
"""
for cmd in process_cmd:
if cmd == "a": # add residual connection
out = out + prev_out if prev_out else out
elif cmd == "n": # add layer normalization
out_dtype = out.dtype
if out_dtype == fluid.core.VarDesc.VarType.FP16:
out = layers.cast(x=out, dtype="float32")
out = layers.layer_norm(
out,
begin_norm_axis=len(out.shape) - 1,
param_attr=fluid.ParamAttr(
name=name + '_layer_norm_scale',
initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(
name=name + '_layer_norm_bias',
initializer=fluid.initializer.Constant(0.)))
if out_dtype == fluid.core.VarDesc.VarType.FP16:
out = layers.cast(x=out, dtype="float16")
elif cmd == "d": # add dropout
if dropout_rate:
out = layers.dropout(
out,
dropout_prob=dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
return out
pre_process_layer = partial(pre_post_process_layer, None)
post_process_layer = pre_post_process_layer
def encoder_layer(enc_input,
attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd="n",
postprocess_cmd="da",
param_initializer=None,
name=''):
"""The encoder layers that can be stacked to form a deep encoder.
This module consits of a multi-head (self) attention followed by
position-wise feed-forward networks and both the two components companied
with the post_process_layer to add residual connection, layer normalization
and droput.
"""
attn_output = multi_head_attention(
pre_process_layer(
enc_input,
preprocess_cmd,
prepostprocess_dropout,
name=name + '_pre_att'),
None,
None,
attn_bias,
d_key,
d_value,
d_model,
n_head,
attention_dropout,
param_initializer=param_initializer,
name=name + '_multi_head_att')
attn_output = post_process_layer(
enc_input,
attn_output,
postprocess_cmd,
prepostprocess_dropout,
name=name + '_post_att')
ffd_output = positionwise_feed_forward(
pre_process_layer(
attn_output,
preprocess_cmd,
prepostprocess_dropout,
name=name + '_pre_ffn'),
d_inner_hid,
d_model,
relu_dropout,
hidden_act,
param_initializer=param_initializer,
name=name + '_ffn')
return post_process_layer(
attn_output,
ffd_output,
postprocess_cmd,
prepostprocess_dropout,
name=name + '_post_ffn')
def encoder(enc_input,
attn_bias,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd="n",
postprocess_cmd="da",
param_initializer=None,
name=''):
"""
The encoder is composed of a stack of identical layers returned by calling
encoder_layer.
"""
for i in range(n_layer):
enc_output = encoder_layer(
enc_input,
attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd,
postprocess_cmd,
param_initializer=param_initializer,
name=name + '_layer_' + str(i))
enc_input = enc_output
enc_output = pre_process_layer(
enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
return enc_output
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimization and learning rate scheduling."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle.fluid as fluid
from utils.fp16 import create_master_params_grads, master_param_to_train_param
def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
""" Applies linear warmup of learning rate from 0 and decay to 0."""
with fluid.default_main_program()._lr_schedule_guard():
lr = fluid.layers.tensor.create_global_var(
shape=[1],
value=0.0,
dtype='float32',
persistable=True,
name="scheduled_learning_rate")
global_step = fluid.layers.learning_rate_scheduler._decay_step_counter()
with fluid.layers.control_flow.Switch() as switch:
with switch.case(global_step < warmup_steps):
warmup_lr = learning_rate * (global_step / warmup_steps)
fluid.layers.tensor.assign(warmup_lr, lr)
with switch.default():
decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
learning_rate=learning_rate,
decay_steps=num_train_steps,
end_learning_rate=0.0,
power=1.0,
cycle=False)
fluid.layers.tensor.assign(decayed_lr, lr)
return lr
def optimization(loss,
warmup_steps,
num_train_steps,
learning_rate,
train_program,
startup_prog,
weight_decay,
scheduler='linear_warmup_decay',
use_fp16=False,
loss_scaling=1.0):
if warmup_steps > 0:
if scheduler == 'noam_decay':
scheduled_lr = fluid.layers.learning_rate_scheduler\
.noam_decay(1/(warmup_steps *(learning_rate ** 2)),
warmup_steps)
elif scheduler == 'linear_warmup_decay':
scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
num_train_steps)
else:
raise ValueError("Unkown learning rate scheduler, should be "
"'noam_decay' or 'linear_warmup_decay'")
optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
else:
optimizer = fluid.optimizer.Adam(learning_rate=learning_rate)
scheduled_lr = learning_rate
clip_norm_thres = 1.0
# When using mixed precision training, scale the gradient clip threshold
# by loss_scaling
if use_fp16 and loss_scaling > 1.0:
clip_norm_thres *= loss_scaling
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres))
def exclude_from_weight_decay(name):
if name.find("layer_norm") > -1:
return True
bias_suffix = ["_bias", "_b", ".b_0"]
for suffix in bias_suffix:
if name.endswith(suffix):
return True
return False
param_list = dict()
if use_fp16:
param_grads = optimizer.backward(loss)
master_param_grads = create_master_params_grads(
param_grads, train_program, startup_prog, loss_scaling)
for param, _ in master_param_grads:
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
optimizer.apply_gradients(master_param_grads)
if weight_decay > 0:
for param, grad in master_param_grads:
if exclude_from_weight_decay(param.name.rstrip(".master")):
continue
with param.block.program._optimized_guard(
[param, grad]), fluid.framework.name_scope("weight_decay"):
updated_param = param - param_list[
param.name] * weight_decay * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
master_param_to_train_param(master_param_grads, param_grads,
train_program)
else:
for param in train_program.global_block().all_parameters():
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
_, param_grads = optimizer.minimize(loss)
if weight_decay > 0:
for param, grad in param_grads:
if exclude_from_weight_decay(param.name):
continue
with param.block.program._optimized_guard(
[param, grad]), fluid.framework.name_scope("weight_decay"):
updated_param = param - param_list[
param.name] * weight_decay * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
return scheduled_lr
此差异已折叠。
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from __future__ import division
import os
import numpy as np
import types
import gzip
import logging
import re
import six
import collections
import tokenization
import paddle
import paddle.fluid as fluid
from batching import prepare_batch_data
class DataReader(object):
def __init__(self,
data_dir,
vocab_path,
batch_size=4096,
in_tokens=True,
max_seq_len=512,
shuffle_files=True,
epoch=100,
voc_size=0,
is_test=False,
generate_neg_sample=False):
self.vocab = self.load_vocab(vocab_path)
self.data_dir = data_dir
self.batch_size = batch_size
self.in_tokens = in_tokens
self.shuffle_files = shuffle_files
self.epoch = epoch
self.current_epoch = 0
self.current_file_index = 0
self.total_file = 0
self.current_file = None
self.voc_size = voc_size
self.max_seq_len = max_seq_len
self.pad_id = self.vocab["[PAD]"]
self.cls_id = self.vocab["[CLS]"]
self.sep_id = self.vocab["[SEP]"]
self.mask_id = self.vocab["[MASK]"]
self.is_test = is_test
self.generate_neg_sample = generate_neg_sample
if self.in_tokens:
assert self.batch_size >= self.max_seq_len, "The number of " \
"tokens in batch should not be smaller than max seq length."
if self.is_test:
self.epoch = 1
self.shuffle_files = False
def get_progress(self):
"""return current progress of traning data
"""
return self.current_epoch, self.current_file_index, self.total_file, self.current_file
def parse_line(self, line, max_seq_len=512):
""" parse one line to token_ids, sentence_ids, pos_ids, label
"""
line = line.strip().decode().split(";")
assert len(line) == 4, "One sample must have 4 fields!"
(token_ids, sent_ids, pos_ids, label) = line
token_ids = [int(token) for token in token_ids.split(" ")]
sent_ids = [int(token) for token in sent_ids.split(" ")]
pos_ids = [int(token) for token in pos_ids.split(" ")]
assert len(token_ids) == len(sent_ids) == len(
pos_ids
), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids)"
label = int(label)
if len(token_ids) > max_seq_len:
return None
return [token_ids, sent_ids, pos_ids, label]
def read_file(self, file):
assert file.endswith('.gz'), "[ERROR] %s is not a gzip file" % file
file_path = self.data_dir + "/" + file
with gzip.open(file_path, "rb") as f:
for line in f:
parsed_line = self.parse_line(
line, max_seq_len=self.max_seq_len)
if parsed_line is None:
continue
yield parsed_line
def convert_to_unicode(self, text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
elif isinstance(text, unicode):
return text
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def load_vocab(self, vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
fin = open(vocab_file)
for num, line in enumerate(fin):
items = self.convert_to_unicode(line.strip()).split("\t")
if len(items) > 2:
break
token = items[0]
index = items[1] if len(items) == 2 else num
token = token.strip()
vocab[token] = int(index)
return vocab
def random_pair_neg_samples(self, pos_samples):
""" randomly generate negtive samples using pos_samples
Args:
pos_samples: list of positive samples
Returns:
neg_samples: list of negtive samples
"""
np.random.shuffle(pos_samples)
num_sample = len(pos_samples)
neg_samples = []
miss_num = 0
for i in range(num_sample):
pair_index = (i + 1) % num_sample
origin_src_ids = pos_samples[i][0]
origin_sep_index = origin_src_ids.index(2)
pair_src_ids = pos_samples[pair_index][0]
pair_sep_index = pair_src_ids.index(2)
src_ids = origin_src_ids[:origin_sep_index + 1] + pair_src_ids[
pair_sep_index + 1:]
if len(src_ids) >= self.max_seq_len:
miss_num += 1
continue
sent_ids = [0] * len(origin_src_ids[:origin_sep_index + 1]) + [
1
] * len(pair_src_ids[pair_sep_index + 1:])
pos_ids = list(range(len(src_ids)))
neg_sample = [src_ids, sent_ids, pos_ids, 0]
assert len(src_ids) == len(sent_ids) == len(
pos_ids
), "[ERROR]len(src_id) == lne(sent_id) == len(pos_id) must be True"
neg_samples.append(neg_sample)
return neg_samples, miss_num
def mixin_negtive_samples(self, pos_sample_generator, buffer=1000):
""" 1. generate negtive samples by randomly group sentence_1 and sentence_2 of positive samples
2. combine negtive samples and positive samples
Args:
pos_sample_generator: a generator producing a parsed positive sample, which is a list: [token_ids, sent_ids, pos_ids, 1]
Returns:
sample: one sample from shuffled positive samples and negtive samples
"""
pos_samples = []
num_total_miss = 0
pos_sample_num = 0
try:
while True:
while len(pos_samples) < buffer:
pos_sample = next(pos_sample_generator)
label = pos_sample[3]
assert label == 1, "positive sample's label must be 1"
pos_samples.append(pos_sample)
pos_sample_num += 1
neg_samples, miss_num = self.random_pair_neg_samples(
pos_samples)
num_total_miss += miss_num
samples = pos_samples + neg_samples
pos_samples = []
np.random.shuffle(samples)
for sample in samples:
yield sample
except StopIteration:
print("stopiteration: reach end of file")
if len(pos_samples) == 1:
yield pos_samples[0]
elif len(pos_samples) == 0:
yield None
else:
neg_samples, miss_num = self.random_pair_neg_samples(
pos_samples)
num_total_miss += miss_num
samples = pos_samples + neg_samples
pos_samples = []
np.random.shuffle(samples)
for sample in samples:
yield sample
print("miss_num:%d\tideal_total_sample_num:%d\tmiss_rate:%f" %
(num_total_miss, pos_sample_num * 2,
num_total_miss / (pos_sample_num * 2)))
def data_generator(self):
"""
data_generator
"""
files = os.listdir(self.data_dir)
self.total_file = len(files)
assert self.total_file > 0, "[Error] data_dir is empty"
def wrapper():
def reader():
for epoch in range(self.epoch):
self.current_epoch = epoch + 1
if self.shuffle_files:
np.random.shuffle(files)
for index, file in enumerate(files):
self.current_file_index = index + 1
self.current_file = file
sample_generator = self.read_file(file)
if not self.is_test and self.generate_neg_sample:
sample_generator = self.mixin_negtive_samples(
sample_generator)
for sample in sample_generator:
if sample is None:
continue
yield sample
def batch_reader(reader, batch_size, in_tokens):
batch, total_token_num, max_len = [], 0, 0
for parsed_line in reader():
token_ids, sent_ids, pos_ids, label = parsed_line
max_len = max(max_len, len(token_ids))
if in_tokens:
to_append = (len(batch) + 1) * max_len <= batch_size
else:
to_append = len(batch) < batch_size
if to_append:
batch.append(parsed_line)
total_token_num += len(token_ids)
else:
yield batch, total_token_num
batch, total_token_num, max_len = [parsed_line], len(
token_ids), len(token_ids)
if len(batch) > 0:
yield batch, total_token_num
for batch_data, total_token_num in batch_reader(
reader, self.batch_size, self.in_tokens):
yield prepare_batch_data(
batch_data,
total_token_num,
voc_size=self.voc_size,
pad_id=self.pad_id,
cls_id=self.cls_id,
sep_id=self.sep_id,
mask_id=self.mask_id,
return_input_mask=True,
return_max_len=False,
return_num_token=False)
return wrapper
if __name__ == "__main__":
pass
此差异已折叠。
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetuning on SQuAD."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import collections
import multiprocessing
import os
import time
import numpy as np
import paddle
import paddle.fluid as fluid
from reader.squad import DataProcessor, write_predictions
from model.bert import BertConfig, BertModel
from utils.args import ArgumentGroup, print_arguments, check_cuda
from optimization import optimization
from utils.init import init_pretraining_params, init_checkpoint
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
model_g.add_arg("bert_config_path", str, None, "Path to the json file for bert model config.")
model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
model_g.add_arg("init_pretraining_params", str, None,
"Init pre-training params which preforms fine-tuning from. If the "
"arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.")
train_g = ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.")
train_g.add_arg("lr_scheduler", str, "linear_warmup_decay",
"scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.")
train_g.add_arg("warmup_proportion", float, 0.1,
"Proportion of training steps to perform linear learning rate warmup for.")
train_g.add_arg("save_steps", int, 1000, "The steps interval to save checkpoints.")
train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.")
train_g.add_arg("loss_scaling", float, 1.0,
"Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
log_g = ArgumentGroup(parser, "logging", "logging related.")
log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
log_g.add_arg("verbose", bool, False, "Whether to output verbose log.")
data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
data_g.add_arg("train_file", str, None, "SQuAD json for training. E.g., train-v1.1.json.")
data_g.add_arg("predict_file", str, None, "SQuAD json for predictions. E.g. dev-v1.1.json or test-v1.1.json.")
data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
data_g.add_arg("version_2_with_negative", bool, False,
"If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.")
data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.")
data_g.add_arg("max_query_length", int, 64, "Max query length.")
data_g.add_arg("max_answer_length", int, 30, "Max answer length.")
data_g.add_arg("batch_size", int, 12, "Total examples' number in batch for training. see also --in_tokens.")
data_g.add_arg("in_tokens", bool, False,
"If set, the batch size will be the maximum number of tokens in one batch. "
"Otherwise, it will be the maximum number of examples in one batch.")
data_g.add_arg("do_lower_case", bool, True,
"Whether to lower case the input text. Should be True for uncased models and False for cased models.")
data_g.add_arg("doc_stride", int, 128,
"When splitting up a long document into chunks, how much stride to take between chunks.")
data_g.add_arg("n_best_size", int, 20,
"The total number of n-best predictions to generate in the nbest_predictions.json output file.")
data_g.add_arg("null_score_diff_threshold", float, 0.0,
"If null_score - best_non_null is greater than the threshold predict null.")
data_g.add_arg("random_seed", int, 0, "Random seed.")
run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).")
run_type_g.add_arg("num_iteration_per_drop_scope", int, 1, "Ihe iteration intervals to clean up temporary variables.")
run_type_g.add_arg("do_train", bool, True, "Whether to perform training.")
run_type_g.add_arg("do_predict", bool, True, "Whether to perform prediction.")
args = parser.parse_args()
# yapf: enable.
def create_model(pyreader_name, bert_config, is_training=False):
if is_training:
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
dtypes=[
'int64', 'int64', 'int64', 'float32', 'int64', 'int64'],
lod_levels=[0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)
(src_ids, pos_ids, sent_ids, input_mask, start_positions,
end_positions) = fluid.layers.read_file(pyreader)
else:
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1, 1]],
dtypes=['int64', 'int64', 'int64', 'float32', 'int64'],
lod_levels=[0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)
(src_ids, pos_ids, sent_ids, input_mask, unique_id) = fluid.layers.read_file(pyreader)
bert = BertModel(
src_ids=src_ids,
position_ids=pos_ids,
sentence_ids=sent_ids,
input_mask=input_mask,
config=bert_config,
use_fp16=args.use_fp16)
enc_out = bert.get_sequence_output()
logits = fluid.layers.fc(
input=enc_out,
size=2,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name="cls_squad_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.)))
logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
batch_ones = fluid.layers.fill_constant_batch_size_like(
input=start_logits, dtype='int64', shape=[1], value=1)
num_seqs = fluid.layers.reduce_sum(input=batch_ones)
if is_training:
def compute_loss(logits, positions):
loss = fluid.layers.softmax_with_cross_entropy(
logits=logits, label=positions)
loss = fluid.layers.mean(x=loss)
return loss
start_loss = compute_loss(start_logits, start_positions)
end_loss = compute_loss(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2.0
if args.use_fp16 and args.loss_scaling > 1.0:
total_loss = total_loss * args.loss_scaling
return pyreader, total_loss, num_seqs
else:
return pyreader, unique_id, start_logits, end_logits, num_seqs
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
def predict(test_exe, test_program, test_pyreader, fetch_list, processor):
if not os.path.exists(args.checkpoints):
os.makedirs(args.checkpoints)
output_prediction_file = os.path.join(args.checkpoints, "predictions.json")
output_nbest_file = os.path.join(args.checkpoints, "nbest_predictions.json")
output_null_log_odds_file = os.path.join(args.checkpoints, "null_odds.json")
test_pyreader.start()
all_results = []
time_begin = time.time()
while True:
try:
np_unique_ids, np_start_logits, np_end_logits, np_num_seqs = test_exe.run(
fetch_list=fetch_list, program=test_program)
for idx in range(np_unique_ids.shape[0]):
if len(all_results) % 1000 == 0:
print("Processing example: %d" % len(all_results))
unique_id = int(np_unique_ids[idx])
start_logits = [float(x) for x in np_start_logits[idx].flat]
end_logits = [float(x) for x in np_end_logits[idx].flat]
all_results.append(
RawResult(
unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
except fluid.core.EOFException:
test_pyreader.reset()
break
time_end = time.time()
features = processor.get_features(
processor.predict_examples, is_training=False)
write_predictions(processor.predict_examples, features, all_results,
args.n_best_size, args.max_answer_length,
args.do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file,
args.version_2_with_negative,
args.null_score_diff_threshold, args.verbose)
def train(args):
bert_config = BertConfig(args.bert_config_path)
bert_config.print_config()
if not (args.do_train or args.do_predict):
raise ValueError("For args `do_train` and `do_predict`, at "
"least one of them must be True.")
if args.use_cuda:
place = fluid.CUDAPlace(0)
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exe = fluid.Executor(place)
processor = DataProcessor(
vocab_path=args.vocab_path,
do_lower_case=args.do_lower_case,
max_seq_length=args.max_seq_len,
in_tokens=args.in_tokens,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length)
startup_prog = fluid.Program()
if args.random_seed is not None:
startup_prog.random_seed = args.random_seed
if args.do_train:
train_data_generator = processor.data_generator(
data_path=args.train_file,
batch_size=args.batch_size,
phase='train',
shuffle=True,
dev_count=dev_count,
version_2_with_negative=args.version_2_with_negative,
epoch=args.epoch)
num_train_examples = processor.get_num_examples(phase='train')
if args.in_tokens:
max_train_steps = args.epoch * num_train_examples // (
args.batch_size // args.max_seq_len) // dev_count
else:
max_train_steps = args.epoch * num_train_examples // (
args.batch_size) // dev_count
warmup_steps = int(max_train_steps * args.warmup_proportion)
print("Device count: %d" % dev_count)
print("Num train examples: %d" % num_train_examples)
print("Max train steps: %d" % max_train_steps)
print("Num warmup steps: %d" % warmup_steps)
train_program = fluid.Program()
with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard():
train_pyreader, loss, num_seqs = create_model(
pyreader_name='train_reader',
bert_config=bert_config,
is_training=True)
scheduled_lr = optimization(
loss=loss,
warmup_steps=warmup_steps,
num_train_steps=max_train_steps,
learning_rate=args.learning_rate,
train_program=train_program,
startup_prog=startup_prog,
weight_decay=args.weight_decay,
scheduler=args.lr_scheduler,
use_fp16=args.use_fp16,
loss_scaling=args.loss_scaling)
fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name])
if args.verbose:
if args.in_tokens:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program,
batch_size=args.batch_size // args.max_seq_len)
else:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program, batch_size=args.batch_size)
print("Theoretical memory usage in training: %.3f - %.3f %s" %
(lower_mem, upper_mem, unit))
if args.do_predict:
test_prog = fluid.Program()
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model(
pyreader_name='test_reader',
bert_config=bert_config,
is_training=False)
fluid.memory_optimize(test_prog, skip_opt_set=[unique_ids.name,
start_logits.name, end_logits.name, num_seqs.name])
test_prog = test_prog.clone(for_test=True)
exe.run(startup_prog)
if args.do_train:
if args.init_checkpoint and args.init_pretraining_params:
print(
"WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
"both are set! Only arg 'init_checkpoint' is made valid.")
if args.init_checkpoint:
init_checkpoint(
exe,
args.init_checkpoint,
main_program=startup_prog,
use_fp16=args.use_fp16)
elif args.init_pretraining_params:
init_pretraining_params(
exe,
args.init_pretraining_params,
main_program=startup_prog,
use_fp16=args.use_fp16)
elif args.do_predict:
if not args.init_checkpoint:
raise ValueError("args 'init_checkpoint' should be set if"
"only doing prediction!")
init_checkpoint(
exe,
args.init_checkpoint,
main_program=startup_prog,
use_fp16=args.use_fp16)
if args.do_train:
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.use_experimental_executor = args.use_fast_executor
exec_strategy.num_threads = dev_count
exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
train_exe = fluid.ParallelExecutor(
use_cuda=args.use_cuda,
loss_name=loss.name,
exec_strategy=exec_strategy,
main_program=train_program)
train_pyreader.decorate_tensor_provider(train_data_generator)
train_pyreader.start()
steps = 0
total_cost, total_num_seqs = [], []
time_begin = time.time()
while steps < max_train_steps:
try:
steps += 1
if steps % args.skip_steps == 0:
if warmup_steps <= 0:
fetch_list = [loss.name, num_seqs.name]
else:
fetch_list = [
loss.name, scheduled_lr.name, num_seqs.name
]
else:
fetch_list = []
outputs = train_exe.run(fetch_list=fetch_list)
if steps % args.skip_steps == 0:
if warmup_steps <= 0:
np_loss, np_num_seqs = outputs
else:
np_loss, np_lr, np_num_seqs = outputs
total_cost.extend(np_loss * np_num_seqs)
total_num_seqs.extend(np_num_seqs)
if args.verbose:
verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
)
verbose += "learning rate: %f" % (
np_lr[0]
if warmup_steps > 0 else args.learning_rate)
print(verbose)
time_end = time.time()
used_time = time_end - time_begin
current_example, epoch = processor.get_train_progress()
print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
"speed: %f steps/s" %
(epoch, current_example, num_train_examples, steps,
np.sum(total_cost) / np.sum(total_num_seqs),
args.skip_steps / used_time))
total_cost, total_num_seqs = [], []
time_begin = time.time()
if steps % args.save_steps == 0 or steps == max_train_steps:
save_path = os.path.join(args.checkpoints,
"step_" + str(steps))
fluid.io.save_persistables(exe, save_path, train_program)
except fluid.core.EOFException:
save_path = os.path.join(args.checkpoints,
"step_" + str(steps) + "_final")
fluid.io.save_persistables(exe, save_path, train_program)
train_pyreader.reset()
break
if args.do_predict:
test_pyreader.decorate_tensor_provider(
processor.data_generator(
data_path=args.predict_file,
batch_size=args.batch_size,
phase='predict',
shuffle=False,
dev_count=1,
epoch=1))
predict(exe, test_prog, test_pyreader, [
unique_ids.name, start_logits.name, end_logits.name, num_seqs.name
], processor)
if __name__ == '__main__':
print_arguments(args)
check_cuda(args.use_cuda)
train(args)
#!/bin/bash
set -xe
# Paddle debug envs
export GLOG_v=1
export GLOG_logtostderr=1
# Unset proxy
unset https_proxy http_proxy
# NCCL debug envs
export NCCL_P2P_DISABLE=1
export NCCL_DEBUG=INFO
# Comment it if your nccl support IB
export NCCL_IB_DISABLE=1
# Add your nodes endpoints here.
export worker_endpoints=127.0.0.1:9184,127.0.0.1:9185
export current_endpoint=127.0.0.1:9184
export CUDA_VISIBLE_DEVICES=0
./train.sh -local n > 0.log 2>&1 &
# Add your nodes endpoints here.
export current_endpoint=127.0.0.1:9185
export CUDA_VISIBLE_DEVICES=1
./train.sh -local n > 1.log 2>&1 &
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import unicodedata
import six
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
elif isinstance(text, unicode):
return text
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text
elif isinstance(text, unicode):
return text.encode("utf-8")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
fin = open(vocab_file)
for num, line in enumerate(fin):
items = convert_to_unicode(line.strip()).split("\t")
if len(items) > 2:
break
token = items[0]
index = items[1] if len(items) == 2 else num
token = token.strip()
vocab[token] = int(index)
return vocab
def convert_by_vocab(vocab, items):
"""Converts a sequence of [tokens|ids] using the vocab."""
output = []
for item in items:
output.append(vocab[item])
return output
def convert_tokens_to_ids(vocab, tokens):
return convert_by_vocab(vocab, tokens)
def convert_ids_to_tokens(inv_vocab, ids):
return convert_by_vocab(inv_vocab, ids)
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a peice of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class FullTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
class CharTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in text.lower().split(" "):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case
def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenziation."""
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
"""
text = convert_to_unicode(text)
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT pretraining."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
import argparse
import numpy as np
import multiprocessing
import paddle
import paddle.fluid as fluid
from reader.pretraining import DataReader
from model.bert import BertModel, BertConfig
from optimization import optimization
from utils.args import ArgumentGroup, print_arguments, check_cuda
from utils.init import init_checkpoint, init_pretraining_params
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
model_g.add_arg("bert_config_path", str, "./config/bert_config.json", "Path to the json file for bert model config.")
model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.")
model_g.add_arg("weight_sharing", bool, True, "If set, share weights between word embedding and masked lm.")
model_g.add_arg("generate_neg_sample", bool, True, "If set, randomly generate negtive samples by positive samples.")
train_g = ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("epoch", int, 100, "Number of epoches for training.")
train_g.add_arg("learning_rate", float, 0.0001, "Learning rate used to train with warmup.")
train_g.add_arg("lr_scheduler", str, "linear_warmup_decay",
"scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.")
train_g.add_arg("num_train_steps", int, 1000000, "Total steps to perform pretraining.")
train_g.add_arg("warmup_steps", int, 4000, "Total steps to perform warmup when pretraining.")
train_g.add_arg("save_steps", int, 10000, "The steps interval to save checkpoints.")
train_g.add_arg("validation_steps", int, 1000, "The steps interval to evaluate model performance.")
train_g.add_arg("use_fp16", bool, False, "Whether to use fp16 mixed precision training.")
train_g.add_arg("loss_scaling", float, 1.0,
"Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
log_g = ArgumentGroup(parser, "logging", "logging related.")
log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
log_g.add_arg("verbose", bool, False, "Whether to output verbose log.")
data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
data_g.add_arg("data_dir", str, "./data/train/", "Path to training data.")
data_g.add_arg("validation_set_dir", str, "./data/validation/", "Path to validation data.")
data_g.add_arg("test_set_dir", str, None, "Path to test data.")
data_g.add_arg("vocab_path", str, "./config/vocab.txt", "Vocabulary path.")
data_g.add_arg("max_seq_len", int, 512, "Tokens' number of the longest seqence allowed.")
data_g.add_arg("batch_size", int, 8192,
"The total number of examples in one batch for training, see also --in_tokens.")
data_g.add_arg("in_tokens", bool, True,
"If set, the batch size will be the maximum number of tokens in one batch. "
"Otherwise, it will be the maximum number of examples in one batch.")
run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("is_distributed", bool, False, "If set, then start distributed training.")
run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).")
run_type_g.add_arg("num_iteration_per_drop_scope", int, 1, "Ihe iteration intervals to clean up temporary variables.")
run_type_g.add_arg("do_test", bool, False, "Whether to perform evaluation on test data set.")
args = parser.parse_args()
# yapf: enable.
def create_model(pyreader_name, bert_config):
pyreader = fluid.layers.py_reader(
capacity=70,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1, 1], [-1, 1],
[-1, 1]],
dtypes=[
'int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'int64'
],
lod_levels=[0, 0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)
(src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels) = fluid.layers.read_file(pyreader)
bert = BertModel(
src_ids=src_ids,
position_ids=pos_ids,
sentence_ids=sent_ids,
input_mask=input_mask,
config=bert_config,
weight_sharing=args.weight_sharing,
use_fp16=args.use_fp16)
next_sent_acc, mask_lm_loss, total_loss = bert.get_pretraining_output(
mask_label, mask_pos, labels)
if args.use_fp16 and args.loss_scaling > 1.0:
total_loss *= args.loss_scaling
return pyreader, next_sent_acc, mask_lm_loss, total_loss
def predict_wrapper(args,
exe,
bert_config,
test_prog=None,
pyreader=None,
fetch_list=None):
# Context to do validation.
data_path = args.test_set_dir if args.do_test else args.validation_set_dir
data_reader = DataReader(
data_path,
vocab_path=args.vocab_path,
batch_size=args.batch_size,
in_tokens=args.in_tokens,
voc_size=bert_config['vocab_size'],
shuffle_files=False,
epoch=1,
max_seq_len=args.max_seq_len,
is_test=True)
if args.do_test:
assert args.init_checkpoint is not None, "[FATAL] Please use --init_checkpoint '/path/to/checkpoints' \
to specify you pretrained model checkpoints"
init_pretraining_params(exe, args.init_checkpoint, test_prog)
def predict(exe=exe, pyreader=pyreader):
pyreader.decorate_tensor_provider(data_reader.data_generator())
pyreader.start()
cost = 0
lm_cost = 0
acc = 0
steps = 0
time_begin = time.time()
while True:
try:
each_next_acc, each_mask_lm_cost, each_total_cost = exe.run(
fetch_list=fetch_list, program=test_prog)
acc += each_next_acc
lm_cost += each_mask_lm_cost
cost += each_total_cost
steps += 1
if args.do_test and steps % args.skip_steps == 0:
print("[test_set] steps: %d" % steps)
except fluid.core.EOFException:
pyreader.reset()
break
used_time = time.time() - time_begin
return cost, lm_cost, acc, steps, (args.skip_steps / used_time)
return predict
def test(args):
bert_config = BertConfig(args.bert_config_path)
bert_config.print_config()
test_prog = fluid.Program()
test_startup = fluid.Program()
with fluid.program_guard(test_prog, test_startup):
with fluid.unique_name.guard():
test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model(
pyreader_name='test_reader', bert_config=bert_config)
test_prog = test_prog.clone(for_test=True)
place = fluid.CUDAPlace(0) if args.use_cuda == True else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(test_startup)
predict = predict_wrapper(
args,
exe,
bert_config,
test_prog=test_prog,
pyreader=test_pyreader,
fetch_list=[next_sent_acc.name, mask_lm_loss.name, total_loss.name])
print("test begin")
loss, lm_loss, acc, steps, speed = predict()
print(
"[test_set] loss: %f, global ppl: %f, next_sent_acc: %f, speed: %f steps/s"
% (np.mean(np.array(loss) / steps),
np.exp(np.mean(np.array(lm_loss) / steps)),
np.mean(np.array(acc) / steps), speed))
def train(args):
print("pretraining start")
bert_config = BertConfig(args.bert_config_path)
bert_config.print_config()
train_program = fluid.Program()
startup_prog = fluid.Program()
with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard():
train_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model(
pyreader_name='train_reader', bert_config=bert_config)
scheduled_lr = optimization(
loss=total_loss,
warmup_steps=args.warmup_steps,
num_train_steps=args.num_train_steps,
learning_rate=args.learning_rate,
train_program=train_program,
startup_prog=startup_prog,
weight_decay=args.weight_decay,
scheduler=args.lr_scheduler,
use_fp16=args.use_fp16,
loss_scaling=args.loss_scaling)
fluid.memory_optimize(
input_program=train_program,
skip_opt_set=[
next_sent_acc.name, mask_lm_loss.name, total_loss.name
])
test_prog = fluid.Program()
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model(
pyreader_name='test_reader', bert_config=bert_config)
test_prog = test_prog.clone(for_test=True)
if args.use_cuda:
place = fluid.CUDAPlace(0)
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
print("Device count %d" % dev_count)
if args.verbose:
if args.in_tokens:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program,
batch_size=args.batch_size // args.max_seq_len)
else:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program, batch_size=args.batch_size)
print("Theoretical memory usage in training: %.3f - %.3f %s" %
(lower_mem, upper_mem, unit))
nccl2_num_trainers = 1
nccl2_trainer_id = 0
print("args.is_distributed:", args.is_distributed)
if args.is_distributed:
worker_endpoints_env = os.getenv("worker_endpoints")
worker_endpoints = worker_endpoints_env.split(",")
trainers_num = len(worker_endpoints)
current_endpoint = os.getenv("current_endpoint")
trainer_id = worker_endpoints.index(current_endpoint)
if trainer_id == 0:
print("train_id == 0, sleep 60s")
time.sleep(60)
print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
trainer_id:{}"
.format(worker_endpoints, trainers_num,
current_endpoint, trainer_id))
# prepare nccl2 env.
config = fluid.DistributeTranspilerConfig()
config.mode = "nccl2"
t = fluid.DistributeTranspiler(config=config)
t.transpile(
trainer_id,
trainers=worker_endpoints_env,
current_endpoint=current_endpoint,
program=train_program,
startup_program=startup_prog)
nccl2_num_trainers = trainers_num
nccl2_trainer_id = trainer_id
exe = fluid.Executor(place)
exe.run(startup_prog)
if args.init_checkpoint and args.init_checkpoint != "":
init_checkpoint(exe, args.init_checkpoint, train_program, args.use_fp16)
data_reader = DataReader(
data_dir=args.data_dir,
batch_size=args.batch_size,
in_tokens=args.in_tokens,
vocab_path=args.vocab_path,
voc_size=bert_config['vocab_size'],
epoch=args.epoch,
max_seq_len=args.max_seq_len,
generate_neg_sample=args.generate_neg_sample)
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.use_experimental_executor = args.use_fast_executor
exec_strategy.num_threads = dev_count
exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
# use_ngraph is for CPU only, please refer to README_ngraph.md for details
use_ngraph = os.getenv('FLAGS_use_ngraph')
if not use_ngraph:
train_exe = fluid.ParallelExecutor(
use_cuda=args.use_cuda,
loss_name=total_loss.name,
exec_strategy=exec_strategy,
main_program=train_program,
num_trainers=nccl2_num_trainers,
trainer_id=nccl2_trainer_id)
else:
train_exe = exe
if args.validation_set_dir and args.validation_set_dir != "":
predict = predict_wrapper(
args,
exe,
bert_config,
test_prog=test_prog,
pyreader=test_pyreader,
fetch_list=[
next_sent_acc.name, mask_lm_loss.name, total_loss.name
])
train_pyreader.decorate_tensor_provider(data_reader.data_generator())
train_pyreader.start()
steps = 0
cost = []
lm_cost = []
acc = []
time_begin = time.time()
while steps < args.num_train_steps:
try:
steps += nccl2_num_trainers
skip_steps = args.skip_steps * nccl2_num_trainers
if nccl2_trainer_id != 0:
if use_ngraph:
train_exe.run(fetch_list=[], program=train_program)
else:
train_exe.run(fetch_list=[])
continue
if steps % skip_steps != 0:
if use_ngraph:
train_exe.run(fetch_list=[], program=train_program)
else:
train_exe.run(fetch_list=[])
else:
if use_ngraph:
each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = train_exe.run(
fetch_list=[
next_sent_acc.name, mask_lm_loss.name, total_loss.name,
scheduled_lr.name], program=train_program)
else:
each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = train_exe.run(
fetch_list=[
next_sent_acc.name, mask_lm_loss.name, total_loss.name,
scheduled_lr.name])
acc.extend(each_next_acc)
lm_cost.extend(each_mask_lm_cost)
cost.extend(each_total_cost)
print("feed_queue size", train_pyreader.queue.size())
time_end = time.time()
used_time = time_end - time_begin
epoch, current_file_index, total_file, current_file = data_reader.get_progress(
)
print("current learning_rate:%f" % np_lr[0])
print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
"ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s"
% (epoch, current_file_index, total_file, steps,
np.mean(np.array(cost)),
np.mean(np.exp(np.array(lm_cost))),
np.mean(np.array(acc)), skip_steps / used_time,
current_file))
cost = []
lm_cost = []
acc = []
time_begin = time.time()
if steps % args.save_steps == 0:
save_path = os.path.join(args.checkpoints, "step_" + str(steps))
fluid.io.save_persistables(exe, save_path, train_program)
if args.validation_set_dir and steps % args.validation_steps == 0:
vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict(
)
print("[validation_set] epoch: %d, step: %d, "
"loss: %f, global ppl: %f, batch-averged ppl: %f, "
"next_sent_acc: %f, speed: %f steps/s" %
(epoch, steps,
np.mean(np.array(vali_cost) / vali_steps),
np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)),
np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)),
np.mean(np.array(vali_acc) / vali_steps), vali_speed))
except fluid.core.EOFException:
train_pyreader.reset()
break
if __name__ == '__main__':
print_arguments(args)
check_cuda(args.use_cuda)
if args.do_test:
test(args)
else:
train(args)
#!/bin/bash
set -xe
while true ; do
case "$1" in
-local) is_local="$2" ; shift 2 ;;
*)
if [[ ${#1} > 0 ]]; then
echo "not supported arugments ${1}" ; exit 1 ;
else
break
fi
;;
esac
done
case "$is_local" in
n) is_distributed="--is_distributed true" ;;
y) is_distributed="--is_distributed false" ;;
*) echo "not support argument -local: ${is_local}" ; exit 1 ;;
esac
# pretrain config
SAVE_STEPS=10000
BATCH_SIZE=4096
LR_RATE=1e-4
WEIGHT_DECAY=0.01
MAX_LEN=512
TRAIN_DATA_DIR=data/train
VALIDATION_DATA_DIR=data/validation
CONFIG_PATH=data/demo_config/bert_config.json
VOCAB_PATH=data/demo_config/vocab.txt
# Change your train arguments:
python -u ./train.py ${is_distributed}\
--use_cuda true\
--weight_sharing true\
--batch_size ${BATCH_SIZE} \
--data_dir ${TRAIN_DATA_DIR} \
--validation_set_dir ${VALIDATION_DATA_DIR} \
--bert_config_path ${CONFIG_PATH} \
--vocab_path ${VOCAB_PATH} \
--generate_neg_sample true\
--checkpoints ./output \
--save_steps ${SAVE_STEPS} \
--learning_rate ${LR_RATE} \
--weight_decay ${WEIGHT_DECAY:-0} \
--max_seq_len ${MAX_LEN} \
--skip_steps 20 \
--validation_steps 1000 \
--num_iteration_per_drop_scope 10 \
--use_fp16 false \
--loss_scaling 8.0
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册