未验证 提交 3a4d6312 编写于 作者: Z zhengya01 提交者: GitHub

Merge pull request #1 from PaddlePaddle/develop

update
export FLAGS_enable_parallel_graph=1
export FLAGS_sync_nccl_allreduce=1
BERT_BASE_PATH="chinese_L-12_H-768_A-12"
TASK_NAME='xnli'
DATA_PATH=data/xnli/XNLI-MT-1.0
CKPT_PATH=pretrain_model
train(){
python -u run_classifier.py --task_name ${TASK_NAME} \
--use_cuda true \
--do_train true \
--do_val false \
--do_test false \
--batch_size 8192 \
--in_tokens true \
--init_checkpoint pretrain_model/chinese_L-12_H-768_A-12/ \
--data_dir ${DATA_PATH} \
--vocab_path pretrain_model/chinese_L-12_H-768_A-12/vocab.txt \
--checkpoints ${CKPT_PATH} \
--save_steps 1000 \
--weight_decay 0.01 \
--warmup_proportion 0.0 \
--validation_steps 25 \
--epoch 1 \
--max_seq_len 512 \
--bert_config_path pretrain_model/chinese_L-12_H-768_A-12/bert_config.json \
--learning_rate 1e-4 \
--skip_steps 10 \
--random_seed 100 \
--enable_ce \
--shuffle false
}
export CUDA_VISIBLE_DEVICES=0
train | python _ce.py
export CUDA_VISIBLE_DEVICES=0,1,2,3
train | python _ce.py
####this file is only used for continuous evaluation test!
import os
import sys
sys.path.insert(0, os.environ['ceroot'])
#sys.path.append('.')
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_cost_xnli_card1_kpi = CostKpi('train_cost_xnli_card1', 0.002, 0, actived=True)
train_acc_xnli_card1_kpi = AccKpi('train_acc_xnli_card1', 0.002, 0, actived=True)
train_duration_xnli_card1_kpi = DurationKpi(
'train_duration_xnli_card1', 0.01, 0, actived=True)
train_cost_xnli_card4_kpi = CostKpi('train_cost_xnli_card4', 0.002, 0, actived=True)
train_acc_xnli_card4_kpi = AccKpi('train_acc_xnli_card4', 0.02, 0, actived=True)
train_duration_xnli_card4_kpi = DurationKpi(
'train_duration_xnli_card4', 0.03, 0, actived=True)
tracking_kpis = [
train_cost_xnli_card1_kpi,
train_acc_xnli_card1_kpi,
train_duration_xnli_card1_kpi,
train_cost_xnli_card4_kpi,
train_acc_xnli_card4_kpi,
train_duration_xnli_card4_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print(log)
print("****")
log_to_ce(log)
...@@ -32,6 +32,7 @@ from model.classifier import create_model ...@@ -32,6 +32,7 @@ from model.classifier import create_model
from optimization import optimization from optimization import optimization
from utils.args import ArgumentGroup, print_arguments, check_cuda from utils.args import ArgumentGroup, print_arguments, check_cuda
from utils.init import init_pretraining_params, init_checkpoint from utils.init import init_pretraining_params, init_checkpoint
from utils.cards import get_cards
import dist_utils import dist_utils
num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
...@@ -87,6 +88,8 @@ run_type_g.add_arg("do_train", bool, True, "Whether to pe ...@@ -87,6 +88,8 @@ run_type_g.add_arg("do_train", bool, True, "Whether to pe
run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.") run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.")
run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.") run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.")
parser.add_argument("--enable_ce", action='store_true', help="The flag indicating whether to run the task for continuous evaluation.")
args = parser.parse_args() args = parser.parse_args()
# yapf: enable. # yapf: enable.
...@@ -298,6 +301,7 @@ def main(args): ...@@ -298,6 +301,7 @@ def main(args):
total_cost, total_acc, total_num_seqs = [], [], [] total_cost, total_acc, total_num_seqs = [], [], []
time_begin = time.time() time_begin = time.time()
throughput = [] throughput = []
ce_info = []
while True: while True:
try: try:
# steps += 1 # steps += 1
...@@ -341,6 +345,7 @@ def main(args): ...@@ -341,6 +345,7 @@ def main(args):
current_epoch, current_example, num_train_examples, current_epoch, current_example, num_train_examples,
steps, np.sum(total_cost) / np.sum(total_num_seqs), steps, np.sum(total_cost) / np.sum(total_num_seqs),
np.sum(total_acc) / np.sum(total_num_seqs)) np.sum(total_acc) / np.sum(total_num_seqs))
ce_info.append([np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), used_time])
if steps > 0 : if steps > 0 :
throughput.append( args.skip_steps / used_time) throughput.append( args.skip_steps / used_time)
log_record = log_record + ", speed: %f steps/s" % (args.skip_steps / used_time) log_record = log_record + ", speed: %f steps/s" % (args.skip_steps / used_time)
...@@ -388,6 +393,24 @@ def main(args): ...@@ -388,6 +393,24 @@ def main(args):
fluid.io.save_persistables(exe, save_path, train_program) fluid.io.save_persistables(exe, save_path, train_program)
train_pyreader.reset() train_pyreader.reset()
break break
if args.enable_ce:
card_num = get_cards()
ce_cost = 0
ce_acc = 0
ce_time = 0
try:
ce_cost = ce_info[-2][0]
ce_acc = ce_info[-2][1]
ce_time = ce_info[-2][2]
except:
print("ce info error")
print("kpis\ttrain_duration_%s_card%s\t%s" %
(args.task_name, card_num, ce_time))
print("kpis\ttrain_cost_%s_card%s\t%f" %
(args.task_name, card_num, ce_cost))
print("kpis\ttrain_acc_%s_card%s\t%f" %
(args.task_name, card_num, ce_acc))
# final eval on dev set # final eval on dev set
if args.do_val: if args.do_val:
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
def get_cards():
"""
get gpu cards number
"""
num = 0
cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
if cards != '':
num = len(cards.split(","))
return num
...@@ -18,6 +18,7 @@ from __future__ import print_function ...@@ -18,6 +18,7 @@ from __future__ import print_function
import six import six
import numpy as np import numpy as np
import random
import time import time
import os import os
import math import math
......
set -eux
export FLAGS_sync_nccl_allreduce=1
MODEL_PATH=ERNIE_1.0.1
TASK_DATA_PATH=task_data
train() {
python -u run_classifier.py \
--use_cuda true \
--do_train true \
--do_val true \
--do_test true \
--verbose true \
--batch_size 8192 \
--in_tokens true \
--init_pretraining_params ${MODEL_PATH}/params \
--train_set ${TASK_DATA_PATH}/xnli/train.tsv \
--dev_set ${TASK_DATA_PATH}/xnli/dev.tsv \
--test_set ${TASK_DATA_PATH}/xnli/test.tsv \
--vocab_path config/vocab.txt \
--label_map ${TASK_DATA_PATH}/xnli/label_map.json \
--ernie_config_path config/ernie_config.json \
--checkpoints ./checkpoints \
--save_steps 2000 \
--weight_decay 0.01 \
--warmup_proportion 0.0 \
--validation_steps 25 \
--epoch 1 \
--max_seq_len 512 \
--learning_rate 1e-4 \
--skip_steps 10 \
--num_iteration_per_drop_scope 1 \
--num_labels 3 \
--random_seed 100 \
--enable_ce \
--shuffle false
}
export CUDA_VISIBLE_DEVICES=0
train | python _ce.py
export CUDA_VISIBLE_DEVICES=0,1,2,3
train | python _ce.py
...@@ -279,7 +279,7 @@ text_a label ...@@ -279,7 +279,7 @@ text_a label
export FLAGS_sync_nccl_allreduce=1 export FLAGS_sync_nccl_allreduce=1
export CUDA_VISIBLE_DEVICES=7 export CUDA_VISIBLE_DEVICES=7
python -u ernir_encoder.py \ python -u ernie_encoder.py \
--use_cuda true \ --use_cuda true \
--batch_size 32 \ --batch_size 32 \
--output_dir "./test" \ --output_dir "./test" \
...@@ -295,3 +295,25 @@ python -u ernir_encoder.py \ ...@@ -295,3 +295,25 @@ python -u ernir_encoder.py \
#### 如何获取输入句子中每个 token 经过 ERNIE 编码后的 Embedding 表示? #### 如何获取输入句子中每个 token 经过 ERNIE 编码后的 Embedding 表示?
[解决方案同上](#如何获取输入句子经过-ERNIE-编码后的-Embedding-表示?) [解决方案同上](#如何获取输入句子经过-ERNIE-编码后的-Embedding-表示?)
#### 如何利用 finetune 得到的模型对新数据进行批量预测?
我们以分类任务为例,给出了分类任务进行批量预测的脚本, 使用示例如下:
```
python -u predict_classifier.py \
--use_cuda true \
--batch_size 32 \
--vocab_path config/vocab.txt \
--init_checkpoint "./checkpoints/step_100" \
--do_lower_case true \
--max_seq_len 128 \
--ernie_config_path config/ernie_config.json \
--do_predict true \
--predict_set ${TASK_DATA_PATH}/lcqmc/test.tsv \
--num_labels 2
```
实际使用时,需要通过 `init_checkpoint` 指定预测用的模型,通过 `predict_set` 指定待预测的数据文件,通过 `num_labels` 配置分类的类别数目;
**Note**: predict_set 的数据格式与 dev_set 和 test_set 的数据格式完全一致,是由 text_a、text_b(可选) 、label 组成的2列/3列 tsv 文件,predict_set 中的 label 列起到占位符的作用,全部置 0 即可;
####this file is only used for continuous evaluation test!
import os
import sys
sys.path.insert(0, os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_loss_card1_kpi = CostKpi('train_loss_card1', 0.03, 0, actived=True)
train_acc_card1_kpi = AccKpi('train_acc_card1', 0.06, 0, actived=True)
train_duration_card1_kpi = DurationKpi(
'train_duration_card1', 0.01, 0, actived=True)
train_loss_card4_kpi = CostKpi('train_loss_card4', 0.01, 0, actived=True)
train_acc_card4_kpi = AccKpi('train_acc_card4', 0.02, 0, actived=True)
train_duration_card4_kpi = DurationKpi(
'train_duration_card4', 0.02, 0, actived=True)
tracking_kpis = [
train_loss_card1_kpi,
train_acc_card1_kpi,
train_duration_card1_kpi,
train_loss_card4_kpi,
train_acc_card4_kpi,
train_duration_card4_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_loss\t1.0
test_loss\t1.0
train_loss\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print(log)
print("****")
log_to_ce(log)
...@@ -74,4 +74,7 @@ run_type_g.add_arg("do_train", bool, True, "Whether to pe ...@@ -74,4 +74,7 @@ run_type_g.add_arg("do_train", bool, True, "Whether to pe
run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.") run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.")
run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.") run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.")
run_type_g.add_arg("metrics", bool, True, "Whether to perform evaluation on test data set.") run_type_g.add_arg("metrics", bool, True, "Whether to perform evaluation on test data set.")
run_type_g.add_arg("shuffle", bool, True, "")
parser.add_argument("--enable_ce", action='store_true', help="The flag indicating whether to run the task for continuous evaluation.")
# yapf: enable # yapf: enable
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Load classifier's checkpoint to do prediction or save inference model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
import argparse
import numpy as np
import multiprocessing
import paddle.fluid as fluid
from reader.task_reader import ClassifyReader
from model.ernie import ErnieConfig
from finetune.classifier import create_model
from utils.args import ArgumentGroup, print_arguments
from utils.init import init_pretraining_params
from finetune_args import parser
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
model_g = ArgumentGroup(parser, "model", "options to init, resume and save model.")
model_g.add_arg("ernie_config_path", str, None, "Path to the json file for bert model config.")
model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
model_g.add_arg("use_fp16", bool, False, "Whether to resume parameters from fp16 checkpoint.")
model_g.add_arg("num_labels", int, 2, "num labels for classify")
data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options.")
data_g.add_arg("predict_set", str, None, "Predict set file")
data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
data_g.add_arg("label_map_config", str, None, "Label_map_config json file.")
data_g.add_arg("max_seq_len", int, 128, "Number of words of the longest seqence.")
data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.")
data_g.add_arg("do_lower_case", bool, True,
"Whether to lower case the input text. Should be True for uncased models and False for cased models.")
run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
run_type_g.add_arg("do_prediction", bool, True, "Whether to do prediction on test set.")
args = parser.parse_args()
# yapf: enable.
def main(args):
ernie_config = ErnieConfig(args.ernie_config_path)
ernie_config.print_config()
reader = ClassifyReader(
vocab_path=args.vocab_path,
label_map_config=args.label_map_config,
max_seq_len=args.max_seq_len,
do_lower_case=args.do_lower_case,
in_tokens=False)
predict_prog = fluid.Program()
predict_startup = fluid.Program()
with fluid.program_guard(predict_prog, predict_startup):
with fluid.unique_name.guard():
predict_pyreader, probs, feed_target_names = create_model(
args,
pyreader_name='predict_reader',
ernie_config=ernie_config,
is_prediction=True)
predict_prog = predict_prog.clone(for_test=True)
if args.use_cuda:
place = fluid.CUDAPlace(0)
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
place = fluid.CUDAPlace(0) if args.use_cuda == True else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(predict_startup)
if args.init_checkpoint:
init_pretraining_params(exe, args.init_checkpoint, predict_prog)
else:
raise ValueError("args 'init_checkpoint' should be set for prediction!")
predict_exe = fluid.Executor(place)
predict_data_generator = reader.data_generator(
input_file=args.predict_set,
batch_size=args.batch_size,
epoch=1,
shuffle=False)
predict_pyreader.decorate_tensor_provider(predict_data_generator)
predict_pyreader.start()
all_results = []
time_begin = time.time()
while True:
try:
results = predict_exe.run(program=predict_prog, fetch_list=[probs.name])
all_results.extend(results[0])
except fluid.core.EOFException:
predict_pyreader.reset()
break
time_end = time.time()
np.set_printoptions(precision=4, suppress=True)
print("-------------- prediction results --------------")
for index, result in enumerate(all_results):
print(str(index) + '\t{}'.format(result))
if __name__ == '__main__':
print_arguments(args)
main(args)
...@@ -29,6 +29,7 @@ from finetune.classifier import create_model, evaluate ...@@ -29,6 +29,7 @@ from finetune.classifier import create_model, evaluate
from optimization import optimization from optimization import optimization
from utils.args import print_arguments, check_cuda from utils.args import print_arguments, check_cuda
from utils.init import init_pretraining_params, init_checkpoint from utils.init import init_pretraining_params, init_checkpoint
from utils.cards import get_cards
from finetune_args import parser from finetune_args import parser
args = parser.parse_args() args = parser.parse_args()
...@@ -67,7 +68,7 @@ def main(args): ...@@ -67,7 +68,7 @@ def main(args):
input_file=args.train_set, input_file=args.train_set,
batch_size=args.batch_size, batch_size=args.batch_size,
epoch=args.epoch, epoch=args.epoch,
shuffle=True, shuffle=args.shuffle,
phase="train") phase="train")
num_train_examples = reader.get_num_examples(args.train_set) num_train_examples = reader.get_num_examples(args.train_set)
...@@ -85,6 +86,8 @@ def main(args): ...@@ -85,6 +86,8 @@ def main(args):
print("Num warmup steps: %d" % warmup_steps) print("Num warmup steps: %d" % warmup_steps)
train_program = fluid.Program() train_program = fluid.Program()
if args.random_seed is not None and args.enable_ce:
train_program.random_seed = args.random_seed
with fluid.program_guard(train_program, startup_prog): with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
...@@ -187,6 +190,7 @@ def main(args): ...@@ -187,6 +190,7 @@ def main(args):
if warmup_steps > 0: if warmup_steps > 0:
graph_vars["learning_rate"] = scheduled_lr graph_vars["learning_rate"] = scheduled_lr
ce_info = []
time_begin = time.time() time_begin = time.time()
while True: while True:
try: try:
...@@ -213,6 +217,7 @@ def main(args): ...@@ -213,6 +217,7 @@ def main(args):
(current_epoch, current_example, num_train_examples, (current_epoch, current_example, num_train_examples,
steps, outputs["loss"], outputs["accuracy"], steps, outputs["loss"], outputs["accuracy"],
args.skip_steps / used_time)) args.skip_steps / used_time))
ce_info.append([outputs["loss"], outputs["accuracy"], used_time])
time_begin = time.time() time_begin = time.time()
if steps % args.save_steps == 0: if steps % args.save_steps == 0:
...@@ -246,6 +251,24 @@ def main(args): ...@@ -246,6 +251,24 @@ def main(args):
fluid.io.save_persistables(exe, save_path, train_program) fluid.io.save_persistables(exe, save_path, train_program)
train_pyreader.reset() train_pyreader.reset()
break break
if args.enable_ce:
card_num = get_cards()
ce_loss = 0
ce_acc = 0
ce_time = 0
try:
ce_loss = ce_info[-2][0]
ce_acc = ce_info[-2][1]
ce_time = ce_info[-2][2]
except:
print("ce info error")
print("kpis\ttrain_duration_card%s\t%s" %
(card_num, ce_time))
print("kpis\ttrain_loss_card%s\t%f" %
(card_num, ce_loss))
print("kpis\ttrain_acc_card%s\t%f" %
(card_num, ce_acc))
# final eval on dev set # final eval on dev set
if args.do_val: if args.do_val:
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
def get_cards():
"""
get gpu cards number
"""
num = 0
cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
if cards != '':
num = len(cards.split(","))
return num
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册