提交 65904378 编写于 作者: Z zhengya01

add ce for BERT

上级 a171e58e
export FLAGS_enable_parallel_graph=1
export FLAGS_sync_nccl_allreduce=1
BERT_BASE_PATH="chinese_L-12_H-768_A-12"
TASK_NAME='xnli'
DATA_PATH=data/xnli/XNLI-MT-1.0
CKPT_PATH=pretrain_model
train(){
python -u run_classifier.py --task_name ${TASK_NAME} \
--use_cuda true \
--do_train true \
--do_val false \
--do_test false \
--batch_size 8192 \
--in_tokens true \
--init_checkpoint pretrain_model/chinese_L-12_H-768_A-12/ \
--data_dir ${DATA_PATH} \
--vocab_path pretrain_model/chinese_L-12_H-768_A-12/vocab.txt \
--checkpoints ${CKPT_PATH} \
--save_steps 1000 \
--weight_decay 0.01 \
--warmup_proportion 0.0 \
--validation_steps 25 \
--epoch 1 \
--max_seq_len 512 \
--bert_config_path pretrain_model/chinese_L-12_H-768_A-12/bert_config.json \
--learning_rate 1e-4 \
--skip_steps 10 \
--random_seed 100 \
--enable_ce \
--shuffle false
}
export CUDA_VISIBLE_DEVICES=0
train | python _ce.py
export CUDA_VISIBLE_DEVICES=0,1,2,3
train | python _ce.py
####this file is only used for continuous evaluation test!
import os
import sys
sys.path.insert(0, os.environ['ceroot'])
#sys.path.append('.')
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_cost_xnli_card1_kpi = CostKpi('train_cost_xnli_card1', 0.002, 0, actived=True)
train_acc_xnli_card1_kpi = AccKpi('train_acc_xnli_card1', 0.002, 0, actived=True)
train_duration_xnli_card1_kpi = DurationKpi(
'train_duration_xnli_card1', 0.01, 0, actived=True)
train_cost_xnli_card4_kpi = CostKpi('train_cost_xnli_card4', 0.002, 0, actived=True)
train_acc_xnli_card4_kpi = AccKpi('train_acc_xnli_card4', 0.02, 0, actived=True)
train_duration_xnli_card4_kpi = DurationKpi(
'train_duration_xnli_card4', 0.03, 0, actived=True)
tracking_kpis = [
train_cost_xnli_card1_kpi,
train_acc_xnli_card1_kpi,
train_duration_xnli_card1_kpi,
train_cost_xnli_card4_kpi,
train_acc_xnli_card4_kpi,
train_duration_xnli_card4_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print(log)
print("****")
log_to_ce(log)
...@@ -87,6 +87,8 @@ run_type_g.add_arg("do_train", bool, True, "Whether to pe ...@@ -87,6 +87,8 @@ run_type_g.add_arg("do_train", bool, True, "Whether to pe
run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.") run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.")
run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.") run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.")
parser.add_argument("--enable_ce", action='store_true', help="The flag indicating whether to run the task for continuous evaluation.")
args = parser.parse_args() args = parser.parse_args()
# yapf: enable. # yapf: enable.
...@@ -298,6 +300,7 @@ def main(args): ...@@ -298,6 +300,7 @@ def main(args):
total_cost, total_acc, total_num_seqs = [], [], [] total_cost, total_acc, total_num_seqs = [], [], []
time_begin = time.time() time_begin = time.time()
throughput = [] throughput = []
ce_info = []
while True: while True:
try: try:
# steps += 1 # steps += 1
...@@ -341,6 +344,7 @@ def main(args): ...@@ -341,6 +344,7 @@ def main(args):
current_epoch, current_example, num_train_examples, current_epoch, current_example, num_train_examples,
steps, np.sum(total_cost) / np.sum(total_num_seqs), steps, np.sum(total_cost) / np.sum(total_num_seqs),
np.sum(total_acc) / np.sum(total_num_seqs)) np.sum(total_acc) / np.sum(total_num_seqs))
ce_info.append([np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), used_time])
if steps > 0 : if steps > 0 :
throughput.append( args.skip_steps / used_time) throughput.append( args.skip_steps / used_time)
log_record = log_record + ", speed: %f steps/s" % (args.skip_steps / used_time) log_record = log_record + ", speed: %f steps/s" % (args.skip_steps / used_time)
...@@ -388,6 +392,24 @@ def main(args): ...@@ -388,6 +392,24 @@ def main(args):
fluid.io.save_persistables(exe, save_path, train_program) fluid.io.save_persistables(exe, save_path, train_program)
train_pyreader.reset() train_pyreader.reset()
break break
if args.enable_ce:
card_num = get_cards()
ce_cost = 0
ce_acc = 0
ce_time = 0
try:
ce_cost = ce_info[-2][0]
ce_acc = ce_info[-2][1]
ce_time = ce_info[-2][2]
except:
print("ce info error")
print("kpis\ttrain_duration_%s_card%s\t%s" %
(args.task_name, card_num, ce_time))
print("kpis\ttrain_cost_%s_card%s\t%f" %
(args.task_name, card_num, ce_cost))
print("kpis\ttrain_acc_%s_card%s\t%f" %
(args.task_name, card_num, ce_acc))
# final eval on dev set # final eval on dev set
if args.do_val: if args.do_val:
...@@ -413,6 +435,14 @@ def main(args): ...@@ -413,6 +435,14 @@ def main(args):
[loss.name, accuracy.name, num_seqs.name], "test") [loss.name, accuracy.name, num_seqs.name], "test")
def get_cards():
num = 0
cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
if cards != '':
num = len(cards.split(","))
return num
if __name__ == '__main__': if __name__ == '__main__':
print_arguments(args) print_arguments(args)
check_cuda(args.use_cuda) check_cuda(args.use_cuda)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册