未验证 提交 bb144d76 编写于 作者: P pkpk 提交者: GitHub

Add model check for DMTK (#2700)

* test=develop

* test=develop

* test=develop
上级 3cd5b2c9
...@@ -20,9 +20,12 @@ except ImportError as e: ...@@ -20,9 +20,12 @@ except ImportError as e:
import pickle #python 3 import pickle #python 3
sys.path.append('../../models/dialogue_model_toolkit/auto_dialogue_evaluation/') sys.path.append('../../models/dialogue_model_toolkit/auto_dialogue_evaluation/')
sys.path.append('../../models/')
from net import Network from net import Network
import config import config
from model_check import check_cuda
def train(args): def train(args):
"""Train """Train
...@@ -73,8 +76,9 @@ def train(args): ...@@ -73,8 +76,9 @@ def train(args):
print("device count %d" % dev_count) print("device count %d" % dev_count)
print("theoretical memory usage: ") print("theoretical memory usage: ")
print(fluid.contrib.memory_usage( print(
program=train_program, batch_size=args.batch_size)) fluid.contrib.memory_usage(
program=train_program, batch_size=args.batch_size))
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(train_startup) exe.run(train_startup)
...@@ -155,8 +159,8 @@ def train(args): ...@@ -155,8 +159,8 @@ def train(args):
main_program=train_program) main_program=train_program)
print("Save model at step %d ... " % step) print("Save model at step %d ... " % step)
print(time.strftime('%Y-%m-%d %H:%M:%S', print(
time.localtime(time.time()))) time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
best_recall = recall_dict['1_in_10'] best_recall = recall_dict['1_in_10']
return best_recall return best_recall
...@@ -252,8 +256,9 @@ def finetune(args): ...@@ -252,8 +256,9 @@ def finetune(args):
print("device count %d" % dev_count) print("device count %d" % dev_count)
print("theoretical memory usage: ") print("theoretical memory usage: ")
print(fluid.contrib.memory_usage( print(
program=train_program, batch_size=args.batch_size)) fluid.contrib.memory_usage(
program=train_program, batch_size=args.batch_size))
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(train_startup) exe.run(train_startup)
...@@ -321,8 +326,8 @@ def finetune(args): ...@@ -321,8 +326,8 @@ def finetune(args):
exe, exe,
main_program=train_program) main_program=train_program)
print("Save model at step %d ... " % step) print("Save model at step %d ... " % step)
print(time.strftime('%Y-%m-%d %H:%M:%S', print(
time.localtime(time.time()))) time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
best_cor = cor best_cor = cor
return best_cor return best_cor
...@@ -466,6 +471,8 @@ def main(): ...@@ -466,6 +471,8 @@ def main():
args = config.parse_args() args = config.parse_args()
config.print_arguments(args) config.print_arguments(args)
check_cuda(args.use_cuda)
if args.do_train == True: if args.do_train == True:
if args.loss_type == 'CLS': if args.loss_type == 'CLS':
train(args) train(args)
......
...@@ -5,12 +5,13 @@ Evaluation ...@@ -5,12 +5,13 @@ Evaluation
import sys import sys
import six import six
import numpy as np import numpy as np
from sklearn.metrics import average_precision_score
def evaluate_ubuntu(file_path): def evaluate_ubuntu(file_path):
""" """
Evaluate on ubuntu data Evaluate on ubuntu data
""" """
def get_p_at_n_in_m(data, n, m, ind): def get_p_at_n_in_m(data, n, m, ind):
""" """
Recall n at m Recall n at m
...@@ -18,7 +19,7 @@ def evaluate_ubuntu(file_path): ...@@ -18,7 +19,7 @@ def evaluate_ubuntu(file_path):
pos_score = data[ind][0] pos_score = data[ind][0]
curr = data[ind:ind + m] curr = data[ind:ind + m]
curr = sorted(curr, key=lambda x: x[0], reverse=True) curr = sorted(curr, key=lambda x: x[0], reverse=True)
if curr[n - 1][0] <= pos_score: if curr[n - 1][0] <= pos_score:
return 1 return 1
return 0 return 0
...@@ -56,7 +57,8 @@ def evaluate_ubuntu(file_path): ...@@ -56,7 +57,8 @@ def evaluate_ubuntu(file_path):
"1_in_2": p_at_1_in_2 / length, "1_in_2": p_at_1_in_2 / length,
"1_in_10": p_at_1_in_10 / length, "1_in_10": p_at_1_in_10 / length,
"2_in_10": p_at_2_in_10 / length, "2_in_10": p_at_2_in_10 / length,
"5_in_10": p_at_5_in_10 / length} "5_in_10": p_at_5_in_10 / length
}
return result_dict return result_dict
...@@ -65,6 +67,7 @@ def evaluate_douban(file_path): ...@@ -65,6 +67,7 @@ def evaluate_douban(file_path):
""" """
Evaluate douban data Evaluate douban data
""" """
def mean_average_precision(sort_data): def mean_average_precision(sort_data):
""" """
Evaluate mean average precision Evaluate mean average precision
...@@ -76,7 +79,7 @@ def evaluate_douban(file_path): ...@@ -76,7 +79,7 @@ def evaluate_douban(file_path):
count_1 += 1 count_1 += 1
sum_precision += 1.0 * count_1 / (index + 1) sum_precision += 1.0 * count_1 / (index + 1)
return sum_precision / count_1 return sum_precision / count_1
def mean_reciprocal_rank(sort_data): def mean_reciprocal_rank(sort_data):
""" """
Evaluate MRR Evaluate MRR
...@@ -84,7 +87,7 @@ def evaluate_douban(file_path): ...@@ -84,7 +87,7 @@ def evaluate_douban(file_path):
sort_lable = [s_d[1] for s_d in sort_data] sort_lable = [s_d[1] for s_d in sort_data]
assert 1 in sort_lable assert 1 in sort_lable
return 1.0 / (1 + sort_lable.index(1)) return 1.0 / (1 + sort_lable.index(1))
def precision_at_position_1(sort_data): def precision_at_position_1(sort_data):
""" """
Evaluate precision Evaluate precision
...@@ -93,7 +96,7 @@ def evaluate_douban(file_path): ...@@ -93,7 +96,7 @@ def evaluate_douban(file_path):
return 1 return 1
else: else:
return 0 return 0
def recall_at_position_k_in_10(sort_data, k): def recall_at_position_k_in_10(sort_data, k):
"""" """"
Evaluate recall Evaluate recall
...@@ -101,7 +104,7 @@ def evaluate_douban(file_path): ...@@ -101,7 +104,7 @@ def evaluate_douban(file_path):
sort_lable = [s_d[1] for s_d in sort_data] sort_lable = [s_d[1] for s_d in sort_data]
select_lable = sort_lable[:k] select_lable = sort_lable[:k]
return 1.0 * select_lable.count(1) / sort_lable.count(1) return 1.0 * select_lable.count(1) / sort_lable.count(1)
def evaluation_one_session(data): def evaluation_one_session(data):
""" """
Evaluate one session Evaluate one session
...@@ -147,7 +150,6 @@ def evaluate_douban(file_path): ...@@ -147,7 +150,6 @@ def evaluate_douban(file_path):
"P_1": 1.0 * sum_p_1 / total_num, "P_1": 1.0 * sum_p_1 / total_num,
"1_in_10": 1.0 * sum_r_1 / total_num, "1_in_10": 1.0 * sum_r_1 / total_num,
"2_in_10": 1.0 * sum_r_2 / total_num, "2_in_10": 1.0 * sum_r_2 / total_num,
"5_in_10": 1.0 * sum_r_5 / total_num} "5_in_10": 1.0 * sum_r_5 / total_num
}
return result_dict return result_dict
...@@ -20,9 +20,12 @@ except ImportError as e: ...@@ -20,9 +20,12 @@ except ImportError as e:
import pickle #python 3 import pickle #python 3
sys.path.append('../../models/dialogue_model_toolkit/deep_attention_matching/') sys.path.append('../../models/dialogue_model_toolkit/deep_attention_matching/')
sys.path.append('../../models/')
from model_check import check_cuda
from net import Net from net import Net
def evaluate(score_path, result_file_path): def evaluate(score_path, result_file_path):
""" """
Evaluate both douban and ubuntu dataset Evaluate both douban and ubuntu dataset
...@@ -70,6 +73,7 @@ def test_with_pyreader(exe, program, pyreader, fetch_list, score_path, batches, ...@@ -70,6 +73,7 @@ def test_with_pyreader(exe, program, pyreader, fetch_list, score_path, batches,
""" """
Test with pyreader Test with pyreader
""" """
def data_provider(): def data_provider():
""" """
Data reader Data reader
...@@ -145,10 +149,12 @@ def train(args): ...@@ -145,10 +149,12 @@ def train(args):
staircase=True)) staircase=True))
optimizer.minimize(loss) optimizer.minimize(loss)
print("begin memory optimization ...") print("begin memory optimization ...")
print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) print(
time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
fluid.memory_optimize(train_program) fluid.memory_optimize(train_program)
print("end memory optimization ...") print("end memory optimization ...")
print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) print(
time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
test_program = fluid.Program() test_program = fluid.Program()
test_startup = fluid.Program() test_startup = fluid.Program()
...@@ -270,6 +276,7 @@ def train(args): ...@@ -270,6 +276,7 @@ def train(args):
""" """
Train on one epoch with pyreader Train on one epoch with pyreader
""" """
def data_provider(): def data_provider():
""" """
Data reader Data reader
...@@ -467,6 +474,9 @@ def get_cards(): ...@@ -467,6 +474,9 @@ def get_cards():
if __name__ == '__main__': if __name__ == '__main__':
args = config.parse_args() args = config.parse_args()
config.print_arguments(args) config.print_arguments(args)
check_cuda(args.use_cuda)
if args.do_train: if args.do_train:
train(args) train(args)
......
...@@ -34,11 +34,15 @@ import define_predict_pack ...@@ -34,11 +34,15 @@ import define_predict_pack
import reader.data_reader as reader import reader.data_reader as reader
_WORK_DIR = os.path.split(os.path.realpath(__file__))[0] _WORK_DIR = os.path.split(os.path.realpath(__file__))[0]
sys.path.append('../../models/dialogue_model_toolkit/dialogue_general_understanding') sys.path.append(
'../../models/dialogue_model_toolkit/dialogue_general_understanding')
sys.path.append('../../models/')
from bert import BertConfig, BertModel from bert import BertConfig, BertModel
from create_model import create_model from create_model import create_model
import define_paradigm import define_paradigm
from model_check import check_cuda
def main(args): def main(args):
...@@ -55,10 +59,10 @@ def main(args): ...@@ -55,10 +59,10 @@ def main(args):
'udc': reader.UDCProcessor, 'udc': reader.UDCProcessor,
'swda': reader.SWDAProcessor, 'swda': reader.SWDAProcessor,
'mrda': reader.MRDAProcessor, 'mrda': reader.MRDAProcessor,
'atis_slot': reader.ATISSlotProcessor, 'atis_slot': reader.ATISSlotProcessor,
'atis_intent': reader.ATISIntentProcessor, 'atis_intent': reader.ATISIntentProcessor,
'dstc2': reader.DSTC2Processor, 'dstc2': reader.DSTC2Processor,
'dstc2_asr': reader.DSTC2Processor, 'dstc2_asr': reader.DSTC2Processor,
} }
in_tokens = { in_tokens = {
...@@ -67,16 +71,16 @@ def main(args): ...@@ -67,16 +71,16 @@ def main(args):
'mrda': True, 'mrda': True,
'atis_slot': False, 'atis_slot': False,
'atis_intent': True, 'atis_intent': True,
'dstc2': True, 'dstc2': True,
'dstc2_asr': True 'dstc2_asr': True
} }
processor = processors[task_name](data_dir=args.data_dir, processor = processors[task_name](data_dir=args.data_dir,
vocab_path=args.vocab_path, vocab_path=args.vocab_path,
max_seq_len=args.max_seq_len, max_seq_len=args.max_seq_len,
do_lower_case=args.do_lower_case, do_lower_case=args.do_lower_case,
in_tokens=in_tokens[task_name], in_tokens=in_tokens[task_name],
task_name=task_name, task_name=task_name,
random_seed=args.random_seed) random_seed=args.random_seed)
num_labels = len(processor.get_labels()) num_labels = len(processor.get_labels())
...@@ -117,10 +121,7 @@ def main(args): ...@@ -117,10 +121,7 @@ def main(args):
use_cuda=args.use_cuda, main_program=predict_prog) use_cuda=args.use_cuda, main_program=predict_prog)
test_data_generator = processor.data_generator( test_data_generator = processor.data_generator(
batch_size=args.batch_size, batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)
phase='test',
epoch=1,
shuffle=False)
predict_pyreader.decorate_tensor_provider(test_data_generator) predict_pyreader.decorate_tensor_provider(test_data_generator)
predict_pyreader.start() predict_pyreader.start()
...@@ -138,15 +139,15 @@ def main(args): ...@@ -138,15 +139,15 @@ def main(args):
np.set_printoptions(precision=4, suppress=True) np.set_printoptions(precision=4, suppress=True)
print("-------------- prediction results --------------") print("-------------- prediction results --------------")
print("example_id\t" + ' '.join(processor.get_labels())) print("example_id\t" + ' '.join(processor.get_labels()))
if in_tokens[task_name]: if in_tokens[task_name]:
for index, result in enumerate(all_results): for index, result in enumerate(all_results):
tags = pred_func(result) tags = pred_func(result)
print("%s\t%s" % (index, tags)) print("%s\t%s" % (index, tags))
else: else:
tags = pred_func(all_results, args.max_seq_len) tags = pred_func(all_results, args.max_seq_len)
for index, tag in enumerate(tags): for index, tag in enumerate(tags):
print("%s\t%s" % (index, tag)) print("%s\t%s" % (index, tag))
if args.save_inference_model_path: if args.save_inference_model_path:
_, ckpt_dir = os.path.split(args.init_checkpoint) _, ckpt_dir = os.path.split(args.init_checkpoint)
dir_name = ckpt_dir + '_inference_model' dir_name = ckpt_dir + '_inference_model'
...@@ -158,7 +159,10 @@ def main(args): ...@@ -158,7 +159,10 @@ def main(args):
main_program=predict_prog) main_program=predict_prog)
if __name__ == '__main__': if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
print_arguments(args) print_arguments(args)
check_cuda(args.use_cuda)
main(args) main(args)
...@@ -33,7 +33,11 @@ from utils.args import print_arguments ...@@ -33,7 +33,11 @@ from utils.args import print_arguments
from utils.init import init_checkpoint, init_pretraining_params from utils.init import init_checkpoint, init_pretraining_params
_WORK_DIR = os.path.split(os.path.realpath(__file__))[0] _WORK_DIR = os.path.split(os.path.realpath(__file__))[0]
sys.path.append('../../models/dialogue_model_toolkit/dialogue_general_understanding') sys.path.append(
'../../models/dialogue_model_toolkit/dialogue_general_understanding')
sys.path.append('../../models/')
from model_check import check_cuda
from bert import BertConfig, BertModel from bert import BertConfig, BertModel
from create_model import create_model from create_model import create_model
...@@ -46,11 +50,12 @@ def evaluate(test_exe, test_program, test_pyreader, fetch_list, eval_phase): ...@@ -46,11 +50,12 @@ def evaluate(test_exe, test_program, test_pyreader, fetch_list, eval_phase):
total_cost, total_acc, total_num_seqs = [], [], [] total_cost, total_acc, total_num_seqs = [], [], []
time_begin = time.time() time_begin = time.time()
while True: while True:
try: try:
if len(fetch_list) > 2: if len(fetch_list) > 2:
np_loss, np_acc, np_num_seqs = test_exe.run(fetch_list=fetch_list) np_loss, np_acc, np_num_seqs = test_exe.run(
fetch_list=fetch_list)
total_acc.extend(np_acc * np_num_seqs) total_acc.extend(np_acc * np_num_seqs)
else: else:
np_loss, np_num_seqs = test_exe.run(fetch_list=fetch_list) np_loss, np_num_seqs = test_exe.run(fetch_list=fetch_list)
total_cost.extend(np_loss * np_num_seqs) total_cost.extend(np_loss * np_num_seqs)
total_num_seqs.extend(np_num_seqs) total_num_seqs.extend(np_num_seqs)
...@@ -58,26 +63,28 @@ def evaluate(test_exe, test_program, test_pyreader, fetch_list, eval_phase): ...@@ -58,26 +63,28 @@ def evaluate(test_exe, test_program, test_pyreader, fetch_list, eval_phase):
test_pyreader.reset() test_pyreader.reset()
break break
time_end = time.time() time_end = time.time()
current_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) current_time = time.strftime('%Y-%m-%d %H:%M:%S',
if len(fetch_list) > 2: time.localtime(time.time()))
print("[%s evaluation] %s ave loss: %f, ave acc: %f, elapsed time: %f s" % if len(fetch_list) > 2:
(eval_phase, current_time, np.sum(total_cost) / np.sum(total_num_seqs), print("[%s evaluation] %s ave loss: %f, ave acc: %f, elapsed time: %f s"
np.sum(total_acc) / np.sum(total_num_seqs), time_end - time_begin)) % (eval_phase, current_time, np.sum(total_cost) /
else: np.sum(total_num_seqs), np.sum(total_acc) /
np.sum(total_num_seqs), time_end - time_begin))
else:
print("[%s evaluation] %s ave loss: %f, elapsed time: %f s" % print("[%s evaluation] %s ave loss: %f, elapsed time: %f s" %
(eval_phase, current_time, np.sum(total_cost) / np.sum(total_num_seqs), (eval_phase, current_time, np.sum(total_cost) /
time_end - time_begin)) np.sum(total_num_seqs), time_end - time_begin))
def main(args): def main(args):
"""main function""" """main function"""
bert_config = BertConfig(args.bert_config_path) bert_config = BertConfig(args.bert_config_path)
bert_config.print_config() bert_config.print_config()
if args.use_cuda: if args.use_cuda:
place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
dev_count = fluid.core.get_cuda_device_count() dev_count = fluid.core.get_cuda_device_count()
else: else:
place = fluid.CPUPlace() place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exe = fluid.Executor(place) exe = fluid.Executor(place)
...@@ -105,14 +112,14 @@ def main(args): ...@@ -105,14 +112,14 @@ def main(args):
processor = processors[task_name](data_dir=args.data_dir, processor = processors[task_name](data_dir=args.data_dir,
vocab_path=args.vocab_path, vocab_path=args.vocab_path,
max_seq_len=args.max_seq_len, max_seq_len=args.max_seq_len,
do_lower_case=args.do_lower_case, do_lower_case=args.do_lower_case,
in_tokens=in_tokens[task_name], in_tokens=in_tokens[task_name],
task_name=task_name, task_name=task_name,
random_seed=args.random_seed) random_seed=args.random_seed)
num_labels = len(processor.get_labels()) num_labels = len(processor.get_labels())
if not (args.do_train or args.do_val or args.do_test): if not (args.do_train or args.do_val or args.do_test):
raise ValueError("For args `do_train`, `do_val` and `do_test`, at " raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
"least one of them must be True.") "least one of them must be True.")
...@@ -120,7 +127,7 @@ def main(args): ...@@ -120,7 +127,7 @@ def main(args):
if args.random_seed is not None: if args.random_seed is not None:
startup_prog.random_seed = args.random_seed startup_prog.random_seed = args.random_seed
if args.do_train: if args.do_train:
train_data_generator = processor.data_generator( train_data_generator = processor.data_generator(
batch_size=args.batch_size, batch_size=args.batch_size,
phase='train', phase='train',
...@@ -128,10 +135,10 @@ def main(args): ...@@ -128,10 +135,10 @@ def main(args):
shuffle=True) shuffle=True)
num_train_examples = processor.get_num_examples(phase='train') num_train_examples = processor.get_num_examples(phase='train')
if in_tokens[task_name]: if in_tokens[task_name]:
max_train_steps = args.epoch * num_train_examples // ( max_train_steps = args.epoch * num_train_examples // (
args.batch_size // args.max_seq_len) // dev_count args.batch_size // args.max_seq_len) // dev_count
else: else:
max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count
warmup_steps = int(max_train_steps * args.warmup_proportion) warmup_steps = int(max_train_steps * args.warmup_proportion)
...@@ -168,24 +175,25 @@ def main(args): ...@@ -168,24 +175,25 @@ def main(args):
use_fp16=args.use_fp16, use_fp16=args.use_fp16,
loss_scaling=args.loss_scaling) loss_scaling=args.loss_scaling)
if accuracy is not None: if accuracy is not None:
skip_opt_set = [loss.name, probs.name, accuracy.name, num_seqs.name] skip_opt_set = [
else: loss.name, probs.name, accuracy.name, num_seqs.name
]
else:
skip_opt_set = [loss.name, probs.name, num_seqs.name] skip_opt_set = [loss.name, probs.name, num_seqs.name]
fluid.memory_optimize( fluid.memory_optimize(
input_program=train_program, input_program=train_program, skip_opt_set=skip_opt_set)
skip_opt_set=skip_opt_set)
if args.verbose: if args.verbose:
if in_tokens[task_name]: if in_tokens[task_name]:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage( lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program, program=train_program,
batch_size=args.batch_size // args.max_seq_len) batch_size=args.batch_size // args.max_seq_len)
else: else:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage( lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program, batch_size=args.batch_size) program=train_program, batch_size=args.batch_size)
print("Theoretical memory usage in training: %.3f - %.3f %s" % print("Theoretical memory usage in training: %.3f - %.3f %s" %
(lower_mem, upper_mem, unit)) (lower_mem, upper_mem, unit))
if args.do_val or args.do_test: if args.do_val or args.do_test:
test_prog = fluid.Program() test_prog = fluid.Program()
...@@ -203,37 +211,37 @@ def main(args): ...@@ -203,37 +211,37 @@ def main(args):
accuracy = test_results.get("accuracy", None) accuracy = test_results.get("accuracy", None)
num_seqs = test_results.get("num_seqs", None) num_seqs = test_results.get("num_seqs", None)
test_prog = test_prog.clone(for_test=True) test_prog = test_prog.clone(for_test=True)
exe.run(startup_prog) exe.run(startup_prog)
if args.do_train: if args.do_train:
if args.init_checkpoint and args.init_pretraining_params: if args.init_checkpoint and args.init_pretraining_params:
print( print(
"WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
"both are set! Only arg 'init_checkpoint' is made valid.") "both are set! Only arg 'init_checkpoint' is made valid.")
if args.init_checkpoint: if args.init_checkpoint:
init_checkpoint( init_checkpoint(
exe, exe,
args.init_checkpoint, args.init_checkpoint,
main_program=startup_prog, main_program=startup_prog,
use_fp16=args.use_fp16) use_fp16=args.use_fp16)
elif args.init_pretraining_params: elif args.init_pretraining_params:
init_pretraining_params( init_pretraining_params(
exe, exe,
args.init_pretraining_params, args.init_pretraining_params,
main_program=startup_prog, main_program=startup_prog,
use_fp16=args.use_fp16) use_fp16=args.use_fp16)
elif args.do_val or args.do_test: elif args.do_val or args.do_test:
if not args.init_checkpoint: if not args.init_checkpoint:
raise ValueError("args 'init_checkpoint' should be set if" raise ValueError("args 'init_checkpoint' should be set if"
"only doing validation or testing!") "only doing validation or testing!")
init_checkpoint( init_checkpoint(
exe, exe,
args.init_checkpoint, args.init_checkpoint,
main_program=startup_prog, main_program=startup_prog,
use_fp16=args.use_fp16) use_fp16=args.use_fp16)
if args.do_train: if args.do_train:
exec_strategy = fluid.ExecutionStrategy() exec_strategy = fluid.ExecutionStrategy()
exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.use_experimental_executor = args.use_fast_executor
exec_strategy.num_threads = dev_count exec_strategy.num_threads = dev_count
...@@ -245,115 +253,136 @@ def main(args): ...@@ -245,115 +253,136 @@ def main(args):
exec_strategy=exec_strategy, exec_strategy=exec_strategy,
main_program=train_program) main_program=train_program)
train_pyreader.decorate_tensor_provider(train_data_generator) train_pyreader.decorate_tensor_provider(train_data_generator)
else: else:
train_exe = None train_exe = None
if args.do_val or args.do_test: if args.do_val or args.do_test:
test_exe = fluid.ParallelExecutor( test_exe = fluid.ParallelExecutor(
use_cuda=args.use_cuda, use_cuda=args.use_cuda,
main_program=test_prog, main_program=test_prog,
share_vars_from=train_exe) share_vars_from=train_exe)
if args.do_train: if args.do_train:
train_pyreader.start() train_pyreader.start()
steps = 0 steps = 0
total_cost, total_acc, total_num_seqs = [], [], [] total_cost, total_acc, total_num_seqs = [], [], []
time_begin = time.time() time_begin = time.time()
ce_info = [] ce_info = []
while True: while True:
try: try:
steps += 1 steps += 1
if steps % args.skip_steps == 0: if steps % args.skip_steps == 0:
if warmup_steps <= 0: if warmup_steps <= 0:
if accuracy is not None: if accuracy is not None:
fetch_list = [loss.name, accuracy.name, num_seqs.name] fetch_list = [
else: loss.name, accuracy.name, num_seqs.name
]
else:
fetch_list = [loss.name, num_seqs.name] fetch_list = [loss.name, num_seqs.name]
else: else:
if accuracy is not None: if accuracy is not None:
fetch_list = [ fetch_list = [
loss.name, accuracy.name, scheduled_lr.name, loss.name, accuracy.name, scheduled_lr.name,
num_seqs.name num_seqs.name
] ]
else: else:
fetch_list = [loss.name, scheduled_lr.name, num_seqs.name] fetch_list = [
else: loss.name, scheduled_lr.name, num_seqs.name
]
else:
fetch_list = [] fetch_list = []
if accuracy is not None: if accuracy is not None:
fetch_test_list = [loss.name, accuracy.name, num_seqs.name] fetch_test_list = [loss.name, accuracy.name, num_seqs.name]
else: else:
fetch_test_list = [loss.name, num_seqs.name] fetch_test_list = [loss.name, num_seqs.name]
outputs = train_exe.run(fetch_list=fetch_list) outputs = train_exe.run(fetch_list=fetch_list)
if steps % args.skip_steps == 0: if steps % args.skip_steps == 0:
if warmup_steps <= 0: if warmup_steps <= 0:
if accuracy is not None: if accuracy is not None:
np_loss, np_acc, np_num_seqs = outputs np_loss, np_acc, np_num_seqs = outputs
else: else:
np_loss, np_num_seqs = outputs np_loss, np_num_seqs = outputs
else: else:
if accuracy is not None: if accuracy is not None:
np_loss, np_acc, np_lr, np_num_seqs = outputs np_loss, np_acc, np_lr, np_num_seqs = outputs
else: else:
np_loss, np_lr, np_num_seqs = outputs np_loss, np_lr, np_num_seqs = outputs
total_cost.extend(np_loss * np_num_seqs) total_cost.extend(np_loss * np_num_seqs)
total_num_seqs.extend(np_num_seqs) total_num_seqs.extend(np_num_seqs)
if accuracy is not None: if accuracy is not None:
total_acc.extend(np_acc * np_num_seqs) total_acc.extend(np_acc * np_num_seqs)
if args.verbose: if args.verbose:
verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size() verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
)
verbose += "learning rate: %f" % ( verbose += "learning rate: %f" % (
np_lr[0] np_lr[0]
if warmup_steps > 0 else args.learning_rate) if warmup_steps > 0 else args.learning_rate)
print(verbose) print(verbose)
current_example, current_epoch = processor.get_train_progress() current_example, current_epoch = processor.get_train_progress(
)
time_end = time.time() time_end = time.time()
used_time = time_end - time_begin used_time = time_end - time_begin
current_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) current_time = time.strftime('%Y-%m-%d %H:%M:%S',
if accuracy is not None: time.localtime(time.time()))
print("%s epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " if accuracy is not None:
"ave acc: %f, speed: %f steps/s" % print(
(current_time, current_epoch, current_example, num_train_examples, "%s epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
steps, np.sum(total_cost) / np.sum(total_num_seqs), "ave acc: %f, speed: %f steps/s" %
np.sum(total_acc) / np.sum(total_num_seqs), (current_time, current_epoch, current_example,
args.skip_steps / used_time)) num_train_examples, steps,
ce_info.append([np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), args.skip_steps / used_time]) np.sum(total_cost) / np.sum(total_num_seqs),
else: np.sum(total_acc) / np.sum(total_num_seqs),
print("%s epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " args.skip_steps / used_time))
ce_info.append([
np.sum(total_cost) / np.sum(total_num_seqs),
np.sum(total_acc) / np.sum(total_num_seqs),
args.skip_steps / used_time
])
else:
print(
"%s epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
"speed: %f steps/s" % "speed: %f steps/s" %
(current_time, current_epoch, current_example, num_train_examples, (current_time, current_epoch, current_example,
steps, np.sum(total_cost) / np.sum(total_num_seqs), num_train_examples, steps,
args.skip_steps / used_time)) np.sum(total_cost) / np.sum(total_num_seqs),
ce_info.append([np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time]) args.skip_steps / used_time))
ce_info.append([
np.sum(total_cost) / np.sum(total_num_seqs),
args.skip_steps / used_time
])
total_cost, total_acc, total_num_seqs = [], [], [] total_cost, total_acc, total_num_seqs = [], [], []
time_begin = time.time() time_begin = time.time()
if steps % args.save_steps == 0: if steps % args.save_steps == 0:
save_path = os.path.join(args.checkpoints, "step_" + str(steps)) save_path = os.path.join(args.checkpoints,
"step_" + str(steps))
fluid.io.save_persistables(exe, save_path, train_program) fluid.io.save_persistables(exe, save_path, train_program)
if steps % args.validation_steps == 0: if steps % args.validation_steps == 0:
#evaluate dev set #evaluate dev set
if args.do_val: if args.do_val:
test_pyreader.decorate_tensor_provider( test_pyreader.decorate_tensor_provider(
processor.data_generator( processor.data_generator(
batch_size=args.batch_size, batch_size=args.batch_size,
phase='dev', phase='dev',
epoch=1, epoch=1,
shuffle=False)) shuffle=False))
evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "dev") evaluate(test_exe, test_prog, test_pyreader,
fetch_test_list, "dev")
#evaluate test set #evaluate test set
if args.do_test: if args.do_test:
test_pyreader.decorate_tensor_provider( test_pyreader.decorate_tensor_provider(
processor.data_generator( processor.data_generator(
batch_size=args.batch_size, batch_size=args.batch_size,
phase='test', phase='test',
epoch=1, epoch=1,
shuffle=False)) shuffle=False))
evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "test") evaluate(test_exe, test_prog, test_pyreader,
fetch_test_list, "test")
except fluid.core.EOFException: except fluid.core.EOFException:
save_path = os.path.join(args.checkpoints, "step_" + str(steps)) save_path = os.path.join(args.checkpoints, "step_" + str(steps))
fluid.io.save_persistables(exe, save_path, train_program) fluid.io.save_persistables(exe, save_path, train_program)
...@@ -372,30 +401,28 @@ def main(args): ...@@ -372,30 +401,28 @@ def main(args):
except: except:
print("ce info error") print("ce info error")
print("kpis\teach_step_duration_%s_card%s\t%s" % print("kpis\teach_step_duration_%s_card%s\t%s" %
(task_name, card_num, ce_time)) (task_name, card_num, ce_time))
print("kpis\ttrain_loss_%s_card%s\t%f" % print("kpis\ttrain_loss_%s_card%s\t%f" % (task_name, card_num, ce_loss))
(task_name, card_num, ce_loss)) print("kpis\ttrain_acc_%s_card%s\t%f" % (task_name, card_num, ce_acc))
print("kpis\ttrain_acc_%s_card%s\t%f" %
(task_name, card_num, ce_acc))
#final eval on dev set #final eval on dev set
if args.do_val: if args.do_val:
test_pyreader.decorate_tensor_provider( test_pyreader.decorate_tensor_provider(
processor.data_generator( processor.data_generator(
batch_size=args.batch_size, phase='dev', epoch=1, batch_size=args.batch_size, phase='dev', epoch=1,
shuffle=False)) shuffle=False))
print("Final validation result:") print("Final validation result:")
evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "dev") evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "dev")
#final eval on test set #final eval on test set
if args.do_test: if args.do_test:
test_pyreader.decorate_tensor_provider( test_pyreader.decorate_tensor_provider(
processor.data_generator( processor.data_generator(
batch_size=args.batch_size, batch_size=args.batch_size,
phase='test', phase='test',
epoch=1, epoch=1,
shuffle=False)) shuffle=False))
print("Final test result:") print("Final test result:")
evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "test") evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "test")
...@@ -408,7 +435,10 @@ def get_cards(): ...@@ -408,7 +435,10 @@ def get_cards():
return num return num
if __name__ == '__main__': if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
print_arguments(args) print_arguments(args)
check_cuda(args.use_cuda)
main(args) main(args)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册