diff --git a/PaddleNLP/unarchived/neural_machine_translation/rnn_search/args.py b/PaddleNLP/unarchived/neural_machine_translation/rnn_search/args.py index 5544a385ce2ce6c3e9c26cd9a59dd40d5113e827..8d27f9abf05d34d4b94b0a03f13a31eee661311c 100644 --- a/PaddleNLP/unarchived/neural_machine_translation/rnn_search/args.py +++ b/PaddleNLP/unarchived/neural_machine_translation/rnn_search/args.py @@ -119,5 +119,15 @@ def parse_args(): help="The flag indicating whether to run the task " "for continuous evaluation.") + parser.add_argument( + "--parallel", + action='store_true', + help="Whether execute with the data_parallel mode.") + + parser.add_argument( + "--profile", + action='store_true', + help="Whether enable the profile.") + args = parser.parse_args() return args diff --git a/PaddleNLP/unarchived/neural_machine_translation/rnn_search/train.py b/PaddleNLP/unarchived/neural_machine_translation/rnn_search/train.py index c5657bd902827e0d78d5b03a03f8da6edf60db9d..820c383a0682b61c728657cdc1349fcd46d5a650 100644 --- a/PaddleNLP/unarchived/neural_machine_translation/rnn_search/train.py +++ b/PaddleNLP/unarchived/neural_machine_translation/rnn_search/train.py @@ -20,12 +20,13 @@ import numpy as np import time import os import random - import math +import contextlib import paddle import paddle.fluid as fluid import paddle.fluid.framework as framework +import paddle.fluid.profiler as profiler from paddle.fluid.executor import Executor import reader @@ -45,7 +46,16 @@ import pickle SEED = 123 -def train(): +@contextlib.contextmanager +def profile_context(profile=True): + if profile: + with profiler.profiler('All', 'total', 'seq2seq.profile'): + yield + else: + yield + + +def main(): args = parse_args() num_layers = args.num_layers @@ -106,6 +116,29 @@ def train(): exe = Executor(place) exe.run(framework.default_startup_program()) + device_count = len(fluid.cuda_places()) if args.use_gpu else len( + fluid.cpu_places()) + if device_count > 1: + raise Exception("Training using multi-GPUs is not supported now.") + + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.num_threads = device_count + exec_strategy.num_iteration_per_drop_scope = 100 + + build_strategy = fluid.BuildStrategy() + build_strategy.enable_inplace = True + build_strategy.memory_optimize = False +# build_strategy.fuse_all_optimizer_ops = True + + if args.parallel: + train_program = fluid.compiler.CompiledProgram( + framework.default_main_program()).with_data_parallel( + loss_name=loss.name, + build_strategy=build_strategy, + exec_strategy=exec_strategy) + else: + train_program = framework.default_main_program() + train_data_prefix = args.train_data_prefix eval_data_prefix = args.eval_data_prefix test_data_prefix = args.test_data_prefix @@ -160,63 +193,78 @@ def train(): return ppl - ce_time = [] - ce_ppl = [] - max_epoch = args.max_epoch - for epoch_id in range(max_epoch): - start_time = time.time() - print("epoch id", epoch_id) - if args.enable_ce: - train_data_iter = reader.get_data_iter(train_data, batch_size, enable_ce=True) - else: - train_data_iter = reader.get_data_iter(train_data, batch_size) - + def train(): + ce_time = [] + ce_ppl = [] + max_epoch = args.max_epoch + for epoch_id in range(max_epoch): + start_time = time.time() + if args.enable_ce: + train_data_iter = reader.get_data_iter(train_data, batch_size, enable_ce=True) + else: + train_data_iter = reader.get_data_iter(train_data, batch_size) + + + total_loss = 0 + word_count = 0.0 + batch_times = [] + for batch_id, batch in enumerate(train_data_iter): + batch_start_time = time.time() + input_data_feed, word_num = prepare_input(batch, epoch_id=epoch_id) + fetch_outs = exe.run(program=train_program, + feed=input_data_feed, + fetch_list=[loss.name], + use_program_cache=True) + + cost_train = np.array(fetch_outs[0]) + + total_loss += cost_train * batch_size + word_count += word_num + batch_end_time = time.time() + batch_time = batch_end_time - batch_start_time + batch_times.append(batch_time) + + if batch_id > 0 and batch_id % 100 == 0: + print( + "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f" + % (epoch_id, batch_id, batch_time, np.exp(total_loss / word_count))) + ce_ppl.append(np.exp(total_loss / word_count)) + total_loss = 0.0 + word_count = 0.0 + + end_time = time.time() + epoch_time = end_time - start_time + ce_time.append(epoch_time) + print( + "\nTrain epoch:[%d]; Epoch Time: %.5f; avg_time: %.5f s/step\n" + % (epoch_id, epoch_time, sum(batch_times) / len(batch_times))) + + if not args.profile: + dir_name = args.model_path + "/epoch_" + str(epoch_id) + print("begin to save", dir_name) + fluid.io.save_params(exe, dir_name) + print("save finished") + dev_ppl = eval(valid_data) + print("dev ppl", dev_ppl) + test_ppl = eval(test_data) + print("test ppl", test_ppl) - total_loss = 0 - word_count = 0.0 - for batch_id, batch in enumerate(train_data_iter): - - input_data_feed, word_num = prepare_input(batch, epoch_id=epoch_id) - fetch_outs = exe.run(feed=input_data_feed, - fetch_list=[loss.name], - use_program_cache=True) - - cost_train = np.array(fetch_outs[0]) - - total_loss += cost_train * batch_size - word_count += word_num - - if batch_id > 0 and batch_id % 100 == 0: - print("ppl", batch_id, np.exp(total_loss / word_count)) - ce_ppl.append(np.exp(total_loss / word_count)) - total_loss = 0.0 - word_count = 0.0 - end_time = time.time() - time_gap = end_time - start_time - ce_time.append(time_gap) - - dir_name = args.model_path + "/epoch_" + str(epoch_id) - print("begin to save", dir_name) - fluid.io.save_params(exe, dir_name) - print("save finished") - dev_ppl = eval(valid_data) - print("dev ppl", dev_ppl) - test_ppl = eval(test_data) - print("test ppl", test_ppl) - - if args.enable_ce: - card_num = get_cards() - _ppl = 0 - _time = 0 - try: - _time = ce_time[-1] - _ppl = ce_ppl[-1] - except: - print("ce info error") - print("kpis\ttrain_duration_card%s\t%s" % - (card_num, _time)) - print("kpis\ttrain_ppl_card%s\t%f" % - (card_num, _ppl)) + if args.enable_ce: + card_num = get_cards() + _ppl = 0 + _time = 0 + try: + _time = ce_time[-1] + _ppl = ce_ppl[-1] + except: + print("ce info error") + print("kpis\ttrain_duration_card%s\t%s" % + (card_num, _time)) + print("kpis\ttrain_ppl_card%s\t%f" % + (card_num, _ppl)) + + with profile_context(args.profile): + train() def get_cards(): @@ -228,4 +276,4 @@ def get_cards(): if __name__ == '__main__': - train() + main()