未验证 提交 e99b607e 编写于 作者: Y Yiqun Liu 提交者: GitHub

Enhance the training processing of rnn_rearch (#3168)

* Refine the print and add timing for each step and epoch.

* Enable the profile.
上级 2ef93ad2
...@@ -119,5 +119,15 @@ def parse_args(): ...@@ -119,5 +119,15 @@ def parse_args():
help="The flag indicating whether to run the task " help="The flag indicating whether to run the task "
"for continuous evaluation.") "for continuous evaluation.")
parser.add_argument(
"--parallel",
action='store_true',
help="Whether execute with the data_parallel mode.")
parser.add_argument(
"--profile",
action='store_true',
help="Whether enable the profile.")
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -20,12 +20,13 @@ import numpy as np ...@@ -20,12 +20,13 @@ import numpy as np
import time import time
import os import os
import random import random
import math import math
import contextlib
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
import paddle.fluid.profiler as profiler
from paddle.fluid.executor import Executor from paddle.fluid.executor import Executor
import reader import reader
...@@ -45,7 +46,16 @@ import pickle ...@@ -45,7 +46,16 @@ import pickle
SEED = 123 SEED = 123
def train(): @contextlib.contextmanager
def profile_context(profile=True):
if profile:
with profiler.profiler('All', 'total', 'seq2seq.profile'):
yield
else:
yield
def main():
args = parse_args() args = parse_args()
num_layers = args.num_layers num_layers = args.num_layers
...@@ -106,6 +116,29 @@ def train(): ...@@ -106,6 +116,29 @@ def train():
exe = Executor(place) exe = Executor(place)
exe.run(framework.default_startup_program()) exe.run(framework.default_startup_program())
device_count = len(fluid.cuda_places()) if args.use_gpu else len(
fluid.cpu_places())
if device_count > 1:
raise Exception("Training using multi-GPUs is not supported now.")
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = device_count
exec_strategy.num_iteration_per_drop_scope = 100
build_strategy = fluid.BuildStrategy()
build_strategy.enable_inplace = True
build_strategy.memory_optimize = False
# build_strategy.fuse_all_optimizer_ops = True
if args.parallel:
train_program = fluid.compiler.CompiledProgram(
framework.default_main_program()).with_data_parallel(
loss_name=loss.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
else:
train_program = framework.default_main_program()
train_data_prefix = args.train_data_prefix train_data_prefix = args.train_data_prefix
eval_data_prefix = args.eval_data_prefix eval_data_prefix = args.eval_data_prefix
test_data_prefix = args.test_data_prefix test_data_prefix = args.test_data_prefix
...@@ -160,63 +193,78 @@ def train(): ...@@ -160,63 +193,78 @@ def train():
return ppl return ppl
ce_time = [] def train():
ce_ppl = [] ce_time = []
max_epoch = args.max_epoch ce_ppl = []
for epoch_id in range(max_epoch): max_epoch = args.max_epoch
start_time = time.time() for epoch_id in range(max_epoch):
print("epoch id", epoch_id) start_time = time.time()
if args.enable_ce: if args.enable_ce:
train_data_iter = reader.get_data_iter(train_data, batch_size, enable_ce=True) train_data_iter = reader.get_data_iter(train_data, batch_size, enable_ce=True)
else: else:
train_data_iter = reader.get_data_iter(train_data, batch_size) train_data_iter = reader.get_data_iter(train_data, batch_size)
total_loss = 0
word_count = 0.0
batch_times = []
for batch_id, batch in enumerate(train_data_iter):
batch_start_time = time.time()
input_data_feed, word_num = prepare_input(batch, epoch_id=epoch_id)
fetch_outs = exe.run(program=train_program,
feed=input_data_feed,
fetch_list=[loss.name],
use_program_cache=True)
cost_train = np.array(fetch_outs[0])
total_loss += cost_train * batch_size
word_count += word_num
batch_end_time = time.time()
batch_time = batch_end_time - batch_start_time
batch_times.append(batch_time)
if batch_id > 0 and batch_id % 100 == 0:
print(
"-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f"
% (epoch_id, batch_id, batch_time, np.exp(total_loss / word_count)))
ce_ppl.append(np.exp(total_loss / word_count))
total_loss = 0.0
word_count = 0.0
end_time = time.time()
epoch_time = end_time - start_time
ce_time.append(epoch_time)
print(
"\nTrain epoch:[%d]; Epoch Time: %.5f; avg_time: %.5f s/step\n"
% (epoch_id, epoch_time, sum(batch_times) / len(batch_times)))
if not args.profile:
dir_name = args.model_path + "/epoch_" + str(epoch_id)
print("begin to save", dir_name)
fluid.io.save_params(exe, dir_name)
print("save finished")
dev_ppl = eval(valid_data)
print("dev ppl", dev_ppl)
test_ppl = eval(test_data)
print("test ppl", test_ppl)
total_loss = 0 if args.enable_ce:
word_count = 0.0 card_num = get_cards()
for batch_id, batch in enumerate(train_data_iter): _ppl = 0
_time = 0
input_data_feed, word_num = prepare_input(batch, epoch_id=epoch_id) try:
fetch_outs = exe.run(feed=input_data_feed, _time = ce_time[-1]
fetch_list=[loss.name], _ppl = ce_ppl[-1]
use_program_cache=True) except:
print("ce info error")
cost_train = np.array(fetch_outs[0]) print("kpis\ttrain_duration_card%s\t%s" %
(card_num, _time))
total_loss += cost_train * batch_size print("kpis\ttrain_ppl_card%s\t%f" %
word_count += word_num (card_num, _ppl))
if batch_id > 0 and batch_id % 100 == 0: with profile_context(args.profile):
print("ppl", batch_id, np.exp(total_loss / word_count)) train()
ce_ppl.append(np.exp(total_loss / word_count))
total_loss = 0.0
word_count = 0.0
end_time = time.time()
time_gap = end_time - start_time
ce_time.append(time_gap)
dir_name = args.model_path + "/epoch_" + str(epoch_id)
print("begin to save", dir_name)
fluid.io.save_params(exe, dir_name)
print("save finished")
dev_ppl = eval(valid_data)
print("dev ppl", dev_ppl)
test_ppl = eval(test_data)
print("test ppl", test_ppl)
if args.enable_ce:
card_num = get_cards()
_ppl = 0
_time = 0
try:
_time = ce_time[-1]
_ppl = ce_ppl[-1]
except:
print("ce info error")
print("kpis\ttrain_duration_card%s\t%s" %
(card_num, _time))
print("kpis\ttrain_ppl_card%s\t%f" %
(card_num, _ppl))
def get_cards(): def get_cards():
...@@ -228,4 +276,4 @@ def get_cards(): ...@@ -228,4 +276,4 @@ def get_cards():
if __name__ == '__main__': if __name__ == '__main__':
train() main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册