提交 5a50806e 编写于 作者: L LiuChiaChi

Merge branch 'develop' of https://github.com/PaddlePaddle/models into update-seq2seq-attn

...@@ -34,6 +34,24 @@ logging.basicConfig(level=logging.INFO) ...@@ -34,6 +34,24 @@ logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class TimeCostAverage(object):
def __init__(self):
self.reset()
def reset(self):
self.cnt = 0
self.total_time = 0
def record(self, usetime):
self.cnt += 1
self.total_time += usetime
def get_average(self):
if self.cnt == 0:
return 0
return self.total_time / self.cnt
def build_program(is_train, main_prog, startup_prog, args): def build_program(is_train, main_prog, startup_prog, args):
"""build program, and add backward op in program accroding to different mode """build program, and add backward op in program accroding to different mode
...@@ -225,7 +243,11 @@ def train(args): ...@@ -225,7 +243,11 @@ def train(args):
compiled_train_prog = best_strategy_compiled(args, train_prog, compiled_train_prog = best_strategy_compiled(args, train_prog,
train_fetch_vars[0], exe) train_fetch_vars[0], exe)
batch_cost_avg = TimeCostAverage()
#NOTE: this for benchmark #NOTE: this for benchmark
total_batch_num = 0 total_batch_num = 0
for pass_id in range(args.num_epochs): for pass_id in range(args.num_epochs):
if num_trainers > 1 and not args.use_dali: if num_trainers > 1 and not args.use_dali:
...@@ -234,7 +256,6 @@ def train(args): ...@@ -234,7 +256,6 @@ def train(args):
train_batch_id = 0 train_batch_id = 0
train_batch_time_record = [] train_batch_time_record = []
train_batch_metrics_record = [] train_batch_metrics_record = []
train_batch_time_print_step = []
if not args.use_dali: if not args.use_dali:
train_iter = train_data_loader() train_iter = train_data_loader()
...@@ -252,25 +273,18 @@ def train(args): ...@@ -252,25 +273,18 @@ def train(args):
t2 = time.time() t2 = time.time()
train_batch_elapse = t2 - t1 train_batch_elapse = t2 - t1
train_batch_time_record.append(train_batch_elapse) train_batch_time_record.append(train_batch_elapse)
batch_cost_avg.record(train_batch_elapse)
train_batch_metrics_avg = np.mean( train_batch_metrics_avg = np.mean(
np.array(train_batch_metrics), axis=1) np.array(train_batch_metrics), axis=1)
train_batch_metrics_record.append(train_batch_metrics_avg) train_batch_metrics_record.append(train_batch_metrics_avg)
if trainer_id == 0: if trainer_id == 0:
if train_batch_id % args.print_step == 0: print_info("batch", train_batch_metrics_avg,
if len(train_batch_time_print_step) == 0: batch_cost_avg.get_average(), pass_id,
train_batch_time_print_step_avg = train_batch_elapse train_batch_id, args.print_step)
else:
train_batch_time_print_step_avg = np.mean(
train_batch_time_print_step)
train_batch_time_print_step = []
print_info("batch", train_batch_metrics_avg,
train_batch_time_print_step_avg, pass_id,
train_batch_id, args.print_step)
else:
train_batch_time_print_step.append(train_batch_elapse)
sys.stdout.flush() sys.stdout.flush()
if train_batch_id % args.print_step == 0:
batch_cost_avg.reset()
train_batch_id += 1 train_batch_id += 1
t1 = time.time() t1 = time.time()
#NOTE: this for benchmark profiler #NOTE: this for benchmark profiler
......
...@@ -49,6 +49,19 @@ import pickle ...@@ -49,6 +49,19 @@ import pickle
SEED = 123 SEED = 123
class TimeCostAverage(object):
def __init__(self):
self.reset()
def reset(self):
self.cnt = 0
self.total_time = 0
def record(self, usetime):
self.cnt += 1
self.total_time += usetime
def get_average(self):
if self.cnt == 0:
return 0
return self.total_time / self.cnt
@contextlib.contextmanager @contextlib.contextmanager
def profile_context(profile=True, profiler_path='/tmp/paddingrnn.profile'): def profile_context(profile=True, profiler_path='/tmp/paddingrnn.profile'):
...@@ -293,8 +306,10 @@ def main(): ...@@ -293,8 +306,10 @@ def main():
total_loss = 0 total_loss = 0
iters = 0 iters = 0
batch_cost_avg = TimeCostAverage()
init_hidden, init_cell = generate_init_data() init_hidden, init_cell = generate_init_data()
batch_start_time = time.time()
for batch_id, batch in enumerate(train_data_iter): for batch_id, batch in enumerate(train_data_iter):
input_data_feed = prepare_input( input_data_feed = prepare_input(
batch, batch,
...@@ -303,7 +318,6 @@ def main(): ...@@ -303,7 +318,6 @@ def main():
epoch_id=epoch_id, epoch_id=epoch_id,
with_lr=True, with_lr=True,
device_count=device_count) device_count=device_count)
batch_start_time = time.time()
fetch_outs = exe.run(train_program, fetch_outs = exe.run(train_program,
feed=input_data_feed, feed=input_data_feed,
fetch_list=[ fetch_list=[
...@@ -313,6 +327,7 @@ def main(): ...@@ -313,6 +327,7 @@ def main():
use_program_cache=True) use_program_cache=True)
batch_time = time.time() - batch_start_time batch_time = time.time() - batch_start_time
batch_times.append(batch_time) batch_times.append(batch_time)
batch_cost_avg.record(batch_time)
cost_train = np.array(fetch_outs[0]) cost_train = np.array(fetch_outs[0])
lr = np.array(fetch_outs[1]) lr = np.array(fetch_outs[1])
...@@ -324,13 +339,17 @@ def main(): ...@@ -324,13 +339,17 @@ def main():
ppl = np.exp(total_loss / iters) ppl = np.exp(total_loss / iters)
print( print(
"-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f"
% (epoch_id, batch_id, batch_time, ppl[0], lr[0])) % (epoch_id, batch_id, batch_cost_avg.get_average(), ppl[0], lr[0]))
batch_cost_avg.reset()
# profiler tools for benchmark # profiler tools for benchmark
if args.profile and batch_id == log_interval: if args.profile and batch_id == log_interval:
profiler.reset_profiler() profiler.reset_profiler()
elif args.profile and batch_id == (log_interval + 5): elif args.profile and batch_id == (log_interval + 5):
break break
batch_start_time = time.time()
ppl = np.exp(total_loss / iters) ppl = np.exp(total_loss / iters)
return ppl return ppl
...@@ -342,6 +361,7 @@ def main(): ...@@ -342,6 +361,7 @@ def main():
total_loss = 0 total_loss = 0
iters = 0 iters = 0
batch_cost_avg = TimeCostAverage()
dataloader.start() dataloader.start()
batch_id = 0 batch_id = 0
...@@ -355,6 +375,7 @@ def main(): ...@@ -355,6 +375,7 @@ def main():
batch_time = time.time() - batch_start_time batch_time = time.time() - batch_start_time
batch_times.append(batch_time) batch_times.append(batch_time)
batch_start_time = time.time() batch_start_time = time.time()
batch_cost_avg.record(batch_time)
new_lr = generate_new_lr(epoch_id, device_count) new_lr = generate_new_lr(epoch_id, device_count)
data_feeds['learning_rate'] = new_lr data_feeds['learning_rate'] = new_lr
...@@ -381,7 +402,8 @@ def main(): ...@@ -381,7 +402,8 @@ def main():
ppl = np.exp(total_loss / iters) ppl = np.exp(total_loss / iters)
print( print(
"-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f"
% (epoch_id, batch_id, batch_time, ppl[0], lr[0])) % (epoch_id, batch_id, batch_cost_avg.get_average(), ppl[0], lr[0]))
batch_cost_avg.reset()
batch_id += 1 batch_id += 1
# profiler tools for benchmark # profiler tools for benchmark
......
...@@ -38,6 +38,19 @@ args = parse_args() ...@@ -38,6 +38,19 @@ args = parse_args()
if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0: if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
print_arguments(args) print_arguments(args)
class TimeCostAverage(object):
def __init__(self):
self.reset()
def reset(self):
self.cnt = 0
self.total_time = 0
def record(self, usetime):
self.cnt += 1
self.total_time += usetime
def get_average(self):
if self.cnt == 0:
return 0
return self.total_time / self.cnt
def eval(net, test_data_loader, eop): def eval(net, test_data_loader, eop):
total_loss = 0.0 total_loss = 0.0
...@@ -170,6 +183,10 @@ def train_mobilenet(): ...@@ -170,6 +183,10 @@ def train_mobilenet():
t_last = 0 t_last = 0
# 4.1 for each batch, call net() , backward(), and minimize() # 4.1 for each batch, call net() , backward(), and minimize()
batch_cost_avg = TimeCostAverage()
batch_reader_avg = TimeCostAverage()
batch_net_avg = TimeCostAverage()
batch_backward_avg = TimeCostAverage()
batch_start = time.time() batch_start = time.time()
for img, label in train_data_loader(): for img, label in train_data_loader():
if args.max_iter and total_batch_num == args.max_iter: if args.max_iter and total_batch_num == args.max_iter:
...@@ -208,16 +225,25 @@ def train_mobilenet(): ...@@ -208,16 +225,25 @@ def train_mobilenet():
# NOTE: used for benchmark # NOTE: used for benchmark
train_batch_cost = time.time() - batch_start train_batch_cost = time.time() - batch_start
batch_cost_avg.record(train_batch_cost)
batch_reader_avg.record(batch_reader_end - batch_start)
batch_net_avg.record(batch_net_end - batch_reader_end)
batch_backward_avg.record(batch_backward_end - batch_net_end)
total_batch_num = total_batch_num + 1 total_batch_num = total_batch_num + 1
if batch_id % args.print_step == 0: if batch_id % args.print_step == 0:
print( print(
"[Epoch %d, batch %d], avg_loss %.5f, acc_top1 %.5f, acc_top5 %.5f, batch_cost: %.5f s, net_t: %.5f s, backward_t: %.5f s, reader_t: %.5f s" "[Epoch %d, batch %d], avg_loss %.5f, acc_top1 %.5f, acc_top5 %.5f, batch_cost: %.5f s, net_t: %.5f s, backward_t: %.5f s, reader_t: %.5f s"
% (eop, batch_id, avg_loss.numpy(), acc_top1.numpy(), % (eop, batch_id, avg_loss.numpy(), acc_top1.numpy(),
acc_top5.numpy(), train_batch_cost, acc_top5.numpy(), batch_cost_avg.get_average(),
batch_net_end - batch_reader_end, batch_net_avg.get_average(),
batch_backward_end - batch_net_end, batch_backward_avg.get_average(),
batch_reader_end - batch_start)) batch_reader_avg.get_average()))
sys.stdout.flush() sys.stdout.flush()
batch_cost_avg.reset()
batch_net_avg.reset()
batch_backward_avg.reset()
batch_reader_avg.reset()
batch_start = time.time() batch_start = time.time()
if args.ce: if args.ce:
......
...@@ -37,6 +37,19 @@ if sys.version[0] == '2': ...@@ -37,6 +37,19 @@ if sys.version[0] == '2':
reload(sys) reload(sys)
sys.setdefaultencoding("utf-8") sys.setdefaultencoding("utf-8")
class TimeCostAverage(object):
def __init__(self):
self.reset()
def reset(self):
self.cnt = 0
self.total_time = 0
def record(self, usetime):
self.cnt += 1
self.total_time += usetime
def get_average(self):
if self.cnt == 0:
return 0
return self.total_time / self.cnt
class SimpleLSTMRNN(fluid.Layer): class SimpleLSTMRNN(fluid.Layer):
def __init__(self, def __init__(self,
...@@ -405,10 +418,17 @@ def train_ptb_lm(): ...@@ -405,10 +418,17 @@ def train_ptb_lm():
init_hidden = to_variable(init_hidden_data) init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data) init_cell = to_variable(init_cell_data)
batch_cost_avg = TimeCostAverage()
reader_cost_avg = TimeCostAverage()
batch_start = time.time() batch_start = time.time()
for batch_id, batch in enumerate(train_data_loader): for batch_id, batch in enumerate(train_data_loader):
if args.max_iter and total_batch_num == args.max_iter: if args.max_iter and total_batch_num == args.max_iter:
return return
train_reader_cost = time.time() - batch_start
reader_cost_avg.record(train_reader_cost)
x, y = batch x, y = batch
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
...@@ -426,13 +446,17 @@ def train_ptb_lm(): ...@@ -426,13 +446,17 @@ def train_ptb_lm():
total_batch_num = total_batch_num + 1 #this is for benchmark total_batch_num = total_batch_num + 1 #this is for benchmark
train_batch_cost = time.time() - batch_start train_batch_cost = time.time() - batch_start
batch_cost_avg.record(train_batch_cost)
if batch_id > 0 and batch_id % log_interval == 0: if batch_id > 0 and batch_id % log_interval == 0:
ppl = np.exp(total_loss / iters) ppl = np.exp(total_loss / iters)
print( print(
"-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch_cost: %.5f s" "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch_cost: %.5f s, reader_cost: %.5f s"
% (epoch_id, batch_id, ppl[0], % (epoch_id, batch_id, ppl[0],
sgd._global_learning_rate().numpy(), out_loss, sgd._global_learning_rate().numpy(), out_loss,
train_batch_cost)) batch_cost_avg.get_average(), reader_cost_avg.get_average()))
batch_cost_avg.reset()
reader_cost_avg.reset()
batch_start = time.time() batch_start = time.time()
ppl = np.exp(total_loss / iters) ppl = np.exp(total_loss / iters)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册