diff --git a/PaddleCV/image_classification/train.py b/PaddleCV/image_classification/train.py index 150b44b6e747efae84b1efb5803be30c4f0528ce..bed4fe56b8dd05cc026d5afd68899640d6a50176 100755 --- a/PaddleCV/image_classification/train.py +++ b/PaddleCV/image_classification/train.py @@ -34,6 +34,24 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +class TimeCostAverage(object): + def __init__(self): + self.reset() + + def reset(self): + self.cnt = 0 + self.total_time = 0 + + def record(self, usetime): + self.cnt += 1 + self.total_time += usetime + + def get_average(self): + if self.cnt == 0: + return 0 + return self.total_time / self.cnt + + def build_program(is_train, main_prog, startup_prog, args): """build program, and add backward op in program accroding to different mode @@ -225,7 +243,11 @@ def train(args): compiled_train_prog = best_strategy_compiled(args, train_prog, train_fetch_vars[0], exe) + + batch_cost_avg = TimeCostAverage() + #NOTE: this for benchmark + total_batch_num = 0 for pass_id in range(args.num_epochs): if num_trainers > 1 and not args.use_dali: @@ -234,7 +256,6 @@ def train(args): train_batch_id = 0 train_batch_time_record = [] train_batch_metrics_record = [] - train_batch_time_print_step = [] if not args.use_dali: train_iter = train_data_loader() @@ -252,25 +273,18 @@ def train(args): t2 = time.time() train_batch_elapse = t2 - t1 train_batch_time_record.append(train_batch_elapse) + batch_cost_avg.record(train_batch_elapse) train_batch_metrics_avg = np.mean( np.array(train_batch_metrics), axis=1) train_batch_metrics_record.append(train_batch_metrics_avg) if trainer_id == 0: - if train_batch_id % args.print_step == 0: - if len(train_batch_time_print_step) == 0: - train_batch_time_print_step_avg = train_batch_elapse - else: - train_batch_time_print_step_avg = np.mean( - train_batch_time_print_step) - train_batch_time_print_step = [] - print_info("batch", train_batch_metrics_avg, - train_batch_time_print_step_avg, pass_id, - train_batch_id, args.print_step) - else: - train_batch_time_print_step.append(train_batch_elapse) - + print_info("batch", train_batch_metrics_avg, + batch_cost_avg.get_average(), pass_id, + train_batch_id, args.print_step) sys.stdout.flush() + if train_batch_id % args.print_step == 0: + batch_cost_avg.reset() train_batch_id += 1 t1 = time.time() #NOTE: this for benchmark profiler diff --git a/PaddleNLP/language_model/train.py b/PaddleNLP/language_model/train.py index 33f9651f6500601e461f14e665b81862f97b93c7..5ff5e59da4849ab6ca984a5cec0453cac9720824 100644 --- a/PaddleNLP/language_model/train.py +++ b/PaddleNLP/language_model/train.py @@ -49,6 +49,19 @@ import pickle SEED = 123 +class TimeCostAverage(object): + def __init__(self): + self.reset() + def reset(self): + self.cnt = 0 + self.total_time = 0 + def record(self, usetime): + self.cnt += 1 + self.total_time += usetime + def get_average(self): + if self.cnt == 0: + return 0 + return self.total_time / self.cnt @contextlib.contextmanager def profile_context(profile=True, profiler_path='/tmp/paddingrnn.profile'): @@ -293,8 +306,10 @@ def main(): total_loss = 0 iters = 0 + batch_cost_avg = TimeCostAverage() init_hidden, init_cell = generate_init_data() + batch_start_time = time.time() for batch_id, batch in enumerate(train_data_iter): input_data_feed = prepare_input( batch, @@ -303,7 +318,6 @@ def main(): epoch_id=epoch_id, with_lr=True, device_count=device_count) - batch_start_time = time.time() fetch_outs = exe.run(train_program, feed=input_data_feed, fetch_list=[ @@ -313,6 +327,7 @@ def main(): use_program_cache=True) batch_time = time.time() - batch_start_time batch_times.append(batch_time) + batch_cost_avg.record(batch_time) cost_train = np.array(fetch_outs[0]) lr = np.array(fetch_outs[1]) @@ -324,13 +339,17 @@ def main(): ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" - % (epoch_id, batch_id, batch_time, ppl[0], lr[0])) + % (epoch_id, batch_id, batch_cost_avg.get_average(), ppl[0], lr[0])) + batch_cost_avg.reset() # profiler tools for benchmark if args.profile and batch_id == log_interval: profiler.reset_profiler() elif args.profile and batch_id == (log_interval + 5): break + + batch_start_time = time.time() + ppl = np.exp(total_loss / iters) return ppl @@ -342,6 +361,7 @@ def main(): total_loss = 0 iters = 0 + batch_cost_avg = TimeCostAverage() dataloader.start() batch_id = 0 @@ -355,6 +375,7 @@ def main(): batch_time = time.time() - batch_start_time batch_times.append(batch_time) batch_start_time = time.time() + batch_cost_avg.record(batch_time) new_lr = generate_new_lr(epoch_id, device_count) data_feeds['learning_rate'] = new_lr @@ -381,7 +402,8 @@ def main(): ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" - % (epoch_id, batch_id, batch_time, ppl[0], lr[0])) + % (epoch_id, batch_id, batch_cost_avg.get_average(), ppl[0], lr[0])) + batch_cost_avg.reset() batch_id += 1 # profiler tools for benchmark diff --git a/dygraph/mobilenet/train.py b/dygraph/mobilenet/train.py index debd22abcccf96ebc91607133aff278c3b184088..f620842b01df1f838512b8f099ca7a28e526c865 100644 --- a/dygraph/mobilenet/train.py +++ b/dygraph/mobilenet/train.py @@ -38,6 +38,19 @@ args = parse_args() if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0: print_arguments(args) +class TimeCostAverage(object): + def __init__(self): + self.reset() + def reset(self): + self.cnt = 0 + self.total_time = 0 + def record(self, usetime): + self.cnt += 1 + self.total_time += usetime + def get_average(self): + if self.cnt == 0: + return 0 + return self.total_time / self.cnt def eval(net, test_data_loader, eop): total_loss = 0.0 @@ -170,6 +183,10 @@ def train_mobilenet(): t_last = 0 # 4.1 for each batch, call net() , backward(), and minimize() + batch_cost_avg = TimeCostAverage() + batch_reader_avg = TimeCostAverage() + batch_net_avg = TimeCostAverage() + batch_backward_avg = TimeCostAverage() batch_start = time.time() for img, label in train_data_loader(): if args.max_iter and total_batch_num == args.max_iter: @@ -208,16 +225,25 @@ def train_mobilenet(): # NOTE: used for benchmark train_batch_cost = time.time() - batch_start + batch_cost_avg.record(train_batch_cost) + batch_reader_avg.record(batch_reader_end - batch_start) + batch_net_avg.record(batch_net_end - batch_reader_end) + batch_backward_avg.record(batch_backward_end - batch_net_end) + total_batch_num = total_batch_num + 1 if batch_id % args.print_step == 0: print( "[Epoch %d, batch %d], avg_loss %.5f, acc_top1 %.5f, acc_top5 %.5f, batch_cost: %.5f s, net_t: %.5f s, backward_t: %.5f s, reader_t: %.5f s" % (eop, batch_id, avg_loss.numpy(), acc_top1.numpy(), - acc_top5.numpy(), train_batch_cost, - batch_net_end - batch_reader_end, - batch_backward_end - batch_net_end, - batch_reader_end - batch_start)) + acc_top5.numpy(), batch_cost_avg.get_average(), + batch_net_avg.get_average(), + batch_backward_avg.get_average(), + batch_reader_avg.get_average())) sys.stdout.flush() + batch_cost_avg.reset() + batch_net_avg.reset() + batch_backward_avg.reset() + batch_reader_avg.reset() batch_start = time.time() if args.ce: diff --git a/dygraph/ptb_lm/ptb_dy.py b/dygraph/ptb_lm/ptb_dy.py index f38a8c93209e9aa4e3bab8c403744bedeb465425..086901d7d5efb8a773551cdeaacb29829d966e80 100644 --- a/dygraph/ptb_lm/ptb_dy.py +++ b/dygraph/ptb_lm/ptb_dy.py @@ -37,6 +37,19 @@ if sys.version[0] == '2': reload(sys) sys.setdefaultencoding("utf-8") +class TimeCostAverage(object): + def __init__(self): + self.reset() + def reset(self): + self.cnt = 0 + self.total_time = 0 + def record(self, usetime): + self.cnt += 1 + self.total_time += usetime + def get_average(self): + if self.cnt == 0: + return 0 + return self.total_time / self.cnt class SimpleLSTMRNN(fluid.Layer): def __init__(self, @@ -405,10 +418,17 @@ def train_ptb_lm(): init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) + batch_cost_avg = TimeCostAverage() + reader_cost_avg = TimeCostAverage() + batch_start = time.time() for batch_id, batch in enumerate(train_data_loader): if args.max_iter and total_batch_num == args.max_iter: return + + train_reader_cost = time.time() - batch_start + reader_cost_avg.record(train_reader_cost) + x, y = batch dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, @@ -426,13 +446,17 @@ def train_ptb_lm(): total_batch_num = total_batch_num + 1 #this is for benchmark train_batch_cost = time.time() - batch_start + batch_cost_avg.record(train_batch_cost) + if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print( - "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch_cost: %.5f s" + "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch_cost: %.5f s, reader_cost: %.5f s" % (epoch_id, batch_id, ppl[0], sgd._global_learning_rate().numpy(), out_loss, - train_batch_cost)) + batch_cost_avg.get_average(), reader_cost_avg.get_average())) + batch_cost_avg.reset() + reader_cost_avg.reset() batch_start = time.time() ppl = np.exp(total_loss / iters)