diff --git a/PaddleCV/image_classification/train.py b/PaddleCV/image_classification/train.py
index 150b44b6e747efae84b1efb5803be30c4f0528ce..bed4fe56b8dd05cc026d5afd68899640d6a50176 100755
--- a/PaddleCV/image_classification/train.py
+++ b/PaddleCV/image_classification/train.py
@@ -34,6 +34,24 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
+class TimeCostAverage(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.cnt = 0
+        self.total_time = 0
+
+    def record(self, usetime):
+        self.cnt += 1
+        self.total_time += usetime
+
+    def get_average(self):
+        if self.cnt == 0:
+            return 0
+        return self.total_time / self.cnt
+
+
 def build_program(is_train, main_prog, startup_prog, args):
     """build program, and add backward op in program accroding to different mode
 
@@ -225,7 +243,11 @@ def train(args):
 
     compiled_train_prog = best_strategy_compiled(args, train_prog,
                                                  train_fetch_vars[0], exe)
+
+    batch_cost_avg = TimeCostAverage()
+
     #NOTE: this for benchmark
+
     total_batch_num = 0
     for pass_id in range(args.num_epochs):
         if num_trainers > 1 and not args.use_dali:
@@ -234,7 +256,6 @@ def train(args):
         train_batch_id = 0
         train_batch_time_record = []
         train_batch_metrics_record = []
-        train_batch_time_print_step = []
 
         if not args.use_dali:
             train_iter = train_data_loader()
@@ -252,25 +273,18 @@ def train(args):
             t2 = time.time()
             train_batch_elapse = t2 - t1
             train_batch_time_record.append(train_batch_elapse)
+            batch_cost_avg.record(train_batch_elapse)
 
             train_batch_metrics_avg = np.mean(
                 np.array(train_batch_metrics), axis=1)
             train_batch_metrics_record.append(train_batch_metrics_avg)
             if trainer_id == 0:
-                if train_batch_id % args.print_step == 0:
-                    if len(train_batch_time_print_step) == 0:
-                        train_batch_time_print_step_avg = train_batch_elapse
-                    else:
-                        train_batch_time_print_step_avg = np.mean(
-                            train_batch_time_print_step)
-                    train_batch_time_print_step = []
-                    print_info("batch", train_batch_metrics_avg,
-                               train_batch_time_print_step_avg, pass_id,
-                               train_batch_id, args.print_step)
-                else:
-                    train_batch_time_print_step.append(train_batch_elapse)
-
+                print_info("batch", train_batch_metrics_avg,
+                           batch_cost_avg.get_average(), pass_id,
+                           train_batch_id, args.print_step)
                 sys.stdout.flush()
+                if train_batch_id % args.print_step == 0:
+                    batch_cost_avg.reset()
             train_batch_id += 1
             t1 = time.time()
             #NOTE: this for benchmark profiler
diff --git a/PaddleNLP/language_model/train.py b/PaddleNLP/language_model/train.py
index 33f9651f6500601e461f14e665b81862f97b93c7..5ff5e59da4849ab6ca984a5cec0453cac9720824 100644
--- a/PaddleNLP/language_model/train.py
+++ b/PaddleNLP/language_model/train.py
@@ -49,6 +49,19 @@ import pickle
 
 SEED = 123
 
+class TimeCostAverage(object):
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.cnt = 0
+        self.total_time = 0
+    def record(self, usetime):
+        self.cnt += 1
+        self.total_time += usetime
+    def get_average(self):
+        if self.cnt == 0:
+            return 0
+        return self.total_time / self.cnt
 
 @contextlib.contextmanager
 def profile_context(profile=True, profiler_path='/tmp/paddingrnn.profile'):
@@ -293,8 +306,10 @@ def main():
 
         total_loss = 0
         iters = 0
+        batch_cost_avg = TimeCostAverage()
 
         init_hidden, init_cell = generate_init_data()
+        batch_start_time = time.time()
         for batch_id, batch in enumerate(train_data_iter):
             input_data_feed = prepare_input(
                 batch,
@@ -303,7 +318,6 @@ def main():
                 epoch_id=epoch_id,
                 with_lr=True,
                 device_count=device_count)
-            batch_start_time = time.time()
             fetch_outs = exe.run(train_program,
                                  feed=input_data_feed,
                                  fetch_list=[
@@ -313,6 +327,7 @@ def main():
                                  use_program_cache=True)
             batch_time = time.time() - batch_start_time
             batch_times.append(batch_time)
+            batch_cost_avg.record(batch_time)
 
             cost_train = np.array(fetch_outs[0])
             lr = np.array(fetch_outs[1])
@@ -324,13 +339,17 @@ def main():
                 ppl = np.exp(total_loss / iters)
                 print(
                     "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f"
-                    % (epoch_id, batch_id, batch_time, ppl[0], lr[0]))
+                    % (epoch_id, batch_id, batch_cost_avg.get_average(), ppl[0], lr[0]))
+                batch_cost_avg.reset()
 
             # profiler tools for benchmark
             if args.profile and batch_id == log_interval:
                 profiler.reset_profiler()
             elif args.profile and batch_id == (log_interval + 5):
                 break
+
+            batch_start_time = time.time()
+
         ppl = np.exp(total_loss / iters)
         return ppl
 
@@ -342,6 +361,7 @@ def main():
 
         total_loss = 0
         iters = 0
+        batch_cost_avg = TimeCostAverage()
 
         dataloader.start()
         batch_id = 0
@@ -355,6 +375,7 @@ def main():
                     batch_time = time.time() - batch_start_time
                     batch_times.append(batch_time)
                     batch_start_time = time.time()
+                    batch_cost_avg.record(batch_time)
 
                 new_lr = generate_new_lr(epoch_id, device_count)
                 data_feeds['learning_rate'] = new_lr
@@ -381,7 +402,8 @@ def main():
                     ppl = np.exp(total_loss / iters)
                     print(
                         "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f"
-                        % (epoch_id, batch_id, batch_time, ppl[0], lr[0]))
+                        % (epoch_id, batch_id, batch_cost_avg.get_average(), ppl[0], lr[0]))
+                    batch_cost_avg.reset()
 
                 batch_id += 1
                 # profiler tools for benchmark
diff --git a/dygraph/mobilenet/train.py b/dygraph/mobilenet/train.py
index debd22abcccf96ebc91607133aff278c3b184088..f620842b01df1f838512b8f099ca7a28e526c865 100644
--- a/dygraph/mobilenet/train.py
+++ b/dygraph/mobilenet/train.py
@@ -38,6 +38,19 @@ args = parse_args()
 if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
     print_arguments(args)
 
+class TimeCostAverage(object):
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.cnt = 0
+        self.total_time = 0
+    def record(self, usetime):
+        self.cnt += 1
+        self.total_time += usetime
+    def get_average(self):
+        if self.cnt == 0:
+            return 0
+        return self.total_time / self.cnt
 
 def eval(net, test_data_loader, eop):
     total_loss = 0.0
@@ -170,6 +183,10 @@ def train_mobilenet():
             t_last = 0
 
             # 4.1 for each batch, call net() , backward(), and minimize()
+            batch_cost_avg = TimeCostAverage()
+            batch_reader_avg = TimeCostAverage()
+            batch_net_avg = TimeCostAverage()
+            batch_backward_avg = TimeCostAverage()
             batch_start = time.time()
             for img, label in train_data_loader():
                 if args.max_iter and total_batch_num == args.max_iter:
@@ -208,16 +225,25 @@ def train_mobilenet():
 
                 # NOTE: used for benchmark
                 train_batch_cost = time.time() - batch_start
+                batch_cost_avg.record(train_batch_cost)
+                batch_reader_avg.record(batch_reader_end - batch_start)
+                batch_net_avg.record(batch_net_end - batch_reader_end)
+                batch_backward_avg.record(batch_backward_end - batch_net_end)
+
                 total_batch_num = total_batch_num + 1
                 if batch_id % args.print_step == 0:
                     print(
                         "[Epoch %d, batch %d], avg_loss %.5f, acc_top1 %.5f, acc_top5 %.5f, batch_cost: %.5f s, net_t: %.5f s, backward_t: %.5f s, reader_t: %.5f s"
                         % (eop, batch_id, avg_loss.numpy(), acc_top1.numpy(),
-                           acc_top5.numpy(), train_batch_cost,
-                           batch_net_end - batch_reader_end,
-                           batch_backward_end - batch_net_end,
-                           batch_reader_end - batch_start))
+                           acc_top5.numpy(), batch_cost_avg.get_average(),
+                           batch_net_avg.get_average(),
+                           batch_backward_avg.get_average(),
+                           batch_reader_avg.get_average()))
                     sys.stdout.flush()
+                    batch_cost_avg.reset()
+                    batch_net_avg.reset()
+                    batch_backward_avg.reset()
+                    batch_reader_avg.reset()
                 batch_start = time.time()
 
             if args.ce:
diff --git a/dygraph/ptb_lm/ptb_dy.py b/dygraph/ptb_lm/ptb_dy.py
index f38a8c93209e9aa4e3bab8c403744bedeb465425..086901d7d5efb8a773551cdeaacb29829d966e80 100644
--- a/dygraph/ptb_lm/ptb_dy.py
+++ b/dygraph/ptb_lm/ptb_dy.py
@@ -37,6 +37,19 @@ if sys.version[0] == '2':
     reload(sys)
     sys.setdefaultencoding("utf-8")
 
+class TimeCostAverage(object):
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.cnt = 0
+        self.total_time = 0
+    def record(self, usetime):
+        self.cnt += 1
+        self.total_time += usetime
+    def get_average(self):
+        if self.cnt == 0:
+            return 0
+        return self.total_time / self.cnt
 
 class SimpleLSTMRNN(fluid.Layer):
     def __init__(self,
@@ -405,10 +418,17 @@ def train_ptb_lm():
             init_hidden = to_variable(init_hidden_data)
             init_cell = to_variable(init_cell_data)
 
+            batch_cost_avg = TimeCostAverage()
+            reader_cost_avg = TimeCostAverage()
+
             batch_start = time.time()
             for batch_id, batch in enumerate(train_data_loader):
                 if args.max_iter and total_batch_num == args.max_iter:
                     return
+
+                train_reader_cost = time.time() - batch_start
+                reader_cost_avg.record(train_reader_cost)
+
                 x, y = batch
 
                 dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
@@ -426,13 +446,17 @@ def train_ptb_lm():
                 total_batch_num = total_batch_num + 1  #this is for benchmark
 
                 train_batch_cost = time.time() - batch_start
+                batch_cost_avg.record(train_batch_cost)
+
                 if batch_id > 0 and batch_id % log_interval == 0:
                     ppl = np.exp(total_loss / iters)
                     print(
-                        "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch_cost: %.5f s"
+                        "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch_cost: %.5f s, reader_cost: %.5f s"
                         % (epoch_id, batch_id, ppl[0],
                            sgd._global_learning_rate().numpy(), out_loss,
-                           train_batch_cost))
+                           batch_cost_avg.get_average(), reader_cost_avg.get_average()))
+                    batch_cost_avg.reset()
+                    reader_cost_avg.reset()
                 batch_start = time.time()
 
             ppl = np.exp(total_loss / iters)