refine resnet benchmard print (#4893)

* fix ptb_dy time print for benchmark, test=develop * refine resnet benchmard print, test=develop

refine resnet benchmard print (#4893)
* fix ptb_dy time print for benchmark, test=develop * refine resnet benchmard print, test=develop
afaf06e7 · wanghuancoder · GitHub · 4000dfb1 · afaf06e7 · afaf06e7
隐藏空白更改
内联并排

Showing with 32 addition and 14 deletion

PaddleCV/image_classification/train.py PaddleCV/image_classification/train.py +17 -5

PaddleCV/image_classification/utils/utility.py PaddleCV/image_classification/utils/utility.py +15 -9

未找到文件。
--- a/PaddleCV/image_classification/train.py
+++ b/PaddleCV/image_classification/train.py
@@ -120,6 +120,7 @@ def validate(args,
    test_batch_time_record = []
    test_batch_metrics_record = []
    test_batch_id = 0
+
    if int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) > 1:
        compiled_program = test_prog
    else:
@@ -245,6 +246,7 @@ def train(args):
                                                 train_fetch_vars[0], exe)

    batch_cost_avg = TimeCostAverage()
+    reader_cost_avg = TimeCostAverage()

    #NOTE: this for benchmark

@@ -267,11 +269,14 @@ def train(args):
            #NOTE: this is for benchmark
            if args.max_iter and total_batch_num == args.max_iter:
                return
+            t2 = time.time()
+            reader_cost = t2 - t1
+            reader_cost_avg.record(reader_cost)
            train_batch_metrics = exe.run(compiled_train_prog,
                                          feed=batch,
                                          fetch_list=train_fetch_list)
-            t2 = time.time()
-            train_batch_elapse = t2 - t1
+            t3 = time.time()
+            train_batch_elapse = t3 - t1
            train_batch_time_record.append(train_batch_elapse)
            batch_cost_avg.record(train_batch_elapse)

@@ -279,11 +284,18 @@ def train(args):
                np.array(train_batch_metrics), axis=1)
            train_batch_metrics_record.append(train_batch_metrics_avg)
            if trainer_id == 0:
-                print_info("batch", train_batch_metrics_avg,
-                           batch_cost_avg.get_average(), pass_id,
-                           train_batch_id, args.print_step)
+                print_info(
+                    "batch",
+                    train_batch_metrics_avg,
+                    batch_cost_avg.get_average(),
+                    pass_id,
+                    train_batch_id,
+                    args.print_step,
+                    reader_cost=reader_cost_avg.get_average(),
+                    ips=args.batch_size / batch_cost_avg.get_average())
                sys.stdout.flush()
                if train_batch_id % args.print_step == 0:
+                    reader_cost_avg.reset()
                    batch_cost_avg.reset()
            train_batch_id += 1
            t1 = time.time()

--- a/PaddleCV/image_classification/utils/utility.py
+++ b/PaddleCV/image_classification/utils/utility.py
@@ -420,7 +420,9 @@ def print_info(info_mode,
               batch_id=0,
               print_step=1,
               device_num=1,
-               class_dim=5):
+               class_dim=5,
+               reader_cost=0.0,
+               ips=0.0):
    """print function

    Args:
@@ -439,25 +441,28 @@ def print_info(info_mode,
            if len(metrics) == 2:
                loss, lr = metrics
                logger.info(
-                    "[Pass {0}, train batch {1}] \tloss {2}, lr {3}, elapse {4}".
+                    "[Pass {0}, train batch {1}] \tloss {2}, lr {3}, reader_cost: {5}, batch_cost: {4}, ips: {6}".
                    format(pass_id, batch_id, "%.5f" % loss, "%.5f" % lr,
-                           "%2.4f sec" % time_info))
+                           "%2.4f sec" % time_info, "%.5f sec" % reader_cost,
+                           "%.5f images/sec" % ips))
            # train and no mixup output
            elif len(metrics) == 4:
                loss, acc1, acc5, lr = metrics
                logger.info(
-                    "[Pass {0}, train batch {1}] \tloss {2}, acc1 {3}, acc{7} {4}, lr {5}, elapse {6}".
+                    "[Pass {0}, train batch {1}] \tloss {2}, acc1 {3}, acc{7} {4}, lr {5}, reader_cost: {8}, batch_cost: {6}, ips: {9}".
                    format(pass_id, batch_id, "%.5f" % loss, "%.5f" % acc1,
                           "%.5f" % acc5, "%.5f" % lr, "%2.4f sec" % time_info,
-                           min(class_dim, 5)))
+                           min(class_dim, 5), "%.5f sec" % reader_cost,
+                           "%.5f images/sec" % ips))
            # test output
            elif len(metrics) == 3:
                loss, acc1, acc5 = metrics
                logger.info(
-                    "[Pass {0}, test  batch {1}] \tloss {2}, acc1 {3}, acc{6} {4}, elapse {5}".
+                    "[Pass {0}, test  batch {1}] \tloss {2}, acc1 {3}, acc{6} {4}, reader_cost: {7}, batch_cost: {5}, ips: {8}".
                    format(pass_id, batch_id, "%.5f" % loss, "%.5f" % acc1,
                           "%.5f" % acc5, "%2.4f sec" % time_info,
-                           min(class_dim, 5)))
+                           min(class_dim, 5), "%.5f sec" % reader_cost,
+                           "%.5f images/sec" % ips))
            else:
                raise Exception(
                    "length of metrics {} is not implemented, It maybe caused by wrong format of build_program_output".
@@ -525,8 +530,9 @@ def best_strategy_compiled(args,
            fluid.require_version(min_version='1.7.0')
            build_strategy.fuse_bn_act_ops = args.fuse_bn_act_ops
        except Exception as e:
-            logger.info("PaddlePaddle version 1.7.0 or higher is "
-            "required when you want to fuse batch_norm and activation_op.")
+            logger.info(
+                "PaddlePaddle version 1.7.0 or higher is "
+                "required when you want to fuse batch_norm and activation_op.")
        build_strategy.fuse_elewise_add_act_ops = args.fuse_elewise_add_act_ops

        exec_strategy = fluid.ExecutionStrategy()