diff --git a/PaddleCV/image_classification/train.py b/PaddleCV/image_classification/train.py index 9584b6fd7c71fab77e16f8e5deb870555d699c42..f2620ae2f22e655e936e2f6dfa3fa0f28162ea3e 100755 --- a/PaddleCV/image_classification/train.py +++ b/PaddleCV/image_classification/train.py @@ -120,6 +120,7 @@ def validate(args, test_batch_time_record = [] test_batch_metrics_record = [] test_batch_id = 0 + if int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) > 1: compiled_program = test_prog else: @@ -245,6 +246,7 @@ def train(args): train_fetch_vars[0], exe) batch_cost_avg = TimeCostAverage() + reader_cost_avg = TimeCostAverage() #NOTE: this for benchmark @@ -267,11 +269,14 @@ def train(args): #NOTE: this is for benchmark if args.max_iter and total_batch_num == args.max_iter: return + t2 = time.time() + reader_cost = t2 - t1 + reader_cost_avg.record(reader_cost) train_batch_metrics = exe.run(compiled_train_prog, feed=batch, fetch_list=train_fetch_list) - t2 = time.time() - train_batch_elapse = t2 - t1 + t3 = time.time() + train_batch_elapse = t3 - t1 train_batch_time_record.append(train_batch_elapse) batch_cost_avg.record(train_batch_elapse) @@ -279,11 +284,18 @@ def train(args): np.array(train_batch_metrics), axis=1) train_batch_metrics_record.append(train_batch_metrics_avg) if trainer_id == 0: - print_info("batch", train_batch_metrics_avg, - batch_cost_avg.get_average(), pass_id, - train_batch_id, args.print_step) + print_info( + "batch", + train_batch_metrics_avg, + batch_cost_avg.get_average(), + pass_id, + train_batch_id, + args.print_step, + reader_cost=reader_cost_avg.get_average(), + ips=args.batch_size / batch_cost_avg.get_average()) sys.stdout.flush() if train_batch_id % args.print_step == 0: + reader_cost_avg.reset() batch_cost_avg.reset() train_batch_id += 1 t1 = time.time() diff --git a/PaddleCV/image_classification/utils/utility.py b/PaddleCV/image_classification/utils/utility.py index 45c85102acfc7ebbc45e291ebe633211d69b9f42..d69b5c9c2b122d5cc1538f414581fd638374ac86 100644 --- a/PaddleCV/image_classification/utils/utility.py +++ b/PaddleCV/image_classification/utils/utility.py @@ -420,7 +420,9 @@ def print_info(info_mode, batch_id=0, print_step=1, device_num=1, - class_dim=5): + class_dim=5, + reader_cost=0.0, + ips=0.0): """print function Args: @@ -439,25 +441,28 @@ def print_info(info_mode, if len(metrics) == 2: loss, lr = metrics logger.info( - "[Pass {0}, train batch {1}] \tloss {2}, lr {3}, elapse {4}". + "[Pass {0}, train batch {1}] \tloss {2}, lr {3}, reader_cost: {5}, batch_cost: {4}, ips: {6}". format(pass_id, batch_id, "%.5f" % loss, "%.5f" % lr, - "%2.4f sec" % time_info)) + "%2.4f sec" % time_info, "%.5f sec" % reader_cost, + "%.5f images/sec" % ips)) # train and no mixup output elif len(metrics) == 4: loss, acc1, acc5, lr = metrics logger.info( - "[Pass {0}, train batch {1}] \tloss {2}, acc1 {3}, acc{7} {4}, lr {5}, elapse {6}". + "[Pass {0}, train batch {1}] \tloss {2}, acc1 {3}, acc{7} {4}, lr {5}, reader_cost: {8}, batch_cost: {6}, ips: {9}". format(pass_id, batch_id, "%.5f" % loss, "%.5f" % acc1, "%.5f" % acc5, "%.5f" % lr, "%2.4f sec" % time_info, - min(class_dim, 5))) + min(class_dim, 5), "%.5f sec" % reader_cost, + "%.5f images/sec" % ips)) # test output elif len(metrics) == 3: loss, acc1, acc5 = metrics logger.info( - "[Pass {0}, test batch {1}] \tloss {2}, acc1 {3}, acc{6} {4}, elapse {5}". + "[Pass {0}, test batch {1}] \tloss {2}, acc1 {3}, acc{6} {4}, reader_cost: {7}, batch_cost: {5}, ips: {8}". format(pass_id, batch_id, "%.5f" % loss, "%.5f" % acc1, "%.5f" % acc5, "%2.4f sec" % time_info, - min(class_dim, 5))) + min(class_dim, 5), "%.5f sec" % reader_cost, + "%.5f images/sec" % ips)) else: raise Exception( "length of metrics {} is not implemented, It maybe caused by wrong format of build_program_output". @@ -525,8 +530,9 @@ def best_strategy_compiled(args, fluid.require_version(min_version='1.7.0') build_strategy.fuse_bn_act_ops = args.fuse_bn_act_ops except Exception as e: - logger.info("PaddlePaddle version 1.7.0 or higher is " - "required when you want to fuse batch_norm and activation_op.") + logger.info( + "PaddlePaddle version 1.7.0 or higher is " + "required when you want to fuse batch_norm and activation_op.") build_strategy.fuse_elewise_add_act_ops = args.fuse_elewise_add_act_ops exec_strategy = fluid.ExecutionStrategy()