From 1fbb0875455f27c18e5b6d51505151090e7713cd Mon Sep 17 00:00:00 2001 From: ruri Date: Mon, 10 Feb 2020 18:40:53 +0800 Subject: [PATCH] fix multi-cards multi-process bug (#4251) * fix multi-card multi-process bug --- PaddleCV/image_classification/train.py | 17 ++++++++++------- .../image_classification/utils/dist_utils.py | 4 ++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/PaddleCV/image_classification/train.py b/PaddleCV/image_classification/train.py index e61333fd..c4d5f26a 100755 --- a/PaddleCV/image_classification/train.py +++ b/PaddleCV/image_classification/train.py @@ -102,13 +102,16 @@ def validate(args, test_batch_time_record = [] test_batch_metrics_record = [] test_batch_id = 0 - compiled_program = best_strategy_compiled( - args, - test_prog, - test_fetch_list[0], - exe, - mode="val", - share_prog=train_prog) + if int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) > 1: + compiled_program = test_prog + else: + compiled_program = best_strategy_compiled( + args, + test_prog, + test_fetch_list[0], + exe, + mode="val", + share_prog=train_prog) for batch in test_iter: t1 = time.time() test_batch_metrics = exe.run(program=compiled_program, diff --git a/PaddleCV/image_classification/utils/dist_utils.py b/PaddleCV/image_classification/utils/dist_utils.py index c98a64dd..681c260e 100755 --- a/PaddleCV/image_classification/utils/dist_utils.py +++ b/PaddleCV/image_classification/utils/dist_utils.py @@ -85,8 +85,8 @@ def prepare_for_multi_process(exe, build_strategy, train_prog): trainer_id = int(os.environ.get('PADDLE_TRAINER_ID', 0)) num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) if num_trainers < 2: return - logger.info("PADDLE_TRAINERS_NUM", num_trainers) - logger.info("PADDLE_TRAINER_ID", trainer_id) + logger.info("PADDLE_TRAINERS_NUM %s" % num_trainers) + logger.info("PADDLE_TRAINER_ID %s" % trainer_id) build_strategy.num_trainers = num_trainers build_strategy.trainer_id = trainer_id # NOTE(zcd): use multi processes to train the model, -- GitLab