From c2ec3397647146cdc8029e57905462b5c8ebd16d Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Thu, 25 Apr 2019 15:39:56 +0800 Subject: [PATCH] update --- .../dist_train/dist_train.py | 18 +++++++----------- .../dist_train/run_mp_mode.sh | 4 +++- PaddleCV/image_classification/reader.py | 9 +++++---- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/PaddleCV/image_classification/dist_train/dist_train.py b/PaddleCV/image_classification/dist_train/dist_train.py index dfa4237a..1d059f4b 100644 --- a/PaddleCV/image_classification/dist_train/dist_train.py +++ b/PaddleCV/image_classification/dist_train/dist_train.py @@ -66,7 +66,6 @@ def parse_args(): add_arg('split_var', bool, True, "Split params on pserver.") add_arg('async_mode', bool, False, "Async distributed training, only for pserver mode.") add_arg('reduce_strategy', str, "allreduce", "Choose from reduce or allreduce.") - add_arg('skip_unbalanced_data', bool, False, "Skip data not if data not balanced on nodes.") add_arg('enable_sequential_execution', bool, False, "Skip data not if data not balanced on nodes.") #for dgc add_arg('enable_dgc', bool, False, "Skip data not if data not balanced on nodes.") @@ -85,13 +84,11 @@ def get_device_num(): device_num = subprocess.check_output(['nvidia-smi', '-L']).decode().count('\n') return device_num -def prepare_reader(is_train, pyreader, args, pass_id=0): - # NOTE: allways set reader infinite when nccl2 mode to balance data - # between ranks - is_infinite = (args.update_method == "nccl2") +def prepare_reader(is_train, pyreader, args, pass_id=1): + # NOTE: always use infinite reader for dist training if is_train: reader = train(data_dir=args.data_dir, pass_id_as_seed=pass_id, - infinite=is_infinite) + infinite=True) else: reader = val(data_dir=args.data_dir) if is_train: @@ -335,9 +332,8 @@ def train_parallel(args): num_samples = 0 start_time = time.time() batch_id = 1 - # use pass_id+1 as per pass global shuffle for distributed training - prepare_reader(True, train_pyreader, args, pass_id + 1) - train_pyreader.start() + if pass_id == 0: + train_pyreader.start() while True: try: if batch_id % 30 == 0: @@ -355,11 +351,10 @@ def train_parallel(args): break num_samples += args.batch_size batch_id += 1 - if (args.skip_unbalanced_data or args.update_method == "nccl2") and batch_id >= steps_per_pass: + if batch_id >= steps_per_pass: break print_train_time(start_time, time.time(), num_samples) - train_pyreader.reset() if pass_id >= args.start_test_pass: if args.multi_batch_repeat > 1: copyback_repeat_bn_params(train_prog) @@ -375,6 +370,7 @@ def train_parallel(args): if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(startup_exe, model_path, main_program=train_prog) + train_pyreader.reset() startup_exe.close() print("total train time: ", time.time() - over_all_start) diff --git a/PaddleCV/image_classification/dist_train/run_mp_mode.sh b/PaddleCV/image_classification/dist_train/run_mp_mode.sh index bf04e078..4c268b6c 100755 --- a/PaddleCV/image_classification/dist_train/run_mp_mode.sh +++ b/PaddleCV/image_classification/dist_train/run_mp_mode.sh @@ -15,5 +15,7 @@ PADDLE_TRAINING_ROLE="TRAINER" \ PADDLE_CURRENT_ENDPOINT="127.0.0.1:716${i}" \ PADDLE_TRAINER_ID="${i}" \ FLAGS_selected_gpus="${i}" \ -python dist_train.py --model $MODEL --update_method nccl2 --batch_size 32 --fp16 1 --scale_loss 8 &> logs/tr$i.log & +python -u dist_train.py --model $MODEL --update_method nccl2 \ +--batch_size 32 \ +--fp16 0 --scale_loss 1 &> logs/tr$i.log & done diff --git a/PaddleCV/image_classification/reader.py b/PaddleCV/image_classification/reader.py index 11c078cb..373a3023 100644 --- a/PaddleCV/image_classification/reader.py +++ b/PaddleCV/image_classification/reader.py @@ -12,7 +12,7 @@ np.random.seed(0) DATA_DIM = 224 THREAD = 8 -BUF_SIZE = 102400 +BUF_SIZE = 1024 DATA_DIR = 'data/ILSVRC2012' @@ -131,7 +131,7 @@ def _reader_creator(file_list, color_jitter=False, rotate=False, data_dir=DATA_DIR, - pass_id_as_seed=0, + pass_id_as_seed=1, infinite=False): def reader(): with open(file_list) as flist: @@ -176,7 +176,7 @@ def _reader_creator(file_list, return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE) -def train(data_dir=DATA_DIR, pass_id_as_seed=0, infinite=False): +def train(data_dir=DATA_DIR, pass_id_as_seed=1, infinite=False): file_list = os.path.join(data_dir, 'train_list.txt') return _reader_creator( file_list, @@ -185,7 +185,8 @@ def train(data_dir=DATA_DIR, pass_id_as_seed=0, infinite=False): color_jitter=False, rotate=False, data_dir=data_dir, - pass_id_as_seed=pass_id_as_seed) + pass_id_as_seed=pass_id_as_seed, + infinite=infinite) def val(data_dir=DATA_DIR): -- GitLab