diff --git a/PaddleCV/image_classification/ResNeXt101_vd_32x4d_local.sh b/PaddleCV/image_classification/ResNeXt101_vd_32x4d_local.sh new file mode 100644 index 0000000000000000000000000000000000000000..ff2dcdda81f85859a772583f899629c128a2cec9 --- /dev/null +++ b/PaddleCV/image_classification/ResNeXt101_vd_32x4d_local.sh @@ -0,0 +1,23 @@ +#Training details +#Missed +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export FLAGS_fast_eager_deletion_mode=1 +export FLAGS_eager_delete_tensor_gb=0.0 +export FLAGS_fraction_of_gpu_memory_to_use=0.98 + +#ResNeXt101_vd_32x4d +python -m paddle.distributed.launch \ + --use_paddlecloud \ + --selected_gpus="0,1,2,3,4,5,6,7" \ + --log_dir=mylog \ + train.py \ + --model=ResNeXt101_vd_32x4d \ + --batch_size=256 \ + --lr_strategy=cosine_decay \ + --lr=0.1 \ + --num_epochs=200 \ + --model_save_dir=output/ \ + --l2_decay=1e-4 \ + --use_mixup=True \ + --use_label_smoothing=True \ + --label_smoothing_epsilon=0.1 diff --git a/PaddleCV/image_classification/reader.py b/PaddleCV/image_classification/reader.py index 239f9eae585b23f5b7c0f6b0d0aa912676f3d6e7..4789596fadbe5d1c8257ea5852baeb31d061d46f 100644 --- a/PaddleCV/image_classification/reader.py +++ b/PaddleCV/image_classification/reader.py @@ -275,8 +275,7 @@ class ImageNetReader: batch_size = 1 else: if settings.use_gpu: - batch_size = settings.batch_size // paddle.fluid.core.get_cuda_device_count( - ) + batch_size = settings.batch_size // num_trainers else: batch_size = settings.batch_size // int( os.environ.get('CPU_NUM', 1)) @@ -296,7 +295,6 @@ class ImageNetReader: full_lines) elif shuffle: np.random.shuffle(full_lines) - batch_data = [] for line in full_lines: img_path, label = line.split() @@ -362,10 +360,10 @@ class ImageNetReader: if settings.use_mixup == True: reader = create_mixup_reader(settings, reader) + num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) reader = fluid.io.batch( reader, - batch_size=int(settings.batch_size / - paddle.fluid.core.get_cuda_device_count()), + batch_size = settings.batch_size // num_trainers, drop_last=True) return reader diff --git a/PaddleCV/image_classification/train.py b/PaddleCV/image_classification/train.py index b11dab999e0f9cf7e61fcd81c869caac7b0e57ad..c45ff12ecdb618ecd9959321e2f04fb9a3c3fa92 100755 --- a/PaddleCV/image_classification/train.py +++ b/PaddleCV/image_classification/train.py @@ -29,6 +29,9 @@ from utils import * import models from build_model import create_model +from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy # new line 1 +from paddle.fluid.incubate.fleet.base import role_maker # new line 2 + def build_program(is_train, main_prog, startup_prog, args): """build program, and add grad op in program accroding to different mode @@ -62,12 +65,24 @@ def build_program(is_train, main_prog, startup_prog, args): # add backward op in program if is_train: optimizer = create_optimizer(args) - avg_cost = loss_out[0] - optimizer.minimize(avg_cost) #XXX: fetch learning rate now, better implement is required here. global_lr = optimizer._global_learning_rate() global_lr.persistable = True loss_out.append(global_lr) + avg_cost = loss_out[0] + + ################################# + # configure DistributedStrategy # + ################################# + dist_strategy = DistributedStrategy() + dist_strategy.nccl_comm_num = 2 + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.num_threads = 3 + exec_strategy.num_iteration_per_drop_scope = 30 + dist_strategy.exec_strategy = exec_strategy + dist_strategy.fuse_all_reduce_ops = True + optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) # new line 5 + optimizer.minimize(avg_cost) if args.use_ema: global_steps = fluid.layers.learning_rate_scheduler._decay_step_counter( ) @@ -120,6 +135,9 @@ def train(args): Args: args: all arguments. """ + role = role_maker.PaddleCloudRoleMaker(is_collective=True) # new line 3 + fleet.init(role) # new line 4 + startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() @@ -176,8 +194,7 @@ def train(args): train_data_loader.set_sample_list_generator(train_reader, places) test_data_loader.set_sample_list_generator(test_reader, place) - compiled_train_prog = best_strategy_compiled(args, train_prog, - train_fetch_vars[0], exe) + compiled_train_prog = fleet.main_program # change line 1 #NOTE: this for benchmark total_batch_num = 0 for pass_id in range(args.num_epochs):