提交 bcd2b431 编写于 作者: G gavin1332

example for training with fleet

上级 c6f6a54a
#Training details
#Missed
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_fast_eager_deletion_mode=1
export FLAGS_eager_delete_tensor_gb=0.0
export FLAGS_fraction_of_gpu_memory_to_use=0.98
#ResNeXt101_vd_32x4d
python -m paddle.distributed.launch \
--use_paddlecloud \
--selected_gpus="0,1,2,3,4,5,6,7" \
--log_dir=mylog \
train.py \
--model=ResNeXt101_vd_32x4d \
--batch_size=256 \
--lr_strategy=cosine_decay \
--lr=0.1 \
--num_epochs=200 \
--model_save_dir=output/ \
--l2_decay=1e-4 \
--use_mixup=True \
--use_label_smoothing=True \
--label_smoothing_epsilon=0.1
......@@ -275,8 +275,7 @@ class ImageNetReader:
batch_size = 1
else:
if settings.use_gpu:
batch_size = settings.batch_size // paddle.fluid.core.get_cuda_device_count(
)
batch_size = settings.batch_size // num_trainers
else:
batch_size = settings.batch_size // int(
os.environ.get('CPU_NUM', 1))
......@@ -296,7 +295,6 @@ class ImageNetReader:
full_lines)
elif shuffle:
np.random.shuffle(full_lines)
batch_data = []
for line in full_lines:
img_path, label = line.split()
......@@ -362,10 +360,10 @@ class ImageNetReader:
if settings.use_mixup == True:
reader = create_mixup_reader(settings, reader)
num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
reader = fluid.io.batch(
reader,
batch_size=int(settings.batch_size /
paddle.fluid.core.get_cuda_device_count()),
batch_size = settings.batch_size // num_trainers,
drop_last=True)
return reader
......
......@@ -29,6 +29,9 @@ from utils import *
import models
from build_model import create_model
from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy # new line 1
from paddle.fluid.incubate.fleet.base import role_maker # new line 2
def build_program(is_train, main_prog, startup_prog, args):
"""build program, and add grad op in program accroding to different mode
......@@ -62,12 +65,24 @@ def build_program(is_train, main_prog, startup_prog, args):
# add backward op in program
if is_train:
optimizer = create_optimizer(args)
avg_cost = loss_out[0]
optimizer.minimize(avg_cost)
#XXX: fetch learning rate now, better implement is required here.
global_lr = optimizer._global_learning_rate()
global_lr.persistable = True
loss_out.append(global_lr)
avg_cost = loss_out[0]
#################################
# configure DistributedStrategy #
#################################
dist_strategy = DistributedStrategy()
dist_strategy.nccl_comm_num = 2
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = 3
exec_strategy.num_iteration_per_drop_scope = 30
dist_strategy.exec_strategy = exec_strategy
dist_strategy.fuse_all_reduce_ops = True
optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) # new line 5
optimizer.minimize(avg_cost)
if args.use_ema:
global_steps = fluid.layers.learning_rate_scheduler._decay_step_counter(
)
......@@ -120,6 +135,9 @@ def train(args):
Args:
args: all arguments.
"""
role = role_maker.PaddleCloudRoleMaker(is_collective=True) # new line 3
fleet.init(role) # new line 4
startup_prog = fluid.Program()
train_prog = fluid.Program()
test_prog = fluid.Program()
......@@ -176,8 +194,7 @@ def train(args):
train_data_loader.set_sample_list_generator(train_reader, places)
test_data_loader.set_sample_list_generator(test_reader, place)
compiled_train_prog = best_strategy_compiled(args, train_prog,
train_fetch_vars[0], exe)
compiled_train_prog = fleet.main_program # change line 1
#NOTE: this for benchmark
total_batch_num = 0
for pass_id in range(args.num_epochs):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册