MPI训练打印学习率出错
Created by: minhozhou
版本信息:python3、paddle1.6.1、CPU-MPI 出错信息:
CPU单机可正常输出当前学习率,没有GPU;分布式用的Fleet的接口,基于paddlecloud的MPI
optimizer代码:
def optimization(cloud_train, base_lr, loss, train_steps, optimizer='adam'):
""" optimization
"""
# decayed_lr = fluid.layers.polynomial_decay(base_lr, train_steps, 0.0025 * 16)
decayed_lr = fluid.layers.polynomial_decay(
learning_rate=base_lr,
decay_steps=train_steps,
end_learning_rate=0.0001 * base_lr,
power=1.0,
cycle=False)
decayed_lr.persistable = True
if optimizer == 'sgd':
optimizer = fluid.optimizer.SGD(
decayed_lr,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0025))
elif optimizer == 'adam':
# dont use gpu's lazy mode
optimizer = fluid.optimizer.Adam(decayed_lr)
else:
raise ValueError
if cloud_train == 1:
config = DistributeTranspilerConfig()
config.sync_mode = False
config.runtime_split_send_recv = True
optimizer = fleet.distributed_optimizer(optimizer, config)
log.info('learning rate:%f' % (base_lr))
optimizer.minimize(loss)
return decayed_lr
MPI分布式train部分代码:
decayed_lr = optimization(args.cloud_train, args.lr * num_devices, loss, train_steps, args.optimizer)
fleet.init_worker()
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fleet.startup_program)
compiled_prog = build_complied_prog(fleet.main_program, loss)
train_loop(exe, exe, compiled_prog, decayed_lr, loss, reader, args, item_embed, predict_input_list)
单机train部分代码:
decayed_lr = optimization(args.cloud_train, args.lr * num_devices, loss, train_steps, args.optimizer)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
train_prog = fluid.default_main_program()
train_exe = get_parallel_exe(train_prog, loss)
train_loop(train_exe, exe, train_prog, decayed_lr, loss, reader, args, item_embed, predict_input_list)
train_loop代码:
def train_loop(train_exe, exe, program, decayed_lr, loss, data_reader, args, src_embed, input_data_list):
""" train
"""
model_save_dir = args.output_path
if not os.path.exists(model_save_dir):
os.makedirs(model_save_dir)
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
step = 0
epoch = 0
for epoch in range(args.epoch):
for data in data_reader():
begin_time = time.time()
decayed_lr_val, loss_val = train_exe.run(program, fetch_list=[decayed_lr.name, loss], feed=data)
log.info(decayed_lr_val)
# decayed_lr_val = np.array(fluid.executor.global_scope().find_var(decayed_lr.name).get_tensor())
step += 1
if step % 10 == 0:
log.info("epoch %s: step %s: loss %.5f speed: %.5f s/step reader qsize: %s" %
(epoch, step, np.mean(loss_val), time.time() - begin_time, data_reader.queue.size()))