Problems in employing multiple programs
Created by: leonleeldc
I am using multiple programs in developing a gan model. But for some reason, the discrminator program and the train program have conflicts. When put them together, I always got nan after a few iterations within one epoch.
I guess this issue is related to the gradient computation. I found a few similar issues in dygraph. Not sure if they are related.
https://github.com/PaddlePaddle/Paddle/issues/21965 https://github.com/PaddlePaddle/Paddle/issues/21886 https://github.com/PaddlePaddle/Paddle/issues/21965 https://github.com/PaddlePaddle/Paddle/issues/22377
Namely, when we do multiple backward() operations, the gradients will become 0 and then it will affect the later updates or predictions. For clarity, I copied part of my script as following.
The trouble makers are,
g_train_op = get_g_train_ops(g_train_prog, config, g_loss, md_loss)
d_train_op = get_d_train_ops(d_train_prog, config, d_loss)
def real_train_dict_reduce(generator, generator_meta, mediator, discriminator, oracle_loader, config):
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = 1
exec_strategy.use_experimental_executor = True
batch_size = config['batch_size']
num_sentences = config['num_sentences']
vocab_size = config['vocab_size']
seq_len = config['seq_len']
data_dir = config['data_dir']
dataset = config['dataset']
log_dir = config['log_dir']
sample_dir = config['sample_dir']
npre_epochs = config['npre_epochs']
pretrain_temp = config['pretrain_temperature']
# filename
oracle_file = os.path.join(sample_dir, 'oracle_{}.txt'.format(dataset))
gen_file = os.path.join(sample_dir, 'generator.txt')
gen_text_file = os.path.join(sample_dir, 'generator_text.txt')
csv_file = os.path.join(log_dir, 'experiment-log-rmcgan.csv')
data_file = os.path.join(data_dir, '{}.txt'.format(dataset))
if dataset == 'image_coco_1000':
test_file = os.path.join(data_dir, 'testdata/test_coco.txt')
elif dataset == 'emnlp_news':
test_file = os.path.join(data_dir, 'testdata/test_emnlp.txt')
else:
raise NotImplementedError('Unknown dataset!')
print('data_file', data_file)
print('test_file', test_file)
# create necessary directories
if not os.path.exists(data_dir):
os.makedirs(data_dir)
if not os.path.exists(sample_dir):
os.makedirs(sample_dir)
if not os.path.exists(log_dir):
os.makedirs(log_dir)
startup_program = fluid.default_startup_program()
program = fluid.default_main_program()
d_train_prog = fluid.default_main_program()
with fluid.program_guard(d_train_prog, startup_program):
x_real = fluid.layers.data(name='x_real', shape=[batch_size, seq_len], append_batch_size=False, dtype='int32')
# x_real = fluid.layers.unsqueeze(input=x_real, axes=[-1])
x_real_onehot = fluid.layers.one_hot(fluid.layers.reshape(x_real, shape=[x_real.shape[0], x_real.shape[1], 1]),
vocab_size) # batch_size x seq_len x vocab_size
# get discriminator output
d_out_real = discriminator(x_onehot=x_real_onehot, batch_size=batch_size, seq_len=config['seq_len'],
vocab_size=vocab_size, dis_emb_dim=config['dis_emb_dim'], num_rep=config['num_rep'],
sn=config['sn'])
with fluid.program_guard(program, startup_program):
#with fluid.unique_name.guard():
# train with different datasets
x_real = fluid.layers.data(name='x_real', shape=[batch_size, seq_len], append_batch_size=False,
dtype='int32') # tokens of oracle sequences
temperature = fluid.layers.create_global_var(shape=[1], value=1., dtype='float32',
persistable=True, force_cpu=True, name='temperature')
static_temperature_pre = fluid.layers.create_global_var(shape=[1], value=pretrain_temp, dtype='float32',
persistable=True, force_cpu=True,
name='static_temperature')
x_fake_onehot_appr, x_fake, g_pretrain_loss, gen_o, _, x_fake_nongumbel, w_dict_g = generator('generator',
program,x_real=x_real,temperature=temperature,temperature_pretrain=static_temperature_pre,w_dict=None,
vocab_size=vocab_size,batch_size=batch_size,seq_len=config['seq_len'],gen_emb_dim=config['gen_emb_dim'],
mem_slots=config['mem_slots'],head_size=config['head_size'],num_heads=config['num_heads'],hidden_dim=
config['hidden_dim'],start_token=config['start_token'])
# (non-meta) mediator divergence minimization loss -1 * how to do it
md_loss = fluid.layers.reduce_sum(
x_fake_nongumbel * (
fluid.layers.log(md_pre_g_predictions_fake + 1e-10) - fluid.layers.log(x_fake_nongumbel + 1e-10))
) / batch_size / config['seq_len'] * config['md_coeff']
# GAN / Divergence type
print('the shape of x_real is in real_train_dict=', x_real.shape)
d_out_fake = discriminator(x_onehot=x_fake_onehot_appr, batch_size=batch_size, seq_len=config['seq_len'],
vocab_size=vocab_size, dis_emb_dim=config['dis_emb_dim'], num_rep=config['num_rep'],
sn=config['sn'])
assert list(x_real_onehot.shape) == [batch_size, config['seq_len'], vocab_size]
log_pg, g_loss, d_loss = get_losses(d_out_real, d_out_fake, x_real_onehot, x_fake_onehot_appr,
gen_o, discriminator, config)
# # compute meta-update
grad_clip = config['grad_clip']
grad_out = paddle.fluid.gradients(g_loss, list(w_dict_g.values()))
print(grad_out)
grads = [fluid.layers.clip_by_norm(grad, grad_clip) if grad is not None else grad for grad in grad_out]
gvs = dict(zip(w_dict_g.keys(), grads))
# gradient descent step
onward_w_dict = OrderedDict()
for i, key in enumerate(w_dict_g.keys()):
onward_w_dict[key] = w_dict_g[key]
grad = gvs[key]
if grad is not None:
onward_w_dict[key] = w_dict_g[key] - config['gadv_lr'] * grad
# # inference with new parameters
_, x_fake_meta, _, _, meta_pre_g_predictions_real, gen_x_nongumbel_meta, _ = generator_meta('generator',
program=program,x_real=x_real,vocab_size=vocab_size,batch_size=batch_size,seq_len=config['seq_len'],
gen_emb_dim=config['gen_emb_dim'],mem_slots=config['mem_slots'],head_size=config['head_size'],
num_heads=config['num_heads'],hidden_dim=config['hidden_dim'],start_token=config['start_token'],
temperature=temperature,temperature_pretrain=static_temperature_pre,w_dict=onward_w_dict)
# ToDo
from_vars = get_from_var(program)
to_vars = get_to_vars(program)
assign_ops = []
print("########## update target graph ##########")
for from_var, to_var in zip(from_vars, to_vars):
assign_ops.append(fluid.layers.assign(from_var, to_var))
index_word_dict = get_oracle_file(data_file, oracle_file, config['seq_len'])
oracle_loader.create_batches(oracle_file)
print("########## update target graph ##########")
# # Train ops
pretrain_prog = program.clone()
g_pretrain_op = get_pretrain_ops(pretrain_prog, config, g_pretrain_loss)
g_train_prog = program.clone()
g_train_op = get_g_train_ops(g_train_prog, config, g_loss, md_loss)
d_train_op = get_d_train_ops(d_train_prog, config, d_loss)
log = open(csv_file, 'w')
use_gpu = True
if use_gpu:
exe = fluid.Executor(fluid.CUDAPlace(0))
else:
exe = fluid.Executor(fluid.CPUPlace())
exe.run(startup_program)
metrics = get_metrics(program, config, oracle_loader, test_file, gen_text_file, g_pretrain_loss, x_real, exe)
sum_writer = get_summary_writer(
os.path.join(log_dir, 'summary')),
for epoch in range(npre_epochs):
# pre-training
print("epoch: ", epoch)
g_pretrain_loss_np = pre_train_epoch(exe, pretrain_prog, g_pretrain_loss, x_real, oracle_loader)
#
ntest_pre = 1 # 10
ntest_pre_bleu = ntest_pre * 2
if np.mod(epoch, ntest_pre) == 0:
generate_samples(exe, g_train_prog, x_fake, batch_size, num_sentences, x_real, oracle_loader, gen_file)
get_real_test_file(gen_file, gen_text_file, index_word_dict)
msg = 'pre_gen_epoch:' + str(epoch) + ', g_pre_loss: %.4f' % g_pretrain_loss_np + ', num_batch: %d' % oracle_loader.num_batch
metric_names = [metric.get_name() for metric in metrics]
if np.mod(epoch, ntest_pre_bleu) == 0 and epoch > 0:
for (name, score) in zip(metric_names, scores):
msg += ', ' + name + ': %.4f' % score
print(msg)