Input(Out@GRAD) shouldn't be null错误怎么排查 a follow up question to issue #16528
Created by: leonleeldc
I noticed that the issue Input(Out@GRAD) shouldn't be null错误怎么排查 #16528 (closed) is similar. But it seems somewhat different as well. 具体说来,我这边定义了一个generator: def generator(parent, program, x_real, temperature, temperature_pretrain, w_dict, vocab_size, batch_size, seq_len, gen_emb_dim, mem_slots, head_size, num_heads, hidden_dim, start_token): IS_SPARSE = True #parent = 'generator' start_tokens = pd.fluid.layers.fill_constant(shape=[batch_size, 1], value=start_token, dtype='int32') output_size = mem_slots * head_size * num_heads
gen_mem = RelationalMemory(parent, program=program, mem_slots=mem_slots, head_size=head_size, num_heads=num_heads)
# build relation memory module
if w_dict is None:
w_dict = OrderedDict()
create_variable = True
else:
create_variable = False
if create_variable:
g_variables = pd.fluid.layers.create_parameter(shape=[vocab_size, 1], dtype='float32', name='g_variables',
default_initializer=create_linear_initializer(vocab_size))
w_dict[parent+'/g_embeddings'] = pd.fluid.layers.nn.embedding(g_variables, size=[vocab_size,gen_emb_dim],param_attr=parent+'/g_embeddings')
w_dict[parent+'/g_output_weight'] = pd.fluid.layers.create_parameter(shape=[output_size, vocab_size],dtype = 'float32', name=parent+'/g_output_weight',
default_initializer=create_linear_initializer(output_size))
w_dict[parent+'/g_output_bias'] = pd.fluid.layers.create_parameter(shape=[vocab_size], dtype='float32', name=parent+'/g_output_bias',
default_initializer=create_bias_initializer())
# initial states
init_states = gen_mem.initial_state(batch_size)
# ---------- generate tokens and approximated one-hot results (Adversarial) ---------
gen_o, gen_x, gen_x_onehot_adv, gen_x_onehot_adv_nongumbel = [], [], [], []
x_t = pd.fluid.layers.nn.embedding(start_tokens, size=[vocab_size,gen_emb_dim], param_attr=parent+'/g_embeddings')
h_tm1 = init_states
for i in range(seq_len):
mem_o_t, h_t, w_dict = gen_mem(x_t, h_tm1, w_dict)
o_t = pd.fluid.layers.matmul(mem_o_t, w_dict[parent+'/g_output_weight']) + w_dict[parent+'/g_output_bias']
gumbel_t = add_gumbel(o_t)
next_token = pd.fluid.layers.argmax(gumbel_t, axis=1)
next_token.stop_gradient = True
next_token_onehot = pd.fluid.layers.one_hot(fluid.layers.unsqueeze(next_token,[1]), vocab_size)
x_onehot_appr = pd.fluid.layers.nn.softmax(pd.fluid.layers.elementwise_mul(gumbel_t, temperature))
x_tp1 = pd.fluid.layers.nn.embedding(fluid.layers.unsqueeze(next_token, [1]), size=[vocab_size, gen_emb_dim],
param_attr=parent+'/g_embeddings')
gen_o.append(pd.fluid.layers.reduce_sum(pd.fluid.layers.elementwise_mul(next_token_onehot, x_onehot_appr), 1))
gen_x.append(next_token)
gen_x_onehot_adv.append(x_onehot_appr)
gen_x_onehot_adv_nongumbel.append(pd.fluid.layers.nn.softmax(o_t))
x_t = x_tp1
h_tm1 = h_t
gen_o = pd.fluid.layers.transpose(pd.fluid.layers.stack(gen_o, 0), perm=[1, 0]) # batch_size x seq_len
gen_x = pd.fluid.layers.transpose(pd.fluid.layers.stack(gen_x, 0), perm=[1, 0]) # batch_size x seq_len
gen_x_onehot_adv = pd.fluid.layers.transpose(pd.fluid.layers.stack(gen_x_onehot_adv,0), perm=[1, 0, 2]) # batch_size x seq_len x vocab_size
gen_x_onehot_adv_nongumbel = pd.fluid.layers.transpose(pd.fluid.layers.stack(gen_x_onehot_adv_nongumbel, 0), perm=[1, 0, 2])
# ----------- pre-training for generator -----------------
x_emb = pd.fluid.layers.transpose(fluid.layers.squeeze(pd.fluid.layers.nn.embedding(
fluid.layers.unsqueeze(x_real, [2]), size=[vocab_size, gen_emb_dim],
param_attr=parent+'/g_embeddings'),[]), perm=[1, 0, 2]) # seq_len x batch_size x emb_dim
x_t_pre = pd.fluid.layers.nn.embedding(fluid.layers.unsqueeze(start_tokens,[1]), size=[vocab_size, gen_emb_dim], param_attr=parent+'/g_embeddings')
h_tm1_pre = init_states
g_predictions, g_predictions_nongumbel = [], []
for i in range(seq_len):
mem_o_t, h_t, w_dict = gen_mem(x_t_pre, h_tm1_pre, w_dict)
# o_t = g_output_unit(mem_o_t)
o_t = pd.fluid.layers.matmul(mem_o_t, w_dict[parent+'/g_output_weight']) + w_dict[ parent+'/g_output_bias']
g_predictions.append(pd.fluid.layers.nn.softmax(o_t * temperature_pretrain))
x_t_pre = x_emb[i]
h_tm1_pre = h_t
g_predictions = pd.fluid.layers.transpose(pd.fluid.layers.stack(g_predictions, 0), perm=[1, 0, 2])
# pre-training loss how to do -1 *
x_real = pd.fluid.layers.reshape(x_real, [-1])
print('what does x_real look like?')
x_real = fluid.layers.unsqueeze(x_real, [1])
x_real = pd.fluid.layers.one_hot(x_real, vocab_size)
pretrain_loss = 0-pd.fluid.layers.reduce_sum(
x_real * pd.fluid.layers.log(
pd.fluid.layers.clip(pd.fluid.layers.reshape(g_predictions, [-1, vocab_size]), 1e-20, 1.0)
)
) / (seq_len * batch_size)
return gen_x_onehot_adv, gen_x, pretrain_loss, gen_o, g_predictions, gen_x_onehot_adv_nongumbel, w_dict
针对pretrain_loss, 我接着定义了如下优化方法: # generator pre-training grad_clip = 5.0 # keep the same with the previous setting pretrain_opt = fluid.optimizer.AdamOptimizer(gpre_lr, beta1=0.9, beta2=0.999) fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=grad_clip)) pretrain_opt.minimize(g_pretrain_loss) 然后mimize(g_pretrain_loss)就报错如下:
File "/usr/local/lib/python3.6/dist-packages/paddle/fluid/backward.py", line 706, in append_backward append_backward_vars(root_block, fwd_op_num, grad_to_var, grad_info_map) File "/usr/local/lib/python3.6/dist-packages/paddle/fluid/backward.py", line 518, in append_backward_vars op_desc.infer_shape(block.desc) paddle.fluid.core_avx.EnforceNotMet: Invoke operator unsqueeze2_grad error. Python Call stacks: File "/usr/local/lib/python3.6/dist-packages/paddle/fluid/framework.py", line 1814, in append_op attrs=kwargs.get("attrs", None)) File "/usr/local/lib/python3.6/dist-packages/paddle/fluid/layer_helper.py", line 43, in append_op return self.main_program.current_block().append_op(*args, **kwargs) File "/usr/local/lib/python3.6/dist-packages/paddle/fluid/layers/nn.py", line 7273, in unsqueeze "XShape": x_shape}) File "/media/data2/dingcheng/workspace/baidu/ccl/MetaCotRelGAN/models/rmc_vanilla_dict.py", line 106, in generator x_real = fluid.layers.unsqueeze(x_real, [1]) File "/media/data2/dingcheng/workspace/baidu/ccl/MetaCotRelGAN/real/real_gan/real_train_dict.py", line 92, in real_train_dict start_token=config['start_token']) File "run_dict.py", line 144, in main real_train_dict(main_program, generator, generator_meta, mediator, discriminator, oracle_loader, config) File "run_dict.py", line 151, in main() C++ Call stacks: Input(Out@GRAD) shouldn't be null. at [/paddle/paddle/fluid/operators/unsqueeze_op.cc:246]
根据上次#16528中所述,他们发现: 训练代码中用到了softmax_with_cross_entropy和sigmoid_cross_entropy_with_logits,这两个op针对label不计算梯度,这点与tf对应接口实现不同。 但是我这里感觉不是同样的原因。能帮忙看看吗?