rnn block 没有梯度的问题
Created by: dongfangyixi
rnn block中,如果rnn out 不是block中最后的op结果的时候,会报没有梯度的错误。 测试代码如下 paddle版本为dev paddlepaddle_gpu-latest-cp36-cp36m-linux_x86_64.whl
import paddle.fluid as fluid
def to_lodtensor(data, place, return_num=False, dtype="int64"):
"""
lod tensor
"""
#print("data", data)
seq_lens = [len(seq) for seq in data]
# print(type(data[0][0]))
if isinstance(data[0][0], np.ndarray):
dim = len(data[0][0])
else:
dim = 1
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype(dtype)
flattened_data = flattened_data.reshape([len(flattened_data), dim])
lod_t = fluid.LoDTensor()
lod_t.set(flattened_data, place)
lod_t.set_lod([lod])
if return_num:
return lod_t, lod[-1]
else:
return lod_t
data_dim = 10
batch_size = 3
main_program = fluid.Program()
start_program = fluid.Program()
with fluid.program_guard(main_program, start_program):
with fluid.unique_name.guard():
rnn_input = fluid.layers.data(name='rnn_input', shape = [data_dim], dtype='float32')
rnn_mem = fluid.layers.zeros(shape=(batch_size, data_dim), dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
rnn = fluid.layers.DynamicRNN()
with rnn.block():
x = rnn.step_input(rnn_input)
pre_state = rnn.memory(init=rnn_mem, need_reorder=True)
rnn_out = fluid.layers.fc([x, pre_state], size=data_dim)
state = pre_state * rnn_out
rnn.update_memory(pre_state, state)
rnn.output(rnn_out)
rnn_out = rnn()
cost = fluid.layers.cross_entropy(input=rnn_out, label=label)
avg_cost = fluid.layers.reduce_mean(cost)
optimizer = fluid.optimizer.SGDOptimizer(learning_rate=1e-5)
optimizer.minimize(avg_cost)
print("model build successful!")
import numpy as np
place = fluid.CPUPlace()
inp = to_lodtensor(np.random.random(size=(batch_size, 5, data_dim)), place, dtype='float32')
label = to_lodtensor(np.random.random(size=(15, 1)), place, dtype='int64')
from paddle.fluid.executor import Executor
exe = Executor(place=place)
exe.run(start_program)
avg_cost_v = exe.run(main_program,
feed={'rnn_input': inp,
'label': label},
fetch_list=[avg_cost])
print("avg cost: ", avg_cost_v)
模型可以build,可以打印出"model build successful!",但是在exe run的时候运行报错:
InvalidArgumentError: The Tensor in the elementwise_add_grad Op's Input Variable Out@GRAD(tmp_0@GRAD) is not initialized.
[Hint: Expected t->IsInitialized() == true, but received t->IsInitialized():0 != true:1.] at (/paddle/paddle/fluid/framework/operator.cc:1218)
[operator < elementwise_add_grad > error]
可以只关注rnn block中的逻辑:
with rnn.block():
x = rnn.step_input(rnn_input)
pre_state = rnn.memory(init=rnn_mem, need_reorder=True)
rnn_out = fluid.layers.fc([x, pre_state], size=data_dim)
state = pre_state * rnn_out
rnn.update_memory(pre_state, state)
rnn.output(rnn_out)
第五行 state = pre_state * rnn_out , 每个step中更新的state,需要 rnn_out 来计算,这样rnn.output添加的tensor不是block中最后一个op计算出来的结果,就会报上述没有梯度的错误 如果改成
with rnn.block():
x = rnn.step_input(rnn_input)
pre_state = rnn.memory(init=rnn_mem, need_reorder=True)
rnn_out = fluid.layers.fc([x, pre_state], size=data_dim)
state = pre_state
rnn.update_memory(pre_state, state)
rnn.output(rnn_out)
第五行不乘rnn_out,运行没有问题。但是我们的模型逻辑是新的state 需要rnn out 来更新。
同样的逻辑在pytorch和tf中都是正确可运行的。 请问帮忙看一下这个问题,谢谢!