StaticRNN中recurrent op梯度没有正确传递
Created by: zhaoyuchen2018
使用下面的test case测试StaticRNN,发现w1的梯度没有正确的传递出去。
import sys import io import numpy as np
import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.framework as framework from paddle.fluid.executor import Executor
import paddle.fluid.layers as layers from paddle.fluid.layers.control_flow import StaticRNN
batch_size = 1 seq_len = 1 hidden_size = 10
x = layers.data( name='x', shape=[-1, batch_size, hidden_size], dtype='float32') x += 0.0 emb = layers.data( name='emb', shape=[-1, batch_size, hidden_size], dtype='float32')
w1 = layers.create_parameter( shape=[hidden_size, hidden_size], dtype='float32', name='w1')
w2 = layers.create_parameter( shape=[hidden_size, hidden_size], dtype='float32', name='w2')
pre_h = layers.fill_constant( shape=[ seq_len, hidden_size], dtype='float32', value = 0.0)
pre_h.stop_gradient = False
def dot_attention( query, memory ): attn = layers.matmul( query, memory, transpose_y=True) weight = layers.softmax( attn ) weight_memory = layers.matmul( weight, memory)
return weight_memory, weight
rnn = StaticRNN()
emb += 0.0
y = layers.matmul(x, w1)
with rnn.step():
step_in = rnn.step_input(x)
pre_h = rnn.memory( init= pre_h )
new_h = layers.matmul( pre_h, w2 )
new_h, _ = dot_attention( new_h, y)
rnn.update_memory( pre_h, new_h )
rnn.step_output( new_h )
rnn_out = rnn()
loss = layers.reduce_sum( rnn_out )
loss.persistable = True
sgd = fluid.optimizer.SGD(1.0)
sgd.minimize( loss )
fluid.default_startup_program().random_seed = 123
place = core.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program() )
param_list = fluid.default_main_program().block(0).all_parameters()
np.random.seed( 10 )
w1_np = np.random.uniform( -0.1, 0.1, ( hidden_size, hidden_size)).astype( 'float32') w2_np = np.random.uniform( -0.1, 0.1, ( hidden_size , hidden_size)).astype( "float32")
fluid.global_scope().find_var( 'w1' ).get_tensor().set( w1_np, place) fluid.global_scope().find_var( 'w2' ).get_tensor().set( w2_np, place)
input = np.random.uniform( -0.1, 0.1, ( seq_len, batch_size, hidden_size)).astype( 'float32' ) emb = np.random.uniform( -0.1, 0.1, ( seq_len, batch_size, hidden_size)).astype( 'float32' )
out = exe.run( feed={ 'x' : input, 'emb' : emb }, fetch_list=[loss.name ])
new_w1 = np.asarray( fluid.global_scope().find_var( 'w1' ).get_tensor() ) new_w2 = np.asarray( fluid.global_scope().find_var( 'w2' ).get_tensor() )
print( new_w1 - w1_np )
打印出来的new_w1 - w1_np全为0.