提交 b6b7ab63 编写于 作者: G guosheng

Fix calculations in gru_unit_op to be consistent with gru_op

上级 f191c820
...@@ -146,35 +146,27 @@ class GRUUnitGradKernel : public framework::OpKernel<T> { ...@@ -146,35 +146,27 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
auto* weight_grad = auto* weight_grad =
context.Output<Tensor>(framework::GradVarName("Weight")); context.Output<Tensor>(framework::GradVarName("Weight"));
auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias")); auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
input_grad->mutable_data<T>(context.GetPlace());
hidden_prev_grad->mutable_data<T>(context.GetPlace());
weight_grad->mutable_data<T>(context.GetPlace());
Tensor gate_grad; Tensor gate_grad;
gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
Tensor reset_hidden_prev_grad; Tensor reset_hidden_prev_grad;
reset_hidden_prev_grad.mutable_data<T>(reset_hidden_prev->dims(),
context.GetPlace());
int batch_size = input->dims()[0];
int frame_size = hidden_prev->dims()[1];
const T* hidden_prev_data = hidden_prev->data<T>(); const T* hidden_prev_data = hidden_prev->data<T>();
T* hidden_prev_grad_data = hidden_prev_grad->data<T>();
const T* weight_data = weight->data<T>(); const T* weight_data = weight->data<T>();
T* weight_grad_data = weight_grad->data<T>(); T* gate_grad_data =
T* gate_grad_data = gate_grad.data<T>(); gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
const T* reset_hidden_prev_data = reset_hidden_prev->data<T>(); const T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.data<T>(); T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data<T>(
reset_hidden_prev->dims(), context.GetPlace());
auto h_p = EigenMatrix<T>::From(*hidden_prev); auto h_p = EigenMatrix<T>::From(*hidden_prev);
auto g = EigenMatrix<T>::From(*gate); auto g = EigenMatrix<T>::From(*gate);
auto d_h = EigenMatrix<T>::From(*hidden_grad); auto d_h = EigenMatrix<T>::From(*hidden_grad);
auto d_x = EigenMatrix<T>::From(*input_grad);
auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
auto d_g = EigenMatrix<T>::From(gate_grad); auto d_g = EigenMatrix<T>::From(gate_grad);
auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad); auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
auto place = context.GetEigenDevice<Place>(); auto place = context.GetEigenDevice<Place>();
int batch_size = input->dims()[0];
int frame_size = hidden_prev->dims()[1];
Eigen::array<int, 2> extents({{batch_size, frame_size}}); Eigen::array<int, 2> extents({{batch_size, frame_size}});
Eigen::array<int, 2> u_offsets({{0, 0}}); Eigen::array<int, 2> u_offsets({{0, 0}});
auto u = g.slice(u_offsets, extents); // update gate auto u = g.slice(u_offsets, extents); // update gate
...@@ -195,28 +187,42 @@ class GRUUnitGradKernel : public framework::OpKernel<T> { ...@@ -195,28 +187,42 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
gate_grad_data + frame_size * 2, frame_size * 3, gate_grad_data + frame_size * 2, frame_size * 3,
weight_data + frame_size * frame_size * 2, frame_size, weight_data + frame_size * frame_size * 2, frame_size,
0, reset_hidden_prev_grad_data, frame_size); 0, reset_hidden_prev_grad_data, frame_size);
// backward for state_weight
math::gemm<Place, T>(
context.device_context(), true, false, frame_size, frame_size,
batch_size, 1, reset_hidden_prev_data, frame_size,
gate_grad_data + frame_size * 2, frame_size * 3, 0,
weight_grad_data + frame_size * frame_size * 2, frame_size);
// backward for unactivated reset gate // backward for unactivated reset gate
ActGradCompute(context.Attr<int>("gate_activation"), place, r, r, ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
d_g.slice(r_offsets, extents), d_r_h_p * h_p); d_g.slice(r_offsets, extents), d_r_h_p * h_p);
// backward for update_gate_weight and reset_gate_weight // backward for weight
math::gemm<Place, T>(context.device_context(), true, false, frame_size, if (weight_grad) {
frame_size * 2, batch_size, 1, hidden_prev_data, T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
frame_size, gate_grad_data, frame_size * 3, 0, // backward for state_weight
weight_grad_data, frame_size * 2); math::gemm<Place, T>(
context.device_context(), true, false, frame_size, frame_size,
batch_size, 1, reset_hidden_prev_data, frame_size,
gate_grad_data + frame_size * 2, frame_size * 3, 0,
weight_grad_data + frame_size * frame_size * 2, frame_size);
// backward for update_gate_weight and reset_gate_weight
math::gemm<Place, T>(context.device_context(), true, false, frame_size,
frame_size * 2, batch_size, 1, hidden_prev_data,
frame_size, gate_grad_data, frame_size * 3, 0,
weight_grad_data, frame_size * 2);
}
// backward for hidden_prev // backward for hidden_prev
d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u); if (hidden_prev_grad) {
math::gemm<Place, T>(context.device_context(), false, true, batch_size, T* hidden_prev_grad_data =
frame_size, frame_size * 2, 1, gate_grad_data, hidden_prev_grad->mutable_data<T>(context.GetPlace());
frame_size * 3, weight_data, frame_size * 2, 1, auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
hidden_prev_grad_data, frame_size); d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
math::gemm<Place, T>(context.device_context(), false, true, batch_size,
frame_size, frame_size * 2, 1, gate_grad_data,
frame_size * 3, weight_data, frame_size * 2, 1,
hidden_prev_grad_data, frame_size);
}
// backward for input // backward for input
d_x.device(place) = d_g; if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace());
auto d_x = EigenMatrix<T>::From(*input_grad);
d_x.device(place) = d_g;
}
// backward for bias // backward for bias
if (bias_grad) { if (bias_grad) {
bias_grad->mutable_data<T>(context.GetPlace()); bias_grad->mutable_data<T>(context.GetPlace());
......
...@@ -28,8 +28,8 @@ def relu(x): ...@@ -28,8 +28,8 @@ def relu(x):
class TestGRUUnitOp(OpTest): class TestGRUUnitOp(OpTest):
batch_size = 3 batch_size = 5
frame_size = 5 frame_size = 10
activate = { activate = {
GRUActivationType.identity: identity, GRUActivationType.identity: identity,
GRUActivationType.sigmoid: sigmoid, GRUActivationType.sigmoid: sigmoid,
...@@ -92,9 +92,7 @@ class TestGRUUnitOp(OpTest): ...@@ -92,9 +92,7 @@ class TestGRUUnitOp(OpTest):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
self.check_grad( self.check_grad(['Input', 'HiddenPrev', 'Weight'], ['Hidden'])
['Input', 'HiddenPrev', 'Weight'], ['Hidden'],
max_relative_error=0.007)
class TestGRUUnitOpWithBias(TestGRUUnitOp): class TestGRUUnitOpWithBias(TestGRUUnitOp):
...@@ -110,9 +108,12 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp): ...@@ -110,9 +108,12 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp):
} }
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'])
def test_check_grad_ingore_input(self):
self.check_grad( self.check_grad(
['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'], ['HiddenPrev', 'Weight', 'Bias'], ['Hidden'],
max_relative_error=0.007) no_grad_set=set('Input'))
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册