From f60f0eae11f9712f2f7955bb351f82dc2c2c412c Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 28 Sep 2017 20:00:23 -0700 Subject: [PATCH] Using double precision to stablize lstm gradient check --- paddle/operators/lstm_unit_op.cc | 13 +++++++------ paddle/operators/lstm_unit_op.cu | 14 ++++++++------ paddle/operators/lstm_unit_op.h | 8 ++++---- .../paddle/v2/framework/tests/test_lstm_unit_op.py | 6 +++--- 4 files changed, 22 insertions(+), 19 deletions(-) diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/operators/lstm_unit_op.cc index bd75b001cb..dad56731de 100644 --- a/paddle/operators/lstm_unit_op.cc +++ b/paddle/operators/lstm_unit_op.cc @@ -47,7 +47,6 @@ class LstmUnitOp : public framework::OperatorWithKernel { } }; -template class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker { public: LstmUnitOpMaker(framework::OpProto* proto, @@ -68,7 +67,7 @@ Equation: H = C * sigm(o) )DOC"); - AddAttr("forget_bias", "The forget bias of Lstm Unit.") + AddAttr("forget_bias", "The forget bias of Lstm Unit.") .SetDefault(0.0); } }; @@ -93,9 +92,11 @@ class LstmUnitGradOp : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker, - lstm_unit_grad, ops::LstmUnitGradOp); +REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker, lstm_unit_grad, + ops::LstmUnitGradOp); REGISTER_OP_CPU_KERNEL(lstm_unit, - ops::LstmUnitKernel); + ops::LstmUnitKernel, + ops::LstmUnitKernel); REGISTER_OP_CPU_KERNEL( - lstm_unit_grad, ops::LstmUnitGradKernel); + lstm_unit_grad, ops::LstmUnitGradKernel, + ops::LstmUnitGradKernel); diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu index b1db0d5322..49ea550b6f 100644 --- a/paddle/operators/lstm_unit_op.cu +++ b/paddle/operators/lstm_unit_op.cu @@ -89,7 +89,7 @@ __global__ void LSTMUnitGradientKernel(const int nthreads, const int dim, } } -template +template class LstmUnitOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -101,7 +101,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel { auto* c_tensor = ctx.Output("C"); auto* h_tensor = ctx.Output("H"); - auto forget_bias = static_cast(ctx.Attr("forget_bias")); + auto forget_bias = static_cast(ctx.Attr("forget_bias")); int b_size = c_tensor->dims()[0]; int D = c_tensor->dims()[1]; @@ -120,7 +120,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel { } }; -template +template class LstmUnitGradOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -153,7 +153,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel { int N = c_tensor->dims()[0]; int D = c_tensor->dims()[1]; - auto forget_bias = static_cast(ctx.Attr("forget_bias")); + auto forget_bias = static_cast(ctx.Attr("forget_bias")); int block = 512; int n = N * D; @@ -169,5 +169,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel); -REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel); +REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel, + ops::LstmUnitOpCUDAKernel); +REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel, + ops::LstmUnitGradOpCUDAKernel); diff --git a/paddle/operators/lstm_unit_op.h b/paddle/operators/lstm_unit_op.h index 0dc9a7d9a7..a0ff498c1d 100644 --- a/paddle/operators/lstm_unit_op.h +++ b/paddle/operators/lstm_unit_op.h @@ -32,7 +32,7 @@ inline T tanh(T x) { return 2. * sigmoid(2. * x) - 1.; } -template +template class LstmUnitKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -44,7 +44,7 @@ class LstmUnitKernel : public framework::OpKernel { auto* c_tensor = ctx.Output("C"); auto* h_tensor = ctx.Output("H"); - auto forget_bias = static_cast(ctx.Attr("forget_bias")); + auto forget_bias = static_cast(ctx.Attr("forget_bias")); int b_size = c_tensor->dims()[0]; int D = c_tensor->dims()[1]; @@ -75,7 +75,7 @@ class LstmUnitKernel : public framework::OpKernel { } }; -template +template class LstmUnitGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -108,7 +108,7 @@ class LstmUnitGradKernel : public framework::OpKernel { int N = c_tensor->dims()[0]; int D = c_tensor->dims()[1]; - auto forget_bias = static_cast(ctx.Attr("forget_bias")); + auto forget_bias = static_cast(ctx.Attr("forget_bias")); for (int n = 0; n < N; ++n) { for (int d = 0; d < D; ++d) { diff --git a/python/paddle/v2/framework/tests/test_lstm_unit_op.py b/python/paddle/v2/framework/tests/test_lstm_unit_op.py index 8ce65bfc31..365ee560e1 100644 --- a/python/paddle/v2/framework/tests/test_lstm_unit_op.py +++ b/python/paddle/v2/framework/tests/test_lstm_unit_op.py @@ -14,8 +14,8 @@ def tanh_np(x): class LstmUnitTest(OpTest): def setUp(self): self.op_type = "lstm_unit" - x_np = np.random.normal(size=(5, 16)).astype("float32") - c_np = np.random.normal(size=(5, 4)).astype("float32") + x_np = np.random.normal(size=(5, 16)).astype("float64") + c_np = np.random.normal(size=(5, 4)).astype("float64") i_np, f_np, o_np, j_np = np.split(x_np, 4, axis=1) forget_bias_np = 0. self.attrs = {'forget_bias': 0.} @@ -31,7 +31,7 @@ class LstmUnitTest(OpTest): self.check_output() def test_check_grad(self): - self.check_grad(['X', 'C_prev'], ['C', 'H'], max_relative_error=0.01) + self.check_grad(['X', 'C_prev'], ['C', 'H']) if __name__ == "__main__": -- GitLab