diff --git a/paddle/operators/lstmp_op.cc b/paddle/operators/lstmp_op.cc index 4c7f7713eea5ba08063525a679c575b4021a3d3f..266612294c69ba7e80d7822a26ed377318b678c0 100644 --- a/paddle/operators/lstmp_op.cc +++ b/paddle/operators/lstmp_op.cc @@ -39,21 +39,12 @@ class LSTMPOp : public framework::OperatorWithKernel { "Output(BatchGate) of LSTMP should not be null."); PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"), "Output(BatchGate) of LSTMP should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"), + "Output(BatchHidden) of LSTMP should not be null."); auto in_dims = ctx->GetInputDim("Input"); PADDLE_ENFORCE_EQ(in_dims.size(), 2, "Input(X)'s rank must be 2."); - if (ctx->HasInput("H0")) { - PADDLE_ENFORCE(ctx->HasInput("C0"), - "Input(C0) and Input(H0) of LSTMP should not " - "be null at the same time."); - auto h_dims = ctx->GetInputDim("H0"); - auto c_dims = ctx->GetInputDim("C0"); - PADDLE_ENFORCE(h_dims == c_dims, - "The dimension of Input(H0) and Input(C0) " - "should be the same."); - } - int frame_size = in_dims[1] / 4; auto w_dims = ctx->GetInputDim("Weight"); auto proj_dims = ctx->GetInputDim("ProjWeight"); @@ -75,6 +66,18 @@ class LSTMPOp : public framework::OperatorWithKernel { "should be %d.", frame_size); + if (ctx->HasInput("H0")) { + PADDLE_ENFORCE(ctx->HasInput("C0"), + "Input(C0) and Input(H0) of LSTMP should not " + "be null at the same time."); + auto h_dims = ctx->GetInputDim("H0"); + auto c_dims = ctx->GetInputDim("C0"); + PADDLE_ENFORCE(h_dims == c_dims, + "The dimension of Input(H0) and Input(C0) " + "should be the same."); + ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]}); + } + auto b_dims = ctx->GetInputDim("Bias"); PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); PADDLE_ENFORCE_EQ(b_dims[0], 1, @@ -98,6 +101,7 @@ class LSTMPOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Cell", out_dims); ctx->SetOutputDim("BatchGate", in_dims); ctx->SetOutputDim("BatchCellPreAct", out_dims); + ctx->SetOutputDim("BatchHidden", out_dims); ctx->ShareLoD("Input", "Projection"); ctx->ShareLoD("Input", "Cell"); } @@ -169,6 +173,15 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker { "(LoDTensor) This LoDTensor is obtained in the forward and used " "in the backward.") .AsIntermediate(); + AddOutput("BatchHidden", + "(LoDTensor) This LoDTensor is obtained in the forward and used " + "in the backward.") + .AsIntermediate(); + AddOutput("OrderedP0", + "(Tensor) the projection of the initial hidden state " + "H0. This is a tensor with shape (N x P), where N is the " + "batch size and P is the hidden size.") + .AsIntermediate(); AddAttr("use_peepholes", "(bool, defalut: True) " "whether to enable diagonal/peephole connections.") @@ -177,6 +190,12 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, defalut: False) " "whether to compute reversed LSTMP.") .SetDefault(false); + AddAttr("share_cell_act", + "(bool, defalut: True) " + "whether to share activation with cell output. " + "If false, the projection would be linear, else " + "through an activation same with the cell output.") + .SetDefault(true); AddAttr( "gate_activation", "(string, default: sigmoid)" @@ -213,7 +232,7 @@ o_t = \sigma(W_{ox}x_{t} + W_{oh}r_{t-1} + W_{oc}c_t + b_o) \\ h_t = o_t \odot act_h(c_t) -r_t = W_{rh}h_t +r_t = act_h'(W_{rh}h_t) $$ where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix @@ -229,7 +248,8 @@ layer. The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$ are the cell input and cell output activation functions and `tanh` is usually -used for them. +used for them. If `share_cell_act` setted to `False`, $act_h'$ will be linear +else will be same with $act_h$. Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$ operations on the input $x_{t}$ are NOT included in this operator. @@ -246,12 +266,14 @@ class LSTMPGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Input"), "Input(Input) of LSTMP should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Hidden"), - "Input(Hidden) of LSTMP should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Projection"), + "Input(Projection) of LSTMP should not be null."); PADDLE_ENFORCE(ctx->HasInput("Cell"), "Input(Cell) of LSTMP should not be null."); PADDLE_ENFORCE(ctx->HasInput("Weight"), "Input(Weight) of LSTMP should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ProjWeight"), + "Input(ProjWeight) of LSTMP should not be null."); PADDLE_ENFORCE(ctx->HasInput("Bias"), "Input(Bias) of LSTMP should not be null."); @@ -268,6 +290,7 @@ class LSTMPGradOp : public framework::OperatorWithKernel { SetOutGradDim("Input"); SetOutGradDim("Weight"); + SetOutGradDim("ProjWeight"); SetOutGradDim("Bias"); SetOutGradDim("H0"); SetOutGradDim("C0"); diff --git a/paddle/operators/lstmp_op.cu.cc b/paddle/operators/lstmp_op.cu similarity index 100% rename from paddle/operators/lstmp_op.cu.cc rename to paddle/operators/lstmp_op.cu diff --git a/paddle/operators/lstmp_op.h b/paddle/operators/lstmp_op.h index f5a38b2ff50088edf876de7f89be7ac1d334fbb3..9467ccdb5a8e9d398daeadec1ee68ac5ef6aa0fc 100644 --- a/paddle/operators/lstmp_op.h +++ b/paddle/operators/lstmp_op.h @@ -13,18 +13,25 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/framework/op_registry.h" +#include "paddle/operators/activation_op.h" #include "paddle/operators/math/detail/activation_functions.h" #include "paddle/operators/math/lstm_compute.h" #include "paddle/operators/math/math_function.h" #include "paddle/operators/math/sequence2batch.h" +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + namespace paddle { namespace operators { using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + template inline void ReorderInitState(const DeviceContext& ctx, const framework::Tensor& src, const size_t* index, @@ -37,6 +44,21 @@ inline void ReorderInitState(const DeviceContext& ctx, template class LSTMPKernel : public framework::OpKernel { public: + template + void ActCompute(const math::detail::ActivationType act_type, const Device& d, + X x, Y y) const { + if (act_type == math::detail::ActivationType::kIdentity) + y.device(d) = x; + else if (act_type == math::detail::ActivationType::kSigmoid) + SigmoidFunctor()(d, x, y); + else if (act_type == math::detail::ActivationType::kTanh) + TanhFunctor()(d, x, y); + else if (act_type == math::detail::ActivationType::kReLU) + ReluFunctor()(d, x, y); + else + PADDLE_THROW("unsupported activation type"); + } + void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input("Input"); auto* weight = ctx.Input("Weight"); @@ -44,6 +66,7 @@ class LSTMPKernel : public framework::OpKernel { auto* bias = ctx.Input("Bias"); auto* hidden_t0 = ctx.Input("H0"); + auto* ordered_proj0 = ctx.Output("OrderedP0"); auto* cell_t0 = ctx.Input("C0"); auto* batch_gate = ctx.Output("BatchGate"); @@ -97,12 +120,13 @@ class LSTMPKernel : public framework::OpKernel { } // Use the local variable as here. - LoDTensor batch_hidden, batch_proj, batch_cell; + LoDTensor batch_proj, batch_cell; auto* batch_cell_pre_act = ctx.Output("BatchCellPreAct"); - batch_hidden.mutable_data(dims, ctx.GetPlace()); // T x D + batch_cell_pre_act->mutable_data(dims, ctx.GetPlace()); + auto* batch_hidden = ctx.Output("BatchHidden"); + batch_hidden->mutable_data(dims, ctx.GetPlace()); // T x D batch_proj.mutable_data(proj_dims, ctx.GetPlace()); // T x P batch_cell.mutable_data(dims, ctx.GetPlace()); // T x D - batch_cell_pre_act->mutable_data(dims, ctx.GetPlace()); auto batch_starts = batch_gate->lod()[0]; size_t num_batch = batch_starts.size() - 1; @@ -112,13 +136,15 @@ class LSTMPKernel : public framework::OpKernel { ctx.Attr("cell_activation")); auto cand_act = math::detail::GetActivationType( ctx.Attr("candidate_activation")); + auto share_cell_act = ctx.Attr("share_cell_act"); + auto& place = *ctx.template device_context().eigen_device(); for (size_t n = 0; n < num_batch; n++) { int bstart = static_cast(batch_starts[n]); int bend = static_cast(batch_starts[n + 1]); Tensor gate_t = batch_gate->Slice(bstart, bend); - Tensor hidden_t = batch_hidden.Slice(bstart, bend); + Tensor hidden_t = batch_hidden->Slice(bstart, bend); Tensor proj_t = batch_proj.Slice(bstart, bend); Tensor cell_t = batch_cell.Slice(bstart, bend); Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend); @@ -140,15 +166,19 @@ class LSTMPKernel : public framework::OpKernel { // Since the batch computing for LSTMP reorders the input sequence // according to their length. The initialized hidden state also needs // to reorder. - Tensor ordered_h0, ordered_proj0; - ordered_proj0.Resize({1, proj_weight->dims()[1]}); - ordered_proj0.mutable_data(ctx.GetPlace()); + + Tensor ordered_h0; + ordered_proj0->mutable_data(ctx.GetPlace()); ReorderInitState(device_ctx, *hidden_t0, order, &ordered_h0, true); math::matmul(device_ctx, ordered_h0, false, *proj_weight, false, static_cast(1.0), - &ordered_proj0, static_cast(0.0)); - math::matmul(device_ctx, ordered_proj0, false, + ordered_proj0, static_cast(0.0)); + if (share_cell_act) { + auto proj0_dev = EigenMatrix::From(*ordered_proj0); + ActCompute(cell_act, place, proj0_dev, proj0_dev); + } + math::matmul(device_ctx, *ordered_proj0, false, *weight, false, static_cast(1.0), &gate_t, static_cast(1.0)); } @@ -164,6 +194,10 @@ class LSTMPKernel : public framework::OpKernel { math::matmul(device_ctx, hidden_t, false, *proj_weight, false, static_cast(1.0), &proj_t, static_cast(0.0)); + if (share_cell_act) { + auto proj_t_dev = EigenMatrix::From(proj_t); + ActCompute(cell_act, place, proj_t_dev, proj_t_dev); + } } math::Batch2LoDTensorFunctor to_seq; @@ -180,9 +214,26 @@ class LSTMPKernel : public framework::OpKernel { template class LSTMPGradKernel : public framework::OpKernel { public: + template + void ActGradCompute(const math::detail::ActivationType act_type, + const Device& d, X x, Y y, DX dx, DY dy) const { + // x is dummy and won't be used even in Relu(use y instead) + if (act_type == math::detail::ActivationType::kIdentity) + dx.device(d) = dy; + else if (act_type == math::detail::ActivationType::kSigmoid) + SigmoidGradFunctor()(d, x, y, dy, dx); + else if (act_type == math::detail::ActivationType::kTanh) + TanhGradFunctor()(d, x, y, dy, dx); + else if (act_type == math::detail::ActivationType::kReLU) + ReluGradFunctor()(d, x, y, dy, dx); + else + PADDLE_THROW("unsupported activation type"); + } + void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input("Input"); auto* weight = ctx.Input("Weight"); + auto* proj_weight = ctx.Input("ProjWeight"); auto* bias = ctx.Input("Bias"); auto* proj_out = ctx.Input("Projection"); @@ -190,14 +241,19 @@ class LSTMPGradKernel : public framework::OpKernel { auto* batch_gate = ctx.Input("BatchGate"); auto* batch_cell_pre_act = ctx.Input("BatchCellPreAct"); + auto* batch_hidden = ctx.Input("BatchHidden"); - auto* hidden_g = ctx.Input(framework::GradVarName("Projection")); + auto* projection_g = + ctx.Input(framework::GradVarName("Projection")); auto* in_g = ctx.Output(framework::GradVarName("Input")); auto* weight_g = ctx.Output(framework::GradVarName("Weight")); + auto* proj_weight_g = + ctx.Output(framework::GradVarName("ProjWeight")); auto* bias_g = ctx.Output(framework::GradVarName("Bias")); auto* h0 = ctx.Input("H0"); + auto* ordered_proj0 = ctx.Input("OrderedP0"); auto* c0 = ctx.Input("C0"); auto* h0_g = ctx.Output(framework::GradVarName("H0")); @@ -209,6 +265,10 @@ class LSTMPGradKernel : public framework::OpKernel { weight_g->mutable_data(ctx.GetPlace()); zero(device_ctx, weight_g, static_cast(0.0)); } + if (proj_weight_g) { + proj_weight_g->mutable_data(ctx.GetPlace()); + zero(device_ctx, proj_weight_g, static_cast(0.0)); + } // ordered_h0/c0 is the reordered hidden/cell initialization. // ordered_h0_g/c0_g is the reordered gradient of hidden/cell @@ -224,7 +284,8 @@ class LSTMPGradKernel : public framework::OpKernel { } auto in_dims = input->dims(); - auto out_dims = hidden_g->dims(); + auto out_dims = cell_out->dims(); + framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]}); int frame_size = static_cast(in_dims[1] / 4); PADDLE_ENFORCE_EQ(frame_size, out_dims[1]); @@ -267,10 +328,11 @@ class LSTMPGradKernel : public framework::OpKernel { to_batch(ctx, src, dst, false); }; - LoDTensor batch_proj, batch_proj_g, batch_cell; - ToBatch(device_ctx, *proj_out, out_dims, batch_proj); - ToBatch(device_ctx, *hidden_g, out_dims, batch_proj_g); - ToBatch(device_ctx, *cell_out, out_dims, batch_cell); + LoDTensor batch_hidden_g, batch_proj, batch_proj_g, batch_cell; + batch_hidden_g.mutable_data(out_dims, ctx.GetPlace()); + ToBatch(device_ctx, *proj_out, proj_dims, batch_proj); // T x P + ToBatch(device_ctx, *projection_g, proj_dims, batch_proj_g); // T x P + ToBatch(device_ctx, *cell_out, out_dims, batch_cell); // T x D LoDTensor batch_cell_g, batch_gate_g; batch_cell_g.mutable_data(out_dims, ctx.GetPlace()); @@ -286,6 +348,8 @@ class LSTMPGradKernel : public framework::OpKernel { ctx.Attr("cell_activation")); auto cand_act = math::detail::GetActivationType( ctx.Attr("candidate_activation")); + auto share_cell_act = ctx.Attr("share_cell_act"); + auto& place = *ctx.template device_context().eigen_device(); auto batch_starts = batch_gate->lod()[0]; size_t num_batch = batch_starts.size() - 1; @@ -293,6 +357,19 @@ class LSTMPGradKernel : public framework::OpKernel { int bstart = static_cast(batch_starts[n]); int bend = static_cast(batch_starts[n + 1]); + Tensor cur_proj = batch_proj.Slice(bstart, bend); + Tensor proj_g = batch_proj_g.Slice(bstart, bend); + if (share_cell_act) { + auto cur_proj_dev = EigenMatrix::From(cur_proj); + auto proj_g_dev = EigenMatrix::From(proj_g); + ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev, + proj_g_dev); + } + Tensor out_g = batch_hidden_g.Slice(bstart, bend); + math::matmul(device_ctx, proj_g, false, *proj_weight, + true, static_cast(1.0), &out_g, + static_cast(0.0)); + Tensor gate = batch_gate->Slice(bstart, bend); Tensor cell = batch_cell.Slice(bstart, bend); Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend); @@ -300,7 +377,6 @@ class LSTMPGradKernel : public framework::OpKernel { lstmp_value.state_value = cell.data(); lstmp_value.state_active_value = cell_pre_act.data(); - Tensor out_g = batch_proj_g.Slice(bstart, bend); Tensor gate_g = batch_gate_g.Slice(bstart, bend); Tensor cell_g = batch_cell_g.Slice(bstart, bend); lstmp_grad.state_grad = cell_g.data(); @@ -337,19 +413,48 @@ class LSTMPGradKernel : public framework::OpKernel { false, static_cast(1.0), weight_g, static_cast(1.0)); } + if (proj_weight_g) { + /* backward proj weigh */ + Tensor hidden_t = batch_hidden->Slice(bstart, bend); + math::matmul(device_ctx, hidden_t, true, proj_g, + false, static_cast(1.0), + proj_weight_g, static_cast(1.0)); + } } else { if (h0 && weight_g) { ReorderInitState(device_ctx, *h0, order, &ordered_h0, true); - math::matmul(device_ctx, ordered_h0, true, gate_g, - false, static_cast(1.0), weight_g, - static_cast(1.0)); + if (weight_g) { + math::matmul(device_ctx, *ordered_proj0, true, + gate_g, false, static_cast(1.0), + weight_g, static_cast(1.0)); + } } - if (h0 && h0_g) { + if (h0 && (h0_g || proj_weight_g)) { ordered_h0_g.mutable_data(h0_g->dims(), ctx.GetPlace()); + Tensor proj0_g; + proj0_g.Resize({in_dims[0], proj_weight->dims()[1]}); + proj0_g.mutable_data(ctx.GetPlace()); math::matmul(device_ctx, gate_g, false, *weight, - true, static_cast(1.0), - &ordered_h0_g, static_cast(0.0)); + true, static_cast(1.0), &proj0_g, + static_cast(0.0)); + if (share_cell_act) { + auto proj0_dev = EigenMatrix::From(*ordered_proj0); + auto proj0_g_dev = EigenMatrix::From(proj0_g); + ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev, + proj0_g_dev); + } + // Tensor proj0_g = proj_g.Slice(bstart, bend); + if (h0_g) { + math::matmul( + device_ctx, proj0_g, false, *proj_weight, true, + static_cast(1.0), &ordered_h0_g, static_cast(0.0)); + } + if (proj_weight_g) { + math::matmul(device_ctx, ordered_h0, true, + proj0_g, false, static_cast(1.0), + proj_weight_g, static_cast(1.0)); + } } } } diff --git a/python/paddle/v2/fluid/tests/test_lstmp_op.py b/python/paddle/v2/fluid/tests/test_lstmp_op.py index e35712ec069a5f892ee69b58f23b592e9d9685e8..81e06063fc7d725a64014a2b63887b0b22efdce0 100644 --- a/python/paddle/v2/fluid/tests/test_lstmp_op.py +++ b/python/paddle/v2/fluid/tests/test_lstmp_op.py @@ -62,7 +62,8 @@ def lstmp( is_reverse=False, act_gate=None, act_cell=None, - act_cand=None): + act_cand=None, + share_cell_act=True): def _step(x, w_r, w_rh, w_c, r_pre, c_pre, act_gate, act_cell, act_cand): g = np.dot(r_pre, w_r) # 1 x 4D g = g + x @@ -85,6 +86,8 @@ def lstmp( h = g_o * act_cell(c) # projection r = np.dot(h, w_rh) + if share_cell_act: + r = act_cell(r) return r, c def _reverse(x, lod): @@ -107,6 +110,8 @@ def lstmp( seq_len = offset[i + 1] - offset[i] x = input[offset[i]:offset[i + 1], :] r_pre = np.dot(h0[i], w_rh) # 1 x P + if share_cell_act: + r_pre = act_cell(r_pre) c_pre = c0[i] # 1 x D for j in range(seq_len): # compute one step @@ -138,6 +143,7 @@ class TestLstmOp(OpTest): self.act_cell = 'tanh' self.act_cand = 'tanh' + self.share_cell_act = True self.has_initial_state = False self.is_reverse = False self.use_peepholes = True @@ -167,7 +173,7 @@ class TestLstmOp(OpTest): w_rh = np.random.normal(size=(self.D, self.P)).astype('float64') r, c = lstmp(x, self.lod, h0, c0, w, w_rh, w_b, w_c, self.is_reverse, ACTVATION[self.act_gate], ACTVATION[self.act_cell], - ACTVATION[self.act_cand]) + ACTVATION[self.act_cand], self.share_cell_act) self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh} @@ -192,28 +198,30 @@ class TestLstmOp(OpTest): def test_check_output(self): self.check_output(atol=1e-8) - """ def test_check_grad(self): # TODO(qingqing) remove folowing lines after the check_grad is refined. N = len(self.lod[0]) - 1 + self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') + self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') self.outputs['BatchCellPreAct'] = np.zeros( (N, self.D)).astype('float64') self.check_grad( - ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4) - """ + ['Input', 'Weight', 'Bias'], ['Projection'], + max_relative_error=5e-3) -""" class TestLstmOpHasInitial(TestLstmOp): def set_argument(self): self.lod = [[0, 2, 5, 7]] self.D = 16 + self.P = 5 self.act_gate = 'sigmoid' self.act_cell = 'tanh' self.act_cand = 'tanh' + self.share_cell_act = True self.has_initial_state = True self.is_reverse = True self.use_peepholes = True @@ -221,63 +229,74 @@ class TestLstmOpHasInitial(TestLstmOp): def test_check_grad(self): # TODO(qingqing) remove folowing lines after the check_grad is refined. N = len(self.lod[0]) - 1 + self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') + self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') self.outputs['BatchCellPreAct'] = np.zeros( (N, self.D)).astype('float64') self.check_grad( - ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'], - max_relative_error=5e-4) + ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Projection'], + max_relative_error=5e-3) def test_check_grad_ingore_bias(self): N = len(self.lod[0]) - 1 + self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') + self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') self.outputs['BatchCellPreAct'] = np.zeros( (N, self.D)).astype('float64') self.check_grad( - ['Input', 'Weight'], ['Hidden'], - max_relative_error=5e-4, + ['Input', 'Weight'], ['Projection'], + max_relative_error=5e-3, no_grad_set=set('Bias')) def test_check_grad_ingore_weight(self): N = len(self.lod[0]) - 1 + self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') + self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') self.outputs['BatchCellPreAct'] = np.zeros( (N, self.D)).astype('float64') self.check_grad( - ['Input', 'Bias'], ['Hidden'], - max_relative_error=5e-4, + ['Input', 'Bias'], ['Projection'], + max_relative_error=5e-3, no_grad_set=set('Weight')) def test_check_grad_ingore_input(self): N = len(self.lod[0]) - 1 + self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') + self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') self.outputs['BatchCellPreAct'] = np.zeros( (N, self.D)).astype('float64') self.check_grad( - ['Weight', 'Bias'], ['Hidden'], - max_relative_error=5e-4, + ['Weight', 'Bias'], ['Projection'], + max_relative_error=5e-3, no_grad_set=set('Input')) def test_check_grad_ingore_h0(self): N = len(self.lod[0]) - 1 + self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') + self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') self.outputs['BatchCellPreAct'] = np.zeros( (N, self.D)).astype('float64') self.check_grad( - ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'], - max_relative_error=5e-4, + ['Input', 'Weight', 'Bias', 'C0'], ['Projection'], + max_relative_error=5e-3, no_grad_set=set('H0')) def test_check_grad_ingore_c0(self): N = len(self.lod[0]) - 1 + self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') + self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') self.outputs['BatchCellPreAct'] = np.zeros( (N, self.D)).astype('float64') self.check_grad( - ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'], - max_relative_error=5e-4, + ['Input', 'Weight', 'Bias', 'H0'], ['Projection'], + max_relative_error=5e-3, no_grad_set=set('C0')) -""" class TestLstmOpRerverse(TestLstmOp): @@ -290,6 +309,7 @@ class TestLstmOpRerverse(TestLstmOp): self.act_cell = 'tanh' self.act_cand = 'tanh' + self.share_cell_act = True self.has_initial_state = False self.is_reverse = True self.use_peepholes = True @@ -305,6 +325,7 @@ class TestLstmOpNotUsePeepholes(TestLstmOp): self.act_cell = 'tanh' self.act_cand = 'tanh' + self.share_cell_act = True self.has_initial_state = False self.is_reverse = True self.use_peepholes = False