diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc index 9a34daf98490926ecf0e6f60ee58e2e02364f028..24f84597cd7301af6521b8c1032e69569ba6f03a 100644 --- a/paddle/operators/gru_unit_op.cc +++ b/paddle/operators/gru_unit_op.cc @@ -31,8 +31,6 @@ class GRUUnitOp : public framework::OperatorWithKernel { "Input(%s) of GRUUnitOp should not be null.", "HiddenPrev"); PADDLE_ENFORCE(ctx->HasInput("Weight"), "Input(%s) of GRUUnitOp should not be null.", "Weight"); - PADDLE_ENFORCE(ctx->HasInput("Bias"), - "Input(%s) of GRUUnitOp should not be null.", "Bias"); PADDLE_ENFORCE(ctx->HasOutput("Gate"), "Output(%s) of GRUUnitOp should not be null.", "Gate"); PADDLE_ENFORCE(ctx->HasOutput("ResetHiddenPrev"), @@ -43,14 +41,11 @@ class GRUUnitOp : public framework::OperatorWithKernel { auto input_dims = ctx->GetInputDim("Input"); auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev"); auto weight_dims = ctx->GetInputDim("Weight"); - auto bias_dims = ctx->GetInputDim("Bias"); int batch_size = input_dims[0]; int input_size = input_dims[1]; int frame_size = hidden_prev_dims[1]; int weight_height = weight_dims[0]; int weight_width = weight_dims[1]; - int bias_height = bias_dims[0]; - int bias_width = bias_dims[1]; PADDLE_ENFORCE_EQ( input_size, frame_size * 3, "The input_size must be 3 times of frame_size in GRUUnitOp."); @@ -60,10 +55,16 @@ class GRUUnitOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( weight_width, frame_size * 3, "The shape of Weight matrix must be [frame_size, frame_size * 3]."); - PADDLE_ENFORCE_EQ(bias_height, 1, - "The shape of Bias must be [1, frame_size * 3]."); - PADDLE_ENFORCE_EQ(bias_width, frame_size * 3, - "The shape of Bias must be [1, frame_size * 3]."); + auto bias = Input("Bias"); + if (bias != framework::kEmptyVarName) { + auto bias_dims = ctx->GetInputDim("Bias"); + int bias_height = bias_dims[0]; + int bias_width = bias_dims[1]; + PADDLE_ENFORCE_EQ(bias_height, 1, + "The shape of Bias must be [1, frame_size * 3]."); + PADDLE_ENFORCE_EQ(bias_width, frame_size * 3, + "The shape of Bias must be [1, frame_size * 3]."); + } ctx->SetOutputDim("Gate", {batch_size, frame_size * 3}); ctx->SetOutputDim("ResetHiddenPrev", {batch_size, frame_size}); ctx->SetOutputDim("Hidden", {batch_size, frame_size}); @@ -139,8 +140,6 @@ class GRUUnitGradOp : public framework::OperatorWithKernel { "HiddenPrev"); PADDLE_ENFORCE(ctx->HasInput("Weight"), "Input(%s) of GRUUnitGradOp should not be null.", "Weight"); - PADDLE_ENFORCE(ctx->HasInput("Bias"), - "Input(%s) of GRUUnitGradOp should not be null.", "Bias"); PADDLE_ENFORCE(ctx->HasInput("Gate"), "Input(%s) of GRUUnitGradOp should not be null.", "Gate"); PADDLE_ENFORCE(ctx->HasInput("ResetHiddenPrev"), @@ -160,14 +159,11 @@ class GRUUnitGradOp : public framework::OperatorWithKernel { auto input_dims = ctx->GetInputDim("Input"); auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev"); auto weight_dims = ctx->GetInputDim("Weight"); - auto bias_dims = ctx->GetInputDim("Bias"); // int batch_size = input_dims[0]; int input_size = input_dims[1]; int frame_size = hidden_prev_dims[1]; int weight_height = weight_dims[0]; int weight_width = weight_dims[1]; - int bias_height = bias_dims[0]; - int bias_width = bias_dims[1]; PADDLE_ENFORCE_EQ( input_size, frame_size * 3, "The input_size must be 3 times of frame_size in GRUUnitOp."); @@ -177,10 +173,19 @@ class GRUUnitGradOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( weight_width, frame_size * 3, "The shape of Weight matrix must be [frame_size, frame_size * 3]."); - PADDLE_ENFORCE_EQ(bias_height, 1, - "The shape of Bias must be [1, frame_size * 3]."); - PADDLE_ENFORCE_EQ(bias_width, frame_size * 3, - "The shape of Bias must be [1, frame_size * 3]."); + auto bias = Input("Bias"); + if (bias != framework::kEmptyVarName) { + auto bias_dims = ctx->GetInputDim("Bias"); + int bias_height = bias_dims[0]; + int bias_width = bias_dims[1]; + PADDLE_ENFORCE_EQ(bias_height, 1, + "The shape of Bias must be [1, frame_size * 3]."); + PADDLE_ENFORCE_EQ(bias_width, frame_size * 3, + "The shape of Bias must be [1, frame_size * 3]."); + auto bias_grad_name = framework::GradVarName("Bias"); + if (ctx->HasOutput(bias_grad_name)) + ctx->SetOutputDim(bias_grad_name, bias_dims); + } auto input_grad_name = framework::GradVarName("Input"); if (ctx->HasOutput(input_grad_name)) ctx->SetOutputDim(input_grad_name, input_dims); @@ -190,9 +195,6 @@ class GRUUnitGradOp : public framework::OperatorWithKernel { auto weight_grad_name = framework::GradVarName("Weight"); if (ctx->HasOutput(weight_grad_name)) ctx->SetOutputDim(weight_grad_name, weight_dims); - auto bias_grad_name = framework::GradVarName("Bias"); - if (ctx->HasOutput(bias_grad_name)) - ctx->SetOutputDim(bias_grad_name, bias_dims); } }; diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h index e97aa38ac642617dafa658bdcf74591b75e10b03..c53e7d9827e0395e6ce613302e732b2797f83cdd 100644 --- a/paddle/operators/gru_unit_op.h +++ b/paddle/operators/gru_unit_op.h @@ -64,16 +64,20 @@ class GRUUnitKernel : public framework::OpKernel { auto x = EigenMatrix::From(*input); auto h_p = EigenMatrix::From(*hidden_prev); - auto b = EigenMatrix::From(*bias); auto g = EigenMatrix::From(*gate); auto r_h_p = EigenMatrix::From(*reset_hidden_prev); auto h = EigenMatrix::From(*hidden); auto place = context.GetEigenDevice(); // calculate unactivated gate outputs - g.device(place) = x + - b.reshape(Eigen::array({{1, frame_size * 3}})) - .broadcast(Eigen::array({{batch_size, 1}})); + if (bias) { + auto b = EigenMatrix::From(*bias); + g.device(place) = x + + b.reshape(Eigen::array({{1, frame_size * 3}})) + .broadcast(Eigen::array({{batch_size, 1}})); + } else { + g.device(place) = x; + } const T* hidden_prev_data = hidden_prev->data(); const T* weight_data = weight->data(); T* gate_data = gate->data(); @@ -145,7 +149,6 @@ class GRUUnitGradKernel : public framework::OpKernel { input_grad->mutable_data(context.GetPlace()); hidden_prev_grad->mutable_data(context.GetPlace()); weight_grad->mutable_data(context.GetPlace()); - bias_grad->mutable_data(context.GetPlace()); Tensor gate_grad; gate_grad.mutable_data(input->dims(), context.GetPlace()); Tensor reset_hidden_prev_grad; @@ -168,7 +171,6 @@ class GRUUnitGradKernel : public framework::OpKernel { auto d_h = EigenMatrix::From(*hidden_grad); auto d_x = EigenMatrix::From(*input_grad); auto d_h_p = EigenMatrix::From(*hidden_prev_grad); - auto d_b = EigenMatrix::From(*bias_grad); auto d_g = EigenMatrix::From(gate_grad); auto d_r_h_p = EigenMatrix::From(reset_hidden_prev_grad); auto place = context.GetEigenDevice(); @@ -216,7 +218,11 @@ class GRUUnitGradKernel : public framework::OpKernel { // backward for input d_x.device(place) = d_g; // backward for bias - d_b.device(place) = d_g.sum(Eigen::array({{0}})); + if (bias_grad) { + bias_grad->mutable_data(context.GetPlace()); + auto d_b = EigenMatrix::From(*bias_grad); + d_b.device(place) = d_g.sum(Eigen::array({{0}})); + } } }; diff --git a/python/paddle/v2/framework/tests/test_gru_unit_op.py b/python/paddle/v2/framework/tests/test_gru_unit_op.py index bc8b3406e6549e5102da6ff37cb31c453ba2301c..57625362d21905d257f46ff5330841a20438773a 100644 --- a/python/paddle/v2/framework/tests/test_gru_unit_op.py +++ b/python/paddle/v2/framework/tests/test_gru_unit_op.py @@ -28,6 +28,8 @@ def relu(x): class TestGRUUnitOp(OpTest): + batch_size = 3 + frame_size = 5 activate = { GRUActivationType.identity: identity, GRUActivationType.sigmoid: sigmoid, @@ -35,9 +37,9 @@ class TestGRUUnitOp(OpTest): GRUActivationType.relu: relu, } - def setUp(self): - batch_size = 3 - frame_size = 5 + def set_inputs(self): + batch_size = self.batch_size + frame_size = self.frame_size self.op_type = 'gru_unit' self.inputs = { 'Input': np.random.uniform( @@ -47,18 +49,21 @@ class TestGRUUnitOp(OpTest): 'Weight': np.random.uniform( -1. / math.sqrt(frame_size), 1. / math.sqrt(frame_size), (frame_size, frame_size * 3)).astype('float32'), - 'Bias': np.random.uniform(-0.1, 0.1, - (1, frame_size * 3)).astype('float32') } self.attrs = { 'activation': GRUActivationType.tanh, 'gate_activation': GRUActivationType.sigmoid } + + def set_outputs(self): # GRU calculations + batch_size = self.batch_size + frame_size = self.frame_size x = self.inputs['Input'] h_p = self.inputs['HiddenPrev'] w = self.inputs['Weight'] - b = self.inputs['Bias'] + b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros( + (1, frame_size * 3)) g = x + np.tile(b, (batch_size, 1)) w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape( (frame_size, frame_size * 2)) @@ -73,12 +78,33 @@ class TestGRUUnitOp(OpTest): g[:, frame_size * 2:]) g = np.hstack((u_r, c)) h = u * h_p + (1 - u) * c - self.outputs = {'Gate': g, 'ResetHiddenPrev': r_h_p, 'Hidden': h} + def setUp(self): + self.set_inputs() + self.set_outputs() + def test_check_output(self): self.check_output() + def test_check_grad(self): + self.check_grad( + ['Input', 'HiddenPrev', 'Weight'], ['Hidden'], + max_relative_error=0.007) + + +class TestGRUUnitOpWithBias(TestGRUUnitOp): + def set_inputs(self): + batch_size = self.batch_size + frame_size = self.frame_size + super(TestGRUUnitOpWithBias, self).set_inputs() + self.inputs['Bias'] = np.random.uniform( + -0.1, 0.1, (1, frame_size * 3)).astype('float32') + self.attrs = { + 'activation': GRUActivationType.identity, + 'gate_activation': GRUActivationType.sigmoid + } + def test_check_grad(self): self.check_grad( ['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'],