diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc index 8d0f59745fd58eb975b952369a53e48584a45126..603dc7f4bd681415bcd540e06365437c6885fedd 100644 --- a/paddle/operators/mul_op.cc +++ b/paddle/operators/mul_op.cc @@ -75,8 +75,8 @@ class MulOpGrad : public framework::OperatorWithKernel { PADDLE_ENFORCE(y_dims[1] == out_dims[1], "Out@GRAD M X N must equal to Y dims 1, N "); - x_grad->Resize(x_dims); - y_grad->Resize(y_dims); + if (x_grad) x_grad->Resize(x_dims); + if (y_grad) y_grad->Resize(y_dims); } }; diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h index 8facc0281449785bf40726f23ca2fd5d166ff272..66ed2f81c7b65c73c665b83e323f76228a332de6 100644 --- a/paddle/operators/mul_op.h +++ b/paddle/operators/mul_op.h @@ -31,13 +31,13 @@ template class MulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* X = context.Input("X"); - auto* Y = context.Input("Y"); - auto* Z = context.Output("Out"); - Z->mutable_data(context.GetPlace()); + auto* x = context.Input("X"); + auto* y = context.Input("Y"); + auto* z = context.Output("Out"); + z->mutable_data(context.GetPlace()); auto* device_context = const_cast(context.device_context_); - math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); + math::matmul(*x, false, *y, false, 1, z, 0, device_context); } }; @@ -45,20 +45,24 @@ template class MulGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* Y = ctx.Input("Y"); - auto* dOut = ctx.Input(framework::GradVarName("Out")); + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dY = ctx.Output(framework::GradVarName("Y")); - dX->mutable_data(ctx.GetPlace()); - dY->mutable_data(ctx.GetPlace()); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); auto* device_context = const_cast(ctx.device_context_); - // dX = dOut * Y'. dX: M x K, dOut : M x N, Y : K x N - math::matmul(*dOut, false, *Y, true, 1, dX, 0, device_context); - // dY = X' * dOut. dY: K x N, dOut : M x N, X : M x K - math::matmul(*X, true, *dOut, false, 1, dY, 0, device_context); + if (dx) { + // dx = dout * y'. dx: M x K, dout : M x N, y : K x N + dx->mutable_data(ctx.GetPlace()); + math::matmul(*dout, false, *y, true, 1, dx, 0, device_context); + } + if (dy) { + dy->mutable_data(ctx.GetPlace()); + // dy = x' * dout. dy K x N, dout : M x N, x : M x K + math::matmul(*x, true, *dout, false, 1, dy, 0, device_context); + } } }; diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc index 63de91254f4b75587cb2fb29aeb8ff7358ba8e76..a9dfba3e95604fc259753dec3b536c2eb5e1ebe8 100644 --- a/paddle/operators/rowwise_add_op.cc +++ b/paddle/operators/rowwise_add_op.cc @@ -64,8 +64,10 @@ class RowwiseAddGradOp : public framework::OperatorWithKernel { auto dims0 = ctx.Input("X")->dims(); auto dims1 = ctx.Input("b")->dims(); PADDLE_ENFORCE_EQ(1, dims1.size(), "b dims should be 1") - ctx.Output(framework::GradVarName("X"))->Resize(dims0); - ctx.Output(framework::GradVarName("b"))->Resize(dims1); + auto *dx = ctx.Output(framework::GradVarName("X")); + auto *db = ctx.Output(framework::GradVarName("b")); + if (dx) dx->Resize(dims0); + if (db) db->Resize(dims1); } }; diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h index 1cbd8bb31ad90a32d8a4e3bb59617d0b5384e470..4e926d9f2947f37b71e81c0fa592b0c66b19c640 100644 --- a/paddle/operators/rowwise_add_op.h +++ b/paddle/operators/rowwise_add_op.h @@ -51,20 +51,24 @@ template class RowwiseAddGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* dOut = context.Input(framework::GradVarName("Out")); - auto* dX = context.Output(framework::GradVarName("X")); + auto* dout = context.Input(framework::GradVarName("Out")); + auto* dx = context.Output(framework::GradVarName("X")); auto* db = context.Output(framework::GradVarName("b")); - dX->mutable_data(context.GetPlace()); - db->mutable_data(context.GetPlace()); - auto OutGrad = EigenMatrix::From(*dOut); + auto out_grad = EigenMatrix::From(*dout); auto place = context.GetEigenDevice(); - EigenMatrix::From(*dX).device(place) = OutGrad; + if (dx) { + dx->mutable_data(context.GetPlace()); + EigenMatrix::From(*dx).device(place) = out_grad; + } - // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html - // colwise add - Eigen::array dims{{0}}; /* dimension to reduce */ - EigenVector::Flatten(*db).device(place) = OutGrad.sum(dims); + if (db) { + db->mutable_data(context.GetPlace()); + // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html + // colwise add + Eigen::array dims{{0}}; /* dimension to reduce */ + EigenVector::Flatten(*db).device(place) = out_grad.sum(dims); + } } }; } // namespace operators diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc index 35c185ad80f93d1005c1616dcffd2e61bcd54222..9b5068f07c863f4a863e22cd670634663496333b 100644 --- a/paddle/operators/scatter_op.cc +++ b/paddle/operators/scatter_op.cc @@ -50,8 +50,8 @@ class ScatterGradOp : public framework::OperatorWithKernel { auto *dRef = ctx.Output(framework::GradVarName("Ref")); auto *Ref = ctx.Input("Ref"); - dRef->Resize(Ref->dims()); - dUpdates->Resize(Updates->dims()); + if (dRef) dRef->Resize(Ref->dims()); + if (dUpdates) dUpdates->Resize(Updates->dims()); } }; diff --git a/paddle/operators/scatter_op.h b/paddle/operators/scatter_op.h index e9595638a86a4a4536ddad4e6f20fd80a54b1608..7551480211a5203ed590487bdda8269c561cde1b 100644 --- a/paddle/operators/scatter_op.h +++ b/paddle/operators/scatter_op.h @@ -49,10 +49,12 @@ class ScatterGradientOpKernel : public framework::OpKernel { auto *dOut = ctx.Input(framework::GradVarName("Out")); // In place gradient: dRef = dO - dRef->ShareDataWith(*dOut); - dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates += dO[Index] - Gather(ctx.GetPlace(), dOut, Index, dUpdates); + if (dRef) dRef->ShareDataWith(*dOut); + if (dUpdates) { + dUpdates->mutable_data(ctx.GetPlace()); + // Gradient by Gather: dUpdates += dO[Index] + Gather(ctx.GetPlace(), dOut, Index, dUpdates); + } } }; diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py index 518f828bacd60e7cb8375b22c6c3296f9bfeb5ea..82ab7ad39b41b291b9c9b66efccc3511dd2fd471 100644 --- a/python/paddle/v2/framework/tests/gradient_checker.py +++ b/python/paddle/v2/framework/tests/gradient_checker.py @@ -286,6 +286,9 @@ class GradientChecker(unittest.TestCase): for no_grad in no_grad_set: if no_grad not in in_names: raise ValueError("no_grad should be in in_names") + if name in inputs_to_check: + raise ValueError("no_grad should not be in inputs_to_check") + backward_op = core.Operator.backward(forward_op, no_grad_set) places = [core.CPUPlace()] @@ -301,9 +304,25 @@ class GradientChecker(unittest.TestCase): check_names = [grad_var_name(name) for name in inputs_to_check] for place in places: - # get analytical gradients according to different device - analytic_grads = self.__get_gradient(forward_op, backward_op, - input_vars, check_names, place) + # analytic_grads = self.__get_gradient(forward_op, backward_op, + # input_vars, check_names, place) + # In fact, the above two lines can be used to replace following + # codes. But most of the gradient operators need to handle the case + # where one of more of the gradient of the input is not needed. + # We change the unit test framework to explicitly test whether + # the operator correctly handles this through follow codes. + # In addtion, if all the inputs have no gradients, the NOP operator + # will be returned by core.Operator.backward(). The following codes + # do not test this case. + analytic_grads = [] + for name in inputs_to_check: + no_grads = [name for name in no_grad_set] + no_grads.extend(filter(lambda x: x != name, inputs_to_check)) + backward_op = core.Operator.backward(forward_op, set(no_grads)) + # get analytical gradients according to different device + analytic_grads.extend( + self.__get_gradient(forward_op, backward_op, input_vars, + [grad_var_name(name)], place)) self.__assert_is_close(numeric_grads, analytic_grads, check_names, max_relative_error, "Gradient Check On %s" % str(place)) diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/framework/tests/test_mul_op.py index ee0d81a64efcb81bae8b11b856c201a86da274e9..81371b1d11269ef7e168ffda497401e385290431 100644 --- a/python/paddle/v2/framework/tests/test_mul_op.py +++ b/python/paddle/v2/framework/tests/test_mul_op.py @@ -16,13 +16,14 @@ class TestMulOp(unittest.TestCase): self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} -class MulGradOpTest(GradientChecker): +class TestMulGradOp(GradientChecker): def test_mul(self): op = create_op("mul") inputs = { 'X': np.random.random((32, 84)).astype("float32"), 'Y': np.random.random((84, 100)).astype("float32") } + self.compare_grad(op, inputs) # mul op will enlarge the relative error self.check_grad( op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.5)