diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 43eb4de2c1d4ff61f295c05af9dfd5f26c2ba365..0fa1fca2bcd3117e1e9a6a54c343b2d0d8c3822b 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -103,12 +103,16 @@ set(DEPS_OPS
     recurrent_op
     cond_op
     cross_entropy_op
-    softmax_with_cross_entropy_op)
+    softmax_with_cross_entropy_op
+    sum_op)
+
+
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
   DEPS framework_proto tensor net_op)
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
+op_library(sum_op DEPS net_op)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index c54843faa698654dafac786979045bebf0ebc95d..7c422b477083fba4661aeb427422abb623b172bb 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include "paddle/operators/sum_op.h"
 #include <vector>
+#include "paddle/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
@@ -57,21 +58,23 @@ or not. But the output only shares the LoD with the first input.
   }
 };
 
-class SumGradOp : public framework::OperatorWithKernel {
+class SumGradOp : public NetOp {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
+  SumGradOp(const std::string& type, const framework::VariableNameMap& inputs,
+            const framework::VariableNameMap& outputs,
+            const framework::AttributeMap& attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    auto& x_grad_names = Outputs(framework::GradVarName("X"));
+    auto out_grad_name = this->Input(framework::GradVarName("Out"));
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
-    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto x_grad_names = ctx->Outputs(framework::GradVarName("X"));
-    size_t x_length = x_grad_names.size();
-    std::vector<framework::DDim> x_grad_dims;
-    x_grad_dims.reserve(x_length);
-    for (size_t i = 0; i < x_length; ++i) {
-      x_grad_dims.push_back(out_grad_dims);
+    framework::AttributeMap grad_attrs;
+    grad_attrs["scale"] = 1.0f;
+    for (auto& x_grad_name : x_grad_names) {
+      AppendOp(framework::OpRegistry::CreateOp(
+          "scale", {{"X", {out_grad_name}}}, {{"Out", {x_grad_name}}},
+          grad_attrs));
     }
-    ctx->SetOutputsDim(framework::GradVarName("X"), x_grad_dims);
+    CompleteAddOp(false);
   }
 };
 
@@ -81,5 +84,3 @@ class SumGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(sum, ops::SumOp, ops::SumOpMaker, sum_grad, ops::SumGradOp);
 REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(sum_grad,
-                       ops::SumGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sum_op.cu b/paddle/operators/sum_op.cu
index a465cf3659ba7c51338abadfc62962fb6755a39d..7129e6bf62d5608037c683add68412166ac86c81 100644
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
@@ -13,6 +13,4 @@ limitations under the License. */
 #include "paddle/operators/sum_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(sum_grad,
-                       ops::SumGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
index 7e8fbb9e41c694df9169ea583ce47c33d3bcf2bb..91e5da8b40d452db8715990cdbe2731b3aea44b9 100644
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -42,24 +42,5 @@ class SumKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
-class SumGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto outs = context.MultiOutput<Tensor>(framework::GradVarName("X"));
-    for (auto out : outs) {
-      out->mutable_data<T>(context.GetPlace());
-    }
-
-    auto place = context.GetEigenDevice<Place>();
-    auto in = EigenVector<T>::Flatten(*input);
-    for (auto out : outs) {
-      auto result = EigenVector<T>::Flatten(*out);
-      result.device(place) = in;
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle