diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
index 1e7e5e02c0181f8828a59b9403ac24f40347f8b6..5b8d08a8943ddeed29731a7b6660619f9a7d4ef3 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
@@ -12,17 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_ASCEND_CL
 #include <memory>
 #include <string>
 
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
+using Tensor = framework::Tensor;
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -39,12 +40,127 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
+    // default axis=-1?
+    // So, the sub_grad should do reduce if needed.
+    // For example, the shape of each variable in elementwise_sub:
+    // x, dx: [2, 3, 5]
+    // y, dy: [1, 5]
+    // out, dout: [2, 3, 5]
+    // Then, out = x - y  =>  dx = dout, dy = -dout
+    // And, the shape of dy can be computed by two stages reduce,
+    // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
+    // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
+
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      // For dx
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dx->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
+        axes.push_back(i);
+      }
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dout(dx->type());
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+      }
+
+      // stage 2
+      axes.clear();
+      for (auto i = 0; i < dx->dims().size(); ++i) {
+        if (dx->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+      } else {
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .Wait();
+        framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
+      }
+    }
+
+    if (dy) {
+      // For dy
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dy->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
+        axes.push_back(i);
+      }
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dout(dout->type());
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .Wait();
+      }
+
+      // stage 2
+      axes.clear();
+      for (auto i = 0; i < dy->dims().size(); ++i) {
+        if (dy->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        dy->mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+      } else {
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .Wait();
+        framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dy);
+      }
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(elementwise_add, ops::ElementwiseAddNPUKernel<float>,
+                       ops::ElementwiseAddNPUKernel<plat::float16>);
 
-REGISTER_OP_NPU_KERNEL(
-    elementwise_add,
-    ops::ElementwiseAddNPUKernel<paddle::platform::NPUDeviceContext, float>);
-#endif
+REGISTER_OP_NPU_KERNEL(elementwise_add_grad,
+                       ops::ElementwiseAddGradNPUKernel<float>,
+                       ops::ElementwiseAddGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
index 0cb8fd1c5781f4a154782e744ab4b0ccd0e92d9a..df6fae6c8484a016a3589339a3a7820d20d7dcca 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -74,6 +74,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
                                     {{"Out", {"Out"}}}, attrs);
 
   op->Run(*scope, place);
+  ctx.Wait();
 
   std::vector<T> out_vec;
   TensorToVector(*tensor_out, ctx, &out_vec);
@@ -125,57 +126,64 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
 
   // run
   f::AttributeMap attrs;
-  auto op = f::OpRegistry::CreateOp(op_type,
-    {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}},
-    {{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp(
+      op_type, {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}},
+      {{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs);
 
   auto place = ctx.GetPlace();
-    op->Run(*scope, place);
-
-    std::vector<T> dx_vec;
-    TensorToVector(*tensor_dx, ctx, &dx_vec);
-
-    std::vector<T> dy_vec;
-    TensorToVector(*tensor_dy, ctx, &dy_vec);
-
-    ctx.Wait();
-    float expected_x, expected_y;
-    if (op_type == "elementwise_add_grad") {
-      expected_x = 1.0;
-      expected_y = 6.0;
-    } else if (op_type == "elementwise_sub_grad") {
-      expected_x = 1.0;
-      expected_y = -6.0;
-    }
-
-    for (uint32_t i = 0; i < dx_vec.size(); i++) {
-      EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x));
-    }
-    for (uint32_t i = 0; i < dy_vec.size(); i++) {
-      EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y));
-    }
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::vector<T> dx_vec;
+  TensorToVector(*tensor_dx, ctx, &dx_vec);
+
+  std::vector<T> dy_vec;
+  TensorToVector(*tensor_dy, ctx, &dy_vec);
+
+  ctx.Wait();
+  float expected_x, expected_y;
+  if (op_type == "elementwise_add_grad") {
+    expected_x = 1.0;
+    expected_y = 6.0;
+  } else if (op_type == "elementwise_sub_grad") {
+    expected_x = 1.0;
+    expected_y = -6.0;
+  }
+
+  for (uint32_t i = 0; i < dx_vec.size(); i++) {
+    EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x));
+  }
+  for (uint32_t i = 0; i < dy_vec.size(); i++) {
+    EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y));
+  }
 }
 
 TEST(elementwise_add, NPU_fp32) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    Compare<float>(&scope, ctx, "elementwise_add");
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx, "elementwise_add");
 }
 
 TEST(elementwise_sub, NPU_fp32) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    Compare<float>(&scope, ctx, "elementwise_sub");
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx, "elementwise_sub");
 }
 
 TEST(elementwise_sub, NPU_fp16) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    Compare<p::float16>(&scope, ctx, "elementwise_sub");
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<p::float16>(&scope, ctx, "elementwise_sub");
 }
 
 TEST(elementwise_sub_grad, NPU) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
+}
+
+TEST(elementwise_add_grad, NPU) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  CompareGrad<float>(&scope, ctx, "elementwise_add_grad");
 }
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
index c3cf76451f62fe23dc88b77f4385f928a4910dbb..809445c2862035c182e827840d6e8440f80d47c4 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_ASCEND_CL
 #include <memory>
 #include <string>
 
@@ -24,7 +23,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -43,7 +42,7 @@ class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -51,8 +50,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
 
-    dx->mutable_data<T>(ctx.GetPlace());
-    dy->mutable_data<T>(ctx.GetPlace());
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
 
     // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
     // default axis=-1?
@@ -66,89 +66,92 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
     // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
     // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
 
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    // For dx
-    // stage 1
-    auto reduce_ndim = dout->dims().size() - dx->dims().size();
-    std::vector<int> axes;
-    for (auto i = 0; i < reduce_ndim; ++i) {
-      axes.push_back(i);
-    }
-    Tensor* tmp_dout = const_cast<Tensor*>(dout);
-    Tensor reduced_dout(dx->type());
-    if (axes.size() != 0) {
-      std::vector<int64_t> reduced_dout_dims;
-      for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
-        reduced_dout_dims.push_back(dout->dims()[i]);
-      }
-      reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
-      reduced_dout.mutable_data<T>(ctx.GetPlace());
-      auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
-                                {{"axes", axes}, {"keep_dims", false}});
-      runner.Run(stream);
-      tmp_dout = &reduced_dout;
-    }
-
-    // stage 2
-    axes.clear();
-    for (auto i = 0; i < dx->dims().size(); ++i) {
-      if (dx->dims()[i] == 1) {
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      // For dx
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dx->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
         axes.push_back(i);
       }
-    }
-    if (axes.size() != 0) {
-      auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
-                                {{"axes", axes}, {"keep_dims", true}});
-      runner.Run(stream);
-    } else {
-      framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
-    }
-
-    // For dy
-    // stage 1
-    reduce_ndim = dout->dims().size() - dy->dims().size();
-    axes.clear();
-    for (auto i = 0; i < reduce_ndim; ++i) {
-      axes.push_back(i);
-    }
-    tmp_dout = const_cast<Tensor*>(dout);
-    Tensor reduced_dy(dy->type());
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dout(dx->type());
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+      }
 
-    if (axes.size() != 0) {
-      std::vector<int64_t> reduced_dout_dims;
-      for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
-        reduced_dout_dims.push_back(dout->dims()[i]);
+      // stage 2
+      axes.clear();
+      for (auto i = 0; i < dx->dims().size(); ++i) {
+        if (dx->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+      } else {
+        framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
       }
-      reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
-      reduced_dout.mutable_data<T>(ctx.GetPlace());
-      auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
-                                {{"axes", axes}, {"keep_dims", false}});
-      runner.Run(stream);
-      tmp_dout = &reduced_dout;
     }
-
-    // stage 2
-    axes.clear();
-    Tensor* tmp_dy = tmp_dout;
-    for (auto i = 0; i < dy->dims().size(); ++i) {
-      if (dy->dims()[i] == 1) {
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      // For dy
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dy->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
         axes.push_back(i);
       }
-    }
-    if (axes.size() != 0) {
-      reduced_dy.Resize(dy->dims());
-      reduced_dy.mutable_data<T>(ctx.GetPlace());
-      auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy},
-                                {{"axes", axes}, {"keep_dims", true}});
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dy(dy->type());
+      Tensor reduced_dout(dy->type());
+
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+      }
+
+      // stage 2
+      axes.clear();
+      Tensor* tmp_dy = tmp_dout;
+      for (auto i = 0; i < dy->dims().size(); ++i) {
+        if (dy->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        reduced_dy.Resize(dy->dims());
+        reduced_dy.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+        tmp_dy = &reduced_dy;
+      }
+
+      // stage 3, negative
+      auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
       runner.Run(stream);
-      tmp_dy = &reduced_dy;
     }
-
-    // stage 3, negative
-    auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
-    runner.Run(stream);
   }
 };
 
@@ -156,16 +159,11 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel<float>,
+                       ops::ElementwiseSubNPUKernel<plat::float16>);
 
-REGISTER_OP_NPU_KERNEL(
-    elementwise_sub,
-    ops::ElementwiseSubNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ElementwiseSubNPUKernel<paddle::platform::NPUDeviceContext,
-                                 paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_sub_grad,
-    ops::ElementwiseSubGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ElementwiseSubGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                     paddle::platform::float16>);
-#endif
+REGISTER_OP_NPU_KERNEL(elementwise_sub_grad,
+                       ops::ElementwiseSubGradNPUKernel<float>,
+                       ops::ElementwiseSubGradNPUKernel<plat::float16>);