diff --git a/paddle/fluid/operators/arg_max_op_xpu.cc b/paddle/fluid/operators/arg_max_op_xpu.cc
index 8060b5cf755c0ef4f0bb0c87405c8da809db33c8..71ec26ea5a7927c5fae86b1d954a32628f65ade0 100644
--- a/paddle/fluid/operators/arg_max_op_xpu.cc
+++ b/paddle/fluid/operators/arg_max_op_xpu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
diff --git a/paddle/fluid/operators/arg_min_op_npu.cc b/paddle/fluid/operators/arg_min_op_npu.cc
index f776412c16239f3b391268feeddc68e18ddc324c..cc81e320080b74464344aaf038654184d311a363 100644
--- a/paddle/fluid/operators/arg_min_op_npu.cc
+++ b/paddle/fluid/operators/arg_min_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc
index e36dd322e0ea1d1f018564473dd9a3f6b7a7734c..f2a57b4b9bdfb178df5c7fd26cba86bf78f970a5 100644
--- a/paddle/fluid/operators/argsort_op_npu.cc
+++ b/paddle/fluid/operators/argsort_op_npu.cc
@@ -1,8 +1,11 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,156 +18,142 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+using Tensor = framework::Tensor;
+using NPUDeviceContext = platform::NPUDeviceContext;
+
+template <typename T>
+static void TranposeNPU(const framework::ExecutionContext& ctx,
+                        const aclrtStream& stream, std::vector<int64_t>* perm,
+                        const Tensor& in, Tensor* out) {
+  out->mutable_data<T>(ctx.GetPlace());
+  NpuOpRunner runner;
+  runner.SetType("Transpose")
+      .AddInput(in)
+      .AddInput(std::move(*perm))
+      .AddOutput(*out)
+      .Run(stream);
+}
+
+static void CastToInt64(const framework::ExecutionContext& ctx,
+                        const aclrtStream& stream, const Tensor& in,
+                        Tensor* out) {
+  out->mutable_data<int64_t>(ctx.GetPlace());
+  NpuOpRunner runner;
+  runner.SetType("Cast")
+      .AddInput(in)
+      .AddOutput(*out)
+      .AddAttr("dst_type", ACL_INT64)
+      .Run(stream);
+}
+
+template <typename T>
 class ArgsortNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* input = ctx.Input<framework::Tensor>("X");
     auto* output = ctx.Output<framework::Tensor>("Out");
-    output->mutable_data<T>(ctx.GetPlace());
     auto* indices = ctx.Output<framework::Tensor>("Indices");
-    indices->mutable_data<int32_t>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    bool descending = ctx.Attr<bool>("descending");
 
-    int32_t axis = ctx.Attr<int>("axis");
-    auto in_dims = indices->dims();
+    auto in_dims = input->dims();
     axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    bool descending = ctx.Attr<bool>("descending");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    framework::NPUAttributeMap sort_attr_input = {
-        {"axis", static_cast<int32_t>(-1)}, {"descending", descending}};
+
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
+    framework::NPUAttributeMap attr = {{"axis", -1},
+                                       {"descending", descending}};
+
+    Tensor indices_tmp(framework::proto::VarType::INT32);
+    indices_tmp.Resize(indices->dims());
 
     if (axis == -1 || axis + 1 == in_dims.size()) {
-      const auto& sort_runner =
-          NpuOpRunner("Sort", {*input}, {*output, *indices}, sort_attr_input);
-      sort_runner.Run(stream);
+      output->mutable_data<T>(ctx.GetPlace());
+      indices_tmp.mutable_data<int32_t>(ctx.GetPlace());
+      const auto& runner =
+          NpuOpRunner("Sort", {*input}, {*output, indices_tmp}, attr);
+      runner.Run(stream);
     } else {
-      // transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
+      std::vector<int64_t> perm;
+      for (int64_t i = 0; i < in_dims.size(); i++) {
+        perm.emplace_back(i);
       }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
+      std::swap(perm[axis], perm[in_dims.size() - 1]);
+
+      std::vector<int64_t> shape;
+      for (size_t i = 0; i < perm.size(); i++) {
+        shape.emplace_back(in_dims[perm[i]]);
       }
-      framework::NPUAttributeMap trans_attr_input = {{"perm", trans}};
-      Tensor trans_input;
-      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
-      const auto& trans_input_runner =
-          NpuOpRunner("TransposeD", {*input}, {trans_input}, trans_attr_input);
-      trans_input_runner.Run(stream);
-      Tensor trans_indices;
-      trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
-      const auto& trans_indice_runner = NpuOpRunner(
-          "TransposeD", {*indices}, {trans_indices}, trans_attr_input);
-      trans_indice_runner.Run(stream);
-      Tensor trans_output;
+      auto trans_dims = framework::make_ddim(shape);
+
+      Tensor trans_input(input->type());
+      trans_input.Resize(trans_dims);
+      TranposeNPU<T>(ctx, stream, &perm, *input, &trans_input);
+
+      Tensor trans_output(input->type());
+      Tensor trans_indices(framework::proto::VarType::INT32);
       trans_output.mutable_data<T>(trans_dims, ctx.GetPlace());
-      const auto& trans_output_runner = NpuOpRunner(
-          "TransposeD", {*output}, {trans_output}, trans_attr_input);
-      trans_output_runner.Run(stream);
-      const auto& sort_runner =
-          NpuOpRunner("Sort", {trans_input}, {trans_output, trans_indices},
-                      sort_attr_input);
-      sort_runner.Run(stream);
-      // transpose back
-      const auto& trans_indices_back_runner = NpuOpRunner(
-          "TransposeD", {trans_indices}, {*indices}, trans_attr_input);
-      trans_indices_back_runner.Run(stream);
-      const auto& trans_output_back_runner = NpuOpRunner(
-          "TransposeD", {trans_output}, {*output}, trans_attr_input);
-      trans_output_back_runner.Run(stream);
+      trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
+
+      const auto& runner = NpuOpRunner("Sort", {trans_input},
+                                       {trans_output, trans_indices}, attr);
+      runner.Run(stream);
+
+      TranposeNPU<T>(ctx, stream, &perm, trans_output, output);
+      TranposeNPU<int32_t>(ctx, stream, &perm, trans_indices, &indices_tmp);
     }
+    CastToInt64(ctx, stream, indices_tmp, indices);
   }
 };
 
-template <typename Type>
-static void ReshapeNPU(const framework::Tensor* input,
-                       const std::vector<Type>& input_shapes,
-                       framework::Tensor* output) {
-  output->ShareDataWith(*input);
-  output->Resize(framework::make_ddim(std::move(input_shapes)));
-}
-
 template <typename T, typename Type>
 static void FullAssignNPU(const framework::ExecutionContext& ctx,
-                          Type ind_lastdim, Type outer_dim,
-                          const framework::DDim& trans_dims,
-                          const framework::Tensor* input,
-                          const framework::Tensor* indices,
-                          framework::Tensor* t_out) {
-  // reshape input
-  Type input_shape = ind_lastdim * outer_dim;
-  std::vector<Type> input_shapes = {input_shape};
-  Tensor input_reshape_tensor(input->type());
-  ReshapeNPU<Type>(input, input_shapes, &input_reshape_tensor);
-  // reshape index
-  std::vector<Type> index_shapes = {outer_dim, ind_lastdim};
-  framework::DDim ind_2d = framework::make_ddim({outer_dim, ind_lastdim});
-  Tensor ind_2d_tensor(indices->type());
-  ReshapeNPU<Type>(indices, index_shapes, &ind_2d_tensor);
-  // range_flatten_index
-  std::vector<int32_t> range_flatten_index;
-  for (Type i = 0; i < input_shape; i += ind_lastdim) {
-    range_flatten_index.push_back(static_cast<int32_t>(i));
+                          const aclrtStream& stream,
+                          const framework::DDim in_dims, const Tensor& input,
+                          const Tensor& indices, Tensor* t_out) {
+  const int64_t input_height =
+      framework::product(framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+  const int64_t input_width = in_dims[in_dims.size() - 1];
+
+  Tensor input_tmp;
+  input_tmp.ShareDataWith(input);
+  input_tmp.Resize(
+      framework::make_ddim(std::vector<int64_t>{input_height * input_width}));
+
+  Tensor indices_tmp;
+  indices_tmp.ShareDataWith(indices);
+  indices_tmp.Resize(
+      framework::make_ddim(std::vector<int64_t>{input_height, input_width}));
+
+  std::vector<int64_t> indexs_value;
+  for (Type i = 0; i < input_height; i++) {
+    indexs_value.push_back(i * input_width);
   }
-  Tensor range_flatten_index_tensor(framework::proto::VarType::INT32);
-  range_flatten_index_tensor.Resize(framework::make_ddim({outer_dim}));
-  range_flatten_index_tensor.mutable_data<int32_t>(
-      {static_cast<int>(range_flatten_index.size())}, ctx.GetPlace());
-  TensorFromVector(range_flatten_index, ctx.device_context(),
-                   &range_flatten_index_tensor);
-  Tensor range_flatten_index_expand_tensor(range_flatten_index_tensor.type());
-  std::vector<Type> flatten_shape = {outer_dim, 1};
-  ReshapeNPU<Type>(&range_flatten_index_tensor, flatten_shape,
-                   &range_flatten_index_expand_tensor);
-  auto stream =
-      ctx.template device_context<paddle::platform::NPUDeviceContext>()
-          .stream();
-  Tensor ind_2d_add_tensor;
-  ind_2d_add_tensor.mutable_data<int32_t>(ind_2d, ctx.GetPlace());
-  const auto& runner_ind_2d_tensor = NpuOpRunner(
-      std::string("Add"), {ind_2d_tensor, range_flatten_index_expand_tensor},
-      {ind_2d_add_tensor}, {});
-  runner_ind_2d_tensor.Run(stream);
-  Tensor ind_reshape_tensor(ind_2d_add_tensor.type());
-  ReshapeNPU<Type>(&ind_2d_add_tensor, input_shapes, &ind_reshape_tensor);
-  Tensor ind_reshape_expand_tensor(ind_reshape_tensor.type());
-  std::vector<Type> ind_shape = {input_shape, 1};
-  ReshapeNPU<Type>(&ind_reshape_tensor, ind_shape, &ind_reshape_expand_tensor);
-  // expand_index
-  Tensor input_scatter_tensor;
-  input_scatter_tensor.Resize({input_shape});
-  input_scatter_tensor.mutable_data<T>(ctx.GetPlace());
-  Tensor input_scatter_tensor_ori;
-  input_scatter_tensor_ori.Resize({input_shape});
-  input_scatter_tensor_ori.mutable_data<T>(ctx.GetPlace());
-  std::vector<Type> trans_shapes;
-
-  for (int i = 0; i < trans_dims.size(); i++) {
-    trans_shapes.push_back(trans_dims[i]);
-  }
-  NpuOpRunner runner_scatter;
-  runner_scatter.SetType("TensorScatterUpdate")
-      .AddInput(input_scatter_tensor_ori)
-      .AddInput(ind_reshape_expand_tensor)
-      .AddInput(input_reshape_tensor)
-      .AddOutput(input_scatter_tensor);
-  runner_scatter.Run(stream);
-  framework::TensorCopy(input_scatter_tensor, ctx.GetPlace(),
-                        ctx.template device_context<platform::DeviceContext>(),
-                        t_out);
-  t_out->Resize(framework::make_ddim(trans_shapes));
+  Tensor indexs_tmp(indices.type());
+  framework::TensorFromVector<int64_t>(indexs_value, ctx.device_context(),
+                                       &indexs_tmp);
+  indexs_tmp.Resize(
+      framework::make_ddim(std::vector<int64_t>{input_height, 1}));
+
+  Tensor indices_index(indices.type());
+  indices_index.mutable_data<int64_t>(indices_tmp.dims(), ctx.GetPlace());
+  const auto& runner_add =
+      NpuOpRunner("Add", {indices_tmp, indexs_tmp}, {indices_index}, {});
+  runner_add.Run(stream);
+
+  indices_index.Resize(
+      framework::make_ddim(std::vector<int64_t>{input_height * input_width}));
+
+  t_out->mutable_data<T>(ctx.GetPlace());
+  Tensor out_tmp(t_out->type());
+  out_tmp.ShareDataWith(*t_out);
+
+  const auto& runner =
+      NpuOpRunner("TensorScatterUpdate", {input_tmp, indices_index, input_tmp},
+                  {out_tmp}, {});
+  runner.Run(stream);
 }
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class ArgsortGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -172,75 +161,42 @@ class ArgsortGradNPUKernel : public framework::OpKernel<T> {
     auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
     int axis = ctx.Attr<int>("axis");
+
     auto in_dims = indices->dims();
     axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    auto place = ctx.GetPlace();
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    dX->mutable_data<T>(ctx.GetPlace());
-    Tensor dxt;
-    dxt.mutable_data<T>(dX->dims(), place);
-    const auto& runner_flatten =
-        NpuOpRunner(std::string("Flatten"), {*dX}, {dxt}, {});
-    runner_flatten.Run(stream);
-    FillNpuTensorWithConstant<T>(&dxt, static_cast<T>(0));
     if (dO->numel() == 0) return;
-    // Do full assig  n
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t outer_dim = framework::product(
-          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t ind_lastdim = in_dims[in_dims.size() - 1];
-      FullAssignNPU<T, int64_t>(ctx, ind_lastdim, outer_dim, in_dims, dO,
-                                indices, dX);
 
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
+
+    if (axis == -1 || axis + 1 == in_dims.size()) {
+      FullAssignNPU<T, int64_t>(ctx, stream, in_dims, *dO, *indices, dX);
     } else {
-      // If not full assign do transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
+      std::vector<int64_t> perm;
+      for (int64_t i = 0; i < in_dims.size(); i++) {
+        perm.emplace_back(i);
       }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-      std::vector<int> axis;
-      for (size_t i = 0; i < trans.size(); i++) {
-        axis.push_back(in_dims[trans[i]]);
+      std::swap(perm[axis], perm[in_dims.size() - 1]);
+
+      std::vector<int64_t> shape;
+      for (size_t i = 0; i < perm.size(); i++) {
+        shape.emplace_back(in_dims[perm[i]]);
       }
-      framework::NPUAttributeMap attr_input = {{"perm", trans}};
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, ctx.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
-      // Do transpose
-      const auto& runner_transpose_dx = NpuOpRunner(
-          std::string("TransposeD"), {*dO}, {trans_dO}, {attr_input});
-      runner_transpose_dx.Run(stream);
-      const auto& runner_transpose_ind = NpuOpRunner(
-          std::string("TransposeD"), {*indices}, {trans_ind}, {attr_input});
-      runner_transpose_ind.Run(stream);
-
-      const int64_t outer_dim = framework::product(
-          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t ind_lastdim = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-
-      FullAssignNPU<T, int64_t>(ctx, ind_lastdim, outer_dim, trans_dims,
-                                &trans_dO, &trans_ind, &tmp_out);
-
-      // transpose back
-      const auto& runner_transpose_out = NpuOpRunner(
-          std::string("TransposeD"), {tmp_out}, {*dX}, {attr_input});
-      runner_transpose_out.Run(stream);
+      auto trans_dims = framework::make_ddim(shape);
+
+      Tensor trans_dout(dO->type());
+      Tensor trans_ids(indices->type());
+      trans_dout.Resize(trans_dims);
+      trans_ids.Resize(trans_dims);
+
+      TranposeNPU<T>(ctx, stream, &perm, *dO, &trans_dout);
+      TranposeNPU<int64_t>(ctx, stream, &perm, *indices, &trans_ids);
+
+      Tensor trans_dx(dO->type());
+      trans_dx.Resize(trans_dims);
+      FullAssignNPU<T, int64_t>(ctx, stream, trans_dims, trans_dout, trans_ids,
+                                &trans_dx);
+
+      TranposeNPU<T>(ctx, stream, &perm, trans_dx, dX);
     }
   }
 };
@@ -251,11 +207,8 @@ class ArgsortGradNPUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_NPU_KERNEL(
-    argsort, ops::ArgsortNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ArgsortNPUKernel<plat::NPUDeviceContext, plat::float16>);
+REGISTER_OP_NPU_KERNEL(argsort, ops::ArgsortNPUKernel<float>,
+                       ops::ArgsortNPUKernel<plat::float16>);
 
-REGISTER_OP_NPU_KERNEL(argsort_grad,
-                       ops::ArgsortGradNPUKernel<plat::NPUDeviceContext, float>,
-                       ops::ArgsortGradNPUKernel<plat::NPUDeviceContext,
-                                                 paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(argsort_grad, ops::ArgsortGradNPUKernel<float>,
+                       ops::ArgsortGradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc
index 486e85b0f0dfca43ea6de6e17d97ba3a2a404196..0c0eb1577e8029cf5c464f8bc4c94b25f07f8834 100644
--- a/paddle/fluid/operators/cumsum_op_npu.cc
+++ b/paddle/fluid/operators/cumsum_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/cum_op.h"
diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc
index b5c8bfff0dc39f0d53308c702addf2fcf83bf796..50d247d9c059068abb30f9f3c62e94cad61a3c0d 100644
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include <memory>
 #include <string>
diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc
index 85fe86a9e606f33216513c2f4ce541e0b86b8eff..4b0e0770573a6f7091f1a0db7534e923eeb61d99 100644
--- a/paddle/fluid/operators/expand_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_v2_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/expand_v2_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/huber_loss_op_npu.cc b/paddle/fluid/operators/huber_loss_op_npu.cc
index a94261559415440752708931f3edf230a758df85..33cbaec4dfc462cf4d0a2c38da1597bba51687e7 100644
--- a/paddle/fluid/operators/huber_loss_op_npu.cc
+++ b/paddle/fluid/operators/huber_loss_op_npu.cc
@@ -1,13 +1,16 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/huber_loss_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc
index d893fbd01962891c4f39f9f3eeb964c734d331a7..b30c7ac810c0112072483acaa4b9cff783bf72b7 100644
--- a/paddle/fluid/operators/interpolate_v2_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/interpolate_v2_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/is_empty_op_npu.cc b/paddle/fluid/operators/is_empty_op_npu.cc
index 9155afecd021b73d8168ad221bc64cf556255218..01579abd74d234d1511040ef237dcdd7158f1ec1 100644
--- a/paddle/fluid/operators/is_empty_op_npu.cc
+++ b/paddle/fluid/operators/is_empty_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/is_empty_op.h"
 
diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc
index a8d906d4b5cad8cb798d43ee329dff63ff0a3b77..74b44165dcc4c139f6a1b33966a7da002d4c63fe 100644
--- a/paddle/fluid/operators/log_loss_op_npu.cc
+++ b/paddle/fluid/operators/log_loss_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/log_loss_op.h"
 #include <cmath>
diff --git a/paddle/fluid/operators/meshgrid_op_npu.cc b/paddle/fluid/operators/meshgrid_op_npu.cc
index 9605fa092f0697a2240215dccd913d34de9039b3..f22e2e178ef85144c43a8927ce33304a5e1907c6 100644
--- a/paddle/fluid/operators/meshgrid_op_npu.cc
+++ b/paddle/fluid/operators/meshgrid_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/meshgrid_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/pad3d_op_npu.cc b/paddle/fluid/operators/pad3d_op_npu.cc
index 3a1fba945500323970502f2b01f9f013f05aba42..483c895e0e65a8fcac2d5cc8dcdd85f47b6b00c4 100644
--- a/paddle/fluid/operators/pad3d_op_npu.cc
+++ b/paddle/fluid/operators/pad3d_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
index b343fc88d7b8d38f08a1cd494bf349c1ec3f047b..5efc7e9b869b7d921b0b510c8da843b510102d1a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/npu_op_runner.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
index 834b63f199e37dcfe06560da37506b8978ca2249..b5f571c7fea2ca6cfca235adfc5d58cf3e767e2d 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
index 6f3b40dbbf39424aea8ffc2321f91552b34cf210..400a09330a3483cebf7b389cbe5c8a6b2626eb23 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
index 52351a98bce37d47e7c3417d64c1bcb926ebeb03..a9092d7e2abbcef6ce21a890bef3a61ec581514e 100644
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/slice_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc
index cdabc28255b518913e973ad53f5f2228d9dd6ebf..6e7e03911370fd2b816722348724e986dbeb23d0 100644
--- a/paddle/fluid/operators/tril_triu_op_npu.cc
+++ b/paddle/fluid/operators/tril_triu_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/tril_triu_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py
index 824266578b9e571ace99db01b8ecc95827e1afe3..2589b2a316a16ee02c16c07f6eb0cb4a9dd015f0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py
@@ -18,7 +18,7 @@ import numpy as np
 import unittest
 import sys
 sys.path.append("..")
-from op_test import OpTest, _set_use_system_allocator
+from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -63,9 +63,6 @@ class TestArgsortOp(OpTest):
         self.__class__.use_npu = True
         self.__class__.no_need_check_grad = True
 
-    def init_kernel_type(self):
-        self.use_mkldnn = False
-
     def init_inputshape(self):
         self.input_shape = (2, 2, 2, 3, 3)
 
@@ -158,7 +155,8 @@ class TestArgsortOpAxis0NPUFP32(TestArgsortOp):
         self.__class__.use_npu = True
 
     def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X"], "Out")
+        self.check_grad_with_place(
+            self.place, ["X"], "Out", max_relative_error=0.03)
 
 
 class TestArgsortOpAxis1NPUFP32(TestArgsortOpAxis0NPUFP32):