[NPU] fix argsort op, test=develop (#36576)

* [NPU] fix argsort op, test=develop * remove debug files, test=develop * fix typo, test=develop * address review comments, test=develop

[NPU] fix argsort op, test=develop (#36576)
* [NPU] fix argsort op, test=develop * remove debug files, test=develop * fix typo, test=develop * address review comments, test=develop
3523bbe8 · Qi Li · GitHub · 43dcf235 · 3523bbe8 · 3523bbe8
18 changed file
--- a/paddle/fluid/operators/arg_max_op_xpu.cc
+++ b/paddle/fluid/operators/arg_max_op_xpu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 #ifdef PADDLE_WITH_XPU

--- a/paddle/fluid/operators/arg_min_op_npu.cc
+++ b/paddle/fluid/operators/arg_min_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
 #include "paddle/fluid/operators/npu_op_runner.h"

--- a/paddle/fluid/operators/argsort_op_npu.cc
+++ b/paddle/fluid/operators/argsort_op_npu.cc
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,156 +18,142 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-template <typename DeviceContext, typename T>
+using Tensor = framework::Tensor;
+using NPUDeviceContext = platform::NPUDeviceContext;
+template <typename T>
+static void TranposeNPU(const framework::ExecutionContext& ctx,
+                        const aclrtStream& stream, std::vector<int64_t>* perm,
+                        const Tensor& in, Tensor* out) {
+  out->mutable_data<T>(ctx.GetPlace());
+  NpuOpRunner runner;
+  runner.SetType("Transpose")
+      .AddInput(in)
+      .AddInput(std::move(*perm))
+      .AddOutput(*out)
+      .Run(stream);
+}
+static void CastToInt64(const framework::ExecutionContext& ctx,
+                        const aclrtStream& stream, const Tensor& in,
+                        Tensor* out) {
+  out->mutable_data<int64_t>(ctx.GetPlace());
+  NpuOpRunner runner;
+  runner.SetType("Cast")
+      .AddInput(in)
+      .AddOutput(*out)
+      .AddAttr("dst_type", ACL_INT64)
+      .Run(stream);
+}
+template <typename T>
 class ArgsortNPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* input = ctx.Input<framework::Tensor>("X");
    auto* output = ctx.Output<framework::Tensor>("Out");
-    output->mutable_data<T>(ctx.GetPlace());
    auto* indices = ctx.Output<framework::Tensor>("Indices");
-    indices->mutable_data<int32_t>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    bool descending = ctx.Attr<bool>("descending");
-    int32_t axis = ctx.Attr<int>("axis");
+    auto in_dims = input->dims();
-    auto in_dims = indices->dims();
    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    bool descending = ctx.Attr<bool>("descending");
-    auto stream =
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+    framework::NPUAttributeMap attr = {{"axis", -1},
-            .stream();
+                                       {"descending", descending}};
-    framework::NPUAttributeMap sort_attr_input = {
-        {"axis", static_cast<int32_t>(-1)}, {"descending", descending}};
+    Tensor indices_tmp(framework::proto::VarType::INT32);
+    indices_tmp.Resize(indices->dims());
    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const auto& sort_runner =
+      output->mutable_data<T>(ctx.GetPlace());
-          NpuOpRunner("Sort", {*input}, {*output, *indices}, sort_attr_input);
+      indices_tmp.mutable_data<int32_t>(ctx.GetPlace());
-      sort_runner.Run(stream);
+      const auto& runner =
+          NpuOpRunner("Sort", {*input}, {*output, indices_tmp}, attr);
+      runner.Run(stream);
    } else {
-      // transpose
+      std::vector<int64_t> perm;
-      std::vector<int> trans;
+      for (int64_t i = 0; i < in_dims.size(); i++) {
-      for (int i = 0; i < axis; i++) {
+        perm.emplace_back(i);
-        trans.push_back(i);
      }
-      trans.push_back(in_dims.size() - 1);
+      std::swap(perm[axis], perm[in_dims.size() - 1]);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
+      std::vector<int64_t> shape;
-      }
+      for (size_t i = 0; i < perm.size(); i++) {
-      trans.push_back(axis);
+        shape.emplace_back(in_dims[perm[i]]);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
      }
-      framework::NPUAttributeMap trans_attr_input = {{"perm", trans}};
+      auto trans_dims = framework::make_ddim(shape);
-      Tensor trans_input;
-      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
+      Tensor trans_input(input->type());
-      const auto& trans_input_runner =
+      trans_input.Resize(trans_dims);
-          NpuOpRunner("TransposeD", {*input}, {trans_input}, trans_attr_input);
+      TranposeNPU<T>(ctx, stream, &perm, *input, &trans_input);
-      trans_input_runner.Run(stream);
-      Tensor trans_indices;
+      Tensor trans_output(input->type());
-      trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
+      Tensor trans_indices(framework::proto::VarType::INT32);
-      const auto& trans_indice_runner = NpuOpRunner(
-          "TransposeD", {*indices}, {trans_indices}, trans_attr_input);
-      trans_indice_runner.Run(stream);
-      Tensor trans_output;
      trans_output.mutable_data<T>(trans_dims, ctx.GetPlace());
-      const auto& trans_output_runner = NpuOpRunner(
+      trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
-          "TransposeD", {*output}, {trans_output}, trans_attr_input);
-      trans_output_runner.Run(stream);
+      const auto& runner = NpuOpRunner("Sort", {trans_input},
-      const auto& sort_runner =
+                                       {trans_output, trans_indices}, attr);
-          NpuOpRunner("Sort", {trans_input}, {trans_output, trans_indices},
+      runner.Run(stream);
-                      sort_attr_input);
-      sort_runner.Run(stream);
+      TranposeNPU<T>(ctx, stream, &perm, trans_output, output);
-      // transpose back
+      TranposeNPU<int32_t>(ctx, stream, &perm, trans_indices, &indices_tmp);
-      const auto& trans_indices_back_runner = NpuOpRunner(
-          "TransposeD", {trans_indices}, {*indices}, trans_attr_input);
-      trans_indices_back_runner.Run(stream);
-      const auto& trans_output_back_runner = NpuOpRunner(
-          "TransposeD", {trans_output}, {*output}, trans_attr_input);
-      trans_output_back_runner.Run(stream);
    }
+    CastToInt64(ctx, stream, indices_tmp, indices);
  }
 };
-template <typename Type>
-static void ReshapeNPU(const framework::Tensor* input,
-                       const std::vector<Type>& input_shapes,
-                       framework::Tensor* output) {
-  output->ShareDataWith(*input);
-  output->Resize(framework::make_ddim(std::move(input_shapes)));
-}
 template <typename T, typename Type>
 static void FullAssignNPU(const framework::ExecutionContext& ctx,
-                          Type ind_lastdim, Type outer_dim,
+                          const aclrtStream& stream,
-                          const framework::DDim& trans_dims,
+                          const framework::DDim in_dims, const Tensor& input,
-                          const framework::Tensor* input,
+                          const Tensor& indices, Tensor* t_out) {
-                          const framework::Tensor* indices,
+  const int64_t input_height =
-                          framework::Tensor* t_out) {
+      framework::product(framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
-  // reshape input
+  const int64_t input_width = in_dims[in_dims.size() - 1];
-  Type input_shape = ind_lastdim * outer_dim;
-  std::vector<Type> input_shapes = {input_shape};
+  Tensor input_tmp;
-  Tensor input_reshape_tensor(input->type());
+  input_tmp.ShareDataWith(input);
-  ReshapeNPU<Type>(input, input_shapes, &input_reshape_tensor);
+  input_tmp.Resize(
-  // reshape index
+      framework::make_ddim(std::vector<int64_t>{input_height * input_width}));
-  std::vector<Type> index_shapes = {outer_dim, ind_lastdim};
-  framework::DDim ind_2d = framework::make_ddim({outer_dim, ind_lastdim});
+  Tensor indices_tmp;
-  Tensor ind_2d_tensor(indices->type());
+  indices_tmp.ShareDataWith(indices);
-  ReshapeNPU<Type>(indices, index_shapes, &ind_2d_tensor);
+  indices_tmp.Resize(
-  // range_flatten_index
+      framework::make_ddim(std::vector<int64_t>{input_height, input_width}));
-  std::vector<int32_t> range_flatten_index;
-  for (Type i = 0; i < input_shape; i += ind_lastdim) {
+  std::vector<int64_t> indexs_value;
-    range_flatten_index.push_back(static_cast<int32_t>(i));
+  for (Type i = 0; i < input_height; i++) {
-  }
+    indexs_value.push_back(i * input_width);
-  Tensor range_flatten_index_tensor(framework::proto::VarType::INT32);
-  range_flatten_index_tensor.Resize(framework::make_ddim({outer_dim}));
-  range_flatten_index_tensor.mutable_data<int32_t>(
-      {static_cast<int>(range_flatten_index.size())}, ctx.GetPlace());
-  TensorFromVector(range_flatten_index, ctx.device_context(),
-                   &range_flatten_index_tensor);
-  Tensor range_flatten_index_expand_tensor(range_flatten_index_tensor.type());
-  std::vector<Type> flatten_shape = {outer_dim, 1};
-  ReshapeNPU<Type>(&range_flatten_index_tensor, flatten_shape,
-                   &range_flatten_index_expand_tensor);
-  auto stream =
-      ctx.template device_context<paddle::platform::NPUDeviceContext>()
-          .stream();
-  Tensor ind_2d_add_tensor;
-  ind_2d_add_tensor.mutable_data<int32_t>(ind_2d, ctx.GetPlace());
-  const auto& runner_ind_2d_tensor = NpuOpRunner(
-      std::string("Add"), {ind_2d_tensor, range_flatten_index_expand_tensor},
-      {ind_2d_add_tensor}, {});
-  runner_ind_2d_tensor.Run(stream);
-  Tensor ind_reshape_tensor(ind_2d_add_tensor.type());
-  ReshapeNPU<Type>(&ind_2d_add_tensor, input_shapes, &ind_reshape_tensor);
-  Tensor ind_reshape_expand_tensor(ind_reshape_tensor.type());
-  std::vector<Type> ind_shape = {input_shape, 1};
-  ReshapeNPU<Type>(&ind_reshape_tensor, ind_shape, &ind_reshape_expand_tensor);
-  // expand_index
-  Tensor input_scatter_tensor;
-  input_scatter_tensor.Resize({input_shape});
-  input_scatter_tensor.mutable_data<T>(ctx.GetPlace());
-  Tensor input_scatter_tensor_ori;
-  input_scatter_tensor_ori.Resize({input_shape});
-  input_scatter_tensor_ori.mutable_data<T>(ctx.GetPlace());
-  std::vector<Type> trans_shapes;
-  for (int i = 0; i < trans_dims.size(); i++) {
-    trans_shapes.push_back(trans_dims[i]);
  }
-  NpuOpRunner runner_scatter;
+  Tensor indexs_tmp(indices.type());
-  runner_scatter.SetType("TensorScatterUpdate")
+  framework::TensorFromVector<int64_t>(indexs_value, ctx.device_context(),
-      .AddInput(input_scatter_tensor_ori)
+                                       &indexs_tmp);
-      .AddInput(ind_reshape_expand_tensor)
+  indexs_tmp.Resize(
-      .AddInput(input_reshape_tensor)
+      framework::make_ddim(std::vector<int64_t>{input_height, 1}));
-      .AddOutput(input_scatter_tensor);
-  runner_scatter.Run(stream);
+  Tensor indices_index(indices.type());
-  framework::TensorCopy(input_scatter_tensor, ctx.GetPlace(),
+  indices_index.mutable_data<int64_t>(indices_tmp.dims(), ctx.GetPlace());
-                        ctx.template device_context<platform::DeviceContext>(),
+  const auto& runner_add =
-                        t_out);
+      NpuOpRunner("Add", {indices_tmp, indexs_tmp}, {indices_index}, {});
-  t_out->Resize(framework::make_ddim(trans_shapes));
+  runner_add.Run(stream);
+  indices_index.Resize(
+      framework::make_ddim(std::vector<int64_t>{input_height * input_width}));
+  t_out->mutable_data<T>(ctx.GetPlace());
+  Tensor out_tmp(t_out->type());
+  out_tmp.ShareDataWith(*t_out);
+  const auto& runner =
+      NpuOpRunner("TensorScatterUpdate", {input_tmp, indices_index, input_tmp},
+                  {out_tmp}, {});
+  runner.Run(stream);
 }
-template <typename DeviceContext, typename T>
+template <typename T>
 class ArgsortGradNPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -172,75 +161,42 @@ class ArgsortGradNPUKernel : public framework::OpKernel<T> {
    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto* dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
    int axis = ctx.Attr<int>("axis");
    auto in_dims = indices->dims();
    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    dX->mutable_data<T>(ctx.GetPlace());
-    Tensor dxt;
-    dxt.mutable_data<T>(dX->dims(), place);
-    const auto& runner_flatten =
-        NpuOpRunner(std::string("Flatten"), {*dX}, {dxt}, {});
-    runner_flatten.Run(stream);
-    FillNpuTensorWithConstant<T>(&dxt, static_cast<T>(0));
    if (dO->numel() == 0) return;
-    // Do full assig  n
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t outer_dim = framework::product(
-          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t ind_lastdim = in_dims[in_dims.size() - 1];
-      FullAssignNPU<T, int64_t>(ctx, ind_lastdim, outer_dim, in_dims, dO,
-                                indices, dX);
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
+    if (axis == -1 || axis + 1 == in_dims.size()) {
+      FullAssignNPU<T, int64_t>(ctx, stream, in_dims, *dO, *indices, dX);
    } else {
-      // If not full assign do transpose
+      std::vector<int64_t> perm;
-      std::vector<int> trans;
+      for (int64_t i = 0; i < in_dims.size(); i++) {
-      for (int i = 0; i < axis; i++) {
+        perm.emplace_back(i);
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
      }
-      trans.push_back(axis);
+      std::swap(perm[axis], perm[in_dims.size() - 1]);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
+      std::vector<int64_t> shape;
-        trans_dims[i] = in_dims[trans[i]];
+      for (size_t i = 0; i < perm.size(); i++) {
-      }
+        shape.emplace_back(in_dims[perm[i]]);
-      std::vector<int> axis;
-      for (size_t i = 0; i < trans.size(); i++) {
-        axis.push_back(in_dims[trans[i]]);
      }
-      framework::NPUAttributeMap attr_input = {{"perm", trans}};
+      auto trans_dims = framework::make_ddim(shape);
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, ctx.GetPlace());
+      Tensor trans_dout(dO->type());
-      Tensor trans_ind;
+      Tensor trans_ids(indices->type());
-      trans_ind.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
+      trans_dout.Resize(trans_dims);
-      // Do transpose
+      trans_ids.Resize(trans_dims);
-      const auto& runner_transpose_dx = NpuOpRunner(
-          std::string("TransposeD"), {*dO}, {trans_dO}, {attr_input});
+      TranposeNPU<T>(ctx, stream, &perm, *dO, &trans_dout);
-      runner_transpose_dx.Run(stream);
+      TranposeNPU<int64_t>(ctx, stream, &perm, *indices, &trans_ids);
-      const auto& runner_transpose_ind = NpuOpRunner(
-          std::string("TransposeD"), {*indices}, {trans_ind}, {attr_input});
+      Tensor trans_dx(dO->type());
-      runner_transpose_ind.Run(stream);
+      trans_dx.Resize(trans_dims);
+      FullAssignNPU<T, int64_t>(ctx, stream, trans_dims, trans_dout, trans_ids,
-      const int64_t outer_dim = framework::product(
+                                &trans_dx);
-          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t ind_lastdim = trans_dims[trans_dims.size() - 1];
+      TranposeNPU<T>(ctx, stream, &perm, trans_dx, dX);
-      Tensor tmp_out;
-      tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-      FullAssignNPU<T, int64_t>(ctx, ind_lastdim, outer_dim, trans_dims,
-                                &trans_dO, &trans_ind, &tmp_out);
-      // transpose back
-      const auto& runner_transpose_out = NpuOpRunner(
-          std::string("TransposeD"), {tmp_out}, {*dX}, {attr_input});
-      runner_transpose_out.Run(stream);
    }
  }
 };
@@ -251,11 +207,8 @@ class ArgsortGradNPUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
+REGISTER_OP_NPU_KERNEL(argsort, ops::ArgsortNPUKernel<float>,
-    argsort, ops::ArgsortNPUKernel<plat::NPUDeviceContext, float>,
+                       ops::ArgsortNPUKernel<plat::float16>);
-    ops::ArgsortNPUKernel<plat::NPUDeviceContext, plat::float16>);
-REGISTER_OP_NPU_KERNEL(argsort_grad,
+REGISTER_OP_NPU_KERNEL(argsort_grad, ops::ArgsortGradNPUKernel<float>,
-                       ops::ArgsortGradNPUKernel<plat::NPUDeviceContext, float>,
+                       ops::ArgsortGradNPUKernel<paddle::platform::float16>);
-                       ops::ArgsortGradNPUKernel<plat::NPUDeviceContext,
-                                                 paddle::platform::float16>);
--- a/paddle/fluid/operators/cumsum_op_npu.cc
+++ b/paddle/fluid/operators/cumsum_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/cum_op.h"

--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 #include <memory>
 #include <string>

--- a/paddle/fluid/operators/expand_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_v2_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 #include "paddle/fluid/operators/expand_v2_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"

--- a/paddle/fluid/operators/huber_loss_op_npu.cc
+++ b/paddle/fluid/operators/huber_loss_op_npu.cc
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 #include "paddle/fluid/operators/huber_loss_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"

--- a/paddle/fluid/operators/interpolate_v2_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 #include "paddle/fluid/operators/interpolate_v2_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"

--- a/paddle/fluid/operators/is_empty_op_npu.cc
+++ b/paddle/fluid/operators/is_empty_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 #include "paddle/fluid/operators/is_empty_op.h"

--- a/paddle/fluid/operators/log_loss_op_npu.cc
+++ b/paddle/fluid/operators/log_loss_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 #include "paddle/fluid/operators/log_loss_op.h"
 #include <cmath>

--- a/paddle/fluid/operators/meshgrid_op_npu.cc
+++ b/paddle/fluid/operators/meshgrid_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 #include "paddle/fluid/operators/meshgrid_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"

--- a/paddle/fluid/operators/pad3d_op_npu.cc
+++ b/paddle/fluid/operators/pad3d_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"

--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 #include "paddle/fluid/operators/npu_op_runner.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"

--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"

--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"

--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 #include "paddle/fluid/operators/slice_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"

--- a/paddle/fluid/operators/tril_triu_op_npu.cc
+++ b/paddle/fluid/operators/tril_triu_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 #include "paddle/fluid/operators/tril_triu_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"

--- a/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py
@@ -18,7 +18,7 @@ import numpy as np
 import unittest
 import sys
 sys.path.append("..")
-from op_test import OpTest, _set_use_system_allocator
+from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -63,9 +63,6 @@ class TestArgsortOp(OpTest):
        self.__class__.use_npu = True
        self.__class__.no_need_check_grad = True
-    def init_kernel_type(self):
-        self.use_mkldnn = False
    def init_inputshape(self):
        self.input_shape = (2, 2, 2, 3, 3)
@@ -158,7 +155,8 @@ class TestArgsortOpAxis0NPUFP32(TestArgsortOp):
        self.__class__.use_npu = True
    def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X"], "Out")
+        self.check_grad_with_place(
+            self.place, ["X"], "Out", max_relative_error=0.03)
 class TestArgsortOpAxis1NPUFP32(TestArgsortOpAxis0NPUFP32):