[NPU] change transpose to transpose2 (#31734)

* change transpose to transpose2 * fix bug

[NPU] change transpose to transpose2 (#31734)
* change transpose to transpose2 * fix bug
342252c9 · Leo Chen · GitHub · 7b450e78 · 342252c9 · 342252c9
3 changed file
--- a/paddle/fluid/operators/transpose_op_npu.cc
+++ b/paddle/fluid/operators/transpose_op_npu.cc
@@ -9,75 +9,73 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef PADDLE_WITH_ASCEND_CL
+#include <iostream>
 #include <memory>
 #include <string>
-#include <iostream>

-#include "paddle/fluid/operators/npu_op_runner.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/expand_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"

 namespace paddle {
 namespace operators {

 template <typename DeviceContext, typename T>
 class TransposeNPUKernel : public framework::OpKernel<T> {
-    public:
-    void Compute(const framework::ExecutionContext& ctx) const override {
-        auto* x = ctx.Input<framework::LoDTensor>("X");
-        auto* out = ctx.Output<framework::LoDTensor>("Out");
-        std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
-        framework::NPUAttributeMap attr_input = {{"perm", axis}};
-        out->mutable_data<T>(ctx.device_context().GetPlace());
-        auto runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
-        auto stream = ctx.template device_context<paddle::platform::NPUDeviceContext>().stream();
-        runner.Run(stream);
-
-    }
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    framework::NPUAttributeMap attr_input = {{"perm", axis}};
+    out->mutable_data<T>(ctx.device_context().GetPlace());
+    auto runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
 };

 template <typename T>
 class TransposeGradNPUKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto* out_grad = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out_grad =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
    std::vector<int> reversed_axis(axis);
    for (size_t i = 0; i < axis.size(); i++) {
      reversed_axis[axis[i]] = i;
    }
-
+    x_grad->mutable_data<T>(ctx.GetPlace());
    framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}};
    auto runner = NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input);
-    auto stream = ctx.template device_context<paddle::platform::NPUDeviceContext>().stream();
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
    runner.Run(stream);
  }
 };

-}
-}
+}  // namespace operators
+}  // namespace paddle

 namespace ops = paddle::operators;

-REGISTER_OP_NPU_KERNEL(transpose,
+REGISTER_OP_NPU_KERNEL(
+    transpose2,
    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, paddle::platform::float16>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext,
+                            paddle::platform::float16>,
    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int>,
    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
-    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>
-);
-
-REGISTER_OP_NPU_KERNEL(transpose_grad,
-    ops::TransposeGradNPUKernel<float>,
-    ops::TransposeGradNPUKernel<paddle::platform::float16>,
-    ops::TransposeGradNPUKernel<int>,
-    ops::TransposeGradNPUKernel<uint8_t>,
-    ops::TransposeGradNPUKernel<int8_t>
-);
-
-
-
-#endif
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>);

+REGISTER_OP_NPU_KERNEL(transpose2_grad, ops::TransposeGradNPUKernel<float>,
+                       ops::TransposeGradNPUKernel<paddle::platform::float16>,
+                       ops::TransposeGradNPUKernel<int>,
+                       ops::TransposeGradNPUKernel<uint8_t>,
+                       ops::TransposeGradNPUKernel<int8_t>);
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -13,12 +13,12 @@ limitations under the License. */
 #include <unistd.h>
 #endif

-#include <string>
 #include <cmath>
+#include <iostream>
+#include <numeric>
+#include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <numeric>
-#include <iostream>

 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -32,17 +32,18 @@ namespace f = paddle::framework;
 namespace p = paddle::platform;
 namespace m = paddle::operators::math;

-USE_OP(transpose);
-USE_OP_DEVICE_KERNEL(transpose, NPU);
-
+USE_OP(transpose2);
+USE_OP_DEVICE_KERNEL(transpose2, NPU);

 template <typename T>
 void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-    // init
+  // init
  auto x = scope->Var("X");
  auto out = scope->Var("Out");
+  auto xshape = scope->Var("XShape");
  auto* x_t = x->GetMutable<f::LoDTensor>();
  auto* out_t = out->GetMutable<f::LoDTensor>();
+  auto* xshape_t = xshape->GetMutable<f::LoDTensor>();
  auto place = ctx.GetPlace();

  int dim0 = 2;
@@ -54,12 +55,13 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
  ctx.Wait();
  out_t->mutable_data<T>(place);
  ctx.Wait();
-  f::AttributeMap attrs = {
-     {"axis", std::vector<int>({1, 0})},
-     {"data_format", std::string("AnyLayout")}
-  };
-  auto op = f::OpRegistry::CreateOp("transpose", {{"X", {"X"}}},
-                              {{"Out", {"Out"}}}, attrs);
+  xshape_t->Resize({dim0, dim1});
+  xshape_t->mutable_data<T>(place);
+  f::AttributeMap attrs = {{"axis", std::vector<int>({1, 0})},
+                           {"data_format", std::string("AnyLayout")}};
+  auto op = f::OpRegistry::CreateOp("transpose2", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}, {"XShape", {"XShape"}}},
+                                    attrs);
  ctx.Wait();
  op->Run(*scope, place);
  ctx.Wait();
@@ -76,47 +78,42 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
  EXPECT_EQ(out_v[5], 5);
 }

-
 template <typename T>
 void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
-    // init
-  auto x = scope->Var("X");
+  // init
+  auto xshape = scope->Var("XShape");
  auto x_grad = scope->Var("X@GRAD");
-  auto out = scope->Var("Out");
  auto out_grad = scope->Var("Out@GRAD");

  auto* x_grad_t = x_grad->GetMutable<f::LoDTensor>();
-  auto* x_t = x->GetMutable<f::LoDTensor>();
+  auto* xshape_t = xshape->GetMutable<f::LoDTensor>();
  auto* out_grad_t = out_grad->GetMutable<f::LoDTensor>();
-  auto* out_t = out->GetMutable<f::LoDTensor>();
+
  int dim0 = 2;
  int dim1 = 3;
  auto place = ctx.GetPlace();

  TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, out_grad_t);
-  TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, x_t);
  ctx.Wait();
+
  x_grad_t->Resize({dim0, dim1});
-  x_t->Resize({dim0, dim1});
+  xshape_t->Resize(
+      {0, dim0,
+       dim1});  // NOTE(zhiqiu): 0 is needed, see its infershape function
  out_grad_t->Resize({dim0, dim1});
-  out_t->Resize({dim0, dim1});

-  x_grad_t->mutable_data<T>(place);
-  out_t->mutable_data<T>(place);
-  ctx.Wait();
-  f::AttributeMap attrs = {
-     {"axis", std::vector<int>({1, 0})},
-     {"data_format", std::string("AnyLayout")}
-  };
+  f::AttributeMap attrs = {{"axis", std::vector<int>({1, 0})},
+                           {"data_format", std::string("AnyLayout")}};
+
  auto op = f::OpRegistry::CreateOp(
-      "transpose_grad", 
-      {{"Out@GRAD", {"Out@GRAD"}}, {"X", {"X"}}, {"Out", {"Out"}}},
+      "transpose2_grad", {{"Out@GRAD", {"Out@GRAD"}}, {"XShape", {"XShape"}}},
      {{"X@GRAD", {"X@GRAD"}}}, attrs);
+
  op->Run(*scope, place);
-  ctx.Wait();  
+  ctx.Wait();
  std::vector<T> out_v;
  TensorToVector(*x_grad_t, ctx, &out_v);
-  ctx.Wait();  
+  ctx.Wait();

  EXPECT_EQ(x_grad_t->numel(), dim0 * dim1);
  EXPECT_EQ(out_v[0], 0);
@@ -125,19 +122,16 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
  EXPECT_EQ(out_v[3], 4);
  EXPECT_EQ(out_v[4], 2);
  EXPECT_EQ(out_v[5], 5);
-
 }

-
-TEST(transpose, NPU_fp32) {
+TEST(transpose2, NPU_fp32) {
  f::Scope scope;
  p::NPUDeviceContext ctx(p::NPUPlace(0));
  Compare<float>(&scope, ctx);
 }

-TEST(transpose_grad, NPU_fp32) {
+TEST(transpose2_grad, NPU_fp32) {
  f::Scope scope;
  p::NPUDeviceContext ctx(p::NPUPlace(0));
  CompareGrad<float>(&scope, ctx);
 }
-
--- a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
@@ -30,7 +30,7 @@ paddle.enable_static()
 class TestTransposeOp(OpTest):
    def setUp(self):
        self.set_npu()
-        self.op_type = "transpose"
+        self.op_type = "transpose2"
        self.place = paddle.NPUPlace(0)
        self.init_dtype()
        self.init_input_output()