From 342252c90206e1fa56cc8ad6e8106632fb827bc3 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Fri, 19 Mar 2021 12:36:11 +0800
Subject: [PATCH] [NPU] change transpose to transpose2 (#31734)

* change transpose to transpose2

* fix bug
---
 paddle/fluid/operators/transpose_op_npu.cc    | 74 +++++++++----------
 .../fluid/operators/transpose_op_npu_test.cc  | 70 ++++++++----------
 .../unittests/npu/test_transpose_op_npu.py    |  2 +-
 3 files changed, 69 insertions(+), 77 deletions(-)
diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc
index 2d71bfdc725..994b8e534f8 100644
--- a/paddle/fluid/operators/transpose_op_npu.cc
+++ b/paddle/fluid/operators/transpose_op_npu.cc
@@ -9,75 +9,73 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_ASCEND_CL
+#include <iostream>
 #include <memory>
 #include <string>
-#include <iostream>
 
-#include "paddle/fluid/operators/npu_op_runner.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/expand_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename DeviceContext, typename T>
 class TransposeNPUKernel : public framework::OpKernel<T> {
-    public:
-    void Compute(const framework::ExecutionContext& ctx) const override {
-        auto* x = ctx.Input<framework::LoDTensor>("X");
-        auto* out = ctx.Output<framework::LoDTensor>("Out");
-        std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
-        framework::NPUAttributeMap attr_input = {{"perm", axis}};
-        out->mutable_data<T>(ctx.device_context().GetPlace());
-        auto runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
-        auto stream = ctx.template device_context<paddle::platform::NPUDeviceContext>().stream();
-        runner.Run(stream);
-
-    }
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    framework::NPUAttributeMap attr_input = {{"perm", axis}};
+    out->mutable_data<T>(ctx.device_context().GetPlace());
+    auto runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
 };
 
 template <typename T>
 class TransposeGradNPUKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto* out_grad = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out_grad =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
     std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
     std::vector<int> reversed_axis(axis);
     for (size_t i = 0; i < axis.size(); i++) {
       reversed_axis[axis[i]] = i;
     }
-
+    x_grad->mutable_data<T>(ctx.GetPlace());
     framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}};
     auto runner = NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input);
-    auto stream = ctx.template device_context<paddle::platform::NPUDeviceContext>().stream();
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
     runner.Run(stream);
   }
 };
 
-}
-}
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_NPU_KERNEL(transpose,
+REGISTER_OP_NPU_KERNEL(
+    transpose2,
     ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, paddle::platform::float16>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext,
+                            paddle::platform::float16>,
     ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int>,
     ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
-    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>
-);
-
-REGISTER_OP_NPU_KERNEL(transpose_grad,
-    ops::TransposeGradNPUKernel<float>,
-    ops::TransposeGradNPUKernel<paddle::platform::float16>,
-    ops::TransposeGradNPUKernel<int>,
-    ops::TransposeGradNPUKernel<uint8_t>,
-    ops::TransposeGradNPUKernel<int8_t>
-);
-
-
-
-#endif
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>);
 
+REGISTER_OP_NPU_KERNEL(transpose2_grad, ops::TransposeGradNPUKernel<float>,
+                       ops::TransposeGradNPUKernel<paddle::platform::float16>,
+                       ops::TransposeGradNPUKernel<int>,
+                       ops::TransposeGradNPUKernel<uint8_t>,
+                       ops::TransposeGradNPUKernel<int8_t>);
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
index c7a791956fb..36f7a695358 100644
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -13,12 +13,12 @@ limitations under the License. */
 #include <unistd.h>
 #endif
 
-#include <string>
 #include <cmath>
+#include <iostream>
+#include <numeric>
+#include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <numeric>
-#include <iostream>
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -32,17 +32,18 @@ namespace f = paddle::framework;
 namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 
-USE_OP(transpose);
-USE_OP_DEVICE_KERNEL(transpose, NPU);
-
+USE_OP(transpose2);
+USE_OP_DEVICE_KERNEL(transpose2, NPU);
 
 template <typename T>
 void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-    // init
+  // init
   auto x = scope->Var("X");
   auto out = scope->Var("Out");
+  auto xshape = scope->Var("XShape");
   auto* x_t = x->GetMutable<f::LoDTensor>();
   auto* out_t = out->GetMutable<f::LoDTensor>();
+  auto* xshape_t = xshape->GetMutable<f::LoDTensor>();
   auto place = ctx.GetPlace();
 
   int dim0 = 2;
@@ -54,12 +55,13 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   ctx.Wait();
   out_t->mutable_data<T>(place);
   ctx.Wait();
-  f::AttributeMap attrs = {
-     {"axis", std::vector<int>({1, 0})},
-     {"data_format", std::string("AnyLayout")}
-  };
-  auto op = f::OpRegistry::CreateOp("transpose", {{"X", {"X"}}},
-                              {{"Out", {"Out"}}}, attrs);
+  xshape_t->Resize({dim0, dim1});
+  xshape_t->mutable_data<T>(place);
+  f::AttributeMap attrs = {{"axis", std::vector<int>({1, 0})},
+                           {"data_format", std::string("AnyLayout")}};
+  auto op = f::OpRegistry::CreateOp("transpose2", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}, {"XShape", {"XShape"}}},
+                                    attrs);
   ctx.Wait();
   op->Run(*scope, place);
   ctx.Wait();
@@ -76,47 +78,42 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   EXPECT_EQ(out_v[5], 5);
 }
 
-
 template <typename T>
 void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
-    // init
-  auto x = scope->Var("X");
+  // init
+  auto xshape = scope->Var("XShape");
   auto x_grad = scope->Var("X@GRAD");
-  auto out = scope->Var("Out");
   auto out_grad = scope->Var("Out@GRAD");
 
   auto* x_grad_t = x_grad->GetMutable<f::LoDTensor>();
-  auto* x_t = x->GetMutable<f::LoDTensor>();
+  auto* xshape_t = xshape->GetMutable<f::LoDTensor>();
   auto* out_grad_t = out_grad->GetMutable<f::LoDTensor>();
-  auto* out_t = out->GetMutable<f::LoDTensor>();
+
   int dim0 = 2;
   int dim1 = 3;
   auto place = ctx.GetPlace();
 
   TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, out_grad_t);
-  TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, x_t);
   ctx.Wait();
+
   x_grad_t->Resize({dim0, dim1});
-  x_t->Resize({dim0, dim1});
+  xshape_t->Resize(
+      {0, dim0,
+       dim1});  // NOTE(zhiqiu): 0 is needed, see its infershape function
   out_grad_t->Resize({dim0, dim1});
-  out_t->Resize({dim0, dim1});
 
-  x_grad_t->mutable_data<T>(place);
-  out_t->mutable_data<T>(place);
-  ctx.Wait();
-  f::AttributeMap attrs = {
-     {"axis", std::vector<int>({1, 0})},
-     {"data_format", std::string("AnyLayout")}
-  };
+  f::AttributeMap attrs = {{"axis", std::vector<int>({1, 0})},
+                           {"data_format", std::string("AnyLayout")}};
+
   auto op = f::OpRegistry::CreateOp(
-      "transpose_grad", 
-      {{"Out@GRAD", {"Out@GRAD"}}, {"X", {"X"}}, {"Out", {"Out"}}},
+      "transpose2_grad", {{"Out@GRAD", {"Out@GRAD"}}, {"XShape", {"XShape"}}},
       {{"X@GRAD", {"X@GRAD"}}}, attrs);
+
   op->Run(*scope, place);
-  ctx.Wait();  
+  ctx.Wait();
   std::vector<T> out_v;
   TensorToVector(*x_grad_t, ctx, &out_v);
-  ctx.Wait();  
+  ctx.Wait();
 
   EXPECT_EQ(x_grad_t->numel(), dim0 * dim1);
   EXPECT_EQ(out_v[0], 0);
@@ -125,19 +122,16 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
   EXPECT_EQ(out_v[3], 4);
   EXPECT_EQ(out_v[4], 2);
   EXPECT_EQ(out_v[5], 5);
-
 }
 
-
-TEST(transpose, NPU_fp32) {
+TEST(transpose2, NPU_fp32) {
   f::Scope scope;
   p::NPUDeviceContext ctx(p::NPUPlace(0));
   Compare<float>(&scope, ctx);
 }
 
-TEST(transpose_grad, NPU_fp32) {
+TEST(transpose2_grad, NPU_fp32) {
   f::Scope scope;
   p::NPUDeviceContext ctx(p::NPUPlace(0));
   CompareGrad<float>(&scope, ctx);
 }
-
diff --git a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
index 797531a6c0f..17f6a0ae1ca 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
@@ -30,7 +30,7 @@ paddle.enable_static()
 class TestTransposeOp(OpTest):
     def setUp(self):
         self.set_npu()
-        self.op_type = "transpose"
+        self.op_type = "transpose2"
         self.place = paddle.NPUPlace(0)
         self.init_dtype()
         self.init_input_output()
-- 
GitLab