diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 78509b145280ea45a0ed09cb4e56b8d06356b3c6..6fe18f2479478a49819da2608dc7c3a0bf5d3017 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -167,6 +167,10 @@ set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 add_subdirectory(benchmark)
 
 cc_test(op_debug_string_test SRCS op_debug_string_test.cc DEPS elementwise_add_op)
+if (WITH_ASCEND_CL)
+    cc_test(transpose_op_npu_test SRCS transpose_op_npu_test.cc DEPS op_registry transpose_op scope device_context enforce executor)
+endif()
+
 
 if(WITH_MKLDNN)
 include(mkldnn/inplace_op_tests.cmake)
diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d71bfdc7251086c137aaf24f70aa0af9a820b9e
--- /dev/null
+++ b/paddle/fluid/operators/transpose_op_npu.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+#include <iostream>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/expand_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class TransposeNPUKernel : public framework::OpKernel<T> {
+    public:
+    void Compute(const framework::ExecutionContext& ctx) const override {
+        auto* x = ctx.Input<framework::LoDTensor>("X");
+        auto* out = ctx.Output<framework::LoDTensor>("Out");
+        std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+        framework::NPUAttributeMap attr_input = {{"perm", axis}};
+        out->mutable_data<T>(ctx.device_context().GetPlace());
+        auto runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
+        auto stream = ctx.template device_context<paddle::platform::NPUDeviceContext>().stream();
+        runner.Run(stream);
+
+    }
+};
+
+template <typename T>
+class TransposeGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto* out_grad = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    std::vector<int> reversed_axis(axis);
+    for (size_t i = 0; i < axis.size(); i++) {
+      reversed_axis[axis[i]] = i;
+    }
+
+    framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}};
+    auto runner = NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input);
+    auto stream = ctx.template device_context<paddle::platform::NPUDeviceContext>().stream();
+    runner.Run(stream);
+  }
+};
+
+}
+}
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(transpose,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, paddle::platform::float16>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>
+);
+
+REGISTER_OP_NPU_KERNEL(transpose_grad,
+    ops::TransposeGradNPUKernel<float>,
+    ops::TransposeGradNPUKernel<paddle::platform::float16>,
+    ops::TransposeGradNPUKernel<int>,
+    ops::TransposeGradNPUKernel<uint8_t>,
+    ops::TransposeGradNPUKernel<int8_t>
+);
+
+
+
+#endif
+
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7a791956fbe56ce162379b59bcecc0fff2b3871
--- /dev/null
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -0,0 +1,143 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <cmath>
+#include <thread>  // NOLINT
+#include <vector>
+#include <numeric>
+#include <iostream>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(transpose);
+USE_OP_DEVICE_KERNEL(transpose, NPU);
+
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+    // init
+  auto x = scope->Var("X");
+  auto out = scope->Var("Out");
+  auto* x_t = x->GetMutable<f::LoDTensor>();
+  auto* out_t = out->GetMutable<f::LoDTensor>();
+  auto place = ctx.GetPlace();
+
+  int dim0 = 2;
+  int dim1 = 3;
+  TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, x_t);
+  ctx.Wait();
+  x_t->Resize({dim0, dim1});
+  out_t->Resize({dim0, dim1});
+  ctx.Wait();
+  out_t->mutable_data<T>(place);
+  ctx.Wait();
+  f::AttributeMap attrs = {
+     {"axis", std::vector<int>({1, 0})},
+     {"data_format", std::string("AnyLayout")}
+  };
+  auto op = f::OpRegistry::CreateOp("transpose", {{"X", {"X"}}},
+                              {{"Out", {"Out"}}}, attrs);
+  ctx.Wait();
+  op->Run(*scope, place);
+  ctx.Wait();
+  std::vector<T> out_v;
+  TensorToVector(*out_t, ctx, &out_v);
+  ctx.Wait();
+
+  EXPECT_EQ(out_t->numel(), dim0 * dim1);
+  EXPECT_EQ(out_v[0], 0);
+  EXPECT_EQ(out_v[1], 3);
+  EXPECT_EQ(out_v[2], 1);
+  EXPECT_EQ(out_v[3], 4);
+  EXPECT_EQ(out_v[4], 2);
+  EXPECT_EQ(out_v[5], 5);
+}
+
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
+    // init
+  auto x = scope->Var("X");
+  auto x_grad = scope->Var("X@GRAD");
+  auto out = scope->Var("Out");
+  auto out_grad = scope->Var("Out@GRAD");
+
+  auto* x_grad_t = x_grad->GetMutable<f::LoDTensor>();
+  auto* x_t = x->GetMutable<f::LoDTensor>();
+  auto* out_grad_t = out_grad->GetMutable<f::LoDTensor>();
+  auto* out_t = out->GetMutable<f::LoDTensor>();
+  int dim0 = 2;
+  int dim1 = 3;
+  auto place = ctx.GetPlace();
+
+  TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, out_grad_t);
+  TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, x_t);
+  ctx.Wait();
+  x_grad_t->Resize({dim0, dim1});
+  x_t->Resize({dim0, dim1});
+  out_grad_t->Resize({dim0, dim1});
+  out_t->Resize({dim0, dim1});
+
+  x_grad_t->mutable_data<T>(place);
+  out_t->mutable_data<T>(place);
+  ctx.Wait();
+  f::AttributeMap attrs = {
+     {"axis", std::vector<int>({1, 0})},
+     {"data_format", std::string("AnyLayout")}
+  };
+  auto op = f::OpRegistry::CreateOp(
+      "transpose_grad", 
+      {{"Out@GRAD", {"Out@GRAD"}}, {"X", {"X"}}, {"Out", {"Out"}}},
+      {{"X@GRAD", {"X@GRAD"}}}, attrs);
+  op->Run(*scope, place);
+  ctx.Wait();  
+  std::vector<T> out_v;
+  TensorToVector(*x_grad_t, ctx, &out_v);
+  ctx.Wait();  
+
+  EXPECT_EQ(x_grad_t->numel(), dim0 * dim1);
+  EXPECT_EQ(out_v[0], 0);
+  EXPECT_EQ(out_v[1], 3);
+  EXPECT_EQ(out_v[2], 1);
+  EXPECT_EQ(out_v[3], 4);
+  EXPECT_EQ(out_v[4], 2);
+  EXPECT_EQ(out_v[5], 5);
+
+}
+
+
+TEST(transpose, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx);
+}
+
+TEST(transpose_grad, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  CompareGrad<float>(&scope, ctx);
+}
+
diff --git a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..797531a6c0f99eb3f5481296f92d43058a9611a7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
@@ -0,0 +1,74 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest, _set_use_system_allocator
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTransposeOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "transpose"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)}
+        self.attrs = {'axis': [0, 2, 1, 3], 'data_format': 'AnyLayout'}
+        self.outputs = {'Out': self.out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [8, 512, 12, 64]).astype(self.dtype)
+        self.out = np.transpose(self.x, [0, 2, 1, 3])
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = -1
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTransposeOpFP16(TestTransposeOp):
+    no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+if __name__ == '__main__':
+    unittest.main()