From f1143f0cdbab985d1d3878ff27a47409d237e932 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Thu, 20 Jan 2022 18:31:17 +0800
Subject: [PATCH] [Pten] Migrate bfloat16/float16/complex from paddle::platform
 into pten::common (#39044)

* Migrate bfloat16/float16/complex from platform into pten::common

* fix typo

* fix code style
---
 cmake/inference_lib.cmake                     |    8 +-
 paddle/fluid/CMakeLists.txt                   |    5 -
 paddle/fluid/framework/data_type.h            |   11 -
 paddle/fluid/framework/data_type_test.cc      |    7 -
 paddle/fluid/framework/dlpack_tensor.cc       |    7 -
 paddle/fluid/framework/dlpack_tensor_test.cc  |    6 -
 paddle/fluid/framework/tensor_test.cc         |    6 -
 paddle/fluid/operators/assign_op.cc           |    3 -
 paddle/fluid/operators/assign_op_npu.cc       |    3 -
 paddle/fluid/operators/assign_op_xpu.cc       |    3 -
 .../collective/c_allreduce_max_op.cc          |    3 -
 .../collective/c_allreduce_max_op.cu.cc       |    7 -
 .../collective/c_allreduce_max_op_npu.cc      |    1 -
 .../collective/c_allreduce_max_op_xpu.cc      |    7 -
 .../collective/c_allreduce_min_op.cc          |    3 -
 .../collective/c_allreduce_min_op.cu.cc       |    7 -
 .../collective/c_allreduce_min_op_npu.cc      |    1 -
 .../collective/c_allreduce_min_op_xpu.cc      |    7 -
 .../collective/c_allreduce_prod_op.cc         |    3 -
 .../collective/c_allreduce_prod_op.cu.cc      |    7 -
 .../collective/c_allreduce_prod_op_npu.cc     |    1 -
 .../collective/c_allreduce_prod_op_xpu.cc     |    7 -
 .../collective/c_allreduce_sum_op.cc          |    3 -
 .../collective/c_allreduce_sum_op.cu.cc       |    7 -
 .../collective/c_allreduce_sum_op_npu.cc      |    1 -
 .../collective/c_allreduce_sum_op_xpu.cc      |    7 -
 .../operators/collective/c_reduce_max_op.cc   |    3 -
 .../collective/c_reduce_max_op.cu.cc          |    7 -
 .../collective/c_reduce_max_op_npu.cc         |    1 -
 .../collective/c_reduce_max_op_xpu.cc         |    7 -
 .../operators/collective/c_reduce_min_op.cc   |    3 -
 .../collective/c_reduce_min_op.cu.cc          |    7 -
 .../collective/c_reduce_min_op_npu.cc         |    1 -
 .../collective/c_reduce_min_op_xpu.cc         |    7 -
 .../operators/collective/c_reduce_prod_op.cc  |    3 -
 .../collective/c_reduce_prod_op.cu.cc         |    7 -
 .../collective/c_reduce_prod_op_npu.cc        |    1 -
 .../collective/c_reduce_prod_op_xpu.cc        |    7 -
 .../operators/collective/c_reduce_sum_op.cc   |    3 -
 .../collective/c_reduce_sum_op.cu.cc          |    7 -
 .../collective/c_reduce_sum_op_npu.cc         |    1 -
 .../collective/c_reduce_sum_op_xpu.cc         |    7 -
 .../operators/controlflow/fetch_v2_op.cc      |    3 -
 .../elementwise/elementwise_add_op.cc         |    7 -
 .../elementwise/elementwise_sub_op.cc         |    7 -
 paddle/fluid/operators/isfinite_v2_op.cc      |    1 -
 .../fluid/operators/math/concat_and_split.cc  |    4 +-
 paddle/fluid/operators/math/prelu.cu          |    7 +-
 paddle/fluid/operators/memcpy_d2h_op.cc       |    3 -
 paddle/fluid/operators/memcpy_h2d_op.cc       |    3 -
 paddle/fluid/operators/memcpy_op.cc           |    3 -
 paddle/fluid/operators/reshape_op.cc          |    3 -
 paddle/fluid/platform/bfloat16.h              |  391 +-----
 paddle/fluid/platform/complex.h               |  528 +-------
 .../platform/device/gpu/cuda/cudnn_helper.h   |    6 -
 .../platform/device/gpu/rocm/miopen_helper.h  |    6 -
 paddle/fluid/platform/float16.h               | 1068 +---------------
 paddle/pten/CMakeLists.txt                    |    5 -
 paddle/pten/api/lib/tensor.cc                 |    4 +-
 paddle/pten/common/bfloat16.h                 |  409 ++++++
 paddle/pten/common/complex.h                  |  548 +++++++++
 paddle/pten/common/data_type.h                |   14 +-
 paddle/pten/common/float16.h                  | 1094 +++++++++++++++++
 paddle/pten/core/dense_tensor.cc              |    6 +-
 paddle/pten/kernels/complex_kernel.h          |    2 +-
 paddle/pten/kernels/cpu/complex_kernel.cc     |    2 +-
 paddle/pten/kernels/cpu/dot_grad_kernel.cc    |    2 +-
 paddle/pten/kernels/cpu/dot_kernel.cc         |    2 +-
 paddle/pten/kernels/cpu/math_kernel.cc        |    4 +-
 paddle/pten/kernels/cpu/matmul_grad_kernel.cc |    2 +-
 paddle/pten/kernels/cpu/matmul_kernel.cc      |    2 +-
 paddle/pten/kernels/cpu/scale_kernel.cc       |    2 +-
 paddle/pten/kernels/cpu/sign_kernel.cc        |    2 +-
 paddle/pten/kernels/empty_kernel.cc           |    2 +-
 .../pten/kernels/funcs/elementwise_functor.h  |    2 +-
 paddle/pten/kernels/funcs/transpose.cc        |    6 +-
 paddle/pten/kernels/funcs/transpose.cu        |    6 +-
 paddle/pten/kernels/gpu/cast_kernel.cu        |    4 +-
 paddle/pten/kernels/gpu/complex_kernel.cu     |    2 +-
 paddle/pten/kernels/gpu/dot_grad_kernel.cu    |    2 +-
 paddle/pten/kernels/gpu/dot_kernel.cu         |    2 +-
 paddle/pten/kernels/gpu/math_kernel.cu        |    4 +-
 paddle/pten/kernels/gpu/matmul_grad_kernel.cu |    2 +-
 paddle/pten/kernels/gpu/matmul_kernel.cu      |    2 +-
 paddle/pten/kernels/gpu/scale_kernel.cu       |    2 +-
 paddle/pten/kernels/gpu/sign_kernel.cu        |    2 +-
 python/setup.py.in                            |    3 -
 87 files changed, 2108 insertions(+), 2276 deletions(-)
 create mode 100644 paddle/pten/common/bfloat16.h
 create mode 100644 paddle/pten/common/complex.h
 create mode 100644 paddle/pten/common/float16.h

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 530e5e7f24..b11eac058a 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -228,13 +228,7 @@ copy(inference_lib_dist
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/api/)
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/pten/common/*.h
-              ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/bfloat16.h
-              ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex.h
-              ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/
-              ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/
-              ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/
-              ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/)
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/)
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/any.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index f4b68ec65b..08c2d2e055 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -1,8 +1,3 @@
-# Adapt to custom op mechanism: Include the header files related to the data type
-# to avoid exposing the path of the underlying file, remove it after moving
-# float16.h/complex.h/bfloat16.h into pten
-include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
-
 add_subdirectory(memory)
 add_subdirectory(platform)
 add_subdirectory(distributed)
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index ec8284b825..791e9a83fa 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -24,17 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 
-namespace paddle {
-namespace platform {
-struct bfloat16;
-template <typename T>
-struct complex;
-struct float16;
-template <typename T>
-struct complex;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
index 5599edcd22..8e762b913f 100644
--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -18,13 +18,6 @@
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/tensor.h"
 
-namespace paddle {
-namespace platform {
-struct bfloat16;
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 TEST(DataType, float16) {
   using paddle::framework::Tensor;
   using paddle::platform::CPUPlace;
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index ef705aae15..2ceeed694a 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -14,13 +14,6 @@
 #include "paddle/fluid/framework/dlpack_tensor.h"
 #include "paddle/fluid/framework/data_type.h"
 
-namespace paddle {
-namespace platform {
-struct bfloat16;
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
index 8639caf4da..829908bd98 100644
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -16,12 +16,6 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-namespace paddle {
-namespace platform {
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index a58f4a6b5f..27e0968736 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -17,12 +17,6 @@
 #include <gtest/gtest.h>
 #include <string>
 
-namespace paddle {
-namespace platform {
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
index da5ee4dd82..684ac5bafd 100644
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -24,9 +24,6 @@ class Variable;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/assign_op_npu.cc b/paddle/fluid/operators/assign_op_npu.cc
index 5be1beaa3d..017c2deb18 100644
--- a/paddle/fluid/operators/assign_op_npu.cc
+++ b/paddle/fluid/operators/assign_op_npu.cc
@@ -26,9 +26,6 @@ class Variable;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/assign_op_xpu.cc b/paddle/fluid/operators/assign_op_xpu.cc
index 26c879c3fb..b95be3096f 100644
--- a/paddle/fluid/operators/assign_op_xpu.cc
+++ b/paddle/fluid/operators/assign_op_xpu.cc
@@ -25,9 +25,6 @@ class Variable;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
index 4ea1876da2..26e814b228 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
index 17b49eda2f..34054103aa 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
index 4dece4a372..16bb14e757 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct ASCENDPlace;
-struct float16;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
index 96da390d45..296a8b3a63 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
index 75a484ef87..d39aa4f604 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
index 4eca34fb50..4e8b6f9d0a 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
index 48e1d2eeb5..d99ab2809b 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct ASCENDPlace;
-struct float16;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
index bded822296..b8a8cfab3f 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
index c49e72eac2..b14a384832 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
index 74acbacf2b..61f76c178d 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
index f3d14afe0a..1c7ba1e951 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct ASCENDPlace;
-struct float16;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
index a1d439cfda..6e2b2df297 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
index 72659282af..d55d5f48ac 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
@@ -21,9 +21,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
index cfd508be27..8fe7fce21e 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
index b66e2e1968..920f69ea9d 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct ASCENDPlace;
-struct float16;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
index bacdf7fb53..d23572e6d6 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cc
index 8a5ed7d7bd..3208b6f01a 100644
--- a/paddle/fluid/operators/collective/c_reduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
index 9668c68c7d..7e260346b4 100644
--- a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
index f35b4c2f70..50d52e0ad1 100644
--- a/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct ASCENDPlace;
-struct float16;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
index 82a10b24da..3ae1af1d08 100644
--- a/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cc
index c2ecf2419a..5ab01f42af 100644
--- a/paddle/fluid/operators/collective/c_reduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
index 7f5b4cd360..77a75ed0b7 100644
--- a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
index 6ebb7e4c40..b94da957e8 100644
--- a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct ASCENDPlace;
-struct float16;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
index b1136b7966..500ea2abe6 100644
--- a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
index a689b9db15..14a4da7638 100644
--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
index c3de32b9fb..07e431f7bc 100644
--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
index f0b7021e79..7515ffad25 100644
--- a/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct ASCENDPlace;
-struct float16;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
index f6def80a19..15ef322896 100644
--- a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
index b7f521b371..7b74386b3b 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
index a4a651be3c..d9826422c1 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
index dd4dbbd5f3..6f056520df 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct ASCENDPlace;
-struct float16;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
index ec928bd6a0..f3ec15ca9e 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
@@ -14,13 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/controlflow/fetch_v2_op.cc b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
index d7f74c44bd..caa67139a9 100644
--- a/paddle/fluid/operators/controlflow/fetch_v2_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index 216178f7d8..0c2476fde0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -18,13 +18,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
-namespace paddle {
-namespace platform {
-template <typename T>
-struct complex;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {
 class OpDesc;
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index f5290a69bb..98b47407b9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -18,13 +18,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
-namespace paddle {
-namespace platform {
-template <typename T>
-struct complex;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {
 class OpDesc;
diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc
index d3391fddd3..316197ac23 100644
--- a/paddle/fluid/operators/isfinite_v2_op.cc
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
@@ -34,7 +34,6 @@ class OverflowKernel;
 }  // namespace operators
 namespace platform {
 class CPUDeviceContext;
-struct float16;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index 2d23f52c0b..4f12630d1e 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -16,6 +16,8 @@ limitations under the License. */
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/float16.h"
 
 namespace pten {
 class DenseTensor;
@@ -25,8 +27,6 @@ namespace paddle {
 namespace framework {}  // namespace framework
 namespace platform {
 class CPUDeviceContext;
-struct bfloat16;
-struct float16;
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/math/prelu.cu b/paddle/fluid/operators/math/prelu.cu
index d06490ee57..03a307bfdb 100644
--- a/paddle/fluid/operators/math/prelu.cu
+++ b/paddle/fluid/operators/math/prelu.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/prelu.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -110,15 +111,15 @@ void PreluScalarDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
 }
 
 template class PreluChannelWiseDirectCUDAFunctor<float>;
-template class PreluChannelWiseDirectCUDAFunctor<paddle::platform::float16>;
+template class PreluChannelWiseDirectCUDAFunctor<platform::float16>;
 template class PreluChannelWiseDirectCUDAFunctor<double>;
 
 template class PreluElementWiseDirectCUDAFunctor<float>;
-template class PreluElementWiseDirectCUDAFunctor<paddle::platform::float16>;
+template class PreluElementWiseDirectCUDAFunctor<platform::float16>;
 template class PreluElementWiseDirectCUDAFunctor<double>;
 
 template class PreluScalarDirectCUDAFunctor<float>;
-template class PreluScalarDirectCUDAFunctor<paddle::platform::float16>;
+template class PreluScalarDirectCUDAFunctor<platform::float16>;
 template class PreluScalarDirectCUDAFunctor<double>;
 
 }  // namespace math
diff --git a/paddle/fluid/operators/memcpy_d2h_op.cc b/paddle/fluid/operators/memcpy_d2h_op.cc
index 1aaa4c2367..9ad30d72eb 100644
--- a/paddle/fluid/operators/memcpy_d2h_op.cc
+++ b/paddle/fluid/operators/memcpy_d2h_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc
index 4e0f353a7a..c8e1e17d65 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.cc
+++ b/paddle/fluid/operators/memcpy_h2d_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/memcpy_op.cc b/paddle/fluid/operators/memcpy_op.cc
index d1eeff0b05..52e493ffcb 100644
--- a/paddle/fluid/operators/memcpy_op.cc
+++ b/paddle/fluid/operators/memcpy_op.cc
@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 01e13cd1dc..4c9be6d0cc 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -30,9 +30,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h
index 100444eee7..1160187f83 100644
--- a/paddle/fluid/platform/bfloat16.h
+++ b/paddle/fluid/platform/bfloat16.h
@@ -14,396 +14,11 @@
 
 #pragma once
 
-#include <stdint.h>
-
-#include <cmath>
-#include <cstring>
-#include <iostream>
-#include <limits>
-
-#ifdef PADDLE_WITH_CUDA
-#include <cuda.h>
-#endif
-
-#if defined(__CUDACC__) && CUDA_VERSION >= 11000
-#define PADDLE_CUDA_BF16
-#include <cuda_bf16.h>
-#endif
-
-#if !defined(_WIN32)
-#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
-#else
-#define PADDLE_ALIGN(x) __declspec(align(x))
-#endif
-
-#if (defined(__CUDACC__) || defined(__HIPCC__))
-#define HOSTDEVICE __host__ __device__
-#define DEVICE __device__
-#define HOST __host__
-#else
-#define HOSTDEVICE
-#define DEVICE
-#define HOST
-#endif
+#include "paddle/pten/common/bfloat16.h"
 
 namespace paddle {
 namespace platform {
-
-struct PADDLE_ALIGN(2) bfloat16 {
- public:
-  uint16_t x;
-
-  // Constructors
-  bfloat16() = default;
-  bfloat16(const bfloat16& o) = default;
-  bfloat16& operator=(const bfloat16& o) = default;
-  bfloat16(bfloat16&& o) = default;
-  bfloat16& operator=(bfloat16&& o) = default;
-  ~bfloat16() = default;
-
-  HOSTDEVICE inline explicit bfloat16(float val) {
-#ifdef PADDLE_WITH_HIP
-    uint32_t res = 0;
-    uint32_t* tempRes;
-    // We should be using memcpy in order to respect the strict aliasing rule
-    // but it fails in the HIP environment.
-    tempRes = reinterpret_cast<uint32_t*>(&val);
-    res = *tempRes;
-    x = res >> 16;
-#else
-#if defined(PADDLE_CUDA_BF16)
-    __nv_bfloat16 tmp = __float2bfloat16(val);
-    x = *reinterpret_cast<uint16_t*>(&tmp);
-#else
-    std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
-#endif
-#endif
-  }
-
-#if defined(PADDLE_CUDA_BF16)
-  HOSTDEVICE inline explicit bfloat16(const __nv_bfloat16& val) {
-    x = *reinterpret_cast<const unsigned short*>(&val);
-  }
-#endif
-
-  template <class T>
-  HOSTDEVICE inline explicit bfloat16(const T& val)
-      : x(bfloat16(static_cast<float>(val)).x) {}
-
-// Assignment operators
-#if defined(PADDLE_CUDA_BF16)
-  HOSTDEVICE inline bfloat16& operator=(const __nv_bfloat16& val) {
-    x = *reinterpret_cast<const unsigned short*>(&val);
-    return *this;
-  }
-#endif
-
-  HOSTDEVICE inline bfloat16& operator=(bool b) {
-    x = b ? 0x3f80 : 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(int8_t val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(uint8_t val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(int16_t val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(uint16_t val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(int32_t val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(uint32_t val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(int64_t val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(uint64_t val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(float val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(double val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  // Conversion opertors
-  HOSTDEVICE inline explicit operator float() const {
-#ifdef PADDLE_WITH_HIP
-    uint32_t res = 0;
-    // We should be using memcpy in order to respect the strict aliasing rule
-    // but it fails in the HIP environment.
-    uint16_t temp = x;
-    uint16_t* temp_ptr = reinterpret_cast<uint16_t*>(&temp);
-    res = *temp_ptr;
-    return res;
-#else
-#ifdef PADDLE_CUDA_BF16
-    return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
-#else
-    float val = 0.f;
-    uint16_t temp = x;
-    std::memcpy(reinterpret_cast<char*>(&val) + 2,
-                reinterpret_cast<char*>(&temp), 2);
-    return val;
-#endif
-#endif
-  }
-
-#ifdef PADDLE_CUDA_BF16
-  HOSTDEVICE inline explicit operator __nv_bfloat16() const {
-    return *reinterpret_cast<const __nv_bfloat16*>(&x);
-  }
-#endif
-
-  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
-
-  HOSTDEVICE inline explicit operator int8_t() const {
-    return static_cast<int8_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint8_t() const {
-    return static_cast<uint8_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator int16_t() const {
-    return static_cast<int16_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint16_t() const {
-    return static_cast<uint16_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator int32_t() const {
-    return static_cast<int32_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint32_t() const {
-    return static_cast<uint32_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator int64_t() const {
-    return static_cast<int64_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint64_t() const {
-    return static_cast<uint64_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator double() const {
-    return static_cast<double>(static_cast<float>(*this));
-  }
-};
-
-HOSTDEVICE inline bfloat16 operator+(const bfloat16& a, const bfloat16& b) {
-  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
-}
-
-HOSTDEVICE inline bfloat16 operator-(const bfloat16& a, const bfloat16& b) {
-  return bfloat16(static_cast<float>(a) - static_cast<float>(b));
-}
-
-HOSTDEVICE inline bfloat16 operator*(const bfloat16& a, const bfloat16& b) {
-  return bfloat16(static_cast<float>(a) * static_cast<float>(b));
-}
-
-HOSTDEVICE inline bfloat16 operator/(const bfloat16& a, const bfloat16& b) {
-  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
-}
-
-HOSTDEVICE inline bfloat16 operator-(const bfloat16& a) {
-  bfloat16 res;
-  res.x = a.x ^ 0x8000;
-  return res;
-}
-
-HOSTDEVICE inline bfloat16& operator+=(bfloat16& a,  // NOLINT
-                                       const bfloat16& b) {
-  a = bfloat16(static_cast<float>(a) + static_cast<float>(b));
-  return a;
-}
-
-HOSTDEVICE inline bfloat16& operator-=(bfloat16& a,  // NOLINT
-                                       const bfloat16& b) {
-  a = bfloat16(static_cast<float>(a) - static_cast<float>(b));
-  return a;
-}
-
-HOSTDEVICE inline bfloat16& operator*=(bfloat16& a,  // NOLINT
-                                       const bfloat16& b) {
-  a = bfloat16(static_cast<float>(a) * static_cast<float>(b));
-  return a;
-}
-
-HOSTDEVICE inline bfloat16& operator/=(bfloat16& a,  // NOLINT
-                                       const bfloat16& b) {
-  a = bfloat16(static_cast<float>(a) / static_cast<float>(b));
-  return a;
-}
-
-HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) {
-  bfloat16 res;
-  res.x = a;
-  return res;
-}
-
-// Comparison operators
-HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) {
-  return static_cast<float>(a) == static_cast<float>(b);
-}
-
-HOSTDEVICE inline bool operator!=(const bfloat16& a, const bfloat16& b) {
-  return static_cast<float>(a) != static_cast<float>(b);
-}
-
-HOSTDEVICE inline bool operator<(const bfloat16& a, const bfloat16& b) {
-  return static_cast<float>(a) < static_cast<float>(b);
-}
-
-HOSTDEVICE inline bool operator<=(const bfloat16& a, const bfloat16& b) {
-  return static_cast<float>(a) <= static_cast<float>(b);
-}
-
-HOSTDEVICE inline bool operator>(const bfloat16& a, const bfloat16& b) {
-  return static_cast<float>(a) > static_cast<float>(b);
-}
-
-HOSTDEVICE inline bool operator>=(const bfloat16& a, const bfloat16& b) {
-  return static_cast<float>(a) >= static_cast<float>(b);
-}
-
-HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
-  return (a.x & 0x7FFF) > 0x7F80;
-}
-
-HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
-  return (a.x & 0x7F80) == 0x7F80;
-}
-
-HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
-  return !((isnan)(a)) && !((isinf)(a));
-}
-
-inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) {
-  os << a.x;
-  return os;
-}
-
+using bfloat16 = pten::dtype::bfloat16;
+using namespace pten::dtype;  // NOLINT
 }  // namespace platform
 }  // namespace paddle
-
-namespace std {
-
-template <>
-struct is_pod<paddle::platform::bfloat16> {
-  static const bool value =
-      is_trivial<paddle::platform::bfloat16>::value &&
-      is_standard_layout<paddle::platform::bfloat16>::value;
-};
-
-template <>
-struct is_floating_point<paddle::platform::bfloat16>
-    : std::integral_constant<
-          bool, std::is_same<paddle::platform::bfloat16,
-                             typename std::remove_cv<
-                                 paddle::platform::bfloat16>::type>::value> {};
-template <>
-struct is_signed<paddle::platform::bfloat16> {
-  static const bool value = true;
-};
-
-template <>
-struct is_unsigned<paddle::platform::bfloat16> {
-  static const bool value = false;
-};
-
-inline bool isnan(const paddle::platform::bfloat16& a) {
-  return paddle::platform::isnan(a);
-}
-
-inline bool isinf(const paddle::platform::bfloat16& a) {
-  return paddle::platform::isinf(a);
-}
-
-template <>
-struct numeric_limits<paddle::platform::bfloat16> {
-  static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool has_infinity = true;
-  static const bool has_quiet_NaN = true;
-  static const bool has_signaling_NaN = true;
-  static const float_denorm_style has_denorm = denorm_present;
-  static const bool has_denorm_loss = false;
-  static const std::float_round_style round_style = std::round_to_nearest;
-  static const bool is_iec559 = false;
-  static const bool is_bounded = false;
-  static const bool is_modulo = false;
-  static const int digits = 8;
-  static const int digits10 = 2;
-  static const int max_digits10 = 9;
-  static const int radix = 2;
-  static const int min_exponent = -125;
-  static const int min_exponent10 = -37;
-  static const int max_exponent = 128;
-  static const int max_exponent10 = 38;
-  static const bool traps = true;
-  static const bool tinyness_before = false;
-
-  static paddle::platform::bfloat16(min)() {
-    return paddle::platform::raw_uint16_to_bfloat16(0x007f);
-  }
-  static paddle::platform::bfloat16 lowest() {
-    return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
-  }
-  static paddle::platform::bfloat16(max)() {
-    return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
-  }
-  static paddle::platform::bfloat16 epsilon() {
-    return paddle::platform::raw_uint16_to_bfloat16(0x3400);
-  }
-  static paddle::platform::bfloat16 round_error() {
-    return paddle::platform::bfloat16(0.5);
-  }
-  static paddle::platform::bfloat16 infinity() {
-    return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
-  }
-  static paddle::platform::bfloat16 quiet_NaN() {
-    return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
-  }
-  static paddle::platform::bfloat16 signaling_NaN() {
-    return paddle::platform::raw_uint16_to_bfloat16(0xff81);
-  }
-  static paddle::platform::bfloat16 denorm_min() {
-    return paddle::platform::raw_uint16_to_bfloat16(0x0001);
-  }
-};
-
-}  // namespace std
diff --git a/paddle/fluid/platform/complex.h b/paddle/fluid/platform/complex.h
index e50b741338..de12bc55a5 100644
--- a/paddle/fluid/platform/complex.h
+++ b/paddle/fluid/platform/complex.h
@@ -14,536 +14,14 @@
 
 #pragma once
 
-#include <stdint.h>
-
-#include <complex>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#ifdef PADDLE_WITH_CUDA
-#include <cuComplex.h>
-#include <thrust/complex.h>
-#endif  // PADDLE_WITH_CUDA
-
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_complex.h>
-#include <thrust/complex.h>  // NOLINT
-#endif
-
-#if !defined(_WIN32)
-#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
-#else
-#define PADDLE_ALIGN(x) __declspec(align(x))
-#endif
-
-#if (defined(__CUDACC__) || defined(__HIPCC__))
-#define HOSTDEVICE __host__ __device__
-#define DEVICE __device__
-#define HOST __host__
-#else
-#define HOSTDEVICE
-#define DEVICE
-#define HOST
-#endif
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-// todo
-#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX
-#endif
+#include "paddle/pten/common/complex.h"
 
 namespace paddle {
 namespace platform {
-
-template <typename T>
-struct PADDLE_ALIGN(sizeof(T) * 2) complex {
- public:
-  T real;
-  T imag;
-
-  using value_type = T;
-
-  complex() = default;
-  complex(const complex<T>& o) = default;
-  complex& operator=(const complex<T>& o) = default;
-  complex(complex<T>&& o) = default;
-  complex& operator=(complex<T>&& o) = default;
-  ~complex() = default;
-
-  HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-  template <typename T1>
-  HOSTDEVICE inline explicit complex(const thrust::complex<T1>& c) {
-    real = c.real();
-    imag = c.imag();
-  }
-
-  template <typename T1>
-  HOSTDEVICE inline explicit operator thrust::complex<T1>() const {
-    return thrust::complex<T1>(real, imag);
-  }
-
-#ifdef PADDLE_WITH_HIP
-  HOSTDEVICE inline explicit operator hipFloatComplex() const {
-    return make_hipFloatComplex(real, imag);
-  }
-
-  HOSTDEVICE inline explicit operator hipDoubleComplex() const {
-    return make_hipDoubleComplex(real, imag);
-  }
-#else
-  HOSTDEVICE inline explicit operator cuFloatComplex() const {
-    return make_cuFloatComplex(real, imag);
-  }
-
-  HOSTDEVICE inline explicit operator cuDoubleComplex() const {
-    return make_cuDoubleComplex(real, imag);
-  }
-#endif
-#endif
-
-  template <typename T1,
-            typename std::enable_if<std::is_floating_point<T1>::value ||
-                                        std::is_integral<T1>::value,
-                                    int>::type = 0>
-  HOSTDEVICE complex(const T1& val) {
-    real = static_cast<T>(val);
-    imag = static_cast<T>(0.0);
-  }
-
-  template <typename T1 = T>
-  HOSTDEVICE explicit complex(
-      const std::enable_if_t<std::is_same<T1, float>::value, complex<double>>&
-          val) {
-    real = val.real;
-    imag = val.imag;
-  }
-
-  template <typename T1 = T>
-  HOSTDEVICE explicit complex(
-      const std::enable_if_t<std::is_same<T1, double>::value, complex<float>>&
-          val) {
-    real = val.real;
-    imag = val.imag;
-  }
-
-  template <typename T1>
-  HOSTDEVICE inline explicit operator std::complex<T1>() const {
-    return static_cast<std::complex<T1>>(std::complex<T>(real, imag));
-  }
-
-  template <typename T1>
-  HOSTDEVICE complex(const std::complex<T1>& val)
-      : real(val.real()), imag(val.imag()) {}
-
-  template <typename T1,
-            typename std::enable_if<std::is_floating_point<T1>::value ||
-                                        std::is_integral<T1>::value,
-                                    int>::type = 0>
-  HOSTDEVICE inline complex& operator=(const T1& val) {
-    real = static_cast<T>(val);
-    imag = static_cast<T>(0.0);
-    return *this;
-  }
-
-  HOSTDEVICE inline explicit operator bool() const {
-    return static_cast<bool>(this->real) || static_cast<bool>(this->imag);
-  }
-
-  HOSTDEVICE inline explicit operator int8_t() const {
-    return static_cast<int8_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint8_t() const {
-    return static_cast<uint8_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator int16_t() const {
-    return static_cast<int16_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint16_t() const {
-    return static_cast<uint16_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator int32_t() const {
-    return static_cast<int32_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint32_t() const {
-    return static_cast<uint32_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator int64_t() const {
-    return static_cast<int64_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint64_t() const {
-    return static_cast<uint64_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator float() const {
-    return static_cast<float>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator double() const {
-    return static_cast<double>(this->real);
-  }
-};
-
-template <typename T>
-HOSTDEVICE inline complex<T> operator+(const complex<T>& a,
-                                       const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(thrust::complex<T>(a) + thrust::complex<T>(b));
-#else
-  return complex<T>(a.real + b.real, a.imag + b.imag);
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T> operator-(const complex<T>& a,
-                                       const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(thrust::complex<T>(a) - thrust::complex<T>(b));
-#else
-  return complex<T>(a.real - b.real, a.imag - b.imag);
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T> operator*(const complex<T>& a,
-                                       const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(thrust::complex<T>(a) * thrust::complex<T>(b));
-#else
-  return complex<T>(a.real * b.real - a.imag * b.imag,
-                    a.imag * b.real + b.imag * a.real);
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T> operator/(const complex<T>& a,
-                                       const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(thrust::complex<T>(a) / thrust::complex<T>(b));
-#else
-  T denominator = b.real * b.real + b.imag * b.imag;
-  return complex<T>((a.real * b.real + a.imag * b.imag) / denominator,
-                    (a.imag * b.real - a.real * b.imag) / denominator);
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T> operator-(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(-thrust::complex<T>(a.real, a.imag));
-#else
-  complex<T> res;
-  res.real = -a.real;
-  res.imag = -a.imag;
-  return res;
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T>& operator+=(complex<T>& a,  // NOLINT
-                                         const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex<T>(thrust::complex<T>(a.real, a.imag) +=
-                 thrust::complex<T>(b.real, b.imag));
-  return a;
-#else
-  a.real += b.real;
-  a.imag += b.imag;
-  return a;
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T>& operator-=(complex<T>& a,  // NOLINT
-                                         const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex<T>(thrust::complex<T>(a.real, a.imag) -=
-                 thrust::complex<T>(b.real, b.imag));
-  return a;
-#else
-  a.real -= b.real;
-  a.imag -= b.imag;
-  return a;
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T>& operator*=(complex<T>& a,  // NOLINT
-                                         const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex<T>(thrust::complex<T>(a.real, a.imag) *=
-                 thrust::complex<T>(b.real, b.imag));
-  return a;
-#else
-  a.real = a.real * b.real - a.imag * b.imag;
-  a.imag = a.imag * b.real + b.imag * a.real;
-  return a;
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T>& operator/=(complex<T>& a,  // NOLINT
-                                         const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex<T>(thrust::complex<T>(a.real, a.imag) /=
-                 thrust::complex<T>(b.real, b.imag));
-  return a;
-#else
-  T denominator = b.real * b.real + b.imag * b.imag;
-  a.real = (a.real * b.real + a.imag * b.imag) / denominator;
-  a.imag = (a.imag * b.real - a.real * b.imag) / denominator;
-  return a;
-#endif
-}
-
 template <typename T>
-HOSTDEVICE inline complex<T> raw_uint16_to_complex64(uint16_t a) {
-  complex<T> res;
-  res.real = a;
-  res.imag = 0.0;
-  return res;
-}
+using complex = pten::dtype::complex<T>;
 
-template <typename T>
-HOSTDEVICE inline bool operator==(const complex<T>& a, const complex<T>& b) {
-  return a.real == b.real && a.imag == b.imag;
-}
-
-template <typename T>
-HOSTDEVICE inline bool operator!=(const complex<T>& a, const complex<T>& b) {
-  return a.real != b.real || a.imag != b.imag;
-}
-
-template <typename T>
-HOSTDEVICE inline bool operator<(const complex<T>& a, const complex<T>& b) {
-  return a.real < b.real;
-}
-
-template <typename T>
-HOSTDEVICE inline bool operator<=(const complex<T>& a, const complex<T>& b) {
-  return a.real <= b.real;
-}
-
-template <typename T>
-HOSTDEVICE inline bool operator>(const complex<T>& a, const complex<T>& b) {
-  return a.real > b.real;
-}
-
-template <typename T>
-HOSTDEVICE inline bool operator>=(const complex<T>& a, const complex<T>& b) {
-  return a.real >= b.real;
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T>(max)(const complex<T>& a, const complex<T>& b) {
-  return (a.real >= b.real) ? a : b;
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T>(min)(const complex<T>& a, const complex<T>& b) {
-  return (a.real < b.real) ? a : b;
-}
-
-template <typename T>
-HOSTDEVICE inline bool(isnan)(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return ::isnan(a.real) || ::isnan(a.imag);
-#else
-  return std::isnan(a.real) || std::isnan(a.imag);
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline bool isinf(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return ::isinf(a.real) || ::isinf(a.imag);
-#else
-  return std::isinf(a.real) || std::isinf(a.imag);
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline bool isfinite(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return ::isfinite(a.real) || ::isfinite(a.imag);
-#else
-  return std::isfinite(a.real) || std::isfinite(a.imag);
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline T abs(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return thrust::abs(thrust::complex<T>(a));
-#else
-  return std::abs(std::complex<T>(a));
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline T arg(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return thrust::arg(thrust::complex<T>(a));
-#else
-  return std::arg(std::complex<T>(a));
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T> pow(const complex<T>& a, const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(thrust::pow(thrust::complex<T>(a), thrust::complex<T>(b)));
-#else
-  return complex<T>(std::pow(std::complex<T>(a), std::complex<T>(b)));
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T> sqrt(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(thrust::sqrt(thrust::complex<T>(a)));
-#else
-  return complex<T>(std::sqrt(std::complex<T>(a)));
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T> tanh(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(thrust::tanh(thrust::complex<T>(a)));
-#else
-  return complex<T>(std::tanh(std::complex<T>(a)));
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T> log(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(thrust::log(thrust::complex<T>(a)));
-#else
-  return complex<T>(std::log(std::complex<T>(a)));
-#endif
-}
-
-template <typename T>
-inline std::ostream& operator<<(std::ostream& os, const complex<T>& a) {
-  os << "real:" << a.real << " imag:" << a.imag;
-  return os;
-}
+using namespace pten::dtype;  // NOLINT
 
 }  // namespace platform
 }  // namespace paddle
-
-namespace std {
-
-template <typename T>
-struct is_pod<paddle::platform::complex<T>> {
-  static const bool value = true;
-};
-
-template <typename T>
-struct is_floating_point<paddle::platform::complex<T>>
-    : std::integral_constant<bool, false> {};
-
-template <typename T>
-struct is_signed<paddle::platform::complex<T>> {
-  static const bool value = false;
-};
-
-template <typename T>
-struct is_unsigned<paddle::platform::complex<T>> {
-  static const bool value = false;
-};
-
-template <typename T>
-inline bool isnan(const paddle::platform::complex<T>& a) {
-  return paddle::platform::isnan(a);
-}
-
-template <typename T>
-inline bool isinf(const paddle::platform::complex<T>& a) {
-  return paddle::platform::isinf(a);
-}
-
-template <typename T>
-struct numeric_limits<paddle::platform::complex<T>> {
-  static const bool is_specialized = false;
-  static const bool is_signed = false;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool has_infinity = false;
-  static const bool has_quiet_NaN = false;
-  static const bool has_signaling_NaN = false;
-  static const float_denorm_style has_denorm = denorm_absent;
-  static const bool has_denorm_loss = false;
-  static const std::float_round_style round_style = std::round_toward_zero;
-  static const bool is_iec559 = false;
-  static const bool is_bounded = false;
-  static const bool is_modulo = false;
-  static const int digits = 0;
-  static const int digits10 = 0;
-  static const int max_digits10 = 0;
-  static const int radix = 0;
-  static const int min_exponent = 0;
-  static const int min_exponent10 = 0;
-  static const int max_exponent = 0;
-  static const int max_exponent10 = 0;
-  static const bool traps = false;
-  static const bool tinyness_before = false;
-
-  static paddle::platform::complex<T>(min)() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-  static paddle::platform::complex<T> lowest() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-  static paddle::platform::complex<T>(max)() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-  static paddle::platform::complex<T> epsilon() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-  static paddle::platform::complex<T> round_error() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-  static paddle::platform::complex<T> infinity() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-  static paddle::platform::complex<T> quiet_NaN() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-  static paddle::platform::complex<T> signaling_NaN() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-  static paddle::platform::complex<T> denorm_min() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-};
-
-}  // namespace std
diff --git a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
index 2bcdbaa201..f6b11b3267 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
@@ -23,12 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/macros.h"
 
-namespace paddle {
-namespace platform {
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 DECLARE_bool(cudnn_deterministic);
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
index bd8d05f812..34b9d57e05 100644
--- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
@@ -27,12 +27,6 @@ limitations under the License. */
 // MIOPEN do not have epslion definition
 #define CUDNN_BN_MIN_EPSILON 1e-05
 
-namespace paddle {
-namespace platform {
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 DECLARE_bool(cudnn_deterministic);
 
 namespace paddle {
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index b6d088421a..3d9fa994b6 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -14,1073 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include <stdint.h>
-
-#include <cmath>
-#include <iostream>
-#include <limits>
-
-#ifdef PADDLE_WITH_CUDA
-#include <cuda.h>
-#endif  // PADDLE_WITH_CUDA
-
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_runtime.h>
-#endif
-
-#if defined(__CUDACC__) && CUDA_VERSION >= 7050
-#define PADDLE_CUDA_FP16
-#include <cuda_fp16.h>
-#endif
-
-#ifdef __HIPCC__
-#define PADDLE_CUDA_FP16
-#include <hip/hip_fp16.h>
-#endif
-
-#if !defined(_WIN32)
-#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
-#else
-#define PADDLE_ALIGN(x) __declspec(align(x))
-#endif
-
-#define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600)
-
-#if (defined(__CUDACC__) || defined(__HIPCC__))
-#define HOSTDEVICE __host__ __device__
-#define DEVICE __device__
-#define HOST __host__
-#else
-#define HOSTDEVICE
-#define DEVICE
-#define HOST
-#endif
+#include "paddle/pten/common/float16.h"
 
 namespace paddle {
 namespace platform {
-
-// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
-// and aligned at least on a 2-byte boundary, which leads to efficient
-// memory access of float16 struct and also makes float16 compatible
-// with CUDA half, ARM float16_t data types.
-struct PADDLE_ALIGN(2) float16 {
- public:
-  uint16_t x;
-
-  // The following defaulted special class member functions
-  // are added to make float16 pass the std::is_trivial test
-  float16() = default;
-  float16(const float16& o) = default;
-  float16& operator=(const float16& o) = default;
-  float16(float16&& o) = default;
-  float16& operator=(float16&& o) = default;
-  ~float16() = default;
-
-// Constructors
-#ifdef PADDLE_CUDA_FP16
-  HOSTDEVICE inline explicit float16(const half& h) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
-    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
-#else
-    x = h.x;
-#endif  // CUDA_VERSION >= 9000
-#endif
-  }
-#endif  // PADDLE_CUDA_FP16
-
-#ifdef PADDLE_WITH_NATIVE_FP16
-  // __fp16 is a native half precision data type for arm cpu,
-  // float16_t is an alias for __fp16
-  HOSTDEVICE inline explicit float16(const float16_t& h) {
-    x = *reinterpret_cast<const uint16_t*>(&h);
-  }
-#endif
-
-  HOSTDEVICE inline explicit float16(float val) {
-#if defined(PADDLE_CUDA_FP16) && \
-    (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
-    half tmp = __float2half(val);
-    x = *reinterpret_cast<uint16_t*>(&tmp);
-
-#elif defined(PADDLE_WITH_NATIVE_FP16)
-    float32x4_t tmp = vld1q_dup_f32(&val);
-    float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
-    x = *reinterpret_cast<uint16_t*>(&res);
-
-#elif defined(__F16C__)
-    x = _cvtss_sh(val, 0);
-
-#else
-    // Conversion routine adapted from
-    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
-    Bits v, s;
-    v.f = val;
-    uint32_t sign = v.si & sigN;
-    v.si ^= sign;
-    sign >>= shiftSign;  // logical shift
-    s.si = mulN;
-    s.si = s.f * v.f;  // correct subnormals
-    v.si ^= (s.si ^ v.si) & -(minN > v.si);
-    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
-    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
-    v.ui >>= shift;  // logical shift
-    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
-    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
-    x = v.ui | sign;
-
-#endif
-  }
-
-  HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
-
-  template <class T>
-  HOSTDEVICE inline explicit float16(const T& val)
-      : x(float16(static_cast<float>(val)).x) {}
-
-// Assignment operators
-#ifdef PADDLE_CUDA_FP16
-  HOSTDEVICE inline float16& operator=(const half& rhs) {
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
-    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
-#else
-    x = rhs.x;
-#endif
-    return *this;
-  }
-#endif
-
-#ifdef PADDLE_WITH_NATIVE_FP16
-  HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
-    x = *reinterpret_cast<const uint16_t*>(&rhs);
-    return *this;
-  }
-#endif
-
-  HOSTDEVICE inline float16& operator=(bool b) {
-    x = b ? 0x3c00 : 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(int8_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(uint8_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(int16_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(uint16_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(int32_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(uint32_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(int64_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(uint64_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(float val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(double val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-// Conversion opertors
-#ifdef PADDLE_CUDA_FP16
-  HOSTDEVICE inline half to_half() const {
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
-    __half_raw h;
-    h.x = x;
-    return half(h);
-#else
-    half h;
-    h.x = x;
-    return h;
-#endif  // CUDA_VERSION >= 9000
-  }
-#endif  // PADDLE_CUDA_FP16
-
-#ifdef PADDLE_WITH_NATIVE_FP16
-  HOSTDEVICE inline explicit operator float16_t() const {
-    return *reinterpret_cast<const float16_t*>(this);
-  }
-#endif
-
-  HOSTDEVICE inline operator float() const {
-#if defined(PADDLE_CUDA_FP16) && \
-    (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
-    half tmp = *reinterpret_cast<const half*>(this);
-    return __half2float(tmp);
-
-#elif defined(PADDLE_WITH_NATIVE_FP16)
-    float16x4_t res = vld1_dup_f16(reinterpret_cast<const float16_t*>(this));
-    return vgetq_lane_f32(vcvt_f32_f16(res), 0);
-
-#elif defined(__F16C__)
-    return _cvtsh_ss(this->x);
-
-#else
-    // Conversion routine adapted from
-    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
-    Bits v;
-    v.ui = this->x;
-    int32_t sign = v.si & sigC;
-    v.si ^= sign;
-    sign <<= shiftSign;
-    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
-    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
-    Bits s;
-    s.si = mulC;
-    s.f *= v.si;
-    int32_t mask = -(norC > v.si);
-    v.si <<= shift;
-    v.si ^= (s.si ^ v.si) & mask;
-    v.si |= sign;
-    return v.f;
-
-#endif
-  }
-
-  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
-
-  HOSTDEVICE inline explicit operator int8_t() const {
-    return static_cast<int8_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint8_t() const {
-    return static_cast<uint8_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator int16_t() const {
-    return static_cast<int16_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint16_t() const {
-    return static_cast<uint16_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator int32_t() const {
-    return static_cast<int32_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint32_t() const {
-    return static_cast<uint32_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator int64_t() const {
-    return static_cast<int64_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint64_t() const {
-    return static_cast<uint64_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline operator double() const {
-    return static_cast<double>(static_cast<float>(*this));
-  }
-
- private:
-  union Bits {
-    float f;
-    int32_t si;
-    uint32_t ui;
-  };
-
-  static const int shift = 13;
-  static const int shiftSign = 16;
-
-  static const int32_t infN = 0x7F800000;
-  static const int32_t maxN = 0x477FE000;  // max flt16 as flt32
-  static const int32_t minN = 0x38800000;  // min flt16 normal as flt32
-  static const int32_t sigN = 0x80000000;  // sign bit
-
-  static constexpr int32_t infC = infN >> shift;
-  static constexpr int32_t nanN = (infC + 1)
-                                  << shift;  // minimum flt16 nan as float32
-  static constexpr int32_t maxC = maxN >> shift;
-  static constexpr int32_t minC = minN >> shift;
-  static constexpr int32_t sigC = sigN >> shiftSign;
-
-  static const int32_t mulN = 0x52000000;  // (1 << 23) / minN
-  static const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
-  static const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
-  static const int32_t norC = 0x00400;     // min flt32 normal downshifted
-
-  static constexpr int32_t maxD = infC - maxC - 1;
-  static constexpr int32_t minD = minC - subC - 1;
-};
-
-// Arithmetic operators on GPU
-// CUDA 9.0 provides built-in arithmetic operators for half while
-// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
-// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
-// CUDA 9.0 regarding the half data type.
-// ROCM has built-in arithmetic operators as not defined
-// __HIP_NO_HALF_OPERATORS__
-#if defined(PADDLE_CUDA_FP16) && !defined(__HIPCC__) && CUDA_VERSION < 9000
-DEVICE inline half operator+(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hadd(a, b);
-#else
-  float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
-  return float16(res).to_half();
-#endif
-}
-
-DEVICE inline half operator-(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hsub(a, b);
-#else
-  float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
-  return float16(res).to_half();
-#endif
-}
-
-DEVICE inline half operator*(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hmul(a, b);
-#else
-  float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
-  return float16(res).to_half();
-#endif
-}
-
-DEVICE inline half operator/(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  float num = __half2float(a);
-  float denom = __half2float(b);
-  return __float2half(num / denom);
-#else
-  float res = static_cast<float>(float16(a)) / static_cast<float>(float16(b));
-  return float16(res).to_half();
-#endif
-}
-
-DEVICE inline half operator-(const half& a) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hneg(a);
-#else
-  float res = -static_cast<float>(float16(a));
-  return float16(res).to_half();
-#endif
-}
-
-#ifndef PADDLE_WITH_HIP  // not defined __HIP_NO_HALF_OPERATORS__
-DEVICE inline half& operator+=(half& a, const half& b) {  // NOLINT
-  a = a + b;
-  return a;
-}
-
-DEVICE inline half& operator-=(half& a, const half& b) {  // NOLINT
-  a = a - b;
-  return a;
-}
-
-DEVICE inline half& operator*=(half& a, const half& b) {  // NOLINT
-  a = a * b;
-  return a;
-}
-
-DEVICE inline half& operator/=(half& a, const half& b) {  // NOLINT
-  a = a / b;
-  return a;
-}
-#endif
-
-DEVICE inline bool operator==(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __heq(a, b);
-#else
-  return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
-#endif
-}
-
-DEVICE inline bool operator!=(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hne(a, b);
-#else
-  return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
-#endif
-}
-
-DEVICE inline bool operator<(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hlt(a, b);
-#else
-  return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
-#endif
-}
-
-DEVICE inline bool operator<=(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hle(a, b);
-#else
-  return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
-#endif
-}
-
-DEVICE inline bool operator>(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hgt(a, b);
-#else
-  return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
-#endif
-}
-
-DEVICE inline bool operator>=(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hge(a, b);
-#else
-  return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
-#endif
-}
-
-#endif  // PADDLE_CUDA_FP16
-
-// Arithmetic operators for float16 on GPU
-#if defined(PADDLE_CUDA_FP16)
-// HIPCC has compile error if call __device__ function __hadd, __hsub, etc.
-// in __host__ __device__ function
-#if defined(__HIPCC__)
-DEVICE inline float16 operator+(const float16& a, const float16& b) {
-  return float16(__hadd(a.to_half(), b.to_half()));
-}
-HOST inline float16 operator+(const float16& a, const float16& b) {
-  return float16(static_cast<float>(a) + static_cast<float>(b));
-}
-#else
-HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hadd(a.to_half(), b.to_half()));
-#else
-  return float16(static_cast<float>(a) + static_cast<float>(b));
-#endif
-}
-#endif
-
-#if defined(__HIPCC__)
-DEVICE inline float16 operator-(const float16& a, const float16& b) {
-  return float16(__hsub(a.to_half(), b.to_half()));
-}
-HOST inline float16 operator-(const float16& a, const float16& b) {
-  return float16(static_cast<float>(a) - static_cast<float>(b));
-}
-#else
-HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hsub(a.to_half(), b.to_half()));
-#else
-  return float16(static_cast<float>(a) - static_cast<float>(b));
-#endif
-}
-#endif
-
-#if defined(__HIPCC__)
-DEVICE inline float16 operator*(const float16& a, const float16& b) {
-  return float16(__hmul(a.to_half(), b.to_half()));
-}
-HOST inline float16 operator*(const float16& a, const float16& b) {
-  return float16(static_cast<float>(a) * static_cast<float>(b));
-}
-#else
-HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hmul(a.to_half(), b.to_half()));
-#else
-  return float16(static_cast<float>(a) * static_cast<float>(b));
-#endif
-}
-#endif
-
-#if defined(__HIPCC__)
-DEVICE inline float16 operator/(const float16& a, const float16& b) {
-  return float16(__hdiv(a.to_half(), b.to_half()));
-}
-HOST inline float16 operator/(const float16& a, const float16& b) {
-  return float16(static_cast<float>(a) / static_cast<float>(b));
-}
-#else
-HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  // TODO(kexinzhao): check which cuda version starts to support __hdiv
-  float num = __half2float(a.to_half());
-  float denom = __half2float(b.to_half());
-  return float16(num / denom);
-#else
-  return float16(static_cast<float>(a) / static_cast<float>(b));
-#endif
-}
-#endif
-
-#if defined(__HIPCC__)
-DEVICE inline float16 operator-(const float16& a) {
-  return float16(__hneg(a.to_half()));
-}
-HOST inline float16 operator-(const float16& a) {
-  float16 res;
-  res.x = a.x ^ 0x8000;
-  return res;
-}
-#else
-HOSTDEVICE inline float16 operator-(const float16& a) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hneg(a.to_half()));
-#else
-  float16 res;
-  res.x = a.x ^ 0x8000;
-  return res;
-#endif
-}
-#endif
-
-HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
-  a = a + b;
-  return a;
-}
-
-HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
-  a = a - b;
-  return a;
-}
-
-HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
-  a = a * b;
-  return a;
-}
-
-HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
-  a = a / b;
-  return a;
-}
-
-// HIPCC has compile error if call __device__ function __heq, __hne, etc.
-// in __host__ __device__ function
-#if defined(__HIPCC__)
-DEVICE inline bool operator==(const float16& a, const float16& b) {
-  return __heq(a.to_half(), b.to_half());
-}
-HOST inline bool operator==(const float16& a, const float16& b) {
-  return static_cast<float>(a) == static_cast<float>(b);
-}
-#else  // __HIPCC__
-HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __heq(a.to_half(), b.to_half());
-#else
-  return static_cast<float>(a) == static_cast<float>(b);
-#endif
-}
-#endif  // __HIPCC__
-
-#if defined(__HIPCC__)
-DEVICE inline bool operator!=(const float16& a, const float16& b) {
-  return __hne(a.to_half(), b.to_half());
-}
-HOST inline bool operator!=(const float16& a, const float16& b) {
-  return static_cast<float>(a) != static_cast<float>(b);
-}
-#else  // __HIPCC__
-HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hne(a.to_half(), b.to_half());
-#else
-  return static_cast<float>(a) != static_cast<float>(b);
-#endif
-}
-#endif  // __HIPCC__
-
-#if defined(__HIPCC__)
-DEVICE inline bool operator<(const float16& a, const float16& b) {
-  return __hlt(a.to_half(), b.to_half());
-}
-HOST inline bool operator<(const float16& a, const float16& b) {
-  return static_cast<float>(a) < static_cast<float>(b);
-}
-#else  // __HIPCC__
-HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hlt(a.to_half(), b.to_half());
-#else
-  return static_cast<float>(a) < static_cast<float>(b);
-#endif
-}
-#endif  // __HIPCC__
-
-#if defined(__HIPCC__)
-DEVICE inline bool operator<=(const float16& a, const float16& b) {
-  return __hle(a.to_half(), b.to_half());
-}
-HOST inline bool operator<=(const float16& a, const float16& b) {
-  return static_cast<float>(a) <= static_cast<float>(b);
-}
-#else  // __HIPCC__
-HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hle(a.to_half(), b.to_half());
-#else
-  return static_cast<float>(a) <= static_cast<float>(b);
-#endif
-}
-#endif  // __HIPCC__
-
-#if defined(__HIPCC__)
-DEVICE inline bool operator>(const float16& a, const float16& b) {
-  return __hgt(a.to_half(), b.to_half());
-}
-HOST inline bool operator>(const float16& a, const float16& b) {
-  return static_cast<float>(a) > static_cast<float>(b);
-}
-#else  // __HIPCC__
-HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hgt(a.to_half(), b.to_half());
-#else
-  return static_cast<float>(a) > static_cast<float>(b);
-#endif
-}
-#endif  // __HIPCC__
-
-#if defined(__HIPCC__)
-DEVICE inline bool operator>=(const float16& a, const float16& b) {
-  return __hge(a.to_half(), b.to_half());
-}
-HOST inline bool operator>=(const float16& a, const float16& b) {
-  return static_cast<float>(a) >= static_cast<float>(b);
-}
-#else  // __HIPCC__
-HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hge(a.to_half(), b.to_half());
-#else
-  return static_cast<float>(a) >= static_cast<float>(b);
-#endif
-}
-#endif  // __HIPCC__
-
-// Arithmetic operators for float16 on ARMv8.2-A CPU
-#elif defined(PADDLE_WITH_NATIVE_FP16)
-inline float16 operator+(const float16& a, const float16& b) {
-  float16 res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fadd h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "v0", "v1");
-  return res;
-}
-
-inline float16 operator-(const float16& a, const float16& b) {
-  float16 res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fsub h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "v0", "v1");
-  return res;
-}
-
-inline float16 operator*(const float16& a, const float16& b) {
-  float16 res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fmul h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "v0", "v1");
-  return res;
-}
-
-inline float16 operator/(const float16& a, const float16& b) {
-  float16 res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fdiv h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "v0", "v1");
-  return res;
-}
-
-inline float16 operator-(const float16& a) {
-  float16 res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "fneg h0, h0\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)),
-      [res_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "v0");
-  return res;
-}
-
-inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
-  a = a + b;
-  return a;
-}
-
-inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
-  a = a - b;
-  return a;
-}
-
-inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
-  a = a * b;
-  return a;
-}
-
-inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
-  a = a / b;
-  return a;
-}
-
-inline bool operator==(const float16& a, const float16& b) {
-  uint16_t res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fcmeq h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0", "v1");
-  return (res & 0xffff) != 0;
-}
-
-inline bool operator!=(const float16& a, const float16& b) { return !(a == b); }
-
-inline bool operator<(const float16& a, const float16& b) {
-  uint16_t res;
-  asm volatile(
-      "ld1 {v1.h}[0], [%[a_ptr]]\n"
-      "ld1 {v0.h}[0], [%[b_ptr]]\n"
-      "fcmgt h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0", "v1");
-  return (res & 0xffff) != 0;
-}
-
-inline bool operator<=(const float16& a, const float16& b) {
-  uint16_t res;
-  asm volatile(
-      "ld1 {v1.h}[0], [%[a_ptr]]\n"
-      "ld1 {v0.h}[0], [%[b_ptr]]\n"
-      "fcmge h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0", "v1");
-  return (res & 0xffff) != 0;
-}
-
-inline bool operator>(const float16& a, const float16& b) {
-  uint16_t res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fcmgt h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0", "v1");
-  return (res & 0xffff) != 0;
-}
-
-inline bool operator>=(const float16& a, const float16& b) {
-  uint16_t res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fcmge h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0", "v1");
-  return (res & 0xffff) != 0;
-}
-
-// Arithmetic operators for float16, software emulated on other CPU
-#else
-inline float16 operator+(const float16& a, const float16& b) {
-  return float16(static_cast<float>(a) + static_cast<float>(b));
-}
-
-inline float16 operator-(const float16& a, const float16& b) {
-  return float16(static_cast<float>(a) - static_cast<float>(b));
-}
-
-inline float16 operator*(const float16& a, const float16& b) {
-  return float16(static_cast<float>(a) * static_cast<float>(b));
-}
-
-inline float16 operator/(const float16& a, const float16& b) {
-  return float16(static_cast<float>(a) / static_cast<float>(b));
-}
-
-inline float16 operator-(const float16& a) {
-  float16 res;
-  res.x = a.x ^ 0x8000;
-  return res;
-}
-
-inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
-  a = float16(static_cast<float>(a) + static_cast<float>(b));
-  return a;
-}
-
-inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
-  a = float16(static_cast<float>(a) - static_cast<float>(b));
-  return a;
-}
-
-inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
-  a = float16(static_cast<float>(a) * static_cast<float>(b));
-  return a;
-}
-
-inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
-  a = float16(static_cast<float>(a) / static_cast<float>(b));
-  return a;
-}
-
-inline bool operator==(const float16& a, const float16& b) {
-  return static_cast<float>(a) == static_cast<float>(b);
-}
-
-inline bool operator!=(const float16& a, const float16& b) {
-  return static_cast<float>(a) != static_cast<float>(b);
-}
-
-inline bool operator<(const float16& a, const float16& b) {
-  return static_cast<float>(a) < static_cast<float>(b);
-}
-
-inline bool operator<=(const float16& a, const float16& b) {
-  return static_cast<float>(a) <= static_cast<float>(b);
-}
-
-inline bool operator>(const float16& a, const float16& b) {
-  return static_cast<float>(a) > static_cast<float>(b);
-}
-
-inline bool operator>=(const float16& a, const float16& b) {
-  return static_cast<float>(a) >= static_cast<float>(b);
-}
-#endif
-
-HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) {
-  float16 res;
-  res.x = a;
-  return res;
-}
-
-// HIPCC has compile error if call __device__ function __hisnan in __host__
-// __device__ function
-#if defined(PADDLE_CUDA_FP16) && defined(__HIPCC__)
-DEVICE inline bool(isnan)(const float16& a) { return __hisnan(a.to_half()); }
-HOST inline bool(isnan)(const float16& a) { return (a.x & 0x7fff) > 0x7c00; }
-#else
-HOSTDEVICE inline bool(isnan)(const float16& a) {
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hisnan(a.to_half());
-#else
-  return (a.x & 0x7fff) > 0x7c00;
-#endif
-}
-#endif
-
-HOSTDEVICE inline bool(isinf)(const float16& a) {
-  return (a.x & 0x7fff) == 0x7c00;
-}
-
-HOSTDEVICE inline bool(isfinite)(const float16& a) {
-  return !((isnan)(a)) && !((isinf)(a));
-}
-
-HOSTDEVICE inline float16(abs)(const float16& a) {
-#if defined(PADDLE_CUDA_FP16) && \
-    (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530))
-  return float16(::fabs(static_cast<float>(a)));
-#else
-  return float16(std::abs(static_cast<float>(a)));
-#endif
-}
-
-inline std::ostream& operator<<(std::ostream& os, const float16& a) {
-  os << static_cast<float>(a);
-  return os;
-}
-
+using float16 = pten::dtype::float16;
+using namespace pten::dtype;  // NOLINT
 }  // namespace platform
 }  // namespace paddle
-
-namespace std {
-
-// Override the std::is_pod::value for float16
-// The reason is that different compilers implemented std::is_pod based on
-// different C++ standards. float16 class is a plain old data in C++11 given
-// that it is both trivial and standard_layout.
-// However, std::is_pod in nvcc 8.0 host c++ compiler follows C++0x and is
-// more restricted in that you cannot provide any customized
-// constructor in float16. Hence, we override is_pod here following C++11
-// so that .cu files can be successfully compiled by nvcc.
-template <>
-struct is_pod<paddle::platform::float16> {
-  static const bool value =
-      is_trivial<paddle::platform::float16>::value &&
-      is_standard_layout<paddle::platform::float16>::value;
-};
-
-template <>
-struct is_floating_point<paddle::platform::float16>
-    : std::integral_constant<
-          bool, std::is_same<paddle::platform::float16,
-                             typename std::remove_cv<
-                                 paddle::platform::float16>::type>::value> {};
-template <>
-struct is_signed<paddle::platform::float16> {
-  static const bool value = true;
-};
-
-template <>
-struct is_unsigned<paddle::platform::float16> {
-  static const bool value = false;
-};
-
-inline bool isnan(const paddle::platform::float16& a) {
-  return paddle::platform::isnan(a);
-}
-
-inline bool isinf(const paddle::platform::float16& a) {
-  return paddle::platform::isinf(a);
-}
-
-template <>
-struct numeric_limits<paddle::platform::float16> {
-  static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool has_infinity = true;
-  static const bool has_quiet_NaN = true;
-  static const bool has_signaling_NaN = true;
-  static const float_denorm_style has_denorm = denorm_present;
-  static const bool has_denorm_loss = false;
-  static const std::float_round_style round_style = std::round_to_nearest;
-  static const bool is_iec559 = false;
-  static const bool is_bounded = false;
-  static const bool is_modulo = false;
-  static const int digits = 11;
-  static const int digits10 = 3;
-  static const int max_digits10 = 5;
-  static const int radix = 2;
-  static const int min_exponent = -13;
-  static const int min_exponent10 = -4;
-  static const int max_exponent = 16;
-  static const int max_exponent10 = 4;
-  static const bool traps = true;
-  static const bool tinyness_before = false;
-
-  HOSTDEVICE static paddle::platform::float16(min)() {
-    return paddle::platform::raw_uint16_to_float16(0x400);
-  }
-  HOSTDEVICE static paddle::platform::float16 lowest() {
-    return paddle::platform::raw_uint16_to_float16(0xfbff);
-  }
-  HOSTDEVICE static paddle::platform::float16(max)() {
-    return paddle::platform::raw_uint16_to_float16(0x7bff);
-  }
-  HOSTDEVICE static paddle::platform::float16 epsilon() {
-    return paddle::platform::raw_uint16_to_float16(0x0800);
-  }
-  HOSTDEVICE static paddle::platform::float16 round_error() {
-    return paddle::platform::float16(0.5);
-  }
-  HOSTDEVICE static paddle::platform::float16 infinity() {
-    return paddle::platform::raw_uint16_to_float16(0x7c00);
-  }
-  HOSTDEVICE static paddle::platform::float16 quiet_NaN() {
-    return paddle::platform::raw_uint16_to_float16(0x7e00);
-  }
-  HOSTDEVICE static paddle::platform::float16 signaling_NaN() {
-    return paddle::platform::raw_uint16_to_float16(0x7e00);
-  }
-  HOSTDEVICE static paddle::platform::float16 denorm_min() {
-    return paddle::platform::raw_uint16_to_float16(0x1);
-  }
-};
-
-HOSTDEVICE inline paddle::platform::float16 abs(
-    const paddle::platform::float16& a) {
-  return paddle::platform::abs(a);
-}
-
-}  // namespace std
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index a9b7c7581b..9b6e5d70cd 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -1,8 +1,3 @@
-# Adapt to custom op mechanism: Include the header files related to the data type
-# to avoid exposing the path of the underlying file, remove it after moving
-# float16.h/complex.h/bfloat16.h into pten
-include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
-
 # paddle experimental common components
 add_subdirectory(common)
 
diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc
index e5dd1ca5f8..cb70d26f94 100644
--- a/paddle/pten/api/lib/tensor.cc
+++ b/paddle/pten/api/lib/tensor.cc
@@ -49,11 +49,11 @@ limitations under the License. */
  */
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/stream/cuda_stream.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/pten/common/bfloat16.h b/paddle/pten/common/bfloat16.h
new file mode 100644
index 0000000000..0d9db3dbe9
--- /dev/null
+++ b/paddle/pten/common/bfloat16.h
@@ -0,0 +1,409 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <cmath>
+#include <cstring>
+#include <iostream>
+#include <limits>
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif
+
+#if defined(__CUDACC__) && CUDA_VERSION >= 11000
+#define PADDLE_CUDA_BF16
+#include <cuda_bf16.h>
+#endif
+
+#if !defined(_WIN32)
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x) __declspec(align(x))
+#endif
+
+#if (defined(__CUDACC__) || defined(__HIPCC__))
+#define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
+#define HOST __host__
+#else
+#define HOSTDEVICE
+#define DEVICE
+#define HOST
+#endif
+
+namespace pten {
+namespace dtype {
+
+struct PADDLE_ALIGN(2) bfloat16 {
+ public:
+  uint16_t x;
+
+  // Constructors
+  bfloat16() = default;
+  bfloat16(const bfloat16& o) = default;
+  bfloat16& operator=(const bfloat16& o) = default;
+  bfloat16(bfloat16&& o) = default;
+  bfloat16& operator=(bfloat16&& o) = default;
+  ~bfloat16() = default;
+
+  HOSTDEVICE inline explicit bfloat16(float val) {
+#ifdef PADDLE_WITH_HIP
+    uint32_t res = 0;
+    uint32_t* tempRes;
+    // We should be using memcpy in order to respect the strict aliasing rule
+    // but it fails in the HIP environment.
+    tempRes = reinterpret_cast<uint32_t*>(&val);
+    res = *tempRes;
+    x = res >> 16;
+#else
+#if defined(PADDLE_CUDA_BF16)
+    __nv_bfloat16 tmp = __float2bfloat16(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
+#else
+    std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
+#endif
+#endif
+  }
+
+#if defined(PADDLE_CUDA_BF16)
+  HOSTDEVICE inline explicit bfloat16(const __nv_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);
+  }
+#endif
+
+  template <class T>
+  HOSTDEVICE inline explicit bfloat16(const T& val)
+      : x(bfloat16(static_cast<float>(val)).x) {}
+
+// Assignment operators
+#if defined(PADDLE_CUDA_BF16)
+  HOSTDEVICE inline bfloat16& operator=(const __nv_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);
+    return *this;
+  }
+#endif
+
+  HOSTDEVICE inline bfloat16& operator=(bool b) {
+    x = b ? 0x3f80 : 0;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int8_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint8_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int16_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint16_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int32_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint32_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int64_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint64_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(float val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(double val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  // Conversion opertors
+  HOSTDEVICE inline explicit operator float() const {
+#ifdef PADDLE_WITH_HIP
+    uint32_t res = 0;
+    // We should be using memcpy in order to respect the strict aliasing rule
+    // but it fails in the HIP environment.
+    uint16_t temp = x;
+    uint16_t* temp_ptr = reinterpret_cast<uint16_t*>(&temp);
+    res = *temp_ptr;
+    return res;
+#else
+#ifdef PADDLE_CUDA_BF16
+    return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+#else
+    float val = 0.f;
+    uint16_t temp = x;
+    std::memcpy(
+        reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp), 2);
+    return val;
+#endif
+#endif
+  }
+
+#ifdef PADDLE_CUDA_BF16
+  HOSTDEVICE inline explicit operator __nv_bfloat16() const {
+    return *reinterpret_cast<const __nv_bfloat16*>(&x);
+  }
+#endif
+
+  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(static_cast<float>(*this));
+  }
+};
+
+HOSTDEVICE inline bfloat16 operator+(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator-(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) - static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator*(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) * static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator/(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator-(const bfloat16& a) {
+  bfloat16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+
+HOSTDEVICE inline bfloat16& operator+=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) + static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator-=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) - static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator*=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) * static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator/=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) / static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) {
+  bfloat16 res;
+  res.x = a;
+  return res;
+}
+
+// Comparison operators
+HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) == static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator!=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) != static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator<(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) < static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator<=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) <= static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator>(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) > static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator>=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) >= static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
+  return (a.x & 0x7FFF) > 0x7F80;
+}
+
+HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
+  return (a.x & 0x7F80) == 0x7F80;
+}
+
+HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
+  return !((isnan)(a)) && !((isinf)(a));
+}
+
+inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) {
+  os << a.x;
+  return os;
+}
+
+}  // namespace dtype
+}  // namespace pten
+
+namespace std {
+
+template <>
+struct is_pod<pten::dtype::bfloat16> {
+  static const bool value = is_trivial<pten::dtype::bfloat16>::value &&
+                            is_standard_layout<pten::dtype::bfloat16>::value;
+};
+
+template <>
+struct is_floating_point<pten::dtype::bfloat16>
+    : std::integral_constant<
+          bool,
+          std::is_same<
+              pten::dtype::bfloat16,
+              typename std::remove_cv<pten::dtype::bfloat16>::type>::value> {};
+template <>
+struct is_signed<pten::dtype::bfloat16> {
+  static const bool value = true;
+};
+
+template <>
+struct is_unsigned<pten::dtype::bfloat16> {
+  static const bool value = false;
+};
+
+inline bool isnan(const pten::dtype::bfloat16& a) {
+  return pten::dtype::isnan(a);
+}
+
+inline bool isinf(const pten::dtype::bfloat16& a) {
+  return pten::dtype::isinf(a);
+}
+
+template <>
+struct numeric_limits<pten::dtype::bfloat16> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 8;
+  static const int digits10 = 2;
+  static const int max_digits10 = 9;
+  static const int radix = 2;
+  static const int min_exponent = -125;
+  static const int min_exponent10 = -37;
+  static const int max_exponent = 128;
+  static const int max_exponent10 = 38;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+
+  static pten::dtype::bfloat16(min)() {
+    return pten::dtype::raw_uint16_to_bfloat16(0x007f);
+  }
+  static pten::dtype::bfloat16 lowest() {
+    return pten::dtype::raw_uint16_to_bfloat16(0xff7f);
+  }
+  static pten::dtype::bfloat16(max)() {
+    return pten::dtype::raw_uint16_to_bfloat16(0x7f7f);
+  }
+  static pten::dtype::bfloat16 epsilon() {
+    return pten::dtype::raw_uint16_to_bfloat16(0x3400);
+  }
+  static pten::dtype::bfloat16 round_error() {
+    return pten::dtype::bfloat16(0.5);
+  }
+  static pten::dtype::bfloat16 infinity() {
+    return pten::dtype::raw_uint16_to_bfloat16(0x7f80);
+  }
+  static pten::dtype::bfloat16 quiet_NaN() {
+    return pten::dtype::raw_uint16_to_bfloat16(0xffc1);
+  }
+  static pten::dtype::bfloat16 signaling_NaN() {
+    return pten::dtype::raw_uint16_to_bfloat16(0xff81);
+  }
+  static pten::dtype::bfloat16 denorm_min() {
+    return pten::dtype::raw_uint16_to_bfloat16(0x0001);
+  }
+};
+
+}  // namespace std
diff --git a/paddle/pten/common/complex.h b/paddle/pten/common/complex.h
new file mode 100644
index 0000000000..cd6b081ede
--- /dev/null
+++ b/paddle/pten/common/complex.h
@@ -0,0 +1,548 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <complex>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#ifdef PADDLE_WITH_CUDA
+#include <cuComplex.h>
+#include <thrust/complex.h>
+#endif  // PADDLE_WITH_CUDA
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_complex.h>
+#include <thrust/complex.h>  // NOLINT
+#endif
+
+#if !defined(_WIN32)
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x) __declspec(align(x))
+#endif
+
+#if (defined(__CUDACC__) || defined(__HIPCC__))
+#define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
+#define HOST __host__
+#else
+#define HOSTDEVICE
+#define DEVICE
+#define HOST
+#endif
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+// todo
+#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX
+#endif
+
+namespace pten {
+namespace dtype {
+
+template <typename T>
+struct PADDLE_ALIGN(sizeof(T) * 2) complex {
+ public:
+  T real;
+  T imag;
+
+  using value_type = T;
+
+  complex() = default;
+  complex(const complex<T>& o) = default;
+  complex& operator=(const complex<T>& o) = default;
+  complex(complex<T>&& o) = default;
+  complex& operator=(complex<T>&& o) = default;
+  ~complex() = default;
+
+  HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+  template <typename T1>
+  HOSTDEVICE inline explicit complex(const thrust::complex<T1>& c) {
+    real = c.real();
+    imag = c.imag();
+  }
+
+  template <typename T1>
+  HOSTDEVICE inline explicit operator thrust::complex<T1>() const {
+    return thrust::complex<T1>(real, imag);
+  }
+
+#ifdef PADDLE_WITH_HIP
+  HOSTDEVICE inline explicit operator hipFloatComplex() const {
+    return make_hipFloatComplex(real, imag);
+  }
+
+  HOSTDEVICE inline explicit operator hipDoubleComplex() const {
+    return make_hipDoubleComplex(real, imag);
+  }
+#else
+  HOSTDEVICE inline explicit operator cuFloatComplex() const {
+    return make_cuFloatComplex(real, imag);
+  }
+
+  HOSTDEVICE inline explicit operator cuDoubleComplex() const {
+    return make_cuDoubleComplex(real, imag);
+  }
+#endif
+#endif
+
+  template <typename T1,
+            typename std::enable_if<std::is_floating_point<T1>::value ||
+                                        std::is_integral<T1>::value,
+                                    int>::type = 0>
+  HOSTDEVICE complex(const T1& val) {
+    real = static_cast<T>(val);
+    imag = static_cast<T>(0.0);
+  }
+
+  template <typename T1 = T>
+  HOSTDEVICE explicit complex(
+      const std::enable_if_t<std::is_same<T1, float>::value, complex<double>>&
+          val) {
+    real = val.real;
+    imag = val.imag;
+  }
+
+  template <typename T1 = T>
+  HOSTDEVICE explicit complex(
+      const std::enable_if_t<std::is_same<T1, double>::value, complex<float>>&
+          val) {
+    real = val.real;
+    imag = val.imag;
+  }
+
+  template <typename T1>
+  HOSTDEVICE inline explicit operator std::complex<T1>() const {
+    return static_cast<std::complex<T1>>(std::complex<T>(real, imag));
+  }
+
+  template <typename T1>
+  HOSTDEVICE complex(const std::complex<T1>& val)
+      : real(val.real()), imag(val.imag()) {}
+
+  template <typename T1,
+            typename std::enable_if<std::is_floating_point<T1>::value ||
+                                        std::is_integral<T1>::value,
+                                    int>::type = 0>
+  HOSTDEVICE inline complex& operator=(const T1& val) {
+    real = static_cast<T>(val);
+    imag = static_cast<T>(0.0);
+    return *this;
+  }
+
+  HOSTDEVICE inline explicit operator bool() const {
+    return static_cast<bool>(this->real) || static_cast<bool>(this->imag);
+  }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator float() const {
+    return static_cast<float>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(this->real);
+  }
+};
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator+(const complex<T>& a,
+                                       const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::complex<T>(a) + thrust::complex<T>(b));
+#else
+  return complex<T>(a.real + b.real, a.imag + b.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator-(const complex<T>& a,
+                                       const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::complex<T>(a) - thrust::complex<T>(b));
+#else
+  return complex<T>(a.real - b.real, a.imag - b.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator*(const complex<T>& a,
+                                       const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::complex<T>(a) * thrust::complex<T>(b));
+#else
+  return complex<T>(a.real * b.real - a.imag * b.imag,
+                    a.imag * b.real + b.imag * a.real);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator/(const complex<T>& a,
+                                       const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::complex<T>(a) / thrust::complex<T>(b));
+#else
+  T denominator = b.real * b.real + b.imag * b.imag;
+  return complex<T>((a.real * b.real + a.imag * b.imag) / denominator,
+                    (a.imag * b.real - a.real * b.imag) / denominator);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator-(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(-thrust::complex<T>(a.real, a.imag));
+#else
+  complex<T> res;
+  res.real = -a.real;
+  res.imag = -a.imag;
+  return res;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>& operator+=(complex<T>& a,  // NOLINT
+                                         const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  a = complex<T>(thrust::complex<T>(a.real, a.imag) +=
+                 thrust::complex<T>(b.real, b.imag));
+  return a;
+#else
+  a.real += b.real;
+  a.imag += b.imag;
+  return a;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>& operator-=(complex<T>& a,  // NOLINT
+                                         const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  a = complex<T>(thrust::complex<T>(a.real, a.imag) -=
+                 thrust::complex<T>(b.real, b.imag));
+  return a;
+#else
+  a.real -= b.real;
+  a.imag -= b.imag;
+  return a;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>& operator*=(complex<T>& a,  // NOLINT
+                                         const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  a = complex<T>(thrust::complex<T>(a.real, a.imag) *=
+                 thrust::complex<T>(b.real, b.imag));
+  return a;
+#else
+  a.real = a.real * b.real - a.imag * b.imag;
+  a.imag = a.imag * b.real + b.imag * a.real;
+  return a;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>& operator/=(complex<T>& a,  // NOLINT
+                                         const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  a = complex<T>(thrust::complex<T>(a.real, a.imag) /=
+                 thrust::complex<T>(b.real, b.imag));
+  return a;
+#else
+  T denominator = b.real * b.real + b.imag * b.imag;
+  a.real = (a.real * b.real + a.imag * b.imag) / denominator;
+  a.imag = (a.imag * b.real - a.real * b.imag) / denominator;
+  return a;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> raw_uint16_to_complex64(uint16_t a) {
+  complex<T> res;
+  res.real = a;
+  res.imag = 0.0;
+  return res;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator==(const complex<T>& a, const complex<T>& b) {
+  return a.real == b.real && a.imag == b.imag;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator!=(const complex<T>& a, const complex<T>& b) {
+  return a.real != b.real || a.imag != b.imag;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator<(const complex<T>& a, const complex<T>& b) {
+  return a.real < b.real;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator<=(const complex<T>& a, const complex<T>& b) {
+  return a.real <= b.real;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator>(const complex<T>& a, const complex<T>& b) {
+  return a.real > b.real;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator>=(const complex<T>& a, const complex<T>& b) {
+  return a.real >= b.real;
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>(max)(const complex<T>& a, const complex<T>& b) {
+  return (a.real >= b.real) ? a : b;
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>(min)(const complex<T>& a, const complex<T>& b) {
+  return (a.real < b.real) ? a : b;
+}
+
+template <typename T>
+HOSTDEVICE inline bool(isnan)(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return ::isnan(a.real) || ::isnan(a.imag);
+#else
+  return std::isnan(a.real) || std::isnan(a.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline bool isinf(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return ::isinf(a.real) || ::isinf(a.imag);
+#else
+  return std::isinf(a.real) || std::isinf(a.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline bool isfinite(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return ::isfinite(a.real) || ::isfinite(a.imag);
+#else
+  return std::isfinite(a.real) || std::isfinite(a.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline T abs(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return thrust::abs(thrust::complex<T>(a));
+#else
+  return std::abs(std::complex<T>(a));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline T arg(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return thrust::arg(thrust::complex<T>(a));
+#else
+  return std::arg(std::complex<T>(a));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> pow(const complex<T>& a, const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::pow(thrust::complex<T>(a), thrust::complex<T>(b)));
+#else
+  return complex<T>(std::pow(std::complex<T>(a), std::complex<T>(b)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> sqrt(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::sqrt(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::sqrt(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> tanh(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::tanh(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::tanh(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> log(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::log(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::log(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const complex<T>& a) {
+  os << "real:" << a.real << " imag:" << a.imag;
+  return os;
+}
+}  // namespace dtype
+}  // namespace pten
+
+namespace std {
+
+template <typename T>
+struct is_pod<pten::dtype::complex<T>> {
+  static const bool value = true;
+};
+
+template <typename T>
+struct is_floating_point<pten::dtype::complex<T>>
+    : std::integral_constant<bool, false> {};
+
+template <typename T>
+struct is_signed<pten::dtype::complex<T>> {
+  static const bool value = false;
+};
+
+template <typename T>
+struct is_unsigned<pten::dtype::complex<T>> {
+  static const bool value = false;
+};
+
+template <typename T>
+inline bool isnan(const pten::dtype::complex<T>& a) {
+  return pten::dtype::isnan(a);
+}
+
+template <typename T>
+inline bool isinf(const pten::dtype::complex<T>& a) {
+  return pten::dtype::isinf(a);
+}
+
+template <typename T>
+struct numeric_limits<pten::dtype::complex<T>> {
+  static const bool is_specialized = false;
+  static const bool is_signed = false;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = false;
+  static const bool has_quiet_NaN = false;
+  static const bool has_signaling_NaN = false;
+  static const float_denorm_style has_denorm = denorm_absent;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_toward_zero;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 0;
+  static const int digits10 = 0;
+  static const int max_digits10 = 0;
+  static const int radix = 0;
+  static const int min_exponent = 0;
+  static const int min_exponent10 = 0;
+  static const int max_exponent = 0;
+  static const int max_exponent10 = 0;
+  static const bool traps = false;
+  static const bool tinyness_before = false;
+
+  static pten::dtype::complex<T>(min)() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+  static pten::dtype::complex<T> lowest() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+  static pten::dtype::complex<T>(max)() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+  static pten::dtype::complex<T> epsilon() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+  static pten::dtype::complex<T> round_error() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+  static pten::dtype::complex<T> infinity() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+  static pten::dtype::complex<T> quiet_NaN() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+  static pten::dtype::complex<T> signaling_NaN() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+  static pten::dtype::complex<T> denorm_min() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+};
+
+}  // namespace std
diff --git a/paddle/pten/common/data_type.h b/paddle/pten/common/data_type.h
index 306507d2d2..3e31d8ba69 100644
--- a/paddle/pten/common/data_type.h
+++ b/paddle/pten/common/data_type.h
@@ -14,19 +14,19 @@ limitations under the License. */
 
 #pragma once
 
-#include "bfloat16.h"  // NOLINT
-#include "complex.h"   // NOLINT
-#include "float16.h"   // NOLINT
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"
 
 #include "paddle/pten/api/ext/exception.h"
 
 namespace paddle {
 namespace experimental {
 
-using complex64 = ::paddle::platform::complex<float>;
-using complex128 = ::paddle::platform::complex<double>;
-using float16 = ::paddle::platform::float16;
-using bfloat16 = ::paddle::platform::bfloat16;
+using complex64 = ::pten::dtype::complex<float>;
+using complex128 = ::pten::dtype::complex<double>;
+using float16 = ::pten::dtype::float16;
+using bfloat16 = ::pten::dtype::bfloat16;
 
 enum class DataType {
   UNDEFINED = 0,
diff --git a/paddle/pten/common/float16.h b/paddle/pten/common/float16.h
new file mode 100644
index 0000000000..b9ee731f92
--- /dev/null
+++ b/paddle/pten/common/float16.h
@@ -0,0 +1,1094 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <cmath>
+#include <iostream>
+#include <limits>
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif  // PADDLE_WITH_CUDA
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
+#if defined(__CUDACC__) && CUDA_VERSION >= 7050
+#define PADDLE_CUDA_FP16
+#include <cuda_fp16.h>
+#endif
+
+#ifdef __HIPCC__
+#define PADDLE_CUDA_FP16
+#include <hip/hip_fp16.h>
+#endif
+
+#if !defined(_WIN32)
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x) __declspec(align(x))
+#endif
+
+#define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600)
+
+#if (defined(__CUDACC__) || defined(__HIPCC__))
+#define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
+#define HOST __host__
+#else
+#define HOSTDEVICE
+#define DEVICE
+#define HOST
+#endif
+
+namespace pten {
+namespace dtype {
+
+// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
+// and aligned at least on a 2-byte boundary, which leads to efficient
+// memory access of float16 struct and also makes float16 compatible
+// with CUDA half, ARM float16_t data types.
+struct PADDLE_ALIGN(2) float16 {
+ public:
+  uint16_t x;
+
+  // The following defaulted special class member functions
+  // are added to make float16 pass the std::is_trivial test
+  float16() = default;
+  float16(const float16& o) = default;
+  float16& operator=(const float16& o) = default;
+  float16(float16&& o) = default;
+  float16& operator=(float16&& o) = default;
+  ~float16() = default;
+
+// Constructors
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline explicit float16(const half& h) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
+#else
+    x = h.x;
+#endif  // CUDA_VERSION >= 9000
+#endif
+  }
+#endif  // PADDLE_CUDA_FP16
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  // __fp16 is a native half precision data type for arm cpu,
+  // float16_t is an alias for __fp16
+  HOSTDEVICE inline explicit float16(const float16_t& h) {
+    x = *reinterpret_cast<const uint16_t*>(&h);
+  }
+#endif
+
+  HOSTDEVICE inline explicit float16(float val) {
+#if defined(PADDLE_CUDA_FP16) && \
+    (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
+    half tmp = __float2half(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
+
+#elif defined(PADDLE_WITH_NATIVE_FP16)
+    float32x4_t tmp = vld1q_dup_f32(&val);
+    float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
+    x = *reinterpret_cast<uint16_t*>(&res);
+
+#elif defined(__F16C__)
+    x = _cvtss_sh(val, 0);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v, s;
+    v.f = val;
+    uint32_t sign = v.si & sigN;
+    v.si ^= sign;
+    sign >>= shiftSign;  // logical shift
+    s.si = mulN;
+    s.si = s.f * v.f;  // correct subnormals
+    v.si ^= (s.si ^ v.si) & -(minN > v.si);
+    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+    v.ui >>= shift;  // logical shift
+    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+    x = v.ui | sign;
+
+#endif
+  }
+
+  HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
+
+  template <class T>
+  HOSTDEVICE inline explicit float16(const T& val)
+      : x(float16(static_cast<float>(val)).x) {}
+
+// Assignment operators
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline float16& operator=(const half& rhs) {
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
+#else
+    x = rhs.x;
+#endif
+    return *this;
+  }
+#endif
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
+    x = *reinterpret_cast<const uint16_t*>(&rhs);
+    return *this;
+  }
+#endif
+
+  HOSTDEVICE inline float16& operator=(bool b) {
+    x = b ? 0x3c00 : 0;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(float val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(double val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+// Conversion opertors
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline half to_half() const {
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
+    __half_raw h;
+    h.x = x;
+    return half(h);
+#else
+    half h;
+    h.x = x;
+    return h;
+#endif  // CUDA_VERSION >= 9000
+  }
+#endif  // PADDLE_CUDA_FP16
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline explicit operator float16_t() const {
+    return *reinterpret_cast<const float16_t*>(this);
+  }
+#endif
+
+  HOSTDEVICE inline operator float() const {
+#if defined(PADDLE_CUDA_FP16) && \
+    (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
+    half tmp = *reinterpret_cast<const half*>(this);
+    return __half2float(tmp);
+
+#elif defined(PADDLE_WITH_NATIVE_FP16)
+    float16x4_t res = vld1_dup_f16(reinterpret_cast<const float16_t*>(this));
+    return vgetq_lane_f32(vcvt_f32_f16(res), 0);
+
+#elif defined(__F16C__)
+    return _cvtsh_ss(this->x);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v;
+    v.ui = this->x;
+    int32_t sign = v.si & sigC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+
+#endif
+  }
+
+  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline operator double() const {
+    return static_cast<double>(static_cast<float>(*this));
+  }
+
+ private:
+  union Bits {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  };
+
+  static const int shift = 13;
+  static const int shiftSign = 16;
+
+  static const int32_t infN = 0x7F800000;
+  static const int32_t maxN = 0x477FE000;  // max flt16 as flt32
+  static const int32_t minN = 0x38800000;  // min flt16 normal as flt32
+  static const int32_t sigN = 0x80000000;  // sign bit
+
+  static constexpr int32_t infC = infN >> shift;
+  static constexpr int32_t nanN = (infC + 1)
+                                  << shift;  // minimum flt16 nan as float32
+  static constexpr int32_t maxC = maxN >> shift;
+  static constexpr int32_t minC = minN >> shift;
+  static constexpr int32_t sigC = sigN >> shiftSign;
+
+  static const int32_t mulN = 0x52000000;  // (1 << 23) / minN
+  static const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
+  static const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
+  static const int32_t norC = 0x00400;     // min flt32 normal downshifted
+
+  static constexpr int32_t maxD = infC - maxC - 1;
+  static constexpr int32_t minD = minC - subC - 1;
+};
+
+// Arithmetic operators on GPU
+// CUDA 9.0 provides built-in arithmetic operators for half while
+// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
+// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
+// CUDA 9.0 regarding the half data type.
+// ROCM has built-in arithmetic operators as not defined
+// __HIP_NO_HALF_OPERATORS__
+#if defined(PADDLE_CUDA_FP16) && !defined(__HIPCC__) && CUDA_VERSION < 9000
+DEVICE inline half operator+(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hadd(a, b);
+#else
+  float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
+  return float16(res).to_half();
+#endif
+}
+
+DEVICE inline half operator-(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hsub(a, b);
+#else
+  float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
+  return float16(res).to_half();
+#endif
+}
+
+DEVICE inline half operator*(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hmul(a, b);
+#else
+  float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
+  return float16(res).to_half();
+#endif
+}
+
+DEVICE inline half operator/(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  float num = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
+#else
+  float res = static_cast<float>(float16(a)) / static_cast<float>(float16(b));
+  return float16(res).to_half();
+#endif
+}
+
+DEVICE inline half operator-(const half& a) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hneg(a);
+#else
+  float res = -static_cast<float>(float16(a));
+  return float16(res).to_half();
+#endif
+}
+
+#ifndef PADDLE_WITH_HIP  // not defined __HIP_NO_HALF_OPERATORS__
+DEVICE inline half& operator+=(half& a, const half& b) {  // NOLINT
+  a = a + b;
+  return a;
+}
+
+DEVICE inline half& operator-=(half& a, const half& b) {  // NOLINT
+  a = a - b;
+  return a;
+}
+
+DEVICE inline half& operator*=(half& a, const half& b) {  // NOLINT
+  a = a * b;
+  return a;
+}
+
+DEVICE inline half& operator/=(half& a, const half& b) {  // NOLINT
+  a = a / b;
+  return a;
+}
+#endif
+
+DEVICE inline bool operator==(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __heq(a, b);
+#else
+  return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
+#endif
+}
+
+DEVICE inline bool operator!=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hne(a, b);
+#else
+  return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
+#endif
+}
+
+DEVICE inline bool operator<(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(a, b);
+#else
+  return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
+#endif
+}
+
+DEVICE inline bool operator<=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hle(a, b);
+#else
+  return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
+#endif
+}
+
+DEVICE inline bool operator>(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hgt(a, b);
+#else
+  return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
+#endif
+}
+
+DEVICE inline bool operator>=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hge(a, b);
+#else
+  return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
+#endif
+}
+
+#endif  // PADDLE_CUDA_FP16
+
+// Arithmetic operators for float16 on GPU
+#if defined(PADDLE_CUDA_FP16)
+// HIPCC has compile error if call __device__ function __hadd, __hsub, etc.
+// in __host__ __device__ function
+#if defined(__HIPCC__)
+DEVICE inline float16 operator+(const float16& a, const float16& b) {
+  return float16(__hadd(a.to_half(), b.to_half()));
+}
+HOST inline float16 operator+(const float16& a, const float16& b) {
+  return float16(static_cast<float>(a) + static_cast<float>(b));
+}
+#else
+HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hadd(a.to_half(), b.to_half()));
+#else
+  return float16(static_cast<float>(a) + static_cast<float>(b));
+#endif
+}
+#endif
+
+#if defined(__HIPCC__)
+DEVICE inline float16 operator-(const float16& a, const float16& b) {
+  return float16(__hsub(a.to_half(), b.to_half()));
+}
+HOST inline float16 operator-(const float16& a, const float16& b) {
+  return float16(static_cast<float>(a) - static_cast<float>(b));
+}
+#else
+HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hsub(a.to_half(), b.to_half()));
+#else
+  return float16(static_cast<float>(a) - static_cast<float>(b));
+#endif
+}
+#endif
+
+#if defined(__HIPCC__)
+DEVICE inline float16 operator*(const float16& a, const float16& b) {
+  return float16(__hmul(a.to_half(), b.to_half()));
+}
+HOST inline float16 operator*(const float16& a, const float16& b) {
+  return float16(static_cast<float>(a) * static_cast<float>(b));
+}
+#else
+HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hmul(a.to_half(), b.to_half()));
+#else
+  return float16(static_cast<float>(a) * static_cast<float>(b));
+#endif
+}
+#endif
+
+#if defined(__HIPCC__)
+DEVICE inline float16 operator/(const float16& a, const float16& b) {
+  return float16(__hdiv(a.to_half(), b.to_half()));
+}
+HOST inline float16 operator/(const float16& a, const float16& b) {
+  return float16(static_cast<float>(a) / static_cast<float>(b));
+}
+#else
+HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  // TODO(kexinzhao): check which cuda version starts to support __hdiv
+  float num = __half2float(a.to_half());
+  float denom = __half2float(b.to_half());
+  return float16(num / denom);
+#else
+  return float16(static_cast<float>(a) / static_cast<float>(b));
+#endif
+}
+#endif
+
+#if defined(__HIPCC__)
+DEVICE inline float16 operator-(const float16& a) {
+  return float16(__hneg(a.to_half()));
+}
+HOST inline float16 operator-(const float16& a) {
+  float16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+#else
+HOSTDEVICE inline float16 operator-(const float16& a) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hneg(a.to_half()));
+#else
+  float16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+#endif
+}
+#endif
+
+HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
+  a = a + b;
+  return a;
+}
+
+HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
+  a = a - b;
+  return a;
+}
+
+HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
+  a = a * b;
+  return a;
+}
+
+HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
+  a = a / b;
+  return a;
+}
+
+// HIPCC has compile error if call __device__ function __heq, __hne, etc.
+// in __host__ __device__ function
+#if defined(__HIPCC__)
+DEVICE inline bool operator==(const float16& a, const float16& b) {
+  return __heq(a.to_half(), b.to_half());
+}
+HOST inline bool operator==(const float16& a, const float16& b) {
+  return static_cast<float>(a) == static_cast<float>(b);
+}
+#else  // __HIPCC__
+HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __heq(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) == static_cast<float>(b);
+#endif
+}
+#endif  // __HIPCC__
+
+#if defined(__HIPCC__)
+DEVICE inline bool operator!=(const float16& a, const float16& b) {
+  return __hne(a.to_half(), b.to_half());
+}
+HOST inline bool operator!=(const float16& a, const float16& b) {
+  return static_cast<float>(a) != static_cast<float>(b);
+}
+#else  // __HIPCC__
+HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hne(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) != static_cast<float>(b);
+#endif
+}
+#endif  // __HIPCC__
+
+#if defined(__HIPCC__)
+DEVICE inline bool operator<(const float16& a, const float16& b) {
+  return __hlt(a.to_half(), b.to_half());
+}
+HOST inline bool operator<(const float16& a, const float16& b) {
+  return static_cast<float>(a) < static_cast<float>(b);
+}
+#else  // __HIPCC__
+HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) < static_cast<float>(b);
+#endif
+}
+#endif  // __HIPCC__
+
+#if defined(__HIPCC__)
+DEVICE inline bool operator<=(const float16& a, const float16& b) {
+  return __hle(a.to_half(), b.to_half());
+}
+HOST inline bool operator<=(const float16& a, const float16& b) {
+  return static_cast<float>(a) <= static_cast<float>(b);
+}
+#else  // __HIPCC__
+HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hle(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) <= static_cast<float>(b);
+#endif
+}
+#endif  // __HIPCC__
+
+#if defined(__HIPCC__)
+DEVICE inline bool operator>(const float16& a, const float16& b) {
+  return __hgt(a.to_half(), b.to_half());
+}
+HOST inline bool operator>(const float16& a, const float16& b) {
+  return static_cast<float>(a) > static_cast<float>(b);
+}
+#else  // __HIPCC__
+HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hgt(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) > static_cast<float>(b);
+#endif
+}
+#endif  // __HIPCC__
+
+#if defined(__HIPCC__)
+DEVICE inline bool operator>=(const float16& a, const float16& b) {
+  return __hge(a.to_half(), b.to_half());
+}
+HOST inline bool operator>=(const float16& a, const float16& b) {
+  return static_cast<float>(a) >= static_cast<float>(b);
+}
+#else  // __HIPCC__
+HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hge(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) >= static_cast<float>(b);
+#endif
+}
+#endif  // __HIPCC__
+
+// Arithmetic operators for float16 on ARMv8.2-A CPU
+#elif defined(PADDLE_WITH_NATIVE_FP16)
+inline float16 operator+(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fadd h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+inline float16 operator-(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fsub h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+inline float16 operator*(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fmul h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+inline float16 operator/(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fdiv h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+inline float16 operator-(const float16& a) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "fneg h0, h0\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0");
+  return res;
+}
+
+inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
+  a = a + b;
+  return a;
+}
+
+inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
+  a = a - b;
+  return a;
+}
+
+inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
+  a = a * b;
+  return a;
+}
+
+inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
+  a = a / b;
+  return a;
+}
+
+inline bool operator==(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmeq h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+inline bool operator!=(const float16& a, const float16& b) { return !(a == b); }
+
+inline bool operator<(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+inline bool operator<=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+inline bool operator>(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+inline bool operator>=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+// Arithmetic operators for float16, software emulated on other CPU
+#else
+inline float16 operator+(const float16& a, const float16& b) {
+  return float16(static_cast<float>(a) + static_cast<float>(b));
+}
+
+inline float16 operator-(const float16& a, const float16& b) {
+  return float16(static_cast<float>(a) - static_cast<float>(b));
+}
+
+inline float16 operator*(const float16& a, const float16& b) {
+  return float16(static_cast<float>(a) * static_cast<float>(b));
+}
+
+inline float16 operator/(const float16& a, const float16& b) {
+  return float16(static_cast<float>(a) / static_cast<float>(b));
+}
+
+inline float16 operator-(const float16& a) {
+  float16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+
+inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) + static_cast<float>(b));
+  return a;
+}
+
+inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) - static_cast<float>(b));
+  return a;
+}
+
+inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) * static_cast<float>(b));
+  return a;
+}
+
+inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) / static_cast<float>(b));
+  return a;
+}
+
+inline bool operator==(const float16& a, const float16& b) {
+  return static_cast<float>(a) == static_cast<float>(b);
+}
+
+inline bool operator!=(const float16& a, const float16& b) {
+  return static_cast<float>(a) != static_cast<float>(b);
+}
+
+inline bool operator<(const float16& a, const float16& b) {
+  return static_cast<float>(a) < static_cast<float>(b);
+}
+
+inline bool operator<=(const float16& a, const float16& b) {
+  return static_cast<float>(a) <= static_cast<float>(b);
+}
+
+inline bool operator>(const float16& a, const float16& b) {
+  return static_cast<float>(a) > static_cast<float>(b);
+}
+
+inline bool operator>=(const float16& a, const float16& b) {
+  return static_cast<float>(a) >= static_cast<float>(b);
+}
+#endif
+
+HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) {
+  float16 res;
+  res.x = a;
+  return res;
+}
+
+// HIPCC has compile error if call __device__ function __hisnan in __host__
+// __device__ function
+#if defined(PADDLE_CUDA_FP16) && defined(__HIPCC__)
+DEVICE inline bool(isnan)(const float16& a) { return __hisnan(a.to_half()); }
+HOST inline bool(isnan)(const float16& a) { return (a.x & 0x7fff) > 0x7c00; }
+#else
+HOSTDEVICE inline bool(isnan)(const float16& a) {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hisnan(a.to_half());
+#else
+  return (a.x & 0x7fff) > 0x7c00;
+#endif
+}
+#endif
+
+HOSTDEVICE inline bool(isinf)(const float16& a) {
+  return (a.x & 0x7fff) == 0x7c00;
+}
+
+HOSTDEVICE inline bool(isfinite)(const float16& a) {
+  return !((isnan)(a)) && !((isinf)(a));
+}
+
+HOSTDEVICE inline float16(abs)(const float16& a) {
+#if defined(PADDLE_CUDA_FP16) && \
+    (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530))
+  return float16(::fabs(static_cast<float>(a)));
+#else
+  return float16(std::abs(static_cast<float>(a)));
+#endif
+}
+
+inline std::ostream& operator<<(std::ostream& os, const float16& a) {
+  os << static_cast<float>(a);
+  return os;
+}
+
+}  // namespace dtype
+}  // namespace pten
+
+namespace std {
+
+// Override the std::is_pod::value for float16
+// The reason is that different compilers implemented std::is_pod based on
+// different C++ standards. float16 class is a plain old data in C++11 given
+// that it is both trivial and standard_layout.
+// However, std::is_pod in nvcc 8.0 host c++ compiler follows C++0x and is
+// more restricted in that you cannot provide any customized
+// constructor in float16. Hence, we override is_pod here following C++11
+// so that .cu files can be successfully compiled by nvcc.
+template <>
+struct is_pod<pten::dtype::float16> {
+  static const bool value = is_trivial<pten::dtype::float16>::value &&
+                            is_standard_layout<pten::dtype::float16>::value;
+};
+
+template <>
+struct is_floating_point<pten::dtype::float16>
+    : std::integral_constant<
+          bool,
+          std::is_same<
+              pten::dtype::float16,
+              typename std::remove_cv<pten::dtype::float16>::type>::value> {};
+template <>
+struct is_signed<pten::dtype::float16> {
+  static const bool value = true;
+};
+
+template <>
+struct is_unsigned<pten::dtype::float16> {
+  static const bool value = false;
+};
+
+inline bool isnan(const pten::dtype::float16& a) {
+  return pten::dtype::isnan(a);
+}
+
+inline bool isinf(const pten::dtype::float16& a) {
+  return pten::dtype::isinf(a);
+}
+
+template <>
+struct numeric_limits<pten::dtype::float16> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 11;
+  static const int digits10 = 3;
+  static const int max_digits10 = 5;
+  static const int radix = 2;
+  static const int min_exponent = -13;
+  static const int min_exponent10 = -4;
+  static const int max_exponent = 16;
+  static const int max_exponent10 = 4;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+
+  HOSTDEVICE static pten::dtype::float16(min)() {
+    return pten::dtype::raw_uint16_to_float16(0x400);
+  }
+  HOSTDEVICE static pten::dtype::float16 lowest() {
+    return pten::dtype::raw_uint16_to_float16(0xfbff);
+  }
+  HOSTDEVICE static pten::dtype::float16(max)() {
+    return pten::dtype::raw_uint16_to_float16(0x7bff);
+  }
+  HOSTDEVICE static pten::dtype::float16 epsilon() {
+    return pten::dtype::raw_uint16_to_float16(0x0800);
+  }
+  HOSTDEVICE static pten::dtype::float16 round_error() {
+    return pten::dtype::float16(0.5);
+  }
+  HOSTDEVICE static pten::dtype::float16 infinity() {
+    return pten::dtype::raw_uint16_to_float16(0x7c00);
+  }
+  HOSTDEVICE static pten::dtype::float16 quiet_NaN() {
+    return pten::dtype::raw_uint16_to_float16(0x7e00);
+  }
+  HOSTDEVICE static pten::dtype::float16 signaling_NaN() {
+    return pten::dtype::raw_uint16_to_float16(0x7e00);
+  }
+  HOSTDEVICE static pten::dtype::float16 denorm_min() {
+    return pten::dtype::raw_uint16_to_float16(0x1);
+  }
+};
+
+HOSTDEVICE inline pten::dtype::float16 abs(const pten::dtype::float16& a) {
+  return pten::dtype::abs(a);
+}
+
+}  // namespace std
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index cfe2cfa03e..4008b6f6ce 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/pten/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"
 
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/convert_utils.h"
diff --git a/paddle/pten/kernels/complex_kernel.h b/paddle/pten/kernels/complex_kernel.h
index d12fc730fe..ff27144eb4 100644
--- a/paddle/pten/kernels/complex_kernel.h
+++ b/paddle/pten/kernels/complex_kernel.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
diff --git a/paddle/pten/kernels/cpu/complex_kernel.cc b/paddle/pten/kernels/cpu/complex_kernel.cc
index 59a7577153..6cdba15620 100644
--- a/paddle/pten/kernels/cpu/complex_kernel.cc
+++ b/paddle/pten/kernels/cpu/complex_kernel.cc
@@ -19,7 +19,7 @@
 #include "paddle/pten/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 
 PT_REGISTER_KERNEL(conj,
                    CPU,
diff --git a/paddle/pten/kernels/cpu/dot_grad_kernel.cc b/paddle/pten/kernels/cpu/dot_grad_kernel.cc
index ed927f820f..91202cf836 100644
--- a/paddle/pten/kernels/cpu/dot_grad_kernel.cc
+++ b/paddle/pten/kernels/cpu/dot_grad_kernel.cc
@@ -18,7 +18,7 @@
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/core/kernel_registry.h"
 
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 
 PT_REGISTER_KERNEL(dot_grad,
                    CPU,
diff --git a/paddle/pten/kernels/cpu/dot_kernel.cc b/paddle/pten/kernels/cpu/dot_kernel.cc
index 0baf9ba0a8..5745737bba 100644
--- a/paddle/pten/kernels/cpu/dot_kernel.cc
+++ b/paddle/pten/kernels/cpu/dot_kernel.cc
@@ -18,7 +18,7 @@
 #include "paddle/pten/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc
index 83388d0d9a..7841dd4113 100644
--- a/paddle/pten/kernels/cpu/math_kernel.cc
+++ b/paddle/pten/kernels/cpu/math_kernel.cc
@@ -25,8 +25,8 @@
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/cpu/matmul_grad_kernel.cc b/paddle/pten/kernels/cpu/matmul_grad_kernel.cc
index 4738e21573..955f2b017b 100644
--- a/paddle/pten/kernels/cpu/matmul_grad_kernel.cc
+++ b/paddle/pten/kernels/cpu/matmul_grad_kernel.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/pten/kernels/matmul_grad_kernel.h"
 
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 #include "paddle/pten/core/kernel_registry.h"
 
 #include "paddle/pten/kernels/impl/matmul_grad_kernel_impl.h"
diff --git a/paddle/pten/kernels/cpu/matmul_kernel.cc b/paddle/pten/kernels/cpu/matmul_kernel.cc
index f749e9cb27..51def07d40 100644
--- a/paddle/pten/kernels/cpu/matmul_kernel.cc
+++ b/paddle/pten/kernels/cpu/matmul_kernel.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/core/kernel_registry.h"
 
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 #include "paddle/pten/kernels/impl/matmul_kernel_impl.h"
 
 PT_REGISTER_KERNEL(matmul,
diff --git a/paddle/pten/kernels/cpu/scale_kernel.cc b/paddle/pten/kernels/cpu/scale_kernel.cc
index 7088bba01a..52949b5896 100644
--- a/paddle/pten/kernels/cpu/scale_kernel.cc
+++ b/paddle/pten/kernels/cpu/scale_kernel.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/pten/common/bfloat16.h"
 namespace pten {
 
 template <typename T, typename Context>
diff --git a/paddle/pten/kernels/cpu/sign_kernel.cc b/paddle/pten/kernels/cpu/sign_kernel.cc
index 25fa2bb5fe..642d6ff6c5 100644
--- a/paddle/pten/kernels/cpu/sign_kernel.cc
+++ b/paddle/pten/kernels/cpu/sign_kernel.cc
@@ -19,6 +19,6 @@ limitations under the License. */
 #include "paddle/pten/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/pten/common/bfloat16.h"
 
 PT_REGISTER_KERNEL(sign, CPU, ALL_LAYOUT, pten::SignKernel, float, double) {}
diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc
index c133d7fc79..d6a155dca0 100644
--- a/paddle/pten/kernels/empty_kernel.cc
+++ b/paddle/pten/kernels/empty_kernel.cc
@@ -16,7 +16,7 @@
 #include "paddle/pten/backends/all_context.h"
 #include "paddle/pten/core/kernel_registry.h"
 
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/funcs/elementwise_functor.h b/paddle/pten/kernels/funcs/elementwise_functor.h
index a61d0de8fe..6b89902456 100644
--- a/paddle/pten/kernels/funcs/elementwise_functor.h
+++ b/paddle/pten/kernels/funcs/elementwise_functor.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/common/float16.h"
 
 namespace pten {
 namespace funcs {
diff --git a/paddle/pten/kernels/funcs/transpose.cc b/paddle/pten/kernels/funcs/transpose.cc
index 5a40abbd1b..77d26fcbc3 100644
--- a/paddle/pten/kernels/funcs/transpose.cc
+++ b/paddle/pten/kernels/funcs/transpose.cc
@@ -18,9 +18,9 @@
 #include "paddle/pten/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"
 
 namespace pten {
 namespace math {
diff --git a/paddle/pten/kernels/funcs/transpose.cu b/paddle/pten/kernels/funcs/transpose.cu
index 77a345d7a0..045bfdbdb0 100644
--- a/paddle/pten/kernels/funcs/transpose.cu
+++ b/paddle/pten/kernels/funcs/transpose.cu
@@ -19,9 +19,9 @@
 #include "paddle/pten/kernels/funcs/transpose.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu
index aa61155221..3774c56370 100644
--- a/paddle/pten/kernels/gpu/cast_kernel.cu
+++ b/paddle/pten/kernels/gpu/cast_kernel.cu
@@ -21,10 +21,10 @@
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/aligned_vector.h"
-#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/float16.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/gpu/complex_kernel.cu b/paddle/pten/kernels/gpu/complex_kernel.cu
index 1c82077793..cd9c95de2a 100644
--- a/paddle/pten/kernels/gpu/complex_kernel.cu
+++ b/paddle/pten/kernels/gpu/complex_kernel.cu
@@ -19,7 +19,7 @@
 #include "paddle/pten/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 
 PT_REGISTER_KERNEL(conj,
                    GPU,
diff --git a/paddle/pten/kernels/gpu/dot_grad_kernel.cu b/paddle/pten/kernels/gpu/dot_grad_kernel.cu
index 4b0d7fed4c..90c37ea1b0 100644
--- a/paddle/pten/kernels/gpu/dot_grad_kernel.cu
+++ b/paddle/pten/kernels/gpu/dot_grad_kernel.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/kernel_registry.h"
 
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 
 PT_REGISTER_KERNEL(dot_grad,
                    GPU,
diff --git a/paddle/pten/kernels/gpu/dot_kernel.cu b/paddle/pten/kernels/gpu/dot_kernel.cu
index 18bab5c15a..5fe397e128 100644
--- a/paddle/pten/kernels/gpu/dot_kernel.cu
+++ b/paddle/pten/kernels/gpu/dot_kernel.cu
@@ -20,7 +20,7 @@
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu
index 1fd085ab5f..80931db56c 100644
--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -27,9 +27,9 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
-#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_registry.h"
 
diff --git a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
index 993b17f6b8..31c44673f9 100644
--- a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
+++ b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/pten/kernels/matmul_grad_kernel.h"
 
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 #include "paddle/pten/core/kernel_registry.h"
 
 #include "paddle/pten/kernels/impl/matmul_grad_kernel_impl.h"
diff --git a/paddle/pten/kernels/gpu/matmul_kernel.cu b/paddle/pten/kernels/gpu/matmul_kernel.cu
index a3ab88913a..f9fdbd27bf 100644
--- a/paddle/pten/kernels/gpu/matmul_kernel.cu
+++ b/paddle/pten/kernels/gpu/matmul_kernel.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/kernel_registry.h"
 
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 #include "paddle/pten/kernels/impl/matmul_kernel_impl.h"
 
 PT_REGISTER_KERNEL(matmul,
diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu
index b49902ff5e..e729dad3b3 100644
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/kernels/funcs/elementwise_base.h"
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/float16.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/gpu/sign_kernel.cu b/paddle/pten/kernels/gpu/sign_kernel.cu
index 16356507dc..2a96ff6530 100644
--- a/paddle/pten/kernels/gpu/sign_kernel.cu
+++ b/paddle/pten/kernels/gpu/sign_kernel.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/pten/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/float16.h"
 
 using float16 = paddle::platform::float16;
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 6b38facb5f..aee4e149b0 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -569,9 +569,6 @@ headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/common')) +  # pten common headers
     # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
     # to `paddle/pten/api/ext`,
-    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/bfloat16.h'] +
-    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex.h'] +
-    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h'] +
     ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'])
 
 if '${WITH_MKLDNN}' == 'ON':
-- 
GitLab