[Pten] Migrate bfloat16/float16/complex from paddle::platform into pten::common (#39044)

* Migrate bfloat16/float16/complex from platform into pten::common * fix typo * fix code style

[Pten] Migrate bfloat16/float16/complex from paddle::platform into pten::common (#39044)
* Migrate bfloat16/float16/complex from platform into pten::common * fix typo * fix code style
f1143f0c · Aurelius84 · GitHub · 655f76d2 · f1143f0c · f1143f0c
87 changed file
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -228,13 +228,7 @@ copy(inference_lib_dist
        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/api/)
 copy(inference_lib_dist
        SRCS  ${PADDLE_SOURCE_DIR}/paddle/pten/common/*.h
-              ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/bfloat16.h
-              ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex.h
-              ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/
-              ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/
-              ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/
-              ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/)
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/)
 copy(inference_lib_dist
        SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/any.h
        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)

--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
-# Adapt to custom op mechanism: Include the header files related to the data type
-# to avoid exposing the path of the underlying file, remove it after moving
-# float16.h/complex.h/bfloat16.h into pten
-include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
-
 add_subdirectory(memory)
 add_subdirectory(platform)
 add_subdirectory(distributed)

--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -24,17 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"

-namespace paddle {
-namespace platform {
-struct bfloat16;
-template <typename T>
-struct complex;
-struct float16;
-template <typename T>
-struct complex;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {


--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -18,13 +18,6 @@
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/tensor.h"

-namespace paddle {
-namespace platform {
-struct bfloat16;
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 TEST(DataType, float16) {
  using paddle::framework::Tensor;
  using paddle::platform::CPUPlace;

--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -14,13 +14,6 @@
 #include "paddle/fluid/framework/dlpack_tensor.h"
 #include "paddle/fluid/framework/data_type.h"

-namespace paddle {
-namespace platform {
-struct bfloat16;
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {


--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -16,12 +16,6 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>

-namespace paddle {
-namespace platform {
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {


--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -17,12 +17,6 @@
 #include <gtest/gtest.h>
 #include <string>

-namespace paddle {
-namespace platform {
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;


--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -24,9 +24,6 @@ class Variable;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/assign_op_npu.cc
+++ b/paddle/fluid/operators/assign_op_npu.cc
@@ -26,9 +26,6 @@ class Variable;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/assign_op_xpu.cc
+++ b/paddle/fluid/operators/assign_op_xpu.cc
@@ -25,9 +25,6 @@ class Variable;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_allreduce_op.h"

-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;


--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct ASCENDPlace;
-struct float16;
 }  // namespace platform
 }  // namespace paddle


--- a/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
@@ -14,13 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_allreduce_op.h"

-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;


--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_allreduce_op.h"

-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;


--- a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct ASCENDPlace;
-struct float16;
 }  // namespace platform
 }  // namespace paddle


--- a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
@@ -14,13 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_allreduce_op.h"

-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;


--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_allreduce_op.h"

-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;


--- a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct ASCENDPlace;
-struct float16;
 }  // namespace platform
 }  // namespace paddle


--- a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
@@ -14,13 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_allreduce_op.h"

-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;


--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
@@ -21,9 +21,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_allreduce_op.h"

-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;


--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct ASCENDPlace;
-struct float16;
 }  // namespace platform
 }  // namespace paddle


--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
@@ -14,13 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_allreduce_op.h"

-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;


--- a/paddle/fluid/operators/collective/c_reduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_reduce_op.h"

-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;


--- a/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct ASCENDPlace;
-struct float16;
 }  // namespace platform
 }  // namespace paddle


--- a/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
@@ -14,13 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_reduce_op.h"

-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;


--- a/paddle/fluid/operators/collective/c_reduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_reduce_op.h"

-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;


--- a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct ASCENDPlace;
-struct float16;
 }  // namespace platform
 }  // namespace paddle


--- a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
@@ -14,13 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_reduce_op.h"

-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;


--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_reduce_op.h"

-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;


--- a/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct ASCENDPlace;
-struct float16;
 }  // namespace platform
 }  // namespace paddle


--- a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
@@ -14,13 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_reduce_op.h"

-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;


--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
@@ -14,13 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_reduce_op.h"

-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;


--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct ASCENDPlace;
-struct float16;
 }  // namespace platform
 }  // namespace paddle


--- a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
@@ -14,13 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_reduce_op.h"

-namespace paddle {
-namespace platform {
-
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;


--- a/paddle/fluid/operators/controlflow/fetch_v2_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -18,13 +18,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/elementwise/elementwise_op.h"

-namespace paddle {
-namespace platform {
-template <typename T>
-struct complex;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {
 class OpDesc;

--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -18,13 +18,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/elementwise/elementwise_op.h"

-namespace paddle {
-namespace platform {
-template <typename T>
-struct complex;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {
 class OpDesc;

--- a/paddle/fluid/operators/isfinite_v2_op.cc
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
@@ -34,7 +34,6 @@ class OverflowKernel;
 }  // namespace operators
 namespace platform {
 class CPUDeviceContext;
-struct float16;
 }  // namespace platform
 }  // namespace paddle


--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -16,6 +16,8 @@ limitations under the License. */
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/float16.h"

 namespace pten {
 class DenseTensor;
@@ -25,8 +27,6 @@ namespace paddle {
 namespace framework {}  // namespace framework
 namespace platform {
 class CPUDeviceContext;
-struct bfloat16;
-struct float16;
 }  // namespace platform
 }  // namespace paddle


--- a/paddle/fluid/operators/math/prelu.cu
+++ b/paddle/fluid/operators/math/prelu.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/math/prelu.h"
+#include "paddle/fluid/platform/float16.h"

 namespace paddle {
 namespace operators {
@@ -110,15 +111,15 @@ void PreluScalarDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
 }

 template class PreluChannelWiseDirectCUDAFunctor<float>;
-template class PreluChannelWiseDirectCUDAFunctor<paddle::platform::float16>;
+template class PreluChannelWiseDirectCUDAFunctor<platform::float16>;
 template class PreluChannelWiseDirectCUDAFunctor<double>;

 template class PreluElementWiseDirectCUDAFunctor<float>;
-template class PreluElementWiseDirectCUDAFunctor<paddle::platform::float16>;
+template class PreluElementWiseDirectCUDAFunctor<platform::float16>;
 template class PreluElementWiseDirectCUDAFunctor<double>;

 template class PreluScalarDirectCUDAFunctor<float>;
-template class PreluScalarDirectCUDAFunctor<paddle::platform::float16>;
+template class PreluScalarDirectCUDAFunctor<platform::float16>;
 template class PreluScalarDirectCUDAFunctor<double>;

 }  // namespace math

--- a/paddle/fluid/operators/memcpy_d2h_op.cc
+++ b/paddle/fluid/operators/memcpy_d2h_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/memcpy_h2d_op.cc
+++ b/paddle/fluid/operators/memcpy_h2d_op.cc
@@ -23,9 +23,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/memcpy_op.cc
+++ b/paddle/fluid/operators/memcpy_op.cc
@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -30,9 +30,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct float16;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/platform/bfloat16.h
+++ b/paddle/fluid/platform/bfloat16.h
@@ -14,396 +14,11 @@

 #pragma once

-#include <stdint.h>
-
-#include <cmath>
-#include <cstring>
-#include <iostream>
-#include <limits>
-
-#ifdef PADDLE_WITH_CUDA
-#include <cuda.h>
-#endif
-
-#if defined(__CUDACC__) && CUDA_VERSION >= 11000
-#define PADDLE_CUDA_BF16
-#include <cuda_bf16.h>
-#endif
-
-#if !defined(_WIN32)
-#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
-#else
-#define PADDLE_ALIGN(x) __declspec(align(x))
-#endif
-
-#if (defined(__CUDACC__) || defined(__HIPCC__))
-#define HOSTDEVICE __host__ __device__
-#define DEVICE __device__
-#define HOST __host__
-#else
-#define HOSTDEVICE
-#define DEVICE
-#define HOST
-#endif
+#include "paddle/pten/common/bfloat16.h"

 namespace paddle {
 namespace platform {
-
-struct PADDLE_ALIGN(2) bfloat16 {
- public:
-  uint16_t x;
-
-  // Constructors
-  bfloat16() = default;
-  bfloat16(const bfloat16& o) = default;
-  bfloat16& operator=(const bfloat16& o) = default;
-  bfloat16(bfloat16&& o) = default;
-  bfloat16& operator=(bfloat16&& o) = default;
-  ~bfloat16() = default;
-
-  HOSTDEVICE inline explicit bfloat16(float val) {
-#ifdef PADDLE_WITH_HIP
-    uint32_t res = 0;
-    uint32_t* tempRes;
-    // We should be using memcpy in order to respect the strict aliasing rule
-    // but it fails in the HIP environment.
-    tempRes = reinterpret_cast<uint32_t*>(&val);
-    res = *tempRes;
-    x = res >> 16;
-#else
-#if defined(PADDLE_CUDA_BF16)
-    __nv_bfloat16 tmp = __float2bfloat16(val);
-    x = *reinterpret_cast<uint16_t*>(&tmp);
-#else
-    std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
-#endif
-#endif
-  }
-
-#if defined(PADDLE_CUDA_BF16)
-  HOSTDEVICE inline explicit bfloat16(const __nv_bfloat16& val) {
-    x = *reinterpret_cast<const unsigned short*>(&val);
-  }
-#endif
-
-  template <class T>
-  HOSTDEVICE inline explicit bfloat16(const T& val)
-      : x(bfloat16(static_cast<float>(val)).x) {}
-
-// Assignment operators
-#if defined(PADDLE_CUDA_BF16)
-  HOSTDEVICE inline bfloat16& operator=(const __nv_bfloat16& val) {
-    x = *reinterpret_cast<const unsigned short*>(&val);
-    return *this;
-  }
-#endif
-
-  HOSTDEVICE inline bfloat16& operator=(bool b) {
-    x = b ? 0x3f80 : 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(int8_t val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(uint8_t val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(int16_t val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(uint16_t val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(int32_t val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(uint32_t val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(int64_t val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(uint64_t val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(float val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline bfloat16& operator=(double val) {
-    x = bfloat16(val).x;
-    return *this;
-  }
-
-  // Conversion opertors
-  HOSTDEVICE inline explicit operator float() const {
-#ifdef PADDLE_WITH_HIP
-    uint32_t res = 0;
-    // We should be using memcpy in order to respect the strict aliasing rule
-    // but it fails in the HIP environment.
-    uint16_t temp = x;
-    uint16_t* temp_ptr = reinterpret_cast<uint16_t*>(&temp);
-    res = *temp_ptr;
-    return res;
-#else
-#ifdef PADDLE_CUDA_BF16
-    return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
-#else
-    float val = 0.f;
-    uint16_t temp = x;
-    std::memcpy(reinterpret_cast<char*>(&val) + 2,
-                reinterpret_cast<char*>(&temp), 2);
-    return val;
-#endif
-#endif
-  }
-
-#ifdef PADDLE_CUDA_BF16
-  HOSTDEVICE inline explicit operator __nv_bfloat16() const {
-    return *reinterpret_cast<const __nv_bfloat16*>(&x);
-  }
-#endif
-
-  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
-
-  HOSTDEVICE inline explicit operator int8_t() const {
-    return static_cast<int8_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint8_t() const {
-    return static_cast<uint8_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator int16_t() const {
-    return static_cast<int16_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint16_t() const {
-    return static_cast<uint16_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator int32_t() const {
-    return static_cast<int32_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint32_t() const {
-    return static_cast<uint32_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator int64_t() const {
-    return static_cast<int64_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint64_t() const {
-    return static_cast<uint64_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator double() const {
-    return static_cast<double>(static_cast<float>(*this));
-  }
-};
-
-HOSTDEVICE inline bfloat16 operator+(const bfloat16& a, const bfloat16& b) {
-  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
-}
-
-HOSTDEVICE inline bfloat16 operator-(const bfloat16& a, const bfloat16& b) {
-  return bfloat16(static_cast<float>(a) - static_cast<float>(b));
-}
-
-HOSTDEVICE inline bfloat16 operator*(const bfloat16& a, const bfloat16& b) {
-  return bfloat16(static_cast<float>(a) * static_cast<float>(b));
-}
-
-HOSTDEVICE inline bfloat16 operator/(const bfloat16& a, const bfloat16& b) {
-  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
-}
-
-HOSTDEVICE inline bfloat16 operator-(const bfloat16& a) {
-  bfloat16 res;
-  res.x = a.x ^ 0x8000;
-  return res;
-}
-
-HOSTDEVICE inline bfloat16& operator+=(bfloat16& a,  // NOLINT
-                                       const bfloat16& b) {
-  a = bfloat16(static_cast<float>(a) + static_cast<float>(b));
-  return a;
-}
-
-HOSTDEVICE inline bfloat16& operator-=(bfloat16& a,  // NOLINT
-                                       const bfloat16& b) {
-  a = bfloat16(static_cast<float>(a) - static_cast<float>(b));
-  return a;
-}
-
-HOSTDEVICE inline bfloat16& operator*=(bfloat16& a,  // NOLINT
-                                       const bfloat16& b) {
-  a = bfloat16(static_cast<float>(a) * static_cast<float>(b));
-  return a;
-}
-
-HOSTDEVICE inline bfloat16& operator/=(bfloat16& a,  // NOLINT
-                                       const bfloat16& b) {
-  a = bfloat16(static_cast<float>(a) / static_cast<float>(b));
-  return a;
-}
-
-HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) {
-  bfloat16 res;
-  res.x = a;
-  return res;
-}
-
-// Comparison operators
-HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) {
-  return static_cast<float>(a) == static_cast<float>(b);
-}
-
-HOSTDEVICE inline bool operator!=(const bfloat16& a, const bfloat16& b) {
-  return static_cast<float>(a) != static_cast<float>(b);
-}
-
-HOSTDEVICE inline bool operator<(const bfloat16& a, const bfloat16& b) {
-  return static_cast<float>(a) < static_cast<float>(b);
-}
-
-HOSTDEVICE inline bool operator<=(const bfloat16& a, const bfloat16& b) {
-  return static_cast<float>(a) <= static_cast<float>(b);
-}
-
-HOSTDEVICE inline bool operator>(const bfloat16& a, const bfloat16& b) {
-  return static_cast<float>(a) > static_cast<float>(b);
-}
-
-HOSTDEVICE inline bool operator>=(const bfloat16& a, const bfloat16& b) {
-  return static_cast<float>(a) >= static_cast<float>(b);
-}
-
-HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
-  return (a.x & 0x7FFF) > 0x7F80;
-}
-
-HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
-  return (a.x & 0x7F80) == 0x7F80;
-}
-
-HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
-  return !((isnan)(a)) && !((isinf)(a));
-}
-
-inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) {
-  os << a.x;
-  return os;
-}
-
+using bfloat16 = pten::dtype::bfloat16;
+using namespace pten::dtype;  // NOLINT
 }  // namespace platform
 }  // namespace paddle
-
-namespace std {
-
-template <>
-struct is_pod<paddle::platform::bfloat16> {
-  static const bool value =
-      is_trivial<paddle::platform::bfloat16>::value &&
-      is_standard_layout<paddle::platform::bfloat16>::value;
-};
-
-template <>
-struct is_floating_point<paddle::platform::bfloat16>
-    : std::integral_constant<
-          bool, std::is_same<paddle::platform::bfloat16,
-                             typename std::remove_cv<
-                                 paddle::platform::bfloat16>::type>::value> {};
-template <>
-struct is_signed<paddle::platform::bfloat16> {
-  static const bool value = true;
-};
-
-template <>
-struct is_unsigned<paddle::platform::bfloat16> {
-  static const bool value = false;
-};
-
-inline bool isnan(const paddle::platform::bfloat16& a) {
-  return paddle::platform::isnan(a);
-}
-
-inline bool isinf(const paddle::platform::bfloat16& a) {
-  return paddle::platform::isinf(a);
-}
-
-template <>
-struct numeric_limits<paddle::platform::bfloat16> {
-  static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool has_infinity = true;
-  static const bool has_quiet_NaN = true;
-  static const bool has_signaling_NaN = true;
-  static const float_denorm_style has_denorm = denorm_present;
-  static const bool has_denorm_loss = false;
-  static const std::float_round_style round_style = std::round_to_nearest;
-  static const bool is_iec559 = false;
-  static const bool is_bounded = false;
-  static const bool is_modulo = false;
-  static const int digits = 8;
-  static const int digits10 = 2;
-  static const int max_digits10 = 9;
-  static const int radix = 2;
-  static const int min_exponent = -125;
-  static const int min_exponent10 = -37;
-  static const int max_exponent = 128;
-  static const int max_exponent10 = 38;
-  static const bool traps = true;
-  static const bool tinyness_before = false;
-
-  static paddle::platform::bfloat16(min)() {
-    return paddle::platform::raw_uint16_to_bfloat16(0x007f);
-  }
-  static paddle::platform::bfloat16 lowest() {
-    return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
-  }
-  static paddle::platform::bfloat16(max)() {
-    return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
-  }
-  static paddle::platform::bfloat16 epsilon() {
-    return paddle::platform::raw_uint16_to_bfloat16(0x3400);
-  }
-  static paddle::platform::bfloat16 round_error() {
-    return paddle::platform::bfloat16(0.5);
-  }
-  static paddle::platform::bfloat16 infinity() {
-    return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
-  }
-  static paddle::platform::bfloat16 quiet_NaN() {
-    return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
-  }
-  static paddle::platform::bfloat16 signaling_NaN() {
-    return paddle::platform::raw_uint16_to_bfloat16(0xff81);
-  }
-  static paddle::platform::bfloat16 denorm_min() {
-    return paddle::platform::raw_uint16_to_bfloat16(0x0001);
-  }
-};
-
-}  // namespace std
--- a/paddle/fluid/platform/complex.h
+++ b/paddle/fluid/platform/complex.h
@@ -14,536 +14,14 @@

 #pragma once

-#include <stdint.h>
-
-#include <complex>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#ifdef PADDLE_WITH_CUDA
-#include <cuComplex.h>
-#include <thrust/complex.h>
-#endif  // PADDLE_WITH_CUDA
-
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_complex.h>
-#include <thrust/complex.h>  // NOLINT
-#endif
-
-#if !defined(_WIN32)
-#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
-#else
-#define PADDLE_ALIGN(x) __declspec(align(x))
-#endif
-
-#if (defined(__CUDACC__) || defined(__HIPCC__))
-#define HOSTDEVICE __host__ __device__
-#define DEVICE __device__
-#define HOST __host__
-#else
-#define HOSTDEVICE
-#define DEVICE
-#define HOST
-#endif
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-// todo
-#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX
-#endif
+#include "paddle/pten/common/complex.h"

 namespace paddle {
 namespace platform {
-
-template <typename T>
-struct PADDLE_ALIGN(sizeof(T) * 2) complex {
- public:
-  T real;
-  T imag;
-
-  using value_type = T;
-
-  complex() = default;
-  complex(const complex<T>& o) = default;
-  complex& operator=(const complex<T>& o) = default;
-  complex(complex<T>&& o) = default;
-  complex& operator=(complex<T>&& o) = default;
-  ~complex() = default;
-
-  HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-  template <typename T1>
-  HOSTDEVICE inline explicit complex(const thrust::complex<T1>& c) {
-    real = c.real();
-    imag = c.imag();
-  }
-
-  template <typename T1>
-  HOSTDEVICE inline explicit operator thrust::complex<T1>() const {
-    return thrust::complex<T1>(real, imag);
-  }
-
-#ifdef PADDLE_WITH_HIP
-  HOSTDEVICE inline explicit operator hipFloatComplex() const {
-    return make_hipFloatComplex(real, imag);
-  }
-
-  HOSTDEVICE inline explicit operator hipDoubleComplex() const {
-    return make_hipDoubleComplex(real, imag);
-  }
-#else
-  HOSTDEVICE inline explicit operator cuFloatComplex() const {
-    return make_cuFloatComplex(real, imag);
-  }
-
-  HOSTDEVICE inline explicit operator cuDoubleComplex() const {
-    return make_cuDoubleComplex(real, imag);
-  }
-#endif
-#endif
-
-  template <typename T1,
-            typename std::enable_if<std::is_floating_point<T1>::value ||
-                                        std::is_integral<T1>::value,
-                                    int>::type = 0>
-  HOSTDEVICE complex(const T1& val) {
-    real = static_cast<T>(val);
-    imag = static_cast<T>(0.0);
-  }
-
-  template <typename T1 = T>
-  HOSTDEVICE explicit complex(
-      const std::enable_if_t<std::is_same<T1, float>::value, complex<double>>&
-          val) {
-    real = val.real;
-    imag = val.imag;
-  }
-
-  template <typename T1 = T>
-  HOSTDEVICE explicit complex(
-      const std::enable_if_t<std::is_same<T1, double>::value, complex<float>>&
-          val) {
-    real = val.real;
-    imag = val.imag;
-  }
-
-  template <typename T1>
-  HOSTDEVICE inline explicit operator std::complex<T1>() const {
-    return static_cast<std::complex<T1>>(std::complex<T>(real, imag));
-  }
-
-  template <typename T1>
-  HOSTDEVICE complex(const std::complex<T1>& val)
-      : real(val.real()), imag(val.imag()) {}
-
-  template <typename T1,
-            typename std::enable_if<std::is_floating_point<T1>::value ||
-                                        std::is_integral<T1>::value,
-                                    int>::type = 0>
-  HOSTDEVICE inline complex& operator=(const T1& val) {
-    real = static_cast<T>(val);
-    imag = static_cast<T>(0.0);
-    return *this;
-  }
-
-  HOSTDEVICE inline explicit operator bool() const {
-    return static_cast<bool>(this->real) || static_cast<bool>(this->imag);
-  }
-
-  HOSTDEVICE inline explicit operator int8_t() const {
-    return static_cast<int8_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint8_t() const {
-    return static_cast<uint8_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator int16_t() const {
-    return static_cast<int16_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint16_t() const {
-    return static_cast<uint16_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator int32_t() const {
-    return static_cast<int32_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint32_t() const {
-    return static_cast<uint32_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator int64_t() const {
-    return static_cast<int64_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint64_t() const {
-    return static_cast<uint64_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator float() const {
-    return static_cast<float>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator double() const {
-    return static_cast<double>(this->real);
-  }
-};
-
-template <typename T>
-HOSTDEVICE inline complex<T> operator+(const complex<T>& a,
-                                       const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(thrust::complex<T>(a) + thrust::complex<T>(b));
-#else
-  return complex<T>(a.real + b.real, a.imag + b.imag);
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T> operator-(const complex<T>& a,
-                                       const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(thrust::complex<T>(a) - thrust::complex<T>(b));
-#else
-  return complex<T>(a.real - b.real, a.imag - b.imag);
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T> operator*(const complex<T>& a,
-                                       const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(thrust::complex<T>(a) * thrust::complex<T>(b));
-#else
-  return complex<T>(a.real * b.real - a.imag * b.imag,
-                    a.imag * b.real + b.imag * a.real);
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T> operator/(const complex<T>& a,
-                                       const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(thrust::complex<T>(a) / thrust::complex<T>(b));
-#else
-  T denominator = b.real * b.real + b.imag * b.imag;
-  return complex<T>((a.real * b.real + a.imag * b.imag) / denominator,
-                    (a.imag * b.real - a.real * b.imag) / denominator);
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T> operator-(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(-thrust::complex<T>(a.real, a.imag));
-#else
-  complex<T> res;
-  res.real = -a.real;
-  res.imag = -a.imag;
-  return res;
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T>& operator+=(complex<T>& a,  // NOLINT
-                                         const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex<T>(thrust::complex<T>(a.real, a.imag) +=
-                 thrust::complex<T>(b.real, b.imag));
-  return a;
-#else
-  a.real += b.real;
-  a.imag += b.imag;
-  return a;
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T>& operator-=(complex<T>& a,  // NOLINT
-                                         const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex<T>(thrust::complex<T>(a.real, a.imag) -=
-                 thrust::complex<T>(b.real, b.imag));
-  return a;
-#else
-  a.real -= b.real;
-  a.imag -= b.imag;
-  return a;
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T>& operator*=(complex<T>& a,  // NOLINT
-                                         const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex<T>(thrust::complex<T>(a.real, a.imag) *=
-                 thrust::complex<T>(b.real, b.imag));
-  return a;
-#else
-  a.real = a.real * b.real - a.imag * b.imag;
-  a.imag = a.imag * b.real + b.imag * a.real;
-  return a;
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T>& operator/=(complex<T>& a,  // NOLINT
-                                         const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex<T>(thrust::complex<T>(a.real, a.imag) /=
-                 thrust::complex<T>(b.real, b.imag));
-  return a;
-#else
-  T denominator = b.real * b.real + b.imag * b.imag;
-  a.real = (a.real * b.real + a.imag * b.imag) / denominator;
-  a.imag = (a.imag * b.real - a.real * b.imag) / denominator;
-  return a;
-#endif
-}
-
 template <typename T>
-HOSTDEVICE inline complex<T> raw_uint16_to_complex64(uint16_t a) {
-  complex<T> res;
-  res.real = a;
-  res.imag = 0.0;
-  return res;
-}
+using complex = pten::dtype::complex<T>;

-template <typename T>
-HOSTDEVICE inline bool operator==(const complex<T>& a, const complex<T>& b) {
-  return a.real == b.real && a.imag == b.imag;
-}
-
-template <typename T>
-HOSTDEVICE inline bool operator!=(const complex<T>& a, const complex<T>& b) {
-  return a.real != b.real || a.imag != b.imag;
-}
-
-template <typename T>
-HOSTDEVICE inline bool operator<(const complex<T>& a, const complex<T>& b) {
-  return a.real < b.real;
-}
-
-template <typename T>
-HOSTDEVICE inline bool operator<=(const complex<T>& a, const complex<T>& b) {
-  return a.real <= b.real;
-}
-
-template <typename T>
-HOSTDEVICE inline bool operator>(const complex<T>& a, const complex<T>& b) {
-  return a.real > b.real;
-}
-
-template <typename T>
-HOSTDEVICE inline bool operator>=(const complex<T>& a, const complex<T>& b) {
-  return a.real >= b.real;
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T>(max)(const complex<T>& a, const complex<T>& b) {
-  return (a.real >= b.real) ? a : b;
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T>(min)(const complex<T>& a, const complex<T>& b) {
-  return (a.real < b.real) ? a : b;
-}
-
-template <typename T>
-HOSTDEVICE inline bool(isnan)(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return ::isnan(a.real) || ::isnan(a.imag);
-#else
-  return std::isnan(a.real) || std::isnan(a.imag);
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline bool isinf(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return ::isinf(a.real) || ::isinf(a.imag);
-#else
-  return std::isinf(a.real) || std::isinf(a.imag);
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline bool isfinite(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return ::isfinite(a.real) || ::isfinite(a.imag);
-#else
-  return std::isfinite(a.real) || std::isfinite(a.imag);
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline T abs(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return thrust::abs(thrust::complex<T>(a));
-#else
-  return std::abs(std::complex<T>(a));
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline T arg(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return thrust::arg(thrust::complex<T>(a));
-#else
-  return std::arg(std::complex<T>(a));
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T> pow(const complex<T>& a, const complex<T>& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(thrust::pow(thrust::complex<T>(a), thrust::complex<T>(b)));
-#else
-  return complex<T>(std::pow(std::complex<T>(a), std::complex<T>(b)));
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T> sqrt(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(thrust::sqrt(thrust::complex<T>(a)));
-#else
-  return complex<T>(std::sqrt(std::complex<T>(a)));
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T> tanh(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(thrust::tanh(thrust::complex<T>(a)));
-#else
-  return complex<T>(std::tanh(std::complex<T>(a)));
-#endif
-}
-
-template <typename T>
-HOSTDEVICE inline complex<T> log(const complex<T>& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex<T>(thrust::log(thrust::complex<T>(a)));
-#else
-  return complex<T>(std::log(std::complex<T>(a)));
-#endif
-}
-
-template <typename T>
-inline std::ostream& operator<<(std::ostream& os, const complex<T>& a) {
-  os << "real:" << a.real << " imag:" << a.imag;
-  return os;
-}
+using namespace pten::dtype;  // NOLINT

 }  // namespace platform
 }  // namespace paddle
-
-namespace std {
-
-template <typename T>
-struct is_pod<paddle::platform::complex<T>> {
-  static const bool value = true;
-};
-
-template <typename T>
-struct is_floating_point<paddle::platform::complex<T>>
-    : std::integral_constant<bool, false> {};
-
-template <typename T>
-struct is_signed<paddle::platform::complex<T>> {
-  static const bool value = false;
-};
-
-template <typename T>
-struct is_unsigned<paddle::platform::complex<T>> {
-  static const bool value = false;
-};
-
-template <typename T>
-inline bool isnan(const paddle::platform::complex<T>& a) {
-  return paddle::platform::isnan(a);
-}
-
-template <typename T>
-inline bool isinf(const paddle::platform::complex<T>& a) {
-  return paddle::platform::isinf(a);
-}
-
-template <typename T>
-struct numeric_limits<paddle::platform::complex<T>> {
-  static const bool is_specialized = false;
-  static const bool is_signed = false;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool has_infinity = false;
-  static const bool has_quiet_NaN = false;
-  static const bool has_signaling_NaN = false;
-  static const float_denorm_style has_denorm = denorm_absent;
-  static const bool has_denorm_loss = false;
-  static const std::float_round_style round_style = std::round_toward_zero;
-  static const bool is_iec559 = false;
-  static const bool is_bounded = false;
-  static const bool is_modulo = false;
-  static const int digits = 0;
-  static const int digits10 = 0;
-  static const int max_digits10 = 0;
-  static const int radix = 0;
-  static const int min_exponent = 0;
-  static const int min_exponent10 = 0;
-  static const int max_exponent = 0;
-  static const int max_exponent10 = 0;
-  static const bool traps = false;
-  static const bool tinyness_before = false;
-
-  static paddle::platform::complex<T>(min)() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-  static paddle::platform::complex<T> lowest() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-  static paddle::platform::complex<T>(max)() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-  static paddle::platform::complex<T> epsilon() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-  static paddle::platform::complex<T> round_error() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-  static paddle::platform::complex<T> infinity() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-  static paddle::platform::complex<T> quiet_NaN() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-  static paddle::platform::complex<T> signaling_NaN() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-  static paddle::platform::complex<T> denorm_min() {
-    return paddle::platform::complex<T>(0.0, 0.0);
-  }
-};
-
-}  // namespace std
--- a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
@@ -23,12 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/macros.h"

-namespace paddle {
-namespace platform {
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 DECLARE_bool(cudnn_deterministic);

 namespace paddle {

--- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
@@ -27,12 +27,6 @@ limitations under the License. */
 // MIOPEN do not have epslion definition
 #define CUDNN_BN_MIN_EPSILON 1e-05

-namespace paddle {
-namespace platform {
-struct float16;
-}  // namespace platform
-}  // namespace paddle
-
 DECLARE_bool(cudnn_deterministic);

 namespace paddle {

--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
-# Adapt to custom op mechanism: Include the header files related to the data type
-# to avoid exposing the path of the underlying file, remove it after moving
-# float16.h/complex.h/bfloat16.h into pten
-include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
-
 # paddle experimental common components
 add_subdirectory(common)


--- a/paddle/pten/api/lib/tensor.cc
+++ b/paddle/pten/api/lib/tensor.cc
@@ -49,11 +49,11 @@ limitations under the License. */
 */
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/stream/cuda_stream.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"

 namespace paddle {
 namespace experimental {

--- a/paddle/pten/common/bfloat16.h
+++ b/paddle/pten/common/bfloat16.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <cmath>
+#include <cstring>
+#include <iostream>
+#include <limits>
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif
+
+#if defined(__CUDACC__) && CUDA_VERSION >= 11000
+#define PADDLE_CUDA_BF16
+#include <cuda_bf16.h>
+#endif
+
+#if !defined(_WIN32)
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x) __declspec(align(x))
+#endif
+
+#if (defined(__CUDACC__) || defined(__HIPCC__))
+#define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
+#define HOST __host__
+#else
+#define HOSTDEVICE
+#define DEVICE
+#define HOST
+#endif
+
+namespace pten {
+namespace dtype {
+
+struct PADDLE_ALIGN(2) bfloat16 {
+ public:
+  uint16_t x;
+
+  // Constructors
+  bfloat16() = default;
+  bfloat16(const bfloat16& o) = default;
+  bfloat16& operator=(const bfloat16& o) = default;
+  bfloat16(bfloat16&& o) = default;
+  bfloat16& operator=(bfloat16&& o) = default;
+  ~bfloat16() = default;
+
+  HOSTDEVICE inline explicit bfloat16(float val) {
+#ifdef PADDLE_WITH_HIP
+    uint32_t res = 0;
+    uint32_t* tempRes;
+    // We should be using memcpy in order to respect the strict aliasing rule
+    // but it fails in the HIP environment.
+    tempRes = reinterpret_cast<uint32_t*>(&val);
+    res = *tempRes;
+    x = res >> 16;
+#else
+#if defined(PADDLE_CUDA_BF16)
+    __nv_bfloat16 tmp = __float2bfloat16(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
+#else
+    std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
+#endif
+#endif
+  }
+
+#if defined(PADDLE_CUDA_BF16)
+  HOSTDEVICE inline explicit bfloat16(const __nv_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);
+  }
+#endif
+
+  template <class T>
+  HOSTDEVICE inline explicit bfloat16(const T& val)
+      : x(bfloat16(static_cast<float>(val)).x) {}
+
+// Assignment operators
+#if defined(PADDLE_CUDA_BF16)
+  HOSTDEVICE inline bfloat16& operator=(const __nv_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);
+    return *this;
+  }
+#endif
+
+  HOSTDEVICE inline bfloat16& operator=(bool b) {
+    x = b ? 0x3f80 : 0;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int8_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint8_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int16_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint16_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int32_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint32_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int64_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint64_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(float val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(double val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  // Conversion opertors
+  HOSTDEVICE inline explicit operator float() const {
+#ifdef PADDLE_WITH_HIP
+    uint32_t res = 0;
+    // We should be using memcpy in order to respect the strict aliasing rule
+    // but it fails in the HIP environment.
+    uint16_t temp = x;
+    uint16_t* temp_ptr = reinterpret_cast<uint16_t*>(&temp);
+    res = *temp_ptr;
+    return res;
+#else
+#ifdef PADDLE_CUDA_BF16
+    return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+#else
+    float val = 0.f;
+    uint16_t temp = x;
+    std::memcpy(
+        reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp), 2);
+    return val;
+#endif
+#endif
+  }
+
+#ifdef PADDLE_CUDA_BF16
+  HOSTDEVICE inline explicit operator __nv_bfloat16() const {
+    return *reinterpret_cast<const __nv_bfloat16*>(&x);
+  }
+#endif
+
+  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(static_cast<float>(*this));
+  }
+};
+
+HOSTDEVICE inline bfloat16 operator+(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator-(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) - static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator*(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) * static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator/(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator-(const bfloat16& a) {
+  bfloat16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+
+HOSTDEVICE inline bfloat16& operator+=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) + static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator-=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) - static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator*=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) * static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator/=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) / static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) {
+  bfloat16 res;
+  res.x = a;
+  return res;
+}
+
+// Comparison operators
+HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) == static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator!=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) != static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator<(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) < static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator<=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) <= static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator>(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) > static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator>=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) >= static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
+  return (a.x & 0x7FFF) > 0x7F80;
+}
+
+HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
+  return (a.x & 0x7F80) == 0x7F80;
+}
+
+HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
+  return !((isnan)(a)) && !((isinf)(a));
+}
+
+inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) {
+  os << a.x;
+  return os;
+}
+
+}  // namespace dtype
+}  // namespace pten
+
+namespace std {
+
+template <>
+struct is_pod<pten::dtype::bfloat16> {
+  static const bool value = is_trivial<pten::dtype::bfloat16>::value &&
+                            is_standard_layout<pten::dtype::bfloat16>::value;
+};
+
+template <>
+struct is_floating_point<pten::dtype::bfloat16>
+    : std::integral_constant<
+          bool,
+          std::is_same<
+              pten::dtype::bfloat16,
+              typename std::remove_cv<pten::dtype::bfloat16>::type>::value> {};
+template <>
+struct is_signed<pten::dtype::bfloat16> {
+  static const bool value = true;
+};
+
+template <>
+struct is_unsigned<pten::dtype::bfloat16> {
+  static const bool value = false;
+};
+
+inline bool isnan(const pten::dtype::bfloat16& a) {
+  return pten::dtype::isnan(a);
+}
+
+inline bool isinf(const pten::dtype::bfloat16& a) {
+  return pten::dtype::isinf(a);
+}
+
+template <>
+struct numeric_limits<pten::dtype::bfloat16> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 8;
+  static const int digits10 = 2;
+  static const int max_digits10 = 9;
+  static const int radix = 2;
+  static const int min_exponent = -125;
+  static const int min_exponent10 = -37;
+  static const int max_exponent = 128;
+  static const int max_exponent10 = 38;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+
+  static pten::dtype::bfloat16(min)() {
+    return pten::dtype::raw_uint16_to_bfloat16(0x007f);
+  }
+  static pten::dtype::bfloat16 lowest() {
+    return pten::dtype::raw_uint16_to_bfloat16(0xff7f);
+  }
+  static pten::dtype::bfloat16(max)() {
+    return pten::dtype::raw_uint16_to_bfloat16(0x7f7f);
+  }
+  static pten::dtype::bfloat16 epsilon() {
+    return pten::dtype::raw_uint16_to_bfloat16(0x3400);
+  }
+  static pten::dtype::bfloat16 round_error() {
+    return pten::dtype::bfloat16(0.5);
+  }
+  static pten::dtype::bfloat16 infinity() {
+    return pten::dtype::raw_uint16_to_bfloat16(0x7f80);
+  }
+  static pten::dtype::bfloat16 quiet_NaN() {
+    return pten::dtype::raw_uint16_to_bfloat16(0xffc1);
+  }
+  static pten::dtype::bfloat16 signaling_NaN() {
+    return pten::dtype::raw_uint16_to_bfloat16(0xff81);
+  }
+  static pten::dtype::bfloat16 denorm_min() {
+    return pten::dtype::raw_uint16_to_bfloat16(0x0001);
+  }
+};
+
+}  // namespace std
--- a/paddle/pten/common/complex.h
+++ b/paddle/pten/common/complex.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <complex>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#ifdef PADDLE_WITH_CUDA
+#include <cuComplex.h>
+#include <thrust/complex.h>
+#endif  // PADDLE_WITH_CUDA
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_complex.h>
+#include <thrust/complex.h>  // NOLINT
+#endif
+
+#if !defined(_WIN32)
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x) __declspec(align(x))
+#endif
+
+#if (defined(__CUDACC__) || defined(__HIPCC__))
+#define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
+#define HOST __host__
+#else
+#define HOSTDEVICE
+#define DEVICE
+#define HOST
+#endif
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+// todo
+#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX
+#endif
+
+namespace pten {
+namespace dtype {
+
+template <typename T>
+struct PADDLE_ALIGN(sizeof(T) * 2) complex {
+ public:
+  T real;
+  T imag;
+
+  using value_type = T;
+
+  complex() = default;
+  complex(const complex<T>& o) = default;
+  complex& operator=(const complex<T>& o) = default;
+  complex(complex<T>&& o) = default;
+  complex& operator=(complex<T>&& o) = default;
+  ~complex() = default;
+
+  HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+  template <typename T1>
+  HOSTDEVICE inline explicit complex(const thrust::complex<T1>& c) {
+    real = c.real();
+    imag = c.imag();
+  }
+
+  template <typename T1>
+  HOSTDEVICE inline explicit operator thrust::complex<T1>() const {
+    return thrust::complex<T1>(real, imag);
+  }
+
+#ifdef PADDLE_WITH_HIP
+  HOSTDEVICE inline explicit operator hipFloatComplex() const {
+    return make_hipFloatComplex(real, imag);
+  }
+
+  HOSTDEVICE inline explicit operator hipDoubleComplex() const {
+    return make_hipDoubleComplex(real, imag);
+  }
+#else
+  HOSTDEVICE inline explicit operator cuFloatComplex() const {
+    return make_cuFloatComplex(real, imag);
+  }
+
+  HOSTDEVICE inline explicit operator cuDoubleComplex() const {
+    return make_cuDoubleComplex(real, imag);
+  }
+#endif
+#endif
+
+  template <typename T1,
+            typename std::enable_if<std::is_floating_point<T1>::value ||
+                                        std::is_integral<T1>::value,
+                                    int>::type = 0>
+  HOSTDEVICE complex(const T1& val) {
+    real = static_cast<T>(val);
+    imag = static_cast<T>(0.0);
+  }
+
+  template <typename T1 = T>
+  HOSTDEVICE explicit complex(
+      const std::enable_if_t<std::is_same<T1, float>::value, complex<double>>&
+          val) {
+    real = val.real;
+    imag = val.imag;
+  }
+
+  template <typename T1 = T>
+  HOSTDEVICE explicit complex(
+      const std::enable_if_t<std::is_same<T1, double>::value, complex<float>>&
+          val) {
+    real = val.real;
+    imag = val.imag;
+  }
+
+  template <typename T1>
+  HOSTDEVICE inline explicit operator std::complex<T1>() const {
+    return static_cast<std::complex<T1>>(std::complex<T>(real, imag));
+  }
+
+  template <typename T1>
+  HOSTDEVICE complex(const std::complex<T1>& val)
+      : real(val.real()), imag(val.imag()) {}
+
+  template <typename T1,
+            typename std::enable_if<std::is_floating_point<T1>::value ||
+                                        std::is_integral<T1>::value,
+                                    int>::type = 0>
+  HOSTDEVICE inline complex& operator=(const T1& val) {
+    real = static_cast<T>(val);
+    imag = static_cast<T>(0.0);
+    return *this;
+  }
+
+  HOSTDEVICE inline explicit operator bool() const {
+    return static_cast<bool>(this->real) || static_cast<bool>(this->imag);
+  }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator float() const {
+    return static_cast<float>(this->real);
+  }
+
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(this->real);
+  }
+};
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator+(const complex<T>& a,
+                                       const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::complex<T>(a) + thrust::complex<T>(b));
+#else
+  return complex<T>(a.real + b.real, a.imag + b.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator-(const complex<T>& a,
+                                       const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::complex<T>(a) - thrust::complex<T>(b));
+#else
+  return complex<T>(a.real - b.real, a.imag - b.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator*(const complex<T>& a,
+                                       const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::complex<T>(a) * thrust::complex<T>(b));
+#else
+  return complex<T>(a.real * b.real - a.imag * b.imag,
+                    a.imag * b.real + b.imag * a.real);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator/(const complex<T>& a,
+                                       const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::complex<T>(a) / thrust::complex<T>(b));
+#else
+  T denominator = b.real * b.real + b.imag * b.imag;
+  return complex<T>((a.real * b.real + a.imag * b.imag) / denominator,
+                    (a.imag * b.real - a.real * b.imag) / denominator);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> operator-(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(-thrust::complex<T>(a.real, a.imag));
+#else
+  complex<T> res;
+  res.real = -a.real;
+  res.imag = -a.imag;
+  return res;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>& operator+=(complex<T>& a,  // NOLINT
+                                         const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  a = complex<T>(thrust::complex<T>(a.real, a.imag) +=
+                 thrust::complex<T>(b.real, b.imag));
+  return a;
+#else
+  a.real += b.real;
+  a.imag += b.imag;
+  return a;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>& operator-=(complex<T>& a,  // NOLINT
+                                         const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  a = complex<T>(thrust::complex<T>(a.real, a.imag) -=
+                 thrust::complex<T>(b.real, b.imag));
+  return a;
+#else
+  a.real -= b.real;
+  a.imag -= b.imag;
+  return a;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>& operator*=(complex<T>& a,  // NOLINT
+                                         const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  a = complex<T>(thrust::complex<T>(a.real, a.imag) *=
+                 thrust::complex<T>(b.real, b.imag));
+  return a;
+#else
+  a.real = a.real * b.real - a.imag * b.imag;
+  a.imag = a.imag * b.real + b.imag * a.real;
+  return a;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>& operator/=(complex<T>& a,  // NOLINT
+                                         const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  a = complex<T>(thrust::complex<T>(a.real, a.imag) /=
+                 thrust::complex<T>(b.real, b.imag));
+  return a;
+#else
+  T denominator = b.real * b.real + b.imag * b.imag;
+  a.real = (a.real * b.real + a.imag * b.imag) / denominator;
+  a.imag = (a.imag * b.real - a.real * b.imag) / denominator;
+  return a;
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> raw_uint16_to_complex64(uint16_t a) {
+  complex<T> res;
+  res.real = a;
+  res.imag = 0.0;
+  return res;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator==(const complex<T>& a, const complex<T>& b) {
+  return a.real == b.real && a.imag == b.imag;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator!=(const complex<T>& a, const complex<T>& b) {
+  return a.real != b.real || a.imag != b.imag;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator<(const complex<T>& a, const complex<T>& b) {
+  return a.real < b.real;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator<=(const complex<T>& a, const complex<T>& b) {
+  return a.real <= b.real;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator>(const complex<T>& a, const complex<T>& b) {
+  return a.real > b.real;
+}
+
+template <typename T>
+HOSTDEVICE inline bool operator>=(const complex<T>& a, const complex<T>& b) {
+  return a.real >= b.real;
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>(max)(const complex<T>& a, const complex<T>& b) {
+  return (a.real >= b.real) ? a : b;
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T>(min)(const complex<T>& a, const complex<T>& b) {
+  return (a.real < b.real) ? a : b;
+}
+
+template <typename T>
+HOSTDEVICE inline bool(isnan)(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return ::isnan(a.real) || ::isnan(a.imag);
+#else
+  return std::isnan(a.real) || std::isnan(a.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline bool isinf(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return ::isinf(a.real) || ::isinf(a.imag);
+#else
+  return std::isinf(a.real) || std::isinf(a.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline bool isfinite(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return ::isfinite(a.real) || ::isfinite(a.imag);
+#else
+  return std::isfinite(a.real) || std::isfinite(a.imag);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline T abs(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return thrust::abs(thrust::complex<T>(a));
+#else
+  return std::abs(std::complex<T>(a));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline T arg(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return thrust::arg(thrust::complex<T>(a));
+#else
+  return std::arg(std::complex<T>(a));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> pow(const complex<T>& a, const complex<T>& b) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::pow(thrust::complex<T>(a), thrust::complex<T>(b)));
+#else
+  return complex<T>(std::pow(std::complex<T>(a), std::complex<T>(b)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> sqrt(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::sqrt(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::sqrt(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> tanh(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::tanh(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::tanh(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> log(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::log(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::log(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const complex<T>& a) {
+  os << "real:" << a.real << " imag:" << a.imag;
+  return os;
+}
+}  // namespace dtype
+}  // namespace pten
+
+namespace std {
+
+template <typename T>
+struct is_pod<pten::dtype::complex<T>> {
+  static const bool value = true;
+};
+
+template <typename T>
+struct is_floating_point<pten::dtype::complex<T>>
+    : std::integral_constant<bool, false> {};
+
+template <typename T>
+struct is_signed<pten::dtype::complex<T>> {
+  static const bool value = false;
+};
+
+template <typename T>
+struct is_unsigned<pten::dtype::complex<T>> {
+  static const bool value = false;
+};
+
+template <typename T>
+inline bool isnan(const pten::dtype::complex<T>& a) {
+  return pten::dtype::isnan(a);
+}
+
+template <typename T>
+inline bool isinf(const pten::dtype::complex<T>& a) {
+  return pten::dtype::isinf(a);
+}
+
+template <typename T>
+struct numeric_limits<pten::dtype::complex<T>> {
+  static const bool is_specialized = false;
+  static const bool is_signed = false;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = false;
+  static const bool has_quiet_NaN = false;
+  static const bool has_signaling_NaN = false;
+  static const float_denorm_style has_denorm = denorm_absent;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_toward_zero;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 0;
+  static const int digits10 = 0;
+  static const int max_digits10 = 0;
+  static const int radix = 0;
+  static const int min_exponent = 0;
+  static const int min_exponent10 = 0;
+  static const int max_exponent = 0;
+  static const int max_exponent10 = 0;
+  static const bool traps = false;
+  static const bool tinyness_before = false;
+
+  static pten::dtype::complex<T>(min)() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+  static pten::dtype::complex<T> lowest() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+  static pten::dtype::complex<T>(max)() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+  static pten::dtype::complex<T> epsilon() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+  static pten::dtype::complex<T> round_error() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+  static pten::dtype::complex<T> infinity() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+  static pten::dtype::complex<T> quiet_NaN() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+  static pten::dtype::complex<T> signaling_NaN() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+  static pten::dtype::complex<T> denorm_min() {
+    return pten::dtype::complex<T>(0.0, 0.0);
+  }
+};
+
+}  // namespace std
--- a/paddle/pten/common/data_type.h
+++ b/paddle/pten/common/data_type.h
@@ -14,19 +14,19 @@ limitations under the License. */

 #pragma once

-#include "bfloat16.h"  // NOLINT
-#include "complex.h"   // NOLINT
-#include "float16.h"   // NOLINT
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"

 #include "paddle/pten/api/ext/exception.h"

 namespace paddle {
 namespace experimental {

-using complex64 = ::paddle::platform::complex<float>;
-using complex128 = ::paddle::platform::complex<double>;
-using float16 = ::paddle::platform::float16;
-using bfloat16 = ::paddle::platform::bfloat16;
+using complex64 = ::pten::dtype::complex<float>;
+using complex128 = ::pten::dtype::complex<double>;
+using float16 = ::pten::dtype::float16;
+using bfloat16 = ::pten::dtype::bfloat16;

 enum class DataType {
  UNDEFINED = 0,

--- a/paddle/pten/common/float16.h
+++ b/paddle/pten/common/float16.h
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/pten/core/dense_tensor.h"

 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"

 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/convert_utils.h"

--- a/paddle/pten/kernels/complex_kernel.h
+++ b/paddle/pten/kernels/complex_kernel.h
@@ -14,7 +14,7 @@ limitations under the License. */

 #pragma once

-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"

--- a/paddle/pten/kernels/cpu/complex_kernel.cc
+++ b/paddle/pten/kernels/cpu/complex_kernel.cc
@@ -19,7 +19,7 @@
 #include "paddle/pten/core/kernel_registry.h"

 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"

 PT_REGISTER_KERNEL(conj,
                   CPU,

--- a/paddle/pten/kernels/cpu/dot_grad_kernel.cc
+++ b/paddle/pten/kernels/cpu/dot_grad_kernel.cc
@@ -18,7 +18,7 @@
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/core/kernel_registry.h"

-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"

 PT_REGISTER_KERNEL(dot_grad,
                   CPU,

--- a/paddle/pten/kernels/cpu/dot_kernel.cc
+++ b/paddle/pten/kernels/cpu/dot_kernel.cc
@@ -18,7 +18,7 @@
 #include "paddle/pten/core/kernel_registry.h"

 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"

 namespace pten {


--- a/paddle/pten/kernels/cpu/math_kernel.cc
+++ b/paddle/pten/kernels/cpu/math_kernel.cc
@@ -25,8 +25,8 @@

 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"

 namespace pten {


--- a/paddle/pten/kernels/cpu/matmul_grad_kernel.cc
+++ b/paddle/pten/kernels/cpu/matmul_grad_kernel.cc
@@ -14,7 +14,7 @@ limitations under the License. */

 #include "paddle/pten/kernels/matmul_grad_kernel.h"

-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 #include "paddle/pten/core/kernel_registry.h"

 #include "paddle/pten/kernels/impl/matmul_grad_kernel_impl.h"

--- a/paddle/pten/kernels/cpu/matmul_kernel.cc
+++ b/paddle/pten/kernels/cpu/matmul_kernel.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/core/kernel_registry.h"

-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 #include "paddle/pten/kernels/impl/matmul_kernel_impl.h"

 PT_REGISTER_KERNEL(matmul,

--- a/paddle/pten/kernels/cpu/scale_kernel.cc
+++ b/paddle/pten/kernels/cpu/scale_kernel.cc
@@ -22,7 +22,7 @@ limitations under the License. */

 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/pten/common/bfloat16.h"
 namespace pten {

 template <typename T, typename Context>

--- a/paddle/pten/kernels/cpu/sign_kernel.cc
+++ b/paddle/pten/kernels/cpu/sign_kernel.cc
@@ -19,6 +19,6 @@ limitations under the License. */
 #include "paddle/pten/core/kernel_registry.h"

 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/pten/common/bfloat16.h"

 PT_REGISTER_KERNEL(sign, CPU, ALL_LAYOUT, pten::SignKernel, float, double) {}
--- a/paddle/pten/kernels/empty_kernel.cc
+++ b/paddle/pten/kernels/empty_kernel.cc
@@ -16,7 +16,7 @@
 #include "paddle/pten/backends/all_context.h"
 #include "paddle/pten/core/kernel_registry.h"

-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"

 namespace pten {


--- a/paddle/pten/kernels/funcs/elementwise_functor.h
+++ b/paddle/pten/kernels/funcs/elementwise_functor.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once

 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/common/float16.h"

 namespace pten {
 namespace funcs {

--- a/paddle/pten/kernels/funcs/transpose.cc
+++ b/paddle/pten/kernels/funcs/transpose.cc
@@ -18,9 +18,9 @@
 #include "paddle/pten/core/dense_tensor.h"

 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"

 namespace pten {
 namespace math {

--- a/paddle/pten/kernels/funcs/transpose.cu
+++ b/paddle/pten/kernels/funcs/transpose.cu
@@ -19,9 +19,9 @@
 #include "paddle/pten/kernels/funcs/transpose.h"

 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"

 namespace pten {


--- a/paddle/pten/kernels/gpu/cast_kernel.cu
+++ b/paddle/pten/kernels/gpu/cast_kernel.cu
@@ -21,10 +21,10 @@

 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/aligned_vector.h"
-#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/common/float16.h"

 namespace pten {


--- a/paddle/pten/kernels/gpu/complex_kernel.cu
+++ b/paddle/pten/kernels/gpu/complex_kernel.cu
@@ -19,7 +19,7 @@
 #include "paddle/pten/core/kernel_registry.h"

 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"

 PT_REGISTER_KERNEL(conj,
                   GPU,

--- a/paddle/pten/kernels/gpu/dot_grad_kernel.cu
+++ b/paddle/pten/kernels/gpu/dot_grad_kernel.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/kernel_registry.h"

-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"

 PT_REGISTER_KERNEL(dot_grad,
                   GPU,

--- a/paddle/pten/kernels/gpu/dot_kernel.cu
+++ b/paddle/pten/kernels/gpu/dot_kernel.cu
@@ -20,7 +20,7 @@

 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"

 namespace pten {


--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -27,9 +27,9 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif

-#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/complex.h"
+#include "paddle/pten/common/float16.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_registry.h"


--- a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
+++ b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
@@ -14,7 +14,7 @@ limitations under the License. */

 #include "paddle/pten/kernels/matmul_grad_kernel.h"

-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 #include "paddle/pten/core/kernel_registry.h"

 #include "paddle/pten/kernels/impl/matmul_grad_kernel_impl.h"

--- a/paddle/pten/kernels/gpu/matmul_kernel.cu
+++ b/paddle/pten/kernels/gpu/matmul_kernel.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/kernel_registry.h"

-#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/common/complex.h"
 #include "paddle/pten/kernels/impl/matmul_kernel_impl.h"

 PT_REGISTER_KERNEL(matmul,

--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/kernels/funcs/elementwise_base.h"
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/float16.h"

 namespace pten {


--- a/paddle/pten/kernels/gpu/sign_kernel.cu
+++ b/paddle/pten/kernels/gpu/sign_kernel.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/pten/core/kernel_registry.h"

 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/common/float16.h"

 using float16 = paddle::platform::float16;


--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -569,9 +569,6 @@ headers = (
    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/common')) +  # pten common headers
    # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
    # to `paddle/pten/api/ext`,
-    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/bfloat16.h'] +
-    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex.h'] +
-    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h'] +
    ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'])

 if '${WITH_MKLDNN}' == 'ON':