From f1143f0cdbab985d1d3878ff27a47409d237e932 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Thu, 20 Jan 2022 18:31:17 +0800 Subject: [PATCH] [Pten] Migrate bfloat16/float16/complex from paddle::platform into pten::common (#39044) * Migrate bfloat16/float16/complex from platform into pten::common * fix typo * fix code style --- cmake/inference_lib.cmake | 8 +- paddle/fluid/CMakeLists.txt | 5 - paddle/fluid/framework/data_type.h | 11 - paddle/fluid/framework/data_type_test.cc | 7 - paddle/fluid/framework/dlpack_tensor.cc | 7 - paddle/fluid/framework/dlpack_tensor_test.cc | 6 - paddle/fluid/framework/tensor_test.cc | 6 - paddle/fluid/operators/assign_op.cc | 3 - paddle/fluid/operators/assign_op_npu.cc | 3 - paddle/fluid/operators/assign_op_xpu.cc | 3 - .../collective/c_allreduce_max_op.cc | 3 - .../collective/c_allreduce_max_op.cu.cc | 7 - .../collective/c_allreduce_max_op_npu.cc | 1 - .../collective/c_allreduce_max_op_xpu.cc | 7 - .../collective/c_allreduce_min_op.cc | 3 - .../collective/c_allreduce_min_op.cu.cc | 7 - .../collective/c_allreduce_min_op_npu.cc | 1 - .../collective/c_allreduce_min_op_xpu.cc | 7 - .../collective/c_allreduce_prod_op.cc | 3 - .../collective/c_allreduce_prod_op.cu.cc | 7 - .../collective/c_allreduce_prod_op_npu.cc | 1 - .../collective/c_allreduce_prod_op_xpu.cc | 7 - .../collective/c_allreduce_sum_op.cc | 3 - .../collective/c_allreduce_sum_op.cu.cc | 7 - .../collective/c_allreduce_sum_op_npu.cc | 1 - .../collective/c_allreduce_sum_op_xpu.cc | 7 - .../operators/collective/c_reduce_max_op.cc | 3 - .../collective/c_reduce_max_op.cu.cc | 7 - .../collective/c_reduce_max_op_npu.cc | 1 - .../collective/c_reduce_max_op_xpu.cc | 7 - .../operators/collective/c_reduce_min_op.cc | 3 - .../collective/c_reduce_min_op.cu.cc | 7 - .../collective/c_reduce_min_op_npu.cc | 1 - .../collective/c_reduce_min_op_xpu.cc | 7 - .../operators/collective/c_reduce_prod_op.cc | 3 - .../collective/c_reduce_prod_op.cu.cc | 7 - .../collective/c_reduce_prod_op_npu.cc | 1 - .../collective/c_reduce_prod_op_xpu.cc | 7 - .../operators/collective/c_reduce_sum_op.cc | 3 - .../collective/c_reduce_sum_op.cu.cc | 7 - .../collective/c_reduce_sum_op_npu.cc | 1 - .../collective/c_reduce_sum_op_xpu.cc | 7 - .../operators/controlflow/fetch_v2_op.cc | 3 - .../elementwise/elementwise_add_op.cc | 7 - .../elementwise/elementwise_sub_op.cc | 7 - paddle/fluid/operators/isfinite_v2_op.cc | 1 - .../fluid/operators/math/concat_and_split.cc | 4 +- paddle/fluid/operators/math/prelu.cu | 7 +- paddle/fluid/operators/memcpy_d2h_op.cc | 3 - paddle/fluid/operators/memcpy_h2d_op.cc | 3 - paddle/fluid/operators/memcpy_op.cc | 3 - paddle/fluid/operators/reshape_op.cc | 3 - paddle/fluid/platform/bfloat16.h | 391 +----- paddle/fluid/platform/complex.h | 528 +------- .../platform/device/gpu/cuda/cudnn_helper.h | 6 - .../platform/device/gpu/rocm/miopen_helper.h | 6 - paddle/fluid/platform/float16.h | 1068 +--------------- paddle/pten/CMakeLists.txt | 5 - paddle/pten/api/lib/tensor.cc | 4 +- paddle/pten/common/bfloat16.h | 409 ++++++ paddle/pten/common/complex.h | 548 +++++++++ paddle/pten/common/data_type.h | 14 +- paddle/pten/common/float16.h | 1094 +++++++++++++++++ paddle/pten/core/dense_tensor.cc | 6 +- paddle/pten/kernels/complex_kernel.h | 2 +- paddle/pten/kernels/cpu/complex_kernel.cc | 2 +- paddle/pten/kernels/cpu/dot_grad_kernel.cc | 2 +- paddle/pten/kernels/cpu/dot_kernel.cc | 2 +- paddle/pten/kernels/cpu/math_kernel.cc | 4 +- paddle/pten/kernels/cpu/matmul_grad_kernel.cc | 2 +- paddle/pten/kernels/cpu/matmul_kernel.cc | 2 +- paddle/pten/kernels/cpu/scale_kernel.cc | 2 +- paddle/pten/kernels/cpu/sign_kernel.cc | 2 +- paddle/pten/kernels/empty_kernel.cc | 2 +- .../pten/kernels/funcs/elementwise_functor.h | 2 +- paddle/pten/kernels/funcs/transpose.cc | 6 +- paddle/pten/kernels/funcs/transpose.cu | 6 +- paddle/pten/kernels/gpu/cast_kernel.cu | 4 +- paddle/pten/kernels/gpu/complex_kernel.cu | 2 +- paddle/pten/kernels/gpu/dot_grad_kernel.cu | 2 +- paddle/pten/kernels/gpu/dot_kernel.cu | 2 +- paddle/pten/kernels/gpu/math_kernel.cu | 4 +- paddle/pten/kernels/gpu/matmul_grad_kernel.cu | 2 +- paddle/pten/kernels/gpu/matmul_kernel.cu | 2 +- paddle/pten/kernels/gpu/scale_kernel.cu | 2 +- paddle/pten/kernels/gpu/sign_kernel.cu | 2 +- python/setup.py.in | 3 - 87 files changed, 2108 insertions(+), 2276 deletions(-) create mode 100644 paddle/pten/common/bfloat16.h create mode 100644 paddle/pten/common/complex.h create mode 100644 paddle/pten/common/float16.h diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 530e5e7f24..b11eac058a 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -228,13 +228,7 @@ copy(inference_lib_dist DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/api/) copy(inference_lib_dist SRCS ${PADDLE_SOURCE_DIR}/paddle/pten/common/*.h - ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/bfloat16.h - ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex.h - ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/ - ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/ - ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/ - ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/) + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/pten/common/) copy(inference_lib_dist SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/any.h DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index f4b68ec65b..08c2d2e055 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -1,8 +1,3 @@ -# Adapt to custom op mechanism: Include the header files related to the data type -# to avoid exposing the path of the underlying file, remove it after moving -# float16.h/complex.h/bfloat16.h into pten -include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) - add_subdirectory(memory) add_subdirectory(platform) add_subdirectory(distributed) diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h index ec8284b825..791e9a83fa 100644 --- a/paddle/fluid/framework/data_type.h +++ b/paddle/fluid/framework/data_type.h @@ -24,17 +24,6 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" -namespace paddle { -namespace platform { -struct bfloat16; -template -struct complex; -struct float16; -template -struct complex; -} // namespace platform -} // namespace paddle - namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc index 5599edcd22..8e762b913f 100644 --- a/paddle/fluid/framework/data_type_test.cc +++ b/paddle/fluid/framework/data_type_test.cc @@ -18,13 +18,6 @@ #include "gtest/gtest.h" #include "paddle/fluid/framework/tensor.h" -namespace paddle { -namespace platform { -struct bfloat16; -struct float16; -} // namespace platform -} // namespace paddle - TEST(DataType, float16) { using paddle::framework::Tensor; using paddle::platform::CPUPlace; diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index ef705aae15..2ceeed694a 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -14,13 +14,6 @@ #include "paddle/fluid/framework/dlpack_tensor.h" #include "paddle/fluid/framework/data_type.h" -namespace paddle { -namespace platform { -struct bfloat16; -struct float16; -} // namespace platform -} // namespace paddle - namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc index 8639caf4da..829908bd98 100644 --- a/paddle/fluid/framework/dlpack_tensor_test.cc +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -16,12 +16,6 @@ #include #include -namespace paddle { -namespace platform { -struct float16; -} // namespace platform -} // namespace paddle - namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index a58f4a6b5f..27e0968736 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -17,12 +17,6 @@ #include #include -namespace paddle { -namespace platform { -struct float16; -} // namespace platform -} // namespace paddle - namespace framework = paddle::framework; namespace platform = paddle::platform; diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc index da5ee4dd82..684ac5bafd 100644 --- a/paddle/fluid/operators/assign_op.cc +++ b/paddle/fluid/operators/assign_op.cc @@ -24,9 +24,6 @@ class Variable; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct float16; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/assign_op_npu.cc b/paddle/fluid/operators/assign_op_npu.cc index 5be1beaa3d..017c2deb18 100644 --- a/paddle/fluid/operators/assign_op_npu.cc +++ b/paddle/fluid/operators/assign_op_npu.cc @@ -26,9 +26,6 @@ class Variable; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct float16; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/assign_op_xpu.cc b/paddle/fluid/operators/assign_op_xpu.cc index 26c879c3fb..b95be3096f 100644 --- a/paddle/fluid/operators/assign_op_xpu.cc +++ b/paddle/fluid/operators/assign_op_xpu.cc @@ -25,9 +25,6 @@ class Variable; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct float16; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc index 4ea1876da2..26e814b228 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc @@ -23,9 +23,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct float16; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc index 17b49eda2f..34054103aa 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc @@ -14,13 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allreduce_op.h" -namespace paddle { -namespace platform { - -struct float16; -} // namespace platform -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc index 4dece4a372..16bb14e757 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc @@ -17,7 +17,6 @@ limitations under the License. */ namespace paddle { namespace platform { struct ASCENDPlace; -struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc index 96da390d45..296a8b3a63 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc @@ -14,13 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allreduce_op.h" -namespace paddle { -namespace platform { - -struct float16; -} // namespace platform -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cc index 75a484ef87..d39aa4f604 100644 --- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc @@ -23,9 +23,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct float16; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc index 4eca34fb50..4e8b6f9d0a 100644 --- a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc @@ -14,13 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allreduce_op.h" -namespace paddle { -namespace platform { - -struct float16; -} // namespace platform -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc index 48e1d2eeb5..d99ab2809b 100644 --- a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc @@ -17,7 +17,6 @@ limitations under the License. */ namespace paddle { namespace platform { struct ASCENDPlace; -struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc index bded822296..b8a8cfab3f 100644 --- a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc @@ -14,13 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allreduce_op.h" -namespace paddle { -namespace platform { - -struct float16; -} // namespace platform -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc index c49e72eac2..b14a384832 100644 --- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc @@ -23,9 +23,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct float16; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc index 74acbacf2b..61f76c178d 100644 --- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc @@ -14,13 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allreduce_op.h" -namespace paddle { -namespace platform { - -struct float16; -} // namespace platform -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc index f3d14afe0a..1c7ba1e951 100644 --- a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc @@ -17,7 +17,6 @@ limitations under the License. */ namespace paddle { namespace platform { struct ASCENDPlace; -struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc index a1d439cfda..6e2b2df297 100644 --- a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc @@ -14,13 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allreduce_op.h" -namespace paddle { -namespace platform { - -struct float16; -} // namespace platform -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc index 72659282af..d55d5f48ac 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc @@ -21,9 +21,6 @@ class OpDesc; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct float16; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc index cfd508be27..8fe7fce21e 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc @@ -14,13 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allreduce_op.h" -namespace paddle { -namespace platform { - -struct float16; -} // namespace platform -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc index b66e2e1968..920f69ea9d 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc @@ -17,7 +17,6 @@ limitations under the License. */ namespace paddle { namespace platform { struct ASCENDPlace; -struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc index bacdf7fb53..d23572e6d6 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc @@ -14,13 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allreduce_op.h" -namespace paddle { -namespace platform { - -struct float16; -} // namespace platform -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cc index 8a5ed7d7bd..3208b6f01a 100644 --- a/paddle/fluid/operators/collective/c_reduce_max_op.cc +++ b/paddle/fluid/operators/collective/c_reduce_max_op.cc @@ -23,9 +23,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct float16; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc index 9668c68c7d..7e260346b4 100644 --- a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc +++ b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc @@ -14,13 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_reduce_op.h" -namespace paddle { -namespace platform { - -struct float16; -} // namespace platform -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc index f35b4c2f70..50d52e0ad1 100644 --- a/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc +++ b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc @@ -17,7 +17,6 @@ limitations under the License. */ namespace paddle { namespace platform { struct ASCENDPlace; -struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc index 82a10b24da..3ae1af1d08 100644 --- a/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc @@ -14,13 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_reduce_op.h" -namespace paddle { -namespace platform { - -struct float16; -} // namespace platform -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cc index c2ecf2419a..5ab01f42af 100644 --- a/paddle/fluid/operators/collective/c_reduce_min_op.cc +++ b/paddle/fluid/operators/collective/c_reduce_min_op.cc @@ -23,9 +23,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct float16; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc index 7f5b4cd360..77a75ed0b7 100644 --- a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc +++ b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc @@ -14,13 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_reduce_op.h" -namespace paddle { -namespace platform { - -struct float16; -} // namespace platform -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc index 6ebb7e4c40..b94da957e8 100644 --- a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc +++ b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc @@ -17,7 +17,6 @@ limitations under the License. */ namespace paddle { namespace platform { struct ASCENDPlace; -struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc index b1136b7966..500ea2abe6 100644 --- a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc @@ -14,13 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_reduce_op.h" -namespace paddle { -namespace platform { - -struct float16; -} // namespace platform -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cc index a689b9db15..14a4da7638 100644 --- a/paddle/fluid/operators/collective/c_reduce_prod_op.cc +++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cc @@ -23,9 +23,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct float16; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc index c3de32b9fb..07e431f7bc 100644 --- a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc +++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc @@ -14,13 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_reduce_op.h" -namespace paddle { -namespace platform { - -struct float16; -} // namespace platform -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc index f0b7021e79..7515ffad25 100644 --- a/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc +++ b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc @@ -17,7 +17,6 @@ limitations under the License. */ namespace paddle { namespace platform { struct ASCENDPlace; -struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc index f6def80a19..15ef322896 100644 --- a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc @@ -14,13 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_reduce_op.h" -namespace paddle { -namespace platform { - -struct float16; -} // namespace platform -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cc index b7f521b371..7b74386b3b 100644 --- a/paddle/fluid/operators/collective/c_reduce_sum_op.cc +++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cc @@ -23,9 +23,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct float16; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc index a4a651be3c..d9826422c1 100644 --- a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc +++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc @@ -14,13 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_reduce_op.h" -namespace paddle { -namespace platform { - -struct float16; -} // namespace platform -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc index dd4dbbd5f3..6f056520df 100644 --- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc @@ -17,7 +17,6 @@ limitations under the License. */ namespace paddle { namespace platform { struct ASCENDPlace; -struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc index ec928bd6a0..f3ec15ca9e 100644 --- a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc @@ -14,13 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_reduce_op.h" -namespace paddle { -namespace platform { - -struct float16; -} // namespace platform -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/controlflow/fetch_v2_op.cc b/paddle/fluid/operators/controlflow/fetch_v2_op.cc index d7f74c44bd..caa67139a9 100644 --- a/paddle/fluid/operators/controlflow/fetch_v2_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc @@ -26,9 +26,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct float16; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index 216178f7d8..0c2476fde0 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -18,13 +18,6 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op.h" -namespace paddle { -namespace platform { -template -struct complex; -} // namespace platform -} // namespace paddle - namespace paddle { namespace framework { class OpDesc; diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc index f5290a69bb..98b47407b9 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc @@ -18,13 +18,6 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op.h" -namespace paddle { -namespace platform { -template -struct complex; -} // namespace platform -} // namespace paddle - namespace paddle { namespace framework { class OpDesc; diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc index d3391fddd3..316197ac23 100644 --- a/paddle/fluid/operators/isfinite_v2_op.cc +++ b/paddle/fluid/operators/isfinite_v2_op.cc @@ -34,7 +34,6 @@ class OverflowKernel; } // namespace operators namespace platform { class CPUDeviceContext; -struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc index 2d23f52c0b..4f12630d1e 100644 --- a/paddle/fluid/operators/math/concat_and_split.cc +++ b/paddle/fluid/operators/math/concat_and_split.cc @@ -16,6 +16,8 @@ limitations under the License. */ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #endif +#include "paddle/pten/common/bfloat16.h" +#include "paddle/pten/common/float16.h" namespace pten { class DenseTensor; @@ -25,8 +27,6 @@ namespace paddle { namespace framework {} // namespace framework namespace platform { class CPUDeviceContext; -struct bfloat16; -struct float16; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/math/prelu.cu b/paddle/fluid/operators/math/prelu.cu index d06490ee57..03a307bfdb 100644 --- a/paddle/fluid/operators/math/prelu.cu +++ b/paddle/fluid/operators/math/prelu.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/prelu.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { @@ -110,15 +111,15 @@ void PreluScalarDirectCUDAFunctor::operator()(gpuStream_t stream, } template class PreluChannelWiseDirectCUDAFunctor; -template class PreluChannelWiseDirectCUDAFunctor; +template class PreluChannelWiseDirectCUDAFunctor; template class PreluChannelWiseDirectCUDAFunctor; template class PreluElementWiseDirectCUDAFunctor; -template class PreluElementWiseDirectCUDAFunctor; +template class PreluElementWiseDirectCUDAFunctor; template class PreluElementWiseDirectCUDAFunctor; template class PreluScalarDirectCUDAFunctor; -template class PreluScalarDirectCUDAFunctor; +template class PreluScalarDirectCUDAFunctor; template class PreluScalarDirectCUDAFunctor; } // namespace math diff --git a/paddle/fluid/operators/memcpy_d2h_op.cc b/paddle/fluid/operators/memcpy_d2h_op.cc index 1aaa4c2367..9ad30d72eb 100644 --- a/paddle/fluid/operators/memcpy_d2h_op.cc +++ b/paddle/fluid/operators/memcpy_d2h_op.cc @@ -23,9 +23,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct float16; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc index 4e0f353a7a..c8e1e17d65 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.cc +++ b/paddle/fluid/operators/memcpy_h2d_op.cc @@ -23,9 +23,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct float16; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/memcpy_op.cc b/paddle/fluid/operators/memcpy_op.cc index d1eeff0b05..52e493ffcb 100644 --- a/paddle/fluid/operators/memcpy_op.cc +++ b/paddle/fluid/operators/memcpy_op.cc @@ -26,9 +26,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct float16; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 01e13cd1dc..4c9be6d0cc 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -30,9 +30,6 @@ class OpDesc; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -struct float16; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h index 100444eee7..1160187f83 100644 --- a/paddle/fluid/platform/bfloat16.h +++ b/paddle/fluid/platform/bfloat16.h @@ -14,396 +14,11 @@ #pragma once -#include - -#include -#include -#include -#include - -#ifdef PADDLE_WITH_CUDA -#include -#endif - -#if defined(__CUDACC__) && CUDA_VERSION >= 11000 -#define PADDLE_CUDA_BF16 -#include -#endif - -#if !defined(_WIN32) -#define PADDLE_ALIGN(x) __attribute__((aligned(x))) -#else -#define PADDLE_ALIGN(x) __declspec(align(x)) -#endif - -#if (defined(__CUDACC__) || defined(__HIPCC__)) -#define HOSTDEVICE __host__ __device__ -#define DEVICE __device__ -#define HOST __host__ -#else -#define HOSTDEVICE -#define DEVICE -#define HOST -#endif +#include "paddle/pten/common/bfloat16.h" namespace paddle { namespace platform { - -struct PADDLE_ALIGN(2) bfloat16 { - public: - uint16_t x; - - // Constructors - bfloat16() = default; - bfloat16(const bfloat16& o) = default; - bfloat16& operator=(const bfloat16& o) = default; - bfloat16(bfloat16&& o) = default; - bfloat16& operator=(bfloat16&& o) = default; - ~bfloat16() = default; - - HOSTDEVICE inline explicit bfloat16(float val) { -#ifdef PADDLE_WITH_HIP - uint32_t res = 0; - uint32_t* tempRes; - // We should be using memcpy in order to respect the strict aliasing rule - // but it fails in the HIP environment. - tempRes = reinterpret_cast(&val); - res = *tempRes; - x = res >> 16; -#else -#if defined(PADDLE_CUDA_BF16) - __nv_bfloat16 tmp = __float2bfloat16(val); - x = *reinterpret_cast(&tmp); -#else - std::memcpy(&x, reinterpret_cast(&val) + 2, 2); -#endif -#endif - } - -#if defined(PADDLE_CUDA_BF16) - HOSTDEVICE inline explicit bfloat16(const __nv_bfloat16& val) { - x = *reinterpret_cast(&val); - } -#endif - - template - HOSTDEVICE inline explicit bfloat16(const T& val) - : x(bfloat16(static_cast(val)).x) {} - -// Assignment operators -#if defined(PADDLE_CUDA_BF16) - HOSTDEVICE inline bfloat16& operator=(const __nv_bfloat16& val) { - x = *reinterpret_cast(&val); - return *this; - } -#endif - - HOSTDEVICE inline bfloat16& operator=(bool b) { - x = b ? 0x3f80 : 0; - return *this; - } - - HOSTDEVICE inline bfloat16& operator=(int8_t val) { - x = bfloat16(val).x; - return *this; - } - - HOSTDEVICE inline bfloat16& operator=(uint8_t val) { - x = bfloat16(val).x; - return *this; - } - - HOSTDEVICE inline bfloat16& operator=(int16_t val) { - x = bfloat16(val).x; - return *this; - } - - HOSTDEVICE inline bfloat16& operator=(uint16_t val) { - x = bfloat16(val).x; - return *this; - } - - HOSTDEVICE inline bfloat16& operator=(int32_t val) { - x = bfloat16(val).x; - return *this; - } - - HOSTDEVICE inline bfloat16& operator=(uint32_t val) { - x = bfloat16(val).x; - return *this; - } - - HOSTDEVICE inline bfloat16& operator=(int64_t val) { - x = bfloat16(val).x; - return *this; - } - - HOSTDEVICE inline bfloat16& operator=(uint64_t val) { - x = bfloat16(val).x; - return *this; - } - - HOSTDEVICE inline bfloat16& operator=(float val) { - x = bfloat16(val).x; - return *this; - } - - HOSTDEVICE inline bfloat16& operator=(double val) { - x = bfloat16(val).x; - return *this; - } - - // Conversion opertors - HOSTDEVICE inline explicit operator float() const { -#ifdef PADDLE_WITH_HIP - uint32_t res = 0; - // We should be using memcpy in order to respect the strict aliasing rule - // but it fails in the HIP environment. - uint16_t temp = x; - uint16_t* temp_ptr = reinterpret_cast(&temp); - res = *temp_ptr; - return res; -#else -#ifdef PADDLE_CUDA_BF16 - return __bfloat162float(*reinterpret_cast(&x)); -#else - float val = 0.f; - uint16_t temp = x; - std::memcpy(reinterpret_cast(&val) + 2, - reinterpret_cast(&temp), 2); - return val; -#endif -#endif - } - -#ifdef PADDLE_CUDA_BF16 - HOSTDEVICE inline explicit operator __nv_bfloat16() const { - return *reinterpret_cast(&x); - } -#endif - - HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } - - HOSTDEVICE inline explicit operator int8_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator uint8_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator int16_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator uint16_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator int32_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator uint32_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator int64_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator uint64_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator double() const { - return static_cast(static_cast(*this)); - } -}; - -HOSTDEVICE inline bfloat16 operator+(const bfloat16& a, const bfloat16& b) { - return bfloat16(static_cast(a) + static_cast(b)); -} - -HOSTDEVICE inline bfloat16 operator-(const bfloat16& a, const bfloat16& b) { - return bfloat16(static_cast(a) - static_cast(b)); -} - -HOSTDEVICE inline bfloat16 operator*(const bfloat16& a, const bfloat16& b) { - return bfloat16(static_cast(a) * static_cast(b)); -} - -HOSTDEVICE inline bfloat16 operator/(const bfloat16& a, const bfloat16& b) { - return bfloat16(static_cast(a) / static_cast(b)); -} - -HOSTDEVICE inline bfloat16 operator-(const bfloat16& a) { - bfloat16 res; - res.x = a.x ^ 0x8000; - return res; -} - -HOSTDEVICE inline bfloat16& operator+=(bfloat16& a, // NOLINT - const bfloat16& b) { - a = bfloat16(static_cast(a) + static_cast(b)); - return a; -} - -HOSTDEVICE inline bfloat16& operator-=(bfloat16& a, // NOLINT - const bfloat16& b) { - a = bfloat16(static_cast(a) - static_cast(b)); - return a; -} - -HOSTDEVICE inline bfloat16& operator*=(bfloat16& a, // NOLINT - const bfloat16& b) { - a = bfloat16(static_cast(a) * static_cast(b)); - return a; -} - -HOSTDEVICE inline bfloat16& operator/=(bfloat16& a, // NOLINT - const bfloat16& b) { - a = bfloat16(static_cast(a) / static_cast(b)); - return a; -} - -HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) { - bfloat16 res; - res.x = a; - return res; -} - -// Comparison operators -HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) { - return static_cast(a) == static_cast(b); -} - -HOSTDEVICE inline bool operator!=(const bfloat16& a, const bfloat16& b) { - return static_cast(a) != static_cast(b); -} - -HOSTDEVICE inline bool operator<(const bfloat16& a, const bfloat16& b) { - return static_cast(a) < static_cast(b); -} - -HOSTDEVICE inline bool operator<=(const bfloat16& a, const bfloat16& b) { - return static_cast(a) <= static_cast(b); -} - -HOSTDEVICE inline bool operator>(const bfloat16& a, const bfloat16& b) { - return static_cast(a) > static_cast(b); -} - -HOSTDEVICE inline bool operator>=(const bfloat16& a, const bfloat16& b) { - return static_cast(a) >= static_cast(b); -} - -HOSTDEVICE inline bool(isnan)(const bfloat16& a) { - return (a.x & 0x7FFF) > 0x7F80; -} - -HOSTDEVICE inline bool(isinf)(const bfloat16& a) { - return (a.x & 0x7F80) == 0x7F80; -} - -HOSTDEVICE inline bool(isfinite)(const bfloat16& a) { - return !((isnan)(a)) && !((isinf)(a)); -} - -inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) { - os << a.x; - return os; -} - +using bfloat16 = pten::dtype::bfloat16; +using namespace pten::dtype; // NOLINT } // namespace platform } // namespace paddle - -namespace std { - -template <> -struct is_pod { - static const bool value = - is_trivial::value && - is_standard_layout::value; -}; - -template <> -struct is_floating_point - : std::integral_constant< - bool, std::is_same::type>::value> {}; -template <> -struct is_signed { - static const bool value = true; -}; - -template <> -struct is_unsigned { - static const bool value = false; -}; - -inline bool isnan(const paddle::platform::bfloat16& a) { - return paddle::platform::isnan(a); -} - -inline bool isinf(const paddle::platform::bfloat16& a) { - return paddle::platform::isinf(a); -} - -template <> -struct numeric_limits { - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool has_infinity = true; - static const bool has_quiet_NaN = true; - static const bool has_signaling_NaN = true; - static const float_denorm_style has_denorm = denorm_present; - static const bool has_denorm_loss = false; - static const std::float_round_style round_style = std::round_to_nearest; - static const bool is_iec559 = false; - static const bool is_bounded = false; - static const bool is_modulo = false; - static const int digits = 8; - static const int digits10 = 2; - static const int max_digits10 = 9; - static const int radix = 2; - static const int min_exponent = -125; - static const int min_exponent10 = -37; - static const int max_exponent = 128; - static const int max_exponent10 = 38; - static const bool traps = true; - static const bool tinyness_before = false; - - static paddle::platform::bfloat16(min)() { - return paddle::platform::raw_uint16_to_bfloat16(0x007f); - } - static paddle::platform::bfloat16 lowest() { - return paddle::platform::raw_uint16_to_bfloat16(0xff7f); - } - static paddle::platform::bfloat16(max)() { - return paddle::platform::raw_uint16_to_bfloat16(0x7f7f); - } - static paddle::platform::bfloat16 epsilon() { - return paddle::platform::raw_uint16_to_bfloat16(0x3400); - } - static paddle::platform::bfloat16 round_error() { - return paddle::platform::bfloat16(0.5); - } - static paddle::platform::bfloat16 infinity() { - return paddle::platform::raw_uint16_to_bfloat16(0x7f80); - } - static paddle::platform::bfloat16 quiet_NaN() { - return paddle::platform::raw_uint16_to_bfloat16(0xffc1); - } - static paddle::platform::bfloat16 signaling_NaN() { - return paddle::platform::raw_uint16_to_bfloat16(0xff81); - } - static paddle::platform::bfloat16 denorm_min() { - return paddle::platform::raw_uint16_to_bfloat16(0x0001); - } -}; - -} // namespace std diff --git a/paddle/fluid/platform/complex.h b/paddle/fluid/platform/complex.h index e50b741338..de12bc55a5 100644 --- a/paddle/fluid/platform/complex.h +++ b/paddle/fluid/platform/complex.h @@ -14,536 +14,14 @@ #pragma once -#include - -#include -#include -#include -#include -#ifdef PADDLE_WITH_CUDA -#include -#include -#endif // PADDLE_WITH_CUDA - -#ifdef PADDLE_WITH_HIP -#include -#include // NOLINT -#endif - -#if !defined(_WIN32) -#define PADDLE_ALIGN(x) __attribute__((aligned(x))) -#else -#define PADDLE_ALIGN(x) __declspec(align(x)) -#endif - -#if (defined(__CUDACC__) || defined(__HIPCC__)) -#define HOSTDEVICE __host__ __device__ -#define DEVICE __device__ -#define HOST __host__ -#else -#define HOSTDEVICE -#define DEVICE -#define HOST -#endif - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -// todo -#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX -#endif +#include "paddle/pten/common/complex.h" namespace paddle { namespace platform { - -template -struct PADDLE_ALIGN(sizeof(T) * 2) complex { - public: - T real; - T imag; - - using value_type = T; - - complex() = default; - complex(const complex& o) = default; - complex& operator=(const complex& o) = default; - complex(complex&& o) = default; - complex& operator=(complex&& o) = default; - ~complex() = default; - - HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - - template - HOSTDEVICE inline explicit complex(const thrust::complex& c) { - real = c.real(); - imag = c.imag(); - } - - template - HOSTDEVICE inline explicit operator thrust::complex() const { - return thrust::complex(real, imag); - } - -#ifdef PADDLE_WITH_HIP - HOSTDEVICE inline explicit operator hipFloatComplex() const { - return make_hipFloatComplex(real, imag); - } - - HOSTDEVICE inline explicit operator hipDoubleComplex() const { - return make_hipDoubleComplex(real, imag); - } -#else - HOSTDEVICE inline explicit operator cuFloatComplex() const { - return make_cuFloatComplex(real, imag); - } - - HOSTDEVICE inline explicit operator cuDoubleComplex() const { - return make_cuDoubleComplex(real, imag); - } -#endif -#endif - - template ::value || - std::is_integral::value, - int>::type = 0> - HOSTDEVICE complex(const T1& val) { - real = static_cast(val); - imag = static_cast(0.0); - } - - template - HOSTDEVICE explicit complex( - const std::enable_if_t::value, complex>& - val) { - real = val.real; - imag = val.imag; - } - - template - HOSTDEVICE explicit complex( - const std::enable_if_t::value, complex>& - val) { - real = val.real; - imag = val.imag; - } - - template - HOSTDEVICE inline explicit operator std::complex() const { - return static_cast>(std::complex(real, imag)); - } - - template - HOSTDEVICE complex(const std::complex& val) - : real(val.real()), imag(val.imag()) {} - - template ::value || - std::is_integral::value, - int>::type = 0> - HOSTDEVICE inline complex& operator=(const T1& val) { - real = static_cast(val); - imag = static_cast(0.0); - return *this; - } - - HOSTDEVICE inline explicit operator bool() const { - return static_cast(this->real) || static_cast(this->imag); - } - - HOSTDEVICE inline explicit operator int8_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator uint8_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator int16_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator uint16_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator int32_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator uint32_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator int64_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator uint64_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator float() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator double() const { - return static_cast(this->real); - } -}; - -template -HOSTDEVICE inline complex operator+(const complex& a, - const complex& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex(thrust::complex(a) + thrust::complex(b)); -#else - return complex(a.real + b.real, a.imag + b.imag); -#endif -} - -template -HOSTDEVICE inline complex operator-(const complex& a, - const complex& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex(thrust::complex(a) - thrust::complex(b)); -#else - return complex(a.real - b.real, a.imag - b.imag); -#endif -} - -template -HOSTDEVICE inline complex operator*(const complex& a, - const complex& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex(thrust::complex(a) * thrust::complex(b)); -#else - return complex(a.real * b.real - a.imag * b.imag, - a.imag * b.real + b.imag * a.real); -#endif -} - -template -HOSTDEVICE inline complex operator/(const complex& a, - const complex& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex(thrust::complex(a) / thrust::complex(b)); -#else - T denominator = b.real * b.real + b.imag * b.imag; - return complex((a.real * b.real + a.imag * b.imag) / denominator, - (a.imag * b.real - a.real * b.imag) / denominator); -#endif -} - -template -HOSTDEVICE inline complex operator-(const complex& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex(-thrust::complex(a.real, a.imag)); -#else - complex res; - res.real = -a.real; - res.imag = -a.imag; - return res; -#endif -} - -template -HOSTDEVICE inline complex& operator+=(complex& a, // NOLINT - const complex& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - a = complex(thrust::complex(a.real, a.imag) += - thrust::complex(b.real, b.imag)); - return a; -#else - a.real += b.real; - a.imag += b.imag; - return a; -#endif -} - -template -HOSTDEVICE inline complex& operator-=(complex& a, // NOLINT - const complex& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - a = complex(thrust::complex(a.real, a.imag) -= - thrust::complex(b.real, b.imag)); - return a; -#else - a.real -= b.real; - a.imag -= b.imag; - return a; -#endif -} - -template -HOSTDEVICE inline complex& operator*=(complex& a, // NOLINT - const complex& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - a = complex(thrust::complex(a.real, a.imag) *= - thrust::complex(b.real, b.imag)); - return a; -#else - a.real = a.real * b.real - a.imag * b.imag; - a.imag = a.imag * b.real + b.imag * a.real; - return a; -#endif -} - -template -HOSTDEVICE inline complex& operator/=(complex& a, // NOLINT - const complex& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - a = complex(thrust::complex(a.real, a.imag) /= - thrust::complex(b.real, b.imag)); - return a; -#else - T denominator = b.real * b.real + b.imag * b.imag; - a.real = (a.real * b.real + a.imag * b.imag) / denominator; - a.imag = (a.imag * b.real - a.real * b.imag) / denominator; - return a; -#endif -} - template -HOSTDEVICE inline complex raw_uint16_to_complex64(uint16_t a) { - complex res; - res.real = a; - res.imag = 0.0; - return res; -} +using complex = pten::dtype::complex; -template -HOSTDEVICE inline bool operator==(const complex& a, const complex& b) { - return a.real == b.real && a.imag == b.imag; -} - -template -HOSTDEVICE inline bool operator!=(const complex& a, const complex& b) { - return a.real != b.real || a.imag != b.imag; -} - -template -HOSTDEVICE inline bool operator<(const complex& a, const complex& b) { - return a.real < b.real; -} - -template -HOSTDEVICE inline bool operator<=(const complex& a, const complex& b) { - return a.real <= b.real; -} - -template -HOSTDEVICE inline bool operator>(const complex& a, const complex& b) { - return a.real > b.real; -} - -template -HOSTDEVICE inline bool operator>=(const complex& a, const complex& b) { - return a.real >= b.real; -} - -template -HOSTDEVICE inline complex(max)(const complex& a, const complex& b) { - return (a.real >= b.real) ? a : b; -} - -template -HOSTDEVICE inline complex(min)(const complex& a, const complex& b) { - return (a.real < b.real) ? a : b; -} - -template -HOSTDEVICE inline bool(isnan)(const complex& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return ::isnan(a.real) || ::isnan(a.imag); -#else - return std::isnan(a.real) || std::isnan(a.imag); -#endif -} - -template -HOSTDEVICE inline bool isinf(const complex& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return ::isinf(a.real) || ::isinf(a.imag); -#else - return std::isinf(a.real) || std::isinf(a.imag); -#endif -} - -template -HOSTDEVICE inline bool isfinite(const complex& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return ::isfinite(a.real) || ::isfinite(a.imag); -#else - return std::isfinite(a.real) || std::isfinite(a.imag); -#endif -} - -template -HOSTDEVICE inline T abs(const complex& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return thrust::abs(thrust::complex(a)); -#else - return std::abs(std::complex(a)); -#endif -} - -template -HOSTDEVICE inline T arg(const complex& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return thrust::arg(thrust::complex(a)); -#else - return std::arg(std::complex(a)); -#endif -} - -template -HOSTDEVICE inline complex pow(const complex& a, const complex& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex(thrust::pow(thrust::complex(a), thrust::complex(b))); -#else - return complex(std::pow(std::complex(a), std::complex(b))); -#endif -} - -template -HOSTDEVICE inline complex sqrt(const complex& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex(thrust::sqrt(thrust::complex(a))); -#else - return complex(std::sqrt(std::complex(a))); -#endif -} - -template -HOSTDEVICE inline complex tanh(const complex& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex(thrust::tanh(thrust::complex(a))); -#else - return complex(std::tanh(std::complex(a))); -#endif -} - -template -HOSTDEVICE inline complex log(const complex& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex(thrust::log(thrust::complex(a))); -#else - return complex(std::log(std::complex(a))); -#endif -} - -template -inline std::ostream& operator<<(std::ostream& os, const complex& a) { - os << "real:" << a.real << " imag:" << a.imag; - return os; -} +using namespace pten::dtype; // NOLINT } // namespace platform } // namespace paddle - -namespace std { - -template -struct is_pod> { - static const bool value = true; -}; - -template -struct is_floating_point> - : std::integral_constant {}; - -template -struct is_signed> { - static const bool value = false; -}; - -template -struct is_unsigned> { - static const bool value = false; -}; - -template -inline bool isnan(const paddle::platform::complex& a) { - return paddle::platform::isnan(a); -} - -template -inline bool isinf(const paddle::platform::complex& a) { - return paddle::platform::isinf(a); -} - -template -struct numeric_limits> { - static const bool is_specialized = false; - static const bool is_signed = false; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool has_infinity = false; - static const bool has_quiet_NaN = false; - static const bool has_signaling_NaN = false; - static const float_denorm_style has_denorm = denorm_absent; - static const bool has_denorm_loss = false; - static const std::float_round_style round_style = std::round_toward_zero; - static const bool is_iec559 = false; - static const bool is_bounded = false; - static const bool is_modulo = false; - static const int digits = 0; - static const int digits10 = 0; - static const int max_digits10 = 0; - static const int radix = 0; - static const int min_exponent = 0; - static const int min_exponent10 = 0; - static const int max_exponent = 0; - static const int max_exponent10 = 0; - static const bool traps = false; - static const bool tinyness_before = false; - - static paddle::platform::complex(min)() { - return paddle::platform::complex(0.0, 0.0); - } - static paddle::platform::complex lowest() { - return paddle::platform::complex(0.0, 0.0); - } - static paddle::platform::complex(max)() { - return paddle::platform::complex(0.0, 0.0); - } - static paddle::platform::complex epsilon() { - return paddle::platform::complex(0.0, 0.0); - } - static paddle::platform::complex round_error() { - return paddle::platform::complex(0.0, 0.0); - } - static paddle::platform::complex infinity() { - return paddle::platform::complex(0.0, 0.0); - } - static paddle::platform::complex quiet_NaN() { - return paddle::platform::complex(0.0, 0.0); - } - static paddle::platform::complex signaling_NaN() { - return paddle::platform::complex(0.0, 0.0); - } - static paddle::platform::complex denorm_min() { - return paddle::platform::complex(0.0, 0.0); - } -}; - -} // namespace std diff --git a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h index 2bcdbaa201..f6b11b3267 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h +++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h @@ -23,12 +23,6 @@ limitations under the License. */ #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/macros.h" -namespace paddle { -namespace platform { -struct float16; -} // namespace platform -} // namespace paddle - DECLARE_bool(cudnn_deterministic); namespace paddle { diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h index bd8d05f812..34b9d57e05 100644 --- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h +++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h @@ -27,12 +27,6 @@ limitations under the License. */ // MIOPEN do not have epslion definition #define CUDNN_BN_MIN_EPSILON 1e-05 -namespace paddle { -namespace platform { -struct float16; -} // namespace platform -} // namespace paddle - DECLARE_bool(cudnn_deterministic); namespace paddle { diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h index b6d088421a..3d9fa994b6 100644 --- a/paddle/fluid/platform/float16.h +++ b/paddle/fluid/platform/float16.h @@ -14,1073 +14,11 @@ limitations under the License. */ #pragma once -#include - -#include -#include -#include - -#ifdef PADDLE_WITH_CUDA -#include -#endif // PADDLE_WITH_CUDA - -#ifdef PADDLE_WITH_HIP -#include -#endif - -#if defined(__CUDACC__) && CUDA_VERSION >= 7050 -#define PADDLE_CUDA_FP16 -#include -#endif - -#ifdef __HIPCC__ -#define PADDLE_CUDA_FP16 -#include -#endif - -#if !defined(_WIN32) -#define PADDLE_ALIGN(x) __attribute__((aligned(x))) -#else -#define PADDLE_ALIGN(x) __declspec(align(x)) -#endif - -#define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600) - -#if (defined(__CUDACC__) || defined(__HIPCC__)) -#define HOSTDEVICE __host__ __device__ -#define DEVICE __device__ -#define HOST __host__ -#else -#define HOSTDEVICE -#define DEVICE -#define HOST -#endif +#include "paddle/pten/common/float16.h" namespace paddle { namespace platform { - -// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated -// and aligned at least on a 2-byte boundary, which leads to efficient -// memory access of float16 struct and also makes float16 compatible -// with CUDA half, ARM float16_t data types. -struct PADDLE_ALIGN(2) float16 { - public: - uint16_t x; - - // The following defaulted special class member functions - // are added to make float16 pass the std::is_trivial test - float16() = default; - float16(const float16& o) = default; - float16& operator=(const float16& o) = default; - float16(float16&& o) = default; - float16& operator=(float16&& o) = default; - ~float16() = default; - -// Constructors -#ifdef PADDLE_CUDA_FP16 - HOSTDEVICE inline explicit float16(const half& h) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 - x = reinterpret_cast<__half_raw*>(const_cast(&h))->x; -#else - x = h.x; -#endif // CUDA_VERSION >= 9000 -#endif - } -#endif // PADDLE_CUDA_FP16 - -#ifdef PADDLE_WITH_NATIVE_FP16 - // __fp16 is a native half precision data type for arm cpu, - // float16_t is an alias for __fp16 - HOSTDEVICE inline explicit float16(const float16_t& h) { - x = *reinterpret_cast(&h); - } -#endif - - HOSTDEVICE inline explicit float16(float val) { -#if defined(PADDLE_CUDA_FP16) && \ - (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) - half tmp = __float2half(val); - x = *reinterpret_cast(&tmp); - -#elif defined(PADDLE_WITH_NATIVE_FP16) - float32x4_t tmp = vld1q_dup_f32(&val); - float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0); - x = *reinterpret_cast(&res); - -#elif defined(__F16C__) - x = _cvtss_sh(val, 0); - -#else - // Conversion routine adapted from - // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion - Bits v, s; - v.f = val; - uint32_t sign = v.si & sigN; - v.si ^= sign; - sign >>= shiftSign; // logical shift - s.si = mulN; - s.si = s.f * v.f; // correct subnormals - v.si ^= (s.si ^ v.si) & -(minN > v.si); - v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); - v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); - v.ui >>= shift; // logical shift - v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); - v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC); - x = v.ui | sign; - -#endif - } - - HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {} - - template - HOSTDEVICE inline explicit float16(const T& val) - : x(float16(static_cast(val)).x) {} - -// Assignment operators -#ifdef PADDLE_CUDA_FP16 - HOSTDEVICE inline float16& operator=(const half& rhs) { -#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 - x = reinterpret_cast<__half_raw*>(const_cast(&rhs))->x; -#else - x = rhs.x; -#endif - return *this; - } -#endif - -#ifdef PADDLE_WITH_NATIVE_FP16 - HOSTDEVICE inline float16& operator=(const float16_t& rhs) { - x = *reinterpret_cast(&rhs); - return *this; - } -#endif - - HOSTDEVICE inline float16& operator=(bool b) { - x = b ? 0x3c00 : 0; - return *this; - } - - HOSTDEVICE inline float16& operator=(int8_t val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(uint8_t val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(int16_t val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(uint16_t val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(int32_t val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(uint32_t val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(int64_t val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(uint64_t val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(float val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(double val) { - x = float16(val).x; - return *this; - } - -// Conversion opertors -#ifdef PADDLE_CUDA_FP16 - HOSTDEVICE inline half to_half() const { -#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 - __half_raw h; - h.x = x; - return half(h); -#else - half h; - h.x = x; - return h; -#endif // CUDA_VERSION >= 9000 - } -#endif // PADDLE_CUDA_FP16 - -#ifdef PADDLE_WITH_NATIVE_FP16 - HOSTDEVICE inline explicit operator float16_t() const { - return *reinterpret_cast(this); - } -#endif - - HOSTDEVICE inline operator float() const { -#if defined(PADDLE_CUDA_FP16) && \ - (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) - half tmp = *reinterpret_cast(this); - return __half2float(tmp); - -#elif defined(PADDLE_WITH_NATIVE_FP16) - float16x4_t res = vld1_dup_f16(reinterpret_cast(this)); - return vgetq_lane_f32(vcvt_f32_f16(res), 0); - -#elif defined(__F16C__) - return _cvtsh_ss(this->x); - -#else - // Conversion routine adapted from - // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion - Bits v; - v.ui = this->x; - int32_t sign = v.si & sigC; - v.si ^= sign; - sign <<= shiftSign; - v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); - v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); - Bits s; - s.si = mulC; - s.f *= v.si; - int32_t mask = -(norC > v.si); - v.si <<= shift; - v.si ^= (s.si ^ v.si) & mask; - v.si |= sign; - return v.f; - -#endif - } - - HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } - - HOSTDEVICE inline explicit operator int8_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator uint8_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator int16_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator uint16_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator int32_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator uint32_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator int64_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator uint64_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline operator double() const { - return static_cast(static_cast(*this)); - } - - private: - union Bits { - float f; - int32_t si; - uint32_t ui; - }; - - static const int shift = 13; - static const int shiftSign = 16; - - static const int32_t infN = 0x7F800000; - static const int32_t maxN = 0x477FE000; // max flt16 as flt32 - static const int32_t minN = 0x38800000; // min flt16 normal as flt32 - static const int32_t sigN = 0x80000000; // sign bit - - static constexpr int32_t infC = infN >> shift; - static constexpr int32_t nanN = (infC + 1) - << shift; // minimum flt16 nan as float32 - static constexpr int32_t maxC = maxN >> shift; - static constexpr int32_t minC = minN >> shift; - static constexpr int32_t sigC = sigN >> shiftSign; - - static const int32_t mulN = 0x52000000; // (1 << 23) / minN - static const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) - static const int32_t subC = 0x003FF; // max flt32 subnormal downshifted - static const int32_t norC = 0x00400; // min flt32 normal downshifted - - static constexpr int32_t maxD = infC - maxC - 1; - static constexpr int32_t minD = minC - subC - 1; -}; - -// Arithmetic operators on GPU -// CUDA 9.0 provides built-in arithmetic operators for half while -// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are -// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in -// CUDA 9.0 regarding the half data type. -// ROCM has built-in arithmetic operators as not defined -// __HIP_NO_HALF_OPERATORS__ -#if defined(PADDLE_CUDA_FP16) && !defined(__HIPCC__) && CUDA_VERSION < 9000 -DEVICE inline half operator+(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hadd(a, b); -#else - float res = static_cast(float16(a)) + static_cast(float16(b)); - return float16(res).to_half(); -#endif -} - -DEVICE inline half operator-(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hsub(a, b); -#else - float res = static_cast(float16(a)) - static_cast(float16(b)); - return float16(res).to_half(); -#endif -} - -DEVICE inline half operator*(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hmul(a, b); -#else - float res = static_cast(float16(a)) * static_cast(float16(b)); - return float16(res).to_half(); -#endif -} - -DEVICE inline half operator/(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - float num = __half2float(a); - float denom = __half2float(b); - return __float2half(num / denom); -#else - float res = static_cast(float16(a)) / static_cast(float16(b)); - return float16(res).to_half(); -#endif -} - -DEVICE inline half operator-(const half& a) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hneg(a); -#else - float res = -static_cast(float16(a)); - return float16(res).to_half(); -#endif -} - -#ifndef PADDLE_WITH_HIP // not defined __HIP_NO_HALF_OPERATORS__ -DEVICE inline half& operator+=(half& a, const half& b) { // NOLINT - a = a + b; - return a; -} - -DEVICE inline half& operator-=(half& a, const half& b) { // NOLINT - a = a - b; - return a; -} - -DEVICE inline half& operator*=(half& a, const half& b) { // NOLINT - a = a * b; - return a; -} - -DEVICE inline half& operator/=(half& a, const half& b) { // NOLINT - a = a / b; - return a; -} -#endif - -DEVICE inline bool operator==(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __heq(a, b); -#else - return static_cast(float16(a)) == static_cast(float16(b)); -#endif -} - -DEVICE inline bool operator!=(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hne(a, b); -#else - return static_cast(float16(a)) != static_cast(float16(b)); -#endif -} - -DEVICE inline bool operator<(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hlt(a, b); -#else - return static_cast(float16(a)) < static_cast(float16(b)); -#endif -} - -DEVICE inline bool operator<=(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hle(a, b); -#else - return static_cast(float16(a)) <= static_cast(float16(b)); -#endif -} - -DEVICE inline bool operator>(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hgt(a, b); -#else - return static_cast(float16(a)) > static_cast(float16(b)); -#endif -} - -DEVICE inline bool operator>=(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hge(a, b); -#else - return static_cast(float16(a)) >= static_cast(float16(b)); -#endif -} - -#endif // PADDLE_CUDA_FP16 - -// Arithmetic operators for float16 on GPU -#if defined(PADDLE_CUDA_FP16) -// HIPCC has compile error if call __device__ function __hadd, __hsub, etc. -// in __host__ __device__ function -#if defined(__HIPCC__) -DEVICE inline float16 operator+(const float16& a, const float16& b) { - return float16(__hadd(a.to_half(), b.to_half())); -} -HOST inline float16 operator+(const float16& a, const float16& b) { - return float16(static_cast(a) + static_cast(b)); -} -#else -HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return float16(__hadd(a.to_half(), b.to_half())); -#else - return float16(static_cast(a) + static_cast(b)); -#endif -} -#endif - -#if defined(__HIPCC__) -DEVICE inline float16 operator-(const float16& a, const float16& b) { - return float16(__hsub(a.to_half(), b.to_half())); -} -HOST inline float16 operator-(const float16& a, const float16& b) { - return float16(static_cast(a) - static_cast(b)); -} -#else -HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return float16(__hsub(a.to_half(), b.to_half())); -#else - return float16(static_cast(a) - static_cast(b)); -#endif -} -#endif - -#if defined(__HIPCC__) -DEVICE inline float16 operator*(const float16& a, const float16& b) { - return float16(__hmul(a.to_half(), b.to_half())); -} -HOST inline float16 operator*(const float16& a, const float16& b) { - return float16(static_cast(a) * static_cast(b)); -} -#else -HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return float16(__hmul(a.to_half(), b.to_half())); -#else - return float16(static_cast(a) * static_cast(b)); -#endif -} -#endif - -#if defined(__HIPCC__) -DEVICE inline float16 operator/(const float16& a, const float16& b) { - return float16(__hdiv(a.to_half(), b.to_half())); -} -HOST inline float16 operator/(const float16& a, const float16& b) { - return float16(static_cast(a) / static_cast(b)); -} -#else -HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - // TODO(kexinzhao): check which cuda version starts to support __hdiv - float num = __half2float(a.to_half()); - float denom = __half2float(b.to_half()); - return float16(num / denom); -#else - return float16(static_cast(a) / static_cast(b)); -#endif -} -#endif - -#if defined(__HIPCC__) -DEVICE inline float16 operator-(const float16& a) { - return float16(__hneg(a.to_half())); -} -HOST inline float16 operator-(const float16& a) { - float16 res; - res.x = a.x ^ 0x8000; - return res; -} -#else -HOSTDEVICE inline float16 operator-(const float16& a) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return float16(__hneg(a.to_half())); -#else - float16 res; - res.x = a.x ^ 0x8000; - return res; -#endif -} -#endif - -HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { // NOLINT - a = a + b; - return a; -} - -HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) { // NOLINT - a = a - b; - return a; -} - -HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) { // NOLINT - a = a * b; - return a; -} - -HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { // NOLINT - a = a / b; - return a; -} - -// HIPCC has compile error if call __device__ function __heq, __hne, etc. -// in __host__ __device__ function -#if defined(__HIPCC__) -DEVICE inline bool operator==(const float16& a, const float16& b) { - return __heq(a.to_half(), b.to_half()); -} -HOST inline bool operator==(const float16& a, const float16& b) { - return static_cast(a) == static_cast(b); -} -#else // __HIPCC__ -HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __heq(a.to_half(), b.to_half()); -#else - return static_cast(a) == static_cast(b); -#endif -} -#endif // __HIPCC__ - -#if defined(__HIPCC__) -DEVICE inline bool operator!=(const float16& a, const float16& b) { - return __hne(a.to_half(), b.to_half()); -} -HOST inline bool operator!=(const float16& a, const float16& b) { - return static_cast(a) != static_cast(b); -} -#else // __HIPCC__ -HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hne(a.to_half(), b.to_half()); -#else - return static_cast(a) != static_cast(b); -#endif -} -#endif // __HIPCC__ - -#if defined(__HIPCC__) -DEVICE inline bool operator<(const float16& a, const float16& b) { - return __hlt(a.to_half(), b.to_half()); -} -HOST inline bool operator<(const float16& a, const float16& b) { - return static_cast(a) < static_cast(b); -} -#else // __HIPCC__ -HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hlt(a.to_half(), b.to_half()); -#else - return static_cast(a) < static_cast(b); -#endif -} -#endif // __HIPCC__ - -#if defined(__HIPCC__) -DEVICE inline bool operator<=(const float16& a, const float16& b) { - return __hle(a.to_half(), b.to_half()); -} -HOST inline bool operator<=(const float16& a, const float16& b) { - return static_cast(a) <= static_cast(b); -} -#else // __HIPCC__ -HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hle(a.to_half(), b.to_half()); -#else - return static_cast(a) <= static_cast(b); -#endif -} -#endif // __HIPCC__ - -#if defined(__HIPCC__) -DEVICE inline bool operator>(const float16& a, const float16& b) { - return __hgt(a.to_half(), b.to_half()); -} -HOST inline bool operator>(const float16& a, const float16& b) { - return static_cast(a) > static_cast(b); -} -#else // __HIPCC__ -HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hgt(a.to_half(), b.to_half()); -#else - return static_cast(a) > static_cast(b); -#endif -} -#endif // __HIPCC__ - -#if defined(__HIPCC__) -DEVICE inline bool operator>=(const float16& a, const float16& b) { - return __hge(a.to_half(), b.to_half()); -} -HOST inline bool operator>=(const float16& a, const float16& b) { - return static_cast(a) >= static_cast(b); -} -#else // __HIPCC__ -HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hge(a.to_half(), b.to_half()); -#else - return static_cast(a) >= static_cast(b); -#endif -} -#endif // __HIPCC__ - -// Arithmetic operators for float16 on ARMv8.2-A CPU -#elif defined(PADDLE_WITH_NATIVE_FP16) -inline float16 operator+(const float16& a, const float16& b) { - float16 res; - asm volatile( - "ld1 {v0.h}[0], [%[a_ptr]]\n" - "ld1 {v1.h}[0], [%[b_ptr]]\n" - "fadd h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&(res.x)) - : // clobbers - "memory", "v0", "v1"); - return res; -} - -inline float16 operator-(const float16& a, const float16& b) { - float16 res; - asm volatile( - "ld1 {v0.h}[0], [%[a_ptr]]\n" - "ld1 {v1.h}[0], [%[b_ptr]]\n" - "fsub h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&(res.x)) - : // clobbers - "memory", "v0", "v1"); - return res; -} - -inline float16 operator*(const float16& a, const float16& b) { - float16 res; - asm volatile( - "ld1 {v0.h}[0], [%[a_ptr]]\n" - "ld1 {v1.h}[0], [%[b_ptr]]\n" - "fmul h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&(res.x)) - : // clobbers - "memory", "v0", "v1"); - return res; -} - -inline float16 operator/(const float16& a, const float16& b) { - float16 res; - asm volatile( - "ld1 {v0.h}[0], [%[a_ptr]]\n" - "ld1 {v1.h}[0], [%[b_ptr]]\n" - "fdiv h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&(res.x)) - : // clobbers - "memory", "v0", "v1"); - return res; -} - -inline float16 operator-(const float16& a) { - float16 res; - asm volatile( - "ld1 {v0.h}[0], [%[a_ptr]]\n" - "fneg h0, h0\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), - [res_ptr] "r"(&(res.x)) - : // clobbers - "memory", "v0"); - return res; -} - -inline float16& operator+=(float16& a, const float16& b) { // NOLINT - a = a + b; - return a; -} - -inline float16& operator-=(float16& a, const float16& b) { // NOLINT - a = a - b; - return a; -} - -inline float16& operator*=(float16& a, const float16& b) { // NOLINT - a = a * b; - return a; -} - -inline float16& operator/=(float16& a, const float16& b) { // NOLINT - a = a / b; - return a; -} - -inline bool operator==(const float16& a, const float16& b) { - uint16_t res; - asm volatile( - "ld1 {v0.h}[0], [%[a_ptr]]\n" - "ld1 {v1.h}[0], [%[b_ptr]]\n" - "fcmeq h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&res) - : // clobbers - "memory", "v0", "v1"); - return (res & 0xffff) != 0; -} - -inline bool operator!=(const float16& a, const float16& b) { return !(a == b); } - -inline bool operator<(const float16& a, const float16& b) { - uint16_t res; - asm volatile( - "ld1 {v1.h}[0], [%[a_ptr]]\n" - "ld1 {v0.h}[0], [%[b_ptr]]\n" - "fcmgt h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&res) - : // clobbers - "memory", "v0", "v1"); - return (res & 0xffff) != 0; -} - -inline bool operator<=(const float16& a, const float16& b) { - uint16_t res; - asm volatile( - "ld1 {v1.h}[0], [%[a_ptr]]\n" - "ld1 {v0.h}[0], [%[b_ptr]]\n" - "fcmge h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&res) - : // clobbers - "memory", "v0", "v1"); - return (res & 0xffff) != 0; -} - -inline bool operator>(const float16& a, const float16& b) { - uint16_t res; - asm volatile( - "ld1 {v0.h}[0], [%[a_ptr]]\n" - "ld1 {v1.h}[0], [%[b_ptr]]\n" - "fcmgt h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&res) - : // clobbers - "memory", "v0", "v1"); - return (res & 0xffff) != 0; -} - -inline bool operator>=(const float16& a, const float16& b) { - uint16_t res; - asm volatile( - "ld1 {v0.h}[0], [%[a_ptr]]\n" - "ld1 {v1.h}[0], [%[b_ptr]]\n" - "fcmge h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&res) - : // clobbers - "memory", "v0", "v1"); - return (res & 0xffff) != 0; -} - -// Arithmetic operators for float16, software emulated on other CPU -#else -inline float16 operator+(const float16& a, const float16& b) { - return float16(static_cast(a) + static_cast(b)); -} - -inline float16 operator-(const float16& a, const float16& b) { - return float16(static_cast(a) - static_cast(b)); -} - -inline float16 operator*(const float16& a, const float16& b) { - return float16(static_cast(a) * static_cast(b)); -} - -inline float16 operator/(const float16& a, const float16& b) { - return float16(static_cast(a) / static_cast(b)); -} - -inline float16 operator-(const float16& a) { - float16 res; - res.x = a.x ^ 0x8000; - return res; -} - -inline float16& operator+=(float16& a, const float16& b) { // NOLINT - a = float16(static_cast(a) + static_cast(b)); - return a; -} - -inline float16& operator-=(float16& a, const float16& b) { // NOLINT - a = float16(static_cast(a) - static_cast(b)); - return a; -} - -inline float16& operator*=(float16& a, const float16& b) { // NOLINT - a = float16(static_cast(a) * static_cast(b)); - return a; -} - -inline float16& operator/=(float16& a, const float16& b) { // NOLINT - a = float16(static_cast(a) / static_cast(b)); - return a; -} - -inline bool operator==(const float16& a, const float16& b) { - return static_cast(a) == static_cast(b); -} - -inline bool operator!=(const float16& a, const float16& b) { - return static_cast(a) != static_cast(b); -} - -inline bool operator<(const float16& a, const float16& b) { - return static_cast(a) < static_cast(b); -} - -inline bool operator<=(const float16& a, const float16& b) { - return static_cast(a) <= static_cast(b); -} - -inline bool operator>(const float16& a, const float16& b) { - return static_cast(a) > static_cast(b); -} - -inline bool operator>=(const float16& a, const float16& b) { - return static_cast(a) >= static_cast(b); -} -#endif - -HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) { - float16 res; - res.x = a; - return res; -} - -// HIPCC has compile error if call __device__ function __hisnan in __host__ -// __device__ function -#if defined(PADDLE_CUDA_FP16) && defined(__HIPCC__) -DEVICE inline bool(isnan)(const float16& a) { return __hisnan(a.to_half()); } -HOST inline bool(isnan)(const float16& a) { return (a.x & 0x7fff) > 0x7c00; } -#else -HOSTDEVICE inline bool(isnan)(const float16& a) { -#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hisnan(a.to_half()); -#else - return (a.x & 0x7fff) > 0x7c00; -#endif -} -#endif - -HOSTDEVICE inline bool(isinf)(const float16& a) { - return (a.x & 0x7fff) == 0x7c00; -} - -HOSTDEVICE inline bool(isfinite)(const float16& a) { - return !((isnan)(a)) && !((isinf)(a)); -} - -HOSTDEVICE inline float16(abs)(const float16& a) { -#if defined(PADDLE_CUDA_FP16) && \ - (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)) - return float16(::fabs(static_cast(a))); -#else - return float16(std::abs(static_cast(a))); -#endif -} - -inline std::ostream& operator<<(std::ostream& os, const float16& a) { - os << static_cast(a); - return os; -} - +using float16 = pten::dtype::float16; +using namespace pten::dtype; // NOLINT } // namespace platform } // namespace paddle - -namespace std { - -// Override the std::is_pod::value for float16 -// The reason is that different compilers implemented std::is_pod based on -// different C++ standards. float16 class is a plain old data in C++11 given -// that it is both trivial and standard_layout. -// However, std::is_pod in nvcc 8.0 host c++ compiler follows C++0x and is -// more restricted in that you cannot provide any customized -// constructor in float16. Hence, we override is_pod here following C++11 -// so that .cu files can be successfully compiled by nvcc. -template <> -struct is_pod { - static const bool value = - is_trivial::value && - is_standard_layout::value; -}; - -template <> -struct is_floating_point - : std::integral_constant< - bool, std::is_same::type>::value> {}; -template <> -struct is_signed { - static const bool value = true; -}; - -template <> -struct is_unsigned { - static const bool value = false; -}; - -inline bool isnan(const paddle::platform::float16& a) { - return paddle::platform::isnan(a); -} - -inline bool isinf(const paddle::platform::float16& a) { - return paddle::platform::isinf(a); -} - -template <> -struct numeric_limits { - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool has_infinity = true; - static const bool has_quiet_NaN = true; - static const bool has_signaling_NaN = true; - static const float_denorm_style has_denorm = denorm_present; - static const bool has_denorm_loss = false; - static const std::float_round_style round_style = std::round_to_nearest; - static const bool is_iec559 = false; - static const bool is_bounded = false; - static const bool is_modulo = false; - static const int digits = 11; - static const int digits10 = 3; - static const int max_digits10 = 5; - static const int radix = 2; - static const int min_exponent = -13; - static const int min_exponent10 = -4; - static const int max_exponent = 16; - static const int max_exponent10 = 4; - static const bool traps = true; - static const bool tinyness_before = false; - - HOSTDEVICE static paddle::platform::float16(min)() { - return paddle::platform::raw_uint16_to_float16(0x400); - } - HOSTDEVICE static paddle::platform::float16 lowest() { - return paddle::platform::raw_uint16_to_float16(0xfbff); - } - HOSTDEVICE static paddle::platform::float16(max)() { - return paddle::platform::raw_uint16_to_float16(0x7bff); - } - HOSTDEVICE static paddle::platform::float16 epsilon() { - return paddle::platform::raw_uint16_to_float16(0x0800); - } - HOSTDEVICE static paddle::platform::float16 round_error() { - return paddle::platform::float16(0.5); - } - HOSTDEVICE static paddle::platform::float16 infinity() { - return paddle::platform::raw_uint16_to_float16(0x7c00); - } - HOSTDEVICE static paddle::platform::float16 quiet_NaN() { - return paddle::platform::raw_uint16_to_float16(0x7e00); - } - HOSTDEVICE static paddle::platform::float16 signaling_NaN() { - return paddle::platform::raw_uint16_to_float16(0x7e00); - } - HOSTDEVICE static paddle::platform::float16 denorm_min() { - return paddle::platform::raw_uint16_to_float16(0x1); - } -}; - -HOSTDEVICE inline paddle::platform::float16 abs( - const paddle::platform::float16& a) { - return paddle::platform::abs(a); -} - -} // namespace std diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt index a9b7c7581b..9b6e5d70cd 100644 --- a/paddle/pten/CMakeLists.txt +++ b/paddle/pten/CMakeLists.txt @@ -1,8 +1,3 @@ -# Adapt to custom op mechanism: Include the header files related to the data type -# to avoid exposing the path of the underlying file, remove it after moving -# float16.h/complex.h/bfloat16.h into pten -include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) - # paddle experimental common components add_subdirectory(common) diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc index e5dd1ca5f8..cb70d26f94 100644 --- a/paddle/pten/api/lib/tensor.cc +++ b/paddle/pten/api/lib/tensor.cc @@ -49,11 +49,11 @@ limitations under the License. */ */ #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/stream/cuda_stream.h" +#include "paddle/pten/common/complex.h" +#include "paddle/pten/common/float16.h" namespace paddle { namespace experimental { diff --git a/paddle/pten/common/bfloat16.h b/paddle/pten/common/bfloat16.h new file mode 100644 index 0000000000..0d9db3dbe9 --- /dev/null +++ b/paddle/pten/common/bfloat16.h @@ -0,0 +1,409 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include +#include +#include + +#ifdef PADDLE_WITH_CUDA +#include +#endif + +#if defined(__CUDACC__) && CUDA_VERSION >= 11000 +#define PADDLE_CUDA_BF16 +#include +#endif + +#if !defined(_WIN32) +#define PADDLE_ALIGN(x) __attribute__((aligned(x))) +#else +#define PADDLE_ALIGN(x) __declspec(align(x)) +#endif + +#if (defined(__CUDACC__) || defined(__HIPCC__)) +#define HOSTDEVICE __host__ __device__ +#define DEVICE __device__ +#define HOST __host__ +#else +#define HOSTDEVICE +#define DEVICE +#define HOST +#endif + +namespace pten { +namespace dtype { + +struct PADDLE_ALIGN(2) bfloat16 { + public: + uint16_t x; + + // Constructors + bfloat16() = default; + bfloat16(const bfloat16& o) = default; + bfloat16& operator=(const bfloat16& o) = default; + bfloat16(bfloat16&& o) = default; + bfloat16& operator=(bfloat16&& o) = default; + ~bfloat16() = default; + + HOSTDEVICE inline explicit bfloat16(float val) { +#ifdef PADDLE_WITH_HIP + uint32_t res = 0; + uint32_t* tempRes; + // We should be using memcpy in order to respect the strict aliasing rule + // but it fails in the HIP environment. + tempRes = reinterpret_cast(&val); + res = *tempRes; + x = res >> 16; +#else +#if defined(PADDLE_CUDA_BF16) + __nv_bfloat16 tmp = __float2bfloat16(val); + x = *reinterpret_cast(&tmp); +#else + std::memcpy(&x, reinterpret_cast(&val) + 2, 2); +#endif +#endif + } + +#if defined(PADDLE_CUDA_BF16) + HOSTDEVICE inline explicit bfloat16(const __nv_bfloat16& val) { + x = *reinterpret_cast(&val); + } +#endif + + template + HOSTDEVICE inline explicit bfloat16(const T& val) + : x(bfloat16(static_cast(val)).x) {} + +// Assignment operators +#if defined(PADDLE_CUDA_BF16) + HOSTDEVICE inline bfloat16& operator=(const __nv_bfloat16& val) { + x = *reinterpret_cast(&val); + return *this; + } +#endif + + HOSTDEVICE inline bfloat16& operator=(bool b) { + x = b ? 0x3f80 : 0; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(int8_t val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(uint8_t val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(int16_t val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(uint16_t val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(int32_t val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(uint32_t val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(int64_t val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(uint64_t val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(float val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(double val) { + x = bfloat16(val).x; + return *this; + } + + // Conversion opertors + HOSTDEVICE inline explicit operator float() const { +#ifdef PADDLE_WITH_HIP + uint32_t res = 0; + // We should be using memcpy in order to respect the strict aliasing rule + // but it fails in the HIP environment. + uint16_t temp = x; + uint16_t* temp_ptr = reinterpret_cast(&temp); + res = *temp_ptr; + return res; +#else +#ifdef PADDLE_CUDA_BF16 + return __bfloat162float(*reinterpret_cast(&x)); +#else + float val = 0.f; + uint16_t temp = x; + std::memcpy( + reinterpret_cast(&val) + 2, reinterpret_cast(&temp), 2); + return val; +#endif +#endif + } + +#ifdef PADDLE_CUDA_BF16 + HOSTDEVICE inline explicit operator __nv_bfloat16() const { + return *reinterpret_cast(&x); + } +#endif + + HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } + + HOSTDEVICE inline explicit operator int8_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator uint8_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator int16_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator uint16_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator int32_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator uint32_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator int64_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator uint64_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator double() const { + return static_cast(static_cast(*this)); + } +}; + +HOSTDEVICE inline bfloat16 operator+(const bfloat16& a, const bfloat16& b) { + return bfloat16(static_cast(a) + static_cast(b)); +} + +HOSTDEVICE inline bfloat16 operator-(const bfloat16& a, const bfloat16& b) { + return bfloat16(static_cast(a) - static_cast(b)); +} + +HOSTDEVICE inline bfloat16 operator*(const bfloat16& a, const bfloat16& b) { + return bfloat16(static_cast(a) * static_cast(b)); +} + +HOSTDEVICE inline bfloat16 operator/(const bfloat16& a, const bfloat16& b) { + return bfloat16(static_cast(a) / static_cast(b)); +} + +HOSTDEVICE inline bfloat16 operator-(const bfloat16& a) { + bfloat16 res; + res.x = a.x ^ 0x8000; + return res; +} + +HOSTDEVICE inline bfloat16& operator+=(bfloat16& a, // NOLINT + const bfloat16& b) { + a = bfloat16(static_cast(a) + static_cast(b)); + return a; +} + +HOSTDEVICE inline bfloat16& operator-=(bfloat16& a, // NOLINT + const bfloat16& b) { + a = bfloat16(static_cast(a) - static_cast(b)); + return a; +} + +HOSTDEVICE inline bfloat16& operator*=(bfloat16& a, // NOLINT + const bfloat16& b) { + a = bfloat16(static_cast(a) * static_cast(b)); + return a; +} + +HOSTDEVICE inline bfloat16& operator/=(bfloat16& a, // NOLINT + const bfloat16& b) { + a = bfloat16(static_cast(a) / static_cast(b)); + return a; +} + +HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) { + bfloat16 res; + res.x = a; + return res; +} + +// Comparison operators +HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) { + return static_cast(a) == static_cast(b); +} + +HOSTDEVICE inline bool operator!=(const bfloat16& a, const bfloat16& b) { + return static_cast(a) != static_cast(b); +} + +HOSTDEVICE inline bool operator<(const bfloat16& a, const bfloat16& b) { + return static_cast(a) < static_cast(b); +} + +HOSTDEVICE inline bool operator<=(const bfloat16& a, const bfloat16& b) { + return static_cast(a) <= static_cast(b); +} + +HOSTDEVICE inline bool operator>(const bfloat16& a, const bfloat16& b) { + return static_cast(a) > static_cast(b); +} + +HOSTDEVICE inline bool operator>=(const bfloat16& a, const bfloat16& b) { + return static_cast(a) >= static_cast(b); +} + +HOSTDEVICE inline bool(isnan)(const bfloat16& a) { + return (a.x & 0x7FFF) > 0x7F80; +} + +HOSTDEVICE inline bool(isinf)(const bfloat16& a) { + return (a.x & 0x7F80) == 0x7F80; +} + +HOSTDEVICE inline bool(isfinite)(const bfloat16& a) { + return !((isnan)(a)) && !((isinf)(a)); +} + +inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) { + os << a.x; + return os; +} + +} // namespace dtype +} // namespace pten + +namespace std { + +template <> +struct is_pod { + static const bool value = is_trivial::value && + is_standard_layout::value; +}; + +template <> +struct is_floating_point + : std::integral_constant< + bool, + std::is_same< + pten::dtype::bfloat16, + typename std::remove_cv::type>::value> {}; +template <> +struct is_signed { + static const bool value = true; +}; + +template <> +struct is_unsigned { + static const bool value = false; +}; + +inline bool isnan(const pten::dtype::bfloat16& a) { + return pten::dtype::isnan(a); +} + +inline bool isinf(const pten::dtype::bfloat16& a) { + return pten::dtype::isinf(a); +} + +template <> +struct numeric_limits { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_to_nearest; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 8; + static const int digits10 = 2; + static const int max_digits10 = 9; + static const int radix = 2; + static const int min_exponent = -125; + static const int min_exponent10 = -37; + static const int max_exponent = 128; + static const int max_exponent10 = 38; + static const bool traps = true; + static const bool tinyness_before = false; + + static pten::dtype::bfloat16(min)() { + return pten::dtype::raw_uint16_to_bfloat16(0x007f); + } + static pten::dtype::bfloat16 lowest() { + return pten::dtype::raw_uint16_to_bfloat16(0xff7f); + } + static pten::dtype::bfloat16(max)() { + return pten::dtype::raw_uint16_to_bfloat16(0x7f7f); + } + static pten::dtype::bfloat16 epsilon() { + return pten::dtype::raw_uint16_to_bfloat16(0x3400); + } + static pten::dtype::bfloat16 round_error() { + return pten::dtype::bfloat16(0.5); + } + static pten::dtype::bfloat16 infinity() { + return pten::dtype::raw_uint16_to_bfloat16(0x7f80); + } + static pten::dtype::bfloat16 quiet_NaN() { + return pten::dtype::raw_uint16_to_bfloat16(0xffc1); + } + static pten::dtype::bfloat16 signaling_NaN() { + return pten::dtype::raw_uint16_to_bfloat16(0xff81); + } + static pten::dtype::bfloat16 denorm_min() { + return pten::dtype::raw_uint16_to_bfloat16(0x0001); + } +}; + +} // namespace std diff --git a/paddle/pten/common/complex.h b/paddle/pten/common/complex.h new file mode 100644 index 0000000000..cd6b081ede --- /dev/null +++ b/paddle/pten/common/complex.h @@ -0,0 +1,548 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include +#include +#include +#ifdef PADDLE_WITH_CUDA +#include +#include +#endif // PADDLE_WITH_CUDA + +#ifdef PADDLE_WITH_HIP +#include +#include // NOLINT +#endif + +#if !defined(_WIN32) +#define PADDLE_ALIGN(x) __attribute__((aligned(x))) +#else +#define PADDLE_ALIGN(x) __declspec(align(x)) +#endif + +#if (defined(__CUDACC__) || defined(__HIPCC__)) +#define HOSTDEVICE __host__ __device__ +#define DEVICE __device__ +#define HOST __host__ +#else +#define HOSTDEVICE +#define DEVICE +#define HOST +#endif + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +// todo +#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX +#endif + +namespace pten { +namespace dtype { + +template +struct PADDLE_ALIGN(sizeof(T) * 2) complex { + public: + T real; + T imag; + + using value_type = T; + + complex() = default; + complex(const complex& o) = default; + complex& operator=(const complex& o) = default; + complex(complex&& o) = default; + complex& operator=(complex&& o) = default; + ~complex() = default; + + HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + + template + HOSTDEVICE inline explicit complex(const thrust::complex& c) { + real = c.real(); + imag = c.imag(); + } + + template + HOSTDEVICE inline explicit operator thrust::complex() const { + return thrust::complex(real, imag); + } + +#ifdef PADDLE_WITH_HIP + HOSTDEVICE inline explicit operator hipFloatComplex() const { + return make_hipFloatComplex(real, imag); + } + + HOSTDEVICE inline explicit operator hipDoubleComplex() const { + return make_hipDoubleComplex(real, imag); + } +#else + HOSTDEVICE inline explicit operator cuFloatComplex() const { + return make_cuFloatComplex(real, imag); + } + + HOSTDEVICE inline explicit operator cuDoubleComplex() const { + return make_cuDoubleComplex(real, imag); + } +#endif +#endif + + template ::value || + std::is_integral::value, + int>::type = 0> + HOSTDEVICE complex(const T1& val) { + real = static_cast(val); + imag = static_cast(0.0); + } + + template + HOSTDEVICE explicit complex( + const std::enable_if_t::value, complex>& + val) { + real = val.real; + imag = val.imag; + } + + template + HOSTDEVICE explicit complex( + const std::enable_if_t::value, complex>& + val) { + real = val.real; + imag = val.imag; + } + + template + HOSTDEVICE inline explicit operator std::complex() const { + return static_cast>(std::complex(real, imag)); + } + + template + HOSTDEVICE complex(const std::complex& val) + : real(val.real()), imag(val.imag()) {} + + template ::value || + std::is_integral::value, + int>::type = 0> + HOSTDEVICE inline complex& operator=(const T1& val) { + real = static_cast(val); + imag = static_cast(0.0); + return *this; + } + + HOSTDEVICE inline explicit operator bool() const { + return static_cast(this->real) || static_cast(this->imag); + } + + HOSTDEVICE inline explicit operator int8_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator uint8_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator int16_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator uint16_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator int32_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator uint32_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator int64_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator uint64_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator float() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator double() const { + return static_cast(this->real); + } +}; + +template +HOSTDEVICE inline complex operator+(const complex& a, + const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(thrust::complex(a) + thrust::complex(b)); +#else + return complex(a.real + b.real, a.imag + b.imag); +#endif +} + +template +HOSTDEVICE inline complex operator-(const complex& a, + const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(thrust::complex(a) - thrust::complex(b)); +#else + return complex(a.real - b.real, a.imag - b.imag); +#endif +} + +template +HOSTDEVICE inline complex operator*(const complex& a, + const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(thrust::complex(a) * thrust::complex(b)); +#else + return complex(a.real * b.real - a.imag * b.imag, + a.imag * b.real + b.imag * a.real); +#endif +} + +template +HOSTDEVICE inline complex operator/(const complex& a, + const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(thrust::complex(a) / thrust::complex(b)); +#else + T denominator = b.real * b.real + b.imag * b.imag; + return complex((a.real * b.real + a.imag * b.imag) / denominator, + (a.imag * b.real - a.real * b.imag) / denominator); +#endif +} + +template +HOSTDEVICE inline complex operator-(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(-thrust::complex(a.real, a.imag)); +#else + complex res; + res.real = -a.real; + res.imag = -a.imag; + return res; +#endif +} + +template +HOSTDEVICE inline complex& operator+=(complex& a, // NOLINT + const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + a = complex(thrust::complex(a.real, a.imag) += + thrust::complex(b.real, b.imag)); + return a; +#else + a.real += b.real; + a.imag += b.imag; + return a; +#endif +} + +template +HOSTDEVICE inline complex& operator-=(complex& a, // NOLINT + const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + a = complex(thrust::complex(a.real, a.imag) -= + thrust::complex(b.real, b.imag)); + return a; +#else + a.real -= b.real; + a.imag -= b.imag; + return a; +#endif +} + +template +HOSTDEVICE inline complex& operator*=(complex& a, // NOLINT + const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + a = complex(thrust::complex(a.real, a.imag) *= + thrust::complex(b.real, b.imag)); + return a; +#else + a.real = a.real * b.real - a.imag * b.imag; + a.imag = a.imag * b.real + b.imag * a.real; + return a; +#endif +} + +template +HOSTDEVICE inline complex& operator/=(complex& a, // NOLINT + const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + a = complex(thrust::complex(a.real, a.imag) /= + thrust::complex(b.real, b.imag)); + return a; +#else + T denominator = b.real * b.real + b.imag * b.imag; + a.real = (a.real * b.real + a.imag * b.imag) / denominator; + a.imag = (a.imag * b.real - a.real * b.imag) / denominator; + return a; +#endif +} + +template +HOSTDEVICE inline complex raw_uint16_to_complex64(uint16_t a) { + complex res; + res.real = a; + res.imag = 0.0; + return res; +} + +template +HOSTDEVICE inline bool operator==(const complex& a, const complex& b) { + return a.real == b.real && a.imag == b.imag; +} + +template +HOSTDEVICE inline bool operator!=(const complex& a, const complex& b) { + return a.real != b.real || a.imag != b.imag; +} + +template +HOSTDEVICE inline bool operator<(const complex& a, const complex& b) { + return a.real < b.real; +} + +template +HOSTDEVICE inline bool operator<=(const complex& a, const complex& b) { + return a.real <= b.real; +} + +template +HOSTDEVICE inline bool operator>(const complex& a, const complex& b) { + return a.real > b.real; +} + +template +HOSTDEVICE inline bool operator>=(const complex& a, const complex& b) { + return a.real >= b.real; +} + +template +HOSTDEVICE inline complex(max)(const complex& a, const complex& b) { + return (a.real >= b.real) ? a : b; +} + +template +HOSTDEVICE inline complex(min)(const complex& a, const complex& b) { + return (a.real < b.real) ? a : b; +} + +template +HOSTDEVICE inline bool(isnan)(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return ::isnan(a.real) || ::isnan(a.imag); +#else + return std::isnan(a.real) || std::isnan(a.imag); +#endif +} + +template +HOSTDEVICE inline bool isinf(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return ::isinf(a.real) || ::isinf(a.imag); +#else + return std::isinf(a.real) || std::isinf(a.imag); +#endif +} + +template +HOSTDEVICE inline bool isfinite(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return ::isfinite(a.real) || ::isfinite(a.imag); +#else + return std::isfinite(a.real) || std::isfinite(a.imag); +#endif +} + +template +HOSTDEVICE inline T abs(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return thrust::abs(thrust::complex(a)); +#else + return std::abs(std::complex(a)); +#endif +} + +template +HOSTDEVICE inline T arg(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return thrust::arg(thrust::complex(a)); +#else + return std::arg(std::complex(a)); +#endif +} + +template +HOSTDEVICE inline complex pow(const complex& a, const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(thrust::pow(thrust::complex(a), thrust::complex(b))); +#else + return complex(std::pow(std::complex(a), std::complex(b))); +#endif +} + +template +HOSTDEVICE inline complex sqrt(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(thrust::sqrt(thrust::complex(a))); +#else + return complex(std::sqrt(std::complex(a))); +#endif +} + +template +HOSTDEVICE inline complex tanh(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(thrust::tanh(thrust::complex(a))); +#else + return complex(std::tanh(std::complex(a))); +#endif +} + +template +HOSTDEVICE inline complex log(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(thrust::log(thrust::complex(a))); +#else + return complex(std::log(std::complex(a))); +#endif +} + +template +inline std::ostream& operator<<(std::ostream& os, const complex& a) { + os << "real:" << a.real << " imag:" << a.imag; + return os; +} +} // namespace dtype +} // namespace pten + +namespace std { + +template +struct is_pod> { + static const bool value = true; +}; + +template +struct is_floating_point> + : std::integral_constant {}; + +template +struct is_signed> { + static const bool value = false; +}; + +template +struct is_unsigned> { + static const bool value = false; +}; + +template +inline bool isnan(const pten::dtype::complex& a) { + return pten::dtype::isnan(a); +} + +template +inline bool isinf(const pten::dtype::complex& a) { + return pten::dtype::isinf(a); +} + +template +struct numeric_limits> { + static const bool is_specialized = false; + static const bool is_signed = false; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = false; + static const bool has_quiet_NaN = false; + static const bool has_signaling_NaN = false; + static const float_denorm_style has_denorm = denorm_absent; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_toward_zero; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 0; + static const int digits10 = 0; + static const int max_digits10 = 0; + static const int radix = 0; + static const int min_exponent = 0; + static const int min_exponent10 = 0; + static const int max_exponent = 0; + static const int max_exponent10 = 0; + static const bool traps = false; + static const bool tinyness_before = false; + + static pten::dtype::complex(min)() { + return pten::dtype::complex(0.0, 0.0); + } + static pten::dtype::complex lowest() { + return pten::dtype::complex(0.0, 0.0); + } + static pten::dtype::complex(max)() { + return pten::dtype::complex(0.0, 0.0); + } + static pten::dtype::complex epsilon() { + return pten::dtype::complex(0.0, 0.0); + } + static pten::dtype::complex round_error() { + return pten::dtype::complex(0.0, 0.0); + } + static pten::dtype::complex infinity() { + return pten::dtype::complex(0.0, 0.0); + } + static pten::dtype::complex quiet_NaN() { + return pten::dtype::complex(0.0, 0.0); + } + static pten::dtype::complex signaling_NaN() { + return pten::dtype::complex(0.0, 0.0); + } + static pten::dtype::complex denorm_min() { + return pten::dtype::complex(0.0, 0.0); + } +}; + +} // namespace std diff --git a/paddle/pten/common/data_type.h b/paddle/pten/common/data_type.h index 306507d2d2..3e31d8ba69 100644 --- a/paddle/pten/common/data_type.h +++ b/paddle/pten/common/data_type.h @@ -14,19 +14,19 @@ limitations under the License. */ #pragma once -#include "bfloat16.h" // NOLINT -#include "complex.h" // NOLINT -#include "float16.h" // NOLINT +#include "paddle/pten/common/bfloat16.h" +#include "paddle/pten/common/complex.h" +#include "paddle/pten/common/float16.h" #include "paddle/pten/api/ext/exception.h" namespace paddle { namespace experimental { -using complex64 = ::paddle::platform::complex; -using complex128 = ::paddle::platform::complex; -using float16 = ::paddle::platform::float16; -using bfloat16 = ::paddle::platform::bfloat16; +using complex64 = ::pten::dtype::complex; +using complex128 = ::pten::dtype::complex; +using float16 = ::pten::dtype::float16; +using bfloat16 = ::pten::dtype::bfloat16; enum class DataType { UNDEFINED = 0, diff --git a/paddle/pten/common/float16.h b/paddle/pten/common/float16.h new file mode 100644 index 0000000000..b9ee731f92 --- /dev/null +++ b/paddle/pten/common/float16.h @@ -0,0 +1,1094 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include +#include + +#ifdef PADDLE_WITH_CUDA +#include +#endif // PADDLE_WITH_CUDA + +#ifdef PADDLE_WITH_HIP +#include +#endif + +#if defined(__CUDACC__) && CUDA_VERSION >= 7050 +#define PADDLE_CUDA_FP16 +#include +#endif + +#ifdef __HIPCC__ +#define PADDLE_CUDA_FP16 +#include +#endif + +#if !defined(_WIN32) +#define PADDLE_ALIGN(x) __attribute__((aligned(x))) +#else +#define PADDLE_ALIGN(x) __declspec(align(x)) +#endif + +#define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600) + +#if (defined(__CUDACC__) || defined(__HIPCC__)) +#define HOSTDEVICE __host__ __device__ +#define DEVICE __device__ +#define HOST __host__ +#else +#define HOSTDEVICE +#define DEVICE +#define HOST +#endif + +namespace pten { +namespace dtype { + +// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated +// and aligned at least on a 2-byte boundary, which leads to efficient +// memory access of float16 struct and also makes float16 compatible +// with CUDA half, ARM float16_t data types. +struct PADDLE_ALIGN(2) float16 { + public: + uint16_t x; + + // The following defaulted special class member functions + // are added to make float16 pass the std::is_trivial test + float16() = default; + float16(const float16& o) = default; + float16& operator=(const float16& o) = default; + float16(float16&& o) = default; + float16& operator=(float16&& o) = default; + ~float16() = default; + +// Constructors +#ifdef PADDLE_CUDA_FP16 + HOSTDEVICE inline explicit float16(const half& h) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 + x = reinterpret_cast<__half_raw*>(const_cast(&h))->x; +#else + x = h.x; +#endif // CUDA_VERSION >= 9000 +#endif + } +#endif // PADDLE_CUDA_FP16 + +#ifdef PADDLE_WITH_NATIVE_FP16 + // __fp16 is a native half precision data type for arm cpu, + // float16_t is an alias for __fp16 + HOSTDEVICE inline explicit float16(const float16_t& h) { + x = *reinterpret_cast(&h); + } +#endif + + HOSTDEVICE inline explicit float16(float val) { +#if defined(PADDLE_CUDA_FP16) && \ + (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) + half tmp = __float2half(val); + x = *reinterpret_cast(&tmp); + +#elif defined(PADDLE_WITH_NATIVE_FP16) + float32x4_t tmp = vld1q_dup_f32(&val); + float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0); + x = *reinterpret_cast(&res); + +#elif defined(__F16C__) + x = _cvtss_sh(val, 0); + +#else + // Conversion routine adapted from + // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion + Bits v, s; + v.f = val; + uint32_t sign = v.si & sigN; + v.si ^= sign; + sign >>= shiftSign; // logical shift + s.si = mulN; + s.si = s.f * v.f; // correct subnormals + v.si ^= (s.si ^ v.si) & -(minN > v.si); + v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); + v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); + v.ui >>= shift; // logical shift + v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); + v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC); + x = v.ui | sign; + +#endif + } + + HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {} + + template + HOSTDEVICE inline explicit float16(const T& val) + : x(float16(static_cast(val)).x) {} + +// Assignment operators +#ifdef PADDLE_CUDA_FP16 + HOSTDEVICE inline float16& operator=(const half& rhs) { +#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 + x = reinterpret_cast<__half_raw*>(const_cast(&rhs))->x; +#else + x = rhs.x; +#endif + return *this; + } +#endif + +#ifdef PADDLE_WITH_NATIVE_FP16 + HOSTDEVICE inline float16& operator=(const float16_t& rhs) { + x = *reinterpret_cast(&rhs); + return *this; + } +#endif + + HOSTDEVICE inline float16& operator=(bool b) { + x = b ? 0x3c00 : 0; + return *this; + } + + HOSTDEVICE inline float16& operator=(int8_t val) { + x = float16(val).x; + return *this; + } + + HOSTDEVICE inline float16& operator=(uint8_t val) { + x = float16(val).x; + return *this; + } + + HOSTDEVICE inline float16& operator=(int16_t val) { + x = float16(val).x; + return *this; + } + + HOSTDEVICE inline float16& operator=(uint16_t val) { + x = float16(val).x; + return *this; + } + + HOSTDEVICE inline float16& operator=(int32_t val) { + x = float16(val).x; + return *this; + } + + HOSTDEVICE inline float16& operator=(uint32_t val) { + x = float16(val).x; + return *this; + } + + HOSTDEVICE inline float16& operator=(int64_t val) { + x = float16(val).x; + return *this; + } + + HOSTDEVICE inline float16& operator=(uint64_t val) { + x = float16(val).x; + return *this; + } + + HOSTDEVICE inline float16& operator=(float val) { + x = float16(val).x; + return *this; + } + + HOSTDEVICE inline float16& operator=(double val) { + x = float16(val).x; + return *this; + } + +// Conversion opertors +#ifdef PADDLE_CUDA_FP16 + HOSTDEVICE inline half to_half() const { +#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 + __half_raw h; + h.x = x; + return half(h); +#else + half h; + h.x = x; + return h; +#endif // CUDA_VERSION >= 9000 + } +#endif // PADDLE_CUDA_FP16 + +#ifdef PADDLE_WITH_NATIVE_FP16 + HOSTDEVICE inline explicit operator float16_t() const { + return *reinterpret_cast(this); + } +#endif + + HOSTDEVICE inline operator float() const { +#if defined(PADDLE_CUDA_FP16) && \ + (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) + half tmp = *reinterpret_cast(this); + return __half2float(tmp); + +#elif defined(PADDLE_WITH_NATIVE_FP16) + float16x4_t res = vld1_dup_f16(reinterpret_cast(this)); + return vgetq_lane_f32(vcvt_f32_f16(res), 0); + +#elif defined(__F16C__) + return _cvtsh_ss(this->x); + +#else + // Conversion routine adapted from + // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion + Bits v; + v.ui = this->x; + int32_t sign = v.si & sigC; + v.si ^= sign; + sign <<= shiftSign; + v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); + v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + Bits s; + s.si = mulC; + s.f *= v.si; + int32_t mask = -(norC > v.si); + v.si <<= shift; + v.si ^= (s.si ^ v.si) & mask; + v.si |= sign; + return v.f; + +#endif + } + + HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } + + HOSTDEVICE inline explicit operator int8_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator uint8_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator int16_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator uint16_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator int32_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator uint32_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator int64_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator uint64_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline operator double() const { + return static_cast(static_cast(*this)); + } + + private: + union Bits { + float f; + int32_t si; + uint32_t ui; + }; + + static const int shift = 13; + static const int shiftSign = 16; + + static const int32_t infN = 0x7F800000; + static const int32_t maxN = 0x477FE000; // max flt16 as flt32 + static const int32_t minN = 0x38800000; // min flt16 normal as flt32 + static const int32_t sigN = 0x80000000; // sign bit + + static constexpr int32_t infC = infN >> shift; + static constexpr int32_t nanN = (infC + 1) + << shift; // minimum flt16 nan as float32 + static constexpr int32_t maxC = maxN >> shift; + static constexpr int32_t minC = minN >> shift; + static constexpr int32_t sigC = sigN >> shiftSign; + + static const int32_t mulN = 0x52000000; // (1 << 23) / minN + static const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) + static const int32_t subC = 0x003FF; // max flt32 subnormal downshifted + static const int32_t norC = 0x00400; // min flt32 normal downshifted + + static constexpr int32_t maxD = infC - maxC - 1; + static constexpr int32_t minD = minC - subC - 1; +}; + +// Arithmetic operators on GPU +// CUDA 9.0 provides built-in arithmetic operators for half while +// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are +// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in +// CUDA 9.0 regarding the half data type. +// ROCM has built-in arithmetic operators as not defined +// __HIP_NO_HALF_OPERATORS__ +#if defined(PADDLE_CUDA_FP16) && !defined(__HIPCC__) && CUDA_VERSION < 9000 +DEVICE inline half operator+(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hadd(a, b); +#else + float res = static_cast(float16(a)) + static_cast(float16(b)); + return float16(res).to_half(); +#endif +} + +DEVICE inline half operator-(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hsub(a, b); +#else + float res = static_cast(float16(a)) - static_cast(float16(b)); + return float16(res).to_half(); +#endif +} + +DEVICE inline half operator*(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hmul(a, b); +#else + float res = static_cast(float16(a)) * static_cast(float16(b)); + return float16(res).to_half(); +#endif +} + +DEVICE inline half operator/(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); +#else + float res = static_cast(float16(a)) / static_cast(float16(b)); + return float16(res).to_half(); +#endif +} + +DEVICE inline half operator-(const half& a) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hneg(a); +#else + float res = -static_cast(float16(a)); + return float16(res).to_half(); +#endif +} + +#ifndef PADDLE_WITH_HIP // not defined __HIP_NO_HALF_OPERATORS__ +DEVICE inline half& operator+=(half& a, const half& b) { // NOLINT + a = a + b; + return a; +} + +DEVICE inline half& operator-=(half& a, const half& b) { // NOLINT + a = a - b; + return a; +} + +DEVICE inline half& operator*=(half& a, const half& b) { // NOLINT + a = a * b; + return a; +} + +DEVICE inline half& operator/=(half& a, const half& b) { // NOLINT + a = a / b; + return a; +} +#endif + +DEVICE inline bool operator==(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __heq(a, b); +#else + return static_cast(float16(a)) == static_cast(float16(b)); +#endif +} + +DEVICE inline bool operator!=(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hne(a, b); +#else + return static_cast(float16(a)) != static_cast(float16(b)); +#endif +} + +DEVICE inline bool operator<(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hlt(a, b); +#else + return static_cast(float16(a)) < static_cast(float16(b)); +#endif +} + +DEVICE inline bool operator<=(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hle(a, b); +#else + return static_cast(float16(a)) <= static_cast(float16(b)); +#endif +} + +DEVICE inline bool operator>(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hgt(a, b); +#else + return static_cast(float16(a)) > static_cast(float16(b)); +#endif +} + +DEVICE inline bool operator>=(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hge(a, b); +#else + return static_cast(float16(a)) >= static_cast(float16(b)); +#endif +} + +#endif // PADDLE_CUDA_FP16 + +// Arithmetic operators for float16 on GPU +#if defined(PADDLE_CUDA_FP16) +// HIPCC has compile error if call __device__ function __hadd, __hsub, etc. +// in __host__ __device__ function +#if defined(__HIPCC__) +DEVICE inline float16 operator+(const float16& a, const float16& b) { + return float16(__hadd(a.to_half(), b.to_half())); +} +HOST inline float16 operator+(const float16& a, const float16& b) { + return float16(static_cast(a) + static_cast(b)); +} +#else +HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return float16(__hadd(a.to_half(), b.to_half())); +#else + return float16(static_cast(a) + static_cast(b)); +#endif +} +#endif + +#if defined(__HIPCC__) +DEVICE inline float16 operator-(const float16& a, const float16& b) { + return float16(__hsub(a.to_half(), b.to_half())); +} +HOST inline float16 operator-(const float16& a, const float16& b) { + return float16(static_cast(a) - static_cast(b)); +} +#else +HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return float16(__hsub(a.to_half(), b.to_half())); +#else + return float16(static_cast(a) - static_cast(b)); +#endif +} +#endif + +#if defined(__HIPCC__) +DEVICE inline float16 operator*(const float16& a, const float16& b) { + return float16(__hmul(a.to_half(), b.to_half())); +} +HOST inline float16 operator*(const float16& a, const float16& b) { + return float16(static_cast(a) * static_cast(b)); +} +#else +HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return float16(__hmul(a.to_half(), b.to_half())); +#else + return float16(static_cast(a) * static_cast(b)); +#endif +} +#endif + +#if defined(__HIPCC__) +DEVICE inline float16 operator/(const float16& a, const float16& b) { + return float16(__hdiv(a.to_half(), b.to_half())); +} +HOST inline float16 operator/(const float16& a, const float16& b) { + return float16(static_cast(a) / static_cast(b)); +} +#else +HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + // TODO(kexinzhao): check which cuda version starts to support __hdiv + float num = __half2float(a.to_half()); + float denom = __half2float(b.to_half()); + return float16(num / denom); +#else + return float16(static_cast(a) / static_cast(b)); +#endif +} +#endif + +#if defined(__HIPCC__) +DEVICE inline float16 operator-(const float16& a) { + return float16(__hneg(a.to_half())); +} +HOST inline float16 operator-(const float16& a) { + float16 res; + res.x = a.x ^ 0x8000; + return res; +} +#else +HOSTDEVICE inline float16 operator-(const float16& a) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return float16(__hneg(a.to_half())); +#else + float16 res; + res.x = a.x ^ 0x8000; + return res; +#endif +} +#endif + +HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { // NOLINT + a = a + b; + return a; +} + +HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) { // NOLINT + a = a - b; + return a; +} + +HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) { // NOLINT + a = a * b; + return a; +} + +HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { // NOLINT + a = a / b; + return a; +} + +// HIPCC has compile error if call __device__ function __heq, __hne, etc. +// in __host__ __device__ function +#if defined(__HIPCC__) +DEVICE inline bool operator==(const float16& a, const float16& b) { + return __heq(a.to_half(), b.to_half()); +} +HOST inline bool operator==(const float16& a, const float16& b) { + return static_cast(a) == static_cast(b); +} +#else // __HIPCC__ +HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __heq(a.to_half(), b.to_half()); +#else + return static_cast(a) == static_cast(b); +#endif +} +#endif // __HIPCC__ + +#if defined(__HIPCC__) +DEVICE inline bool operator!=(const float16& a, const float16& b) { + return __hne(a.to_half(), b.to_half()); +} +HOST inline bool operator!=(const float16& a, const float16& b) { + return static_cast(a) != static_cast(b); +} +#else // __HIPCC__ +HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hne(a.to_half(), b.to_half()); +#else + return static_cast(a) != static_cast(b); +#endif +} +#endif // __HIPCC__ + +#if defined(__HIPCC__) +DEVICE inline bool operator<(const float16& a, const float16& b) { + return __hlt(a.to_half(), b.to_half()); +} +HOST inline bool operator<(const float16& a, const float16& b) { + return static_cast(a) < static_cast(b); +} +#else // __HIPCC__ +HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hlt(a.to_half(), b.to_half()); +#else + return static_cast(a) < static_cast(b); +#endif +} +#endif // __HIPCC__ + +#if defined(__HIPCC__) +DEVICE inline bool operator<=(const float16& a, const float16& b) { + return __hle(a.to_half(), b.to_half()); +} +HOST inline bool operator<=(const float16& a, const float16& b) { + return static_cast(a) <= static_cast(b); +} +#else // __HIPCC__ +HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hle(a.to_half(), b.to_half()); +#else + return static_cast(a) <= static_cast(b); +#endif +} +#endif // __HIPCC__ + +#if defined(__HIPCC__) +DEVICE inline bool operator>(const float16& a, const float16& b) { + return __hgt(a.to_half(), b.to_half()); +} +HOST inline bool operator>(const float16& a, const float16& b) { + return static_cast(a) > static_cast(b); +} +#else // __HIPCC__ +HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hgt(a.to_half(), b.to_half()); +#else + return static_cast(a) > static_cast(b); +#endif +} +#endif // __HIPCC__ + +#if defined(__HIPCC__) +DEVICE inline bool operator>=(const float16& a, const float16& b) { + return __hge(a.to_half(), b.to_half()); +} +HOST inline bool operator>=(const float16& a, const float16& b) { + return static_cast(a) >= static_cast(b); +} +#else // __HIPCC__ +HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hge(a.to_half(), b.to_half()); +#else + return static_cast(a) >= static_cast(b); +#endif +} +#endif // __HIPCC__ + +// Arithmetic operators for float16 on ARMv8.2-A CPU +#elif defined(PADDLE_WITH_NATIVE_FP16) +inline float16 operator+(const float16& a, const float16& b) { + float16 res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fadd h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&(res.x)) + : // clobbers + "memory", "v0", "v1"); + return res; +} + +inline float16 operator-(const float16& a, const float16& b) { + float16 res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fsub h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&(res.x)) + : // clobbers + "memory", "v0", "v1"); + return res; +} + +inline float16 operator*(const float16& a, const float16& b) { + float16 res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fmul h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&(res.x)) + : // clobbers + "memory", "v0", "v1"); + return res; +} + +inline float16 operator/(const float16& a, const float16& b) { + float16 res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fdiv h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&(res.x)) + : // clobbers + "memory", "v0", "v1"); + return res; +} + +inline float16 operator-(const float16& a) { + float16 res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "fneg h0, h0\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [res_ptr] "r"(&(res.x)) + : // clobbers + "memory", "v0"); + return res; +} + +inline float16& operator+=(float16& a, const float16& b) { // NOLINT + a = a + b; + return a; +} + +inline float16& operator-=(float16& a, const float16& b) { // NOLINT + a = a - b; + return a; +} + +inline float16& operator*=(float16& a, const float16& b) { // NOLINT + a = a * b; + return a; +} + +inline float16& operator/=(float16& a, const float16& b) { // NOLINT + a = a / b; + return a; +} + +inline bool operator==(const float16& a, const float16& b) { + uint16_t res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fcmeq h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&res) + : // clobbers + "memory", "v0", "v1"); + return (res & 0xffff) != 0; +} + +inline bool operator!=(const float16& a, const float16& b) { return !(a == b); } + +inline bool operator<(const float16& a, const float16& b) { + uint16_t res; + asm volatile( + "ld1 {v1.h}[0], [%[a_ptr]]\n" + "ld1 {v0.h}[0], [%[b_ptr]]\n" + "fcmgt h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&res) + : // clobbers + "memory", "v0", "v1"); + return (res & 0xffff) != 0; +} + +inline bool operator<=(const float16& a, const float16& b) { + uint16_t res; + asm volatile( + "ld1 {v1.h}[0], [%[a_ptr]]\n" + "ld1 {v0.h}[0], [%[b_ptr]]\n" + "fcmge h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&res) + : // clobbers + "memory", "v0", "v1"); + return (res & 0xffff) != 0; +} + +inline bool operator>(const float16& a, const float16& b) { + uint16_t res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fcmgt h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&res) + : // clobbers + "memory", "v0", "v1"); + return (res & 0xffff) != 0; +} + +inline bool operator>=(const float16& a, const float16& b) { + uint16_t res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fcmge h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&res) + : // clobbers + "memory", "v0", "v1"); + return (res & 0xffff) != 0; +} + +// Arithmetic operators for float16, software emulated on other CPU +#else +inline float16 operator+(const float16& a, const float16& b) { + return float16(static_cast(a) + static_cast(b)); +} + +inline float16 operator-(const float16& a, const float16& b) { + return float16(static_cast(a) - static_cast(b)); +} + +inline float16 operator*(const float16& a, const float16& b) { + return float16(static_cast(a) * static_cast(b)); +} + +inline float16 operator/(const float16& a, const float16& b) { + return float16(static_cast(a) / static_cast(b)); +} + +inline float16 operator-(const float16& a) { + float16 res; + res.x = a.x ^ 0x8000; + return res; +} + +inline float16& operator+=(float16& a, const float16& b) { // NOLINT + a = float16(static_cast(a) + static_cast(b)); + return a; +} + +inline float16& operator-=(float16& a, const float16& b) { // NOLINT + a = float16(static_cast(a) - static_cast(b)); + return a; +} + +inline float16& operator*=(float16& a, const float16& b) { // NOLINT + a = float16(static_cast(a) * static_cast(b)); + return a; +} + +inline float16& operator/=(float16& a, const float16& b) { // NOLINT + a = float16(static_cast(a) / static_cast(b)); + return a; +} + +inline bool operator==(const float16& a, const float16& b) { + return static_cast(a) == static_cast(b); +} + +inline bool operator!=(const float16& a, const float16& b) { + return static_cast(a) != static_cast(b); +} + +inline bool operator<(const float16& a, const float16& b) { + return static_cast(a) < static_cast(b); +} + +inline bool operator<=(const float16& a, const float16& b) { + return static_cast(a) <= static_cast(b); +} + +inline bool operator>(const float16& a, const float16& b) { + return static_cast(a) > static_cast(b); +} + +inline bool operator>=(const float16& a, const float16& b) { + return static_cast(a) >= static_cast(b); +} +#endif + +HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) { + float16 res; + res.x = a; + return res; +} + +// HIPCC has compile error if call __device__ function __hisnan in __host__ +// __device__ function +#if defined(PADDLE_CUDA_FP16) && defined(__HIPCC__) +DEVICE inline bool(isnan)(const float16& a) { return __hisnan(a.to_half()); } +HOST inline bool(isnan)(const float16& a) { return (a.x & 0x7fff) > 0x7c00; } +#else +HOSTDEVICE inline bool(isnan)(const float16& a) { +#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hisnan(a.to_half()); +#else + return (a.x & 0x7fff) > 0x7c00; +#endif +} +#endif + +HOSTDEVICE inline bool(isinf)(const float16& a) { + return (a.x & 0x7fff) == 0x7c00; +} + +HOSTDEVICE inline bool(isfinite)(const float16& a) { + return !((isnan)(a)) && !((isinf)(a)); +} + +HOSTDEVICE inline float16(abs)(const float16& a) { +#if defined(PADDLE_CUDA_FP16) && \ + (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)) + return float16(::fabs(static_cast(a))); +#else + return float16(std::abs(static_cast(a))); +#endif +} + +inline std::ostream& operator<<(std::ostream& os, const float16& a) { + os << static_cast(a); + return os; +} + +} // namespace dtype +} // namespace pten + +namespace std { + +// Override the std::is_pod::value for float16 +// The reason is that different compilers implemented std::is_pod based on +// different C++ standards. float16 class is a plain old data in C++11 given +// that it is both trivial and standard_layout. +// However, std::is_pod in nvcc 8.0 host c++ compiler follows C++0x and is +// more restricted in that you cannot provide any customized +// constructor in float16. Hence, we override is_pod here following C++11 +// so that .cu files can be successfully compiled by nvcc. +template <> +struct is_pod { + static const bool value = is_trivial::value && + is_standard_layout::value; +}; + +template <> +struct is_floating_point + : std::integral_constant< + bool, + std::is_same< + pten::dtype::float16, + typename std::remove_cv::type>::value> {}; +template <> +struct is_signed { + static const bool value = true; +}; + +template <> +struct is_unsigned { + static const bool value = false; +}; + +inline bool isnan(const pten::dtype::float16& a) { + return pten::dtype::isnan(a); +} + +inline bool isinf(const pten::dtype::float16& a) { + return pten::dtype::isinf(a); +} + +template <> +struct numeric_limits { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_to_nearest; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 11; + static const int digits10 = 3; + static const int max_digits10 = 5; + static const int radix = 2; + static const int min_exponent = -13; + static const int min_exponent10 = -4; + static const int max_exponent = 16; + static const int max_exponent10 = 4; + static const bool traps = true; + static const bool tinyness_before = false; + + HOSTDEVICE static pten::dtype::float16(min)() { + return pten::dtype::raw_uint16_to_float16(0x400); + } + HOSTDEVICE static pten::dtype::float16 lowest() { + return pten::dtype::raw_uint16_to_float16(0xfbff); + } + HOSTDEVICE static pten::dtype::float16(max)() { + return pten::dtype::raw_uint16_to_float16(0x7bff); + } + HOSTDEVICE static pten::dtype::float16 epsilon() { + return pten::dtype::raw_uint16_to_float16(0x0800); + } + HOSTDEVICE static pten::dtype::float16 round_error() { + return pten::dtype::float16(0.5); + } + HOSTDEVICE static pten::dtype::float16 infinity() { + return pten::dtype::raw_uint16_to_float16(0x7c00); + } + HOSTDEVICE static pten::dtype::float16 quiet_NaN() { + return pten::dtype::raw_uint16_to_float16(0x7e00); + } + HOSTDEVICE static pten::dtype::float16 signaling_NaN() { + return pten::dtype::raw_uint16_to_float16(0x7e00); + } + HOSTDEVICE static pten::dtype::float16 denorm_min() { + return pten::dtype::raw_uint16_to_float16(0x1); + } +}; + +HOSTDEVICE inline pten::dtype::float16 abs(const pten::dtype::float16& a) { + return pten::dtype::abs(a); +} + +} // namespace std diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc index cfe2cfa03e..4008b6f6ce 100644 --- a/paddle/pten/core/dense_tensor.cc +++ b/paddle/pten/core/dense_tensor.cc @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/pten/core/dense_tensor.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/pten/common/bfloat16.h" +#include "paddle/pten/common/complex.h" +#include "paddle/pten/common/float16.h" #include "paddle/pten/api/lib/utils/storage.h" #include "paddle/pten/core/convert_utils.h" diff --git a/paddle/pten/kernels/complex_kernel.h b/paddle/pten/kernels/complex_kernel.h index d12fc730fe..ff27144eb4 100644 --- a/paddle/pten/kernels/complex_kernel.h +++ b/paddle/pten/kernels/complex_kernel.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/platform/complex.h" +#include "paddle/pten/common/complex.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/infermeta/unary.h" #include "paddle/pten/kernels/empty_kernel.h" diff --git a/paddle/pten/kernels/cpu/complex_kernel.cc b/paddle/pten/kernels/cpu/complex_kernel.cc index 59a7577153..6cdba15620 100644 --- a/paddle/pten/kernels/cpu/complex_kernel.cc +++ b/paddle/pten/kernels/cpu/complex_kernel.cc @@ -19,7 +19,7 @@ #include "paddle/pten/core/kernel_registry.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/complex.h" +#include "paddle/pten/common/complex.h" PT_REGISTER_KERNEL(conj, CPU, diff --git a/paddle/pten/kernels/cpu/dot_grad_kernel.cc b/paddle/pten/kernels/cpu/dot_grad_kernel.cc index ed927f820f..91202cf836 100644 --- a/paddle/pten/kernels/cpu/dot_grad_kernel.cc +++ b/paddle/pten/kernels/cpu/dot_grad_kernel.cc @@ -18,7 +18,7 @@ #include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/core/kernel_registry.h" -#include "paddle/fluid/platform/complex.h" +#include "paddle/pten/common/complex.h" PT_REGISTER_KERNEL(dot_grad, CPU, diff --git a/paddle/pten/kernels/cpu/dot_kernel.cc b/paddle/pten/kernels/cpu/dot_kernel.cc index 0baf9ba0a8..5745737bba 100644 --- a/paddle/pten/kernels/cpu/dot_kernel.cc +++ b/paddle/pten/kernels/cpu/dot_kernel.cc @@ -18,7 +18,7 @@ #include "paddle/pten/core/kernel_registry.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/complex.h" +#include "paddle/pten/common/complex.h" namespace pten { diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc index 83388d0d9a..7841dd4113 100644 --- a/paddle/pten/kernels/cpu/math_kernel.cc +++ b/paddle/pten/kernels/cpu/math_kernel.cc @@ -25,8 +25,8 @@ // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/complex.h" +#include "paddle/pten/common/bfloat16.h" +#include "paddle/pten/common/complex.h" namespace pten { diff --git a/paddle/pten/kernels/cpu/matmul_grad_kernel.cc b/paddle/pten/kernels/cpu/matmul_grad_kernel.cc index 4738e21573..955f2b017b 100644 --- a/paddle/pten/kernels/cpu/matmul_grad_kernel.cc +++ b/paddle/pten/kernels/cpu/matmul_grad_kernel.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/pten/kernels/matmul_grad_kernel.h" -#include "paddle/fluid/platform/complex.h" +#include "paddle/pten/common/complex.h" #include "paddle/pten/core/kernel_registry.h" #include "paddle/pten/kernels/impl/matmul_grad_kernel_impl.h" diff --git a/paddle/pten/kernels/cpu/matmul_kernel.cc b/paddle/pten/kernels/cpu/matmul_kernel.cc index f749e9cb27..51def07d40 100644 --- a/paddle/pten/kernels/cpu/matmul_kernel.cc +++ b/paddle/pten/kernels/cpu/matmul_kernel.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/core/kernel_registry.h" -#include "paddle/fluid/platform/complex.h" +#include "paddle/pten/common/complex.h" #include "paddle/pten/kernels/impl/matmul_kernel_impl.h" PT_REGISTER_KERNEL(matmul, diff --git a/paddle/pten/kernels/cpu/scale_kernel.cc b/paddle/pten/kernels/cpu/scale_kernel.cc index 7088bba01a..52949b5896 100644 --- a/paddle/pten/kernels/cpu/scale_kernel.cc +++ b/paddle/pten/kernels/cpu/scale_kernel.cc @@ -22,7 +22,7 @@ limitations under the License. */ // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/platform/bfloat16.h" +#include "paddle/pten/common/bfloat16.h" namespace pten { template diff --git a/paddle/pten/kernels/cpu/sign_kernel.cc b/paddle/pten/kernels/cpu/sign_kernel.cc index 25fa2bb5fe..642d6ff6c5 100644 --- a/paddle/pten/kernels/cpu/sign_kernel.cc +++ b/paddle/pten/kernels/cpu/sign_kernel.cc @@ -19,6 +19,6 @@ limitations under the License. */ #include "paddle/pten/core/kernel_registry.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/bfloat16.h" +#include "paddle/pten/common/bfloat16.h" PT_REGISTER_KERNEL(sign, CPU, ALL_LAYOUT, pten::SignKernel, float, double) {} diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc index c133d7fc79..d6a155dca0 100644 --- a/paddle/pten/kernels/empty_kernel.cc +++ b/paddle/pten/kernels/empty_kernel.cc @@ -16,7 +16,7 @@ #include "paddle/pten/backends/all_context.h" #include "paddle/pten/core/kernel_registry.h" -#include "paddle/fluid/platform/complex.h" +#include "paddle/pten/common/complex.h" namespace pten { diff --git a/paddle/pten/kernels/funcs/elementwise_functor.h b/paddle/pten/kernels/funcs/elementwise_functor.h index a61d0de8fe..6b89902456 100644 --- a/paddle/pten/kernels/funcs/elementwise_functor.h +++ b/paddle/pten/kernels/funcs/elementwise_functor.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/common/float16.h" namespace pten { namespace funcs { diff --git a/paddle/pten/kernels/funcs/transpose.cc b/paddle/pten/kernels/funcs/transpose.cc index 5a40abbd1b..77d26fcbc3 100644 --- a/paddle/pten/kernels/funcs/transpose.cc +++ b/paddle/pten/kernels/funcs/transpose.cc @@ -18,9 +18,9 @@ #include "paddle/pten/core/dense_tensor.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/pten/common/bfloat16.h" +#include "paddle/pten/common/complex.h" +#include "paddle/pten/common/float16.h" namespace pten { namespace math { diff --git a/paddle/pten/kernels/funcs/transpose.cu b/paddle/pten/kernels/funcs/transpose.cu index 77a345d7a0..045bfdbdb0 100644 --- a/paddle/pten/kernels/funcs/transpose.cu +++ b/paddle/pten/kernels/funcs/transpose.cu @@ -19,9 +19,9 @@ #include "paddle/pten/kernels/funcs/transpose.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/pten/common/bfloat16.h" +#include "paddle/pten/common/complex.h" +#include "paddle/pten/common/float16.h" namespace pten { diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu index aa61155221..3774c56370 100644 --- a/paddle/pten/kernels/gpu/cast_kernel.cu +++ b/paddle/pten/kernels/gpu/cast_kernel.cu @@ -21,10 +21,10 @@ // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/platform/aligned_vector.h" -#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/device/gpu/gpu_helper.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/pten/common/bfloat16.h" +#include "paddle/pten/common/float16.h" namespace pten { diff --git a/paddle/pten/kernels/gpu/complex_kernel.cu b/paddle/pten/kernels/gpu/complex_kernel.cu index 1c82077793..cd9c95de2a 100644 --- a/paddle/pten/kernels/gpu/complex_kernel.cu +++ b/paddle/pten/kernels/gpu/complex_kernel.cu @@ -19,7 +19,7 @@ #include "paddle/pten/core/kernel_registry.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/complex.h" +#include "paddle/pten/common/complex.h" PT_REGISTER_KERNEL(conj, GPU, diff --git a/paddle/pten/kernels/gpu/dot_grad_kernel.cu b/paddle/pten/kernels/gpu/dot_grad_kernel.cu index 4b0d7fed4c..90c37ea1b0 100644 --- a/paddle/pten/kernels/gpu/dot_grad_kernel.cu +++ b/paddle/pten/kernels/gpu/dot_grad_kernel.cu @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/pten/backends/gpu/gpu_context.h" #include "paddle/pten/core/kernel_registry.h" -#include "paddle/fluid/platform/complex.h" +#include "paddle/pten/common/complex.h" PT_REGISTER_KERNEL(dot_grad, GPU, diff --git a/paddle/pten/kernels/gpu/dot_kernel.cu b/paddle/pten/kernels/gpu/dot_kernel.cu index 18bab5c15a..5fe397e128 100644 --- a/paddle/pten/kernels/gpu/dot_kernel.cu +++ b/paddle/pten/kernels/gpu/dot_kernel.cu @@ -20,7 +20,7 @@ // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/platform/complex.h" +#include "paddle/pten/common/complex.h" namespace pten { diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu index 1fd085ab5f..80931db56c 100644 --- a/paddle/pten/kernels/gpu/math_kernel.cu +++ b/paddle/pten/kernels/gpu/math_kernel.cu @@ -27,9 +27,9 @@ limitations under the License. */ namespace cub = hipcub; #endif -#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/pten/common/complex.h" +#include "paddle/pten/common/float16.h" #include "paddle/pten/core/convert_utils.h" #include "paddle/pten/core/kernel_registry.h" diff --git a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu index 993b17f6b8..31c44673f9 100644 --- a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu +++ b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/pten/kernels/matmul_grad_kernel.h" -#include "paddle/fluid/platform/complex.h" +#include "paddle/pten/common/complex.h" #include "paddle/pten/core/kernel_registry.h" #include "paddle/pten/kernels/impl/matmul_grad_kernel_impl.h" diff --git a/paddle/pten/kernels/gpu/matmul_kernel.cu b/paddle/pten/kernels/gpu/matmul_kernel.cu index a3ab88913a..f9fdbd27bf 100644 --- a/paddle/pten/kernels/gpu/matmul_kernel.cu +++ b/paddle/pten/kernels/gpu/matmul_kernel.cu @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/pten/backends/gpu/gpu_context.h" #include "paddle/pten/core/kernel_registry.h" -#include "paddle/fluid/platform/complex.h" +#include "paddle/pten/common/complex.h" #include "paddle/pten/kernels/impl/matmul_kernel_impl.h" PT_REGISTER_KERNEL(matmul, diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu index b49902ff5e..e729dad3b3 100644 --- a/paddle/pten/kernels/gpu/scale_kernel.cu +++ b/paddle/pten/kernels/gpu/scale_kernel.cu @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/pten/core/kernel_registry.h" #include "paddle/pten/kernels/funcs/elementwise_base.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/float16.h" +#include "paddle/pten/common/float16.h" namespace pten { diff --git a/paddle/pten/kernels/gpu/sign_kernel.cu b/paddle/pten/kernels/gpu/sign_kernel.cu index 16356507dc..2a96ff6530 100644 --- a/paddle/pten/kernels/gpu/sign_kernel.cu +++ b/paddle/pten/kernels/gpu/sign_kernel.cu @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/pten/core/kernel_registry.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/float16.h" +#include "paddle/pten/common/float16.h" using float16 = paddle::platform::float16; diff --git a/python/setup.py.in b/python/setup.py.in index 6b38facb5f..aee4e149b0 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -569,9 +569,6 @@ headers = ( list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/common')) + # pten common headers # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform` # to `paddle/pten/api/ext`, - ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/bfloat16.h'] + - ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex.h'] + - ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h'] + ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h']) if '${WITH_MKLDNN}' == 'ON': -- GitLab