[CustomOp] Support float16 in custom op (#31725)

* support float16 in custom op * fix failed unittests

[CustomOp] Support float16 in custom op (#31725)
* support float16 in custom op * fix failed unittests
878e117b · Chen Weihang · GitHub · c9e1d9dc · 878e117b · 878e117b
13 changed file
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -198,6 +198,9 @@ copy(inference_lib_dist
 copy(inference_lib_dist
        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex128.h
        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
 # CAPI inference library for only inference
 set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING

--- a/paddle/fluid/extension/include/ext_dispatch.h
+++ b/paddle/fluid/extension/include/ext_dispatch.h
@@ -47,6 +47,22 @@ namespace paddle {
    }                                                                     \
  }()
+#define PD_DISPATCH_FLOATING_AND_HALF_TYPES(TYPE, NAME, ...)                   \
+  [&] {                                                                        \
+    const auto& __dtype__ = TYPE;                                              \
+    switch (__dtype__) {                                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,           \
+                           __VA_ARGS__)                                        \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,          \
+                           __VA_ARGS__)                                        \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT16, paddle::float16, \
+                           __VA_ARGS__)                                        \
+      default:                                                                 \
+        PD_THROW("function " #NAME " is not implemented for data type `",      \
+                 ::paddle::ToString(__dtype__), "`");                          \
+    }                                                                          \
+  }()
 ///////// Integral Dispatch Marco ///////////
 #define PD_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...)                           \

--- a/paddle/fluid/extension/include/ext_dtype.h
+++ b/paddle/fluid/extension/include/ext_dtype.h
@@ -19,11 +19,13 @@ limitations under the License. */
 #include "complex128.h"     // NOLINT
 #include "complex64.h"      // NOLINT
 #include "ext_exception.h"  // NOLINT
+#include "float16.h"        // NOLINT
 namespace paddle {
 using complex64 = paddle::platform::complex64;
 using complex128 = paddle::platform::complex128;
+using float16 = paddle::platform::float16;
 enum class DataType {
  BOOL,
@@ -32,6 +34,7 @@ enum class DataType {
  INT16,
  INT32,
  INT64,
+  FLOAT16,
  FLOAT32,
  FLOAT64,
  COMPLEX64,
@@ -53,6 +56,8 @@ inline std::string ToString(DataType dtype) {
      return "int32_t";
    case DataType::INT64:
      return "int64_t";
+    case DataType::FLOAT16:
+      return "float16";
    case DataType::FLOAT32:
      return "float";
    case DataType::FLOAT64:
@@ -73,6 +78,7 @@ inline std::string ToString(DataType dtype) {
  _(int16_t, DataType::INT16)       \
  _(int, DataType::INT32)           \
  _(int64_t, DataType::INT64)       \
+  _(float16, DataType::FLOAT16)     \
  _(float, DataType::FLOAT32)       \
  _(double, DataType::FLOAT64)      \
  _(complex64, DataType::COMPLEX64) \

--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ b/paddle/fluid/extension/src/ext_tensor.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/transform.h"
 namespace paddle {
@@ -170,6 +171,8 @@ DataType Tensor::type() const {
    return DataType::COMPLEX64;
  } else if (type == framework::proto::VarType::COMPLEX128) {
    return DataType::COMPLEX128;
+  } else if (type == framework::proto::VarType::FP16) {
+    return DataType::FLOAT16;
  }
  // TODO(JiabinYang) Support more dtype here
  return DataType::FLOAT32;
@@ -229,6 +232,8 @@ template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex64>(
    const PlaceType &target_place) const;
 template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex128>(
    const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const;
 template PD_DLL_DECL float *Tensor::data<float>() const;
 template PD_DLL_DECL double *Tensor::data<double>() const;
@@ -242,6 +247,8 @@ template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::data<paddle::platform::complex64>() const;
 template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::data<paddle::platform::complex128>() const;
+template PD_DLL_DECL paddle::platform::float16 *
+Tensor::data<paddle::platform::float16>() const;
 template PD_DLL_DECL float *Tensor::mutable_data<float>();
 template PD_DLL_DECL double *Tensor::mutable_data<double>();
@@ -255,6 +262,8 @@ template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::mutable_data<paddle::platform::complex64>();
 template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::mutable_data<paddle::platform::complex128>();
+template PD_DLL_DECL paddle::platform::float16 *
+Tensor::mutable_data<paddle::platform::float16>();
 template PD_DLL_DECL float *Tensor::mutable_data<float>(const PlaceType &place);
 template PD_DLL_DECL double *Tensor::mutable_data<double>(
@@ -274,6 +283,8 @@ template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::mutable_data<paddle::platform::complex64>(const PlaceType &place);
 template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::mutable_data<paddle::platform::complex128>(const PlaceType &place);
+template PD_DLL_DECL paddle::platform::float16 *
+Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
 std::vector<int64_t> Tensor::shape() const {
  GET_CASTED_TENSOR
@@ -344,6 +355,11 @@ Tensor Tensor::cast(const DataType &target_type) const {
                               CastDataType<paddle::platform::complex128>(
                                   *tensor, rlt_tensor_, ctx));
      break;
+    case framework::proto::VarType::FP16:
+      framework::VisitDataType(
+          dst_type,
+          CastDataType<paddle::platform::float16>(*tensor, rlt_tensor_, ctx));
+      break;
    // TODO(JiabinYang) Support more dtype here
    default:
      PADDLE_THROW(platform::errors::Unimplemented(

--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -113,6 +113,8 @@ void GroupTestCopy() {
  TestCopyTensor<paddle::complex64>();
  VLOG(2) << "complex128 cpu-cpu-gpu-gpu-cpu";
  TestCopyTensor<paddle::complex128>();
+  VLOG(2) << "Fp16 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::float16>();
 }
 void GroupTestCast() {
@@ -134,6 +136,8 @@ void GroupTestCast() {
  TestCast<paddle::complex64>(paddle::DataType::FLOAT32);
  VLOG(2) << "complex128 cast";
  TestCast<paddle::complex128>(paddle::DataType::FLOAT32);
+  VLOG(2) << "float16 cast";
+  TestCast<paddle::float16>(paddle::DataType::FLOAT16);
 }
 void GroupTestDtype() {
@@ -146,6 +150,7 @@ void GroupTestDtype() {
  CHECK(TestDtype<uint8_t>() == paddle::DataType::UINT8);
  CHECK(TestDtype<paddle::complex64>() == paddle::DataType::COMPLEX64);
  CHECK(TestDtype<paddle::complex128>() == paddle::DataType::COMPLEX128);
+  CHECK(TestDtype<paddle::float16>() == paddle::DataType::FLOAT16);
 }
 void GroupTestDtypeConvert() {
@@ -178,6 +183,9 @@ void GroupTestDtypeConvert() {
  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
            paddle::DataType::COMPLEX128) ==
        paddle::framework::proto::VarType::COMPLEX128);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::FLOAT16) ==
+        paddle::framework::proto::VarType::FP16);
  // proto -> enum
  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
            paddle::framework::proto::VarType::FP64) ==
@@ -207,6 +215,9 @@ void GroupTestDtypeConvert() {
  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
            paddle::framework::proto::VarType::COMPLEX128) ==
        paddle::DataType::COMPLEX128);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::FP16) ==
+        paddle::DataType::FLOAT16);
 }
 TEST(CustomTensor, copyTest) {

--- a/paddle/fluid/framework/custom_tensor_utils.h
+++ b/paddle/fluid/framework/custom_tensor_utils.h
@@ -60,6 +60,8 @@ class CustomTensorUtils {
        return framework::proto::VarType::COMPLEX64;
      case paddle::DataType::COMPLEX128:
        return framework::proto::VarType::COMPLEX128;
+      case paddle::DataType::FLOAT16:
+        return framework::proto::VarType::FP16;
      case paddle::DataType::BOOL:
        return framework::proto::VarType::BOOL;
      default:
@@ -91,6 +93,8 @@ class CustomTensorUtils {
        return paddle::DataType::COMPLEX64;
      case framework::proto::VarType::COMPLEX128:
        return paddle::DataType::COMPLEX128;
+      case framework::proto::VarType::FP16:
+        return paddle::DataType::FLOAT16;
      case framework::proto::VarType::BOOL:
        return paddle::DataType::BOOL;
      default:

--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -13,24 +13,15 @@ endif()
 py_test(test_sysconfig SRCS test_sysconfig.py)
-# 'test_dispatch' compile .cc file
+# CPU custom op tests: only compile .cc file
 py_test(test_dispatch_jit SRCS test_dispatch_jit.py)
-set_tests_properties(test_dispatch_jit PROPERTIES TIMEOUT 120)
 py_test(test_multi_out_jit SRCS test_multi_out_jit.py)
-set_tests_properties(test_multi_out_jit PROPERTIES TIMEOUT 120)
 py_test(test_custom_attrs_jit SRCS test_custom_attrs_jit.py)
-set_tests_properties(test_custom_attrs_jit PROPERTIES TIMEOUT 120)
 py_test(test_custom_concat SRCS test_custom_concat.py)
-set_tests_properties(test_custom_concat PROPERTIES TIMEOUT 120)
 py_test(test_custom_conj SRCS test_custom_conj.py)
-set_tests_properties(test_custom_conj PROPERTIES TIMEOUT 120)
+# other tests
 py_test(test_check_abi SRCS test_check_abi.py)
 cc_test(test_check_error SRCS test_check_error.cc DEPS gtest)
 if(NOT LINUX)

--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -20,7 +20,7 @@ __global__ void relu_cuda_forward_kernel(const data_t* x,
                                         const int num) {
  int gid = blockIdx.x * blockDim.x + threadIdx.x;
  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    y[i] = max(x[i], static_cast<data_t>(0.));
+    y[i] = x[i] > static_cast<data_t>(0.) ? x[i] : static_cast<data_t>(0.);
  }
 }
@@ -31,7 +31,8 @@ __global__ void relu_cuda_backward_kernel(const data_t* dy,
                                          const int num) {
  int gid = blockIdx.x * blockDim.x + threadIdx.x;
  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.);
+    dx[i] = dy[i] * (y[i] > static_cast<data_t>(0.) ? static_cast<data_t>(1.)
+                                                    : static_cast<data_t>(0.));
  }
 }
@@ -42,7 +43,7 @@ std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
  int numel = x.size();
  int block = 512;
  int grid = (numel + block - 1) / block;
-  PD_DISPATCH_FLOATING_TYPES(
+  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
      x.type(), "relu_cuda_forward_kernel", ([&] {
        relu_cuda_forward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
            x.data<data_t>(), out.mutable_data<data_t>(x.place()), numel);
@@ -60,7 +61,7 @@ std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
  int numel = out.size();
  int block = 512;
  int grid = (numel + block - 1) / block;
-  PD_DISPATCH_FLOATING_TYPES(
+  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
      out.type(), "relu_cuda_backward_kernel", ([&] {
        relu_cuda_backward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
            grad_out.data<data_t>(),

--- a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
@@ -118,3 +118,21 @@ PD_BUILD_OP(dispatch_test_float_and_integer_and_complex)
    .Inputs({"X"})
    .Outputs({"Out"})
    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndIntegerAndComplex));
+std::vector<paddle::Tensor> DispatchTestFloatAndHalf(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+  return {out};
+}
+PD_BUILD_OP(dispatch_test_float_and_half)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndHalf));
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -50,11 +50,17 @@ class TestJITLoad(unittest.TestCase):
            custom_module.custom_relu, custom_module.custom_relu_dup
        ]
        self.dtypes = ['float32', 'float64']
-        self.devices = ['cpu', 'gpu']
+        if paddle.is_compiled_with_cuda():
+            self.dtypes.append('float16')
+        self.devices = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            self.devices.append('gpu')
    def test_static(self):
        for device in self.devices:
            for dtype in self.dtypes:
+                if device == 'cpu' and dtype == 'float16':
+                    continue
                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                for custom_op in self.custom_ops:
                    out = custom_relu_static(custom_op, device, dtype, x)
@@ -68,6 +74,8 @@ class TestJITLoad(unittest.TestCase):
    def test_dynamic(self):
        for device in self.devices:
            for dtype in self.dtypes:
+                if device == 'cpu' and dtype == 'float16':
+                    continue
                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                for custom_op in self.custom_ops:
                    out, x_grad = custom_relu_dynamic(custom_op, device, dtype,
@@ -87,7 +95,7 @@ class TestJITLoad(unittest.TestCase):
        caught_exception = False
        try:
            x = np.random.uniform(-1, 1, [4, 8]).astype('int32')
-            custom_relu_dynamic(custom_module.custom_relu, 'cpu', 'float32', x)
+            custom_relu_dynamic(custom_module.custom_relu, 'cpu', 'int32', x)
        except OSError as e:
            caught_exception = True
            self.assertTrue(
@@ -105,15 +113,15 @@ class TestJITLoad(unittest.TestCase):
        caught_exception = False
        try:
-            x = np.random.uniform(-1, 1, [4, 8]).astype('int64')
+            x = np.random.uniform(-1, 1, [4, 8]).astype('int32')
-            custom_relu_dynamic(custom_module.custom_relu, 'gpu', 'float32', x)
+            custom_relu_dynamic(custom_module.custom_relu, 'gpu', 'int32', x)
        except OSError as e:
            caught_exception = True
            self.assertTrue(
-                "function \"relu_cuda_forward_kernel\" is not implemented for data type `int64_t`"
+                "function \"relu_cuda_forward_kernel\" is not implemented for data type `int32_t`"
                in str(e))
            self.assertTrue(
-                "python/paddle/fluid/tests/custom_op/custom_relu_op.cu:49" in
+                "python/paddle/fluid/tests/custom_op/custom_relu_op.cu:50" in
                str(e))
        self.assertTrue(caught_exception)

--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
@@ -26,7 +26,7 @@ from paddle.utils.cpp_extension.extension_utils import run_cmd
 def custom_relu_dynamic(func, device, dtype, np_x, use_func=True):
    paddle.set_device(device)
-    t = paddle.to_tensor(np_x)
+    t = paddle.to_tensor(np_x, dtype=dtype)
    t.stop_gradient = False
    out = func(t) if use_func else paddle.nn.functional.relu(t)
@@ -171,7 +171,11 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
        ]
        self.dtypes = ['float32', 'float64']
-        self.devices = ['cpu', 'gpu']
+        if paddle.is_compiled_with_cuda():
+            self.dtypes.append('float16')
+        self.devices = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            self.devices.append('gpu')
        # config seed
        SEED = 2021
@@ -181,6 +185,8 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
    def test_static(self):
        for device in self.devices:
            for dtype in self.dtypes:
+                if device == 'cpu' and dtype == 'float16':
+                    continue
                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                for custom_op in self.custom_ops:
                    out = custom_relu_static(custom_op, device, dtype, x)
@@ -194,6 +200,8 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
    def test_static_pe(self):
        for device in self.devices:
            for dtype in self.dtypes:
+                if device == 'cpu' and dtype == 'float16':
+                    continue
                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                for custom_op in self.custom_ops:
                    out = custom_relu_static_pe(custom_op, device, dtype, x)
@@ -207,6 +215,8 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
    def test_dynamic(self):
        for device in self.devices:
            for dtype in self.dtypes:
+                if device == 'cpu' and dtype == 'float16':
+                    continue
                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                for custom_op in self.custom_ops:
                    out, x_grad = custom_relu_dynamic(custom_op, device, dtype,

--- a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
@@ -83,6 +83,12 @@ class TestJitDispatch(unittest.TestCase):
            self.run_dispatch_test(
                dispatch_op.dispatch_test_float_and_integer_and_complex, dtype)
+    def test_dispatch_float_and_half(self):
+        dtypes = ["float32", "float64", "float16"]
+        for dtype in dtypes:
+            self.run_dispatch_test(dispatch_op.dispatch_test_float_and_half,
+                                   dtype)
 if __name__ == '__main__':
    unittest.main()
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -453,15 +453,12 @@ class InstallHeaders(Command):
    def copy_data_type_headers(self, header):
        if os.name == 'nt':
-            data_type_headers = ['platform\\complex64.h', 'platform\\complex128.h']
+            data_type_headers = ['platform\\complex64.h', 'platform\\complex128.h', 'platform\\float16.h']
        else:
-            data_type_headers = ['platform/complex64.h', 'platform/complex128.h']
+            data_type_headers = ['platform/complex64.h', 'platform/complex128.h', 'platform/float16.h']
        for dtype_header in data_type_headers:
            if dtype_header in header:
-                if os.name == 'nt':
+                install_dir = os.path.join(self.install_dir, "paddle/fluid/extension/include")
-                    install_dir = os.path.join(self.install_dir, "paddle\\fluid\\extension\\include")
-                else:
-                    install_dir = os.path.join(self.install_dir, "paddle/fluid/extension/include")
                if not os.path.exists(install_dir):
                    self.mkpath(install_dir)
                return self.copy_file(header, install_dir)