PaddlePaddle Hackathon 3 No.45 & 46】：为 Paddle cumsum和logcumsumexp 支持 float16 数据类型 (#45952)

c91b1b91 · thunder95 · GitHub · a3436672 · c91b1b91 · c91b1b91
6 changed file
--- a/paddle/phi/kernels/gpu/cum_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_kernel.cu
@@ -27,11 +27,25 @@ namespace cub = hipcub;
 #endif
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
 namespace phi {
+template <typename T>
+class CumTypeTrait {
+ public:
+  using Type = T;
+};
+template <>
+class CumTypeTrait<phi::dtype::float16> {
+ public:
+  using Type = __half;
+};
 template <typename T, int BLOCK_SIZE>
 __device__ void BlockReverse(
    const T* idata, T* odata, int src_base, int dst_base, int valid_item) {
@@ -39,7 +53,7 @@ __device__ void BlockReverse(
  int tx = threadIdx.x;
  int offset = tx;
-  T src_data = 0;
+  T src_data = static_cast<T>(0);
  int src_offset = BLOCK_SIZE - offset - 1;
  if (src_offset < valid_item) {
    src_data = idata[src_base + src_offset];
@@ -160,14 +174,18 @@ __global__ void BlockScanKernel(T* d_out,
                                int scan_size,
                                bool exclusive,
                                Op op) {
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
  // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
  typedef cub::
-      BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_TRANSPOSE>
+      BlockLoad<MT, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_TRANSPOSE>
          BlockLoadT;
-  typedef cub::
+  typedef cub::BlockStore<MT,
-      BlockStore<T, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_STORE_TRANSPOSE>
+                          BLOCK_THREADS,
+                          ITEMS_PER_THREAD,
+                          cub::BLOCK_STORE_TRANSPOSE>
      BlockStoreT;
-  typedef cub::BlockScan<T, BLOCK_THREADS> BlockScanT;
+  typedef cub::BlockScan<MT, BLOCK_THREADS> BlockScanT;
  // Allocate type-safe, repurposable shared memory for collectives
  __shared__ union {
    typename BlockLoadT::TempStorage load;
@@ -176,8 +194,7 @@ __global__ void BlockScanKernel(T* d_out,
  } temp_storage;
  int bx = blockIdx.x;
+  BlockPrefixCallbackOp<MT, Op> prefix_op(Identity<MT, Op>::value, op);
-  BlockPrefixCallbackOp<T, Op> prefix_op(Identity<T, Op>::value, op);
  // Obtain this block's segment of consecutive keys (blocked across threads)
  int item_per_block = BLOCK_THREADS * ITEMS_PER_THREAD;
@@ -192,7 +209,7 @@ __global__ void BlockScanKernel(T* d_out,
    int offset = block_offset + bx * scan_size;
-    T thread_keys[ITEMS_PER_THREAD];
+    MT thread_keys[ITEMS_PER_THREAD];
    BlockLoadT(temp_storage.load)
        .Load(d_in + offset, thread_keys, valid_item, 0);
@@ -241,17 +258,22 @@ void ScanKernel(const Context& dev_ctx,
  // Use thrust for parallel acceleration when the input size is equal to the
  // length of the ‘axis’ dimension.
-  if (std::is_same<Op, cub::Sum>::value && size == out_dims[axis]) {
+  if (!std::is_same<T, phi::dtype::float16>::value &&
+      std::is_same<Op, cub::Sum>::value && size == out_dims[axis]) {
 #ifdef __HIPCC__
    const auto& policy = thrust::hip::par.on(dev_ctx.stream());
 #else
    const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
 #endif
+    using CumType = typename CumTypeTrait<T>::Type;
+    CumType* out_data_ptr = reinterpret_cast<CumType*>(out_data);
+    const CumType* in_data_ptr = reinterpret_cast<const CumType*>(in_data);
    if (reverse) {
-      thrust::reverse_iterator<thrust::device_ptr<const T>> reversed_in(
+      thrust::reverse_iterator<thrust::device_ptr<const CumType>> reversed_in(
-          thrust::device_pointer_cast(in_data) + size);
+          thrust::device_pointer_cast(in_data_ptr) + size);
-      thrust::reverse_iterator<thrust::device_ptr<T>> reversed_out(
+      thrust::reverse_iterator<thrust::device_ptr<CumType>> reversed_out(
-          thrust::device_pointer_cast(out_data) + size);
+          thrust::device_pointer_cast(out_data_ptr) + size);
      if (exclusive) {
        thrust::exclusive_scan(
            policy, reversed_in, reversed_in + size, reversed_out);
@@ -261,11 +283,14 @@ void ScanKernel(const Context& dev_ctx,
      }
    } else {
      if (exclusive) {
-        thrust::exclusive_scan(policy, in_data, in_data + size, out_data);
+        thrust::exclusive_scan(
+            policy, in_data_ptr, in_data_ptr + size, out_data_ptr);
      } else {
-        thrust::inclusive_scan(policy, in_data, in_data + size, out_data);
+        thrust::inclusive_scan(
+            policy, in_data_ptr, in_data_ptr + size, out_data_ptr);
      }
    }
    return;
  }
@@ -305,7 +330,6 @@ void ScanKernel(const Context& dev_ctx,
  int outer_size = height / scan_size;
  int inner_size = width;
  // Consider the size of shared memory, here block size is 128
  dim3 scan_grid(outer_size, inner_size);
  dim3 reverse_grid = scan_grid;
  if (reverse) {
@@ -380,6 +404,7 @@ void LogcumsumexpKernel(const Context& dev_ctx,
 }  // namespace phi
+#ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(cumsum,
                   GPU,
                   ALL_LAYOUT,
@@ -392,3 +417,23 @@ PD_REGISTER_KERNEL(cumsum,
 PD_REGISTER_KERNEL(
    logcumsumexp, GPU, ALL_LAYOUT, phi::LogcumsumexpKernel, float, double) {}
+#else
+PD_REGISTER_KERNEL(cumsum,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CumsumKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(logcumsumexp,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogcumsumexpKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
--- a/paddle/phi/kernels/gpu/logcumsumexp_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/logcumsumexp_grad_kernel.cu
@@ -20,9 +20,19 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/logcumsumexp_grad_impl.h"
+#ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(logcumsumexp_grad,
                   GPU,
                   ALL_LAYOUT,
                   phi::LogcumsumexpGradKernel,
                   float,
                   double) {}
+#else
+PD_REGISTER_KERNEL(logcumsumexp_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogcumsumexpGradKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
+#endif
--- a/paddle/phi/kernels/impl/logcumsumexp_grad_impl.h
+++ b/paddle/phi/kernels/impl/logcumsumexp_grad_impl.h
@@ -12,9 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <limits>
+#pragma once
+#include <limits>
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cum_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -55,32 +57,38 @@ void LogcumsumexpGradKernel(const Context& dev_ctx,
  auto eigen_d_out = EigenVector<T>::Flatten(d_out);
  auto& place = *dev_ctx.eigen_device();
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
  DenseTensor output_pos;
  output_pos.Resize(d_out.dims());
-  dev_ctx.template Alloc<T>(&output_pos);
+  dev_ctx.template Alloc<MT>(&output_pos);
-  auto eigen_output_pos = EigenVector<T>::Flatten(output_pos);
+  auto eigen_output_pos = EigenVector<MT>::Flatten(output_pos);
  DenseTensor output_neg;
  output_neg.Resize(d_out.dims());
-  dev_ctx.template Alloc<T>(&output_neg);
+  dev_ctx.template Alloc<MT>(&output_neg);
-  auto eigen_output_neg = EigenVector<T>::Flatten(output_neg);
+  auto eigen_output_neg = EigenVector<MT>::Flatten(output_neg);
  DenseTensor tmp;
  tmp.Resize(d_out.dims());
-  dev_ctx.template Alloc<T>(&tmp);
+  dev_ctx.template Alloc<MT>(&tmp);
-  auto eigen_tmp = EigenVector<T>::Flatten(tmp);
+  auto eigen_tmp = EigenVector<MT>::Flatten(tmp);
  eigen_tmp.device(place) =
-      eigen_d_out.unaryExpr(LogGradPositiveFunctor<T>()) - eigen_out;
+      eigen_d_out.template cast<MT>().unaryExpr(LogGradPositiveFunctor<MT>()) -
-  LogcumsumexpKernel<T, Context>(
+      eigen_out.template cast<MT>();
+  LogcumsumexpKernel<MT, Context>(
      dev_ctx, tmp, axis, flatten, exclusive, reverse, &output_pos);
-  eigen_output_pos.device(place) = (eigen_output_pos + eigen_x).exp();
+  auto out_pos = eigen_output_pos + eigen_x.template cast<MT>();
+  eigen_output_pos.device(place) = out_pos.exp();
  eigen_tmp.device(place) =
-      eigen_d_out.unaryExpr(LogGradNegativeFunctor<T>()) - eigen_out;
+      eigen_d_out.template cast<MT>().unaryExpr(LogGradNegativeFunctor<MT>()) -
-  LogcumsumexpKernel<T, Context>(
+      eigen_out.template cast<MT>();
+  LogcumsumexpKernel<MT, Context>(
      dev_ctx, tmp, axis, flatten, exclusive, reverse, &output_neg);
-  eigen_output_neg.device(place) = (eigen_output_neg + eigen_x).exp();
+  auto out_neg = eigen_output_neg + eigen_x.template cast<MT>();
+  eigen_output_neg.device(place) = out_neg.exp();
  auto eigen_d_x = EigenVector<T>::Flatten(*d_x);
-  eigen_d_x.device(place) = eigen_output_pos - eigen_output_neg;
+  eigen_d_x.device(place) =
+      (eigen_output_pos - eigen_output_neg).template cast<T>();
 }
 }  // namespace phi
--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
@@ -199,6 +199,32 @@ class TestSumOp7(OpTest):
        self.check_grad(['X'], 'Out')
+class TestCumsumFP16(unittest.TestCase):
+    def check_main(self, x_np, dtype):
+        paddle.disable_static()
+        x = paddle.to_tensor(x_np.astype(dtype))
+        x.stop_gradient = False
+        y = paddle.cumsum(x, dtype=dtype)
+        x_g = paddle.grad(y, [x])
+        y_np = y.numpy().astype('float32')
+        x_g_np = x_g[0].numpy().astype('float32')
+        paddle.enable_static()
+        return y_np, x_g_np
+    def test_main(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        np.random.seed(20)
+        x_np = np.random.random([10, 12])
+        y_np_1, x_g_np_1 = self.check_main(x_np, 'float16')
+        y_np_2, x_g_np_2 = self.check_main(x_np, 'float32')
+        np.testing.assert_allclose(y_np_1, y_np_2, rtol=1e-03)
+        np.testing.assert_allclose(x_g_np_1, x_g_np_2, rtol=1e-03)
 class TestSumOpExclusive1(OpTest):
    def setUp(self):
@@ -289,6 +315,24 @@ class TestSumOpExclusive5(OpTest):
        self.check_output()
+class TestSumOpExclusiveFP16(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.attrs = {'axis': 2, "exclusive": True, "dtype": "float16"}
+        a = np.random.random((4, 5, 3096)).astype("float64")
+        self.inputs = {'X': a}
+        self.outputs = {
+            'Out':
+            np.concatenate((np.zeros(
+                (4, 5, 1), dtype=np.float64), a[:, :, :-1].cumsum(axis=2)),
+                           axis=2)
+        }
+    def test_check_output(self):
+        self.check_output()
 class TestSumOpReverseExclusive(OpTest):
    def setUp(self):

--- a/python/paddle/fluid/tests/unittests/test_logcumsumexp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_logcumsumexp_op.py
@@ -210,6 +210,8 @@ class BaseTestCases:
            input, attrs = self.input_and_attrs()
            self.inputs = {'X': input}
            self.attrs = attrs
+            if "dtype" in attrs:
+                del attrs["dtype"]
            self.outputs = {'Out': np_logcumsumexp(input, **attrs)}
        def test_check_output(self):
@@ -264,5 +266,36 @@ class TestLogcumsumexpOp4(BaseTestCases.BaseOpTest):
        }
+class TestLogcumsumexpFP16(unittest.TestCase):
+    def check_main(self, x_np, dtype, axis=None):
+        paddle.disable_static()
+        x = paddle.to_tensor(x_np.astype(dtype))
+        x.stop_gradient = False
+        y = paddle.logcumsumexp(x, dtype=dtype, axis=axis)
+        x_g = paddle.grad(y, [x])
+        y_np = y.numpy().astype('float32')
+        x_g_np = x_g[0].numpy().astype('float32')
+        paddle.enable_static()
+        return y_np, x_g_np
+    def test_main(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        np.random.seed(20)
+        x_np = np.random.random([10, 12])
+        y_np_1, x_g_np_1 = self.check_main(x_np, 'float16')
+        y_np_2, x_g_np_2 = self.check_main(x_np, 'float32')
+        np.testing.assert_allclose(y_np_1, y_np_2, rtol=1e-03)
+        np.testing.assert_allclose(x_g_np_1, x_g_np_2, rtol=1e-03)
+        y_np_1, x_g_np_1 = self.check_main(x_np, 'float16', axis=1)
+        y_np_2, x_g_np_2 = self.check_main(x_np, 'float32', axis=1)
+        np.testing.assert_allclose(y_np_1, y_np_2, rtol=1e-03)
+        np.testing.assert_allclose(x_g_np_1, x_g_np_2, rtol=2e-03)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -3173,7 +3173,7 @@ def cumsum(x, axis=None, dtype=None, name=None):
    Args:
        x (Tensor): The input tensor needed to be cumsumed.
        axis (int, optional): The dimension to accumulate along. -1 means the last dimension. The default (None) is to compute the cumsum over the flattened array.
-        dtype (str, optional): The data type of the output tensor, can be float32, float64, int32, int64. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None.
+        dtype (str, optional): The data type of the output tensor, can be float16, float32, float64, int32, int64. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None.
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
    Returns:
@@ -3246,7 +3246,7 @@ def logcumsumexp(x, axis=None, dtype=None, name=None):
    Args:
        x (Tensor): The input tensor.
        axis (int, optional): The dimension to do the operation along. -1 means the last dimension. The default (None) is to compute the cumsum over the flattened array.
-        dtype (str, optional): The data type of the output tensor, can be float32, float64. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None.
+        dtype (str, optional): The data type of the output tensor, can be float16, float32, float64. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None.
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
    Returns:
@@ -3295,7 +3295,8 @@ def logcumsumexp(x, axis=None, dtype=None, name=None):
            return _legacy_C_ops.logcumsumexp(x, 'axis', axis, 'flatten',
                                              flatten)
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], "logcumsumexp")
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             "logcumsumexp")
    helper = LayerHelper('logcumsumexp', **locals())
    out = helper.create_variable_for_type_inference(x.dtype)