未验证 提交 01eeba5e 编写于 作者: S Siming Dai 提交者: GitHub

[AMP OP&Test] Support fp16/bf16 for cumsum (#51694)

* add fp16 unittest

* support bf16 and add unittest

* fix according to review
上级 9c238d2b
......@@ -29,6 +29,7 @@ namespace cub = hipcub;
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -82,5 +83,6 @@ PD_REGISTER_KERNEL(cumsum_grad,
int16_t,
int,
int64_t,
phi::dtype::float16) {}
phi::dtype::float16,
phi::dtype::bfloat16) {}
#endif
......@@ -28,6 +28,7 @@ namespace cub = hipcub;
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -217,7 +218,8 @@ __global__ void BlockScanKernel(T* d_out,
}
template <typename Context, typename T>
typename std::enable_if<!std::is_same<T, phi::dtype::float16>::value>::type
typename std::enable_if<!std::is_same<T, phi::dtype::float16>::value &&
!std::is_same<T, phi::dtype::bfloat16>::value>::type
ThrustCumsumKernel(const Context& dev_ctx,
const T* in_data,
T* out_data,
......@@ -261,6 +263,15 @@ ThrustCumsumKernel(const Context& dev_ctx,
bool reverse,
bool exclusive) {}
template <typename Context, typename T>
typename std::enable_if<std::is_same<T, phi::dtype::bfloat16>::value>::type
ThrustCumsumKernel(const Context& dev_ctx,
const phi::dtype::bfloat16* in_data,
phi::dtype::bfloat16* out_data,
int64_t size,
bool reverse,
bool exclusive) {}
template <typename T, typename Context, typename Op>
void ScanKernel(const Context& dev_ctx,
const DenseTensor& x,
......@@ -301,6 +312,7 @@ void ScanKernel(const Context& dev_ctx,
// Use thrust for parallel acceleration when the input size is equal to the
// length of the ‘axis’ dimension.
if (!std::is_same<T, phi::dtype::float16>::value &&
!std::is_same<T, phi::dtype::bfloat16>::value &&
std::is_same<Op, cub::Sum>::value && size == out_dims[axis]) {
ThrustCumsumKernel<Context, T>(
dev_ctx, in_data, out_data, size, reverse, exclusive);
......@@ -440,7 +452,8 @@ PD_REGISTER_KERNEL(cumsum,
int16_t,
int,
int64_t,
phi::dtype::float16) {}
phi::dtype::float16,
phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL(logcumsumexp,
GPU,
......
......@@ -17,7 +17,7 @@ import tempfile
import unittest
import numpy as np
from op_test import OpTest
from op_test import OpTest, convert_float_to_uint16
import paddle
import paddle.fluid as fluid
......@@ -117,10 +117,15 @@ class TestSumOp1(OpTest):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
self.enable_cinn = True
self.attrs = {'axis': 2}
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
self.outputs = {'Out': self.inputs['X'].cumsum(axis=2)}
self.set_enable_cinn()
self.init_dtype()
self.set_attrs_input_output()
if self.dtype == np.uint16:
self.inputs = {'X': convert_float_to_uint16(self.x)}
self.outputs = {'Out': convert_float_to_uint16(self.out)}
else:
self.inputs = {'X': self.x}
self.outputs = {'Out': self.out}
def test_check_output(self):
self.check_output()
......@@ -128,109 +133,56 @@ class TestSumOp1(OpTest):
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True)
def init_dtype(self):
self.dtype = self.dtype_ = np.float64
class TestSumOp2(OpTest):
def setUp(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
def set_enable_cinn(self):
self.enable_cinn = True
self.attrs = {'axis': -1, 'reverse': True}
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
self.outputs = {
'Out': np.flip(
np.flip(self.inputs['X'], axis=2).cumsum(axis=2), axis=2
)
}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True)
def set_attrs_input_output(self):
self.attrs = {'axis': 2}
self.x = np.random.random((5, 6, 10)).astype(self.dtype_)
self.out = self.x.cumsum(axis=2)
class TestSumOp3(OpTest):
def setUp(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
self.enable_cinn = True
self.attrs = {'axis': 1}
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
class TestSumOp2(TestSumOp1):
def set_attrs_input_output(self):
self.attrs = {'axis': -1, 'reverse': True}
self.x = np.random.random((5, 6, 10)).astype(self.dtype_)
self.out = np.flip(np.flip(self.x, axis=2).cumsum(axis=2), axis=2)
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True)
class TestSumOp3(TestSumOp1):
def set_attrs_input_output(self):
self.attrs = {'axis': 1}
self.x = np.random.random((5, 6, 10)).astype(self.dtype_)
self.out = self.x.cumsum(axis=1)
class TestSumOp4(OpTest):
def setUp(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
self.enable_cinn = True
class TestSumOp4(TestSumOp1):
def set_attrs_input_output(self):
self.attrs = {'axis': 0}
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
self.x = np.random.random((5, 6, 10)).astype(self.dtype_)
self.out = self.x.cumsum(axis=0)
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True)
class TestSumOp5(TestSumOp1):
def set_attrs_input_output(self):
self.x = np.random.random((5, 20)).astype(self.dtype_)
self.out = self.x.cumsum(axis=1)
class TestSumOp5(OpTest):
def setUp(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
self.enable_cinn = True
self.inputs = {'X': np.random.random((5, 20)).astype("float64")}
self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True)
class TestSumOp6(OpTest):
def setUp(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
class TestSumOp6(TestSumOp1):
def set_attrs_input_output(self):
self.attrs = {'axis': -1, 'flatten': True}
self.inputs = {'X': np.random.random((5, 6, 5)).astype("float64")}
self.outputs = {'Out': self.inputs['X'].cumsum()}
self.enable_cinn = False
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True)
class TestSumOp7(OpTest):
def setUp(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
self.enable_cinn = True
self.inputs = {'X': np.random.random((100)).astype("float64")}
self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
self.x = np.random.random((5, 6, 5)).astype(self.dtype_)
self.out = self.x.cumsum()
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True)
class TestSumOp7(TestSumOp1):
def set_attrs_input_output(self):
self.x = np.random.random((100)).astype(self.dtype_)
self.out = self.x.cumsum(axis=0)
class TestCumsumFP16(unittest.TestCase):
......@@ -263,19 +215,15 @@ class TestSumOpExclusive1(OpTest):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
self.enable_cinn = True
self.attrs = {'axis': 2, "exclusive": True}
a = np.random.random((4, 5, 20)).astype("float64")
self.inputs = {'X': a}
self.outputs = {
'Out': np.concatenate(
(
np.zeros((4, 5, 1), dtype=np.float64),
a[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
}
self.set_enable_cinn()
self.init_dtype()
self.set_attrs_input_output()
if self.dtype == np.uint16:
self.inputs = {'X': convert_float_to_uint16(self.x)}
self.outputs = {'Out': convert_float_to_uint16(self.out)}
else:
self.inputs = {'X': self.x}
self.outputs = {'Out': self.out}
def test_check_output(self):
self.check_output()
......@@ -283,103 +231,98 @@ class TestSumOpExclusive1(OpTest):
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True)
def init_dtype(self):
self.dtype = self.dtype_ = np.float64
class TestSumOpExclusive2(OpTest):
def setUp(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
def set_enable_cinn(self):
self.enable_cinn = True
self.attrs = {'axis': 2, "exclusive": True}
a = np.random.random((1, 1, 100)).astype("float64")
self.inputs = {'X': a}
self.outputs = {
'Out': np.concatenate(
(
np.zeros((1, 1, 1), dtype=np.float64),
a[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True)
def set_attrs_input_output(self):
self.attrs = {'axis': 2, 'exclusive': True}
self.x = np.random.random((4, 5, 20)).astype(self.dtype_)
self.out = np.concatenate(
(
np.zeros((4, 5, 1), dtype=self.dtype_),
self.x[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
class TestSumOpExclusive3(OpTest):
def setUp(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
self.enable_cinn = True
self.attrs = {'axis': 2, "exclusive": True}
a = np.random.random((4, 5, 20)).astype("float64")
self.inputs = {'X': a}
self.outputs = {
'Out': np.concatenate(
(
np.zeros((4, 5, 1), dtype=np.float64),
a[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
}
class TestSumOpExclusive2(TestSumOpExclusive1):
def set_attrs_input_output(self):
self.attrs = {'axis': 2, 'exclusive': True}
self.x = np.random.random((1, 1, 100)).astype(self.dtype_)
self.out = np.concatenate(
(
np.zeros((1, 1, 1), dtype=self.dtype_),
self.x[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True)
class TestSumOpExclusive3(TestSumOpExclusive1):
def set_attrs_input_output(self):
self.attrs = {'axis': 2, 'exclusive': True}
self.x = np.random.random((4, 5, 20)).astype(self.dtype_)
self.out = np.concatenate(
(
np.zeros((4, 5, 1), dtype=self.dtype_),
self.x[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
class TestSumOpExclusive4(OpTest):
def setUp(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
self.enable_cinn = True
self.attrs = {'axis': 2, "exclusive": True}
a = np.random.random((1, 1, 100)).astype("float64")
self.inputs = {'X': a}
self.outputs = {
'Out': np.concatenate(
(
np.zeros((1, 1, 1), dtype=np.float64),
a[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
}
class TestSumOpExclusive4(TestSumOpExclusive1):
def set_attrs_input_output(self):
self.attrs = {'axis': 2, 'exclusive': True}
self.x = np.random.random((1, 1, 100)).astype(self.dtype_)
self.out = np.concatenate(
(
np.zeros((1, 1, 1), dtype=self.dtype_),
self.x[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True)
class TestSumOpExclusive5(TestSumOpExclusive1):
def set_attrs_input_output(self):
self.attrs = {'axis': 2, 'exclusive': True}
self.x = np.random.random((4, 5, 40)).astype(self.dtype_)
self.out = np.concatenate(
(
np.zeros((4, 5, 1), dtype=self.dtype_),
self.x[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
class TestSumOpExclusive5(OpTest):
class TestSumOpExclusiveFP16(OpTest):
def setUp(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
self.enable_cinn = True
self.init_dtype()
self.enable_cinn = False
self.attrs = {'axis': 2, "exclusive": True}
a = np.random.random((4, 5, 40)).astype("float64")
self.inputs = {'X': a}
self.outputs = {
'Out': np.concatenate(
(
np.zeros((4, 5, 1), dtype=np.float64),
a[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
}
self.x = np.random.random((4, 5, 20)).astype(self.dtype)
self.out = np.concatenate(
(
np.zeros((4, 5, 1), dtype=self.dtype),
self.x[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
if self.dtype == np.uint16:
self.inputs = {'X': convert_float_to_uint16(self.x)}
self.outputs = {'Out': convert_float_to_uint16(self.out)}
else:
self.inputs = {'X': self.x}
self.outputs = {'Out': self.out}
def test_check_output(self):
self.check_output()
......@@ -387,25 +330,37 @@ class TestSumOpExclusive5(OpTest):
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True)
def init_dtype(self):
self.dtype = np.float16
class TestSumOpExclusiveFP16(OpTest):
class TestSumOpReverseExclusive(OpTest):
def setUp(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
self.enable_cinn = False
self.attrs = {'axis': 2, "exclusive": True}
a = np.random.random((4, 5, 20)).astype("float16")
self.inputs = {'X': a}
self.outputs = {
'Out': np.concatenate(
(
np.zeros((4, 5, 1), dtype=np.float16),
a[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
self.set_enable_cinn()
self.init_dtype()
self.attrs = {
'axis': 2,
'reverse': True,
'exclusive': True,
}
self.x = np.random.random((4, 5, 6)).astype(self.dtype_)
a = np.flip(self.x, axis=2)
self.out = np.concatenate(
(
np.flip(a[:, :, :-1].cumsum(axis=2), axis=2),
np.zeros((4, 5, 1), dtype=self.dtype_),
),
axis=2,
)
if self.dtype == np.uint16:
self.inputs = {'X': convert_float_to_uint16(self.x)}
self.outputs = {'Out': convert_float_to_uint16(self.out)}
else:
self.inputs = {'X': self.x}
self.outputs = {'Out': self.out}
def test_check_output(self):
self.check_output()
......@@ -413,32 +368,89 @@ class TestSumOpExclusiveFP16(OpTest):
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True)
def init_dtype(self):
self.dtype = self.dtype_ = np.float64
class TestSumOpReverseExclusive(OpTest):
def setUp(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
def set_enable_cinn(self):
self.enable_cinn = True
self.attrs = {'axis': 2, 'reverse': True, "exclusive": True}
a = np.random.random((4, 5, 6)).astype("float64")
self.inputs = {'X': a}
a = np.flip(a, axis=2)
self.outputs = {
'Out': np.concatenate(
(
np.flip(a[:, :, :-1].cumsum(axis=2), axis=2),
np.zeros((4, 5, 1), dtype=np.float64),
),
axis=2,
)
}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True)
def create_test_fp16_class(parent, max_relative_error=1e-2):
class TestCumsumFP16Op(parent):
def init_dtype(self):
self.dtype = self.dtype_ = np.float16
def set_enable_cinn(self):
self.enable_cinn = False
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(
['X'],
'Out',
check_prim=True,
)
cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
TestCumsumFP16Op.__name__ = cls_name
globals()[cls_name] = TestCumsumFP16Op
create_test_fp16_class(TestSumOp1)
create_test_fp16_class(TestSumOp2)
create_test_fp16_class(TestSumOp3)
create_test_fp16_class(TestSumOp4)
create_test_fp16_class(TestSumOp5)
create_test_fp16_class(TestSumOp6)
create_test_fp16_class(TestSumOpExclusive1)
create_test_fp16_class(TestSumOpExclusive2)
create_test_fp16_class(TestSumOpExclusive3)
create_test_fp16_class(TestSumOpExclusive4)
create_test_fp16_class(TestSumOpExclusive5)
create_test_fp16_class(TestSumOpReverseExclusive)
def create_test_bf16_class(parent):
@unittest.skipIf(
not core.is_compiled_with_cuda()
or not core.is_bfloat16_supported(core.CUDAPlace(0)),
"core is not compiled with CUDA or not support bfloat16",
)
class TestCumsumBF16Op(parent):
def init_dtype(self):
self.dtype = np.uint16
self.dtype_ = np.float32
def set_enable_cinn(self):
self.enable_cinn = False
def test_check_output(self):
place = paddle.CUDAPlace(0)
self.check_output_with_place(place, check_prim=True)
def test_check_grad(self):
place = paddle.CUDAPlace(0)
self.check_grad_with_place(place, ["X"], "Out", check_prim=True)
cls_name = "{0}_{1}".format(parent.__name__, "BF16")
TestCumsumBF16Op.__name__ = cls_name
globals()[cls_name] = TestCumsumBF16Op
create_test_bf16_class(TestSumOp1)
create_test_bf16_class(TestSumOp2)
create_test_bf16_class(TestSumOp3)
create_test_bf16_class(TestSumOp4)
create_test_bf16_class(TestSumOp5)
create_test_bf16_class(TestSumOp6)
create_test_bf16_class(TestSumOpExclusive1)
create_test_bf16_class(TestSumOpExclusive2)
create_test_bf16_class(TestSumOpExclusive3)
create_test_bf16_class(TestSumOpExclusive4)
create_test_bf16_class(TestSumOpExclusive5)
create_test_bf16_class(TestSumOpReverseExclusive)
class BadInputTest(unittest.TestCase):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册