未验证 提交 01eeba5e 编写于 作者: S Siming Dai 提交者: GitHub

[AMP OP&Test] Support fp16/bf16 for cumsum (#51694)

* add fp16 unittest

* support bf16 and add unittest

* fix according to review
上级 9c238d2b
...@@ -29,6 +29,7 @@ namespace cub = hipcub; ...@@ -29,6 +29,7 @@ namespace cub = hipcub;
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h" #include "paddle/phi/common/float16.h"
#include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
...@@ -82,5 +83,6 @@ PD_REGISTER_KERNEL(cumsum_grad, ...@@ -82,5 +83,6 @@ PD_REGISTER_KERNEL(cumsum_grad,
int16_t, int16_t,
int, int,
int64_t, int64_t,
phi::dtype::float16) {} phi::dtype::float16,
phi::dtype::bfloat16) {}
#endif #endif
...@@ -28,6 +28,7 @@ namespace cub = hipcub; ...@@ -28,6 +28,7 @@ namespace cub = hipcub;
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h" #include "paddle/phi/common/float16.h"
#include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
...@@ -217,7 +218,8 @@ __global__ void BlockScanKernel(T* d_out, ...@@ -217,7 +218,8 @@ __global__ void BlockScanKernel(T* d_out,
} }
template <typename Context, typename T> template <typename Context, typename T>
typename std::enable_if<!std::is_same<T, phi::dtype::float16>::value>::type typename std::enable_if<!std::is_same<T, phi::dtype::float16>::value &&
!std::is_same<T, phi::dtype::bfloat16>::value>::type
ThrustCumsumKernel(const Context& dev_ctx, ThrustCumsumKernel(const Context& dev_ctx,
const T* in_data, const T* in_data,
T* out_data, T* out_data,
...@@ -261,6 +263,15 @@ ThrustCumsumKernel(const Context& dev_ctx, ...@@ -261,6 +263,15 @@ ThrustCumsumKernel(const Context& dev_ctx,
bool reverse, bool reverse,
bool exclusive) {} bool exclusive) {}
template <typename Context, typename T>
typename std::enable_if<std::is_same<T, phi::dtype::bfloat16>::value>::type
ThrustCumsumKernel(const Context& dev_ctx,
const phi::dtype::bfloat16* in_data,
phi::dtype::bfloat16* out_data,
int64_t size,
bool reverse,
bool exclusive) {}
template <typename T, typename Context, typename Op> template <typename T, typename Context, typename Op>
void ScanKernel(const Context& dev_ctx, void ScanKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
...@@ -301,6 +312,7 @@ void ScanKernel(const Context& dev_ctx, ...@@ -301,6 +312,7 @@ void ScanKernel(const Context& dev_ctx,
// Use thrust for parallel acceleration when the input size is equal to the // Use thrust for parallel acceleration when the input size is equal to the
// length of the ‘axis’ dimension. // length of the ‘axis’ dimension.
if (!std::is_same<T, phi::dtype::float16>::value && if (!std::is_same<T, phi::dtype::float16>::value &&
!std::is_same<T, phi::dtype::bfloat16>::value &&
std::is_same<Op, cub::Sum>::value && size == out_dims[axis]) { std::is_same<Op, cub::Sum>::value && size == out_dims[axis]) {
ThrustCumsumKernel<Context, T>( ThrustCumsumKernel<Context, T>(
dev_ctx, in_data, out_data, size, reverse, exclusive); dev_ctx, in_data, out_data, size, reverse, exclusive);
...@@ -440,7 +452,8 @@ PD_REGISTER_KERNEL(cumsum, ...@@ -440,7 +452,8 @@ PD_REGISTER_KERNEL(cumsum,
int16_t, int16_t,
int, int,
int64_t, int64_t,
phi::dtype::float16) {} phi::dtype::float16,
phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL(logcumsumexp, PD_REGISTER_KERNEL(logcumsumexp,
GPU, GPU,
......
...@@ -17,7 +17,7 @@ import tempfile ...@@ -17,7 +17,7 @@ import tempfile
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest, convert_float_to_uint16
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -117,10 +117,15 @@ class TestSumOp1(OpTest): ...@@ -117,10 +117,15 @@ class TestSumOp1(OpTest):
self.op_type = "cumsum" self.op_type = "cumsum"
self.prim_op_type = "prim" self.prim_op_type = "prim"
self.python_api = paddle.cumsum self.python_api = paddle.cumsum
self.enable_cinn = True self.set_enable_cinn()
self.attrs = {'axis': 2} self.init_dtype()
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} self.set_attrs_input_output()
self.outputs = {'Out': self.inputs['X'].cumsum(axis=2)} if self.dtype == np.uint16:
self.inputs = {'X': convert_float_to_uint16(self.x)}
self.outputs = {'Out': convert_float_to_uint16(self.out)}
else:
self.inputs = {'X': self.x}
self.outputs = {'Out': self.out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
...@@ -128,109 +133,56 @@ class TestSumOp1(OpTest): ...@@ -128,109 +133,56 @@ class TestSumOp1(OpTest):
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True) self.check_grad(['X'], 'Out', check_prim=True)
def init_dtype(self):
self.dtype = self.dtype_ = np.float64
class TestSumOp2(OpTest): def set_enable_cinn(self):
def setUp(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
self.enable_cinn = True self.enable_cinn = True
self.attrs = {'axis': -1, 'reverse': True}
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
self.outputs = {
'Out': np.flip(
np.flip(self.inputs['X'], axis=2).cumsum(axis=2), axis=2
)
}
def test_check_output(self): def set_attrs_input_output(self):
self.check_output() self.attrs = {'axis': 2}
self.x = np.random.random((5, 6, 10)).astype(self.dtype_)
def test_check_grad(self): self.out = self.x.cumsum(axis=2)
self.check_grad(['X'], 'Out', check_prim=True)
class TestSumOp3(OpTest): class TestSumOp2(TestSumOp1):
def setUp(self): def set_attrs_input_output(self):
self.op_type = "cumsum" self.attrs = {'axis': -1, 'reverse': True}
self.prim_op_type = "prim" self.x = np.random.random((5, 6, 10)).astype(self.dtype_)
self.python_api = paddle.cumsum self.out = np.flip(np.flip(self.x, axis=2).cumsum(axis=2), axis=2)
self.enable_cinn = True
self.attrs = {'axis': 1}
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
def test_check_output(self):
self.check_output()
def test_check_grad(self): class TestSumOp3(TestSumOp1):
self.check_grad(['X'], 'Out', check_prim=True) def set_attrs_input_output(self):
self.attrs = {'axis': 1}
self.x = np.random.random((5, 6, 10)).astype(self.dtype_)
self.out = self.x.cumsum(axis=1)
class TestSumOp4(OpTest): class TestSumOp4(TestSumOp1):
def setUp(self): def set_attrs_input_output(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
self.enable_cinn = True
self.attrs = {'axis': 0} self.attrs = {'axis': 0}
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} self.x = np.random.random((5, 6, 10)).astype(self.dtype_)
self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)} self.out = self.x.cumsum(axis=0)
def test_check_output(self):
self.check_output()
def test_check_grad(self): class TestSumOp5(TestSumOp1):
self.check_grad(['X'], 'Out', check_prim=True) def set_attrs_input_output(self):
self.x = np.random.random((5, 20)).astype(self.dtype_)
self.out = self.x.cumsum(axis=1)
class TestSumOp5(OpTest): class TestSumOp6(TestSumOp1):
def setUp(self): def set_attrs_input_output(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
self.enable_cinn = True
self.inputs = {'X': np.random.random((5, 20)).astype("float64")}
self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True)
class TestSumOp6(OpTest):
def setUp(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
self.attrs = {'axis': -1, 'flatten': True} self.attrs = {'axis': -1, 'flatten': True}
self.inputs = {'X': np.random.random((5, 6, 5)).astype("float64")} self.x = np.random.random((5, 6, 5)).astype(self.dtype_)
self.outputs = {'Out': self.inputs['X'].cumsum()} self.out = self.x.cumsum()
self.enable_cinn = False
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True)
class TestSumOp7(OpTest):
def setUp(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
self.enable_cinn = True
self.inputs = {'X': np.random.random((100)).astype("float64")}
self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
def test_check_output(self):
self.check_output()
def test_check_grad(self): class TestSumOp7(TestSumOp1):
self.check_grad(['X'], 'Out', check_prim=True) def set_attrs_input_output(self):
self.x = np.random.random((100)).astype(self.dtype_)
self.out = self.x.cumsum(axis=0)
class TestCumsumFP16(unittest.TestCase): class TestCumsumFP16(unittest.TestCase):
...@@ -263,19 +215,15 @@ class TestSumOpExclusive1(OpTest): ...@@ -263,19 +215,15 @@ class TestSumOpExclusive1(OpTest):
self.op_type = "cumsum" self.op_type = "cumsum"
self.prim_op_type = "prim" self.prim_op_type = "prim"
self.python_api = paddle.cumsum self.python_api = paddle.cumsum
self.enable_cinn = True self.set_enable_cinn()
self.attrs = {'axis': 2, "exclusive": True} self.init_dtype()
a = np.random.random((4, 5, 20)).astype("float64") self.set_attrs_input_output()
self.inputs = {'X': a} if self.dtype == np.uint16:
self.outputs = { self.inputs = {'X': convert_float_to_uint16(self.x)}
'Out': np.concatenate( self.outputs = {'Out': convert_float_to_uint16(self.out)}
( else:
np.zeros((4, 5, 1), dtype=np.float64), self.inputs = {'X': self.x}
a[:, :, :-1].cumsum(axis=2), self.outputs = {'Out': self.out}
),
axis=2,
)
}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
...@@ -283,103 +231,98 @@ class TestSumOpExclusive1(OpTest): ...@@ -283,103 +231,98 @@ class TestSumOpExclusive1(OpTest):
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True) self.check_grad(['X'], 'Out', check_prim=True)
def init_dtype(self):
self.dtype = self.dtype_ = np.float64
class TestSumOpExclusive2(OpTest): def set_enable_cinn(self):
def setUp(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
self.enable_cinn = True self.enable_cinn = True
self.attrs = {'axis': 2, "exclusive": True}
a = np.random.random((1, 1, 100)).astype("float64")
self.inputs = {'X': a}
self.outputs = {
'Out': np.concatenate(
(
np.zeros((1, 1, 1), dtype=np.float64),
a[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
}
def test_check_output(self): def set_attrs_input_output(self):
self.check_output() self.attrs = {'axis': 2, 'exclusive': True}
self.x = np.random.random((4, 5, 20)).astype(self.dtype_)
def test_check_grad(self): self.out = np.concatenate(
self.check_grad(['X'], 'Out', check_prim=True) (
np.zeros((4, 5, 1), dtype=self.dtype_),
self.x[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
class TestSumOpExclusive3(OpTest): class TestSumOpExclusive2(TestSumOpExclusive1):
def setUp(self): def set_attrs_input_output(self):
self.op_type = "cumsum" self.attrs = {'axis': 2, 'exclusive': True}
self.prim_op_type = "prim" self.x = np.random.random((1, 1, 100)).astype(self.dtype_)
self.python_api = paddle.cumsum self.out = np.concatenate(
self.enable_cinn = True (
self.attrs = {'axis': 2, "exclusive": True} np.zeros((1, 1, 1), dtype=self.dtype_),
a = np.random.random((4, 5, 20)).astype("float64") self.x[:, :, :-1].cumsum(axis=2),
self.inputs = {'X': a} ),
self.outputs = { axis=2,
'Out': np.concatenate( )
(
np.zeros((4, 5, 1), dtype=np.float64),
a[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
}
def test_check_output(self):
self.check_output()
def test_check_grad(self): class TestSumOpExclusive3(TestSumOpExclusive1):
self.check_grad(['X'], 'Out', check_prim=True) def set_attrs_input_output(self):
self.attrs = {'axis': 2, 'exclusive': True}
self.x = np.random.random((4, 5, 20)).astype(self.dtype_)
self.out = np.concatenate(
(
np.zeros((4, 5, 1), dtype=self.dtype_),
self.x[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
class TestSumOpExclusive4(OpTest): class TestSumOpExclusive4(TestSumOpExclusive1):
def setUp(self): def set_attrs_input_output(self):
self.op_type = "cumsum" self.attrs = {'axis': 2, 'exclusive': True}
self.prim_op_type = "prim" self.x = np.random.random((1, 1, 100)).astype(self.dtype_)
self.python_api = paddle.cumsum self.out = np.concatenate(
self.enable_cinn = True (
self.attrs = {'axis': 2, "exclusive": True} np.zeros((1, 1, 1), dtype=self.dtype_),
a = np.random.random((1, 1, 100)).astype("float64") self.x[:, :, :-1].cumsum(axis=2),
self.inputs = {'X': a} ),
self.outputs = { axis=2,
'Out': np.concatenate( )
(
np.zeros((1, 1, 1), dtype=np.float64),
a[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
}
def test_check_output(self):
self.check_output()
def test_check_grad(self): class TestSumOpExclusive5(TestSumOpExclusive1):
self.check_grad(['X'], 'Out', check_prim=True) def set_attrs_input_output(self):
self.attrs = {'axis': 2, 'exclusive': True}
self.x = np.random.random((4, 5, 40)).astype(self.dtype_)
self.out = np.concatenate(
(
np.zeros((4, 5, 1), dtype=self.dtype_),
self.x[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
class TestSumOpExclusive5(OpTest): class TestSumOpExclusiveFP16(OpTest):
def setUp(self): def setUp(self):
self.op_type = "cumsum" self.op_type = "cumsum"
self.prim_op_type = "prim" self.prim_op_type = "prim"
self.python_api = paddle.cumsum self.python_api = paddle.cumsum
self.enable_cinn = True self.init_dtype()
self.enable_cinn = False
self.attrs = {'axis': 2, "exclusive": True} self.attrs = {'axis': 2, "exclusive": True}
a = np.random.random((4, 5, 40)).astype("float64") self.x = np.random.random((4, 5, 20)).astype(self.dtype)
self.inputs = {'X': a} self.out = np.concatenate(
self.outputs = { (
'Out': np.concatenate( np.zeros((4, 5, 1), dtype=self.dtype),
( self.x[:, :, :-1].cumsum(axis=2),
np.zeros((4, 5, 1), dtype=np.float64), ),
a[:, :, :-1].cumsum(axis=2), axis=2,
), )
axis=2, if self.dtype == np.uint16:
) self.inputs = {'X': convert_float_to_uint16(self.x)}
} self.outputs = {'Out': convert_float_to_uint16(self.out)}
else:
self.inputs = {'X': self.x}
self.outputs = {'Out': self.out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
...@@ -387,25 +330,37 @@ class TestSumOpExclusive5(OpTest): ...@@ -387,25 +330,37 @@ class TestSumOpExclusive5(OpTest):
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True) self.check_grad(['X'], 'Out', check_prim=True)
def init_dtype(self):
self.dtype = np.float16
class TestSumOpExclusiveFP16(OpTest):
class TestSumOpReverseExclusive(OpTest):
def setUp(self): def setUp(self):
self.op_type = "cumsum" self.op_type = "cumsum"
self.prim_op_type = "prim" self.prim_op_type = "prim"
self.python_api = paddle.cumsum self.python_api = paddle.cumsum
self.enable_cinn = False self.set_enable_cinn()
self.attrs = {'axis': 2, "exclusive": True} self.init_dtype()
a = np.random.random((4, 5, 20)).astype("float16") self.attrs = {
self.inputs = {'X': a} 'axis': 2,
self.outputs = { 'reverse': True,
'Out': np.concatenate( 'exclusive': True,
(
np.zeros((4, 5, 1), dtype=np.float16),
a[:, :, :-1].cumsum(axis=2),
),
axis=2,
)
} }
self.x = np.random.random((4, 5, 6)).astype(self.dtype_)
a = np.flip(self.x, axis=2)
self.out = np.concatenate(
(
np.flip(a[:, :, :-1].cumsum(axis=2), axis=2),
np.zeros((4, 5, 1), dtype=self.dtype_),
),
axis=2,
)
if self.dtype == np.uint16:
self.inputs = {'X': convert_float_to_uint16(self.x)}
self.outputs = {'Out': convert_float_to_uint16(self.out)}
else:
self.inputs = {'X': self.x}
self.outputs = {'Out': self.out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
...@@ -413,32 +368,89 @@ class TestSumOpExclusiveFP16(OpTest): ...@@ -413,32 +368,89 @@ class TestSumOpExclusiveFP16(OpTest):
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['X'], 'Out', check_prim=True) self.check_grad(['X'], 'Out', check_prim=True)
def init_dtype(self):
self.dtype = self.dtype_ = np.float64
class TestSumOpReverseExclusive(OpTest): def set_enable_cinn(self):
def setUp(self):
self.op_type = "cumsum"
self.prim_op_type = "prim"
self.python_api = paddle.cumsum
self.enable_cinn = True self.enable_cinn = True
self.attrs = {'axis': 2, 'reverse': True, "exclusive": True}
a = np.random.random((4, 5, 6)).astype("float64")
self.inputs = {'X': a}
a = np.flip(a, axis=2)
self.outputs = {
'Out': np.concatenate(
(
np.flip(a[:, :, :-1].cumsum(axis=2), axis=2),
np.zeros((4, 5, 1), dtype=np.float64),
),
axis=2,
)
}
def test_check_output(self):
self.check_output()
def test_check_grad(self): def create_test_fp16_class(parent, max_relative_error=1e-2):
self.check_grad(['X'], 'Out', check_prim=True) class TestCumsumFP16Op(parent):
def init_dtype(self):
self.dtype = self.dtype_ = np.float16
def set_enable_cinn(self):
self.enable_cinn = False
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(
['X'],
'Out',
check_prim=True,
)
cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
TestCumsumFP16Op.__name__ = cls_name
globals()[cls_name] = TestCumsumFP16Op
create_test_fp16_class(TestSumOp1)
create_test_fp16_class(TestSumOp2)
create_test_fp16_class(TestSumOp3)
create_test_fp16_class(TestSumOp4)
create_test_fp16_class(TestSumOp5)
create_test_fp16_class(TestSumOp6)
create_test_fp16_class(TestSumOpExclusive1)
create_test_fp16_class(TestSumOpExclusive2)
create_test_fp16_class(TestSumOpExclusive3)
create_test_fp16_class(TestSumOpExclusive4)
create_test_fp16_class(TestSumOpExclusive5)
create_test_fp16_class(TestSumOpReverseExclusive)
def create_test_bf16_class(parent):
@unittest.skipIf(
not core.is_compiled_with_cuda()
or not core.is_bfloat16_supported(core.CUDAPlace(0)),
"core is not compiled with CUDA or not support bfloat16",
)
class TestCumsumBF16Op(parent):
def init_dtype(self):
self.dtype = np.uint16
self.dtype_ = np.float32
def set_enable_cinn(self):
self.enable_cinn = False
def test_check_output(self):
place = paddle.CUDAPlace(0)
self.check_output_with_place(place, check_prim=True)
def test_check_grad(self):
place = paddle.CUDAPlace(0)
self.check_grad_with_place(place, ["X"], "Out", check_prim=True)
cls_name = "{0}_{1}".format(parent.__name__, "BF16")
TestCumsumBF16Op.__name__ = cls_name
globals()[cls_name] = TestCumsumBF16Op
create_test_bf16_class(TestSumOp1)
create_test_bf16_class(TestSumOp2)
create_test_bf16_class(TestSumOp3)
create_test_bf16_class(TestSumOp4)
create_test_bf16_class(TestSumOp5)
create_test_bf16_class(TestSumOp6)
create_test_bf16_class(TestSumOpExclusive1)
create_test_bf16_class(TestSumOpExclusive2)
create_test_bf16_class(TestSumOpExclusive3)
create_test_bf16_class(TestSumOpExclusive4)
create_test_bf16_class(TestSumOpExclusive5)
create_test_bf16_class(TestSumOpReverseExclusive)
class BadInputTest(unittest.TestCase): class BadInputTest(unittest.TestCase):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册