diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h index 252ed90e44114bc652aa190dd7aa050c983fa14a..a77527c081650386c3b59dd7bee4ce36113ac58f 100644 --- a/paddle/phi/backends/gpu/gpu_primitives.h +++ b/paddle/phi/backends/gpu/gpu_primitives.h @@ -445,6 +445,57 @@ CUDA_ATOMIC_WRAPPER(Max, phi::dtype::float16) { } #endif +inline static __device__ uint32_t bf16_max_to_low_half(uint32_t val, float x) { + phi::dtype::bfloat16 low_half; + // The bfloat16 in lower 16bits + low_half.x = static_cast(val & 0xFFFFu); + low_half = + static_cast(max(static_cast(low_half), x)); + return (val & 0xFFFF0000u) | low_half.x; +} + +inline static __device__ uint32_t bf16_max_to_high_half(uint32_t val, float x) { + phi::dtype::bfloat16 high_half; + // The bfloat16 in higher 16bits + high_half.x = static_cast(val >> 16); + high_half = + static_cast(max(static_cast(high_half), x)); + return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); +} + +CUDA_ATOMIC_WRAPPER(Max, phi::dtype::bfloat16) { + if (*address >= val) { + return *address; + } + uint32_t *address_as_ui = reinterpret_cast( + reinterpret_cast(address) - + (reinterpret_cast(address) & 0x02)); + float val_f = static_cast(val); + uint32_t old = *address_as_ui; + uint32_t assumed; + if (((uintptr_t)address & 0x02) == 0) { + // The bfloat16 value stay at lower 16 bits of the address. + do { + assumed = old; + old = atomicCAS( + address_as_ui, assumed, bf16_max_to_low_half(assumed, val_f)); + } while (old != assumed); + phi::dtype::bfloat16 ret; + ret.x = old & 0xFFFFu; + return ret; + } else { + // The bfloat16 value stay at higher 16 bits of the address. + do { + assumed = old; + old = atomicCAS( + address_as_ui, assumed, bf16_max_to_high_half(assumed, val_f)); + } while (old != assumed); + phi::dtype::bfloat16 ret; + ret.x = old >> 16; + return ret; + } +} + // For atomicMin USE_CUDA_ATOMIC(Min, int); USE_CUDA_ATOMIC(Min, unsigned int); @@ -580,6 +631,57 @@ CUDA_ATOMIC_WRAPPER(Min, phi::dtype::float16) { } #endif +inline static __device__ uint32_t bf16_min_to_low_half(uint32_t val, float x) { + phi::dtype::bfloat16 low_half; + // The bfloat16 in lower 16bits + low_half.x = static_cast(val & 0xFFFFu); + low_half = + static_cast(min(static_cast(low_half), x)); + return (val & 0xFFFF0000u) | low_half.x; +} + +inline static __device__ uint32_t bf16_min_to_high_half(uint32_t val, float x) { + phi::dtype::bfloat16 high_half; + // The bfloat16 in higher 16bits + high_half.x = static_cast(val >> 16); + high_half = + static_cast(min(static_cast(high_half), x)); + return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); +} + +CUDA_ATOMIC_WRAPPER(Min, phi::dtype::bfloat16) { + if (*address <= val) { + return *address; + } + uint32_t *address_as_ui = reinterpret_cast( + reinterpret_cast(address) - + (reinterpret_cast(address) & 0x02)); + float val_f = static_cast(val); + uint32_t old = *address_as_ui; + uint32_t assumed; + if (((uintptr_t)address & 0x02) == 0) { + // The bfloat16 value stay at lower 16 bits of the address. + do { + assumed = old; + old = atomicCAS( + address_as_ui, assumed, bf16_min_to_low_half(assumed, val_f)); + } while (old != assumed); + phi::dtype::bfloat16 ret; + ret.x = old & 0xFFFFu; + return ret; + } else { + // The bfloat16 value stay at higher 16 bits of the address. + do { + assumed = old; + old = atomicCAS( + address_as_ui, assumed, bf16_min_to_high_half(assumed, val_f)); + } while (old != assumed); + phi::dtype::bfloat16 ret; + ret.x = old >> 16; + return ret; + } +} + #ifdef PADDLE_WITH_CUDA /* * One thead block deals with elementwise atomicAdd for vector of len. diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu index 2624b5850e1b2a94bf9e7b6870d07b02d68e7da8..0b6df55bdeff19096808fd5468b2431c2eb73250 100644 --- a/paddle/phi/kernels/funcs/segment_pooling.cu +++ b/paddle/phi/kernels/funcs/segment_pooling.cu @@ -451,6 +451,8 @@ template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; @@ -462,6 +464,8 @@ template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu index 86800ed840528d438f8cd8c87b91a8d2275a8634..0b73580d5c94b6f57575fb2fc2a11d2a53dfb39c 100644 --- a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu @@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(segment_pool_grad, double, int, int64_t, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/segment_pool_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_kernel.cu index 9da2ef519a42bc2e54b889116dfb9b8048b6caf4..526c46e32496ce8ba432c0e7cb177f1c654b5c61 100644 --- a/paddle/phi/kernels/gpu/segment_pool_kernel.cu +++ b/paddle/phi/kernels/gpu/segment_pool_kernel.cu @@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(segment_pool, double, int, int64_t, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/python/paddle/fluid/tests/unittests/test_segment_ops.py b/python/paddle/fluid/tests/unittests/test_segment_ops.py index 763b814f4b1434030787f4e896724baf72da1ca8..ab71a515c4a33c2ad1c6017495d8ebb773b66552 100644 --- a/python/paddle/fluid/tests/unittests/test_segment_ops.py +++ b/python/paddle/fluid/tests/unittests/test_segment_ops.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 import paddle from paddle.fluid import core @@ -84,7 +84,10 @@ def segment_pool_split(X, SegmentIds, pooltype): class TestSegmentOps(OpTest): def set_data(self): - x = np.random.uniform(-1, 1, self.shape).astype(self.dtype) + if self.dtype == np.uint16: + x = np.random.uniform(-1, 1, self.shape).astype(self.np_dtype) + else: + x = np.random.uniform(-1, 1, self.shape).astype(self.dtype) segment_ids = self.set_segment(len(x), len(x) // 5 + 1) return x, segment_ids @@ -110,10 +113,14 @@ class TestSegmentOps(OpTest): x, segment_ids = self.set_data() result = self.compute(x, segment_ids) self.inputs = { - 'X': x.astype(self.dtype), + 'X': x, 'SegmentIds': segment_ids.astype(np.int64), } - self.outputs = {'Out': result.astype(self.dtype)} + if self.dtype == np.uint16: + self.outputs = {'Out': result.astype(self.np_dtype)} + else: + self.outputs = {'Out': result.astype(self.dtype)} + self.convert_bf16() def test_check_output(self): self.check_output() @@ -121,6 +128,12 @@ class TestSegmentOps(OpTest): def test_check_grad(self): self.check_grad(["X"], "Out") + def convert_bf16(self): + if self.dtype == np.uint16: + self.inputs['X'] = convert_float_to_uint16(self.inputs['X']) + self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out']) + self.place = core.CUDAPlace(0) + class TestSegmentSum2(TestSegmentOps): def prepare(self): @@ -141,23 +154,16 @@ class TestSegmentSum2(TestSegmentOps): class TestSegmentMax(TestSegmentOps): def compute(self, x, segment_ids): - return compute_segment_min_max(x, segment_ids, pooltype="MAX") + result, self.gradient = compute_segment_min_max( + x, segment_ids, pooltype="MAX" + ) + return result def prepare(self): super().prepare() self.shape = [40, 20] self.attrs = {'pooltype': "MAX"} - def setUp(self): - self.prepare() - x, segment_ids = self.set_data() - result, self.gradient = self.compute(x, segment_ids) - self.inputs = { - 'X': x.astype(self.dtype), - 'SegmentIds': segment_ids.astype(np.int32), - } - self.outputs = {'Out': result.astype(self.dtype)} - def test_check_grad(self): self.check_grad(["X"], "Out", user_defined_grads=[self.gradient]) @@ -170,7 +176,10 @@ class TestSegmentMax2(TestSegmentMax): class TestSegmentMin(TestSegmentMax): def compute(self, x, segment_ids): - return compute_segment_min_max(x, segment_ids, pooltype="MIN") + result, self.gradient = compute_segment_min_max( + x, segment_ids, pooltype="MIN" + ) + return result def prepare(self): super().prepare() @@ -197,12 +206,17 @@ class TestSegmentMean(TestSegmentOps): x, segment_ids = self.set_data() result = self.compute(x, segment_ids) self.inputs = {'X': x, 'SegmentIds': segment_ids} + if self.dtype == np.uint16: + astype = self.np_dtype + else: + astype = self.dtype self.outputs = { 'Out': result, 'SummedIds': compute_segment_sum( - np.ones([len(x), 1]).astype(self.dtype), segment_ids + np.ones([len(x), 1]).astype(astype), segment_ids ), } + self.convert_bf16() class TestSegmentMean2(TestSegmentMean): @@ -213,6 +227,106 @@ class TestSegmentMean2(TestSegmentMean): self.attrs = {'pooltype': "MEAN"} +class TestSegmentSumFP16Op(TestSegmentOps): + def prepare(self): + super().prepare() + self.dtype = np.float16 + + +class TestSegmentMaxFP16Op(TestSegmentMax): + def prepare(self): + super().prepare() + self.dtype = np.float16 + + +class TestSegmentMinFP16Op(TestSegmentMin): + def prepare(self): + super().prepare() + self.dtype = np.float16 + + +class TestSegmentMeanFP16Op(TestSegmentMean): + def prepare(self): + super().prepare() + self.dtype = np.float16 + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestSegmentSumBF16Op(TestSegmentOps): + def prepare(self): + super().prepare() + self.dtype = np.uint16 + self.np_dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestSegmentMaxBF16Op(TestSegmentMax): + def prepare(self): + super().prepare() + self.dtype = np.uint16 + self.np_dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ["X"], "Out", user_defined_grads=[self.gradient] + ) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestSegmentMinBF16Op(TestSegmentMin): + def prepare(self): + super().prepare() + self.dtype = np.uint16 + self.np_dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ["X"], "Out", user_defined_grads=[self.gradient] + ) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestSegmentMeanBF16Op(TestSegmentMean): + def prepare(self): + super().prepare() + self.dtype = np.uint16 + self.np_dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + + class API_SegmentOpsTest(unittest.TestCase): def test_static(self): with paddle.static.program_guard(paddle.static.Program()): diff --git a/python/paddle/geometric/math.py b/python/paddle/geometric/math.py index fdec045ec1317b3c827119162724814ace48c273..9e07e1c77a7c012299ad2aac58bde814fa83cd8c 100644 --- a/python/paddle/geometric/math.py +++ b/python/paddle/geometric/math.py @@ -56,7 +56,7 @@ def segment_sum(data, segment_ids, name=None): check_variable_and_dtype( data, "X", - ("float32", "float64", "int32", "int64", "float16"), + ("float32", "float64", "int32", "int64", "float16", "uint16"), "segment_pool", ) check_variable_and_dtype( @@ -114,7 +114,7 @@ def segment_mean(data, segment_ids, name=None): check_variable_and_dtype( data, "X", - ("float32", "float64", "int32", "int64", "float16"), + ("float32", "float64", "int32", "int64", "float16", "uint16"), "segment_pool", ) check_variable_and_dtype( @@ -170,7 +170,7 @@ def segment_min(data, segment_ids, name=None): check_variable_and_dtype( data, "X", - ("float32", "float64", "int32", "int64", "float16"), + ("float32", "float64", "int32", "int64", "float16", "uint16"), "segment_pool", ) check_variable_and_dtype( @@ -226,7 +226,7 @@ def segment_max(data, segment_ids, name=None): check_variable_and_dtype( data, "X", - ("float32", "float64", "int32", "int64", "float16"), + ("float32", "float64", "int32", "int64", "float16", "uint16"), "segment_pool", ) check_variable_and_dtype(