diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h
index 252ed90e44114bc652aa190dd7aa050c983fa14a..a77527c081650386c3b59dd7bee4ce36113ac58f 100644
--- a/paddle/phi/backends/gpu/gpu_primitives.h
+++ b/paddle/phi/backends/gpu/gpu_primitives.h
@@ -445,6 +445,57 @@ CUDA_ATOMIC_WRAPPER(Max, phi::dtype::float16) {
 }
 #endif
 
+inline static __device__ uint32_t bf16_max_to_low_half(uint32_t val, float x) {
+  phi::dtype::bfloat16 low_half;
+  // The bfloat16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half =
+      static_cast<phi::dtype::bfloat16>(max(static_cast<float>(low_half), x));
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t bf16_max_to_high_half(uint32_t val, float x) {
+  phi::dtype::bfloat16 high_half;
+  // The bfloat16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half =
+      static_cast<phi::dtype::bfloat16>(max(static_cast<float>(high_half), x));
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+CUDA_ATOMIC_WRAPPER(Max, phi::dtype::bfloat16) {
+  if (*address >= val) {
+    return *address;
+  }
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // The bfloat16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_max_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // The bfloat16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_max_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+
 // For atomicMin
 USE_CUDA_ATOMIC(Min, int);
 USE_CUDA_ATOMIC(Min, unsigned int);
@@ -580,6 +631,57 @@ CUDA_ATOMIC_WRAPPER(Min, phi::dtype::float16) {
 }
 #endif
 
+inline static __device__ uint32_t bf16_min_to_low_half(uint32_t val, float x) {
+  phi::dtype::bfloat16 low_half;
+  // The bfloat16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half =
+      static_cast<phi::dtype::bfloat16>(min(static_cast<float>(low_half), x));
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t bf16_min_to_high_half(uint32_t val, float x) {
+  phi::dtype::bfloat16 high_half;
+  // The bfloat16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half =
+      static_cast<phi::dtype::bfloat16>(min(static_cast<float>(high_half), x));
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+CUDA_ATOMIC_WRAPPER(Min, phi::dtype::bfloat16) {
+  if (*address <= val) {
+    return *address;
+  }
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // The bfloat16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_min_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // The bfloat16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_min_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+
 #ifdef PADDLE_WITH_CUDA
 /*
  * One thead block deals with elementwise atomicAdd for vector of len.
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
index 2624b5850e1b2a94bf9e7b6870d07b02d68e7da8..0b6df55bdeff19096808fd5468b2431c2eb73250 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -451,6 +451,8 @@ template class SegmentPoolFunctor<GPU, int64_t, int>;
 template class SegmentPoolFunctor<GPU, int64_t, int64_t>;
 template class SegmentPoolFunctor<GPU, float16, int>;
 template class SegmentPoolFunctor<GPU, float16, int64_t>;
+template class SegmentPoolFunctor<GPU, phi::dtype::bfloat16, int>;
+template class SegmentPoolFunctor<GPU, phi::dtype::bfloat16, int64_t>;
 
 template class SegmentPoolGradFunctor<GPU, float, int>;
 template class SegmentPoolGradFunctor<GPU, float, int64_t>;
@@ -462,6 +464,8 @@ template class SegmentPoolGradFunctor<GPU, int64_t, int>;
 template class SegmentPoolGradFunctor<GPU, int64_t, int64_t>;
 template class SegmentPoolGradFunctor<GPU, float16, int>;
 template class SegmentPoolGradFunctor<GPU, float16, int64_t>;
+template class SegmentPoolGradFunctor<GPU, phi::dtype::bfloat16, int>;
+template class SegmentPoolGradFunctor<GPU, phi::dtype::bfloat16, int64_t>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
index 86800ed840528d438f8cd8c87b91a8d2275a8634..0b73580d5c94b6f57575fb2fc2a11d2a53dfb39c 100644
--- a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
@@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(segment_pool_grad,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/segment_pool_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
index 9da2ef519a42bc2e54b889116dfb9b8048b6caf4..526c46e32496ce8ba432c0e7cb177f1c654b5c61 100644
--- a/paddle/phi/kernels/gpu/segment_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
@@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(segment_pool,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/python/paddle/fluid/tests/unittests/test_segment_ops.py b/python/paddle/fluid/tests/unittests/test_segment_ops.py
index 763b814f4b1434030787f4e896724baf72da1ca8..ab71a515c4a33c2ad1c6017495d8ebb773b66552 100644
--- a/python/paddle/fluid/tests/unittests/test_segment_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_segment_ops.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.fluid import core
@@ -84,7 +84,10 @@ def segment_pool_split(X, SegmentIds, pooltype):
 
 class TestSegmentOps(OpTest):
     def set_data(self):
-        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        if self.dtype == np.uint16:
+            x = np.random.uniform(-1, 1, self.shape).astype(self.np_dtype)
+        else:
+            x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
         segment_ids = self.set_segment(len(x), len(x) // 5 + 1)
         return x, segment_ids
 
@@ -110,10 +113,14 @@ class TestSegmentOps(OpTest):
         x, segment_ids = self.set_data()
         result = self.compute(x, segment_ids)
         self.inputs = {
-            'X': x.astype(self.dtype),
+            'X': x,
             'SegmentIds': segment_ids.astype(np.int64),
         }
-        self.outputs = {'Out': result.astype(self.dtype)}
+        if self.dtype == np.uint16:
+            self.outputs = {'Out': result.astype(self.np_dtype)}
+        else:
+            self.outputs = {'Out': result.astype(self.dtype)}
+        self.convert_bf16()
 
     def test_check_output(self):
         self.check_output()
@@ -121,6 +128,12 @@ class TestSegmentOps(OpTest):
     def test_check_grad(self):
         self.check_grad(["X"], "Out")
 
+    def convert_bf16(self):
+        if self.dtype == np.uint16:
+            self.inputs['X'] = convert_float_to_uint16(self.inputs['X'])
+            self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out'])
+            self.place = core.CUDAPlace(0)
+
 
 class TestSegmentSum2(TestSegmentOps):
     def prepare(self):
@@ -141,23 +154,16 @@ class TestSegmentSum2(TestSegmentOps):
 
 class TestSegmentMax(TestSegmentOps):
     def compute(self, x, segment_ids):
-        return compute_segment_min_max(x, segment_ids, pooltype="MAX")
+        result, self.gradient = compute_segment_min_max(
+            x, segment_ids, pooltype="MAX"
+        )
+        return result
 
     def prepare(self):
         super().prepare()
         self.shape = [40, 20]
         self.attrs = {'pooltype': "MAX"}
 
-    def setUp(self):
-        self.prepare()
-        x, segment_ids = self.set_data()
-        result, self.gradient = self.compute(x, segment_ids)
-        self.inputs = {
-            'X': x.astype(self.dtype),
-            'SegmentIds': segment_ids.astype(np.int32),
-        }
-        self.outputs = {'Out': result.astype(self.dtype)}
-
     def test_check_grad(self):
         self.check_grad(["X"], "Out", user_defined_grads=[self.gradient])
 
@@ -170,7 +176,10 @@ class TestSegmentMax2(TestSegmentMax):
 
 class TestSegmentMin(TestSegmentMax):
     def compute(self, x, segment_ids):
-        return compute_segment_min_max(x, segment_ids, pooltype="MIN")
+        result, self.gradient = compute_segment_min_max(
+            x, segment_ids, pooltype="MIN"
+        )
+        return result
 
     def prepare(self):
         super().prepare()
@@ -197,12 +206,17 @@ class TestSegmentMean(TestSegmentOps):
         x, segment_ids = self.set_data()
         result = self.compute(x, segment_ids)
         self.inputs = {'X': x, 'SegmentIds': segment_ids}
+        if self.dtype == np.uint16:
+            astype = self.np_dtype
+        else:
+            astype = self.dtype
         self.outputs = {
             'Out': result,
             'SummedIds': compute_segment_sum(
-                np.ones([len(x), 1]).astype(self.dtype), segment_ids
+                np.ones([len(x), 1]).astype(astype), segment_ids
             ),
         }
+        self.convert_bf16()
 
 
 class TestSegmentMean2(TestSegmentMean):
@@ -213,6 +227,106 @@ class TestSegmentMean2(TestSegmentMean):
         self.attrs = {'pooltype': "MEAN"}
 
 
+class TestSegmentSumFP16Op(TestSegmentOps):
+    def prepare(self):
+        super().prepare()
+        self.dtype = np.float16
+
+
+class TestSegmentMaxFP16Op(TestSegmentMax):
+    def prepare(self):
+        super().prepare()
+        self.dtype = np.float16
+
+
+class TestSegmentMinFP16Op(TestSegmentMin):
+    def prepare(self):
+        super().prepare()
+        self.dtype = np.float16
+
+
+class TestSegmentMeanFP16Op(TestSegmentMean):
+    def prepare(self):
+        super().prepare()
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestSegmentSumBF16Op(TestSegmentOps):
+    def prepare(self):
+        super().prepare()
+        self.dtype = np.uint16
+        self.np_dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ["X"], "Out")
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestSegmentMaxBF16Op(TestSegmentMax):
+    def prepare(self):
+        super().prepare()
+        self.dtype = np.uint16
+        self.np_dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ["X"], "Out", user_defined_grads=[self.gradient]
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestSegmentMinBF16Op(TestSegmentMin):
+    def prepare(self):
+        super().prepare()
+        self.dtype = np.uint16
+        self.np_dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ["X"], "Out", user_defined_grads=[self.gradient]
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestSegmentMeanBF16Op(TestSegmentMean):
+    def prepare(self):
+        super().prepare()
+        self.dtype = np.uint16
+        self.np_dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ["X"], "Out")
+
+
 class API_SegmentOpsTest(unittest.TestCase):
     def test_static(self):
         with paddle.static.program_guard(paddle.static.Program()):
diff --git a/python/paddle/geometric/math.py b/python/paddle/geometric/math.py
index fdec045ec1317b3c827119162724814ace48c273..9e07e1c77a7c012299ad2aac58bde814fa83cd8c 100644
--- a/python/paddle/geometric/math.py
+++ b/python/paddle/geometric/math.py
@@ -56,7 +56,7 @@ def segment_sum(data, segment_ids, name=None):
         check_variable_and_dtype(
             data,
             "X",
-            ("float32", "float64", "int32", "int64", "float16"),
+            ("float32", "float64", "int32", "int64", "float16", "uint16"),
             "segment_pool",
         )
         check_variable_and_dtype(
@@ -114,7 +114,7 @@ def segment_mean(data, segment_ids, name=None):
         check_variable_and_dtype(
             data,
             "X",
-            ("float32", "float64", "int32", "int64", "float16"),
+            ("float32", "float64", "int32", "int64", "float16", "uint16"),
             "segment_pool",
         )
         check_variable_and_dtype(
@@ -170,7 +170,7 @@ def segment_min(data, segment_ids, name=None):
         check_variable_and_dtype(
             data,
             "X",
-            ("float32", "float64", "int32", "int64", "float16"),
+            ("float32", "float64", "int32", "int64", "float16", "uint16"),
             "segment_pool",
         )
         check_variable_and_dtype(
@@ -226,7 +226,7 @@ def segment_max(data, segment_ids, name=None):
         check_variable_and_dtype(
             data,
             "X",
-            ("float32", "float64", "int32", "int64", "float16"),
+            ("float32", "float64", "int32", "int64", "float16", "uint16"),
             "segment_pool",
         )
         check_variable_and_dtype(