diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index c51417730850cfd65ce8c98fe39a2aca524dfa03..b7dbf252103c6f30ed3d9f985cdc86eef26b17dc 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -563,6 +563,20 @@ struct RemainderFunctor<dtype::float16> {
   }
 };
 
+template <>
+struct RemainderFunctor<dtype::bfloat16> {
+  inline HOSTDEVICE dtype::bfloat16 operator()(const dtype::bfloat16 a,
+                                               const dtype::bfloat16 b) const {
+    float b_float = static_cast<float>(b);
+    float res = fmod(static_cast<float>(a), b_float);
+
+    // Accoding to #PR26732: in dividen % divsor
+    // remainder shall have the same sign as divsor.
+    if ((res != 0.0f) && ((res < 0.0f) != (b_float < 0.0f))) res += b_float;
+    return static_cast<dtype::bfloat16>(res);
+  }
+};
+
 template <typename T, typename Enable = void>
 struct InverseRemainderFunctor {
   inline HOSTDEVICE T operator()(const T a, const T b) const {
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index e1eb5f85288394bb8b4d8661e3c71959fca9b96b..c6e693ddd3b90f860cf866003d66cc253728e9db 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -117,7 +117,8 @@ PD_REGISTER_KERNEL(remainder,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(
     floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
 PD_REGISTER_KERNEL(elementwise_pow,
diff --git a/paddle/phi/kernels/legacy/kps/elementwise_raw_kernel.cu b/paddle/phi/kernels/legacy/kps/elementwise_raw_kernel.cu
index 8fab237f0a5ee7440438d567990d91a4e54e75cb..23ea1264c63f420dca2e7e80bf45659447e619d7 100644
--- a/paddle/phi/kernels/legacy/kps/elementwise_raw_kernel.cu
+++ b/paddle/phi/kernels/legacy/kps/elementwise_raw_kernel.cu
@@ -157,7 +157,8 @@ PD_REGISTER_KERNEL(remainder_raw,
                    double,
                    int,
                    float16,
-                   int64_t) {}
+                   int64_t,
+                   bfloat16) {}
 PD_REGISTER_KERNEL(floor_divide_raw,
                    KPS,
                    ALL_LAYOUT,
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
index 55ebc5d28cff9cd818b1e7505fa6e2b9eba81ddb..8f34328241fef62b9c56806c7ea56890da6ed365 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
@@ -16,10 +16,15 @@ import random
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    convert_uint16_to_float,
+)
 
 import paddle
 from paddle import fluid
+from paddle.fluid import core
 
 
 class TestElementwiseModOp(OpTest):
@@ -106,14 +111,17 @@ class TestElementwiseModOpFloat(TestElementwiseModOp):
             self.check_output()
 
 
-class TestElementwiseModOpFp16(TestElementwiseModOp):
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+)
+class TestElementwiseModFP16Op(TestElementwiseModOp):
     def init_dtype(self):
         self.dtype = np.float16
 
     def init_input_output(self):
         self.x = np.random.uniform(-1000, 1000, [10, 10]).astype(self.dtype)
         self.y = np.random.uniform(-100, 100, [10, 10]).astype(self.dtype)
-        self.out = np.mod(self.x, self.y)
+        self.out = np.fmod(self.y + np.fmod(self.x, self.y), self.y)
 
     def test_check_output(self):
         if self.attrs['axis'] == -1:
@@ -122,6 +130,83 @@ class TestElementwiseModOpFp16(TestElementwiseModOp):
             self.check_output()
 
 
+class TestElementwiseModFP16Op_ZeroDim1(TestElementwiseModFP16Op):
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, []).astype(np.float16)
+        self.y = np.random.uniform(0, 1000, []).astype(np.float16)
+        self.out = np.fmod(self.y + np.fmod(self.x, self.y), self.y)
+
+
+class TestElementwiseModFP16Op_ZeroDim2(TestElementwiseModFP16Op):
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10, 10]).astype(np.float16)
+        self.y = np.random.uniform(0, 1000, []).astype(np.float16)
+        self.out = np.fmod(self.y + np.fmod(self.x, self.y), self.y)
+
+
+class TestElementwiseModFP16Op_ZeroDim3(TestElementwiseModFP16Op):
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, []).astype(np.float16)
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(np.float16)
+        self.out = np.fmod(self.y + np.fmod(self.x, self.y), self.y)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestElementwiseModBF16Op(OpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10, 10]).astype(np.float32)
+        self.x = convert_uint16_to_float(convert_float_to_uint16(self.x))
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(np.float32)
+        self.y = convert_uint16_to_float(convert_float_to_uint16(self.y))
+        self.out = np.fmod(self.y + np.fmod(self.x, self.y), self.y)
+
+    def setUp(self):
+        self.op_type = "elementwise_mod"
+        self.python_api = paddle.remainder
+        self.public_python_api = paddle.remainder
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+        self.inputs = {
+            'X': convert_float_to_uint16(
+                OpTest.np_dtype_to_fluid_dtype(self.x)
+            ),
+            'Y': convert_float_to_uint16(
+                OpTest.np_dtype_to_fluid_dtype(self.y)
+            ),
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': convert_float_to_uint16(self.out)}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def init_axis(self):
+        pass
+
+
+class TestElementwiseModBF16Op_ZeroDim1(TestElementwiseModBF16Op):
+    def init_input(self):
+        self.x = np.random.uniform(0, 10000, []).astype("float32")
+        self.x = convert_uint16_to_float(convert_float_to_uint16(self.x))
+        self.y = np.random.uniform(0, 1000, []).astype("float32")
+        self.y = convert_uint16_to_float(convert_float_to_uint16(self.y))
+        self.out = np.fmod(self.y + np.fmod(self.x, self.y), self.y)
+
+
 class TestElementwiseModOpDouble(TestElementwiseModOpFloat):
     def init_dtype(self):
         self.dtype = np.float64