diff --git a/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu
index b4ff277b5026cec7d33ff697722dbb4888c601cf..7b4472c5223182b4b299e9f0694878501d297ccd 100644
--- a/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu
@@ -16,7 +16,63 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceMaxGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out,
+                         const DenseTensor& out_grad,
+                         const IntArray& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DenseTensor* x_grad) {
+  dev_ctx.Alloc(x_grad, x.dtype());
+  reduce_all = recompute_reduce_all(x, dims, reduce_all);
+
+  // get reduce_dim
+  int dim_size = x.dims().size();
+  auto reduce_dims =
+      funcs::details::GetReduceDim(dims.GetData(), dim_size, reduce_all);
+  auto update_dims = vectorize(x.dims());
+  for (auto i : reduce_dims) {
+    update_dims[i] = 1;
+  }
+
+  // make new tensor of out and out_grad
+  phi::DenseTensor new_out(out.type());
+  new_out.ShareDataWith(out);
+  new_out.Resize(phi::make_ddim(update_dims));
+
+  phi::DenseTensor new_out_grad(out_grad.type());
+  new_out_grad.ShareDataWith(out_grad);
+  new_out_grad.Resize(phi::make_ddim(update_dims));
+
+  // make equal_out
+  phi::DenseTensor* equal_out = new phi::DenseTensor();
+  equal_out->Resize(x.dims());
+  dev_ctx.template Alloc<T>(equal_out);
+
+  // compute
+  // 1. equal_out = Equal(x, y)
+  std::vector<const phi::DenseTensor*> equal_inputs = {&new_out, &x};
+  std::vector<phi::DenseTensor*> equal_outputs = {equal_out};
+  funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+      dev_ctx, equal_inputs, &equal_outputs, 0, funcs::EqualFunctor<T>());
+
+  // 2. dx = dout * 1
+  std::vector<const phi::DenseTensor*> mul_inputs = {&new_out_grad, equal_out};
+  std::vector<phi::DenseTensor*> mul_outputs = {x_grad};
+  funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+      dev_ctx, mul_inputs, &mul_outputs, 0, funcs::MultiplyFunctor<T>());
+  delete equal_out;
+}
+}  // namespace phi
 
 PD_REGISTER_KERNEL(max_grad,
                    GPU,
@@ -25,4 +81,6 @@ PD_REGISTER_KERNEL(max_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/kps/reduce_max_kernel.cu b/paddle/phi/kernels/kps/reduce_max_kernel.cu
index 9c0fdb52c4279026fdc4f25584c7aa9d1e437ea3..a03035dcf1932d0d516bf435d5ee49a79a5cb7fd 100644
--- a/paddle/phi/kernels/kps/reduce_max_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_max_kernel.cu
@@ -36,6 +36,14 @@ void MaxRawKernel(const Context& dev_ctx,
 #ifdef PADDLE_WITH_XPU_KP
 PD_REGISTER_KERNEL(max_raw, KPS, ALL_LAYOUT, phi::MaxRawKernel, float) {}
 #else
-PD_REGISTER_KERNEL(
-    max_raw, KPS, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(max_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::MaxRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/reduce_max_kernel.cc
index 23da5bd4cd54edf0b53bb825a2a0ebbe7b03d9e3..7892fc879c713cd7d8813b31627b3d4c2ac9a889 100644
--- a/paddle/phi/kernels/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/reduce_max_kernel.cc
@@ -34,7 +34,20 @@ void MaxKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA)
+PD_REGISTER_KERNEL(max,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaxKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
+
+#if defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(
     max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
 #endif
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 01b25b543117c8ea3f36973fb50a7e594fa377ad..050879369244d45909bf63f8ad595a9db55385ba 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -251,18 +251,6 @@ class TestMaxOp(OpTest):
             only_check_prim=True,
         )
 
-    def test_raise_error(self):
-        if core.is_compiled_with_cuda():
-            self.inputs = {'X': np.random.random((5, 6, 10)).astype("float16")}
-            place = core.CUDAPlace(0)
-            with self.assertRaises(RuntimeError) as cm:
-                self.check_output_with_place(place)
-            error_msg = str(cm.exception).split("\n")[-2].strip().split(".")[0]
-            self.assertEqual(
-                error_msg,
-                "NotFoundError: The kernel (reduce_max) with key (GPU, Undefined(AnyLayout), float16) is not found and GPU kernel cannot fallback to CPU one",
-            )
-
 
 class TestMaxOp_ZeroDim(OpTest):
     """Remove Max with subgradient from gradient check to confirm the success of CI."""
@@ -292,7 +280,7 @@ class TestMaxOp_ZeroDim(OpTest):
         )
 
 
-class TestMaxOp_FP32(OpTest):
+class TestMaxFP32Op(OpTest):
     """Remove Max with subgradient from gradient check to confirm the success of CI."""
 
     def setUp(self):
@@ -300,13 +288,19 @@ class TestMaxOp_FP32(OpTest):
         self.prim_op_type = "prim"
         self.python_api = paddle.max
         self.public_python_api = paddle.max
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.init_dtype()
+        if self.dtype == np.uint16:
+            x = np.random.random((5, 6, 10)).astype(np.float32)
+            self.inputs = {'X': convert_float_to_uint16(x)}
+        else:
+            x = np.random.random((5, 6, 10)).astype(self.dtype)
+            self.inputs = {'X': x}
         self.attrs = {'dim': [-1], 'keep_dim': True}
-        self.outputs = {
-            'Out': self.inputs['X'].max(
-                axis=tuple(self.attrs['dim']), keepdims=True
-            )
-        }
+        out = x.max(axis=tuple(self.attrs['dim']), keepdims=True)
+        if self.dtype == np.uint16:
+            self.outputs = {'Out': convert_float_to_uint16(out)}
+        else:
+            self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
@@ -320,6 +314,37 @@ class TestMaxOp_FP32(OpTest):
             only_check_prim=True,
         )
 
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+class TestMaxFP16Op(TestMaxFP32Op):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestMaxBF16Op(TestMaxFP32Op):
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CUDAPlace(0))
+
+    def test_check_grad(self):
+        # only composite op support gradient check of reduce_max
+        self.check_grad_with_place(
+            core.CUDAPlace(0),
+            ['X'],
+            'Out',
+            check_prim=True,
+            only_check_prim=True,
+        )
+
 
 @skip_check_grad_ci(
     reason="reduce_min is discontinuous non-derivable function,"
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index fe41200378793d1c52cb7c74701bdb6b824808c5..0e6b55142bf70d3f635341c70d06646ef046e82d 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -2348,7 +2348,10 @@ def max(x, axis=None, keepdim=False, name=None):
         reduce_all, axis = _get_reduce_axis_with_tensor(axis, x)
         helper = LayerHelper('max', **locals())
         check_variable_and_dtype(
-            x, 'x', ['float32', 'float64', 'int32', 'int64'], 'max'
+            x,
+            'x',
+            ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'],
+            'max',
         )
         if not isinstance(axis, Variable) and paddle.utils._contain_var(axis):
             axis = paddle.utils._convert_to_tensor_list(axis)