diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a4ed0c85f4f9d9b48414f8101919abba36378bbe
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc
@@ -0,0 +1,149 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_XPU
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h"
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ReduceMaxXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    XPUReduce<DeviceContext, T>(context, xpu::reduce_max<T>);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReduceMaxGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Input<Tensor>("Out");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+
+    int in_dtype = context.Attr<int>("in_dtype");
+    PADDLE_ENFORCE_EQ(
+        in_dtype == -1, true,
+        platform::errors::InvalidArgument(
+            "XPU only support in_dtype == -1 in reduce_sum_grad op."));
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    x_grad->mutable_data<T>(context.GetPlace());
+    const T* x_data = x->data<T>();
+    const T* out_data = out->data<T>();
+    const T* out_grad_data = out_grad->data<T>();
+    auto* x_grad_data = x_grad->data<T>();
+    const auto& input_dim_size = x->dims().size();
+    std::vector<int> true_dims;
+    for (size_t i = 0; i < dims.size(); ++i) {
+      if (dims[i] < 0) {
+        true_dims.push_back(dims[i] + input_dim_size);
+      } else {
+        true_dims.push_back(dims[i]);
+      }
+    }
+    std::vector<int> ydims(input_dim_size);
+    std::vector<int> xdims((input_dim_size));
+    std::set<int> dims_set(true_dims.begin(), true_dims.end());
+    for (auto i = 0; i < input_dim_size; i++) {
+      xdims[i] = x->dims()[i];
+      if (dims_set.find(i) != dims_set.end() || reduce_all) {
+        ydims[i] = 1;
+      } else {
+        ydims[i] = x->dims()[i];
+      }
+    }
+
+    T* brocast1 = nullptr;
+    T* brocast2 = nullptr;
+    bool* equal = nullptr;
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc(reinterpret_cast<void**>(&brocast1), x->numel() * sizeof(T)),
+        XPU_SUCCESS,
+        platform::errors::ResourceExhausted("XPU has no enough memory"));
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc(reinterpret_cast<void**>(&equal), x->numel() * sizeof(bool)),
+        XPU_SUCCESS,
+        platform::errors::ResourceExhausted("XPU has no enough memory"));
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc(reinterpret_cast<void**>(&brocast2), x->numel() * sizeof(T)),
+        XPU_SUCCESS,
+        platform::errors::ResourceExhausted("XPU has no enough memory"));
+
+    // step 1. brocast out and out_grad
+    int r = xpu::broadcast<T>(dev_ctx.x_context(), out_data, brocast1, ydims,
+                              xdims);
+    PADDLE_ENFORCE_EQ(
+        r == xpu::Error_t::SUCCESS, true,
+        platform::errors::External("XPU broadcast in reduce_max_grad op return"
+                                   " wrong value[%d %s].",
+                                   r, XPUAPIErrorMsg[r]));
+    r = xpu::broadcast<T>(dev_ctx.x_context(), out_grad_data, brocast2, ydims,
+                          xdims);
+    PADDLE_ENFORCE_EQ(
+        r == xpu::Error_t::SUCCESS, true,
+        platform::errors::External("XPU broadcast in reduce_max_grad op return"
+                                   " wrong value[%d %s].",
+                                   r, XPUAPIErrorMsg[r]));
+    // step 2. comparse out_brocast and x
+    r = xpu::elementwise_equal<T>(dev_ctx.x_context(), x_data, brocast1, equal,
+                                  x->numel());
+    PADDLE_ENFORCE_EQ(
+        r == xpu::Error_t::SUCCESS, true,
+        platform::errors::External("XPU elementwise_equal in reduce_max_grad "
+                                   "op return wrong value[%d %s].",
+                                   r, XPUAPIErrorMsg[r]));
+    // step 3. get x_grad
+    r = xpu::constant<T>(dev_ctx.x_context(), brocast1, x->numel(), 0);
+    PADDLE_ENFORCE_EQ(
+        r == xpu::Error_t::SUCCESS, true,
+        platform::errors::External("XPU constant in reduce_max_grad op return"
+                                   " wrong value[%d %s].",
+                                   r, XPUAPIErrorMsg[r]));
+    r = xpu::select<T>(dev_ctx.x_context(), equal, brocast2, brocast1,
+                       x_grad_data, xdims, xdims);
+    PADDLE_ENFORCE_EQ(
+        r == xpu::Error_t::SUCCESS, true,
+        platform::errors::External("XPU select in reduce_max_grad op return"
+                                   " wrong value[%d %s].",
+                                   r, XPUAPIErrorMsg[r]));
+
+    if (dev_ctx.x_context()->xpu_stream) {
+      dev_ctx.Wait();
+    }
+    xpu_free(brocast1);
+    xpu_free(brocast2);
+    xpu_free(equal);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_XPU_KERNEL(
+    reduce_max,
+    ops::ReduceMaxXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    reduce_max_grad,
+    ops::ReduceMaxGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa9503ec3f0aeca3960f0e8d1c98b73ef5bdc6dc
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+void XPUReduce(
+    const framework::ExecutionContext& context,
+    std::function<int(xpu::Context*, const T*, T*, const std::vector<int>&,
+                      const std::vector<int>&)>
+        func) {
+  PADDLE_ENFORCE_EQ(
+      platform::is_xpu_place(context.GetPlace()), true,
+      platform::errors::Unavailable("This kernel only runs on XPU."));
+  bool reduce_all = context.Attr<bool>("reduce_all");
+  auto dims = context.Attr<std::vector<int>>("dim");
+  auto* x = context.Input<Tensor>("X");
+  auto* y = context.Output<Tensor>("Out");
+  y->mutable_data<T>(context.GetPlace());
+  auto& dev_ctx = context.template device_context<DeviceContext>();
+
+  int out_dtype = context.Attr<int>("out_dtype");
+  PADDLE_ENFORCE_EQ(out_dtype == -1, true,
+                    platform::errors::InvalidArgument(
+                        "XPU only support out_dtype == -1 in reduce op."));
+
+  const auto* x_data = x->data<T>();
+  auto* y_data = y->data<T>();
+  const auto& input_dim_size = x->dims().size();
+  std::vector<int> true_dims;
+  for (size_t i = 0; i < dims.size(); ++i) {
+    if (dims[i] < 0) {
+      true_dims.push_back(dims[i] + input_dim_size);
+    } else {
+      true_dims.push_back(dims[i]);
+    }
+  }
+
+  std::vector<int> reduce_dims;
+  std::vector<int> xdims((input_dim_size));
+  for (int i = 0; i < input_dim_size; ++i) {
+    xdims[i] = x->dims()[i];
+  }
+  if (reduce_all) {
+    for (int i = 0; i < input_dim_size; ++i) {
+      reduce_dims.push_back(i);
+    }
+  } else {
+    std::set<int> dims_set(true_dims.begin(), true_dims.end());
+    for (auto i = 0; i < input_dim_size; i++) {
+      if (dims_set.find(i) != dims_set.end()) {
+        if (x->dims()[i] != 1) {
+          reduce_dims.push_back(i);
+        }
+      }
+    }
+  }
+
+  if (reduce_dims.size() == 0) {
+    int r = xpu::copy<T>(dev_ctx.x_context(), x_data, y_data,
+                         x->numel() * sizeof(T));
+    PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
+                      platform::errors::External("XPU copy in reduce op return "
+                                                 "wrong value[%d %s].",
+                                                 r, XPUAPIErrorMsg[r]));
+  } else {
+    int r = func(dev_ctx.x_context(), x_data, y_data, xdims, reduce_dims);
+    PADDLE_ENFORCE_EQ(
+        r == xpu::Error_t::SUCCESS, true,
+        platform::errors::External("XPU reduce op return wrong value[%d %s].",
+                                   r, XPUAPIErrorMsg[r]));
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
index f67d43194a0d176d37427a219ebd0c3b77e9faa7..bf55221bd3ffdda95c964094abbe4f714fa79ca0 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h"
 #include "paddle/fluid/platform/xpu_header.h"
 
 namespace paddle {
@@ -25,71 +25,7 @@ template <typename DeviceContext, typename T>
 class ReduceSumXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_xpu_place(context.GetPlace()), true,
-        platform::errors::Unavailable("This kernel only runs on XPU."));
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto dims = context.Attr<std::vector<int>>("dim");
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Output<Tensor>("Out");
-    y->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    int out_dtype = context.Attr<int>("out_dtype");
-    PADDLE_ENFORCE_EQ(
-        out_dtype == -1, true,
-        platform::errors::InvalidArgument(
-            "XPU only support out_dtype == -1 in reduce_sum op."));
-
-    const auto* x_data = x->data<T>();
-    auto* y_data = y->data<T>();
-    const auto& input_dim_size = x->dims().size();
-    std::vector<int> true_dims;
-    for (size_t i = 0; i < dims.size(); ++i) {
-      if (dims[i] < 0) {
-        true_dims.push_back(dims[i] + input_dim_size);
-      } else {
-        true_dims.push_back(dims[i]);
-      }
-    }
-
-    std::vector<int> reduce_dims;
-    std::vector<int> xdims((input_dim_size));
-    for (int i = 0; i < input_dim_size; ++i) {
-      xdims[i] = x->dims()[i];
-    }
-    if (reduce_all) {
-      for (int i = 0; i < input_dim_size; ++i) {
-        reduce_dims.push_back(i);
-      }
-    } else {
-      std::set<int> dims_set(true_dims.begin(), true_dims.end());
-      for (auto i = 0; i < input_dim_size; i++) {
-        if (dims_set.find(i) != dims_set.end()) {
-          if (x->dims()[i] != 1) {
-            reduce_dims.push_back(i);
-          }
-        }
-      }
-    }
-
-    if (reduce_dims.size() == 0) {
-      int r = xpu::copy<T>(dev_ctx.x_context(), x_data, y_data,
-                           x->numel() * sizeof(T));
-      PADDLE_ENFORCE_EQ(
-          r == xpu::Error_t::SUCCESS, true,
-          platform::errors::External("XPU copy in reduce_sum op return "
-                                     "wrong value[%d %s].",
-                                     r, XPUAPIErrorMsg[r]));
-    } else {
-      int r = xpu::reduce_sum<T>(dev_ctx.x_context(), x_data, y_data, xdims,
-                                 reduce_dims);
-      PADDLE_ENFORCE_EQ(
-          r == xpu::Error_t::SUCCESS, true,
-          platform::errors::External("XPU reduce_sum in reduce_sum op return"
-                                     " wrong value[%d %s].",
-                                     r, XPUAPIErrorMsg[r]));
-    }
+    XPUReduce<DeviceContext, T>(context, xpu::reduce_sum<T>);
   }
 };
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..55ed5442cf1f371c1f48e174b5761d04494d4892
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py
@@ -0,0 +1,74 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test_xpu import OpTest, XPUOpTest
+from op_test import skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+
+class TestXPUReduceMaxOp(XPUOpTest):
+    def setUp(self):
+        self.init_op_type()
+        self.initTestCase()
+        self.use_xpu = True
+        self.use_mkldnn = False
+        self.attrs = {
+            'dim': self.axis,
+            'keep_dim': self.keep_dim,
+            'reduce_all': self.reduce_all
+        }
+        self.inputs = {'X': np.random.random(self.shape).astype("float32")}
+        if self.attrs['reduce_all']:
+            self.outputs = {'Out': self.inputs['X'].max()}
+        else:
+            self.outputs = {
+                'Out': self.inputs['X'].max(axis=self.axis,
+                                            keepdims=self.attrs['keep_dim'])
+            }
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+    def init_op_type(self):
+        self.op_type = "reduce_max"
+        self.use_mkldnn = False
+        self.keep_dim = False
+        self.reduce_all = False
+
+    def initTestCase(self):
+        self.shape = (5, 6, 10)
+        self.axis = (-1, )
+
+
+if __name__ == '__main__':
+    unittest.main()