diff --git a/paddle/phi/api/ext/tensor_compat.h b/paddle/phi/api/ext/tensor_compat.h
index 7233744c65c3fd482810608cb04b6be5092e7f7b..17c0dd3f8732dde96d371d99bc8798692146a3f3 100644
--- a/paddle/phi/api/ext/tensor_compat.h
+++ b/paddle/phi/api/ext/tensor_compat.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 // Note(chenweihang): In order to be compatible with the original custom
 // operator Tensor interface, only available to external users, the file
-// cannot be includeed in paddle
+// cannot be included in paddle
 
 namespace paddle {
 using Tensor = experimental::Tensor;
diff --git a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
index 1dd6ef6776750c01fa78b6e6a269fea0df63f33d..00eef2d5a77316dcb3918ff32dde55b4fe9a1c73 100644
--- a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
@@ -28,4 +28,5 @@ if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU)
   set_tests_properties(test_custom_cpu_profiler_plugin PROPERTIES TIMEOUT 120)
   set_tests_properties(test_fleet_launch_custom_device PROPERTIES TIMEOUT 120)
   set_tests_properties(test_custom_cpu_to_static PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_custom_device_relu_setup PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/tests/custom_runtime/custom_relu_op.cc b/python/paddle/fluid/tests/custom_runtime/custom_relu_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da0563ffeb10e3762dc874676ffc9402d0529bc7
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_runtime/custom_relu_op.cc
@@ -0,0 +1,193 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/extension.h"
+
+#define CHECK_CPU_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_CUSTOM_INPUT(x) \
+  PD_CHECK(x.is_custom_device(), #x " must be a custom Tensor.")
+
+template <typename data_t>
+void relu_cpu_forward_kernel(const data_t* x_data,
+                             data_t* out_data,
+                             int64_t x_numel) {
+  PD_CHECK(x_data != nullptr, "x_data is nullptr.");
+  PD_CHECK(out_data != nullptr, "out_data is nullptr.");
+  for (int64_t i = 0; i < x_numel; ++i) {
+    out_data[i] = std::max(static_cast<data_t>(0.), x_data[i]);
+  }
+}
+
+template <typename data_t>
+void relu_cpu_backward_kernel(const data_t* grad_out_data,
+                              const data_t* out_data,
+                              data_t* grad_x_data,
+                              int64_t out_numel) {
+  for (int64_t i = 0; i < out_numel; ++i) {
+    grad_x_data[i] =
+        grad_out_data[i] * (out_data[i] > static_cast<data_t>(0) ? 1. : 0.);
+  }
+}
+
+template <typename data_t>
+void relu_cpu_double_backward_kernel(const data_t* out_data,
+                                     const data_t* ddx_data,
+                                     data_t* ddout_data,
+                                     int64_t ddout_numel) {
+  for (int64_t i = 0; i < ddout_numel; ++i) {
+    ddout_data[i] =
+        ddx_data[i] * (out_data[i] > static_cast<data_t>(0) ? 1. : 0.);
+  }
+}
+
+std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
+  CHECK_CPU_INPUT(x);
+  auto out = paddle::empty_like(x);
+
+  PD_DISPATCH_FLOATING_TYPES(
+      x.type(), "relu_cpu_forward", ([&] {
+        relu_cpu_forward_kernel<data_t>(
+            x.data<data_t>(), out.data<data_t>(), x.numel());
+      }));
+
+  return {out};
+}
+
+std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& x,
+                                              const paddle::Tensor& out,
+                                              const paddle::Tensor& grad_out) {
+  auto grad_x = paddle::empty_like(x);
+
+  PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
+                               relu_cpu_backward_kernel<data_t>(
+                                   grad_out.data<data_t>(),
+                                   out.data<data_t>(),
+                                   grad_x.data<data_t>(),
+                                   out.size());
+                             }));
+
+  return {grad_x};
+}
+
+std::vector<paddle::Tensor> relu_cpu_double_backward(
+    const paddle::Tensor& out, const paddle::Tensor& ddx) {
+  CHECK_CPU_INPUT(out);
+  CHECK_CPU_INPUT(ddx);
+  auto ddout = paddle::empty(out.shape(), out.dtype(), out.place());
+
+  PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_double_backward", ([&] {
+                               relu_cpu_double_backward_kernel<data_t>(
+                                   out.data<data_t>(),
+                                   ddx.data<data_t>(),
+                                   ddout.mutable_data<data_t>(out.place()),
+                                   ddout.size());
+                             }));
+
+  return {ddout};
+}
+
+std::vector<paddle::Tensor> relu_custom_forward(const paddle::Tensor& x) {
+  CHECK_CUSTOM_INPUT(x);
+  auto out = paddle::relu(x);
+  return {out};
+}
+
+std::vector<paddle::Tensor> relu_custom_backward(
+    const paddle::Tensor& x,
+    const paddle::Tensor& out,
+    const paddle::Tensor& grad_out) {
+  CHECK_CUSTOM_INPUT(x);
+  CHECK_CUSTOM_INPUT(out);
+  auto grad_x = paddle::empty_like(x, x.dtype(), x.place());
+  auto ones = paddle::experimental::full_like(x, 1.0, x.dtype(), x.place());
+  auto zeros = paddle::experimental::full_like(x, 0.0, x.dtype(), x.place());
+  auto condition = paddle::experimental::greater_than(x, zeros);
+
+  grad_x = paddle::multiply(grad_out, paddle::where(condition, ones, zeros));
+
+  return {grad_x};
+}
+
+std::vector<paddle::Tensor> relu_custom_double_backward(
+    const paddle::Tensor& out, const paddle::Tensor& ddx) {
+  CHECK_CUSTOM_INPUT(out);
+  auto ddout = paddle::empty(out.shape(), out.dtype(), out.place());
+  auto ones =
+      paddle::experimental::full_like(out, 1.0, out.dtype(), out.place());
+  auto zeros =
+      paddle::experimental::full_like(out, 0.0, out.dtype(), out.place());
+  auto condition = paddle::experimental::greater_than(out, zeros);
+
+  ddout = paddle::multiply(ddx, paddle::where(condition, ones, zeros));
+
+  return {ddout};
+}
+
+std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) {
+  if (x.is_cpu()) {
+    return relu_cpu_forward(x);
+  } else if (x.is_custom_device()) {
+    return relu_custom_forward(x);
+  } else {
+    PD_THROW("Not implemented.");
+  }
+}
+
+std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
+                                         const paddle::Tensor& out,
+                                         const paddle::Tensor& grad_out) {
+  if (x.is_cpu()) {
+    return relu_cpu_backward(x, out, grad_out);
+  } else if (x.is_custom_device()) {
+    return relu_custom_backward(x, out, grad_out);
+  } else {
+    PD_THROW("Not implemented.");
+  }
+}
+
+std::vector<paddle::Tensor> ReluDoubleBackward(const paddle::Tensor& out,
+                                               const paddle::Tensor& ddx) {
+  if (out.is_cpu()) {
+    return relu_cpu_double_backward(out, ddx);
+  } else if (out.is_custom_device()) {
+    return relu_custom_double_backward(out, ddx);
+  } else {
+    PD_THROW("Not implemented.");
+  }
+}
+
+std::vector<std::vector<int64_t>> ReluDoubleBackwardInferShape(
+    const std::vector<int64_t>& out_shape,
+    const std::vector<int64_t>& ddx_shape) {
+  return {out_shape};
+}
+
+PD_BUILD_OP(custom_relu)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(ReluForward));
+
+PD_BUILD_GRAD_OP(custom_relu)
+    .Inputs({"X", "Out", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(ReluBackward));
+
+PD_BUILD_DOUBLE_GRAD_OP(custom_relu)
+    .Inputs({"Out", paddle::Grad(paddle::Grad("X"))})
+    .Outputs({paddle::Grad(paddle::Grad("Out"))})
+    .SetKernelFn(PD_KERNEL(ReluDoubleBackward))
+    .SetInferShapeFn(PD_INFER_SHAPE(ReluDoubleBackwardInferShape));
diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_device_relu_setup.py b/python/paddle/fluid/tests/custom_runtime/test_custom_device_relu_setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..760ad56cc3380e4d5b53fd65e07638e14d5859f5
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_runtime/test_custom_device_relu_setup.py
@@ -0,0 +1,325 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import tempfile
+import unittest
+from site import getsitepackages
+
+import numpy as np
+
+
+def custom_relu_dynamic(func, device, dtype, np_x, use_func=True):
+    import paddle
+
+    paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+    paddle.set_device(device)
+
+    t = paddle.to_tensor(np_x, dtype=dtype)
+    t.stop_gradient = False
+    sys.stdout.flush()
+
+    out = func(t) if use_func else paddle.nn.functional.relu(t)
+    out.stop_gradient = False
+
+    out.backward()
+
+    paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
+    if t.grad is None:
+        return out.numpy(), t.grad
+    else:
+        return out.numpy(), t.grad.numpy()
+
+
+def custom_relu_static(func, device, dtype, np_x, use_func=True):
+    import paddle
+    import paddle.static as static
+
+    paddle.enable_static()
+    paddle.set_device(device)
+
+    with static.scope_guard(static.Scope()):
+        with static.program_guard(static.Program()):
+            x = static.data(name="X", shape=[None, 8], dtype=dtype)
+            x.stop_gradient = False
+            out = func(x) if use_func else paddle.nn.functional.relu(x)
+            static.append_backward(out)
+
+            exe = static.Executor()
+            exe.run(static.default_startup_program())
+            # in static mode, x data has been covered by out
+            out_v = exe.run(
+                static.default_main_program(),
+                feed={"X": np_x},
+                fetch_list=[out.name],
+            )
+
+    paddle.disable_static()
+    return out_v
+
+
+def custom_relu_static_pe(func, device, dtype, np_x, use_func=True):
+    import paddle
+    import paddle.static as static
+
+    paddle.enable_static()
+    paddle.set_device(device)
+
+    places = paddle.CustomPlace("custom_cpu", 0)
+
+    with static.scope_guard(static.Scope()):
+        with static.program_guard(static.Program()):
+            x = static.data(name="X", shape=[None, 8], dtype=dtype)
+            x.stop_gradient = False
+            out = func(x) if use_func else paddle.nn.functional.relu(x)
+            static.append_backward(out)
+
+            exe = static.Executor()
+            exe.run(static.default_startup_program())
+
+            # in static mode, x data has been covered by out
+            compiled_prog = static.CompiledProgram(
+                static.default_main_program()
+            ).with_data_parallel(loss_name=out.name, places=places)
+            out_v = exe.run(
+                compiled_prog, feed={"X": np_x}, fetch_list=[out.name]
+            )
+
+    paddle.disable_static()
+    return out_v
+
+
+def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True):
+    import paddle
+
+    paddle.set_device(device)
+    paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+
+    t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
+
+    out = func(t) if use_func else paddle.nn.functional.relu(t)
+    dx = paddle.grad(
+        outputs=out,
+        inputs=t,
+        grad_outputs=paddle.ones_like(t),
+        create_graph=True,
+        retain_graph=True,
+    )
+
+    ddout = paddle.grad(
+        outputs=dx[0],
+        inputs=out.grad,
+        grad_outputs=paddle.ones_like(t),
+        create_graph=False,
+    )
+
+    paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
+    assert ddout[0].numpy() is not None
+    return dx[0].numpy(), ddout[0].numpy()
+
+
+class TestNewCustomOpSetUpInstall(unittest.TestCase):
+    def setUp(self):
+        # compile so and set to current path
+        self.cur_dir = os.path.dirname(os.path.abspath(__file__))
+        self.temp_dir = tempfile.TemporaryDirectory()
+        cmd = 'cd {} \
+            && git clone {} \
+            && cd PaddleCustomDevice \
+            && git fetch origin \
+            && git checkout {} -b dev \
+            && cd backends/custom_cpu \
+            && mkdir build && cd build && cmake .. && make -j8 \
+            && cd {}'.format(
+            self.temp_dir.name,
+            os.getenv('PLUGIN_URL'),
+            os.getenv('PLUGIN_TAG'),
+            self.cur_dir,
+        )
+        os.system(cmd)
+
+        # set environment for loading and registering compiled custom kernels
+        # only valid in current process
+        os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join(
+            self.cur_dir,
+            '{}/PaddleCustomDevice/backends/custom_cpu/build'.format(
+                self.temp_dir.name
+            ),
+        )
+
+        # `import paddle` loads custom_cpu.so, hence we must import paddle after finishing build PaddleCustomDevice
+        import paddle
+
+        # [Why specific paddle_includes directory?]
+        # Add paddle_includes to pass CI, for more details,
+        # please refer to the comments in `paddle/fluid/tests/custom_op/utils.py``
+        paddle_includes = []
+        for site_packages_path in getsitepackages():
+            paddle_includes.append(
+                os.path.join(site_packages_path, 'paddle', 'include')
+            )
+            paddle_includes.append(
+                os.path.join(
+                    site_packages_path, 'paddle', 'include', 'third_party'
+                )
+            )
+
+        custom_module = paddle.utils.cpp_extension.load(
+            name='custom_device_relu',
+            sources=['custom_relu_op.cc'],
+            extra_include_paths=paddle_includes,  # add for Coverage CI
+            extra_cxx_cflags=["-w", "-g"],  # test for cc flags
+            # build_directory=self.cur_dir,
+            verbose=True,
+        )
+        self.custom_op = custom_module.custom_relu
+
+        self.dtypes = ["float32", "float64"]
+        self.device = "custom_cpu"
+
+        # config seed
+        SEED = 2021
+        paddle.seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+        del os.environ['CUSTOM_DEVICE_ROOT']
+
+    def test_custom_device(self):
+        self._test_static()
+        self._test_static_pe()
+        self._test_dynamic()
+        self._test_double_grad_dynamic()
+        self._test_with_dataloader()
+
+    def _test_static(self):
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out = custom_relu_static(self.custom_op, self.device, dtype, x)
+            pd_out = custom_relu_static(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg="custom op out: {},\n paddle api out: {}".format(
+                    out, pd_out
+                ),
+            )
+
+    def _test_static_pe(self):
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out = custom_relu_static_pe(self.custom_op, self.device, dtype, x)
+            pd_out = custom_relu_static_pe(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg="custom op out: {},\n paddle api out: {}".format(
+                    out, pd_out
+                ),
+            )
+
+    def _test_dynamic(self):
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out, x_grad = custom_relu_dynamic(
+                self.custom_op, self.device, dtype, x
+            )
+            pd_out, pd_x_grad = custom_relu_dynamic(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg="custom op out: {},\n paddle api out: {}".format(
+                    out, pd_out
+                ),
+            )
+            np.testing.assert_array_equal(
+                x_grad,
+                pd_x_grad,
+                err_msg="custom op x grad: {},\n paddle api x grad: {}".format(
+                    x_grad, pd_x_grad
+                ),
+            )
+
+    def _test_double_grad_dynamic(self):
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out, dx_grad = custom_relu_double_grad_dynamic(
+                self.custom_op, self.device, dtype, x
+            )
+            pd_out, pd_dx_grad = custom_relu_double_grad_dynamic(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg="custom op out: {},\n paddle api out: {}".format(
+                    out, pd_out
+                ),
+            )
+            np.testing.assert_array_equal(
+                dx_grad,
+                pd_dx_grad,
+                err_msg="custom op dx grad: {},\n paddle api dx grad: {}".format(
+                    dx_grad, pd_dx_grad
+                ),
+            )
+
+    def _test_with_dataloader(self):
+        import paddle
+        from paddle.vision.transforms import Compose, Normalize
+
+        paddle.set_device(self.device)
+        # data loader
+        transform = Compose(
+            [Normalize(mean=[127.5], std=[127.5], data_format="CHW")]
+        )
+        train_dataset = paddle.vision.datasets.MNIST(
+            mode="train", transform=transform
+        )
+        train_loader = paddle.io.DataLoader(
+            train_dataset,
+            batch_size=64,
+            shuffle=True,
+            drop_last=True,
+            num_workers=0,
+        )
+
+        for batch_id, (image, _) in enumerate(train_loader()):
+            out = self.custom_op(image)
+            pd_out = paddle.nn.functional.relu(image)
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg="custom op out: {},\n paddle api out: {}".format(
+                    out, pd_out
+                ),
+            )
+
+            if batch_id == 5:
+                break
+
+
+if __name__ == "__main__":
+    if os.name == 'nt' or sys.platform.startswith('darwin'):
+        # only support Linux now
+        exit()
+    unittest.main()