[CustomOp] Split test and add inference test (#31078)

* split test & add inference test * add timeout config * change to setup install * change to jit compile * add verbose for test * fix load setup name repeat * polish details * resolve conflict * fix code format error

[CustomOp] Split test and add inference test (#31078)
* split test & add inference test * add timeout config * change to setup install * change to jit compile * add verbose for test * fix load setup name repeat * polish details * resolve conflict * fix code format error
e60fd1f6 · Chen Weihang · GitHub · d3f09ad7 · e60fd1f6 · e60fd1f6
11 changed file
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
 # New custom OP can support Windows/Linux now
-# 'test_simple_custom_op_jit/test_simple_custom_op_setup' compile .cc and .cu file
+# 'test_custom_relu_op_setup/jit' compile .cc and .cu file
-py_test(test_simple_custom_op_setup SRCS test_simple_custom_op_setup.py)
+py_test(test_custom_relu_op_setup SRCS test_custom_relu_op_setup.py)
-py_test(test_simple_custom_op_jit SRCS test_simple_custom_op_jit.py)
+py_test(test_custom_relu_op_jit SRCS test_custom_relu_op_jit.py)
 # Compiling shared library will cost some time, but running process is very fast.
-set_tests_properties(test_simple_custom_op_setup PROPERTIES TIMEOUT 250)
+set_tests_properties(test_custom_relu_op_setup PROPERTIES TIMEOUT 250)
-set_tests_properties(test_simple_custom_op_jit PROPERTIES TIMEOUT 180)
+set_tests_properties(test_custom_relu_op_jit PROPERTIES TIMEOUT 180)
 py_test(test_sysconfig SRCS test_sysconfig.py)
 # 'test_dispatch' compile .cc file
-py_test(test_dispatch SRCS test_dispatch.py)
+py_test(test_dispatch_jit SRCS test_dispatch_jit.py)
-set_tests_properties(test_dispatch PROPERTIES TIMEOUT 180)
+set_tests_properties(test_dispatch_jit PROPERTIES TIMEOUT 180)
+py_test(test_multi_out_jit SRCS test_multi_out_jit.py)
+set_tests_properties(test_multi_out_jit PROPERTIES TIMEOUT 180)
 if(NOT LINUX)
    return()

--- a/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
+++ b/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
@@ -17,13 +17,6 @@
 #include "paddle/extension.h"
-template <typename data_t>
-void fill_constant_cpu_kernel(data_t* out_data, int64_t x_numel, data_t value) {
-  for (int i = 0; i < x_numel; ++i) {
-    out_data[i] = value;
-  }
-}
 template <typename data_t>
 void relu_cpu_forward_kernel(const data_t* x_data,
                             data_t* out_data,
@@ -53,21 +46,8 @@ std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
        relu_cpu_forward_kernel<data_t>(
            x.data<data_t>(), out.mutable_data<data_t>(x.place()), x.size());
      }));
-  // fake multi output: Fake_float64 with float64 dtype
-  auto fake_float64 = paddle::Tensor(paddle::PlaceType::kCPU);
-  fake_float64.reshape(x.shape());
-  fill_constant_cpu_kernel<double>(
-      fake_float64.mutable_data<double>(x.place()), x.size(), 0.);
-  // fake multi output: ZFake_int32 with int32 dtype
-  auto zfake_int32 = paddle::Tensor(paddle::PlaceType::kCPU);
-  zfake_int32.reshape(x.shape());
-  fill_constant_cpu_kernel<int32_t>(
-      zfake_int32.mutable_data<int32_t>(x.place()), x.size(), 1);
-  return {out, fake_float64, zfake_int32};
+  return {out};
 }
 std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& x,
@@ -117,16 +97,16 @@ std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
 }
 std::vector<std::vector<int64_t>> ReluInferShape(std::vector<int64_t> x_shape) {
-  return {x_shape, x_shape, x_shape};
+  return {x_shape};
 }
 std::vector<paddle::DataType> ReluInferDType(paddle::DataType x_dtype) {
-  return {x_dtype, paddle::DataType::FLOAT64, paddle::DataType::INT32};
+  return {x_dtype};
 }
-PD_BUILD_OP("relu2")
+PD_BUILD_OP("custom_relu")
    .Inputs({"X"})
-    .Outputs({"Out", "Fake_float64", "ZFake_int32"})
+    .Outputs({"Out"})
    .SetKernelFn(PD_KERNEL(ReluForward))
    .SetInferShapeFn(PD_INFER_SHAPE(ReluInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(ReluInferDType))

--- a/python/paddle/fluid/tests/custom_op/relu_op_simple.cu
+++ b/python/paddle/fluid/tests/custom_op/relu_op_simple.cu
@@ -14,16 +14,6 @@
 #include "paddle/extension.h"
-template <typename data_t>
-__global__ void fill_constant_cuda_kernel(data_t* y,
-                                          const int num,
-                                          data_t value) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    y[i] = value;
-  }
-}
 template <typename data_t>
 __global__ void relu_cuda_forward_kernel(const data_t* x,
                                         data_t* y,
@@ -57,18 +47,8 @@ std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
        relu_cuda_forward_kernel<data_t><<<grid, block>>>(
            x.data<data_t>(), out.mutable_data<data_t>(x.place()), numel);
      }));
-  // fake multi output: Fake_1
-  auto fake_float64 = paddle::Tensor(paddle::PlaceType::kGPU);
-  fake_float64.reshape(x.shape());
-  fill_constant_cuda_kernel<double><<<grid, block>>>(
-      fake_float64.mutable_data<double>(x.place()), numel, 0.);
-  // fake multi output: ZFake_1
-  auto zfake_int32 = paddle::Tensor(paddle::PlaceType::kGPU);
-  zfake_int32.reshape(x.shape());
-  fill_constant_cuda_kernel<int32_t><<<grid, block>>>(
-      zfake_int32.mutable_data<int32_t>(x.place()), numel, 1);
-  return {out, fake_float64, zfake_int32};
+  return {out};
 }
 std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,

--- a/python/paddle/fluid/tests/custom_op/relu_op3_simple.cc
+++ b/python/paddle/fluid/tests/custom_op/relu_op3_simple.cc
@@ -29,11 +29,11 @@ std::vector<std::vector<int64_t>> ReluInferShape(std::vector<int64_t> x_shape);
 std::vector<paddle::DataType> ReluInferDType(paddle::DataType x_dtype);
-// Reuse codes in `relu_op_simple.cc/cu` to register another custom operator
+// Reuse codes in `custom_relu_op.cc/cu` to register another custom operator
 // to test jointly compile multi operators at same time.
-PD_BUILD_OP("relu3")
+PD_BUILD_OP("custom_relu_dup")
    .Inputs({"X"})
-    .Outputs({"Out", "Fake_float64", "ZFake_int32"})
+    .Outputs({"Out"})
    .SetKernelFn(PD_KERNEL(ReluForward))
    .SetInferShapeFn(PD_INFER_SHAPE(ReluInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(ReluInferDType))

--- a/python/paddle/fluid/tests/custom_op/setup_install_simple.py
+++ b/python/paddle/fluid/tests/custom_op/setup_install_simple.py
@@ -17,11 +17,14 @@ import os
 from utils import paddle_includes, extra_compile_args
 from paddle.utils.cpp_extension import CUDAExtension, setup
+# custom_relu_op_dup.cc is only used for multi ops test,
+# not a new op, if you want to test only one op, remove this
+# source file
 setup(
-    name='simple_setup_relu2',
+    name='custom_relu_module_setup',
    ext_modules=CUDAExtension(  # test for not specific name here.
        sources=[
-            'relu_op_simple.cc', 'relu_op_simple.cu', 'relu_op3_simple.cc'
+            'custom_relu_op.cc', 'custom_relu_op.cu', 'custom_relu_op_dup.cc'
        ],  # test for multi ops
        include_dirs=paddle_includes,
        extra_compile_args=extra_compile_args))
--- a/python/paddle/fluid/tests/custom_op/multi_out_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/multi_out_test_op.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <iostream>
+#include <vector>
+#include "paddle/extension.h"
+template <typename data_t>
+void assign_cpu_kernel(const data_t* x_data,
+                       data_t* out_data,
+                       int64_t x_numel) {
+  for (int i = 0; i < x_numel; ++i) {
+    out_data[i] = x_data[i];
+  }
+}
+template <typename data_t>
+void fill_constant_cpu_kernel(data_t* out_data, int64_t x_numel, data_t value) {
+  for (int i = 0; i < x_numel; ++i) {
+    out_data[i] = value;
+  }
+}
+std::vector<paddle::Tensor> MultiOutCPU(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+  PD_DISPATCH_FLOATING_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(x.place()), x.size());
+      }));
+  // fake multi output: Fake_float64 with float64 dtype
+  auto fake_float64 = paddle::Tensor(paddle::PlaceType::kCPU);
+  fake_float64.reshape(x.shape());
+  fill_constant_cpu_kernel<double>(
+      fake_float64.mutable_data<double>(x.place()), x.size(), 0.);
+  // fake multi output: ZFake_int32 with int32 dtype
+  auto zfake_int32 = paddle::Tensor(paddle::PlaceType::kCPU);
+  zfake_int32.reshape(x.shape());
+  fill_constant_cpu_kernel<int32_t>(
+      zfake_int32.mutable_data<int32_t>(x.place()), x.size(), 1);
+  return {out, fake_float64, zfake_int32};
+}
+std::vector<std::vector<int64_t>> InferShape(std::vector<int64_t> x_shape) {
+  return {x_shape, x_shape, x_shape};
+}
+std::vector<paddle::DataType> InferDtype(paddle::DataType x_dtype) {
+  return {x_dtype, paddle::DataType::FLOAT64, paddle::DataType::INT32};
+}
+PD_BUILD_OP("multi_out")
+    .Inputs({"X"})
+    .Outputs({"Out", "Fake_float64", "ZFake_int32"})
+    .SetKernelFn(PD_KERNEL(MultiOutCPU))
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype));
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import subprocess
+import unittest
+import paddle
+import numpy as np
+from paddle.utils.cpp_extension import load, get_build_directory
+from paddle.utils.cpp_extension.extension_utils import run_cmd
+from utils import paddle_includes, extra_compile_args
+from test_custom_relu_op_setup import custom_relu_dynamic, custom_relu_static
+# Because Windows don't use docker, the shared lib already exists in the 
+# cache dir, it will not be compiled again unless the shared lib is removed.
+if os.name == 'nt':
+    cmd = 'del {}\\custom_relu_module_jit.pyd'.format(get_build_directory())
+    run_cmd(cmd, True)
+# Compile and load custom op Just-In-Time.
+# custom_relu_op_dup.cc is only used for multi ops test,
+# not a new op, if you want to test only one op, remove this
+# source file
+custom_module = load(
+    name='custom_relu_module_jit',
+    sources=[
+        'custom_relu_op.cc', 'custom_relu_op.cu', 'custom_relu_op_dup.cc'
+    ],
+    extra_include_paths=paddle_includes,  # add for Coverage CI
+    extra_cflags=extra_compile_args,  # add for Coverage CI
+    verbose=True)
+class TestJITLoad(unittest.TestCase):
+    def setUp(self):
+        self.custom_ops = [
+            custom_module.custom_relu, custom_module.custom_relu_dup
+        ]
+        self.dtypes = ['float32', 'float64']
+        self.devices = ['cpu', 'gpu']
+    def test_static(self):
+        for device in self.devices:
+            for dtype in self.dtypes:
+                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+                for custom_op in self.custom_ops:
+                    out = custom_relu_static(custom_op, device, dtype, x)
+                    pd_out = custom_relu_static(custom_op, device, dtype, x,
+                                                False)
+                    self.assertTrue(
+                        np.array_equal(out, pd_out),
+                        "custom op out: {},\n paddle api out: {}".format(
+                            out, pd_out))
+    def test_dynamic(self):
+        for device in self.devices:
+            for dtype in self.dtypes:
+                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+                for custom_op in self.custom_ops:
+                    out, x_grad = custom_relu_dynamic(custom_op, device, dtype,
+                                                      x)
+                    pd_out, pd_x_grad = custom_relu_dynamic(custom_op, device,
+                                                            dtype, x, False)
+                    self.assertTrue(
+                        np.array_equal(out, pd_out),
+                        "custom op out: {},\n paddle api out: {}".format(
+                            out, pd_out))
+                    self.assertTrue(
+                        np.array_equal(x_grad, pd_x_grad),
+                        "custom op x grad: {},\n paddle api x grad: {}".format(
+                            x_grad, pd_x_grad))
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
@@ -23,13 +23,13 @@ import numpy as np
 from paddle.utils.cpp_extension.extension_utils import run_cmd
-def relu2_dynamic(func, device, dtype, np_x, use_func=True):
+def custom_relu_dynamic(func, device, dtype, np_x, use_func=True):
    paddle.set_device(device)
    t = paddle.to_tensor(np_x)
    t.stop_gradient = False
-    out = func(t)[0] if use_func else paddle.nn.functional.relu(t)
+    out = func(t) if use_func else paddle.nn.functional.relu(t)
    out.stop_gradient = False
    out.backward()
@@ -37,7 +37,12 @@ def relu2_dynamic(func, device, dtype, np_x, use_func=True):
    return out.numpy(), t.grad
-def relu2_static(func, device, dtype, np_x, use_func=True):
+def custom_relu_static(func,
+                       device,
+                       dtype,
+                       np_x,
+                       use_func=True,
+                       test_infer=False):
    paddle.enable_static()
    paddle.set_device(device)
@@ -45,8 +50,7 @@ def relu2_static(func, device, dtype, np_x, use_func=True):
        with static.program_guard(static.Program()):
            x = static.data(name='X', shape=[None, 8], dtype=dtype)
            x.stop_gradient = False
-            # out, fake_float64, fake_int32
+            out = func(x) if use_func else paddle.nn.functional.relu(x)
-            out = func(x)[0] if use_func else paddle.nn.functional.relu(x)
            static.append_backward(out)
            exe = static.Executor()
@@ -60,7 +64,7 @@ def relu2_static(func, device, dtype, np_x, use_func=True):
    return out_v
-def relu2_static_pe(func, device, dtype, np_x, use_func=True):
+def custom_relu_static_pe(func, device, dtype, np_x, use_func=True):
    paddle.enable_static()
    paddle.set_device(device)
@@ -69,7 +73,7 @@ def relu2_static_pe(func, device, dtype, np_x, use_func=True):
        with static.program_guard(static.Program()):
            x = static.data(name='X', shape=[None, 8], dtype=dtype)
            x.stop_gradient = False
-            out = func(x)[0] if use_func else paddle.nn.functional.relu(x)
+            out = func(x) if use_func else paddle.nn.functional.relu(x)
            static.append_backward(out)
            exe = static.Executor()
@@ -87,16 +91,58 @@ def relu2_static_pe(func, device, dtype, np_x, use_func=True):
    return out_v
+def custom_relu_static_inference(func, device, np_data, np_label, path_prefix):
+    paddle.set_device(device)
+    with static.scope_guard(static.Scope()):
+        with static.program_guard(static.Program()):
+            # simple module
+            data = static.data(
+                name='data', shape=[None, 1, 28, 28], dtype='float32')
+            label = static.data(name='label', shape=[None, 1], dtype='int64')
+            hidden = static.nn.fc(data, size=128)
+            hidden = func(hidden)
+            hidden = static.nn.fc(hidden, size=128)
+            predict = static.nn.fc(hidden, size=10, activation='softmax')
+            loss = paddle.nn.functional.cross_entropy(input=hidden, label=label)
+            avg_loss = paddle.mean(loss)
+            opt = paddle.optimizer.SGD(learning_rate=0.1)
+            opt.minimize(avg_loss)
+            # run start up model
+            exe = static.Executor()
+            exe.run(static.default_startup_program())
+            # train
+            for i in range(4):
+                avg_loss_v = exe.run(static.default_main_program(),
+                                     feed={'data': np_data,
+                                           'label': np_label},
+                                     fetch_list=[avg_loss])
+            # save inference model
+            static.save_inference_model(path_prefix, [data], [predict], exe)
+            # get train predict value
+            predict_v = exe.run(static.default_main_program(),
+                                feed={'data': np_data,
+                                      'label': np_label},
+                                fetch_list=[predict])
+    return predict_v
 class TestNewCustomOpSetUpInstall(unittest.TestCase):
    def setUp(self):
        cur_dir = os.path.dirname(os.path.abspath(__file__))
        # compile, install the custom op egg into site-packages under background
        if os.name == 'nt':
-            cmd = 'cd /d {} && python setup_install_simple.py install'.format(
+            cmd = 'cd /d {} && python custom_relu_setup.py install'.format(
                cur_dir)
        else:
-            cmd = 'cd {} && python setup_install_simple.py install'.format(
+            cmd = 'cd {} && python custom_relu_setup.py install'.format(cur_dir)
-                cur_dir)
        run_cmd(cmd)
        # NOTE(Aurelius84): Normally, it's no need to add following codes for users.
@@ -110,26 +156,36 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
        else:
            site_dir = site.getsitepackages()[0]
        custom_egg_path = [
-            x for x in os.listdir(site_dir) if 'simple_setup_relu2' in x
+            x for x in os.listdir(site_dir) if 'custom_relu_module_setup' in x
        ]
        assert len(custom_egg_path) == 1, "Matched egg number is %d." % len(
            custom_egg_path)
        sys.path.append(os.path.join(site_dir, custom_egg_path[0]))
        # usage: import the package directly
-        import simple_setup_relu2
+        import custom_relu_module_setup
-        self.custom_ops = [simple_setup_relu2.relu2, simple_setup_relu2.relu3]
+        # `custom_relu_dup` is same as `custom_relu_dup`
+        self.custom_ops = [
+            custom_relu_module_setup.custom_relu,
+            custom_relu_module_setup.custom_relu_dup
+        ]
        self.dtypes = ['float32', 'float64']
        self.devices = ['cpu', 'gpu']
+        # config seed
+        SEED = 2021
+        paddle.seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
    def test_static(self):
        for device in self.devices:
            for dtype in self.dtypes:
                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                for custom_op in self.custom_ops:
-                    out = relu2_static(custom_op, device, dtype, x)
+                    out = custom_relu_static(custom_op, device, dtype, x)
-                    pd_out = relu2_static(custom_op, device, dtype, x, False)
+                    pd_out = custom_relu_static(custom_op, device, dtype, x,
+                                                False)
                    self.assertTrue(
                        np.array_equal(out, pd_out),
                        "custom op out: {},\n paddle api out: {}".format(
@@ -140,8 +196,9 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
            for dtype in self.dtypes:
                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                for custom_op in self.custom_ops:
-                    out = relu2_static_pe(custom_op, device, dtype, x)
+                    out = custom_relu_static_pe(custom_op, device, dtype, x)
-                    pd_out = relu2_static_pe(custom_op, device, dtype, x, False)
+                    pd_out = custom_relu_static_pe(custom_op, device, dtype, x,
+                                                   False)
                    self.assertTrue(
                        np.array_equal(out, pd_out),
                        "custom op out: {},\n paddle api out: {}".format(
@@ -152,9 +209,10 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
            for dtype in self.dtypes:
                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                for custom_op in self.custom_ops:
-                    out, x_grad = relu2_dynamic(custom_op, device, dtype, x)
+                    out, x_grad = custom_relu_dynamic(custom_op, device, dtype,
-                    pd_out, pd_x_grad = relu2_dynamic(custom_op, device, dtype,
+                                                      x)
-                                                      x, False)
+                    pd_out, pd_x_grad = custom_relu_dynamic(custom_op, device,
+                                                            dtype, x, False)
                    self.assertTrue(
                        np.array_equal(out, pd_out),
                        "custom op out: {},\n paddle api out: {}".format(
@@ -164,6 +222,28 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
                        "custom op x grad: {},\n paddle api x grad: {}".format(
                            x_grad, pd_x_grad))
+    def test_static_save_and_load_inference_model(self):
+        paddle.enable_static()
+        np_data = np.random.random((1, 1, 28, 28)).astype("float32")
+        np_label = np.random.random((1, 1)).astype("int64")
+        path_prefix = "custom_op_inference/custom_relu"
+        for device in self.devices:
+            predict = custom_relu_static_inference(
+                self.custom_ops[0], device, np_data, np_label, path_prefix)
+            # load inference model
+            with static.scope_guard(static.Scope()):
+                exe = static.Executor()
+                [inference_program, feed_target_names,
+                 fetch_targets] = static.load_inference_model(path_prefix, exe)
+                predict_infer = exe.run(inference_program,
+                                        feed={feed_target_names[0]: np_data},
+                                        fetch_list=fetch_targets)
+                self.assertTrue(
+                    np.array_equal(predict, predict_infer),
+                    "custom op predict: {},\n custom op infer predict: {}".
+                    format(predict, predict_infer))
+        paddle.disable_static()
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/custom_op/test_dispatch.py
+++ b/python/paddle/fluid/tests/custom_op/test_dispatch.py
--- a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_jit.py
@@ -15,88 +15,51 @@
 import os
 import subprocess
 import unittest
-import paddle
 import numpy as np
+import paddle
+from paddle.utils.cpp_extension import load
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_compile_args
-from test_simple_custom_op_setup import relu2_dynamic, relu2_static
 # Because Windows don't use docker, the shared lib already exists in the 
 # cache dir, it will not be compiled again unless the shared lib is removed.
 if os.name == 'nt':
-    cmd = 'del {}\\simple_jit_relu2.pyd'.format(get_build_directory())
+    cmd = 'del {}\\multi_out_jit.pyd'.format(get_build_directory())
    run_cmd(cmd, True)
 # Compile and load custom op Just-In-Time.
-custom_module = load(
+multi_out_module = load(
-    name='simple_jit_relu2',
+    name='multi_out_jit',
-    sources=['relu_op_simple.cc', 'relu_op_simple.cu', 'relu_op3_simple.cc'],
+    sources=['multi_out_test_op.cc'],
    extra_include_paths=paddle_includes,  # add for Coverage CI
    extra_cflags=extra_compile_args,  # add for Coverage CI
    verbose=True)
-class TestJITLoad(unittest.TestCase):
-    def setUp(self):
-        self.custom_ops = [custom_module.relu2, custom_module.relu3]
-        self.dtypes = ['float32', 'float64']
-        self.devices = ['cpu', 'gpu']
-    def test_static(self):
-        for device in self.devices:
-            for dtype in self.dtypes:
-                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
-                for custom_op in self.custom_ops:
-                    out = relu2_static(custom_op, device, dtype, x)
-                    pd_out = relu2_static(custom_op, device, dtype, x, False)
-                    self.assertTrue(
-                        np.array_equal(out, pd_out),
-                        "custom op out: {},\n paddle api out: {}".format(
-                            out, pd_out))
-    def test_dynamic(self):
-        for device in self.devices:
-            for dtype in self.dtypes:
-                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
-                for custom_op in self.custom_ops:
-                    out, x_grad = relu2_dynamic(custom_op, device, dtype, x)
-                    pd_out, pd_x_grad = relu2_dynamic(custom_op, device, dtype,
-                                                      x, False)
-                    self.assertTrue(
-                        np.array_equal(out, pd_out),
-                        "custom op out: {},\n paddle api out: {}".format(
-                            out, pd_out))
-                    self.assertTrue(
-                        np.array_equal(x_grad, pd_x_grad),
-                        "custom op x grad: {},\n paddle api x grad: {}".format(
-                            x_grad, pd_x_grad))
 class TestMultiOutputDtypes(unittest.TestCase):
    def setUp(self):
-        self.custom_op = custom_module.relu2
+        self.custom_op = multi_out_module.multi_out
        self.dtypes = ['float32', 'float64']
-        self.devices = ['cpu', 'gpu']
+        self.devices = ['cpu']
-    def test_static(self):
+    def run_static(self, device, dtype):
-        paddle.enable_static()
+        paddle.set_device(device)
-        for device in self.devices:
+        x_data = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
-            for dtype in self.dtypes:
-                res = self.run_static(device, dtype)
-                self.check_multi_outputs(res)
-        paddle.disable_static()
-    def test_dynamic(self):
+        with paddle.static.scope_guard(paddle.static.Scope()):
-        for device in self.devices:
+            with paddle.static.program_guard(paddle.static.Program()):
-            for dtype in self.dtypes:
+                x = paddle.static.data(name='X', shape=[None, 8], dtype=dtype)
-                paddle.set_device(device)
-                x_data = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
-                x = paddle.to_tensor(x_data)
                outs = self.custom_op(x)
-                self.assertTrue(len(outs) == 3)
+                exe = paddle.static.Executor()
-                self.check_multi_outputs(outs, True)
+                exe.run(paddle.static.default_startup_program())
+                res = exe.run(paddle.static.default_main_program(),
+                              feed={'X': x_data},
+                              fetch_list=outs)
+                return res
    def check_multi_outputs(self, outs, is_dynamic=False):
        out, zero_float64, one_int32 = outs
@@ -112,22 +75,24 @@ class TestMultiOutputDtypes(unittest.TestCase):
        self.assertTrue(
            np.array_equal(one_int32, np.ones([4, 8]).astype('int32')))
-    def run_static(self, device, dtype):
+    def test_static(self):
-        paddle.set_device(device)
+        paddle.enable_static()
-        x_data = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+        for device in self.devices:
+            for dtype in self.dtypes:
+                res = self.run_static(device, dtype)
+                self.check_multi_outputs(res)
+        paddle.disable_static()
-        with paddle.static.scope_guard(paddle.static.Scope()):
+    def test_dynamic(self):
-            with paddle.static.program_guard(paddle.static.Program()):
+        for device in self.devices:
-                x = paddle.static.data(name='X', shape=[None, 8], dtype=dtype)
+            for dtype in self.dtypes:
+                paddle.set_device(device)
+                x_data = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+                x = paddle.to_tensor(x_data)
                outs = self.custom_op(x)
-                exe = paddle.static.Executor()
+                self.assertTrue(len(outs) == 3)
-                exe.run(paddle.static.default_startup_program())
+                self.check_multi_outputs(outs, True)
-                res = exe.run(paddle.static.default_main_program(),
-                              feed={'X': x_data},
-                              fetch_list=outs)
-                return res
 if __name__ == '__main__':

--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -558,7 +558,7 @@ def load(name,
    log_v("build_directory: {}".format(build_directory), verbose)
-    file_path = os.path.join(build_directory, "setup.py")
+    file_path = os.path.join(build_directory, "{}_setup.py".format(name))
    sources = [os.path.abspath(source) for source in sources]
    # TODO(Aurelius84): split cflags and cuda_flags