[with_data_parallel][part3] remove with_data_parallel in unit test (#50568)

* remove with_data_parallel in unittest * fix CI * remove comment * trigger CI * revert part changes * test_build_strategy_fusion_group_pass

[with_data_parallel][part3] remove with_data_parallel in unit test (#50568)
* remove with_data_parallel in unittest * fix CI * remove comment * trigger CI * revert part changes * test_build_strategy_fusion_group_pass
b958fa75 · kangguangli · GitHub · 499b7f87 · b958fa75 · b958fa75
31 changed file
--- a/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_executor.py
@@ -235,9 +235,7 @@ class SwitchExecutorInterfaceWithFeed(unittest.TestCase):
        exe.run(startup_program)

        if use_compiled:
-            main_program = paddle.static.CompiledProgram(
-                main_program
-            ).with_data_parallel(fetch_vars[0].name, places=[self.place])
+            main_program = paddle.static.CompiledProgram(main_program)

        if use_str:  # test for fetch name
            fetch_vars = [x.name for x in fetch_vars]

--- a/python/paddle/fluid/tests/unittests/test_build_strategy_fusion_group_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_build_strategy_fusion_group_pass.py
@@ -32,7 +32,7 @@ class FusionGroupPaddingRNNTest(PaddingRNNTestBase):
        rnn_model = "static"
        config = RNNConfig("test", rnn_model)
        with fluid.scope_guard(fluid.Scope()):
-            self.train(config, parallel=True, use_program_cache=False)
+            self.train(config, use_program_cache=False)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_compiled_program.py
+++ b/python/paddle/fluid/tests/unittests/test_compiled_program.py
@@ -72,30 +72,6 @@ class TestCompiledProgram(unittest.TestCase):
            )
            np.testing.assert_array_equal(loss_data[0], self.loss)

-    def test_compiled_program_with_data_parallel(self):
-        with new_program_scope():
-            paddle.seed(self.seed)
-            paddle.framework.random._manual_program_seed(self.seed)
-            place = (
-                fluid.CUDAPlace(0)
-                if core.is_compiled_with_cuda()
-                else fluid.CPUPlace()
-            )
-            exe = fluid.Executor(place)
-
-            loss = simple_fc_net()
-            exe.run(fluid.default_startup_program())
-            compiled_prog = fluid.CompiledProgram(
-                fluid.default_main_program()
-            ).with_data_parallel(loss_name=loss.name, places=[place])
-
-            (loss_data,) = exe.run(
-                compiled_prog,
-                feed={"image": self.img, "label": self.label},
-                fetch_list=[loss.name],
-            )
-            np.testing.assert_array_equal(loss_data[0], self.loss)
-

 class TestCompiledProgramError(unittest.TestCase):
    def test_program_or_graph_error(self):
@@ -112,17 +88,6 @@ class TestCompiledProgramError(unittest.TestCase):
        )
        avg_loss = paddle.mean(loss)

-    def compile_program_not_compiled(self):
-        with fluid.program_guard(fluid.Program()):
-            # build model
-            self.build_simple_model()
-            # compile program
-            program = fluid.default_main_program()
-            compiled_program = fluid.CompiledProgram(
-                program
-            ).with_data_parallel()
-            return compiled_program
-
    def compile_program(self):
        with fluid.program_guard(fluid.Program()):
            # build model
@@ -149,34 +114,6 @@ class TestCompiledProgramError(unittest.TestCase):
            with self.assertRaises(ValueError):
                compiled_program._compile(scope, new_place)

-    def test_share_vars_from_error_no_parallel(self):
-        with fluid.program_guard(fluid.Program()):
-            source_program, _, _ = self.compile_program()
-            self.build_simple_model()
-            # compile program
-            program = fluid.default_main_program()
-            compiled_program = fluid.CompiledProgram(
-                program
-            ).with_data_parallel(share_vars_from=source_program)
-            scope = fluid.global_scope()
-            place = fluid.CPUPlace()
-            with self.assertRaises(ValueError):
-                compiled_program._compile(scope, place)
-
-    def test_share_vars_from_error_no_executor(self):
-        with fluid.program_guard(fluid.Program()):
-            source_program = self.compile_program_not_compiled()
-            self.build_simple_model()
-            # compile program
-            program = fluid.default_main_program()
-            compiled_program = fluid.CompiledProgram(
-                program
-            ).with_data_parallel(share_vars_from=source_program)
-            scope = fluid.global_scope()
-            place = fluid.CPUPlace()
-            with self.assertRaises(ValueError):
-                compiled_program._compile(scope, place)
-

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_cuda_graph_static_mode.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph_static_mode.py
@@ -111,9 +111,7 @@ class TestCUDAGraphInStaticMode(unittest.TestCase):
            build_strategy.fix_op_run_order = True
            build_strategy.fuse_all_optimizer_ops = True
            compiled_program = paddle.static.CompiledProgram(
-                main
-            ).with_data_parallel(
-                loss_name=loss.name, build_strategy=build_strategy, places=place
+                main, build_strategy=build_strategy
            )
            image_t = scope.var(image.name).get_tensor()
            label_t = scope.var(label.name).get_tensor()

--- a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
@@ -106,13 +106,13 @@ class DatasetLoaderTestBase(unittest.TestCase):
        dataset._set_batch_size(BATCH_SIZE)

        if isinstance(place, fluid.CPUPlace):
-            file_num = 10
+            file_num = 1
            os.environ['CPU_NUM'] = str(file_num)
-            places = fluid.cpu_places()
+            places = [fluid.CPUPlace()]
            use_cuda = False
        else:
-            file_num = fluid.core.get_cuda_device_count()
-            places = fluid.cuda_places()
+            file_num = 1
+            places = [fluid.CUDAPlace(0)]
            use_cuda = True

        filelist = []
@@ -145,7 +145,7 @@ class DatasetLoaderTestBase(unittest.TestCase):
        dataloader = fluid.io.DataLoader.from_dataset(
            dataset=dataset, places=places, drop_last=self.drop_last
        )
-        prog = fluid.CompiledProgram(main_prog).with_data_parallel()
+        prog = fluid.CompiledProgram(main_prog)
        exe = fluid.Executor(place)

        exe.run(startup_prog)

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
@@ -19,7 +19,6 @@ import numpy as np
 os.environ['FLAGS_use_mkldnn'] = '0'
 os.environ['CPU_NUM'] = '4'

-import multiprocessing
 import unittest
 from functools import reduce

@@ -82,13 +81,6 @@ class TestExecutor(unittest.TestCase):
                    with fluid.unique_name.guard():
                        self.executor_main()

-        for p in places:
-            self.place = p
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                with fluid.scope_guard(fluid.Scope()):
-                    with fluid.unique_name.guard():
-                        self.pe_main()
-
    def prepare_feed(self, image, label, dev_cnt=1):
        batch_size = 32 * dev_cnt
        image_shape = (batch_size,) + tuple(image.shape[1:])
@@ -179,48 +171,6 @@ class TestExecutor(unittest.TestCase):
                fluid.global_scope(), persistables, non_persistables
            )

-    def pe_main(self):
-        image, label, loss = simple_fc_net()
-        loss.persistable = False
-        persistables, non_persistables = get_persistables_and_non_persistables(
-            fluid.default_main_program(), [loss.name]
-        )
-        self.assert_gc_vars(
-            fluid.default_main_program(), [loss.name], non_persistables
-        )
-
-        exe = fluid.Executor(self.place)
-        exe.run(fluid.default_startup_program())
-
-        exec_strategy = fluid.ExecutionStrategy()
-        exec_strategy.num_iteration_per_drop_scope = 100
-
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.memory_optimize = False
-        build_strategy.enable_inplace = False
-
-        prog = fluid.CompiledProgram(
-            fluid.default_main_program()
-        ).with_data_parallel(loss_name=loss.name, exec_strategy=exec_strategy)
-
-        dev_cnt = (
-            fluid.core.get_cuda_device_count()
-            if isinstance(self.place, fluid.CUDAPlace)
-            else int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-        )
-
-        for idx in range(10):
-            image_np, label_np = self.prepare_feed(image, label, dev_cnt)
-            feed = {image.name: image_np, label.name: label_np}
-
-            exe.run(program=prog, feed=feed, fetch_list=[loss])
-
-            local_scopes = prog._local_scopes
-            for scope in local_scopes:
-                kids = scope._kids()
-                self.assertTrue(len(kids) == 1)
-                self.assertScopeVar(kids[0], persistables, non_persistables)
-

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
@@ -23,20 +23,13 @@ from fake_reader import fake_imdb_reader
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid import compiler


-def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
+def train(network, use_cuda, batch_size=32, pass_num=2):
    if use_cuda and not core.is_compiled_with_cuda():
        print('Skip use_cuda=True because Paddle is not compiled with cuda')
        return

-    if use_parallel_executor and os.name == 'nt':
-        print(
-            'Skip use_parallel_executor=True because Paddle comes without parallel support on windows'
-        )
-        return
-
    word_dict_size = 5147
    reader = fake_imdb_reader(word_dict_size, batch_size * 40)
    train_reader = paddle.batch(reader, batch_size=batch_size)
@@ -54,9 +47,7 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-    reader = feeder.decorate_reader(
-        train_reader, multi_devices=use_parallel_executor
-    )
+    reader = feeder.decorate_reader(train_reader, multi_devices=False)

    exe = fluid.Executor(place)
    fluid.default_startup_program().random_seed = 1
@@ -64,13 +55,7 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
    exe.run(fluid.default_startup_program())

    train_cp = fluid.default_main_program()
-    if use_parallel_executor:
-        train_cp = compiler.CompiledProgram(
-            fluid.default_main_program()
-        ).with_data_parallel(loss_name=cost.name)
-        fetch_list = [cost.name]
-    else:
-        fetch_list = [cost]
+    fetch_list = [cost]

    for pass_id in range(pass_num):
        batch_id = 0
@@ -94,12 +79,9 @@ class TestBase(unittest.TestCase):
            return

        for use_cuda in [True, False]:
-            for use_parallel_executor in [False, True]:
-                print(
-                    'network: {}, use_cuda: {}, use_parallel_executor: {}'.format(
-                        self.net.__name__, use_cuda, use_parallel_executor
-                    )
-                )
-                with fluid.program_guard(fluid.Program(), fluid.Program()):
-                    with fluid.scope_guard(core.Scope()):
-                        train(self.net, use_cuda, use_parallel_executor)
+            print(
+                'network: {}, use_cuda: {}'.format(self.net.__name__, use_cuda)
+            )
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                with fluid.scope_guard(core.Scope()):
+                    train(self.net, use_cuda)
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
@@ -473,7 +473,7 @@ class PaddingRNNTestBase(unittest.TestCase):
        # You can override the function to set your own config.
        pass

-    def _prepare_program(self, config, parallel=True):
+    def _prepare_program(self, config):
        paddle.seed(config.random_seed)
        self.main_program = fluid.Program()
        self.startup_program = fluid.Program()
@@ -517,16 +517,7 @@ class PaddingRNNTestBase(unittest.TestCase):

        self.exe.run(self.startup_program)

-        if parallel:
-            self.train_program = fluid.compiler.CompiledProgram(
-                self.main_program
-            ).with_data_parallel(
-                loss_name=self.loss.name,
-                build_strategy=self.build_strategy,
-                exec_strategy=self.exec_strategy,
-            )
-        else:
-            self.train_program = self.main_program
+        self.train_program = self.main_program

    def _generate_init_data(self):
        init_hidden = np.zeros(
@@ -621,29 +612,27 @@ class PaddingRNNTestBase(unittest.TestCase):
            ppl = np.append(ppl, batch_ppl)
        return ppl

-    def train(self, config, parallel=True, use_program_cache=True):
+    def train(self, config, use_program_cache=True):
        self.set_customed_config()

        self.config = config
-        self._prepare_program(config, parallel)
+        self._prepare_program(config)
        ppl = np.zeros(shape=(0, config.batch_size))
        for epoch_id in range(config.max_epoch):
            train_ppl = self._train_an_epoch(epoch_id, use_program_cache)
            ppl = np.append(ppl, train_ppl)
        return ppl

-    def compare_padding_static_mode(
-        self, parallel=True, use_program_cache=True
-    ):
+    def compare_padding_static_mode(self, use_program_cache=True):
        '''
        Test that train ppl of padding mode is same to that of static graph mode
        '''
        config = RNNConfig('test', 'padding')
        with fluid.scope_guard(fluid.Scope()):
-            padding_rnn_ppl = self.train(config, parallel, use_program_cache)
+            padding_rnn_ppl = self.train(config, use_program_cache)
        config = RNNConfig('test', 'static')
        with fluid.scope_guard(fluid.Scope()):
-            static_rnn_ppl = self.train(config, parallel, use_program_cache)
+            static_rnn_ppl = self.train(config, use_program_cache)
        np.testing.assert_allclose(padding_rnn_ppl, static_rnn_ppl, rtol=0.001)


@@ -654,7 +643,7 @@ class EagerDeletionPaddingRNNTest(PaddingRNNTestBase):
        '''
        fluid.core._set_eager_deletion_mode(-1.0, 1.0, True)
        # When parallel is True, use_program_cache does not make a difference.
-        self.compare_padding_static_mode(parallel=True, use_program_cache=True)
+        self.compare_padding_static_mode(use_program_cache=True)

    def test_padding_mode_eager_deletion(self):
        '''
@@ -662,7 +651,7 @@ class EagerDeletionPaddingRNNTest(PaddingRNNTestBase):
        '''
        fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
        # When parallel is True, use_program_cache does not make a difference.
-        self.compare_padding_static_mode(parallel=True, use_program_cache=True)
+        self.compare_padding_static_mode(use_program_cache=True)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
@@ -16,14 +16,12 @@ import os

 os.environ['CPU_NUM'] = '2'

-import multiprocessing
 import unittest

 import numpy

 import paddle
 import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
 import paddle.fluid.core as core
 import paddle.fluid.layers as layers
 from paddle.fluid.executor import Executor
@@ -41,30 +39,19 @@ class TestEagerDeletionWhileOpBase(unittest.TestCase):
            places.append(core.CUDAPlace(0))

        for p in places:
-            for with_data_parallel in [False, True]:
-                with fluid.program_guard(fluid.Program(), fluid.Program()):
-                    with fluid.scope_guard(fluid.Scope()):
-                        self.run_main(p, with_data_parallel)
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                with fluid.scope_guard(fluid.Scope()):
+                    self.run_main(p)

-    def run_main(self, place, with_data_parallel):
+    def run_main(self, place):
        self.place = place
-        self.with_data_parallel = with_data_parallel

        if not core.is_compiled_with_cuda() and isinstance(
            self.place, core.CUDAPlace
        ):
            return

-        if isinstance(self.place, core.CUDAPlace):
-            device_cnt = (
-                core.get_cuda_device_count() if self.with_data_parallel else 1
-            )
-        else:
-            device_cnt = (
-                int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                if self.with_data_parallel
-                else 1
-            )
+        device_cnt = 1

        d0 = paddle.static.data("d0", shape=[-1, 10], dtype='float32')
        d1 = paddle.static.data("d1", shape=[-1, 10], dtype='float32')
@@ -139,19 +126,12 @@ class TestEagerDeletionWhileOpBase(unittest.TestCase):
        exe.run(fluid.default_startup_program())

        prog = fluid.default_main_program()
-        if self.with_data_parallel:
-            prog = compiler.CompiledProgram(
-                fluid.default_main_program()
-            ).with_data_parallel(loss_name=loss.name)

        for _ in range(5):
            d = []
            for i in range(3):
                tmp = numpy.random.random(size=[10]).astype('float32')
-                if not self.with_data_parallel:
-                    d.append(tmp)
-                else:
-                    d.append(numpy.array([tmp] * device_cnt))
+                d.append(numpy.array([tmp] * device_cnt))

            outs = exe.run(
                program=prog,

--- a/python/paddle/fluid/tests/unittests/test_executor_and_use_program_cache.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_and_use_program_cache.py
@@ -83,11 +83,11 @@ class TestExecutor(unittest.TestCase):

 class ExecutorPaddingRNNTest(PaddingRNNTestBase):
    def train_and_save_inference_program(
-        self, rnn_model="static", parallel=True, use_program_cache=True
+        self, rnn_model="static", use_program_cache=True
    ):
        config = RNNConfig("test", rnn_model)
        with fluid.scope_guard(fluid.Scope()):
-            self.train(config, parallel, use_program_cache)
+            self.train(config, use_program_cache)
            fluid.io.save_inference_model(
                main_program=self.main_program,
                feeded_var_names=self.feed_order,
@@ -101,7 +101,7 @@ class ExecutorPaddingRNNTest(PaddingRNNTestBase):
        for rnn_model in ["static", "padding"]:
            # Set parallel to False to use the default executor.
            self.train_and_save_inference_program(
-                rnn_model=rnn_model, parallel=True, use_program_cache=True
+                rnn_model=rnn_model, use_program_cache=True
            )

            x_np = np.random.random(

--- a/python/paddle/fluid/tests/unittests/test_executor_check_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_check_feed.py
@@ -64,9 +64,7 @@ class TestExecutor(unittest.TestCase):
                exe = fluid.Executor(cpu)
                lr, cost = self.net()
                exe.run(startup_program)
-                compiled_prog = fluid.CompiledProgram(
-                    main_program
-                ).with_data_parallel(loss_name=cost.name)
+                compiled_prog = fluid.CompiledProgram(main_program)
                train_data = [[1.0], [2.0], [3.0], [4.0]]
                y_true = [[2.0], [4.0], [6.0], [8.0]]
                a = 0

--- a/python/paddle/fluid/tests/unittests/test_executor_feed_non_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_feed_non_tensor.py
@@ -119,9 +119,7 @@ class TestExecutor(unittest.TestCase):
                cpu = fluid.CPUPlace()
                exe = fluid.Executor(cpu)
                exe.run(startup_program)
-                compiled_prog = fluid.CompiledProgram(
-                    main_program
-                ).with_data_parallel(loss_name=cost.name)
+                compiled_prog = fluid.CompiledProgram(main_program)
                train_data = numpy.array([[1.0], [2.0], [3.0], [4.0]]).astype(
                    'float32'
                )

--- a/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py
@@ -72,28 +72,20 @@ class TestExecutorReturnTensorNotOverOverwritingWithLayers(unittest.TestCase):
    def setUp(self):
        pass

-    def calc_add_out(self, place=None, parallel=None):
+    def calc_add_out(self, place=None):
        x = paddle.ones(shape=[3, 3], dtype='float32')
        y = paddle.ones(shape=[3, 3], dtype='float32')
        out = paddle.add(x=x, y=y)
        program = fluid.default_main_program()
-        if parallel:
-            program = fluid.CompiledProgram(program).with_data_parallel(
-                places=place
-            )
        exe = fluid.Executor(place)
        out = exe.run(program, fetch_list=[out], return_numpy=False)
        return out

-    def calc_sub_out(self, place=None, parallel=None):
+    def calc_sub_out(self, place=None):
        x = paddle.ones(shape=[2, 2], dtype='float32')
        y = paddle.ones(shape=[2, 2], dtype='float32')
        out = paddle.subtract(x=x, y=y)
        program = fluid.default_main_program()
-        if parallel:
-            program = fluid.CompiledProgram(program).with_data_parallel(
-                places=place
-            )
        exe = fluid.Executor(place)
        out = exe.run(program, fetch_list=[out], return_numpy=False)
        return out
@@ -104,12 +96,11 @@ class TestExecutorReturnTensorNotOverOverwritingWithLayers(unittest.TestCase):
            places.append(fluid.CUDAPlace(0))

        for place in places:
-            for parallel in [True, False]:
-                add_out = self.calc_add_out(place, parallel)
-                add_out1 = np.array(add_out[0])
-                sub_out = self.calc_sub_out(place, parallel)
-                add_out2 = np.array(add_out[0])
-                np.testing.assert_array_equal(add_out1, add_out2)
+            add_out = self.calc_add_out(place)
+            add_out1 = np.array(add_out[0])
+            sub_out = self.calc_sub_out(place)
+            add_out2 = np.array(add_out[0])
+            np.testing.assert_array_equal(add_out1, add_out2)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py
+++ b/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py
@@ -20,7 +20,6 @@ import numpy as np

 import paddle
 import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
 import paddle.fluid.core as core

 os.environ['CPU_NUM'] = str(4)
@@ -46,16 +45,12 @@ class TestFeedData(unittest.TestCase):
            else int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
        )

-    def _get_feed_batch_size(self, use_cuda, use_parallel_executor):
+    def _get_feed_batch_size(self, use_cuda):
        """
        Returns actual fed data size. We should multiple the number of
        devices when it is using ParallelExecutor
        """
-        return (
-            self.data_batch_size * self._get_device_count(use_cuda)
-            if use_parallel_executor
-            else self.data_batch_size
-        )
+        return self.data_batch_size

    def _simple_fc_net(self, in_size, label_size, class_num, hidden_sizes):
        in_data = fluid.data(name="data", dtype='float32', shape=in_size)
@@ -85,57 +80,45 @@ class TestFeedData(unittest.TestCase):
        for use_cuda in (
            [True, False] if core.is_compiled_with_cuda() else [False]
        ):
-            for use_parallel_executor in [False, True]:
-                print('Test Parameters:'),
-                print(
-                    {
-                        'use_cuda': use_cuda,
-                        'use_parallel_executor': use_parallel_executor,
-                    }
-                )
-                # Test feeding without error
-                self._test_feed_data_match_shape_type(
-                    use_cuda, use_parallel_executor
-                )
-                self._test_feed_data_contains_neg_one(
-                    use_cuda, use_parallel_executor
-                )
-                self._test_feed_lod_tensor(use_cuda, use_parallel_executor)
-
-                # Test exception message when feeding with error
-                in_shape_tuple = (-1, 3, 4, 8)
-                error_shape_list = [self.data_batch_size, 3, 4, 5]
-
-                with self.assertRaises(ValueError) as shape_mismatch_err:
-                    self._test_feed_data_shape_mismatch(
-                        use_cuda, use_parallel_executor
-                    )
-                self.assertEqual(
-                    str(shape_mismatch_err.exception),
-                    "The fed Variable %r should have dimensions = %r, "
-                    "shape = %r, but received fed shape %r on each device"
-                    % (
-                        'data',
-                        len(in_shape_tuple),
-                        in_shape_tuple,
-                        error_shape_list,
-                    ),
-                )
-
-                with self.assertRaises(ValueError) as dtype_mismatch_err:
-                    self._test_feed_data_dtype_mismatch(
-                        use_cuda, use_parallel_executor
-                    )
-                self.assertEqual(
-                    str(dtype_mismatch_err.exception),
-                    "The data type of fed Variable %r must be 'int64', but "
-                    "received 'float64'" % ('label'),
-                )
-
-    def _test_feed_data_dtype_mismatch(self, use_cuda, use_parallel_executor):
-        feed_batch_size = self._get_feed_batch_size(
-            use_cuda, use_parallel_executor
-        )
+            print('Test Parameters:'),
+            print(
+                {
+                    'use_cuda': use_cuda,
+                }
+            )
+            # Test feeding without error
+            self._test_feed_data_match_shape_type(use_cuda)
+            self._test_feed_data_contains_neg_one(use_cuda)
+            self._test_feed_lod_tensor(use_cuda)
+
+            # Test exception message when feeding with error
+            in_shape_tuple = (-1, 3, 4, 8)
+            error_shape_list = [self.data_batch_size, 3, 4, 5]
+
+            with self.assertRaises(ValueError) as shape_mismatch_err:
+                self._test_feed_data_shape_mismatch(use_cuda)
+            self.assertEqual(
+                str(shape_mismatch_err.exception),
+                "The fed Variable %r should have dimensions = %r, "
+                "shape = %r, but received fed shape %r on each device"
+                % (
+                    'data',
+                    len(in_shape_tuple),
+                    in_shape_tuple,
+                    error_shape_list,
+                ),
+            )
+
+            with self.assertRaises(ValueError) as dtype_mismatch_err:
+                self._test_feed_data_dtype_mismatch(use_cuda)
+            self.assertEqual(
+                str(dtype_mismatch_err.exception),
+                "The data type of fed Variable %r must be 'int64', but "
+                "received 'float64'" % ('label'),
+            )
+
+    def _test_feed_data_dtype_mismatch(self, use_cuda):
+        feed_batch_size = self._get_feed_batch_size(use_cuda)
        in_size = [self.data_batch_size, 3, 4, 5]
        feed_in_data = np.random.uniform(
            size=[feed_batch_size, 3, 4, 5]
@@ -150,11 +133,10 @@ class TestFeedData(unittest.TestCase):
            feed_in_data,
            feed_label,
            use_cuda,
-            use_parallel_executor,
        )

-    def _test_feed_data_shape_mismatch(self, use_cuda, use_parallel_executor):
-        batch_size = self._get_feed_batch_size(use_cuda, use_parallel_executor)
+    def _test_feed_data_shape_mismatch(self, use_cuda):
+        batch_size = self._get_feed_batch_size(use_cuda)
        in_size = [None, 3, 4, 8]
        feed_in_data = np.random.uniform(size=[batch_size, 3, 4, 5]).astype(
            np.float32
@@ -169,11 +151,10 @@ class TestFeedData(unittest.TestCase):
            feed_in_data,
            feed_label,
            use_cuda,
-            use_parallel_executor,
        )

-    def _test_feed_data_contains_neg_one(self, use_cuda, use_parallel_executor):
-        batch_size = self._get_feed_batch_size(use_cuda, use_parallel_executor)
+    def _test_feed_data_contains_neg_one(self, use_cuda):
+        batch_size = self._get_feed_batch_size(use_cuda)
        in_size = [-1, 3, 4, 5]
        feed_in_data = np.random.uniform(size=[batch_size, 3, 4, 5]).astype(
            np.float32
@@ -188,13 +169,10 @@ class TestFeedData(unittest.TestCase):
            feed_in_data,
            feed_label,
            use_cuda,
-            use_parallel_executor,
        )

-    def _test_feed_data_match_shape_type(self, use_cuda, use_parallel_executor):
-        feed_batch_size = self._get_feed_batch_size(
-            use_cuda, use_parallel_executor
-        )
+    def _test_feed_data_match_shape_type(self, use_cuda):
+        feed_batch_size = self._get_feed_batch_size(use_cuda)
        in_size = [self.data_batch_size, 3, 4, 5]
        feed_in_data = np.random.uniform(
            size=[feed_batch_size, 3, 4, 5]
@@ -209,10 +187,9 @@ class TestFeedData(unittest.TestCase):
            feed_in_data,
            feed_label,
            use_cuda,
-            use_parallel_executor,
        )

-    def _test_feed_lod_tensor(self, use_cuda, use_parallel_executor):
+    def _test_feed_lod_tensor(self, use_cuda):
        device_count = self._get_device_count(use_cuda)

        in_size = [device_count, 3, 4, 5]
@@ -241,7 +218,6 @@ class TestFeedData(unittest.TestCase):
            feed_data_tensor,
            feed_label_tensor,
            use_cuda,
-            use_parallel_executor,
        )

    def _feed_data_in_executor(
@@ -251,7 +227,6 @@ class TestFeedData(unittest.TestCase):
        feed_in_data,
        feed_label,
        use_cuda,
-        use_parallel_executor,
    ):

        startup_program = fluid.Program()
@@ -268,10 +243,6 @@ class TestFeedData(unittest.TestCase):
        exe.run(startup_program)

        train_program = main_program
-        if use_parallel_executor:
-            train_program = compiler.CompiledProgram(
-                main_program
-            ).with_data_parallel(loss_name=loss.name)

        for i in range(self.iterations):
            fetches = exe.run(

--- a/python/paddle/fluid/tests/unittests/test_fetch_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_lod_tensor_array.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import os
 import unittest

 import numpy as np
@@ -44,7 +43,6 @@ class TestFetchLoDTensorArray(unittest.TestCase):
                return loss, array

    def check_network(self, use_cuda=True):
-        os.environ["CPU_NUM"] = str(2)
        main_program = fluid.Program()
        startup_program = fluid.Program()

@@ -60,35 +58,15 @@ class TestFetchLoDTensorArray(unittest.TestCase):
        feed_dict = {'image': image, 'label': label}

        build_strategy = fluid.BuildStrategy()
-        binary = fluid.CompiledProgram(main_program).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy
+        binary = fluid.CompiledProgram(
+            main_program, build_strategy=build_strategy
        )

-        device_num = fluid.core.get_cuda_device_count() if use_cuda else 2
        for _ in range(3):
            loss_v, array_v = exe.run(
-                binary,
-                feed=feed_dict,
-                fetch_list=[loss, array],
-                return_merged=False,
+                binary, feed=feed_dict, fetch_list=[loss, array]
            )
-            self.assertEqual(np.array(loss_v).shape, (device_num, 1))
-            self.assertEqual(
-                np.array(array_v[0][0]).shape, (batch_size / device_num, 784)
-            )
-            self.assertEqual(
-                np.array(array_v[0][1]).shape, (batch_size / device_num, 1)
-            )
-            self.assertEqual(np.array(array_v[0][2]).shape, (1,))
-
-        for _ in range(3):
-            loss_v, array_v = exe.run(
-                binary,
-                feed=feed_dict,
-                fetch_list=[loss, array],
-                return_merged=True,
-            )
-            self.assertEqual(np.array(loss_v).shape, (device_num,))
+            self.assertEqual(np.array(loss_v).shape, (1,))
            self.assertEqual(np.array(array_v[0]).shape, (batch_size, 784))
            self.assertEqual(np.array(array_v[1]).shape, (batch_size, 1))
            np.testing.assert_allclose(loss_v, array_v[2], rtol=1e-05)
@@ -98,13 +76,6 @@ class TestFetchLoDTensorArray(unittest.TestCase):
            self.check_network(use_cuda=True)
        self.check_network(use_cuda=False)

-    def test_fetch_unmerged_parallel_graph(self):
-        fluid.core.globals()['FLAGS_enable_parallel_graph'] = True
-        if fluid.core.is_compiled_with_cuda():
-            self.check_network(use_cuda=True)
-        self.check_network(use_cuda=False)
-        fluid.core.globals()['FLAGS_enable_parallel_graph'] = False
-

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
@@ -83,8 +83,8 @@ class TestFuseBatchNormActPass(unittest.TestCase):
        # close fused_bn_act_ops
        build_strategy = fluid.BuildStrategy()
        build_strategy.fuse_bn_act_ops = False
-        binary = fluid.CompiledProgram(main_program).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy
+        binary = fluid.CompiledProgram(
+            main_program, build_strategy=build_strategy
        )
        train_reader = paddle.batch(
            paddle.dataset.mnist.train(), batch_size=batch_size
@@ -103,8 +103,8 @@ class TestFuseBatchNormActPass(unittest.TestCase):
        # open fused_bn_act_ops
        build_strategy_fused = fluid.BuildStrategy()
        build_strategy_fused.fuse_bn_act_ops = True
-        binary_fused = fluid.CompiledProgram(main_program).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy_fused
+        binary_fused = fluid.CompiledProgram(
+            main_program, build_strategy=build_strategy_fused
        )
        train_reader_fused = paddle.batch(
            paddle.dataset.mnist.train(), batch_size=batch_size

--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
@@ -198,8 +198,8 @@ class TestFusedBnAddActAPI(unittest.TestCase):
        )
        build_strategy_fused = fluid.BuildStrategy()
        build_strategy_fused.fuse_bn_add_act_ops = True
-        binary_fused = fluid.CompiledProgram(main_program).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy_fused
+        binary_fused = fluid.CompiledProgram(
+            main_program, build_strategy=build_strategy_fused
        )
        exe = fluid.Executor(place)
        loss_vals_fused = []
@@ -221,8 +221,8 @@ class TestFusedBnAddActAPI(unittest.TestCase):
        # build_origin_program: turn off fused_bn_act_ops
        build_strategy = fluid.BuildStrategy()
        build_strategy.fuse_bn_add_act_ops = False
-        binary = fluid.CompiledProgram(main_program).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy_fused
+        binary = fluid.CompiledProgram(
+            main_program, build_strategy=build_strategy_fused
        )
        loss_vals = []
        scope = fluid.Scope()

--- a/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py
@@ -146,11 +146,8 @@ class TestFuseGemmEpilogueFWDBase(unittest.TestCase):
    def _test_output(self):
        build_strategy = paddle.static.BuildStrategy()
        build_strategy.fuse_gemm_epilogue = True
-        program = paddle.static.CompiledProgram(self.main_prog)
-        program = program.with_data_parallel(
-            loss_name=self.loss.name,
-            build_strategy=build_strategy,
-            places=paddle.static.cuda_places(),
+        program = paddle.static.CompiledProgram(
+            self.main_prog, build_strategy=build_strategy
        )

        result = self.exe.run(
@@ -332,11 +329,8 @@ class TestFuseGemmEpilogueBWDBase(unittest.TestCase):
    def _test_output(self):
        build_strategy = paddle.static.BuildStrategy()
        build_strategy.fuse_gemm_epilogue = True
-        program = paddle.static.CompiledProgram(self.main_prog)
-        program = program.with_data_parallel(
-            loss_name=self.loss.name,
-            build_strategy=build_strategy,
-            places=paddle.static.cuda_places(),
+        program = paddle.static.CompiledProgram(
+            self.main_prog, build_strategy=build_strategy
        )

        outs_res = self.exe.run(program, feed=self.feed, fetch_list=self.fetch)

--- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
@@ -238,9 +238,7 @@ class TestInstance(unittest.TestCase):

        # will print warning message

-        cp_prog = CompiledProgram(program).with_data_parallel(
-            loss_name=avg_cost.name
-        )
+        cp_prog = CompiledProgram(program)

        save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, cp_prog)
        self.assertRaises(

--- a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
@@ -97,9 +97,7 @@ class TestInplaceAddto(unittest.TestCase):

            strategy = fluid.BuildStrategy()
            strategy.enable_addto = enable_addto
-            compiled = fluid.CompiledProgram(main).with_data_parallel(
-                loss_name=loss.name, build_strategy=strategy
-            )
+            compiled = fluid.CompiledProgram(main, build_strategy=strategy)

            exe.run(startup)
            img = np.random.uniform(-128, 128, [8, 3, 224, 224]).astype(

--- a/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py
@@ -64,9 +64,7 @@ class TestSoftmaxWithXe(unittest.TestCase):
                build_strategy = fluid.BuildStrategy()
                build_strategy.enable_inplace = inplace
                prog = fluid.CompiledProgram(
-                    fluid.default_main_program()
-                ).with_data_parallel(
-                    build_strategy=build_strategy, places=place
+                    fluid.default_main_program(), build_strategy=build_strategy
                )

                fetch_list = [z_d.name, s_d.name]

--- a/python/paddle/fluid/tests/unittests/test_memory_reuse_exclude_feed_var.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_reuse_exclude_feed_var.py
@@ -41,8 +41,8 @@ class TestMemoryReuseExcludeFeedVar(unittest.TestCase):
        exe.run(fluid.default_startup_program())

        compiled_prog = fluid.CompiledProgram(
-            fluid.default_main_program()
-        ).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy)
+            fluid.default_main_program(), build_strategy=build_strategy
+        )

        image_tensor = fluid.LoDTensor()
        np_image = np.random.uniform(

--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
@@ -68,9 +68,7 @@ class TestReaderReset(unittest.TestCase):
            paddle.batch(self.prepare_data(), batch_size=self.batch_size)
        )

-        train_cp = compiler.CompiledProgram(main_prog).with_data_parallel(
-            places=[place]
-        )
+        train_cp = compiler.CompiledProgram(main_prog)

        batch_id = 0
        pass_count = 0

--- a/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py
+++ b/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py
@@ -90,9 +90,7 @@ class TestResnet50Accuracy(unittest.TestCase):
        loss = self.build_program(main_program, startup_program)
        exe = paddle.static.Executor(place)

-        compiled_prog = paddle.static.CompiledProgram(
-            main_program
-        ).with_data_parallel(loss_name=loss.name)
+        compiled_prog = paddle.static.CompiledProgram(main_program)
        loss_vals = []
        scope = paddle.static.Scope()


--- a/python/paddle/fluid/tests/unittests/test_weight_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_decay.py
@@ -126,11 +126,7 @@ class TestWeightDecay(unittest.TestCase):
        build_strategy.memory_optimize = use_ir_memory_optimize

        train_cp = compiler.CompiledProgram(
-            fluid.default_main_program()
-        ).with_data_parallel(
-            loss_name=loss.name,
-            exec_strategy=exec_strategy,
-            build_strategy=build_strategy,
+            fluid.default_main_program(), build_strategy=build_strategy
        )

        loss_set = []

--- a/python/paddle/static/quantization/tests/test_graph.py
+++ b/python/paddle/static/quantization/tests/test_graph.py
@@ -76,11 +76,11 @@ class TestGraph(unittest.TestCase):
        build_strategy.memory_optimize = False
        build_strategy.enable_inplace = False
        origin_binary = paddle.static.CompiledProgram(
-            graph.graph
-        ).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy)
+            graph.graph, build_strategy=build_strategy
+        )
        backup_binary = paddle.static.CompiledProgram(
-            backup_graph.graph
-        ).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy)
+            backup_graph.graph, build_strategy=build_strategy
+        )
        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
        exe = paddle.static.Executor(place)
        exe.run(startup)

--- a/python/paddle/static/quantization/tests/test_moving_average_abs_max_scale_op.py
+++ b/python/paddle/static/quantization/tests/test_moving_average_abs_max_scale_op.py
@@ -70,9 +70,7 @@ class TestMovingAverageAbsMaxScaleOp(unittest.TestCase):
        exe = paddle.static.Executor(place)
        exe.run(startup_program)

-        binary = paddle.static.CompiledProgram(main_program).with_data_parallel(
-            loss_name=loss.name
-        )
+        binary = paddle.static.CompiledProgram(main_program)

        img, label = init_data()
        feed_dict = {"image": img, "label": label}

--- a/python/paddle/static/quantization/tests/test_quantization_mkldnn_pass.py
+++ b/python/paddle/static/quantization/tests/test_quantization_mkldnn_pass.py
@@ -143,8 +143,8 @@ class TestMKLDNNTransformBasedFreezePass(unittest.TestCase):
        build_strategy.memory_optimize = False
        build_strategy.enable_inplace = False
        binary = paddle.static.CompiledProgram(
-            main_graph.graph
-        ).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy)
+            main_graph.graph, build_strategy=build_strategy
+        )
        quantized_test_program = test_graph.to_program()
        iters = 5
        batch_size = 8

--- a/python/paddle/static/quantization/tests/test_quantization_pass.py
+++ b/python/paddle/static/quantization/tests/test_quantization_pass.py
@@ -373,8 +373,8 @@ class TestQuantizationFreezePass(unittest.TestCase):
        build_strategy.enable_inplace = False
        build_strategy.fuse_all_reduce_ops = False
        binary = paddle.static.CompiledProgram(
-            main_graph.graph
-        ).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy)
+            main_graph.graph, build_strategy=build_strategy
+        )
        quantized_test_program = test_graph.to_program()
        iters = 5
        batch_size = 8

--- a/python/paddle/static/quantization/tests/test_quantization_scale_pass.py
+++ b/python/paddle/static/quantization/tests/test_quantization_scale_pass.py
@@ -143,8 +143,8 @@ class TestQuantizationScalePass(unittest.TestCase):
        build_strategy.enable_inplace = False
        build_strategy.fuse_all_reduce_ops = False
        binary = paddle.static.CompiledProgram(
-            main_graph.graph
-        ).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy)
+            main_graph.graph, build_strategy=build_strategy
+        )
        iters = 5
        batch_size = 8


--- a/python/paddle/static/quantization/tests/test_user_defined_quantization.py
+++ b/python/paddle/static/quantization/tests/test_user_defined_quantization.py
@@ -191,8 +191,8 @@ class TestUserDefinedQuantization(unittest.TestCase):
        build_strategy.enable_inplace = False
        build_strategy.fuse_all_reduce_ops = False
        binary = paddle.static.CompiledProgram(
-            main_graph.graph
-        ).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy)
+            main_graph.graph, build_strategy=build_strategy
+        )
        iters = 5
        batch_size = 8