[Executor] remove run_program branch (#52471)

* remove run_program * remove FLAGS_USE_STANDALONE_EXECUTOR

[Executor] remove run_program branch (#52471)
* remove run_program * remove FLAGS_USE_STANDALONE_EXECUTOR
39278731 · kangguangli · GitHub · 47c740e7 · 39278731 · 39278731
16 changed file
--- a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
+++ b/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
@@ -473,9 +473,7 @@ class OptimizationTuner:
        parent_env = copy.copy(os.environ.copy())
        # env flags need for profile
-        new_env = {
+        new_env = {}
-            "FLAGS_USE_STANDALONE_EXECUTOR": "False",
-        }
        new_env.update(parent_env)
        # TODO if any rank hang or fail, kill all processes

--- a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
+++ b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
@@ -34,7 +34,6 @@ from paddle.distributed.auto_parallel.utils import (
    ring_id_to_process_group,
 )
 from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole
-from paddle.fluid.executor import _is_enable_standalone_executor
 from paddle.static import default_main_program
 from paddle.utils import unique_name
@@ -97,8 +96,7 @@ class DataParallelOptimizationPass(PassBase):
        self.global_rank = int(self.get_attr("global_rank"))
        self.use_sharding = self.get_attr("use_sharding")
        self.coalesce_prefix = 'coalesce_grad'
-        if _is_enable_standalone_executor():
+        self.gradient_sync_stream = "gradient_sync_stream"
-            self.gradient_sync_stream = "gradient_sync_stream"
        with paddle.static.program_guard(main_program, startup_program):
            self._analyze_program()
@@ -316,8 +314,7 @@ class DataParallelOptimizationPass(PassBase):
    def _calc_wait_comms(self):
-        if _is_enable_standalone_executor():
+        return
-            return
        block = default_main_program().global_block()
@@ -602,7 +599,7 @@ class DataParallelOptimizationPass(PassBase):
        # multiple stream executor(standalone exe). This function just for standalone exe. Refactor here
        # in future when only one executor stay.
-        if not _is_enable_standalone_executor() or len(grad_groups) == 0:
+        if len(grad_groups) == 0:
            return
        block = default_main_program().global_block()

--- a/python/paddle/distributed/passes/auto_parallel_grad_clip.py
+++ b/python/paddle/distributed/passes/auto_parallel_grad_clip.py
@@ -18,7 +18,6 @@ import numpy as np
 import paddle
 from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole
-from paddle.fluid.executor import _is_enable_standalone_executor
 from ..auto_parallel.dist_attribute import OperatorDistAttr, TensorDistAttr
 from ..auto_parallel.operators.common import (
@@ -460,10 +459,7 @@ class ClipGradByGloblNormPass(PassBase):
                    )
                    self.clip_helper._init_dist_attr(allreduce_op)
-                    if (
+                    if insert_leaf_fill_constant_node:
-                        _is_enable_standalone_executor()
-                        and insert_leaf_fill_constant_node
-                    ):
                        # NOTE add naive deps for global norm sync in graph exe
                        j = idx - 1

--- a/python/paddle/distributed/passes/auto_parallel_sharding.py
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -35,7 +35,6 @@ from paddle.distributed.auto_parallel.utils import (
    set_var_dist_attr,
 )
 from paddle.distributed.fleet.meta_optimizers.sharding.utils import get_var_size
-from paddle.fluid.executor import _is_enable_standalone_executor
 from paddle.framework import core
 from paddle.static import default_main_program, default_startup_program
 from paddle.utils import unique_name
@@ -1168,7 +1167,7 @@ class ShardingPass(PassBase):
        P.S. this overlap pass is ONLY adapted for standalone executor (graph based) and stream awared allocator.
        """
-        if not _is_enable_standalone_executor() or (not self.enable_overlap):
+        if not self.enable_overlap:
            return
        self.grad_comm_group_stream_pairs = []

--- a/python/paddle/distributed/passes/auto_parallel_supplement_explicit_dependencies.py
+++ b/python/paddle/distributed/passes/auto_parallel_supplement_explicit_dependencies.py
@@ -21,7 +21,6 @@ from paddle.distributed.auto_parallel.utils import (
    OpRole,
    insert_dependencies_for_vars,
 )
-from paddle.fluid.executor import _is_enable_standalone_executor
 from .auto_parallel_sharding import ShardingPass, _supported_optimizer_type
 from .pass_base import PassBase, register_pass
@@ -70,9 +69,7 @@ class AutoParalSupplementDepPass(PassBase):
    def _apply_single_impl(self, main_program, startup_program, context):
        # TODO general this pass for all case.
-        if not _is_enable_standalone_executor or not _sharding_pass_applied(
+        if not _sharding_pass_applied(context):
-            context
-        ):
            return
        self._dist_context = self.get_attr("dist_context", None)

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -493,14 +493,6 @@ def _to_name_str(var):
        return _to_str(var)
-def _is_enable_standalone_executor():
-    return (
-        framework._enable_standalone_executor_ is None
-        or framework._enable_standalone_executor_
-        in [1, '1', True, 'True', 'true']
-    )
 def _is_dy2st_enable_standalone_executor():
    return framework._dy2st_enable_standalone_executor_ in [
        1,
@@ -1004,8 +996,6 @@ class Executor:
            "__auto_checkpoint_executor__"
        )
-        # NOTE: Whether to use experimental executor `StandaloneExecutor`.
-        self._enable_interpreter_core = _is_enable_standalone_executor()
        self._executor_cache = _ExecutorCache()
        self._fleet_executor = None
@@ -1605,9 +1595,7 @@ class Executor:
            return True
-        if self._enable_interpreter_core and _can_use_interpreter_core(
+        if _can_use_interpreter_core(program, self.place):
-            program, self.place
-        ):
            if feed is None:
                feed = {}
@@ -1685,132 +1673,12 @@ class Executor:
        acp._auto_checkpoint(self, program)
-        # For backward compatibility, run directly.
-        if not compiled:
-            return self._run_program(
-                program,
-                feed=feed,
-                fetch_list=fetch_list,
-                feed_var_name=feed_var_name,
-                fetch_var_name=fetch_var_name,
-                scope=scope,
-                return_numpy=return_numpy,
-                use_program_cache=use_program_cache,
-            )
        program._compile(scope, self.place)
        assert (
            program._is_inference
        ), f"Program must have _is_inference = True, but get {program._is_inference}"
        return self._run_inference(program._executor, feed)
-    def _run_program(
-        self,
-        program,
-        feed,
-        fetch_list,
-        feed_var_name,
-        fetch_var_name,
-        scope,
-        return_numpy,
-        use_program_cache,
-    ):
-        from paddle.optimizer.lr import LRScheduler
-        if feed is None:
-            feed = {}
-        elif isinstance(feed, (list, tuple)):
-            assert len(feed) == 1, "Not compiled with data parallel"
-            feed = feed[0]
-        if not isinstance(feed, dict):
-            raise TypeError(
-                "feed requires dict as its Parameter. But you passed in %s"
-                % (type(feed))
-            )
-        assert program is not None, "The program should not be Empty"
-        if not isinstance(program, Program):
-            raise TypeError(
-                "Executor requires Program as its Parameter. But you passed in %s"
-                % (type(program))
-            )
-        if not isinstance(fetch_var_name, str):
-            raise TypeError(
-                "The name of fetch variable requires string as its Parameter. But you passed in %s"
-                % (type(fetch_var_name))
-            )
-        if use_program_cache:
-            cache_key = _get_strong_program_cache_key(program, feed, fetch_list)
-            cached_program = self._get_program_cache(cache_key)
-            cached_ctx = self._get_ctx_cache(cache_key)
-            cached_scope = self._get_scope_cache(cache_key)
-            if cached_program is None:
-                cached_program = _add_feed_fetch_ops(
-                    program=program,
-                    feed=feed,
-                    fetch_list=fetch_list,
-                    feed_var_name=feed_var_name,
-                    fetch_var_name=fetch_var_name,
-                )
-                self._add_program_cache(cache_key, cached_program)
-                fetch_list_str = list(map(_to_name_str, fetch_list))
-                cached_ctx = self._default_executor.prepare(
-                    cached_program.desc, 0, fetch_list_str, False
-                )
-                # currently, we cache program, vars, sub_scope here
-                # we suppose that in a life cycle of training, a user
-                # will not create many programs. So, here the basic
-                # rule of caching is to cache all unseen (program, var, scope)
-                # when a user use use_program_cache.
-                cached_scope = scope.new_scope()
-                self._default_executor.create_variables(
-                    cached_program.desc, cached_scope, 0
-                )
-                self._add_ctx_cache(cache_key, cached_ctx)
-                self._add_scope_cache(cache_key, cached_scope)
-            program = cached_program
-            ctx = cached_ctx
-            scope = cached_scope
-        else:
-            program = _add_feed_fetch_ops(
-                program=program,
-                feed=feed,
-                fetch_list=fetch_list,
-                feed_var_name=feed_var_name,
-                fetch_var_name=fetch_var_name,
-            )
-        self._feed_data(program, feed, feed_var_name, scope)
-        if hasattr(program, 'lr_schedulerr'):
-            assert isinstance(
-                program.lr_scheduler, LRScheduler
-            ), "must be LRScheduler"
-            lr_scheduler = program.lr_scheduler
-            lr_value = lr_scheduler()
-            lr_var = program.global_block().vars[lr_scheduler._var_name]
-            data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype))
-            tensor = core.get_variable_tensor(scope, lr_scheduler._var_name)
-            tensor.set(data, self.place)
-        if not use_program_cache:
-            self._default_executor.run(
-                program.desc, scope, 0, True, True, [fetch_var_name]
-            )
-        else:
-            self._default_executor.run_prepared_ctx(
-                ctx, scope, False, False, False
-            )
-        arr = scope.find_var(fetch_var_name).get_fetch_list()
-        tensors = arr._move_to_list()
-        if return_numpy:
-            return as_numpy(tensors)
-        else:
-            return tensors
    def _run_inference(self, exe, feed):
        return exe.run(feed)

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -116,9 +116,7 @@ _already_patch_eager_tensor = False
 _already_patch_varbase = False
 _current_cuda_graph_mode = None
 _global_flags_ = core.globals()
-_enable_standalone_executor_ = os.environ.get(
-    'FLAGS_USE_STANDALONE_EXECUTOR', None
-)
 _dy2st_enable_standalone_executor_ = os.environ.get(
    'FLAGS_DY2ST_USE_STANDALONE_EXECUTOR', 1
 )
@@ -270,17 +268,6 @@ ipu_index_attr_name = 'ipu_index'
 ipu_stage_attr_name = 'ipu_stage'
-@signature_safe_contextmanager
-def _enable_standalone_executor(enable=True):
-    global _enable_standalone_executor_
-    original_ = _enable_standalone_executor_
-    _enable_standalone_executor_ = enable
-    try:
-        yield
-    finally:
-        _enable_standalone_executor_ = original_
 @signature_safe_contextmanager
 def ipu_shard_guard(index=-1, stage=-1):
    """

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1150,25 +1150,6 @@ if(WITH_GLOO)
                       PROPERTIES TIMEOUT 120)
 endif()
-if($ENV{USE_STANDALONE_EXECUTOR})
-  # these test will fail in some server due to PR#42149, temporarily set it use old executor.
-  set_tests_properties(test_apply_pass_to_program
-                       PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
-  set_tests_properties(test_buffer_shared_memory_reuse_pass
-                       PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
-  set_tests_properties(
-    test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
-    PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
-  set_tests_properties(test_imperative_optimizer
-                       PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
-  set_tests_properties(test_imperative_star_gan_with_gradient_penalty
-                       PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
-  set_tests_properties(test_switch_autotune
-                       PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
-  set_tests_properties(test_imperative_mnist_sorted_gradient
-                       PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
-endif()
 set(TEST_CINN_OPS
    test_softmax_op
    test_expand_v2_op

--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -201,9 +201,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
  set_tests_properties(test_trt_conv3d_op PROPERTIES TIMEOUT 60)
  set_tests_properties(test_trt_conv3d_transpose_op PROPERTIES TIMEOUT 60)
  set_tests_properties(test_trt_nearest_interp_v2_op PROPERTIES TIMEOUT 30)
-  set_tests_properties(
+  set_tests_properties(test_trt_multiclass_nms3_op PROPERTIES TIMEOUT 60)
-    test_trt_multiclass_nms3_op PROPERTIES TIMEOUT 60 ENVIRONMENT
-                                           FLAGS_USE_STANDALONE_EXECUTOR=0)
  if(WITH_MKLDNN
     AND TENSORRT_FOUND

--- a/python/paddle/fluid/tests/unittests/test_eager_run_program.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_run_program.py
@@ -20,10 +20,6 @@ import paddle
 from paddle import _legacy_C_ops
 from paddle.fluid import core
 from paddle.fluid.dygraph.base import switch_to_static_graph
-from paddle.fluid.executor import (
-    _is_dy2st_enable_standalone_executor,
-    _is_enable_standalone_executor,
-)
 from paddle.fluid.framework import Variable
@@ -140,10 +136,7 @@ class TestRunProgram(unittest.TestCase):
            [out.name + '@GRAD'],
        ]
-        use_interpretorcore = (
+        use_interpretorcore = True
-            _is_enable_standalone_executor()
-            and _is_dy2st_enable_standalone_executor()
-        )
        attrs.extend(('use_interpretorcore', use_interpretorcore))
        if use_interpretorcore:
            attrs.extend(

--- a/python/paddle/fluid/tests/unittests/test_run_program_op.py
+++ b/python/paddle/fluid/tests/unittests/test_run_program_op.py
@@ -21,10 +21,6 @@ import paddle
 from paddle import _legacy_C_ops, fluid
 from paddle.fluid import core, framework
 from paddle.fluid.dygraph.base import switch_to_static_graph
-from paddle.fluid.executor import (
-    _is_dy2st_enable_standalone_executor,
-    _is_enable_standalone_executor,
-)
 from paddle.fluid.framework import global_var
 paddle.enable_static()
@@ -240,10 +236,7 @@ class RunProgramOpTest(unittest.TestCase):
                self.program_desc, self.fwd_op_num, len(outputs['Out'])
            )
-            use_interpretorcore = (
+            use_interpretorcore = True
-                _is_enable_standalone_executor()
-                and _is_dy2st_enable_standalone_executor()
-            )
            self.attrs.extend(('use_interpretorcore', use_interpretorcore))
            if use_interpretorcore:
                self.attrs.extend(
@@ -292,10 +285,7 @@ class RunProgramOpTest(unittest.TestCase):
                self.program_desc, self.fwd_op_num, len(outputs['Out'])
            )
-            use_interpretorcore = (
+            use_interpretorcore = True
-                _is_enable_standalone_executor()
-                and _is_dy2st_enable_standalone_executor()
-            )
            self.attrs.extend(('use_interpretorcore', use_interpretorcore))
            if use_interpretorcore:
                self.attrs.extend(

--- a/python/paddle/jit/translated_layer.py
+++ b/python/paddle/jit/translated_layer.py
@@ -21,10 +21,6 @@ import paddle
 from paddle import _legacy_C_ops
 from paddle.fluid import backward, core, framework, unique_name
 from paddle.fluid.dygraph.base import switch_to_static_graph
-from paddle.fluid.executor import (
-    _is_dy2st_enable_standalone_executor,
-    _is_enable_standalone_executor,
-)
 from paddle.fluid.framework import OpProtoHolder, _non_static_mode
 from paddle.jit.dy2static.partial_program import (
    LazyInitialized,
@@ -976,10 +972,7 @@ def _run_dygraph(instance, input, program_holder):
            )
        )
-    use_interpretorcore = (
+    use_interpretorcore = True
-        _is_enable_standalone_executor()
-        and _is_dy2st_enable_standalone_executor()
-    )
    attrs.extend(('use_interpretorcore', use_interpretorcore))
    if use_interpretorcore:
        attrs.extend(

--- a/test/custom_op/CMakeLists.txt
+++ b/test/custom_op/CMakeLists.txt
@@ -11,15 +11,6 @@ if(WITH_TESTING)
    set_tests_properties(test_custom_relu_op_jit PROPERTIES TIMEOUT 180)
    set_tests_properties(test_custom_relu_model PROPERTIES TIMEOUT 180)
    set_tests_properties(test_context_pool PROPERTIES TIMEOUT 180)
-    if($ENV{USE_STANDALONE_EXECUTOR})
-      # these test will fail in some server due to PR#42149, temporarily set it use old executor.
-      set_tests_properties(
-        test_custom_relu_op_setup PROPERTIES ENVIRONMENT
-                                             FLAGS_USE_STANDALONE_EXECUTOR=0)
-      set_tests_properties(
-        test_custom_relu_model PROPERTIES ENVIRONMENT
-                                          FLAGS_USE_STANDALONE_EXECUTOR=0)
-    endif()
  endif()
  if(WITH_GPU AND WITH_DISTRIBUTE)

--- a/test/standalone_executor/test_standalone_controlflow.py
+++ b/test/standalone_executor/test_standalone_controlflow.py
@@ -17,7 +17,7 @@ import unittest
 import numpy as np
 import paddle
-from paddle.fluid import core, framework
+from paddle.fluid import core
 from paddle.fluid.framework import Program, program_guard
 paddle.enable_static()
@@ -25,7 +25,7 @@ paddle.enable_static()
 #  test the compatibility of new executor: run old
 #  and new executor twice and check the result.
-#  please override the _get_feeds() and build_prgram()
+#  please override the _get_feeds() and build_prgram(), run_dygraph_once()
 class TestCompatibility(unittest.TestCase):
    def setUp(self):
        self.place = (
@@ -78,26 +78,53 @@ class TestCompatibility(unittest.TestCase):
            ret.append(exe.run(main_program, feed=feed, fetch_list=fetch_vars))
        return ret
-    def run_raw_executor(self, feed):
+    def run_dygraph_once(self, feed):
-        with framework._enable_standalone_executor(False):
+        x = paddle.tensor.fill_constant(shape=[1], dtype='float32', value=0.1)
-            out = self._run(feed)
+        y = paddle.tensor.fill_constant(shape=[1], dtype='float32', value=0.23)
+        if x < y:
+            out = [
+                paddle.tensor.fill_constant(
+                    shape=[1, 2], dtype='int32', value=1
+                ).numpy(),
+                paddle.tensor.fill_constant(
+                    shape=[2, 3], dtype='bool', value=True
+                ).numpy(),
+            ]
+        else:
+            out = [
+                paddle.tensor.fill_constant(
+                    shape=[3, 4], dtype='float32', value=3
+                ).numpy(),
+                paddle.tensor.fill_constant(
+                    shape=[4, 5], dtype='int64', value=2
+                ).numpy(),
+            ]
        return out
+    def run_dygraph(self, feed):
+        ret = []
+        for _ in range(self.iter_run):
+            ret.append(self.run_dygraph_once(feed))
+        return ret
    def run_new_executor(self, feed):
-        with framework._enable_standalone_executor(True):
+        out = self._run(feed)
-            out = self._run(feed)
        return out
    def test_with_feed(self):
        feed = self._get_feed()
+        paddle.enable_static()
        res = self.run_new_executor(feed)
-        gt = self.run_raw_executor(feed)
+        paddle.disable_static()
+        gt = self.run_dygraph(feed)
        for x, y in zip(gt, res):
            if isinstance(x, list):
                for tx, ty in zip(x, y):
                    np.testing.assert_array_equal(tx, ty)
            elif isinstance(x, np.ndarray):
-                np.testing.assert_array_equal(tx, ty)
+                np.testing.assert_array_equal(x, y)
            else:
                raise Exception("Not Implement!")
@@ -129,6 +156,12 @@ class TestWhile(TestCompatibility):
            exe = paddle.static.Executor(paddle.CPUPlace())
        return main_program, startup_program, i
+    def run_dygraph_once(self, feed):
+        i = 1
+        while i < 10:
+            i = i + 1
+        return [i]
 if __name__ == "__main__":
    unittest.main()
--- a/test/standalone_executor/test_standalone_executor.py
+++ b/test/standalone_executor/test_standalone_executor.py
@@ -23,7 +23,7 @@ import unittest
 import numpy as np
 import paddle
-from paddle.fluid import core, framework
+from paddle.fluid import core
 from paddle.fluid.core import StandaloneExecutor
 from paddle.profiler import profiler
@@ -143,16 +143,15 @@ class ExecutorStatisticsTestCase(unittest.TestCase):
        scope = paddle.static.Scope()
        with paddle.static.scope_guard(scope):
-            with framework._enable_standalone_executor(enable):
+            exe = paddle.static.Executor(self.place)
-                exe = paddle.static.Executor(self.place)
+            helper_profiler = profiler.Profiler(
-                helper_profiler = profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2)
-                    targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2)
+            )
-                )
+            helper_profiler.start()
-                helper_profiler.start()
+            for i in range(self.iter_n):
-                for i in range(self.iter_n):
+                exe.run(main_program, fetch_list=fetch_list)
-                    exe.run(main_program, fetch_list=fetch_list)
+                helper_profiler.step()
-                    helper_profiler.step()
+            helper_profiler.stop()
-                helper_profiler.stop()
        self.assertTrue(os.path.exists(self.perf_path))
        with open(self.perf_path, 'r') as load_f:
@@ -183,15 +182,14 @@ class MultiStreamModelTestCase(unittest.TestCase):
        paddle.seed(2020)
        main_program, startup_program, fetch_list = build_program()
-        with framework._enable_standalone_executor(use_new_executor):
+        scope = core.Scope()
-            scope = core.Scope()
+        exe = paddle.static.Executor(self.place)
-            exe = paddle.static.Executor(self.place)
+        outs = []
-            outs = []
+        for i in range(self.iter_n):
-            for i in range(self.iter_n):
+            outs.append(
-                outs.append(
+                exe.run(main_program, scope=scope, fetch_list=fetch_list)
-                    exe.run(main_program, scope=scope, fetch_list=fetch_list)
+            )
-                )
+        print(outs)
-            print(outs)
        return outs
@@ -249,30 +247,46 @@ class SwitchExecutorInterfaceWithFeed(unittest.TestCase):
        return outs
-    def run_raw_executor(self, feed, use_compiled=False):
+    def run_dygraph(self, feed):
-        with framework._enable_standalone_executor(False):
+        def run_once(is_double):
-            # run construct program 1
+            paddle.seed(2020)
-            out1 = self._run(
+            a = feed['a']
-                feed, use_str=False, is_double=False, use_compiled=use_compiled
+            a = paddle.to_tensor(a, dtype='float32')
-            )
+            b = paddle.ones([2, 2]) * 2
-            # run construct program 2 with same executor
+            t = paddle.nn.Linear(2, 2)(a)
-            out2 = self._run(
+            c = t + b
-                feed, use_str=True, is_double=True, use_compiled=use_compiled
+            if is_double:
-            )
+                c = c + c
+            return c.numpy()
-            return [out1, out2]
+        out1 = []
+        for i in range(self.iter_run):
+            out1.append(run_once(False))
+        out2 = []
+        for i in range(self.iter_run):
+            out2.append(run_once(True))
+        return [out1, out2]
    def run_new_executor(self, feed, use_compiled=False):
-        with framework._enable_standalone_executor():
+        # run construct program 1
-            out = self.run_raw_executor(feed, use_compiled=use_compiled)
+        out1 = self._run(
-        return out
+            feed, use_str=False, is_double=False, use_compiled=use_compiled
+        )
+        # run construct program 2 with same executor
+        out2 = self._run(
+            feed, use_str=True, is_double=True, use_compiled=use_compiled
+        )
+        return [out1, out2]
    def test_with_feed(self):
        data = np.ones([2, 2], dtype="float32")
        feed = {"a": data, 'fake_input': data}
-        res = self.run_new_executor(feed)
+        with paddle.fluid.framework._static_guard():
-        gt = self.run_raw_executor(feed)
+            res = self.run_new_executor(feed)
+        with paddle.fluid.dygraph.guard():
+            gt = self.run_dygraph(feed)
        for x, y in zip(gt, res):
            np.testing.assert_array_equal(x, y)
@@ -280,8 +294,7 @@ class SwitchExecutorInterfaceWithFeed(unittest.TestCase):
        feed = [{'a': np.ones([2, 2], dtype="float32")}]
        with self.assertRaises(TypeError):
-            with framework._enable_standalone_executor():
+            self._run(feed[0], add_wrong_fetch=True)
-                self._run(feed[0], add_wrong_fetch=True)
    def test_empty_program(self):
        program = paddle.static.Program()
@@ -291,8 +304,7 @@ class SwitchExecutorInterfaceWithFeed(unittest.TestCase):
        for i in range(10):
            print(i, flush=1)
-            with framework._enable_standalone_executor():
+            out = exe.run(program, feed=None)
-                out = exe.run(program, feed=None)
 class TestException(unittest.TestCase):
@@ -328,8 +340,7 @@ class TestException(unittest.TestCase):
        return out
    def run_new_executor(self, feed):
-        with framework._enable_standalone_executor():
+        out = self._run(feed)
-            out = self._run(feed)
        return out
    def test_exception(self):
@@ -399,13 +410,11 @@ class TestInplaceApiWithDataTransform(unittest.TestCase):
            with paddle.fluid.device_guard("cpu"):
                x = paddle.increment(x)
            exe = paddle.static.Executor(paddle.CUDAPlace(0))
-            with framework._enable_standalone_executor():
+            for i in range(10):
+                (a,) = exe.run(
-                for i in range(10):
+                    paddle.static.default_main_program(), fetch_list=[x]
-                    (a,) = exe.run(
+                )
-                        paddle.static.default_main_program(), fetch_list=[x]
+                self.assertEqual(a[0], 1)
-                    )
-                    self.assertEqual(a[0], 1)
 if __name__ == "__main__":

--- a/test/standalone_executor/test_standalone_multiply_write.py
+++ b/test/standalone_executor/test_standalone_multiply_write.py
@@ -39,6 +39,14 @@ class TestMultiplyWrite(TestCompatibility):
            paddle.assign(inp2, out)
        return main_program, startup_program, out
+    def run_dygraph_once(self, feed):
+        out = paddle.full((1,), 1)
+        inp1 = paddle.full((1,), 2)
+        inp2 = paddle.full((1,), 3)
+        paddle.assign(inp1, out)
+        paddle.assign(inp2, out)
+        return [out.numpy()]
    def setUp(self):
        self.place = paddle.CPUPlace()
        self.iter_run = 5