Enable startup program for standalone executor (#45314)

* Enable startup program for standalone executor * Disable test_py_reader_using_executor * Fix test_parallel_executor_mnist * Fix CI errors * Fix CI errors

Enable startup program for standalone executor (#45314)
* Enable startup program for standalone executor * Disable test_py_reader_using_executor * Fix test_parallel_executor_mnist * Fix CI errors * Fix CI errors
6df93364 · Ruibiao Chen · GitHub · 23bc0e3c · 6df93364 · 6df93364
3 changed file
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
 #include "paddle/fluid/framework/new_executor/interpretercore_util.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/os_info.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/supplement_tracing.h"
@@ -28,7 +29,6 @@
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"

 PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace,
                            false,
@@ -104,7 +104,7 @@ InterpreterCore::~InterpreterCore() {
 interpreter::CostInfo InterpreterCore::DryRun(
    const std::vector<std::string>& feed_names,
    const std::vector<framework::LoDTensor>& feed_tensors) {
-#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
@@ -138,14 +138,16 @@ interpreter::CostInfo InterpreterCore::DryRun(
 paddle::framework::FetchList InterpreterCore::Run(
    const std::vector<std::string>& feed_names,
    const std::vector<framework::LoDTensor>& feed_tensors) {
-#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
 #endif
+
 #ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
 #endif
+
  bool is_build = is_build_;
  Prepare(feed_names, feed_tensors, is_build);

@@ -180,14 +182,16 @@ paddle::framework::FetchList InterpreterCore::Run(

 paddle::framework::FetchList InterpreterCore::Run(
    const std::vector<std::string>& feed_names) {
-#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
 #endif
+
 #ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
 #endif
+
  if (!is_build_) {
    paddle::framework::interpreter::build_variable_scope(
        block_, &var_scope_, create_local_scope_);

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1615,51 +1615,46 @@ class Executor(object):
        # use StandaloneExecutor to run the program.
        if return_merged and self._enable_interpreter_core and _can_use_interpreter_core(
                program, self.place):
-            inner_program = program._program if isinstance(
-                program, compiler.CompiledProgram) else program
-            if not inner_program._is_start_up_program_:
-                if feed is None:
-                    feed = {}
-                elif isinstance(feed, (list, tuple)):
-                    assert len(feed) == 1, "Not compiled with data parallel"
-                    feed = feed[0]
-                if not isinstance(feed, dict):
-                    raise TypeError(
-                        "feed requires dict as its Parameter. But you passed in %s"
-                        % (type(feed)))
-                feed = self._update_feed(program, feed)
-
-                program, new_exe = self._executor_cache.get_program_and_executor(
-                    program, feed, fetch_list, feed_var_name, fetch_var_name,
-                    self.place, scope)
-
-                self._feed_data(program, feed, feed_var_name, scope)
-                if hasattr(program, 'lr_sheduler'):
-                    from paddle.optimizer.lr import LRScheduler
-                    assert isinstance(program.lr_sheduler,
-                                      LRScheduler), "must be LRScheduler"
-                    lr_sheduler = program.lr_sheduler
-                    lr_value = lr_sheduler()
-                    lr_var = program.global_block().vars[lr_sheduler._var_name]
-                    data = np.array([lr_value
-                                     ]).astype(convert_dtype(lr_var.dtype))
-                    tensor = core.get_variable_tensor(scope,
-                                                      lr_sheduler._var_name)
-                    # NOTE(dev): `set` always call TensorCopySync that is a
-                    # blocking behavior. So we use `_copy_from` to replace it.
-                    cpu_tensor = _as_lodtensor(data, core.CPUPlace())
-                    # for ipu, tensor is allocated on cpu
-                    if core.is_compiled_with_ipu():
-                        tensor._copy_from(cpu_tensor, tensor._place())
-                    else:
-                        tensor._copy_from(cpu_tensor, self.place)

-                warnings.warn(
-                    "FLAGS_USE_STANDALONE_EXECUTOR is set to 1. New executor is used to execute Program."
-                )
+            if feed is None:
+                feed = {}
+            elif isinstance(feed, (list, tuple)):
+                assert len(feed) == 1, "Not compiled with data parallel"
+                feed = feed[0]
+            if not isinstance(feed, dict):
+                raise TypeError(
+                    "feed requires dict as its Parameter. But you passed in %s"
+                    % (type(feed)))
+            feed = self._update_feed(program, feed)
+
+            program, new_exe = self._executor_cache.get_program_and_executor(
+                program, feed, fetch_list, feed_var_name, fetch_var_name,
+                self.place, scope)
+
+            self._feed_data(program, feed, feed_var_name, scope)
+            if hasattr(program, 'lr_sheduler'):
+                from paddle.optimizer.lr import LRScheduler
+                assert isinstance(program.lr_sheduler,
+                                  LRScheduler), "must be LRScheduler"
+                lr_sheduler = program.lr_sheduler
+                lr_value = lr_sheduler()
+                lr_var = program.global_block().vars[lr_sheduler._var_name]
+                data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype))
+                tensor = core.get_variable_tensor(scope, lr_sheduler._var_name)
+                # NOTE(dev): `tensor.set(data, self.place)` always call TensorCopySync that is a blocking behavior. So we use `_copy_from` to replace it.
+                cpu_tensor = _as_lodtensor(data, core.CPUPlace())
+                # for ipu, tensor is allocated on cpu
+                if core.is_compiled_with_ipu():
+                    tensor._copy_from(cpu_tensor, tensor._place())
+                else:
+                    tensor._copy_from(cpu_tensor, self.place)
+
+            warnings.warn(
+                "FLAGS_USE_STANDALONE_EXECUTOR is set to 1. New executor is used to execute Program."
+            )

-                return new_exe.run(scope, list(feed.keys()), fetch_list,
-                                   return_numpy)
+            return new_exe.run(scope, list(feed.keys()), fetch_list,
+                               return_numpy)

        compiled = isinstance(program, compiler.CompiledProgram)


--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -95,15 +95,12 @@ def simple_fc_net(in_size,
        py_reader = fluid.layers.create_py_reader_by_data(
            capacity=queue_capacity,
            use_double_buffer=use_double_buffer,
-            feed_list=[in_data, label],
-            name=unique_name.generate('py_reader_name'))
+            feed_list=[in_data, label])
    else:
-        py_reader = fluid.layers.py_reader(
-            capacity=queue_capacity,
-            shapes=[in_data.shape, label.shape],
-            dtypes=['float32', 'int64'],
-            name=unique_name.generate('py_reader_name'),
-            use_double_buffer=use_double_buffer)
+        py_reader = fluid.layers.py_reader(capacity=queue_capacity,
+                                           shapes=[in_data.shape, label.shape],
+                                           dtypes=['float32', 'int64'],
+                                           use_double_buffer=use_double_buffer)

    in_data, label = fluid.layers.read_file(py_reader)