Enable startup program for standalone executor (#45314)

* Enable startup program for standalone executor * Disable test_py_reader_using_executor * Fix test_parallel_executor_mnist * Fix CI errors * Fix CI errors

Enable startup program for standalone executor (#45314)
* Enable startup program for standalone executor * Disable test_py_reader_using_executor * Fix test_parallel_executor_mnist * Fix CI errors * Fix CI errors
6df93364 · Ruibiao Chen · GitHub · 23bc0e3c · 6df93364 · 6df93364
3 changed file
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
 #include "paddle/fluid/framework/new_executor/interpretercore_util.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/os_info.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/supplement_tracing.h"
@@ -28,7 +29,6 @@
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace,
                            false,
@@ -104,7 +104,7 @@ InterpreterCore::~InterpreterCore() {
 interpreter::CostInfo InterpreterCore::DryRun(
    const std::vector<std::string>& feed_names,
    const std::vector<framework::LoDTensor>& feed_tensors) {
-#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
@@ -138,14 +138,16 @@ interpreter::CostInfo InterpreterCore::DryRun(
 paddle::framework::FetchList InterpreterCore::Run(
    const std::vector<std::string>& feed_names,
    const std::vector<framework::LoDTensor>& feed_tensors) {
-#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
 #endif
 #ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
 #endif
  bool is_build = is_build_;
  Prepare(feed_names, feed_tensors, is_build);
@@ -180,14 +182,16 @@ paddle::framework::FetchList InterpreterCore::Run(
 paddle::framework::FetchList InterpreterCore::Run(
    const std::vector<std::string>& feed_names) {
-#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
 #endif
 #ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
 #endif
  if (!is_build_) {
    paddle::framework::interpreter::build_variable_scope(
        block_, &var_scope_, create_local_scope_);

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1615,9 +1615,7 @@ class Executor(object):
        # use StandaloneExecutor to run the program.
        if return_merged and self._enable_interpreter_core and _can_use_interpreter_core(
                program, self.place):
-            inner_program = program._program if isinstance(
-                program, compiler.CompiledProgram) else program
-            if not inner_program._is_start_up_program_:
            if feed is None:
                feed = {}
            elif isinstance(feed, (list, tuple)):
@@ -1641,12 +1639,9 @@ class Executor(object):
                lr_sheduler = program.lr_sheduler
                lr_value = lr_sheduler()
                lr_var = program.global_block().vars[lr_sheduler._var_name]
-                    data = np.array([lr_value
+                data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype))
-                                     ]).astype(convert_dtype(lr_var.dtype))
+                tensor = core.get_variable_tensor(scope, lr_sheduler._var_name)
-                    tensor = core.get_variable_tensor(scope,
+                # NOTE(dev): `tensor.set(data, self.place)` always call TensorCopySync that is a blocking behavior. So we use `_copy_from` to replace it.
-                                                      lr_sheduler._var_name)
-                    # NOTE(dev): `set` always call TensorCopySync that is a
-                    # blocking behavior. So we use `_copy_from` to replace it.
                cpu_tensor = _as_lodtensor(data, core.CPUPlace())
                # for ipu, tensor is allocated on cpu
                if core.is_compiled_with_ipu():

--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -95,14 +95,11 @@ def simple_fc_net(in_size,
        py_reader = fluid.layers.create_py_reader_by_data(
            capacity=queue_capacity,
            use_double_buffer=use_double_buffer,
-            feed_list=[in_data, label],
+            feed_list=[in_data, label])
-            name=unique_name.generate('py_reader_name'))
    else:
-        py_reader = fluid.layers.py_reader(
+        py_reader = fluid.layers.py_reader(capacity=queue_capacity,
-            capacity=queue_capacity,
                                           shapes=[in_data.shape, label.shape],
                                           dtypes=['float32', 'int64'],
-            name=unique_name.generate('py_reader_name'),
                                           use_double_buffer=use_double_buffer)
    in_data, label = fluid.layers.read_file(py_reader)