diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index c679ae47a365962124c16488d64516d5c0824cc6..6be8aa776a839cb67cc80c74f6d381b72f9a7c25 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" #include "paddle/fluid/framework/new_executor/interpretercore_util.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/supplement_tracing.h" @@ -28,7 +29,6 @@ #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif -#include "paddle/fluid/platform/device/gpu/gpu_info.h" PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, false, @@ -104,7 +104,7 @@ InterpreterCore::~InterpreterCore() { interpreter::CostInfo InterpreterCore::DryRun( const std::vector& feed_names, const std::vector& feed_tensors) { -#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place_)) { platform::SetDeviceId(place_.device); } @@ -138,14 +138,16 @@ interpreter::CostInfo InterpreterCore::DryRun( paddle::framework::FetchList InterpreterCore::Run( const std::vector& feed_names, const std::vector& feed_tensors) { -#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place_)) { platform::SetDeviceId(place_.device); } #endif + #ifdef PADDLE_WITH_MKLDNN platform::AttachPointerHashToMKLDNNKey(this, place_); #endif + bool is_build = is_build_; Prepare(feed_names, feed_tensors, is_build); @@ -180,14 +182,16 @@ paddle::framework::FetchList InterpreterCore::Run( paddle::framework::FetchList InterpreterCore::Run( const std::vector& feed_names) { -#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place_)) { platform::SetDeviceId(place_.device); } #endif + #ifdef PADDLE_WITH_MKLDNN platform::AttachPointerHashToMKLDNNKey(this, place_); #endif + if (!is_build_) { paddle::framework::interpreter::build_variable_scope( block_, &var_scope_, create_local_scope_); diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 5b92df7838c61913b59b5b60e629e4f626a5b60f..1ef053bab68f3cd058da9edf30dee39dfa71eef6 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1615,51 +1615,46 @@ class Executor(object): # use StandaloneExecutor to run the program. if return_merged and self._enable_interpreter_core and _can_use_interpreter_core( program, self.place): - inner_program = program._program if isinstance( - program, compiler.CompiledProgram) else program - if not inner_program._is_start_up_program_: - if feed is None: - feed = {} - elif isinstance(feed, (list, tuple)): - assert len(feed) == 1, "Not compiled with data parallel" - feed = feed[0] - if not isinstance(feed, dict): - raise TypeError( - "feed requires dict as its Parameter. But you passed in %s" - % (type(feed))) - feed = self._update_feed(program, feed) - - program, new_exe = self._executor_cache.get_program_and_executor( - program, feed, fetch_list, feed_var_name, fetch_var_name, - self.place, scope) - - self._feed_data(program, feed, feed_var_name, scope) - if hasattr(program, 'lr_sheduler'): - from paddle.optimizer.lr import LRScheduler - assert isinstance(program.lr_sheduler, - LRScheduler), "must be LRScheduler" - lr_sheduler = program.lr_sheduler - lr_value = lr_sheduler() - lr_var = program.global_block().vars[lr_sheduler._var_name] - data = np.array([lr_value - ]).astype(convert_dtype(lr_var.dtype)) - tensor = core.get_variable_tensor(scope, - lr_sheduler._var_name) - # NOTE(dev): `set` always call TensorCopySync that is a - # blocking behavior. So we use `_copy_from` to replace it. - cpu_tensor = _as_lodtensor(data, core.CPUPlace()) - # for ipu, tensor is allocated on cpu - if core.is_compiled_with_ipu(): - tensor._copy_from(cpu_tensor, tensor._place()) - else: - tensor._copy_from(cpu_tensor, self.place) - warnings.warn( - "FLAGS_USE_STANDALONE_EXECUTOR is set to 1. New executor is used to execute Program." - ) + if feed is None: + feed = {} + elif isinstance(feed, (list, tuple)): + assert len(feed) == 1, "Not compiled with data parallel" + feed = feed[0] + if not isinstance(feed, dict): + raise TypeError( + "feed requires dict as its Parameter. But you passed in %s" + % (type(feed))) + feed = self._update_feed(program, feed) + + program, new_exe = self._executor_cache.get_program_and_executor( + program, feed, fetch_list, feed_var_name, fetch_var_name, + self.place, scope) + + self._feed_data(program, feed, feed_var_name, scope) + if hasattr(program, 'lr_sheduler'): + from paddle.optimizer.lr import LRScheduler + assert isinstance(program.lr_sheduler, + LRScheduler), "must be LRScheduler" + lr_sheduler = program.lr_sheduler + lr_value = lr_sheduler() + lr_var = program.global_block().vars[lr_sheduler._var_name] + data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype)) + tensor = core.get_variable_tensor(scope, lr_sheduler._var_name) + # NOTE(dev): `tensor.set(data, self.place)` always call TensorCopySync that is a blocking behavior. So we use `_copy_from` to replace it. + cpu_tensor = _as_lodtensor(data, core.CPUPlace()) + # for ipu, tensor is allocated on cpu + if core.is_compiled_with_ipu(): + tensor._copy_from(cpu_tensor, tensor._place()) + else: + tensor._copy_from(cpu_tensor, self.place) + + warnings.warn( + "FLAGS_USE_STANDALONE_EXECUTOR is set to 1. New executor is used to execute Program." + ) - return new_exe.run(scope, list(feed.keys()), fetch_list, - return_numpy) + return new_exe.run(scope, list(feed.keys()), fetch_list, + return_numpy) compiled = isinstance(program, compiler.CompiledProgram) diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py index 830ade004d3a6cb57f3e48196f1eae8003c2d654..e52b6462bfb54b62bcb9fe53e17d80a723ca5b02 100644 --- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py +++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py @@ -95,15 +95,12 @@ def simple_fc_net(in_size, py_reader = fluid.layers.create_py_reader_by_data( capacity=queue_capacity, use_double_buffer=use_double_buffer, - feed_list=[in_data, label], - name=unique_name.generate('py_reader_name')) + feed_list=[in_data, label]) else: - py_reader = fluid.layers.py_reader( - capacity=queue_capacity, - shapes=[in_data.shape, label.shape], - dtypes=['float32', 'int64'], - name=unique_name.generate('py_reader_name'), - use_double_buffer=use_double_buffer) + py_reader = fluid.layers.py_reader(capacity=queue_capacity, + shapes=[in_data.shape, label.shape], + dtypes=['float32', 'int64'], + use_double_buffer=use_double_buffer) in_data, label = fluid.layers.read_file(py_reader)