diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index a6ca78174d837eb573068619a64e8747d644e05a..8367607adba06d0841dfd27a80b49a14ce194d92 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -79,12 +79,13 @@ void InterpreterCore::AddFetch(const std::vector& fetch_names) { } paddle::framework::FetchList InterpreterCore::Run( - const std::vector& feed_tensors) { + const std::vector& feed_tensors) { auto FeedInput = [&] { for (size_t i = 0; i < feed_names_.size(); ++i) { auto* feed_var = global_scope_->Var(feed_names_[i]); auto feed_tensor = feed_var->GetMutable(); feed_tensor->ShareDataWith(feed_tensors[i]); + feed_tensor->set_lod(feed_tensors[i].lod()); } }; @@ -495,7 +496,7 @@ void InterpreterCore::CheckGC(const Instruction& instr) { } void InterpreterCore::DryRunPrepare( - const std::vector& feed_tensors) { + const std::vector& feed_tensors) { auto FeedInput = [&] { for (size_t i = 0; i < feed_names_.size(); ++i) { auto* feed_var = global_scope_->FindVar(feed_names_[i]); @@ -504,6 +505,7 @@ void InterpreterCore::DryRunPrepare( auto feed_tensor = feed_var->GetMutable(); feed_tensor->ShareDataWith(feed_tensors[i]); + feed_tensor->set_lod(feed_tensors[i].lod()); } }; @@ -525,7 +527,7 @@ void InterpreterCore::DryRunPrepare( } const CostInfo& InterpreterCore::DryRun( - const std::vector& feed_tensors) { + const std::vector& feed_tensors) { DryRunPrepare(feed_tensors); // DryRun may be called many times. dry_run_profiler_.Reset(); diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index 811843db5292a74e08a4e7ea2942335b2019643b..c91acb7827da89b77a250a9adc0fe3e059dbe2aa 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -46,9 +46,9 @@ class InterpreterCore { const std::vector& fetch_names); paddle::framework::FetchList Run( - const std::vector& feed_tensors); + const std::vector& feed_tensors); - const CostInfo& DryRun(const std::vector& feed_tensors); + const CostInfo& DryRun(const std::vector& feed_tensors); private: void Convert(); @@ -65,7 +65,7 @@ class InterpreterCore { void ExecuteInstructionList(const std::vector& vec_instr); - void DryRunPrepare(const std::vector& feed_tensors); + void DryRunPrepare(const std::vector& feed_tensors); void CheckGC(const Instruction& instr); diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index 61d1462053f4a32c5d0f3600e65fbc459ccaf39d..32e26f795a2cff9034586bfc4fa5cbc2aee916ce 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -287,7 +287,7 @@ void build_op_func_list(const platform::Place& place, for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto var = var_name_item.second[i]; auto& var_name = inputs_names[var_name_item.first].at(i); - auto tensor_in = static_cast(&(var->Get())); + auto tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var); if (!tensor_in->IsInitialized()) { continue; } @@ -296,7 +296,9 @@ void build_op_func_list(const platform::Place& place, ->GetKernelTypeForVar(var_name_item.first, *tensor_in, expected_kernel_key); if (platform::is_same_place(kernel_type_for_var.place_, - expected_kernel_key.place_)) { + expected_kernel_key.place_) || + (is_cuda_pinned_place(kernel_type_for_var.place_) && + is_cpu_place(expected_kernel_key.place_))) { // record no need data transformer input var_id VLOG(3) << op->Type() << " found no data_transform var: " << var_name << " with id: " << var_name; diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 898c2d3d75e7e31e52088808d8ab51cf6da9eb00..474be9e889d2af446ca8335e08b68328ebfe02eb 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -47,7 +47,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, paddle::framework::FetchList StandaloneExecutor::Run( const std::vector& feed_names, - const std::vector& feed_tensors, + const std::vector& feed_tensors, const std::vector& fetch_names) { auto core = GetInterpreterCore(feed_names, fetch_names); @@ -56,7 +56,7 @@ paddle::framework::FetchList StandaloneExecutor::Run( const CostInfo& StandaloneExecutor::DryRun( const std::vector& feed_names, - const std::vector& feed_tensors) { + const std::vector& feed_tensors) { auto core = GetInterpreterCore(feed_names, {}); auto& cost_info = core->DryRun(feed_tensors); diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h index 600c90e3a11a6aa25db36ff45d3a0bb4a076eea5..ba1c7df45c9d2f6a8823590fe2a8c3f61b6770e2 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.h +++ b/paddle/fluid/framework/new_executor/standalone_executor.h @@ -28,7 +28,7 @@ class ExecutorBase { virtual ~ExecutorBase() {} virtual paddle::framework::FetchList Run( const std::vector& feed_names, - const std::vector& feed_tensors, + const std::vector& feed_tensors, const std::vector& fetch_names) = 0; }; @@ -42,11 +42,11 @@ class StandaloneExecutor : public ExecutorBase { virtual paddle::framework::FetchList Run( const std::vector& feed_names, - const std::vector& feed_tensors, + const std::vector& feed_tensors, const std::vector& fetch_names); const CostInfo& DryRun(const std::vector& feed_names, - const std::vector& feed_tensors); + const std::vector& feed_tensors); private: void BuildVariableOuterScope(const framework::ProgramDesc& pdesc, diff --git a/paddle/fluid/operators/controlflow/fetch_v2_op.cc b/paddle/fluid/operators/controlflow/fetch_v2_op.cc index 355e52b9436e6250cd25f44293ca926b8584fc00..bf9874c02f6203a80194345af9a2e0b3894f39c9 100644 --- a/paddle/fluid/operators/controlflow/fetch_v2_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc @@ -128,9 +128,12 @@ class FetchV2Kernel { if (fetch_var->IsType()) { auto &src_item = fetch_var->Get(); auto *dst_item = &(BOOST_GET(framework::LoDTensor, fetch_list->at(col))); - PADDLE_ENFORCE_EQ(platform::is_cpu_place(src_item.place()), true, - platform::errors::InvalidArgument( - "Tensor's place of input(X) must be CPUPlace.")); + bool check_place = platform::is_cpu_place(src_item.place()) || + platform::is_cuda_pinned_place(src_item.place()); + PADDLE_ENFORCE_EQ( + check_place, true, + platform::errors::InvalidArgument("Tensor's place of input(X) must " + "be CPUPlace or CUDAPinnedPlace.")); if (deepcopy) { DeepCopy(src_item, fetch_var_name, dst_item); } else { @@ -188,8 +191,11 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL_FUNCTOR(fetch_v2, float, ops::FetchV2Kernel, double, - ops::FetchV2Kernel, int, ops::FetchV2Kernel, - int64_t, ops::FetchV2Kernel, bool, - ops::FetchV2Kernel, plat::float16, - ops::FetchV2Kernel); +REGISTER_OP_CPU_KERNEL_FUNCTOR( + fetch_v2, float, ops::FetchV2Kernel, double, ops::FetchV2Kernel, int8_t, + ops::FetchV2Kernel, uint8_t, ops::FetchV2Kernel, int, ops::FetchV2Kernel, + int64_t, ops::FetchV2Kernel, bool, ops::FetchV2Kernel, + paddle::platform::bfloat16, ops::FetchV2Kernel, + paddle::platform::complex, ops::FetchV2Kernel, + paddle::platform::complex, ops::FetchV2Kernel, plat::float16, + ops::FetchV2Kernel, int16_t, ops::FetchV2Kernel); diff --git a/paddle/fluid/operators/memcpy_d2h_op.cc b/paddle/fluid/operators/memcpy_d2h_op.cc index 3158b0963a43add3b27de9a1e404a70b5155d975..1eb8d09c783b01253262d15b95b1edd6719e81d8 100644 --- a/paddle/fluid/operators/memcpy_d2h_op.cc +++ b/paddle/fluid/operators/memcpy_d2h_op.cc @@ -125,24 +125,33 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL_FUNCTOR(memcpy_d2h, float, ops::MemcpyD2HKernel, double, - ops::MemcpyD2HKernel, int, ops::MemcpyD2HKernel, - int64_t, ops::MemcpyD2HKernel, bool, - ops::MemcpyD2HKernel, plat::float16, - ops::MemcpyD2HKernel); +REGISTER_OP_CPU_KERNEL_FUNCTOR( + memcpy_d2h, float, ops::MemcpyD2HKernel, double, ops::MemcpyD2HKernel, + int8_t, ops::MemcpyD2HKernel, uint8_t, ops::MemcpyD2HKernel, int, + ops::MemcpyD2HKernel, int64_t, ops::MemcpyD2HKernel, bool, + ops::MemcpyD2HKernel, paddle::platform::bfloat16, ops::MemcpyD2HKernel, + paddle::platform::complex, ops::MemcpyD2HKernel, + paddle::platform::complex, ops::MemcpyD2HKernel, plat::float16, + ops::MemcpyD2HKernel, int16_t, ops::MemcpyD2HKernel); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy_d2h, float, ops::MemcpyD2HKernel, double, - ops::MemcpyD2HKernel, int, ops::MemcpyD2HKernel, - int64_t, ops::MemcpyD2HKernel, bool, - ops::MemcpyD2HKernel, plat::float16, - ops::MemcpyD2HKernel); +REGISTER_OP_CUDA_KERNEL_FUNCTOR( + memcpy_d2h, float, ops::MemcpyD2HKernel, double, ops::MemcpyD2HKernel, + int8_t, ops::MemcpyD2HKernel, uint8_t, ops::MemcpyD2HKernel, int, + ops::MemcpyD2HKernel, int64_t, ops::MemcpyD2HKernel, bool, + ops::MemcpyD2HKernel, paddle::platform::bfloat16, ops::MemcpyD2HKernel, + paddle::platform::complex, ops::MemcpyD2HKernel, + paddle::platform::complex, ops::MemcpyD2HKernel, plat::float16, + ops::MemcpyD2HKernel, int16_t, ops::MemcpyD2HKernel); #endif #ifdef PADDLE_WITH_ASCEND_CL -REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy_d2h, float, ops::MemcpyD2HKernel, double, - ops::MemcpyD2HKernel, int, ops::MemcpyD2HKernel, - int64_t, ops::MemcpyD2HKernel, bool, - ops::MemcpyD2HKernel, plat::float16, - ops::MemcpyD2HKernel); +REGISTER_OP_NPU_KERNEL_FUNCTOR( + memcpy_d2h, float, ops::MemcpyD2HKernel, double, ops::MemcpyD2HKernel, + int8_t, ops::MemcpyD2HKernel, uint8_t, ops::MemcpyD2HKernel, int, + ops::MemcpyD2HKernel, int64_t, ops::MemcpyD2HKernel, bool, + ops::MemcpyD2HKernel, paddle::platform::bfloat16, ops::MemcpyD2HKernel, + paddle::platform::complex, ops::MemcpyD2HKernel, + paddle::platform::complex, ops::MemcpyD2HKernel, plat::float16, + ops::MemcpyD2HKernel, int16_t, ops::MemcpyD2HKernel); #endif diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc index f100dc6f7a53ee122bf5d791e6a20b6e88097e57..0e27ec0dc75b779d03a6f3a9c5625a984c3b33ad 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.cc +++ b/paddle/fluid/operators/memcpy_h2d_op.cc @@ -125,24 +125,33 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL_FUNCTOR(memcpy_h2d, float, ops::MemcpyH2DKernel, double, - ops::MemcpyH2DKernel, int, ops::MemcpyH2DKernel, - int64_t, ops::MemcpyH2DKernel, bool, - ops::MemcpyH2DKernel, plat::float16, - ops::MemcpyH2DKernel); +REGISTER_OP_CPU_KERNEL_FUNCTOR( + memcpy_h2d, float, ops::MemcpyH2DKernel, double, ops::MemcpyH2DKernel, + int8_t, ops::MemcpyH2DKernel, uint8_t, ops::MemcpyH2DKernel, int, + ops::MemcpyH2DKernel, int64_t, ops::MemcpyH2DKernel, bool, + ops::MemcpyH2DKernel, paddle::platform::bfloat16, ops::MemcpyH2DKernel, + paddle::platform::complex, ops::MemcpyH2DKernel, + paddle::platform::complex, ops::MemcpyH2DKernel, plat::float16, + ops::MemcpyH2DKernel, int16_t, ops::MemcpyH2DKernel); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy_h2d, float, ops::MemcpyH2DKernel, double, - ops::MemcpyH2DKernel, int, ops::MemcpyH2DKernel, - int64_t, ops::MemcpyH2DKernel, bool, - ops::MemcpyH2DKernel, plat::float16, - ops::MemcpyH2DKernel); +REGISTER_OP_CUDA_KERNEL_FUNCTOR( + memcpy_h2d, float, ops::MemcpyH2DKernel, double, ops::MemcpyH2DKernel, + int8_t, ops::MemcpyH2DKernel, uint8_t, ops::MemcpyH2DKernel, int, + ops::MemcpyH2DKernel, int64_t, ops::MemcpyH2DKernel, bool, + ops::MemcpyH2DKernel, paddle::platform::bfloat16, ops::MemcpyH2DKernel, + paddle::platform::complex, ops::MemcpyH2DKernel, + paddle::platform::complex, ops::MemcpyH2DKernel, plat::float16, + ops::MemcpyH2DKernel, int16_t, ops::MemcpyH2DKernel); #endif #ifdef PADDLE_WITH_ASCEND_CL -REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy_h2d, float, ops::MemcpyH2DKernel, double, - ops::MemcpyH2DKernel, int, ops::MemcpyH2DKernel, - int64_t, ops::MemcpyH2DKernel, bool, - ops::MemcpyH2DKernel, plat::float16, - ops::MemcpyH2DKernel); +REGISTER_OP_NPU_KERNEL_FUNCTOR( + memcpy_h2d, float, ops::MemcpyH2DKernel, double, ops::MemcpyH2DKernel, + int8_t, ops::MemcpyH2DKernel, uint8_t, ops::MemcpyH2DKernel, int, + ops::MemcpyH2DKernel, int64_t, ops::MemcpyH2DKernel, bool, + ops::MemcpyH2DKernel, paddle::platform::bfloat16, ops::MemcpyH2DKernel, + paddle::platform::complex, ops::MemcpyH2DKernel, + paddle::platform::complex, ops::MemcpyH2DKernel, plat::float16, + ops::MemcpyH2DKernel, int16_t, ops::MemcpyH2DKernel); #endif diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 2123569704f0bb7770f95c665aa3ded88bf1a414..d79bba7fd2f81e484169972a4bb43cc7dbe393de 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2046,7 +2046,7 @@ All parameter, weight, gradient are variables in Paddle. [](StandaloneExecutor &self, const std::unordered_map &input_dict, std::vector fetch_names) { - std::vector feed_tensors; + std::vector feed_tensors; std::vector feed_names; for (auto &item : input_dict) { @@ -2066,10 +2066,10 @@ All parameter, weight, gradient are variables in Paddle. }) .def("run", [](StandaloneExecutor &self, - const std::unordered_map + const std::unordered_map &input_dict, std::vector fetch_names) { - std::vector feed_tensors; + std::vector feed_tensors; std::vector feed_names; for (auto &item : input_dict) { @@ -2087,7 +2087,7 @@ All parameter, weight, gradient are variables in Paddle. .def("dry_run", [](StandaloneExecutor &self, const std::unordered_map &input_dict) { - std::vector feed_tensors; + std::vector feed_tensors; std::vector feed_names; for (auto &item : input_dict) { diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 6fba200f54099d608fa9e3f4f2f50a9506d300a3..377a40af7a3d538788d8a5c9ed1e8882a58cfbd3 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -485,10 +485,11 @@ handler = FetchHandlerExample(var_dict=var_dict) class _StandaloneExecutor(object): - def __init__(self, place, main_program): + def __init__(self, place, main_program, scope): self._place = core.Place() self._place.set_place(place) self._main_program = main_program + self._scope = scope self._new_exe = self._create_new_executor() def run(self, feed, fetch_list, return_numpy=True): @@ -522,9 +523,8 @@ class _StandaloneExecutor(object): def _create_new_executor(self): # NOTE: It's a trick to set empty start_up program. startup_program = Program() - outer_scope = global_scope() new_exe = core.StandaloneExecutor(self._place, startup_program.desc, - self._main_program.desc, outer_scope) + self._main_program.desc, self._scope) return new_exe @@ -585,11 +585,11 @@ class _ExecutorCache(object): self._place = place self._cached_executors = {} - def run(self, program, feed, fetch_list, return_numpy=True): - new_exe = self._get_exe_from_cache(program) + def run(self, program, scope, feed, fetch_list, return_numpy=True): + new_exe = self._get_exe_from_cache(program, scope) return new_exe.run(feed, fetch_list, return_numpy) - def _get_exe_from_cache(self, program): + def _get_exe_from_cache(self, program, scope): """ Return cached _StandaloneExecutor instance. If not found, create associated _StandaloneExecutor instance with given program and cache it. @@ -598,7 +598,7 @@ class _ExecutorCache(object): program, Program), "Required type(Program), but received {}".format( type(program).__name__) if program not in self._cached_executors: - new_exe = _StandaloneExecutor(self._place, program) + new_exe = _StandaloneExecutor(self._place, program, scope) self._cached_executors[program] = new_exe return self._cached_executors[program] @@ -1297,7 +1297,7 @@ class Executor(object): # NOTE: This is an experimental feature. If `export FLAGS_USE_STANDALONE_EXECUTOR=1 `, # use StandaloneExecutor to run the program. if self._enable_interpreter_core and not program._is_start_up_program_: - return self._executor_cache.run(program, feed, fetch_list, + return self._executor_cache.run(program, scope, feed, fetch_list, return_numpy) # use_prune can be overrided by putting optimize_ops in fetch_list