diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index ffcb1b9f3ddbd1997bbdc27bb54a4c8e2dac4f3f..ec5a0362b6ee05309be16cfe7b6fdabb2e1b974f 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -21,11 +21,29 @@ InterpreterCore::InterpreterCore(const platform::Place& place, VariableScope* global_scope, const std::vector& feed_names, const std::vector& fetch_names) - : place_(place), main_program_(main_prog), global_scope_(global_scope) { + : place_(place), + main_program_(main_prog), + global_scope_(global_scope), + fetch_context_pool_({place}) { is_build_ = false; feed_names_ = feed_names; - fetch_names_ = fetch_names; - // add feedop and fetchop to main_program + + // Step1: add feedop and fetchop to main_program + auto* fetch_holder = main_program_.MutableBlock(0)->Var("fetch_vars"); + fetch_holder->SetType(proto::VarType::FETCH_LIST); + fetch_holder->SetPersistable(true); + + int i = 0; + for (auto& fetch_name : fetch_names) { + // append fetch op + auto* op = main_program_.MutableBlock(0)->AppendOp(); + op->SetType("fetch_v2"); + op->SetInput("X", {fetch_name}); + op->SetOutput("Out", {"fetch_vars"}); + op->SetAttr("col", {static_cast(i)}); + op->CheckAttrs(); + i++; + } // prune @@ -34,8 +52,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place, // convert to run graph } -void InterpreterCore::Run(const std::vector& feed_tensors, - std::vector* fetch_tensors) { +paddle::framework::FetchList InterpreterCore::Run( + const std::vector& feed_tensors) { if (is_build_ == false) { BuildVariableScope(main_program_, global_scope_); } @@ -58,32 +76,8 @@ void InterpreterCore::Run(const std::vector& feed_tensors, ExecuteInstructionList(vec_instruction_, *global_scope_, place_); } - for (size_t i = 0; i < fetch_names_.size(); ++i) { - auto it = global_scope_->name2id.find(fetch_names_[i]); - assert(it != global_scope_->name2id.end()); - PADDLE_ENFORCE_NE( - it, global_scope_->name2id.end(), - platform::errors::NotFound( - "Can't find (%d) the fetch var (%s) in scope", i, fetch_names_[i])); - - auto fetch_tensor = - global_scope_->var_list[it->second]->GetMutable(); - - if (platform::is_gpu_place(fetch_tensor->place())) { - Tensor out; - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place_); - dev_ctx->Wait(); - TensorCopySync(*fetch_tensor, platform::CPUPlace(), &out); - dev_ctx->Wait(); - fetch_tensors->push_back(out); - } else { - Tensor out; - TensorCopySync(*fetch_tensor, platform::CPUPlace(), &out); - fetch_tensors->push_back(out); - } - } + return *(global_scope_->var_list[global_scope_->name2id["fetch_vars"]] + ->GetMutable()); } void InterpreterCore::Convert() { @@ -195,6 +189,9 @@ void InterpreterCore::BuildInstructionCtx(Instruction* instr_node, platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); + if (instr_node->kernel_func_.operator_base_->Type() == "fetch_v2") { + dev_ctx = fetch_context_pool_.Get(place); + } Scope scope; instr_node->execution_ctx_.reset(new ExecutionContext( @@ -206,6 +203,12 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { instr_node.kernel_func_.operator_base_) ->InferShape(instr_node.infershape_ctx_.get()); + if (instr_node.kernel_func_.operator_base_->Type() == "fetch_v2") { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place_); + dev_ctx->Wait(); // TODO(wanghuancoder) + } + instr_node.kernel_func_.compute_func_(*instr_node.execution_ctx_.get()); } @@ -247,6 +250,8 @@ void InterpreterCore::ExecuteInstructionList( } } + fetch_context_pool_.Get(place)->Wait(); + for (size_t i = 0; i < working_var_ref.size(); ++i) { if (working_var_ref[i].var_ref_count_ != 0) { std::cerr << " var ref is not zero " << i << std::endl; diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index c102916e92b377426e7e4e330d8cb8c396b09472..b4db6923852e130fd2e70e624c44dfef68ba1ae5 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -35,8 +35,8 @@ class InterpreterCore { const std::vector& feed_names, const std::vector& fetch_names); - void Run(const std::vector& feed_tensors, - std::vector* fetch_tensors); + paddle::framework::FetchList Run( + const std::vector& feed_tensors); static void BuildOpFuncList(const platform::Place& place, const framework::ProgramDesc& pdesc, @@ -64,7 +64,7 @@ class InterpreterCore { VariableScope* var_scope); const platform::Place& place_; - const ProgramDesc& main_program_; + ProgramDesc main_program_; VariableScope* global_scope_; std::vector vec_meta_info_; @@ -80,7 +80,8 @@ class InterpreterCore { bool is_build_; std::vector feed_names_; - std::vector fetch_names_; + + platform::DeviceContextPool fetch_context_pool_; }; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index c312195feb518006543c9627b62956322b51bd76..3c419104907fb5a55f0c80aba5ce0f7391aedc65 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -47,15 +47,13 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, &vec_func_list, &global_scope_); } -int StandaloneExecutor::Run(const std::vector& feed_names, - const std::vector& feed_tensors, - const std::vector& fetch_names, - std::vector* fetch_tensors) { +paddle::framework::FetchList StandaloneExecutor::Run( + const std::vector& feed_names, + const std::vector& feed_tensors, + const std::vector& fetch_names) { auto core = GetInterpreterCore(feed_names, fetch_names); - core->Run(feed_tensors, fetch_tensors); - - return 0; + return core->Run(feed_tensors); } void StandaloneExecutor::BuildVariableOuterScope( diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h index 8526f64c6bcfbc3c287b1d5ca0584f0378ed2c64..c8943e29371f6b42e70107dbd11a90fc506d115f 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.h +++ b/paddle/fluid/framework/new_executor/standalone_executor.h @@ -26,10 +26,10 @@ namespace framework { class ExecutorBase { public: virtual ~ExecutorBase() {} - virtual int Run(const std::vector& feed_names, - const std::vector& feed_tensors, - const std::vector& fetch_names, - std::vector* fetch_tensors) = 0; + virtual paddle::framework::FetchList Run( + const std::vector& feed_names, + const std::vector& feed_tensors, + const std::vector& fetch_names) = 0; }; class StandaloneExecutor : public ExecutorBase { @@ -40,10 +40,10 @@ class StandaloneExecutor : public ExecutorBase { ~StandaloneExecutor() {} - virtual int Run(const std::vector& feed_names, - const std::vector& feed_tensors, - const std::vector& fetch_names, - std::vector* fetch_tensors); + virtual paddle::framework::FetchList Run( + const std::vector& feed_names, + const std::vector& feed_tensors, + const std::vector& fetch_names); private: void BuildVariableOuterScope(const framework::ProgramDesc& pdesc, diff --git a/paddle/fluid/operators/controlflow/fetch_v2_op.cc b/paddle/fluid/operators/controlflow/fetch_v2_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..382f412742e619c8e1b9725d25054cab8b5b9799 --- /dev/null +++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc @@ -0,0 +1,210 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +class OpDesc; +class InferShapeContext; +template +class EmptyGradOpMaker; +} // namespace framework +namespace imperative { +class OpBase; +} // namespace imperative +namespace platform { +struct CPUPlace; +struct CUDAPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace paddle { +namespace operators { + +static void DataCopy(const framework::LoDTensor &src_item, + const std::string &fetch_var_name, + framework::LoDTensor *dst_item, + const platform::DeviceContext &dev_ctx) { + if (src_item.IsInitialized() && src_item.numel() > 0) { +#ifdef PADDLE_WITH_MKLDNN + // Conversion from MKL-DNN to Paddle + if (src_item.layout() == framework::DataLayout::kMKLDNN) { + framework::Tensor out; + // Convert to desired Paddle layout, apart from grads of filter + // as params are not a subject to paddle's data_format + framework::innerTransDataLayoutFromMKLDNN( + src_item.layout(), fetch_var_name == framework::GradVarName("Filter") + ? framework::DataLayout::kNCHW + : paddle::platform::MKLDNNDeviceContext::tls() + .get_cur_paddle_data_layout(), + src_item, &out, platform::CPUPlace()); + TensorCopy(src_item, platform::CPUPlace(), dev_ctx, dst_item); + } else { + if (platform::is_gpu_place(src_item.place())) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + TensorCopy(src_item, platform::CUDAPinnedPlace(), dev_ctx, dst_item); +#endif + } else { + TensorCopy(src_item, platform::CPUPlace(), dst_item); + } + } +#else + if (platform::is_gpu_place(src_item.place())) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + TensorCopy(src_item, platform::CUDAPinnedPlace(), dev_ctx, dst_item); +#endif + } else { + TensorCopy(src_item, platform::CPUPlace(), dst_item); + } +#endif + + } else { + // Not copy, if the src tensor is empty. + dst_item->clear(); + dst_item->Resize({0}); + } + dst_item->set_lod(src_item.lod()); +} + +class FetchV2Op : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override {} + + protected: + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const framework::Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const override { + return framework::OpKernelType(expected_kernel_type.data_type_, + expected_kernel_type.place_, + tensor.layout()); + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), + ctx.device_context()); + } +}; + +class FetchV2InferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + ctx->SyncTypeAndDataType("X", "Out"); + } +}; + +class FetchV2Kernel { + public: + void operator()(const framework::ExecutionContext &ctx) const { + auto fetch_var_name = ctx.InputName("X"); + auto *fetch_var = ctx.InputVar("X"); + if (fetch_var == nullptr) { + return; + } + PADDLE_ENFORCE_EQ(ctx.HasOutput("Out"), true, + platform::errors::NotFound( + "Output(Out) of memcpy_d2h_op is not found.")); + auto *out_var = ctx.OutputVar("Out"); + // Get dev_ctx from ExecutionContext, it's D2H stream + auto &dev_ctx = ctx.device_context(); + + int col = ctx.Attr("col"); + PADDLE_ENFORCE_GE( + col, 0, platform::errors::InvalidArgument( + "Expected the column index (the attribute 'col' of " + "operator 'Fetch') of current fetching variable to be " + "no less than 0. But received column index = %d.", + col)); + + auto *fetch_list = out_var->GetMutable(); + + if (static_cast(col) >= fetch_list->size()) { + fetch_list->resize(col + 1); + } + + if (fetch_var->IsType()) { + auto &src_item = fetch_var->Get(); + auto *dst_item = &(BOOST_GET(framework::LoDTensor, fetch_list->at(col))); + DataCopy(src_item, fetch_var_name, dst_item, dev_ctx); + } else { + auto &src_item = fetch_var->Get(); + framework::LoDTensorArray tmp(src_item.size()); + fetch_list->at(col) = tmp; + auto &dst_item = + BOOST_GET(framework::LoDTensorArray, fetch_list->at(col)); + for (size_t i = 0; i < src_item.size(); ++i) { + DataCopy(src_item[i], fetch_var_name, &dst_item[i], dev_ctx); + } + } + } +}; + +class FetchV2OpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(LoDTensor) The resulted LoDTensor which is expected to return " + "to users."); + AddOutput("Out", + "(vector) A fetching list of LoDTensor which may have " + "different dimension, shape and data type."); + AddAttr("col", "(int) The column index of fetching object."); + AddComment(R"DOC( +FetchV2 Operator. + +It should not be configured by users directly. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OPERATOR( + fetch_v2, ops::FetchV2Op, ops::FetchV2OpProtoMaker, + ops::FetchV2InferVarType, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL_FUNCTOR(fetch_v2, float, ops::FetchV2Kernel, double, + ops::FetchV2Kernel, int, ops::FetchV2Kernel, + int64_t, ops::FetchV2Kernel, bool, + ops::FetchV2Kernel, plat::float16, + ops::FetchV2Kernel); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ROCM) +REGISTER_OP_CUDA_KERNEL_FUNCTOR(fetch_v2, float, ops::FetchV2Kernel, double, + ops::FetchV2Kernel, int, ops::FetchV2Kernel, + int64_t, ops::FetchV2Kernel, bool, + ops::FetchV2Kernel, plat::float16, + ops::FetchV2Kernel); +#endif + +#ifdef PADDLE_WITH_ASCEND_CL +REGISTER_OP_NPU_KERNEL_FUNCTOR(fetch_v2, float, ops::FetchV2Kernel, double, + ops::FetchV2Kernel, int, ops::FetchV2Kernel, + int64_t, ops::FetchV2Kernel, bool, + ops::FetchV2Kernel, plat::float16, + ops::FetchV2Kernel); +#endif diff --git a/paddle/fluid/operators/controlflow/unity_build_rule.cmake b/paddle/fluid/operators/controlflow/unity_build_rule.cmake index f75785bd961c2543a20877d6b68d84471df96f41..690a332d20b4c38a156f6393c5367f5c6f787965 100644 --- a/paddle/fluid/operators/controlflow/unity_build_rule.cmake +++ b/paddle/fluid/operators/controlflow/unity_build_rule.cmake @@ -10,6 +10,7 @@ register_unity_group(cc conditional_block_infer_op.cc feed_op.cc fetch_op.cc + fetch_v2_op.cc get_places_op.cc logical_op.cc bitwise_op.cc diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 426b539e80c76c9c07bfe164d1b89cf3c5ab85ac..a53bd1f535222804ad84b6464919e8553029bd2f 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1952,7 +1952,6 @@ All parameter, weight, gradient are variables in Paddle. [](StandaloneExecutor &self, const std::unordered_map &input_dict, std::vector fetch_names) { - pybind11::gil_scoped_release release; std::vector feed_tensors; std::vector feed_names; @@ -1964,13 +1963,13 @@ All parameter, weight, gradient are variables in Paddle. feed_tensors.push_back(t); } - std::vector fetch_tensors; - self.Run(feed_names, feed_tensors, fetch_names, &fetch_tensors); - std::vector vec_ret; - for (size_t i = 0; i < fetch_tensors.size(); ++i) { - vec_ret.push_back(TensorToPyArray(fetch_tensors[i], true)); + paddle::framework::FetchList ret; + { + pybind11::gil_scoped_release release; + ret = self.Run(feed_names, feed_tensors, fetch_names); } - return vec_ret; + + return py::cast(std::move(ret)); }); m.def("init_gflags", framework::InitGflags);