diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 950756c0394a5e6e851644e431b990418abffaaa..aea9ad20353966f3b9491d85129bfd62269cfcb0 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -418,15 +418,16 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { VLOG(4) << "Run pten kernel: " << op->Type(); VLOG(4) << instr_node.InnerRuntimeContext().get() << " " << &instr_node.DeviceContext(); + pten::KernelContext pt_kernel_context; op_with_kernel->BuildPtenKernelContext( *instr_node.InnerRuntimeContext().get(), - const_cast(&instr_node.DeviceContext())); + const_cast(&instr_node.DeviceContext()), + &pt_kernel_context); - (*instr_node.PtenKernel())(instr_node.PtenKernelContext()); + (*instr_node.PtenKernel())(&pt_kernel_context); op_with_kernel->WriteBackToOutputs( - instr_node.InnerRuntimeContext().get()); - instr_node.PtenKernelContext()->ClearData(); + instr_node.InnerRuntimeContext().get(), &pt_kernel_context); } else { instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get()); } diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index 7ced4853c2d8ffa0a1d35fa58f5e8bd29c685a9f..214a1d728266b03d122ffc5fdf36d4617612f22b 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -425,13 +425,14 @@ void build_op_func_list(const platform::Place& place, } if (run_pten_kernel) { - op_with_kernel->BuildPtenKernelContext(runtime_context, dev_ctx); + pten::KernelContext pt_kernel_context; + op_with_kernel->BuildPtenKernelContext(runtime_context, dev_ctx, + &pt_kernel_context); op_func_node.pt_kernel_ = op_with_kernel->PtenKernel(); - op_func_node.pt_kernel_context_ = op_with_kernel->PtenKernelContext(); - (*op_func_node.pt_kernel_)(op_func_node.pt_kernel_context_); - op_with_kernel->WriteBackToOutputs(&runtime_context); - op_func_node.pt_kernel_context_->ClearData(); + (*op_func_node.pt_kernel_)(&pt_kernel_context); + op_with_kernel->WriteBackToOutputs(&runtime_context, + &pt_kernel_context); } else { op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second); op_func_node.kernel_func_(exec_ctx); diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc index 654746794da4edd720f597eaf152b4c4e3dd0c33..fb29e18887b4ee74b448323fb1d14409212e9f71 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.cc +++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc @@ -688,10 +688,6 @@ pten::Kernel* Instruction::PtenKernel() const { return op_func_node_.pt_kernel_; } -pten::KernelContext* Instruction::PtenKernelContext() const { - return op_func_node_.pt_kernel_context_; -} - OpFuncType Instruction::KernelType() const { return op_func_node_.type_; } OperatorBase* Instruction::OpBase() const { diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 5d63eb33d424bd7f80e0b304e07a89212ba88ada..0ef85a25a237b5b97f4bba32dc28a436a5336174 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -299,8 +299,7 @@ struct OpFuncNode { platform::DeviceContext* dev_ctx_; // not owned // fit for pten kernel - pten::Kernel* pt_kernel_{nullptr}; // not owned - pten::KernelContext* pt_kernel_context_{nullptr}; // not onwed + pten::Kernel* pt_kernel_{nullptr}; // not owned OpFuncType type_; }; @@ -322,8 +321,6 @@ class Instruction { pten::Kernel* PtenKernel() const; - pten::KernelContext* PtenKernelContext() const; - OpFuncType KernelType() const; OperatorBase* OpBase() const; diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index aa21c8eed256b902f1bacb8a65429f87a713021a..ff12edb72c06ad21da7ff28849a054718b99d8fc 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1192,13 +1192,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope, platform::RecordEvent record_event("compute", platform::EventRole::kInnerOp); if (run_pten_kernel_) { - if (pt_kernel_context_ == nullptr) { - pt_kernel_context_.reset(new pten::KernelContext()); - } - BuildPtenKernelContext(*runtime_ctx, dev_ctx); - (*pt_kernel_)(pt_kernel_context_.get()); - WriteBackToOutputs(runtime_ctx); - pt_kernel_context_->ClearData(); + pten::KernelContext pt_kernel_context; + BuildPtenKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context); + (*pt_kernel_)(&pt_kernel_context); + WriteBackToOutputs(runtime_ctx, &pt_kernel_context); } else { (*kernel_func_)( ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx)); @@ -1791,18 +1788,9 @@ KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs( } void OperatorWithKernel::BuildPtenKernelContext( - const RuntimeContext& ctx, platform::DeviceContext* dev_ctx) const { - if (pt_kernel_context_ == nullptr) { - pt_kernel_context_.reset(new pten::KernelContext()); - } - // TODO(chenweihang): now only work for very simple case, - // many cases need to be deal with later: - // 1. the input and output are not tensor - // 2. the dispensbale, duplicable input and output - // 3. needless attributes remove - // 4. use pt Tensor directly - // 5. kernel input is not DenseTensor - pt_kernel_context_->SetDeviceContext(dev_ctx); + const RuntimeContext& ctx, platform::DeviceContext* dev_ctx, + pten::KernelContext* pt_kernel_context) const { + pt_kernel_context->SetDeviceContext(dev_ctx); auto& input_names = std::get<0>(pt_kernel_signature_->args); auto& attr_names = std::get<1>(pt_kernel_signature_->args); @@ -1836,33 +1824,14 @@ void OperatorWithKernel::BuildPtenKernelContext( // calcute the start and end index of the input tensors size_t start_idx = - (i == 0 ? 0 : pt_kernel_context_->InputRangeAt(i - 1).second); + (i == 0 ? 0 : pt_kernel_context->InputRangeAt(i - 1).second); size_t end_idx = start_idx + ins_vector.size(); - auto current_vector_size = pt_kernel_context_->InputsSize(); - // If the memory needed is less than the current memory allocated, we will - // reuse the current memory by using ReMakePtenDenseTensorFromVar. - // Otherwise,we will create new storage. for (size_t offset = 0; offset < ins_vector.size(); ++offset) { - if (current_vector_size > start_idx + offset) { - auto& input_ptr = - pt_kernel_context_->MutableInputPtrAt(start_idx + offset); - if (input_ptr == nullptr) { - input_ptr = experimental::MakePtenTensorBaseFromVar( - *ins_vector[offset], in_def); - } else { - experimental::ReMakePtenDenseTensorFromVar( - *ins_vector[offset], in_def, - pt_kernel_context_->MutableInputAt(start_idx + - offset)); - } - } else { - pt_kernel_context_->EmplaceBackInputWithoutSetRange( - experimental::MakePtenTensorBaseFromVar(*ins_vector[offset], - in_def)); - } + pt_kernel_context->EmplaceBackInputWithoutSetRange( + experimental::MakePtenTensorBaseFromVar(*ins_vector[offset], in_def)); } - pt_kernel_context_->AssignInputRange(std::make_pair(start_idx, end_idx), i); + pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i); } for (size_t i = 0; i < output_names.size(); ++i) { @@ -1870,43 +1839,24 @@ void OperatorWithKernel::BuildPtenKernelContext( auto& outs_vector = ctx.outputs.at(output_names[i]); size_t start_idx = - (i == 0 ? 0 : pt_kernel_context_->OutputRangeAt(i - 1).second); + (i == 0 ? 0 : pt_kernel_context->OutputRangeAt(i - 1).second); size_t end_idx = start_idx + outs_vector.size(); - auto current_vector_size = pt_kernel_context_->OutputsSize(); - // If the memory needed is less than the current memory allocated, we will - // reuse the current memory by using ReMakePtenDenseTensorFromVar. - // Otherwise,we will create new storage. for (size_t offset = 0; offset < outs_vector.size(); ++offset) { - if (current_vector_size > start_idx + offset) { - auto* buffer_tensor = - pt_kernel_context_->MutableOutputAt(start_idx + - offset); - if (buffer_tensor) { - experimental::ReMakePtenDenseTensorFromVar(outs_vector[offset], - out_def, buffer_tensor); - } - } else { - pt_kernel_context_->EmplaceBackOutputWithoutSetRange( - experimental::MakePtenTensorBaseFromVar(outs_vector[offset], - out_def)); - } + pt_kernel_context->EmplaceBackOutputWithoutSetRange( + experimental::MakePtenTensorBaseFromVar(outs_vector[offset], + out_def)); } // Deal with the case that some outputs are NULL when run the kernel. // For example : the outputs of matmul_grad are dx and dy, // sometimes dx or dy may be NULL. if (outs_vector.empty()) { - if (current_vector_size > start_idx) { - pt_kernel_context_->SetOutputWithoutSetRange(start_idx, {nullptr}); - } else { - pt_kernel_context_->EmplaceBackOutputWithoutSetRange({nullptr}); - } + pt_kernel_context->EmplaceBackOutputWithoutSetRange({nullptr}); end_idx = start_idx + 1; } - pt_kernel_context_->AssignOutputRange(std::make_pair(start_idx, end_idx), - i); + pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i); } for (size_t i = 0; i < attr_names.size(); ++i) { @@ -1915,11 +1865,11 @@ void OperatorWithKernel::BuildPtenKernelContext( if (attr_iter != Attrs().end()) { // shape is in the attribute if (std::type_index(attr_iter->second.type()) == std::type_index(typeid(std::vector))) { - pt_kernel_context_->EmplaceBackAttr(std::move(pten::ScalarArray( + pt_kernel_context->EmplaceBackAttr(std::move(pten::ScalarArray( BOOST_GET_CONST(std::vector, attr_iter->second)))); } else if (std::type_index(attr_iter->second.type()) == std::type_index(typeid(std::vector))) { - pt_kernel_context_->EmplaceBackAttr(std::move(pten::ScalarArray( + pt_kernel_context->EmplaceBackAttr(std::move(pten::ScalarArray( BOOST_GET_CONST(std::vector, attr_iter->second)))); } else { PADDLE_THROW(platform::errors::Unimplemented( @@ -1930,10 +1880,10 @@ void OperatorWithKernel::BuildPtenKernelContext( } else { // shape is in the input auto& ins_vector = ctx.inputs.at(attr_names[i]); if (ins_vector.size() == 1) { // ShapeTensor - pt_kernel_context_->EmplaceBackAttr(std::move( + pt_kernel_context->EmplaceBackAttr(std::move( experimental::MakePtenScalarArrayFromVar(*ins_vector.front()))); } else { // ShapeTensorList - pt_kernel_context_->EmplaceBackAttr(std::move( + pt_kernel_context->EmplaceBackAttr(std::move( experimental::MakePtenScalarArrayFromVarList(ins_vector))); } } @@ -1946,11 +1896,11 @@ void OperatorWithKernel::BuildPtenKernelContext( if (attr_iter != Attrs().end()) { // scalar is in the attribute auto& attr = Attrs().at(attr_names[i]); if (std::type_index(attr.type()) == std::type_index(typeid(float))) { - pt_kernel_context_->EmplaceBackAttr( + pt_kernel_context->EmplaceBackAttr( std::move(pten::Scalar(BOOST_GET_CONST(float, attr)))); } else if (std::type_index(attr.type()) == std::type_index(typeid(std::string))) { - pt_kernel_context_->EmplaceBackAttr( + pt_kernel_context->EmplaceBackAttr( std::move(pten::Scalar(BOOST_GET_CONST(std::string, attr)))); } else { PADDLE_THROW(platform::errors::Unimplemented( @@ -1960,7 +1910,7 @@ void OperatorWithKernel::BuildPtenKernelContext( } } else { auto& ins_vector = ctx.inputs.at(attr_names[i]); - pt_kernel_context_->EmplaceBackAttr(std::move( + pt_kernel_context->EmplaceBackAttr(std::move( experimental::MakePtenScalarFromVar(*ins_vector.front()))); } @@ -1968,17 +1918,17 @@ void OperatorWithKernel::BuildPtenKernelContext( // TODO(chenweihang): support other attrs later auto& attr = Attrs().at(attr_names[i]); if (attr_defs[i].type_index == std::type_index(typeid(int))) { - pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(int, attr)); + pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int, attr)); } else if (attr_defs[i].type_index == std::type_index(typeid(float))) { - pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(float, attr)); + pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(float, attr)); } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) { - pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(bool, attr)); + pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(bool, attr)); } else if (attr_defs[i].type_index == std::type_index(typeid(pten::DataType))) { auto data_type = pten::TransToPtenDataType( static_cast( BOOST_GET_CONST(int, attr))); - pt_kernel_context_->EmplaceBackAttr(data_type); + pt_kernel_context->EmplaceBackAttr(data_type); } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { if (std::type_index(attr.type()) == @@ -1987,7 +1937,7 @@ void OperatorWithKernel::BuildPtenKernelContext( const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); const std::vector vector_int64_attr(vector_int_attr.begin(), vector_int_attr.end()); - pt_kernel_context_->EmplaceBackAttr(vector_int64_attr); + pt_kernel_context->EmplaceBackAttr(vector_int64_attr); } // TODO(YuanRisheng) Need support vector attr @@ -2001,20 +1951,16 @@ void OperatorWithKernel::BuildPtenKernelContext( } } -void OperatorWithKernel::WriteBackToOutputs(RuntimeContext* ctx) const { - // auto& input_names = std::get<0>(pt_kernel_signature_->args); - // auto& attr_names = std::get<1>(pt_kernel_signature_->args); +void OperatorWithKernel::WriteBackToOutputs( + RuntimeContext* ctx, pten::KernelContext* pt_kernel_context) const { auto& output_names = std::get<2>(pt_kernel_signature_->args); - // pt_kernel_context_ - for (size_t i = 0; i < output_names.size(); ++i) { auto& outs_vector = ctx->outputs.at(output_names[i]); - auto& range_pair = pt_kernel_context_->OutputRangeAt(i); - auto pten_outs = - pt_kernel_context_->MutableOutputBetween( - range_pair.first, range_pair.second); + auto& range_pair = pt_kernel_context->OutputRangeAt(i); + auto pten_outs = pt_kernel_context->MutableOutputBetween( + range_pair.first, range_pair.second); for (size_t j = 0; j < pten_outs.size(); ++j) { if (pten_outs[j]) { diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 12946b416cf9f29f37de2d74d9a9db128e6a00db..3aab9165eae0afb1cc343874ccd5a699921f9ffa 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -589,16 +589,14 @@ class OperatorWithKernel : public OperatorBase { void ChoosePtenKernel(const ExecutionContext& ctx) const; void BuildPtenKernelContext(const RuntimeContext& ctx, - platform::DeviceContext* dev_ctx) const; + platform::DeviceContext* dev_ctx, + pten::KernelContext* pt_kernel_context) const; - void WriteBackToOutputs(RuntimeContext* ctx) const; + void WriteBackToOutputs(RuntimeContext* ctx, + pten::KernelContext* pt_kernel_context) const; pten::Kernel* PtenKernel() const { return pt_kernel_.get(); } - pten::KernelContext* PtenKernelContext() const { - return pt_kernel_context_.get(); - } - const OpKernelType* kernel_type() const { return kernel_type_.get(); } private: @@ -657,9 +655,6 @@ class OperatorWithKernel : public OperatorBase { mutable bool run_pten_kernel_ = false; mutable std::unique_ptr pt_kernel_signature_; mutable std::unique_ptr pt_kernel_; - // In order to reduce the compatibility phase - // performance overhead, temporarily cache KernelContext - mutable std::unique_ptr pt_kernel_context_; }; extern bool OpSupportGPU(const std::string& op_type); diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index d8ee400e35082b857937e59afdc16d28423f4bc1..cc7fcf455a13d7b1a677d1bbad3c56144811dbff 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -409,8 +409,6 @@ void VarBase::_CopyGradientFrom(const VarBase& src) { } } -pten::KernelContext OpBase::pt_kernel_context_; - void OpBase::SetType(const std::string& type) { op_ = framework::OpRegistry::CreateOp(type, {}, {}, {}, false); } @@ -426,8 +424,7 @@ static void OpBaseRunImpl(const framework::OperatorBase& op, const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs, - const platform::Place& place, - pten::KernelContext* pt_kernel_context) { + const platform::Place& place) { auto* op_kernel = dynamic_cast(&op); PADDLE_ENFORCE_NOT_NULL( op_kernel, platform::errors::PermissionDenied( @@ -468,8 +465,8 @@ static void OpBaseRunImpl(const framework::OperatorBase& op, * after the execution of op, but the original input is directly * overwritten in the previous dynamic graph implemention. */ - auto prepared_op = PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs, - default_attrs, pt_kernel_context); + auto prepared_op = + PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs, default_attrs); auto tmp_ins_ptr = PrepareData(*op_kernel, ins, prepared_op.kernel_type()); if (tmp_ins_ptr == nullptr) { @@ -497,8 +494,7 @@ void OpBase::Run(const framework::OperatorBase& op, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs, const platform::Place& place) { - OpBaseRunImpl(op, ins, outs, attrs, default_attrs, place, - &pt_kernel_context_); + OpBaseRunImpl(op, ins, outs, attrs, default_attrs, place); } void OpBase::Run(const framework::OperatorBase& op, @@ -507,8 +503,7 @@ void OpBase::Run(const framework::OperatorBase& op, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs, const platform::Place& place) { - OpBaseRunImpl(op, ins, outs, attrs, default_attrs, place, - &pt_kernel_context_); + OpBaseRunImpl(op, ins, outs, attrs, default_attrs, place); } void ClearNoNeedBufferInputs(OpBase* op) { diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h index cb76a823532828a3b966638671c67258931493c7..3d0847605566b0f6df63eb48db4cb38df18dc8da 100644 --- a/paddle/fluid/imperative/op_base.h +++ b/paddle/fluid/imperative/op_base.h @@ -183,8 +183,6 @@ class OpBase { const framework::AttributeMap& default_attrs, const platform::Place& place); - static pten::KernelContext* GetKernelContext() { return &pt_kernel_context_; } - bool HasVoidFunctionPostHook() const { return !void_function_post_hooks_.empty(); } @@ -212,9 +210,6 @@ class OpBase { std::unique_ptr op_; platform::Place place_; size_t id_{-1UL}; - // In order to reduce the compatibility phase - // performance overhead, temporarily cache KernelContext - static pten::KernelContext pt_kernel_context_; std::vector>> void_function_post_hooks_; }; diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 46e974c8f43f375a32abc64592df9fd28b69741f..15a278c2e6464321f6a80f0a5e271532d7468859 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -117,7 +117,6 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op, const framework::OpKernelType& kernel_type, const framework::KernelSignature& kernel_signature, const pten::Kernel& pt_kernel, - pten::KernelContext* pt_kernel_context, platform::DeviceContext* dev_ctx) : op_(op), ctx_(ctx), @@ -126,8 +125,7 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op, dev_ctx_(dev_ctx), run_pten_kernel_(true), pt_kernel_signature_(kernel_signature), - pt_kernel_(pt_kernel), - pt_kernel_context_(pt_kernel_context) {} + pt_kernel_(pt_kernel) {} template PreparedOp PrepareImpl(const NameVarMap& ins, @@ -135,8 +133,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, const framework::OperatorWithKernel& op, const platform::Place& place, const framework::AttributeMap& attrs, - const framework::AttributeMap& default_attrs, - pten::KernelContext* pt_kernel_context) { + const framework::AttributeMap& default_attrs) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); @@ -178,7 +175,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, // TODO(chenweihang): using CPUKernel when miss device kernel case return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature, - pt_kernel, pt_kernel_context, dev_ctx); + pt_kernel, dev_ctx); } else { VLOG(6) << "Dynamic mode ChoosePtenKernel - kernel `" << pt_kernel_name << "` not found."; @@ -247,10 +244,8 @@ PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const framework::OperatorWithKernel& op, const platform::Place& place, const framework::AttributeMap& attrs, - const framework::AttributeMap& default_attrs, - pten::KernelContext* pt_kernel_context) { - return PrepareImpl(ins, outs, op, place, attrs, default_attrs, - pt_kernel_context); + const framework::AttributeMap& default_attrs) { + return PrepareImpl(ins, outs, op, place, attrs, default_attrs); } PreparedOp PreparedOp::Prepare(const NameVarMap& ins, @@ -258,10 +253,9 @@ PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const framework::OperatorWithKernel& op, const platform::Place& place, const framework::AttributeMap& attrs, - const framework::AttributeMap& default_attrs, - pten::KernelContext* pt_kernel_context) { + const framework::AttributeMap& default_attrs) { return PrepareImpl(ins, outs, op, place, attrs, - default_attrs, pt_kernel_context); + default_attrs); } template @@ -271,13 +265,6 @@ static void BuildDygraphPtenKernelContext( const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs, platform::DeviceContext* dev_ctx, pten::KernelContext* kernel_ctx) { - // TODO(chenweihang): now only work for very simple case, - // many cases need to be deal with later: - // 1. the input and output are not tensor - // 2. the dispensbale, duplicable input and output - // 3. needless attributes remove - // 4. use pt Tensor directly - // 5. kernel input is not DenseTensor kernel_ctx->SetDeviceContext(dev_ctx); auto& input_names = std::get<0>(pt_kernel_signature.args); @@ -312,26 +299,11 @@ static void BuildDygraphPtenKernelContext( size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second); size_t end_idx = start_idx + ins_vector.size(); - auto current_vector_size = kernel_ctx->InputsSize(); - // If the memory needed is less than the current memory allocated, we will - // reuse the current memory by using ReMakePtenDenseTensorFromVar. - // Otherwise,we will create new storage. for (size_t offset = 0; offset < ins_vector.size(); ++offset) { const auto& variable = ins_vector[offset]->Var(); - if (current_vector_size > start_idx + offset) { - auto& input_ptr = kernel_ctx->MutableInputPtrAt(start_idx + offset); - if (input_ptr == nullptr) { - input_ptr = experimental::MakePtenTensorBaseFromVar(variable, in_def); - } else { - experimental::ReMakePtenDenseTensorFromVar( - variable, in_def, kernel_ctx->MutableInputAt( - start_idx + offset)); - } - } else { - kernel_ctx->EmplaceBackInputWithoutSetRange( - experimental::MakePtenTensorBaseFromVar(variable, in_def)); - } + kernel_ctx->EmplaceBackInputWithoutSetRange( + paddle::experimental::MakePtenTensorBaseFromVar(variable, in_def)); } kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); } @@ -340,15 +312,10 @@ static void BuildDygraphPtenKernelContext( auto& out_def = output_defs.at(i); size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second); - auto current_vector_size = kernel_ctx->OutputsSize(); auto iter = outs.find(output_names[i]); if (iter == outs.end()) { - if (current_vector_size > start_idx) { - kernel_ctx->SetOutputWithoutSetRange(start_idx, {nullptr}); - } else { - kernel_ctx->EmplaceBackOutputWithoutSetRange({nullptr}); - } + kernel_ctx->EmplaceBackOutputWithoutSetRange({nullptr}); kernel_ctx->AssignOutputRange(std::make_pair(start_idx, start_idx + 1), i); continue; @@ -357,27 +324,10 @@ static void BuildDygraphPtenKernelContext( auto& outs_vector = iter->second; size_t end_idx = start_idx + outs_vector.size(); - // If the memory needed is less than the current memory allocated, we will - // reuse the current memory by using ReMakePtenDenseTensorFromVar. - // Otherwise,we will create new storage. for (size_t offset = 0; offset < outs_vector.size(); ++offset) { - if (current_vector_size > start_idx + offset) { - auto* buffer_tensor = - kernel_ctx->MutableOutputAt(start_idx + offset); - if (buffer_tensor) { - experimental::ReMakePtenDenseTensorFromVar( - outs_vector[offset]->MutableVar(), out_def, buffer_tensor); - } else { - kernel_ctx->SetOutputWithoutSetRange( - start_idx + offset, - experimental::MakePtenTensorBaseFromVar( - outs_vector[offset]->MutableVar(), out_def)); - } - } else { - kernel_ctx->EmplaceBackOutputWithoutSetRange( - experimental::MakePtenTensorBaseFromVar( - outs_vector[offset]->MutableVar(), out_def)); - } + kernel_ctx->EmplaceBackOutputWithoutSetRange( + paddle::experimental::MakePtenTensorBaseFromVar( + outs_vector[offset]->MutableVar(), out_def)); } kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i); } @@ -556,19 +506,20 @@ static void PreparedOpRunPtImpl( const framework::OperatorBase& op, const framework::OpKernelType& kernel_type, const framework::KernelSignature& pt_kernel_signature, - const pten::Kernel& pt_kernel, pten::KernelContext* pt_kernel_context, - platform::DeviceContext* dev_ctx, const NameVarMap& ins, - const NameVarMap& outs, const framework::AttributeMap& attrs, + const pten::Kernel& pt_kernel, platform::DeviceContext* dev_ctx, + const NameVarMap& ins, const NameVarMap& outs, + const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { DygraphInferShapeContext infer_shape_ctx( &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type); op.Info().infer_shape_(&infer_shape_ctx); + pten::KernelContext pt_kernel_context; BuildDygraphPtenKernelContext(pt_kernel_signature, pt_kernel, ins, outs, attrs, default_attrs, dev_ctx, - pt_kernel_context); + &pt_kernel_context); - pt_kernel(pt_kernel_context); + pt_kernel(&pt_kernel_context); if (FLAGS_benchmark) { dev_ctx->Wait(); @@ -578,10 +529,7 @@ static void PreparedOpRunPtImpl( #endif } - WriteBackToOutputs(pt_kernel_signature, outs, pt_kernel_context); - - // Ensure that it does not affect the VarBase life cycle management - pt_kernel_context->ClearData(); + WriteBackToOutputs(pt_kernel_signature, outs, &pt_kernel_context); // TODO(chenweihang): add debug flags later if (framework::IsComplexType(kernel_type.data_type_)) { @@ -595,8 +543,8 @@ void PreparedOp::Run(const NameVarMap& ins, const framework::AttributeMap& default_attrs) { if (run_pten_kernel_) { PreparedOpRunPtImpl(op_, kernel_type_, pt_kernel_signature_, - pt_kernel_, pt_kernel_context_, dev_ctx_, ins, - outs, attrs, default_attrs); + pt_kernel_, dev_ctx_, ins, outs, attrs, + default_attrs); } else { PreparedOpRunImpl(op_, ctx_, kernel_type_, func_, dev_ctx_, ins, outs, attrs, default_attrs); @@ -609,8 +557,8 @@ void PreparedOp::Run(const NameVarMap& ins, const framework::AttributeMap& default_attrs) { if (run_pten_kernel_) { PreparedOpRunPtImpl( - op_, kernel_type_, pt_kernel_signature_, pt_kernel_, pt_kernel_context_, - dev_ctx_, ins, outs, attrs, default_attrs); + op_, kernel_type_, pt_kernel_signature_, pt_kernel_, dev_ctx_, ins, + outs, attrs, default_attrs); } else { PreparedOpRunImpl(op_, ctx_, kernel_type_, func_, dev_ctx_, ins, outs, attrs, default_attrs); diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 29747e79ef6fad9cb714837285c06e2630c0c0be..22f016e2cadc1a3fccf57645252c96bea860eb6a 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -153,25 +153,21 @@ class PreparedOp { const framework::RuntimeContext& ctx, const framework::OpKernelType& kernel_type, const framework::KernelSignature& kernel_signature, - const pten::Kernel& pt_kernel, - pten::KernelContext* pt_kernel_context, - platform::DeviceContext* dev_ctx); + const pten::Kernel& pt_kernel, platform::DeviceContext* dev_ctx); static PreparedOp Prepare(const NameVarMap& ins, const NameVarMap& outs, const framework::OperatorWithKernel& op, const platform::Place& place, const framework::AttributeMap& attrs, - const framework::AttributeMap& default_attrs, - pten::KernelContext* pt_kernel_context = nullptr); + const framework::AttributeMap& default_attrs); static PreparedOp Prepare(const NameVarMap& ins, const NameVarMap& outs, const framework::OperatorWithKernel& op, const platform::Place& place, const framework::AttributeMap& attrs, - const framework::AttributeMap& default_attrs, - pten::KernelContext* pt_kernel_context = nullptr); + const framework::AttributeMap& default_attrs); void Run(const NameVarMap& in, const NameVarMap& out, const framework::AttributeMap& attrs, @@ -196,9 +192,6 @@ class PreparedOp { bool run_pten_kernel_{false}; framework::KernelSignature pt_kernel_signature_; pten::Kernel pt_kernel_; - // In order to reduce the compatibility phase - // performance overhead, temporarily cache KernelContext - pten::KernelContext* pt_kernel_context_; }; } // namespace imperative diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 682916a9b323b638dfe8c7bfb1e675ab133bab15..7ed9f08906a731559c70d4794242f8e3fc437752 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -231,8 +231,6 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, OpBase::Run(*op, new_ins, outs, attrs, default_attrs, place); } catch (platform::EnforceNotMet& exception) { framework::AppendErrorOpHint(type, &exception); - // Compatible impl: clear pten kernel context data when throw error - OpBase::GetKernelContext()->ClearData(); throw std::move(exception); } catch (std::exception& ex) { PADDLE_THROW(platform::errors::Fatal( diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py index 10a9358612960cca302f198b1f8eecd78536a9a0..a3e6ea6d1bc78a8426d604d02116d3b2667ae6a6 100644 --- a/python/paddle/fluid/dataloader/dataloader_iter.py +++ b/python/paddle/fluid/dataloader/dataloader_iter.py @@ -202,22 +202,6 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase): # APIs in this thread. _set_expected_place(legacy_expected_place) - # NOTE(chenweihang): [ Why need to set not to execute pten kernel here? ] - # Now, in order to ensure that the execution performance of the dynamic - # graph mode in pten compatible state does not decline significantly, - # we have adopted the approach of caching a KernelContext globally for - # the dynamic graph tracer to reduce the construction and deconstruction - # overhead of data interfaces such as the compatible state DenseTensor. - # The static graph is each op caches a KernelContext, but the op of - # the dynamic graph will be constructed and destroyed every round of - # execution, so it is impossible to cache KernelContext for each op. - # However, it is not thread-safe if using only one global kernel context in - # dynamic graph. If the pten op of paddle is used in the DataLoader thread, - # it may cause access errors. We temporarily do not execute pten kernel - # in this scenario and will find a better solution later and remove - # this setting. - set_flags({'FLAGS_run_pten_kernel': False}) - while not self._thread_done_event.is_set(): try: indices = next(self._sampler_iter) @@ -519,9 +503,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): # APIs in this thread. _set_expected_place(legacy_expected_place) - # NOTE(chenweihang): See Note [ Why need to set not to execute pten kernel here? ] - set_flags({'FLAGS_run_pten_kernel': False}) - while not self._thread_done_event.is_set(): batch = self._get_data() if not self._thread_done_event.is_set():