From 8dd0a3b935192ab7eb28f4af0ecc7884bdefd594 Mon Sep 17 00:00:00 2001 From: chenjian Date: Fri, 24 Jun 2022 10:35:29 +0800 Subject: [PATCH] record memory and op supplement info (#43550) * record memory and op supplement info * update * update * fix a bug * fix memory recording * fix a bug * update * update * fix a bug * update * fix a bug * fix a bug * fix a bug * Revert "fix a bug" This reverts commit c1d4df52762ba9ae7c7e27cd2ba4fc3a7ed9c7a5. * fix a bug * fix format * fix --- .../framework/new_executor/interpretercore.cc | 6 + paddle/fluid/framework/operator.cc | 357 ++++++--- paddle/fluid/memory/allocation/CMakeLists.txt | 4 +- .../allocation/naive_best_fit_allocator.cc | 113 +-- .../memory/allocation/pinned_allocator.cc | 9 + .../fluid/memory/allocation/stat_allocator.h | 29 +- .../fluid/memory/detail/system_allocator.cc | 126 +++- paddle/fluid/memory/memcpy.cc | 581 ++++++++++----- paddle/fluid/platform/CMakeLists.txt | 21 +- paddle/fluid/platform/device/gpu/gpu_info.cc | 101 ++- paddle/fluid/platform/profiler.cc | 343 +++++++-- paddle/fluid/platform/profiler.h | 41 +- paddle/fluid/platform/profiler/CMakeLists.txt | 4 +- paddle/fluid/platform/profiler/common_event.h | 80 +- paddle/fluid/platform/profiler/host_tracer.cc | 98 ++- paddle/fluid/platform/profiler/mem_tracing.h | 43 ++ .../fluid/platform/profiler/profiler_test.cc | 56 +- .../platform/profiler/supplement_tracing.h | 45 ++ paddle/fluid/pybind/pybind.cc | 697 ++++++++++++------ 19 files changed, 2031 insertions(+), 723 deletions(-) create mode 100644 paddle/fluid/platform/profiler/mem_tracing.h create mode 100644 paddle/fluid/platform/profiler/supplement_tracing.h diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index dfa2179f44..c61243041a 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -24,6 +24,7 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/supplement_tracing.h" #include "paddle/phi/core/kernel_context.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -558,6 +559,11 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { op_with_kernel->Info().infer_shape_( instr_node.InnerInferShapeContext().get()); } + infershape_event.End(); + platform::RecordOpInfoSupplement(op->Type(), + op->Attrs(), + *(instr_node.InnerInferShapeContext()), + *(instr_node.InnerRuntimeContext())); } } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index dbf6bec676..31c3ea7607 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -31,6 +31,7 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/supplement_tracing.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/kernel_context.h" @@ -70,7 +71,8 @@ std::vector> kKernelPriority = { std::make_tuple(platform::CPUPlace(), LibraryType::kPlain), }; -static DDim GetDimsDebug(const ScopeBase& scope, const std::string& name, +static DDim GetDimsDebug(const ScopeBase& scope, + const std::string& name, bool get_actual_dim = false) { Variable* var = scope.FindVar(name); if (var == nullptr) { @@ -264,7 +266,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { Type(), platform::TracerEventType::Operator, 1); auto op_name = platform::OpName(outputs_, Type()); platform::RecordEvent op_name_record_event( - op_name, platform::TracerEventType::Operator, + op_name, + platform::TracerEventType::Operator, FLAGS_enable_host_event_recorder_hook ? 20 : 1, platform::EventRole::kUniqueOp); RunImpl(scope, place); @@ -293,9 +296,11 @@ bool OperatorBase::HasInputs(const std::string& name) const { std::string OperatorBase::Input(const std::string& name) const { auto& ins = Inputs(name); PADDLE_ENFORCE_LE( - ins.size(), 1UL, + ins.size(), + 1UL, platform::errors::InvalidArgument( - "Operator %s's input %s should contain only one variable.", type_, + "Operator %s's input %s should contain only one variable.", + type_, name)); return ins.empty() ? kEmptyVarName : ins[0]; } @@ -304,9 +309,10 @@ const std::vector& OperatorBase::Inputs( const std::string& name) const { auto it = inputs_.find(name); PADDLE_ENFORCE_NE( - it, inputs_.end(), - platform::errors::NotFound("Operator %s does not have the input %s.", - type_, name)); + it, + inputs_.end(), + platform::errors::NotFound( + "Operator %s does not have the input %s.", type_, name)); return it->second; } @@ -321,9 +327,11 @@ bool OperatorBase::HasOutputs(const std::string& name) const { std::string OperatorBase::Output(const std::string& name) const { auto& outs = Outputs(name); PADDLE_ENFORCE_LE( - outs.size(), 1UL, + outs.size(), + 1UL, platform::errors::InvalidArgument( - "Operator %s's output %s should contain only one variable.", type_, + "Operator %s's output %s should contain only one variable.", + type_, name)); return outs.empty() ? kEmptyVarName : outs[0]; } @@ -332,7 +340,8 @@ const std::vector& OperatorBase::Outputs( const std::string& name) const { auto it = outputs_.find(name); PADDLE_ENFORCE_NE( - it, outputs_.end(), + it, + outputs_.end(), platform::errors::NotFound( "Operator %s does not have an output called %s.", type_, name)); return it->second; @@ -480,18 +489,20 @@ void OperatorBase::CheckAllInputOutputSet() const { for (auto& in : info_->Proto().inputs()) { if (!in.dispensable() && !in.extra()) { PADDLE_ENFORCE_NE( - inputs_.find(in.name()), inputs_.end(), - platform::errors::NotFound("Operator %s's input (%s) is not set.", - Type(), in.name())); + inputs_.find(in.name()), + inputs_.end(), + platform::errors::NotFound( + "Operator %s's input (%s) is not set.", Type(), in.name())); } } for (auto& out : info_->Proto().outputs()) { if (!out.dispensable() && !out.extra()) { PADDLE_ENFORCE_NE( - outputs_.find(out.name()), outputs_.end(), - platform::errors::NotFound("Operator %s's output (%s) is not set.", - Type(), out.name())); + outputs_.find(out.name()), + outputs_.end(), + platform::errors::NotFound( + "Operator %s's output (%s) is not set.", Type(), out.name())); } } } @@ -564,10 +575,12 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const { if (it == ctx_.inputs.end()) return nullptr; PADDLE_ENFORCE_LE( - it->second.size(), 1UL, + it->second.size(), + 1UL, platform::errors::InvalidArgument( "Operator %s's input %s should contain only one variable.", - op_.Type(), name)); + op_.Type(), + name)); return it->second.empty() ? nullptr : it->second[0]; } @@ -576,10 +589,12 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const { if (it == ctx_.outputs.end()) return nullptr; PADDLE_ENFORCE_LE( - it->second.size(), 1UL, + it->second.size(), + 1UL, platform::errors::InvalidArgument( "Operator %s's output %s should contain only one variable.", - op_.Type(), name)); + op_.Type(), + name)); return it->second.empty() ? nullptr : it->second[0]; } @@ -594,10 +609,13 @@ const std::vector ExecutionContext::MultiInput( } std::vector res; res.reserve(vars.size()); - std::transform(vars.begin(), vars.end(), std::back_inserter(res), + std::transform(vars.begin(), + vars.end(), + std::back_inserter(res), [&](const Variable* var) -> const Tensor* { if (var == nullptr) return nullptr; - PADDLE_ENFORCE_EQ(var->IsType(), true, + PADDLE_ENFORCE_EQ(var->IsType(), + true, platform::errors::InvalidArgument( "Input variable should be LoDTensor, " "but the received type is %s.", @@ -617,7 +635,9 @@ std::vector ExecutionContext::MultiOutput( } std::vector res; res.reserve(vars.size()); - std::transform(vars.begin(), vars.end(), std::back_inserter(res), + std::transform(vars.begin(), + vars.end(), + std::back_inserter(res), [&](Variable* var) -> Tensor* { return var == nullptr ? nullptr : var->GetMutable(); @@ -675,7 +695,8 @@ class RuntimeInferShapeContext : public InferShapeContext { const auto& in = it->second; if (in.size() == 0) return false; PADDLE_ENFORCE_EQ( - in.size(), 1UL, + in.size(), + 1UL, platform::errors::InvalidArgument( "Input %s should not contain more than one inputs.", name)); return in[0] != nullptr; @@ -693,7 +714,8 @@ class RuntimeInferShapeContext : public InferShapeContext { return false; } PADDLE_ENFORCE_EQ( - out.size(), 1UL, + out.size(), + 1UL, platform::errors::InvalidArgument( "Output %s should not contain more than one outputs.", name)); return out[0] != nullptr; @@ -750,11 +772,14 @@ class RuntimeInferShapeContext : public InferShapeContext { std::string GetInputNameByIdx(size_t idx) const override { auto& op_proto = paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_; - PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(), + PADDLE_ENFORCE_LT(idx, + op_proto->inputs().size(), platform::errors::OutOfRange( "The index should be less than the size of inputs of " "operator %s, but got index is %d and size is %d", - op_.Type(), idx, op_proto->inputs().size())); + op_.Type(), + idx, + op_proto->inputs().size())); return op_proto->inputs()[idx].name(); } @@ -762,42 +787,55 @@ class RuntimeInferShapeContext : public InferShapeContext { auto& op_proto = paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_; PADDLE_ENFORCE_LT( - idx, op_proto->outputs().size(), + idx, + op_proto->outputs().size(), platform::errors::OutOfRange( "The index should be less than the size of outputs of " "operator %s, but got index is %d and size is %d", - op_.Type(), idx, op_proto->outputs().size())); + op_.Type(), + idx, + op_proto->outputs().size())); return op_proto->outputs()[idx].name(); } - void ShareDim(const std::string& in, const std::string& out, size_t i = 0, + void ShareDim(const std::string& in, + const std::string& out, + size_t i = 0, size_t j = 0) override { auto in_it = ctx_.inputs.find(in); auto out_it = ctx_.outputs.find(out); PADDLE_ENFORCE_NE( - in_it, ctx_.inputs.end(), + in_it, + ctx_.inputs.end(), platform::errors::NotFound("Input %s does not exist.", in)); PADDLE_ENFORCE_NE( - out_it, ctx_.outputs.end(), + out_it, + ctx_.outputs.end(), platform::errors::NotFound("Output %s does not exist.", out)); - PADDLE_ENFORCE_LT(i, in_it->second.size(), + PADDLE_ENFORCE_LT(i, + in_it->second.size(), platform::errors::InvalidArgument( "The index of input dimension is out of range, " "excepted index less than %zu, but received %zu.", - in_it->second.size(), i)); - PADDLE_ENFORCE_LT(j, out_it->second.size(), + in_it->second.size(), + i)); + PADDLE_ENFORCE_LT(j, + out_it->second.size(), platform::errors::InvalidArgument( "The index of output dimension is out of range, " "excepted index less than %zu, but received %zu.", - out_it->second.size(), j)); + out_it->second.size(), + j)); Variable* in_var = in_it->second[i]; Variable* out_var = out_it->second[j]; PADDLE_ENFORCE_EQ( - in_var->Type(), out_var->Type(), + in_var->Type(), + out_var->Type(), platform::errors::InvalidArgument( - "The type of input (%s) and output (%s) are inconsistent.", in, + "The type of input (%s) and output (%s) are inconsistent.", + in, out)); if (in_var->IsType()) { @@ -821,19 +859,22 @@ class RuntimeInferShapeContext : public InferShapeContext { const std::string& out) const override { auto in_it = ctx_.inputs.find(in); auto out_it = ctx_.outputs.find(out); - PADDLE_ENFORCE_NE(in_it, ctx_.inputs.end(), + PADDLE_ENFORCE_NE(in_it, + ctx_.inputs.end(), platform::errors::NotFound( "Input [%s] found error in Op [%s]", in, op_.Type())); PADDLE_ENFORCE_NE( - out_it, ctx_.outputs.end(), - platform::errors::NotFound("Output [%s] found error in Op [%s]", out, - op_.Type())); + out_it, + ctx_.outputs.end(), + platform::errors::NotFound( + "Output [%s] found error in Op [%s]", out, op_.Type())); auto& in_var_list = in_it->second; auto& out_var_list = out_it->second; PADDLE_ENFORCE_EQ( - in_var_list.size(), out_var_list.size(), + in_var_list.size(), + out_var_list.size(), platform::errors::PreconditionNotMet( "Op [%s]: Input var size should be equal with output var size", op_.Type())); @@ -848,10 +889,12 @@ class RuntimeInferShapeContext : public InferShapeContext { Variable* in_var = in_var_list[i]; if (!in_var->IsType()) return; Variable* out_var = out_var_list[i]; - PADDLE_ENFORCE_EQ(out_var->IsType(), true, + PADDLE_ENFORCE_EQ(out_var->IsType(), + true, platform::errors::PreconditionNotMet( "The %d-th output of Output(%s) must be LoDTensor.", - i, out_var_names[i])); + i, + out_var_names[i])); auto& in_tensor = in_var->Get(); auto* out_tensor = out_var->GetMutable(); out_tensor->set_lod(in_tensor.lod()); @@ -862,32 +905,41 @@ class RuntimeInferShapeContext : public InferShapeContext { } } - void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, + void ShareLoD(const std::string& in, + const std::string& out, + size_t i = 0, size_t j = 0) const override { auto in_it = ctx_.inputs.find(in); auto out_it = ctx_.outputs.find(out); PADDLE_ENFORCE_NE( - in_it, ctx_.inputs.end(), + in_it, + ctx_.inputs.end(), platform::errors::NotFound("Input %s does not exist.", in)); PADDLE_ENFORCE_NE( - out_it, ctx_.outputs.end(), + out_it, + ctx_.outputs.end(), platform::errors::NotFound("Output %s does not exist.", out)); - PADDLE_ENFORCE_LT(i, in_it->second.size(), + PADDLE_ENFORCE_LT(i, + in_it->second.size(), platform::errors::InvalidArgument( "The index of input dimension is out of range, " "excepted index less than %zu, but received %zu.", - in_it->second.size(), i)); - PADDLE_ENFORCE_LT(j, out_it->second.size(), + in_it->second.size(), + i)); + PADDLE_ENFORCE_LT(j, + out_it->second.size(), platform::errors::InvalidArgument( "The index of output dimension is out of range, " "excepted index less than %zu, but received %zu.", - out_it->second.size(), j)); + out_it->second.size(), + j)); Variable* in_var = in_it->second.at(i); if (!in_var->IsType()) return; Variable* out_var = out_it->second.at(j); PADDLE_ENFORCE_EQ( - out_var->IsType(), true, + out_var->IsType(), + true, platform::errors::InvalidArgument( "The %zu-th output of Output(%s) must be LoDTensor.", j, out)); auto& in_tensor = in_var->Get(); @@ -922,7 +974,8 @@ class RuntimeInferShapeContext : public InferShapeContext { "set in the runtime kernel.")); } - void SetLoDLevel(const std::string& out, int32_t lod_level, + void SetLoDLevel(const std::string& out, + int32_t lod_level, size_t j = 0) const override { PADDLE_THROW(platform::errors::PreconditionNotMet( "SetLoDLevel is only used in compile time. The calculation of " @@ -965,10 +1018,12 @@ class RuntimeInferShapeContext : public InferShapeContext { DDim GetInputDim(const std::string& name) const override { const std::vector& vars = InputVars(name); PADDLE_ENFORCE_EQ( - vars.size(), 1UL, + vars.size(), + 1UL, platform::errors::InvalidArgument( "Input(%s) should hold one element, but now it holds %zu elements.", - name, vars.size())); + name, + vars.size())); return this->GetDim(vars[0]); } @@ -994,10 +1049,12 @@ class RuntimeInferShapeContext : public InferShapeContext { void SetOutputDim(const std::string& name, const DDim& dim) override { auto& vars = OutputVars(name); PADDLE_ENFORCE_EQ( - vars.size(), 1UL, + vars.size(), + 1UL, platform::errors::InvalidArgument("Output(%s) should hold one element, " "but now it holds %zu elements.", - name, vars.size())); + name, + vars.size())); SetDim(vars[0], dim); } @@ -1034,7 +1091,9 @@ class RuntimeInferShapeContext : public InferShapeContext { std::vector GetDims(const std::vector& vars) const { std::vector ret; ret.reserve(vars.size()); - std::transform(vars.begin(), vars.end(), std::back_inserter(ret), + std::transform(vars.begin(), + vars.end(), + std::back_inserter(ret), [this](Variable* var) { return this->GetDim(var); }); return ret; } @@ -1060,12 +1119,14 @@ class RuntimeInferShapeContext : public InferShapeContext { void SetDims(const std::vector& vars, const std::vector& dims) { size_t length = vars.size(); - PADDLE_ENFORCE_EQ(length, dims.size(), + PADDLE_ENFORCE_EQ(length, + dims.size(), platform::errors::InvalidArgument( "The number of input variables do not match the " "number of input dimensions, the number of variables " "is %zu, the number of dimensions is %zu.", - length, dims.size())); + length, + dims.size())); for (size_t i = 0; i < length; ++i) { if (vars[i] == nullptr) { continue; @@ -1084,9 +1145,12 @@ class RuntimeInferShapeContext : public InferShapeContext { const std::vector& vars) const { std::vector retv; retv.resize(vars.size()); - std::transform(vars.begin(), vars.end(), retv.begin(), + std::transform(vars.begin(), + vars.end(), + retv.begin(), std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType), - this, std::placeholders::_1)); + this, + std::placeholders::_1)); return retv; } @@ -1098,7 +1162,8 @@ class RuntimeInferShapeContext : public InferShapeContext { const std::vector& InputVars(const std::string& name) const { auto it = ctx_.inputs.find(name); PADDLE_ENFORCE_NE( - it, ctx_.inputs.end(), + it, + ctx_.inputs.end(), platform::errors::NotFound( "Operator (%s) does not have the input (%s).", op_.Type(), name)); return it->second; @@ -1107,7 +1172,8 @@ class RuntimeInferShapeContext : public InferShapeContext { const std::vector& OutputVars(const std::string& name) const { auto it = ctx_.outputs.find(name); PADDLE_ENFORCE_NE( - it, ctx_.outputs.end(), + it, + ctx_.outputs.end(), platform::errors::NotFound( "Operator (%s) does not have the outputs (%s).", op_.Type(), name)); return it->second; @@ -1143,20 +1209,23 @@ static void CheckTensorNANOrInf(const std::string& op_type, return; } PADDLE_ENFORCE_NE( - framework::TensorContainsInf(tensor), true, - platform::errors::Fatal("Operator %s output Tensor %s contains Inf.", - op_type, name)); + framework::TensorContainsInf(tensor), + true, + platform::errors::Fatal( + "Operator %s output Tensor %s contains Inf.", op_type, name)); PADDLE_ENFORCE_NE( - framework::TensorContainsNAN(tensor), true, - platform::errors::Fatal("Operator %s output Tensor %s contains NAN.", - op_type, name)); + framework::TensorContainsNAN(tensor), + true, + platform::errors::Fatal( + "Operator %s output Tensor %s contains NAN.", op_type, name)); } bool OperatorWithKernel::SupportGPU() const { auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap( phi::TransToPhiKernelName(type_)); auto has_phi_kernel = - std::any_of(phi_kernels.begin(), phi_kernels.end(), + std::any_of(phi_kernels.begin(), + phi_kernels.end(), [](phi::KernelKeyMap::const_reference kern_pair) { return kern_pair.first.backend() == phi::Backend::GPU; }); @@ -1169,7 +1238,8 @@ bool OperatorWithKernel::SupportGPU() const { } else { auto& op_kernels = kernel_iter->second; return std::any_of( - op_kernels.begin(), op_kernels.end(), + op_kernels.begin(), + op_kernels.end(), [](OpKernelMap::const_reference kern_pair) { return platform::is_gpu_place(kern_pair.first.place_); }); @@ -1181,7 +1251,8 @@ bool OperatorWithKernel::SupportNPU() const { auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap( phi::TransToPhiKernelName(type_)); auto has_phi_kernel = - std::any_of(phi_kernels.begin(), phi_kernels.end(), + std::any_of(phi_kernels.begin(), + phi_kernels.end(), [](phi::KernelKeyMap::const_reference kern_pair) { return kern_pair.first.backend() == phi::Backend::NPU; }); @@ -1194,7 +1265,8 @@ bool OperatorWithKernel::SupportNPU() const { } else { auto& op_kernels = kernel_iter->second; return std::any_of( - op_kernels.begin(), op_kernels.end(), + op_kernels.begin(), + op_kernels.end(), [](OpKernelMap::const_reference kern_pair) { return platform::is_npu_place(kern_pair.first.place_); }); @@ -1214,7 +1286,8 @@ bool OperatorWithKernel::SupportsMKLDNN( return false; } auto& op_kernels = op_kernel_iter->second; - return std::any_of(op_kernels.begin(), op_kernels.end(), + return std::any_of(op_kernels.begin(), + op_kernels.end(), [data_type](OpKernelMap::const_reference kern_pair) { return platform::is_cpu_place(kern_pair.first.place_) && kern_pair.first.library_type_ == @@ -1496,10 +1569,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope, { platform::RecordEvent record_event("prepare_data", platform::TracerEventType::OperatorInner, - 1, platform::EventRole::kInnerOp); + 1, + platform::EventRole::kInnerOp); if (need_prepare_data_) { - transfer_scope = PrepareData(scope, *kernel_type_, - &transfered_inplace_vars, runtime_ctx); + transfer_scope = PrepareData( + scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx); } } // exec scope is the scope that kernel actually executed on. @@ -1509,9 +1583,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope, if (!all_kernels_must_compute_runtime_shape_) { platform::RecordEvent record_event("infer_shape", platform::TracerEventType::OperatorInner, - 1, platform::EventRole::kInnerOp); + 1, + platform::EventRole::kInnerOp); RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx); this->Info().infer_shape_(&infer_shape_ctx); + record_event.End(); + platform::RecordOpInfoSupplement( + Type(), Attrs(), infer_shape_ctx, *runtime_ctx); } if (FLAGS_enable_unused_var_check) { @@ -1523,7 +1601,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, { platform::RecordEvent record_event("compute", platform::TracerEventType::OperatorInner, - 1, platform::EventRole::kInnerOp); + 1, + platform::EventRole::kInnerOp); if (run_phi_kernel_) { phi::KernelContext pt_kernel_context; // Do data transform before building KernelContext @@ -1663,7 +1742,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { auto& all_op_kernels = AllOpKernels(); auto kernels_iter = all_op_kernels.find(type_); PADDLE_ENFORCE_NE( - kernels_iter, all_op_kernels.end(), + kernels_iter, + all_op_kernels.end(), platform::errors::Unavailable( "There are no kernels which are registered in the %s operator.", type_)); @@ -1785,10 +1865,12 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { kernel_iter = kernels.find(expected_kernel_key); } #endif - PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), - platform::errors::NotFound( - "Operator (%s) does not have kernel for %s.", type_, - KernelTypeToString(expected_kernel_key))); + PADDLE_ENFORCE_NE( + kernel_iter, + kernels.end(), + platform::errors::NotFound("Operator (%s) does not have kernel for %s.", + type_, + KernelTypeToString(expected_kernel_key))); std::lock_guard lock(cache_update_mutex_); if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) { @@ -1798,7 +1880,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { } void OperatorWithKernel::TransferInplaceVarsBack( - const Scope& scope, const std::vector& inplace_vars, + const Scope& scope, + const std::vector& inplace_vars, const Scope& transfer_scope) const { for (auto& var_name : inplace_vars) { VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; @@ -1809,8 +1892,9 @@ void OperatorWithKernel::TransferInplaceVarsBack( auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var); auto* var = transfer_scope.FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument( - "The variable[%s] is nullptr.", var_name)); + PADDLE_ENFORCE_NOT_NULL(var, + platform::errors::InvalidArgument( + "The variable[%s] is nullptr.", var_name)); auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var); auto original_dims = original_tensor->dims(); original_tensor->ShareDataWith(*transformed_tensor); @@ -1890,7 +1974,8 @@ void OperatorWithKernel::HandleComplexGradToRealGrad( } Scope* OperatorWithKernel::PrepareData( - const Scope& scope, const OpKernelType& expected_kernel_key, + const Scope& scope, + const OpKernelType& expected_kernel_key, std::vector* transfered_inplace_vars, RuntimeContext* ctx) const { Scope* new_scope = nullptr; @@ -1947,8 +2032,8 @@ Scope* OperatorWithKernel::PrepareData( input_vars[i] = trans_var; auto out = trans_var->GetMutable(); out->Resize(tensor_in->dims()); - platform::MatchShapeToLayout(out, tensor_in->layout(), - DataLayout::kNHWC); + platform::MatchShapeToLayout( + out, tensor_in->layout(), DataLayout::kNHWC); VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , " "but kNHWC layout" << var_name_item.first << " in Operator " << type_; @@ -1995,8 +2080,8 @@ Scope* OperatorWithKernel::PrepareData( if (!run_by_executor_ && (platform::is_gpu_place(kernel_type_for_var.place_) || platform::is_gpu_place(expected_kernel_key.place_))) { - new_scope = TryCreateTransferScope(kernel_type_for_var, - expected_kernel_key, &scope); + new_scope = TryCreateTransferScope( + kernel_type_for_var, expected_kernel_key, &scope); enable_cache_transfer_scope_ = true; } if (!new_scope) { @@ -2058,7 +2143,8 @@ Scope* OperatorWithKernel::PrepareData( } void OperatorWithKernel::ParseInputDataType( - const Variable* var, const std::string& name, + const Variable* var, + const std::string& name, proto::VarType::Type* data_type) const { if (var != nullptr) { const Tensor* t = nullptr; @@ -2078,17 +2164,20 @@ void OperatorWithKernel::ParseInputDataType( } if (t != nullptr) { PADDLE_ENFORCE_EQ( - t->IsInitialized(), true, + t->IsInitialized(), + true, platform::errors::InvalidArgument("The %s Op's Input Variable `%s` " "contains uninitialized Tensor.", - Type(), name)); + Type(), + name)); *data_type = paddle::framework::TransToProtoVarType(t->dtype()); } } } void OperatorWithKernel::ParseMultiInputDataType( - const std::vector& vars, const std::string& name, + const std::vector& vars, + const std::string& name, proto::VarType::Type* data_type) const { proto::VarType::Type default_data_type = static_cast(-1); @@ -2112,10 +2201,12 @@ void OperatorWithKernel::ParseMultiInputDataType( } if (t != nullptr) { PADDLE_ENFORCE_EQ( - t->IsInitialized(), true, + t->IsInitialized(), + true, platform::errors::InvalidArgument("The %s Op's Input Variable `%s` " "contains uninitialized Tensor.", - Type(), name)); + Type(), + name)); proto::VarType::Type tmp = paddle::framework::TransToProtoVarType(t->dtype()); PADDLE_ENFORCE(tmp == *data_type || *data_type == default_data_type, @@ -2125,7 +2216,9 @@ void OperatorWithKernel::ParseMultiInputDataType( "consistent or reigster GetExpectedKernelType. The " "current variable type is (%s), but the " "previous variable type is (%s).", - Type(), name, DataTypeToString(tmp), + Type(), + name, + DataTypeToString(tmp), DataTypeToString(*data_type))); *data_type = tmp; } @@ -2146,7 +2239,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( } } PADDLE_ENFORCE_NE( - data_type, dafault_data_type, + data_type, + dafault_data_type, platform::errors::NotFound( "DataType should be indicated by input Variable at %s.", Type())); return data_type; @@ -2163,12 +2257,14 @@ proto::VarType::Type OperatorWithKernel::IndicateVarDataType( ParseMultiInputDataType(ctx.MultiInputVar(name), name, &data_type); } PADDLE_ENFORCE_NE( - data_type, dafault_data_type, + data_type, + dafault_data_type, platform::errors::InvalidArgument( "The Input Variable(%s) of (%s) Operator used to determine kernel " "data type is empty or not LoDTensor or SelectedRows or " "LoDTensorArray.", - name, Type())); + name, + Type())); return data_type; } @@ -2200,11 +2296,14 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely( t, platform::errors::InvalidArgument( "The Tensor of variable %s is nullptr when promote complex types.")); - PADDLE_ENFORCE_EQ(t->IsInitialized(), true, + PADDLE_ENFORCE_EQ(t->IsInitialized(), + true, platform::errors::InvalidArgument( "The Tensor in the %s Op's Input Variable %s(%s) is " "not initialized.", - Type(), name, ctx.InputName(name))); + Type(), + name, + ctx.InputName(name))); return t; } @@ -2216,7 +2315,8 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely( * the kernel data type. */ proto::VarType::Type OperatorWithKernel::IndicateOrPromoteVarDataTypes( - const ExecutionContext& ctx, const std::string& name1, + const ExecutionContext& ctx, + const std::string& name1, const std::string& name2) const { // 1. Get tensor auto* tensor_a = GetTensorFormInputSafely(ctx, name1); @@ -2238,10 +2338,11 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType( } OpKernelType OperatorWithKernel::GetKernelTypeForVar( - const std::string& var_name, const Tensor& tensor, + const std::string& var_name, + const Tensor& tensor, const OpKernelType& expected_kernel_type) const { - return OpKernelType(expected_kernel_type.data_type_, tensor.place(), - tensor.layout()); + return OpKernelType( + expected_kernel_type.data_type_, tensor.place(), tensor.layout()); } phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( @@ -2264,16 +2365,19 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( } Scope* OperatorWithKernel::PreparePhiData( - const Scope& scope, const phi::Kernel& pt_kernel, + const Scope& scope, + const phi::Kernel& pt_kernel, const phi::KernelSignature& pt_kernel_signature, RuntimeContext* ctx) const { const auto& input_names = pt_kernel_signature.input_names; auto input_defs = pt_kernel.args_def().input_defs(); - PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(), + PADDLE_ENFORCE_EQ(input_names.size(), + input_defs.size(), platform::errors::InvalidArgument( "The size of inputs_args names (%d) must be equal to " "the size of kernel input_defs (%d).", - input_names.size(), input_defs.size())); + input_names.size(), + input_defs.size())); Scope* new_scope = nullptr; auto& name_map = Inputs(); const std::unordered_set* no_buffer_ins = nullptr; @@ -2362,7 +2466,8 @@ Scope* OperatorWithKernel::PreparePhiData( } void OperatorWithKernel::BuildPhiKernelContext( - const RuntimeContext& ctx, platform::DeviceContext* dev_ctx, + const RuntimeContext& ctx, + platform::DeviceContext* dev_ctx, phi::KernelContext* pt_kernel_context) const { pt_kernel_context->SetDeviceContext(dev_ctx); @@ -2374,23 +2479,29 @@ void OperatorWithKernel::BuildPhiKernelContext( auto attr_defs = pt_kernel_->args_def().attribute_defs(); auto output_defs = pt_kernel_->args_def().output_defs(); - PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(), + PADDLE_ENFORCE_EQ(input_names.size(), + input_defs.size(), platform::errors::InvalidArgument( "The size of inputs_args names (%d) must be equal to " "the size of kernel input_defs (%d).", - input_names.size(), input_defs.size())); + input_names.size(), + input_defs.size())); - PADDLE_ENFORCE_EQ(output_names.size(), output_defs.size(), + PADDLE_ENFORCE_EQ(output_names.size(), + output_defs.size(), platform::errors::InvalidArgument( "The size of outputs_args names (%d) must be equal to " "the size of kernel output_defs (%d).", - output_names.size(), output_defs.size())); + output_names.size(), + output_defs.size())); - PADDLE_ENFORCE_EQ(attr_names.size(), attr_defs.size(), + PADDLE_ENFORCE_EQ(attr_names.size(), + attr_defs.size(), platform::errors::InvalidArgument( "The size of attribute_args names (%d) must be equal " "to the size of kernel attribute_defs (%d).", - attr_names.size(), attr_defs.size())); + attr_names.size(), + attr_defs.size())); for (size_t i = 0; i < input_names.size(); ++i) { auto it = ctx.inputs.find(input_names[i]); @@ -2572,7 +2683,8 @@ void OperatorWithKernel::BuildPhiKernelContext( break; case phi::AttributeType::SCALARS: { PADDLE_ENFORCE_NE( - attr_iter, Attrs().end(), + attr_iter, + Attrs().end(), platform::errors::NotFound("(%s) is not found in AttributeMap when " "buildind static KernelContext.", attr_names[i])); @@ -2636,7 +2748,8 @@ void OperatorWithKernel::BuildPhiKernelContext( } break; default: { PADDLE_ENFORCE_NE( - attr_iter, Attrs().end(), + attr_iter, + Attrs().end(), platform::errors::NotFound("(%s) is not found in AttributeMap when " "buildind static KernelContext.", attr_names[i])); diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index e1b14c4bae..46a46b04b3 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -1,7 +1,7 @@ cc_library( allocator SRCS allocator.cc - DEPS place stats) + DEPS place stats profiler) cc_library( cpu_allocator SRCS cpu_allocator.cc @@ -21,7 +21,7 @@ cc_library( cc_library( naive_best_fit_allocator SRCS naive_best_fit_allocator.cc - DEPS allocator buddy_allocator profiler) + DEPS allocator buddy_allocator) cc_test( naive_best_fit_allocator_test SRCS naive_best_fit_allocator_test.cc diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 7cc95de831..9d5f048a16 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -32,7 +32,8 @@ #endif PADDLE_DEFINE_EXPORTED_bool( - init_allocated_mem, false, + init_allocated_mem, + false, "It is a mistake that the values of the memory allocated by " "BuddyAllocator are always zeroed in some op's implementation. " "To find this error in time, we use init_allocated_mem to indicate " @@ -77,7 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() { std::call_once(init_flag, []() { a = new detail::BuddyAllocator( std::unique_ptr(new detail::CPUAllocator), - platform::CpuMinChunkSize(), platform::CpuMaxChunkSize()); + platform::CpuMinChunkSize(), + platform::CpuMaxChunkSize()); }); return a; @@ -95,7 +97,8 @@ void *Alloc(const platform::CPUPlace &place, size_t size) { } template <> -void Free(const platform::CPUPlace &place, void *p, +void Free(const platform::CPUPlace &place, + void *p, size_t size) { VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); GetCPUBuddyAllocator()->Free(p); @@ -125,7 +128,8 @@ void *Alloc(const platform::IPUPlace &place, size_t size) { return p; } template <> -void Free(const platform::IPUPlace &place, void *p, +void Free(const platform::IPUPlace &place, + void *p, size_t size) { VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); GetCPUBuddyAllocator()->Free(p); @@ -154,7 +158,8 @@ void *Alloc(const platform::XPUPlace &place, size_t size) { ret = xpu_malloc(reinterpret_cast(&p), size); } PADDLE_ENFORCE_EQ( - ret, XPU_SUCCESS, + ret, + XPU_SUCCESS, platform::errors::External( "XPU API return wrong value[%d], no enough memory", ret)); if (FLAGS_init_allocated_mem) { @@ -171,7 +176,8 @@ void *Alloc(const platform::XPUPlace &place, size_t size) { } template <> -void Free(const platform::XPUPlace &place, void *p, +void Free(const platform::XPUPlace &place, + void *p, size_t size) { #ifdef PADDLE_WITH_XPU VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); @@ -234,11 +240,13 @@ class NPUBuddyAllocatorList { BuddyAllocator *Get(int npu_id) { auto pos = std::distance( devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id)); - PADDLE_ENFORCE_LT(pos, devices_.size(), + PADDLE_ENFORCE_LT(pos, + devices_.size(), platform::errors::OutOfRange( "The index exceeds the size of devices, the size of " "devices is %d, the index is %d", - devices_.size(), pos)); + devices_.size(), + pos)); std::call_once(*init_flags_[pos], [this, pos] { platform::SetNPUDeviceId(devices_[pos]); @@ -246,7 +254,8 @@ class NPUBuddyAllocatorList { new BuddyAllocator(std::unique_ptr( new detail::NPUAllocator(devices_[pos])), platform::NPUMinChunkSize(), - platform::NPUMaxChunkSize(), EXTRA_PADDING_SIZE)); + platform::NPUMaxChunkSize(), + EXTRA_PADDING_SIZE)); VLOG(10) << "\n\nNOTE:\n" << "You can set GFlags environment variable " << "'FLAGS_fraction_of_gpu_memory_to_use' " @@ -312,8 +321,10 @@ void *Alloc(const platform::NPUPlace &place, size_t size) { PADDLE_THROW(platform::errors::ResourceExhausted( "Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize " "%s, NpuMaxChunkSize %s, NPU memory used: %s.", - string::HumanReadableSize(size), place.device, - string::HumanReadableSize(avail), string::HumanReadableSize(total), + string::HumanReadableSize(size), + place.device, + string::HumanReadableSize(avail), + string::HumanReadableSize(total), string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), string::HumanReadableSize(Used(place)))); @@ -331,7 +342,8 @@ void *Alloc(const platform::NPUPlace &place, size_t size) { } template <> -void Free(const platform::NPUPlace &place, void *p, +void Free(const platform::NPUPlace &place, + void *p, size_t size) { #ifdef PADDLE_WITH_ASCEND_CL VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); @@ -384,7 +396,8 @@ void *Alloc(const platform::NPUPinnedPlace &place, template <> void Free(const platform::NPUPinnedPlace &place, - void *p, size_t size) { + void *p, + size_t size) { #ifdef PADDLE_WITH_ASCEND_CL GetNPUPinnedBuddyAllocator()->Free(p); #else @@ -430,18 +443,21 @@ class GPUBuddyAllocatorList { BuddyAllocator *Get(int gpu_id) { auto pos = std::distance( devices_.begin(), std::find(devices_.begin(), devices_.end(), gpu_id)); - PADDLE_ENFORCE_LT(pos, devices_.size(), + PADDLE_ENFORCE_LT(pos, + devices_.size(), platform::errors::OutOfRange( "The index exceeds the size of devices, the size of " "devices is %d, the index is %d", - devices_.size(), pos)); + devices_.size(), + pos)); std::call_once(*init_flags_[pos], [this, pos] { platform::SetDeviceId(devices_[pos]); - allocators_[pos].reset(new BuddyAllocator( - std::unique_ptr( - new detail::GPUAllocator(devices_[pos])), - platform::GpuMinChunkSize(), platform::GpuMaxChunkSize())); + allocators_[pos].reset( + new BuddyAllocator(std::unique_ptr( + new detail::GPUAllocator(devices_[pos])), + platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize())); VLOG(10) << "\n\nNOTE:\n" << "You can set GFlags environment variable " << "'FLAGS_fraction_of_gpu_memory_to_use' " @@ -493,8 +509,10 @@ void *Alloc(const platform::CUDAPlace &place, PADDLE_THROW(platform::errors::ResourceExhausted( "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize " "%s, GpuMaxChunkSize %s, GPU memory used: %s.", - string::HumanReadableSize(size), place.device, - string::HumanReadableSize(avail), string::HumanReadableSize(total), + string::HumanReadableSize(size), + place.device, + string::HumanReadableSize(avail), + string::HumanReadableSize(total), string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), string::HumanReadableSize(Used(place)))); @@ -515,7 +533,8 @@ void *Alloc(const platform::CUDAPlace &place, } template <> -void Free(const platform::CUDAPlace &place, void *p, +void Free(const platform::CUDAPlace &place, + void *p, size_t size) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) GetGPUBuddyAllocator(place.device)->Free(p); @@ -584,7 +603,8 @@ void *Alloc(const platform::CUDAPinnedPlace &place, template <> void Free(const platform::CUDAPinnedPlace &place, - void *p, size_t size) { + void *p, + size_t size) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) GetCUDAPinnedBuddyAllocator()->Free(p); #else @@ -630,18 +650,21 @@ class MLUBuddyAllocatorList { BuddyAllocator *Get(int mlu_id) { auto pos = std::distance( devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id)); - PADDLE_ENFORCE_LT(pos, devices_.size(), + PADDLE_ENFORCE_LT(pos, + devices_.size(), platform::errors::OutOfRange( "The index exceeds the size of devices, the size of " "devices is %d, the index is %d", - devices_.size(), pos)); + devices_.size(), + pos)); std::call_once(*init_flags_[pos], [this, pos] { platform::SetMLUDeviceId(devices_[pos]); - allocators_[pos].reset(new BuddyAllocator( - std::unique_ptr( - new detail::MLUAllocator(devices_[pos])), - platform::MLUMinChunkSize(), platform::MLUMaxChunkSize())); + allocators_[pos].reset( + new BuddyAllocator(std::unique_ptr( + new detail::MLUAllocator(devices_[pos])), + platform::MLUMinChunkSize(), + platform::MLUMaxChunkSize())); VLOG(10) << "\n\nNOTE:\n" << "You can set GFlags environment variable " << "(mlu reuse gpu GFlags) " @@ -693,8 +716,10 @@ void *Alloc(const platform::MLUPlace &place, size_t size) { PADDLE_THROW(platform::errors::ResourceExhausted( "Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize " "%s, MLUMinChunkSize %s, MLU memory used: %s.", - string::HumanReadableSize(size), place.device, - string::HumanReadableSize(avail), string::HumanReadableSize(total), + string::HumanReadableSize(size), + place.device, + string::HumanReadableSize(avail), + string::HumanReadableSize(total), string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), string::HumanReadableSize(Used(place)))); @@ -711,7 +736,8 @@ void *Alloc(const platform::MLUPlace &place, size_t size) { } template <> -void Free(const platform::MLUPlace &place, void *p, +void Free(const platform::MLUPlace &place, + void *p, size_t size) { #ifdef PADDLE_WITH_MLU VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); @@ -759,10 +785,12 @@ class BuddyAllocatorList { } BuddyAllocator *Get(int dev_id) { - PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(), + PADDLE_ENFORCE_NE(init_flags_.find(dev_id), + init_flags_.end(), platform::errors::OutOfRange( "Cannot find %s %d, please check visible devices.", - device_type_, dev_id)); + device_type_, + dev_id)); std::call_once(*init_flags_[dev_id], [this, dev_id] { phi::DeviceManager::SetDevice(device_type_, dev_id); @@ -773,7 +801,8 @@ class BuddyAllocatorList { new detail::CustomAllocator(device_type_, dev_id)), phi::DeviceManager::GetMinChunkSize(place), phi::DeviceManager::GetMaxChunkSize(place), - phi::DeviceManager::GetExtraPaddingSize(place), device_type_)); + phi::DeviceManager::GetExtraPaddingSize(place), + device_type_)); }); return allocators_[dev_id].get(); @@ -813,8 +842,11 @@ void *Alloc(const platform::CustomPlace &place, PADDLE_THROW(platform::errors::ResourceExhausted( "Cannot allocate %s in %s:%d, avaliable %s, total %s, used " "%s. ", - string::HumanReadableSize(size), place.GetDeviceType(), place.device, - string::HumanReadableSize(avail), string::HumanReadableSize(total), + string::HumanReadableSize(size), + place.GetDeviceType(), + place.device, + string::HumanReadableSize(avail), + string::HumanReadableSize(total), string::HumanReadableSize(total - avail))); } else { if (FLAGS_init_allocated_mem) { @@ -830,7 +862,8 @@ void *Alloc(const platform::CustomPlace &place, } template <> -void Free(const platform::CustomPlace &place, void *p, +void Free(const platform::CustomPlace &place, + void *p, size_t size) { #ifdef PADDLE_WITH_CUSTOM_DEVICE VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); @@ -922,8 +955,6 @@ namespace allocation { phi::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) { void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size)); auto *tmp_alloc = new Allocation(ptr, size, place_); - platform::MemEvenRecorder::Instance().PushMemRecord( - static_cast(tmp_alloc), place_, size); return tmp_alloc; } @@ -931,8 +962,6 @@ void NaiveBestFitAllocator::FreeImpl(phi::Allocation *allocation) { paddle::platform::VisitPlace( allocation->place(), legacy::FreeVisitor(allocation->ptr(), allocation->size())); - platform::MemEvenRecorder::Instance().PopMemRecord( - static_cast(allocation), place_); delete allocation; } diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index ad11d81875..f1c0178faf 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/memory/allocation/pinned_allocator.h" #include "paddle/fluid/memory/stats.h" +#include "paddle/fluid/platform/profiler/mem_tracing.h" namespace paddle { namespace memory { namespace allocation { @@ -26,6 +27,10 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) { PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr())); #endif HOST_MEMORY_STAT_UPDATE(Reserved, 0, -allocation->size()); + platform::RecordMemEvent(allocation->ptr(), + allocation->place(), + allocation->size(), + platform::TracerMemEventType::ReservedFree); delete allocation; } phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { @@ -36,6 +41,10 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); #endif HOST_MEMORY_STAT_UPDATE(Reserved, 0, size); + platform::RecordMemEvent(ptr, + platform::CUDAPinnedPlace(), + size, + platform::TracerMemEventType::ReservedAllocate); return new Allocation(ptr, size, platform::CUDAPinnedPlace()); } } // namespace allocation diff --git a/paddle/fluid/memory/allocation/stat_allocator.h b/paddle/fluid/memory/allocation/stat_allocator.h index 8b54b96159..ef999dddf4 100644 --- a/paddle/fluid/memory/allocation/stat_allocator.h +++ b/paddle/fluid/memory/allocation/stat_allocator.h @@ -16,6 +16,7 @@ #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/stats.h" +#include "paddle/fluid/platform/profiler/mem_tracing.h" namespace paddle { namespace memory { @@ -30,14 +31,18 @@ class StatAllocator : public Allocator { protected: void FreeImpl(phi::Allocation* allocation) override { - if (platform::is_cpu_place(allocation->place())) { - HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(), - -allocation->size()); + if (platform::is_cpu_place(allocation->place()) || + platform::is_cuda_pinned_place(allocation->place())) { + HOST_MEMORY_STAT_UPDATE( + Allocated, allocation->place().GetDeviceId(), -allocation->size()); } else { - DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(), - -allocation->size()); + DEVICE_MEMORY_STAT_UPDATE( + Allocated, allocation->place().GetDeviceId(), -allocation->size()); } - + platform::RecordMemEvent(allocation->ptr(), + allocation->place(), + allocation->size(), + platform::TracerMemEventType::Free); underlying_allocator_->Free(allocation); } @@ -48,12 +53,16 @@ class StatAllocator : public Allocator { const platform::Place& place = allocation->place(); if (platform::is_cpu_place(place) || platform::is_cuda_pinned_place(place)) { - HOST_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(), - allocation->size()); + HOST_MEMORY_STAT_UPDATE( + Allocated, place.GetDeviceId(), allocation->size()); } else { - DEVICE_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(), - allocation->size()); + DEVICE_MEMORY_STAT_UPDATE( + Allocated, place.GetDeviceId(), allocation->size()); } + platform::RecordMemEvent(allocation->ptr(), + allocation->place(), + allocation->size(), + platform::TracerMemEventType::Allocate); return allocation.release(); } diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 244445d59b..eb5c74e56d 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -41,6 +41,7 @@ limitations under the License. */ #endif #include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/profiler/mem_tracing.h" DECLARE_bool(use_pinned_memory); DECLARE_double(fraction_of_gpu_memory_to_use); @@ -64,12 +65,14 @@ void* AlignedMalloc(size_t size) { #else int error = posix_memalign(&p, alignment, size); PADDLE_ENFORCE_EQ( - error, 0, + error, + 0, platform::errors::ResourceExhausted( "Fail to alloc memory of %ld size, error code is %d.", size, error)); #endif - PADDLE_ENFORCE_NOT_NULL(p, platform::errors::ResourceExhausted( - "Fail to alloc memory of %ld size.", size)); + PADDLE_ENFORCE_NOT_NULL(p, + platform::errors::ResourceExhausted( + "Fail to alloc memory of %ld size.", size)); return p; } @@ -95,7 +98,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) { } HOST_MEMORY_STAT_UPDATE(Reserved, 0, size); - + platform::RecordMemEvent( + p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate); return p; } @@ -114,6 +118,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) { #endif HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size); + platform::RecordMemEvent( + p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree); } bool CPUAllocator::UseGpu() const { return false; } @@ -146,7 +152,8 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " "maximum GPU memory usage is limited to %d MB.\n" " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", - limit_size, limit_size); + limit_size, + limit_size); } PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( @@ -161,21 +168,29 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { "please set it to a higher value but less than 1.0.\n" " The command is " "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", - gpu_id_, string::HumanReadableSize(size), gpu_id_, - string::HumanReadableSize(allocated), string::HumanReadableSize(avail), - gpu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg)); + gpu_id_, + string::HumanReadableSize(size), + gpu_id_, + string::HumanReadableSize(allocated), + string::HumanReadableSize(avail), + gpu_id_, + FLAGS_fraction_of_gpu_memory_to_use, + err_msg)); } } void GPUAllocator::Free(void* p, size_t size, size_t index) { - PADDLE_ENFORCE_EQ(index, 0, + PADDLE_ENFORCE_EQ(index, + 0, platform::errors::InvalidArgument( "The index should be 0, index is %d", index)); - PADDLE_ENFORCE_GE(gpu_alloc_size_, size, + PADDLE_ENFORCE_GE(gpu_alloc_size_, + size, platform::errors::InvalidArgument( "The size of memory (%d) to free exceeds the size of " "allocated gpu memory (%d)", - size, gpu_alloc_size_)); + size, + gpu_alloc_size_)); gpu_alloc_size_ -= size; platform::RecordedGpuFree(p, size, gpu_id_); @@ -213,6 +228,8 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { *index = 1; // PINNED memory cuda_pinnd_alloc_size_ += size; HOST_MEMORY_STAT_UPDATE(Reserved, 0, size); + platform::RecordMemEvent( + p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate); return p; } else { LOG(WARNING) << "cudaHostAlloc failed."; @@ -224,21 +241,25 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { gpuError_t err; - PADDLE_ENFORCE_EQ(index, 1, + PADDLE_ENFORCE_EQ(index, + 1, platform::errors::InvalidArgument( "The index should be 1, but got %d", index)); - PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, size, + PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, + size, platform::errors::InvalidArgument( "The size of memory (%d) to free exceeds the size of " "allocated cuda pinned memory (%d)", - size, cuda_pinnd_alloc_size_)); + size, + cuda_pinnd_alloc_size_)); cuda_pinnd_alloc_size_ -= size; #ifdef PADDLE_WITH_HIP err = hipHostFree(p); if (err != hipErrorDeinitialized) { PADDLE_ENFORCE_EQ( - err, hipSuccess, + err, + hipSuccess, platform::errors::Fatal( "hipFreeHost failed in GPUPinnedAllocator, error code is %d", err)); } @@ -252,13 +273,16 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { // cudaFreeHost succeeds. if (err != cudaErrorCudartUnloading) { PADDLE_ENFORCE_EQ( - err, 0, + err, + 0, platform::errors::Fatal( "cudaFreeHost failed in GPUPinnedAllocator, error code is %d", err)); } #endif HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size); + platform::RecordMemEvent( + p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree); } bool CUDAPinnedAllocator::UseGpu() const { return false; } @@ -289,7 +313,8 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) { "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " "maximum GPU memory usage is limited to %d MB.\n" " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", - limit_size, limit_size); + limit_size, + limit_size); } PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( @@ -304,22 +329,29 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) { "please set it to a higher value but less than 1.0.\n" " The command is " "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", - npu_id_, string::HumanReadableSize(size), npu_id_, - string::HumanReadableSize(avail), npu_id_, - FLAGS_fraction_of_gpu_memory_to_use, err_msg)); + npu_id_, + string::HumanReadableSize(size), + npu_id_, + string::HumanReadableSize(avail), + npu_id_, + FLAGS_fraction_of_gpu_memory_to_use, + err_msg)); } } void NPUAllocator::Free(void* p, size_t size, size_t index) { VLOG(4) << "Free " << p << " size " << size; - PADDLE_ENFORCE_EQ(index, 0, + PADDLE_ENFORCE_EQ(index, + 0, platform::errors::InvalidArgument( "The index should be 0, index is %d", index)); - PADDLE_ENFORCE_GE(npu_alloc_size_, size, + PADDLE_ENFORCE_GE(npu_alloc_size_, + size, platform::errors::InvalidArgument( "The size of memory (%d) to free exceeds the size of " "allocated gpu memory (%d)", - size, npu_alloc_size_)); + size, + npu_alloc_size_)); npu_alloc_size_ -= size; platform::RecordedNPUFree(p, size, npu_id_); @@ -358,21 +390,25 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) { void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) { aclError err; - PADDLE_ENFORCE_EQ(index, 1, + PADDLE_ENFORCE_EQ(index, + 1, platform::errors::InvalidArgument( "The index should be 1, but got %d", index)); - PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size, + PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, + size, platform::errors::InvalidArgument( "The size of memory (%d) to free exceeds the size of " "allocated npu pinned memory (%d)", - size, npu_pinnd_alloc_size_)); + size, + npu_pinnd_alloc_size_)); npu_pinnd_alloc_size_ -= size; err = platform::NPUHostFree(p); if (err != ACL_ERROR_NONE) { PADDLE_ENFORCE_EQ( - err, 0, + err, + 0, platform::errors::Fatal( "NPUHostFree failed in NPUPinnedAllocator, error code is %d", err)); } @@ -407,7 +443,8 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) { "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " "maximum MLU memory usage is limited to %d MB.\n" " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", - limit_size, limit_size); + limit_size, + limit_size); } PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( @@ -422,21 +459,29 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) { "please set it to a higher value but less than 1.0.\n" " The command is " "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", - mlu_id_, string::HumanReadableSize(size), mlu_id_, - string::HumanReadableSize(allocated), string::HumanReadableSize(avail), - mlu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg)); + mlu_id_, + string::HumanReadableSize(size), + mlu_id_, + string::HumanReadableSize(allocated), + string::HumanReadableSize(avail), + mlu_id_, + FLAGS_fraction_of_gpu_memory_to_use, + err_msg)); } } void MLUAllocator::Free(void* p, size_t size, size_t index) { - PADDLE_ENFORCE_EQ(index, 0, + PADDLE_ENFORCE_EQ(index, + 0, platform::errors::InvalidArgument( "The index should be 0, index is %d", index)); - PADDLE_ENFORCE_GE(mlu_alloc_size_, size, + PADDLE_ENFORCE_GE(mlu_alloc_size_, + size, platform::errors::InvalidArgument( "The size of memory (%d) to free exceeds the size of " "allocated gpu memory (%d)", - size, mlu_alloc_size_)); + size, + mlu_alloc_size_)); mlu_alloc_size_ -= size; platform::RecordedMLUFree(p, size, mlu_id_); @@ -465,7 +510,9 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) { "\n\nOut of memory error on %s %d. " "total memory is %s, used memory is %s, " "available memory is only %s.\n\n", - dev_type_, dev_id_, string::HumanReadableSize(total), + dev_type_, + dev_id_, + string::HumanReadableSize(total), string::HumanReadableSize(total - avail), string::HumanReadableSize(avail))); } @@ -474,14 +521,17 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) { void CustomAllocator::Free(void* p, size_t size, size_t index) { VLOG(4) << "CustomAllocator::Free " << p << " size " << size; - PADDLE_ENFORCE_EQ(index, 0, + PADDLE_ENFORCE_EQ(index, + 0, platform::errors::InvalidArgument( "The index should be 0, index is %d", index)); - PADDLE_ENFORCE_GE(plug_alloc_size, size, + PADDLE_ENFORCE_GE(plug_alloc_size, + size, platform::errors::InvalidArgument( "The size of memory (%d) to free exceeds the size of " "allocated gpu memory (%d)", - size, plug_alloc_size)); + size, + plug_alloc_size)); plug_alloc_size -= size; auto place = platform::CustomPlace(dev_type_, dev_id_); auto device = phi::DeviceManager::GetDeviceWithPlace(place); diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index c45180f600..f09cbfc3be 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/common/place.h" #ifdef PADDLE_WITH_XPU @@ -33,8 +33,12 @@ namespace memory { #ifdef PADDLE_WITH_CUSTOM_DEVICE template <> void Copy( - platform::CPUPlace dst_place, void* dst, platform::CustomPlace src_place, - const void* src, size_t num, void* stream) { + platform::CPUPlace dst_place, + void* dst, + platform::CustomPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; auto src_type = platform::PlaceHelper::GetDeviceType(src_place); @@ -52,8 +56,12 @@ void Copy( template <> void Copy( - platform::CustomPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num, void* stream) { + platform::CustomPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; auto src_type = platform::PlaceHelper::GetDeviceType(src_place); auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place); @@ -70,8 +78,12 @@ void Copy( template <> void Copy( - platform::CustomPlace dst_place, void* dst, platform::CustomPlace src_place, - const void* src, size_t num, void* stream) { + platform::CustomPlace dst_place, + void* dst, + platform::CustomPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; auto src_type = platform::PlaceHelper::GetDeviceType(src_place); @@ -102,9 +114,11 @@ void Copy( #endif // PADDLE_WITH_CUSTOM_DEVICE template <> -void Copy(platform::CPUPlace, void* dst, +void Copy(platform::CPUPlace, + void* dst, platform::CPUPlace, - const void* src, size_t num) { + const void* src, + size_t num) { if (UNLIKELY(num == 0)) return; VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num; std::memcpy(dst, src, num); @@ -115,7 +129,8 @@ template <> void Copy(platform::IPUPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (UNLIKELY(num == 0)) return; std::memcpy(dst, src, num); } @@ -123,7 +138,8 @@ template <> void Copy(platform::CPUPlace dst_place, void* dst, platform::IPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (UNLIKELY(num == 0)) return; std::memcpy(dst, src, num); } @@ -131,15 +147,18 @@ template <> void Copy(platform::IPUPlace dst_place, void* dst, platform::IPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (UNLIKELY(num == 0)) return; std::memcpy(dst, src, num); } // NOTE: only for (CPUPlace and IPUPlace) -> (IPUPlace). template <> -void Copy(phi::IPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, +void Copy(phi::IPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, size_t num) { if (src_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_src; @@ -152,8 +171,10 @@ void Copy(phi::IPUPlace dst_place, void* dst, // NOTE: only for (IPUPlace) -> (CPUPlace and IPUPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::IPUPlace src_place, const void* src, +void Copy(phi::Place dst_place, + void* dst, + phi::IPUPlace src_place, + const void* src, size_t num) { if (dst_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_dst; @@ -170,7 +191,8 @@ template <> void Copy(platform::XPUPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (num <= 0) { VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")"; return; @@ -182,7 +204,8 @@ template <> void Copy(platform::CPUPlace dst_place, void* dst, platform::XPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (num <= 0) { VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")"; return; @@ -194,7 +217,8 @@ template <> void Copy(platform::XPUPlace dst_place, void* dst, platform::XPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (num <= 0) { VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")"; return; @@ -204,8 +228,10 @@ void Copy(platform::XPUPlace dst_place, // NOTE: only for (CPUPlace and XPUPlace) -> (XPUPlace). template <> -void Copy(phi::XPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, +void Copy(phi::XPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, size_t num) { if (src_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_src; @@ -218,8 +244,10 @@ void Copy(phi::XPUPlace dst_place, void* dst, // NOTE: only for (XPUPlace) -> (CPUPlace and XPUPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::XPUPlace src_place, const void* src, +void Copy(phi::Place dst_place, + void* dst, + phi::XPUPlace src_place, + const void* src, size_t num) { if (dst_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_dst; @@ -236,7 +264,8 @@ template <> void Copy(platform::NPUPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -248,7 +277,10 @@ void Copy(platform::NPUPlace dst_place, if (stream) { platform::RecordEvent record_event( "NpuMemcpyAsync:CPU->NPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_HOST_TO_DEVICE, reinterpret_cast(stream)); } else { // On NPU, async operation after sync operation is ok, while sync operation @@ -267,7 +299,8 @@ template <> void Copy(platform::CPUPlace dst_place, void* dst, platform::NPUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -279,7 +312,10 @@ void Copy(platform::CPUPlace dst_place, if (stream) { platform::RecordEvent record_event( "NpuMemcpyAsync:NPU->CPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_DEVICE_TO_HOST, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -295,7 +331,8 @@ template <> void Copy(platform::NPUPlace dst_place, void* dst, platform::NPUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -307,7 +344,10 @@ void Copy(platform::NPUPlace dst_place, platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_DEVICE_TO_DEVICE, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = @@ -329,7 +369,10 @@ void Copy(platform::NPUPlace dst_place, platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_DEVICE_TO_DEVICE, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = @@ -346,8 +389,11 @@ void Copy(platform::NPUPlace dst_place, template <> void Copy( - platform::CPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place, - const void* src, size_t num) { + platform::CPUPlace dst_place, + void* dst, + platform::NPUPinnedPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -356,8 +402,11 @@ void Copy( template <> void Copy( - platform::NPUPinnedPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num) { + platform::NPUPinnedPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -366,8 +415,11 @@ void Copy( template <> void Copy( - platform::NPUPinnedPlace dst_place, void* dst, - platform::NPUPinnedPlace src_place, const void* src, size_t num) { + platform::NPUPinnedPlace dst_place, + void* dst, + platform::NPUPinnedPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -376,8 +428,12 @@ void Copy( template <> void Copy( - platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place, - const void* src, size_t num, void* stream) { + platform::NPUPinnedPlace dst_place, + void* dst, + platform::NPUPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(src_place.device); @@ -389,7 +445,10 @@ void Copy( platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_DEVICE_TO_HOST, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -404,8 +463,12 @@ void Copy( template <> void Copy( - platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place, - const void* src, size_t num, void* stream) { + platform::NPUPlace dst_place, + void* dst, + platform::NPUPinnedPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(dst_place.device); @@ -417,7 +480,10 @@ void Copy( platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_HOST_TO_DEVICE, reinterpret_cast(stream)); } else { // On NPU, async operation after sync operation is ok, while sync operation @@ -435,9 +501,12 @@ void Copy( // NOTE: only for CPUPlace, NPUPlace and NPUPinnedPlace. template <> -void Copy(phi::Place dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, aclrtStream stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + aclrtStream stream) { if (src_place.GetType() == phi::AllocationType::CPU && dst_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_dst, place_src; @@ -504,52 +573,76 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (CPUPlace). template <> -void Copy(phi::CPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, aclrtStream stream) { +void Copy(phi::CPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + aclrtStream stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (CPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::CPUPlace src_place, const void* src, - size_t num, aclrtStream stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::CPUPlace src_place, + const void* src, + size_t num, + aclrtStream stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPlace) template <> -void Copy(phi::NPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, aclrtStream stream) { - Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place, - src, num, stream); +void Copy(phi::NPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + aclrtStream stream) { + Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), + dst, + src_place, + src, + num, + stream); } // NOTE: only for (NPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace) template <> -void Copy(phi::Place dst_place, void* dst, - phi::NPUPlace src_place, const void* src, - size_t num, aclrtStream stream) { - Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()), - src, num, stream); +void Copy(phi::Place dst_place, + void* dst, + phi::NPUPlace src_place, + const void* src, + size_t num, + aclrtStream stream) { + Copy(dst_place, + dst, + phi::Place(src_place.GetType(), src_place.GetDeviceId()), + src, + num, + stream); } // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPinnedPlace) template <> void Copy(phi::NPUPinnedPlace dst_place, - void* dst, phi::Place src_place, - const void* src, size_t num, + void* dst, + phi::Place src_place, + const void* src, + size_t num, aclrtStream stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (NPUPinnedPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace) template <> -void Copy(phi::Place dst_place, void* dst, +void Copy(phi::Place dst_place, + void* dst, phi::NPUPinnedPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, aclrtStream stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } @@ -557,16 +650,20 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: only for (CPUPlace) -> (NPUPinnedPlace) template <> void Copy(phi::NPUPinnedPlace dst_place, - void* dst, phi::Place src_place, - const void* src, size_t num) { + void* dst, + phi::Place src_place, + const void* src, + size_t num) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr); } // NOTE: only for (NPUPinnedPlace) -> (CPUPlace) template <> -void Copy(phi::Place dst_place, void* dst, +void Copy(phi::Place dst_place, + void* dst, phi::NPUPinnedPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr); } #endif @@ -608,8 +705,12 @@ inline void SyncCUDAStream() { template <> void Copy( - platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, - const void* src, size_t num, void* stream) { + platform::CPUPlace dst_place, + void* dst, + platform::CUDAPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetDeviceId(src_place.device); @@ -619,10 +720,16 @@ void Copy( platform::RecordEvent record_event( "GpuMemcpyAsync:GPU->CPU", platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, + platform::GpuMemcpyAsync(dst, + src, + num, + hipMemcpyDeviceToHost, reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, + platform::GpuMemcpyAsync(dst, + src, + num, + cudaMemcpyDeviceToHost, reinterpret_cast(stream)); #endif } else { @@ -642,8 +749,12 @@ void Copy( template <> void Copy( - platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num, void* stream) { + platform::CUDAPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetDeviceId(dst_place.device); @@ -653,10 +764,16 @@ void Copy( platform::RecordEvent record_event( "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + hipMemcpyHostToDevice, reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + cudaMemcpyHostToDevice, reinterpret_cast(stream)); #endif } else { @@ -676,8 +793,12 @@ void Copy( template <> void Copy( - platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place, - const void* src, size_t num, void* stream) { + platform::CUDAPlace dst_place, + void* dst, + platform::CUDAPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " @@ -689,10 +810,16 @@ void Copy( platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + hipMemcpyDeviceToDevice, reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + cudaMemcpyDeviceToDevice, reinterpret_cast(stream)); #endif } else { @@ -710,22 +837,29 @@ void Copy( platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU", platform::TracerEventType::UserDefined, 1); - platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, - num, reinterpret_cast(stream)); + platform::GpuMemcpyPeerAsync(dst, + dst_place.device, + src, + src_place.device, + num, + reinterpret_cast(stream)); } else { platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU", platform::TracerEventType::UserDefined, 1); - platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device, - num); + platform::GpuMemcpyPeerSync( + dst, dst_place.device, src, src_place.device, num); } } } template <> void Copy( - platform::CPUPlace dst_place, void* dst, - platform::CUDAPinnedPlace src_place, const void* src, size_t num) { + platform::CPUPlace dst_place, + void* dst, + platform::CUDAPinnedPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -734,8 +868,11 @@ void Copy( template <> void Copy( - platform::CUDAPinnedPlace dst_place, void* dst, - platform::CPUPlace src_place, const void* src, size_t num) { + platform::CUDAPinnedPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -744,8 +881,11 @@ void Copy( template <> void Copy( - platform::CUDAPinnedPlace dst_place, void* dst, - platform::CUDAPinnedPlace src_place, const void* src, size_t num) { + platform::CUDAPinnedPlace dst_place, + void* dst, + platform::CUDAPinnedPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -754,8 +894,12 @@ void Copy( template <> void Copy( - platform::CUDAPinnedPlace dst_place, void* dst, - platform::CUDAPlace src_place, const void* src, size_t num, void* stream) { + platform::CUDAPinnedPlace dst_place, + void* dst, + platform::CUDAPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetDeviceId(src_place.device); VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " @@ -765,10 +909,16 @@ void Copy( platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, + platform::GpuMemcpyAsync(dst, + src, + num, + hipMemcpyDeviceToHost, reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, + platform::GpuMemcpyAsync(dst, + src, + num, + cudaMemcpyDeviceToHost, reinterpret_cast(stream)); #endif } else { @@ -785,8 +935,11 @@ void Copy( template <> void Copy( - platform::CUDAPlace dst_place, void* dst, - platform::CUDAPinnedPlace src_place, const void* src, size_t num, + platform::CUDAPlace dst_place, + void* dst, + platform::CUDAPinnedPlace src_place, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -798,10 +951,16 @@ void Copy( platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + hipMemcpyHostToDevice, reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + cudaMemcpyHostToDevice, reinterpret_cast(stream)); #endif } else { @@ -818,9 +977,12 @@ void Copy( // NOTE: only for CPUPlace、CUDAPlace and CUDAPinnedPlace. template <> -void Copy(phi::Place dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { if (src_place.GetType() == phi::AllocationType::CPU && dst_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_dst, place_src; @@ -887,52 +1049,76 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CPUPlace). template <> -void Copy(phi::CPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::CPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::CPUPlace src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::CPUPlace src_place, + const void* src, + size_t num, + void* stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPlace) template <> -void Copy(phi::GPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { - Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place, - src, num, stream); +void Copy(phi::GPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { + Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), + dst, + src_place, + src, + num, + stream); } // NOTE: only for (CUDAPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace) template <> -void Copy(phi::Place dst_place, void* dst, - phi::GPUPlace src_place, const void* src, - size_t num, void* stream) { - Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()), - src, num, stream); +void Copy(phi::Place dst_place, + void* dst, + phi::GPUPlace src_place, + const void* src, + size_t num, + void* stream) { + Copy(dst_place, + dst, + phi::Place(src_place.GetType(), src_place.GetDeviceId()), + src, + num, + stream); } // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPinnedPlace) template <> void Copy(phi::GPUPinnedPlace dst_place, - void* dst, phi::Place src_place, - const void* src, size_t num, + void* dst, + phi::Place src_place, + const void* src, + size_t num, void* stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (CUDAPinnedPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace) template <> -void Copy(phi::Place dst_place, void* dst, +void Copy(phi::Place dst_place, + void* dst, phi::GPUPinnedPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } @@ -940,16 +1126,20 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: only for (CPUPlace) -> (CUDAPinnedPlace) template <> void Copy(phi::GPUPinnedPlace dst_place, - void* dst, phi::Place src_place, - const void* src, size_t num) { + void* dst, + phi::Place src_place, + const void* src, + size_t num) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr); } // NOTE: only for (CUDAPinnedPlace) -> (CPUPlace) template <> -void Copy(phi::Place dst_place, void* dst, +void Copy(phi::Place dst_place, + void* dst, phi::GPUPinnedPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr); } #endif @@ -959,7 +1149,8 @@ template <> void Copy(platform::CPUPlace dst_place, void* dst, platform::MLUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -970,8 +1161,8 @@ void Copy(platform::CPUPlace dst_place, platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU", platform::TracerEventType::UserDefined, 1); - platform::MLUMemcpyD2HAsync(dst, src, num, - reinterpret_cast(stream)); + platform::MLUMemcpyD2HAsync( + dst, src, num, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); @@ -988,7 +1179,8 @@ template <> void Copy(platform::MLUPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -999,8 +1191,8 @@ void Copy(platform::MLUPlace dst_place, platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU", platform::TracerEventType::UserDefined, 1); - platform::MLUMemcpyH2DAsync(dst, src, num, - reinterpret_cast(stream)); + platform::MLUMemcpyH2DAsync( + dst, src, num, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); @@ -1017,7 +1209,8 @@ template <> void Copy(platform::MLUPlace dst_place, void* dst, platform::MLUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -1029,8 +1222,8 @@ void Copy(platform::MLUPlace dst_place, platform::RecordEvent record_event("MLUMemcpyD2DAsync(same_mlu):MLU->MLU", platform::TracerEventType::UserDefined, 1); - platform::MLUMemcpyD2DAsync(dst, src, num, - reinterpret_cast(stream)); + platform::MLUMemcpyD2DAsync( + dst, src, num, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -1050,25 +1243,32 @@ void Copy(platform::MLUPlace dst_place, platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU", platform::TracerEventType::UserDefined, 1); - platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, - num, reinterpret_cast(stream)); + platform::MLUMemcpyPeerAsync(dst, + dst_place.device, + src, + src_place.device, + num, + reinterpret_cast(stream)); } else { VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU", platform::TracerEventType::UserDefined, 1); - platform::MLUMemcpyPeerSync(dst, dst_place.device, src, src_place.device, - num); + platform::MLUMemcpyPeerSync( + dst, dst_place.device, src, src_place.device, num); } } } // NOTE: only for CPUPlace and MLUPlace. template <> -void Copy(phi::Place dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { if (src_place.GetType() == phi::AllocationType::CPU && dst_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_dst, place_src; @@ -1110,35 +1310,55 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: only for (CPUPlace and MLUPlace) -> (MLUPlace) template <> -void Copy(phi::MLUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { - Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place, - src, num, stream); +void Copy(phi::MLUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { + Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), + dst, + src_place, + src, + num, + stream); } // NOTE: only for (MLUPlace) -> (CPUPlace and MLUPlace) template <> -void Copy(phi::Place dst_place, void* dst, - phi::MLUPlace src_place, const void* src, - size_t num, void* stream) { - Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()), - src, num, stream); +void Copy(phi::Place dst_place, + void* dst, + phi::MLUPlace src_place, + const void* src, + size_t num, + void* stream) { + Copy(dst_place, + dst, + phi::Place(src_place.GetType(), src_place.GetDeviceId()), + src, + num, + stream); } // NOTE: only for (MLUPlace) -> (CPUPlace) with mluStream. template <> -void Copy(phi::CPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::CPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (CPUPlace) -> (MLUPlace) with mluStream. template <> -void Copy(phi::Place dst_place, void* dst, - phi::CPUPlace src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::CPUPlace src_place, + const void* src, + size_t num, + void* stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } @@ -1146,8 +1366,10 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: Only for CPUPlace, XPUPlace and PinnedPlace. template <> -void Copy(phi::Place dst_place, void* dst, - phi::Place src_place, const void* src, +void Copy(phi::Place dst_place, + void* dst, + phi::Place src_place, + const void* src, size_t num) { if (UNLIKELY(num == 0)) return; VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " @@ -1224,16 +1446,20 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: Only for (CPUPlace) -> (CPUPlace and PinnedPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::CPUPlace src_place, const void* src, +void Copy(phi::Place dst_place, + void* dst, + phi::CPUPlace src_place, + const void* src, size_t num) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num); } // NOTE: Only for (CPUPlace and PinnedPlace) -> (CPUPlace). template <> -void Copy(phi::CPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, +void Copy(phi::CPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, size_t num) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num); } @@ -1243,9 +1469,12 @@ void Copy(phi::CPUPlace dst_place, void* dst, !defined(PADDLE_WITH_MLU) template <> -void Copy(phi::Place dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT dst_place.GetType() == phi::AllocationType::CUSTOM) { platform::CPUPlace place_src; @@ -1265,17 +1494,23 @@ void Copy(phi::Place dst_place, void* dst, } template <> -void Copy(phi::CPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::CPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::CPUPlace src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::CPUPlace src_place, + const void* src, + size_t num, + void* stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } #endif diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index e01e2eb599..ffb3f7e6eb 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -354,7 +354,9 @@ if(WITH_GPU) enforce dynload_cuda new_profiler - stats) + stats + op_proto_maker + shape_inference) nv_library( device_memory_aligment SRCS device_memory_aligment.cc @@ -363,7 +365,14 @@ elseif(WITH_ROCM) hip_library( profiler SRCS profiler.cc profiler.cu - DEPS os_info device_tracer gpu_info enforce new_profiler stats) + DEPS os_info + device_tracer + gpu_info + enforce + new_profiler + stats + op_proto_maker + shape_inference) hip_library( device_memory_aligment SRCS device_memory_aligment.cc @@ -372,7 +381,13 @@ else() cc_library( profiler SRCS profiler.cc - DEPS os_info device_tracer enforce new_profiler stats) + DEPS os_info + device_tracer + enforce + new_profiler + stats + op_proto_maker + shape_inference) cc_library( device_memory_aligment SRCS device_memory_aligment.cc diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 6b302d2449..7cceb8ccec 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -29,6 +29,7 @@ limitations under the License. */ #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/monitor.h" #include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler/mem_tracing.h" #include "paddle/fluid/string/split.h" #include "paddle/phi/backends/gpu/gpu_info.h" @@ -51,10 +52,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_bool(enable_cublas_tensor_op_math); DECLARE_uint64(gpu_memory_limit_mb); -PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false, +PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, + false, "Whether to print the message of gpu memory usage " "at exit, mainly used for UT and CI."); -PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb, true, +PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb, + true, "Whether to print the message of gpu memory usage " "MB as a unit of measurement."); @@ -66,7 +69,10 @@ namespace platform { void GpuMemoryUsage(size_t *available, size_t *total) { size_t actual_available, actual_total; - RecordedGpuMemGetInfo(available, total, &actual_available, &actual_total, + RecordedGpuMemGetInfo(available, + total, + &actual_available, + &actual_total, platform::GetCurrentDeviceId()); } @@ -94,7 +100,8 @@ size_t GpuMaxAllocSize() { static size_t GpuAllocSize(bool realloc) { size_t available_to_alloc = GpuAvailableMemToAlloc(); PADDLE_ENFORCE_GT( - available_to_alloc, 0, + available_to_alloc, + 0, platform::errors::ResourceExhausted("Not enough available GPU memory.")); // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be // allocated by fraction @@ -105,7 +112,8 @@ static size_t GpuAllocSize(bool realloc) { ? flag_mb << 20 : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use); PADDLE_ENFORCE_GE( - available_to_alloc, alloc_bytes, + available_to_alloc, + alloc_bytes, platform::errors::ResourceExhausted("Not enough available GPU memory.")); VLOG(10) << "Alloc size is " << (alloc_bytes >> 20) << " MiB, is it Re-alloc: " << realloc; @@ -192,13 +200,16 @@ class RecordedGpuMallocHelper { }); PADDLE_ENFORCE_GE( - dev_id, 0, + dev_id, + 0, platform::errors::OutOfRange( "Device id must be not less than 0, but got %d.", dev_id)); PADDLE_ENFORCE_LT( - dev_id, instances_.size(), + dev_id, + instances_.size(), platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.", - dev_id, instances_.size())); + dev_id, + instances_.size())); return instances_[dev_id].get(); } @@ -207,7 +218,8 @@ class RecordedGpuMallocHelper { * or cudaSuccess would be returned, and the cudaGetLastError() flag * would be clear. */ - gpuError_t Malloc(void **ptr, size_t size, + gpuError_t Malloc(void **ptr, + size_t size, bool malloc_managed_memory = false) { LockGuardPtr lock(mtx_); if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) { @@ -236,7 +248,10 @@ class RecordedGpuMallocHelper { cur_size_.fetch_add(size); STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size); - + platform::RecordMemEvent(ptr, + GPUPlace(dev_id_), + size, + platform::TracerMemEventType::ReservedAllocate); #ifdef PADDLE_WITH_TESTING gpu_ptrs.insert(*ptr); #endif @@ -275,6 +290,10 @@ class RecordedGpuMallocHelper { cur_size_.fetch_sub(size); STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size); + platform::RecordMemEvent(ptr, + GPUPlace(dev_id_), + size, + platform::TracerMemEventType::ReservedFree); } else { platform::GpuGetLastError(); // clear the error flag when // cudaErrorCudartUnloading / @@ -300,7 +319,9 @@ class RecordedGpuMallocHelper { #endif } - bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail, + bool GetMemInfo(size_t *avail, + size_t *total, + size_t *actual_avail, size_t *actual_total) { { CUDADeviceGuard guard(dev_id_); @@ -335,7 +356,8 @@ class RecordedGpuMallocHelper { #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10020 - CUresult MemCreate(CUmemGenericAllocationHandle *handle, size_t size, + CUresult MemCreate(CUmemGenericAllocationHandle *handle, + size_t size, const CUmemAllocationProp *prop, unsigned long long flags) { // NOLINT auto result = @@ -371,7 +393,9 @@ class RecordedGpuMallocHelper { std::once_flag RecordedGpuMallocHelper::once_flag_; -gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id, +gpuError_t RecordedGpuMalloc(void **ptr, + size_t size, + int dev_id, bool malloc_managed_memory) { return RecordedGpuMallocHelper::Instance(dev_id)->Malloc( ptr, size, malloc_managed_memory); @@ -383,22 +407,28 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) { #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10020 -CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, +CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, + size_t size, const CUmemAllocationProp *prop, - unsigned long long flags, int dev_id) { // NOLINT - return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(handle, size, - prop, flags); + unsigned long long flags, + int dev_id) { // NOLINT + return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate( + handle, size, prop, flags); } -CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, size_t size, +CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, + size_t size, int dev_id) { return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size); } #endif #endif -bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail, - size_t *actual_total, int dev_id) { +bool RecordedGpuMemGetInfo(size_t *avail, + size_t *total, + size_t *actual_avail, + size_t *actual_total, + int dev_id) { return RecordedGpuMallocHelper::Instance(dev_id)->GetMemInfo( avail, total, actual_avail, actual_total); } @@ -493,26 +523,35 @@ void GpuDestroyStream(gpuStream_t stream) { void GpuDeviceSync() { phi::backends::gpu::GpuDeviceSync(); } -void GpuMemcpyAsync(void *dst, const void *src, size_t count, - gpuMemcpyKind kind, gpuStream_t stream) { +void GpuMemcpyAsync(void *dst, + const void *src, + size_t count, + gpuMemcpyKind kind, + gpuStream_t stream) { phi::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream); } -void GpuMemcpySync(void *dst, const void *src, size_t count, +void GpuMemcpySync(void *dst, + const void *src, + size_t count, gpuMemcpyKind kind) { phi::backends::gpu::GpuMemcpySync(dst, src, count, kind); } -void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, - int src_device, size_t count, gpuStream_t stream) { - phi::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device, - count, stream); +void GpuMemcpyPeerAsync(void *dst, + int dst_device, + const void *src, + int src_device, + size_t count, + gpuStream_t stream) { + phi::backends::gpu::GpuMemcpyPeerAsync( + dst, dst_device, src, src_device, count, stream); } -void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, - int src_device, size_t count) { - phi::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device, - count); +void GpuMemcpyPeerSync( + void *dst, int dst_device, const void *src, int src_device, size_t count) { + phi::backends::gpu::GpuMemcpyPeerSync( + dst, dst_device, src, src_device, count); } void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) { diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 47141bd73a..0369202284 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -30,12 +30,16 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/dynload/nvtx.h" #endif +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/os_info.h" -PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false, +PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, + false, "Enable rpc profiler or not."); -DEFINE_bool(enable_host_event_recorder_hook, false, +DEFINE_bool(enable_host_event_recorder_hook, + false, "enable HostEventRecorder, hook Profiler"); namespace paddle { @@ -43,8 +47,11 @@ namespace platform { MemEvenRecorder MemEvenRecorder::recorder; -Event::Event(EventType type, std::string name, uint32_t thread_id, - EventRole role, std::string attr) +Event::Event(EventType type, + std::string name, + uint32_t thread_id, + EventRole role, + std::string attr) : type_(type), name_(name), thread_id_(thread_id), @@ -68,8 +75,10 @@ double Event::CudaElapsedMs(const Event &e) const { #endif } -RecordEvent::RecordEvent(const char *name, const TracerEventType type, - uint32_t level, const EventRole role) { +RecordEvent::RecordEvent(const char *name, + const TracerEventType type, + uint32_t level, + const EventRole role) { #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA if (g_enable_nvprof_hook) { @@ -100,8 +109,10 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type, start_ns_ = PosixInNsec(); } -RecordEvent::RecordEvent(const std::string &name, const TracerEventType type, - uint32_t level, const EventRole role) { +RecordEvent::RecordEvent(const std::string &name, + const TracerEventType type, + uint32_t level, + const EventRole role) { #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA if (g_enable_nvprof_hook) { @@ -130,8 +141,10 @@ RecordEvent::RecordEvent(const std::string &name, const TracerEventType type, start_ns_ = PosixInNsec(); } -RecordEvent::RecordEvent(const std::string &name, const std::string &attr, - const TracerEventType type, uint32_t level, +RecordEvent::RecordEvent(const std::string &name, + const std::string &attr, + const TracerEventType type, + uint32_t level, const EventRole role) { #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA @@ -215,8 +228,8 @@ void RecordEvent::End() { DeviceTracer *tracer = GetDeviceTracer(); if (tracer) { uint64_t end_ns = PosixInNsec(); - tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(), - g_thread_id); + tracer->AddCPURecords( + CurAnnotationName(), start_ns_, end_ns, BlockDepth(), g_thread_id); } ClearCurAnnotation(); PopEvent(*name_, role_); @@ -226,7 +239,8 @@ void RecordEvent::End() { is_enabled_ = false; } -RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type, +RecordInstantEvent::RecordInstantEvent(const char *name, + TracerEventType type, uint32_t level) { if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) { return; @@ -236,20 +250,206 @@ RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type, name, start_end_ns, start_end_ns, EventRole::kOrdinary, type); } -void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place, +RecordOpInfoSupplement::RecordOpInfoSupplement( + const std::string &type, + const framework::AttributeMap &attrs, + const framework::InferShapeContext &shape_ctx, + const framework::RuntimeContext &ctx) { + if (FLAGS_enable_host_event_recorder_hook == false) { + return; + } + std::map> input_shapes; + std::map> dtypes; + for (auto it = ctx.inputs.begin(); it != ctx.inputs.end(); it++) { + input_shapes[it->first] = shape_ctx.GetInputsDim(it->first); + dtypes[it->first] = shape_ctx.GetInputsVarType(it->first); + } + + const std::vector *callstack_ptr = nullptr; + std::vector callstack; + auto iter = attrs.find( + framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); + if (iter != attrs.end()) { + callstack_ptr = &BOOST_GET_CONST(std::vector, iter->second); + callstack = *callstack_ptr; + } + HostEventRecorder::GetInstance().RecordEvent( + PosixInNsec(), type, input_shapes, dtypes, callstack); +} + +RecordMemEvent::RecordMemEvent(const void *ptr, + const phi::Place &place, + size_t size, + const TracerMemEventType type) { + if (g_state == ProfilerState::kDisabled && + FLAGS_enable_host_event_recorder_hook == false) { + return; + } + if (type == TracerMemEventType::Allocate) { + uint64_t current_allocated; + uint64_t peak_allocated; + uint64_t current_reserved = 0; // 0 means keep the same as before + uint64_t peak_reserved = 0; // 0 means keep the same as before + if (platform::is_cpu_place(place) || + platform::is_cuda_pinned_place(place)) { + current_allocated = + HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); + peak_allocated = + HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + } else { + current_allocated = + DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); + peak_allocated = + DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + } + + platform::MemEvenRecorder::Instance().PushMemRecord(ptr, + place, + size, + type, + current_allocated, + current_reserved, + peak_allocated, + peak_reserved); + } else if (type == TracerMemEventType::ReservedAllocate) { + uint64_t current_reserved; + uint64_t peak_reserved; + uint64_t current_allocated = 0; // 0 means keep the same as before + uint64_t peak_allocated = 0; // 0 means keep the same as before + if (platform::is_cpu_place(place) || + platform::is_cuda_pinned_place(place)) { + current_reserved = + HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); + peak_reserved = + HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + } else { + current_reserved = + DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); + peak_reserved = + DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + } + + platform::MemEvenRecorder::Instance().PushMemRecord(ptr, + place, + size, + type, + current_allocated, + current_reserved, + peak_allocated, + peak_reserved); + } else if (type == TracerMemEventType::Free) { + uint64_t current_allocated; + uint64_t peak_allocated; + uint64_t current_reserved = 0; // 0 means keep the same as before + uint64_t peak_reserved = 0; // 0 means keep the same as before + if (platform::is_cpu_place(place) || + platform::is_cuda_pinned_place(place)) { + current_allocated = + HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); + peak_allocated = + HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + } else { + current_allocated = + DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); + peak_allocated = + DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + } + + platform::MemEvenRecorder::Instance().PopMemRecord(ptr, + place, + size, + type, + current_allocated, + current_reserved, + peak_allocated, + peak_reserved); + } else if (type == TracerMemEventType::ReservedFree) { + uint64_t current_reserved; + uint64_t peak_reserved; + uint64_t current_allocated = 0; // 0 means keep the same as before + uint64_t peak_allocated = 0; // 0 means keep the same as before + if (platform::is_cpu_place(place) || + platform::is_cuda_pinned_place(place)) { + current_reserved = + HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); + peak_reserved = + HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + } else { + current_reserved = + DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); + peak_reserved = + DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + } + + platform::MemEvenRecorder::Instance().PopMemRecord(ptr, + place, + size, + type, + current_allocated, + current_reserved, + peak_allocated, + peak_reserved); + } +} + +void MemEvenRecorder::PushMemRecord(const void *ptr, + const Place &place, size_t size) { - if (g_state == ProfilerState::kDisabled) return; + if (g_state == ProfilerState::kDisabled) { + return; + } std::lock_guard guard(mtx_); auto &events = address_memevent_[place]; - PADDLE_ENFORCE_EQ(events.count(ptr), 0, + PADDLE_ENFORCE_EQ(events.count(ptr), + 0, platform::errors::InvalidArgument( "The Place can't exist in the stage of PushMemRecord")); - events.emplace(ptr, std::unique_ptr( - new MemEvenRecorder::RecordMemEvent(place, size))); + events.emplace(ptr, + std::unique_ptr( + new MemEvenRecorder::RecordMemEvent(place, size))); +} + +void MemEvenRecorder::PushMemRecord(const void *ptr, + const Place &place, + size_t size, + TracerMemEventType type, + uint64_t current_allocated, + uint64_t current_reserved, + uint64_t peak_allocated, + uint64_t peak_reserved) { + std::lock_guard guard(mtx_); + if (FLAGS_enable_host_event_recorder_hook) { // new MemRecord + HostEventRecorder::GetInstance().RecordEvent( + PosixInNsec(), + reinterpret_cast(ptr), + type, + size, + place, + current_allocated, + current_reserved, + peak_allocated, + peak_reserved); + return; + } + if (type == TracerMemEventType::ReservedAllocate) { + // old profiler only analyse memory managed by paddle. + return; + } + if (g_state == ProfilerState::kDisabled) return; + auto &events = address_memevent_[place]; + PADDLE_ENFORCE_EQ(events.count(ptr), + 0, + platform::errors::InvalidArgument( + "The Place can't exist in the stage of PushMemRecord")); + events.emplace(ptr, + std::unique_ptr( + new MemEvenRecorder::RecordMemEvent(place, size))); } void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) { - if (g_state == ProfilerState::kDisabled) return; + if (g_state == ProfilerState::kDisabled) { + return; + } std::lock_guard guard(mtx_); auto &events = address_memevent_[place]; auto iter = events.find(ptr); @@ -259,6 +459,41 @@ void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) { } } +void MemEvenRecorder::PopMemRecord(const void *ptr, + const Place &place, + size_t size, + TracerMemEventType type, + uint64_t current_allocated, + uint64_t current_reserved, + uint64_t peak_allocated, + uint64_t peak_reserved) { + std::lock_guard guard(mtx_); + if (FLAGS_enable_host_event_recorder_hook) { // new MemRecord + HostEventRecorder::GetInstance().RecordEvent( + PosixInNsec(), + reinterpret_cast(ptr), + type, + -size, + place, + current_allocated, + current_reserved, + peak_allocated, + peak_reserved); + return; + } + if (type == TracerMemEventType::ReservedFree) { + // old profiler only analyse memory managed by paddle. + return; + } + if (g_state == ProfilerState::kDisabled) return; + auto &events = address_memevent_[place]; + auto iter = events.find(ptr); + // The ptr maybe not in address_memevent + if (iter != events.end()) { + events.erase(iter); + } +} + void MemEvenRecorder::Flush() { std::lock_guard guard(mtx_); address_memevent_.clear(); @@ -279,8 +514,13 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() { auto annotation_free = CurAnnotationName(); if (tracer) { - tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_, - annotation_free, g_mem_thread_id); + tracer->AddMemInfoRecord(start_ns_, + end_ns_, + bytes_, + place_, + alloc_in_, + annotation_free, + g_mem_thread_id); } PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free); } @@ -307,22 +547,38 @@ RecordBlock::~RecordBlock() { if (tracer) { // We try to put all blocks at the same nested depth in the // same timeline lane. and distinguish the using thread_id. - tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(), - g_thread_id); + tracer->AddCPURecords( + name_, start_ns_, PosixInNsec(), BlockDepth(), g_thread_id); } ClearCurBlock(); } -void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, - const Place &place, const std::string &annotation) { - GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes, - place, g_mem_thread_id, annotation); -} - -void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, - const Place &place, const std::string &annotation) { - GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place, - g_mem_thread_id, annotation); +void PushMemEvent(uint64_t start_ns, + uint64_t end_ns, + size_t bytes, + const Place &place, + const std::string &annotation) { + GetMemEventList().Record(EventType::kPushRange, + start_ns, + end_ns, + bytes, + place, + g_mem_thread_id, + annotation); +} + +void PopMemEvent(uint64_t start_ns, + uint64_t end_ns, + size_t bytes, + const Place &place, + const std::string &annotation) { + GetMemEventList().Record(EventType::kPopRange, + start_ns, + end_ns, + bytes, + place, + g_mem_thread_id, + annotation); } void Mark(const std::string &name) { @@ -334,17 +590,19 @@ void Mark(const std::string &name) { GetEventList().Record(EventType::kMark, name, g_thread_id); } -Event *PushEvent(const std::string &name, const EventRole role, +Event *PushEvent(const std::string &name, + const EventRole role, std::string attr) { - return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role, - attr); + return GetEventList().Record( + EventType::kPushRange, name, g_thread_id, role, attr); } void PopEvent(const std::string &name, const EventRole role, std::string attr) { GetEventList().Record(EventType::kPopRange, name, g_thread_id, role, attr); } void EnableProfiler(ProfilerState state) { - PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled, + PADDLE_ENFORCE_NE(state, + ProfilerState::kDisabled, platform::errors::InvalidArgument( "Can't enable profiling, since the input state is" "ProfilerState::kDisabled")); @@ -380,7 +638,8 @@ void ResetProfiler() { (*it)->Clear(); } for (auto it = g_all_mem_event_lists.begin(); - it != g_all_mem_event_lists.end(); ++it) { + it != g_all_mem_event_lists.end(); + ++it) { (*it)->Clear(); } } @@ -576,8 +835,8 @@ static void EmulateEventPushAndPop( std::string name = prefix_stk.empty() ? evt.name : prefix_stk.top() + "/" + evt.name; const char *attr = (evt.attr == nullptr ? "none" : evt.attr); - Event *orig_evt = cur_thr_list->Record(EventType::kPushRange, name, tid, - evt.role, attr); + Event *orig_evt = cur_thr_list->Record( + EventType::kPushRange, name, tid, evt.role, attr); (*out)[tid][evt.end_ns] = std::make_pair(orig_evt, evt.start_ns); cur_thr_list->Record(EventType::kPopRange, name, tid, evt.role, attr); } @@ -593,8 +852,8 @@ static void EmulateCPURecordsAdd( for (const auto &thr_sec : host_sec.thr_sections) { uint64_t tid = thr_sec.thread_id; for (const auto &evt : thr_sec.events) { - tracer->AddCPURecords(evt.name, evt.start_ns, evt.end_ns, BlockDepth(), - tid); + tracer->AddCPURecords( + evt.name, evt.start_ns, evt.end_ns, BlockDepth(), tid); } } } diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 78275341cb..4773b1a177 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -30,6 +30,8 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.pb.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/mem_tracing.h" +#include "paddle/fluid/platform/profiler/supplement_tracing.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif @@ -102,6 +104,22 @@ struct MemEvenRecorder { public: void PushMemRecord(const void* ptr, const Place& place, size_t size); void PopMemRecord(const void* ptr, const Place& place); + void PushMemRecord(const void* ptr, + const Place& place, + size_t size, + TracerMemEventType type, + uint64_t current_allocated, + uint64_t current_reserved, + uint64_t peak_allocated, + uint64_t peak_reserved); + void PopMemRecord(const void* ptr, + const Place& place, + size_t size, + TracerMemEventType type, + uint64_t current_allocated, + uint64_t current_reserved, + uint64_t peak_allocated, + uint64_t peak_reserved); void Flush(); static MemEvenRecorder& Instance() { return recorder; } @@ -160,7 +178,8 @@ struct EventList { std::vector Reduce() { std::vector result; for (auto& block : event_blocks) { - result.insert(result.begin(), std::make_move_iterator(block.begin()), + result.insert(result.begin(), + std::make_move_iterator(block.begin()), std::make_move_iterator(block.end())); } event_blocks.clear(); @@ -173,13 +192,21 @@ struct EventList { }; void Mark(const std::string& name); -void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, - const Place& place, const std::string& annotation); -void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, - const Place& place, const std::string& annotation); -Event* PushEvent(const std::string& name, const EventRole role, +void PushMemEvent(uint64_t start_ns, + uint64_t end_ns, + size_t bytes, + const Place& place, + const std::string& annotation); +void PopMemEvent(uint64_t start_ns, + uint64_t end_ns, + size_t bytes, + const Place& place, + const std::string& annotation); +Event* PushEvent(const std::string& name, + const EventRole role, const std::string attr = "none"); -void PopEvent(const std::string& name, const EventRole role, +void PopEvent(const std::string& name, + const EventRole role, const std::string attr = "none"); // Return the event list of all threads. Assumed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt index ea3111b736..1daed7db1e 100644 --- a/paddle/fluid/platform/profiler/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/CMakeLists.txt @@ -1,7 +1,7 @@ cc_library( host_tracer SRCS host_tracer.cc - DEPS enforce) + DEPS enforce ddim var_type_traits) cc_library( cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc @@ -10,7 +10,7 @@ add_subdirectory(mlu) cc_library( event_node SRCS event_node.cc - DEPS enforce) + DEPS enforce place) cc_library( profiler_utils SRCS utils.cc diff --git a/paddle/fluid/platform/profiler/common_event.h b/paddle/fluid/platform/profiler/common_event.h index 8fe3b15052..3e166d1d04 100644 --- a/paddle/fluid/platform/profiler/common_event.h +++ b/paddle/fluid/platform/profiler/common_event.h @@ -18,16 +18,21 @@ #include #include +#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/platform/event.h" // import EventRole, TODO(TIEXING): remove later #include "paddle/fluid/platform/profiler/trace_event.h" +#include "paddle/phi/core/ddim.h" namespace paddle { namespace platform { struct CommonEvent { public: - CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns, - EventRole role, TracerEventType type) + CommonEvent(const char *name, + uint64_t start_ns, + uint64_t end_ns, + EventRole role, + TracerEventType type) : name(name), start_ns(start_ns), end_ns(end_ns), @@ -35,8 +40,12 @@ struct CommonEvent { type(type) {} CommonEvent(std::function arena_allocator, - const std::string &name_str, uint64_t start_ns, uint64_t end_ns, - EventRole role, TracerEventType type, const std::string &attr_str) + const std::string &name_str, + uint64_t start_ns, + uint64_t end_ns, + EventRole role, + TracerEventType type, + const std::string &attr_str) : start_ns(start_ns), end_ns(end_ns), role(role), type(type) { auto buf = static_cast(arena_allocator(name_str.length() + 1)); strncpy(buf, name_str.c_str(), name_str.length() + 1); @@ -47,8 +56,11 @@ struct CommonEvent { } CommonEvent(std::function arena_allocator, - const std::string &name_str, uint64_t start_ns, uint64_t end_ns, - EventRole role, TracerEventType type) + const std::string &name_str, + uint64_t start_ns, + uint64_t end_ns, + EventRole role, + TracerEventType type) : start_ns(start_ns), end_ns(end_ns), role(role), type(type) { auto buf = static_cast(arena_allocator(name_str.length() + 1)); strncpy(buf, name_str.c_str(), name_str.length() + 1); @@ -63,5 +75,61 @@ struct CommonEvent { const char *attr = nullptr; // not owned, designed for performance }; +struct CommonMemEvent { + public: + CommonMemEvent(uint64_t timestamp_ns, + uint64_t addr, + TracerMemEventType type, + int64_t increase_bytes, + const Place &place, + uint64_t current_allocated, + uint64_t current_reserved, + uint64_t peak_allocated, + uint64_t peak_reserved) + : timestamp_ns(timestamp_ns), + addr(addr), + type(type), + increase_bytes(increase_bytes), + place(place), + peak_allocated(peak_allocated), + peak_reserved(peak_reserved) {} + uint64_t timestamp_ns; + uint64_t addr; + TracerMemEventType type; + int64_t increase_bytes; + Place place; + uint64_t current_allocated; + uint64_t current_reserved; + uint64_t peak_allocated; + uint64_t peak_reserved; +}; + +struct OperatorSupplementOriginEvent { + public: + OperatorSupplementOriginEvent( + std::function arena_allocator, + uint64_t timestamp_ns, + const std::string &type_name, + const std::map> &input_shapes, + const std::map> + &dtypes, + const std::vector callstack) + : timestamp_ns(timestamp_ns), + input_shapes(input_shapes), + dtypes(dtypes), + callstack(callstack) { + auto buf = static_cast(arena_allocator(type_name.length() + 1)); + strncpy(buf, type_name.c_str(), type_name.length() + 1); + op_type = buf; + } + uint64_t timestamp_ns; + const char *op_type = nullptr; // not owned, designed for performance + // input shapes + std::map> input_shapes; + std::map> dtypes; + // call stack + const std::vector callstack; +}; + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc index bde1395c12..1c2c00d75b 100644 --- a/paddle/fluid/platform/profiler/host_tracer.cc +++ b/paddle/fluid/platform/profiler/host_tracer.cc @@ -11,9 +11,10 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/platform/profiler/host_tracer.h" +#include + #include "glog/logging.h" #include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/profiler/common_event.h" @@ -21,7 +22,8 @@ // Used to filter events, works like glog VLOG(level). // RecordEvent will works if host_trace_level >= level. -PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 1, +PADDLE_DEFINE_EXPORTED_int64(host_trace_level, + 1, "RecordEvent will works " "if host_trace_level >= level."); @@ -50,6 +52,79 @@ void ProcessHostEvents(const HostEventSection& host_events, } } +void ProcessHostMemEvents( + const HostEventSection& host_mem_events, + TraceEventCollector* collector) { + for (const auto& thr_sec : host_mem_events.thr_sections) { + uint64_t tid = thr_sec.thread_id; + if (thr_sec.thread_name != kDefaultThreadName) { + collector->AddThreadName(tid, thr_sec.thread_name); + } + for (const auto& evt : thr_sec.events) { + MemTraceEvent event; + event.timestamp_ns = evt.timestamp_ns; + event.addr = evt.addr; + event.type = evt.type; + event.increase_bytes = evt.increase_bytes; + event.place = evt.place.DebugString(); + event.current_allocated = evt.current_allocated; + event.current_reserved = evt.current_reserved; + event.peak_allocated = evt.peak_allocated; + event.peak_reserved = evt.peak_reserved; + event.process_id = host_mem_events.process_id; + event.thread_id = tid; + collector->AddMemEvent(std::move(event)); + } + } +} + +void ProcessOperatorSupplementEvents( + const HostEventSection& op_supplement_events, + TraceEventCollector* collector) { + for (const auto& thr_sec : op_supplement_events.thr_sections) { + uint64_t tid = thr_sec.thread_id; + if (thr_sec.thread_name != kDefaultThreadName) { + collector->AddThreadName(tid, thr_sec.thread_name); + } + for (const auto& evt : thr_sec.events) { + OperatorSupplementEvent event; + event.timestamp_ns = evt.timestamp_ns; + event.op_type = evt.op_type; + std::map>> input_shapes; + std::map> dtypes; + std::string callstack; + for (auto it = evt.input_shapes.begin(); it != evt.input_shapes.end(); + it++) { + for (auto idx = 0lu; idx < it->second.size(); idx++) { + input_shapes[it->first].push_back(std::vector()); + for (auto dim_idx = 0; dim_idx < it->second.at(idx).size(); + dim_idx++) { + input_shapes[it->first][idx].push_back( + it->second.at(idx).at(dim_idx)); + } + } + } + for (auto it = evt.dtypes.begin(); it != evt.dtypes.end(); it++) { + for (auto idx = 0lu; idx < it->second.size(); idx++) { + dtypes[it->first].push_back( + framework::proto::VarType::Type_Name(it->second.at(idx))); + } + } + + std::ostringstream result_string; + for (auto it = evt.callstack.begin(); it != evt.callstack.end(); it++) { + result_string << (*it) << std::endl; + } + event.input_shapes = input_shapes; + event.dtypes = dtypes; + event.callstack = result_string.str(); + event.process_id = op_supplement_events.process_id; + event.thread_id = tid; + collector->AddOperatorSupplementEvent(std::move(event)); + } + } +} + } // namespace void HostTracer::PrepareTracing() { @@ -60,16 +135,21 @@ void HostTracer::PrepareTracing() { void HostTracer::StartTracing() { PADDLE_ENFORCE_EQ( - state_ == TracerState::READY || state_ == TracerState::STOPED, true, + state_ == TracerState::READY || state_ == TracerState::STOPED, + true, platform::errors::PreconditionNotMet("TracerState must be READY")); HostEventRecorder::GetInstance().GatherEvents(); + HostEventRecorder::GetInstance().GatherEvents(); + HostEventRecorder::GetInstance() + .GatherEvents(); HostTraceLevel::GetInstance().SetLevel(options_.trace_level); state_ = TracerState::STARTED; } void HostTracer::StopTracing() { PADDLE_ENFORCE_EQ( - state_, TracerState::STARTED, + state_, + TracerState::STARTED, platform::errors::PreconditionNotMet("TracerState must be STARTED")); HostTraceLevel::GetInstance().SetLevel(HostTraceLevel::kDisabled); state_ = TracerState::STOPED; @@ -77,11 +157,19 @@ void HostTracer::StopTracing() { void HostTracer::CollectTraceData(TraceEventCollector* collector) { PADDLE_ENFORCE_EQ( - state_, TracerState::STOPED, + state_, + TracerState::STOPED, platform::errors::PreconditionNotMet("TracerState must be STOPED")); HostEventSection host_events = HostEventRecorder::GetInstance().GatherEvents(); ProcessHostEvents(host_events, collector); + HostEventSection host_mem_events = + HostEventRecorder::GetInstance().GatherEvents(); + ProcessHostMemEvents(host_mem_events, collector); + HostEventSection op_supplement_events = + HostEventRecorder::GetInstance() + .GatherEvents(); + ProcessOperatorSupplementEvents(op_supplement_events, collector); } } // namespace platform diff --git a/paddle/fluid/platform/profiler/mem_tracing.h b/paddle/fluid/platform/profiler/mem_tracing.h new file mode 100644 index 0000000000..3d3508c7bd --- /dev/null +++ b/paddle/fluid/platform/profiler/mem_tracing.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler/trace_event.h" + +namespace paddle { +namespace platform { +// Memory event tracing. A trace marks memory manipulation such as allocation +// and free. +// The events can be used to draw memory variation curve. +class RecordMemEvent { + public: + /** + * @param ptr: Pointer address allocated or free. + * @param place: Device for this memory event. + * @param size: Memory size allocated or free. + * @param type: Denote manipulation type for this memory event. + */ + explicit RecordMemEvent( + const void* ptr, + const Place& place, + size_t size, + const TracerMemEventType type = TracerMemEventType::Allocate); +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc index 1f1fbcb71e..db8895576b 100644 --- a/paddle/fluid/platform/profiler/profiler_test.cc +++ b/paddle/fluid/platform/profiler/profiler_test.cc @@ -23,6 +23,8 @@ #ifdef PADDLE_WITH_HIP #include #endif +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_python.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/profiler.h" @@ -41,10 +43,10 @@ TEST(ProfilerTest, TestHostTracer) { profiler->Prepare(); profiler->Start(); { - RecordInstantEvent("TestTraceLevel_record1", TracerEventType::UserDefined, - 2); - RecordInstantEvent("TestTraceLevel_record2", TracerEventType::UserDefined, - 3); + RecordInstantEvent( + "TestTraceLevel_record1", TracerEventType::UserDefined, 2); + RecordInstantEvent( + "TestTraceLevel_record2", TracerEventType::UserDefined, 3); } auto profiler_result = profiler->Stop(); auto nodetree = profiler_result->GetNodeTrees(); @@ -93,3 +95,49 @@ TEST(ProfilerTest, TestCudaTracer) { EXPECT_GT(runtime_events.size(), 0u); #endif } + +TEST(ProfilerTest, TestHostTracerForMem) { + using paddle::platform::CPUPlace; + using paddle::platform::EnableHostEventRecorder; + using paddle::platform::MemTraceEventNode; + using paddle::platform::Profiler; + using paddle::platform::ProfilerOptions; + using paddle::platform::ProfilerResult; + using paddle::platform::RecordEvent; + using paddle::platform::RecordInstantEvent; + using paddle::platform::RecordMemEvent; + using paddle::platform::TracerEventType; + using paddle::platform::TracerMemEventType; + ProfilerOptions options; + options.trace_level = 1; + options.trace_switch = 3; + auto profiler = Profiler::Create(options); + EXPECT_TRUE(profiler); + EnableHostEventRecorder(); + profiler->Prepare(); + profiler->Start(); + { + RecordEvent event1( + "TestTracerForMem_phase1", TracerEventType::UserDefined, 1); + RecordMemEvent(reinterpret_cast(0), + CPUPlace(), + 1024, + TracerMemEventType::Allocate); + RecordMemEvent( + reinterpret_cast(0), CPUPlace(), 1024, TracerMemEventType::Free); + } + { + RecordEvent event2( + "TestTracerForMem_phase2", TracerEventType::UserDefined, 1); + RecordMemEvent(reinterpret_cast(1024), + CPUPlace(), + 1024, + TracerMemEventType::Allocate); + RecordMemEvent(reinterpret_cast(1024), + CPUPlace(), + 1024, + TracerMemEventType::Free); + } + auto profiler_result = profiler->Stop(); + auto nodetree = profiler_result->GetNodeTrees(); +} diff --git a/paddle/fluid/platform/profiler/supplement_tracing.h b/paddle/fluid/platform/profiler/supplement_tracing.h new file mode 100644 index 0000000000..46b1616d71 --- /dev/null +++ b/paddle/fluid/platform/profiler/supplement_tracing.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/framework/shape_inference.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/platform/profiler/trace_event.h" + +namespace paddle { + +namespace framework { +class RuntimeContext; +} +namespace platform { + +class RecordOpInfoSupplement { + public: + /** + * @param type: Operator type name. + * @param attrs: Attribute map of op. + * @param shape_ctx: Infershape context object. + * @param ctx: Runtime context object. + */ + explicit RecordOpInfoSupplement(const std::string& type, + const framework::AttributeMap& attrs, + const framework::InferShapeContext& shape_ctx, + const framework::RuntimeContext& ctx); +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index b81f494f1a..b24c3546a3 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -382,7 +382,8 @@ static T PyObjectCast(PyObject *obj) { } catch (py::cast_error &) { PADDLE_THROW(platform::errors::InvalidArgument( "Python object is not type of %s, the real type is %s", - typeid(T).name(), obj->ob_type->tp_name)); + typeid(T).name(), + obj->ob_type->tp_name)); } } @@ -441,7 +442,8 @@ static std::vector inline GetNameList( } static void inline CreateVariableIfNotExit( - const py::handle &py_handle, const framework::Scope &scope, + const py::handle &py_handle, + const framework::Scope &scope, const framework::Executor *exe = nullptr) { std::vector vec_res; @@ -479,8 +481,9 @@ static void inline CreateVariableIfNotExit( PyObject *py_var_desc = PyObject_GetAttrString(PyList_GET_ITEM(py_obj, i), kVarDescField); PADDLE_ENFORCE_NOT_NULL( - py_var_desc, platform::errors::InvalidArgument( - "The var_desc of parameter to set is None")); + py_var_desc, + platform::errors::InvalidArgument( + "The var_desc of parameter to set is None")); auto var_desc = PyObjectCast(py_var_desc); Py_DECREF(py_var_desc); var = const_cast(&scope)->Var(para_name); @@ -515,7 +518,8 @@ static void AssertStaticGraphAndDygraphGradMakerNoDiff() { } } } - PADDLE_ENFORCE_EQ(ops.empty(), true, + PADDLE_ENFORCE_EQ(ops.empty(), + true, platform::errors::Unimplemented( "OperatorWithKernel [%s] have only static graph grad " "maker or have only dygraph grad maker, which is not " @@ -537,8 +541,10 @@ static int GetNCCLVersion() { #endif template -static void TensorCopyFrom(framework::Tensor *dst, const framework::Tensor &src, - const PlaceType &place, int64_t batch_size) { +static void TensorCopyFrom(framework::Tensor *dst, + const framework::Tensor &src, + const PlaceType &place, + int64_t batch_size) { if (batch_size < 0) { framework::TensorCopy(src, place, dst); } else { @@ -624,9 +630,10 @@ PYBIND11_MODULE(core_noavx, m) { PyCapsule_GetPointer(dltensor->ptr(), "dltensor")); PADDLE_ENFORCE_NOT_NULL( - dmt, platform::errors::InvalidArgument( - "from_dlpack received an invalid capsule. " - "Note that a DLPack tensor can be consumed only once.")); + dmt, + platform::errors::InvalidArgument( + "from_dlpack received an invalid capsule. " + "Note that a DLPack tensor can be consumed only once.")); PyCapsule_SetName(dltensor->ptr(), "used_dltensor"); DLTensor dl = dmt->dl_tensor; @@ -644,7 +651,8 @@ PYBIND11_MODULE(core_noavx, m) { }); m.def("_create_loaded_parameter", - [](const py::handle &vec_var_list, const Scope &scope, + [](const py::handle &vec_var_list, + const Scope &scope, const Executor *executor) { CreateVariableIfNotExit(vec_var_list, scope, executor); }); @@ -682,11 +690,12 @@ PYBIND11_MODULE(core_noavx, m) { << ", sci_mode=" << print_opt.sci_mode; }); - m.def("broadcast_shape", [](const std::vector &x_dim, - const std::vector &y_dim) { - return phi::vectorize(operators::details::BroadcastTwoDims( - phi::make_ddim(x_dim), phi::make_ddim(y_dim), -1)); - }); + m.def( + "broadcast_shape", + [](const std::vector &x_dim, const std::vector &y_dim) { + return phi::vectorize(operators::details::BroadcastTwoDims( + phi::make_ddim(x_dim), phi::make_ddim(y_dim), -1)); + }); m.def( "_append_python_callable_object_and_return_id", @@ -808,14 +817,22 @@ PYBIND11_MODULE(core_noavx, m) { self.EmplaceBackOutput(std::move(CastPyArg2Tensor(obj, 1))); } }) - .def("add_attr", [](paddle::CustomOpKernelContext &self, - bool attr) { self.EmplaceBackAttr(attr); }) - .def("add_attr", [](paddle::CustomOpKernelContext &self, - int attr) { self.EmplaceBackAttr(attr); }) - .def("add_attr", [](paddle::CustomOpKernelContext &self, - float attr) { self.EmplaceBackAttr(attr); }) - .def("add_attr", [](paddle::CustomOpKernelContext &self, - int64_t attr) { self.EmplaceBackAttr(attr); }) + .def("add_attr", + [](paddle::CustomOpKernelContext &self, bool attr) { + self.EmplaceBackAttr(attr); + }) + .def("add_attr", + [](paddle::CustomOpKernelContext &self, int attr) { + self.EmplaceBackAttr(attr); + }) + .def("add_attr", + [](paddle::CustomOpKernelContext &self, float attr) { + self.EmplaceBackAttr(attr); + }) + .def("add_attr", + [](paddle::CustomOpKernelContext &self, int64_t attr) { + self.EmplaceBackAttr(attr); + }) .def("add_attr", [](paddle::CustomOpKernelContext &self, const std::string &attr) { self.EmplaceBackAttr(attr); @@ -829,13 +846,14 @@ PYBIND11_MODULE(core_noavx, m) { .def("add_attr", [](paddle::CustomOpKernelContext &self, const std::vector &attr) { self.EmplaceBackAttr(attr); }) - .def("add_attr", [](paddle::CustomOpKernelContext &self, - const std::vector &attr) { - self.EmplaceBackAttr(attr); - }); + .def("add_attr", + [](paddle::CustomOpKernelContext &self, + const std::vector &attr) { + self.EmplaceBackAttr(attr); + }); - py::class_ framework_tensor(m, "Tensor", - py::buffer_protocol()); + py::class_ framework_tensor( + m, "Tensor", py::buffer_protocol()); g_framework_tensor_pytype = reinterpret_cast(framework_tensor.ptr()); framework_tensor @@ -918,80 +936,135 @@ PYBIND11_MODULE(core_noavx, m) { self.mutable_data(place); }) .def("_mutable_data", - [](framework::Tensor &self, paddle::platform::CPUPlace &place, + [](framework::Tensor &self, + paddle::platform::CPUPlace &place, paddle::framework::proto::VarType::Type type) { return reinterpret_cast( self.mutable_data(place, framework::TransToPhiDataType(type))); }) .def("_mutable_data", - [](framework::Tensor &self, paddle::platform::CustomPlace &place, + [](framework::Tensor &self, + paddle::platform::CustomPlace &place, paddle::framework::proto::VarType::Type type) { return reinterpret_cast( self.mutable_data(place, framework::TransToPhiDataType(type))); }) .def("_mutable_data", - [](framework::Tensor &self, paddle::platform::XPUPlace &place, + [](framework::Tensor &self, + paddle::platform::XPUPlace &place, paddle::framework::proto::VarType::Type type) { return reinterpret_cast( self.mutable_data(place, framework::TransToPhiDataType(type))); }) .def("_mutable_data", - [](framework::Tensor &self, paddle::platform::CUDAPlace &place, + [](framework::Tensor &self, + paddle::platform::CUDAPlace &place, paddle::framework::proto::VarType::Type type) { return reinterpret_cast( self.mutable_data(place, framework::TransToPhiDataType(type))); }) .def("_mutable_data", - [](framework::Tensor &self, paddle::platform::CUDAPinnedPlace &place, + [](framework::Tensor &self, + paddle::platform::CUDAPinnedPlace &place, paddle::framework::proto::VarType::Type type) { return reinterpret_cast( self.mutable_data(place, framework::TransToPhiDataType(type))); }) .def("_mutable_data", - [](framework::Tensor &self, paddle::platform::MLUPlace &place, + [](framework::Tensor &self, + paddle::platform::MLUPlace &place, paddle::framework::proto::VarType::Type type) { return reinterpret_cast( self.mutable_data(place, framework::TransToPhiDataType(type))); }) .def("_clear", &framework::Tensor::clear) .def("_mutable_data", - [](framework::Tensor &self, paddle::platform::NPUPlace &place, + [](framework::Tensor &self, + paddle::platform::NPUPlace &place, paddle::framework::proto::VarType::Type type) { return reinterpret_cast( self.mutable_data(place, framework::TransToPhiDataType(type))); }) - .def("_copy_from", &TensorCopyFrom, - py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) - .def("_copy_from", &TensorCopyFrom, - py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) - .def("_copy_from", &TensorCopyFrom, - py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) - .def("_copy_from", &TensorCopyFrom, - py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) - .def("_copy_from", &TensorCopyFrom, - py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) - .def("_copy_from", &TensorCopyFrom, - py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) - .def("_copy_from", &TensorCopyFrom, - py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) - .def("_copy_from", &TensorCopyFrom, - py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) - .def("set", SetTensorFromPyArray, - py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) - .def("set", SetTensorFromPyArray, - py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) - .def("set", SetTensorFromPyArray, - py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) - .def("set", SetTensorFromPyArray, - py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) - .def("set", SetTensorFromPyArray, - py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) - .def("set", SetTensorFromPyArray, - py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) - .def("set", SetTensorFromPyArray, - py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) - .def("set", SetTensorFromPyArray, - py::arg("array"), py::arg("place"), py::arg("zero_copy") = false, + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false, R"DOC( Set the data of Tensor on place with given numpy array. @@ -1077,25 +1150,26 @@ PYBIND11_MODULE(core_noavx, m) { ostr << self; return ostr.str(); }) /* ------ End of original Tensor ------ */ - .def( - "__init__", - [](framework::Tensor &instance, const std::vector> - &recursive_sequence_lengths) { - LoD new_lod; - new_lod.reserve(recursive_sequence_lengths.size()); - std::copy(recursive_sequence_lengths.begin(), - recursive_sequence_lengths.end(), - std::back_inserter(new_lod)); - LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); - PADDLE_ENFORCE_EQ( - CheckLoD(new_offset_lod, -1), true, - platform::errors::InvalidArgument( - "The provided recursive_sequence_lengths info is " - "invalid, " - "the LoD converted by recursive_sequence_lengths is %s", - new_lod)); - new (&instance) framework::Tensor(new_offset_lod); - }) + .def("__init__", + [](framework::Tensor &instance, + const std::vector> + &recursive_sequence_lengths) { + LoD new_lod; + new_lod.reserve(recursive_sequence_lengths.size()); + std::copy(recursive_sequence_lengths.begin(), + recursive_sequence_lengths.end(), + std::back_inserter(new_lod)); + LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); + PADDLE_ENFORCE_EQ( + CheckLoD(new_offset_lod, -1), + true, + platform::errors::InvalidArgument( + "The provided recursive_sequence_lengths info is " + "invalid, " + "the LoD converted by recursive_sequence_lengths is %s", + new_lod)); + new (&instance) framework::Tensor(new_offset_lod); + }) .def("__init__", [](framework::Tensor &instance) { new (&instance) framework::Tensor(); @@ -1115,12 +1189,14 @@ PYBIND11_MODULE(core_noavx, m) { new_lod.reserve(lod.size()); std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); PADDLE_ENFORCE_EQ( - CheckLoD(new_lod, vectorize(self.dims()).front()), true, + CheckLoD(new_lod, vectorize(self.dims()).front()), + true, platform::errors::InvalidArgument( "The provided LoD is invalid, the LoD is %s", new_lod)); self.set_lod(new_lod); }, - py::arg("lod"), R"DOC( + py::arg("lod"), + R"DOC( Set LoD of the Tensor. Args: @@ -1142,8 +1218,9 @@ PYBIND11_MODULE(core_noavx, m) { )DOC") .def( "set_recursive_sequence_lengths", - [](framework::Tensor &self, const std::vector> - &recursive_sequence_lengths) { + [](framework::Tensor &self, + const std::vector> + &recursive_sequence_lengths) { // the input recursive_sequence_lengths is length-based // level-of-detail info LoD new_lod; @@ -1153,7 +1230,8 @@ PYBIND11_MODULE(core_noavx, m) { std::back_inserter(new_lod)); LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); PADDLE_ENFORCE_EQ( - CheckLoD(new_offset_lod, vectorize(self.dims()).front()), true, + CheckLoD(new_offset_lod, vectorize(self.dims()).front()), + true, platform::errors::InvalidArgument( "The provided recursive_sequence_lengths info is " "invalid, " @@ -1162,7 +1240,8 @@ PYBIND11_MODULE(core_noavx, m) { new_lod)); self.set_lod(new_offset_lod); }, - py::arg("recursive_sequence_lengths"), R"DOC( + py::arg("recursive_sequence_lengths"), + R"DOC( Set LoD of the Tensor according to recursive sequence lengths. For example, if recursive_sequence_lengths=[[2, 3]], which means @@ -1630,7 +1709,8 @@ PYBIND11_MODULE(core_noavx, m) { new (&instance) phi::SelectedRows(); }) .def("__init__", - [](phi::SelectedRows &instance, const std::vector rows, + [](phi::SelectedRows &instance, + const std::vector rows, const int64_t &height) { new (&instance) phi::SelectedRows(rows, height); }) @@ -1693,8 +1773,10 @@ All parameter, weight, gradient are variables in Paddle. [](Variable &self, Strings str_list) { *self.GetMutable() = str_list; }) - .def("set_vocab", [](Variable &self, - Vocab vocab) { *self.GetMutable() = vocab; }) + .def("set_vocab", + [](Variable &self, Vocab vocab) { + *self.GetMutable() = vocab; + }) .def( "get_string_tensor", [](Variable &self) { return self.GetMutable(); }, @@ -1732,7 +1814,8 @@ All parameter, weight, gradient are variables in Paddle. .def( "get_reader", [](Variable &self) -> framework::ReaderHolder * { - PADDLE_ENFORCE_EQ(self.IsType(), true, + PADDLE_ENFORCE_EQ(self.IsType(), + true, platform::errors::InvalidArgument( "The variable is not type of ReaderHolder.")); return self.GetMutable(); @@ -1743,7 +1826,8 @@ All parameter, weight, gradient are variables in Paddle. [](Variable &self) -> Scope * { auto scope_vec = self.GetMutable>(); PADDLE_ENFORCE_GT( - scope_vec->size(), 0, + scope_vec->size(), + 0, platform::errors::InvalidArgument( "The size of scope_vec should be greater than 0")); return scope_vec->front(); @@ -1801,7 +1885,9 @@ All parameter, weight, gradient are variables in Paddle. out (core.Variable): the found or created variable. )DOC", py::return_value_policy::reference) - .def("find_var", &Scope::FindVar, py::arg("name"), + .def("find_var", + &Scope::FindVar, + py::arg("name"), R"DOC( Find variable named :code:`name` in the current scope or its parent scope. Return None if not found. @@ -1814,7 +1900,9 @@ All parameter, weight, gradient are variables in Paddle. )DOC", py::return_value_policy::reference) .def("size", &Scope::Size) - .def("erase", &Scope::EraseVars, py::arg("names"), + .def("erase", + &Scope::EraseVars, + py::arg("names"), R"DOC( Find variable named :code:`name` in the current scope or its parent scope. Return None if not found. @@ -1827,7 +1915,8 @@ All parameter, weight, gradient are variables in Paddle. )DOC", py::return_value_policy::reference) .def( - "new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); }, + "new_scope", + [](Scope &self) -> Scope * { return &self.NewScope(); }, R"DOC( Create a new sub-scope of the current scope. @@ -1835,7 +1924,8 @@ All parameter, weight, gradient are variables in Paddle. out (core._Scope): the created sub-scope. )DOC", py::return_value_policy::reference) - .def("drop_kids", &Scope::DropKids, + .def("drop_kids", + &Scope::DropKids, R"DOC( Delete all sub-scopes of the current scope. )DOC") @@ -1865,7 +1955,8 @@ All parameter, weight, gradient are variables in Paddle. if (info.HasOpProtoAndChecker()) { std::string str; PADDLE_ENFORCE_EQ( - info.Proto().SerializeToString(&str), true, + info.Proto().SerializeToString(&str), + true, platform::errors::Fatal( "Serialize OpProto Error. This could be a bug of Paddle.")); ret_values.emplace_back(str); @@ -1886,22 +1977,24 @@ All parameter, weight, gradient are variables in Paddle. } return res; }); - m.def( - "get_grad_op_desc", [](const OpDesc &op_desc, - const std::unordered_set &no_grad_set, - const std::vector &grad_sub_block) { - std::unordered_map grad_to_var; - std::vector> grad_op_descs = - framework::OpInfoMap::Instance() - .Get(op_desc.Type()) - .GradOpMaker()(op_desc, no_grad_set, &grad_to_var, - grad_sub_block); - std::vector grad_op_desc_ptrs(grad_op_descs.size()); - std::transform(grad_op_descs.begin(), grad_op_descs.end(), - grad_op_desc_ptrs.begin(), - [](std::unique_ptr &p) { return p.release(); }); - return std::make_pair(grad_op_desc_ptrs, grad_to_var); - }); + m.def("get_grad_op_desc", + [](const OpDesc &op_desc, + const std::unordered_set &no_grad_set, + const std::vector &grad_sub_block) { + std::unordered_map grad_to_var; + std::vector> grad_op_descs = + framework::OpInfoMap::Instance() + .Get(op_desc.Type()) + .GradOpMaker()( + op_desc, no_grad_set, &grad_to_var, grad_sub_block); + std::vector grad_op_desc_ptrs(grad_op_descs.size()); + std::transform( + grad_op_descs.begin(), + grad_op_descs.end(), + grad_op_desc_ptrs.begin(), + [](std::unique_ptr &p) { return p.release(); }); + return std::make_pair(grad_op_desc_ptrs, grad_to_var); + }); m.def("has_grad_op_maker", [](const std::string op_type) { return framework::OpInfoMap::Instance().Get(op_type).HasGradOpMaker(); }); @@ -1914,7 +2007,8 @@ All parameter, weight, gradient are variables in Paddle. return framework::OpInfoMap::Instance().Get(op_type).HasInferInplace(); }); m.def("infer_no_need_buffer_slots", - [](const std::string op_type, const framework::VariableNameMap &inputs, + [](const std::string op_type, + const framework::VariableNameMap &inputs, const framework::VariableNameMap &outputs, const framework::AttributeMap &attrs) { auto infer_func = framework::OpInfoMap::Instance() @@ -1927,20 +2021,21 @@ All parameter, weight, gradient are variables in Paddle. return empty; } }); - m.def("prune", [](const ProgramDesc &origin, - const std::set &feeded_var_names, - const std::vector> &targets) { - ProgramDesc prog_with_targets(origin); - - for (const auto &t : targets) { - prog_with_targets.MutableBlock(t[0])->Op(t[1])->SetIsTarget(true); - } - proto::ProgramDesc pruned_desc; - auto pruned_origin_block_id_map = - Prune(*prog_with_targets.Proto(), feeded_var_names, &pruned_desc); - return std::make_tuple(ProgramDesc(pruned_desc), - pruned_origin_block_id_map); - }); + m.def("prune", + [](const ProgramDesc &origin, + const std::set &feeded_var_names, + const std::vector> &targets) { + ProgramDesc prog_with_targets(origin); + + for (const auto &t : targets) { + prog_with_targets.MutableBlock(t[0])->Op(t[1])->SetIsTarget(true); + } + proto::ProgramDesc pruned_desc; + auto pruned_origin_block_id_map = + Prune(*prog_with_targets.Proto(), feeded_var_names, &pruned_desc); + return std::make_tuple(ProgramDesc(pruned_desc), + pruned_origin_block_id_map); + }); m.def( "prune_backward", [](const framework::ProgramDesc &program) { @@ -2168,7 +2263,8 @@ All parameter, weight, gradient are variables in Paddle. #endif return devices; }); - py::class_ customplace(m, "CustomPlace", + py::class_ customplace(m, + "CustomPlace", R"DOC( CustomPlace is a descriptor of a device. It represents a custom device on which a tensor will be allocated and a model will run. @@ -2182,7 +2278,8 @@ All parameter, weight, gradient are variables in Paddle. g_customplace_pytype = reinterpret_cast(customplace.ptr()); customplace .def("__init__", - [](platform::CustomPlace &self, const std::string &device_type, + [](platform::CustomPlace &self, + const std::string &device_type, int dev_id) { #ifdef PADDLE_WITH_CUSTOM_DEVICE if (UNLIKELY(dev_id < 0)) { @@ -2190,7 +2287,8 @@ All parameter, weight, gradient are variables in Paddle. "Invalid CustomPlace(%s, %d), device id must be 0 " "or " "positive integer", - device_type, dev_id); + device_type, + dev_id); std::exit(-1); } @@ -2211,7 +2309,11 @@ All parameter, weight, gradient are variables in Paddle. "inside " "[0, %d), because %s " "number on your machine is %d", - device_type, dev_id, dev_count, device_type, dev_count); + device_type, + dev_id, + dev_count, + device_type, + dev_count); std::exit(-1); } } @@ -2221,7 +2323,8 @@ All parameter, weight, gradient are variables in Paddle. "Invalid CustomPlace(%s, %d), the device type is " "not registered " "as a custom device.", - device_type, dev_id); + device_type, + dev_id); std::exit(-1); } #else @@ -2293,7 +2396,8 @@ All parameter, weight, gradient are variables in Paddle. LOG(ERROR) << string::Sprintf( "Invalid CUDAPlace(%d), must inside [0, %d), because GPU " "number on your machine is %d", - dev_id, platform::GetGPUDeviceCount(), + dev_id, + platform::GetGPUDeviceCount(), platform::GetGPUDeviceCount()); std::exit(-1); } @@ -2359,7 +2463,8 @@ All parameter, weight, gradient are variables in Paddle. LOG(ERROR) << string::Sprintf( "Invalid XPUPlace(%d), must inside [0, %d), because XPU " "number on your machine is %d", - dev_id, platform::GetXPUDeviceCount(), + dev_id, + platform::GetXPUDeviceCount(), platform::GetXPUDeviceCount()); std::exit(-1); } @@ -2524,7 +2629,8 @@ All parameter, weight, gradient are variables in Paddle. LOG(ERROR) << string::Sprintf( "Invalid NPUPlace(%d), must inside [0, %d), because NPU " "number on your machine is %d", - dev_id, platform::GetNPUDeviceCount(), + dev_id, + platform::GetNPUDeviceCount(), platform::GetNPUDeviceCount()); std::exit(-1); } @@ -2640,7 +2746,8 @@ All parameter, weight, gradient are variables in Paddle. LOG(ERROR) << string::Sprintf( "Invalid MLUPlace(%d), must inside [0, %d), because MLU " "number on your machine is %d", - dev_id, platform::GetMLUDeviceCount(), + dev_id, + platform::GetMLUDeviceCount(), platform::GetMLUDeviceCount()); std::exit(-1); } @@ -2713,8 +2820,10 @@ All parameter, weight, gradient are variables in Paddle. .def("mlu_device_id", [](platform::Place &self) { return self.device; }) .def("custom_device_id", [](platform::Place &self) { return self.device; }) - .def("set_place", [](platform::Place &self, - const platform::Place &other) { self = other; }) + .def("set_place", + [](platform::Place &self, const platform::Place &other) { + self = other; + }) .def("set_place", [](platform::Place &self, const platform::CPUPlace &cpu_place) { self = cpu_place; @@ -2759,7 +2868,8 @@ All parameter, weight, gradient are variables in Paddle. true, platform::errors::InvalidArgument( "Cannot parse user input to OpDesc")); - PADDLE_ENFORCE_EQ(desc.IsInitialized(), true, + PADDLE_ENFORCE_EQ(desc.IsInitialized(), + true, platform::errors::InvalidArgument( "The provided OpDesc is not " "initialized, the reason is: %s", @@ -2767,43 +2877,50 @@ All parameter, weight, gradient are variables in Paddle. return OpRegistry::CreateOp(desc); }) .def("run", - [](OperatorBase &self, const Scope &scope, + [](OperatorBase &self, + const Scope &scope, const platform::CPUPlace &place) { pybind11::gil_scoped_release release; self.Run(scope, place); }) .def("run", - [](OperatorBase &self, const Scope &scope, + [](OperatorBase &self, + const Scope &scope, const platform::XPUPlace &place) { pybind11::gil_scoped_release release; self.Run(scope, place); }) .def("run", - [](OperatorBase &self, const Scope &scope, + [](OperatorBase &self, + const Scope &scope, const platform::NPUPlace &place) { pybind11::gil_scoped_release release; self.Run(scope, place); }) .def("run", - [](OperatorBase &self, const Scope &scope, + [](OperatorBase &self, + const Scope &scope, const platform::CUDAPlace &place) { pybind11::gil_scoped_release release; self.Run(scope, place); }) .def("run", - [](OperatorBase &self, const Scope &scope, + [](OperatorBase &self, + const Scope &scope, const platform::CUDAPinnedPlace &place) { pybind11::gil_scoped_release release; self.Run(scope, place); }) .def("run", - [](OperatorBase &self, const Scope &scope, + [](OperatorBase &self, + const Scope &scope, const platform::MLUPlace &place) { pybind11::gil_scoped_release release; self.Run(scope, place); }) .def("run", - [](OperatorBase &self, const Scope &scope, + [](OperatorBase &self, + const Scope &scope, const platform::CustomPlace &place) { pybind11::gil_scoped_release release; self.Run(scope, place); @@ -2843,13 +2960,17 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Executor") .def(py::init()) .def("close", &Executor::Close) - .def("run_from_dataset", &Executor::RunFromDataset, + .def("run_from_dataset", + &Executor::RunFromDataset, py::call_guard()) - .def("release_trainer", &Executor::ReleaseTrainer, + .def("release_trainer", + &Executor::ReleaseTrainer, py::call_guard()) .def("init_for_dataset", - [](Executor &self, const ProgramDesc &prog, - const std::string &trainer_desc, Scope *scope, + [](Executor &self, + const ProgramDesc &prog, + const std::string &trainer_desc, + Scope *scope, Dataset *dataset) -> std::shared_ptr { pybind11::gil_scoped_release release; return self.InitForDataset(prog, trainer_desc, scope, dataset); @@ -2860,42 +2981,64 @@ All parameter, weight, gradient are variables in Paddle. self.RunFromDataset(trainer); }) .def("run_prepared_ctx", - [](Executor &self, ExecutorPrepareContext *ctx, Scope *scope, + [](Executor &self, + ExecutorPrepareContext *ctx, + Scope *scope, std::map *feed_targets, std::map *fetch_targets, - bool create_local_scope = true, bool create_vars = true, + bool create_local_scope = true, + bool create_vars = true, const std::string &feed_holder_name = "feed", const std::string &fetch_holder_name = "fetch") { pybind11::gil_scoped_release release; - self.RunPreparedContext(ctx, scope, feed_targets, fetch_targets, - create_local_scope, create_vars, - feed_holder_name, fetch_holder_name); + self.RunPreparedContext(ctx, + scope, + feed_targets, + fetch_targets, + create_local_scope, + create_vars, + feed_holder_name, + fetch_holder_name); }) .def("run_prepared_ctx", - [](Executor &self, ExecutorPrepareContext *ctx, Scope *scope, - bool create_local_scope = true, bool create_vars = true, + [](Executor &self, + ExecutorPrepareContext *ctx, + Scope *scope, + bool create_local_scope = true, + bool create_vars = true, bool keep_kids = false) { pybind11::gil_scoped_release release; - self.RunPreparedContext(ctx, scope, create_local_scope, - create_vars, keep_kids); + self.RunPreparedContext( + ctx, scope, create_local_scope, create_vars, keep_kids); }) .def("prepare", - [](Executor &self, const ProgramDesc &program, int block_id, + [](Executor &self, + const ProgramDesc &program, + int block_id, const std::vector &skip_ref_cnt_vars = std::vector(), bool force_disable_gc = false) { pybind11::gil_scoped_release release; - return self.Prepare(program, block_id, skip_ref_cnt_vars, - force_disable_gc); + return self.Prepare( + program, block_id, skip_ref_cnt_vars, force_disable_gc); }) .def("create_variables", &Executor::CreateVariables) - .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope, - int block_id, bool create_local_scope, bool create_vars, - const std::vector &fetch_vars) { - pybind11::gil_scoped_release release; - self.Run(prog, scope, block_id, create_local_scope, create_vars, - fetch_vars); - }); + .def("run", + [](Executor &self, + const ProgramDesc &prog, + Scope *scope, + int block_id, + bool create_local_scope, + bool create_vars, + const std::vector &fetch_vars) { + pybind11::gil_scoped_release release; + self.Run(prog, + scope, + block_id, + create_local_scope, + create_vars, + fetch_vars); + }); py::class_(m, "CostInfo") .def(py::init<>()) @@ -2906,8 +3049,10 @@ All parameter, weight, gradient are variables in Paddle. }); py::class_(m, "StandaloneExecutor") - .def(py::init()) + .def(py::init()) .def("run", [](StandaloneExecutor &self, const std::unordered_map &input_dict, @@ -2951,7 +3096,8 @@ All parameter, weight, gradient are variables in Paddle. return py::cast(std::move(ret)); }) .def("run", - [](StandaloneExecutor &self, std::vector feed_names, + [](StandaloneExecutor &self, + std::vector feed_names, std::vector fetch_names) { paddle::framework::FetchList ret; { @@ -3036,20 +3182,27 @@ All parameter, weight, gradient are variables in Paddle. m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue); m.def( "run_cmd", - [](const std::string &cmd, int time_out = -1, + [](const std::string &cmd, + int time_out = -1, int sleep_inter = -1) -> const std::string { - return paddle::framework::shell_get_command_output(cmd, time_out, - sleep_inter); + return paddle::framework::shell_get_command_output( + cmd, time_out, sleep_inter); }, - py::arg("cmd"), py::arg("time_out") = -1, py::arg("sleep_inter") = -1); + py::arg("cmd"), + py::arg("time_out") = -1, + py::arg("sleep_inter") = -1); m.def( "shell_execute_cmd", - [](const std::string &cmd, int time_out = 0, int sleep_inter = 0, + [](const std::string &cmd, + int time_out = 0, + int sleep_inter = 0, bool redirect_stderr = false) -> std::vector { - return paddle::framework::shell_execute_cmd(cmd, time_out, sleep_inter, - redirect_stderr); + return paddle::framework::shell_execute_cmd( + cmd, time_out, sleep_inter, redirect_stderr); }, - py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0, + py::arg("cmd"), + py::arg("time_out") = 0, + py::arg("sleep_inter") = 0, py::arg("redirect_stderr") = false); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -3064,13 +3217,16 @@ All parameter, weight, gradient are variables in Paddle. #endif m.def("set_feed_variable", - static_cast(&framework::SetFeedVariable)); + static_cast( + &framework::SetFeedVariable)); m.def("set_feed_variable", - static_cast(&framework::SetFeedVariable)); + static_cast( + &framework::SetFeedVariable)); m.def("get_fetch_variable", - [](const Scope &scope, const std::string &var_name, + [](const Scope &scope, + const std::string &var_name, size_t index) -> py::object { auto &var = framework::GetFetchVariable(scope, var_name, index); if (data_is_lod_tensor(var)) { @@ -3125,7 +3281,8 @@ All parameter, weight, gradient are variables in Paddle. .def("__len__", [](LoDTensorArray &self) { return self.size(); }) .def("__setitem__", [](LoDTensorArray &self, size_t i, const LoDTensor &t) { - PADDLE_ENFORCE_LT(i, self.size(), + PADDLE_ENFORCE_LT(i, + self.size(), platform::errors::InvalidArgument( "The index to set is larger than the size " "of LoDTensorArray.")); @@ -3139,7 +3296,8 @@ All parameter, weight, gradient are variables in Paddle. self.back().ShareDataWith(t); self.back().set_lod(t.lod()); }, - py::arg("tensor"), R"DOC( + py::arg("tensor"), + R"DOC( Append a LoDensor to LoDTensorArray. Args: @@ -3376,18 +3534,20 @@ All parameter, weight, gradient are variables in Paddle. m.def("reset_profiler", platform::ResetProfiler); m.def("register_pass", [](const std::string &pass_type, py::object callable) { PADDLE_ENFORCE_EQ( - framework::ir::PassRegistry::Instance().Has(pass_type), false, + framework::ir::PassRegistry::Instance().Has(pass_type), + false, platform::errors::AlreadyExists("Pass '%s' is registered more than " "once. Please use another name.", pass_type)); callable.inc_ref(); - framework::ir::PassRegistry::Instance().Insert(pass_type, [pass_type, - callable]() { - py::gil_scoped_acquire guard; - std::unique_ptr pass( - new framework::ir::GeneratePass(py::cast(callable()))); - return pass; - }); + framework::ir::PassRegistry::Instance().Insert( + pass_type, [pass_type, callable]() { + py::gil_scoped_acquire guard; + std::unique_ptr pass( + new framework::ir::GeneratePass( + py::cast(callable()))); + return pass; + }); }); m.def("get_pass", [](const std::string &pass_type) { auto pass = framework::ir::PassRegistry::Instance().Get(pass_type); @@ -3397,11 +3557,32 @@ All parameter, weight, gradient are variables in Paddle. m.def("size_of_dtype", framework::SizeOfType); py::class_(m, "_ProfilerResult") .def(py::init<>()) - .def("get_data", &paddle::platform::ProfilerResult::GetData, + .def("get_data", + &paddle::platform::ProfilerResult::GetData, py::return_value_policy::automatic_reference) .def("save", &paddle::platform::ProfilerResult::Save) .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo); + py::class_(m, "MemPythonNode") + .def(py::init<>()) + .def_readwrite("timestamp_ns", + &paddle::platform::MemPythonNode::timestamp_ns) + .def_readwrite("addr", &paddle::platform::MemPythonNode::addr) + .def_readwrite("type", &paddle::platform::MemPythonNode::type) + .def_readwrite("process_id", &paddle::platform::MemPythonNode::process_id) + .def_readwrite("thread_id", &paddle::platform::MemPythonNode::thread_id) + .def_readwrite("increase_bytes", + &paddle::platform::MemPythonNode::increase_bytes) + .def_readwrite("place", &paddle::platform::MemPythonNode::place) + .def_readwrite("current_allocated", + &paddle::platform::MemPythonNode::current_allocated) + .def_readwrite("current_reserved", + &paddle::platform::MemPythonNode::current_reserved) + .def_readwrite("peak_allocated", + &paddle::platform::MemPythonNode::peak_allocated) + .def_readwrite("peak_reserved", + &paddle::platform::MemPythonNode::peak_reserved); + py::class_(m, "DevicePythonNode") .def(py::init<>()) .def_readwrite("name", &paddle::platform::DevicePythonNode::name) @@ -3424,15 +3605,22 @@ All parameter, weight, gradient are variables in Paddle. .def_readwrite("process_id", &paddle::platform::HostPythonNode::process_id) .def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id) + .def_readwrite("input_shapes", + &paddle::platform::HostPythonNode::input_shapes) + .def_readwrite("dtypes", &paddle::platform::HostPythonNode::dtypes) + .def_readwrite("callstack", &paddle::platform::HostPythonNode::callstack) .def_readwrite("children_node", &paddle::platform::HostPythonNode::children_node_ptrs) .def_readwrite("runtime_node", &paddle::platform::HostPythonNode::runtime_node_ptrs) .def_readwrite("device_node", - &paddle::platform::HostPythonNode::device_node_ptrs); + &paddle::platform::HostPythonNode::device_node_ptrs) + .def_readwrite("mem_node", + &paddle::platform::HostPythonNode::mem_node_ptrs); py::class_(m, "_Profiler") - .def("create", &paddle::platform::Profiler::Create, + .def("create", + &paddle::platform::Profiler::Create, py::return_value_policy::take_ownership) .def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported) .def("is_cnpapi_supported", @@ -3466,6 +3654,14 @@ All parameter, weight, gradient are variables in Paddle. })) .def("end", [](platform::RecordEvent *event) { event->End(); }); + py::enum_(m, "TracerMemEventType") + .value("Allocate", paddle::platform::TracerMemEventType::Allocate) + .value("Free", paddle::platform::TracerMemEventType::Free) + .value("ReservedAllocate", + paddle::platform::TracerMemEventType::ReservedAllocate) + .value("ReservedFree", + paddle::platform::TracerMemEventType::ReservedFree); + py::enum_(m, "TracerEventType") .value("Operator", paddle::platform::TracerEventType::Operator) .value("Dataloader", paddle::platform::TracerEventType::Dataloader) @@ -3509,22 +3705,29 @@ All parameter, weight, gradient are variables in Paddle. [](ir::Pass &self, const std::string &name, const std::string &attr) { self.Set(name, new std::string(attr)); }) - .def("set", [](ir::Pass &self, const std::string &name, - bool val) { self.Set(name, new bool(val)); }) - .def("set", [](ir::Pass &self, const std::string &name, - int val) { self.Set(name, new int(val)); }) .def("set", - [](ir::Pass &self, const std::string &name, + [](ir::Pass &self, const std::string &name, bool val) { + self.Set(name, new bool(val)); + }) + .def("set", + [](ir::Pass &self, const std::string &name, int val) { + self.Set(name, new int(val)); + }) + .def("set", + [](ir::Pass &self, + const std::string &name, std::vector set) { self.Set(name, new std::vector(set)); }) .def("set", - [](ir::Pass &self, const std::string &name, + [](ir::Pass &self, + const std::string &name, std::unordered_set set) { self.Set(name, new std::unordered_set(set)); }) .def("set", - [](ir::Pass &self, const std::string &name, + [](ir::Pass &self, + const std::string &name, std::unordered_set set) { self.Set(name, new std::unordered_set(set)); }) @@ -3769,7 +3972,8 @@ All parameter, weight, gradient are variables in Paddle. "reduce_strategy", [](const BuildStrategy &self) { return self.reduce_; }, [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) { - PADDLE_ENFORCE_NE(self.IsFinalized(), true, + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, platform::errors::PreconditionNotMet( "BuildStrategy has been finlaized, cannot be " "configured again.")); @@ -3799,7 +4003,8 @@ All parameter, weight, gradient are variables in Paddle. [](const BuildStrategy &self) { return self.gradient_scale_; }, [](BuildStrategy &self, BuildStrategy::GradientScaleStrategy strategy) { - PADDLE_ENFORCE_NE(self.IsFinalized(), true, + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, platform::errors::PreconditionNotMet( "BuildStrategy has been finlaized, cannot be " "configured again.")); @@ -3864,7 +4069,8 @@ All parameter, weight, gradient are variables in Paddle. "debug_graphviz_path", [](const BuildStrategy &self) { return self.debug_graphviz_path_; }, [](BuildStrategy &self, const std::string &path) { - PADDLE_ENFORCE_NE(self.IsFinalized(), true, + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, platform::errors::PreconditionNotMet( "BuildStrategy has been finlaized, cannot be " "configured again.")); @@ -3891,7 +4097,8 @@ All parameter, weight, gradient are variables in Paddle. return self.enable_sequential_execution_; }, [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), true, + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, platform::errors::PreconditionNotMet( "BuildStrategy has been finlaized, cannot be " "configured again.")); @@ -3917,7 +4124,8 @@ All parameter, weight, gradient are variables in Paddle. return self.remove_unnecessary_lock_; }, [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), true, + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, platform::errors::PreconditionNotMet( "BuildStrategy has been finlaized, cannot be " "configured again.")); @@ -3995,7 +4203,8 @@ All parameter, weight, gradient are variables in Paddle. return self.fuse_elewise_add_act_ops_; }, [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), true, + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, platform::errors::PreconditionNotMet( "BuildStrategy has been finlaized, cannot be " "configured again.")); @@ -4020,7 +4229,8 @@ All parameter, weight, gradient are variables in Paddle. "fuse_gemm_epilogue", [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; }, [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), true, + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, platform::errors::PreconditionNotMet( "BuildStrategy has been finlaized, cannot be " "configured again.")); @@ -4045,7 +4255,8 @@ All parameter, weight, gradient are variables in Paddle. "fuse_bn_act_ops", [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; }, [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), true, + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, platform::errors::PreconditionNotMet( "BuildStrategy has been finlaized, cannot be " "configured again.")); @@ -4070,7 +4281,8 @@ All parameter, weight, gradient are variables in Paddle. "fuse_bn_add_act_ops", [](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; }, [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), true, + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, platform::errors::PreconditionNotMet( "BuildStrategy has been finlaized, cannot be " "configured again.")); @@ -4095,7 +4307,8 @@ All parameter, weight, gradient are variables in Paddle. "enable_auto_fusion", [](const BuildStrategy &self) { return self.enable_auto_fusion_; }, [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), true, + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, platform::errors::PreconditionNotMet( "BuildStrategy has been finlaized, cannot be " "configured again.")); @@ -4123,7 +4336,8 @@ All parameter, weight, gradient are variables in Paddle. return self.fuse_relu_depthwise_conv_; }, [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), true, + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, platform::errors::PreconditionNotMet( "BuildStrategy has been finlaized, cannot be " "configured again.")); @@ -4153,7 +4367,8 @@ All parameter, weight, gradient are variables in Paddle. self.fuse_broadcast_ops_ == paddle::none; }, [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), true, + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, platform::errors::PreconditionNotMet( "BuildStrategy has been finlaized, " "cannot be configured again.")); @@ -4184,7 +4399,8 @@ All parameter, weight, gradient are variables in Paddle. self.fuse_all_optimizer_ops_ == paddle::none; }, [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), true, + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, platform::errors::PreconditionNotMet( "BuildStrategy has been finlaized, " "cannot be configured again.")); @@ -4194,7 +4410,8 @@ All parameter, weight, gradient are variables in Paddle. "sync_batch_norm", [](const BuildStrategy &self) { return self.sync_batch_norm_; }, [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), true, + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, platform::errors::PreconditionNotMet( "BuildStrategy has been finlaized, cannot be " "configured again.")); @@ -4348,9 +4565,13 @@ All parameter, weight, gradient are variables in Paddle. }); pe.def(py::init &, - const std::vector &, const std::string &, - Scope *, std::vector &, const ExecutionStrategy &, - const BuildStrategy &, ir::Graph *>()) + const std::vector &, + const std::string &, + Scope *, + std::vector &, + const ExecutionStrategy &, + const BuildStrategy &, + ir::Graph *>()) // NOTE: even we return a vec* to Python use reference policy. // We still cannot get local_scope from this vector, since the element // of vec will be freed by Python GC. We can only return Scope* @@ -4439,7 +4660,8 @@ All parameter, weight, gradient are variables in Paddle. PADDLE_THROW(platform::errors::Unimplemented( "Failed to convert type: %s when set IpuStrategy " "option: %s", - option.get_type(), option_name)); + option.get_type(), + option_name)); } self.InsertStringOption(option_name, option_val); } @@ -4447,7 +4669,8 @@ All parameter, weight, gradient are variables in Paddle. if (option_name.rfind("location_", 0) == 0) { for (auto option : element.second.cast()) { self.SetTensorLocation( - option_name, option.first.cast(), + option_name, + option.first.cast(), option.second.cast()); } } else if (option_name == "replicated_collectives_settings") { @@ -4501,17 +4724,19 @@ All parameter, weight, gradient are variables in Paddle. PADDLE_THROW(platform::errors::Unimplemented( "Failed to convert value type: %s when set " "IpuStrategy option: %s", - option.second.get_type(), option_key)); + option.second.get_type(), + option_key)); } - self.InsertStringPairOption(option_name, option_key, - option_val); + self.InsertStringPairOption( + option_name, option_key, option_val); } } } else { PADDLE_THROW(platform::errors::InvalidArgument( "Invalid IpuStrategy option value type: %s, please check " "input value for option: %s", - element.second.get_type(), option_name)); + element.second.get_type(), + option_name)); } } }) -- GitLab