From a224019056b9767e827a48c72b2e8a9270e04d79 Mon Sep 17 00:00:00 2001 From: chenjian Date: Tue, 19 Jul 2022 16:13:09 +0800 Subject: [PATCH] Record op shape data for profiler [cherry-pick PR43405 43578 43822] (#44384) * add serialization for new field in event node (#43405) * add serialization for new field in event node * fix a bug * add more field to memory record (#43578) * Add infer shape in dygraph (#43822) * record memory and op supplement info * update * update * fix a bug * fix memory recording * fix a bug * update * update * fix a bug * update * fix a bug * fix a bug * fix a bug * update dygraph record * add infer shape record * fix * fix * fix * add comments * fix a bug * fix * fix * add record op info * fix file mode * add op input shape info * fix dependency --- .../framework/new_executor/interpretercore.cc | 80 ++- paddle/fluid/framework/operator.cc | 378 ++++++++---- paddle/fluid/imperative/CMakeLists.txt | 245 ++++++-- paddle/fluid/imperative/prepared_operator.cc | 249 ++++++-- paddle/fluid/memory/memcpy.cc | 581 ++++++++++++------ paddle/fluid/platform/profiler.cc | 222 +++++-- paddle/fluid/platform/profiler.h | 24 +- paddle/fluid/platform/profiler/CMakeLists.txt | 62 +- .../platform/profiler/chrometracing_logger.cc | 16 +- paddle/fluid/platform/profiler/common_event.h | 52 +- .../profiler/dump/deserialization_reader.cc | 93 ++- .../profiler/dump/deserialization_reader.h | 3 + .../platform/profiler/dump/nodetree.proto | 78 +++ .../profiler/dump/serialization_logger.cc | 101 ++- .../profiler/dump/serialization_logger.h | 3 + .../dump/test_serialization_logger.cc | 37 ++ paddle/fluid/platform/profiler/event_node.cc | 21 +- paddle/fluid/platform/profiler/event_node.h | 2 + .../fluid/platform/profiler/event_python.cc | 35 +- paddle/fluid/platform/profiler/event_python.h | 36 ++ .../platform/profiler/host_event_recorder.h | 74 ++- paddle/fluid/platform/profiler/host_tracer.cc | 77 ++- .../platform/profiler/supplement_tracing.h | 56 ++ .../platform/profiler/test_event_node.cc | 34 +- paddle/fluid/platform/profiler/trace_event.h | 20 +- paddle/fluid/platform/profiler/utils.cc | 3 +- paddle/fluid/pybind/pybind.cc | 4 + 27 files changed, 1975 insertions(+), 611 deletions(-) create mode 100644 paddle/fluid/platform/profiler/supplement_tracing.h diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index dd1c0d885ef..e26f45a8467 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -22,14 +22,17 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/supplement_tracing.h" #include "paddle/phi/core/kernel_context.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif -PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true, +PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, + true, "Use inplace in new executor"); -PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true, +PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, + true, "Use local_scope in new executor(especially used " "in UT), can turn off for better performance"); @@ -167,8 +170,8 @@ paddle::framework::FetchList InterpreterCore::Run( // scope? } global_scope_->SetLocalScope(local_scope_); - paddle::framework::interpreter::build_variable_scope(block_, global_scope_, - create_local_scope_); + paddle::framework::interpreter::build_variable_scope( + block_, global_scope_, create_local_scope_); std::vector op_func_nodes; paddle::framework::interpreter::build_op_func_list( place_, block_, &op_func_nodes, global_scope_, create_local_scope_); @@ -490,7 +493,9 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { // If it is OperatorBase, InferShape do nothing. if (op_with_kernel != nullptr) { platform::RecordEvent infershape_event( - "infer_shape", platform::TracerEventType::OperatorInner, 1, + "infer_shape", + platform::TracerEventType::OperatorInner, + 1, platform::EventRole::kInnerOp); // see OperatorWithKernel::RunImpl in operator.cc for why @@ -499,6 +504,11 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { op_with_kernel->Info().infer_shape_( instr_node.InnerInferShapeContext().get()); } + infershape_event.End(); + platform::RecordOpInfoSupplement(op->Type(), + op->Attrs(), + *(instr_node.InnerInferShapeContext()), + *(instr_node.InnerRuntimeContext())); } } @@ -516,7 +526,9 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { { platform::RecordEvent compute_event( - "compute", platform::TracerEventType::OperatorInner, 1, + "compute", + platform::TracerEventType::OperatorInner, + 1, platform::EventRole::kInnerOp); if (op_with_kernel == nullptr) { instr_node.OpBase()->Run(*local_scope, place_); @@ -571,7 +583,8 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { if (op_with_kernel != nullptr && FLAGS_check_nan_inf) { VLOG(4) << "Check nan/inf"; framework::details::CheckOpHasNanOrInf( - *op, *global_scope_, + *op, + *global_scope_, place); // TODO(xiongkun03) change it to inner scope. } } @@ -596,10 +609,14 @@ void InterpreterCore::ExecuteInstructionList( for (size_t i = 0; i < dependecy_count_.size(); ++i) { if (dependecy_count_[i] == 0) { - async_work_queue_->AddTask(vec_instr.at(i).KernelType(), [ - this, i, atomic_deps = atomic_deps.get(), - atomic_var_ref = atomic_var_ref.get() - ] { RunInstructionAsync(i, atomic_deps, atomic_var_ref); }); + async_work_queue_->AddTask(vec_instr.at(i).KernelType(), + [this, + i, + atomic_deps = atomic_deps.get(), + atomic_var_ref = atomic_var_ref.get()] { + RunInstructionAsync( + i, atomic_deps, atomic_var_ref); + }); } } @@ -615,7 +632,8 @@ void InterpreterCore::ExecuteInstructionList( } VLOG(4) << "Cancel ok"; PADDLE_ENFORCE_EQ( - main_thread_blocker_.Clear(), 0, + main_thread_blocker_.Clear(), + 0, platform::errors::PreconditionNotMet( "main_thread_blocker_.Clear() return -1, clear failed")); VLOG(4) << "clear ok"; @@ -624,7 +642,8 @@ void InterpreterCore::ExecuteInstructionList( } void InterpreterCore::RunNextInstructions( - const Instruction& instr, std::queue* reserved_next_ops, + const Instruction& instr, + std::queue* reserved_next_ops, std::vector>* atomic_deps, std::vector>* atomic_var_ref) { auto& next_instr = instr.NextInstructions(); @@ -691,7 +710,8 @@ void InterpreterCore::RunNextInstructions( } void InterpreterCore::RunInstructionAsync( - size_t instr_id, std::vector>* atomic_deps, + size_t instr_id, + std::vector>* atomic_deps, std::vector>* atomic_var_ref) { std::queue ready_ops; ready_ops.push(instr_id); @@ -700,10 +720,10 @@ void InterpreterCore::RunInstructionAsync( ready_ops.pop(); auto& instr_node = vec_instruction_.at(instr_id); VLOG(5) << __func__ << " OP id:" << instr_node.Id() - << " name:" << instr_node.OpBase()->Type() - << " type:" << (instr_node.KernelType() == OpFuncType::kQueueSync - ? "kQueueSync" - : "kQueueAsync") + << " name:" << instr_node.OpBase()->Type() << " type:" + << (instr_node.KernelType() == OpFuncType::kQueueSync + ? "kQueueSync" + : "kQueueAsync") << " runs on " << platform::GetCurrentThreadName(); auto* op = instr_node.OpBase(); @@ -877,12 +897,14 @@ void InterpreterCore::CheckGC( } else { static_cast(gc_.get())->Add( - var_scope.Var(var_id), &gc_event_.at(instr_id), + var_scope.Var(var_id), + &gc_event_.at(instr_id), &instr.DeviceContext()); } #else static_cast(gc_.get())->Add( - var_scope.Var(var_id), &gc_event_.at(instr_id), + var_scope.Var(var_id), + &gc_event_.at(instr_id), &instr.DeviceContext()); #endif } @@ -891,20 +913,24 @@ void InterpreterCore::CheckGC( void InterpreterCore::Prepare( const std::vector& feed_names, - const std::vector& feed_tensors, bool prepare_feed) { - PADDLE_ENFORCE_EQ(feed_names.size(), feed_tensors.size(), + const std::vector& feed_tensors, + bool prepare_feed) { + PADDLE_ENFORCE_EQ(feed_names.size(), + feed_tensors.size(), platform::errors::PreconditionNotMet( "Required feed_names.size() == feed_tensors.size(), " "but received %d != %d", - feed_names.size(), feed_tensors.size())); + feed_names.size(), + feed_tensors.size())); auto FeedInput = [&] { VLOG(4) << "Feed inputs"; for (size_t i = 0; i < feed_names.size(); ++i) { auto* feed_var = global_scope_->FindVar(feed_names[i]); PADDLE_ENFORCE_NOT_NULL( - feed_var, platform::errors::NotFound( - "Variable %s should not be nullptr.", feed_names[i])); + feed_var, + platform::errors::NotFound("Variable %s should not be nullptr.", + feed_names[i])); auto feed_tensor = feed_var->GetMutable(); feed_tensor->ShareDataWith(feed_tensors[i]); @@ -913,8 +939,8 @@ void InterpreterCore::Prepare( }; if (!is_build_) { - paddle::framework::interpreter::build_variable_scope(block_, global_scope_, - create_local_scope_); + paddle::framework::interpreter::build_variable_scope( + block_, global_scope_, create_local_scope_); FeedInput(); std::vector op_func_nodes; paddle::framework::interpreter::build_op_func_list( diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index f06ed0b496e..140525384c3 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -33,6 +33,7 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/supplement_tracing.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/kernel_context.h" @@ -59,7 +60,8 @@ class DenseTensor; DECLARE_bool(benchmark); DECLARE_bool(check_nan_inf); DECLARE_bool(enable_unused_var_check); -PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0, +PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, + 0, "number of threads for inner op"); DECLARE_bool(run_kp_kernel); DECLARE_bool(enable_host_event_recorder_hook); @@ -74,7 +76,8 @@ std::vector> kKernelPriority = { std::make_tuple(platform::CPUPlace(), LibraryType::kPlain), }; -static DDim GetDimsDebug(const ScopeBase& scope, const std::string& name, +static DDim GetDimsDebug(const ScopeBase& scope, + const std::string& name, bool get_actual_dim = false) { Variable* var = scope.FindVar(name); if (var == nullptr) { @@ -268,7 +271,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { Type(), platform::TracerEventType::Operator, 1); auto op_name = platform::OpName(outputs_, Type()); platform::RecordEvent op_name_record_event( - op_name, platform::TracerEventType::Operator, + op_name, + platform::TracerEventType::Operator, FLAGS_enable_host_event_recorder_hook ? 20 : 1, platform::EventRole::kUniqueOp); RunImpl(scope, place); @@ -297,9 +301,11 @@ bool OperatorBase::HasInputs(const std::string& name) const { std::string OperatorBase::Input(const std::string& name) const { auto& ins = Inputs(name); PADDLE_ENFORCE_LE( - ins.size(), 1UL, + ins.size(), + 1UL, platform::errors::InvalidArgument( - "Operator %s's input %s should contain only one variable.", type_, + "Operator %s's input %s should contain only one variable.", + type_, name)); return ins.empty() ? kEmptyVarName : ins[0]; } @@ -308,9 +314,10 @@ const std::vector& OperatorBase::Inputs( const std::string& name) const { auto it = inputs_.find(name); PADDLE_ENFORCE_NE( - it, inputs_.end(), - platform::errors::NotFound("Operator %s does not have the input %s.", - type_, name)); + it, + inputs_.end(), + platform::errors::NotFound( + "Operator %s does not have the input %s.", type_, name)); return it->second; } @@ -325,9 +332,11 @@ bool OperatorBase::HasOutputs(const std::string& name) const { std::string OperatorBase::Output(const std::string& name) const { auto& outs = Outputs(name); PADDLE_ENFORCE_LE( - outs.size(), 1UL, + outs.size(), + 1UL, platform::errors::InvalidArgument( - "Operator %s's output %s should contain only one variable.", type_, + "Operator %s's output %s should contain only one variable.", + type_, name)); return outs.empty() ? kEmptyVarName : outs[0]; } @@ -336,7 +345,8 @@ const std::vector& OperatorBase::Outputs( const std::string& name) const { auto it = outputs_.find(name); PADDLE_ENFORCE_NE( - it, outputs_.end(), + it, + outputs_.end(), platform::errors::NotFound( "Operator %s does not have an output called %s.", type_, name)); return it->second; @@ -484,18 +494,20 @@ void OperatorBase::CheckAllInputOutputSet() const { for (auto& in : info_->Proto().inputs()) { if (!in.dispensable() && !in.extra()) { PADDLE_ENFORCE_NE( - inputs_.find(in.name()), inputs_.end(), - platform::errors::NotFound("Operator %s's input (%s) is not set.", - Type(), in.name())); + inputs_.find(in.name()), + inputs_.end(), + platform::errors::NotFound( + "Operator %s's input (%s) is not set.", Type(), in.name())); } } for (auto& out : info_->Proto().outputs()) { if (!out.dispensable() && !out.extra()) { PADDLE_ENFORCE_NE( - outputs_.find(out.name()), outputs_.end(), - platform::errors::NotFound("Operator %s's output (%s) is not set.", - Type(), out.name())); + outputs_.find(out.name()), + outputs_.end(), + platform::errors::NotFound( + "Operator %s's output (%s) is not set.", Type(), out.name())); } } } @@ -568,10 +580,12 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const { if (it == ctx_.inputs.end()) return nullptr; PADDLE_ENFORCE_LE( - it->second.size(), 1UL, + it->second.size(), + 1UL, platform::errors::InvalidArgument( "Operator %s's input %s should contain only one variable.", - op_.Type(), name)); + op_.Type(), + name)); return it->second.empty() ? nullptr : it->second[0]; } @@ -580,10 +594,12 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const { if (it == ctx_.outputs.end()) return nullptr; PADDLE_ENFORCE_LE( - it->second.size(), 1UL, + it->second.size(), + 1UL, platform::errors::InvalidArgument( "Operator %s's output %s should contain only one variable.", - op_.Type(), name)); + op_.Type(), + name)); return it->second.empty() ? nullptr : it->second[0]; } @@ -598,10 +614,13 @@ const std::vector ExecutionContext::MultiInput( } std::vector res; res.reserve(vars.size()); - std::transform(vars.begin(), vars.end(), std::back_inserter(res), + std::transform(vars.begin(), + vars.end(), + std::back_inserter(res), [&](const Variable* var) -> const Tensor* { if (var == nullptr) return nullptr; - PADDLE_ENFORCE_EQ(var->IsType(), true, + PADDLE_ENFORCE_EQ(var->IsType(), + true, platform::errors::InvalidArgument( "Input variable should be LoDTensor, " "but the received type is %s.", @@ -621,7 +640,9 @@ std::vector ExecutionContext::MultiOutput( } std::vector res; res.reserve(vars.size()); - std::transform(vars.begin(), vars.end(), std::back_inserter(res), + std::transform(vars.begin(), + vars.end(), + std::back_inserter(res), [&](Variable* var) -> Tensor* { return var == nullptr ? nullptr : var->GetMutable(); @@ -679,7 +700,8 @@ class RuntimeInferShapeContext : public InferShapeContext { const auto& in = it->second; if (in.size() == 0) return false; PADDLE_ENFORCE_EQ( - in.size(), 1UL, + in.size(), + 1UL, platform::errors::InvalidArgument( "Input %s should not contain more than one inputs.", name)); return in[0] != nullptr; @@ -697,7 +719,8 @@ class RuntimeInferShapeContext : public InferShapeContext { return false; } PADDLE_ENFORCE_EQ( - out.size(), 1UL, + out.size(), + 1UL, platform::errors::InvalidArgument( "Output %s should not contain more than one outputs.", name)); return out[0] != nullptr; @@ -754,11 +777,14 @@ class RuntimeInferShapeContext : public InferShapeContext { std::string GetInputNameByIdx(size_t idx) const override { auto& op_proto = paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_; - PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(), + PADDLE_ENFORCE_LT(idx, + op_proto->inputs().size(), platform::errors::OutOfRange( "The index should be less than the size of inputs of " "operator %s, but got index is %d and size is %d", - op_.Type(), idx, op_proto->inputs().size())); + op_.Type(), + idx, + op_proto->inputs().size())); return op_proto->inputs()[idx].name(); } @@ -766,42 +792,55 @@ class RuntimeInferShapeContext : public InferShapeContext { auto& op_proto = paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_; PADDLE_ENFORCE_LT( - idx, op_proto->outputs().size(), + idx, + op_proto->outputs().size(), platform::errors::OutOfRange( "The index should be less than the size of outputs of " "operator %s, but got index is %d and size is %d", - op_.Type(), idx, op_proto->outputs().size())); + op_.Type(), + idx, + op_proto->outputs().size())); return op_proto->outputs()[idx].name(); } - void ShareDim(const std::string& in, const std::string& out, size_t i = 0, + void ShareDim(const std::string& in, + const std::string& out, + size_t i = 0, size_t j = 0) override { auto in_it = ctx_.inputs.find(in); auto out_it = ctx_.outputs.find(out); PADDLE_ENFORCE_NE( - in_it, ctx_.inputs.end(), + in_it, + ctx_.inputs.end(), platform::errors::NotFound("Input %s does not exist.", in)); PADDLE_ENFORCE_NE( - out_it, ctx_.outputs.end(), + out_it, + ctx_.outputs.end(), platform::errors::NotFound("Output %s does not exist.", out)); - PADDLE_ENFORCE_LT(i, in_it->second.size(), + PADDLE_ENFORCE_LT(i, + in_it->second.size(), platform::errors::InvalidArgument( "The index of input dimension is out of range, " "excepted index less than %zu, but received %zu.", - in_it->second.size(), i)); - PADDLE_ENFORCE_LT(j, out_it->second.size(), + in_it->second.size(), + i)); + PADDLE_ENFORCE_LT(j, + out_it->second.size(), platform::errors::InvalidArgument( "The index of output dimension is out of range, " "excepted index less than %zu, but received %zu.", - out_it->second.size(), j)); + out_it->second.size(), + j)); Variable* in_var = in_it->second[i]; Variable* out_var = out_it->second[j]; PADDLE_ENFORCE_EQ( - in_var->Type(), out_var->Type(), + in_var->Type(), + out_var->Type(), platform::errors::InvalidArgument( - "The type of input (%s) and output (%s) are inconsistent.", in, + "The type of input (%s) and output (%s) are inconsistent.", + in, out)); if (in_var->IsType()) { @@ -825,19 +864,22 @@ class RuntimeInferShapeContext : public InferShapeContext { const std::string& out) const override { auto in_it = ctx_.inputs.find(in); auto out_it = ctx_.outputs.find(out); - PADDLE_ENFORCE_NE(in_it, ctx_.inputs.end(), + PADDLE_ENFORCE_NE(in_it, + ctx_.inputs.end(), platform::errors::NotFound( "Input [%s] found error in Op [%s]", in, op_.Type())); PADDLE_ENFORCE_NE( - out_it, ctx_.outputs.end(), - platform::errors::NotFound("Output [%s] found error in Op [%s]", out, - op_.Type())); + out_it, + ctx_.outputs.end(), + platform::errors::NotFound( + "Output [%s] found error in Op [%s]", out, op_.Type())); auto& in_var_list = in_it->second; auto& out_var_list = out_it->second; PADDLE_ENFORCE_EQ( - in_var_list.size(), out_var_list.size(), + in_var_list.size(), + out_var_list.size(), platform::errors::PreconditionNotMet( "Op [%s]: Input var size should be equal with output var size", op_.Type())); @@ -852,10 +894,12 @@ class RuntimeInferShapeContext : public InferShapeContext { Variable* in_var = in_var_list[i]; if (!in_var->IsType()) return; Variable* out_var = out_var_list[i]; - PADDLE_ENFORCE_EQ(out_var->IsType(), true, + PADDLE_ENFORCE_EQ(out_var->IsType(), + true, platform::errors::PreconditionNotMet( "The %d-th output of Output(%s) must be LoDTensor.", - i, out_var_names[i])); + i, + out_var_names[i])); auto& in_tensor = in_var->Get(); auto* out_tensor = out_var->GetMutable(); out_tensor->set_lod(in_tensor.lod()); @@ -866,32 +910,41 @@ class RuntimeInferShapeContext : public InferShapeContext { } } - void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, + void ShareLoD(const std::string& in, + const std::string& out, + size_t i = 0, size_t j = 0) const override { auto in_it = ctx_.inputs.find(in); auto out_it = ctx_.outputs.find(out); PADDLE_ENFORCE_NE( - in_it, ctx_.inputs.end(), + in_it, + ctx_.inputs.end(), platform::errors::NotFound("Input %s does not exist.", in)); PADDLE_ENFORCE_NE( - out_it, ctx_.outputs.end(), + out_it, + ctx_.outputs.end(), platform::errors::NotFound("Output %s does not exist.", out)); - PADDLE_ENFORCE_LT(i, in_it->second.size(), + PADDLE_ENFORCE_LT(i, + in_it->second.size(), platform::errors::InvalidArgument( "The index of input dimension is out of range, " "excepted index less than %zu, but received %zu.", - in_it->second.size(), i)); - PADDLE_ENFORCE_LT(j, out_it->second.size(), + in_it->second.size(), + i)); + PADDLE_ENFORCE_LT(j, + out_it->second.size(), platform::errors::InvalidArgument( "The index of output dimension is out of range, " "excepted index less than %zu, but received %zu.", - out_it->second.size(), j)); + out_it->second.size(), + j)); Variable* in_var = in_it->second.at(i); if (!in_var->IsType()) return; Variable* out_var = out_it->second.at(j); PADDLE_ENFORCE_EQ( - out_var->IsType(), true, + out_var->IsType(), + true, platform::errors::InvalidArgument( "The %zu-th output of Output(%s) must be LoDTensor.", j, out)); auto& in_tensor = in_var->Get(); @@ -926,7 +979,8 @@ class RuntimeInferShapeContext : public InferShapeContext { "set in the runtime kernel.")); } - void SetLoDLevel(const std::string& out, int32_t lod_level, + void SetLoDLevel(const std::string& out, + int32_t lod_level, size_t j = 0) const override { PADDLE_THROW(platform::errors::PreconditionNotMet( "SetLoDLevel is only used in compile time. The calculation of " @@ -969,10 +1023,12 @@ class RuntimeInferShapeContext : public InferShapeContext { DDim GetInputDim(const std::string& name) const override { const std::vector& vars = InputVars(name); PADDLE_ENFORCE_EQ( - vars.size(), 1UL, + vars.size(), + 1UL, platform::errors::InvalidArgument( "Input(%s) should hold one element, but now it holds %zu elements.", - name, vars.size())); + name, + vars.size())); return this->GetDim(vars[0]); } @@ -998,10 +1054,12 @@ class RuntimeInferShapeContext : public InferShapeContext { void SetOutputDim(const std::string& name, const DDim& dim) override { auto& vars = OutputVars(name); PADDLE_ENFORCE_EQ( - vars.size(), 1UL, + vars.size(), + 1UL, platform::errors::InvalidArgument("Output(%s) should hold one element, " "but now it holds %zu elements.", - name, vars.size())); + name, + vars.size())); SetDim(vars[0], dim); } @@ -1038,7 +1096,9 @@ class RuntimeInferShapeContext : public InferShapeContext { std::vector GetDims(const std::vector& vars) const { std::vector ret; ret.reserve(vars.size()); - std::transform(vars.begin(), vars.end(), std::back_inserter(ret), + std::transform(vars.begin(), + vars.end(), + std::back_inserter(ret), [this](Variable* var) { return this->GetDim(var); }); return ret; } @@ -1064,12 +1124,14 @@ class RuntimeInferShapeContext : public InferShapeContext { void SetDims(const std::vector& vars, const std::vector& dims) { size_t length = vars.size(); - PADDLE_ENFORCE_EQ(length, dims.size(), + PADDLE_ENFORCE_EQ(length, + dims.size(), platform::errors::InvalidArgument( "The number of input variables do not match the " "number of input dimensions, the number of variables " "is %zu, the number of dimensions is %zu.", - length, dims.size())); + length, + dims.size())); for (size_t i = 0; i < length; ++i) { if (vars[i] == nullptr) { continue; @@ -1088,9 +1150,12 @@ class RuntimeInferShapeContext : public InferShapeContext { const std::vector& vars) const { std::vector retv; retv.resize(vars.size()); - std::transform(vars.begin(), vars.end(), retv.begin(), + std::transform(vars.begin(), + vars.end(), + retv.begin(), std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType), - this, std::placeholders::_1)); + this, + std::placeholders::_1)); return retv; } @@ -1102,7 +1167,8 @@ class RuntimeInferShapeContext : public InferShapeContext { const std::vector& InputVars(const std::string& name) const { auto it = ctx_.inputs.find(name); PADDLE_ENFORCE_NE( - it, ctx_.inputs.end(), + it, + ctx_.inputs.end(), platform::errors::NotFound( "Operator (%s) does not have the input (%s).", op_.Type(), name)); return it->second; @@ -1111,7 +1177,8 @@ class RuntimeInferShapeContext : public InferShapeContext { const std::vector& OutputVars(const std::string& name) const { auto it = ctx_.outputs.find(name); PADDLE_ENFORCE_NE( - it, ctx_.outputs.end(), + it, + ctx_.outputs.end(), platform::errors::NotFound( "Operator (%s) does not have the outputs (%s).", op_.Type(), name)); return it->second; @@ -1132,20 +1199,23 @@ static void CheckTensorNANOrInf(const std::string& op_type, return; } PADDLE_ENFORCE_NE( - framework::TensorContainsInf(tensor), true, - platform::errors::Fatal("Operator %s output Tensor %s contains Inf.", - op_type, name)); + framework::TensorContainsInf(tensor), + true, + platform::errors::Fatal( + "Operator %s output Tensor %s contains Inf.", op_type, name)); PADDLE_ENFORCE_NE( - framework::TensorContainsNAN(tensor), true, - platform::errors::Fatal("Operator %s output Tensor %s contains NAN.", - op_type, name)); + framework::TensorContainsNAN(tensor), + true, + platform::errors::Fatal( + "Operator %s output Tensor %s contains NAN.", op_type, name)); } bool OperatorWithKernel::SupportGPU() const { auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap( phi::TransToPhiKernelName(type_)); auto has_phi_kernel = - std::any_of(phi_kernels.begin(), phi_kernels.end(), + std::any_of(phi_kernels.begin(), + phi_kernels.end(), [](phi::KernelKeyMap::const_reference kern_pair) { return kern_pair.first.backend() == phi::Backend::GPU; }); @@ -1158,7 +1228,8 @@ bool OperatorWithKernel::SupportGPU() const { } else { auto& op_kernels = kernel_iter->second; return std::any_of( - op_kernels.begin(), op_kernels.end(), + op_kernels.begin(), + op_kernels.end(), [](OpKernelMap::const_reference kern_pair) { return platform::is_gpu_place(kern_pair.first.place_); }); @@ -1170,7 +1241,8 @@ bool OperatorWithKernel::SupportNPU() const { auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap( phi::TransToPhiKernelName(type_)); auto has_phi_kernel = - std::any_of(phi_kernels.begin(), phi_kernels.end(), + std::any_of(phi_kernels.begin(), + phi_kernels.end(), [](phi::KernelKeyMap::const_reference kern_pair) { return kern_pair.first.backend() == phi::Backend::NPU; }); @@ -1183,7 +1255,8 @@ bool OperatorWithKernel::SupportNPU() const { } else { auto& op_kernels = kernel_iter->second; return std::any_of( - op_kernels.begin(), op_kernels.end(), + op_kernels.begin(), + op_kernels.end(), [](OpKernelMap::const_reference kern_pair) { return platform::is_npu_place(kern_pair.first.place_); }); @@ -1195,14 +1268,16 @@ bool OperatorWithKernel::SupportsMKLDNN( const proto::VarType::Type data_type) const { auto op_kernel_iter = OperatorWithKernel::AllOpKernels().find(type_); if (op_kernel_iter == OperatorWithKernel::AllOpKernels().end()) { - VLOG(6) << "Warning: " << type_ << " don't find its MKLDNN Kernel in Fluid " - "Registered Kernels. And We don't " - "search its kernels in phi lib, " - "SupportsMKLDNN() return false."; + VLOG(6) << "Warning: " << type_ + << " don't find its MKLDNN Kernel in Fluid " + "Registered Kernels. And We don't " + "search its kernels in phi lib, " + "SupportsMKLDNN() return false."; return false; } auto& op_kernels = op_kernel_iter->second; - return std::any_of(op_kernels.begin(), op_kernels.end(), + return std::any_of(op_kernels.begin(), + op_kernels.end(), [data_type](OpKernelMap::const_reference kern_pair) { return platform::is_cpu_place(kern_pair.first.place_) && kern_pair.first.library_type_ == @@ -1366,7 +1441,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) && !is_xpu_unsupport #endif - ) { + ) { run_phi_kernel_ = true; } else { auto& all_op_kernels = AllOpKernels(); @@ -1399,7 +1474,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, #if defined(PADDLE_WITH_XPU_KP) || (is_xpu_unsupport && !is_xpu_kp_support) #endif - ) { + ) { auto pt_cpu_kernel_key = FallBackToCpu(*kernel_type_.get(), pt_kernel_key, *this); pt_kernel_.reset( @@ -1429,10 +1504,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope, { platform::RecordEvent record_event("prepare_data", platform::TracerEventType::OperatorInner, - 1, platform::EventRole::kInnerOp); + 1, + platform::EventRole::kInnerOp); if (need_prepare_data_) { - transfer_scope = PrepareData(scope, *kernel_type_, - &transfered_inplace_vars, runtime_ctx); + transfer_scope = PrepareData( + scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx); } } // exec scope is the scope that kernel actually executed on. @@ -1442,9 +1518,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope, if (!all_kernels_must_compute_runtime_shape_) { platform::RecordEvent record_event("infer_shape", platform::TracerEventType::OperatorInner, - 1, platform::EventRole::kInnerOp); + 1, + platform::EventRole::kInnerOp); RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx); this->Info().infer_shape_(&infer_shape_ctx); + record_event.End(); + platform::RecordOpInfoSupplement( + Type(), Attrs(), infer_shape_ctx, *runtime_ctx); } if (FLAGS_enable_unused_var_check) { @@ -1456,7 +1536,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, { platform::RecordEvent record_event("compute", platform::TracerEventType::OperatorInner, - 1, platform::EventRole::kInnerOp); + 1, + platform::EventRole::kInnerOp); if (run_phi_kernel_) { phi::KernelContext pt_kernel_context; // Do data transform before building KernelContext @@ -1584,7 +1665,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { auto& all_op_kernels = AllOpKernels(); auto kernels_iter = all_op_kernels.find(type_); PADDLE_ENFORCE_NE( - kernels_iter, all_op_kernels.end(), + kernels_iter, + all_op_kernels.end(), platform::errors::Unavailable( "There are no kernels which are registered in the %s operator.", type_)); @@ -1706,10 +1788,12 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { kernel_iter = kernels.find(expected_kernel_key); } #endif - PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), - platform::errors::NotFound( - "Operator (%s) does not have kernel for %s.", type_, - KernelTypeToString(expected_kernel_key))); + PADDLE_ENFORCE_NE( + kernel_iter, + kernels.end(), + platform::errors::NotFound("Operator (%s) does not have kernel for %s.", + type_, + KernelTypeToString(expected_kernel_key))); std::lock_guard lock(cache_update_mutex_); if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) { @@ -1719,7 +1803,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { } void OperatorWithKernel::TransferInplaceVarsBack( - const Scope& scope, const std::vector& inplace_vars, + const Scope& scope, + const std::vector& inplace_vars, const Scope& transfer_scope) const { for (auto& var_name : inplace_vars) { VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; @@ -1730,8 +1815,9 @@ void OperatorWithKernel::TransferInplaceVarsBack( auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var); auto* var = transfer_scope.FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument( - "The variable[%s] is nullptr.", var_name)); + PADDLE_ENFORCE_NOT_NULL(var, + platform::errors::InvalidArgument( + "The variable[%s] is nullptr.", var_name)); auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var); auto original_dims = original_tensor->dims(); original_tensor->ShareDataWith(*transformed_tensor); @@ -1811,7 +1897,8 @@ void OperatorWithKernel::HandleComplexGradToRealGrad( } Scope* OperatorWithKernel::PrepareData( - const Scope& scope, const OpKernelType& expected_kernel_key, + const Scope& scope, + const OpKernelType& expected_kernel_key, std::vector* transfered_inplace_vars, RuntimeContext* ctx) const { Scope* new_scope = nullptr; @@ -1867,8 +1954,8 @@ Scope* OperatorWithKernel::PrepareData( input_vars[i] = trans_var; auto out = trans_var->GetMutable(); out->Resize(tensor_in->dims()); - platform::MatchShapeToLayout(out, tensor_in->layout(), - DataLayout::kNHWC); + platform::MatchShapeToLayout( + out, tensor_in->layout(), DataLayout::kNHWC); VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , " "but kNHWC layout" << var_name_item.first << " in Operator " << type_; @@ -1915,8 +2002,8 @@ Scope* OperatorWithKernel::PrepareData( if (!run_by_executor_ && (platform::is_gpu_place(kernel_type_for_var.place_) || platform::is_gpu_place(expected_kernel_key.place_))) { - new_scope = TryCreateTransferScope(kernel_type_for_var, - expected_kernel_key, &scope); + new_scope = TryCreateTransferScope( + kernel_type_for_var, expected_kernel_key, &scope); enable_cache_transfer_scope_ = true; } if (!new_scope) { @@ -1978,7 +2065,8 @@ Scope* OperatorWithKernel::PrepareData( } void OperatorWithKernel::ParseInputDataType( - const Variable* var, const std::string& name, + const Variable* var, + const std::string& name, proto::VarType::Type* data_type) const { if (var != nullptr) { const Tensor* t = nullptr; @@ -1998,17 +2086,20 @@ void OperatorWithKernel::ParseInputDataType( } if (t != nullptr) { PADDLE_ENFORCE_EQ( - t->IsInitialized(), true, + t->IsInitialized(), + true, platform::errors::InvalidArgument("The %s Op's Input Variable `%s` " "contains uninitialized Tensor.", - Type(), name)); + Type(), + name)); *data_type = paddle::framework::TransToProtoVarType(t->dtype()); } } } void OperatorWithKernel::ParseMultiInputDataType( - const std::vector& vars, const std::string& name, + const std::vector& vars, + const std::string& name, proto::VarType::Type* data_type) const { proto::VarType::Type default_data_type = static_cast(-1); @@ -2032,10 +2123,12 @@ void OperatorWithKernel::ParseMultiInputDataType( } if (t != nullptr) { PADDLE_ENFORCE_EQ( - t->IsInitialized(), true, + t->IsInitialized(), + true, platform::errors::InvalidArgument("The %s Op's Input Variable `%s` " "contains uninitialized Tensor.", - Type(), name)); + Type(), + name)); proto::VarType::Type tmp = paddle::framework::TransToProtoVarType(t->dtype()); PADDLE_ENFORCE(tmp == *data_type || *data_type == default_data_type, @@ -2045,7 +2138,9 @@ void OperatorWithKernel::ParseMultiInputDataType( "consistent or reigster GetExpectedKernelType. The " "current variable type is (%s), but the " "previous variable type is (%s).", - Type(), name, DataTypeToString(tmp), + Type(), + name, + DataTypeToString(tmp), DataTypeToString(*data_type))); *data_type = tmp; } @@ -2066,7 +2161,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( } } PADDLE_ENFORCE_NE( - data_type, dafault_data_type, + data_type, + dafault_data_type, platform::errors::NotFound( "DataType should be indicated by input Variable at %s.", Type())); return data_type; @@ -2083,12 +2179,14 @@ proto::VarType::Type OperatorWithKernel::IndicateVarDataType( ParseMultiInputDataType(ctx.MultiInputVar(name), name, &data_type); } PADDLE_ENFORCE_NE( - data_type, dafault_data_type, + data_type, + dafault_data_type, platform::errors::InvalidArgument( "The Input Variable(%s) of (%s) Operator used to determine kernel " "data type is empty or not LoDTensor or SelectedRows or " "LoDTensorArray.", - name, Type())); + name, + Type())); return data_type; } @@ -2120,11 +2218,14 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely( t, platform::errors::InvalidArgument( "The Tensor of variable %s is nullptr when promote complex types.")); - PADDLE_ENFORCE_EQ(t->IsInitialized(), true, + PADDLE_ENFORCE_EQ(t->IsInitialized(), + true, platform::errors::InvalidArgument( "The Tensor in the %s Op's Input Variable %s(%s) is " "not initialized.", - Type(), name, ctx.InputName(name))); + Type(), + name, + ctx.InputName(name))); return t; } @@ -2136,7 +2237,8 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely( * the kernel data type. */ proto::VarType::Type OperatorWithKernel::IndicateOrPromoteVarDataTypes( - const ExecutionContext& ctx, const std::string& name1, + const ExecutionContext& ctx, + const std::string& name1, const std::string& name2) const { // 1. Get tensor auto* tensor_a = GetTensorFormInputSafely(ctx, name1); @@ -2158,10 +2260,11 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType( } OpKernelType OperatorWithKernel::GetKernelTypeForVar( - const std::string& var_name, const Tensor& tensor, + const std::string& var_name, + const Tensor& tensor, const OpKernelType& expected_kernel_type) const { - return OpKernelType(expected_kernel_type.data_type_, tensor.place(), - tensor.layout()); + return OpKernelType( + expected_kernel_type.data_type_, tensor.place(), tensor.layout()); } phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( @@ -2172,8 +2275,9 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( if (arg_map_fn) { arg_map_fn_.reset(new phi::ArgumentMappingFn(*arg_map_fn)); } else { - auto func = [this]( - const phi::ArgumentMappingContext& ctx) -> phi::KernelSignature { + auto func = + [this]( + const phi::ArgumentMappingContext& ctx) -> phi::KernelSignature { return phi::DefaultKernelSignatureMap::Instance().Get(type_); }; arg_map_fn_.reset(new phi::ArgumentMappingFn(func)); @@ -2183,16 +2287,19 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( } Scope* OperatorWithKernel::PreparePhiData( - const Scope& scope, const phi::Kernel& pt_kernel, + const Scope& scope, + const phi::Kernel& pt_kernel, const phi::KernelSignature& pt_kernel_signature, RuntimeContext* ctx) const { const auto& input_names = pt_kernel_signature.input_names; auto input_defs = pt_kernel.args_def().input_defs(); - PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(), + PADDLE_ENFORCE_EQ(input_names.size(), + input_defs.size(), platform::errors::InvalidArgument( "The size of inputs_args names (%d) must be equal to " "the size of kernel input_defs (%d).", - input_names.size(), input_defs.size())); + input_names.size(), + input_defs.size())); Scope* new_scope = nullptr; auto& name_map = Inputs(); const std::unordered_set* no_buffer_ins = nullptr; @@ -2279,7 +2386,8 @@ Scope* OperatorWithKernel::PreparePhiData( } void OperatorWithKernel::BuildPhiKernelContext( - const RuntimeContext& ctx, platform::DeviceContext* dev_ctx, + const RuntimeContext& ctx, + platform::DeviceContext* dev_ctx, phi::KernelContext* pt_kernel_context) const { pt_kernel_context->SetDeviceContext(dev_ctx); @@ -2291,23 +2399,29 @@ void OperatorWithKernel::BuildPhiKernelContext( auto attr_defs = pt_kernel_->args_def().attribute_defs(); auto output_defs = pt_kernel_->args_def().output_defs(); - PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(), + PADDLE_ENFORCE_EQ(input_names.size(), + input_defs.size(), platform::errors::InvalidArgument( "The size of inputs_args names (%d) must be equal to " "the size of kernel input_defs (%d).", - input_names.size(), input_defs.size())); + input_names.size(), + input_defs.size())); - PADDLE_ENFORCE_EQ(output_names.size(), output_defs.size(), + PADDLE_ENFORCE_EQ(output_names.size(), + output_defs.size(), platform::errors::InvalidArgument( "The size of outputs_args names (%d) must be equal to " "the size of kernel output_defs (%d).", - output_names.size(), output_defs.size())); + output_names.size(), + output_defs.size())); - PADDLE_ENFORCE_EQ(attr_names.size(), attr_defs.size(), + PADDLE_ENFORCE_EQ(attr_names.size(), + attr_defs.size(), platform::errors::InvalidArgument( "The size of attribute_args names (%d) must be equal " "to the size of kernel attribute_defs (%d).", - attr_names.size(), attr_defs.size())); + attr_names.size(), + attr_defs.size())); for (size_t i = 0; i < input_names.size(); ++i) { auto it = ctx.inputs.find(input_names[i]); @@ -2489,7 +2603,8 @@ void OperatorWithKernel::BuildPhiKernelContext( break; case phi::AttributeType::SCALARS: { PADDLE_ENFORCE_NE( - attr_iter, Attrs().end(), + attr_iter, + Attrs().end(), platform::errors::NotFound("(%s) is not found in AttributeMap when " "buildind static KernelContext.", attr_names[i])); @@ -2553,7 +2668,8 @@ void OperatorWithKernel::BuildPhiKernelContext( } break; default: { PADDLE_ENFORCE_NE( - attr_iter, Attrs().end(), + attr_iter, + Attrs().end(), platform::errors::NotFound("(%s) is not found in AttributeMap when " "buildind static KernelContext.", attr_names[i])); diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 107bbdf09a0..7b0d876e98f 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,65 +1,216 @@ -cc_library(imperative_flag SRCS flags.cc DEPS gflags flags) -cc_library(var_helper SRCS var_helper.cc DEPS tensor phi_api) -IF(WITH_XPU) -cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils phi_api phi phi_utils var_helper) -ELSE() -cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils phi_api phi phi_utils var_helper) -ENDIF() -cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry var_helper phi_api) +cc_library( + imperative_flag + SRCS flags.cc + DEPS gflags flags) +cc_library( + var_helper + SRCS var_helper.cc + DEPS tensor phi_api) +if(WITH_XPU) + cc_library( + prepared_operator + SRCS prepared_operator.cc + DEPS xpu_op_list + proto_desc + operator + device_context + lod_tensor + selected_rows_utils + var_type_traits + op_kernel_type + data_transform + nan_inf_utils + phi_api + phi_utils + var_helper + profiler) +else() + cc_library( + prepared_operator + SRCS prepared_operator.cc + DEPS proto_desc + operator + device_context + lod_tensor + selected_rows_utils + var_type_traits + op_kernel_type + data_transform + nan_inf_utils + phi_api + phi_utils + var_helper + profiler) +endif() +cc_library( + layer + SRCS layer.cc + DEPS prepared_operator + math_function + imperative_flag + variable_helper + op_registry + var_helper + phi_api) add_subdirectory(jit) -if (WITH_GPU) -cc_library(layout_autotune SRCS layout_autotune.cc DEPS op_info phi_gpu_info) +if(WITH_GPU) + cc_library( + layout_autotune + SRCS layout_autotune.cc + DEPS op_info phi_gpu_info) else() -cc_library(layout_autotune SRCS layout_autotune.cc DEPS op_info) + cc_library( + layout_autotune + SRCS layout_autotune.cc + DEPS op_info) endif() -cc_library(amp SRCS amp_auto_cast.cc DEPS layer var_helper) -cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector var_helper layout_autotune) -cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator switch_autotune) -cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator switch_autotune) -cc_library(imperative_profiler SRCS profiler.cc DEPS flags) +cc_library( + amp + SRCS amp_auto_cast.cc + DEPS layer var_helper) +cc_library( + tracer + SRCS tracer.cc + DEPS layer + engine + program_desc_tracer + amp + denormal + garbage_collector + var_helper + layout_autotune) +cc_library( + basic_engine + SRCS basic_engine.cc + DEPS layer gradient_accumulator switch_autotune) +cc_library( + engine + SRCS basic_engine.cc partial_grad_engine.cc + DEPS layer gradient_accumulator switch_autotune) +cc_library( + imperative_profiler + SRCS profiler.cc + DEPS flags) if(NOT WIN32) - if(WITH_NCCL OR WITH_RCCL) - cc_library(imperative_all_reduce SRCS all_reduce.cc DEPS collective_helper device_context selected_rows_utils tensor) - cc_library(nccl_context SRCS nccl_context.cc DEPS collective_helper device_context imperative_all_reduce var_type_traits) - if(WITH_NCCL) - nv_library(reducer SRCS reducer.cc reducer.cu DEPS layer imperative_all_reduce) - endif() - if(WITH_RCCL) - hip_library(reducer SRCS reducer.cc reducer.cu DEPS layer imperative_all_reduce) - endif() - endif() - if(WITH_XPU_BKCL) - cc_library(bkcl_context SRCS bkcl_context.cc DEPS collective_helper device_context tensor var_type_traits) - cc_library(reducer SRCS reducer.cc DEPS layer) + if(WITH_NCCL OR WITH_RCCL) + cc_library( + imperative_all_reduce + SRCS all_reduce.cc + DEPS collective_helper device_context selected_rows_utils tensor) + cc_library( + nccl_context + SRCS nccl_context.cc + DEPS collective_helper device_context imperative_all_reduce + var_type_traits) + if(WITH_NCCL) + nv_library( + reducer + SRCS reducer.cc reducer.cu + DEPS layer imperative_all_reduce) endif() - if(WITH_ASCEND_CL) - cc_library(hccl_context SRCS hccl_context.cc DEPS collective_helper device_context tensor var_type_traits) - cc_library(reducer SRCS reducer.cc DEPS layer) + if(WITH_RCCL) + hip_library( + reducer + SRCS reducer.cc reducer.cu + DEPS layer imperative_all_reduce) endif() - if(WITH_CNCL) - cc_library(cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits) - cc_library(reducer SRCS reducer.cc DEPS layer) - endif() - if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) - cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits) - endif() - cc_library(data_loader SRCS data_loader.cc DEPS enforce) + endif() + if(WITH_XPU_BKCL) + cc_library( + bkcl_context + SRCS bkcl_context.cc + DEPS collective_helper device_context tensor var_type_traits) + cc_library( + reducer + SRCS reducer.cc + DEPS layer) + endif() + if(WITH_ASCEND_CL) + cc_library( + hccl_context + SRCS hccl_context.cc + DEPS collective_helper device_context tensor var_type_traits) + cc_library( + reducer + SRCS reducer.cc + DEPS layer) + endif() + if(WITH_CNCL) + cc_library( + cncl_context + SRCS cncl_context.cc + DEPS collective_helper device_context tensor var_type_traits) + cc_library( + reducer + SRCS reducer.cc + DEPS layer) + endif() + if(WITH_NCCL + OR WITH_RCCL + OR WITH_XPU_BKCL + OR WITH_ASCEND_CL) + cc_library( + heter_ccl_context + SRCS heter_ccl_context.cc + DEPS collective_helper device_context tensor var_type_traits) + endif() + cc_library( + data_loader + SRCS data_loader.cc + DEPS enforce) endif(NOT WIN32) if(WITH_GLOO) - cc_library(imperative_gloo_context SRCS gloo_context.cc DEPS collective_helper device_context tensor var_type_traits) - if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL OR WITH_CNCL) )) - cc_library(reducer SRCS reducer.cc DEPS layer) - endif() + cc_library( + imperative_gloo_context + SRCS gloo_context.cc + DEPS collective_helper device_context tensor var_type_traits) + if(WIN32 + OR (NOT + (WITH_NCCL + OR WITH_RCCL + OR WITH_XPU_BKCL + OR WITH_ASCEND_CL + OR WITH_CNCL) + )) + cc_library( + reducer + SRCS reducer.cc + DEPS layer) + endif() endif() if(WITH_MLU) - SET(MLU_DEPS mlu_baseop) + set(MLU_DEPS mlu_baseop) endif() if(NOT WITH_ASCEND_CL) -cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor ${MLU_DEPS}) + cc_library( + gradient_accumulator + SRCS gradient_accumulator.cc + DEPS blas + operator + lod_tensor + selected_rows_utils + selected_rows_functor + var_type_traits + layer + math_function + phi_tensor + ${MLU_DEPS}) else() -cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner phi_tensor) + cc_library( + gradient_accumulator + SRCS gradient_accumulator.cc + DEPS blas + operator + lod_tensor + selected_rows_utils + selected_rows_functor + var_type_traits + layer + math_function + npu_op_runner + phi_tensor) endif() add_subdirectory(tests) diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index bf69f6cf5ac..bcdcf740407 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -28,6 +28,7 @@ #include "paddle/fluid/framework/library_type.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/supplement_tracing.h" DECLARE_bool(check_nan_inf); DECLARE_bool(benchmark); @@ -91,8 +92,8 @@ void HandleComplexGradToRealGrad(const NameVarMap& outs) { << framework::DataTypeToString(var->ForwardDataType()) << " real var in dynamic graph."; framework::Tensor out; - framework::TransComplexToReal(var->ForwardDataType(), var->DataType(), - *tensor, &out); + framework::TransComplexToReal( + var->ForwardDataType(), var->DataType(), *tensor, &out); SetTensorToVariable(var->Var(), out, var->MutableVar()); } } @@ -147,8 +148,10 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op, template PreparedOp PrepareImpl( - const NameVarMap& ins, const NameVarMap& outs, - const framework::OperatorWithKernel& op, const platform::Place& place, + const NameVarMap& ins, + const NameVarMap& outs, + const framework::OperatorWithKernel& op, + const platform::Place& place, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs, const phi::KernelFactory& phi_kernel_factory, @@ -254,7 +257,7 @@ PreparedOp PrepareImpl( #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) && !is_xpu_unsupport #endif - ) { + ) { VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name << " | kernel key: " << pt_kernel_key << " | kernel: " << phi_kernel; @@ -263,9 +266,14 @@ PreparedOp PrepareImpl( dev_ctx = pool.Get(expected_kernel_key.place_); } - return PreparedOp(op, empty_ctx, expected_kernel_key, arg_map_fn, - default_kernel_signature, std::move(kernel_signature), - phi_kernel, dev_ctx); + return PreparedOp(op, + empty_ctx, + expected_kernel_key, + arg_map_fn, + default_kernel_signature, + std::move(kernel_signature), + phi_kernel, + dev_ctx); } else { VLOG(6) << "Dynamic mode ChoosePhiKernel - kernel `" << pt_kernel_name << "` not found."; @@ -302,7 +310,7 @@ PreparedOp PrepareImpl( #if defined(PADDLE_WITH_XPU_KP) || (is_xpu_unsupport && !is_xpu_kp_support) #endif - ) { + ) { if (has_phi_kernel) { auto pt_cpu_kernel_key = FallBackToCpu(expected_kernel_key, pt_kernel_key, op); @@ -313,15 +321,21 @@ PreparedOp PrepareImpl( << " | kernel key: " << pt_cpu_kernel_key << " | kernel: " << pt_cpu_kernel; auto* cpu_ctx = pool.Get(paddle::platform::CPUPlace()); - return PreparedOp(op, empty_ctx, expected_kernel_key, arg_map_fn, - default_kernel_signature, std::move(kernel_signature), - pt_cpu_kernel, cpu_ctx); + return PreparedOp(op, + empty_ctx, + expected_kernel_key, + arg_map_fn, + default_kernel_signature, + std::move(kernel_signature), + pt_cpu_kernel, + cpu_ctx); } } } PADDLE_ENFORCE_NE( - kernels_iter, all_op_kernels.end(), + kernels_iter, + all_op_kernels.end(), platform::errors::NotFound( "There are no kernels which are registered in the %s operator.", op.Type())); @@ -397,17 +411,24 @@ PreparedOp PrepareImpl( #endif // TODO(jiabin): Add operator.cc's line 1000 part back when we need that // case - PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), - platform::errors::NotFound( - "Operator %s does not have kernel for %s.", op.Type(), - KernelTypeToString(expected_kernel_key))); + PADDLE_ENFORCE_NE( + kernel_iter, + kernels.end(), + platform::errors::NotFound("Operator %s does not have kernel for %s.", + op.Type(), + KernelTypeToString(expected_kernel_key))); if (!(expected_kernel_key.place_ == place)) { dev_ctx = pool.Get(expected_kernel_key.place_); } - return PreparedOp(op, empty_ctx, expected_kernel_key, kernel_iter->second, - arg_map_fn, default_kernel_signature, dev_ctx); + return PreparedOp(op, + empty_ctx, + expected_kernel_key, + kernel_iter->second, + arg_map_fn, + default_kernel_signature, + dev_ctx); } PreparedOp PreparedOp::Prepare(const NameVarMap& ins, @@ -416,8 +437,14 @@ PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const platform::Place& place, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { - return PrepareImpl(ins, outs, op, place, attrs, default_attrs, - phi_kernel_factory, phi_op_utils_map, + return PrepareImpl(ins, + outs, + op, + place, + attrs, + default_attrs, + phi_kernel_factory, + phi_op_utils_map, default_phi_kernel_sig_map); } @@ -427,9 +454,15 @@ PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const platform::Place& place, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { - return PrepareImpl( - ins, outs, op, place, attrs, default_attrs, phi_kernel_factory, - phi_op_utils_map, default_phi_kernel_sig_map); + return PrepareImpl(ins, + outs, + op, + place, + attrs, + default_attrs, + phi_kernel_factory, + phi_op_utils_map, + default_phi_kernel_sig_map); } PreparedOp PreparedOp::Prepare(const NameVarMap& ins, @@ -438,39 +471,58 @@ PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const platform::Place& place, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { - return PrepareImpl( - ins, outs, op, place, attrs, default_attrs, phi_kernel_factory, - phi_op_utils_map, default_phi_kernel_sig_map); + return PrepareImpl(ins, + outs, + op, + place, + attrs, + default_attrs, + phi_kernel_factory, + phi_op_utils_map, + default_phi_kernel_sig_map); } template static void PreparedOpRunImpl( - const framework::OperatorBase& op, const framework::RuntimeContext& ctx, + const framework::OperatorBase& op, + const framework::RuntimeContext& ctx, const framework::OpKernelType& kernel_type, const framework::OperatorWithKernel::OpKernelFunc& func, const phi::ArgumentMappingFn* arg_map_fn, const phi::KernelSignature* default_kernel_signature, - platform::DeviceContext* dev_ctx, const NameVarMap& ins, - const NameVarMap& outs, const framework::AttributeMap& attrs, + platform::DeviceContext* dev_ctx, + const NameVarMap& ins, + const NameVarMap& outs, + const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { // TODO(zjl): remove scope in dygraph { platform::RecordEvent record_event("infer_shape", platform::TracerEventType::OperatorInner, - 1, platform::EventRole::kInnerOp); - DygraphInferShapeContext infer_shape_ctx( - &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type, - arg_map_fn, default_kernel_signature); + 1, + platform::EventRole::kInnerOp); + DygraphInferShapeContext infer_shape_ctx(&ins, + &outs, + &attrs, + &default_attrs, + op.Type(), + &kernel_type, + arg_map_fn, + default_kernel_signature); op.Info().infer_shape_(&infer_shape_ctx); + record_event.End(); + platform::RecordOpInfoSupplement( + op.Type(), op.Attrs(), infer_shape_ctx, ctx); } { platform::RecordEvent record_event("compute", platform::TracerEventType::OperatorInner, - 1, platform::EventRole::kInnerOp); + 1, + platform::EventRole::kInnerOp); - func(DygraphExecutionContext(op, empty_scope, *dev_ctx, ctx, ins, - outs, attrs, default_attrs)); + func(DygraphExecutionContext( + op, empty_scope, *dev_ctx, ctx, ins, outs, attrs, default_attrs)); } if (FLAGS_check_nan_inf) { @@ -509,30 +561,48 @@ static void PreparedOpRunPtImpl( const framework::OpKernelType& kernel_type, const phi::ArgumentMappingFn* arg_map_fn, const phi::KernelSignature* default_kernel_signature, - const phi::KernelSignature& kernel_signature, const phi::Kernel& phi_kernel, - platform::DeviceContext* dev_ctx, const NameVarMap& ins, - const NameVarMap& outs, const framework::AttributeMap& attrs, + const phi::KernelSignature& kernel_signature, + const phi::Kernel& phi_kernel, + platform::DeviceContext* dev_ctx, + const NameVarMap& ins, + const NameVarMap& outs, + const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { { platform::RecordEvent record_event("infer_shape", platform::TracerEventType::OperatorInner, - 1, platform::EventRole::kInnerOp); - DygraphInferShapeContext infer_shape_ctx( - &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type, - arg_map_fn, default_kernel_signature); + 1, + platform::EventRole::kInnerOp); + DygraphInferShapeContext infer_shape_ctx(&ins, + &outs, + &attrs, + &default_attrs, + op.Type(), + &kernel_type, + arg_map_fn, + default_kernel_signature); op.Info().infer_shape_(&infer_shape_ctx); + record_event.End(); + platform::RecordOpInfoSupplement( + op.Type(), op.Attrs(), infer_shape_ctx, kernel_signature); } { platform::RecordEvent record_event("compute", platform::TracerEventType::OperatorInner, - 1, platform::EventRole::kInnerOp); + 1, + platform::EventRole::kInnerOp); PreparePhiData(phi_kernel, kernel_signature, ins); phi::KernelContext pt_kernel_context; - BuildDygraphPhiKernelContext(kernel_signature, phi_kernel, ins, - outs, attrs, default_attrs, dev_ctx, + BuildDygraphPhiKernelContext(kernel_signature, + phi_kernel, + ins, + outs, + attrs, + default_attrs, + dev_ctx, &pt_kernel_context); phi_kernel(&pt_kernel_context); @@ -561,14 +631,29 @@ void PreparedOp::Run(const NameVarMap& ins, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { if (run_phi_kernel_) { - PreparedOpRunPtImpl(op_, kernel_type_, arg_map_fn_, - default_kernel_signature_, kernel_signature_, - phi_kernel_, dev_ctx_, ins, outs, attrs, + PreparedOpRunPtImpl(op_, + kernel_type_, + arg_map_fn_, + default_kernel_signature_, + kernel_signature_, + phi_kernel_, + dev_ctx_, + ins, + outs, + attrs, default_attrs); } else { - PreparedOpRunImpl(op_, ctx_, kernel_type_, func_, arg_map_fn_, - default_kernel_signature_, dev_ctx_, ins, outs, - attrs, default_attrs); + PreparedOpRunImpl(op_, + ctx_, + kernel_type_, + func_, + arg_map_fn_, + default_kernel_signature_, + dev_ctx_, + ins, + outs, + attrs, + default_attrs); } } @@ -577,14 +662,29 @@ void PreparedOp::Run(const NameVarMap& ins, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { if (run_phi_kernel_) { - PreparedOpRunPtImpl( - op_, kernel_type_, arg_map_fn_, default_kernel_signature_, - kernel_signature_, phi_kernel_, dev_ctx_, ins, outs, attrs, - default_attrs); + PreparedOpRunPtImpl(op_, + kernel_type_, + arg_map_fn_, + default_kernel_signature_, + kernel_signature_, + phi_kernel_, + dev_ctx_, + ins, + outs, + attrs, + default_attrs); } else { - PreparedOpRunImpl( - op_, ctx_, kernel_type_, func_, arg_map_fn_, default_kernel_signature_, - dev_ctx_, ins, outs, attrs, default_attrs); + PreparedOpRunImpl(op_, + ctx_, + kernel_type_, + func_, + arg_map_fn_, + default_kernel_signature_, + dev_ctx_, + ins, + outs, + attrs, + default_attrs); } } @@ -593,14 +693,29 @@ void PreparedOp::Run(const NameVarMap& ins, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { if (run_phi_kernel_) { - PreparedOpRunPtImpl( - op_, kernel_type_, arg_map_fn_, default_kernel_signature_, - kernel_signature_, phi_kernel_, dev_ctx_, ins, outs, attrs, - default_attrs); + PreparedOpRunPtImpl(op_, + kernel_type_, + arg_map_fn_, + default_kernel_signature_, + kernel_signature_, + phi_kernel_, + dev_ctx_, + ins, + outs, + attrs, + default_attrs); } else { - PreparedOpRunImpl( - op_, ctx_, kernel_type_, func_, arg_map_fn_, default_kernel_signature_, - dev_ctx_, ins, outs, attrs, default_attrs); + PreparedOpRunImpl(op_, + ctx_, + kernel_type_, + func_, + arg_map_fn_, + default_kernel_signature_, + dev_ctx_, + ins, + outs, + attrs, + default_attrs); } } diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 3198b4f8d93..ae2c0aa612e 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/common/place.h" #ifdef PADDLE_WITH_XPU @@ -33,8 +33,12 @@ namespace memory { #ifdef PADDLE_WITH_CUSTOM_DEVICE template <> void Copy( - platform::CPUPlace dst_place, void* dst, platform::CustomPlace src_place, - const void* src, size_t num, void* stream) { + platform::CPUPlace dst_place, + void* dst, + platform::CustomPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; auto src_type = platform::PlaceHelper::GetDeviceType(src_place); @@ -52,8 +56,12 @@ void Copy( template <> void Copy( - platform::CustomPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num, void* stream) { + platform::CustomPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; auto src_type = platform::PlaceHelper::GetDeviceType(src_place); auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place); @@ -70,8 +78,12 @@ void Copy( template <> void Copy( - platform::CustomPlace dst_place, void* dst, platform::CustomPlace src_place, - const void* src, size_t num, void* stream) { + platform::CustomPlace dst_place, + void* dst, + platform::CustomPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; auto src_type = platform::PlaceHelper::GetDeviceType(src_place); @@ -102,9 +114,11 @@ void Copy( #endif // PADDLE_WITH_CUSTOM_DEVICE template <> -void Copy(platform::CPUPlace, void* dst, +void Copy(platform::CPUPlace, + void* dst, platform::CPUPlace, - const void* src, size_t num) { + const void* src, + size_t num) { if (UNLIKELY(num == 0)) return; VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num; std::memcpy(dst, src, num); @@ -115,7 +129,8 @@ template <> void Copy(platform::IPUPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (UNLIKELY(num == 0)) return; std::memcpy(dst, src, num); } @@ -123,7 +138,8 @@ template <> void Copy(platform::CPUPlace dst_place, void* dst, platform::IPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (UNLIKELY(num == 0)) return; std::memcpy(dst, src, num); } @@ -131,15 +147,18 @@ template <> void Copy(platform::IPUPlace dst_place, void* dst, platform::IPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (UNLIKELY(num == 0)) return; std::memcpy(dst, src, num); } // NOTE: only for (CPUPlace and IPUPlace) -> (IPUPlace). template <> -void Copy(phi::IPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, +void Copy(phi::IPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, size_t num) { if (src_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_src; @@ -152,8 +171,10 @@ void Copy(phi::IPUPlace dst_place, void* dst, // NOTE: only for (IPUPlace) -> (CPUPlace and IPUPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::IPUPlace src_place, const void* src, +void Copy(phi::Place dst_place, + void* dst, + phi::IPUPlace src_place, + const void* src, size_t num) { if (dst_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_dst; @@ -170,7 +191,8 @@ template <> void Copy(platform::XPUPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (num <= 0) { VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")"; return; @@ -182,7 +204,8 @@ template <> void Copy(platform::CPUPlace dst_place, void* dst, platform::XPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (num <= 0) { VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")"; return; @@ -194,7 +217,8 @@ template <> void Copy(platform::XPUPlace dst_place, void* dst, platform::XPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (num <= 0) { VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")"; return; @@ -204,8 +228,10 @@ void Copy(platform::XPUPlace dst_place, // NOTE: only for (CPUPlace and XPUPlace) -> (XPUPlace). template <> -void Copy(phi::XPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, +void Copy(phi::XPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, size_t num) { if (src_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_src; @@ -218,8 +244,10 @@ void Copy(phi::XPUPlace dst_place, void* dst, // NOTE: only for (XPUPlace) -> (CPUPlace and XPUPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::XPUPlace src_place, const void* src, +void Copy(phi::Place dst_place, + void* dst, + phi::XPUPlace src_place, + const void* src, size_t num) { if (dst_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_dst; @@ -236,7 +264,8 @@ template <> void Copy(platform::NPUPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -248,7 +277,10 @@ void Copy(platform::NPUPlace dst_place, if (stream) { platform::RecordEvent record_event( "NpuMemcpyAsync:CPU->NPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_HOST_TO_DEVICE, reinterpret_cast(stream)); } else { // On NPU, async operation after sync operation is ok, while sync operation @@ -267,7 +299,8 @@ template <> void Copy(platform::CPUPlace dst_place, void* dst, platform::NPUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -279,7 +312,10 @@ void Copy(platform::CPUPlace dst_place, if (stream) { platform::RecordEvent record_event( "NpuMemcpyAsync:NPU->CPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_DEVICE_TO_HOST, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -295,7 +331,8 @@ template <> void Copy(platform::NPUPlace dst_place, void* dst, platform::NPUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -307,7 +344,10 @@ void Copy(platform::NPUPlace dst_place, platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_DEVICE_TO_DEVICE, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = @@ -329,7 +369,10 @@ void Copy(platform::NPUPlace dst_place, platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_DEVICE_TO_DEVICE, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = @@ -346,8 +389,11 @@ void Copy(platform::NPUPlace dst_place, template <> void Copy( - platform::CPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place, - const void* src, size_t num) { + platform::CPUPlace dst_place, + void* dst, + platform::NPUPinnedPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -356,8 +402,11 @@ void Copy( template <> void Copy( - platform::NPUPinnedPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num) { + platform::NPUPinnedPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -366,8 +415,11 @@ void Copy( template <> void Copy( - platform::NPUPinnedPlace dst_place, void* dst, - platform::NPUPinnedPlace src_place, const void* src, size_t num) { + platform::NPUPinnedPlace dst_place, + void* dst, + platform::NPUPinnedPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -376,8 +428,12 @@ void Copy( template <> void Copy( - platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place, - const void* src, size_t num, void* stream) { + platform::NPUPinnedPlace dst_place, + void* dst, + platform::NPUPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(src_place.device); @@ -389,7 +445,10 @@ void Copy( platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_DEVICE_TO_HOST, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -404,8 +463,12 @@ void Copy( template <> void Copy( - platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place, - const void* src, size_t num, void* stream) { + platform::NPUPlace dst_place, + void* dst, + platform::NPUPinnedPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(dst_place.device); @@ -417,7 +480,10 @@ void Copy( platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_HOST_TO_DEVICE, reinterpret_cast(stream)); } else { // On NPU, async operation after sync operation is ok, while sync operation @@ -435,9 +501,12 @@ void Copy( // NOTE: only for CPUPlace, NPUPlace and NPUPinnedPlace. template <> -void Copy(phi::Place dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, aclrtStream stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + aclrtStream stream) { if (src_place.GetType() == phi::AllocationType::CPU && dst_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_dst, place_src; @@ -504,52 +573,76 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (CPUPlace). template <> -void Copy(phi::CPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, aclrtStream stream) { +void Copy(phi::CPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + aclrtStream stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (CPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::CPUPlace src_place, const void* src, - size_t num, aclrtStream stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::CPUPlace src_place, + const void* src, + size_t num, + aclrtStream stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPlace) template <> -void Copy(phi::NPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, aclrtStream stream) { - Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place, - src, num, stream); +void Copy(phi::NPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + aclrtStream stream) { + Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), + dst, + src_place, + src, + num, + stream); } // NOTE: only for (NPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace) template <> -void Copy(phi::Place dst_place, void* dst, - phi::NPUPlace src_place, const void* src, - size_t num, aclrtStream stream) { - Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()), - src, num, stream); +void Copy(phi::Place dst_place, + void* dst, + phi::NPUPlace src_place, + const void* src, + size_t num, + aclrtStream stream) { + Copy(dst_place, + dst, + phi::Place(src_place.GetType(), src_place.GetDeviceId()), + src, + num, + stream); } // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPinnedPlace) template <> void Copy(phi::NPUPinnedPlace dst_place, - void* dst, phi::Place src_place, - const void* src, size_t num, + void* dst, + phi::Place src_place, + const void* src, + size_t num, aclrtStream stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (NPUPinnedPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace) template <> -void Copy(phi::Place dst_place, void* dst, +void Copy(phi::Place dst_place, + void* dst, phi::NPUPinnedPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, aclrtStream stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } @@ -557,16 +650,20 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: only for (CPUPlace) -> (NPUPinnedPlace) template <> void Copy(phi::NPUPinnedPlace dst_place, - void* dst, phi::Place src_place, - const void* src, size_t num) { + void* dst, + phi::Place src_place, + const void* src, + size_t num) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr); } // NOTE: only for (NPUPinnedPlace) -> (CPUPlace) template <> -void Copy(phi::Place dst_place, void* dst, +void Copy(phi::Place dst_place, + void* dst, phi::NPUPinnedPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr); } #endif @@ -608,8 +705,12 @@ inline void SyncCUDAStream() { template <> void Copy( - platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, - const void* src, size_t num, void* stream) { + platform::CPUPlace dst_place, + void* dst, + platform::CUDAPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetDeviceId(src_place.device); @@ -619,10 +720,16 @@ void Copy( platform::RecordEvent record_event( "GpuMemcpyAsync:GPU->CPU", platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, + platform::GpuMemcpyAsync(dst, + src, + num, + hipMemcpyDeviceToHost, reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, + platform::GpuMemcpyAsync(dst, + src, + num, + cudaMemcpyDeviceToHost, reinterpret_cast(stream)); #endif } else { @@ -642,8 +749,12 @@ void Copy( template <> void Copy( - platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num, void* stream) { + platform::CUDAPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetDeviceId(dst_place.device); @@ -653,10 +764,16 @@ void Copy( platform::RecordEvent record_event( "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + hipMemcpyHostToDevice, reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + cudaMemcpyHostToDevice, reinterpret_cast(stream)); #endif } else { @@ -676,8 +793,12 @@ void Copy( template <> void Copy( - platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place, - const void* src, size_t num, void* stream) { + platform::CUDAPlace dst_place, + void* dst, + platform::CUDAPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " @@ -689,10 +810,16 @@ void Copy( platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + hipMemcpyDeviceToDevice, reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + cudaMemcpyDeviceToDevice, reinterpret_cast(stream)); #endif } else { @@ -710,22 +837,29 @@ void Copy( platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU", platform::TracerEventType::UserDefined, 1); - platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, - num, reinterpret_cast(stream)); + platform::GpuMemcpyPeerAsync(dst, + dst_place.device, + src, + src_place.device, + num, + reinterpret_cast(stream)); } else { platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU", platform::TracerEventType::UserDefined, 1); - platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device, - num); + platform::GpuMemcpyPeerSync( + dst, dst_place.device, src, src_place.device, num); } } } template <> void Copy( - platform::CPUPlace dst_place, void* dst, - platform::CUDAPinnedPlace src_place, const void* src, size_t num) { + platform::CPUPlace dst_place, + void* dst, + platform::CUDAPinnedPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -734,8 +868,11 @@ void Copy( template <> void Copy( - platform::CUDAPinnedPlace dst_place, void* dst, - platform::CPUPlace src_place, const void* src, size_t num) { + platform::CUDAPinnedPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -744,8 +881,11 @@ void Copy( template <> void Copy( - platform::CUDAPinnedPlace dst_place, void* dst, - platform::CUDAPinnedPlace src_place, const void* src, size_t num) { + platform::CUDAPinnedPlace dst_place, + void* dst, + platform::CUDAPinnedPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -754,8 +894,12 @@ void Copy( template <> void Copy( - platform::CUDAPinnedPlace dst_place, void* dst, - platform::CUDAPlace src_place, const void* src, size_t num, void* stream) { + platform::CUDAPinnedPlace dst_place, + void* dst, + platform::CUDAPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetDeviceId(src_place.device); VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " @@ -765,10 +909,16 @@ void Copy( platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, + platform::GpuMemcpyAsync(dst, + src, + num, + hipMemcpyDeviceToHost, reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, + platform::GpuMemcpyAsync(dst, + src, + num, + cudaMemcpyDeviceToHost, reinterpret_cast(stream)); #endif } else { @@ -785,8 +935,11 @@ void Copy( template <> void Copy( - platform::CUDAPlace dst_place, void* dst, - platform::CUDAPinnedPlace src_place, const void* src, size_t num, + platform::CUDAPlace dst_place, + void* dst, + platform::CUDAPinnedPlace src_place, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -798,10 +951,16 @@ void Copy( platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + hipMemcpyHostToDevice, reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + cudaMemcpyHostToDevice, reinterpret_cast(stream)); #endif } else { @@ -818,9 +977,12 @@ void Copy( // NOTE: only for CPUPlace、CUDAPlace and CUDAPinnedPlace. template <> -void Copy(phi::Place dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { if (src_place.GetType() == phi::AllocationType::CPU && dst_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_dst, place_src; @@ -887,52 +1049,76 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CPUPlace). template <> -void Copy(phi::CPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::CPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::CPUPlace src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::CPUPlace src_place, + const void* src, + size_t num, + void* stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPlace) template <> -void Copy(phi::GPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { - Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place, - src, num, stream); +void Copy(phi::GPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { + Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), + dst, + src_place, + src, + num, + stream); } // NOTE: only for (CUDAPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace) template <> -void Copy(phi::Place dst_place, void* dst, - phi::GPUPlace src_place, const void* src, - size_t num, void* stream) { - Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()), - src, num, stream); +void Copy(phi::Place dst_place, + void* dst, + phi::GPUPlace src_place, + const void* src, + size_t num, + void* stream) { + Copy(dst_place, + dst, + phi::Place(src_place.GetType(), src_place.GetDeviceId()), + src, + num, + stream); } // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPinnedPlace) template <> void Copy(phi::GPUPinnedPlace dst_place, - void* dst, phi::Place src_place, - const void* src, size_t num, + void* dst, + phi::Place src_place, + const void* src, + size_t num, void* stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (CUDAPinnedPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace) template <> -void Copy(phi::Place dst_place, void* dst, +void Copy(phi::Place dst_place, + void* dst, phi::GPUPinnedPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } @@ -940,16 +1126,20 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: only for (CPUPlace) -> (CUDAPinnedPlace) template <> void Copy(phi::GPUPinnedPlace dst_place, - void* dst, phi::Place src_place, - const void* src, size_t num) { + void* dst, + phi::Place src_place, + const void* src, + size_t num) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr); } // NOTE: only for (CUDAPinnedPlace) -> (CPUPlace) template <> -void Copy(phi::Place dst_place, void* dst, +void Copy(phi::Place dst_place, + void* dst, phi::GPUPinnedPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr); } #endif @@ -959,7 +1149,8 @@ template <> void Copy(platform::CPUPlace dst_place, void* dst, platform::MLUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -970,8 +1161,8 @@ void Copy(platform::CPUPlace dst_place, platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU", platform::TracerEventType::UserDefined, 1); - platform::MLUMemcpyD2HAsync(dst, src, num, - reinterpret_cast(stream)); + platform::MLUMemcpyD2HAsync( + dst, src, num, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); @@ -988,7 +1179,8 @@ template <> void Copy(platform::MLUPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -999,8 +1191,8 @@ void Copy(platform::MLUPlace dst_place, platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU", platform::TracerEventType::UserDefined, 1); - platform::MLUMemcpyH2DAsync(dst, src, num, - reinterpret_cast(stream)); + platform::MLUMemcpyH2DAsync( + dst, src, num, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); @@ -1017,7 +1209,8 @@ template <> void Copy(platform::MLUPlace dst_place, void* dst, platform::MLUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -1029,8 +1222,8 @@ void Copy(platform::MLUPlace dst_place, platform::RecordEvent record_event("MLUMemcpyD2DAsync(same_mlu):MLU->MLU", platform::TracerEventType::UserDefined, 1); - platform::MLUMemcpyD2DAsync(dst, src, num, - reinterpret_cast(stream)); + platform::MLUMemcpyD2DAsync( + dst, src, num, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -1050,25 +1243,32 @@ void Copy(platform::MLUPlace dst_place, platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU", platform::TracerEventType::UserDefined, 1); - platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, - num, reinterpret_cast(stream)); + platform::MLUMemcpyPeerAsync(dst, + dst_place.device, + src, + src_place.device, + num, + reinterpret_cast(stream)); } else { VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU", platform::TracerEventType::UserDefined, 1); - platform::MLUMemcpyPeerSync(dst, dst_place.device, src, src_place.device, - num); + platform::MLUMemcpyPeerSync( + dst, dst_place.device, src, src_place.device, num); } } } // NOTE: only for CPUPlace and MLUPlace. template <> -void Copy(phi::Place dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { if (src_place.GetType() == phi::AllocationType::CPU && dst_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_dst, place_src; @@ -1110,35 +1310,55 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: only for (CPUPlace and MLUPlace) -> (MLUPlace) template <> -void Copy(phi::MLUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { - Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place, - src, num, stream); +void Copy(phi::MLUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { + Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), + dst, + src_place, + src, + num, + stream); } // NOTE: only for (MLUPlace) -> (CPUPlace and MLUPlace) template <> -void Copy(phi::Place dst_place, void* dst, - phi::MLUPlace src_place, const void* src, - size_t num, void* stream) { - Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()), - src, num, stream); +void Copy(phi::Place dst_place, + void* dst, + phi::MLUPlace src_place, + const void* src, + size_t num, + void* stream) { + Copy(dst_place, + dst, + phi::Place(src_place.GetType(), src_place.GetDeviceId()), + src, + num, + stream); } // NOTE: only for (MLUPlace) -> (CPUPlace) with mluStream. template <> -void Copy(phi::CPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::CPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (CPUPlace) -> (MLUPlace) with mluStream. template <> -void Copy(phi::Place dst_place, void* dst, - phi::CPUPlace src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::CPUPlace src_place, + const void* src, + size_t num, + void* stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } @@ -1146,8 +1366,10 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: Only for CPUPlace, XPUPlace and PinnedPlace. template <> -void Copy(phi::Place dst_place, void* dst, - phi::Place src_place, const void* src, +void Copy(phi::Place dst_place, + void* dst, + phi::Place src_place, + const void* src, size_t num) { if (UNLIKELY(num == 0)) return; VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " @@ -1224,16 +1446,20 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: Only for (CPUPlace) -> (CPUPlace and PinnedPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::CPUPlace src_place, const void* src, +void Copy(phi::Place dst_place, + void* dst, + phi::CPUPlace src_place, + const void* src, size_t num) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num); } // NOTE: Only for (CPUPlace and PinnedPlace) -> (CPUPlace). template <> -void Copy(phi::CPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, +void Copy(phi::CPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, size_t num) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num); } @@ -1243,9 +1469,12 @@ void Copy(phi::CPUPlace dst_place, void* dst, !defined(PADDLE_WITH_MLU) template <> -void Copy(phi::Place dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT dst_place.GetType() == phi::AllocationType::CUSTOM) { platform::CPUPlace place_src; @@ -1265,17 +1494,23 @@ void Copy(phi::Place dst_place, void* dst, } template <> -void Copy(phi::CPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::CPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::CPUPlace src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::CPUPlace src_place, + const void* src, + size_t num, + void* stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } #endif diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 8fa48ffcfb1..0f68be01a61 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/platform/profiler.h" + #include // NOLINT #include #include @@ -20,7 +22,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/common_event.h" #include "paddle/fluid/platform/profiler/host_event_recorder.h" #include "paddle/fluid/platform/profiler/host_tracer.h" @@ -29,12 +30,16 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/dynload/nvtx.h" #endif +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/os_info.h" -PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false, +PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, + false, "Enable rpc profiler or not."); -DEFINE_bool(enable_host_event_recorder_hook, false, +DEFINE_bool(enable_host_event_recorder_hook, + false, "enable HostEventRecorder, hook Profiler"); namespace paddle { @@ -42,8 +47,11 @@ namespace platform { MemEvenRecorder MemEvenRecorder::recorder; -Event::Event(EventType type, std::string name, uint32_t thread_id, - EventRole role, std::string attr) +Event::Event(EventType type, + std::string name, + uint32_t thread_id, + EventRole role, + std::string attr) : type_(type), name_(name), thread_id_(thread_id), @@ -67,8 +75,10 @@ double Event::CudaElapsedMs(const Event &e) const { #endif } -RecordEvent::RecordEvent(const char *name, const TracerEventType type, - uint32_t level, const EventRole role) { +RecordEvent::RecordEvent(const char *name, + const TracerEventType type, + uint32_t level, + const EventRole role) { #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA if (g_enable_nvprof_hook) { @@ -99,8 +109,10 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type, start_ns_ = PosixInNsec(); } -RecordEvent::RecordEvent(const std::string &name, const TracerEventType type, - uint32_t level, const EventRole role) { +RecordEvent::RecordEvent(const std::string &name, + const TracerEventType type, + uint32_t level, + const EventRole role) { #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA if (g_enable_nvprof_hook) { @@ -129,8 +141,10 @@ RecordEvent::RecordEvent(const std::string &name, const TracerEventType type, start_ns_ = PosixInNsec(); } -RecordEvent::RecordEvent(const std::string &name, const std::string &attr, - const TracerEventType type, uint32_t level, +RecordEvent::RecordEvent(const std::string &name, + const std::string &attr, + const TracerEventType type, + uint32_t level, const EventRole role) { #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA @@ -191,15 +205,15 @@ void RecordEvent::End() { if (LIKELY(FLAGS_enable_host_event_recorder_hook && is_enabled_)) { uint64_t end_ns = PosixInNsec(); if (LIKELY(shallow_copy_name_ != nullptr)) { - HostEventRecorder::GetInstance().RecordEvent( + HostEventRecorder::GetInstance().RecordEvent( shallow_copy_name_, start_ns_, end_ns, role_, type_); } else if (name_ != nullptr) { if (attr_ == nullptr) { - HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns, - role_, type_); + HostEventRecorder::GetInstance().RecordEvent( + *name_, start_ns_, end_ns, role_, type_); } else { - HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns, - role_, type_, *attr_); + HostEventRecorder::GetInstance().RecordEvent( + *name_, start_ns_, end_ns, role_, type_, *attr_); delete attr_; } delete name_; @@ -214,8 +228,8 @@ void RecordEvent::End() { DeviceTracer *tracer = GetDeviceTracer(); if (tracer) { uint64_t end_ns = PosixInNsec(); - tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(), - g_thread_id); + tracer->AddCPURecords( + CurAnnotationName(), start_ns_, end_ns, BlockDepth(), g_thread_id); } ClearCurAnnotation(); PopEvent(*name_, role_); @@ -225,30 +239,96 @@ void RecordEvent::End() { is_enabled_ = false; } -RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type, +RecordInstantEvent::RecordInstantEvent(const char *name, + TracerEventType type, uint32_t level) { if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) { return; } auto start_end_ns = PosixInNsec(); - HostEventRecorder::GetInstance().RecordEvent(name, start_end_ns, start_end_ns, - EventRole::kOrdinary, type); + HostEventRecorder::GetInstance().RecordEvent( + name, start_end_ns, start_end_ns, EventRole::kOrdinary, type); +} + +RecordOpInfoSupplement::RecordOpInfoSupplement( + const std::string &type, + const framework::AttributeMap &attrs, + const framework::InferShapeContext &shape_ctx, + const framework::RuntimeContext &ctx) { + if (FLAGS_enable_host_event_recorder_hook == false) { + return; + } + std::map> input_shapes; + std::map> dtypes; + for (auto it = ctx.inputs.begin(); it != ctx.inputs.end(); it++) { + input_shapes[it->first] = shape_ctx.GetInputsDim(it->first); + dtypes[it->first] = shape_ctx.GetInputsVarType(it->first); + } + + const std::vector *callstack_ptr = nullptr; + std::vector callstack; + auto iter = attrs.find( + framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); + if (iter != attrs.end()) { + callstack_ptr = &BOOST_GET_CONST(std::vector, iter->second); + callstack = *callstack_ptr; + } + HostEventRecorder::GetInstance().RecordEvent( + PosixInNsec(), type, input_shapes, dtypes, callstack); +} + +RecordOpInfoSupplement::RecordOpInfoSupplement( + const std::string &type, + const framework::AttributeMap &attrs, + const framework::InferShapeContext &shape_ctx, + const phi::KernelSignature &kernel_signature) { + if (FLAGS_enable_host_event_recorder_hook == false) { + return; + } + std::map> input_shapes; + std::map> dtypes; + for (auto it = kernel_signature.input_names.begin(); + it != kernel_signature.input_names.end(); + it++) { + std::string input_name(*it); + if (shape_ctx.HasInputs(input_name)) { + input_shapes[input_name] = shape_ctx.GetInputsDim(input_name); + dtypes[input_name] = shape_ctx.GetInputsVarType(input_name); + } + } + const std::vector *callstack_ptr = nullptr; + std::vector callstack; + auto iter = attrs.find( + framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); + if (iter != attrs.end()) { + callstack_ptr = &BOOST_GET_CONST(std::vector, iter->second); + callstack = *callstack_ptr; + } + HostEventRecorder::GetInstance().RecordEvent( + PosixInNsec(), type, input_shapes, dtypes, callstack); } -void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place, +void MemEvenRecorder::PushMemRecord(const void *ptr, + const Place &place, size_t size) { - if (g_state == ProfilerState::kDisabled) return; + if (g_state == ProfilerState::kDisabled) { + return; + } std::lock_guard guard(mtx_); auto &events = address_memevent_[place]; - PADDLE_ENFORCE_EQ(events.count(ptr), 0, + PADDLE_ENFORCE_EQ(events.count(ptr), + 0, platform::errors::InvalidArgument( "The Place can't exist in the stage of PushMemRecord")); - events.emplace(ptr, std::unique_ptr( - new MemEvenRecorder::RecordMemEvent(place, size))); + events.emplace(ptr, + std::unique_ptr( + new MemEvenRecorder::RecordMemEvent(place, size))); } void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) { - if (g_state == ProfilerState::kDisabled) return; + if (g_state == ProfilerState::kDisabled) { + return; + } std::lock_guard guard(mtx_); auto &events = address_memevent_[place]; auto iter = events.find(ptr); @@ -278,8 +358,13 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() { auto annotation_free = CurAnnotationName(); if (tracer) { - tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_, - annotation_free, g_mem_thread_id); + tracer->AddMemInfoRecord(start_ns_, + end_ns_, + bytes_, + place_, + alloc_in_, + annotation_free, + g_mem_thread_id); } PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free); } @@ -306,44 +391,62 @@ RecordBlock::~RecordBlock() { if (tracer) { // We try to put all blocks at the same nested depth in the // same timeline lane. and distinguish the using thread_id. - tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(), - g_thread_id); + tracer->AddCPURecords( + name_, start_ns_, PosixInNsec(), BlockDepth(), g_thread_id); } ClearCurBlock(); } -void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, - const Place &place, const std::string &annotation) { - GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes, - place, g_mem_thread_id, annotation); -} - -void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, - const Place &place, const std::string &annotation) { - GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place, - g_mem_thread_id, annotation); +void PushMemEvent(uint64_t start_ns, + uint64_t end_ns, + size_t bytes, + const Place &place, + const std::string &annotation) { + GetMemEventList().Record(EventType::kPushRange, + start_ns, + end_ns, + bytes, + place, + g_mem_thread_id, + annotation); +} + +void PopMemEvent(uint64_t start_ns, + uint64_t end_ns, + size_t bytes, + const Place &place, + const std::string &annotation) { + GetMemEventList().Record(EventType::kPopRange, + start_ns, + end_ns, + bytes, + place, + g_mem_thread_id, + annotation); } void Mark(const std::string &name) { if (FLAGS_enable_host_event_recorder_hook) { - HostEventRecorder::GetInstance().RecordEvent( + HostEventRecorder::GetInstance().RecordEvent( name, 0, 0, EventRole::kOrdinary, TracerEventType::UserDefined); return; } GetEventList().Record(EventType::kMark, name, g_thread_id); } -Event *PushEvent(const std::string &name, const EventRole role, +Event *PushEvent(const std::string &name, + const EventRole role, std::string attr) { - return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role, - attr); + return GetEventList().Record( + EventType::kPushRange, name, g_thread_id, role, attr); } void PopEvent(const std::string &name, const EventRole role, std::string attr) { GetEventList().Record(EventType::kPopRange, name, g_thread_id, role, attr); } void EnableProfiler(ProfilerState state) { - PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled, + PADDLE_ENFORCE_NE(state, + ProfilerState::kDisabled, platform::errors::InvalidArgument( "Can't enable profiling, since the input state is" "ProfilerState::kDisabled")); @@ -379,7 +482,8 @@ void ResetProfiler() { (*it)->Clear(); } for (auto it = g_all_mem_event_lists.begin(); - it != g_all_mem_event_lists.end(); ++it) { + it != g_all_mem_event_lists.end(); + ++it) { (*it)->Clear(); } } @@ -521,7 +625,8 @@ void DisableHostEventRecorder() { std::string PrintHostEvents() { std::ostringstream oss; - auto host_evt_sec = HostEventRecorder::GetInstance().GatherEvents(); + auto host_evt_sec = + HostEventRecorder::GetInstance().GatherEvents(); for (const auto &thr_evt_sec : host_evt_sec.thr_sections) { oss << thr_evt_sec.thread_id << std::endl; for (const auto &evt : thr_evt_sec.events) { @@ -533,8 +638,9 @@ std::string PrintHostEvents() { return oss.str(); } -static void EmulateEventPushAndPop(const HostEventSection &host_sec, - std::map *out) { +static void EmulateEventPushAndPop( + const HostEventSection &host_sec, + std::map *out) { for (const auto &thr_sec : host_sec.thr_sections) { uint64_t tid = thr_sec.thread_id; auto cur_thr_list = std::make_shared>(); @@ -573,15 +679,16 @@ static void EmulateEventPushAndPop(const HostEventSection &host_sec, std::string name = prefix_stk.empty() ? evt.name : prefix_stk.top() + "/" + evt.name; const char *attr = (evt.attr == nullptr ? "none" : evt.attr); - Event *orig_evt = cur_thr_list->Record(EventType::kPushRange, name, tid, - evt.role, attr); + Event *orig_evt = cur_thr_list->Record( + EventType::kPushRange, name, tid, evt.role, attr); (*out)[tid][evt.end_ns] = std::make_pair(orig_evt, evt.start_ns); cur_thr_list->Record(EventType::kPopRange, name, tid, evt.role, attr); } } } -static void EmulateCPURecordsAdd(const HostEventSection &host_sec) { +static void EmulateCPURecordsAdd( + const HostEventSection &host_sec) { DeviceTracer *tracer = GetDeviceTracer(); if (tracer == nullptr) { return; @@ -589,8 +696,8 @@ static void EmulateCPURecordsAdd(const HostEventSection &host_sec) { for (const auto &thr_sec : host_sec.thr_sections) { uint64_t tid = thr_sec.thread_id; for (const auto &evt : thr_sec.events) { - tracer->AddCPURecords(evt.name, evt.start_ns, evt.end_ns, BlockDepth(), - tid); + tracer->AddCPURecords( + evt.name, evt.start_ns, evt.end_ns, BlockDepth(), tid); } } } @@ -609,10 +716,11 @@ static std::map DockHostEventRecorderHostPart() { if (FLAGS_enable_host_event_recorder_hook == false) { return thr_events; } - auto host_evt_sec = HostEventRecorder::GetInstance().GatherEvents(); + auto host_evt_sec = + HostEventRecorder::GetInstance().GatherEvents(); EmulateEventPushAndPop(host_evt_sec, &thr_events); EmulateCPURecordsAdd(host_evt_sec); - return std::move(thr_events); + return thr_events; } static void DockHostEventRecorderDevicePart( diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 78275341cbb..e9b33b064a3 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -30,6 +30,7 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.pb.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/supplement_tracing.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif @@ -160,7 +161,8 @@ struct EventList { std::vector Reduce() { std::vector result; for (auto& block : event_blocks) { - result.insert(result.begin(), std::make_move_iterator(block.begin()), + result.insert(result.begin(), + std::make_move_iterator(block.begin()), std::make_move_iterator(block.end())); } event_blocks.clear(); @@ -173,13 +175,21 @@ struct EventList { }; void Mark(const std::string& name); -void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, - const Place& place, const std::string& annotation); -void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, - const Place& place, const std::string& annotation); -Event* PushEvent(const std::string& name, const EventRole role, +void PushMemEvent(uint64_t start_ns, + uint64_t end_ns, + size_t bytes, + const Place& place, + const std::string& annotation); +void PopMemEvent(uint64_t start_ns, + uint64_t end_ns, + size_t bytes, + const Place& place, + const std::string& annotation); +Event* PushEvent(const std::string& name, + const EventRole role, const std::string attr = "none"); -void PopEvent(const std::string& name, const EventRole role, +void PopEvent(const std::string& name, + const EventRole role, const std::string attr = "none"); // Return the event list of all threads. Assumed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt index 084bc44dbc7..1daed7db1e7 100755 --- a/paddle/fluid/platform/profiler/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/CMakeLists.txt @@ -1,14 +1,52 @@ -cc_library(host_tracer SRCS host_tracer.cc DEPS enforce) -cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog) +cc_library( + host_tracer + SRCS host_tracer.cc + DEPS enforce ddim var_type_traits) +cc_library( + cuda_tracer + SRCS cuda_tracer.cc cupti_data_process.cc + DEPS workqueue_utils enforce glog) add_subdirectory(mlu) -cc_library(event_node SRCS event_node.cc DEPS enforce) -cc_library(profiler_utils SRCS utils.cc DEPS enforce glog) +cc_library( + event_node + SRCS event_node.cc + DEPS enforce place) +cc_library( + profiler_utils + SRCS utils.cc + DEPS enforce glog) add_subdirectory(dump) -cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils) -cc_library(event_bind SRCS event_python.cc DEPS profiler_logger) -cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog) -cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind mlu_tracer) -cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger) -cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils) -cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind) -cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler) +cc_library( + profiler_logger + SRCS chrometracing_logger.cc dump/serialization_logger.cc + dump/deserialization_reader.cc + DEPS nodetreeproto event_node profiler_utils) +cc_library( + event_bind + SRCS event_python.cc + DEPS profiler_logger) +cc_library( + cpu_utilization + SRCS cpu_utilization.cc + DEPS cpu_info os_info enforce glog) +cc_library( + new_profiler + SRCS profiler.cc + DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind + mlu_tracer) +cc_test( + test_event_node + SRCS test_event_node.cc + DEPS event_node profiler_logger) +cc_test( + test_extra_info + SRCS test_extra_info.cc + DEPS profiler_utils) +cc_test( + test_serialization_logger + SRCS dump/test_serialization_logger.cc + DEPS event_bind) +cc_test( + new_profiler_test + SRCS profiler_test.cc + DEPS new_profiler) diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index 1e26c0a9440..e8fe5412721 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include +#include #include "glog/logging.h" @@ -128,27 +129,32 @@ void ChromeTracingLogger::LogMemTraceEventNode( std::string( R"JSON( { - "name": "[memory]", "pid": %lld, "tid": "%lld", + "name": "[memory]", "pid": %lld, "tid": "%lld(C++)", "ts": %lld, "ph": "i", "cat": "%s", "args": { "place": "%s", "addr": "%llu", + "increase_bytes": %lld, "current_allocated": %llu, "current_reserved": %llu, - "increase_bytes": %lld + "peak_allocated": %llu, + "peak_reserved": %llu } }, )JSON"), mem_node.ProcessId(), mem_node.ThreadId(), - mem_node.TimeStampNs(), + nsToUs(mem_node.TimeStampNs()), StringTracerMemEventType(mem_node.Type()), mem_node.Place().c_str(), mem_node.Addr(), + mem_node.IncreaseBytes(), mem_node.CurrentAllocated(), mem_node.CurrentReserved(), - mem_node.IncreaseBytes()); + mem_node.PeakAllocated(), + mem_node.PeakReserved()); + pid_tid_set_.insert({mem_node.ProcessId(), mem_node.ThreadId()}); } void ChromeTracingLogger::LogHostTraceEventNode( @@ -172,6 +178,8 @@ void ChromeTracingLogger::LogHostTraceEventNode( input_shapes = op_supplement_node->InputShapes(); input_dtypes = op_supplement_node->Dtypes(); callstack = op_supplement_node->CallStack(); + callstack = std::regex_replace(callstack, std::regex("\""), "\'"); + callstack = std::regex_replace(callstack, std::regex("\n"), "\\n"); } switch (host_node.Type()) { case TracerEventType::ProfileStep: diff --git a/paddle/fluid/platform/profiler/common_event.h b/paddle/fluid/platform/profiler/common_event.h index cfdc3be110a..05e7d1dc4f0 100644 --- a/paddle/fluid/platform/profiler/common_event.h +++ b/paddle/fluid/platform/profiler/common_event.h @@ -17,16 +17,22 @@ #include #include #include + +#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/platform/event.h" // import EventRole, TODO(TIEXING): remove later #include "paddle/fluid/platform/profiler/trace_event.h" +#include "paddle/phi/core/ddim.h" namespace paddle { namespace platform { struct CommonEvent { public: - CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns, - EventRole role, TracerEventType type) + CommonEvent(const char *name, + uint64_t start_ns, + uint64_t end_ns, + EventRole role, + TracerEventType type) : name(name), start_ns(start_ns), end_ns(end_ns), @@ -34,8 +40,12 @@ struct CommonEvent { type(type) {} CommonEvent(std::function arena_allocator, - const std::string &name_str, uint64_t start_ns, uint64_t end_ns, - EventRole role, TracerEventType type, const std::string &attr_str) + const std::string &name_str, + uint64_t start_ns, + uint64_t end_ns, + EventRole role, + TracerEventType type, + const std::string &attr_str) : start_ns(start_ns), end_ns(end_ns), role(role), type(type) { auto buf = static_cast(arena_allocator(name_str.length() + 1)); strncpy(buf, name_str.c_str(), name_str.length() + 1); @@ -46,8 +56,11 @@ struct CommonEvent { } CommonEvent(std::function arena_allocator, - const std::string &name_str, uint64_t start_ns, uint64_t end_ns, - EventRole role, TracerEventType type) + const std::string &name_str, + uint64_t start_ns, + uint64_t end_ns, + EventRole role, + TracerEventType type) : start_ns(start_ns), end_ns(end_ns), role(role), type(type) { auto buf = static_cast(arena_allocator(name_str.length() + 1)); strncpy(buf, name_str.c_str(), name_str.length() + 1); @@ -62,5 +75,32 @@ struct CommonEvent { const char *attr = nullptr; // not owned, designed for performance }; +struct OperatorSupplementOriginEvent { + public: + OperatorSupplementOriginEvent( + std::function arena_allocator, + uint64_t timestamp_ns, + const std::string &type_name, + const std::map> &input_shapes, + const std::map> + &dtypes, + const std::vector callstack) + : timestamp_ns(timestamp_ns), + input_shapes(input_shapes), + dtypes(dtypes), + callstack(callstack) { + auto buf = static_cast(arena_allocator(type_name.length() + 1)); + strncpy(buf, type_name.c_str(), type_name.length() + 1); + op_type = buf; + } + uint64_t timestamp_ns; + const char *op_type = nullptr; // not owned, designed for performance + // input shapes + std::map> input_shapes; + std::map> dtypes; + // call stack + const std::vector callstack; +}; + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc index de3411579d3..d17aa9e9ce2 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc @@ -45,7 +45,8 @@ std::unique_ptr DeserializationReader::Parse() { ExtraInfo extrainfo; for (auto indx = 0; indx < node_trees_proto_->extra_info_size(); indx++) { ExtraInfoMap extra_info_map = node_trees_proto_->extra_info(indx); - extrainfo.AddExtraInfo(extra_info_map.key(), std::string("%s"), + extrainfo.AddExtraInfo(extra_info_map.key(), + std::string("%s"), extra_info_map.value().c_str()); } // restore NodeTrees @@ -90,6 +91,26 @@ std::unique_ptr DeserializationReader::Parse() { device_node); // insert into runtime_node } } + // handle mem node + for (int mem_node_index = 0; + mem_node_index < host_node_proto.mem_nodes_size(); + mem_node_index++) { + const MemTraceEventNodeProto& mem_node_proto = + host_node_proto.mem_nodes(mem_node_index); + MemTraceEventNode* mem_node = RestoreMemTraceEventNode(mem_node_proto); + host_node->AddMemNode(mem_node); + } + // handle op supplement node + for (int op_supplement_node_index = 0; + op_supplement_node_index < + host_node_proto.op_supplement_nodes_size(); + op_supplement_node_index++) { + const OperatorSupplementEventNodeProto& op_supplement_node_proto = + host_node_proto.op_supplement_nodes(op_supplement_node_index); + OperatorSupplementEventNode* op_supplement_node = + RestoreOperatorSupplementEventNode(op_supplement_node_proto); + host_node->SetOperatorSupplementNode(op_supplement_node); + } } // restore parent-child relationship for (auto it = child_parent_map.begin(); it != child_parent_map.end(); @@ -174,6 +195,64 @@ HostTraceEventNode* DeserializationReader::RestoreHostTraceEventNode( return new HostTraceEventNode(host_event); } +MemTraceEventNode* DeserializationReader::RestoreMemTraceEventNode( + const MemTraceEventNodeProto& mem_node_proto) { + const MemTraceEventProto& mem_event_proto = mem_node_proto.mem_event(); + MemTraceEvent mem_event; + mem_event.timestamp_ns = mem_event_proto.timestamp_ns(); + mem_event.addr = mem_event_proto.addr(); + mem_event.type = static_cast(mem_event_proto.type()); + mem_event.process_id = mem_event_proto.process_id(); + mem_event.thread_id = mem_event_proto.thread_id(); + mem_event.increase_bytes = mem_event_proto.increase_bytes(); + mem_event.place = mem_event_proto.place(); + mem_event.current_allocated = mem_event_proto.current_allocated(); + mem_event.current_reserved = mem_event_proto.current_reserved(); + mem_event.peak_allocated = mem_event_proto.peak_allocated(); + mem_event.peak_reserved = mem_event_proto.peak_reserved(); + return new MemTraceEventNode(mem_event); +} + +OperatorSupplementEventNode* +DeserializationReader::RestoreOperatorSupplementEventNode( + const OperatorSupplementEventNodeProto& op_supplement_node_proto) { + const OperatorSupplementEventProto& op_supplement_event_proto = + op_supplement_node_proto.op_supplement_event(); + OperatorSupplementEvent op_supplement_event; + op_supplement_event.timestamp_ns = op_supplement_event_proto.timestamp_ns(); + op_supplement_event.op_type = op_supplement_event_proto.op_type(); + op_supplement_event.callstack = op_supplement_event_proto.callstack(); + op_supplement_event.process_id = op_supplement_event_proto.process_id(); + op_supplement_event.thread_id = op_supplement_event_proto.thread_id(); + std::map>> input_shapes; + std::map> dtypes; + auto input_shape_proto = op_supplement_event_proto.input_shapes(); + for (int i = 0; i < input_shape_proto.key_size(); i++) { + auto input_shape_vec = input_shapes[input_shape_proto.key(i)]; + auto shape_vectors_proto = input_shape_proto.shape_vecs(i); + for (int j = 0; j < shape_vectors_proto.shapes_size(); j++) { + auto shape_vector_proto = shape_vectors_proto.shapes(j); + std::vector shape; + for (int k = 0; k < shape_vector_proto.size_size(); k++) { + shape.push_back(shape_vector_proto.size(k)); + } + input_shape_vec.push_back(shape); + } + } + op_supplement_event.input_shapes = input_shapes; + auto dtype_proto = op_supplement_event_proto.dtypes(); + for (int i = 0; i < dtype_proto.key_size(); i++) { + auto dtype_vec = dtypes[dtype_proto.key(i)]; + auto dtype_vec_proto = dtype_proto.dtype_vecs(i); + for (int j = 0; j < dtype_vec_proto.dtype_size(); j++) { + auto dtype_string = dtype_vec_proto.dtype(j); + dtype_vec.push_back(dtype_string); + } + } + op_supplement_event.dtypes = dtypes; + return new OperatorSupplementEventNode(op_supplement_event); +} + KernelEventInfo DeserializationReader::HandleKernelEventInfoProto( const DeviceTraceEventProto& device_event_proto) { const KernelEventInfoProto& kernel_info_proto = @@ -203,11 +282,14 @@ MemcpyEventInfo DeserializationReader::HandleMemcpyEventInfoProto( device_event_proto.memcpy_info(); MemcpyEventInfo memcpy_info; memcpy_info.num_bytes = memcpy_info_proto.num_bytes(); - std::strncpy(memcpy_info.copy_kind, memcpy_info_proto.copy_kind().c_str(), + std::strncpy(memcpy_info.copy_kind, + memcpy_info_proto.copy_kind().c_str(), kMemKindMaxLen - 1); - std::strncpy(memcpy_info.src_kind, memcpy_info_proto.src_kind().c_str(), + std::strncpy(memcpy_info.src_kind, + memcpy_info_proto.src_kind().c_str(), kMemKindMaxLen - 1); - std::strncpy(memcpy_info.dst_kind, memcpy_info_proto.dst_kind().c_str(), + std::strncpy(memcpy_info.dst_kind, + memcpy_info_proto.dst_kind().c_str(), kMemKindMaxLen - 1); return memcpy_info; } @@ -218,7 +300,8 @@ MemsetEventInfo DeserializationReader::HandleMemsetEventInfoProto( device_event_proto.memset_info(); MemsetEventInfo memset_info; memset_info.num_bytes = memset_info_proto.num_bytes(); - std::strncpy(memset_info.memory_kind, memset_info_proto.memory_kind().c_str(), + std::strncpy(memset_info.memory_kind, + memset_info_proto.memory_kind().c_str(), kMemKindMaxLen - 1); memset_info.value = memset_info_proto.value(); return memset_info; diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h index e6feb4f9489..7df93b7703c 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h @@ -36,6 +36,9 @@ class DeserializationReader { KernelEventInfo HandleKernelEventInfoProto(const DeviceTraceEventProto&); MemcpyEventInfo HandleMemcpyEventInfoProto(const DeviceTraceEventProto&); MemsetEventInfo HandleMemsetEventInfoProto(const DeviceTraceEventProto&); + MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&); + OperatorSupplementEventNode* RestoreOperatorSupplementEventNode( + const OperatorSupplementEventNodeProto&); std::string filename_; std::ifstream input_file_stream_; NodeTreesProto* node_trees_proto_; diff --git a/paddle/fluid/platform/profiler/dump/nodetree.proto b/paddle/fluid/platform/profiler/dump/nodetree.proto index 7016745059d..4ebfb6e73b3 100644 --- a/paddle/fluid/platform/profiler/dump/nodetree.proto +++ b/paddle/fluid/platform/profiler/dump/nodetree.proto @@ -46,6 +46,19 @@ enum TracerEventTypeProto { PythonOp = 13; // Used to mark python level userdefined PythonUserDefined = 14; + // Used to mark mlu runtime record returned by cnpapi + MluRuntime = 15; +}; + +enum TracerMemEventTypeProto { + // Used to mark memory allocation which is managed by paddle + Allocate = 0; + // Used to mark memory free which is managed by paddle + Free = 1; + // Used to mark reserved memory allocation which is applied from device. + ReservedAllocate = 2; + // Used to mark reserved memory free which is released to device. + ReservedFree = 3; }; message KernelEventInfoProto { @@ -121,6 +134,62 @@ message HostTraceEventProto { required uint64 thread_id = 6; } +message MemTraceEventProto { + // timestamp of the record + required uint64 timestamp_ns = 1; + // memory manipulation type + required TracerMemEventTypeProto type = 2; + // memory addr of allocation or free + required uint64 addr = 3; + // process id of the record + required uint64 process_id = 4; + // thread id of the record + required uint64 thread_id = 5; + // increase bytes after this manipulation, allocation for sign +, free for + // sign - + required int64 increase_bytes = 6; + // place + required string place = 7; + // current total allocated memory + required uint64 current_allocated = 8; + // current total reserved memory + required uint64 current_reserved = 9; + // current peak allocated memory + required uint64 peak_allocated = 10; + // current peak reserved memory + required uint64 peak_reserved = 11; +} + +message OperatorSupplementEventProto { + // timestamp of the record + required uint64 timestamp_ns = 1; + // op type name + required string op_type = 2; + // process id of the record + required uint64 process_id = 3; + // thread id of the record + required uint64 thread_id = 4; + // input shapes + message input_shape_proto { + repeated string key = 1; + message shape_vector { + message shape { repeated uint64 size = 1; } + repeated shape shapes = 1; + } + repeated shape_vector shape_vecs = 2; + } + required input_shape_proto input_shapes = 5; + // dtypes + message dtype_proto { + repeated string key = 1; + message dtype_vector { repeated string dtype = 1; } + repeated dtype_vector dtype_vecs = 2; + } + required dtype_proto dtypes = 6; + // call stack + required string callstack = 7; +} + message CudaRuntimeTraceEventProto { // record name required string name = 1; @@ -166,6 +235,12 @@ message DeviceTraceEventProto { } } +message OperatorSupplementEventNodeProto { + required OperatorSupplementEventProto op_supplement_event = 1; +} + +message MemTraceEventNodeProto { required MemTraceEventProto mem_event = 1; } + message DeviceTraceEventNodeProto { required DeviceTraceEventProto device_event = 1; } @@ -180,6 +255,9 @@ message HostTraceEventNodeProto { required int64 parentid = 2; required HostTraceEventProto host_trace_event = 3; repeated CudaRuntimeTraceEventNodeProto runtime_nodes = 4; + // below is added in version 1.0.1 + repeated MemTraceEventNodeProto mem_nodes = 5; + repeated OperatorSupplementEventNodeProto op_supplement_nodes = 6; } message ThreadNodeTreeProto { diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc index 73021f4362a..cbb86e76d3a 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc @@ -20,19 +20,19 @@ namespace paddle { namespace platform { static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb"; -static const char* version = "1.0.0"; +static const char* version = "1.0.1"; static uint32_t span_indx = 0; static std::string DefaultFileName() { auto pid = GetProcessId(); - return string_format(std::string(kDefaultFilename), pid, - GetStringFormatLocalTime().c_str()); + return string_format( + std::string(kDefaultFilename), pid, GetStringFormatLocalTime().c_str()); } void SerializationLogger::OpenFile() { - output_file_stream_.open(filename_, std::ofstream::out | - std::ofstream::trunc | - std::ofstream::binary); + output_file_stream_.open( + filename_, + std::ofstream::out | std::ofstream::trunc | std::ofstream::binary); if (!output_file_stream_) { LOG(WARNING) << "Unable to open file for writing profiling data." << std::endl; @@ -50,7 +50,8 @@ void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) { thread2host_event_nodes = node_trees.Traverse(true); for (auto it = thread2host_event_nodes.begin(); - it != thread2host_event_nodes.end(); ++it) { + it != thread2host_event_nodes.end(); + ++it) { // 1. order every node an index, every node a parent std::map node_index_map; std::map node_parent_map; @@ -64,7 +65,8 @@ void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) { for (auto hostnode = it->second.begin(); hostnode != it->second.end(); ++hostnode) { for (auto childnode = (*hostnode)->GetChildren().begin(); - childnode != (*hostnode)->GetChildren().end(); ++childnode) { + childnode != (*hostnode)->GetChildren().end(); + ++childnode) { node_parent_map[(*childnode)] = node_index_map[(*hostnode)]; // mark each node's parent } @@ -106,10 +108,36 @@ void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) { (*devicenode)->LogMe(this); // fill detail information } } + for (auto memnode = (*hostnode)->GetMemTraceEventNodes().begin(); + memnode != (*hostnode)->GetMemTraceEventNodes().end(); + ++memnode) { + MemTraceEventNodeProto* mem_node_proto = + current_host_trace_event_node_proto_->add_mem_nodes(); + current_mem_trace_event_node_proto_ = mem_node_proto; + (*memnode)->LogMe(this); + } } } } +void SerializationLogger::LogMemTraceEventNode( + const MemTraceEventNode& mem_node) { + MemTraceEventProto* mem_trace_event = new MemTraceEventProto(); + mem_trace_event->set_timestamp_ns(mem_node.TimeStampNs()); + mem_trace_event->set_type( + static_cast(mem_node.Type())); + mem_trace_event->set_addr(mem_node.Addr()); + mem_trace_event->set_process_id(mem_node.ProcessId()); + mem_trace_event->set_thread_id(mem_node.ThreadId()); + mem_trace_event->set_increase_bytes(mem_node.IncreaseBytes()); + mem_trace_event->set_place(mem_node.Place()); + mem_trace_event->set_current_allocated(mem_node.CurrentAllocated()); + mem_trace_event->set_current_reserved(mem_node.CurrentReserved()); + mem_trace_event->set_peak_allocated(mem_node.PeakAllocated()); + mem_trace_event->set_peak_reserved(mem_node.PeakReserved()); + current_mem_trace_event_node_proto_->set_allocated_mem_event(mem_trace_event); +} + void SerializationLogger::LogHostTraceEventNode( const HostTraceEventNode& host_node) { HostTraceEventProto* host_trace_event = new HostTraceEventProto(); @@ -122,6 +150,63 @@ void SerializationLogger::LogHostTraceEventNode( host_trace_event->set_thread_id(host_node.ThreadId()); current_host_trace_event_node_proto_->set_allocated_host_trace_event( host_trace_event); + OperatorSupplementEventNode* op_supplement_event_node = + host_node.GetOperatorSupplementEventNode(); + if (op_supplement_event_node != nullptr) { + current_op_supplement_event_node_proto_ = + current_host_trace_event_node_proto_->add_op_supplement_nodes(); + OperatorSupplementEventProto* op_supplement_event_proto = + new OperatorSupplementEventProto(); + op_supplement_event_proto->set_op_type(op_supplement_event_node->Name()); + op_supplement_event_proto->set_timestamp_ns( + op_supplement_event_node->TimeStampNs()); + op_supplement_event_proto->set_process_id( + op_supplement_event_node->ProcessId()); + op_supplement_event_proto->set_thread_id( + op_supplement_event_node->ThreadId()); + op_supplement_event_proto->set_callstack( + op_supplement_event_node->CallStack()); + + OperatorSupplementEventProto::input_shape_proto* input_shape_proto = + op_supplement_event_proto->mutable_input_shapes(); + for (auto it = op_supplement_event_node->InputShapes().begin(); + it != op_supplement_event_node->InputShapes().end(); + it++) { + input_shape_proto->add_key(it->first); + OperatorSupplementEventProto::input_shape_proto::shape_vector* + shape_vectors_proto = input_shape_proto->add_shape_vecs(); + auto shape_vectors = it->second; + for (auto shape_vecs_it = shape_vectors.begin(); + shape_vecs_it != shape_vectors.end(); + shape_vecs_it++) { + auto shape_vector = *shape_vecs_it; + OperatorSupplementEventProto::input_shape_proto::shape_vector::shape* + shape_proto = shape_vectors_proto->add_shapes(); + for (auto shape_it = shape_vector.begin(); + shape_it != shape_vector.end(); + shape_it++) { + shape_proto->add_size(*shape_it); + } + } + } + + OperatorSupplementEventProto::dtype_proto* dtype_proto = + op_supplement_event_proto->mutable_dtypes(); + for (auto it = op_supplement_event_node->Dtypes().begin(); + it != op_supplement_event_node->Dtypes().end(); + it++) { + dtype_proto->add_key(it->first); + OperatorSupplementEventProto::dtype_proto::dtype_vector* + dtype_vector_proto = dtype_proto->add_dtype_vecs(); + auto dtype_vector = it->second; + for (auto dtype_it = dtype_vector.begin(); dtype_it != dtype_vector.end(); + dtype_it++) { + dtype_vector_proto->add_dtype(*dtype_it); + } + } + current_op_supplement_event_node_proto_->set_allocated_op_supplement_event( + op_supplement_event_proto); + } } void SerializationLogger::LogRuntimeTraceEventNode( diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h index 378834cff59..31910cb68c5 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.h +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h @@ -34,6 +34,7 @@ class SerializationLogger : public BaseLogger { void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override; void LogNodeTrees(const NodeTrees&) override; void LogMetaInfo(const std::unordered_map); + void LogMemTraceEventNode(const MemTraceEventNode&) override; private: void OpenFile(); @@ -48,6 +49,8 @@ class SerializationLogger : public BaseLogger { HostTraceEventNodeProto* current_host_trace_event_node_proto_; CudaRuntimeTraceEventNodeProto* current_runtime_trace_event_node_proto_; DeviceTraceEventNodeProto* current_device_trace_event_node_proto_; + MemTraceEventNodeProto* current_mem_trace_event_node_proto_; + OperatorSupplementEventNodeProto* current_op_supplement_event_node_proto_; }; } // namespace platform diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc index 9380a26dbc3..a49d799c785 100644 --- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc @@ -35,6 +35,7 @@ using paddle::platform::ProfilerResult; using paddle::platform::RuntimeTraceEvent; using paddle::platform::SerializationLogger; using paddle::platform::TracerEventType; +using paddle::platform::TracerMemEventType; TEST(SerializationLoggerTest, dump_case0) { std::list host_events; @@ -54,6 +55,36 @@ TEST(SerializationLoggerTest, dump_case0) { std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10)); host_events.push_back(HostTraceEvent( std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11)); + mem_events.push_back(MemTraceEvent(11500, + 0x1000, + TracerMemEventType::Allocate, + 10, + 10, + 50, + "GPU:0", + 50, + 50, + 100, + 100)); + mem_events.push_back(MemTraceEvent(11900, + 0x1000, + TracerMemEventType::Free, + 10, + 10, + -50, + "GPU:0", + 0, + 50, + 100, + 100)); + std::map>> input_shapes; + std::map> dtypes; + input_shapes[std::string("X")].push_back(std::vector{1, 2, 3}); + input_shapes[std::string("X")].push_back(std::vector{4, 5, 6, 7}); + dtypes[std::string("X")].push_back(std::string("int8")); + dtypes[std::string("X")].push_back(std::string("float32")); + op_supplement_events.push_back(OperatorSupplementEvent( + 11600, "op1", input_shapes, dtypes, "op1()", 10, 10)); runtime_events.push_back(RuntimeTraceEvent( std::string("cudalaunch1"), 15000, 17000, 10, 10, 1, 0)); runtime_events.push_back(RuntimeTraceEvent( @@ -128,6 +159,8 @@ TEST(SerializationLoggerTest, dump_case0) { if ((*it)->Name() == "op1") { EXPECT_EQ((*it)->GetChildren().size(), 0u); EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + EXPECT_EQ((*it)->GetMemTraceEventNodes().size(), 2u); + EXPECT_NE((*it)->GetOperatorSupplementEventNode(), nullptr); } } for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) { @@ -137,6 +170,7 @@ TEST(SerializationLoggerTest, dump_case0) { } } tree.LogMe(&logger); + logger.LogMetaInfo(std::unordered_map()); } TEST(SerializationLoggerTest, dump_case1) { @@ -224,6 +258,7 @@ TEST(SerializationLoggerTest, dump_case1) { } } tree.LogMe(&logger); + logger.LogMetaInfo(std::unordered_map()); } TEST(DeserializationReaderTest, restore_case0) { @@ -243,6 +278,8 @@ TEST(DeserializationReaderTest, restore_case0) { if ((*it)->Name() == "op1") { EXPECT_EQ((*it)->GetChildren().size(), 0u); EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + EXPECT_EQ((*it)->GetMemTraceEventNodes().size(), 2u); + EXPECT_NE((*it)->GetOperatorSupplementEventNode(), nullptr); } } for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) { diff --git a/paddle/fluid/platform/profiler/event_node.cc b/paddle/fluid/platform/profiler/event_node.cc index dbf132d0669..ec6efc11aca 100644 --- a/paddle/fluid/platform/profiler/event_node.cc +++ b/paddle/fluid/platform/profiler/event_node.cc @@ -92,11 +92,9 @@ void NodeTrees::BuildTrees( ++it) { auto dst_iter = correlation_id2runtime_event_node.find((*it)->CorrelationId()); - PADDLE_ENFORCE_NE( - dst_iter, - correlation_id2runtime_event_node.end(), - platform::errors::NotFound("Unknown device events, " - "no corresponding cuda runtime events")); + if (dst_iter == correlation_id2runtime_event_node.end()) { + continue; + } dst_iter->second->AddDeviceTraceEventNode(*it); } // construct thread2mem_event_nodes @@ -375,22 +373,9 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship( hasenter = true; } (*it)->SetOperatorSupplementNode(*op_supplement_it); - PADDLE_ENFORCE_EQ((*it)->Type(), - TracerEventType::Operator, - platform::errors::PreconditionNotMet( - "Operator supplement events should be embraced " - "by event of type TracerEventType::Operator, " - "but got type TracerEventType::%s", - StringTracerEventType((*it)->Type()))); op_supplement_count += 1; } else { if ((*op_supplement_it)->TimeStampNs() > (*it)->EndNs()) { - PADDLE_ENFORCE_LE(op_supplement_count, - 1, - platform::errors::PreconditionNotMet( - "One event of TracerEventType::Operator has no " - "more than 1 op supplement event, but got %d.", - op_supplement_count)); lastposition = op_supplement_it; break; } diff --git a/paddle/fluid/platform/profiler/event_node.h b/paddle/fluid/platform/profiler/event_node.h index 13ec1151005..34e6556f7f4 100644 --- a/paddle/fluid/platform/profiler/event_node.h +++ b/paddle/fluid/platform/profiler/event_node.h @@ -47,6 +47,8 @@ class MemTraceEventNode { std::string Place() const { return mem_event_.place; } uint64_t CurrentAllocated() const { return mem_event_.current_allocated; } uint64_t CurrentReserved() const { return mem_event_.current_reserved; } + uint64_t PeakAllocated() const { return mem_event_.peak_allocated; } + uint64_t PeakReserved() const { return mem_event_.peak_reserved; } // member function void LogMe(BaseLogger* logger) { logger->LogMemTraceEventNode(*this); } diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc index 1a6f19d2f93..028d666f355 100644 --- a/paddle/fluid/platform/profiler/event_python.cc +++ b/paddle/fluid/platform/profiler/event_python.cc @@ -31,6 +31,9 @@ HostPythonNode::~HostPythonNode() { for (auto it = device_node_ptrs.begin(); it != device_node_ptrs.end(); ++it) { delete *it; } + for (auto it = mem_node_ptrs.begin(); it != mem_node_ptrs.end(); ++it) { + delete *it; + } } HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { @@ -52,7 +55,8 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { } // copy its CudaRuntimeTraceEventNode for (auto runtimenode = root->GetRuntimeTraceEventNodes().begin(); - runtimenode != root->GetRuntimeTraceEventNodes().end(); ++runtimenode) { + runtimenode != root->GetRuntimeTraceEventNodes().end(); + ++runtimenode) { HostPythonNode* runtime_python_node = new HostPythonNode(); runtime_python_node->name = (*runtimenode)->Name(); runtime_python_node->type = (*runtimenode)->Type(); @@ -76,6 +80,32 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { runtime_python_node->device_node_ptrs.push_back(device_python_node); } } + // copy MemTraceEventNode + for (auto memnode = root->GetMemTraceEventNodes().begin(); + memnode != root->GetMemTraceEventNodes().end(); + memnode++) { + MemPythonNode* mem_python_node = new MemPythonNode(); + mem_python_node->timestamp_ns = (*memnode)->TimeStampNs(); + mem_python_node->addr = (*memnode)->Addr(); + mem_python_node->type = (*memnode)->Type(); + mem_python_node->process_id = (*memnode)->ProcessId(); + mem_python_node->thread_id = (*memnode)->ThreadId(); + mem_python_node->increase_bytes = (*memnode)->IncreaseBytes(); + mem_python_node->place = (*memnode)->Place(); + mem_python_node->current_allocated = (*memnode)->CurrentAllocated(); + mem_python_node->current_reserved = (*memnode)->CurrentReserved(); + mem_python_node->peak_allocated = (*memnode)->PeakAllocated(); + mem_python_node->peak_reserved = (*memnode)->PeakReserved(); + host_python_node->mem_node_ptrs.push_back(mem_python_node); + } + // copy OperatorSupplementEventNode's information if exists + OperatorSupplementEventNode* op_supplement_node = + root->GetOperatorSupplementEventNode(); + if (op_supplement_node != nullptr) { + host_python_node->input_shapes = op_supplement_node->InputShapes(); + host_python_node->dtypes = op_supplement_node->Dtypes(); + host_python_node->callstack = op_supplement_node->CallStack(); + } return host_python_node; } @@ -93,7 +123,8 @@ ProfilerResult::ProfilerResult(std::unique_ptr tree, ProfilerResult::~ProfilerResult() { // delete all root nodes for (auto it = thread_event_trees_map_.begin(); - it != thread_event_trees_map_.end(); ++it) { + it != thread_event_trees_map_.end(); + ++it) { delete it->second; } } diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h index 12ecb9fde32..44f6e61fd37 100644 --- a/paddle/fluid/platform/profiler/event_python.h +++ b/paddle/fluid/platform/profiler/event_python.h @@ -43,6 +43,35 @@ struct DevicePythonNode { uint64_t stream_id; }; +struct MemPythonNode { + MemPythonNode() = default; + ~MemPythonNode() {} + + // timestamp of the record + uint64_t timestamp_ns; + // memory addr of allocation or free + uint64_t addr; + // memory manipulation type + TracerMemEventType type; + // process id of the record + uint64_t process_id; + // thread id of the record + uint64_t thread_id; + // increase bytes after this manipulation, allocation for sign +, free for + // sign - + int64_t increase_bytes; + // place + std::string place; + // current total allocated memory + uint64_t current_allocated; + // current total reserved memory + uint64_t current_reserved; + // peak allocated memory + uint64_t peak_allocated; + // peak reserved memory + uint64_t peak_reserved; +}; + struct HostPythonNode { HostPythonNode() = default; ~HostPythonNode(); @@ -58,12 +87,19 @@ struct HostPythonNode { uint64_t process_id; // thread id of the record uint64_t thread_id; + // input shapes + std::map>> input_shapes; + std::map> dtypes; + // call stack + std::string callstack; // children node std::vector children_node_ptrs; // runtime node std::vector runtime_node_ptrs; // device node std::vector device_node_ptrs; + // mem node + std::vector mem_node_ptrs; }; class ProfilerResult { diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h index afd41352465..8d9c2a4cfc5 100644 --- a/paddle/fluid/platform/profiler/host_event_recorder.h +++ b/paddle/fluid/platform/profiler/host_event_recorder.h @@ -17,10 +17,10 @@ #include #include #include + #include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h" #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/os_info.h" -#include "paddle/fluid/platform/profiler/common_event.h" namespace paddle { namespace platform { @@ -28,9 +28,11 @@ namespace platform { template struct ContainsStdString : std::conditional_t< - std::is_same>>::value, - std::true_type, ContainsStdString> {}; + std::is_same< + std::string, + std::remove_cv_t>>::value, + std::true_type, + ContainsStdString> {}; template struct ContainsStdString @@ -58,7 +60,7 @@ class EventContainer { public: // Record an event template - void Record(Args &&... args) { + void Record(Args &&...args) { DoRecord(ContainsStdString(), std::forward(args)...); } @@ -112,7 +114,7 @@ class EventContainer { // Record an event with string arguments template - void DoRecord(std::true_type, Args &&... args) { + void DoRecord(std::true_type, Args &&...args) { auto *storage = GetEventStorage(); std::function allocator = [this](size_t size) { return GetStrBufFromArena(size); @@ -122,7 +124,7 @@ class EventContainer { // Record an event without any string argument template - void DoRecord(std::false_type, Args &&... args) { + void DoRecord(std::false_type, Args &&...args) { auto *storage = GetEventStorage(); new (storage) EventType(std::forward(args)...); } @@ -181,12 +183,14 @@ char *EventContainer::GetStringStorage(size_t sz) { return storage; } +template struct ThreadEventSection { std::string thread_name; uint64_t thread_id; - std::vector events; + std::vector events; }; +template class ThreadEventRecorder { public: ThreadEventRecorder() { @@ -199,12 +203,12 @@ class ThreadEventRecorder { public: // Forward call to EventContainer::Record template - void RecordEvent(Args &&... args) { + void RecordEvent(Args &&...args) { base_evt_cntr_.Record(std::forward(args)...); } - ThreadEventSection GatherEvents() { - ThreadEventSection thr_sec; + ThreadEventSection GatherEvents() { + ThreadEventSection thr_sec; thr_sec.thread_name = thread_name_; thr_sec.thread_id = thread_id_; thr_sec.events = std::move(base_evt_cntr_.Reduce()); @@ -214,15 +218,17 @@ class ThreadEventRecorder { private: uint64_t thread_id_; std::string thread_name_; - EventContainer base_evt_cntr_; + EventContainer base_evt_cntr_; }; +template struct HostEventSection { std::string process_name; uint64_t process_id; - std::vector thr_sections; + std::vector> thr_sections; }; +template class HostEventRecorder { public: // singleton @@ -237,37 +243,51 @@ class HostEventRecorder { // Do your best to avoid using 'std::string' as the argument type. // It will cause deep-copy to harm performance. template - void RecordEvent(Args &&... args) { - GetThreadLocalRecorder()->RecordEvent(std::forward(args)...); + void RecordEvent(Args &&...args) { + // Get thread local ThreadEventRecorder + // If not exists, we create a new one. + // Both HostEventRecorder and thread-local varibale in + // ThreadEventRecorderRegistry keep the shared pointer. We add this to + // prevent ThreadEventRecorder being destroyed by thread-local variable in + // ThreadEventRecorderRegistry and lose data. + if (GetThreadLocalRecorder()->get() == nullptr) { + std::shared_ptr> + thread_event_recorder_ptr = + std::make_shared>(); + *(GetThreadLocalRecorder()) = thread_event_recorder_ptr; + thr_recorders_.push_back(thread_event_recorder_ptr); + } + (*GetThreadLocalRecorder())->RecordEvent(std::forward(args)...); } // thread-unsafe, make sure make sure there is no running tracing. // Poor performance, call it at the ending - HostEventSection GatherEvents() { - auto thr_recorders = - ThreadEventRecorderRegistry::GetInstance().GetAllThreadDataByRef(); - HostEventSection host_sec; + HostEventSection GatherEvents() { + HostEventSection host_sec; host_sec.process_id = GetProcessId(); - host_sec.thr_sections.reserve(thr_recorders.size()); - for (auto &kv : thr_recorders) { - auto &thr_recorder = kv.second.get(); - host_sec.thr_sections.emplace_back( - std::move(thr_recorder.GatherEvents())); + host_sec.thr_sections.reserve(thr_recorders_.size()); + for (auto &v : thr_recorders_) { + host_sec.thr_sections.emplace_back(std::move(v->GatherEvents())); } return host_sec; } private: - using ThreadEventRecorderRegistry = - framework::ThreadDataRegistry; + using ThreadEventRecorderRegistry = framework::ThreadDataRegistry< + std::shared_ptr>>; HostEventRecorder() = default; DISABLE_COPY_AND_ASSIGN(HostEventRecorder); - ThreadEventRecorder *GetThreadLocalRecorder() { + std::shared_ptr> *GetThreadLocalRecorder() { return ThreadEventRecorderRegistry::GetInstance() .GetMutableCurrentThreadData(); } + // Hold all thread-local ThreadEventRecorders + // ThreadEventRecorderRegistry and HostEventRecorder both take care of this + // shared pointer. We add this to prevent ThreadEventRecorder being destroyed + // by thread-local variable in ThreadEventRecorderRegistry and lose data. + std::vector>> thr_recorders_; }; } // namespace platform diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc index b7eb53331b7..75abdb91673 100644 --- a/paddle/fluid/platform/profiler/host_tracer.cc +++ b/paddle/fluid/platform/profiler/host_tracer.cc @@ -11,8 +11,10 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/platform/profiler/host_tracer.h" + +#include + #include "glog/logging.h" #include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/profiler/common_event.h" @@ -20,7 +22,8 @@ // Used to filter events, works like glog VLOG(level). // RecordEvent will works if host_trace_level >= level. -PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 1, +PADDLE_DEFINE_EXPORTED_int64(host_trace_level, + 1, "RecordEvent will works " "if host_trace_level >= level."); @@ -29,7 +32,7 @@ namespace platform { namespace { -void ProcessHostEvents(const HostEventSection& host_events, +void ProcessHostEvents(const HostEventSection& host_events, TraceEventCollector* collector) { for (const auto& thr_sec : host_events.thr_sections) { uint64_t tid = thr_sec.thread_id; @@ -49,6 +52,53 @@ void ProcessHostEvents(const HostEventSection& host_events, } } +void ProcessOperatorSupplementEvents( + const HostEventSection& op_supplement_events, + TraceEventCollector* collector) { + for (const auto& thr_sec : op_supplement_events.thr_sections) { + uint64_t tid = thr_sec.thread_id; + if (thr_sec.thread_name != kDefaultThreadName) { + collector->AddThreadName(tid, thr_sec.thread_name); + } + for (const auto& evt : thr_sec.events) { + OperatorSupplementEvent event; + event.timestamp_ns = evt.timestamp_ns; + event.op_type = evt.op_type; + std::map>> input_shapes; + std::map> dtypes; + std::string callstack; + for (auto it = evt.input_shapes.begin(); it != evt.input_shapes.end(); + it++) { + for (auto idx = 0lu; idx < it->second.size(); idx++) { + input_shapes[it->first].push_back(std::vector()); + for (auto dim_idx = 0; dim_idx < it->second.at(idx).size(); + dim_idx++) { + input_shapes[it->first][idx].push_back( + it->second.at(idx).at(dim_idx)); + } + } + } + for (auto it = evt.dtypes.begin(); it != evt.dtypes.end(); it++) { + for (auto idx = 0lu; idx < it->second.size(); idx++) { + dtypes[it->first].push_back( + framework::proto::VarType::Type_Name(it->second.at(idx))); + } + } + + std::ostringstream result_string; + for (auto it = evt.callstack.begin(); it != evt.callstack.end(); it++) { + result_string << (*it) << std::endl; + } + event.input_shapes = input_shapes; + event.dtypes = dtypes; + event.callstack = result_string.str(); + event.process_id = op_supplement_events.process_id; + event.thread_id = tid; + collector->AddOperatorSupplementEvent(std::move(event)); + } + } +} + } // namespace void HostTracer::PrepareTracing() { @@ -59,16 +109,20 @@ void HostTracer::PrepareTracing() { void HostTracer::StartTracing() { PADDLE_ENFORCE_EQ( - state_ == TracerState::READY || state_ == TracerState::STOPED, true, + state_ == TracerState::READY || state_ == TracerState::STOPED, + true, platform::errors::PreconditionNotMet("TracerState must be READY")); - HostEventRecorder::GetInstance().GatherEvents(); + HostEventRecorder::GetInstance().GatherEvents(); + HostEventRecorder::GetInstance() + .GatherEvents(); HostTraceLevel::GetInstance().SetLevel(options_.trace_level); state_ = TracerState::STARTED; } void HostTracer::StopTracing() { PADDLE_ENFORCE_EQ( - state_, TracerState::STARTED, + state_, + TracerState::STARTED, platform::errors::PreconditionNotMet("TracerState must be STARTED")); HostTraceLevel::GetInstance().SetLevel(HostTraceLevel::kDisabled); state_ = TracerState::STOPED; @@ -76,11 +130,16 @@ void HostTracer::StopTracing() { void HostTracer::CollectTraceData(TraceEventCollector* collector) { PADDLE_ENFORCE_EQ( - state_, TracerState::STOPED, + state_, + TracerState::STOPED, platform::errors::PreconditionNotMet("TracerState must be STOPED")); - HostEventSection host_events = - HostEventRecorder::GetInstance().GatherEvents(); + HostEventSection host_events = + HostEventRecorder::GetInstance().GatherEvents(); ProcessHostEvents(host_events, collector); + HostEventSection op_supplement_events = + HostEventRecorder::GetInstance() + .GatherEvents(); + ProcessOperatorSupplementEvents(op_supplement_events, collector); } } // namespace platform diff --git a/paddle/fluid/platform/profiler/supplement_tracing.h b/paddle/fluid/platform/profiler/supplement_tracing.h new file mode 100644 index 00000000000..270223d13b2 --- /dev/null +++ b/paddle/fluid/platform/profiler/supplement_tracing.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/framework/shape_inference.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/platform/profiler/trace_event.h" +#include "paddle/phi/core/compat/arg_map_context.h" + +namespace paddle { + +namespace framework { +class RuntimeContext; +} +namespace platform { + +class RecordOpInfoSupplement { + public: + /** + * @param type: Operator type name. + * @param attrs: Attribute map of op. + * @param shape_ctx: Infershape context object. + * @param ctx: Runtime context object. + */ + explicit RecordOpInfoSupplement(const std::string& type, + const framework::AttributeMap& attrs, + const framework::InferShapeContext& shape_ctx, + const framework::RuntimeContext& ctx); + /** + * @param type: Operator type name. + * @param attrs: Attribute map of op. + * @param shape_ctx: Infershape context object. + * @param kernel_signature: KernelSignature object, used in dygraph. + */ + explicit RecordOpInfoSupplement(const std::string& type, + const framework::AttributeMap& attrs, + const framework::InferShapeContext& shape_ctx, + const phi::KernelSignature& kernel_signature); +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/test_event_node.cc b/paddle/fluid/platform/profiler/test_event_node.cc index 41a5ebce023..dcf6dd56d74 100644 --- a/paddle/fluid/platform/profiler/test_event_node.cc +++ b/paddle/fluid/platform/profiler/test_event_node.cc @@ -60,9 +60,20 @@ TEST(NodeTreesTest, LogMe_case0) { 50, "GPU:0", 50, - 50)); - mem_events.push_back(MemTraceEvent( - 11900, 0x1000, TracerMemEventType::Free, 10, 10, -50, "GPU:0", 0, 50)); + 50, + 100, + 100)); + mem_events.push_back(MemTraceEvent(11900, + 0x1000, + TracerMemEventType::Free, + 10, + 10, + -50, + "GPU:0", + 0, + 50, + 100, + 100)); std::map>> input_shapes; std::map> dtypes; input_shapes[std::string("X")].push_back(std::vector{1, 2, 3}); @@ -267,9 +278,20 @@ TEST(NodeTreesTest, HandleTrees_case0) { 50, "GPU:0", 50, - 50)); - mem_events.push_back(MemTraceEvent( - 11900, 0x1000, TracerMemEventType::Free, 10, 10, -50, "GPU:0", 0, 50)); + 50, + 100, + 100)); + mem_events.push_back(MemTraceEvent(11900, + 0x1000, + TracerMemEventType::Free, + 10, + 10, + -50, + "GPU:0", + 0, + 50, + 100, + 100)); op_supplement_events.push_back(OperatorSupplementEvent( 11600, "op1", diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h index d50c5584f5c..62d82c19d17 100644 --- a/paddle/fluid/platform/profiler/trace_event.h +++ b/paddle/fluid/platform/profiler/trace_event.h @@ -59,10 +59,14 @@ enum class TracerEventType { }; enum class TracerMemEventType { - // Used to mark memory allocation + // Used to mark memory allocation which is managed by paddle Allocate = 0, - // Used to mark memory free + // Used to mark memory free which is managed by paddle Free = 1, + // Used to mark reserved memory allocation which is applied from device. + ReservedAllocate = 2, + // Used to mark reserved memory free which is released to device. + ReservedFree = 3, // A flag to denote the number of current types NumTypes }; @@ -318,7 +322,9 @@ struct MemTraceEvent { int64_t increase_bytes, const std::string& place, uint64_t current_allocated, - uint64_t current_reserved) + uint64_t current_reserved, + uint64_t peak_allocated, + uint64_t peak_reserved) : timestamp_ns(timestamp_ns), addr(addr), type(type), @@ -327,7 +333,9 @@ struct MemTraceEvent { increase_bytes(increase_bytes), place(place), current_allocated(current_allocated), - current_reserved(current_reserved) {} + current_reserved(current_reserved), + peak_allocated(peak_allocated), + peak_reserved(peak_reserved) {} // timestamp of the record uint64_t timestamp_ns; @@ -348,6 +356,10 @@ struct MemTraceEvent { uint64_t current_allocated; // current total reserved memory uint64_t current_reserved; + // current peak allocated memory + uint64_t peak_allocated; + // current peak reserved memory + uint64_t peak_reserved; }; } // namespace platform diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc index bbfc687738d..11035867416 100644 --- a/paddle/fluid/platform/profiler/utils.cc +++ b/paddle/fluid/platform/profiler/utils.cc @@ -91,7 +91,8 @@ float CalculateEstOccupancy(uint32_t DeviceId, #endif const char* StringTracerMemEventType(TracerMemEventType type) { - static const char* categary_name_[] = {"Allocate", "Free"}; + static const char* categary_name_[] = { + "Allocate", "Free", "ReservedAllocate", "ReservedFree"}; return categary_name_[static_cast(type)]; } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index a003de812a3..d2d5bbf8d24 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -3515,6 +3515,10 @@ All parameter, weight, gradient are variables in Paddle. .def_readwrite("process_id", &paddle::platform::HostPythonNode::process_id) .def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id) + .def_readwrite("input_shapes", + &paddle::platform::HostPythonNode::input_shapes) + .def_readwrite("dtypes", &paddle::platform::HostPythonNode::dtypes) + .def_readwrite("callstack", &paddle::platform::HostPythonNode::callstack) .def_readwrite("children_node", &paddle::platform::HostPythonNode::children_node_ptrs) .def_readwrite("runtime_node", -- GitLab