From ff7da11778aea2a508d6f673ed1f1dd4107043f4 Mon Sep 17 00:00:00 2001 From: chenjian Date: Tue, 28 Jun 2022 10:52:11 +0800 Subject: [PATCH] Add infer shape in dygraph (#43822) * record memory and op supplement info * update * update * fix a bug * fix memory recording * fix a bug * update * update * fix a bug * update * fix a bug * fix a bug * fix a bug * update dygraph record * add infer shape record * fix * fix * fix * add comments * fix a bug * fix * fix --- paddle/fluid/imperative/CMakeLists.txt | 6 ++-- paddle/fluid/imperative/prepared_operator.cc | 7 ++++ paddle/fluid/platform/profiler.cc | 31 ++++++++++++++++ paddle/fluid/platform/profiler/common_event.h | 2 ++ paddle/fluid/platform/profiler/event_node.cc | 21 ++--------- .../platform/profiler/host_event_recorder.h | 36 +++++++++++++------ .../platform/profiler/supplement_tracing.h | 11 ++++++ 7 files changed, 83 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 1c287449405..2d4a57b82a1 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -22,7 +22,8 @@ if(WITH_XPU) nan_inf_utils phi_api phi_utils - var_helper) + var_helper + profiler) else() cc_library( prepared_operator @@ -38,7 +39,8 @@ else() nan_inf_utils phi_api phi_utils - var_helper) + var_helper + profiler) endif() cc_library( layer diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index c6717857053..950a66d5e6d 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -28,6 +28,7 @@ #include "paddle/fluid/framework/library_type.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/supplement_tracing.h" DECLARE_bool(check_nan_inf); DECLARE_bool(benchmark); @@ -514,6 +515,9 @@ static void PreparedOpRunImpl( arg_map_fn, default_kernel_signature); op.Info().infer_shape_(&infer_shape_ctx); + record_event.End(); + platform::RecordOpInfoSupplement( + op.Type(), op.Attrs(), infer_shape_ctx, ctx); } { @@ -583,6 +587,9 @@ static void PreparedOpRunPtImpl( arg_map_fn, default_kernel_signature); op.Info().infer_shape_(&infer_shape_ctx); + record_event.End(); + platform::RecordOpInfoSupplement( + op.Type(), op.Attrs(), infer_shape_ctx, kernel_signature); } { diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 0369202284d..ec33e9e8198 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -277,6 +277,37 @@ RecordOpInfoSupplement::RecordOpInfoSupplement( PosixInNsec(), type, input_shapes, dtypes, callstack); } +RecordOpInfoSupplement::RecordOpInfoSupplement( + const std::string &type, + const framework::AttributeMap &attrs, + const framework::InferShapeContext &shape_ctx, + const phi::KernelSignature &kernel_signature) { + if (FLAGS_enable_host_event_recorder_hook == false) { + return; + } + std::map> input_shapes; + std::map> dtypes; + for (auto it = kernel_signature.input_names.begin(); + it != kernel_signature.input_names.end(); + it++) { + std::string input_name(*it); + if (shape_ctx.HasInputs(input_name)) { + input_shapes[input_name] = shape_ctx.GetInputsDim(input_name); + dtypes[input_name] = shape_ctx.GetInputsVarType(input_name); + } + } + const std::vector *callstack_ptr = nullptr; + std::vector callstack; + auto iter = attrs.find( + framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); + if (iter != attrs.end()) { + callstack_ptr = &BOOST_GET_CONST(std::vector, iter->second); + callstack = *callstack_ptr; + } + HostEventRecorder::GetInstance().RecordEvent( + PosixInNsec(), type, input_shapes, dtypes, callstack); +} + RecordMemEvent::RecordMemEvent(const void *ptr, const phi::Place &place, size_t size, diff --git a/paddle/fluid/platform/profiler/common_event.h b/paddle/fluid/platform/profiler/common_event.h index 3e166d1d04d..837abbec9ad 100644 --- a/paddle/fluid/platform/profiler/common_event.h +++ b/paddle/fluid/platform/profiler/common_event.h @@ -91,6 +91,8 @@ struct CommonMemEvent { type(type), increase_bytes(increase_bytes), place(place), + current_allocated(current_allocated), + current_reserved(current_reserved), peak_allocated(peak_allocated), peak_reserved(peak_reserved) {} uint64_t timestamp_ns; diff --git a/paddle/fluid/platform/profiler/event_node.cc b/paddle/fluid/platform/profiler/event_node.cc index f0e3ee29d0c..ac340247772 100644 --- a/paddle/fluid/platform/profiler/event_node.cc +++ b/paddle/fluid/platform/profiler/event_node.cc @@ -93,11 +93,9 @@ void NodeTrees::BuildTrees( ++it) { auto dst_iter = correlation_id2runtime_event_node.find((*it)->CorrelationId()); - PADDLE_ENFORCE_NE( - dst_iter, - correlation_id2runtime_event_node.end(), - platform::errors::NotFound("Unknown device events, " - "no corresponding cuda runtime events")); + if (dst_iter == correlation_id2runtime_event_node.end()) { + continue; + } dst_iter->second->AddDeviceTraceEventNode(*it); } // construct thread2mem_event_nodes @@ -376,22 +374,9 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship( hasenter = true; } (*it)->SetOperatorSupplementNode(*op_supplement_it); - PADDLE_ENFORCE_EQ((*it)->Type(), - TracerEventType::Operator, - platform::errors::PreconditionNotMet( - "Operator supplement events should be embraced " - "by event of type TracerEventType::Operator, " - "but got type TracerEventType::%s", - StringTracerEventType((*it)->Type()))); op_supplement_count += 1; } else { if ((*op_supplement_it)->TimeStampNs() > (*it)->EndNs()) { - PADDLE_ENFORCE_LE(op_supplement_count, - 1, - platform::errors::PreconditionNotMet( - "One event of TracerEventType::Operator has no " - "more than 1 op supplement event, but got %d.", - op_supplement_count)); lastposition = op_supplement_it; break; } diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h index bb4c56f5a0a..8d9c2a4cfc5 100644 --- a/paddle/fluid/platform/profiler/host_event_recorder.h +++ b/paddle/fluid/platform/profiler/host_event_recorder.h @@ -244,36 +244,50 @@ class HostEventRecorder { // It will cause deep-copy to harm performance. template void RecordEvent(Args &&...args) { - GetThreadLocalRecorder()->RecordEvent(std::forward(args)...); + // Get thread local ThreadEventRecorder + // If not exists, we create a new one. + // Both HostEventRecorder and thread-local varibale in + // ThreadEventRecorderRegistry keep the shared pointer. We add this to + // prevent ThreadEventRecorder being destroyed by thread-local variable in + // ThreadEventRecorderRegistry and lose data. + if (GetThreadLocalRecorder()->get() == nullptr) { + std::shared_ptr> + thread_event_recorder_ptr = + std::make_shared>(); + *(GetThreadLocalRecorder()) = thread_event_recorder_ptr; + thr_recorders_.push_back(thread_event_recorder_ptr); + } + (*GetThreadLocalRecorder())->RecordEvent(std::forward(args)...); } // thread-unsafe, make sure make sure there is no running tracing. // Poor performance, call it at the ending HostEventSection GatherEvents() { - auto thr_recorders = - ThreadEventRecorderRegistry::GetInstance().GetAllThreadDataByRef(); HostEventSection host_sec; host_sec.process_id = GetProcessId(); - host_sec.thr_sections.reserve(thr_recorders.size()); - for (auto &kv : thr_recorders) { - auto &thr_recorder = kv.second.get(); - host_sec.thr_sections.emplace_back( - std::move(thr_recorder.GatherEvents())); + host_sec.thr_sections.reserve(thr_recorders_.size()); + for (auto &v : thr_recorders_) { + host_sec.thr_sections.emplace_back(std::move(v->GatherEvents())); } return host_sec; } private: - using ThreadEventRecorderRegistry = - framework::ThreadDataRegistry>; + using ThreadEventRecorderRegistry = framework::ThreadDataRegistry< + std::shared_ptr>>; HostEventRecorder() = default; DISABLE_COPY_AND_ASSIGN(HostEventRecorder); - ThreadEventRecorder *GetThreadLocalRecorder() { + std::shared_ptr> *GetThreadLocalRecorder() { return ThreadEventRecorderRegistry::GetInstance() .GetMutableCurrentThreadData(); } + // Hold all thread-local ThreadEventRecorders + // ThreadEventRecorderRegistry and HostEventRecorder both take care of this + // shared pointer. We add this to prevent ThreadEventRecorder being destroyed + // by thread-local variable in ThreadEventRecorderRegistry and lose data. + std::vector>> thr_recorders_; }; } // namespace platform diff --git a/paddle/fluid/platform/profiler/supplement_tracing.h b/paddle/fluid/platform/profiler/supplement_tracing.h index 46b1616d71c..270223d13b2 100644 --- a/paddle/fluid/platform/profiler/supplement_tracing.h +++ b/paddle/fluid/platform/profiler/supplement_tracing.h @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/platform/profiler/trace_event.h" +#include "paddle/phi/core/compat/arg_map_context.h" namespace paddle { @@ -39,6 +40,16 @@ class RecordOpInfoSupplement { const framework::AttributeMap& attrs, const framework::InferShapeContext& shape_ctx, const framework::RuntimeContext& ctx); + /** + * @param type: Operator type name. + * @param attrs: Attribute map of op. + * @param shape_ctx: Infershape context object. + * @param kernel_signature: KernelSignature object, used in dygraph. + */ + explicit RecordOpInfoSupplement(const std::string& type, + const framework::AttributeMap& attrs, + const framework::InferShapeContext& shape_ctx, + const phi::KernelSignature& kernel_signature); }; } // namespace platform -- GitLab