diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 1b2b24762894c0d72e75f0c4d20531e21f146cfd..ce471d55b24a18a7f291db0dc6d93026940096a2 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -132,6 +132,9 @@ FetchResultType FastThreadedSSAGraphExecutor::Run( } // Wait FetchOps. if (!fetch_ops.empty()) { + platform::RecordEvent record_wait( + "FastThreadedSSAGraphExecutor::WaitFetchOps", + platform::TracerEventType::Operator, 1); ClearFetchOp(graph_, &fetch_ops); for (auto &place : places_) { @@ -231,8 +234,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( OpHandleBase *op, const std::shared_ptr> &complete_q) { ++remaining_; - platform::RecordEvent("WorkQueue::AddTask", - platform::TracerEventType::UserDefined, 10 /*level*/); + platform::RecordEvent record("WorkQueue::AddTask", + platform::TracerEventType::UserDefined, + 10 /*level*/); this->pool_->enqueue([=] { std::deque op_queue; op_queue.push_front(op); diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index f951b5d0f507039decfc3b4d0081f17cc9f8f50e..06ce9712f5c52708eeba76c60291af5f91b83b09 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -172,6 +172,8 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool create_local_scope, bool create_vars, const std::vector& skip_ref_cnt_vars, bool force_disable_gc, bool keep_kid_scopes) { + platform::RecordEvent record_run("Executor::Run", + platform::TracerEventType::UserDefined, 1); platform::RecordBlock b(block_id); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); @@ -301,6 +303,8 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, bool create_local_scope, bool create_vars, const std::string& feed_holder_name, const std::string& fetch_holder_name) { + platform::RecordEvent record_run("Executor::Run", + platform::TracerEventType::UserDefined, 1); platform::RecordBlock b(kProgramId); if (FLAGS_use_mkldnn) EnableMKLDNN(program); #ifdef PADDLE_WITH_MKLDNN @@ -428,6 +432,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, int64_t end_op_index, bool create_local_scope, bool create_vars, bool keep_kids) { + platform::RecordEvent record_run("Executor::RunPartialPreparedContext", + platform::TracerEventType::UserDefined, 1); platform::RecordBlock b(kProgramId); PADDLE_ENFORCE_NOT_NULL( scope, platform::errors::InvalidArgument("Scope shouldn't be null")); @@ -518,6 +524,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, auto& op = ctx->ops_[i]; op->Run(*local_scope, place_); if (gc) { + platform::RecordEvent record("CheckGC", + platform::TracerEventType::UserDefined, 10); DeleteUnusedTensors(*local_scope, op.get(), ctx->unused_vars_, gc.get()); } } diff --git a/paddle/fluid/framework/ir/cost_model.cc b/paddle/fluid/framework/ir/cost_model.cc index 5027c50103a52932bb14fec4677ec48db2cc9a22..6086409ffd971fff2a3cc5432e00e59165464634 100644 --- a/paddle/fluid/framework/ir/cost_model.cc +++ b/paddle/fluid/framework/ir/cost_model.cc @@ -44,6 +44,19 @@ double CostData::GetWholeMemoryBytes() const { return whole_memory_bytes_; } const Graph* CostData::GetGraph() const { return graph_; } const ProgramDesc* CostData::GetProgram() const { return program_; } +static bool StringHasEnding(const std::string& full, + const std::string& ending) { + if (full.length() < ending.length()) { + return false; + } + if (full.length() == ending.length()) { + return full == ending; + } + size_t prefix_len = full.length() - ending.length(); + return 0 == full.compare(prefix_len, ending.length(), ending) && + full[prefix_len - 1] == '/'; +} + bool CostData::SetCostData(const ProgramDesc& program, const std::vector>& time_events) { // TODO(zhhsplendid): Make a copy so that CostData can be available even if @@ -77,7 +90,7 @@ bool CostData::SetCostData(const ProgramDesc& program, std::string op_type = op_desc->Type(); while (event_index < main_thread_events.size()) { - if (main_thread_events[event_index].name() == op_type && + if (StringHasEnding(main_thread_events[event_index].name(), op_type) && main_thread_events[event_index].type() == platform::EventType::kPushRange) { break; @@ -97,7 +110,7 @@ bool CostData::SetCostData(const ProgramDesc& program, // ControlFlow Op can be like that, but this version only support global // block // TODO(zhhsplendid): make a more strict mapping between push and pop - if (main_thread_events[event_index].name() == op_type && + if (StringHasEnding(main_thread_events[event_index].name(), op_type) && main_thread_events[event_index].type() == platform::EventType::kPopRange) { break; diff --git a/paddle/fluid/framework/new_executor/event_manager.cc b/paddle/fluid/framework/new_executor/event_manager.cc index cc6fd6e3ed0f94cc75936eb78f2e7ec8fbe48e16..bca2264b66afcc9632689905f41ae0d7757795da 100644 --- a/paddle/fluid/framework/new_executor/event_manager.cc +++ b/paddle/fluid/framework/new_executor/event_manager.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/new_executor/event_manager.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace framework { @@ -24,6 +25,8 @@ void WaitEvent(const Instruction& instruction, const platform::Place& place) { VLOG(3) << "Deal StreamWaitEventOrSync for " << instruction.OpBase()->Type(); for (auto& event_iter : instruction.InputEvents()) { + platform::RecordEvent record("WaitStreamEvent", + platform::TracerEventType::UserDefined, 10); VLOG(3) << "wait var_id: " << event_iter.var_id_ << " 's event with waiter_type: " << event_iter.waiter_type_; event_iter.event_->Wait(event_iter.waiter_type_, @@ -36,6 +39,8 @@ void RecordEvent(const Instruction& instruction, const platform::Place& place) { if (platform::is_cpu_place(place)) return; for (auto& event : instruction.OutputEvents()) { + platform::RecordEvent record("RecordStreamEvent", + platform::TracerEventType::UserDefined, 10); VLOG(3) << "Record event in out_var_id: " << event.var_id_; event.event_->Record(&instruction.DeviceContext()); } @@ -46,6 +51,8 @@ void RecordEvent(const Instruction& instruction) { if (platform::is_cpu_place(instruction.DeviceContext().GetPlace())) return; for (auto& event : instruction.OutputEvents()) { + platform::RecordEvent record("RecordStreamEvent", + platform::TracerEventType::UserDefined, 10); VLOG(3) << "Record event in out_var_id: " << event.var_id_; event.event_->Record(&instruction.DeviceContext()); } diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 29aa7b13a270eb348aa8f603959fb52f4eef677a..20a6e53479323afc5aa153d08f371029dace7f72 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -489,6 +489,8 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { VLOG(4) << "End run " << place << " " << op->DebugStringEx(global_scope_); if (!instr_node.InplaceBackMap().empty()) { + platform::RecordEvent inplaceback_event( + "InplaceVarsBack", platform::TracerEventType::UserDefined, 10); auto& m = instr_node.InplaceBackMap(); // NOTE(zhiqiu): same logic as TransferInplaceVarsBack() in operator.cc for (auto& p : m) { @@ -530,6 +532,8 @@ void InterpreterCore::ExecuteInstructionList( return; } + platform::RecordEvent record_prepare( + "PrepareAtomic", platform::TracerEventType::UserDefined, 1); // NOTE(zhiqiu): get the prepared deps from std::future, and async prepare // those for the next step auto atomic_deps = async_work_queue_->AtomicDeps(); @@ -537,6 +541,7 @@ void InterpreterCore::ExecuteInstructionList( async_work_queue_->PrepareAtomicDeps(dependecy_count_); async_work_queue_->PrepareAtomicVarRef(global_scope_->VecMetaInfo()); + record_prepare.End(); exception_holder_.Clear(); @@ -573,6 +578,9 @@ void InterpreterCore::RunNextInstructions( const Instruction& instr, std::queue* reserved_next_ops, std::vector>* atomic_deps, std::vector>* atomic_var_ref) { + platform::RecordEvent record("RunNextInstructions", + platform::TracerEventType::UserDefined, 10); + VLOG(4) << "atomic 1:" << atomic_deps; auto& next_instr = instr.NextInstructions(); auto IsReady = [atomic_deps](size_t next_id) { @@ -708,6 +716,8 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) { instr.KernelType() != OpFuncType::kQueueAsync) { return; } + platform::RecordEvent record("RecordStreamForGC", + platform::TracerEventType::UserDefined, 10); gpuStream_t stream = reinterpret_cast( instr.DeviceContext()) @@ -799,6 +809,8 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) { void InterpreterCore::CheckGC( const Instruction& instr, std::vector>* atomic_var_ref) { + platform::RecordEvent record("CheckGC", + platform::TracerEventType::UserDefined, 10); size_t instr_id = instr.Id(); auto& var_scope = *global_scope_; diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h index bc65231abe7371a931f709c9190b55fde24f0543..384498584c66a6c3b8c17089c6936eaa3c4c34d2 100644 --- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h +++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h @@ -408,8 +408,8 @@ class ThreadPoolTempl { ec_.Notify(true); return false; } - platform::RecordEvent("SleepWaitForWork", - platform::TracerEventType::UserDefined, 10); + platform::RecordEvent record("WaitForWork", + platform::TracerEventType::UserDefined, 10); ec_.CommitWait(waiter); blocked_--; return true; diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc index 881878ebb12a721e7b194036b9d36a89c5404365..b8dfcad187ca0b3efbe937490933a1a7de3486ef 100644 --- a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc +++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc @@ -55,8 +55,9 @@ class WorkQueueImpl : public WorkQueue { } void AddTask(std::function fn) override { - platform::RecordEvent("WorkQueue::AddTask", - platform::TracerEventType::UserDefined, 10 /*level*/); + platform::RecordEvent record("WorkQueue::AddTask", + platform::TracerEventType::UserDefined, + 10 /*level*/); if (tracker_ != nullptr) { fn = [ task = std::move(fn), raii = CounterGuard(tracker_) @@ -146,8 +147,9 @@ WorkQueueGroupImpl::~WorkQueueGroupImpl() { } void WorkQueueGroupImpl::AddTask(size_t queue_idx, std::function fn) { - platform::RecordEvent("WorkQueue::AddTask", - platform::TracerEventType::UserDefined, 10 /*level*/); + platform::RecordEvent record("WorkQueue::AddTask", + platform::TracerEventType::UserDefined, + 10 /*level*/); assert(queue_idx < queues_.size()); if (queues_options_.at(queue_idx).track_task) { fn = [ diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 5b913ff2d21de51851f631d080b8c1283f29e54e..b088a535a12329f71d308417bb2d52a54466e967 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -916,6 +916,8 @@ void ParallelExecutor::BCastParamsToDevices( FetchResultType ParallelExecutor::Run( const std::vector &fetch_tensors, bool return_merged) { + platform::RecordEvent record_run("ParallelExecutor::Run", + platform::TracerEventType::UserDefined, 1); VLOG(3) << "enter ParallelExecutor Run"; #ifdef PADDLE_WITH_CUDA if (platform::IsCUDAGraphCapturing()) { diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index f5e4941d787097b5e349c0b668d6c95fad137873..782062283e9859b0c5f63203acffd779ae492e39 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -48,8 +48,9 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator( phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl( size_t unaligned_size) { - platform::RecordEvent("AutoGrowthBestFitAllocator::Allocate", - platform::TracerEventType::UserDefined, 9 /*level*/); + platform::RecordEvent record("AutoGrowthBestFitAllocator::Allocate", + platform::TracerEventType::UserDefined, + 9 /*level*/); size_t size = AlignedSize(unaligned_size, alignment_); VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size; @@ -111,8 +112,9 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl( } void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) { - platform::RecordEvent("AutoGrowthBestFitAllocator::Free", - platform::TracerEventType::UserDefined, 9 /*level*/); + platform::RecordEvent record("AutoGrowthBestFitAllocator::Free", + platform::TracerEventType::UserDefined, + 9 /*level*/); VLOG(10) << "Free " << allocation->size() << " bytes, ptr = " << allocation->ptr(); std::lock_guard guard(spinlock_); diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 82233fd4fe82141bb0ac3f891dfb5fc31c439154..80877cb670ba9cfc78ef872484e0c228313b3c4c 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -163,8 +163,9 @@ void StreamSafeCUDAAllocator::SetDefaultStream(gpuStream_t stream) { } phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { - platform::RecordEvent("StreamSafeCUDAAllocator::Allocate", - platform::TracerEventType::UserDefined, 9 /*level*/); + platform::RecordEvent record("StreamSafeCUDAAllocator::Allocate", + platform::TracerEventType::UserDefined, + 9 /*level*/); ProcessUnfreedAllocations(); VLOG(8) << "Try allocate " << size << " bytes"; AllocationPtr underlying_allocation; @@ -192,8 +193,9 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { } void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) { - platform::RecordEvent("StreamSafeCUDAAllocator::Free", - platform::TracerEventType::UserDefined, 9 /*level*/); + platform::RecordEvent record("StreamSafeCUDAAllocator::Free", + platform::TracerEventType::UserDefined, + 9 /*level*/); StreamSafeCUDAAllocation* stream_safe_cuda_allocation = static_cast(allocation); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 96d86ee1a3100457000d410679c2ddff1be1a815..44abf3357d63d0a32722eaae67648c0c2ad2cf55 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2867,7 +2867,7 @@ All parameter, weight, gradient are variables in Paddle. [](StandaloneExecutor &self, std::vector feed_names, std::vector fetch_names) { platform::RecordEvent record_event( - "StandaloneExecutor:run", + "StandaloneExecutor::run", platform::TracerEventType::UserDefined, 1); paddle::framework::FetchList ret; {