diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md index 5d061e1c00d2ca0194153730a39486b8357fa5b0..faf39f276dbddcd4961407ba2d082c9826051cbe 100644 --- a/doc/fluid/howto/optimization/timeline_cn.md +++ b/doc/fluid/howto/optimization/timeline_cn.md @@ -1,21 +1,27 @@ # 如何使用timeline工具åšæ€§èƒ½åˆ†æž -1. 在è®ç»ƒçš„ä¸»å¾ªçŽ¯å¤–åŠ ä¸Š`with profiler.profiler(...)`。è¿è¡Œä¹‹åŽï¼Œä»£ç 会在`/tmp/profile`目录下生æˆä¸€ä¸ªprofile的记录文件。 +1. 在è®ç»ƒçš„ä¸»å¾ªçŽ¯å¤–åŠ ä¸Š`profiler.start_profiler(...)`å’Œ`profiler.stop_profiler(...)`。è¿è¡Œä¹‹åŽï¼Œä»£ç 会在`/tmp/profile`目录下生æˆä¸€ä¸ªprofile的记录文件。 **æ示:** 请ä¸è¦åœ¨timeline记录信æ¯æ—¶è¿è¡Œå¤ªå¤šæ¬¡è¿ä»£ï¼Œå› 为timelineä¸çš„记录数é‡å’Œè¿ä»£æ¬¡æ•°æ˜¯æˆæ£æ¯”的。 ```python - with profiler.profiler('All', 'total', '/tmp/profile') as prof: - for pass_id in range(pass_num): - for batch_id, data in enumerate(train_reader()): - exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[]) + for pass_id in range(pass_num): + for batch_id, data in enumerate(train_reader()): + if pass_id == 0 and batch_id == 5: + profiler.start_profiler("All") + elif pass_id == 0 and batch_id == 10: + profiler.stop_profiler("total", "/tmp/profile") + exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[]) ... ``` 1. è¿è¡Œ`python paddle/tools/timeline.py`æ¥å¤„ç†`/tmp/profile`,这个程åºé»˜è®¤ä¼šç”Ÿæˆä¸€ä¸ª`/tmp/timeline`æ–‡ä»¶ï¼Œä½ ä¹Ÿå¯ä»¥ç”¨å‘½ä»¤è¡Œå‚æ•°æ¥ä¿®æ”¹è¿™ä¸ªè·¯å¾„,请å‚考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。 +```python +python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline +``` 1. 打开chromeæµè§ˆå™¨ï¼Œè®¿é—®<chrome://tracing/>,用`load`按钮æ¥åŠ 载生æˆçš„`timeline`文件。 diff --git a/doc/fluid/howto/optimization/timeline_en.md b/doc/fluid/howto/optimization/timeline_en.md index 96481ae2a6e4442d40803f8d5361e5f942502df3..6f963c6b4da6967fb2f493ada917a4b08917fa4c 100644 --- a/doc/fluid/howto/optimization/timeline_en.md +++ b/doc/fluid/howto/optimization/timeline_en.md @@ -1,15 +1,17 @@ # how to use timeline tool to do profile -1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number. +1. Add `profiler.start_profiler(...)`å’Œ`profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number. ```python - with profiler.profiler('All', 'total', '/tmp/profile') as prof: - for pass_id in range(pass_num): - for batch_id, data in enumerate(train_reader()): - exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[], - use_program_cache=True) + for pass_id in range(pass_num): + for batch_id, data in enumerate(train_reader()): + if pass_id == 0 and batch_id == 5: + profiler.start_profiler("All") + elif pass_id == 0 and batch_id == 10: + profiler.stop_profiler("total", "/tmp/profile") + exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[]) ... ``` @@ -17,6 +19,10 @@ file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at [timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details. +```python +python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline +``` + 1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file. ![chrome tracing](./tracing.jpeg) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 700c73c745bad72637d77385f5cd38c494501c86..bf493a3fa44e48deec734250d04b2a413c3ed9da 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { + platform::RecordEvent r("all_reduce", nullptr); if (NoDummyInputSize() == 1) { return; // No need to all reduce when GPU count = 1; } else { diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 7160e346dad0615e2fd32b70c096880af0359e1a..68bdfbaf52120d19d05d156529626f42adda630d 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -16,12 +16,14 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { namespace details { void ReduceOpHandle::RunImpl() { + platform::RecordEvent r("reduce", nullptr); if (places_.size() == 1) return; // the input and output may have dummy var. auto in_var_handles = DynamicCast<VarHandle>(inputs_); diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 1d80bab90f513139f807b57258177c6b2ac53ac0..5bd974d6b789a2f085c0a69de5e133187342f587 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -17,6 +17,7 @@ #include <string> #include <vector> #include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -62,6 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( eptr = std::current_exception(); } + platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); drop_scope_counter_ += 1; if (!fetch_tensors.empty() || drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index e556c84b0219eba2b92c456c205e03947171626b..0eaf9a9c951991a5775604eb8d0e7535f81a4ae2 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/ssa_graph_builder.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -34,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( FeedFetchList ThreadedSSAGraphExecutor::Run( const std::vector<std::string> &fetch_tensors) { + std::unique_ptr<platform::RecordEvent> event( + new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr)); std::unordered_map<OpHandleBase *, size_t> pending_ops; std::unordered_set<VarHandleBase *> pending_vars; BlockingQueue<VarHandleBase *> ready_vars; @@ -84,6 +87,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // Clean run context run_op_futures_.clear(); exception_holder_.Clear(); + event.reset(nullptr); // Step 3. Execution while (!pending_vars.empty()) { diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc index c9744db3d0654ef63357963d9a9a3cb946f56e2d..916cdad3fd288d1f3ffb19bc769ab827dd1e9103 100644 --- a/paddle/fluid/operators/parallel_do_op.cc +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -18,7 +18,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/detail/safe_ref.h" -#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { @@ -166,8 +165,6 @@ class ParallelDoOp : public framework::OperatorBase { workers.emplace_back( framework::Async([program, cur_scope, place, block, place_idx] { - // Give the thread an id to distinguish parallel block with same id. - platform::RecordThread rt(static_cast<int>(place_idx) + 1); framework::Executor executor(place); executor.Run(*program, cur_scope, block->ID(), false /*create_local_scope*/); @@ -244,8 +241,6 @@ class ParallelDoGradOp : public framework::OperatorBase { // execute workers.emplace_back( framework::Async([program, cur_scope, place, block, i] { - // Give the thread an id to distinguish parallel block with same id. - platform::RecordThread rt(static_cast<int>(i) + 1); framework::Executor executor(place); executor.Run(*program, cur_scope, block->ID(), false /*create_local_scope*/); diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index d9e2afadaf8ec439d158e57c94d3e6e684bce116..8fa8dbd67c936439840cffa073b6fa6693dd31a1 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -30,9 +30,6 @@ limitations under the License. */ namespace paddle { namespace platform { namespace { -// Current thread's id. Note, we don't distinguish nested threads -// for now. -thread_local int cur_thread_id = 0; // Tracking the nested block stacks of each thread. thread_local std::deque<int> block_id_stack; // Tracking the nested event stacks. @@ -413,12 +410,5 @@ void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); } void ClearCurBlock() { block_id_stack.pop_back(); } int BlockDepth() { return block_id_stack.size(); } - -void SetCurThread(int thread_id) { cur_thread_id = thread_id; } - -void ClearCurThread() { cur_thread_id = 0; } - -int CurThread() { return cur_thread_id; } - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index 0375c7439c29d4122e8ff6b58734dad4f504b7a2..d2a571f4345b544ad5e74f4629c3967593d6d628 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -99,9 +99,5 @@ std::string CurAnnotation(); void SetCurBlock(int block_id); void ClearCurBlock(); int BlockDepth(); - -void SetCurThread(int thread_id); -void ClearCurThread(); -int CurThread(); } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 01de9d7041bf3eb40884e2a6295027cccfaebd2a..f4a0e2a86134096985ba169e3e28d9980f21c83c 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -190,7 +190,7 @@ RecordEvent::~RecordEvent() { DeviceTracer* tracer = GetDeviceTracer(); if (tracer) { tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(), - BlockDepth(), CurThread()); + BlockDepth(), g_thread_id); } ClearCurAnnotation(); PopEvent(name_, dev_ctx_); @@ -211,21 +211,11 @@ RecordBlock::~RecordBlock() { // We try to put all blocks at the same nested depth in the // same timeline lane. and distinguish the using thread_id. tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(), - CurThread()); + g_thread_id); } ClearCurBlock(); } -RecordThread::RecordThread(int thread_id) { - if (g_state == ProfilerState::kDisabled) return; - SetCurThread(thread_id); -} - -RecordThread::~RecordThread() { - if (g_state == ProfilerState::kDisabled) return; - ClearCurThread(); -} - void EnableProfiler(ProfilerState state) { PADDLE_ENFORCE(state != ProfilerState::kDisabled, "Can't enbale profling, since the input state is ", diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index bf43925373a12cd9ff2155d68c42d0266ba4df60..c99d9c807d1bfb45d1ce0725b84b9fff09049511 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -95,11 +95,6 @@ struct RecordBlock { uint64_t start_ns_; }; -struct RecordThread { - explicit RecordThread(int thread_id); - ~RecordThread(); -}; - // Return the event list of all threads. Assumed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. std::vector<std::vector<Event>> GetAllEvents();