diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md index 5d061e1c00d2ca0194153730a39486b8357fa5b0..faf39f276dbddcd4961407ba2d082c9826051cbe 100644 --- a/doc/fluid/howto/optimization/timeline_cn.md +++ b/doc/fluid/howto/optimization/timeline_cn.md @@ -1,21 +1,27 @@ # 如何使用timeline工具做性能分析 -1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。 +1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。 **提示:** 请不要在timeline记录信息时运行太多次迭代,因为timeline中的记录数量和迭代次数是成正比的。 ```python - with profiler.profiler('All', 'total', '/tmp/profile') as prof: - for pass_id in range(pass_num): - for batch_id, data in enumerate(train_reader()): - exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[]) + for pass_id in range(pass_num): + for batch_id, data in enumerate(train_reader()): + if pass_id == 0 and batch_id == 5: + profiler.start_profiler("All") + elif pass_id == 0 and batch_id == 10: + profiler.stop_profiler("total", "/tmp/profile") + exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[]) ... ``` 1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`,这个程序默认会生成一个`/tmp/timeline`文件,你也可以用命令行参数来修改这个路径,请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。 +```python +python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline +``` 1. 打开chrome浏览器,访问,用`load`按钮来加载生成的`timeline`文件。 diff --git a/doc/fluid/howto/optimization/timeline_en.md b/doc/fluid/howto/optimization/timeline_en.md index 96481ae2a6e4442d40803f8d5361e5f942502df3..6f963c6b4da6967fb2f493ada917a4b08917fa4c 100644 --- a/doc/fluid/howto/optimization/timeline_en.md +++ b/doc/fluid/howto/optimization/timeline_en.md @@ -1,15 +1,17 @@ # how to use timeline tool to do profile -1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number. +1. Add `profiler.start_profiler(...)`和`profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number. ```python - with profiler.profiler('All', 'total', '/tmp/profile') as prof: - for pass_id in range(pass_num): - for batch_id, data in enumerate(train_reader()): - exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[], - use_program_cache=True) + for pass_id in range(pass_num): + for batch_id, data in enumerate(train_reader()): + if pass_id == 0 and batch_id == 5: + profiler.start_profiler("All") + elif pass_id == 0 and batch_id == 10: + profiler.stop_profiler("total", "/tmp/profile") + exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[]) ... ``` @@ -17,6 +19,10 @@ file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at [timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details. +```python +python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline +``` + 1. Open chrome and visit , use `load` button to load the generated `timeline` file. ![chrome tracing](./tracing.jpeg) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 700c73c745bad72637d77385f5cd38c494501c86..bf493a3fa44e48deec734250d04b2a413c3ed9da 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { + platform::RecordEvent r("all_reduce", nullptr); if (NoDummyInputSize() == 1) { return; // No need to all reduce when GPU count = 1; } else { diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 7160e346dad0615e2fd32b70c096880af0359e1a..68bdfbaf52120d19d05d156529626f42adda630d 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -16,12 +16,14 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { namespace details { void ReduceOpHandle::RunImpl() { + platform::RecordEvent r("reduce", nullptr); if (places_.size() == 1) return; // the input and output may have dummy var. auto in_var_handles = DynamicCast(inputs_); diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 1d80bab90f513139f807b57258177c6b2ac53ac0..5bd974d6b789a2f085c0a69de5e133187342f587 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -17,6 +17,7 @@ #include #include #include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -62,6 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( eptr = std::current_exception(); } + platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); drop_scope_counter_ += 1; if (!fetch_tensors.empty() || drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index e556c84b0219eba2b92c456c205e03947171626b..0eaf9a9c951991a5775604eb8d0e7535f81a4ae2 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/ssa_graph_builder.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -34,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( FeedFetchList ThreadedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { + std::unique_ptr event( + new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr)); std::unordered_map pending_ops; std::unordered_set pending_vars; BlockingQueue ready_vars; @@ -84,6 +87,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // Clean run context run_op_futures_.clear(); exception_holder_.Clear(); + event.reset(nullptr); // Step 3. Execution while (!pending_vars.empty()) { diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc index c9744db3d0654ef63357963d9a9a3cb946f56e2d..916cdad3fd288d1f3ffb19bc769ab827dd1e9103 100644 --- a/paddle/fluid/operators/parallel_do_op.cc +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -18,7 +18,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/detail/safe_ref.h" -#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { @@ -166,8 +165,6 @@ class ParallelDoOp : public framework::OperatorBase { workers.emplace_back( framework::Async([program, cur_scope, place, block, place_idx] { - // Give the thread an id to distinguish parallel block with same id. - platform::RecordThread rt(static_cast(place_idx) + 1); framework::Executor executor(place); executor.Run(*program, cur_scope, block->ID(), false /*create_local_scope*/); @@ -244,8 +241,6 @@ class ParallelDoGradOp : public framework::OperatorBase { // execute workers.emplace_back( framework::Async([program, cur_scope, place, block, i] { - // Give the thread an id to distinguish parallel block with same id. - platform::RecordThread rt(static_cast(i) + 1); framework::Executor executor(place); executor.Run(*program, cur_scope, block->ID(), false /*create_local_scope*/); diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index d9e2afadaf8ec439d158e57c94d3e6e684bce116..8fa8dbd67c936439840cffa073b6fa6693dd31a1 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -30,9 +30,6 @@ limitations under the License. */ namespace paddle { namespace platform { namespace { -// Current thread's id. Note, we don't distinguish nested threads -// for now. -thread_local int cur_thread_id = 0; // Tracking the nested block stacks of each thread. thread_local std::deque block_id_stack; // Tracking the nested event stacks. @@ -413,12 +410,5 @@ void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); } void ClearCurBlock() { block_id_stack.pop_back(); } int BlockDepth() { return block_id_stack.size(); } - -void SetCurThread(int thread_id) { cur_thread_id = thread_id; } - -void ClearCurThread() { cur_thread_id = 0; } - -int CurThread() { return cur_thread_id; } - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index 0375c7439c29d4122e8ff6b58734dad4f504b7a2..d2a571f4345b544ad5e74f4629c3967593d6d628 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -99,9 +99,5 @@ std::string CurAnnotation(); void SetCurBlock(int block_id); void ClearCurBlock(); int BlockDepth(); - -void SetCurThread(int thread_id); -void ClearCurThread(); -int CurThread(); } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 75cae95f83b2b9c50a05acd48d7c93198e3b6b49..fe811e025062eb6b6a0f3c7bcc3b37e55fd26d37 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -192,7 +192,7 @@ RecordEvent::~RecordEvent() { DeviceTracer* tracer = GetDeviceTracer(); if (tracer) { tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(), - BlockDepth(), CurThread()); + BlockDepth(), g_thread_id); } ClearCurAnnotation(); PopEvent(name_, dev_ctx_); @@ -215,23 +215,11 @@ RecordBlock::~RecordBlock() { // We try to put all blocks at the same nested depth in the // same timeline lane. and distinguish the using thread_id. tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(), - CurThread()); + g_thread_id); } ClearCurBlock(); } -RecordThread::RecordThread(int thread_id) { - std::lock_guard l(profiler_mu); - if (g_state == ProfilerState::kDisabled) return; - SetCurThread(thread_id); -} - -RecordThread::~RecordThread() { - std::lock_guard l(profiler_mu); - if (g_state == ProfilerState::kDisabled) return; - ClearCurThread(); -} - void EnableProfiler(ProfilerState state) { PADDLE_ENFORCE(state != ProfilerState::kDisabled, "Can't enbale profling, since the input state is ", diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index bf43925373a12cd9ff2155d68c42d0266ba4df60..c99d9c807d1bfb45d1ce0725b84b9fff09049511 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -95,11 +95,6 @@ struct RecordBlock { uint64_t start_ns_; }; -struct RecordThread { - explicit RecordThread(int thread_id); - ~RecordThread(); -}; - // Return the event list of all threads. Assumed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. std::vector> GetAllEvents();