diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md
index 5d061e1c00d2ca0194153730a39486b8357fa5b0..faf39f276dbddcd4961407ba2d082c9826051cbe 100644
--- a/doc/fluid/howto/optimization/timeline_cn.md
+++ b/doc/fluid/howto/optimization/timeline_cn.md
@@ -1,21 +1,27 @@
# 如何使用timeline工具做性能分析
-1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。
**提示:**
请不要在timeline记录信息时运行太多次迭代,因为timeline中的记录数量和迭代次数是成正比的。
```python
- with profiler.profiler('All', 'total', '/tmp/profile') as prof:
- for pass_id in range(pass_num):
- for batch_id, data in enumerate(train_reader()):
- exe.run(fluid.default_main_program(),
- feed=feeder.feed(data),
- fetch_list=[])
+ for pass_id in range(pass_num):
+ for batch_id, data in enumerate(train_reader()):
+ if pass_id == 0 and batch_id == 5:
+ profiler.start_profiler("All")
+ elif pass_id == 0 and batch_id == 10:
+ profiler.stop_profiler("total", "/tmp/profile")
+ exe.run(fluid.default_main_program(),
+ feed=feeder.feed(data),
+ fetch_list=[])
...
```
1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`,这个程序默认会生成一个`/tmp/timeline`文件,你也可以用命令行参数来修改这个路径,请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
1. 打开chrome浏览器,访问,用`load`按钮来加载生成的`timeline`文件。
diff --git a/doc/fluid/howto/optimization/timeline_en.md b/doc/fluid/howto/optimization/timeline_en.md
index 96481ae2a6e4442d40803f8d5361e5f942502df3..6f963c6b4da6967fb2f493ada917a4b08917fa4c 100644
--- a/doc/fluid/howto/optimization/timeline_en.md
+++ b/doc/fluid/howto/optimization/timeline_en.md
@@ -1,15 +1,17 @@
# how to use timeline tool to do profile
-1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
+1. Add `profiler.start_profiler(...)`和`profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
```python
- with profiler.profiler('All', 'total', '/tmp/profile') as prof:
- for pass_id in range(pass_num):
- for batch_id, data in enumerate(train_reader()):
- exe.run(fluid.default_main_program(),
- feed=feeder.feed(data),
- fetch_list=[],
- use_program_cache=True)
+ for pass_id in range(pass_num):
+ for batch_id, data in enumerate(train_reader()):
+ if pass_id == 0 and batch_id == 5:
+ profiler.start_profiler("All")
+ elif pass_id == 0 and batch_id == 10:
+ profiler.stop_profiler("total", "/tmp/profile")
+ exe.run(fluid.default_main_program(),
+ feed=feeder.feed(data),
+ fetch_list=[])
...
```
@@ -17,6 +19,10 @@
file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
+
1. Open chrome and visit , use `load` button to load the generated `timeline` file.

diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 700c73c745bad72637d77385f5cd38c494501c86..bf493a3fa44e48deec734250d04b2a413c3ed9da 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -17,6 +17,7 @@
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
@@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif
void AllReduceOpHandle::RunImpl() {
+ platform::RecordEvent r("all_reduce", nullptr);
if (NoDummyInputSize() == 1) {
return; // No need to all reduce when GPU count = 1;
} else {
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 7160e346dad0615e2fd32b70c096880af0359e1a..68bdfbaf52120d19d05d156529626f42adda630d 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -16,12 +16,14 @@
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
namespace details {
void ReduceOpHandle::RunImpl() {
+ platform::RecordEvent r("reduce", nullptr);
if (places_.size() == 1) return;
// the input and output may have dummy var.
auto in_var_handles = DynamicCast(inputs_);
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 1d80bab90f513139f807b57258177c6b2ac53ac0..5bd974d6b789a2f085c0a69de5e133187342f587 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -17,6 +17,7 @@
#include
#include
#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
@@ -62,6 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
eptr = std::current_exception();
}
+ platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
drop_scope_counter_ += 1;
if (!fetch_tensors.empty() ||
drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index e556c84b0219eba2b92c456c205e03947171626b..0eaf9a9c951991a5775604eb8d0e7535f81a4ae2 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
@@ -34,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
FeedFetchList ThreadedSSAGraphExecutor::Run(
const std::vector &fetch_tensors) {
+ std::unique_ptr event(
+ new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
std::unordered_map pending_ops;
std::unordered_set pending_vars;
BlockingQueue ready_vars;
@@ -84,6 +87,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// Clean run context
run_op_futures_.clear();
exception_holder_.Clear();
+ event.reset(nullptr);
// Step 3. Execution
while (!pending_vars.empty()) {
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index c9744db3d0654ef63357963d9a9a3cb946f56e2d..916cdad3fd288d1f3ffb19bc769ab827dd1e9103 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -18,7 +18,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
@@ -166,8 +165,6 @@ class ParallelDoOp : public framework::OperatorBase {
workers.emplace_back(
framework::Async([program, cur_scope, place, block, place_idx] {
- // Give the thread an id to distinguish parallel block with same id.
- platform::RecordThread rt(static_cast(place_idx) + 1);
framework::Executor executor(place);
executor.Run(*program, cur_scope, block->ID(),
false /*create_local_scope*/);
@@ -244,8 +241,6 @@ class ParallelDoGradOp : public framework::OperatorBase {
// execute
workers.emplace_back(
framework::Async([program, cur_scope, place, block, i] {
- // Give the thread an id to distinguish parallel block with same id.
- platform::RecordThread rt(static_cast(i) + 1);
framework::Executor executor(place);
executor.Run(*program, cur_scope, block->ID(),
false /*create_local_scope*/);
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index d9e2afadaf8ec439d158e57c94d3e6e684bce116..8fa8dbd67c936439840cffa073b6fa6693dd31a1 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -30,9 +30,6 @@ limitations under the License. */
namespace paddle {
namespace platform {
namespace {
-// Current thread's id. Note, we don't distinguish nested threads
-// for now.
-thread_local int cur_thread_id = 0;
// Tracking the nested block stacks of each thread.
thread_local std::deque block_id_stack;
// Tracking the nested event stacks.
@@ -413,12 +410,5 @@ void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
void ClearCurBlock() { block_id_stack.pop_back(); }
int BlockDepth() { return block_id_stack.size(); }
-
-void SetCurThread(int thread_id) { cur_thread_id = thread_id; }
-
-void ClearCurThread() { cur_thread_id = 0; }
-
-int CurThread() { return cur_thread_id; }
-
} // namespace platform
} // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index 0375c7439c29d4122e8ff6b58734dad4f504b7a2..d2a571f4345b544ad5e74f4629c3967593d6d628 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -99,9 +99,5 @@ std::string CurAnnotation();
void SetCurBlock(int block_id);
void ClearCurBlock();
int BlockDepth();
-
-void SetCurThread(int thread_id);
-void ClearCurThread();
-int CurThread();
} // namespace platform
} // namespace paddle
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 75cae95f83b2b9c50a05acd48d7c93198e3b6b49..fe811e025062eb6b6a0f3c7bcc3b37e55fd26d37 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -192,7 +192,7 @@ RecordEvent::~RecordEvent() {
DeviceTracer* tracer = GetDeviceTracer();
if (tracer) {
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
- BlockDepth(), CurThread());
+ BlockDepth(), g_thread_id);
}
ClearCurAnnotation();
PopEvent(name_, dev_ctx_);
@@ -215,23 +215,11 @@ RecordBlock::~RecordBlock() {
// We try to put all blocks at the same nested depth in the
// same timeline lane. and distinguish the using thread_id.
tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
- CurThread());
+ g_thread_id);
}
ClearCurBlock();
}
-RecordThread::RecordThread(int thread_id) {
- std::lock_guard l(profiler_mu);
- if (g_state == ProfilerState::kDisabled) return;
- SetCurThread(thread_id);
-}
-
-RecordThread::~RecordThread() {
- std::lock_guard l(profiler_mu);
- if (g_state == ProfilerState::kDisabled) return;
- ClearCurThread();
-}
-
void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enbale profling, since the input state is ",
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index bf43925373a12cd9ff2155d68c42d0266ba4df60..c99d9c807d1bfb45d1ce0725b84b9fff09049511 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -95,11 +95,6 @@ struct RecordBlock {
uint64_t start_ns_;
};
-struct RecordThread {
- explicit RecordThread(int thread_id);
- ~RecordThread();
-};
-
// Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector> GetAllEvents();