提交 caf10b47 编写于 作者: X Xin Pan

make profiler use thread_id from g_thread_id

Add a few more RecordEvent.
Cleanup
上级 baff71d5
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif #endif
void AllReduceOpHandle::RunImpl() { void AllReduceOpHandle::RunImpl() {
platform::RecordEvent r("all_reduce", nullptr);
if (NoDummyInputSize() == 1) { if (NoDummyInputSize() == 1) {
return; // No need to all reduce when GPU count = 1; return; // No need to all reduce when GPU count = 1;
} else { } else {
......
...@@ -16,12 +16,14 @@ ...@@ -16,12 +16,14 @@
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
void ReduceOpHandle::RunImpl() { void ReduceOpHandle::RunImpl() {
platform::RecordEvent r("reduce", nullptr);
if (places_.size() == 1) return; if (places_.size() == 1) return;
// the input and output may have dummy var. // the input and output may have dummy var.
auto in_var_handles = DynamicCast<VarHandle>(inputs_); auto in_var_handles = DynamicCast<VarHandle>(inputs_);
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -62,6 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( ...@@ -62,6 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
eptr = std::current_exception(); eptr = std::current_exception();
} }
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
drop_scope_counter_ += 1; drop_scope_counter_ += 1;
if (!fetch_tensors.empty() || if (!fetch_tensors.empty() ||
drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/ssa_graph_builder.h" #include "paddle/fluid/framework/details/ssa_graph_builder.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -34,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( ...@@ -34,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
FeedFetchList ThreadedSSAGraphExecutor::Run( FeedFetchList ThreadedSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors) { const std::vector<std::string> &fetch_tensors) {
std::unique_ptr<platform::RecordEvent> event(
new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
std::unordered_map<OpHandleBase *, size_t> pending_ops; std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars; std::unordered_set<VarHandleBase *> pending_vars;
BlockingQueue<VarHandleBase *> ready_vars; BlockingQueue<VarHandleBase *> ready_vars;
...@@ -84,6 +87,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -84,6 +87,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// Clean run context // Clean run context
run_op_futures_.clear(); run_op_futures_.clear();
exception_holder_.Clear(); exception_holder_.Clear();
event.reset(nullptr);
// Step 3. Execution // Step 3. Execution
while (!pending_vars.empty()) { while (!pending_vars.empty()) {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
the Apache License, Version 2.0 (the "License"); you may not use this file
except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax = "proto3";
package sendrecv;
option cc_generic_services = false;
service SendRecvService {
// For parameter server round-robin like hashing, do not split tensors.
// Send and recv only one tensor
// TODO(typhoonzero): add streaming API
rpc SendVariable(VariableMessage) returns (VoidMessage) {}
// Argument VariableMessage for GetVariable should only contain varname.
rpc GetVariable(VariableMessage) returns (VariableMessage) {}
// pre-fetch variable by given variable name and Ids
rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
}
// VariableMessage is serialized paddle variable message.
// It can be:
// LoDTensor
// SelectedRows
enum VarType {
LOD_TENSOR = 0;
SELECTED_ROWS = 1;
NCCL_ID = 2;
}
// NOTICE(gongwb):don't modify this proto if you are not
// not familar with how we serialize in sendrecvop_utils.h
// and deserilize it in variable_response.h.
message VariableMessage {
enum Type {
// Pod Types
BOOL = 0;
INT16 = 1;
INT32 = 2;
INT64 = 3;
FP16 = 4;
FP32 = 5;
FP64 = 6;
}
message LodData { repeated int64 lod_data = 1; }
string varname = 1;
// TODO(Yancey1989): reference framework::proto::VarDesc::VarType
VarType type = 2;
// bool persistable is not needed for sending.
// tensor info:
Type data_type = 3;
repeated int64 dims = 4;
// lod details:
int64 lod_level = 5;
repeated LodData lod = 6;
// selected_rows height, aka. original dim0
int64 slr_height = 7;
// tensor data
bytes serialized = 8;
// selected_rows data
bytes rows = 9;
// Look up table block execution output variable name.
string out_varname = 10;
// If 1, the ps server will start profiling, the ps
// server stops profiling and generates a profile to /tmp/profile_ps_*
// when profile switches from 1 to 2.
int64 profile = 11;
}
message VoidMessage {}
...@@ -18,7 +18,6 @@ limitations under the License. */ ...@@ -18,7 +18,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -166,8 +165,6 @@ class ParallelDoOp : public framework::OperatorBase { ...@@ -166,8 +165,6 @@ class ParallelDoOp : public framework::OperatorBase {
workers.emplace_back( workers.emplace_back(
framework::Async([program, cur_scope, place, block, place_idx] { framework::Async([program, cur_scope, place, block, place_idx] {
// Give the thread an id to distinguish parallel block with same id.
platform::RecordThread rt(static_cast<int>(place_idx) + 1);
framework::Executor executor(place); framework::Executor executor(place);
executor.Run(*program, cur_scope, block->ID(), executor.Run(*program, cur_scope, block->ID(),
false /*create_local_scope*/); false /*create_local_scope*/);
...@@ -244,8 +241,6 @@ class ParallelDoGradOp : public framework::OperatorBase { ...@@ -244,8 +241,6 @@ class ParallelDoGradOp : public framework::OperatorBase {
// execute // execute
workers.emplace_back( workers.emplace_back(
framework::Async([program, cur_scope, place, block, i] { framework::Async([program, cur_scope, place, block, i] {
// Give the thread an id to distinguish parallel block with same id.
platform::RecordThread rt(static_cast<int>(i) + 1);
framework::Executor executor(place); framework::Executor executor(place);
executor.Run(*program, cur_scope, block->ID(), executor.Run(*program, cur_scope, block->ID(),
false /*create_local_scope*/); false /*create_local_scope*/);
......
...@@ -30,9 +30,6 @@ limitations under the License. */ ...@@ -30,9 +30,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace { namespace {
// Current thread's id. Note, we don't distinguish nested threads
// for now.
thread_local int cur_thread_id = 0;
// Tracking the nested block stacks of each thread. // Tracking the nested block stacks of each thread.
thread_local std::deque<int> block_id_stack; thread_local std::deque<int> block_id_stack;
// Tracking the nested event stacks. // Tracking the nested event stacks.
...@@ -413,12 +410,5 @@ void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); } ...@@ -413,12 +410,5 @@ void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
void ClearCurBlock() { block_id_stack.pop_back(); } void ClearCurBlock() { block_id_stack.pop_back(); }
int BlockDepth() { return block_id_stack.size(); } int BlockDepth() { return block_id_stack.size(); }
void SetCurThread(int thread_id) { cur_thread_id = thread_id; }
void ClearCurThread() { cur_thread_id = 0; }
int CurThread() { return cur_thread_id; }
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -99,9 +99,5 @@ std::string CurAnnotation(); ...@@ -99,9 +99,5 @@ std::string CurAnnotation();
void SetCurBlock(int block_id); void SetCurBlock(int block_id);
void ClearCurBlock(); void ClearCurBlock();
int BlockDepth(); int BlockDepth();
void SetCurThread(int thread_id);
void ClearCurThread();
int CurThread();
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -190,7 +190,7 @@ RecordEvent::~RecordEvent() { ...@@ -190,7 +190,7 @@ RecordEvent::~RecordEvent() {
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer* tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(), tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
BlockDepth(), CurThread()); BlockDepth(), g_thread_id);
} }
ClearCurAnnotation(); ClearCurAnnotation();
PopEvent(name_, dev_ctx_); PopEvent(name_, dev_ctx_);
...@@ -211,21 +211,11 @@ RecordBlock::~RecordBlock() { ...@@ -211,21 +211,11 @@ RecordBlock::~RecordBlock() {
// We try to put all blocks at the same nested depth in the // We try to put all blocks at the same nested depth in the
// same timeline lane. and distinguish the using thread_id. // same timeline lane. and distinguish the using thread_id.
tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(), tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
CurThread()); g_thread_id);
} }
ClearCurBlock(); ClearCurBlock();
} }
RecordThread::RecordThread(int thread_id) {
if (g_state == ProfilerState::kDisabled) return;
SetCurThread(thread_id);
}
RecordThread::~RecordThread() {
if (g_state == ProfilerState::kDisabled) return;
ClearCurThread();
}
void EnableProfiler(ProfilerState state) { void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled, PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enbale profling, since the input state is ", "Can't enbale profling, since the input state is ",
......
...@@ -95,11 +95,6 @@ struct RecordBlock { ...@@ -95,11 +95,6 @@ struct RecordBlock {
uint64_t start_ns_; uint64_t start_ns_;
}; };
struct RecordThread {
explicit RecordThread(int thread_id);
~RecordThread();
};
// Return the event list of all threads. Assumed the returned value calls // Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> GetAllEvents(); std::vector<std::vector<Event>> GetAllEvents();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册