未验证 提交 a83e4704 编写于 作者: D Dun 提交者: GitHub

Profiler refine and add CUDA runtime api tracer (#15301)

* refine profiler && add runtime tracer

* test=develop

* test=develop

* test=develop

* test=develop

* test=develop

* test=develop

* test=develop

* test=develop

* fix bug && test=develop

* add thread id map && test=develop

* test=develop

* testing

* bug fix

* remove cuda event && refine code && test=develop

* test=develop

* test=develop

* test=develop

* fix windows temp file && test=develop

* test=develop

* fix windows bug && test=develop

* fix start up issue && test=develop

* code polish &&  test=develop

* remove unused code && test=develop

* add some cupti cbid && test=develop

* add FLAGS_multiple_of_cupti_buffer_size && test=develop

* fix compile error && test=develop

* add keyword && test=develop

* fix && test=develop

* code polish && test=develop
上级 c0d33f13
...@@ -53,7 +53,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -53,7 +53,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif #endif
void AllReduceOpHandle::RunImpl() { void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); platform::RecordEvent record_event(Name());
WaitInputVarGenerated(); WaitInputVarGenerated();
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs()); auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
......
...@@ -22,7 +22,7 @@ namespace framework { ...@@ -22,7 +22,7 @@ namespace framework {
namespace details { namespace details {
void BroadcastOpHandle::RunImpl() { void BroadcastOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); platform::RecordEvent record_event(Name());
if (places_.size() == 1) return; if (places_.size() == 1) return;
......
...@@ -22,7 +22,7 @@ namespace framework { ...@@ -22,7 +22,7 @@ namespace framework {
namespace details { namespace details {
void FusedBroadcastOpHandle::RunImpl() { void FusedBroadcastOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); platform::RecordEvent record_event(Name());
if (places_.size() == 1UL) return; if (places_.size() == 1UL) return;
......
...@@ -139,7 +139,7 @@ void ReduceOpHandle::GatherSelectedRows( ...@@ -139,7 +139,7 @@ void ReduceOpHandle::GatherSelectedRows(
#endif #endif
void ReduceOpHandle::RunImpl() { void ReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); platform::RecordEvent record_event(Name());
if (places_.size() == 1) return; if (places_.size() == 1) return;
// the input and output may have dummy var. // the input and output may have dummy var.
......
...@@ -63,7 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( ...@@ -63,7 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
eptr = std::current_exception(); eptr = std::current_exception();
} }
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun");
++drop_scope_counter_; ++drop_scope_counter_;
bool stream_end = false; bool stream_end = false;
......
...@@ -37,7 +37,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( ...@@ -37,7 +37,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
FeedFetchList ThreadedSSAGraphExecutor::Run( FeedFetchList ThreadedSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors) { const std::vector<std::string> &fetch_tensors) {
std::unique_ptr<platform::RecordEvent> event( std::unique_ptr<platform::RecordEvent> event(
new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr)); new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare"));
std::unordered_map<OpHandleBase *, size_t> pending_ops; std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars; std::unordered_set<VarHandleBase *> pending_vars;
auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>(); auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>();
......
...@@ -177,9 +177,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -177,9 +177,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
// in concurrency scenerio. Here use an `if` to fix this issue. // in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern. // Please not remove the `if`, ask @Superjomn if there are any concern.
if (platform::IsProfileEnabled()) { if (platform::IsProfileEnabled()) {
platform::DeviceContextPool& pool = platform::RecordEvent record_event(Type());
platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place); RunImpl(scope, place);
} else { } else {
RunImpl(scope, place); RunImpl(scope, place);
......
...@@ -171,9 +171,7 @@ void TestInference(const std::string& dirname, ...@@ -171,9 +171,7 @@ void TestInference(const std::string& dirname,
// Enable the profiler // Enable the profiler
paddle::platform::EnableProfiler(state); paddle::platform::EnableProfiler(state);
{ {
paddle::platform::RecordEvent record_event( paddle::platform::RecordEvent record_event("init_program");
"init_program",
paddle::platform::DeviceContextPool::Instance().Get(place));
inference_program = InitProgram(&executor, scope, dirname, is_combined); inference_program = InitProgram(&executor, scope, dirname, is_combined);
} }
...@@ -230,9 +228,7 @@ void TestInference(const std::string& dirname, ...@@ -230,9 +228,7 @@ void TestInference(const std::string& dirname,
// Run repeat times to profile the performance // Run repeat times to profile the performance
for (int i = 0; i < repeat; ++i) { for (int i = 0; i < repeat; ++i) {
paddle::platform::RecordEvent record_event( paddle::platform::RecordEvent record_event("run_inference");
"run_inference",
paddle::platform::DeviceContextPool::Instance().Get(place));
if (PrepareContext) { if (PrepareContext) {
// Note: if you change the inference_program, you need to call // Note: if you change the inference_program, you need to call
......
...@@ -80,7 +80,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep, ...@@ -80,7 +80,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
google::protobuf::Closure* done = brpc::NewCallback( google::protobuf::Closure* done = brpc::NewCallback(
&HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
platform::RecordRPCEvent record_event(method, p_ctx); platform::RecordRPCEvent record_event(method);
ch_ctx->stub->SendVariable(cntl, &request, response, done); ch_ctx->stub->SendVariable(cntl, &request, response, done);
...@@ -184,7 +184,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, ...@@ -184,7 +184,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
google::protobuf::Closure* done = brpc::NewCallback( google::protobuf::Closure* done = brpc::NewCallback(
&HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
platform::RecordRPCEvent record_event(method, p_ctx); platform::RecordRPCEvent record_event(method);
if (method_name == kGetMonomerRPC) { if (method_name == kGetMonomerRPC) {
ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done); ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
...@@ -272,7 +272,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, ...@@ -272,7 +272,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
&cntl->request_attachment(), out_var_name_val, &cntl->request_attachment(), out_var_name_val,
false, 0, table_name_val); false, 0, table_name_val);
platform::RecordRPCEvent record_event(method, p_ctx); platform::RecordRPCEvent record_event(method);
google::protobuf::Closure* done = brpc::NewCallback( google::protobuf::Closure* done = brpc::NewCallback(
&HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
...@@ -311,7 +311,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep, ...@@ -311,7 +311,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
VarHandlePtr var_h( VarHandlePtr var_h(
new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
platform::RecordRPCEvent record_event(method, nullptr); platform::RecordRPCEvent record_event(method);
google::protobuf::Closure* done = brpc::NewCallback( google::protobuf::Closure* done = brpc::NewCallback(
&HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
...@@ -406,7 +406,7 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage( ...@@ -406,7 +406,7 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage(
sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
cntl->set_timeout_ms(time_out); cntl->set_timeout_ms(time_out);
platform::RecordRPCEvent record_event(method_name, nullptr); platform::RecordRPCEvent record_event(method_name);
VarHandlePtr var_h( VarHandlePtr var_h(
new VarHandle(ep, method_name, req.varname(), nullptr, nullptr)); new VarHandle(ep, method_name, req.varname(), nullptr, nullptr));
......
...@@ -89,7 +89,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, ...@@ -89,7 +89,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
// stub context // stub context
s->response_call_back_ = nullptr; s->response_call_back_ = nullptr;
platform::RecordRPCEvent record_event(method, p_ctx); platform::RecordRPCEvent record_event(method);
auto call = s->stub_g_.PrepareUnaryCall( auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_); s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
...@@ -184,7 +184,7 @@ VarHandlePtr GRPCClient::_AsyncGetVar( ...@@ -184,7 +184,7 @@ VarHandlePtr GRPCClient::_AsyncGetVar(
// stub context // stub context
s->response_call_back_ = ProcGetResponse; s->response_call_back_ = ProcGetResponse;
platform::RecordRPCEvent record_event(method, p_ctx); platform::RecordRPCEvent record_event(method);
auto call = auto call =
s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
...@@ -235,7 +235,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, ...@@ -235,7 +235,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
// stub context // stub context
s->response_call_back_ = ProcGetResponse; s->response_call_back_ = ProcGetResponse;
platform::RecordRPCEvent record_event(method, p_ctx); platform::RecordRPCEvent record_event(method);
auto call = s->stub_g_.PrepareUnaryCall( auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
...@@ -265,7 +265,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, ...@@ -265,7 +265,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(BATCH_BARRIER_MESSAGE); req.set_varname(BATCH_BARRIER_MESSAGE);
platform::RecordRPCEvent record_event(method, nullptr); platform::RecordRPCEvent record_event(method);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
...@@ -290,7 +290,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, ...@@ -290,7 +290,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(FETCH_BARRIER_MESSAGE); req.set_varname(FETCH_BARRIER_MESSAGE);
platform::RecordRPCEvent record_event(method, nullptr); platform::RecordRPCEvent record_event(method);
auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
...@@ -317,7 +317,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep, ...@@ -317,7 +317,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(var_name); req.set_varname(var_name);
platform::RecordRPCEvent record_event(method, nullptr); platform::RecordRPCEvent record_event(method);
auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
...@@ -342,7 +342,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, ...@@ -342,7 +342,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(COMPLETE_MESSAGE); req.set_varname(COMPLETE_MESSAGE);
platform::RecordRPCEvent record_event(method, nullptr); platform::RecordRPCEvent record_event(method);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
...@@ -372,7 +372,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep, ...@@ -372,7 +372,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
req.set_varname(CHECKPOINT_SAVE_MESSAGE); req.set_varname(CHECKPOINT_SAVE_MESSAGE);
req.set_out_varname(dir); req.set_out_varname(dir);
platform::RecordRPCEvent record_event(method, nullptr); platform::RecordRPCEvent record_event(method);
auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
......
...@@ -38,7 +38,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -38,7 +38,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
::grpc::ByteBuffer* msg, const std::string& out_name, ::grpc::ByteBuffer* msg, const std::string& out_name,
const int trainer_id, const int trainer_id,
const std::string& table_name) { const std::string& table_name) {
platform::RecordRPCEvent record_event("serial", &ctx); platform::RecordRPCEvent record_event("serial");
VarMsg request; VarMsg request;
TensorPayload* payload = nullptr; TensorPayload* payload = nullptr;
...@@ -147,7 +147,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, ...@@ -147,7 +147,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope* scope, const framework::Scope* scope,
framework::Variable** var, int* trainer_id) { framework::Variable** var, int* trainer_id) {
platform::RecordRPCEvent record_event("deserial", &ctx); platform::RecordRPCEvent record_event("deserial");
operators::distributed::GRPCVariableResponse resp(scope, &ctx); operators::distributed::GRPCVariableResponse resp(scope, &ctx);
PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!"); PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
*var = resp.GetVar(); *var = resp.GetVar();
......
...@@ -85,9 +85,7 @@ class ReadOp : public framework::OperatorBase { ...@@ -85,9 +85,7 @@ class ReadOp : public framework::OperatorBase {
std::vector<framework::LoDTensor> ins; std::vector<framework::LoDTensor> ins;
// For profiling // For profiling
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::RecordEvent record_event(Type());
auto& ctx = *pool.Get(dev_place);
platform::RecordEvent record_event(Type(), &ctx);
reader->ReadNext(&ins); reader->ReadNext(&ins);
if (ins.empty()) { if (ins.empty()) {
......
...@@ -88,7 +88,11 @@ cc_library(timer SRCS timer.cc) ...@@ -88,7 +88,11 @@ cc_library(timer SRCS timer.cc)
cc_test(timer_test SRCS timer_test.cc DEPS timer) cc_test(timer_test SRCS timer_test.cc DEPS timer)
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) if(WITH_GPU)
nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_context device_tracer)
else()
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
endif()
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
......
...@@ -14,17 +14,23 @@ limitations under the License. */ ...@@ -14,17 +14,23 @@ limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/device_tracer.h"
#include <deque> #include <deque>
#include <forward_list>
#include <fstream> #include <fstream>
#include <list>
#include <map> #include <map>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <numeric> #include <numeric>
#include <sstream>
#include <string> #include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <unordered_map>
#include <utility>
#include <vector> #include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "google/protobuf/text_format.h" #include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
...@@ -33,17 +39,31 @@ namespace { ...@@ -33,17 +39,31 @@ namespace {
// Tracking the nested block stacks of each thread. // Tracking the nested block stacks of each thread.
thread_local std::deque<int> block_id_stack; thread_local std::deque<int> block_id_stack;
// Tracking the nested event stacks. // Tracking the nested event stacks.
thread_local std::deque<std::string> annotation_stack; thread_local std::deque<Event *> annotation_stack;
std::map<uint32_t, int32_t> system_thread_id_map;
std::once_flag tracer_once_flag; std::once_flag tracer_once_flag;
DeviceTracer *tracer = nullptr; DeviceTracer *tracer = nullptr;
void PrintCuptiHint() {
static bool showed = false;
if (showed) return;
showed = true;
LOG(WARNING) << "Invalid timestamp occured. Please try increasing the "
"FLAGS_multiple_of_cupti_buffer_size.";
}
} // namespace } // namespace
#ifdef PADDLE_WITH_CUPTI #ifdef PADDLE_WITH_CUPTI
namespace { namespace {
// TODO(panyx0718): Revisit the buffer size here. // The experimental best performance is
uint64_t kBufSize = 32 * 1024; // the same size with CUPTI device buffer size(8M)
uint64_t kBufSize = 1024 * 1024 * 8;
uint64_t kAlignSize = 8; uint64_t kAlignSize = 8;
std::unordered_map<CUpti_CallbackId, std::string> runtime_cbid_str,
driver_cbid_str;
#define ALIGN_BUFFER(buffer, align) \ #define ALIGN_BUFFER(buffer, align) \
(((uintptr_t)(buffer) & ((align)-1)) \ (((uintptr_t)(buffer) & ((align)-1)) \
...@@ -92,15 +112,33 @@ std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) { ...@@ -92,15 +112,33 @@ std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
return "MEMCPY"; return "MEMCPY";
} }
std::string DriverKind(CUpti_CallbackId cbid) {
auto iter = driver_cbid_str.find(cbid);
if (iter == driver_cbid_str.end())
return "Driver API " + std::to_string(cbid);
return iter->second;
}
std::string RuntimeKind(CUpti_CallbackId cbid) {
auto iter = runtime_cbid_str.find(cbid);
if (iter == runtime_cbid_str.end())
return "Runtime API " + std::to_string(cbid);
return iter->second;
}
void EnableActivity() { void EnableActivity() {
// Device activity record is created when CUDA initializes, so we // Device activity record is created when CUDA initializes, so we
// want to enable it before cuInit() or any CUDA runtime call. // want to enable it before cuInit() or any CUDA runtime call.
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL)); CUPTI_CALL(
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
// We don't track these activities for now. // We don't track these activities for now.
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT)); // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER)); // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
...@@ -110,16 +148,17 @@ void EnableActivity() { ...@@ -110,16 +148,17 @@ void EnableActivity() {
void DisableActivity() { void DisableActivity() {
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY)); CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL)); CUPTI_CALL(
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE)); dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
// Disable all other activity record kinds. // Disable all other activity record kinds.
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT)); // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER)); CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME)); CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET)); // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME)); // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER)); // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD)); // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
} }
void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
...@@ -132,6 +171,11 @@ void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, ...@@ -132,6 +171,11 @@ void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
size_t size, size_t validSize) { size_t size, size_t validSize) {
static std::thread::id cupti_thread_id(0);
if (cupti_thread_id == std::thread::id(0))
cupti_thread_id = std::this_thread::get_id();
PADDLE_ENFORCE_EQ(std::this_thread::get_id(), cupti_thread_id,
"Only one thread is allowed to call bufferCompleted()");
CUptiResult status; CUptiResult status;
CUpti_Activity *record = NULL; CUpti_Activity *record = NULL;
if (validSize > 0) { if (validSize > 0) {
...@@ -168,6 +212,23 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, ...@@ -168,6 +212,23 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
memcpy->correlationId, memcpy->bytes); memcpy->correlationId, memcpy->bytes);
break; break;
} }
case CUPTI_ACTIVITY_KIND_DRIVER: {
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
if (api->start != 0 && api->end != 0)
// -1 device id represents CUDA api call
tracer->AddCPURecords(
DriverKind(api->cbid), api->start, api->end, -1,
GetThreadIdFromSystemThreadId(api->threadId));
break;
}
case CUPTI_ACTIVITY_KIND_RUNTIME: {
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
if (api->start != 0 && api->end != 0)
tracer->AddCPURecords(
RuntimeKind(api->cbid), api->start, api->end, -1,
GetThreadIdFromSystemThreadId(api->threadId));
break;
}
default: { break; } default: { break; }
} }
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
...@@ -183,21 +244,35 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, ...@@ -183,21 +244,35 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
dynload::cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); dynload::cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
if (dropped != 0) { if (dropped != 0) {
fprintf(stderr, "Dropped %u activity records\n", (unsigned int)dropped); fprintf(stderr, "Dropped %u activity records\n", (unsigned int)dropped);
PrintCuptiHint();
} }
} }
free(buffer); free(buffer);
} }
void initCuptiCbidStr();
} // namespace } // namespace
#endif // PADDLE_WITH_CUPTI #endif // PADDLE_WITH_CUPTI
class DeviceTracerImpl : public DeviceTracer { class DeviceTracerImpl : public DeviceTracer {
public: public:
DeviceTracerImpl() : enabled_(false) {} DeviceTracerImpl() : enabled_(false) {
#ifdef PADDLE_WITH_CUPTI
initCuptiCbidStr();
#endif
}
void AddAnnotation(uint64_t id, const std::string &anno) { void AddAnnotation(uint32_t id, Event *event) {
std::lock_guard<std::mutex> l(trace_mu_); thread_local std::forward_list<std::pair<uint32_t, Event *>>
correlations_[id] = anno; *local_correlations_pairs = nullptr;
if (local_correlations_pairs == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
correlations_pairs.emplace_front();
local_correlations_pairs = &correlations_pairs.front();
}
local_correlations_pairs->push_front(std::make_pair(id, event));
} }
void AddCPURecords(const std::string &anno, uint64_t start_ns, void AddCPURecords(const std::string &anno, uint64_t start_ns,
...@@ -206,8 +281,13 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -206,8 +281,13 @@ class DeviceTracerImpl : public DeviceTracer {
VLOG(1) << "Empty timeline annotation."; VLOG(1) << "Empty timeline annotation.";
return; return;
} }
std::lock_guard<std::mutex> l(trace_mu_); thread_local std::forward_list<CPURecord> *local_cpu_records_ = nullptr;
cpu_records_.push_back( if (local_cpu_records_ == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
cpu_records_.emplace_front();
local_cpu_records_ = &cpu_records_.front();
}
local_cpu_records_->push_front(
CPURecord{anno, start_ns, end_ns, device_id, thread_id}); CPURecord{anno, start_ns, end_ns, device_id, thread_id});
} }
...@@ -215,25 +295,27 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -215,25 +295,27 @@ class DeviceTracerImpl : public DeviceTracer {
uint64_t end_ns, int64_t device_id, int64_t stream_id, uint64_t end_ns, int64_t device_id, int64_t stream_id,
uint32_t correlation_id, uint64_t bytes) { uint32_t correlation_id, uint64_t bytes) {
// 0 means timestamp information could not be collected for the kernel. // 0 means timestamp information could not be collected for the kernel.
if (start_ns == 0 || end_ns == 0) { if (start_ns == 0 || end_ns == 0 || start_ns == end_ns) {
VLOG(3) << name << " cannot be traced"; VLOG(3) << name << " cannot be traced";
PrintCuptiHint();
return; return;
} }
std::lock_guard<std::mutex> l(trace_mu_); // NOTE(liangdun): lock is not needed, only one thread call this function.
mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id, mem_records_.push_front(MemRecord{name, start_ns, end_ns, device_id,
stream_id, correlation_id, bytes}); stream_id, correlation_id, bytes});
} }
void AddKernelRecords(std::string name, uint64_t start, uint64_t end, void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
int64_t device_id, int64_t stream_id, int64_t device_id, int64_t stream_id,
uint32_t correlation_id) { uint32_t correlation_id) {
// 0 means timestamp information could not be collected for the kernel. // 0 means timestamp information could not be collected for the kernel.
if (start == 0 || end == 0) { if (start == 0 || end == 0 || start == end) {
VLOG(3) << correlation_id << " cannot be traced"; VLOG(3) << correlation_id << " cannot be traced";
PrintCuptiHint();
return; return;
} }
std::lock_guard<std::mutex> l(trace_mu_); // NOTE(liangdun): lock is not needed, only one thread call this function.
kernel_records_.push_back( kernel_records_.push_front(
KernelRecord{name, start, end, device_id, stream_id, correlation_id}); KernelRecord{name, start, end, device_id, stream_id, correlation_id});
} }
...@@ -263,25 +345,80 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -263,25 +345,80 @@ class DeviceTracerImpl : public DeviceTracer {
} else if (ret != CUPTI_SUCCESS) { } else if (ret != CUPTI_SUCCESS) {
fprintf(stderr, "Failed to create CUPTI subscriber.\n"); fprintf(stderr, "Failed to create CUPTI subscriber.\n");
} }
CUPTI_CALL( const std::vector<int> cbids {
dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020,
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)); CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000
#if CUDA_VERSION >= 9000
,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000
#endif
};
for (auto cbid : cbids)
CUPTI_CALL(dynload::cuptiEnableCallback(
1, subscriber_, CUPTI_CB_DOMAIN_RUNTIME_API, cbid));
CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_)); CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
#endif // PADDLE_WITH_CUPTI #endif // PADDLE_WITH_CUPTI
enabled_ = true; enabled_ = true;
} }
void Reset() {
#ifdef PADDLE_WITH_CUPTI
CUPTI_CALL(
dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
#endif
std::lock_guard<std::mutex> l(trace_mu_);
kernel_records_.clear();
mem_records_.clear();
correlations_.clear();
for (auto &tmp : correlations_pairs) tmp.clear();
for (auto &tmp : cpu_records_) tmp.clear();
}
void GenEventKernelCudaElapsedTime() {
#ifdef PADDLE_WITH_CUPTI
if (correlations_.empty())
for (auto &tmp : correlations_pairs)
for (auto &pair : tmp) correlations_[pair.first] = pair.second;
for (const KernelRecord &r : kernel_records_) {
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
Event *e = c->second;
e->AddCudaElapsedTime(r.start_ns, r.end_ns);
}
}
for (const auto &r : mem_records_) {
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
Event *e = c->second;
e->AddCudaElapsedTime(r.start_ns, r.end_ns);
}
}
#endif
}
proto::Profile GenProfile(const std::string &profile_path) { proto::Profile GenProfile(const std::string &profile_path) {
int miss = 0, find = 0;
std::lock_guard<std::mutex> l(trace_mu_); std::lock_guard<std::mutex> l(trace_mu_);
proto::Profile profile_pb; proto::Profile profile_pb;
profile_pb.set_start_ns(start_ns_); profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_); profile_pb.set_end_ns(end_ns_);
if (correlations_.empty())
for (auto &tmp : correlations_pairs)
for (auto &pair : tmp) correlations_[pair.first] = pair.second;
for (const KernelRecord &r : kernel_records_) { for (const KernelRecord &r : kernel_records_) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel); event->set_type(proto::Event::GPUKernel);
if (correlations_.find(r.correlation_id) != correlations_.end()) { auto c = correlations_.find(r.correlation_id);
event->set_name(correlations_.at(r.correlation_id)); if (c != correlations_.end() && c->second != nullptr) {
event->set_name(c->second->name());
event->set_detail_info(r.name);
find++;
} else { } else {
VLOG(10) << "Missing Kernel Event: " + r.name;
miss++;
event->set_name(r.name); event->set_name(r.name);
} }
event->set_start_ns(r.start_ns); event->set_start_ns(r.start_ns);
...@@ -289,31 +426,41 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -289,31 +426,41 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_sub_device_id(r.stream_id); event->set_sub_device_id(r.stream_id);
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
} }
VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
for (const CPURecord &r : cpu_records_) { for (auto &tmp : cpu_records_)
auto *event = profile_pb.add_events(); for (const CPURecord &r : tmp) {
event->set_type(proto::Event::CPU); auto *event = profile_pb.add_events();
event->set_name(r.name); event->set_type(proto::Event::CPU);
event->set_start_ns(r.start_ns); event->set_name(r.name);
event->set_end_ns(r.end_ns); event->set_start_ns(r.start_ns);
event->set_sub_device_id(r.thread_id); event->set_end_ns(r.end_ns);
event->set_device_id(r.device_id); event->set_sub_device_id(r.thread_id);
} event->set_device_id(r.device_id);
}
miss = find = 0;
for (const MemRecord &r : mem_records_) { for (const MemRecord &r : mem_records_) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel); event->set_type(proto::Event::GPUKernel);
event->set_name(r.name); auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
event->set_name(c->second->name());
event->set_detail_info(r.name);
find++;
} else {
miss++;
event->set_name(r.name);
}
event->set_start_ns(r.start_ns); event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns); event->set_end_ns(r.end_ns);
event->set_sub_device_id(r.stream_id); event->set_sub_device_id(r.stream_id);
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
event->mutable_memcopy()->set_bytes(r.bytes); event->mutable_memcopy()->set_bytes(r.bytes);
} }
VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
std::ofstream profile_f; std::ofstream profile_f;
profile_f.open(profile_path, std::ios::out | std::ios::trunc); profile_f.open(profile_path,
std::string profile_str; std::ios::out | std::ios::trunc | std::ios::binary);
profile_pb.SerializeToString(&profile_str); profile_pb.SerializeToOstream(&profile_f);
profile_f << profile_str;
profile_f.close(); profile_f.close();
return profile_pb; return profile_pb;
} }
...@@ -321,12 +468,13 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -321,12 +468,13 @@ class DeviceTracerImpl : public DeviceTracer {
void Disable() { void Disable() {
#ifdef PADDLE_WITH_CUPTI #ifdef PADDLE_WITH_CUPTI
// flush might cause additional calls to DeviceTracker. // flush might cause additional calls to DeviceTracker.
dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED); CUPTI_CALL(
dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
#endif // PADDLE_WITH_CUPTI #endif // PADDLE_WITH_CUPTI
std::lock_guard<std::mutex> l(trace_mu_); std::lock_guard<std::mutex> l(trace_mu_);
#ifdef PADDLE_WITH_CUPTI #ifdef PADDLE_WITH_CUPTI
DisableActivity(); DisableActivity();
dynload::cuptiUnsubscribe(subscriber_); CUPTI_CALL(dynload::cuptiUnsubscribe(subscriber_));
CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_)); CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
#endif // PADDLE_WITH_CUPTI #endif // PADDLE_WITH_CUPTI
enabled_ = false; enabled_ = false;
...@@ -337,18 +485,10 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -337,18 +485,10 @@ class DeviceTracerImpl : public DeviceTracer {
static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain, static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
CUpti_CallbackId cbid, const void *cbdata) { CUpti_CallbackId cbid, const void *cbdata) {
auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata); auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
DeviceTracer *tracer = reinterpret_cast<DeviceTracer *>(userdata); DeviceTracerImpl *tracer = reinterpret_cast<DeviceTracerImpl *>(userdata);
if (cbInfo->callbackSite == CUPTI_API_ENTER) {
if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) && Event *event = CurAnnotation();
(cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) { tracer->AddAnnotation(cbInfo->correlationId, event);
if (cbInfo->callbackSite == CUPTI_API_ENTER) {
const std::string anno = !annotation_stack.empty()
? annotation_stack.back()
: cbInfo->symbolName;
tracer->AddAnnotation(cbInfo->correlationId, anno);
}
} else {
VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
} }
} }
CUpti_SubscriberHandle subscriber_; CUpti_SubscriberHandle subscriber_;
...@@ -357,10 +497,12 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -357,10 +497,12 @@ class DeviceTracerImpl : public DeviceTracer {
bool enabled_; bool enabled_;
uint64_t start_ns_; uint64_t start_ns_;
uint64_t end_ns_; uint64_t end_ns_;
std::vector<KernelRecord> kernel_records_; std::forward_list<KernelRecord> kernel_records_;
std::vector<MemRecord> mem_records_; std::forward_list<MemRecord> mem_records_;
std::vector<CPURecord> cpu_records_; std::forward_list<std::forward_list<CPURecord>> cpu_records_;
std::unordered_map<uint32_t, std::string> correlations_; std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
correlations_pairs;
std::unordered_map<uint32_t, Event *> correlations_;
}; };
void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); } void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); }
...@@ -370,21 +512,104 @@ DeviceTracer *GetDeviceTracer() { ...@@ -370,21 +512,104 @@ DeviceTracer *GetDeviceTracer() {
return tracer; return tracer;
} }
void SetCurAnnotation(const std::string &anno) { void SetCurAnnotation(Event *event) { annotation_stack.push_back(event); }
annotation_stack.push_back(anno);
}
void ClearCurAnnotation() { annotation_stack.pop_back(); } void ClearCurAnnotation() { annotation_stack.pop_back(); }
std::string CurAnnotation() { Event *CurAnnotation() {
if (annotation_stack.empty()) return ""; if (annotation_stack.empty()) return nullptr;
return annotation_stack.back(); return annotation_stack.back();
} }
std::string CurAnnotationName() {
if (annotation_stack.empty()) return "";
return annotation_stack.back()->name();
}
void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); } void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
void ClearCurBlock() { block_id_stack.pop_back(); } void ClearCurBlock() { block_id_stack.pop_back(); }
int BlockDepth() { return block_id_stack.size(); } int BlockDepth() { return block_id_stack.size(); }
uint32_t GetCurSystemThreadId() {
std::stringstream ss;
ss << std::this_thread::get_id();
uint32_t id = static_cast<uint32_t>(std::stoull(ss.str()));
return id;
}
void RecoreCurThreadId(int32_t id) {
auto gid = GetCurSystemThreadId();
VLOG(1) << "RecoreCurThreadId: " << gid << " -> " << id;
system_thread_id_map[gid] = id;
}
int32_t GetThreadIdFromSystemThreadId(uint32_t id) {
auto it = system_thread_id_map.find(id);
if (it != system_thread_id_map.end()) return it->second;
// return origin id if no event is recorded in this thread.
return static_cast<int32_t>(id);
}
#ifdef PADDLE_WITH_CUPTI
namespace {
void initCuptiCbidStr() {
static bool called = false;
if (called) return;
called = true;
#define REGISTER_RUNTIME_CBID_STR(cbid) \
runtime_cbid_str[CUPTI_RUNTIME_TRACE_CBID_##cbid] = #cbid
REGISTER_RUNTIME_CBID_STR(cudaBindTexture_v3020);
REGISTER_RUNTIME_CBID_STR(cudaConfigureCall_v3020);
REGISTER_RUNTIME_CBID_STR(cudaDeviceGetAttribute_v5000);
REGISTER_RUNTIME_CBID_STR(cudaDeviceGetStreamPriorityRange_v5050);
REGISTER_RUNTIME_CBID_STR(cudaDeviceSynchronize_v3020);
REGISTER_RUNTIME_CBID_STR(cudaDriverGetVersion_v3020);
REGISTER_RUNTIME_CBID_STR(cudaEventCreateWithFlags_v3020);
REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
REGISTER_RUNTIME_CBID_STR(cudaEventQuery_v3020);
REGISTER_RUNTIME_CBID_STR(cudaEventRecord_v3020);
REGISTER_RUNTIME_CBID_STR(cudaFreeHost_v3020);
REGISTER_RUNTIME_CBID_STR(cudaFree_v3020);
REGISTER_RUNTIME_CBID_STR(cudaFuncGetAttributes_v3020);
REGISTER_RUNTIME_CBID_STR(cudaGetDeviceCount_v3020);
REGISTER_RUNTIME_CBID_STR(cudaGetDeviceProperties_v3020);
REGISTER_RUNTIME_CBID_STR(cudaGetDevice_v3020);
REGISTER_RUNTIME_CBID_STR(cudaGetErrorString_v3020);
REGISTER_RUNTIME_CBID_STR(cudaGetLastError_v3020);
REGISTER_RUNTIME_CBID_STR(cudaHostAlloc_v3020);
REGISTER_RUNTIME_CBID_STR(cudaHostGetDevicePointer_v3020);
REGISTER_RUNTIME_CBID_STR(cudaLaunchKernel_v7000);
REGISTER_RUNTIME_CBID_STR(cudaMallocHost_v3020);
REGISTER_RUNTIME_CBID_STR(cudaMalloc_v3020);
REGISTER_RUNTIME_CBID_STR(cudaMemcpyAsync_v3020);
REGISTER_RUNTIME_CBID_STR(cudaMemcpy_v3020);
REGISTER_RUNTIME_CBID_STR(cudaMemsetAsync_v3020);
REGISTER_RUNTIME_CBID_STR(cudaMemset_v3020);
REGISTER_RUNTIME_CBID_STR(
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000);
REGISTER_RUNTIME_CBID_STR(cudaPeekAtLastError_v3020);
REGISTER_RUNTIME_CBID_STR(cudaRuntimeGetVersion_v3020);
REGISTER_RUNTIME_CBID_STR(cudaSetDevice_v3020);
REGISTER_RUNTIME_CBID_STR(cudaStreamCreate_v3020);
REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithFlags_v5000);
REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithPriority_v5050);
REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v5050);
REGISTER_RUNTIME_CBID_STR(cudaStreamSynchronize_v3020);
REGISTER_RUNTIME_CBID_STR(cudaStreamWaitEvent_v3020);
REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
#if CUDA_VERSION >= 9000
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);
#endif
#undef REGISTER_RUNTIME_CBID_STR
}
} // namespace
#endif // PADDLE_WITH_CUPTI
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -32,6 +32,8 @@ inline uint64_t PosixInNsec() { ...@@ -32,6 +32,8 @@ inline uint64_t PosixInNsec() {
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec); return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
} }
class Event;
// DeviceTracer performs the following tasks: // DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc. // 1. Register cuda callbacks for various events: kernel, memcpy, etc.
// 2. Collect cuda statistics: start/end ts, memory, etc. // 2. Collect cuda statistics: start/end ts, memory, etc.
...@@ -68,11 +70,13 @@ class DeviceTracer { ...@@ -68,11 +70,13 @@ class DeviceTracer {
virtual void Enable() = 0; virtual void Enable() = 0;
// Needs to be called once after use. // Needs to be called once after use.
virtual void Disable() = 0; virtual void Disable() = 0;
// Needs to be called once before reuse.
virtual void Reset() = 0;
// Add a pair to correlate internal cuda id with high level // Add a pair to correlate internal cuda id with high level
// annotation (string). So cuda statistics can be represented by // annotation event(with string). So cuda statistics can be represented by
// human-readable annotations. // human-readable annotations.
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0; virtual void AddAnnotation(uint32_t id, Event* event) = 0;
virtual void AddMemRecords(const std::string& name, uint64_t start_ns, virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, uint64_t end_ns, int64_t device_id,
...@@ -92,6 +96,9 @@ class DeviceTracer { ...@@ -92,6 +96,9 @@ class DeviceTracer {
// Generate a proto after done (Disabled). // Generate a proto after done (Disabled).
virtual proto::Profile GenProfile(const std::string& profile_path) = 0; virtual proto::Profile GenProfile(const std::string& profile_path) = 0;
// generate kernel elapsed time into Event
virtual void GenEventKernelCudaElapsedTime() = 0;
virtual bool IsEnabled() = 0; virtual bool IsEnabled() = 0;
}; };
...@@ -99,14 +106,19 @@ class DeviceTracer { ...@@ -99,14 +106,19 @@ class DeviceTracer {
DeviceTracer* GetDeviceTracer(); DeviceTracer* GetDeviceTracer();
// Set a name for the cuda kernel operation being launched by the thread. // Set a name for the cuda kernel operation being launched by the thread.
void SetCurAnnotation(const std::string& anno); void SetCurAnnotation(Event* event);
// Clear the name after the operation is done. // Clear the name after the operation is done.
void ClearCurAnnotation(); void ClearCurAnnotation();
// Current name of the operation being run in the thread. // Current name of the operation being run in the thread.
std::string CurAnnotation(); std::string CurAnnotationName();
Event* CurAnnotation();
void SetCurBlock(int block_id); void SetCurBlock(int block_id);
void ClearCurBlock(); void ClearCurBlock();
int BlockDepth(); int BlockDepth();
// Set current thread id, so we can map the system thread id to thread id.
void RecoreCurThreadId(int32_t id);
int32_t GetThreadIdFromSystemThreadId(uint32_t id);
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/string/split.h" #include "paddle/fluid/string/split.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/dynload/cupti.h"
#endif #endif
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/init.h"
...@@ -30,6 +31,9 @@ limitations under the License. */ ...@@ -30,6 +31,9 @@ limitations under the License. */
DEFINE_int32(paddle_num_threads, 1, DEFINE_int32(paddle_num_threads, 1,
"Number of threads for each paddle instance."); "Number of threads for each paddle instance.");
DEFINE_int32(multiple_of_cupti_buffer_size, 1,
"Multiple of the CUPTI device buffer size. If the timestamps have "
"been dropped when you are profiling, try increasing this value.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -78,7 +82,32 @@ void InitP2P(std::vector<int> devices) { ...@@ -78,7 +82,32 @@ void InitP2P(std::vector<int> devices) {
#endif #endif
} }
void InitCupti() {
#ifdef PADDLE_WITH_CUPTI
if (FLAGS_multiple_of_cupti_buffer_size == 1) return;
size_t attrValue = 0, attrValueSize = sizeof(size_t);
#define MULTIPLY_ATTR_VALUE(attr) \
{ \
PADDLE_ENFORCE(!platform::dynload::cuptiActivityGetAttribute( \
attr, &attrValueSize, &attrValue)); \
attrValue *= FLAGS_multiple_of_cupti_buffer_size; \
LOG(WARNING) << "Set " #attr " " << attrValue << " byte"; \
PADDLE_ENFORCE(!platform::dynload::cuptiActivitySetAttribute( \
attr, &attrValueSize, &attrValue)); \
}
MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE);
MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP);
#if CUDA_VERSION >= 9000
MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE);
#endif
#undef MULTIPLY_ATTR_VALUE
#endif
}
void InitDevices(bool init_p2p) { void InitDevices(bool init_p2p) {
// CUPTI attribute should be set before any CUDA context is created (see CUPTI
// documentation about CUpti_ActivityAttribute).
InitCupti();
/*Init all available devices by default */ /*Init all available devices by default */
std::vector<int> devices; std::vector<int> devices;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include <algorithm> #include <algorithm>
#include <iomanip> #include <iomanip>
#include <limits> #include <limits>
...@@ -27,7 +29,6 @@ limitations under the License. */ ...@@ -27,7 +29,6 @@ limitations under the License. */
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
...@@ -66,12 +67,13 @@ struct EventList { ...@@ -66,12 +67,13 @@ struct EventList {
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign); ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
template <typename... Args> template <typename... Args>
void Record(Args&&... args) { Event* Record(Args&&... args) {
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) { if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
event_blocks.emplace_front(); event_blocks.emplace_front();
event_blocks.front().reserve(kNumBlock); event_blocks.front().reserve(kNumBlock);
} }
event_blocks.front().emplace_back(std::forward<Args>(args)...); event_blocks.front().emplace_back(std::forward<Args>(args)...);
return &event_blocks.front().back();
} }
std::vector<Event> Reduce() { std::vector<Event> Reduce() {
...@@ -98,21 +100,8 @@ inline uint64_t GetTimeInNsec() { ...@@ -98,21 +100,8 @@ inline uint64_t GetTimeInNsec() {
.count(); .count();
} }
Event::Event(EventType type, std::string name, uint32_t thread_id, Event::Event(EventType type, std::string name, uint32_t thread_id)
const DeviceContext* dev_ctx) : type_(type), name_(name), thread_id_(thread_id) {
: type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
#ifdef PADDLE_WITH_CUDA
has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
if (has_cuda_) {
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
PADDLE_ENFORCE(cudaSetDevice(
boost::get<platform::CUDAPlace>(cuda_dev_ctx->GetPlace()).device));
PADDLE_ENFORCE(cudaGetDevice(&device_));
PADDLE_ENFORCE(cudaEventCreate(&event_));
auto stream = cuda_dev_ctx->stream();
PADDLE_ENFORCE(cudaEventRecord(event_, stream));
}
#endif
cpu_ns_ = GetTimeInNsec(); cpu_ns_ = GetTimeInNsec();
} }
...@@ -124,88 +113,70 @@ double Event::CpuElapsedMs(const Event& e) const { ...@@ -124,88 +113,70 @@ double Event::CpuElapsedMs(const Event& e) const {
double Event::CudaElapsedMs(const Event& e) const { double Event::CudaElapsedMs(const Event& e) const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (!has_cuda_) return 0.0; #ifdef PADDLE_WITH_CUPTI
PADDLE_ENFORCE(e.has_cuda() && has_cuda()); return gpu_ns_ / 1000000.0;
PADDLE_ENFORCE(e.device() == device()); #endif
PADDLE_ENFORCE(cudaEventSynchronize(event_));
PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
float ms;
PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
return ms;
#else #else
PADDLE_THROW("CUDA is not enabled"); PADDLE_THROW("CUDA is not enabled");
#endif #endif
} }
#ifdef PADDLE_WITH_CUDA
static void ForEachDevice(std::function<void(int)> func) {
auto original_device = GetCurrentDeviceId();
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
SetDeviceId(i);
func(i);
}
SetDeviceId(original_device);
}
#endif
inline EventList& GetEventList() { inline EventList& GetEventList() {
if (!g_event_list) { if (!g_event_list) {
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex); std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
g_event_list = std::make_shared<EventList>(); g_event_list = std::make_shared<EventList>();
g_thread_id = g_next_thread_id++; g_thread_id = g_next_thread_id++;
g_all_event_lists.emplace_front(g_event_list); g_all_event_lists.emplace_front(g_event_list);
RecoreCurThreadId(g_thread_id);
} }
return *g_event_list; return *g_event_list;
} }
void Mark(const std::string& name, const DeviceContext* dev_ctx) { void Mark(const std::string& name) {
GetEventList().Record(EventType::kMark, name, g_thread_id, dev_ctx); GetEventList().Record(EventType::kMark, name, g_thread_id);
} }
void PushEvent(const std::string& name, const DeviceContext* dev_ctx) { Event* PushEvent(const std::string& name) {
GetEventList().Record(EventType::kPushRange, name, g_thread_id, dev_ctx); return GetEventList().Record(EventType::kPushRange, name, g_thread_id);
} }
void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { void PopEvent(const std::string& name) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id, dev_ctx); GetEventList().Record(EventType::kPopRange, name, g_thread_id);
} }
RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) RecordEvent::RecordEvent(const std::string& name)
: is_enabled_(false), start_ns_(PosixInNsec()) { : is_enabled_(false), start_ns_(PosixInNsec()) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
std::lock_guard<std::mutex> l(profiler_mu); // lock is not needed, the code below is thread-safe
is_enabled_ = true; is_enabled_ = true;
dev_ctx_ = dev_ctx;
name_ = name; name_ = name;
PushEvent(name_, dev_ctx_); Event* e = PushEvent(name_);
// Maybe need the same push/pop behavior. // Maybe need the same push/pop behavior.
SetCurAnnotation(name_); SetCurAnnotation(e);
} }
RecordEvent::~RecordEvent() { RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled || !is_enabled_) return; if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
std::lock_guard<std::mutex> l(profiler_mu); // lock is not needed, the code below is thread-safe
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer* tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(), tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
BlockDepth(), g_thread_id); BlockDepth(), g_thread_id);
} }
ClearCurAnnotation(); ClearCurAnnotation();
PopEvent(name_, dev_ctx_); PopEvent(name_);
} }
RecordRPCEvent::RecordRPCEvent(const std::string& name, RecordRPCEvent::RecordRPCEvent(const std::string& name) {
const DeviceContext* dev_ctx) {
if (FLAGS_enable_rpc_profiler) { if (FLAGS_enable_rpc_profiler) {
event_.reset(new platform::RecordEvent(name, dev_ctx)); event_.reset(new platform::RecordEvent(name));
} }
} }
RecordBlock::RecordBlock(int block_id) RecordBlock::RecordBlock(int block_id)
: is_enabled_(false), start_ns_(PosixInNsec()) { : is_enabled_(false), start_ns_(PosixInNsec()) {
std::lock_guard<std::mutex> l(profiler_mu); // lock is not needed, the code below is thread-safe
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
is_enabled_ = true; is_enabled_ = true;
SetCurBlock(block_id); SetCurBlock(block_id);
...@@ -213,7 +184,7 @@ RecordBlock::RecordBlock(int block_id) ...@@ -213,7 +184,7 @@ RecordBlock::RecordBlock(int block_id)
} }
RecordBlock::~RecordBlock() { RecordBlock::~RecordBlock() {
std::lock_guard<std::mutex> l(profiler_mu); // lock is not needed, the code below is thread-safe
if (g_state == ProfilerState::kDisabled || !is_enabled_) return; if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer* tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
...@@ -225,11 +196,21 @@ RecordBlock::~RecordBlock() { ...@@ -225,11 +196,21 @@ RecordBlock::~RecordBlock() {
ClearCurBlock(); ClearCurBlock();
} }
void SynchronizeAllDevice() {
#ifdef PADDLE_WITH_CUDA
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
SetDeviceId(i);
PADDLE_ENFORCE(cudaDeviceSynchronize());
}
#endif
}
void EnableProfiler(ProfilerState state) { void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled, PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enable profiling, since the input state is ", "Can't enable profiling, since the input state is ",
"ProfilerState::kDisabled"); "ProfilerState::kDisabled");
SynchronizeAllDevice();
std::lock_guard<std::mutex> l(profiler_mu); std::lock_guard<std::mutex> l(profiler_mu);
if (state == g_state) { if (state == g_state) {
return; return;
...@@ -238,23 +219,20 @@ void EnableProfiler(ProfilerState state) { ...@@ -238,23 +219,20 @@ void EnableProfiler(ProfilerState state) {
should_send_profile_state = true; should_send_profile_state = true;
GetDeviceTracer()->Enable(); GetDeviceTracer()->Enable();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (g_state == ProfilerState::kCUDA) { if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll ||
g_state == ProfilerState::kCPU) {
// Generate some dummy events first to reduce the startup overhead. // Generate some dummy events first to reduce the startup overhead.
for (int i = 0; i < 5; i++) { DummyKernelAndEvent();
ForEachDevice([](int d) { GetDeviceTracer()->Reset();
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
Mark("_cuda_startup_", dev_ctx);
dev_ctx->Wait();
delete dev_ctx;
});
}
} }
#endif #endif
// Mark the profiling start. // Mark the profiling start.
Mark("_start_profiler_", nullptr); Mark("_start_profiler_");
} }
void ResetProfiler() { void ResetProfiler() {
SynchronizeAllDevice();
GetDeviceTracer()->Reset();
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex); std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
++it) { ++it) {
...@@ -481,20 +459,23 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -481,20 +459,23 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
void DisableProfiler(EventSortingKey sorted_key, void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path) { const std::string& profile_path) {
SynchronizeAllDevice();
std::lock_guard<std::mutex> l(profiler_mu); std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
// Mark the profiling stop. // Mark the profiling stop.
Mark("_stop_profiler_", nullptr); Mark("_stop_profiler_");
std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, true, sorted_key);
ParseEvents(all_events, false, sorted_key);
ResetProfiler();
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer* tracer = GetDeviceTracer();
if (tracer->IsEnabled()) { if (tracer->IsEnabled()) {
tracer->Disable(); tracer->Disable();
tracer->GenProfile(profile_path); tracer->GenProfile(profile_path);
tracer->GenEventKernelCudaElapsedTime();
} }
std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, true, sorted_key);
ParseEvents(all_events, false, sorted_key);
ResetProfiler();
g_state = ProfilerState::kDisabled; g_state = ProfilerState::kDisabled;
should_send_profile_state = true; should_send_profile_state = true;
} }
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include <cuda.h>
namespace paddle {
namespace platform {
__global__ void DummyKernel(int *a) { a[0] = 0; }
static void ForEachDevice(std::function<void(int)> func) {
auto original_device = GetCurrentDeviceId();
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
SetDeviceId(i);
func(i);
}
SetDeviceId(original_device);
}
void DummyKernelAndEvent() {
for (int i = 0; i < 5; i++) {
ForEachDevice([](int d) {
CUDADeviceContext *dev_ctx = new CUDADeviceContext(CUDAPlace(d));
Mark("_cuda_startup_");
int *ptr;
PADDLE_ENFORCE(cudaMalloc(&ptr, sizeof(int)));
DummyKernel<<<1, 1, 0, dev_ctx->stream()>>>(ptr);
dev_ctx->Wait();
PADDLE_ENFORCE(cudaFree(ptr));
delete dev_ctx;
});
}
}
} // namespace platform
} // namespace paddle
...@@ -28,17 +28,17 @@ class Event { ...@@ -28,17 +28,17 @@ class Event {
public: public:
// The DeviceContext is used to get the cuda stream. // The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr. // If CPU profiling mode, can pass nullptr.
Event(EventType type, std::string name, uint32_t thread_id, Event(EventType type, std::string name, uint32_t thread_id);
const DeviceContext* dev_ctx);
const EventType& type() const; const EventType& type() const;
std::string name() const { return name_; } std::string name() const { return name_; }
uint32_t thread_id() const { return thread_id_; } uint32_t thread_id() const { return thread_id_; }
bool has_cuda() const { return has_cuda_; }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#ifndef PADDLE_WITH_CUPTI
cudaEvent_t event() const { return event_; } cudaEvent_t event() const { return event_; }
int device() const { return device_; } int device() const { return device_; }
#endif
#endif #endif
double CpuElapsedMs(const Event& e) const; double CpuElapsedMs(const Event& e) const;
...@@ -49,11 +49,21 @@ class Event { ...@@ -49,11 +49,21 @@ class Event {
std::string name_; std::string name_;
uint32_t thread_id_; uint32_t thread_id_;
int64_t cpu_ns_; int64_t cpu_ns_;
bool has_cuda_;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUPTI
int64_t gpu_ns_ = 0;
public:
void AddCudaElapsedTime(int64_t start_ns, int64_t end_ns) {
gpu_ns_ += end_ns - start_ns;
}
private:
#else
cudaEvent_t event_ = nullptr; cudaEvent_t event_ = nullptr;
int device_ = -1; int device_ = -1;
#endif #endif
#endif
}; };
enum ProfilerState { enum ProfilerState {
...@@ -63,22 +73,19 @@ enum ProfilerState { ...@@ -63,22 +73,19 @@ enum ProfilerState {
kAll, // Profile both CPU and GPU. (Currently experimental). kAll, // Profile both CPU and GPU. (Currently experimental).
}; };
void Mark(const std::string& name, const DeviceContext* dev_ctx); void Mark(const std::string& name);
void PushEvent(const std::string& name, const DeviceContext* dev_ctx); Event* PushEvent(const std::string& name);
void PopEvent(const std::string& name, const DeviceContext* dev_ctx); void PopEvent(const std::string& name);
struct RecordEvent { struct RecordEvent {
// dev_ctx can be set to nullptr if device is cpu. explicit RecordEvent(const std::string& name);
RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
~RecordEvent(); ~RecordEvent();
bool is_enabled_; bool is_enabled_;
uint64_t start_ns_; uint64_t start_ns_;
// The device context is used by Event to get the current cuda stream.
const DeviceContext* dev_ctx_;
// Event name // Event name
std::string name_; std::string name_;
// Need to distinguish name by op type, block_id, program_id and perhaps // Need to distinguish name by op type, block_id, program_id and perhaps
...@@ -88,8 +95,7 @@ struct RecordEvent { ...@@ -88,8 +95,7 @@ struct RecordEvent {
class RecordRPCEvent { class RecordRPCEvent {
public: public:
// dev_ctx can be set to nullptr if device is cpu. explicit RecordRPCEvent(const std::string& name);
RecordRPCEvent(const std::string& name, const DeviceContext* dev_ctx);
~RecordRPCEvent() {} ~RecordRPCEvent() {}
private: private:
...@@ -132,5 +138,9 @@ bool ShouldSendProfileState(); ...@@ -132,5 +138,9 @@ bool ShouldSendProfileState();
void SetProfileListener(); void SetProfileListener();
int64_t ListenerId(); int64_t ListenerId();
#ifdef PADDLE_WITH_CUDA
void DummyKernelAndEvent();
#endif
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -31,6 +31,7 @@ message Event { ...@@ -31,6 +31,7 @@ message Event {
optional int64 sub_device_id = 6; optional int64 sub_device_id = 6;
optional MemCopy memcopy = 7; optional MemCopy memcopy = 7;
optional string detail_info = 9;
} }
message Profile { message Profile {
......
...@@ -23,76 +23,49 @@ TEST(Event, CpuElapsedTime) { ...@@ -23,76 +23,49 @@ TEST(Event, CpuElapsedTime) {
using paddle::platform::Event; using paddle::platform::Event;
using paddle::platform::EventType; using paddle::platform::EventType;
Event start_event(EventType::kPushRange, "test", 0, nullptr); Event start_event(EventType::kPushRange, "test", 0);
EXPECT_TRUE(start_event.has_cuda() == false);
int counter = 0; int counter = 0;
while (counter != 1000) { while (counter != 1000) {
counter++; counter++;
} }
Event stop_event(EventType::kPopRange, "test", 0, nullptr); Event stop_event(EventType::kPopRange, "test", 0);
EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0); EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
} }
#ifdef PADDLE_WITH_CUDA
TEST(Event, CudaElapsedTime) {
using paddle::platform::DeviceContext;
using paddle::platform::CUDADeviceContext;
using paddle::platform::CUDAPlace;
using paddle::platform::Event;
using paddle::platform::EventType;
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
Event start_event(EventType::kPushRange, "test", 0, dev_ctx);
EXPECT_TRUE(start_event.has_cuda() == true);
int counter = 0;
while (counter != 1000) {
counter++;
}
Event stop_event(EventType::kPopRange, "test", 0, dev_ctx);
EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
}
#endif
TEST(RecordEvent, RecordEvent) { TEST(RecordEvent, RecordEvent) {
using paddle::platform::DeviceContext; using paddle::platform::DeviceContext;
using paddle::platform::Event; using paddle::platform::Event;
using paddle::platform::EventType; using paddle::platform::EventType;
using paddle::platform::RecordEvent; using paddle::platform::RecordEvent;
using paddle::platform::PushEvent;
using paddle::platform::PopEvent;
using paddle::platform::ProfilerState; using paddle::platform::ProfilerState;
using paddle::platform::EventSortingKey; using paddle::platform::EventSortingKey;
ProfilerState state = ProfilerState::kCPU; ProfilerState state = ProfilerState::kCPU;
DeviceContext* dev_ctx = nullptr;
#ifdef PADDLE_WITH_CUDA
using paddle::platform::CUDADeviceContext;
using paddle::platform::CUDAPlace;
state = ProfilerState::kCUDA;
dev_ctx =
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
#endif
EnableProfiler(state); EnableProfiler(state);
/* Usage 1: /* Usage 1:
* PushEvent(evt_name, dev_ctx); * PushEvent(evt_name);
* ... * ...
* code to be analyzed * code to be analyzed
* ... * ...
* PopEvent(evt_name, dev_ctx); * PopEvent(evt_name);
*/ */
LOG(INFO) << "Usage 1: PushEvent & PopEvent"; LOG(INFO) << "Usage 1: PushEvent & PopEvent";
for (int loop = 0; loop < 3; ++loop) { for (int loop = 0; loop < 3; ++loop) {
for (int i = 1; i < 5; ++i) { for (int i = 1; i < 5; ++i) {
std::string name = "op_" + std::to_string(i); std::string name = "op_" + std::to_string(i);
PushEvent(name, dev_ctx); PushEvent(name);
int counter = 1; int counter = 1;
while (counter != i * 1000) counter++; while (counter != i * 1000) counter++;
PopEvent(name, dev_ctx); PopEvent(name);
} }
} }
/* Usage 2: /* Usage 2:
* { * {
* RecordEvent record_event(name, dev_ctx); * RecordEvent record_event(name);
* ... * ...
* code to be analyzed * code to be analyzed
* ... * ...
...@@ -101,7 +74,7 @@ TEST(RecordEvent, RecordEvent) { ...@@ -101,7 +74,7 @@ TEST(RecordEvent, RecordEvent) {
LOG(INFO) << "Usage 2: RecordEvent"; LOG(INFO) << "Usage 2: RecordEvent";
for (int i = 1; i < 5; ++i) { for (int i = 1; i < 5; ++i) {
std::string name = "evs_op_" + std::to_string(i); std::string name = "evs_op_" + std::to_string(i);
RecordEvent record_event(name, dev_ctx); RecordEvent record_event(name);
int counter = 1; int counter = 1;
while (counter != i * 1000) counter++; while (counter != i * 1000) counter++;
} }
...@@ -123,20 +96,20 @@ TEST(RecordEvent, RecordEvent) { ...@@ -123,20 +96,20 @@ TEST(RecordEvent, RecordEvent) {
LOG(INFO) << "Usage 3: nested RecordEvent"; LOG(INFO) << "Usage 3: nested RecordEvent";
for (int i = 1; i < 5; ++i) { for (int i = 1; i < 5; ++i) {
std::string name = "ano_evs_op_" + std::to_string(i); std::string name = "ano_evs_op_" + std::to_string(i);
RecordEvent record_event(name, dev_ctx); RecordEvent record_event(name);
int counter = 1; int counter = 1;
while (counter != i * 100) counter++; while (counter != i * 100) counter++;
{ {
std::string nested_name = "nested_ano_evs_op_" + std::to_string(i); std::string nested_name = "nested_ano_evs_op_" + std::to_string(i);
RecordEvent nested_record_event(nested_name, dev_ctx); RecordEvent nested_record_event(nested_name);
int nested_counter = 1; int nested_counter = 1;
while (nested_counter != i * 100) nested_counter++; while (nested_counter != i * 100) nested_counter++;
} }
} }
// Bad Usage: // Bad Usage:
PushEvent("event_without_pop", dev_ctx); PushEvent("event_without_pop");
PopEvent("event_without_push", dev_ctx); PopEvent("event_without_push");
std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents(); std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
int cuda_startup_count = 0; int cuda_startup_count = 0;
......
...@@ -131,7 +131,8 @@ def __bootstrap__(): ...@@ -131,7 +131,8 @@ def __bootstrap__():
'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
'allocator_strategy', 'reader_queue_speed_test_mode', 'allocator_strategy', 'reader_queue_speed_test_mode',
'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
'inner_op_parallelism', 'enable_parallel_graph' 'inner_op_parallelism', 'enable_parallel_graph',
'multiple_of_cupti_buffer_size'
] ]
if 'Darwin' not in sysstr: if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory') read_env_flags.append('use_pinned_memory')
......
...@@ -16,15 +16,19 @@ from __future__ import print_function ...@@ -16,15 +16,19 @@ from __future__ import print_function
import unittest import unittest
import os import os
import tempfile
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.profiler as profiler import paddle.fluid.profiler as profiler
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
class TestProfiler(unittest.TestCase): class TestProfiler(unittest.TestCase):
def net_profiler(self, state, profile_path='/tmp/profile'): def net_profiler(self, state, use_parallel_executor=False):
profile_path = os.path.join(tempfile.gettempdir(), "profile")
open(profile_path, "w").write("")
startup_program = fluid.Program() startup_program = fluid.Program()
main_program = fluid.Program() main_program = fluid.Program()
...@@ -60,6 +64,11 @@ class TestProfiler(unittest.TestCase): ...@@ -60,6 +64,11 @@ class TestProfiler(unittest.TestCase):
place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0) place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup_program) exe.run(startup_program)
if use_parallel_executor:
pe = fluid.ParallelExecutor(
state != 'CPU',
loss_name=avg_cost.name,
main_program=main_program)
pass_acc_calculator = fluid.average.WeightedAverage() pass_acc_calculator = fluid.average.WeightedAverage()
with profiler.profiler(state, 'total', profile_path) as prof: with profiler.profiler(state, 'total', profile_path) as prof:
...@@ -69,6 +78,9 @@ class TestProfiler(unittest.TestCase): ...@@ -69,6 +78,9 @@ class TestProfiler(unittest.TestCase):
x = np.random.random((32, 784)).astype("float32") x = np.random.random((32, 784)).astype("float32")
y = np.random.randint(0, 10, (32, 1)).astype("int64") y = np.random.randint(0, 10, (32, 1)).astype("int64")
if use_parallel_executor:
pe.run(feed={'x': x, 'y': y}, fetch_list=[avg_cost.name])
continue
outs = exe.run(main_program, outs = exe.run(main_program,
feed={'x': x, feed={'x': x,
'y': y}, 'y': y},
...@@ -77,21 +89,37 @@ class TestProfiler(unittest.TestCase): ...@@ -77,21 +89,37 @@ class TestProfiler(unittest.TestCase):
b_size = np.array(outs[2]) b_size = np.array(outs[2])
pass_acc_calculator.add(value=acc, weight=b_size) pass_acc_calculator.add(value=acc, weight=b_size)
pass_acc = pass_acc_calculator.eval() pass_acc = pass_acc_calculator.eval()
data = open(profile_path, 'rb').read()
self.assertGreater(len(data), 0)
profile_pb = profiler_pb2.Profile()
profile_pb.ParseFromString(data)
self.assertGreater(len(profile_pb.events), 0)
for event in profile_pb.events:
if event.type == profiler_pb2.Event.GPUKernel:
if not event.detail_info and not event.name.startswith("MEM"):
raise Exception(
"Kernel %s missing event. Has this kernel been recorded by RecordEvent?"
% event.name)
elif event.type == profiler_pb2.Event.CPU and (
event.name.startswith("Driver API") or
event.name.startswith("Runtime API")):
print("Warning: unregister", event.name)
def test_cpu_profiler(self): def test_cpu_profiler(self):
self.net_profiler('CPU') self.net_profiler('CPU')
self.net_profiler('CPU', use_parallel_executor=True)
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(not core.is_compiled_with_cuda(),
"profiler is enabled only with GPU") "profiler is enabled only with GPU")
def test_cuda_profiler(self): def test_cuda_profiler(self):
self.net_profiler('GPU') self.net_profiler('GPU')
self.net_profiler('GPU', use_parallel_executor=True)
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(not core.is_compiled_with_cuda(),
"profiler is enabled only with GPU") "profiler is enabled only with GPU")
def test_all_profiler(self): def test_all_profiler(self):
self.net_profiler('All', '/tmp/profile_out') self.net_profiler('All')
with open('/tmp/profile_out', 'rb') as f: self.net_profiler('All', use_parallel_executor=True)
self.assertGreater(len(f.read()), 0)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -131,8 +131,12 @@ class Timeline(object): ...@@ -131,8 +131,12 @@ class Timeline(object):
if (k, event.device_id, "CPU") not in self._devices: if (k, event.device_id, "CPU") not in self._devices:
pid = self._allocate_pid() pid = self._allocate_pid()
self._devices[(k, event.device_id, "CPU")] = pid self._devices[(k, event.device_id, "CPU")] = pid
self._chrome_trace.emit_pid("%s:cpu:block:%d" % # -1 device id represents CUDA api call
(k, event.device_id), pid) if event.device_id == -1:
self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
else:
self._chrome_trace.emit_pid(
"%s:cpu:block:%d" % (k, event.device_id), pid)
elif event.type == profiler_pb2.Event.GPUKernel: elif event.type == profiler_pb2.Event.GPUKernel:
if (k, event.device_id, "GPUKernel") not in self._devices: if (k, event.device_id, "GPUKernel") not in self._devices:
pid = self._allocate_pid() pid = self._allocate_pid()
...@@ -150,7 +154,9 @@ class Timeline(object): ...@@ -150,7 +154,9 @@ class Timeline(object):
pid = self._devices[(k, event.device_id, type)] pid = self._devices[(k, event.device_id, type)]
args = {'name': event.name} args = {'name': event.name}
if event.memcopy.bytes > 0: if event.memcopy.bytes > 0:
args = {'mem_bytes': event.memcopy.bytes} args['mem_bytes'] = event.memcopy.bytes
if event.detail_info:
args['detail_info'] = event.detail_info
# TODO(panyx0718): Chrome tracing only handles ms. However, some # TODO(panyx0718): Chrome tracing only handles ms. However, some
# ops takes micro-seconds. Hence, we keep the ns here. # ops takes micro-seconds. Hence, we keep the ns here.
self._chrome_trace.emit_region( self._chrome_trace.emit_region(
...@@ -173,7 +179,7 @@ if args.timeline_path: ...@@ -173,7 +179,7 @@ if args.timeline_path:
profile_paths = profile_path.split(',') profile_paths = profile_path.split(',')
profile_dict = dict() profile_dict = dict()
if len(profile_paths) == 1: if len(profile_paths) == 1:
with open(profile_path, 'r') as f: with open(profile_path, 'rb') as f:
profile_s = f.read() profile_s = f.read()
profile_pb = profiler_pb2.Profile() profile_pb = profiler_pb2.Profile()
profile_pb.ParseFromString(profile_s) profile_pb.ParseFromString(profile_s)
...@@ -181,7 +187,7 @@ if len(profile_paths) == 1: ...@@ -181,7 +187,7 @@ if len(profile_paths) == 1:
else: else:
for profile_path in profile_paths: for profile_path in profile_paths:
k, v = profile_path.split('=') k, v = profile_path.split('=')
with open(v, 'r') as f: with open(v, 'rb') as f:
profile_s = f.read() profile_s = f.read()
profile_pb = profiler_pb2.Profile() profile_pb = profiler_pb2.Profile()
profile_pb.ParseFromString(profile_s) profile_pb.ParseFromString(profile_s)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册