提交 0a5fbb06 编写于 作者: D dangqingqing

Refine code struct.

上级 f03e73c8
...@@ -115,18 +115,6 @@ class CUDNNDeviceContext : public CUDADeviceContext { ...@@ -115,18 +115,6 @@ class CUDNNDeviceContext : public CUDADeviceContext {
cudnnHandle_t cudnn_handle_; cudnnHandle_t cudnn_handle_;
}; };
class DeviceGuard {
public:
explicit DeviceGuard(int device) {
original_device_ = platform::GetCurrentDeviceId();
platform::SetDeviceId(device);
}
~DeviceGuard() { platform::SetDeviceId(original_device_); }
private:
int original_device_;
};
#endif #endif
/*! \brief device context pool singleton */ /*! \brief device context pool singleton */
......
...@@ -17,34 +17,133 @@ limitations under the License. */ ...@@ -17,34 +17,133 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
ProfilerState kState = ProfilerState::kDisabled; // The profiler state, the initial value is ProfilerState::kDisabled
uint32_t kNextThreadId = 0; static ProfilerState g_state = ProfilerState::kDisabled;
std::mutex kAllEventListsMutex; // The thread local event list only can be accessed by the specific thread
std::list<std::shared_ptr<EventList>> kAllEventLists; // The thread index of each thread
thread_local std::shared_ptr<EventList> kEventList; static thread_local int32_t g_thread_id;
thread_local int32_t kThreadId; // The g_next_thread_id is a global counter for threads, by the g_thread_id and
// g_next_thread_id, we can know how many threads have created EventList.
static uint32_t g_next_thread_id = 0;
// The global mutex
static std::mutex g_all_event_lists_mutex;
// The total event lists of all threads
static std::list<std::shared_ptr<EventList>> g_all_event_lists;
// The thread local event list only can be accessed by the specific thread
static thread_local std::shared_ptr<EventList> g_event_list;
inline uint64_t GetTimeInNsec() {
using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
std::chrono::high_resolution_clock,
std::chrono::steady_clock>::type;
return std::chrono::duration_cast<std::chrono::nanoseconds>(
clock::now().time_since_epoch())
.count();
}
Event::Event(EventKind kind, std::string name, uint32_t thread_id,
DeviceContext* dev_ctx)
: kind_(kind),
name_(std::move(name)),
thread_id_(thread_id),
has_cuda_(false) {
#ifdef PADDLE_WITH_CUDA
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
if (cuda_dev_ctx) {
PADDLE_ENFORCE(cudaGetDevice(&device_));
PADDLE_ENFORCE(cudaEventCreate(&event_));
auto stream = cuda_dev_ctx->stream();
PADDLE_ENFORCE(cudaEventRecord(event_, stream));
has_cuda_ = true;
}
#endif
cpu_ns_ = GetTimeInNsec();
}
std::string Event::kind() const {
switch (kind_) {
case EventKind::kMark:
return "mark";
case EventKind::kPushRange:
return "push";
case EventKind::kPopRange:
return "pop";
}
PADDLE_THROW("Unknown EventKind.");
}
double Event::CpuElapsedUs(const Event& e) const {
return (e.cpu_ns_ - cpu_ns_) / (1000.0);
}
double Event::CudaElapsedUs(const Event& e) const {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE(e.has_cuda() && has_cuda());
PADDLE_ENFORCE(e.device() == device());
PADDLE_ENFORCE(cudaEventSynchronize(event_));
PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
float ms;
PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
return ms * 1000.0;
#else
PADDLE_THROW("CUDA is not enabled");
#endif
}
#ifdef PADDLE_WITH_CUDA
static void ForEachDevice(std::function<void(int)> func) {
auto original_device = GetCurrentDeviceId();
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
SetDeviceId(i);
func(i);
}
SetDeviceId(original_device);
}
#endif
inline EventList& GetEventList() {
if (!g_event_list) {
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
g_event_list = std::make_shared<EventList>();
g_thread_id = g_next_thread_id++;
g_all_event_lists.emplace_front(g_event_list);
}
return *g_event_list;
}
void Mark(const std::string& name, DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id,
dev_ctx);
}
RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
if (g_state == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx;
GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
dev_ctx_);
}
RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled) return;
GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id,
dev_ctx_);
}
void EnableProfiler(ProfilerState state) { void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled, PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enbale profling, since the input state is ", "Can't enbale profling, since the input state is ",
"ProfilerState::kDisabled"); "ProfilerState::kDisabled");
PADDLE_ENFORCE(kState == ProfilerState::kDisabled, PADDLE_ENFORCE(g_state == ProfilerState::kDisabled,
"The profiling state should be disabled when calling ", "The profiling state should be disabled when calling ",
"EnableProfiler."); "EnableProfiler.");
kState = state; g_state = state;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto ForEachDevice = [](std::function<void(int)> op) { if (g_state == ProfilerState::kCUDA) {
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
DeviceGuard dev_guard(i);
op(i);
}
};
if (kState == ProfilerState::kCUDA) {
// Generate some dummy evenets first to reduce the startup overhead. // Generate some dummy evenets first to reduce the startup overhead.
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
ForEachDevice([](int d) { ForEachDevice([](int d) {
DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(d)); DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
Mark("_cuda_startup_", dev_ctx); Mark("_cuda_startup_", dev_ctx);
dev_ctx->Wait(); dev_ctx->Wait();
}); });
...@@ -52,20 +151,20 @@ void EnableProfiler(ProfilerState state) { ...@@ -52,20 +151,20 @@ void EnableProfiler(ProfilerState state) {
} }
#endif #endif
// Mark the profiling start. // Mark the profiling start.
Mark("_start_profiler_"); Mark("_start_profiler_", nullptr);
} }
std::vector<std::vector<Event>> DisableProfiler() { std::vector<std::vector<Event>> DisableProfiler() {
PADDLE_ENFORCE(kState != ProfilerState::kDisabled, PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
"Can't disable profiling, since it's not starting."); "Can't disable profiling, since it's not starting.");
// Mark the profiling stop. // Mark the profiling stop.
Mark("_stop_profiler_"); Mark("_stop_profiler_", nullptr);
kState = ProfilerState::kDisabled; g_state = ProfilerState::kDisabled;
std::vector<std::vector<Event>> result; std::vector<std::vector<Event>> result;
std::lock_guard<std::mutex> guard(kAllEventListsMutex); std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
for (auto it = kAllEventLists.begin(); it != kAllEventLists.end(); ++it) { for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
auto& list = *it; ++it) {
result.emplace_back(list->Reduce()); result.emplace_back((*it)->Reduce());
} }
return result; return result;
} }
......
...@@ -24,76 +24,24 @@ namespace platform { ...@@ -24,76 +24,24 @@ namespace platform {
enum EventKind { kMark, kPushRange, kPopRange }; enum EventKind { kMark, kPushRange, kPopRange };
inline uint64_t GetTimeInNsec() {
// using std::chrono;
using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
std::chrono::high_resolution_clock,
std::chrono::steady_clock>::type;
return std::chrono::duration_cast<std::chrono::nanoseconds>(
clock::now().time_since_epoch())
.count();
}
class Event { class Event {
public: public:
// the DeviceContext is used to get the cuda stream. // The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr.
Event(EventKind kind, std::string name, uint32_t thread_id, Event(EventKind kind, std::string name, uint32_t thread_id,
const platform::DeviceContext* dev_ctx = nullptr) DeviceContext* dev_ctx);
: kind_(kind), name_(std::move(name)), thread_id_(thread_id) {
has_cuda_ = false;
#ifdef PADDLE_WITH_CUDA
auto* cuda_dev_ctx =
static_cast<const platform::CUDADeviceContext*>(dev_ctx);
if (cuda_dev_ctx) {
PADDLE_ENFORCE(cudaGetDevice(&device_));
PADDLE_ENFORCE(cudaEventCreate(&event_));
auto stream = cuda_dev_ctx->stream();
PADDLE_ENFORCE(cudaEventRecord(event_, stream));
has_cuda_ = true;
}
#endif
cpu_ns_ = GetTimeInNsec();
}
std::string kind() const {
switch (kind_) {
case EventKind::kMark:
return "mark";
case EventKind::kPushRange:
return "push";
case EventKind::kPopRange:
return "pop";
}
PADDLE_THROW("Unknown EventKind.");
}
std::string kind() const;
std::string name() const { return name_; } std::string name() const { return name_; }
bool has_cuda() const { return has_cuda_; } bool has_cuda() const { return has_cuda_; }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
cudaEvent_t event() const { return event_; } cudaEvent_t event() const { return event_; }
int device() const { return device_; } int device() const { return device_; }
#endif #endif
double CpuElapsedUs(const Event& e) const { double CpuElapsedUs(const Event& e) const;
return (e.cpu_ns_ - cpu_ns_) / (1000.0); double CudaElapsedUs(const Event& e) const;
}
double CudaElapsedUs(const Event& e) const {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE(e.has_cuda() && has_cuda());
PADDLE_ENFORCE(e.device() == device());
PADDLE_ENFORCE(cudaEventSynchronize(event_));
PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
float ms;
PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
return ms * 1000.0;
#else
PADDLE_THROW("CUDA is not enabled");
#endif
}
private: private:
EventKind kind_; EventKind kind_;
...@@ -108,11 +56,11 @@ class Event { ...@@ -108,11 +56,11 @@ class Event {
}; };
struct EventList { struct EventList {
constexpr static std::size_t kMB = 1024 * 1024; constexpr static size_t kMB = 1024 * 1024;
constexpr static std::size_t kEventBlockSize = 16 * kMB; constexpr static size_t kEventBlockSize = 16 * kMB;
constexpr static std::size_t kEventSize = sizeof(Event); constexpr static size_t kEventSize = sizeof(Event);
constexpr static std::size_t kEventAlign = alignof(Event); constexpr static size_t kEventAlign = alignof(Event);
constexpr static std::size_t kNumBlock = constexpr static size_t kNumBlock =
kEventBlockSize / kEventBlockSize /
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign); ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
...@@ -139,58 +87,27 @@ struct EventList { ...@@ -139,58 +87,27 @@ struct EventList {
}; };
enum ProfilerState { enum ProfilerState {
kDisabled, kDisabled, // disabled state
kCPU, kCPU, // CPU profiling state
kCUDA, kCUDA, // GPU profiling state
}; };
// The profiler state, the initial value is ProfilerState::kDisabled void Mark(const std::string& name, DeviceContext* dev_ctx);
extern ProfilerState kState;
// The global mutex
extern std::mutex kAllEventListsMutex;
// The total event lists of all threads
extern std::list<std::shared_ptr<EventList>> kAllEventLists;
// The thread local event list only can be accessed by the specific thread
extern thread_local std::shared_ptr<EventList> kEventList;
// The thread index of each thread
extern thread_local int32_t kThreadId;
// The kNextThreadId is a global counter for threads, by the kThreadId and
// kNextThreadId, we can know how many threads have created EventList.
extern uint32_t kNextThreadId;
inline EventList& GetEventList() {
if (!kEventList) {
std::lock_guard<std::mutex> guard(kAllEventListsMutex);
kEventList = std::make_shared<EventList>();
kThreadId = kNextThreadId++;
kAllEventLists.emplace_front(kEventList);
}
return *kEventList;
}
inline void Mark(const std::string name,
const platform::DeviceContext* dev_ctx = nullptr) {
GetEventList().Record(EventKind::kMark, std::move(name), kThreadId, dev_ctx);
}
struct RecordEvent { struct RecordEvent {
explicit RecordEvent(const std::string name, explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
platform::DeviceContext* dev_ctx = nullptr) {
if (kState == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx;
GetEventList().Record(EventKind::kPushRange, std::move(name), kThreadId,
dev_ctx_);
}
~RecordEvent() { ~RecordEvent();
if (kState == ProfilerState::kDisabled) return;
GetEventList().Record(EventKind::kPopRange, std::string(), kThreadId, // The device context is used by Event to get the current cuda stream.
dev_ctx_); DeviceContext* dev_ctx_;
}
platform::DeviceContext* dev_ctx_;
}; };
// Enable the profiling function.
void EnableProfiler(ProfilerState state); void EnableProfiler(ProfilerState state);
// Return the event list of all threads. Asummed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> DisableProfiler(); std::vector<std::vector<Event>> DisableProfiler();
} // namespace platform } // namespace platform
......
...@@ -19,13 +19,13 @@ TEST(Event, CpuElapsedTime) { ...@@ -19,13 +19,13 @@ TEST(Event, CpuElapsedTime) {
using paddle::platform::Event; using paddle::platform::Event;
using paddle::platform::EventKind; using paddle::platform::EventKind;
Event start_event(EventKind::kPushRange, "test", 0); Event start_event(EventKind::kPushRange, "test", 0, nullptr);
EXPECT_TRUE(start_event.has_cuda() == false); EXPECT_TRUE(start_event.has_cuda() == false);
int counter = 0; int counter = 0;
while (counter != 1000) { while (counter != 1000) {
counter++; counter++;
} }
Event stop_event(EventKind::kPopRange, "test", 0); Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0); EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0);
} }
...@@ -33,11 +33,11 @@ TEST(Event, CpuElapsedTime) { ...@@ -33,11 +33,11 @@ TEST(Event, CpuElapsedTime) {
TEST(Event, CudaElapsedTime) { TEST(Event, CudaElapsedTime) {
using paddle::platform::DeviceContext; using paddle::platform::DeviceContext;
using paddle::platform::CUDADeviceContext; using paddle::platform::CUDADeviceContext;
using paddle::platform::GPUPlace; using paddle::platform::CUDAPlace;
using paddle::platform::Event; using paddle::platform::Event;
using paddle::platform::EventKind; using paddle::platform::EventKind;
DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(0)); DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
Event start_event(EventKind::kPushRange, "test", 0, dev_ctx); Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
EXPECT_TRUE(start_event.has_cuda() == true); EXPECT_TRUE(start_event.has_cuda() == true);
int counter = 0; int counter = 0;
...@@ -60,10 +60,10 @@ TEST(RecordEvent, RecordEvent) { ...@@ -60,10 +60,10 @@ TEST(RecordEvent, RecordEvent) {
DeviceContext* dev_ctx = nullptr; DeviceContext* dev_ctx = nullptr;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
using paddle::platform::CUDADeviceContext; using paddle::platform::CUDADeviceContext;
using paddle::platform::GPUPlace; using paddle::platform::CUDAPlace;
state = ProfilerState::kCUDA; state = ProfilerState::kCUDA;
dev_ctx = dev_ctx =
new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace(0)); new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
#endif #endif
EnableProfiler(state); EnableProfiler(state);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册