提交 0a5fbb06 编写于 作者: D dangqingqing

Refine code struct.

上级 f03e73c8
......@@ -115,18 +115,6 @@ class CUDNNDeviceContext : public CUDADeviceContext {
cudnnHandle_t cudnn_handle_;
};
class DeviceGuard {
public:
explicit DeviceGuard(int device) {
original_device_ = platform::GetCurrentDeviceId();
platform::SetDeviceId(device);
}
~DeviceGuard() { platform::SetDeviceId(original_device_); }
private:
int original_device_;
};
#endif
/*! \brief device context pool singleton */
......
......@@ -17,34 +17,133 @@ limitations under the License. */
namespace paddle {
namespace platform {
ProfilerState kState = ProfilerState::kDisabled;
uint32_t kNextThreadId = 0;
std::mutex kAllEventListsMutex;
std::list<std::shared_ptr<EventList>> kAllEventLists;
thread_local std::shared_ptr<EventList> kEventList;
thread_local int32_t kThreadId;
// The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled;
// The thread local event list only can be accessed by the specific thread
// The thread index of each thread
static thread_local int32_t g_thread_id;
// The g_next_thread_id is a global counter for threads, by the g_thread_id and
// g_next_thread_id, we can know how many threads have created EventList.
static uint32_t g_next_thread_id = 0;
// The global mutex
static std::mutex g_all_event_lists_mutex;
// The total event lists of all threads
static std::list<std::shared_ptr<EventList>> g_all_event_lists;
// The thread local event list only can be accessed by the specific thread
static thread_local std::shared_ptr<EventList> g_event_list;
inline uint64_t GetTimeInNsec() {
using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
std::chrono::high_resolution_clock,
std::chrono::steady_clock>::type;
return std::chrono::duration_cast<std::chrono::nanoseconds>(
clock::now().time_since_epoch())
.count();
}
Event::Event(EventKind kind, std::string name, uint32_t thread_id,
DeviceContext* dev_ctx)
: kind_(kind),
name_(std::move(name)),
thread_id_(thread_id),
has_cuda_(false) {
#ifdef PADDLE_WITH_CUDA
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
if (cuda_dev_ctx) {
PADDLE_ENFORCE(cudaGetDevice(&device_));
PADDLE_ENFORCE(cudaEventCreate(&event_));
auto stream = cuda_dev_ctx->stream();
PADDLE_ENFORCE(cudaEventRecord(event_, stream));
has_cuda_ = true;
}
#endif
cpu_ns_ = GetTimeInNsec();
}
std::string Event::kind() const {
switch (kind_) {
case EventKind::kMark:
return "mark";
case EventKind::kPushRange:
return "push";
case EventKind::kPopRange:
return "pop";
}
PADDLE_THROW("Unknown EventKind.");
}
double Event::CpuElapsedUs(const Event& e) const {
return (e.cpu_ns_ - cpu_ns_) / (1000.0);
}
double Event::CudaElapsedUs(const Event& e) const {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE(e.has_cuda() && has_cuda());
PADDLE_ENFORCE(e.device() == device());
PADDLE_ENFORCE(cudaEventSynchronize(event_));
PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
float ms;
PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
return ms * 1000.0;
#else
PADDLE_THROW("CUDA is not enabled");
#endif
}
#ifdef PADDLE_WITH_CUDA
static void ForEachDevice(std::function<void(int)> func) {
auto original_device = GetCurrentDeviceId();
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
SetDeviceId(i);
func(i);
}
SetDeviceId(original_device);
}
#endif
inline EventList& GetEventList() {
if (!g_event_list) {
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
g_event_list = std::make_shared<EventList>();
g_thread_id = g_next_thread_id++;
g_all_event_lists.emplace_front(g_event_list);
}
return *g_event_list;
}
void Mark(const std::string& name, DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id,
dev_ctx);
}
RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
if (g_state == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx;
GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
dev_ctx_);
}
RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled) return;
GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id,
dev_ctx_);
}
void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enbale profling, since the input state is ",
"ProfilerState::kDisabled");
PADDLE_ENFORCE(kState == ProfilerState::kDisabled,
PADDLE_ENFORCE(g_state == ProfilerState::kDisabled,
"The profiling state should be disabled when calling ",
"EnableProfiler.");
kState = state;
g_state = state;
#ifdef PADDLE_WITH_CUDA
auto ForEachDevice = [](std::function<void(int)> op) {
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
DeviceGuard dev_guard(i);
op(i);
}
};
if (kState == ProfilerState::kCUDA) {
if (g_state == ProfilerState::kCUDA) {
// Generate some dummy evenets first to reduce the startup overhead.
for (int i = 0; i < 5; i++) {
ForEachDevice([](int d) {
DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(d));
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
Mark("_cuda_startup_", dev_ctx);
dev_ctx->Wait();
});
......@@ -52,20 +151,20 @@ void EnableProfiler(ProfilerState state) {
}
#endif
// Mark the profiling start.
Mark("_start_profiler_");
Mark("_start_profiler_", nullptr);
}
std::vector<std::vector<Event>> DisableProfiler() {
PADDLE_ENFORCE(kState != ProfilerState::kDisabled,
PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
"Can't disable profiling, since it's not starting.");
// Mark the profiling stop.
Mark("_stop_profiler_");
kState = ProfilerState::kDisabled;
Mark("_stop_profiler_", nullptr);
g_state = ProfilerState::kDisabled;
std::vector<std::vector<Event>> result;
std::lock_guard<std::mutex> guard(kAllEventListsMutex);
for (auto it = kAllEventLists.begin(); it != kAllEventLists.end(); ++it) {
auto& list = *it;
result.emplace_back(list->Reduce());
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
++it) {
result.emplace_back((*it)->Reduce());
}
return result;
}
......
......@@ -24,76 +24,24 @@ namespace platform {
enum EventKind { kMark, kPushRange, kPopRange };
inline uint64_t GetTimeInNsec() {
// using std::chrono;
using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
std::chrono::high_resolution_clock,
std::chrono::steady_clock>::type;
return std::chrono::duration_cast<std::chrono::nanoseconds>(
clock::now().time_since_epoch())
.count();
}
class Event {
public:
// the DeviceContext is used to get the cuda stream.
// The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr.
Event(EventKind kind, std::string name, uint32_t thread_id,
const platform::DeviceContext* dev_ctx = nullptr)
: kind_(kind), name_(std::move(name)), thread_id_(thread_id) {
has_cuda_ = false;
#ifdef PADDLE_WITH_CUDA
auto* cuda_dev_ctx =
static_cast<const platform::CUDADeviceContext*>(dev_ctx);
if (cuda_dev_ctx) {
PADDLE_ENFORCE(cudaGetDevice(&device_));
PADDLE_ENFORCE(cudaEventCreate(&event_));
auto stream = cuda_dev_ctx->stream();
PADDLE_ENFORCE(cudaEventRecord(event_, stream));
has_cuda_ = true;
}
#endif
cpu_ns_ = GetTimeInNsec();
}
std::string kind() const {
switch (kind_) {
case EventKind::kMark:
return "mark";
case EventKind::kPushRange:
return "push";
case EventKind::kPopRange:
return "pop";
}
PADDLE_THROW("Unknown EventKind.");
}
DeviceContext* dev_ctx);
std::string kind() const;
std::string name() const { return name_; }
bool has_cuda() const { return has_cuda_; }
#ifdef PADDLE_WITH_CUDA
cudaEvent_t event() const { return event_; }
int device() const { return device_; }
#endif
double CpuElapsedUs(const Event& e) const {
return (e.cpu_ns_ - cpu_ns_) / (1000.0);
}
double CudaElapsedUs(const Event& e) const {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE(e.has_cuda() && has_cuda());
PADDLE_ENFORCE(e.device() == device());
PADDLE_ENFORCE(cudaEventSynchronize(event_));
PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
float ms;
PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
return ms * 1000.0;
#else
PADDLE_THROW("CUDA is not enabled");
#endif
}
double CpuElapsedUs(const Event& e) const;
double CudaElapsedUs(const Event& e) const;
private:
EventKind kind_;
......@@ -108,11 +56,11 @@ class Event {
};
struct EventList {
constexpr static std::size_t kMB = 1024 * 1024;
constexpr static std::size_t kEventBlockSize = 16 * kMB;
constexpr static std::size_t kEventSize = sizeof(Event);
constexpr static std::size_t kEventAlign = alignof(Event);
constexpr static std::size_t kNumBlock =
constexpr static size_t kMB = 1024 * 1024;
constexpr static size_t kEventBlockSize = 16 * kMB;
constexpr static size_t kEventSize = sizeof(Event);
constexpr static size_t kEventAlign = alignof(Event);
constexpr static size_t kNumBlock =
kEventBlockSize /
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
......@@ -139,58 +87,27 @@ struct EventList {
};
enum ProfilerState {
kDisabled,
kCPU,
kCUDA,
kDisabled, // disabled state
kCPU, // CPU profiling state
kCUDA, // GPU profiling state
};
// The profiler state, the initial value is ProfilerState::kDisabled
extern ProfilerState kState;
// The global mutex
extern std::mutex kAllEventListsMutex;
// The total event lists of all threads
extern std::list<std::shared_ptr<EventList>> kAllEventLists;
// The thread local event list only can be accessed by the specific thread
extern thread_local std::shared_ptr<EventList> kEventList;
// The thread index of each thread
extern thread_local int32_t kThreadId;
// The kNextThreadId is a global counter for threads, by the kThreadId and
// kNextThreadId, we can know how many threads have created EventList.
extern uint32_t kNextThreadId;
inline EventList& GetEventList() {
if (!kEventList) {
std::lock_guard<std::mutex> guard(kAllEventListsMutex);
kEventList = std::make_shared<EventList>();
kThreadId = kNextThreadId++;
kAllEventLists.emplace_front(kEventList);
}
return *kEventList;
}
inline void Mark(const std::string name,
const platform::DeviceContext* dev_ctx = nullptr) {
GetEventList().Record(EventKind::kMark, std::move(name), kThreadId, dev_ctx);
}
void Mark(const std::string& name, DeviceContext* dev_ctx);
struct RecordEvent {
explicit RecordEvent(const std::string name,
platform::DeviceContext* dev_ctx = nullptr) {
if (kState == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx;
GetEventList().Record(EventKind::kPushRange, std::move(name), kThreadId,
dev_ctx_);
}
explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
~RecordEvent() {
if (kState == ProfilerState::kDisabled) return;
GetEventList().Record(EventKind::kPopRange, std::string(), kThreadId,
dev_ctx_);
}
platform::DeviceContext* dev_ctx_;
~RecordEvent();
// The device context is used by Event to get the current cuda stream.
DeviceContext* dev_ctx_;
};
// Enable the profiling function.
void EnableProfiler(ProfilerState state);
// Return the event list of all threads. Asummed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> DisableProfiler();
} // namespace platform
......
......@@ -19,13 +19,13 @@ TEST(Event, CpuElapsedTime) {
using paddle::platform::Event;
using paddle::platform::EventKind;
Event start_event(EventKind::kPushRange, "test", 0);
Event start_event(EventKind::kPushRange, "test", 0, nullptr);
EXPECT_TRUE(start_event.has_cuda() == false);
int counter = 0;
while (counter != 1000) {
counter++;
}
Event stop_event(EventKind::kPopRange, "test", 0);
Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0);
}
......@@ -33,11 +33,11 @@ TEST(Event, CpuElapsedTime) {
TEST(Event, CudaElapsedTime) {
using paddle::platform::DeviceContext;
using paddle::platform::CUDADeviceContext;
using paddle::platform::GPUPlace;
using paddle::platform::CUDAPlace;
using paddle::platform::Event;
using paddle::platform::EventKind;
DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(0));
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
EXPECT_TRUE(start_event.has_cuda() == true);
int counter = 0;
......@@ -60,10 +60,10 @@ TEST(RecordEvent, RecordEvent) {
DeviceContext* dev_ctx = nullptr;
#ifdef PADDLE_WITH_CUDA
using paddle::platform::CUDADeviceContext;
using paddle::platform::GPUPlace;
using paddle::platform::CUDAPlace;
state = ProfilerState::kCUDA;
dev_ctx =
new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace(0));
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
#endif
EnableProfiler(state);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册