diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index f0a0ea70a0aa14e1db959e4e6ace2a44363d0c35..8c4803b9739bb54cae89de62468a47631a5dde94 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -30,3 +30,6 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context) + +cc_library(profiler SRCS profiler.cc DEPS device_context) +cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc new file mode 100644 index 0000000000000000000000000000000000000000..4e89e5c600bf7f6e23faf8d62bc7c72f7ee8ca7a --- /dev/null +++ b/paddle/platform/profiler.cc @@ -0,0 +1,173 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/platform/profiler.h" + +namespace paddle { +namespace platform { + +// The profiler state, the initial value is ProfilerState::kDisabled +static ProfilerState g_state = ProfilerState::kDisabled; +// The thread local event list only can be accessed by the specific thread +// The thread index of each thread +static thread_local int32_t g_thread_id; +// The g_next_thread_id is a global counter for threads, by the g_thread_id and +// g_next_thread_id, we can know how many threads have created EventList. +static uint32_t g_next_thread_id = 0; +// The global mutex +static std::mutex g_all_event_lists_mutex; +// The total event lists of all threads +static std::list> g_all_event_lists; +// The thread local event list only can be accessed by the specific thread +static thread_local std::shared_ptr g_event_list; + +inline uint64_t GetTimeInNsec() { + using clock = std::conditional::type; + return std::chrono::duration_cast( + clock::now().time_since_epoch()) + .count(); +} + +Event::Event(EventKind kind, std::string name, uint32_t thread_id, + DeviceContext* dev_ctx) + : kind_(kind), + name_(std::move(name)), + thread_id_(thread_id), + has_cuda_(false) { +#ifdef PADDLE_WITH_CUDA + auto* cuda_dev_ctx = static_cast(dev_ctx); + if (cuda_dev_ctx) { + PADDLE_ENFORCE(cudaGetDevice(&device_)); + PADDLE_ENFORCE(cudaEventCreate(&event_)); + auto stream = cuda_dev_ctx->stream(); + PADDLE_ENFORCE(cudaEventRecord(event_, stream)); + has_cuda_ = true; + } +#endif + cpu_ns_ = GetTimeInNsec(); +} + +std::string Event::kind() const { + switch (kind_) { + case EventKind::kMark: + return "mark"; + case EventKind::kPushRange: + return "push"; + case EventKind::kPopRange: + return "pop"; + } + PADDLE_THROW("Unknown EventKind."); +} + +double Event::CpuElapsedUs(const Event& e) const { + return (e.cpu_ns_ - cpu_ns_) / (1000.0); +} + +double Event::CudaElapsedUs(const Event& e) const { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE(e.has_cuda() && has_cuda()); + PADDLE_ENFORCE(e.device() == device()); + PADDLE_ENFORCE(cudaEventSynchronize(event_)); + PADDLE_ENFORCE(cudaEventSynchronize(e.event())); + float ms; + PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event())); + return ms * 1000.0; +#else + PADDLE_THROW("CUDA is not enabled"); +#endif +} + +#ifdef PADDLE_WITH_CUDA +static void ForEachDevice(std::function func) { + auto original_device = GetCurrentDeviceId(); + int count = GetCUDADeviceCount(); + for (int i = 0; i < count; i++) { + SetDeviceId(i); + func(i); + } + SetDeviceId(original_device); +} +#endif + +inline EventList& GetEventList() { + if (!g_event_list) { + std::lock_guard guard(g_all_event_lists_mutex); + g_event_list = std::make_shared(); + g_thread_id = g_next_thread_id++; + g_all_event_lists.emplace_front(g_event_list); + } + return *g_event_list; +} + +void Mark(const std::string& name, DeviceContext* dev_ctx) { + GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id, + dev_ctx); +} + +RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) { + if (g_state == ProfilerState::kDisabled) return; + dev_ctx_ = dev_ctx; + GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id, + dev_ctx_); +} + +RecordEvent::~RecordEvent() { + if (g_state == ProfilerState::kDisabled) return; + GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id, + dev_ctx_); +} + +void EnableProfiler(ProfilerState state) { + PADDLE_ENFORCE(state != ProfilerState::kDisabled, + "Can't enbale profling, since the input state is ", + "ProfilerState::kDisabled"); + PADDLE_ENFORCE(g_state == ProfilerState::kDisabled, + "The profiling state should be disabled when calling ", + "EnableProfiler."); + g_state = state; +#ifdef PADDLE_WITH_CUDA + if (g_state == ProfilerState::kCUDA) { + // Generate some dummy evenets first to reduce the startup overhead. + for (int i = 0; i < 5; i++) { + ForEachDevice([](int d) { + DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d)); + Mark("_cuda_startup_", dev_ctx); + dev_ctx->Wait(); + }); + } + } +#endif + // Mark the profiling start. + Mark("_start_profiler_", nullptr); +} + +std::vector> DisableProfiler() { + PADDLE_ENFORCE(g_state != ProfilerState::kDisabled, + "Can't disable profiling, since it's not starting."); + // Mark the profiling stop. + Mark("_stop_profiler_", nullptr); + g_state = ProfilerState::kDisabled; + std::vector> result; + std::lock_guard guard(g_all_event_lists_mutex); + for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); + ++it) { + result.emplace_back((*it)->Reduce()); + } + return result; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h new file mode 100644 index 0000000000000000000000000000000000000000..47104ea9d08dad373888c0af463d2b1dbe72a269 --- /dev/null +++ b/paddle/platform/profiler.h @@ -0,0 +1,114 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include "paddle/platform/device_context.h" + +namespace paddle { +namespace platform { + +enum EventKind { kMark, kPushRange, kPopRange }; + +class Event { + public: + // The DeviceContext is used to get the cuda stream. + // If CPU profiling mode, can pass nullptr. + Event(EventKind kind, std::string name, uint32_t thread_id, + DeviceContext* dev_ctx); + + std::string kind() const; + std::string name() const { return name_; } + bool has_cuda() const { return has_cuda_; } + +#ifdef PADDLE_WITH_CUDA + cudaEvent_t event() const { return event_; } + int device() const { return device_; } +#endif + + double CpuElapsedUs(const Event& e) const; + double CudaElapsedUs(const Event& e) const; + + private: + EventKind kind_; + std::string name_; + uint32_t thread_id_; + int64_t cpu_ns_; + bool has_cuda_; +#ifdef PADDLE_WITH_CUDA + cudaEvent_t event_ = nullptr; + int device_ = -1; +#endif +}; + +struct EventList { + constexpr static size_t kMB = 1024 * 1024; + constexpr static size_t kEventBlockSize = 16 * kMB; + constexpr static size_t kEventSize = sizeof(Event); + constexpr static size_t kEventAlign = alignof(Event); + constexpr static size_t kNumBlock = + kEventBlockSize / + ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign); + + template + void Record(Args&&... args) { + if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) { + event_blocks.emplace_front(); + event_blocks.front().reserve(kNumBlock); + } + event_blocks.front().emplace_back(std::forward(args)...); + } + + std::vector Reduce() { + std::vector result; + for (auto& block : event_blocks) { + result.insert(result.begin(), std::make_move_iterator(block.begin()), + std::make_move_iterator(block.end())); + } + event_blocks.clear(); + return result; + } + + std::forward_list> event_blocks; +}; + +enum ProfilerState { + kDisabled, // disabled state + kCPU, // CPU profiling state + kCUDA, // GPU profiling state +}; + +void Mark(const std::string& name, DeviceContext* dev_ctx); + +struct RecordEvent { + explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx); + + ~RecordEvent(); + + // The device context is used by Event to get the current cuda stream. + DeviceContext* dev_ctx_; +}; + +// Enable the profiling function. +void EnableProfiler(ProfilerState state); + +// Return the event list of all threads. Asummed the returned value calls +// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. +std::vector> DisableProfiler(); + +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..47cf7be146121c54300df928fe329268ed975373 --- /dev/null +++ b/paddle/platform/profiler_test.cc @@ -0,0 +1,98 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/platform/profiler.h" +#include "gtest/gtest.h" + +TEST(Event, CpuElapsedTime) { + using paddle::platform::Event; + using paddle::platform::EventKind; + + Event start_event(EventKind::kPushRange, "test", 0, nullptr); + EXPECT_TRUE(start_event.has_cuda() == false); + int counter = 0; + while (counter != 1000) { + counter++; + } + Event stop_event(EventKind::kPopRange, "test", 0, nullptr); + EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0); +} + +#ifdef PADDLE_WITH_CUDA +TEST(Event, CudaElapsedTime) { + using paddle::platform::DeviceContext; + using paddle::platform::CUDADeviceContext; + using paddle::platform::CUDAPlace; + using paddle::platform::Event; + using paddle::platform::EventKind; + + DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0)); + Event start_event(EventKind::kPushRange, "test", 0, dev_ctx); + EXPECT_TRUE(start_event.has_cuda() == true); + int counter = 0; + while (counter != 1000) { + counter++; + } + Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx); + EXPECT_GT(start_event.CudaElapsedUs(stop_event), 0); +} +#endif + +TEST(RecordEvent, RecordEvent) { + using paddle::platform::DeviceContext; + using paddle::platform::Event; + using paddle::platform::EventKind; + using paddle::platform::RecordEvent; + using paddle::platform::ProfilerState; + + ProfilerState state = ProfilerState::kCPU; + DeviceContext* dev_ctx = nullptr; +#ifdef PADDLE_WITH_CUDA + using paddle::platform::CUDADeviceContext; + using paddle::platform::CUDAPlace; + state = ProfilerState::kCUDA; + dev_ctx = + new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0)); +#endif + EnableProfiler(state); + + for (int i = 1; i < 5; ++i) { + std::string name = "op_" + std::to_string(i); + RecordEvent record_event(name, dev_ctx); + int counter = 1; + while (counter != i * 1000) counter++; + } + std::vector> events = paddle::platform::DisableProfiler(); + int cuda_startup_count = 0; + int start_profiler_count = 0; + int stop_profiler_count = 0; + for (size_t i = 0; i < events.size(); ++i) { + for (size_t j = 0; j < events[i].size(); ++j) { + if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count; + if (events[i][j].name() == "_start_profiler_") ++start_profiler_count; + if (events[i][j].name() == "_stop_profiler_") ++stop_profiler_count; + if (events[i][j].name() == "push") { + EXPECT_EQ(events[i][j + 1].name(), "pop"); +#ifdef PADDLE_WITH_CUDA + EXPECT_GT(events[i][j].CudaElapsedUs(events[i][j + 1]), 0); +#else + EXPECT_GT(events[i][j].CpuElapsedUs(events[i][j + 1]), 0); +#endif + } + } + } + EXPECT_EQ(cuda_startup_count % 5, 0); + EXPECT_EQ(start_profiler_count, 1); + EXPECT_EQ(stop_profiler_count, 1); +}