未验证 提交 09799566 编写于 作者: C chengduo 提交者: GitHub

Add memory profiler (#16137)

test=develop
上级 05993c3f
...@@ -3,7 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator) ...@@ -3,7 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator) cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator) cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
if (WITH_GPU) if (WITH_GPU)
......
...@@ -12,8 +12,6 @@ ...@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility> #include <utility>
...@@ -24,9 +22,11 @@ ...@@ -24,9 +22,11 @@
#endif #endif
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/split.h" #include "paddle/fluid/string/split.h"
...@@ -329,18 +329,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { ...@@ -329,18 +329,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
} // namespace legacy } // namespace legacy
namespace allocation { namespace allocation {
LegacyMemMonitor GPUMemMonitor; LegacyMemMonitor GPUMemMonitor;
Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_); void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
return new Allocation(ptr, size, place_); auto *tmp_alloc = new Allocation(ptr, size, place_);
platform::MemEvenRecorder::Instance().PushMemRecord(
static_cast<void *>(tmp_alloc), place_, size);
return tmp_alloc;
} }
void LegacyAllocator::Free(Allocation *allocation) { void LegacyAllocator::Free(Allocation *allocation) {
boost::apply_visitor( boost::apply_visitor(
legacy::FreeVisitor(allocation->ptr(), allocation->size()), legacy::FreeVisitor(allocation->ptr(), allocation->size()),
allocation->place()); allocation->place());
platform::MemEvenRecorder::Instance().PopMemRecord(
static_cast<void *>(allocation), place_);
delete allocation; delete allocation;
} }
......
...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h"
#include <deque> #include <deque>
#include <forward_list> #include <forward_list>
...@@ -30,6 +29,8 @@ limitations under the License. */ ...@@ -30,6 +29,8 @@ limitations under the License. */
#include "glog/logging.h" #include "glog/logging.h"
#include "google/protobuf/text_format.h" #include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
...@@ -317,6 +318,24 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -317,6 +318,24 @@ class DeviceTracerImpl : public DeviceTracer {
stream_id, correlation_id, bytes}); stream_id, correlation_id, bytes});
} }
void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &alloc_in,
const std::string &free_in, int64_t thread_id) {
if (0 == start_ns || 0 == end_ns) {
VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
return;
}
thread_local std::forward_list<MemInfoRecord> *local_mem_info_record =
nullptr;
if (local_mem_info_record == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
mem_info_record_.emplace_front();
local_mem_info_record = &mem_info_record_.front();
}
local_mem_info_record->emplace_front(MemInfoRecord{
start_ns, end_ns, bytes, place, thread_id, alloc_in, free_in});
}
void AddActiveKindRecords(const std::string &anno, uint64_t start_ns, void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, uint64_t end_ns, int64_t device_id,
int64_t thread_id, uint32_t correlation_id) { int64_t thread_id, uint32_t correlation_id) {
...@@ -409,6 +428,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -409,6 +428,7 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_.clear(); correlations_.clear();
for (auto &tmp : correlations_pairs) tmp.clear(); for (auto &tmp : correlations_pairs) tmp.clear();
for (auto &tmp : cpu_records_) tmp.clear(); for (auto &tmp : cpu_records_) tmp.clear();
for (auto &tmp : mem_info_record_) tmp.clear();
for (auto &tmp : active_kind_records_) tmp.clear(); for (auto &tmp : active_kind_records_) tmp.clear();
} }
...@@ -440,9 +460,12 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -440,9 +460,12 @@ class DeviceTracerImpl : public DeviceTracer {
proto::Profile profile_pb; proto::Profile profile_pb;
profile_pb.set_start_ns(start_ns_); profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_); profile_pb.set_end_ns(end_ns_);
if (correlations_.empty()) if (correlations_.empty()) {
for (auto &tmp : correlations_pairs) for (auto &tmp : correlations_pairs) {
for (auto &pair : tmp) correlations_[pair.first] = pair.second; for (auto &pair : tmp) correlations_[pair.first] = pair.second;
}
}
for (const KernelRecord &r : kernel_records_) { for (const KernelRecord &r : kernel_records_) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel); event->set_type(proto::Event::GPUKernel);
...@@ -462,6 +485,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -462,6 +485,7 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
} }
VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find; VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
for (auto &tmp : cpu_records_) { for (auto &tmp : cpu_records_) {
for (const CPURecord &r : tmp) { for (const CPURecord &r : tmp) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
...@@ -473,6 +497,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -473,6 +497,7 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
} }
} }
for (auto &tmp : active_kind_records_) { for (auto &tmp : active_kind_records_) {
for (const ActiveKindRecord &r : tmp) { for (const ActiveKindRecord &r : tmp) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
...@@ -510,6 +535,31 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -510,6 +535,31 @@ class DeviceTracerImpl : public DeviceTracer {
event->mutable_memcopy()->set_bytes(r.bytes); event->mutable_memcopy()->set_bytes(r.bytes);
} }
VLOG(1) << "MemRecord event miss: " << miss << " find: " << find; VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
for (auto &tmp : mem_info_record_) {
for (const auto &r : tmp) {
auto *event = profile_pb.add_mem_events();
event->set_device_id(0);
if (platform::is_cpu_place(r.place)) {
event->set_place(proto::MemEvent::CPUPlace);
} else if (platform::is_gpu_place(r.place)) {
event->set_place(proto::MemEvent::CUDAPlace);
event->set_device_id(
boost::get<platform::CUDAPlace>(r.place).GetDeviceId());
} else if (platform::is_cuda_pinned_place(r.place)) {
event->set_place(proto::MemEvent::CUDAPinnedPlace);
} else {
PADDLE_THROW("The current place is not supported.");
}
event->set_alloc_in(r.alloc_in);
event->set_free_in(r.free_in);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_bytes(r.bytes);
event->set_thread_id(r.thread_id);
}
}
std::ofstream profile_f; std::ofstream profile_f;
profile_f.open(profile_path, profile_f.open(profile_path,
std::ios::out | std::ios::trunc | std::ios::binary); std::ios::out | std::ios::trunc | std::ios::binary);
...@@ -553,6 +603,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -553,6 +603,7 @@ class DeviceTracerImpl : public DeviceTracer {
std::forward_list<KernelRecord> kernel_records_; std::forward_list<KernelRecord> kernel_records_;
std::forward_list<MemRecord> mem_records_; std::forward_list<MemRecord> mem_records_;
std::forward_list<std::forward_list<CPURecord>> cpu_records_; std::forward_list<std::forward_list<CPURecord>> cpu_records_;
std::forward_list<std::forward_list<MemInfoRecord>> mem_info_record_;
std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_; std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_;
std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>> std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
correlations_pairs; correlations_pairs;
...@@ -575,7 +626,7 @@ Event *CurAnnotation() { ...@@ -575,7 +626,7 @@ Event *CurAnnotation() {
return annotation_stack.back(); return annotation_stack.back();
} }
std::string CurAnnotationName() { std::string CurAnnotationName() {
if (annotation_stack.empty()) return ""; if (annotation_stack.empty()) return "Unknown";
return annotation_stack.back()->name(); return annotation_stack.back()->name();
} }
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.pb.h" #include "paddle/fluid/platform/profiler.pb.h"
...@@ -47,6 +48,7 @@ class DeviceTracer { ...@@ -47,6 +48,7 @@ class DeviceTracer {
int64_t stream_id; int64_t stream_id;
uint32_t correlation_id; uint32_t correlation_id;
}; };
struct CPURecord { struct CPURecord {
std::string name; std::string name;
uint64_t start_ns; uint64_t start_ns;
...@@ -54,6 +56,7 @@ class DeviceTracer { ...@@ -54,6 +56,7 @@ class DeviceTracer {
int64_t device_id; int64_t device_id;
int64_t thread_id; int64_t thread_id;
}; };
struct MemRecord { struct MemRecord {
std::string name; std::string name;
uint64_t start_ns; uint64_t start_ns;
...@@ -63,6 +66,17 @@ class DeviceTracer { ...@@ -63,6 +66,17 @@ class DeviceTracer {
uint32_t correlation_id; uint32_t correlation_id;
uint64_t bytes; uint64_t bytes;
}; };
struct MemInfoRecord {
uint64_t start_ns;
uint64_t end_ns;
size_t bytes;
Place place;
int64_t thread_id;
std::string alloc_in;
std::string free_in;
};
struct ActiveKindRecord { struct ActiveKindRecord {
std::string name; std::string name;
uint64_t start_ns; uint64_t start_ns;
...@@ -71,6 +85,7 @@ class DeviceTracer { ...@@ -71,6 +85,7 @@ class DeviceTracer {
int64_t thread_id; int64_t thread_id;
uint32_t correlation_id; uint32_t correlation_id;
}; };
virtual ~DeviceTracer() {} virtual ~DeviceTracer() {}
// Needs to be called once before use. // Needs to be called once before use.
virtual void Enable() = 0; virtual void Enable() = 0;
...@@ -97,6 +112,12 @@ class DeviceTracer { ...@@ -97,6 +112,12 @@ class DeviceTracer {
int64_t thread_id, int64_t thread_id,
uint32_t correlation_id) = 0; uint32_t correlation_id) = 0;
virtual void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns,
size_t bytes, const Place& place,
const std::string& alloc_in,
const std::string& free_in,
int64_t thread_id) = 0;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability. // added before for human readability.
virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end, virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
......
...@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
#endif #endif
#include "paddle/fluid/platform/place.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -64,5 +66,36 @@ class Event { ...@@ -64,5 +66,36 @@ class Event {
#endif #endif
#endif #endif
}; };
class MemEvent {
public:
MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes,
Place place, int64_t thread_id, const std::string& annotation)
: type_(type),
start_ns_(start_ns),
end_ns_(end_ns),
bytes_(bytes),
place_(place),
thread_id_(thread_id),
annotation_(annotation) {}
const EventType& type() const { return type_; }
uint64_t start_ns() const { return start_ns_; }
uint64_t end_ns() const { return end_ns_; }
size_t bytes() const { return bytes_; }
Place place() const { return place_; }
int64_t thread_id() const { return thread_id_; }
const std::string& annotation() const { return annotation_; }
private:
EventType type_;
uint64_t start_ns_ = 0;
uint64_t end_ns_ = 0;
size_t bytes_;
Place place_;
int64_t thread_id_;
std::string annotation_;
};
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include <algorithm> #include <algorithm>
#include <iomanip> #include <iomanip>
#include <limits> #include <limits>
...@@ -21,6 +20,8 @@ limitations under the License. */ ...@@ -21,6 +20,8 @@ limitations under the License. */
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <random> #include <random>
#include <string> #include <string>
#include <vector>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
...@@ -36,8 +37,6 @@ DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); ...@@ -36,8 +37,6 @@ DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
struct EventList;
static int64_t profiler_lister_id = 0; static int64_t profiler_lister_id = 0;
static bool should_send_profile_state = false; static bool should_send_profile_state = false;
std::mutex profiler_mu; std::mutex profiler_mu;
...@@ -53,43 +52,15 @@ static uint32_t g_next_thread_id = 0; ...@@ -53,43 +52,15 @@ static uint32_t g_next_thread_id = 0;
// The global mutex // The global mutex
static std::mutex g_all_event_lists_mutex; static std::mutex g_all_event_lists_mutex;
// The total event lists of all threads // The total event lists of all threads
static std::list<std::shared_ptr<EventList>> g_all_event_lists; static std::list<std::shared_ptr<EventList<Event>>> g_all_event_lists;
// The thread local event list only can be accessed by the specific thread // The thread local event list only can be accessed by the specific thread
static thread_local std::shared_ptr<EventList> g_event_list; static thread_local std::shared_ptr<EventList<Event>> g_event_list;
struct EventList {
constexpr static size_t kMB = 1024 * 1024;
constexpr static size_t kEventBlockSize = 16 * kMB;
constexpr static size_t kEventSize = sizeof(Event);
constexpr static size_t kEventAlign = alignof(Event);
constexpr static size_t kNumBlock =
kEventBlockSize /
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
template <typename... Args>
Event* Record(Args&&... args) {
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
event_blocks.emplace_front();
event_blocks.front().reserve(kNumBlock);
}
event_blocks.front().emplace_back(std::forward<Args>(args)...);
return &event_blocks.front().back();
}
std::vector<Event> Reduce() {
std::vector<Event> result;
for (auto& block : event_blocks) {
result.insert(result.begin(), std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
event_blocks.clear();
return result;
}
void Clear() { event_blocks.clear(); } static std::list<std::shared_ptr<EventList<MemEvent>>> g_all_mem_event_lists;
static thread_local std::shared_ptr<EventList<MemEvent>> g_mem_event_list;
std::forward_list<std::vector<Event>> event_blocks; static std::mutex g_all_mem_event_lists_mutex;
}; static thread_local int32_t g_mem_thread_id;
static uint32_t g_mem_next_thread_id = 0;
inline uint64_t GetTimeInNsec() { inline uint64_t GetTimeInNsec() {
using clock = std::conditional<std::chrono::high_resolution_clock::is_steady, using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
...@@ -105,13 +76,13 @@ Event::Event(EventType type, std::string name, uint32_t thread_id) ...@@ -105,13 +76,13 @@ Event::Event(EventType type, std::string name, uint32_t thread_id)
cpu_ns_ = GetTimeInNsec(); cpu_ns_ = GetTimeInNsec();
} }
const EventType& Event::type() const { return type_; } const EventType &Event::type() const { return type_; }
double Event::CpuElapsedMs(const Event& e) const { double Event::CpuElapsedMs(const Event &e) const {
return (e.cpu_ns_ - cpu_ns_) / (1000000.0); return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
} }
double Event::CudaElapsedMs(const Event& e) const { double Event::CudaElapsedMs(const Event &e) const {
#ifdef PADDLE_WITH_CUPTI #ifdef PADDLE_WITH_CUPTI
return gpu_ns_ / 1000000.0; return gpu_ns_ / 1000000.0;
#else #else
...@@ -120,10 +91,32 @@ double Event::CudaElapsedMs(const Event& e) const { ...@@ -120,10 +91,32 @@ double Event::CudaElapsedMs(const Event& e) const {
#endif #endif
} }
inline EventList& GetEventList() { inline EventList<MemEvent> &GetMemEventList() {
if (!g_mem_event_list) {
g_mem_event_list = std::make_shared<EventList<MemEvent>>();
std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
g_mem_thread_id = g_mem_next_thread_id++;
g_all_mem_event_lists.emplace_front(g_mem_event_list);
}
return *g_mem_event_list;
}
void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &annotation) {
GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes,
place, g_mem_thread_id, annotation);
}
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &annotation) {
GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place,
g_mem_thread_id, annotation);
}
inline EventList<Event> &GetEventList() {
if (!g_event_list) { if (!g_event_list) {
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex); std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
g_event_list = std::make_shared<EventList>(); g_event_list = std::make_shared<EventList<Event>>();
g_thread_id = g_next_thread_id++; g_thread_id = g_next_thread_id++;
g_all_event_lists.emplace_front(g_event_list); g_all_event_lists.emplace_front(g_event_list);
RecoreCurThreadId(g_thread_id); RecoreCurThreadId(g_thread_id);
...@@ -131,26 +124,26 @@ inline EventList& GetEventList() { ...@@ -131,26 +124,26 @@ inline EventList& GetEventList() {
return *g_event_list; return *g_event_list;
} }
void Mark(const std::string& name) { void Mark(const std::string &name) {
GetEventList().Record(EventType::kMark, name, g_thread_id); GetEventList().Record(EventType::kMark, name, g_thread_id);
} }
Event* PushEvent(const std::string& name) { Event *PushEvent(const std::string &name) {
return GetEventList().Record(EventType::kPushRange, name, g_thread_id); return GetEventList().Record(EventType::kPushRange, name, g_thread_id);
} }
void PopEvent(const std::string& name) { void PopEvent(const std::string &name) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id); GetEventList().Record(EventType::kPopRange, name, g_thread_id);
} }
RecordEvent::RecordEvent(const std::string& name) RecordEvent::RecordEvent(const std::string &name)
: is_enabled_(false), start_ns_(PosixInNsec()) { : is_enabled_(false), start_ns_(PosixInNsec()) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
// lock is not needed, the code below is thread-safe // lock is not needed, the code below is thread-safe
is_enabled_ = true; is_enabled_ = true;
name_ = name; name_ = name;
Event* e = PushEvent(name_); Event *e = PushEvent(name_);
// Maybe need the same push/pop behavior. // Maybe need the same push/pop behavior.
SetCurAnnotation(e); SetCurAnnotation(e);
} }
...@@ -158,7 +151,7 @@ RecordEvent::RecordEvent(const std::string& name) ...@@ -158,7 +151,7 @@ RecordEvent::RecordEvent(const std::string& name)
RecordEvent::~RecordEvent() { RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled || !is_enabled_) return; if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
// lock is not needed, the code below is thread-safe // lock is not needed, the code below is thread-safe
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer *tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(), tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
BlockDepth(), g_thread_id); BlockDepth(), g_thread_id);
...@@ -167,7 +160,56 @@ RecordEvent::~RecordEvent() { ...@@ -167,7 +160,56 @@ RecordEvent::~RecordEvent() {
PopEvent(name_); PopEvent(name_);
} }
RecordRPCEvent::RecordRPCEvent(const std::string& name) { MemEvenRecorder MemEvenRecorder::recorder;
void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
size_t size) {
if (g_state == ProfilerState::kDisabled) return;
std::lock_guard<std::mutex> guard(mtx_);
auto &events = address_memevent_[place];
PADDLE_ENFORCE(events.count(ptr) == 0, "");
events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
new MemEvenRecorder::RecordMemEvent(place, size)));
}
void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
if (g_state == ProfilerState::kDisabled) return;
std::lock_guard<std::mutex> guard(mtx_);
auto &events = address_memevent_[place];
auto iter = events.find(ptr);
// The ptr maybe not in address_memevent
if (iter != events.end()) {
events.erase(iter);
}
}
void MemEvenRecorder::Flush() {
std::lock_guard<std::mutex> guard(mtx_);
address_memevent_.clear();
}
MemEvenRecorder::RecordMemEvent::RecordMemEvent(const Place &place,
size_t bytes)
: place_(place),
bytes_(bytes),
start_ns_(PosixInNsec()),
alloc_in_(CurAnnotationName()) {
PushMemEvent(start_ns_, end_ns_, bytes_, place_, alloc_in_);
}
MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
DeviceTracer *tracer = GetDeviceTracer();
end_ns_ = PosixInNsec();
auto annotation_free = CurAnnotationName();
if (tracer) {
tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_,
annotation_free, g_mem_thread_id);
}
PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
}
RecordRPCEvent::RecordRPCEvent(const std::string &name) {
if (FLAGS_enable_rpc_profiler) { if (FLAGS_enable_rpc_profiler) {
event_.reset(new platform::RecordEvent(name)); event_.reset(new platform::RecordEvent(name));
} }
...@@ -185,7 +227,7 @@ RecordBlock::RecordBlock(int block_id) ...@@ -185,7 +227,7 @@ RecordBlock::RecordBlock(int block_id)
RecordBlock::~RecordBlock() { RecordBlock::~RecordBlock() {
// lock is not needed, the code below is thread-safe // lock is not needed, the code below is thread-safe
if (g_state == ProfilerState::kDisabled || !is_enabled_) return; if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer *tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
// We try to put all blocks at the same nested depth in the // We try to put all blocks at the same nested depth in the
// same timeline lane. and distinguish the using thread_id. // same timeline lane. and distinguish the using thread_id.
...@@ -232,11 +274,16 @@ void EnableProfiler(ProfilerState state) { ...@@ -232,11 +274,16 @@ void EnableProfiler(ProfilerState state) {
void ResetProfiler() { void ResetProfiler() {
SynchronizeAllDevice(); SynchronizeAllDevice();
GetDeviceTracer()->Reset(); GetDeviceTracer()->Reset();
MemEvenRecorder::Instance().Flush();
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex); std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
++it) { ++it) {
(*it)->Clear(); (*it)->Clear();
} }
for (auto it = g_all_mem_event_lists.begin();
it != g_all_mem_event_lists.end(); ++it) {
(*it)->Clear();
}
} }
std::vector<std::vector<Event>> GetAllEvents() { std::vector<std::vector<Event>> GetAllEvents() {
...@@ -249,6 +296,15 @@ std::vector<std::vector<Event>> GetAllEvents() { ...@@ -249,6 +296,15 @@ std::vector<std::vector<Event>> GetAllEvents() {
return result; return result;
} }
std::vector<std::vector<MemEvent>> GetMemEvents() {
std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
std::vector<std::vector<MemEvent>> result;
for (auto &it : g_all_mem_event_lists) {
result.emplace_back((*it).Reduce());
}
return result;
}
// The information of each event given in the profiling report // The information of each event given in the profiling report
struct EventItem { struct EventItem {
std::string name; std::string name;
...@@ -263,8 +319,8 @@ struct EventItem { ...@@ -263,8 +319,8 @@ struct EventItem {
}; };
// Print results // Print results
void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
const std::string& sorted_domain, const size_t name_width, const std::string &sorted_domain, const size_t name_width,
const size_t data_width, bool merge_thread) { const size_t data_width, bool merge_thread) {
// Output header information // Output header information
std::cout << "\n------------------------->" std::cout << "\n------------------------->"
...@@ -302,7 +358,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, ...@@ -302,7 +358,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
<< std::setw(data_width) << "Ratio." << std::endl; << std::setw(data_width) << "Ratio." << std::endl;
for (size_t i = 0; i < events_table.size(); ++i) { for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) { for (size_t j = 0; j < events_table[i].size(); ++j) {
const EventItem& event_item = events_table[i][j]; const EventItem &event_item = events_table[i][j];
std::cout << std::setw(name_width) << event_item.name std::cout << std::setw(name_width) << event_item.name
<< std::setw(data_width) << event_item.calls << std::setw(data_width) << event_item.calls
<< std::setw(data_width) << event_item.total_time; << std::setw(data_width) << event_item.total_time;
...@@ -326,54 +382,54 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, ...@@ -326,54 +382,54 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
} }
// Parse the event list and output the profiling report // Parse the event list and output the profiling report
void ParseEvents(const std::vector<std::vector<Event>>& events, void ParseEvents(const std::vector<std::vector<Event>> &events,
bool merge_thread, bool merge_thread,
EventSortingKey sorted_by = EventSortingKey::kDefault) { EventSortingKey sorted_by = EventSortingKey::kDefault) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
if (merge_thread && events.size() < 2) return; if (merge_thread && events.size() < 2) return;
std::string sorted_domain; std::string sorted_domain;
std::function<bool(const EventItem&, const EventItem&)> sorted_func; std::function<bool(const EventItem &, const EventItem &)> sorted_func;
switch (sorted_by) { switch (sorted_by) {
case EventSortingKey::kCalls: case EventSortingKey::kCalls:
sorted_domain = "number of calls"; sorted_domain = "number of calls";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.calls > b.calls; return a.calls > b.calls;
}; };
break; break;
case EventSortingKey::kTotal: case EventSortingKey::kTotal:
sorted_domain = "total time"; sorted_domain = "total time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.total_time > b.total_time; return a.total_time > b.total_time;
}; };
break; break;
case EventSortingKey::kMin: case EventSortingKey::kMin:
sorted_domain = "minimum time"; sorted_domain = "minimum time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.min_time > b.min_time; return a.min_time > b.min_time;
}; };
break; break;
case EventSortingKey::kMax: case EventSortingKey::kMax:
sorted_domain = "maximum time"; sorted_domain = "maximum time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.max_time > b.max_time; return a.max_time > b.max_time;
}; };
break; break;
case EventSortingKey::kAve: case EventSortingKey::kAve:
sorted_domain = "average time"; sorted_domain = "average time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.ave_time > b.ave_time; return a.ave_time > b.ave_time;
}; };
break; break;
case EventSortingKey::kGPUTime: case EventSortingKey::kGPUTime:
sorted_domain = "average time"; sorted_domain = "average time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.gpu_time > b.gpu_time; return a.gpu_time > b.gpu_time;
}; };
break; break;
case EventSortingKey::kCPUTime: case EventSortingKey::kCPUTime:
sorted_domain = "average time"; sorted_domain = "average time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.cpu_time > b.cpu_time; return a.cpu_time > b.cpu_time;
}; };
break; break;
...@@ -381,7 +437,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -381,7 +437,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
sorted_domain = "event first end time"; sorted_domain = "event first end time";
} }
const std::vector<std::vector<Event>>* analyze_events; const std::vector<std::vector<Event>> *analyze_events;
std::vector<std::vector<Event>> merged_events_list; std::vector<std::vector<Event>> merged_events_list;
if (merge_thread) { if (merge_thread) {
std::vector<Event> merged_events; std::vector<Event> merged_events;
...@@ -469,7 +525,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -469,7 +525,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
} }
} }
// average time // average time
for (auto& item : event_items) { for (auto &item : event_items) {
item.ave_time = item.total_time / item.calls; item.ave_time = item.total_time / item.calls;
item.ratio = item.total_time / total; item.ratio = item.total_time / total;
} }
...@@ -493,15 +549,77 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -493,15 +549,77 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
merge_thread); merge_thread);
} }
struct MemoryProfierReport {
size_t alloc_times{0};
size_t alloc_size{0};
size_t free_times{0};
size_t free_size{0};
};
// Print results
void PrintMemProfiler(
const std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
&annotation_report,
const size_t name_width, const size_t data_width) {
// Output header information
std::cout << "\n------------------------->"
<< " Memory Profiling Report "
<< "<-------------------------\n\n";
// Output events table
std::cout.setf(std::ios::left);
std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Alloc Calls" << std::setw(data_width) << "Size(MB)"
<< std::setw(data_width) << "Free Calls" << std::setw(data_width)
<< "Size(MB)" << std::endl;
for (auto &tmp : annotation_report) {
for (auto &e : tmp.second) {
auto event_name = string::Sprintf("%s:%s", tmp.first, e.first);
std::cout << std::setw(name_width) << event_name;
std::cout << std::setw(data_width) << e.second.alloc_times;
std::cout << std::setw(data_width)
<< e.second.alloc_size / (1024.0 * 1024.0);
std::cout << std::setw(data_width) << e.second.free_times;
std::cout << std::setw(data_width)
<< e.second.free_size / (1024.0 * 1024.0) << std::endl;
}
}
std::cout << std::endl;
}
// parse memory events
void ParseMemEvents(const std::vector<std::vector<MemEvent>> &events) {
if (g_state == ProfilerState::kDisabled) return;
// place, annotation, alloc times, alloc size
std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
annotation_report;
for (auto &tmp : events) {
for (auto &e : tmp) {
if (e.type() == EventType::kPushRange) {
annotation_report[e.place()][e.annotation()].alloc_times += 1;
annotation_report[e.place()][e.annotation()].alloc_size += e.bytes();
} else if (e.type() == EventType::kPopRange) {
annotation_report[e.place()][e.annotation()].free_times += 1;
annotation_report[e.place()][e.annotation()].free_size += e.bytes();
}
}
}
PrintMemProfiler(annotation_report, 55, 18);
}
void DisableProfiler(EventSortingKey sorted_key, void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path) { const std::string &profile_path) {
SynchronizeAllDevice(); SynchronizeAllDevice();
MemEvenRecorder::Instance().Flush();
std::lock_guard<std::mutex> l(profiler_mu); std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
// Mark the profiling stop. // Mark the profiling stop.
Mark("_stop_profiler_"); Mark("_stop_profiler_");
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer *tracer = GetDeviceTracer();
if (tracer->IsEnabled()) { if (tracer->IsEnabled()) {
tracer->Disable(); tracer->Disable();
tracer->GenProfile(profile_path); tracer->GenProfile(profile_path);
...@@ -511,6 +629,11 @@ void DisableProfiler(EventSortingKey sorted_key, ...@@ -511,6 +629,11 @@ void DisableProfiler(EventSortingKey sorted_key,
std::vector<std::vector<Event>> all_events = GetAllEvents(); std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, true, sorted_key); ParseEvents(all_events, true, sorted_key);
ParseEvents(all_events, false, sorted_key); ParseEvents(all_events, false, sorted_key);
if (VLOG_IS_ON(5)) {
std::vector<std::vector<MemEvent>> all_mem_events = GetMemEvents();
ParseMemEvents(all_mem_events);
}
ResetProfiler(); ResetProfiler();
g_state = ProfilerState::kDisabled; g_state = ProfilerState::kDisabled;
should_send_profile_state = true; should_send_profile_state = true;
......
...@@ -15,10 +15,17 @@ limitations under the License. */ ...@@ -15,10 +15,17 @@ limitations under the License. */
#pragma once #pragma once
#include <forward_list> #include <forward_list>
#include <list> #include <list>
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#endif #endif
...@@ -34,8 +41,41 @@ enum ProfilerState { ...@@ -34,8 +41,41 @@ enum ProfilerState {
void Mark(const std::string& name); void Mark(const std::string& name);
Event* PushEvent(const std::string& name); void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place);
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place);
struct MemEvenRecorder {
public:
void PushMemRecord(const void* ptr, const Place& place, size_t size);
void PopMemRecord(const void* ptr, const Place& place);
void Flush();
static MemEvenRecorder& Instance() { return recorder; }
private:
struct RecordMemEvent {
RecordMemEvent(const Place& place, size_t bytes);
~RecordMemEvent();
Place place_;
size_t bytes_;
uint64_t start_ns_;
uint64_t end_ns_;
std::string alloc_in_;
std::string free_in_;
};
static MemEvenRecorder recorder;
std::map<Place,
std::unordered_map<const void*, std::unique_ptr<RecordMemEvent>>>
address_memevent_;
std::mutex mtx_;
MemEvenRecorder() {}
DISABLE_COPY_AND_ASSIGN(MemEvenRecorder);
};
Event* PushEvent(const std::string& name);
void PopEvent(const std::string& name); void PopEvent(const std::string& name);
struct RecordEvent { struct RecordEvent {
...@@ -87,6 +127,41 @@ enum EventSortingKey { ...@@ -87,6 +127,41 @@ enum EventSortingKey {
kGPUTime kGPUTime
}; };
template <typename T>
struct EventList {
constexpr static size_t kMB = 1024 * 1024;
constexpr static size_t kEventBlockSize = 16 * kMB;
constexpr static size_t kEventSize = sizeof(T);
constexpr static size_t kEventAlign = alignof(T);
constexpr static size_t kNumBlock =
kEventBlockSize /
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
template <typename... Args>
T* Record(Args&&... args) {
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
event_blocks.emplace_front();
event_blocks.front().reserve(kNumBlock);
}
event_blocks.front().emplace_back(std::forward<Args>(args)...);
return &event_blocks.front().back();
}
std::vector<T> Reduce() {
std::vector<T> result;
for (auto& block : event_blocks) {
result.insert(result.begin(), std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
event_blocks.clear();
return result;
}
void Clear() { event_blocks.clear(); }
std::forward_list<std::vector<T>> event_blocks;
};
// Enable the profiling function. // Enable the profiling function.
void EnableProfiler(ProfilerState state); void EnableProfiler(ProfilerState state);
......
...@@ -34,8 +34,25 @@ message Event { ...@@ -34,8 +34,25 @@ message Event {
optional string detail_info = 9; optional string detail_info = 9;
} }
message MemEvent {
enum Place {
CUDAPlace = 0;
CPUPlace = 1;
CUDAPinnedPlace = 2;
}
optional uint64 start_ns = 1;
optional uint64 end_ns = 2;
optional uint64 bytes = 3;
optional Place place = 4;
optional uint64 thread_id = 5;
optional uint32 device_id = 6;
optional string alloc_in = 7;
optional string free_in = 8;
}
message Profile { message Profile {
repeated Event events = 1; repeated Event events = 1;
optional uint64 start_ns = 2; optional uint64 start_ns = 2;
optional uint64 end_ns = 3; optional uint64 end_ns = 3;
repeated MemEvent mem_events = 4;
} }
\ No newline at end of file
...@@ -95,6 +95,22 @@ class _ChromeTraceFormatter(object): ...@@ -95,6 +95,22 @@ class _ChromeTraceFormatter(object):
event['args'] = args event['args'] = args
self._events.append(event) self._events.append(event)
def emit_counter(self, category, name, pid, timestamp, counter, value):
"""Emits a record for a single counter.
Args:
category: The event category as string
name: The event name as string
pid: Identifier of the process generating this event as integer
timestamp: The timestamps of this event as long integer
counter: Name of the counter as string
value: Value of the counter as integer
tid: Thread id of the allocation as integer
"""
event = self._create_event('C', category, name, pid, 0, timestamp)
event['args'] = {counter: value}
self._events.append(event)
def format_to_string(self, pretty=False): def format_to_string(self, pretty=False):
"""Formats the chrome trace to a string. """Formats the chrome trace to a string.
...@@ -117,6 +133,7 @@ class Timeline(object): ...@@ -117,6 +133,7 @@ class Timeline(object):
self._profile_dict = profile_dict self._profile_dict = profile_dict
self._pid = 0 self._pid = 0
self._devices = dict() self._devices = dict()
self._mem_devices = dict()
self._chrome_trace = _ChromeTraceFormatter() self._chrome_trace = _ChromeTraceFormatter()
def _allocate_pid(self): def _allocate_pid(self):
...@@ -143,6 +160,45 @@ class Timeline(object): ...@@ -143,6 +160,45 @@ class Timeline(object):
self._devices[(k, event.device_id, "GPUKernel")] = pid self._devices[(k, event.device_id, "GPUKernel")] = pid
self._chrome_trace.emit_pid("%s:gpu:%d" % self._chrome_trace.emit_pid("%s:gpu:%d" %
(k, event.device_id), pid) (k, event.device_id), pid)
for mevent in profile_pb.mem_events:
if mevent.place == profiler_pb2.MemEvent.CUDAPlace:
if (k, mevent.device_id, "GPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, mevent.device_id, "GPU")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:gpu:%d" % (k, mevent.device_id),
pid)
elif mevent.place == profiler_pb2.MemEvent.CPUPlace:
if (k, mevent.device_id, "CPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, mevent.device_id, "CPU")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:cpu:%d" % (k, mevent.device_id),
pid)
elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace:
if (k, mevent.device_id, "CUDAPinnedPlace"
) not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, mevent.device_id,
"CUDAPinnedPlace")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:cudapinnedplace:%d" %
(k, mevent.device_id), pid)
if (k, 0, "CPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "CPU")] = pid
self._chrome_trace.emit_pid("memory usage on %s:cpu:%d" %
(k, 0), pid)
if (k, 0, "GPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "GPU")] = pid
self._chrome_trace.emit_pid("memory usage on %s:gpu:%d" %
(k, 0), pid)
if (k, 0, "CUDAPinnedPlace") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)
def _allocate_events(self): def _allocate_events(self):
for k, profile_pb in six.iteritems(self._profile_dict): for k, profile_pb in six.iteritems(self._profile_dict):
...@@ -163,9 +219,57 @@ class Timeline(object): ...@@ -163,9 +219,57 @@ class Timeline(object):
event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid, event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
event.sub_device_id, 'Op', event.name, args) event.sub_device_id, 'Op', event.name, args)
def _allocate_memory_event(self):
place_to_str = {
profiler_pb2.MemEvent.CPUPlace: "CPU",
profiler_pb2.MemEvent.CUDAPlace: "GPU",
profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
}
for k, profile_pb in six.iteritems(self._profile_dict):
mem_list = []
end_profiler = 0
for mevent in profile_pb.mem_events:
crt_info = dict()
crt_info['time'] = mevent.start_ns
crt_info['size'] = mevent.bytes
if mevent.place in place_to_str:
place = place_to_str[mevent.place]
else:
place = "UnDefine"
crt_info['place'] = place
pid = self._mem_devices[(k, mevent.device_id, place)]
crt_info['pid'] = pid
crt_info['thread_id'] = mevent.thread_id
crt_info['device_id'] = mevent.device_id
mem_list.append(crt_info)
crt_info = dict()
crt_info['place'] = place
crt_info['pid'] = pid
crt_info['thread_id'] = mevent.thread_id
crt_info['device_id'] = mevent.device_id
crt_info['time'] = mevent.end_ns
crt_info['size'] = -mevent.bytes
mem_list.append(crt_info)
end_profiler = max(end_profiler, crt_info['time'])
mem_list.sort(key=lambda tmp: (tmp.get('time', 0)))
i = 0
total_size = 0
while i < len(mem_list):
total_size += mem_list[i]['size']
while i < len(mem_list) - 1 and mem_list[i]['time'] == mem_list[
i + 1]['time']:
total_size += mem_list[i + 1]['size']
i += 1
self._chrome_trace.emit_counter(
"Memory", "Memory", mem_list[i]['pid'], mem_list[i]['time'],
0, total_size)
i += 1
def generate_chrome_trace(self): def generate_chrome_trace(self):
self._allocate_pids() self._allocate_pids()
self._allocate_events() self._allocate_events()
self._allocate_memory_event()
return self._chrome_trace.format_to_string() return self._chrome_trace.format_to_string()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册