未验证 提交 8dd0a3b9 编写于 作者: C chenjian 提交者: GitHub

record memory and op supplement info (#43550)

* record memory and op supplement info

* update

* update

* fix a bug

* fix memory recording

* fix a bug

* update

* update

* fix a bug

* update

* fix a bug

* fix a bug

* fix a bug

* Revert "fix a bug"

This reverts commit c1d4df52762ba9ae7c7e27cd2ba4fc3a7ed9c7a5.

* fix a bug

* fix format

* fix
上级 e64823c1
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
#include "paddle/phi/core/kernel_context.h" #include "paddle/phi/core/kernel_context.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
...@@ -558,6 +559,11 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { ...@@ -558,6 +559,11 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
op_with_kernel->Info().infer_shape_( op_with_kernel->Info().infer_shape_(
instr_node.InnerInferShapeContext().get()); instr_node.InnerInferShapeContext().get());
} }
infershape_event.End();
platform::RecordOpInfoSupplement(op->Type(),
op->Attrs(),
*(instr_node.InnerInferShapeContext()),
*(instr_node.InnerRuntimeContext()));
} }
} }
......
此差异已折叠。
cc_library( cc_library(
allocator allocator
SRCS allocator.cc SRCS allocator.cc
DEPS place stats) DEPS place stats profiler)
cc_library( cc_library(
cpu_allocator cpu_allocator
SRCS cpu_allocator.cc SRCS cpu_allocator.cc
...@@ -21,7 +21,7 @@ cc_library( ...@@ -21,7 +21,7 @@ cc_library(
cc_library( cc_library(
naive_best_fit_allocator naive_best_fit_allocator
SRCS naive_best_fit_allocator.cc SRCS naive_best_fit_allocator.cc
DEPS allocator buddy_allocator profiler) DEPS allocator buddy_allocator)
cc_test( cc_test(
naive_best_fit_allocator_test naive_best_fit_allocator_test
SRCS naive_best_fit_allocator_test.cc SRCS naive_best_fit_allocator_test.cc
......
...@@ -32,7 +32,8 @@ ...@@ -32,7 +32,8 @@
#endif #endif
PADDLE_DEFINE_EXPORTED_bool( PADDLE_DEFINE_EXPORTED_bool(
init_allocated_mem, false, init_allocated_mem,
false,
"It is a mistake that the values of the memory allocated by " "It is a mistake that the values of the memory allocated by "
"BuddyAllocator are always zeroed in some op's implementation. " "BuddyAllocator are always zeroed in some op's implementation. "
"To find this error in time, we use init_allocated_mem to indicate " "To find this error in time, we use init_allocated_mem to indicate "
...@@ -77,7 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() { ...@@ -77,7 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
std::call_once(init_flag, []() { std::call_once(init_flag, []() {
a = new detail::BuddyAllocator( a = new detail::BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator), std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
platform::CpuMinChunkSize(), platform::CpuMaxChunkSize()); platform::CpuMinChunkSize(),
platform::CpuMaxChunkSize());
}); });
return a; return a;
...@@ -95,7 +97,8 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) { ...@@ -95,7 +97,8 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
} }
template <> template <>
void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p, void Free<platform::CPUPlace>(const platform::CPUPlace &place,
void *p,
size_t size) { size_t size) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p); GetCPUBuddyAllocator()->Free(p);
...@@ -125,7 +128,8 @@ void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) { ...@@ -125,7 +128,8 @@ void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) {
return p; return p;
} }
template <> template <>
void Free<platform::IPUPlace>(const platform::IPUPlace &place, void *p, void Free<platform::IPUPlace>(const platform::IPUPlace &place,
void *p,
size_t size) { size_t size) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p); GetCPUBuddyAllocator()->Free(p);
...@@ -154,7 +158,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) { ...@@ -154,7 +158,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
ret = xpu_malloc(reinterpret_cast<void **>(&p), size); ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
} }
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS, ret,
XPU_SUCCESS,
platform::errors::External( platform::errors::External(
"XPU API return wrong value[%d], no enough memory", ret)); "XPU API return wrong value[%d], no enough memory", ret));
if (FLAGS_init_allocated_mem) { if (FLAGS_init_allocated_mem) {
...@@ -171,7 +176,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) { ...@@ -171,7 +176,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
} }
template <> template <>
void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p, void Free<platform::XPUPlace>(const platform::XPUPlace &place,
void *p,
size_t size) { size_t size) {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
...@@ -234,11 +240,13 @@ class NPUBuddyAllocatorList { ...@@ -234,11 +240,13 @@ class NPUBuddyAllocatorList {
BuddyAllocator *Get(int npu_id) { BuddyAllocator *Get(int npu_id) {
auto pos = std::distance( auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id)); devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
PADDLE_ENFORCE_LT(pos, devices_.size(), PADDLE_ENFORCE_LT(pos,
devices_.size(),
platform::errors::OutOfRange( platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of " "The index exceeds the size of devices, the size of "
"devices is %d, the index is %d", "devices is %d, the index is %d",
devices_.size(), pos)); devices_.size(),
pos));
std::call_once(*init_flags_[pos], [this, pos] { std::call_once(*init_flags_[pos], [this, pos] {
platform::SetNPUDeviceId(devices_[pos]); platform::SetNPUDeviceId(devices_[pos]);
...@@ -246,7 +254,8 @@ class NPUBuddyAllocatorList { ...@@ -246,7 +254,8 @@ class NPUBuddyAllocatorList {
new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>( new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::NPUAllocator(devices_[pos])), new detail::NPUAllocator(devices_[pos])),
platform::NPUMinChunkSize(), platform::NPUMinChunkSize(),
platform::NPUMaxChunkSize(), EXTRA_PADDING_SIZE)); platform::NPUMaxChunkSize(),
EXTRA_PADDING_SIZE));
VLOG(10) << "\n\nNOTE:\n" VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable " << "You can set GFlags environment variable "
<< "'FLAGS_fraction_of_gpu_memory_to_use' " << "'FLAGS_fraction_of_gpu_memory_to_use' "
...@@ -312,8 +321,10 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) { ...@@ -312,8 +321,10 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
PADDLE_THROW(platform::errors::ResourceExhausted( PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize " "Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize "
"%s, NpuMaxChunkSize %s, NPU memory used: %s.", "%s, NpuMaxChunkSize %s, NPU memory used: %s.",
string::HumanReadableSize(size), place.device, string::HumanReadableSize(size),
string::HumanReadableSize(avail), string::HumanReadableSize(total), place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::NPUPlace>(place)))); string::HumanReadableSize(Used<platform::NPUPlace>(place))));
...@@ -331,7 +342,8 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) { ...@@ -331,7 +342,8 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
} }
template <> template <>
void Free<platform::NPUPlace>(const platform::NPUPlace &place, void *p, void Free<platform::NPUPlace>(const platform::NPUPlace &place,
void *p,
size_t size) { size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
...@@ -384,7 +396,8 @@ void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place, ...@@ -384,7 +396,8 @@ void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
template <> template <>
void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place, void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
void *p, size_t size) { void *p,
size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
GetNPUPinnedBuddyAllocator()->Free(p); GetNPUPinnedBuddyAllocator()->Free(p);
#else #else
...@@ -430,18 +443,21 @@ class GPUBuddyAllocatorList { ...@@ -430,18 +443,21 @@ class GPUBuddyAllocatorList {
BuddyAllocator *Get(int gpu_id) { BuddyAllocator *Get(int gpu_id) {
auto pos = std::distance( auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), gpu_id)); devices_.begin(), std::find(devices_.begin(), devices_.end(), gpu_id));
PADDLE_ENFORCE_LT(pos, devices_.size(), PADDLE_ENFORCE_LT(pos,
devices_.size(),
platform::errors::OutOfRange( platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of " "The index exceeds the size of devices, the size of "
"devices is %d, the index is %d", "devices is %d, the index is %d",
devices_.size(), pos)); devices_.size(),
pos));
std::call_once(*init_flags_[pos], [this, pos] { std::call_once(*init_flags_[pos], [this, pos] {
platform::SetDeviceId(devices_[pos]); platform::SetDeviceId(devices_[pos]);
allocators_[pos].reset(new BuddyAllocator( allocators_[pos].reset(
std::unique_ptr<detail::SystemAllocator>( new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::GPUAllocator(devices_[pos])), new detail::GPUAllocator(devices_[pos])),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize())); platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize()));
VLOG(10) << "\n\nNOTE:\n" VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable " << "You can set GFlags environment variable "
<< "'FLAGS_fraction_of_gpu_memory_to_use' " << "'FLAGS_fraction_of_gpu_memory_to_use' "
...@@ -493,8 +509,10 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place, ...@@ -493,8 +509,10 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
PADDLE_THROW(platform::errors::ResourceExhausted( PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize " "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
"%s, GpuMaxChunkSize %s, GPU memory used: %s.", "%s, GpuMaxChunkSize %s, GPU memory used: %s.",
string::HumanReadableSize(size), place.device, string::HumanReadableSize(size),
string::HumanReadableSize(avail), string::HumanReadableSize(total), place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::CUDAPlace>(place)))); string::HumanReadableSize(Used<platform::CUDAPlace>(place))));
...@@ -515,7 +533,8 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place, ...@@ -515,7 +533,8 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
} }
template <> template <>
void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p, void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
void *p,
size_t size) { size_t size) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
GetGPUBuddyAllocator(place.device)->Free(p); GetGPUBuddyAllocator(place.device)->Free(p);
...@@ -584,7 +603,8 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place, ...@@ -584,7 +603,8 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
template <> template <>
void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place, void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
void *p, size_t size) { void *p,
size_t size) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
GetCUDAPinnedBuddyAllocator()->Free(p); GetCUDAPinnedBuddyAllocator()->Free(p);
#else #else
...@@ -630,18 +650,21 @@ class MLUBuddyAllocatorList { ...@@ -630,18 +650,21 @@ class MLUBuddyAllocatorList {
BuddyAllocator *Get(int mlu_id) { BuddyAllocator *Get(int mlu_id) {
auto pos = std::distance( auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id)); devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id));
PADDLE_ENFORCE_LT(pos, devices_.size(), PADDLE_ENFORCE_LT(pos,
devices_.size(),
platform::errors::OutOfRange( platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of " "The index exceeds the size of devices, the size of "
"devices is %d, the index is %d", "devices is %d, the index is %d",
devices_.size(), pos)); devices_.size(),
pos));
std::call_once(*init_flags_[pos], [this, pos] { std::call_once(*init_flags_[pos], [this, pos] {
platform::SetMLUDeviceId(devices_[pos]); platform::SetMLUDeviceId(devices_[pos]);
allocators_[pos].reset(new BuddyAllocator( allocators_[pos].reset(
std::unique_ptr<detail::SystemAllocator>( new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::MLUAllocator(devices_[pos])), new detail::MLUAllocator(devices_[pos])),
platform::MLUMinChunkSize(), platform::MLUMaxChunkSize())); platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize()));
VLOG(10) << "\n\nNOTE:\n" VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable " << "You can set GFlags environment variable "
<< "(mlu reuse gpu GFlags) " << "(mlu reuse gpu GFlags) "
...@@ -693,8 +716,10 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) { ...@@ -693,8 +716,10 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
PADDLE_THROW(platform::errors::ResourceExhausted( PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize " "Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize "
"%s, MLUMinChunkSize %s, MLU memory used: %s.", "%s, MLUMinChunkSize %s, MLU memory used: %s.",
string::HumanReadableSize(size), place.device, string::HumanReadableSize(size),
string::HumanReadableSize(avail), string::HumanReadableSize(total), place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::MLUPlace>(place)))); string::HumanReadableSize(Used<platform::MLUPlace>(place))));
...@@ -711,7 +736,8 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) { ...@@ -711,7 +736,8 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
} }
template <> template <>
void Free<platform::MLUPlace>(const platform::MLUPlace &place, void *p, void Free<platform::MLUPlace>(const platform::MLUPlace &place,
void *p,
size_t size) { size_t size) {
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
...@@ -759,10 +785,12 @@ class BuddyAllocatorList { ...@@ -759,10 +785,12 @@ class BuddyAllocatorList {
} }
BuddyAllocator *Get(int dev_id) { BuddyAllocator *Get(int dev_id) {
PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(), PADDLE_ENFORCE_NE(init_flags_.find(dev_id),
init_flags_.end(),
platform::errors::OutOfRange( platform::errors::OutOfRange(
"Cannot find %s %d, please check visible devices.", "Cannot find %s %d, please check visible devices.",
device_type_, dev_id)); device_type_,
dev_id));
std::call_once(*init_flags_[dev_id], [this, dev_id] { std::call_once(*init_flags_[dev_id], [this, dev_id] {
phi::DeviceManager::SetDevice(device_type_, dev_id); phi::DeviceManager::SetDevice(device_type_, dev_id);
...@@ -773,7 +801,8 @@ class BuddyAllocatorList { ...@@ -773,7 +801,8 @@ class BuddyAllocatorList {
new detail::CustomAllocator(device_type_, dev_id)), new detail::CustomAllocator(device_type_, dev_id)),
phi::DeviceManager::GetMinChunkSize(place), phi::DeviceManager::GetMinChunkSize(place),
phi::DeviceManager::GetMaxChunkSize(place), phi::DeviceManager::GetMaxChunkSize(place),
phi::DeviceManager::GetExtraPaddingSize(place), device_type_)); phi::DeviceManager::GetExtraPaddingSize(place),
device_type_));
}); });
return allocators_[dev_id].get(); return allocators_[dev_id].get();
...@@ -813,8 +842,11 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place, ...@@ -813,8 +842,11 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
PADDLE_THROW(platform::errors::ResourceExhausted( PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in %s:%d, avaliable %s, total %s, used " "Cannot allocate %s in %s:%d, avaliable %s, total %s, used "
"%s. ", "%s. ",
string::HumanReadableSize(size), place.GetDeviceType(), place.device, string::HumanReadableSize(size),
string::HumanReadableSize(avail), string::HumanReadableSize(total), place.GetDeviceType(),
place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(total - avail))); string::HumanReadableSize(total - avail)));
} else { } else {
if (FLAGS_init_allocated_mem) { if (FLAGS_init_allocated_mem) {
...@@ -830,7 +862,8 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place, ...@@ -830,7 +862,8 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
} }
template <> template <>
void Free<platform::CustomPlace>(const platform::CustomPlace &place, void *p, void Free<platform::CustomPlace>(const platform::CustomPlace &place,
void *p,
size_t size) { size_t size) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
...@@ -922,8 +955,6 @@ namespace allocation { ...@@ -922,8 +955,6 @@ namespace allocation {
phi::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) { phi::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size)); void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size));
auto *tmp_alloc = new Allocation(ptr, size, place_); auto *tmp_alloc = new Allocation(ptr, size, place_);
platform::MemEvenRecorder::Instance().PushMemRecord(
static_cast<void *>(tmp_alloc), place_, size);
return tmp_alloc; return tmp_alloc;
} }
...@@ -931,8 +962,6 @@ void NaiveBestFitAllocator::FreeImpl(phi::Allocation *allocation) { ...@@ -931,8 +962,6 @@ void NaiveBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
paddle::platform::VisitPlace( paddle::platform::VisitPlace(
allocation->place(), allocation->place(),
legacy::FreeVisitor(allocation->ptr(), allocation->size())); legacy::FreeVisitor(allocation->ptr(), allocation->size()));
platform::MemEvenRecorder::Instance().PopMemRecord(
static_cast<void *>(allocation), place_);
delete allocation; delete allocation;
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/memory/allocation/pinned_allocator.h" #include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/memory/stats.h" #include "paddle/fluid/memory/stats.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
...@@ -26,6 +27,10 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) { ...@@ -26,6 +27,10 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr())); PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
#endif #endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, -allocation->size()); HOST_MEMORY_STAT_UPDATE(Reserved, 0, -allocation->size());
platform::RecordMemEvent(allocation->ptr(),
allocation->place(),
allocation->size(),
platform::TracerMemEventType::ReservedFree);
delete allocation; delete allocation;
} }
phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
...@@ -36,6 +41,10 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { ...@@ -36,6 +41,10 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
#endif #endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, size); HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
platform::RecordMemEvent(ptr,
platform::CUDAPinnedPlace(),
size,
platform::TracerMemEventType::ReservedAllocate);
return new Allocation(ptr, size, platform::CUDAPinnedPlace()); return new Allocation(ptr, size, platform::CUDAPinnedPlace());
} }
} // namespace allocation } // namespace allocation
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/stats.h" #include "paddle/fluid/memory/stats.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -30,14 +31,18 @@ class StatAllocator : public Allocator { ...@@ -30,14 +31,18 @@ class StatAllocator : public Allocator {
protected: protected:
void FreeImpl(phi::Allocation* allocation) override { void FreeImpl(phi::Allocation* allocation) override {
if (platform::is_cpu_place(allocation->place())) { if (platform::is_cpu_place(allocation->place()) ||
HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(), platform::is_cuda_pinned_place(allocation->place())) {
-allocation->size()); HOST_MEMORY_STAT_UPDATE(
Allocated, allocation->place().GetDeviceId(), -allocation->size());
} else { } else {
DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(), DEVICE_MEMORY_STAT_UPDATE(
-allocation->size()); Allocated, allocation->place().GetDeviceId(), -allocation->size());
} }
platform::RecordMemEvent(allocation->ptr(),
allocation->place(),
allocation->size(),
platform::TracerMemEventType::Free);
underlying_allocator_->Free(allocation); underlying_allocator_->Free(allocation);
} }
...@@ -48,12 +53,16 @@ class StatAllocator : public Allocator { ...@@ -48,12 +53,16 @@ class StatAllocator : public Allocator {
const platform::Place& place = allocation->place(); const platform::Place& place = allocation->place();
if (platform::is_cpu_place(place) || if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) { platform::is_cuda_pinned_place(place)) {
HOST_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(), HOST_MEMORY_STAT_UPDATE(
allocation->size()); Allocated, place.GetDeviceId(), allocation->size());
} else { } else {
DEVICE_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(), DEVICE_MEMORY_STAT_UPDATE(
allocation->size()); Allocated, place.GetDeviceId(), allocation->size());
} }
platform::RecordMemEvent(allocation->ptr(),
allocation->place(),
allocation->size(),
platform::TracerMemEventType::Allocate);
return allocation.release(); return allocation.release();
} }
......
...@@ -41,6 +41,7 @@ limitations under the License. */ ...@@ -41,6 +41,7 @@ limitations under the License. */
#endif #endif
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
DECLARE_bool(use_pinned_memory); DECLARE_bool(use_pinned_memory);
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
...@@ -64,11 +65,13 @@ void* AlignedMalloc(size_t size) { ...@@ -64,11 +65,13 @@ void* AlignedMalloc(size_t size) {
#else #else
int error = posix_memalign(&p, alignment, size); int error = posix_memalign(&p, alignment, size);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
error, 0, error,
0,
platform::errors::ResourceExhausted( platform::errors::ResourceExhausted(
"Fail to alloc memory of %ld size, error code is %d.", size, error)); "Fail to alloc memory of %ld size, error code is %d.", size, error));
#endif #endif
PADDLE_ENFORCE_NOT_NULL(p, platform::errors::ResourceExhausted( PADDLE_ENFORCE_NOT_NULL(p,
platform::errors::ResourceExhausted(
"Fail to alloc memory of %ld size.", size)); "Fail to alloc memory of %ld size.", size));
return p; return p;
} }
...@@ -95,7 +98,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) { ...@@ -95,7 +98,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
} }
HOST_MEMORY_STAT_UPDATE(Reserved, 0, size); HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
platform::RecordMemEvent(
p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
return p; return p;
} }
...@@ -114,6 +118,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) { ...@@ -114,6 +118,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
#endif #endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size); HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
platform::RecordMemEvent(
p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
} }
bool CPUAllocator::UseGpu() const { return false; } bool CPUAllocator::UseGpu() const { return false; }
...@@ -146,7 +152,8 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { ...@@ -146,7 +152,8 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum GPU memory usage is limited to %d MB.\n" "maximum GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size); limit_size,
limit_size);
} }
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
...@@ -161,21 +168,29 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { ...@@ -161,21 +168,29 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
"please set it to a higher value but less than 1.0.\n" "please set it to a higher value but less than 1.0.\n"
" The command is " " The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
gpu_id_, string::HumanReadableSize(size), gpu_id_, gpu_id_,
string::HumanReadableSize(allocated), string::HumanReadableSize(avail), string::HumanReadableSize(size),
gpu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg)); gpu_id_,
string::HumanReadableSize(allocated),
string::HumanReadableSize(avail),
gpu_id_,
FLAGS_fraction_of_gpu_memory_to_use,
err_msg));
} }
} }
void GPUAllocator::Free(void* p, size_t size, size_t index) { void GPUAllocator::Free(void* p, size_t size, size_t index) {
PADDLE_ENFORCE_EQ(index, 0, PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The index should be 0, index is %d", index)); "The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(gpu_alloc_size_, size, PADDLE_ENFORCE_GE(gpu_alloc_size_,
size,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of " "The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)", "allocated gpu memory (%d)",
size, gpu_alloc_size_)); size,
gpu_alloc_size_));
gpu_alloc_size_ -= size; gpu_alloc_size_ -= size;
platform::RecordedGpuFree(p, size, gpu_id_); platform::RecordedGpuFree(p, size, gpu_id_);
...@@ -213,6 +228,8 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { ...@@ -213,6 +228,8 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
*index = 1; // PINNED memory *index = 1; // PINNED memory
cuda_pinnd_alloc_size_ += size; cuda_pinnd_alloc_size_ += size;
HOST_MEMORY_STAT_UPDATE(Reserved, 0, size); HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
platform::RecordMemEvent(
p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
return p; return p;
} else { } else {
LOG(WARNING) << "cudaHostAlloc failed."; LOG(WARNING) << "cudaHostAlloc failed.";
...@@ -224,21 +241,25 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { ...@@ -224,21 +241,25 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
gpuError_t err; gpuError_t err;
PADDLE_ENFORCE_EQ(index, 1, PADDLE_ENFORCE_EQ(index,
1,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The index should be 1, but got %d", index)); "The index should be 1, but got %d", index));
PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, size, PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_,
size,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of " "The size of memory (%d) to free exceeds the size of "
"allocated cuda pinned memory (%d)", "allocated cuda pinned memory (%d)",
size, cuda_pinnd_alloc_size_)); size,
cuda_pinnd_alloc_size_));
cuda_pinnd_alloc_size_ -= size; cuda_pinnd_alloc_size_ -= size;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
err = hipHostFree(p); err = hipHostFree(p);
if (err != hipErrorDeinitialized) { if (err != hipErrorDeinitialized) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
err, hipSuccess, err,
hipSuccess,
platform::errors::Fatal( platform::errors::Fatal(
"hipFreeHost failed in GPUPinnedAllocator, error code is %d", err)); "hipFreeHost failed in GPUPinnedAllocator, error code is %d", err));
} }
...@@ -252,13 +273,16 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { ...@@ -252,13 +273,16 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
// cudaFreeHost succeeds. // cudaFreeHost succeeds.
if (err != cudaErrorCudartUnloading) { if (err != cudaErrorCudartUnloading) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
err, 0, err,
0,
platform::errors::Fatal( platform::errors::Fatal(
"cudaFreeHost failed in GPUPinnedAllocator, error code is %d", "cudaFreeHost failed in GPUPinnedAllocator, error code is %d",
err)); err));
} }
#endif #endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size); HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
platform::RecordMemEvent(
p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
} }
bool CUDAPinnedAllocator::UseGpu() const { return false; } bool CUDAPinnedAllocator::UseGpu() const { return false; }
...@@ -289,7 +313,8 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) { ...@@ -289,7 +313,8 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum GPU memory usage is limited to %d MB.\n" "maximum GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size); limit_size,
limit_size);
} }
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
...@@ -304,22 +329,29 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) { ...@@ -304,22 +329,29 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
"please set it to a higher value but less than 1.0.\n" "please set it to a higher value but less than 1.0.\n"
" The command is " " The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
npu_id_, string::HumanReadableSize(size), npu_id_, npu_id_,
string::HumanReadableSize(avail), npu_id_, string::HumanReadableSize(size),
FLAGS_fraction_of_gpu_memory_to_use, err_msg)); npu_id_,
string::HumanReadableSize(avail),
npu_id_,
FLAGS_fraction_of_gpu_memory_to_use,
err_msg));
} }
} }
void NPUAllocator::Free(void* p, size_t size, size_t index) { void NPUAllocator::Free(void* p, size_t size, size_t index) {
VLOG(4) << "Free " << p << " size " << size; VLOG(4) << "Free " << p << " size " << size;
PADDLE_ENFORCE_EQ(index, 0, PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The index should be 0, index is %d", index)); "The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(npu_alloc_size_, size, PADDLE_ENFORCE_GE(npu_alloc_size_,
size,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of " "The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)", "allocated gpu memory (%d)",
size, npu_alloc_size_)); size,
npu_alloc_size_));
npu_alloc_size_ -= size; npu_alloc_size_ -= size;
platform::RecordedNPUFree(p, size, npu_id_); platform::RecordedNPUFree(p, size, npu_id_);
...@@ -358,21 +390,25 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) { ...@@ -358,21 +390,25 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) { void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
aclError err; aclError err;
PADDLE_ENFORCE_EQ(index, 1, PADDLE_ENFORCE_EQ(index,
1,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The index should be 1, but got %d", index)); "The index should be 1, but got %d", index));
PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size, PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_,
size,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of " "The size of memory (%d) to free exceeds the size of "
"allocated npu pinned memory (%d)", "allocated npu pinned memory (%d)",
size, npu_pinnd_alloc_size_)); size,
npu_pinnd_alloc_size_));
npu_pinnd_alloc_size_ -= size; npu_pinnd_alloc_size_ -= size;
err = platform::NPUHostFree(p); err = platform::NPUHostFree(p);
if (err != ACL_ERROR_NONE) { if (err != ACL_ERROR_NONE) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
err, 0, err,
0,
platform::errors::Fatal( platform::errors::Fatal(
"NPUHostFree failed in NPUPinnedAllocator, error code is %d", err)); "NPUHostFree failed in NPUPinnedAllocator, error code is %d", err));
} }
...@@ -407,7 +443,8 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) { ...@@ -407,7 +443,8 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum MLU memory usage is limited to %d MB.\n" "maximum MLU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size); limit_size,
limit_size);
} }
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
...@@ -422,21 +459,29 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) { ...@@ -422,21 +459,29 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
"please set it to a higher value but less than 1.0.\n" "please set it to a higher value but less than 1.0.\n"
" The command is " " The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
mlu_id_, string::HumanReadableSize(size), mlu_id_, mlu_id_,
string::HumanReadableSize(allocated), string::HumanReadableSize(avail), string::HumanReadableSize(size),
mlu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg)); mlu_id_,
string::HumanReadableSize(allocated),
string::HumanReadableSize(avail),
mlu_id_,
FLAGS_fraction_of_gpu_memory_to_use,
err_msg));
} }
} }
void MLUAllocator::Free(void* p, size_t size, size_t index) { void MLUAllocator::Free(void* p, size_t size, size_t index) {
PADDLE_ENFORCE_EQ(index, 0, PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The index should be 0, index is %d", index)); "The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(mlu_alloc_size_, size, PADDLE_ENFORCE_GE(mlu_alloc_size_,
size,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of " "The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)", "allocated gpu memory (%d)",
size, mlu_alloc_size_)); size,
mlu_alloc_size_));
mlu_alloc_size_ -= size; mlu_alloc_size_ -= size;
platform::RecordedMLUFree(p, size, mlu_id_); platform::RecordedMLUFree(p, size, mlu_id_);
...@@ -465,7 +510,9 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) { ...@@ -465,7 +510,9 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
"\n\nOut of memory error on %s %d. " "\n\nOut of memory error on %s %d. "
"total memory is %s, used memory is %s, " "total memory is %s, used memory is %s, "
"available memory is only %s.\n\n", "available memory is only %s.\n\n",
dev_type_, dev_id_, string::HumanReadableSize(total), dev_type_,
dev_id_,
string::HumanReadableSize(total),
string::HumanReadableSize(total - avail), string::HumanReadableSize(total - avail),
string::HumanReadableSize(avail))); string::HumanReadableSize(avail)));
} }
...@@ -474,14 +521,17 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) { ...@@ -474,14 +521,17 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
void CustomAllocator::Free(void* p, size_t size, size_t index) { void CustomAllocator::Free(void* p, size_t size, size_t index) {
VLOG(4) << "CustomAllocator::Free " << p << " size " << size; VLOG(4) << "CustomAllocator::Free " << p << " size " << size;
PADDLE_ENFORCE_EQ(index, 0, PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The index should be 0, index is %d", index)); "The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(plug_alloc_size, size, PADDLE_ENFORCE_GE(plug_alloc_size,
size,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of " "The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)", "allocated gpu memory (%d)",
size, plug_alloc_size)); size,
plug_alloc_size));
plug_alloc_size -= size; plug_alloc_size -= size;
auto place = platform::CustomPlace(dev_type_, dev_id_); auto place = platform::CustomPlace(dev_type_, dev_id_);
auto device = phi::DeviceManager::GetDeviceWithPlace(place); auto device = phi::DeviceManager::GetDeviceWithPlace(place);
......
此差异已折叠。
...@@ -354,7 +354,9 @@ if(WITH_GPU) ...@@ -354,7 +354,9 @@ if(WITH_GPU)
enforce enforce
dynload_cuda dynload_cuda
new_profiler new_profiler
stats) stats
op_proto_maker
shape_inference)
nv_library( nv_library(
device_memory_aligment device_memory_aligment
SRCS device_memory_aligment.cc SRCS device_memory_aligment.cc
...@@ -363,7 +365,14 @@ elseif(WITH_ROCM) ...@@ -363,7 +365,14 @@ elseif(WITH_ROCM)
hip_library( hip_library(
profiler profiler
SRCS profiler.cc profiler.cu SRCS profiler.cc profiler.cu
DEPS os_info device_tracer gpu_info enforce new_profiler stats) DEPS os_info
device_tracer
gpu_info
enforce
new_profiler
stats
op_proto_maker
shape_inference)
hip_library( hip_library(
device_memory_aligment device_memory_aligment
SRCS device_memory_aligment.cc SRCS device_memory_aligment.cc
...@@ -372,7 +381,13 @@ else() ...@@ -372,7 +381,13 @@ else()
cc_library( cc_library(
profiler profiler
SRCS profiler.cc SRCS profiler.cc
DEPS os_info device_tracer enforce new_profiler stats) DEPS os_info
device_tracer
enforce
new_profiler
stats
op_proto_maker
shape_inference)
cc_library( cc_library(
device_memory_aligment device_memory_aligment
SRCS device_memory_aligment.cc SRCS device_memory_aligment.cc
......
...@@ -29,6 +29,7 @@ limitations under the License. */ ...@@ -29,6 +29,7 @@ limitations under the License. */
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/monitor.h" #include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
#include "paddle/fluid/string/split.h" #include "paddle/fluid/string/split.h"
#include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_info.h"
...@@ -51,10 +52,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb); ...@@ -51,10 +52,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(enable_cublas_tensor_op_math); DECLARE_bool(enable_cublas_tensor_op_math);
DECLARE_uint64(gpu_memory_limit_mb); DECLARE_uint64(gpu_memory_limit_mb);
PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false, PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log,
false,
"Whether to print the message of gpu memory usage " "Whether to print the message of gpu memory usage "
"at exit, mainly used for UT and CI."); "at exit, mainly used for UT and CI.");
PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb, true, PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb,
true,
"Whether to print the message of gpu memory usage " "Whether to print the message of gpu memory usage "
"MB as a unit of measurement."); "MB as a unit of measurement.");
...@@ -66,7 +69,10 @@ namespace platform { ...@@ -66,7 +69,10 @@ namespace platform {
void GpuMemoryUsage(size_t *available, size_t *total) { void GpuMemoryUsage(size_t *available, size_t *total) {
size_t actual_available, actual_total; size_t actual_available, actual_total;
RecordedGpuMemGetInfo(available, total, &actual_available, &actual_total, RecordedGpuMemGetInfo(available,
total,
&actual_available,
&actual_total,
platform::GetCurrentDeviceId()); platform::GetCurrentDeviceId());
} }
...@@ -94,7 +100,8 @@ size_t GpuMaxAllocSize() { ...@@ -94,7 +100,8 @@ size_t GpuMaxAllocSize() {
static size_t GpuAllocSize(bool realloc) { static size_t GpuAllocSize(bool realloc) {
size_t available_to_alloc = GpuAvailableMemToAlloc(); size_t available_to_alloc = GpuAvailableMemToAlloc();
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
available_to_alloc, 0, available_to_alloc,
0,
platform::errors::ResourceExhausted("Not enough available GPU memory.")); platform::errors::ResourceExhausted("Not enough available GPU memory."));
// If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
// allocated by fraction // allocated by fraction
...@@ -105,7 +112,8 @@ static size_t GpuAllocSize(bool realloc) { ...@@ -105,7 +112,8 @@ static size_t GpuAllocSize(bool realloc) {
? flag_mb << 20 ? flag_mb << 20
: available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use); : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_GE(
available_to_alloc, alloc_bytes, available_to_alloc,
alloc_bytes,
platform::errors::ResourceExhausted("Not enough available GPU memory.")); platform::errors::ResourceExhausted("Not enough available GPU memory."));
VLOG(10) << "Alloc size is " << (alloc_bytes >> 20) VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
<< " MiB, is it Re-alloc: " << realloc; << " MiB, is it Re-alloc: " << realloc;
...@@ -192,13 +200,16 @@ class RecordedGpuMallocHelper { ...@@ -192,13 +200,16 @@ class RecordedGpuMallocHelper {
}); });
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_GE(
dev_id, 0, dev_id,
0,
platform::errors::OutOfRange( platform::errors::OutOfRange(
"Device id must be not less than 0, but got %d.", dev_id)); "Device id must be not less than 0, but got %d.", dev_id));
PADDLE_ENFORCE_LT( PADDLE_ENFORCE_LT(
dev_id, instances_.size(), dev_id,
instances_.size(),
platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.", platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.",
dev_id, instances_.size())); dev_id,
instances_.size()));
return instances_[dev_id].get(); return instances_[dev_id].get();
} }
...@@ -207,7 +218,8 @@ class RecordedGpuMallocHelper { ...@@ -207,7 +218,8 @@ class RecordedGpuMallocHelper {
* or cudaSuccess would be returned, and the cudaGetLastError() flag * or cudaSuccess would be returned, and the cudaGetLastError() flag
* would be clear. * would be clear.
*/ */
gpuError_t Malloc(void **ptr, size_t size, gpuError_t Malloc(void **ptr,
size_t size,
bool malloc_managed_memory = false) { bool malloc_managed_memory = false) {
LockGuardPtr<std::mutex> lock(mtx_); LockGuardPtr<std::mutex> lock(mtx_);
if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) { if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
...@@ -236,7 +248,10 @@ class RecordedGpuMallocHelper { ...@@ -236,7 +248,10 @@ class RecordedGpuMallocHelper {
cur_size_.fetch_add(size); cur_size_.fetch_add(size);
STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size); DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
platform::RecordMemEvent(ptr,
GPUPlace(dev_id_),
size,
platform::TracerMemEventType::ReservedAllocate);
#ifdef PADDLE_WITH_TESTING #ifdef PADDLE_WITH_TESTING
gpu_ptrs.insert(*ptr); gpu_ptrs.insert(*ptr);
#endif #endif
...@@ -275,6 +290,10 @@ class RecordedGpuMallocHelper { ...@@ -275,6 +290,10 @@ class RecordedGpuMallocHelper {
cur_size_.fetch_sub(size); cur_size_.fetch_sub(size);
STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size); DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
platform::RecordMemEvent(ptr,
GPUPlace(dev_id_),
size,
platform::TracerMemEventType::ReservedFree);
} else { } else {
platform::GpuGetLastError(); // clear the error flag when platform::GpuGetLastError(); // clear the error flag when
// cudaErrorCudartUnloading / // cudaErrorCudartUnloading /
...@@ -300,7 +319,9 @@ class RecordedGpuMallocHelper { ...@@ -300,7 +319,9 @@ class RecordedGpuMallocHelper {
#endif #endif
} }
bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail, bool GetMemInfo(size_t *avail,
size_t *total,
size_t *actual_avail,
size_t *actual_total) { size_t *actual_total) {
{ {
CUDADeviceGuard guard(dev_id_); CUDADeviceGuard guard(dev_id_);
...@@ -335,7 +356,8 @@ class RecordedGpuMallocHelper { ...@@ -335,7 +356,8 @@ class RecordedGpuMallocHelper {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10020 #if CUDA_VERSION >= 10020
CUresult MemCreate(CUmemGenericAllocationHandle *handle, size_t size, CUresult MemCreate(CUmemGenericAllocationHandle *handle,
size_t size,
const CUmemAllocationProp *prop, const CUmemAllocationProp *prop,
unsigned long long flags) { // NOLINT unsigned long long flags) { // NOLINT
auto result = auto result =
...@@ -371,7 +393,9 @@ class RecordedGpuMallocHelper { ...@@ -371,7 +393,9 @@ class RecordedGpuMallocHelper {
std::once_flag RecordedGpuMallocHelper::once_flag_; std::once_flag RecordedGpuMallocHelper::once_flag_;
gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id, gpuError_t RecordedGpuMalloc(void **ptr,
size_t size,
int dev_id,
bool malloc_managed_memory) { bool malloc_managed_memory) {
return RecordedGpuMallocHelper::Instance(dev_id)->Malloc( return RecordedGpuMallocHelper::Instance(dev_id)->Malloc(
ptr, size, malloc_managed_memory); ptr, size, malloc_managed_memory);
...@@ -383,22 +407,28 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) { ...@@ -383,22 +407,28 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10020 #if CUDA_VERSION >= 10020
CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
size_t size,
const CUmemAllocationProp *prop, const CUmemAllocationProp *prop,
unsigned long long flags, int dev_id) { // NOLINT unsigned long long flags,
return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(handle, size, int dev_id) { // NOLINT
prop, flags); return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(
handle, size, prop, flags);
} }
CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, size_t size, CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle,
size_t size,
int dev_id) { int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size); return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size);
} }
#endif #endif
#endif #endif
bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail, bool RecordedGpuMemGetInfo(size_t *avail,
size_t *actual_total, int dev_id) { size_t *total,
size_t *actual_avail,
size_t *actual_total,
int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->GetMemInfo( return RecordedGpuMallocHelper::Instance(dev_id)->GetMemInfo(
avail, total, actual_avail, actual_total); avail, total, actual_avail, actual_total);
} }
...@@ -493,26 +523,35 @@ void GpuDestroyStream(gpuStream_t stream) { ...@@ -493,26 +523,35 @@ void GpuDestroyStream(gpuStream_t stream) {
void GpuDeviceSync() { phi::backends::gpu::GpuDeviceSync(); } void GpuDeviceSync() { phi::backends::gpu::GpuDeviceSync(); }
void GpuMemcpyAsync(void *dst, const void *src, size_t count, void GpuMemcpyAsync(void *dst,
gpuMemcpyKind kind, gpuStream_t stream) { const void *src,
size_t count,
gpuMemcpyKind kind,
gpuStream_t stream) {
phi::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream); phi::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream);
} }
void GpuMemcpySync(void *dst, const void *src, size_t count, void GpuMemcpySync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind) { gpuMemcpyKind kind) {
phi::backends::gpu::GpuMemcpySync(dst, src, count, kind); phi::backends::gpu::GpuMemcpySync(dst, src, count, kind);
} }
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerAsync(void *dst,
int src_device, size_t count, gpuStream_t stream) { int dst_device,
phi::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device, const void *src,
count, stream); int src_device,
size_t count,
gpuStream_t stream) {
phi::backends::gpu::GpuMemcpyPeerAsync(
dst, dst_device, src, src_device, count, stream);
} }
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerSync(
int src_device, size_t count) { void *dst, int dst_device, const void *src, int src_device, size_t count) {
phi::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device, phi::backends::gpu::GpuMemcpyPeerSync(
count); dst, dst_device, src, src_device, count);
} }
void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) { void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
......
...@@ -30,12 +30,16 @@ limitations under the License. */ ...@@ -30,12 +30,16 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/nvtx.h" #include "paddle/fluid/platform/dynload/nvtx.h"
#endif #endif
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/os_info.h"
PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false, PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler,
false,
"Enable rpc profiler or not."); "Enable rpc profiler or not.");
DEFINE_bool(enable_host_event_recorder_hook, false, DEFINE_bool(enable_host_event_recorder_hook,
false,
"enable HostEventRecorder, hook Profiler"); "enable HostEventRecorder, hook Profiler");
namespace paddle { namespace paddle {
...@@ -43,8 +47,11 @@ namespace platform { ...@@ -43,8 +47,11 @@ namespace platform {
MemEvenRecorder MemEvenRecorder::recorder; MemEvenRecorder MemEvenRecorder::recorder;
Event::Event(EventType type, std::string name, uint32_t thread_id, Event::Event(EventType type,
EventRole role, std::string attr) std::string name,
uint32_t thread_id,
EventRole role,
std::string attr)
: type_(type), : type_(type),
name_(name), name_(name),
thread_id_(thread_id), thread_id_(thread_id),
...@@ -68,8 +75,10 @@ double Event::CudaElapsedMs(const Event &e) const { ...@@ -68,8 +75,10 @@ double Event::CudaElapsedMs(const Event &e) const {
#endif #endif
} }
RecordEvent::RecordEvent(const char *name, const TracerEventType type, RecordEvent::RecordEvent(const char *name,
uint32_t level, const EventRole role) { const TracerEventType type,
uint32_t level,
const EventRole role) {
#ifndef _WIN32 #ifndef _WIN32
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook) { if (g_enable_nvprof_hook) {
...@@ -100,8 +109,10 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type, ...@@ -100,8 +109,10 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type,
start_ns_ = PosixInNsec(); start_ns_ = PosixInNsec();
} }
RecordEvent::RecordEvent(const std::string &name, const TracerEventType type, RecordEvent::RecordEvent(const std::string &name,
uint32_t level, const EventRole role) { const TracerEventType type,
uint32_t level,
const EventRole role) {
#ifndef _WIN32 #ifndef _WIN32
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook) { if (g_enable_nvprof_hook) {
...@@ -130,8 +141,10 @@ RecordEvent::RecordEvent(const std::string &name, const TracerEventType type, ...@@ -130,8 +141,10 @@ RecordEvent::RecordEvent(const std::string &name, const TracerEventType type,
start_ns_ = PosixInNsec(); start_ns_ = PosixInNsec();
} }
RecordEvent::RecordEvent(const std::string &name, const std::string &attr, RecordEvent::RecordEvent(const std::string &name,
const TracerEventType type, uint32_t level, const std::string &attr,
const TracerEventType type,
uint32_t level,
const EventRole role) { const EventRole role) {
#ifndef _WIN32 #ifndef _WIN32
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -215,8 +228,8 @@ void RecordEvent::End() { ...@@ -215,8 +228,8 @@ void RecordEvent::End() {
DeviceTracer *tracer = GetDeviceTracer(); DeviceTracer *tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
uint64_t end_ns = PosixInNsec(); uint64_t end_ns = PosixInNsec();
tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(), tracer->AddCPURecords(
g_thread_id); CurAnnotationName(), start_ns_, end_ns, BlockDepth(), g_thread_id);
} }
ClearCurAnnotation(); ClearCurAnnotation();
PopEvent(*name_, role_); PopEvent(*name_, role_);
...@@ -226,7 +239,8 @@ void RecordEvent::End() { ...@@ -226,7 +239,8 @@ void RecordEvent::End() {
is_enabled_ = false; is_enabled_ = false;
} }
RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type, RecordInstantEvent::RecordInstantEvent(const char *name,
TracerEventType type,
uint32_t level) { uint32_t level) {
if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) { if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
return; return;
...@@ -236,21 +250,242 @@ RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type, ...@@ -236,21 +250,242 @@ RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type,
name, start_end_ns, start_end_ns, EventRole::kOrdinary, type); name, start_end_ns, start_end_ns, EventRole::kOrdinary, type);
} }
void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place, RecordOpInfoSupplement::RecordOpInfoSupplement(
const std::string &type,
const framework::AttributeMap &attrs,
const framework::InferShapeContext &shape_ctx,
const framework::RuntimeContext &ctx) {
if (FLAGS_enable_host_event_recorder_hook == false) {
return;
}
std::map<std::string, std::vector<framework::DDim>> input_shapes;
std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
for (auto it = ctx.inputs.begin(); it != ctx.inputs.end(); it++) {
input_shapes[it->first] = shape_ctx.GetInputsDim(it->first);
dtypes[it->first] = shape_ctx.GetInputsVarType(it->first);
}
const std::vector<std::string> *callstack_ptr = nullptr;
std::vector<std::string> callstack;
auto iter = attrs.find(
framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
if (iter != attrs.end()) {
callstack_ptr = &BOOST_GET_CONST(std::vector<std::string>, iter->second);
callstack = *callstack_ptr;
}
HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance().RecordEvent(
PosixInNsec(), type, input_shapes, dtypes, callstack);
}
RecordMemEvent::RecordMemEvent(const void *ptr,
const phi::Place &place,
size_t size,
const TracerMemEventType type) {
if (g_state == ProfilerState::kDisabled &&
FLAGS_enable_host_event_recorder_hook == false) {
return;
}
if (type == TracerMemEventType::Allocate) {
uint64_t current_allocated;
uint64_t peak_allocated;
uint64_t current_reserved = 0; // 0 means keep the same as before
uint64_t peak_reserved = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
current_allocated =
HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
} else {
current_allocated =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
}
platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
place,
size,
type,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
} else if (type == TracerMemEventType::ReservedAllocate) {
uint64_t current_reserved;
uint64_t peak_reserved;
uint64_t current_allocated = 0; // 0 means keep the same as before
uint64_t peak_allocated = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
current_reserved =
HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
} else {
current_reserved =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
}
platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
place,
size,
type,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
} else if (type == TracerMemEventType::Free) {
uint64_t current_allocated;
uint64_t peak_allocated;
uint64_t current_reserved = 0; // 0 means keep the same as before
uint64_t peak_reserved = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
current_allocated =
HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
} else {
current_allocated =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
}
platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
place,
size,
type,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
} else if (type == TracerMemEventType::ReservedFree) {
uint64_t current_reserved;
uint64_t peak_reserved;
uint64_t current_allocated = 0; // 0 means keep the same as before
uint64_t peak_allocated = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
current_reserved =
HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
} else {
current_reserved =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
}
platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
place,
size,
type,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
}
}
void MemEvenRecorder::PushMemRecord(const void *ptr,
const Place &place,
size_t size) { size_t size) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) {
return;
}
std::lock_guard<std::mutex> guard(mtx_); std::lock_guard<std::mutex> guard(mtx_);
auto &events = address_memevent_[place]; auto &events = address_memevent_[place];
PADDLE_ENFORCE_EQ(events.count(ptr), 0, PADDLE_ENFORCE_EQ(events.count(ptr),
0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The Place can't exist in the stage of PushMemRecord")); "The Place can't exist in the stage of PushMemRecord"));
events.emplace(ptr, std::unique_ptr<RecordMemEvent>( events.emplace(ptr,
std::unique_ptr<RecordMemEvent>(
new MemEvenRecorder::RecordMemEvent(place, size))); new MemEvenRecorder::RecordMemEvent(place, size)));
} }
void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) { void MemEvenRecorder::PushMemRecord(const void *ptr,
const Place &place,
size_t size,
TracerMemEventType type,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved) {
std::lock_guard<std::mutex> guard(mtx_);
if (FLAGS_enable_host_event_recorder_hook) { // new MemRecord
HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
PosixInNsec(),
reinterpret_cast<uint64_t>(ptr),
type,
size,
place,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
return;
}
if (type == TracerMemEventType::ReservedAllocate) {
// old profiler only analyse memory managed by paddle.
return;
}
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
auto &events = address_memevent_[place];
PADDLE_ENFORCE_EQ(events.count(ptr),
0,
platform::errors::InvalidArgument(
"The Place can't exist in the stage of PushMemRecord"));
events.emplace(ptr,
std::unique_ptr<RecordMemEvent>(
new MemEvenRecorder::RecordMemEvent(place, size)));
}
void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
if (g_state == ProfilerState::kDisabled) {
return;
}
std::lock_guard<std::mutex> guard(mtx_);
auto &events = address_memevent_[place];
auto iter = events.find(ptr);
// The ptr maybe not in address_memevent
if (iter != events.end()) {
events.erase(iter);
}
}
void MemEvenRecorder::PopMemRecord(const void *ptr,
const Place &place,
size_t size,
TracerMemEventType type,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved) {
std::lock_guard<std::mutex> guard(mtx_); std::lock_guard<std::mutex> guard(mtx_);
if (FLAGS_enable_host_event_recorder_hook) { // new MemRecord
HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
PosixInNsec(),
reinterpret_cast<uint64_t>(ptr),
type,
-size,
place,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
return;
}
if (type == TracerMemEventType::ReservedFree) {
// old profiler only analyse memory managed by paddle.
return;
}
if (g_state == ProfilerState::kDisabled) return;
auto &events = address_memevent_[place]; auto &events = address_memevent_[place];
auto iter = events.find(ptr); auto iter = events.find(ptr);
// The ptr maybe not in address_memevent // The ptr maybe not in address_memevent
...@@ -279,8 +514,13 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() { ...@@ -279,8 +514,13 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
auto annotation_free = CurAnnotationName(); auto annotation_free = CurAnnotationName();
if (tracer) { if (tracer) {
tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_, tracer->AddMemInfoRecord(start_ns_,
annotation_free, g_mem_thread_id); end_ns_,
bytes_,
place_,
alloc_in_,
annotation_free,
g_mem_thread_id);
} }
PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free); PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
} }
...@@ -307,22 +547,38 @@ RecordBlock::~RecordBlock() { ...@@ -307,22 +547,38 @@ RecordBlock::~RecordBlock() {
if (tracer) { if (tracer) {
// We try to put all blocks at the same nested depth in the // We try to put all blocks at the same nested depth in the
// same timeline lane. and distinguish the using thread_id. // same timeline lane. and distinguish the using thread_id.
tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(), tracer->AddCPURecords(
g_thread_id); name_, start_ns_, PosixInNsec(), BlockDepth(), g_thread_id);
} }
ClearCurBlock(); ClearCurBlock();
} }
void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, void PushMemEvent(uint64_t start_ns,
const Place &place, const std::string &annotation) { uint64_t end_ns,
GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes, size_t bytes,
place, g_mem_thread_id, annotation); const Place &place,
} const std::string &annotation) {
GetMemEventList().Record(EventType::kPushRange,
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, start_ns,
const Place &place, const std::string &annotation) { end_ns,
GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place, bytes,
g_mem_thread_id, annotation); place,
g_mem_thread_id,
annotation);
}
void PopMemEvent(uint64_t start_ns,
uint64_t end_ns,
size_t bytes,
const Place &place,
const std::string &annotation) {
GetMemEventList().Record(EventType::kPopRange,
start_ns,
end_ns,
bytes,
place,
g_mem_thread_id,
annotation);
} }
void Mark(const std::string &name) { void Mark(const std::string &name) {
...@@ -334,17 +590,19 @@ void Mark(const std::string &name) { ...@@ -334,17 +590,19 @@ void Mark(const std::string &name) {
GetEventList().Record(EventType::kMark, name, g_thread_id); GetEventList().Record(EventType::kMark, name, g_thread_id);
} }
Event *PushEvent(const std::string &name, const EventRole role, Event *PushEvent(const std::string &name,
const EventRole role,
std::string attr) { std::string attr) {
return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role, return GetEventList().Record(
attr); EventType::kPushRange, name, g_thread_id, role, attr);
} }
void PopEvent(const std::string &name, const EventRole role, std::string attr) { void PopEvent(const std::string &name, const EventRole role, std::string attr) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id, role, attr); GetEventList().Record(EventType::kPopRange, name, g_thread_id, role, attr);
} }
void EnableProfiler(ProfilerState state) { void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled, PADDLE_ENFORCE_NE(state,
ProfilerState::kDisabled,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Can't enable profiling, since the input state is" "Can't enable profiling, since the input state is"
"ProfilerState::kDisabled")); "ProfilerState::kDisabled"));
...@@ -380,7 +638,8 @@ void ResetProfiler() { ...@@ -380,7 +638,8 @@ void ResetProfiler() {
(*it)->Clear(); (*it)->Clear();
} }
for (auto it = g_all_mem_event_lists.begin(); for (auto it = g_all_mem_event_lists.begin();
it != g_all_mem_event_lists.end(); ++it) { it != g_all_mem_event_lists.end();
++it) {
(*it)->Clear(); (*it)->Clear();
} }
} }
...@@ -576,8 +835,8 @@ static void EmulateEventPushAndPop( ...@@ -576,8 +835,8 @@ static void EmulateEventPushAndPop(
std::string name = std::string name =
prefix_stk.empty() ? evt.name : prefix_stk.top() + "/" + evt.name; prefix_stk.empty() ? evt.name : prefix_stk.top() + "/" + evt.name;
const char *attr = (evt.attr == nullptr ? "none" : evt.attr); const char *attr = (evt.attr == nullptr ? "none" : evt.attr);
Event *orig_evt = cur_thr_list->Record(EventType::kPushRange, name, tid, Event *orig_evt = cur_thr_list->Record(
evt.role, attr); EventType::kPushRange, name, tid, evt.role, attr);
(*out)[tid][evt.end_ns] = std::make_pair(orig_evt, evt.start_ns); (*out)[tid][evt.end_ns] = std::make_pair(orig_evt, evt.start_ns);
cur_thr_list->Record(EventType::kPopRange, name, tid, evt.role, attr); cur_thr_list->Record(EventType::kPopRange, name, tid, evt.role, attr);
} }
...@@ -593,8 +852,8 @@ static void EmulateCPURecordsAdd( ...@@ -593,8 +852,8 @@ static void EmulateCPURecordsAdd(
for (const auto &thr_sec : host_sec.thr_sections) { for (const auto &thr_sec : host_sec.thr_sections) {
uint64_t tid = thr_sec.thread_id; uint64_t tid = thr_sec.thread_id;
for (const auto &evt : thr_sec.events) { for (const auto &evt : thr_sec.events) {
tracer->AddCPURecords(evt.name, evt.start_ns, evt.end_ns, BlockDepth(), tracer->AddCPURecords(
tid); evt.name, evt.start_ns, evt.end_ns, BlockDepth(), tid);
} }
} }
} }
......
...@@ -30,6 +30,8 @@ limitations under the License. */ ...@@ -30,6 +30,8 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.pb.h" #include "paddle/fluid/platform/profiler.pb.h"
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif #endif
...@@ -102,6 +104,22 @@ struct MemEvenRecorder { ...@@ -102,6 +104,22 @@ struct MemEvenRecorder {
public: public:
void PushMemRecord(const void* ptr, const Place& place, size_t size); void PushMemRecord(const void* ptr, const Place& place, size_t size);
void PopMemRecord(const void* ptr, const Place& place); void PopMemRecord(const void* ptr, const Place& place);
void PushMemRecord(const void* ptr,
const Place& place,
size_t size,
TracerMemEventType type,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved);
void PopMemRecord(const void* ptr,
const Place& place,
size_t size,
TracerMemEventType type,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved);
void Flush(); void Flush();
static MemEvenRecorder& Instance() { return recorder; } static MemEvenRecorder& Instance() { return recorder; }
...@@ -160,7 +178,8 @@ struct EventList { ...@@ -160,7 +178,8 @@ struct EventList {
std::vector<T> Reduce() { std::vector<T> Reduce() {
std::vector<T> result; std::vector<T> result;
for (auto& block : event_blocks) { for (auto& block : event_blocks) {
result.insert(result.begin(), std::make_move_iterator(block.begin()), result.insert(result.begin(),
std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end())); std::make_move_iterator(block.end()));
} }
event_blocks.clear(); event_blocks.clear();
...@@ -173,13 +192,21 @@ struct EventList { ...@@ -173,13 +192,21 @@ struct EventList {
}; };
void Mark(const std::string& name); void Mark(const std::string& name);
void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, void PushMemEvent(uint64_t start_ns,
const Place& place, const std::string& annotation); uint64_t end_ns,
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, size_t bytes,
const Place& place, const std::string& annotation); const Place& place,
Event* PushEvent(const std::string& name, const EventRole role, const std::string& annotation);
void PopMemEvent(uint64_t start_ns,
uint64_t end_ns,
size_t bytes,
const Place& place,
const std::string& annotation);
Event* PushEvent(const std::string& name,
const EventRole role,
const std::string attr = "none"); const std::string attr = "none");
void PopEvent(const std::string& name, const EventRole role, void PopEvent(const std::string& name,
const EventRole role,
const std::string attr = "none"); const std::string attr = "none");
// Return the event list of all threads. Assumed the returned value calls // Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
......
cc_library( cc_library(
host_tracer host_tracer
SRCS host_tracer.cc SRCS host_tracer.cc
DEPS enforce) DEPS enforce ddim var_type_traits)
cc_library( cc_library(
cuda_tracer cuda_tracer
SRCS cuda_tracer.cc cupti_data_process.cc SRCS cuda_tracer.cc cupti_data_process.cc
...@@ -10,7 +10,7 @@ add_subdirectory(mlu) ...@@ -10,7 +10,7 @@ add_subdirectory(mlu)
cc_library( cc_library(
event_node event_node
SRCS event_node.cc SRCS event_node.cc
DEPS enforce) DEPS enforce place)
cc_library( cc_library(
profiler_utils profiler_utils
SRCS utils.cc SRCS utils.cc
......
...@@ -18,16 +18,21 @@ ...@@ -18,16 +18,21 @@
#include <functional> #include <functional>
#include <string> #include <string>
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/event.h" // import EventRole, TODO(TIEXING): remove later #include "paddle/fluid/platform/event.h" // import EventRole, TODO(TIEXING): remove later
#include "paddle/fluid/platform/profiler/trace_event.h" #include "paddle/fluid/platform/profiler/trace_event.h"
#include "paddle/phi/core/ddim.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
struct CommonEvent { struct CommonEvent {
public: public:
CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns, CommonEvent(const char *name,
EventRole role, TracerEventType type) uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type)
: name(name), : name(name),
start_ns(start_ns), start_ns(start_ns),
end_ns(end_ns), end_ns(end_ns),
...@@ -35,8 +40,12 @@ struct CommonEvent { ...@@ -35,8 +40,12 @@ struct CommonEvent {
type(type) {} type(type) {}
CommonEvent(std::function<void *(size_t)> arena_allocator, CommonEvent(std::function<void *(size_t)> arena_allocator,
const std::string &name_str, uint64_t start_ns, uint64_t end_ns, const std::string &name_str,
EventRole role, TracerEventType type, const std::string &attr_str) uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type,
const std::string &attr_str)
: start_ns(start_ns), end_ns(end_ns), role(role), type(type) { : start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1)); auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1); strncpy(buf, name_str.c_str(), name_str.length() + 1);
...@@ -47,8 +56,11 @@ struct CommonEvent { ...@@ -47,8 +56,11 @@ struct CommonEvent {
} }
CommonEvent(std::function<void *(size_t)> arena_allocator, CommonEvent(std::function<void *(size_t)> arena_allocator,
const std::string &name_str, uint64_t start_ns, uint64_t end_ns, const std::string &name_str,
EventRole role, TracerEventType type) uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type)
: start_ns(start_ns), end_ns(end_ns), role(role), type(type) { : start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1)); auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1); strncpy(buf, name_str.c_str(), name_str.length() + 1);
...@@ -63,5 +75,61 @@ struct CommonEvent { ...@@ -63,5 +75,61 @@ struct CommonEvent {
const char *attr = nullptr; // not owned, designed for performance const char *attr = nullptr; // not owned, designed for performance
}; };
struct CommonMemEvent {
public:
CommonMemEvent(uint64_t timestamp_ns,
uint64_t addr,
TracerMemEventType type,
int64_t increase_bytes,
const Place &place,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved)
: timestamp_ns(timestamp_ns),
addr(addr),
type(type),
increase_bytes(increase_bytes),
place(place),
peak_allocated(peak_allocated),
peak_reserved(peak_reserved) {}
uint64_t timestamp_ns;
uint64_t addr;
TracerMemEventType type;
int64_t increase_bytes;
Place place;
uint64_t current_allocated;
uint64_t current_reserved;
uint64_t peak_allocated;
uint64_t peak_reserved;
};
struct OperatorSupplementOriginEvent {
public:
OperatorSupplementOriginEvent(
std::function<void *(size_t)> arena_allocator,
uint64_t timestamp_ns,
const std::string &type_name,
const std::map<std::string, std::vector<framework::DDim>> &input_shapes,
const std::map<std::string, std::vector<framework::proto::VarType::Type>>
&dtypes,
const std::vector<std::string> callstack)
: timestamp_ns(timestamp_ns),
input_shapes(input_shapes),
dtypes(dtypes),
callstack(callstack) {
auto buf = static_cast<char *>(arena_allocator(type_name.length() + 1));
strncpy(buf, type_name.c_str(), type_name.length() + 1);
op_type = buf;
}
uint64_t timestamp_ns;
const char *op_type = nullptr; // not owned, designed for performance
// input shapes
std::map<std::string, std::vector<framework::DDim>> input_shapes;
std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
// call stack
const std::vector<std::string> callstack;
};
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -11,9 +11,10 @@ ...@@ -11,9 +11,10 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/platform/profiler/host_tracer.h" #include "paddle/fluid/platform/profiler/host_tracer.h"
#include <sstream>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/flags.h"
#include "paddle/fluid/platform/profiler/common_event.h" #include "paddle/fluid/platform/profiler/common_event.h"
...@@ -21,7 +22,8 @@ ...@@ -21,7 +22,8 @@
// Used to filter events, works like glog VLOG(level). // Used to filter events, works like glog VLOG(level).
// RecordEvent will works if host_trace_level >= level. // RecordEvent will works if host_trace_level >= level.
PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 1, PADDLE_DEFINE_EXPORTED_int64(host_trace_level,
1,
"RecordEvent will works " "RecordEvent will works "
"if host_trace_level >= level."); "if host_trace_level >= level.");
...@@ -50,6 +52,79 @@ void ProcessHostEvents(const HostEventSection<CommonEvent>& host_events, ...@@ -50,6 +52,79 @@ void ProcessHostEvents(const HostEventSection<CommonEvent>& host_events,
} }
} }
void ProcessHostMemEvents(
const HostEventSection<CommonMemEvent>& host_mem_events,
TraceEventCollector* collector) {
for (const auto& thr_sec : host_mem_events.thr_sections) {
uint64_t tid = thr_sec.thread_id;
if (thr_sec.thread_name != kDefaultThreadName) {
collector->AddThreadName(tid, thr_sec.thread_name);
}
for (const auto& evt : thr_sec.events) {
MemTraceEvent event;
event.timestamp_ns = evt.timestamp_ns;
event.addr = evt.addr;
event.type = evt.type;
event.increase_bytes = evt.increase_bytes;
event.place = evt.place.DebugString();
event.current_allocated = evt.current_allocated;
event.current_reserved = evt.current_reserved;
event.peak_allocated = evt.peak_allocated;
event.peak_reserved = evt.peak_reserved;
event.process_id = host_mem_events.process_id;
event.thread_id = tid;
collector->AddMemEvent(std::move(event));
}
}
}
void ProcessOperatorSupplementEvents(
const HostEventSection<OperatorSupplementOriginEvent>& op_supplement_events,
TraceEventCollector* collector) {
for (const auto& thr_sec : op_supplement_events.thr_sections) {
uint64_t tid = thr_sec.thread_id;
if (thr_sec.thread_name != kDefaultThreadName) {
collector->AddThreadName(tid, thr_sec.thread_name);
}
for (const auto& evt : thr_sec.events) {
OperatorSupplementEvent event;
event.timestamp_ns = evt.timestamp_ns;
event.op_type = evt.op_type;
std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
std::map<std::string, std::vector<std::string>> dtypes;
std::string callstack;
for (auto it = evt.input_shapes.begin(); it != evt.input_shapes.end();
it++) {
for (auto idx = 0lu; idx < it->second.size(); idx++) {
input_shapes[it->first].push_back(std::vector<int64_t>());
for (auto dim_idx = 0; dim_idx < it->second.at(idx).size();
dim_idx++) {
input_shapes[it->first][idx].push_back(
it->second.at(idx).at(dim_idx));
}
}
}
for (auto it = evt.dtypes.begin(); it != evt.dtypes.end(); it++) {
for (auto idx = 0lu; idx < it->second.size(); idx++) {
dtypes[it->first].push_back(
framework::proto::VarType::Type_Name(it->second.at(idx)));
}
}
std::ostringstream result_string;
for (auto it = evt.callstack.begin(); it != evt.callstack.end(); it++) {
result_string << (*it) << std::endl;
}
event.input_shapes = input_shapes;
event.dtypes = dtypes;
event.callstack = result_string.str();
event.process_id = op_supplement_events.process_id;
event.thread_id = tid;
collector->AddOperatorSupplementEvent(std::move(event));
}
}
}
} // namespace } // namespace
void HostTracer::PrepareTracing() { void HostTracer::PrepareTracing() {
...@@ -60,16 +135,21 @@ void HostTracer::PrepareTracing() { ...@@ -60,16 +135,21 @@ void HostTracer::PrepareTracing() {
void HostTracer::StartTracing() { void HostTracer::StartTracing() {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
state_ == TracerState::READY || state_ == TracerState::STOPED, true, state_ == TracerState::READY || state_ == TracerState::STOPED,
true,
platform::errors::PreconditionNotMet("TracerState must be READY")); platform::errors::PreconditionNotMet("TracerState must be READY"));
HostEventRecorder<CommonEvent>::GetInstance().GatherEvents(); HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
HostEventRecorder<CommonMemEvent>::GetInstance().GatherEvents();
HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance()
.GatherEvents();
HostTraceLevel::GetInstance().SetLevel(options_.trace_level); HostTraceLevel::GetInstance().SetLevel(options_.trace_level);
state_ = TracerState::STARTED; state_ = TracerState::STARTED;
} }
void HostTracer::StopTracing() { void HostTracer::StopTracing() {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
state_, TracerState::STARTED, state_,
TracerState::STARTED,
platform::errors::PreconditionNotMet("TracerState must be STARTED")); platform::errors::PreconditionNotMet("TracerState must be STARTED"));
HostTraceLevel::GetInstance().SetLevel(HostTraceLevel::kDisabled); HostTraceLevel::GetInstance().SetLevel(HostTraceLevel::kDisabled);
state_ = TracerState::STOPED; state_ = TracerState::STOPED;
...@@ -77,11 +157,19 @@ void HostTracer::StopTracing() { ...@@ -77,11 +157,19 @@ void HostTracer::StopTracing() {
void HostTracer::CollectTraceData(TraceEventCollector* collector) { void HostTracer::CollectTraceData(TraceEventCollector* collector) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
state_, TracerState::STOPED, state_,
TracerState::STOPED,
platform::errors::PreconditionNotMet("TracerState must be STOPED")); platform::errors::PreconditionNotMet("TracerState must be STOPED"));
HostEventSection<CommonEvent> host_events = HostEventSection<CommonEvent> host_events =
HostEventRecorder<CommonEvent>::GetInstance().GatherEvents(); HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
ProcessHostEvents(host_events, collector); ProcessHostEvents(host_events, collector);
HostEventSection<CommonMemEvent> host_mem_events =
HostEventRecorder<CommonMemEvent>::GetInstance().GatherEvents();
ProcessHostMemEvents(host_mem_events, collector);
HostEventSection<OperatorSupplementOriginEvent> op_supplement_events =
HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance()
.GatherEvents();
ProcessOperatorSupplementEvents(op_supplement_events, collector);
} }
} // namespace platform } // namespace platform
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler/trace_event.h"
namespace paddle {
namespace platform {
// Memory event tracing. A trace marks memory manipulation such as allocation
// and free.
// The events can be used to draw memory variation curve.
class RecordMemEvent {
public:
/**
* @param ptr: Pointer address allocated or free.
* @param place: Device for this memory event.
* @param size: Memory size allocated or free.
* @param type: Denote manipulation type for this memory event.
*/
explicit RecordMemEvent(
const void* ptr,
const Place& place,
size_t size,
const TracerMemEventType type = TracerMemEventType::Allocate);
};
} // namespace platform
} // namespace paddle
...@@ -23,6 +23,8 @@ ...@@ -23,6 +23,8 @@
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
#endif #endif
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_python.h" #include "paddle/fluid/platform/profiler/event_python.h"
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/profiler.h" #include "paddle/fluid/platform/profiler/profiler.h"
...@@ -41,10 +43,10 @@ TEST(ProfilerTest, TestHostTracer) { ...@@ -41,10 +43,10 @@ TEST(ProfilerTest, TestHostTracer) {
profiler->Prepare(); profiler->Prepare();
profiler->Start(); profiler->Start();
{ {
RecordInstantEvent("TestTraceLevel_record1", TracerEventType::UserDefined, RecordInstantEvent(
2); "TestTraceLevel_record1", TracerEventType::UserDefined, 2);
RecordInstantEvent("TestTraceLevel_record2", TracerEventType::UserDefined, RecordInstantEvent(
3); "TestTraceLevel_record2", TracerEventType::UserDefined, 3);
} }
auto profiler_result = profiler->Stop(); auto profiler_result = profiler->Stop();
auto nodetree = profiler_result->GetNodeTrees(); auto nodetree = profiler_result->GetNodeTrees();
...@@ -93,3 +95,49 @@ TEST(ProfilerTest, TestCudaTracer) { ...@@ -93,3 +95,49 @@ TEST(ProfilerTest, TestCudaTracer) {
EXPECT_GT(runtime_events.size(), 0u); EXPECT_GT(runtime_events.size(), 0u);
#endif #endif
} }
TEST(ProfilerTest, TestHostTracerForMem) {
using paddle::platform::CPUPlace;
using paddle::platform::EnableHostEventRecorder;
using paddle::platform::MemTraceEventNode;
using paddle::platform::Profiler;
using paddle::platform::ProfilerOptions;
using paddle::platform::ProfilerResult;
using paddle::platform::RecordEvent;
using paddle::platform::RecordInstantEvent;
using paddle::platform::RecordMemEvent;
using paddle::platform::TracerEventType;
using paddle::platform::TracerMemEventType;
ProfilerOptions options;
options.trace_level = 1;
options.trace_switch = 3;
auto profiler = Profiler::Create(options);
EXPECT_TRUE(profiler);
EnableHostEventRecorder();
profiler->Prepare();
profiler->Start();
{
RecordEvent event1(
"TestTracerForMem_phase1", TracerEventType::UserDefined, 1);
RecordMemEvent(reinterpret_cast<void*>(0),
CPUPlace(),
1024,
TracerMemEventType::Allocate);
RecordMemEvent(
reinterpret_cast<void*>(0), CPUPlace(), 1024, TracerMemEventType::Free);
}
{
RecordEvent event2(
"TestTracerForMem_phase2", TracerEventType::UserDefined, 1);
RecordMemEvent(reinterpret_cast<void*>(1024),
CPUPlace(),
1024,
TracerMemEventType::Allocate);
RecordMemEvent(reinterpret_cast<void*>(1024),
CPUPlace(),
1024,
TracerMemEventType::Free);
}
auto profiler_result = profiler->Stop();
auto nodetree = profiler_result->GetNodeTrees();
}
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/platform/profiler/trace_event.h"
namespace paddle {
namespace framework {
class RuntimeContext;
}
namespace platform {
class RecordOpInfoSupplement {
public:
/**
* @param type: Operator type name.
* @param attrs: Attribute map of op.
* @param shape_ctx: Infershape context object.
* @param ctx: Runtime context object.
*/
explicit RecordOpInfoSupplement(const std::string& type,
const framework::AttributeMap& attrs,
const framework::InferShapeContext& shape_ctx,
const framework::RuntimeContext& ctx);
};
} // namespace platform
} // namespace paddle
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册