未验证 提交 8dd0a3b9 编写于 作者: C chenjian 提交者: GitHub

record memory and op supplement info (#43550)

* record memory and op supplement info

* update

* update

* fix a bug

* fix memory recording

* fix a bug

* update

* update

* fix a bug

* update

* fix a bug

* fix a bug

* fix a bug

* Revert "fix a bug"

This reverts commit c1d4df52762ba9ae7c7e27cd2ba4fc3a7ed9c7a5.

* fix a bug

* fix format

* fix
上级 e64823c1
......@@ -24,6 +24,7 @@
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
#include "paddle/phi/core/kernel_context.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
......@@ -558,6 +559,11 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
op_with_kernel->Info().infer_shape_(
instr_node.InnerInferShapeContext().get());
}
infershape_event.End();
platform::RecordOpInfoSupplement(op->Type(),
op->Attrs(),
*(instr_node.InnerInferShapeContext()),
*(instr_node.InnerRuntimeContext()));
}
}
......
此差异已折叠。
cc_library(
allocator
SRCS allocator.cc
DEPS place stats)
DEPS place stats profiler)
cc_library(
cpu_allocator
SRCS cpu_allocator.cc
......@@ -21,7 +21,7 @@ cc_library(
cc_library(
naive_best_fit_allocator
SRCS naive_best_fit_allocator.cc
DEPS allocator buddy_allocator profiler)
DEPS allocator buddy_allocator)
cc_test(
naive_best_fit_allocator_test
SRCS naive_best_fit_allocator_test.cc
......
......@@ -32,7 +32,8 @@
#endif
PADDLE_DEFINE_EXPORTED_bool(
init_allocated_mem, false,
init_allocated_mem,
false,
"It is a mistake that the values of the memory allocated by "
"BuddyAllocator are always zeroed in some op's implementation. "
"To find this error in time, we use init_allocated_mem to indicate "
......@@ -77,7 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
std::call_once(init_flag, []() {
a = new detail::BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
platform::CpuMinChunkSize(),
platform::CpuMaxChunkSize());
});
return a;
......@@ -95,7 +97,8 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
}
template <>
void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
void Free<platform::CPUPlace>(const platform::CPUPlace &place,
void *p,
size_t size) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p);
......@@ -125,7 +128,8 @@ void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) {
return p;
}
template <>
void Free<platform::IPUPlace>(const platform::IPUPlace &place, void *p,
void Free<platform::IPUPlace>(const platform::IPUPlace &place,
void *p,
size_t size) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p);
......@@ -154,7 +158,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
}
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
ret,
XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], no enough memory", ret));
if (FLAGS_init_allocated_mem) {
......@@ -171,7 +176,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
}
template <>
void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
void Free<platform::XPUPlace>(const platform::XPUPlace &place,
void *p,
size_t size) {
#ifdef PADDLE_WITH_XPU
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
......@@ -234,11 +240,13 @@ class NPUBuddyAllocatorList {
BuddyAllocator *Get(int npu_id) {
auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
PADDLE_ENFORCE_LT(pos, devices_.size(),
PADDLE_ENFORCE_LT(pos,
devices_.size(),
platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of "
"devices is %d, the index is %d",
devices_.size(), pos));
devices_.size(),
pos));
std::call_once(*init_flags_[pos], [this, pos] {
platform::SetNPUDeviceId(devices_[pos]);
......@@ -246,7 +254,8 @@ class NPUBuddyAllocatorList {
new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::NPUAllocator(devices_[pos])),
platform::NPUMinChunkSize(),
platform::NPUMaxChunkSize(), EXTRA_PADDING_SIZE));
platform::NPUMaxChunkSize(),
EXTRA_PADDING_SIZE));
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
<< "'FLAGS_fraction_of_gpu_memory_to_use' "
......@@ -312,8 +321,10 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize "
"%s, NpuMaxChunkSize %s, NPU memory used: %s.",
string::HumanReadableSize(size), place.device,
string::HumanReadableSize(avail), string::HumanReadableSize(total),
string::HumanReadableSize(size),
place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::NPUPlace>(place))));
......@@ -331,7 +342,8 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
}
template <>
void Free<platform::NPUPlace>(const platform::NPUPlace &place, void *p,
void Free<platform::NPUPlace>(const platform::NPUPlace &place,
void *p,
size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
......@@ -384,7 +396,8 @@ void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
template <>
void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
void *p, size_t size) {
void *p,
size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
GetNPUPinnedBuddyAllocator()->Free(p);
#else
......@@ -430,18 +443,21 @@ class GPUBuddyAllocatorList {
BuddyAllocator *Get(int gpu_id) {
auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), gpu_id));
PADDLE_ENFORCE_LT(pos, devices_.size(),
PADDLE_ENFORCE_LT(pos,
devices_.size(),
platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of "
"devices is %d, the index is %d",
devices_.size(), pos));
devices_.size(),
pos));
std::call_once(*init_flags_[pos], [this, pos] {
platform::SetDeviceId(devices_[pos]);
allocators_[pos].reset(new BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(
allocators_[pos].reset(
new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::GPUAllocator(devices_[pos])),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize()));
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
<< "'FLAGS_fraction_of_gpu_memory_to_use' "
......@@ -493,8 +509,10 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
"%s, GpuMaxChunkSize %s, GPU memory used: %s.",
string::HumanReadableSize(size), place.device,
string::HumanReadableSize(avail), string::HumanReadableSize(total),
string::HumanReadableSize(size),
place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::CUDAPlace>(place))));
......@@ -515,7 +533,8 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
}
template <>
void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
void *p,
size_t size) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
GetGPUBuddyAllocator(place.device)->Free(p);
......@@ -584,7 +603,8 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
template <>
void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
void *p, size_t size) {
void *p,
size_t size) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
GetCUDAPinnedBuddyAllocator()->Free(p);
#else
......@@ -630,18 +650,21 @@ class MLUBuddyAllocatorList {
BuddyAllocator *Get(int mlu_id) {
auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id));
PADDLE_ENFORCE_LT(pos, devices_.size(),
PADDLE_ENFORCE_LT(pos,
devices_.size(),
platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of "
"devices is %d, the index is %d",
devices_.size(), pos));
devices_.size(),
pos));
std::call_once(*init_flags_[pos], [this, pos] {
platform::SetMLUDeviceId(devices_[pos]);
allocators_[pos].reset(new BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(
allocators_[pos].reset(
new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::MLUAllocator(devices_[pos])),
platform::MLUMinChunkSize(), platform::MLUMaxChunkSize()));
platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize()));
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
<< "(mlu reuse gpu GFlags) "
......@@ -693,8 +716,10 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize "
"%s, MLUMinChunkSize %s, MLU memory used: %s.",
string::HumanReadableSize(size), place.device,
string::HumanReadableSize(avail), string::HumanReadableSize(total),
string::HumanReadableSize(size),
place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::MLUPlace>(place))));
......@@ -711,7 +736,8 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
}
template <>
void Free<platform::MLUPlace>(const platform::MLUPlace &place, void *p,
void Free<platform::MLUPlace>(const platform::MLUPlace &place,
void *p,
size_t size) {
#ifdef PADDLE_WITH_MLU
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
......@@ -759,10 +785,12 @@ class BuddyAllocatorList {
}
BuddyAllocator *Get(int dev_id) {
PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(),
PADDLE_ENFORCE_NE(init_flags_.find(dev_id),
init_flags_.end(),
platform::errors::OutOfRange(
"Cannot find %s %d, please check visible devices.",
device_type_, dev_id));
device_type_,
dev_id));
std::call_once(*init_flags_[dev_id], [this, dev_id] {
phi::DeviceManager::SetDevice(device_type_, dev_id);
......@@ -773,7 +801,8 @@ class BuddyAllocatorList {
new detail::CustomAllocator(device_type_, dev_id)),
phi::DeviceManager::GetMinChunkSize(place),
phi::DeviceManager::GetMaxChunkSize(place),
phi::DeviceManager::GetExtraPaddingSize(place), device_type_));
phi::DeviceManager::GetExtraPaddingSize(place),
device_type_));
});
return allocators_[dev_id].get();
......@@ -813,8 +842,11 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in %s:%d, avaliable %s, total %s, used "
"%s. ",
string::HumanReadableSize(size), place.GetDeviceType(), place.device,
string::HumanReadableSize(avail), string::HumanReadableSize(total),
string::HumanReadableSize(size),
place.GetDeviceType(),
place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(total - avail)));
} else {
if (FLAGS_init_allocated_mem) {
......@@ -830,7 +862,8 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
}
template <>
void Free<platform::CustomPlace>(const platform::CustomPlace &place, void *p,
void Free<platform::CustomPlace>(const platform::CustomPlace &place,
void *p,
size_t size) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
......@@ -922,8 +955,6 @@ namespace allocation {
phi::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size));
auto *tmp_alloc = new Allocation(ptr, size, place_);
platform::MemEvenRecorder::Instance().PushMemRecord(
static_cast<void *>(tmp_alloc), place_, size);
return tmp_alloc;
}
......@@ -931,8 +962,6 @@ void NaiveBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
paddle::platform::VisitPlace(
allocation->place(),
legacy::FreeVisitor(allocation->ptr(), allocation->size()));
platform::MemEvenRecorder::Instance().PopMemRecord(
static_cast<void *>(allocation), place_);
delete allocation;
}
......
......@@ -15,6 +15,7 @@
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/memory/stats.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
namespace paddle {
namespace memory {
namespace allocation {
......@@ -26,6 +27,10 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
#endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, -allocation->size());
platform::RecordMemEvent(allocation->ptr(),
allocation->place(),
allocation->size(),
platform::TracerMemEventType::ReservedFree);
delete allocation;
}
phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
......@@ -36,6 +41,10 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
#endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
platform::RecordMemEvent(ptr,
platform::CUDAPinnedPlace(),
size,
platform::TracerMemEventType::ReservedAllocate);
return new Allocation(ptr, size, platform::CUDAPinnedPlace());
}
} // namespace allocation
......
......@@ -16,6 +16,7 @@
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/stats.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
namespace paddle {
namespace memory {
......@@ -30,14 +31,18 @@ class StatAllocator : public Allocator {
protected:
void FreeImpl(phi::Allocation* allocation) override {
if (platform::is_cpu_place(allocation->place())) {
HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-allocation->size());
if (platform::is_cpu_place(allocation->place()) ||
platform::is_cuda_pinned_place(allocation->place())) {
HOST_MEMORY_STAT_UPDATE(
Allocated, allocation->place().GetDeviceId(), -allocation->size());
} else {
DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-allocation->size());
DEVICE_MEMORY_STAT_UPDATE(
Allocated, allocation->place().GetDeviceId(), -allocation->size());
}
platform::RecordMemEvent(allocation->ptr(),
allocation->place(),
allocation->size(),
platform::TracerMemEventType::Free);
underlying_allocator_->Free(allocation);
}
......@@ -48,12 +53,16 @@ class StatAllocator : public Allocator {
const platform::Place& place = allocation->place();
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
HOST_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
allocation->size());
HOST_MEMORY_STAT_UPDATE(
Allocated, place.GetDeviceId(), allocation->size());
} else {
DEVICE_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
allocation->size());
DEVICE_MEMORY_STAT_UPDATE(
Allocated, place.GetDeviceId(), allocation->size());
}
platform::RecordMemEvent(allocation->ptr(),
allocation->place(),
allocation->size(),
platform::TracerMemEventType::Allocate);
return allocation.release();
}
......
......@@ -41,6 +41,7 @@ limitations under the License. */
#endif
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
DECLARE_bool(use_pinned_memory);
DECLARE_double(fraction_of_gpu_memory_to_use);
......@@ -64,11 +65,13 @@ void* AlignedMalloc(size_t size) {
#else
int error = posix_memalign(&p, alignment, size);
PADDLE_ENFORCE_EQ(
error, 0,
error,
0,
platform::errors::ResourceExhausted(
"Fail to alloc memory of %ld size, error code is %d.", size, error));
#endif
PADDLE_ENFORCE_NOT_NULL(p, platform::errors::ResourceExhausted(
PADDLE_ENFORCE_NOT_NULL(p,
platform::errors::ResourceExhausted(
"Fail to alloc memory of %ld size.", size));
return p;
}
......@@ -95,7 +98,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
}
HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
platform::RecordMemEvent(
p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
return p;
}
......@@ -114,6 +118,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
#endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
platform::RecordMemEvent(
p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
}
bool CPUAllocator::UseGpu() const { return false; }
......@@ -146,7 +152,8 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size);
limit_size,
limit_size);
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
......@@ -161,21 +168,29 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
"please set it to a higher value but less than 1.0.\n"
" The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
gpu_id_, string::HumanReadableSize(size), gpu_id_,
string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
gpu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg));
gpu_id_,
string::HumanReadableSize(size),
gpu_id_,
string::HumanReadableSize(allocated),
string::HumanReadableSize(avail),
gpu_id_,
FLAGS_fraction_of_gpu_memory_to_use,
err_msg));
}
}
void GPUAllocator::Free(void* p, size_t size, size_t index) {
PADDLE_ENFORCE_EQ(index, 0,
PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument(
"The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(gpu_alloc_size_, size,
PADDLE_ENFORCE_GE(gpu_alloc_size_,
size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)",
size, gpu_alloc_size_));
size,
gpu_alloc_size_));
gpu_alloc_size_ -= size;
platform::RecordedGpuFree(p, size, gpu_id_);
......@@ -213,6 +228,8 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
*index = 1; // PINNED memory
cuda_pinnd_alloc_size_ += size;
HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
platform::RecordMemEvent(
p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
return p;
} else {
LOG(WARNING) << "cudaHostAlloc failed.";
......@@ -224,21 +241,25 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
gpuError_t err;
PADDLE_ENFORCE_EQ(index, 1,
PADDLE_ENFORCE_EQ(index,
1,
platform::errors::InvalidArgument(
"The index should be 1, but got %d", index));
PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, size,
PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_,
size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated cuda pinned memory (%d)",
size, cuda_pinnd_alloc_size_));
size,
cuda_pinnd_alloc_size_));
cuda_pinnd_alloc_size_ -= size;
#ifdef PADDLE_WITH_HIP
err = hipHostFree(p);
if (err != hipErrorDeinitialized) {
PADDLE_ENFORCE_EQ(
err, hipSuccess,
err,
hipSuccess,
platform::errors::Fatal(
"hipFreeHost failed in GPUPinnedAllocator, error code is %d", err));
}
......@@ -252,13 +273,16 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
// cudaFreeHost succeeds.
if (err != cudaErrorCudartUnloading) {
PADDLE_ENFORCE_EQ(
err, 0,
err,
0,
platform::errors::Fatal(
"cudaFreeHost failed in GPUPinnedAllocator, error code is %d",
err));
}
#endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
platform::RecordMemEvent(
p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
}
bool CUDAPinnedAllocator::UseGpu() const { return false; }
......@@ -289,7 +313,8 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size);
limit_size,
limit_size);
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
......@@ -304,22 +329,29 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
"please set it to a higher value but less than 1.0.\n"
" The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
npu_id_, string::HumanReadableSize(size), npu_id_,
string::HumanReadableSize(avail), npu_id_,
FLAGS_fraction_of_gpu_memory_to_use, err_msg));
npu_id_,
string::HumanReadableSize(size),
npu_id_,
string::HumanReadableSize(avail),
npu_id_,
FLAGS_fraction_of_gpu_memory_to_use,
err_msg));
}
}
void NPUAllocator::Free(void* p, size_t size, size_t index) {
VLOG(4) << "Free " << p << " size " << size;
PADDLE_ENFORCE_EQ(index, 0,
PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument(
"The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(npu_alloc_size_, size,
PADDLE_ENFORCE_GE(npu_alloc_size_,
size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)",
size, npu_alloc_size_));
size,
npu_alloc_size_));
npu_alloc_size_ -= size;
platform::RecordedNPUFree(p, size, npu_id_);
......@@ -358,21 +390,25 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
aclError err;
PADDLE_ENFORCE_EQ(index, 1,
PADDLE_ENFORCE_EQ(index,
1,
platform::errors::InvalidArgument(
"The index should be 1, but got %d", index));
PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size,
PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_,
size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated npu pinned memory (%d)",
size, npu_pinnd_alloc_size_));
size,
npu_pinnd_alloc_size_));
npu_pinnd_alloc_size_ -= size;
err = platform::NPUHostFree(p);
if (err != ACL_ERROR_NONE) {
PADDLE_ENFORCE_EQ(
err, 0,
err,
0,
platform::errors::Fatal(
"NPUHostFree failed in NPUPinnedAllocator, error code is %d", err));
}
......@@ -407,7 +443,8 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum MLU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size);
limit_size,
limit_size);
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
......@@ -422,21 +459,29 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
"please set it to a higher value but less than 1.0.\n"
" The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
mlu_id_, string::HumanReadableSize(size), mlu_id_,
string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
mlu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg));
mlu_id_,
string::HumanReadableSize(size),
mlu_id_,
string::HumanReadableSize(allocated),
string::HumanReadableSize(avail),
mlu_id_,
FLAGS_fraction_of_gpu_memory_to_use,
err_msg));
}
}
void MLUAllocator::Free(void* p, size_t size, size_t index) {
PADDLE_ENFORCE_EQ(index, 0,
PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument(
"The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(mlu_alloc_size_, size,
PADDLE_ENFORCE_GE(mlu_alloc_size_,
size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)",
size, mlu_alloc_size_));
size,
mlu_alloc_size_));
mlu_alloc_size_ -= size;
platform::RecordedMLUFree(p, size, mlu_id_);
......@@ -465,7 +510,9 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
"\n\nOut of memory error on %s %d. "
"total memory is %s, used memory is %s, "
"available memory is only %s.\n\n",
dev_type_, dev_id_, string::HumanReadableSize(total),
dev_type_,
dev_id_,
string::HumanReadableSize(total),
string::HumanReadableSize(total - avail),
string::HumanReadableSize(avail)));
}
......@@ -474,14 +521,17 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
void CustomAllocator::Free(void* p, size_t size, size_t index) {
VLOG(4) << "CustomAllocator::Free " << p << " size " << size;
PADDLE_ENFORCE_EQ(index, 0,
PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument(
"The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(plug_alloc_size, size,
PADDLE_ENFORCE_GE(plug_alloc_size,
size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)",
size, plug_alloc_size));
size,
plug_alloc_size));
plug_alloc_size -= size;
auto place = platform::CustomPlace(dev_type_, dev_id_);
auto device = phi::DeviceManager::GetDeviceWithPlace(place);
......
此差异已折叠。
......@@ -354,7 +354,9 @@ if(WITH_GPU)
enforce
dynload_cuda
new_profiler
stats)
stats
op_proto_maker
shape_inference)
nv_library(
device_memory_aligment
SRCS device_memory_aligment.cc
......@@ -363,7 +365,14 @@ elseif(WITH_ROCM)
hip_library(
profiler
SRCS profiler.cc profiler.cu
DEPS os_info device_tracer gpu_info enforce new_profiler stats)
DEPS os_info
device_tracer
gpu_info
enforce
new_profiler
stats
op_proto_maker
shape_inference)
hip_library(
device_memory_aligment
SRCS device_memory_aligment.cc
......@@ -372,7 +381,13 @@ else()
cc_library(
profiler
SRCS profiler.cc
DEPS os_info device_tracer enforce new_profiler stats)
DEPS os_info
device_tracer
enforce
new_profiler
stats
op_proto_maker
shape_inference)
cc_library(
device_memory_aligment
SRCS device_memory_aligment.cc
......
......@@ -29,6 +29,7 @@ limitations under the License. */
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
#include "paddle/fluid/string/split.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
......@@ -51,10 +52,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(enable_cublas_tensor_op_math);
DECLARE_uint64(gpu_memory_limit_mb);
PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false,
PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log,
false,
"Whether to print the message of gpu memory usage "
"at exit, mainly used for UT and CI.");
PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb, true,
PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb,
true,
"Whether to print the message of gpu memory usage "
"MB as a unit of measurement.");
......@@ -66,7 +69,10 @@ namespace platform {
void GpuMemoryUsage(size_t *available, size_t *total) {
size_t actual_available, actual_total;
RecordedGpuMemGetInfo(available, total, &actual_available, &actual_total,
RecordedGpuMemGetInfo(available,
total,
&actual_available,
&actual_total,
platform::GetCurrentDeviceId());
}
......@@ -94,7 +100,8 @@ size_t GpuMaxAllocSize() {
static size_t GpuAllocSize(bool realloc) {
size_t available_to_alloc = GpuAvailableMemToAlloc();
PADDLE_ENFORCE_GT(
available_to_alloc, 0,
available_to_alloc,
0,
platform::errors::ResourceExhausted("Not enough available GPU memory."));
// If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
// allocated by fraction
......@@ -105,7 +112,8 @@ static size_t GpuAllocSize(bool realloc) {
? flag_mb << 20
: available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
PADDLE_ENFORCE_GE(
available_to_alloc, alloc_bytes,
available_to_alloc,
alloc_bytes,
platform::errors::ResourceExhausted("Not enough available GPU memory."));
VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
<< " MiB, is it Re-alloc: " << realloc;
......@@ -192,13 +200,16 @@ class RecordedGpuMallocHelper {
});
PADDLE_ENFORCE_GE(
dev_id, 0,
dev_id,
0,
platform::errors::OutOfRange(
"Device id must be not less than 0, but got %d.", dev_id));
PADDLE_ENFORCE_LT(
dev_id, instances_.size(),
dev_id,
instances_.size(),
platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.",
dev_id, instances_.size()));
dev_id,
instances_.size()));
return instances_[dev_id].get();
}
......@@ -207,7 +218,8 @@ class RecordedGpuMallocHelper {
* or cudaSuccess would be returned, and the cudaGetLastError() flag
* would be clear.
*/
gpuError_t Malloc(void **ptr, size_t size,
gpuError_t Malloc(void **ptr,
size_t size,
bool malloc_managed_memory = false) {
LockGuardPtr<std::mutex> lock(mtx_);
if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
......@@ -236,7 +248,10 @@ class RecordedGpuMallocHelper {
cur_size_.fetch_add(size);
STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
platform::RecordMemEvent(ptr,
GPUPlace(dev_id_),
size,
platform::TracerMemEventType::ReservedAllocate);
#ifdef PADDLE_WITH_TESTING
gpu_ptrs.insert(*ptr);
#endif
......@@ -275,6 +290,10 @@ class RecordedGpuMallocHelper {
cur_size_.fetch_sub(size);
STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
platform::RecordMemEvent(ptr,
GPUPlace(dev_id_),
size,
platform::TracerMemEventType::ReservedFree);
} else {
platform::GpuGetLastError(); // clear the error flag when
// cudaErrorCudartUnloading /
......@@ -300,7 +319,9 @@ class RecordedGpuMallocHelper {
#endif
}
bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
bool GetMemInfo(size_t *avail,
size_t *total,
size_t *actual_avail,
size_t *actual_total) {
{
CUDADeviceGuard guard(dev_id_);
......@@ -335,7 +356,8 @@ class RecordedGpuMallocHelper {
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10020
CUresult MemCreate(CUmemGenericAllocationHandle *handle, size_t size,
CUresult MemCreate(CUmemGenericAllocationHandle *handle,
size_t size,
const CUmemAllocationProp *prop,
unsigned long long flags) { // NOLINT
auto result =
......@@ -371,7 +393,9 @@ class RecordedGpuMallocHelper {
std::once_flag RecordedGpuMallocHelper::once_flag_;
gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id,
gpuError_t RecordedGpuMalloc(void **ptr,
size_t size,
int dev_id,
bool malloc_managed_memory) {
return RecordedGpuMallocHelper::Instance(dev_id)->Malloc(
ptr, size, malloc_managed_memory);
......@@ -383,22 +407,28 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) {
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10020
CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
size_t size,
const CUmemAllocationProp *prop,
unsigned long long flags, int dev_id) { // NOLINT
return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(handle, size,
prop, flags);
unsigned long long flags,
int dev_id) { // NOLINT
return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(
handle, size, prop, flags);
}
CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, size_t size,
CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle,
size_t size,
int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size);
}
#endif
#endif
bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
size_t *actual_total, int dev_id) {
bool RecordedGpuMemGetInfo(size_t *avail,
size_t *total,
size_t *actual_avail,
size_t *actual_total,
int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->GetMemInfo(
avail, total, actual_avail, actual_total);
}
......@@ -493,26 +523,35 @@ void GpuDestroyStream(gpuStream_t stream) {
void GpuDeviceSync() { phi::backends::gpu::GpuDeviceSync(); }
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
gpuMemcpyKind kind, gpuStream_t stream) {
void GpuMemcpyAsync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind,
gpuStream_t stream) {
phi::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream);
}
void GpuMemcpySync(void *dst, const void *src, size_t count,
void GpuMemcpySync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind) {
phi::backends::gpu::GpuMemcpySync(dst, src, count, kind);
}
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, gpuStream_t stream) {
phi::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device,
count, stream);
void GpuMemcpyPeerAsync(void *dst,
int dst_device,
const void *src,
int src_device,
size_t count,
gpuStream_t stream) {
phi::backends::gpu::GpuMemcpyPeerAsync(
dst, dst_device, src, src_device, count, stream);
}
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count) {
phi::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device,
count);
void GpuMemcpyPeerSync(
void *dst, int dst_device, const void *src, int src_device, size_t count) {
phi::backends::gpu::GpuMemcpyPeerSync(
dst, dst_device, src, src_device, count);
}
void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
......
......@@ -30,12 +30,16 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/nvtx.h"
#endif
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/os_info.h"
PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false,
PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler,
false,
"Enable rpc profiler or not.");
DEFINE_bool(enable_host_event_recorder_hook, false,
DEFINE_bool(enable_host_event_recorder_hook,
false,
"enable HostEventRecorder, hook Profiler");
namespace paddle {
......@@ -43,8 +47,11 @@ namespace platform {
MemEvenRecorder MemEvenRecorder::recorder;
Event::Event(EventType type, std::string name, uint32_t thread_id,
EventRole role, std::string attr)
Event::Event(EventType type,
std::string name,
uint32_t thread_id,
EventRole role,
std::string attr)
: type_(type),
name_(name),
thread_id_(thread_id),
......@@ -68,8 +75,10 @@ double Event::CudaElapsedMs(const Event &e) const {
#endif
}
RecordEvent::RecordEvent(const char *name, const TracerEventType type,
uint32_t level, const EventRole role) {
RecordEvent::RecordEvent(const char *name,
const TracerEventType type,
uint32_t level,
const EventRole role) {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook) {
......@@ -100,8 +109,10 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type,
start_ns_ = PosixInNsec();
}
RecordEvent::RecordEvent(const std::string &name, const TracerEventType type,
uint32_t level, const EventRole role) {
RecordEvent::RecordEvent(const std::string &name,
const TracerEventType type,
uint32_t level,
const EventRole role) {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook) {
......@@ -130,8 +141,10 @@ RecordEvent::RecordEvent(const std::string &name, const TracerEventType type,
start_ns_ = PosixInNsec();
}
RecordEvent::RecordEvent(const std::string &name, const std::string &attr,
const TracerEventType type, uint32_t level,
RecordEvent::RecordEvent(const std::string &name,
const std::string &attr,
const TracerEventType type,
uint32_t level,
const EventRole role) {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
......@@ -215,8 +228,8 @@ void RecordEvent::End() {
DeviceTracer *tracer = GetDeviceTracer();
if (tracer) {
uint64_t end_ns = PosixInNsec();
tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(),
g_thread_id);
tracer->AddCPURecords(
CurAnnotationName(), start_ns_, end_ns, BlockDepth(), g_thread_id);
}
ClearCurAnnotation();
PopEvent(*name_, role_);
......@@ -226,7 +239,8 @@ void RecordEvent::End() {
is_enabled_ = false;
}
RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type,
RecordInstantEvent::RecordInstantEvent(const char *name,
TracerEventType type,
uint32_t level) {
if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
return;
......@@ -236,21 +250,242 @@ RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type,
name, start_end_ns, start_end_ns, EventRole::kOrdinary, type);
}
void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
RecordOpInfoSupplement::RecordOpInfoSupplement(
const std::string &type,
const framework::AttributeMap &attrs,
const framework::InferShapeContext &shape_ctx,
const framework::RuntimeContext &ctx) {
if (FLAGS_enable_host_event_recorder_hook == false) {
return;
}
std::map<std::string, std::vector<framework::DDim>> input_shapes;
std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
for (auto it = ctx.inputs.begin(); it != ctx.inputs.end(); it++) {
input_shapes[it->first] = shape_ctx.GetInputsDim(it->first);
dtypes[it->first] = shape_ctx.GetInputsVarType(it->first);
}
const std::vector<std::string> *callstack_ptr = nullptr;
std::vector<std::string> callstack;
auto iter = attrs.find(
framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
if (iter != attrs.end()) {
callstack_ptr = &BOOST_GET_CONST(std::vector<std::string>, iter->second);
callstack = *callstack_ptr;
}
HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance().RecordEvent(
PosixInNsec(), type, input_shapes, dtypes, callstack);
}
RecordMemEvent::RecordMemEvent(const void *ptr,
const phi::Place &place,
size_t size,
const TracerMemEventType type) {
if (g_state == ProfilerState::kDisabled &&
FLAGS_enable_host_event_recorder_hook == false) {
return;
}
if (type == TracerMemEventType::Allocate) {
uint64_t current_allocated;
uint64_t peak_allocated;
uint64_t current_reserved = 0; // 0 means keep the same as before
uint64_t peak_reserved = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
current_allocated =
HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
} else {
current_allocated =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
}
platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
place,
size,
type,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
} else if (type == TracerMemEventType::ReservedAllocate) {
uint64_t current_reserved;
uint64_t peak_reserved;
uint64_t current_allocated = 0; // 0 means keep the same as before
uint64_t peak_allocated = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
current_reserved =
HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
} else {
current_reserved =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
}
platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
place,
size,
type,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
} else if (type == TracerMemEventType::Free) {
uint64_t current_allocated;
uint64_t peak_allocated;
uint64_t current_reserved = 0; // 0 means keep the same as before
uint64_t peak_reserved = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
current_allocated =
HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
} else {
current_allocated =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
}
platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
place,
size,
type,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
} else if (type == TracerMemEventType::ReservedFree) {
uint64_t current_reserved;
uint64_t peak_reserved;
uint64_t current_allocated = 0; // 0 means keep the same as before
uint64_t peak_allocated = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
current_reserved =
HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
} else {
current_reserved =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
}
platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
place,
size,
type,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
}
}
void MemEvenRecorder::PushMemRecord(const void *ptr,
const Place &place,
size_t size) {
if (g_state == ProfilerState::kDisabled) return;
if (g_state == ProfilerState::kDisabled) {
return;
}
std::lock_guard<std::mutex> guard(mtx_);
auto &events = address_memevent_[place];
PADDLE_ENFORCE_EQ(events.count(ptr), 0,
PADDLE_ENFORCE_EQ(events.count(ptr),
0,
platform::errors::InvalidArgument(
"The Place can't exist in the stage of PushMemRecord"));
events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
events.emplace(ptr,
std::unique_ptr<RecordMemEvent>(
new MemEvenRecorder::RecordMemEvent(place, size)));
}
void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
void MemEvenRecorder::PushMemRecord(const void *ptr,
const Place &place,
size_t size,
TracerMemEventType type,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved) {
std::lock_guard<std::mutex> guard(mtx_);
if (FLAGS_enable_host_event_recorder_hook) { // new MemRecord
HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
PosixInNsec(),
reinterpret_cast<uint64_t>(ptr),
type,
size,
place,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
return;
}
if (type == TracerMemEventType::ReservedAllocate) {
// old profiler only analyse memory managed by paddle.
return;
}
if (g_state == ProfilerState::kDisabled) return;
auto &events = address_memevent_[place];
PADDLE_ENFORCE_EQ(events.count(ptr),
0,
platform::errors::InvalidArgument(
"The Place can't exist in the stage of PushMemRecord"));
events.emplace(ptr,
std::unique_ptr<RecordMemEvent>(
new MemEvenRecorder::RecordMemEvent(place, size)));
}
void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
if (g_state == ProfilerState::kDisabled) {
return;
}
std::lock_guard<std::mutex> guard(mtx_);
auto &events = address_memevent_[place];
auto iter = events.find(ptr);
// The ptr maybe not in address_memevent
if (iter != events.end()) {
events.erase(iter);
}
}
void MemEvenRecorder::PopMemRecord(const void *ptr,
const Place &place,
size_t size,
TracerMemEventType type,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved) {
std::lock_guard<std::mutex> guard(mtx_);
if (FLAGS_enable_host_event_recorder_hook) { // new MemRecord
HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
PosixInNsec(),
reinterpret_cast<uint64_t>(ptr),
type,
-size,
place,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
return;
}
if (type == TracerMemEventType::ReservedFree) {
// old profiler only analyse memory managed by paddle.
return;
}
if (g_state == ProfilerState::kDisabled) return;
auto &events = address_memevent_[place];
auto iter = events.find(ptr);
// The ptr maybe not in address_memevent
......@@ -279,8 +514,13 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
auto annotation_free = CurAnnotationName();
if (tracer) {
tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_,
annotation_free, g_mem_thread_id);
tracer->AddMemInfoRecord(start_ns_,
end_ns_,
bytes_,
place_,
alloc_in_,
annotation_free,
g_mem_thread_id);
}
PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
}
......@@ -307,22 +547,38 @@ RecordBlock::~RecordBlock() {
if (tracer) {
// We try to put all blocks at the same nested depth in the
// same timeline lane. and distinguish the using thread_id.
tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
g_thread_id);
tracer->AddCPURecords(
name_, start_ns_, PosixInNsec(), BlockDepth(), g_thread_id);
}
ClearCurBlock();
}
void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &annotation) {
GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes,
place, g_mem_thread_id, annotation);
}
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &annotation) {
GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place,
g_mem_thread_id, annotation);
void PushMemEvent(uint64_t start_ns,
uint64_t end_ns,
size_t bytes,
const Place &place,
const std::string &annotation) {
GetMemEventList().Record(EventType::kPushRange,
start_ns,
end_ns,
bytes,
place,
g_mem_thread_id,
annotation);
}
void PopMemEvent(uint64_t start_ns,
uint64_t end_ns,
size_t bytes,
const Place &place,
const std::string &annotation) {
GetMemEventList().Record(EventType::kPopRange,
start_ns,
end_ns,
bytes,
place,
g_mem_thread_id,
annotation);
}
void Mark(const std::string &name) {
......@@ -334,17 +590,19 @@ void Mark(const std::string &name) {
GetEventList().Record(EventType::kMark, name, g_thread_id);
}
Event *PushEvent(const std::string &name, const EventRole role,
Event *PushEvent(const std::string &name,
const EventRole role,
std::string attr) {
return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role,
attr);
return GetEventList().Record(
EventType::kPushRange, name, g_thread_id, role, attr);
}
void PopEvent(const std::string &name, const EventRole role, std::string attr) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id, role, attr);
}
void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled,
PADDLE_ENFORCE_NE(state,
ProfilerState::kDisabled,
platform::errors::InvalidArgument(
"Can't enable profiling, since the input state is"
"ProfilerState::kDisabled"));
......@@ -380,7 +638,8 @@ void ResetProfiler() {
(*it)->Clear();
}
for (auto it = g_all_mem_event_lists.begin();
it != g_all_mem_event_lists.end(); ++it) {
it != g_all_mem_event_lists.end();
++it) {
(*it)->Clear();
}
}
......@@ -576,8 +835,8 @@ static void EmulateEventPushAndPop(
std::string name =
prefix_stk.empty() ? evt.name : prefix_stk.top() + "/" + evt.name;
const char *attr = (evt.attr == nullptr ? "none" : evt.attr);
Event *orig_evt = cur_thr_list->Record(EventType::kPushRange, name, tid,
evt.role, attr);
Event *orig_evt = cur_thr_list->Record(
EventType::kPushRange, name, tid, evt.role, attr);
(*out)[tid][evt.end_ns] = std::make_pair(orig_evt, evt.start_ns);
cur_thr_list->Record(EventType::kPopRange, name, tid, evt.role, attr);
}
......@@ -593,8 +852,8 @@ static void EmulateCPURecordsAdd(
for (const auto &thr_sec : host_sec.thr_sections) {
uint64_t tid = thr_sec.thread_id;
for (const auto &evt : thr_sec.events) {
tracer->AddCPURecords(evt.name, evt.start_ns, evt.end_ns, BlockDepth(),
tid);
tracer->AddCPURecords(
evt.name, evt.start_ns, evt.end_ns, BlockDepth(), tid);
}
}
}
......
......@@ -30,6 +30,8 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.pb.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif
......@@ -102,6 +104,22 @@ struct MemEvenRecorder {
public:
void PushMemRecord(const void* ptr, const Place& place, size_t size);
void PopMemRecord(const void* ptr, const Place& place);
void PushMemRecord(const void* ptr,
const Place& place,
size_t size,
TracerMemEventType type,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved);
void PopMemRecord(const void* ptr,
const Place& place,
size_t size,
TracerMemEventType type,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved);
void Flush();
static MemEvenRecorder& Instance() { return recorder; }
......@@ -160,7 +178,8 @@ struct EventList {
std::vector<T> Reduce() {
std::vector<T> result;
for (auto& block : event_blocks) {
result.insert(result.begin(), std::make_move_iterator(block.begin()),
result.insert(result.begin(),
std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
event_blocks.clear();
......@@ -173,13 +192,21 @@ struct EventList {
};
void Mark(const std::string& name);
void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place, const std::string& annotation);
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place, const std::string& annotation);
Event* PushEvent(const std::string& name, const EventRole role,
void PushMemEvent(uint64_t start_ns,
uint64_t end_ns,
size_t bytes,
const Place& place,
const std::string& annotation);
void PopMemEvent(uint64_t start_ns,
uint64_t end_ns,
size_t bytes,
const Place& place,
const std::string& annotation);
Event* PushEvent(const std::string& name,
const EventRole role,
const std::string attr = "none");
void PopEvent(const std::string& name, const EventRole role,
void PopEvent(const std::string& name,
const EventRole role,
const std::string attr = "none");
// Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
......
cc_library(
host_tracer
SRCS host_tracer.cc
DEPS enforce)
DEPS enforce ddim var_type_traits)
cc_library(
cuda_tracer
SRCS cuda_tracer.cc cupti_data_process.cc
......@@ -10,7 +10,7 @@ add_subdirectory(mlu)
cc_library(
event_node
SRCS event_node.cc
DEPS enforce)
DEPS enforce place)
cc_library(
profiler_utils
SRCS utils.cc
......
......@@ -18,16 +18,21 @@
#include <functional>
#include <string>
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/event.h" // import EventRole, TODO(TIEXING): remove later
#include "paddle/fluid/platform/profiler/trace_event.h"
#include "paddle/phi/core/ddim.h"
namespace paddle {
namespace platform {
struct CommonEvent {
public:
CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns,
EventRole role, TracerEventType type)
CommonEvent(const char *name,
uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type)
: name(name),
start_ns(start_ns),
end_ns(end_ns),
......@@ -35,8 +40,12 @@ struct CommonEvent {
type(type) {}
CommonEvent(std::function<void *(size_t)> arena_allocator,
const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
EventRole role, TracerEventType type, const std::string &attr_str)
const std::string &name_str,
uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type,
const std::string &attr_str)
: start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1);
......@@ -47,8 +56,11 @@ struct CommonEvent {
}
CommonEvent(std::function<void *(size_t)> arena_allocator,
const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
EventRole role, TracerEventType type)
const std::string &name_str,
uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type)
: start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1);
......@@ -63,5 +75,61 @@ struct CommonEvent {
const char *attr = nullptr; // not owned, designed for performance
};
struct CommonMemEvent {
public:
CommonMemEvent(uint64_t timestamp_ns,
uint64_t addr,
TracerMemEventType type,
int64_t increase_bytes,
const Place &place,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved)
: timestamp_ns(timestamp_ns),
addr(addr),
type(type),
increase_bytes(increase_bytes),
place(place),
peak_allocated(peak_allocated),
peak_reserved(peak_reserved) {}
uint64_t timestamp_ns;
uint64_t addr;
TracerMemEventType type;
int64_t increase_bytes;
Place place;
uint64_t current_allocated;
uint64_t current_reserved;
uint64_t peak_allocated;
uint64_t peak_reserved;
};
struct OperatorSupplementOriginEvent {
public:
OperatorSupplementOriginEvent(
std::function<void *(size_t)> arena_allocator,
uint64_t timestamp_ns,
const std::string &type_name,
const std::map<std::string, std::vector<framework::DDim>> &input_shapes,
const std::map<std::string, std::vector<framework::proto::VarType::Type>>
&dtypes,
const std::vector<std::string> callstack)
: timestamp_ns(timestamp_ns),
input_shapes(input_shapes),
dtypes(dtypes),
callstack(callstack) {
auto buf = static_cast<char *>(arena_allocator(type_name.length() + 1));
strncpy(buf, type_name.c_str(), type_name.length() + 1);
op_type = buf;
}
uint64_t timestamp_ns;
const char *op_type = nullptr; // not owned, designed for performance
// input shapes
std::map<std::string, std::vector<framework::DDim>> input_shapes;
std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
// call stack
const std::vector<std::string> callstack;
};
} // namespace platform
} // namespace paddle
......@@ -11,9 +11,10 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/profiler/host_tracer.h"
#include <sstream>
#include "glog/logging.h"
#include "paddle/fluid/platform/flags.h"
#include "paddle/fluid/platform/profiler/common_event.h"
......@@ -21,7 +22,8 @@
// Used to filter events, works like glog VLOG(level).
// RecordEvent will works if host_trace_level >= level.
PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 1,
PADDLE_DEFINE_EXPORTED_int64(host_trace_level,
1,
"RecordEvent will works "
"if host_trace_level >= level.");
......@@ -50,6 +52,79 @@ void ProcessHostEvents(const HostEventSection<CommonEvent>& host_events,
}
}
void ProcessHostMemEvents(
const HostEventSection<CommonMemEvent>& host_mem_events,
TraceEventCollector* collector) {
for (const auto& thr_sec : host_mem_events.thr_sections) {
uint64_t tid = thr_sec.thread_id;
if (thr_sec.thread_name != kDefaultThreadName) {
collector->AddThreadName(tid, thr_sec.thread_name);
}
for (const auto& evt : thr_sec.events) {
MemTraceEvent event;
event.timestamp_ns = evt.timestamp_ns;
event.addr = evt.addr;
event.type = evt.type;
event.increase_bytes = evt.increase_bytes;
event.place = evt.place.DebugString();
event.current_allocated = evt.current_allocated;
event.current_reserved = evt.current_reserved;
event.peak_allocated = evt.peak_allocated;
event.peak_reserved = evt.peak_reserved;
event.process_id = host_mem_events.process_id;
event.thread_id = tid;
collector->AddMemEvent(std::move(event));
}
}
}
void ProcessOperatorSupplementEvents(
const HostEventSection<OperatorSupplementOriginEvent>& op_supplement_events,
TraceEventCollector* collector) {
for (const auto& thr_sec : op_supplement_events.thr_sections) {
uint64_t tid = thr_sec.thread_id;
if (thr_sec.thread_name != kDefaultThreadName) {
collector->AddThreadName(tid, thr_sec.thread_name);
}
for (const auto& evt : thr_sec.events) {
OperatorSupplementEvent event;
event.timestamp_ns = evt.timestamp_ns;
event.op_type = evt.op_type;
std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
std::map<std::string, std::vector<std::string>> dtypes;
std::string callstack;
for (auto it = evt.input_shapes.begin(); it != evt.input_shapes.end();
it++) {
for (auto idx = 0lu; idx < it->second.size(); idx++) {
input_shapes[it->first].push_back(std::vector<int64_t>());
for (auto dim_idx = 0; dim_idx < it->second.at(idx).size();
dim_idx++) {
input_shapes[it->first][idx].push_back(
it->second.at(idx).at(dim_idx));
}
}
}
for (auto it = evt.dtypes.begin(); it != evt.dtypes.end(); it++) {
for (auto idx = 0lu; idx < it->second.size(); idx++) {
dtypes[it->first].push_back(
framework::proto::VarType::Type_Name(it->second.at(idx)));
}
}
std::ostringstream result_string;
for (auto it = evt.callstack.begin(); it != evt.callstack.end(); it++) {
result_string << (*it) << std::endl;
}
event.input_shapes = input_shapes;
event.dtypes = dtypes;
event.callstack = result_string.str();
event.process_id = op_supplement_events.process_id;
event.thread_id = tid;
collector->AddOperatorSupplementEvent(std::move(event));
}
}
}
} // namespace
void HostTracer::PrepareTracing() {
......@@ -60,16 +135,21 @@ void HostTracer::PrepareTracing() {
void HostTracer::StartTracing() {
PADDLE_ENFORCE_EQ(
state_ == TracerState::READY || state_ == TracerState::STOPED, true,
state_ == TracerState::READY || state_ == TracerState::STOPED,
true,
platform::errors::PreconditionNotMet("TracerState must be READY"));
HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
HostEventRecorder<CommonMemEvent>::GetInstance().GatherEvents();
HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance()
.GatherEvents();
HostTraceLevel::GetInstance().SetLevel(options_.trace_level);
state_ = TracerState::STARTED;
}
void HostTracer::StopTracing() {
PADDLE_ENFORCE_EQ(
state_, TracerState::STARTED,
state_,
TracerState::STARTED,
platform::errors::PreconditionNotMet("TracerState must be STARTED"));
HostTraceLevel::GetInstance().SetLevel(HostTraceLevel::kDisabled);
state_ = TracerState::STOPED;
......@@ -77,11 +157,19 @@ void HostTracer::StopTracing() {
void HostTracer::CollectTraceData(TraceEventCollector* collector) {
PADDLE_ENFORCE_EQ(
state_, TracerState::STOPED,
state_,
TracerState::STOPED,
platform::errors::PreconditionNotMet("TracerState must be STOPED"));
HostEventSection<CommonEvent> host_events =
HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
ProcessHostEvents(host_events, collector);
HostEventSection<CommonMemEvent> host_mem_events =
HostEventRecorder<CommonMemEvent>::GetInstance().GatherEvents();
ProcessHostMemEvents(host_mem_events, collector);
HostEventSection<OperatorSupplementOriginEvent> op_supplement_events =
HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance()
.GatherEvents();
ProcessOperatorSupplementEvents(op_supplement_events, collector);
}
} // namespace platform
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler/trace_event.h"
namespace paddle {
namespace platform {
// Memory event tracing. A trace marks memory manipulation such as allocation
// and free.
// The events can be used to draw memory variation curve.
class RecordMemEvent {
public:
/**
* @param ptr: Pointer address allocated or free.
* @param place: Device for this memory event.
* @param size: Memory size allocated or free.
* @param type: Denote manipulation type for this memory event.
*/
explicit RecordMemEvent(
const void* ptr,
const Place& place,
size_t size,
const TracerMemEventType type = TracerMemEventType::Allocate);
};
} // namespace platform
} // namespace paddle
......@@ -23,6 +23,8 @@
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_python.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/profiler.h"
......@@ -41,10 +43,10 @@ TEST(ProfilerTest, TestHostTracer) {
profiler->Prepare();
profiler->Start();
{
RecordInstantEvent("TestTraceLevel_record1", TracerEventType::UserDefined,
2);
RecordInstantEvent("TestTraceLevel_record2", TracerEventType::UserDefined,
3);
RecordInstantEvent(
"TestTraceLevel_record1", TracerEventType::UserDefined, 2);
RecordInstantEvent(
"TestTraceLevel_record2", TracerEventType::UserDefined, 3);
}
auto profiler_result = profiler->Stop();
auto nodetree = profiler_result->GetNodeTrees();
......@@ -93,3 +95,49 @@ TEST(ProfilerTest, TestCudaTracer) {
EXPECT_GT(runtime_events.size(), 0u);
#endif
}
TEST(ProfilerTest, TestHostTracerForMem) {
using paddle::platform::CPUPlace;
using paddle::platform::EnableHostEventRecorder;
using paddle::platform::MemTraceEventNode;
using paddle::platform::Profiler;
using paddle::platform::ProfilerOptions;
using paddle::platform::ProfilerResult;
using paddle::platform::RecordEvent;
using paddle::platform::RecordInstantEvent;
using paddle::platform::RecordMemEvent;
using paddle::platform::TracerEventType;
using paddle::platform::TracerMemEventType;
ProfilerOptions options;
options.trace_level = 1;
options.trace_switch = 3;
auto profiler = Profiler::Create(options);
EXPECT_TRUE(profiler);
EnableHostEventRecorder();
profiler->Prepare();
profiler->Start();
{
RecordEvent event1(
"TestTracerForMem_phase1", TracerEventType::UserDefined, 1);
RecordMemEvent(reinterpret_cast<void*>(0),
CPUPlace(),
1024,
TracerMemEventType::Allocate);
RecordMemEvent(
reinterpret_cast<void*>(0), CPUPlace(), 1024, TracerMemEventType::Free);
}
{
RecordEvent event2(
"TestTracerForMem_phase2", TracerEventType::UserDefined, 1);
RecordMemEvent(reinterpret_cast<void*>(1024),
CPUPlace(),
1024,
TracerMemEventType::Allocate);
RecordMemEvent(reinterpret_cast<void*>(1024),
CPUPlace(),
1024,
TracerMemEventType::Free);
}
auto profiler_result = profiler->Stop();
auto nodetree = profiler_result->GetNodeTrees();
}
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/platform/profiler/trace_event.h"
namespace paddle {
namespace framework {
class RuntimeContext;
}
namespace platform {
class RecordOpInfoSupplement {
public:
/**
* @param type: Operator type name.
* @param attrs: Attribute map of op.
* @param shape_ctx: Infershape context object.
* @param ctx: Runtime context object.
*/
explicit RecordOpInfoSupplement(const std::string& type,
const framework::AttributeMap& attrs,
const framework::InferShapeContext& shape_ctx,
const framework::RuntimeContext& ctx);
};
} // namespace platform
} // namespace paddle
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册