未验证 提交 1fcaab45 编写于 作者: C chenjian 提交者: GitHub

Update record interface using part3 (#39695)

* fix RecordEvent interface

* modify default level to 4

* update interface use

* add const default trace level

* update record event interface using

* update record event interface using

* update record event interface using

* update operator.cc

* update part2

* update part1

* update part3

* fix include profiler.h header in ps server

* fix include profiler.h header in ps server

* fix profiler.h header

* fix profiler.h header

* fix merge buf

* update

* fix bug

* fix bug
上级 94243789
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h" #include "paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
DECLARE_bool(sync_nccl_allreduce); DECLARE_bool(sync_nccl_allreduce);
...@@ -47,6 +48,8 @@ GradMergeAllReduceOpHandle::GradMergeAllReduceOpHandle( ...@@ -47,6 +48,8 @@ GradMergeAllReduceOpHandle::GradMergeAllReduceOpHandle(
#endif #endif
void GradMergeAllReduceOpHandle::RunImpl() { void GradMergeAllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(
Name(), platform::TracerEventType::Communication, 1);
PADDLE_ENFORCE_GT(local_scopes_.size(), 0, PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.", "The number of local scope should be > 0, but got %zu.",
...@@ -96,6 +99,8 @@ FusedGradMergeAllReduceOpHandle::FusedGradMergeAllReduceOpHandle( ...@@ -96,6 +99,8 @@ FusedGradMergeAllReduceOpHandle::FusedGradMergeAllReduceOpHandle(
#endif #endif
void FusedGradMergeAllReduceOpHandle::RunImpl() { void FusedGradMergeAllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(
Name(), platform::TracerEventType::Communication, 1);
PADDLE_ENFORCE_GT(local_scopes_.size(), 0, PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.", "The number of local scope should be > 0, but got %zu.",
......
...@@ -246,7 +246,8 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place, ...@@ -246,7 +246,8 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
<< dst_place << " by thream(" << stream << ")"; << dst_place << " by thream(" << stream << ")";
if (stream) { if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU"); platform::RecordEvent record_event(
"NpuMemcpyAsync:CPU->NPU", platform::TracerEventType::UserDefined, 1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream)); reinterpret_cast<aclrtStream>(stream));
} else { } else {
...@@ -256,7 +257,8 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place, ...@@ -256,7 +257,8 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait(); static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU"); platform::RecordEvent record_event(
"NpuMemcpySync:CPU->NPU", platform::TracerEventType::UserDefined, 1);
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
} }
} }
...@@ -275,14 +277,16 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place, ...@@ -275,14 +277,16 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
<< dst_place << " by thream(" << stream << ")"; << dst_place << " by thream(" << stream << ")";
if (stream) { if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU"); platform::RecordEvent record_event(
"NpuMemcpyAsync:NPU->CPU", platform::TracerEventType::UserDefined, 1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
reinterpret_cast<aclrtStream>(stream)); reinterpret_cast<aclrtStream>(stream));
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait(); static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
platform::RecordEvent record_event("NpuMemcpySync:NPU->CPU"); platform::RecordEvent record_event(
"NpuMemcpySync:NPU->CPU", platform::TracerEventType::UserDefined, 1);
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
} }
} }
...@@ -300,7 +304,9 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place, ...@@ -300,7 +304,9 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
if (dst_place == src_place) { if (dst_place == src_place) {
platform::SetNPUDeviceId(src_place.device); platform::SetNPUDeviceId(src_place.device);
if (stream) { if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU"); platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream)); reinterpret_cast<aclrtStream>(stream));
} else { } else {
...@@ -308,7 +314,9 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place, ...@@ -308,7 +314,9 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait(); static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU"); platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
} }
} else { } else {
...@@ -318,7 +326,9 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place, ...@@ -318,7 +326,9 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
} }
if (stream) { if (stream) {
// TODO(zhiqiu): support peer access? // TODO(zhiqiu): support peer access?
platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU"); platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream)); reinterpret_cast<aclrtStream>(stream));
} else { } else {
...@@ -326,7 +336,9 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place, ...@@ -326,7 +336,9 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait(); static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU"); platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
} }
} }
...@@ -374,14 +386,18 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>( ...@@ -374,14 +386,18 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
<< dst_place << " by thream(" << stream << ")"; << dst_place << " by thream(" << stream << ")";
if (stream) { if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned"); platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
reinterpret_cast<aclrtStream>(stream)); reinterpret_cast<aclrtStream>(stream));
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait(); static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned"); platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
} }
} }
...@@ -398,7 +414,9 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>( ...@@ -398,7 +414,9 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
<< dst_place << " by thream(" << stream << ")"; << dst_place << " by thream(" << stream << ")";
if (stream) { if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU"); platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream)); reinterpret_cast<aclrtStream>(stream));
} else { } else {
...@@ -408,7 +426,9 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>( ...@@ -408,7 +426,9 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait(); static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU"); platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
} }
} }
...@@ -596,7 +616,8 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>( ...@@ -596,7 +616,8 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by stream(" << stream << ")"; << dst_place << " by stream(" << stream << ")";
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU"); platform::RecordEvent record_event(
"GpuMemcpyAsync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
...@@ -605,7 +626,8 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>( ...@@ -605,7 +626,8 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
#endif #endif
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU"); platform::RecordEvent record_event(
"GpuMemcpySync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost); platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
#else #else
...@@ -628,7 +650,8 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>( ...@@ -628,7 +650,8 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")"; << dst_place << " by thream(" << stream << ")";
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU"); platform::RecordEvent record_event(
"GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
...@@ -637,7 +660,8 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>( ...@@ -637,7 +660,8 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
#endif #endif
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU"); platform::RecordEvent record_event(
"GpuMemcpySync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice); platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
#else #else
...@@ -661,7 +685,9 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>( ...@@ -661,7 +685,9 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
if (dst_place == src_place) { if (dst_place == src_place) {
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU"); platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU",
platform::TracerEventType::UserDefined,
1);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice,
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
...@@ -670,7 +696,9 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>( ...@@ -670,7 +696,9 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
#endif #endif
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU"); platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU",
platform::TracerEventType::UserDefined,
1);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToDevice); platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToDevice);
#else #else
...@@ -679,11 +707,15 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>( ...@@ -679,11 +707,15 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
} }
} else { } else {
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU"); platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU",
platform::TracerEventType::UserDefined,
1);
platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
num, reinterpret_cast<gpuStream_t>(stream)); num, reinterpret_cast<gpuStream_t>(stream));
} else { } else {
platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU"); platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU",
platform::TracerEventType::UserDefined,
1);
platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device, platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
num); num);
} }
...@@ -729,7 +761,9 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>( ...@@ -729,7 +761,9 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")"; << dst_place << " by thream(" << stream << ")";
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned"); platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned",
platform::TracerEventType::UserDefined,
1);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
...@@ -738,7 +772,9 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>( ...@@ -738,7 +772,9 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
#endif #endif
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned"); platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned",
platform::TracerEventType::UserDefined,
1);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost); platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
#else #else
...@@ -758,7 +794,9 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>( ...@@ -758,7 +794,9 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")"; << dst_place << " by thream(" << stream << ")";
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU"); platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU",
platform::TracerEventType::UserDefined,
1);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
...@@ -767,7 +805,9 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>( ...@@ -767,7 +805,9 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
#endif #endif
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU"); platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU",
platform::TracerEventType::UserDefined,
1);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice); platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
#else #else
...@@ -927,7 +967,9 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place, ...@@ -927,7 +967,9 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
if (stream) { if (stream) {
VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place << " by mlu stream(" << stream << ")"; << " to " << dst_place << " by mlu stream(" << stream << ")";
platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU"); platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyD2HAsync(dst, src, num, platform::MLUMemcpyD2HAsync(dst, src, num,
reinterpret_cast<mluStream>(stream)); reinterpret_cast<mluStream>(stream));
} else { } else {
...@@ -936,7 +978,8 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place, ...@@ -936,7 +978,8 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place; << " to " << dst_place;
platform::RecordEvent record_event("MLUMemcpyD2HSync:MLU->CPU"); platform::RecordEvent record_event(
"MLUMemcpyD2HSync:MLU->CPU", platform::TracerEventType::UserDefined, 1);
platform::MLUMemcpyD2HSync(dst, src, num); platform::MLUMemcpyD2HSync(dst, src, num);
} }
} }
...@@ -953,7 +996,9 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place, ...@@ -953,7 +996,9 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
if (stream) { if (stream) {
VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place << " by mlu stream(" << stream << ")"; << " to " << dst_place << " by mlu stream(" << stream << ")";
platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU"); platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyH2DAsync(dst, src, num, platform::MLUMemcpyH2DAsync(dst, src, num,
reinterpret_cast<mluStream>(stream)); reinterpret_cast<mluStream>(stream));
} else { } else {
...@@ -962,7 +1007,8 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place, ...@@ -962,7 +1007,8 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place; << " to " << dst_place;
platform::RecordEvent record_event("MLUMemcpyH2DSync:CPU->MLU"); platform::RecordEvent record_event(
"MLUMemcpyH2DSync:CPU->MLU", platform::TracerEventType::UserDefined, 1);
platform::MLUMemcpyH2DSync(dst, src, num); platform::MLUMemcpyH2DSync(dst, src, num);
} }
} }
...@@ -980,8 +1026,9 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place, ...@@ -980,8 +1026,9 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
if (stream) { if (stream) {
VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place << " by mlu stream(" << stream << ")"; << " to " << dst_place << " by mlu stream(" << stream << ")";
platform::RecordEvent record_event( platform::RecordEvent record_event("MLUMemcpyD2DAsync(same_mlu):MLU->MLU",
"MLUMemcpyD2DAsync(same_mlu):MLU->MLU"); platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyD2DAsync(dst, src, num, platform::MLUMemcpyD2DAsync(dst, src, num,
reinterpret_cast<mluStream>(stream)); reinterpret_cast<mluStream>(stream));
} else { } else {
...@@ -991,20 +1038,26 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place, ...@@ -991,20 +1038,26 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place; << " to " << dst_place;
platform::RecordEvent record_event("MLUMemcpyD2DSync(same_mlu):MLU->MLU"); platform::RecordEvent record_event("MLUMemcpyD2DSync(same_mlu):MLU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyD2DSync(dst, src, num); platform::MLUMemcpyD2DSync(dst, src, num);
} }
} else { } else {
if (stream) { if (stream) {
VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place << " by mlu stream(" << stream << ")"; << " to " << dst_place << " by mlu stream(" << stream << ")";
platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU"); platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
num, reinterpret_cast<mluStream>(stream)); num, reinterpret_cast<mluStream>(stream));
} else { } else {
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place; << " to " << dst_place;
platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU"); platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyPeerSync(dst, dst_place.device, src, src_place.device, platform::MLUMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
num); num);
} }
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -28,7 +28,7 @@ limitations under the License. */ ...@@ -28,7 +28,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/padding.h" #include "paddle/fluid/operators/math/padding.h"
#include "paddle/fluid/platform/cudnn_workspace_helper.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
DECLARE_bool(cudnn_deterministic); DECLARE_bool(cudnn_deterministic);
DECLARE_uint64(conv_workspace_size_limit); DECLARE_uint64(conv_workspace_size_limit);
......
...@@ -22,7 +22,7 @@ limitations under the License. */ ...@@ -22,7 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/operators/reader/buffered_reader.h" #include "paddle/fluid/operators/reader/buffered_reader.h"
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -115,7 +116,9 @@ void BufferedReader::ReadAsync(size_t i) { ...@@ -115,7 +116,9 @@ void BufferedReader::ReadAsync(size_t i) {
platform::CUDAPinnedPlace cuda_pinned_place; platform::CUDAPinnedPlace cuda_pinned_place;
std::vector<void *> cuda_pinned_ptrs; std::vector<void *> cuda_pinned_ptrs;
cuda_pinned_ptrs.reserve(cpu.size()); cuda_pinned_ptrs.reserve(cpu.size());
platform::RecordEvent record_event("BufferedReader:MemoryCopy"); platform::RecordEvent record_event(
"BufferedReader:MemoryCopy", platform::TracerEventType::UserDefined,
1);
// NODE(chenweihang): When we use CUDAPinned Memory, we need call // NODE(chenweihang): When we use CUDAPinned Memory, we need call
// cudaHostAlloc, that is a CUDA API, calling CUDA API need load // cudaHostAlloc, that is a CUDA API, calling CUDA API need load
// cuda lib into device, it will cost hundreds of MB of GPU memory. // cuda lib into device, it will cost hundreds of MB of GPU memory.
...@@ -170,7 +173,9 @@ void BufferedReader::ReadAsync(size_t i) { ...@@ -170,7 +173,9 @@ void BufferedReader::ReadAsync(size_t i) {
cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0)); cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0));
#endif #endif
platform::RecordEvent record_event("BufferedReader:MemoryCopy"); platform::RecordEvent record_event(
"BufferedReader:MemoryCopy", platform::TracerEventType::UserDefined,
1);
for (size_t i = 0; i < cpu.size(); ++i) { for (size_t i = 0; i < cpu.size(); ++i) {
auto cpu_place = cpu[i].place(); auto cpu_place = cpu[i].place();
auto cpu_ptr = cpu[i].data(); auto cpu_ptr = cpu[i].data();
...@@ -229,7 +234,9 @@ void BufferedReader::ReadAsync(size_t i) { ...@@ -229,7 +234,9 @@ void BufferedReader::ReadAsync(size_t i) {
platform::NPUEventRecord(events_[i].get(), compute_stream_); platform::NPUEventRecord(events_[i].get(), compute_stream_);
platform::NPUStreamWaitEvent(stream_.get(), events_[i].get()); platform::NPUStreamWaitEvent(stream_.get(), events_[i].get());
platform::RecordEvent record_event("BufferedReader:MemoryCopy"); platform::RecordEvent record_event("BufferedReader:MemoryCopy",
platform::TracerEventType::UserDefined,
1);
for (size_t i = 0; i < cpu.size(); ++i) { for (size_t i = 0; i < cpu.size(); ++i) {
auto cpu_place = cpu[i].place(); auto cpu_place = cpu[i].place();
auto cpu_ptr = cpu[i].data(); auto cpu_ptr = cpu[i].data();
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -106,7 +106,8 @@ class ReadOp : public framework::OperatorBase { ...@@ -106,7 +106,8 @@ class ReadOp : public framework::OperatorBase {
std::vector<framework::LoDTensor> ins; std::vector<framework::LoDTensor> ins;
// For profiling // For profiling
platform::RecordEvent record_event(Type()); platform::RecordEvent record_event(
Type().c_str(), platform::TracerEventType::UserDefined, 1);
reader->ReadNext(&ins); reader->ReadNext(&ins);
if (ins.empty()) { if (ins.empty()) {
......
...@@ -32,6 +32,7 @@ limitations under the License. */ ...@@ -32,6 +32,7 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -322,7 +323,8 @@ NPUDeviceContext::~NPUDeviceContext() { ...@@ -322,7 +323,8 @@ NPUDeviceContext::~NPUDeviceContext() {
} }
void NPUDeviceContext::Wait() const { void NPUDeviceContext::Wait() const {
platform::RecordEvent record_event("NPUDeviceContext/wait"); platform::RecordEvent record_event("NPUDeviceContext/wait",
platform::TracerEventType::UserDefined, 2);
VLOG(4) << "NPU context(" << this << ") Wait"; VLOG(4) << "NPU context(" << this << ") Wait";
stream_->Wait(); stream_->Wait();
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册