未验证 提交 90773473 编写于 作者: W wanghuancoder 提交者: GitHub

use nvtx push pop in timeline (#30567)

* delete empty line of pybing.cc, test=develop

* use nvtx push pop in timeline, test=develop

* change year, test=develop

* add #ifdef PADDLE_WITH_CUDA, test=develop

* add #ifndef WIN32, test=develop

* is_pushed to is_pushed_, test=develop
上级 358106fc
......@@ -124,7 +124,7 @@ cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_pri
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
if(WITH_GPU)
nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda)
nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
else()
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/platform/dynload/nvtx.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......@@ -38,5 +39,13 @@ void CudaProfilerStart() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStart()); }
void CudaProfilerStop() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStop()); }
#ifndef _WIN32
void CudaNvtxRangePush(std::string name) {
dynload::nvtxRangePushA(name.c_str());
}
void CudaNvtxRangePop() { dynload::nvtxRangePop(); }
#endif
} // namespace platform
} // namespace paddle
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc)
list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc)
#hip
if (WITH_ROCM_PLATFORM)
list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc)
......
......@@ -416,6 +416,19 @@ void* GetOpDsoHandle(const std::string& dso_name) {
#endif
}
void* GetNvtxDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Apple."));
#elif defined(_WIN32)
PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Windows."));
#elif !defined(PADDLE_WITH_CUDA)
PADDLE_THROW(
platform::errors::Unimplemented("Nvtx do not support without CUDA."));
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so");
#endif
}
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -37,6 +37,7 @@ void* GetNCCLDsoHandle();
void* GetTensorRtDsoHandle();
void* GetMKLMLDsoHandle();
void* GetOpDsoHandle(const std::string& dso_name);
void* GetNvtxDsoHandle();
void SetPaddleLibPath(const std::string&);
} // namespace dynload
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include "paddle/fluid/platform/dynload/nvtx.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag nvtx_dso_flag;
void *nvtx_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
NVTX_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace platform
} // namespace paddle
#endif
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifndef _WIN32
#include <cuda.h>
#include <nvToolsExt.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag nvtx_dso_flag;
extern void *nvtx_dso_handle;
#define DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
int operator()(Args... args) { \
using nvtxFunc = decltype(&::__name); \
std::call_once(nvtx_dso_flag, []() { \
nvtx_dso_handle = paddle::platform::dynload::GetNvtxDsoHandle(); \
}); \
static void *p_##__name = dlsym(nvtx_dso_handle, #__name); \
return reinterpret_cast<nvtxFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define NVTX_ROUTINE_EACH(__macro) \
__macro(nvtxRangePushA); \
__macro(nvtxRangePop);
NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
#undef DECLARE_DYNAMIC_LOAD_NVTX_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
#endif
......@@ -21,6 +21,9 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler_helper.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/nvtx.h"
#endif
DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
......@@ -51,6 +54,14 @@ double Event::CudaElapsedMs(const Event &e) const {
}
RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook) {
dynload::nvtxRangePushA(name.c_str());
is_pushed_ = true;
}
#endif
#endif
if (g_state == ProfilerState::kDisabled || name.empty()) return;
// do some initialization
......@@ -65,6 +76,13 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
}
RecordEvent::~RecordEvent() {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook && is_pushed_) {
dynload::nvtxRangePop();
}
#endif
#endif
if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
// lock is not needed, the code below is thread-safe
DeviceTracer *tracer = GetDeviceTracer();
......@@ -299,5 +317,12 @@ void SetProfileListener() {
int64_t ListenerId() { return profiler_lister_id; }
void NvprofEnableRecordEvent() {
SynchronizeAllDevice();
g_enable_nvprof_hook = true;
}
void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; }
} // namespace platform
} // namespace paddle
......@@ -131,6 +131,7 @@ struct RecordEvent {
~RecordEvent();
bool is_enabled_{false};
bool is_pushed_{false};
uint64_t start_ns_;
// Event name
std::string name_;
......@@ -227,5 +228,8 @@ void DummyKernelAndEvent();
void SetProfileListener();
int64_t ListenerId();
void NvprofEnableRecordEvent();
void NvprofDisableRecordEvent();
} // namespace platform
} // namespace paddle
......@@ -42,6 +42,8 @@ std::mutex profiler_mu;
static TracerOption g_tracer_option = TracerOption::kDefault;
// The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled;
// To hook RecordEvent's events, use it to nvtx timeline
static bool g_enable_nvprof_hook = false;
// The thread local event list only can be accessed by the specific thread
// The thread index of each thread
static thread_local int32_t g_thread_id;
......
......@@ -3,6 +3,10 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp
analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper)
if (WITH_GPU)
set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda)
endif()
if (WITH_NCCL)
set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
......
......@@ -1951,6 +1951,10 @@ All parameter, weight, gradient are variables in Paddle.
m.def("nvprof_init", platform::CudaProfilerInit);
m.def("nvprof_start", platform::CudaProfilerStart);
m.def("nvprof_stop", platform::CudaProfilerStop);
m.def("nvprof_nvtx_push", platform::CudaNvtxRangePush);
m.def("nvprof_nvtx_pop", platform::CudaNvtxRangePop);
m.def("nvprof_enable_record_event", platform::NvprofEnableRecordEvent);
m.def("nvprof_disable_record_event", platform::NvprofDisableRecordEvent);
#endif
#endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册