diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index f2a8309f00c67e7694f81ac95db3656ab0a37863..73add8ea06f060949bf6084408600a1d38c274fd 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -124,7 +124,7 @@ cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_pri cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) if(WITH_GPU) - nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce) + nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda) nv_test(cuda_helper_test SRCS cuda_helper_test.cu) nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) else() diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h index a9382f2c8adcb18e320ef44086a312f89c03ad09..6edc141205a95252b2d1811b839d53da67a456c5 100644 --- a/paddle/fluid/platform/cuda_profiler.h +++ b/paddle/fluid/platform/cuda_profiler.h @@ -17,6 +17,7 @@ limitations under the License. */ #include +#include "paddle/fluid/platform/dynload/nvtx.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -38,5 +39,13 @@ void CudaProfilerStart() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStart()); } void CudaProfilerStop() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStop()); } +#ifndef _WIN32 +void CudaNvtxRangePush(std::string name) { + dynload::nvtxRangePushA(name.c_str()); +} + +void CudaNvtxRangePop() { dynload::nvtxRangePop(); } +#endif + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 647bff93122b13117ea83dd616f1c744264aeb1a..725b7fcf9dde9809fab9ead225206cf239a0df54 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) -list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc) +list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc) #hip if (WITH_ROCM_PLATFORM) list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc) diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 4c39a35030b3f47627135bed7b7cc8e8b714480c..e71305446890501908ecf4431da81bb765fce4f8 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -416,6 +416,19 @@ void* GetOpDsoHandle(const std::string& dso_name) { #endif } +void* GetNvtxDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Apple.")); +#elif defined(_WIN32) + PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Windows.")); +#elif !defined(PADDLE_WITH_CUDA) + PADDLE_THROW( + platform::errors::Unimplemented("Nvtx do not support without CUDA.")); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so"); +#endif +} + } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 1136184ce1fc9a2d1506cc0a424b2c94d0da4ee9..c3f5953c7857913980bca07d9fcffe459e483cfa 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -37,6 +37,7 @@ void* GetNCCLDsoHandle(); void* GetTensorRtDsoHandle(); void* GetMKLMLDsoHandle(); void* GetOpDsoHandle(const std::string& dso_name); +void* GetNvtxDsoHandle(); void SetPaddleLibPath(const std::string&); } // namespace dynload diff --git a/paddle/fluid/platform/dynload/nvtx.cc b/paddle/fluid/platform/dynload/nvtx.cc new file mode 100644 index 0000000000000000000000000000000000000000..372f8500e54dda0ee43c4129f9697c7e48d529a0 --- /dev/null +++ b/paddle/fluid/platform/dynload/nvtx.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifndef _WIN32 +#include "paddle/fluid/platform/dynload/nvtx.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag nvtx_dso_flag; +void *nvtx_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +NVTX_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/dynload/nvtx.h b/paddle/fluid/platform/dynload/nvtx.h new file mode 100644 index 0000000000000000000000000000000000000000..b696bbf91816aa286f113cae70afe1f7683d24db --- /dev/null +++ b/paddle/fluid/platform/dynload/nvtx.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#ifndef _WIN32 +#include +#include +#include // NOLINT + +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace platform { +namespace dynload { +extern std::once_flag nvtx_dso_flag; +extern void *nvtx_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + int operator()(Args... args) { \ + using nvtxFunc = decltype(&::__name); \ + std::call_once(nvtx_dso_flag, []() { \ + nvtx_dso_handle = paddle::platform::dynload::GetNvtxDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(nvtx_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define NVTX_ROUTINE_EACH(__macro) \ + __macro(nvtxRangePushA); \ + __macro(nvtxRangePop); + +NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_NVTX_WRAP +} // namespace dynload +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 56a6275b582d753c0fa579d624b7f50c16f579b8..c8e8e68dcda4c81a7dc13a70373200e370ed5a15 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -21,6 +21,9 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler_helper.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/dynload/nvtx.h" +#endif DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); @@ -51,6 +54,14 @@ double Event::CudaElapsedMs(const Event &e) const { } RecordEvent::RecordEvent(const std::string &name, const EventRole role) { +#ifndef _WIN32 +#ifdef PADDLE_WITH_CUDA + if (g_enable_nvprof_hook) { + dynload::nvtxRangePushA(name.c_str()); + is_pushed_ = true; + } +#endif +#endif if (g_state == ProfilerState::kDisabled || name.empty()) return; // do some initialization @@ -65,6 +76,13 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) { } RecordEvent::~RecordEvent() { +#ifndef _WIN32 +#ifdef PADDLE_WITH_CUDA + if (g_enable_nvprof_hook && is_pushed_) { + dynload::nvtxRangePop(); + } +#endif +#endif if (g_state == ProfilerState::kDisabled || !is_enabled_) return; // lock is not needed, the code below is thread-safe DeviceTracer *tracer = GetDeviceTracer(); @@ -299,5 +317,12 @@ void SetProfileListener() { int64_t ListenerId() { return profiler_lister_id; } +void NvprofEnableRecordEvent() { + SynchronizeAllDevice(); + g_enable_nvprof_hook = true; +} + +void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; } + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 0185328ff32004a35283c1d2883c2834b49b473b..66a102a3d58636ee3e2913d1cd0d91533684d505 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -131,6 +131,7 @@ struct RecordEvent { ~RecordEvent(); bool is_enabled_{false}; + bool is_pushed_{false}; uint64_t start_ns_; // Event name std::string name_; @@ -227,5 +228,8 @@ void DummyKernelAndEvent(); void SetProfileListener(); int64_t ListenerId(); +void NvprofEnableRecordEvent(); +void NvprofDisableRecordEvent(); + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index 96296861322101ee166bd880d31903862e44e556..66595aa651a5450221c16e3e44982b3cdd16134b 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -42,6 +42,8 @@ std::mutex profiler_mu; static TracerOption g_tracer_option = TracerOption::kDefault; // The profiler state, the initial value is ProfilerState::kDisabled static ProfilerState g_state = ProfilerState::kDisabled; +// To hook RecordEvent's events, use it to nvtx timeline +static bool g_enable_nvprof_hook = false; // The thread local event list only can be accessed by the specific thread // The thread index of each thread static thread_local int32_t g_thread_id; diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 0f52d7344c87fac8397aa2fc84e7b19672bc1280..39e83ab12d56daebe4b2d095b0939274c664d1db 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -3,6 +3,10 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper) +if (WITH_GPU) + set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda) +endif() + if (WITH_NCCL) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper) set(PYBIND_DEPS ${PYBIND_DEPS} reducer) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 72b3c9645ba2d5571fd815e64b7af74e24e29948..03a21b29921de17799da580e31130c5ca9134729 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1951,6 +1951,10 @@ All parameter, weight, gradient are variables in Paddle. m.def("nvprof_init", platform::CudaProfilerInit); m.def("nvprof_start", platform::CudaProfilerStart); m.def("nvprof_stop", platform::CudaProfilerStop); + m.def("nvprof_nvtx_push", platform::CudaNvtxRangePush); + m.def("nvprof_nvtx_pop", platform::CudaNvtxRangePop); + m.def("nvprof_enable_record_event", platform::NvprofEnableRecordEvent); + m.def("nvprof_disable_record_event", platform::NvprofDisableRecordEvent); #endif #endif