From 90773473a06b3762376a51b5878abb8a626ba78c Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Wed, 20 Jan 2021 18:29:40 +0800 Subject: [PATCH] use nvtx push pop in timeline (#30567) * delete empty line of pybing.cc, test=develop * use nvtx push pop in timeline, test=develop * change year, test=develop * add #ifdef PADDLE_WITH_CUDA, test=develop * add #ifndef WIN32, test=develop * is_pushed to is_pushed_, test=develop --- paddle/fluid/platform/CMakeLists.txt | 2 +- paddle/fluid/platform/cuda_profiler.h | 9 ++++ paddle/fluid/platform/dynload/CMakeLists.txt | 2 +- .../fluid/platform/dynload/dynamic_loader.cc | 13 +++++ .../fluid/platform/dynload/dynamic_loader.h | 1 + paddle/fluid/platform/dynload/nvtx.cc | 31 +++++++++++ paddle/fluid/platform/dynload/nvtx.h | 53 +++++++++++++++++++ paddle/fluid/platform/profiler.cc | 25 +++++++++ paddle/fluid/platform/profiler.h | 4 ++ paddle/fluid/platform/profiler_helper.h | 2 + paddle/fluid/pybind/CMakeLists.txt | 4 ++ paddle/fluid/pybind/pybind.cc | 4 ++ 12 files changed, 148 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/platform/dynload/nvtx.cc create mode 100644 paddle/fluid/platform/dynload/nvtx.h diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index f2a8309f00c..73add8ea06f 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -124,7 +124,7 @@ cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_pri cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) if(WITH_GPU) - nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce) + nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda) nv_test(cuda_helper_test SRCS cuda_helper_test.cu) nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) else() diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h index a9382f2c8ad..6edc141205a 100644 --- a/paddle/fluid/platform/cuda_profiler.h +++ b/paddle/fluid/platform/cuda_profiler.h @@ -17,6 +17,7 @@ limitations under the License. */ #include +#include "paddle/fluid/platform/dynload/nvtx.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -38,5 +39,13 @@ void CudaProfilerStart() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStart()); } void CudaProfilerStop() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStop()); } +#ifndef _WIN32 +void CudaNvtxRangePush(std::string name) { + dynload::nvtxRangePushA(name.c_str()); +} + +void CudaNvtxRangePop() { dynload::nvtxRangePop(); } +#endif + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 647bff93122..725b7fcf9dd 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) -list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc) +list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc) #hip if (WITH_ROCM_PLATFORM) list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc) diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 4c39a35030b..e7130544689 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -416,6 +416,19 @@ void* GetOpDsoHandle(const std::string& dso_name) { #endif } +void* GetNvtxDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Apple.")); +#elif defined(_WIN32) + PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Windows.")); +#elif !defined(PADDLE_WITH_CUDA) + PADDLE_THROW( + platform::errors::Unimplemented("Nvtx do not support without CUDA.")); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so"); +#endif +} + } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 1136184ce1f..c3f5953c785 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -37,6 +37,7 @@ void* GetNCCLDsoHandle(); void* GetTensorRtDsoHandle(); void* GetMKLMLDsoHandle(); void* GetOpDsoHandle(const std::string& dso_name); +void* GetNvtxDsoHandle(); void SetPaddleLibPath(const std::string&); } // namespace dynload diff --git a/paddle/fluid/platform/dynload/nvtx.cc b/paddle/fluid/platform/dynload/nvtx.cc new file mode 100644 index 00000000000..372f8500e54 --- /dev/null +++ b/paddle/fluid/platform/dynload/nvtx.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifndef _WIN32 +#include "paddle/fluid/platform/dynload/nvtx.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag nvtx_dso_flag; +void *nvtx_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +NVTX_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/dynload/nvtx.h b/paddle/fluid/platform/dynload/nvtx.h new file mode 100644 index 00000000000..b696bbf9181 --- /dev/null +++ b/paddle/fluid/platform/dynload/nvtx.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#ifndef _WIN32 +#include +#include +#include // NOLINT + +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace platform { +namespace dynload { +extern std::once_flag nvtx_dso_flag; +extern void *nvtx_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + int operator()(Args... args) { \ + using nvtxFunc = decltype(&::__name); \ + std::call_once(nvtx_dso_flag, []() { \ + nvtx_dso_handle = paddle::platform::dynload::GetNvtxDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(nvtx_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define NVTX_ROUTINE_EACH(__macro) \ + __macro(nvtxRangePushA); \ + __macro(nvtxRangePop); + +NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_NVTX_WRAP +} // namespace dynload +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 56a6275b582..c8e8e68dcda 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -21,6 +21,9 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler_helper.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/dynload/nvtx.h" +#endif DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); @@ -51,6 +54,14 @@ double Event::CudaElapsedMs(const Event &e) const { } RecordEvent::RecordEvent(const std::string &name, const EventRole role) { +#ifndef _WIN32 +#ifdef PADDLE_WITH_CUDA + if (g_enable_nvprof_hook) { + dynload::nvtxRangePushA(name.c_str()); + is_pushed_ = true; + } +#endif +#endif if (g_state == ProfilerState::kDisabled || name.empty()) return; // do some initialization @@ -65,6 +76,13 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) { } RecordEvent::~RecordEvent() { +#ifndef _WIN32 +#ifdef PADDLE_WITH_CUDA + if (g_enable_nvprof_hook && is_pushed_) { + dynload::nvtxRangePop(); + } +#endif +#endif if (g_state == ProfilerState::kDisabled || !is_enabled_) return; // lock is not needed, the code below is thread-safe DeviceTracer *tracer = GetDeviceTracer(); @@ -299,5 +317,12 @@ void SetProfileListener() { int64_t ListenerId() { return profiler_lister_id; } +void NvprofEnableRecordEvent() { + SynchronizeAllDevice(); + g_enable_nvprof_hook = true; +} + +void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; } + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 0185328ff32..66a102a3d58 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -131,6 +131,7 @@ struct RecordEvent { ~RecordEvent(); bool is_enabled_{false}; + bool is_pushed_{false}; uint64_t start_ns_; // Event name std::string name_; @@ -227,5 +228,8 @@ void DummyKernelAndEvent(); void SetProfileListener(); int64_t ListenerId(); +void NvprofEnableRecordEvent(); +void NvprofDisableRecordEvent(); + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index 96296861322..66595aa651a 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -42,6 +42,8 @@ std::mutex profiler_mu; static TracerOption g_tracer_option = TracerOption::kDefault; // The profiler state, the initial value is ProfilerState::kDisabled static ProfilerState g_state = ProfilerState::kDisabled; +// To hook RecordEvent's events, use it to nvtx timeline +static bool g_enable_nvprof_hook = false; // The thread local event list only can be accessed by the specific thread // The thread index of each thread static thread_local int32_t g_thread_id; diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 0f52d7344c8..39e83ab12d5 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -3,6 +3,10 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper) +if (WITH_GPU) + set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda) +endif() + if (WITH_NCCL) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper) set(PYBIND_DEPS ${PYBIND_DEPS} reducer) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 72b3c9645ba..03a21b29921 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1951,6 +1951,10 @@ All parameter, weight, gradient are variables in Paddle. m.def("nvprof_init", platform::CudaProfilerInit); m.def("nvprof_start", platform::CudaProfilerStart); m.def("nvprof_stop", platform::CudaProfilerStop); + m.def("nvprof_nvtx_push", platform::CudaNvtxRangePush); + m.def("nvprof_nvtx_pop", platform::CudaNvtxRangePop); + m.def("nvprof_enable_record_event", platform::NvprofEnableRecordEvent); + m.def("nvprof_disable_record_event", platform::NvprofDisableRecordEvent); #endif #endif -- GitLab