From de6e7431a2f31244307ac97e0894996141e22f98 Mon Sep 17 00:00:00 2001 From: Yuanle Liu Date: Wed, 19 Oct 2022 15:15:43 +0800 Subject: [PATCH] add nvtxRangePush/Pop for naive_executor and refine some code (#47139) --- CMakeLists.txt | 1 + cmake/inference_lib.cmake | 4 ++++ paddle/fluid/framework/naive_executor.cc | 19 +++++++++++++++++-- paddle/fluid/inference/CMakeLists.txt | 4 ++++ paddle/fluid/inference/api/analysis_config.cc | 2 +- .../fluid/inference/api/analysis_predictor.cc | 12 ++++++------ .../fluid/inference/api/analysis_predictor.h | 2 +- .../platform/device/gpu/cuda/cuda_profiler.cc | 18 +++++++++++++----- .../platform/device/gpu/cuda/cuda_profiler.h | 18 ++++++++++++++---- paddle/fluid/platform/dynload/nvtx.h | 12 ++++-------- paddle/phi/backends/dynload/nvtx.h | 1 + 11 files changed, 66 insertions(+), 27 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a4c8a52486..3cb05a98da 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -268,6 +268,7 @@ option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE}) option(WITH_HETERPS "Compile with heterps" OFF}) option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF) +option(WITH_INFERENCE_NVTX "Paddle inference with nvtx for profiler" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}) option( diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 2fc1be2545..6ab33109c9 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -356,6 +356,10 @@ else() ) endif() +if(WITH_INFERENCE_NVTX AND NOT WIN32) + add_definitions(-DPADDLE_WITH_INFERENCE_NVTX) +endif() + copy( inference_lib_dist SRCS ${src_dir}/inference/capi_exp/pd_*.h ${paddle_inference_c_lib} diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 651ac23e52..ee04cef32e 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -22,9 +22,12 @@ #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT #include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h" #endif +#ifdef PADDLE_WITH_INFERENCE_NVTX +#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" +#endif namespace paddle { namespace framework { @@ -48,12 +51,24 @@ void NaiveExecutor::Run() { platform::RegisterModelLayout(ops_, place_); #endif platform::ScopedFlushDenormal flush; +#ifdef PADDLE_WITH_INFERENCE_NVTX + platform::CudaNvtxRangePush("model", platform::NvtxRangeColor::Yellow); +#endif for (auto &op : ops_) { VLOG(4) << std::this_thread::get_id() << " run " << op->DebugStringEx(scope_) << " on scope " << scope_; op->SetIsCalledByExecutor(false); +#ifdef PADDLE_WITH_INFERENCE_NVTX + platform::CudaNvtxRangePush(op->Type(), platform::NvtxRangeColor::Green); +#endif op->Run(*scope_, place_); +#ifdef PADDLE_WITH_INFERENCE_NVTX + platform::CudaNvtxRangePop(); +#endif } +#ifdef PADDLE_WITH_INFERENCE_NVTX + platform::CudaNvtxRangePop(); +#endif } void NaiveExecutor::CreateVariables(const ProgramDesc &desc, @@ -146,7 +161,7 @@ NaiveExecutor::~NaiveExecutor() { } void NaiveExecutor::ResetTrtOps(int num) { -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT for (auto &op : ops_) { if (op->Type() == "tensorrt_engine") { operators::TensorRTEngineOp *trtop = diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 0b20dbfde7..7e14a3b264 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -108,6 +108,10 @@ if(WITH_PSCORE) tensor_table) endif() +if(WITH_INFERENCE_NVTX AND NOT WIN32) + set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} cuda_profiler) +endif() + if(WITH_ONNXRUNTIME) set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 68864ad152..2af92c7e14 100755 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -655,7 +655,7 @@ void AnalysisConfig::EnableTensorRtEngine( } use_tensorrt_ = true; -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT // https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2 // when trt version less than 7.2, // createExecutionContextWithoutDeviceMemory() has bug. diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index f4107975cb..79527c97d4 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -79,7 +79,7 @@ #include "paddle/fluid/inference/api/onnxruntime_predictor.h" #endif -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" @@ -92,7 +92,7 @@ namespace paddle { using inference::Singleton; -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT using inference::tensorrt::TRTCalibratorEngine; using inference::tensorrt::TRTCalibratorEngineManager; using inference::tensorrt::TRTInt8Calibrator; @@ -1271,7 +1271,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { [](framework::ProgramDesc *prog) { // Note, please do NOT use any member variables, because member variables may // have been destructed in multiple threads. -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT auto &block = prog->Block(0); for (auto &op_desc : block.AllOps()) { if (op_desc->Type() == "tensorrt_engine") { @@ -1977,7 +1977,7 @@ void AnalysisPredictor::ClearIntermediateTensor() { } } -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT bool AnalysisPredictor::SaveTrtCalibToDisk() { PADDLE_ENFORCE_EQ(config_.tensorrt_engine_enabled(), true, @@ -2033,7 +2033,7 @@ bool AnalysisPredictor::SaveTrtCalibToDisk() { #endif AnalysisPredictor::~AnalysisPredictor() { -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT if (config_.tensorrt_engine_enabled() && config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 && Singleton::Global().Has()) { @@ -2157,7 +2157,7 @@ std::unique_ptr CreatePaddlePredictor( } // namespace paddle -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT USE_TRT_CONVERTER(elementwise_add_weight); USE_TRT_CONVERTER(elementwise_sub_weight); USE_TRT_CONVERTER(elementwise_mul_weight); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index ff34bac545..bb196e2ef5 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -382,7 +382,7 @@ class AnalysisPredictor : public PaddlePredictor { /// void MkldnnPostReset(); -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT /// /// \brief save calibration table /// diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc index 291dd6c7ce..cebb36cbc6 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc @@ -17,9 +17,9 @@ namespace paddle { namespace platform { -void CudaProfilerInit(std::string output_file, - std::string output_mode, - std::string config_file) { +void CudaProfilerInit(const std::string& output_file, + const std::string& output_mode, + const std::string& config_file) { PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv", platform::errors::InvalidArgument( "Unsupported cuda profiler output mode, expect `kvp` or " @@ -35,8 +35,16 @@ void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); } void CudaProfilerStop() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStop()); } #ifndef _WIN32 -void CudaNvtxRangePush(std::string name) { - dynload::nvtxRangePushA(name.c_str()); +void CudaNvtxRangePush(const std::string& name, const NvtxRangeColor color) { + nvtxEventAttributes_t eventAttrib; + eventAttrib.version = NVTX_VERSION; + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + eventAttrib.colorType = NVTX_COLOR_ARGB; + eventAttrib.color = static_cast(color); + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; + eventAttrib.message.ascii = name.c_str(); + + dynload::nvtxRangePushEx(&eventAttrib); } void CudaNvtxRangePop() { dynload::nvtxRangePop(); } diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h index 6c7cf0fd8d..193e08bdde 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h @@ -23,16 +23,26 @@ limitations under the License. */ namespace paddle { namespace platform { -void CudaProfilerInit(std::string output_file, - std::string output_mode, - std::string config_file); +void CudaProfilerInit(const std::string& output_file, + const std::string& output_mode, + const std::string& config_file); void CudaProfilerStart(); void CudaProfilerStop(); #ifndef _WIN32 -void CudaNvtxRangePush(std::string name); +enum class NvtxRangeColor : uint32_t { + Black = 0x00000000, + Red = 0x00ff0000, + Green = 0x0000ff00, + Blue = 0x000000ff, + White = 0x00ffffff, + Yellow = 0x00ffff00, +}; + +void CudaNvtxRangePush(const std::string& name, + const NvtxRangeColor color = NvtxRangeColor::Blue); void CudaNvtxRangePop(); #endif diff --git a/paddle/fluid/platform/dynload/nvtx.h b/paddle/fluid/platform/dynload/nvtx.h index c3dc9e31df..e5816e240e 100644 --- a/paddle/fluid/platform/dynload/nvtx.h +++ b/paddle/fluid/platform/dynload/nvtx.h @@ -13,11 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef _WIN32 -#include -#include - -#include // NOLINT - #include "paddle/phi/backends/dynload/nvtx.h" namespace paddle { @@ -28,11 +23,12 @@ namespace dynload { using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name -#define NVTX_ROUTINE_EACH(__macro) \ - __macro(nvtxRangePushA); \ +#define PLATFORM_NVTX_ROUTINE_EACH(__macro) \ + __macro(nvtxRangePushA); \ + __macro(nvtxRangePushEx); \ __macro(nvtxRangePop); -NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP); +PLATFORM_NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP); #undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP } // namespace dynload diff --git a/paddle/phi/backends/dynload/nvtx.h b/paddle/phi/backends/dynload/nvtx.h index a9a166b289..e51bbf2154 100644 --- a/paddle/phi/backends/dynload/nvtx.h +++ b/paddle/phi/backends/dynload/nvtx.h @@ -42,6 +42,7 @@ extern void *nvtx_dso_handle; #define NVTX_ROUTINE_EACH(__macro) \ __macro(nvtxRangePushA); \ + __macro(nvtxRangePushEx); \ __macro(nvtxRangePop); NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP); -- GitLab