diff --git a/CMakeLists.txt b/CMakeLists.txt index a4c8a52486f5daf16d97f8ccb9a28d8857e669db..3cb05a98da01f2c24f9cfff402be401f69905733 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -268,6 +268,7 @@ option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE}) option(WITH_HETERPS "Compile with heterps" OFF}) option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF) +option(WITH_INFERENCE_NVTX "Paddle inference with nvtx for profiler" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}) option( diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 2fc1be2545ddce3af943a5db6a1980dc65bd77f3..6ab33109c93e0cc28720d0632825b194077f47c3 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -356,6 +356,10 @@ else() ) endif() +if(WITH_INFERENCE_NVTX AND NOT WIN32) + add_definitions(-DPADDLE_WITH_INFERENCE_NVTX) +endif() + copy( inference_lib_dist SRCS ${src_dir}/inference/capi_exp/pd_*.h ${paddle_inference_c_lib} diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 651ac23e52fe1ca4df66076e9f36246d5272583e..ee04cef32edcc634fd8157d34023715c51f5c848 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -22,9 +22,12 @@ #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT #include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h" #endif +#ifdef PADDLE_WITH_INFERENCE_NVTX +#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" +#endif namespace paddle { namespace framework { @@ -48,12 +51,24 @@ void NaiveExecutor::Run() { platform::RegisterModelLayout(ops_, place_); #endif platform::ScopedFlushDenormal flush; +#ifdef PADDLE_WITH_INFERENCE_NVTX + platform::CudaNvtxRangePush("model", platform::NvtxRangeColor::Yellow); +#endif for (auto &op : ops_) { VLOG(4) << std::this_thread::get_id() << " run " << op->DebugStringEx(scope_) << " on scope " << scope_; op->SetIsCalledByExecutor(false); +#ifdef PADDLE_WITH_INFERENCE_NVTX + platform::CudaNvtxRangePush(op->Type(), platform::NvtxRangeColor::Green); +#endif op->Run(*scope_, place_); +#ifdef PADDLE_WITH_INFERENCE_NVTX + platform::CudaNvtxRangePop(); +#endif } +#ifdef PADDLE_WITH_INFERENCE_NVTX + platform::CudaNvtxRangePop(); +#endif } void NaiveExecutor::CreateVariables(const ProgramDesc &desc, @@ -146,7 +161,7 @@ NaiveExecutor::~NaiveExecutor() { } void NaiveExecutor::ResetTrtOps(int num) { -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT for (auto &op : ops_) { if (op->Type() == "tensorrt_engine") { operators::TensorRTEngineOp *trtop = diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 0b20dbfde7f38bd169a71c06da7e5ee8597ac20a..7e14a3b264d65d9ede997643e4d3490575fa2929 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -108,6 +108,10 @@ if(WITH_PSCORE) tensor_table) endif() +if(WITH_INFERENCE_NVTX AND NOT WIN32) + set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} cuda_profiler) +endif() + if(WITH_ONNXRUNTIME) set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 68864ad152833e9befe0201744b322a3fbfe772f..2af92c7e1480d277484ee3fd52865a59d35d3344 100755 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -655,7 +655,7 @@ void AnalysisConfig::EnableTensorRtEngine( } use_tensorrt_ = true; -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT // https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2 // when trt version less than 7.2, // createExecutionContextWithoutDeviceMemory() has bug. diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index f4107975cba7000c12cb2b826d2cbacbec018e1c..79527c97d45fe0407212b9a4c44bee2757a924b4 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -79,7 +79,7 @@ #include "paddle/fluid/inference/api/onnxruntime_predictor.h" #endif -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" @@ -92,7 +92,7 @@ namespace paddle { using inference::Singleton; -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT using inference::tensorrt::TRTCalibratorEngine; using inference::tensorrt::TRTCalibratorEngineManager; using inference::tensorrt::TRTInt8Calibrator; @@ -1271,7 +1271,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { [](framework::ProgramDesc *prog) { // Note, please do NOT use any member variables, because member variables may // have been destructed in multiple threads. -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT auto &block = prog->Block(0); for (auto &op_desc : block.AllOps()) { if (op_desc->Type() == "tensorrt_engine") { @@ -1977,7 +1977,7 @@ void AnalysisPredictor::ClearIntermediateTensor() { } } -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT bool AnalysisPredictor::SaveTrtCalibToDisk() { PADDLE_ENFORCE_EQ(config_.tensorrt_engine_enabled(), true, @@ -2033,7 +2033,7 @@ bool AnalysisPredictor::SaveTrtCalibToDisk() { #endif AnalysisPredictor::~AnalysisPredictor() { -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT if (config_.tensorrt_engine_enabled() && config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 && Singleton::Global().Has()) { @@ -2157,7 +2157,7 @@ std::unique_ptr CreatePaddlePredictor( } // namespace paddle -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT USE_TRT_CONVERTER(elementwise_add_weight); USE_TRT_CONVERTER(elementwise_sub_weight); USE_TRT_CONVERTER(elementwise_mul_weight); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index ff34bac545d80ecb7f65f7b4b17a700113f8c2b5..bb196e2ef54db843717de77a12adb3a128f624b9 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -382,7 +382,7 @@ class AnalysisPredictor : public PaddlePredictor { /// void MkldnnPostReset(); -#if PADDLE_WITH_TENSORRT +#ifdef PADDLE_WITH_TENSORRT /// /// \brief save calibration table /// diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc index 291dd6c7ce1c75d7d2464de2eec2aaea07767895..cebb36cbc6462ba05c484d84b843b3377a2af149 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc @@ -17,9 +17,9 @@ namespace paddle { namespace platform { -void CudaProfilerInit(std::string output_file, - std::string output_mode, - std::string config_file) { +void CudaProfilerInit(const std::string& output_file, + const std::string& output_mode, + const std::string& config_file) { PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv", platform::errors::InvalidArgument( "Unsupported cuda profiler output mode, expect `kvp` or " @@ -35,8 +35,16 @@ void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); } void CudaProfilerStop() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStop()); } #ifndef _WIN32 -void CudaNvtxRangePush(std::string name) { - dynload::nvtxRangePushA(name.c_str()); +void CudaNvtxRangePush(const std::string& name, const NvtxRangeColor color) { + nvtxEventAttributes_t eventAttrib; + eventAttrib.version = NVTX_VERSION; + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + eventAttrib.colorType = NVTX_COLOR_ARGB; + eventAttrib.color = static_cast(color); + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; + eventAttrib.message.ascii = name.c_str(); + + dynload::nvtxRangePushEx(&eventAttrib); } void CudaNvtxRangePop() { dynload::nvtxRangePop(); } diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h index 6c7cf0fd8dd94eb503b4605bf9c0b44871fa0cce..193e08bdde558a616897cfef38fb446cd25ac6a2 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h @@ -23,16 +23,26 @@ limitations under the License. */ namespace paddle { namespace platform { -void CudaProfilerInit(std::string output_file, - std::string output_mode, - std::string config_file); +void CudaProfilerInit(const std::string& output_file, + const std::string& output_mode, + const std::string& config_file); void CudaProfilerStart(); void CudaProfilerStop(); #ifndef _WIN32 -void CudaNvtxRangePush(std::string name); +enum class NvtxRangeColor : uint32_t { + Black = 0x00000000, + Red = 0x00ff0000, + Green = 0x0000ff00, + Blue = 0x000000ff, + White = 0x00ffffff, + Yellow = 0x00ffff00, +}; + +void CudaNvtxRangePush(const std::string& name, + const NvtxRangeColor color = NvtxRangeColor::Blue); void CudaNvtxRangePop(); #endif diff --git a/paddle/fluid/platform/dynload/nvtx.h b/paddle/fluid/platform/dynload/nvtx.h index c3dc9e31df354fe13798ee55b1e5a7e5afcb1625..e5816e240e6d26ad21d0936c74a79ae32cbe937b 100644 --- a/paddle/fluid/platform/dynload/nvtx.h +++ b/paddle/fluid/platform/dynload/nvtx.h @@ -13,11 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef _WIN32 -#include -#include - -#include // NOLINT - #include "paddle/phi/backends/dynload/nvtx.h" namespace paddle { @@ -28,11 +23,12 @@ namespace dynload { using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name -#define NVTX_ROUTINE_EACH(__macro) \ - __macro(nvtxRangePushA); \ +#define PLATFORM_NVTX_ROUTINE_EACH(__macro) \ + __macro(nvtxRangePushA); \ + __macro(nvtxRangePushEx); \ __macro(nvtxRangePop); -NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP); +PLATFORM_NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP); #undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP } // namespace dynload diff --git a/paddle/phi/backends/dynload/nvtx.h b/paddle/phi/backends/dynload/nvtx.h index a9a166b289e3320c27f93d21a5a1daf2dfec821a..e51bbf2154a1781710ef62f29b7274fbe7c3934e 100644 --- a/paddle/phi/backends/dynload/nvtx.h +++ b/paddle/phi/backends/dynload/nvtx.h @@ -42,6 +42,7 @@ extern void *nvtx_dso_handle; #define NVTX_ROUTINE_EACH(__macro) \ __macro(nvtxRangePushA); \ + __macro(nvtxRangePushEx); \ __macro(nvtxRangePop); NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP);