未验证 提交 de6e7431 编写于 作者: Y Yuanle Liu 提交者: GitHub

add nvtxRangePush/Pop for naive_executor and refine some code (#47139)

上级 065608dd
...@@ -268,6 +268,7 @@ option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE}) ...@@ -268,6 +268,7 @@ option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE})
option(WITH_HETERPS "Compile with heterps" OFF}) option(WITH_HETERPS "Compile with heterps" OFF})
option(WITH_INFERENCE_API_TEST option(WITH_INFERENCE_API_TEST
"Test fluid inference C++ high-level api interface" OFF) "Test fluid inference C++ high-level api interface" OFF)
option(WITH_INFERENCE_NVTX "Paddle inference with nvtx for profiler" OFF)
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}) option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
option( option(
......
...@@ -356,6 +356,10 @@ else() ...@@ -356,6 +356,10 @@ else()
) )
endif() endif()
if(WITH_INFERENCE_NVTX AND NOT WIN32)
add_definitions(-DPADDLE_WITH_INFERENCE_NVTX)
endif()
copy( copy(
inference_lib_dist inference_lib_dist
SRCS ${src_dir}/inference/capi_exp/pd_*.h ${paddle_inference_c_lib} SRCS ${src_dir}/inference/capi_exp/pd_*.h ${paddle_inference_c_lib}
......
...@@ -22,9 +22,12 @@ ...@@ -22,9 +22,12 @@
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
#if PADDLE_WITH_TENSORRT #ifdef PADDLE_WITH_TENSORRT
#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h" #include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
#endif #endif
#ifdef PADDLE_WITH_INFERENCE_NVTX
#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -48,12 +51,24 @@ void NaiveExecutor::Run() { ...@@ -48,12 +51,24 @@ void NaiveExecutor::Run() {
platform::RegisterModelLayout(ops_, place_); platform::RegisterModelLayout(ops_, place_);
#endif #endif
platform::ScopedFlushDenormal flush; platform::ScopedFlushDenormal flush;
#ifdef PADDLE_WITH_INFERENCE_NVTX
platform::CudaNvtxRangePush("model", platform::NvtxRangeColor::Yellow);
#endif
for (auto &op : ops_) { for (auto &op : ops_) {
VLOG(4) << std::this_thread::get_id() << " run " VLOG(4) << std::this_thread::get_id() << " run "
<< op->DebugStringEx(scope_) << " on scope " << scope_; << op->DebugStringEx(scope_) << " on scope " << scope_;
op->SetIsCalledByExecutor(false); op->SetIsCalledByExecutor(false);
#ifdef PADDLE_WITH_INFERENCE_NVTX
platform::CudaNvtxRangePush(op->Type(), platform::NvtxRangeColor::Green);
#endif
op->Run(*scope_, place_); op->Run(*scope_, place_);
#ifdef PADDLE_WITH_INFERENCE_NVTX
platform::CudaNvtxRangePop();
#endif
} }
#ifdef PADDLE_WITH_INFERENCE_NVTX
platform::CudaNvtxRangePop();
#endif
} }
void NaiveExecutor::CreateVariables(const ProgramDesc &desc, void NaiveExecutor::CreateVariables(const ProgramDesc &desc,
...@@ -146,7 +161,7 @@ NaiveExecutor::~NaiveExecutor() { ...@@ -146,7 +161,7 @@ NaiveExecutor::~NaiveExecutor() {
} }
void NaiveExecutor::ResetTrtOps(int num) { void NaiveExecutor::ResetTrtOps(int num) {
#if PADDLE_WITH_TENSORRT #ifdef PADDLE_WITH_TENSORRT
for (auto &op : ops_) { for (auto &op : ops_) {
if (op->Type() == "tensorrt_engine") { if (op->Type() == "tensorrt_engine") {
operators::TensorRTEngineOp *trtop = operators::TensorRTEngineOp *trtop =
......
...@@ -108,6 +108,10 @@ if(WITH_PSCORE) ...@@ -108,6 +108,10 @@ if(WITH_PSCORE)
tensor_table) tensor_table)
endif() endif()
if(WITH_INFERENCE_NVTX AND NOT WIN32)
set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} cuda_profiler)
endif()
if(WITH_ONNXRUNTIME) if(WITH_ONNXRUNTIME)
set(SHARED_INFERENCE_SRCS set(SHARED_INFERENCE_SRCS
${SHARED_INFERENCE_SRCS} ${SHARED_INFERENCE_SRCS}
......
...@@ -655,7 +655,7 @@ void AnalysisConfig::EnableTensorRtEngine( ...@@ -655,7 +655,7 @@ void AnalysisConfig::EnableTensorRtEngine(
} }
use_tensorrt_ = true; use_tensorrt_ = true;
#if PADDLE_WITH_TENSORRT #ifdef PADDLE_WITH_TENSORRT
// https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2 // https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2
// when trt version less than 7.2, // when trt version less than 7.2,
// createExecutionContextWithoutDeviceMemory() has bug. // createExecutionContextWithoutDeviceMemory() has bug.
......
...@@ -79,7 +79,7 @@ ...@@ -79,7 +79,7 @@
#include "paddle/fluid/inference/api/onnxruntime_predictor.h" #include "paddle/fluid/inference/api/onnxruntime_predictor.h"
#endif #endif
#if PADDLE_WITH_TENSORRT #ifdef PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
...@@ -92,7 +92,7 @@ ...@@ -92,7 +92,7 @@
namespace paddle { namespace paddle {
using inference::Singleton; using inference::Singleton;
#if PADDLE_WITH_TENSORRT #ifdef PADDLE_WITH_TENSORRT
using inference::tensorrt::TRTCalibratorEngine; using inference::tensorrt::TRTCalibratorEngine;
using inference::tensorrt::TRTCalibratorEngineManager; using inference::tensorrt::TRTCalibratorEngineManager;
using inference::tensorrt::TRTInt8Calibrator; using inference::tensorrt::TRTInt8Calibrator;
...@@ -1271,7 +1271,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -1271,7 +1271,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
[](framework::ProgramDesc *prog) { [](framework::ProgramDesc *prog) {
// Note, please do NOT use any member variables, because member variables may // Note, please do NOT use any member variables, because member variables may
// have been destructed in multiple threads. // have been destructed in multiple threads.
#if PADDLE_WITH_TENSORRT #ifdef PADDLE_WITH_TENSORRT
auto &block = prog->Block(0); auto &block = prog->Block(0);
for (auto &op_desc : block.AllOps()) { for (auto &op_desc : block.AllOps()) {
if (op_desc->Type() == "tensorrt_engine") { if (op_desc->Type() == "tensorrt_engine") {
...@@ -1977,7 +1977,7 @@ void AnalysisPredictor::ClearIntermediateTensor() { ...@@ -1977,7 +1977,7 @@ void AnalysisPredictor::ClearIntermediateTensor() {
} }
} }
#if PADDLE_WITH_TENSORRT #ifdef PADDLE_WITH_TENSORRT
bool AnalysisPredictor::SaveTrtCalibToDisk() { bool AnalysisPredictor::SaveTrtCalibToDisk() {
PADDLE_ENFORCE_EQ(config_.tensorrt_engine_enabled(), PADDLE_ENFORCE_EQ(config_.tensorrt_engine_enabled(),
true, true,
...@@ -2033,7 +2033,7 @@ bool AnalysisPredictor::SaveTrtCalibToDisk() { ...@@ -2033,7 +2033,7 @@ bool AnalysisPredictor::SaveTrtCalibToDisk() {
#endif #endif
AnalysisPredictor::~AnalysisPredictor() { AnalysisPredictor::~AnalysisPredictor() {
#if PADDLE_WITH_TENSORRT #ifdef PADDLE_WITH_TENSORRT
if (config_.tensorrt_engine_enabled() && if (config_.tensorrt_engine_enabled() &&
config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 && config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 &&
Singleton<TRTCalibratorEngineManager>::Global().Has()) { Singleton<TRTCalibratorEngineManager>::Global().Has()) {
...@@ -2157,7 +2157,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>( ...@@ -2157,7 +2157,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
} // namespace paddle } // namespace paddle
#if PADDLE_WITH_TENSORRT #ifdef PADDLE_WITH_TENSORRT
USE_TRT_CONVERTER(elementwise_add_weight); USE_TRT_CONVERTER(elementwise_add_weight);
USE_TRT_CONVERTER(elementwise_sub_weight); USE_TRT_CONVERTER(elementwise_sub_weight);
USE_TRT_CONVERTER(elementwise_mul_weight); USE_TRT_CONVERTER(elementwise_mul_weight);
......
...@@ -382,7 +382,7 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -382,7 +382,7 @@ class AnalysisPredictor : public PaddlePredictor {
/// ///
void MkldnnPostReset(); void MkldnnPostReset();
#if PADDLE_WITH_TENSORRT #ifdef PADDLE_WITH_TENSORRT
/// ///
/// \brief save calibration table /// \brief save calibration table
/// ///
......
...@@ -17,9 +17,9 @@ ...@@ -17,9 +17,9 @@
namespace paddle { namespace paddle {
namespace platform { namespace platform {
void CudaProfilerInit(std::string output_file, void CudaProfilerInit(const std::string& output_file,
std::string output_mode, const std::string& output_mode,
std::string config_file) { const std::string& config_file) {
PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv", PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv",
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Unsupported cuda profiler output mode, expect `kvp` or " "Unsupported cuda profiler output mode, expect `kvp` or "
...@@ -35,8 +35,16 @@ void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); } ...@@ -35,8 +35,16 @@ void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); }
void CudaProfilerStop() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStop()); } void CudaProfilerStop() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStop()); }
#ifndef _WIN32 #ifndef _WIN32
void CudaNvtxRangePush(std::string name) { void CudaNvtxRangePush(const std::string& name, const NvtxRangeColor color) {
dynload::nvtxRangePushA(name.c_str()); nvtxEventAttributes_t eventAttrib;
eventAttrib.version = NVTX_VERSION;
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
eventAttrib.colorType = NVTX_COLOR_ARGB;
eventAttrib.color = static_cast<uint32_t>(color);
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
eventAttrib.message.ascii = name.c_str();
dynload::nvtxRangePushEx(&eventAttrib);
} }
void CudaNvtxRangePop() { dynload::nvtxRangePop(); } void CudaNvtxRangePop() { dynload::nvtxRangePop(); }
......
...@@ -23,16 +23,26 @@ limitations under the License. */ ...@@ -23,16 +23,26 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
void CudaProfilerInit(std::string output_file, void CudaProfilerInit(const std::string& output_file,
std::string output_mode, const std::string& output_mode,
std::string config_file); const std::string& config_file);
void CudaProfilerStart(); void CudaProfilerStart();
void CudaProfilerStop(); void CudaProfilerStop();
#ifndef _WIN32 #ifndef _WIN32
void CudaNvtxRangePush(std::string name); enum class NvtxRangeColor : uint32_t {
Black = 0x00000000,
Red = 0x00ff0000,
Green = 0x0000ff00,
Blue = 0x000000ff,
White = 0x00ffffff,
Yellow = 0x00ffff00,
};
void CudaNvtxRangePush(const std::string& name,
const NvtxRangeColor color = NvtxRangeColor::Blue);
void CudaNvtxRangePop(); void CudaNvtxRangePop();
#endif #endif
......
...@@ -13,11 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,11 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#ifndef _WIN32 #ifndef _WIN32
#include <cuda.h>
#include <nvToolsExt.h>
#include <mutex> // NOLINT
#include "paddle/phi/backends/dynload/nvtx.h" #include "paddle/phi/backends/dynload/nvtx.h"
namespace paddle { namespace paddle {
...@@ -28,11 +23,12 @@ namespace dynload { ...@@ -28,11 +23,12 @@ namespace dynload {
using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#define NVTX_ROUTINE_EACH(__macro) \ #define PLATFORM_NVTX_ROUTINE_EACH(__macro) \
__macro(nvtxRangePushA); \ __macro(nvtxRangePushA); \
__macro(nvtxRangePushEx); \
__macro(nvtxRangePop); __macro(nvtxRangePop);
NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP); PLATFORM_NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP #undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP
} // namespace dynload } // namespace dynload
......
...@@ -42,6 +42,7 @@ extern void *nvtx_dso_handle; ...@@ -42,6 +42,7 @@ extern void *nvtx_dso_handle;
#define NVTX_ROUTINE_EACH(__macro) \ #define NVTX_ROUTINE_EACH(__macro) \
__macro(nvtxRangePushA); \ __macro(nvtxRangePushA); \
__macro(nvtxRangePushEx); \
__macro(nvtxRangePop); __macro(nvtxRangePop);
NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP); NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册