未验证 提交 de6e7431 编写于 作者: Y Yuanle Liu 提交者: GitHub

add nvtxRangePush/Pop for naive_executor and refine some code (#47139)

上级 065608dd
......@@ -268,6 +268,7 @@ option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE})
option(WITH_HETERPS "Compile with heterps" OFF})
option(WITH_INFERENCE_API_TEST
"Test fluid inference C++ high-level api interface" OFF)
option(WITH_INFERENCE_NVTX "Paddle inference with nvtx for profiler" OFF)
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
option(
......
......@@ -356,6 +356,10 @@ else()
)
endif()
if(WITH_INFERENCE_NVTX AND NOT WIN32)
add_definitions(-DPADDLE_WITH_INFERENCE_NVTX)
endif()
copy(
inference_lib_dist
SRCS ${src_dir}/inference/capi_exp/pd_*.h ${paddle_inference_c_lib}
......
......@@ -22,9 +22,12 @@
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
#if PADDLE_WITH_TENSORRT
#ifdef PADDLE_WITH_TENSORRT
#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
#endif
#ifdef PADDLE_WITH_INFERENCE_NVTX
#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
#endif
namespace paddle {
namespace framework {
......@@ -48,12 +51,24 @@ void NaiveExecutor::Run() {
platform::RegisterModelLayout(ops_, place_);
#endif
platform::ScopedFlushDenormal flush;
#ifdef PADDLE_WITH_INFERENCE_NVTX
platform::CudaNvtxRangePush("model", platform::NvtxRangeColor::Yellow);
#endif
for (auto &op : ops_) {
VLOG(4) << std::this_thread::get_id() << " run "
<< op->DebugStringEx(scope_) << " on scope " << scope_;
op->SetIsCalledByExecutor(false);
#ifdef PADDLE_WITH_INFERENCE_NVTX
platform::CudaNvtxRangePush(op->Type(), platform::NvtxRangeColor::Green);
#endif
op->Run(*scope_, place_);
#ifdef PADDLE_WITH_INFERENCE_NVTX
platform::CudaNvtxRangePop();
#endif
}
#ifdef PADDLE_WITH_INFERENCE_NVTX
platform::CudaNvtxRangePop();
#endif
}
void NaiveExecutor::CreateVariables(const ProgramDesc &desc,
......@@ -146,7 +161,7 @@ NaiveExecutor::~NaiveExecutor() {
}
void NaiveExecutor::ResetTrtOps(int num) {
#if PADDLE_WITH_TENSORRT
#ifdef PADDLE_WITH_TENSORRT
for (auto &op : ops_) {
if (op->Type() == "tensorrt_engine") {
operators::TensorRTEngineOp *trtop =
......
......@@ -108,6 +108,10 @@ if(WITH_PSCORE)
tensor_table)
endif()
if(WITH_INFERENCE_NVTX AND NOT WIN32)
set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} cuda_profiler)
endif()
if(WITH_ONNXRUNTIME)
set(SHARED_INFERENCE_SRCS
${SHARED_INFERENCE_SRCS}
......
......@@ -655,7 +655,7 @@ void AnalysisConfig::EnableTensorRtEngine(
}
use_tensorrt_ = true;
#if PADDLE_WITH_TENSORRT
#ifdef PADDLE_WITH_TENSORRT
// https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2
// when trt version less than 7.2,
// createExecutionContextWithoutDeviceMemory() has bug.
......
......@@ -79,7 +79,7 @@
#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
#endif
#if PADDLE_WITH_TENSORRT
#ifdef PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
......@@ -92,7 +92,7 @@
namespace paddle {
using inference::Singleton;
#if PADDLE_WITH_TENSORRT
#ifdef PADDLE_WITH_TENSORRT
using inference::tensorrt::TRTCalibratorEngine;
using inference::tensorrt::TRTCalibratorEngineManager;
using inference::tensorrt::TRTInt8Calibrator;
......@@ -1271,7 +1271,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
[](framework::ProgramDesc *prog) {
// Note, please do NOT use any member variables, because member variables may
// have been destructed in multiple threads.
#if PADDLE_WITH_TENSORRT
#ifdef PADDLE_WITH_TENSORRT
auto &block = prog->Block(0);
for (auto &op_desc : block.AllOps()) {
if (op_desc->Type() == "tensorrt_engine") {
......@@ -1977,7 +1977,7 @@ void AnalysisPredictor::ClearIntermediateTensor() {
}
}
#if PADDLE_WITH_TENSORRT
#ifdef PADDLE_WITH_TENSORRT
bool AnalysisPredictor::SaveTrtCalibToDisk() {
PADDLE_ENFORCE_EQ(config_.tensorrt_engine_enabled(),
true,
......@@ -2033,7 +2033,7 @@ bool AnalysisPredictor::SaveTrtCalibToDisk() {
#endif
AnalysisPredictor::~AnalysisPredictor() {
#if PADDLE_WITH_TENSORRT
#ifdef PADDLE_WITH_TENSORRT
if (config_.tensorrt_engine_enabled() &&
config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 &&
Singleton<TRTCalibratorEngineManager>::Global().Has()) {
......@@ -2157,7 +2157,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
} // namespace paddle
#if PADDLE_WITH_TENSORRT
#ifdef PADDLE_WITH_TENSORRT
USE_TRT_CONVERTER(elementwise_add_weight);
USE_TRT_CONVERTER(elementwise_sub_weight);
USE_TRT_CONVERTER(elementwise_mul_weight);
......
......@@ -382,7 +382,7 @@ class AnalysisPredictor : public PaddlePredictor {
///
void MkldnnPostReset();
#if PADDLE_WITH_TENSORRT
#ifdef PADDLE_WITH_TENSORRT
///
/// \brief save calibration table
///
......
......@@ -17,9 +17,9 @@
namespace paddle {
namespace platform {
void CudaProfilerInit(std::string output_file,
std::string output_mode,
std::string config_file) {
void CudaProfilerInit(const std::string& output_file,
const std::string& output_mode,
const std::string& config_file) {
PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv",
platform::errors::InvalidArgument(
"Unsupported cuda profiler output mode, expect `kvp` or "
......@@ -35,8 +35,16 @@ void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); }
void CudaProfilerStop() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStop()); }
#ifndef _WIN32
void CudaNvtxRangePush(std::string name) {
dynload::nvtxRangePushA(name.c_str());
void CudaNvtxRangePush(const std::string& name, const NvtxRangeColor color) {
nvtxEventAttributes_t eventAttrib;
eventAttrib.version = NVTX_VERSION;
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
eventAttrib.colorType = NVTX_COLOR_ARGB;
eventAttrib.color = static_cast<uint32_t>(color);
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
eventAttrib.message.ascii = name.c_str();
dynload::nvtxRangePushEx(&eventAttrib);
}
void CudaNvtxRangePop() { dynload::nvtxRangePop(); }
......
......@@ -23,16 +23,26 @@ limitations under the License. */
namespace paddle {
namespace platform {
void CudaProfilerInit(std::string output_file,
std::string output_mode,
std::string config_file);
void CudaProfilerInit(const std::string& output_file,
const std::string& output_mode,
const std::string& config_file);
void CudaProfilerStart();
void CudaProfilerStop();
#ifndef _WIN32
void CudaNvtxRangePush(std::string name);
enum class NvtxRangeColor : uint32_t {
Black = 0x00000000,
Red = 0x00ff0000,
Green = 0x0000ff00,
Blue = 0x000000ff,
White = 0x00ffffff,
Yellow = 0x00ffff00,
};
void CudaNvtxRangePush(const std::string& name,
const NvtxRangeColor color = NvtxRangeColor::Blue);
void CudaNvtxRangePop();
#endif
......
......@@ -13,11 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifndef _WIN32
#include <cuda.h>
#include <nvToolsExt.h>
#include <mutex> // NOLINT
#include "paddle/phi/backends/dynload/nvtx.h"
namespace paddle {
......@@ -28,11 +23,12 @@ namespace dynload {
using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#define NVTX_ROUTINE_EACH(__macro) \
__macro(nvtxRangePushA); \
#define PLATFORM_NVTX_ROUTINE_EACH(__macro) \
__macro(nvtxRangePushA); \
__macro(nvtxRangePushEx); \
__macro(nvtxRangePop);
NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
PLATFORM_NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP
} // namespace dynload
......
......@@ -42,6 +42,7 @@ extern void *nvtx_dso_handle;
#define NVTX_ROUTINE_EACH(__macro) \
__macro(nvtxRangePushA); \
__macro(nvtxRangePushEx); \
__macro(nvtxRangePop);
NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册