merge from develop: add tensorrt support for win test=develop (#19172)

* merge from develop: add tensorrt support for win test=develop

merge from develop: add tensorrt support for win test=develop (#19172)
* merge from develop: add tensorrt support for win test=develop
1fd0ca82 · wopeizl · GitHub · 5a86891f · 1fd0ca82 · 1fd0ca82
12 changed file
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -88,6 +88,11 @@ if(WITH_GPU)
    include_directories(${CUDA_TOOLKIT_INCLUDE})
    if(TENSORRT_FOUND)
+        if(WIN32)
+            if(${CUDA_VERSION_MAJOR} VERSION_LESS 9)
+                message(FATAL_ERROR "TensorRT needs CUDA >= 9.0 to compile on Windows")
+            endif()
+        else()
            if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
                message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
            endif()
@@ -97,6 +102,7 @@ if(WITH_GPU)
            if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
                message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
            endif()
+        endif()
        include_directories(${TENSORRT_INCLUDE_DIR})
    endif()
    if(WITH_ANAKIN)

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -218,7 +218,7 @@ endif ()
 if (TENSORRT_FOUND)
    copy(tensorrt_lib DEPS ${inference_deps} 
-        SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/libnvinfer*
+        SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/*nvinfer*
        DSTS ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/include ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/lib)
 endif ()

--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -2,14 +2,28 @@ if(NOT WITH_GPU)
    return()
 endif()
-set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
+if(WIN32)
+    if("${TENSORRT_ROOT}" STREQUAL "")
+        message(WARNING "Please specify the TensorRT root path: TENSORRT_ROOT.")
+    endif()
+    string(REPLACE "\\" "/" TENSORRT_ROOT "${TENSORRT_ROOT}")
+    set(TR_INFER_LIB nvinfer.lib)
+    set(TR_INFER_RT nvinfer.dll)
+    set(TR_INFER_PLUGIN_RT nvinfer_plugin.dll)
+else()
+    set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
+    set(TR_INFER_LIB libnvinfer.a)
+    set(TR_INFER_RT libnvinfer.so)
+    set(TR_INFER_PLUGIN_RT libnvinfer_plugin.so)
+endif()
 find_path(TENSORRT_INCLUDE_DIR NvInfer.h
    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include
    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include
    NO_DEFAULT_PATH
 )
-find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
+find_library(TENSORRT_LIBRARY NAMES ${TR_INFER_LIB} ${TR_INFER_RT}
    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib
    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib
    NO_DEFAULT_PATH

--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -219,7 +219,7 @@ template class AnakinOpConverter<::anakin::saber::X86,
 #define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__, precision_type__)   \
  extern int Touch_anakin_##op_type__##_##place_type__##_##precision_type__(); \
  int use_converter_anakin_##op_type__##_##place_type__##_##precision_type__   \
-      __attribute__((unused)) =                                                \
+      UNUSED =                                                                 \
          Touch_anakin_##op_type__##_##place_type__##_##precision_type__();
 #if defined(PADDLE_WITH_CUDA) && defined(ANAKIN_X86_PLACE)

--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -141,6 +141,10 @@ if(WITH_GPU)
    endif()
    set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
  else()
+    if (USE_TENSORRT)
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
+    endif()
    set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )
    set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
    set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
@@ -150,6 +154,14 @@ endif()
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 target_link_libraries(${DEMO_NAME} ${DEPS})
 if(WIN32)
+  if(USE_TENSORRT)
+    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}
+              ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+            COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}
+              ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+            )
+  endif()
  if(WITH_MKL)
    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
          COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}

--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -227,5 +227,5 @@ class OpConverter {
 #define USE_TRT_CONVERTER(op_type__)                   \
  extern int TouchConverterRegister_##op_type__();     \
-  static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
+  static int use_op_converter_trt_##op_type__ UNUSED = \
      TouchConverterRegister_##op_type__();
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -216,8 +216,8 @@ class TensorRTEngine {
 // TensorRT has too many layers, so that is not wise to add member functions for
 // them, and an macro like this is more extensible when underlying TensorRT
 // library add new layer supports.
-#define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
+#define TRT_ENGINE_ADD_LAYER(engine__, layer__, ...) \
-  engine__->network()->add##layer__(ARGS);
+  engine__->network()->add##layer__(__VA_ARGS__);
 class TRTEngineManager {
 public:

--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -34,6 +34,7 @@ int PReluPlugin::initialize() {
  cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
  cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
             cudaMemcpyHostToDevice);
+  return 0;
 }
 nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,

--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
@@ -68,7 +68,7 @@ class TrtPluginRegistrar {
 #define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func)      \
  static paddle::inference::tensorrt::plugin::TrtPluginRegistrar   \
-      trt_plugin_registrar##ctr __attribute__((unused)) =          \
+      trt_plugin_registrar##ctr UNUSED =                           \
          paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \
              name, deserialize_func)

--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -247,6 +247,8 @@ void* GetNCCLDsoHandle() {
 void* GetTensorRtDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "nvinfer.dll");
 #else
  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
 #endif

--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -14,7 +14,9 @@ limitations under the License. */
 #pragma once
 #include <NvInfer.h>
+#if !defined(_WIN32)
 #include <dlfcn.h>
+#endif
 #include <mutex>  // NOLINT
@@ -34,7 +36,7 @@ extern void* tensorrt_dso_handle;
  struct DynLoad__##__name {                                            \
    template <typename... Args>                                         \
    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {    \
-      using tensorrt_func = decltype(__name(args...)) (*)(Args...);     \
+      using tensorrt_func = decltype(&::__name);                        \
      std::call_once(tensorrt_dso_flag, []() {                          \
        tensorrt_dso_handle =                                           \
            paddle::platform::dynload::GetTensorRtDsoHandle();          \

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -166,6 +166,11 @@ package_data['paddle.libs']= []
 package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
 shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
+if '${TENSORRT_FOUND}' == 'ON' and os.name == 'nt':
+    shutil.copy(os.path.join('${TENSORRT_ROOT}', 'lib', '${TR_INFER_RT}'), libs_path)
+    shutil.copy(os.path.join('${TENSORRT_ROOT}', 'lib', '${TR_INFER_PLUGIN_RT}'), libs_path)
+    package_data['paddle.libs'] += ['${TR_INFER_RT}', '${TR_INFER_PLUGIN_RT}']
 if '${WITH_MKL}' == 'ON':
    shutil.copy('${MKLML_SHARED_LIB}', libs_path)
    shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)