Merge pull request #795 from MRXLT/trt-fixed

support TensorRT

Merge pull request #795 from MRXLT/trt-fixed
support TensorRT
4031781a · MRXLT · GitHub · 876ebaec · a7386fcf · 4031781a
19 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,6 +54,7 @@ option(SERVER	    "Compile Paddle Serving Server"		    OFF)
 option(APP          "Compile Paddle Serving App package"	    OFF)
 option(WITH_ELASTIC_CTR "Compile ELASITC-CTR solution"              OFF)
 option(PACK         "Compile for whl"                               OFF)
+option(WITH_TRT     "Compile Paddle Serving with TRT"       OFF)
 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)

--- a/README.md
+++ b/README.md
@@ -128,6 +128,7 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 | `mem_optim_off` | - | - | Disable memory / graphic memory optimization |
 | `ir_optim` | - | - | Enable analysis and optimization of calculation graph |
 | `use_mkl` (Only for cpu version) | - | - | Run inference with MKL |
+| `use_trt` (Only for trt version) | - | - | Run inference with TensorRT  |
 Here, we use `curl` to send a HTTP POST request to the service we just started. Users can use any python library to send HTTP POST as well, e.g, [requests](https://requests.readthedocs.io/en/master/).
 </center>

--- a/README_CN.md
+++ b/README_CN.md
@@ -124,6 +124,7 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 | `mem_optim_off` | - | - | Disable memory optimization |
 | `ir_optim` | - | - | Enable analysis and optimization of calculation graph |
 | `use_mkl` (Only for cpu version) | - | - | Run inference with MKL |
+| `use_trt` (Only for trt version) | - | - | Run inference with TensorRT  |
 我们使用 `curl` 命令来发送HTTP POST请求给刚刚启动的服务。用户也可以调用python库来发送HTTP POST请求，请参考英文文档 [requests](https://requests.readthedocs.io/en/master/)。
 </center>

--- a/cmake/paddlepaddle.cmake
+++ b/cmake/paddlepaddle.cmake
@@ -34,7 +34,11 @@ message( "WITH_GPU = ${WITH_GPU}")
 SET(PADDLE_VERSION "1.8.4")
 if (WITH_GPU)
-    SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda${CUDA_VERSION_MAJOR}-cudnn7-avx-mkl")
+    if (WITH_TRT)
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10.1-cudnn7.6-avx-mkl-trt6")
+    else()
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10-cudnn7-avx-mkl")
+    endif()
 else()
    if (WITH_AVX)
        if (WITH_MKLML)
@@ -50,21 +54,38 @@ endif()
 SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/fluid_inference.tgz")
 MESSAGE(STATUS "PADDLE_LIB_PATH=${PADDLE_LIB_PATH}")
 if (WITH_GPU OR WITH_MKLML)
-ExternalProject_Add(
+    if (WITH_TRT)
-    "extern_paddle"
+        ExternalProject_Add(
-    ${EXTERNAL_PROJECT_LOG_ARGS}
+            "extern_paddle"
-    URL                 "${PADDLE_LIB_PATH}"
+            ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX              "${PADDLE_SOURCES_DIR}"
+            URL                 "${PADDLE_LIB_PATH}"
-    DOWNLOAD_DIR        "${PADDLE_DOWNLOAD_DIR}"
+            PREFIX              "${PADDLE_SOURCES_DIR}"
-    CONFIGURE_COMMAND   ""
+            DOWNLOAD_DIR        "${PADDLE_DOWNLOAD_DIR}"
-    BUILD_COMMAND       ""
+            CONFIGURE_COMMAND   ""
-    UPDATE_COMMAND      ""
+            BUILD_COMMAND       ""
-    INSTALL_COMMAND
+            UPDATE_COMMAND      ""
-        ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/include ${PADDLE_INSTALL_DIR}/include &&
+            INSTALL_COMMAND
-        ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/lib ${PADDLE_INSTALL_DIR}/lib &&
+                ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/include ${PADDLE_INSTALL_DIR}/include &&
-        ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/third_party ${PADDLE_INSTALL_DIR}/third_party &&
+                ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/lib ${PADDLE_INSTALL_DIR}/lib &&
-        ${CMAKE_COMMAND} -E copy ${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib/libmkldnn.so.0 ${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib/libmkldnn.so 
+                ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/third_party ${PADDLE_INSTALL_DIR}/third_party
-)
+        )
+    else()
+        ExternalProject_Add(
+            "extern_paddle"
+            ${EXTERNAL_PROJECT_LOG_ARGS}
+            URL                 "${PADDLE_LIB_PATH}"
+            PREFIX              "${PADDLE_SOURCES_DIR}"
+            DOWNLOAD_DIR        "${PADDLE_DOWNLOAD_DIR}"
+            CONFIGURE_COMMAND   ""
+            BUILD_COMMAND       ""
+            UPDATE_COMMAND      ""
+            INSTALL_COMMAND
+                ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/include ${PADDLE_INSTALL_DIR}/include &&
+                ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/lib ${PADDLE_INSTALL_DIR}/lib &&
+                ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/third_party ${PADDLE_INSTALL_DIR}/third_party &&
+                ${CMAKE_COMMAND} -E copy ${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib/libmkldnn.so.0 ${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib/libmkldnn.so 
+        )
+    endif()
 else()
 ExternalProject_Add(
    "extern_paddle"
@@ -92,8 +113,16 @@ LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib)
 ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/openblas/lib/libopenblas.a)
-ADD_LIBRARY(paddle_fluid STATIC IMPORTED GLOBAL)
+ADD_LIBRARY(paddle_fluid SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.a)
+SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.so)
+if (WITH_TRT)
+ADD_LIBRARY(nvinfer SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET nvinfer PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer.so)
+ADD_LIBRARY(nvinfer_plugin SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET nvinfer_plugin PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer_plugin.so)
+endif()
 ADD_LIBRARY(xxhash STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET xxhash PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/xxhash/lib/libxxhash.a)
@@ -101,4 +130,9 @@ SET_PROPERTY(TARGET xxhash PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/thir
 LIST(APPEND external_project_dependencies paddle)
 LIST(APPEND paddle_depend_libs
-        xxhash)
+    xxhash)
+if(WITH_TRT)
+LIST(APPEND paddle_depend_libs
+    nvinfer nvinfer_plugin)
+endif()
--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -44,6 +44,7 @@ message EngineDesc {
  optional bool static_optimization = 14;
  optional bool force_update_static_cache = 15;
  optional bool enable_ir_optimization = 16;
+  optional bool use_trt = 17;
 };
 // model_toolkit conf

--- a/core/general-server/CMakeLists.txt
+++ b/core/general-server/CMakeLists.txt
@@ -9,7 +9,7 @@ endif()
 target_include_directories(serving PUBLIC
        ${CMAKE_CURRENT_BINARY_DIR}/../../core/predictor
        )
+    include_directories(${CUDNN_ROOT}/include/)
 if(WITH_GPU)
    target_link_libraries(serving -Wl,--whole-archive fluid_gpu_engine
            -Wl,--no-whole-archive)
@@ -29,7 +29,11 @@ if(WITH_GPU)
 endif()
 if(WITH_MKL OR WITH_GPU)
+    if (WITH_TRT)
+    target_link_libraries(serving -liomp5 -lmklml_intel -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -lbz2)
+    else()
    target_link_libraries(serving -liomp5 -lmklml_intel -lmkldnn -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -lbz2)
+endif()
 else()
    target_link_libraries(serving openblas -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -lbz2)
 endif()

--- a/core/predictor/CMakeLists.txt
+++ b/core/predictor/CMakeLists.txt
@@ -13,7 +13,9 @@ set_source_files_properties(
        PROPERTIES
        COMPILE_FLAGS  "-Wno-strict-aliasing -Wno-unused-variable -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 add_dependencies(pdserving protobuf boost brpc leveldb pdcodegen configure)
+if (WITH_TRT)
+    add_definitions(-DWITH_TRT)
+endif()
 target_link_libraries(pdserving
        brpc protobuf boost leveldb configure -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)

--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -38,6 +38,7 @@ class InferEngineCreationParams {
    _enable_ir_optimization = false;
    _static_optimization = false;
    _force_update_static_cache = false;
+    _use_trt = false;
  }
  void set_path(const std::string& path) { _path = path; }
@@ -50,12 +51,16 @@ class InferEngineCreationParams {
    _enable_ir_optimization = enable_ir_optimization;
  }
+  void set_use_trt(bool use_trt) { _use_trt = use_trt; }
  bool enable_memory_optimization() const {
    return _enable_memory_optimization;
  }
  bool enable_ir_optimization() const { return _enable_ir_optimization; }
+  bool use_trt() const { return _use_trt; }
  void set_static_optimization(bool static_optimization = false) {
    _static_optimization = static_optimization;
  }
@@ -86,6 +91,7 @@ class InferEngineCreationParams {
  bool _enable_ir_optimization;
  bool _static_optimization;
  bool _force_update_static_cache;
+  bool _use_trt;
 };
 class InferEngine {
@@ -172,6 +178,10 @@ class ReloadableInferEngine : public InferEngine {
          force_update_static_cache);
    }
+    if (conf.has_use_trt()) {
+      _infer_engine_params.set_use_trt(conf.use_trt());
+    }
    if (!check_need_reload() || load(_infer_engine_params) != 0) {
      LOG(ERROR) << "Failed load model_data_path" << _model_data_path;
      return -1;
@@ -553,8 +563,12 @@ class CloneDBReloadableInferEngine
 };
 template <typename FluidFamilyCore>
+#ifdef WITH_TRT
+class FluidInferEngine : public DBReloadableInferEngine<FluidFamilyCore> {
+#else
 class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
- public:
+#endif
+ public:  // NOLINT
  FluidInferEngine() {}
  ~FluidInferEngine() {}

--- a/core/sdk-cpp/include/abtest.h
+++ b/core/sdk-cpp/include/abtest.h
@@ -51,8 +51,8 @@ class WeightedRandomRender : public EndpointRouterBase {
        new (std::nothrow) Factory<WeightedRandomRender, EndpointRouterBase>();
    if (factory == NULL) {
      RAW_LOG(ERROR,
-              "Failed regist factory: WeightedRandomRender->EndpointRouterBase \
+              "Failed regist factory: WeightedRandomRender->EndpointRouterBase "
-          in macro!");
+              "in macro!");
      return -1;
    }
@@ -63,8 +63,8 @@ class WeightedRandomRender : public EndpointRouterBase {
    if (FactoryPool<EndpointRouterBase>::instance().register_factory(
            "WeightedRandomRender", factory) != 0) {
      RAW_LOG(INFO,
-              "Factory has been registed: \
+              "Factory has been registed: "
-              WeightedRandomRender->EndpointRouterBase.");
+              "WeightedRandomRender->EndpointRouterBase.");
    }
    return 0;

--- a/doc/COMPILE.md
+++ b/doc/COMPILE.md
@@ -77,10 +77,10 @@ export PATH=$PATH:$GOPATH/bin
 ```shell
 go env -w GO111MODULE=on
 go env -w GOPROXY=https://goproxy.cn,direct
-go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2
-go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2
-go get -u github.com/golang/protobuf/protoc-gen-go
+go get -u github.com/golang/protobuf/protoc-gen-go@v1.4.3
-go get -u google.golang.org/grpc
+go get -u google.golang.org/grpc@v1.33.0
 ```
@@ -91,9 +91,9 @@ go get -u google.golang.org/grpc
 ``` shell
 mkdir server-build-cpu && cd server-build-cpu
 cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
-      -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
-      -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
-      -DSERVER=ON ..
+    -DSERVER=ON ..
 make -j10
 ```
@@ -104,10 +104,28 @@ you can execute `make install` to put targets under directory `./output`, you ne
 ``` shell
 mkdir server-build-gpu && cd server-build-gpu
 cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
-      -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
-      -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
-      -DSERVER=ON \
+    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
-      -DWITH_GPU=ON ..
+    -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \  
+    -DSERVER=ON \
+    -DWITH_GPU=ON ..
+make -j10
+```
+### Integrated TRT version paddle inference library
+```
+mkdir server-build-trt && cd server-build-trt
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH} \
+    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
+    -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \
+    -DSERVER=ON \
+    -DWITH_GPU=ON \
+    -DWITH_TRT=ON ..
 make -j10
 ```
@@ -136,7 +154,10 @@ execute `make install` to put targets under directory `./output`
 ```bash
 mkdir app-build && cd app-build
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DAPP=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DAPP=ON ..
 make
 ```
@@ -167,7 +188,9 @@ Please use the example under `python/examples` to verify.
 |     WITH_AVX     | Compile Paddle Serving with AVX intrinsics | OFF  |
 |     WITH_MKL     |  Compile Paddle Serving with MKL support   | OFF  |
 |     WITH_GPU     |   Compile Paddle Serving with NVIDIA GPU   | OFF  |
-|    CUDNN_ROOT    |    Define CuDNN library and header path    |      |
+|  CUDNN_LIBRARY   |    Define CuDNN library and header path    |      |
+| CUDA_TOOLKIT_ROOT_DIR |       Define CUDA PATH                |      |
+|   TENSORRT_ROOT  |           Define TensorRT PATH             |      |
 |      CLIENT      |       Compile Paddle Serving Client        | OFF  |
 |      SERVER      |       Compile Paddle Serving Server        | OFF  |
 |       APP        |     Compile Paddle Serving App package     | OFF  |
@@ -182,7 +205,8 @@ To compile the Paddle Serving GPU version on bare metal, you need to install the
 - CUDA
 - CuDNN
- NCCL2
+To compile the TensorRT version, you need to install the TensorRT library.
 Note here:
@@ -192,21 +216,12 @@ Note here:
 The following is the base library version matching relationship used by the PaddlePaddle release version for reference:
-|        |  CUDA   |          CuDNN           | NCCL2  |
+|          |  CUDA   |          CuDNN           | TensorRT |
-| :----: | :-----: | :----------------------: | :----: |
+| :----:   | :-----: | :----------------------: | :----:   |
-| CUDA 8 | 8.0.61  | CuDNN 7.1.2 for CUDA 8.0 | 2.1.4  |
+| post9    |  9.0    | CuDNN 7.3.1 for CUDA 9.0 |          |
-| CUDA 9 | 9.0.176 | CuDNN 7.3.1 for CUDA 9.0 | 2.2.12 |
+| post10   |  10.0   | CuDNN 7.5.1 for CUDA 10.0|          |
+| trt      |  10.1   | CuDNN 7.5.1 for CUDA 10.1| 6.0.1.5  |
 ### How to make the compiler detect the CuDNN library
 Download the corresponding CUDNN version from NVIDIA developer official website and decompressing it, add `-DCUDNN_ROOT` to cmake command, to specify the path of CUDNN.
-### How to make the compiler detect the nccl library
-After downloading the corresponding version of the nccl2 library from the NVIDIA developer official website and decompressing it, add the following environment variables (take nccl2.1.4 as an example):
-```shell
-export C_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$C_INCLUDE_PATH
-export CPLUS_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$CPLUS_INCLUDE_PATH
-export LD_LIBRARY_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/lib/:$LD_LIBRARY_PATH
-```
--- a/doc/COMPILE_CN.md
+++ b/doc/COMPILE_CN.md
@@ -74,10 +74,10 @@ export PATH=$PATH:$GOPATH/bin
 ```shell
 go env -w GO111MODULE=on
 go env -w GOPROXY=https://goproxy.cn,direct
-go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2
-go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger
+go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2
-go get -u github.com/golang/protobuf/protoc-gen-go
+go get -u github.com/golang/protobuf/protoc-gen-go@v1.4.3
-go get -u google.golang.org/grpc
+go get -u google.golang.org/grpc@v1.33.0
 ```
@@ -87,7 +87,10 @@ go get -u google.golang.org/grpc
 ``` shell
 mkdir server-build-cpu && cd server-build-cpu
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DSERVER=ON ..
 make -j10
 ```
@@ -97,21 +100,44 @@ make -j10
 ``` shell
 mkdir server-build-gpu && cd server-build-gpu
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON -DWITH_GPU=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
+    -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \
+    -DSERVER=ON \
+    -DWITH_GPU=ON ..
 make -j10
 ```
-执行`make install`可以把目标产出放在`./output`目录下。
+### 集成TensorRT版本Paddle Inference Library
-**注意：** 编译成功后，需要设置`SERVING_BIN`路径，详见后面的[注意事项](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE_CN.md#注意事项)。
+```
+mkdir server-build-trt && cd server-build-trt
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH} \
+    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
+    -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \
+    -DSERVER=ON \
+    -DWITH_GPU=ON \
+    -DWITH_TRT=ON ..
+make -j10
+```
+执行`make install`可以把目标产出放在`./output`目录下。
+**注意：** 编译成功后，需要设置`SERVING_BIN`路径，详见后面的[注意事项](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE_CN.md#注意事项)。
 ## 编译Client部分
 ``` shell
 mkdir client-build && cd client-build
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DCLIENT=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DCLIENT=ON ..
 make -j10
 ```
@@ -123,7 +149,11 @@ make -j10
 ```bash
 mkdir app-build && cd app-build
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DCMAKE_INSTALL_PREFIX=./output -DAPP=ON ..
+cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
+    -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \
+    -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \
+    -DCMAKE_INSTALL_PREFIX=./output \
+    -DAPP=ON ..
 make
 ```
@@ -154,7 +184,10 @@ make
 |     WITH_AVX     | Compile Paddle Serving with AVX intrinsics | OFF  |
 |     WITH_MKL     |  Compile Paddle Serving with MKL support   | OFF  |
 |     WITH_GPU     |   Compile Paddle Serving with NVIDIA GPU   | OFF  |
-|    CUDNN_ROOT    |    Define CuDNN library and header path    |      |
+|     WITH_TRT     |    Compile Paddle Serving with TensorRT    | OFF  |
+|  CUDNN_LIBRARY   |    Define CuDNN library and header path    |      |
+| CUDA_TOOLKIT_ROOT_DIR |       Define CUDA PATH                |      |
+|   TENSORRT_ROOT  |           Define TensorRT PATH             |      |
 |      CLIENT      |       Compile Paddle Serving Client        | OFF  |
 |      SERVER      |       Compile Paddle Serving Server        | OFF  |
 |       APP        |     Compile Paddle Serving App package     | OFF  |
@@ -169,7 +202,8 @@ Paddle Serving通过PaddlePaddle预测库支持在GPU上做预测。WITH_GPU选
 - CUDA
 - CuDNN
- NCCL2
+编译TensorRT版本，需要安装TensorRT库。
 这里要注意的是：
@@ -178,21 +212,12 @@ Paddle Serving通过PaddlePaddle预测库支持在GPU上做预测。WITH_GPU选
 以下是PaddlePaddle发布版本所使用的基础库版本匹配关系，供参考：
-|        |  CUDA   |          CuDNN           | NCCL2  |
+|          |  CUDA   |          CuDNN           | TensorRT |
-| :----: | :-----: | :----------------------: | :----: |
+| :----:   | :-----: | :----------------------: | :----:   |
-| CUDA 8 | 8.0.61  | CuDNN 7.1.2 for CUDA 8.0 | 2.1.4  |
+| post9    |  9.0    | CuDNN 7.3.1 for CUDA 9.0 |          |
-| CUDA 9 | 9.0.176 | CuDNN 7.3.1 for CUDA 9.0 | 2.2.12 |
+| post10   |  10.0   | CuDNN 7.5.1 for CUDA 10.0|          |
+| trt      |  10.1   | CuDNN 7.5.1 for CUDA 10.1| 6.0.1.5  |
 ### 如何让Paddle Serving编译系统探测到CuDNN库
-从NVIDIA developer官网下载对应版本CuDNN并在本地解压后，在cmake编译命令中增加`-DCUDNN_ROOT`参数，指定CuDNN库所在路径。
+从NVIDIA developer官网下载对应版本CuDNN并在本地解压后，在cmake编译命令中增加`-DCUDNN_LIBRARY`参数，指定CuDNN库所在路径。
-### 如何让Paddle Serving编译系统探测到nccl库
-从NVIDIA developer官网下载对应版本nccl2库并解压后，增加如下环境变量 (以nccl2.1.4为例)：
-```shell
-export C_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$C_INCLUDE_PATH
-export CPLUS_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$CPLUS_INCLUDE_PATH
-export LD_LIBRARY_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/lib/:$LD_LIBRARY_PATH
-```
--- a/doc/LATEST_PACKAGES.md
+++ b/doc/LATEST_PACKAGES.md
@@ -18,6 +18,8 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server-0.0.0-py2-none-an
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-py3-none-any.whl
 #cuda 10.0
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py3-none-any.whl
+#cuda10.1 with TensorRT 6
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py3-none-any.whl
 ```
 ### Python 2
 ```
@@ -25,6 +27,8 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-py2-none-any.whl
 #cuda 10.0
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py2-none-any.whl
+##cuda10.1 with TensorRT 6
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py2-none-any.whl
 ```
 ## Client

--- a/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h
+++ b/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h
@@ -23,7 +23,6 @@
 #include "core/configure/inferencer_configure.pb.h"
 #include "core/predictor/framework/infer.h"
 #include "paddle_inference_api.h"  // NOLINT
-//#include "predictor/framework/infer.h"
 namespace baidu {
 namespace paddle_serving {

--- a/paddle_inference/inferencer-fluid-gpu/CMakeLists.txt
+++ b/paddle_inference/inferencer-fluid-gpu/CMakeLists.txt
@@ -2,6 +2,7 @@ FILE(GLOB fluid_gpu_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
 add_library(fluid_gpu_engine ${fluid_gpu_engine_srcs})
 target_include_directories(fluid_gpu_engine PUBLIC
        ${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
 add_dependencies(fluid_gpu_engine pdserving extern_paddle configure)
 target_link_libraries(fluid_gpu_engine pdserving paddle_fluid iomp5 mklml_intel -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)

--- a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
+++ b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
@@ -190,7 +190,7 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {
    paddle::AnalysisConfig analysis_config;
    analysis_config.SetModel(data_path);
-    analysis_config.EnableUseGpu(100, FLAGS_gpuid);
+    analysis_config.EnableUseGpu(1500, FLAGS_gpuid);
    analysis_config.SwitchSpecifyInputNames(true);
    analysis_config.SetCpuMathLibraryNumThreads(1);
@@ -198,12 +198,68 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {
      analysis_config.EnableMemoryOptim();
    }
-    if (params.enable_ir_optimization()) {
+#if 0  // todo: support flexible shape
-      analysis_config.SwitchIrOptim(true);
+    int min_seq_len = 1;
+    int max_seq_len = 512;
+    int opt_seq_len = 128;
+    int head_number = 12;
+    int batch = 50;
+    std::vector<int> min_in_shape = {batch, min_seq_len, 1};
+    std::vector<int> max_in_shape = {batch, max_seq_len, 1};
+    std::vector<int> opt_in_shape = {batch, opt_seq_len, 1};
+    std::string input1_name = "src_text_a_ids";
+    std::string input2_name = "pos_text_a_ids";
+    std::string input3_name = "sent_text_a_ids";
+    std::string input4_name = "stack_0.tmp_0";
+    std::map<std::string, std::vector<int>> min_input_shape = {
+        {input1_name, min_in_shape},
+        {input2_name, min_in_shape},
+        {input3_name, min_in_shape},
+        {input4_name, {batch, head_number, min_seq_len, min_seq_len}},
+    };
+    std::map<std::string, std::vector<int>> max_input_shape = {
+        {input1_name, max_in_shape},
+        {input2_name, max_in_shape},
+        {input3_name, max_in_shape},
+        {input4_name, {batch, head_number, max_seq_len, max_seq_len}},
+    };
+    std::map<std::string, std::vector<int>> opt_input_shape = {
+        {input1_name, opt_in_shape},
+        {input2_name, opt_in_shape},
+        {input3_name, opt_in_shape},
+        {input4_name, {batch, head_number, opt_seq_len, opt_seq_len}},
+    };
+    analysis_config.SetTRTDynamicShapeInfo(
+        min_input_shape, max_input_shape, opt_input_shape);
+#endif
+    int max_batch = 32;
+    int min_subgraph_size = 3;
+    if (params.use_trt()) {
+      analysis_config.EnableTensorRtEngine(
+          1 << 20,
+          max_batch,
+          min_subgraph_size,
+          paddle::AnalysisConfig::Precision::kFloat32,
+          false,
+          false);
+      LOG(INFO) << "create TensorRT predictor";
    } else {
-      analysis_config.SwitchIrOptim(false);
+      if (params.enable_memory_optimization()) {
-    }
+        analysis_config.EnableMemoryOptim();
+      }
+      if (params.enable_ir_optimization()) {
+        analysis_config.SwitchIrOptim(true);
+      } else {
+        analysis_config.SwitchIrOptim(false);
+      }
+    }
    AutoLock lock(GlobalPaddleCreateMutex::instance());
    _core =
        paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -80,6 +80,16 @@ if (SERVER)
            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
+    elseif(WITH_TRT)
+        add_custom_command(
+            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
+            COMMAND cp -r
+            ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/
+            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py
+            "server_gpu" trt
+            COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+            DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES})
+        add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
    else()
        add_custom_command(
            OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp

--- a/python/paddle_serving_server_gpu/__init__.py
+++ b/python/paddle_serving_server_gpu/__init__.py
@@ -73,6 +73,8 @@ def serve_args():
        default=False,
        action="store_true",
        help="Use Multi-language-service")
+    parser.add_argument(
+        "--use_trt", default=False, action="store_true", help="Use TensorRT")
    parser.add_argument(
        "--product_name",
        type=str,
@@ -205,6 +207,7 @@ class Server(object):
        self.cur_path = os.getcwd()
        self.use_local_bin = False
        self.gpuid = 0
+        self.use_trt = False
        self.model_config_paths = None  # for multi-model in a workflow
        self.product_name = None
        self.container_id = None
@@ -271,6 +274,9 @@ class Server(object):
    def set_gpuid(self, gpuid=0):
        self.gpuid = gpuid
+    def set_trt(self):
+        self.use_trt = True
    def _prepare_engine(self, model_config_paths, device):
        if self.model_toolkit_conf == None:
            self.model_toolkit_conf = server_sdk.ModelToolkitConf()
@@ -290,6 +296,7 @@ class Server(object):
            engine.enable_ir_optimization = self.ir_optimization
            engine.static_optimization = False
            engine.force_update_static_cache = False
+            engine.use_trt = self.use_trt
            if device == "cpu":
                engine.type = "FLUID_CPU_ANALYSIS_DIR"
@@ -396,7 +403,10 @@ class Server(object):
        for line in version_file.readlines():
            if re.match("cuda_version", line):
                cuda_version = line.split("\"")[1]
-                device_version = "serving-gpu-cuda" + cuda_version + "-"
+                if cuda_version != "trt":
+                    device_version = "serving-gpu-cuda" + cuda_version + "-"
+                else:
+                    device_version = "serving-gpu-" + cuda_version + "-"
        folder_name = device_version + serving_server_version
        tar_name = folder_name + ".tar.gz"

--- a/python/paddle_serving_server_gpu/serve.py
+++ b/python/paddle_serving_server_gpu/serve.py
@@ -64,6 +64,8 @@ def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-miss
    server.set_memory_optimize(mem_optim)
    server.set_ir_optimize(ir_optim)
    server.set_max_body_size(max_body_size)
+    if args.use_trt:
+        server.set_trt()
    if args.product_name != None:
        server.set_product_name(args.product_name)

--- a/python/setup.py.server_gpu.in
+++ b/python/setup.py.server_gpu.in
@@ -19,11 +19,13 @@ from __future__ import print_function
 from setuptools import setup, Distribution, Extension
 from setuptools import find_packages
 from setuptools import setup
-from paddle_serving_server_gpu.version import serving_server_version
+from paddle_serving_server_gpu.version import serving_server_version, cuda_version
 import util
-max_version, mid_version, min_version = util.python_version()
+if cuda_version != "trt":
+    cuda_version = "post" + cuda_version
+max_version, mid_version, min_version = util.python_version()
 # gen pipeline proto code
 util.gen_pipeline_code("paddle_serving_server_gpu")
@@ -56,7 +58,7 @@ package_data={'paddle_serving_server_gpu': ['pipeline/gateway/libproxy_server.so
 setup(
    name='paddle-serving-server-gpu',
-    version=serving_server_version.replace('-', '') + '.post@CUDA_VERSION_MAJOR@',
+    version=serving_server_version.replace('-', '') + "." + cuda_version,
    description=
    ('Paddle Serving Package for saved model with PaddlePaddle'),
    url='https://github.com/PaddlePaddle/Serving',