diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c497e3e048c4dd8d5c1291286de2ab9d218b914..59d6fcb07d27e1f3ab259e69d36708b775c1852a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,7 @@ option(SERVER "Compile Paddle Serving Server" OFF) option(APP "Compile Paddle Serving App package" OFF) option(WITH_ELASTIC_CTR "Compile ELASITC-CTR solution" OFF) option(PACK "Compile for whl" OFF) +option(WITH_TRT "Compile Paddle Serving with TRT" OFF) set(WITH_MKLML ${WITH_MKL}) if (NOT DEFINED WITH_MKLDNN) diff --git a/README.md b/README.md index 44cee7bac8087a60e08754f007ad33bdebad98e3..fb537b65db83d013f570c8208f21c219ca5084a3 100644 --- a/README.md +++ b/README.md @@ -128,6 +128,7 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po | `mem_optim_off` | - | - | Disable memory / graphic memory optimization | | `ir_optim` | - | - | Enable analysis and optimization of calculation graph | | `use_mkl` (Only for cpu version) | - | - | Run inference with MKL | +| `use_trt` (Only for trt version) | - | - | Run inference with TensorRT | Here, we use `curl` to send a HTTP POST request to the service we just started. Users can use any python library to send HTTP POST as well, e.g, [requests](https://requests.readthedocs.io/en/master/). diff --git a/README_CN.md b/README_CN.md index 8bdc2702a68ed88437495fe8b4ced3817742d13a..2c37a26681d4291adcf7e8e70d3392772fabbe6b 100644 --- a/README_CN.md +++ b/README_CN.md @@ -124,6 +124,7 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po | `mem_optim_off` | - | - | Disable memory optimization | | `ir_optim` | - | - | Enable analysis and optimization of calculation graph | | `use_mkl` (Only for cpu version) | - | - | Run inference with MKL | +| `use_trt` (Only for trt version) | - | - | Run inference with TensorRT | 我们使用 `curl` 命令来发送HTTP POST请求给刚刚启动的服务。用户也可以调用python库来发送HTTP POST请求,请参考英文文档 [requests](https://requests.readthedocs.io/en/master/)。 diff --git a/cmake/paddlepaddle.cmake b/cmake/paddlepaddle.cmake index 5a164e93437e59e9b93ad6472755adffea8421ae..4b7d3ed1f620bfcd2e1e214c49c57ee3848129e7 100644 --- a/cmake/paddlepaddle.cmake +++ b/cmake/paddlepaddle.cmake @@ -34,7 +34,11 @@ message( "WITH_GPU = ${WITH_GPU}") SET(PADDLE_VERSION "1.8.4") if (WITH_GPU) - SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda${CUDA_VERSION_MAJOR}-cudnn7-avx-mkl") + if (WITH_TRT) + SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10.1-cudnn7.6-avx-mkl-trt6") + else() + SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda10-cudnn7-avx-mkl") + endif() else() if (WITH_AVX) if (WITH_MKLML) @@ -50,21 +54,38 @@ endif() SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/fluid_inference.tgz") MESSAGE(STATUS "PADDLE_LIB_PATH=${PADDLE_LIB_PATH}") if (WITH_GPU OR WITH_MKLML) -ExternalProject_Add( - "extern_paddle" - ${EXTERNAL_PROJECT_LOG_ARGS} - URL "${PADDLE_LIB_PATH}" - PREFIX "${PADDLE_SOURCES_DIR}" - DOWNLOAD_DIR "${PADDLE_DOWNLOAD_DIR}" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - UPDATE_COMMAND "" - INSTALL_COMMAND - ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/include ${PADDLE_INSTALL_DIR}/include && - ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/lib ${PADDLE_INSTALL_DIR}/lib && - ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/third_party ${PADDLE_INSTALL_DIR}/third_party && - ${CMAKE_COMMAND} -E copy ${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib/libmkldnn.so.0 ${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib/libmkldnn.so -) + if (WITH_TRT) + ExternalProject_Add( + "extern_paddle" + ${EXTERNAL_PROJECT_LOG_ARGS} + URL "${PADDLE_LIB_PATH}" + PREFIX "${PADDLE_SOURCES_DIR}" + DOWNLOAD_DIR "${PADDLE_DOWNLOAD_DIR}" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND + ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/include ${PADDLE_INSTALL_DIR}/include && + ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/lib ${PADDLE_INSTALL_DIR}/lib && + ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/third_party ${PADDLE_INSTALL_DIR}/third_party + ) + else() + ExternalProject_Add( + "extern_paddle" + ${EXTERNAL_PROJECT_LOG_ARGS} + URL "${PADDLE_LIB_PATH}" + PREFIX "${PADDLE_SOURCES_DIR}" + DOWNLOAD_DIR "${PADDLE_DOWNLOAD_DIR}" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND + ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/include ${PADDLE_INSTALL_DIR}/include && + ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/lib ${PADDLE_INSTALL_DIR}/lib && + ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/third_party ${PADDLE_INSTALL_DIR}/third_party && + ${CMAKE_COMMAND} -E copy ${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib/libmkldnn.so.0 ${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib/libmkldnn.so + ) + endif() else() ExternalProject_Add( "extern_paddle" @@ -92,8 +113,16 @@ LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib) ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/openblas/lib/libopenblas.a) -ADD_LIBRARY(paddle_fluid STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.a) +ADD_LIBRARY(paddle_fluid SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.so) + +if (WITH_TRT) +ADD_LIBRARY(nvinfer SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET nvinfer PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer.so) + +ADD_LIBRARY(nvinfer_plugin SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET nvinfer_plugin PROPERTY IMPORTED_LOCATION ${TENSORRT_ROOT}/lib/libnvinfer_plugin.so) +endif() ADD_LIBRARY(xxhash STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET xxhash PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/xxhash/lib/libxxhash.a) @@ -101,4 +130,9 @@ SET_PROPERTY(TARGET xxhash PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/thir LIST(APPEND external_project_dependencies paddle) LIST(APPEND paddle_depend_libs - xxhash) + xxhash) + +if(WITH_TRT) +LIST(APPEND paddle_depend_libs + nvinfer nvinfer_plugin) +endif() diff --git a/core/configure/proto/server_configure.proto b/core/configure/proto/server_configure.proto index de32637b2a523df1a8d8cd2e28dcf29e79ff96dc..c008ee857bb7c69672e399ce44b2420d5db7fb3c 100644 --- a/core/configure/proto/server_configure.proto +++ b/core/configure/proto/server_configure.proto @@ -44,6 +44,7 @@ message EngineDesc { optional bool static_optimization = 14; optional bool force_update_static_cache = 15; optional bool enable_ir_optimization = 16; + optional bool use_trt = 17; }; // model_toolkit conf diff --git a/core/general-server/CMakeLists.txt b/core/general-server/CMakeLists.txt index 9056e229a51f56463dc2eec5629f219d00dc6a38..aa1b7badc9140301d84bdbd94b3324b52176e837 100644 --- a/core/general-server/CMakeLists.txt +++ b/core/general-server/CMakeLists.txt @@ -9,7 +9,7 @@ endif() target_include_directories(serving PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/../../core/predictor ) - + include_directories(${CUDNN_ROOT}/include/) if(WITH_GPU) target_link_libraries(serving -Wl,--whole-archive fluid_gpu_engine -Wl,--no-whole-archive) @@ -29,7 +29,11 @@ if(WITH_GPU) endif() if(WITH_MKL OR WITH_GPU) + if (WITH_TRT) + target_link_libraries(serving -liomp5 -lmklml_intel -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -lbz2) + else() target_link_libraries(serving -liomp5 -lmklml_intel -lmkldnn -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -lbz2) +endif() else() target_link_libraries(serving openblas -lpthread -lcrypto -lm -lrt -lssl -ldl -lz -lbz2) endif() diff --git a/core/predictor/CMakeLists.txt b/core/predictor/CMakeLists.txt index 6b5013c3edadb4592df40db539fa75fb9364d02f..637c7c15530273bc908ec2f8693a3d66989eebd2 100644 --- a/core/predictor/CMakeLists.txt +++ b/core/predictor/CMakeLists.txt @@ -13,7 +13,9 @@ set_source_files_properties( PROPERTIES COMPILE_FLAGS "-Wno-strict-aliasing -Wno-unused-variable -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") add_dependencies(pdserving protobuf boost brpc leveldb pdcodegen configure) - +if (WITH_TRT) + add_definitions(-DWITH_TRT) +endif() target_link_libraries(pdserving brpc protobuf boost leveldb configure -lpthread -lcrypto -lm -lrt -lssl -ldl -lz) diff --git a/core/predictor/framework/infer.h b/core/predictor/framework/infer.h index 1cff7647e2dbbcc8df4d144f81488fde35aeb798..431bc456326c1714dce48e2f6321bf58f3e021ce 100644 --- a/core/predictor/framework/infer.h +++ b/core/predictor/framework/infer.h @@ -38,6 +38,7 @@ class InferEngineCreationParams { _enable_ir_optimization = false; _static_optimization = false; _force_update_static_cache = false; + _use_trt = false; } void set_path(const std::string& path) { _path = path; } @@ -50,12 +51,16 @@ class InferEngineCreationParams { _enable_ir_optimization = enable_ir_optimization; } + void set_use_trt(bool use_trt) { _use_trt = use_trt; } + bool enable_memory_optimization() const { return _enable_memory_optimization; } bool enable_ir_optimization() const { return _enable_ir_optimization; } + bool use_trt() const { return _use_trt; } + void set_static_optimization(bool static_optimization = false) { _static_optimization = static_optimization; } @@ -86,6 +91,7 @@ class InferEngineCreationParams { bool _enable_ir_optimization; bool _static_optimization; bool _force_update_static_cache; + bool _use_trt; }; class InferEngine { @@ -172,6 +178,10 @@ class ReloadableInferEngine : public InferEngine { force_update_static_cache); } + if (conf.has_use_trt()) { + _infer_engine_params.set_use_trt(conf.use_trt()); + } + if (!check_need_reload() || load(_infer_engine_params) != 0) { LOG(ERROR) << "Failed load model_data_path" << _model_data_path; return -1; @@ -553,8 +563,12 @@ class CloneDBReloadableInferEngine }; template +#ifdef WITH_TRT +class FluidInferEngine : public DBReloadableInferEngine { +#else class FluidInferEngine : public CloneDBReloadableInferEngine { - public: +#endif + public: // NOLINT FluidInferEngine() {} ~FluidInferEngine() {} diff --git a/core/sdk-cpp/include/abtest.h b/core/sdk-cpp/include/abtest.h index a4275fcf52413bb85bcca1fcb470ea2360fdf174..47a502745ae8aa6297729a0a3695600402cf5cfe 100644 --- a/core/sdk-cpp/include/abtest.h +++ b/core/sdk-cpp/include/abtest.h @@ -51,8 +51,8 @@ class WeightedRandomRender : public EndpointRouterBase { new (std::nothrow) Factory(); if (factory == NULL) { RAW_LOG(ERROR, - "Failed regist factory: WeightedRandomRender->EndpointRouterBase \ - in macro!"); + "Failed regist factory: WeightedRandomRender->EndpointRouterBase " + "in macro!"); return -1; } @@ -63,8 +63,8 @@ class WeightedRandomRender : public EndpointRouterBase { if (FactoryPool::instance().register_factory( "WeightedRandomRender", factory) != 0) { RAW_LOG(INFO, - "Factory has been registed: \ - WeightedRandomRender->EndpointRouterBase."); + "Factory has been registed: " + "WeightedRandomRender->EndpointRouterBase."); } return 0; diff --git a/doc/COMPILE.md b/doc/COMPILE.md index 640d599dda555a8fef70ee0c42de29eae022c720..cf0bfdf2593ff0274e4bec20d3b1524f2e61241a 100644 --- a/doc/COMPILE.md +++ b/doc/COMPILE.md @@ -77,10 +77,10 @@ export PATH=$PATH:$GOPATH/bin ```shell go env -w GO111MODULE=on go env -w GOPROXY=https://goproxy.cn,direct -go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway -go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger -go get -u github.com/golang/protobuf/protoc-gen-go -go get -u google.golang.org/grpc +go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2 +go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2 +go get -u github.com/golang/protobuf/protoc-gen-go@v1.4.3 +go get -u google.golang.org/grpc@v1.33.0 ``` @@ -91,9 +91,9 @@ go get -u google.golang.org/grpc ``` shell mkdir server-build-cpu && cd server-build-cpu cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \ - -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \ - -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \ - -DSERVER=ON .. + -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \ + -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \ + -DSERVER=ON .. make -j10 ``` @@ -104,10 +104,28 @@ you can execute `make install` to put targets under directory `./output`, you ne ``` shell mkdir server-build-gpu && cd server-build-gpu cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \ - -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \ - -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \ - -DSERVER=ON \ - -DWITH_GPU=ON .. + -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \ + -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \ + -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \ + -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \ + -DSERVER=ON \ + -DWITH_GPU=ON .. +make -j10 +``` + +### Integrated TRT version paddle inference library + +``` +mkdir server-build-trt && cd server-build-trt +cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \ + -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \ + -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \ + -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH} \ + -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \ + -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \ + -DSERVER=ON \ + -DWITH_GPU=ON \ + -DWITH_TRT=ON .. make -j10 ``` @@ -136,7 +154,10 @@ execute `make install` to put targets under directory `./output` ```bash mkdir app-build && cd app-build -cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DAPP=ON .. +cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \ + -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \ + -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \ + -DAPP=ON .. make ``` @@ -167,7 +188,9 @@ Please use the example under `python/examples` to verify. | WITH_AVX | Compile Paddle Serving with AVX intrinsics | OFF | | WITH_MKL | Compile Paddle Serving with MKL support | OFF | | WITH_GPU | Compile Paddle Serving with NVIDIA GPU | OFF | -| CUDNN_ROOT | Define CuDNN library and header path | | +| CUDNN_LIBRARY | Define CuDNN library and header path | | +| CUDA_TOOLKIT_ROOT_DIR | Define CUDA PATH | | +| TENSORRT_ROOT | Define TensorRT PATH | | | CLIENT | Compile Paddle Serving Client | OFF | | SERVER | Compile Paddle Serving Server | OFF | | APP | Compile Paddle Serving App package | OFF | @@ -182,7 +205,8 @@ To compile the Paddle Serving GPU version on bare metal, you need to install the - CUDA - CuDNN -- NCCL2 + +To compile the TensorRT version, you need to install the TensorRT library. Note here: @@ -192,21 +216,12 @@ Note here: The following is the base library version matching relationship used by the PaddlePaddle release version for reference: -| | CUDA | CuDNN | NCCL2 | -| :----: | :-----: | :----------------------: | :----: | -| CUDA 8 | 8.0.61 | CuDNN 7.1.2 for CUDA 8.0 | 2.1.4 | -| CUDA 9 | 9.0.176 | CuDNN 7.3.1 for CUDA 9.0 | 2.2.12 | +| | CUDA | CuDNN | TensorRT | +| :----: | :-----: | :----------------------: | :----: | +| post9 | 9.0 | CuDNN 7.3.1 for CUDA 9.0 | | +| post10 | 10.0 | CuDNN 7.5.1 for CUDA 10.0| | +| trt | 10.1 | CuDNN 7.5.1 for CUDA 10.1| 6.0.1.5 | ### How to make the compiler detect the CuDNN library Download the corresponding CUDNN version from NVIDIA developer official website and decompressing it, add `-DCUDNN_ROOT` to cmake command, to specify the path of CUDNN. - -### How to make the compiler detect the nccl library - -After downloading the corresponding version of the nccl2 library from the NVIDIA developer official website and decompressing it, add the following environment variables (take nccl2.1.4 as an example): - -```shell -export C_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$C_INCLUDE_PATH -export CPLUS_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$CPLUS_INCLUDE_PATH -export LD_LIBRARY_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/lib/:$LD_LIBRARY_PATH -``` diff --git a/doc/COMPILE_CN.md b/doc/COMPILE_CN.md index 392da7ed64bc88a8b92294f2a1f805522433cad1..b3619d9a38e967a139f850e7a605f713b1a57f95 100644 --- a/doc/COMPILE_CN.md +++ b/doc/COMPILE_CN.md @@ -74,10 +74,10 @@ export PATH=$PATH:$GOPATH/bin ```shell go env -w GO111MODULE=on go env -w GOPROXY=https://goproxy.cn,direct -go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway -go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger -go get -u github.com/golang/protobuf/protoc-gen-go -go get -u google.golang.org/grpc +go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2 +go get -u github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2 +go get -u github.com/golang/protobuf/protoc-gen-go@v1.4.3 +go get -u google.golang.org/grpc@v1.33.0 ``` @@ -87,7 +87,10 @@ go get -u google.golang.org/grpc ``` shell mkdir server-build-cpu && cd server-build-cpu -cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON .. +cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \ + -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \ + -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \ + -DSERVER=ON .. make -j10 ``` @@ -97,21 +100,44 @@ make -j10 ``` shell mkdir server-build-gpu && cd server-build-gpu -cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON -DWITH_GPU=ON .. +cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \ + -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \ + -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \ + -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \ + -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \ + -DSERVER=ON \ + -DWITH_GPU=ON .. make -j10 ``` -执行`make install`可以把目标产出放在`./output`目录下。 +### 集成TensorRT版本Paddle Inference Library -**注意:** 编译成功后,需要设置`SERVING_BIN`路径,详见后面的[注意事项](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE_CN.md#注意事项)。 +``` +mkdir server-build-trt && cd server-build-trt +cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \ + -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \ + -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \ + -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH} \ + -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \ + -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \ + -DSERVER=ON \ + -DWITH_GPU=ON \ + -DWITH_TRT=ON .. +make -j10 +``` +执行`make install`可以把目标产出放在`./output`目录下。 +**注意:** 编译成功后,需要设置`SERVING_BIN`路径,详见后面的[注意事项](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE_CN.md#注意事项)。 ## 编译Client部分 ``` shell mkdir client-build && cd client-build -cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DCLIENT=ON .. +cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \ + -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \ + -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \ + -DCLIENT=ON .. make -j10 ``` @@ -123,7 +149,11 @@ make -j10 ```bash mkdir app-build && cd app-build -cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DCMAKE_INSTALL_PREFIX=./output -DAPP=ON .. +cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \ + -DPYTHON_LIBRARIES=$PYTHONROOT/lib/libpython2.7.so \ + -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python \ + -DCMAKE_INSTALL_PREFIX=./output \ + -DAPP=ON .. make ``` @@ -154,7 +184,10 @@ make | WITH_AVX | Compile Paddle Serving with AVX intrinsics | OFF | | WITH_MKL | Compile Paddle Serving with MKL support | OFF | | WITH_GPU | Compile Paddle Serving with NVIDIA GPU | OFF | -| CUDNN_ROOT | Define CuDNN library and header path | | +| WITH_TRT | Compile Paddle Serving with TensorRT | OFF | +| CUDNN_LIBRARY | Define CuDNN library and header path | | +| CUDA_TOOLKIT_ROOT_DIR | Define CUDA PATH | | +| TENSORRT_ROOT | Define TensorRT PATH | | | CLIENT | Compile Paddle Serving Client | OFF | | SERVER | Compile Paddle Serving Server | OFF | | APP | Compile Paddle Serving App package | OFF | @@ -169,7 +202,8 @@ Paddle Serving通过PaddlePaddle预测库支持在GPU上做预测。WITH_GPU选 - CUDA - CuDNN -- NCCL2 + +编译TensorRT版本,需要安装TensorRT库。 这里要注意的是: @@ -178,21 +212,12 @@ Paddle Serving通过PaddlePaddle预测库支持在GPU上做预测。WITH_GPU选 以下是PaddlePaddle发布版本所使用的基础库版本匹配关系,供参考: -| | CUDA | CuDNN | NCCL2 | -| :----: | :-----: | :----------------------: | :----: | -| CUDA 8 | 8.0.61 | CuDNN 7.1.2 for CUDA 8.0 | 2.1.4 | -| CUDA 9 | 9.0.176 | CuDNN 7.3.1 for CUDA 9.0 | 2.2.12 | +| | CUDA | CuDNN | TensorRT | +| :----: | :-----: | :----------------------: | :----: | +| post9 | 9.0 | CuDNN 7.3.1 for CUDA 9.0 | | +| post10 | 10.0 | CuDNN 7.5.1 for CUDA 10.0| | +| trt | 10.1 | CuDNN 7.5.1 for CUDA 10.1| 6.0.1.5 | ### 如何让Paddle Serving编译系统探测到CuDNN库 -从NVIDIA developer官网下载对应版本CuDNN并在本地解压后,在cmake编译命令中增加`-DCUDNN_ROOT`参数,指定CuDNN库所在路径。 - -### 如何让Paddle Serving编译系统探测到nccl库 - -从NVIDIA developer官网下载对应版本nccl2库并解压后,增加如下环境变量 (以nccl2.1.4为例): - -```shell -export C_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$C_INCLUDE_PATH -export CPLUS_INCLUDE_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/include:$CPLUS_INCLUDE_PATH -export LD_LIBRARY_PATH=/path/to/nccl2/cuda8/nccl_2.1.4-1+cuda8.0_x86_64/lib/:$LD_LIBRARY_PATH -``` +从NVIDIA developer官网下载对应版本CuDNN并在本地解压后,在cmake编译命令中增加`-DCUDNN_LIBRARY`参数,指定CuDNN库所在路径。 diff --git a/doc/LATEST_PACKAGES.md b/doc/LATEST_PACKAGES.md index 247c04c000404944e7021093ff8bf3280c2f2539..dc72421ef5b1766955a67814b83071f591700f9c 100644 --- a/doc/LATEST_PACKAGES.md +++ b/doc/LATEST_PACKAGES.md @@ -18,6 +18,8 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server-0.0.0-py2-none-an https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-py3-none-any.whl #cuda 10.0 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py3-none-any.whl +#cuda10.1 with TensorRT 6 +https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py3-none-any.whl ``` ### Python 2 ``` @@ -25,6 +27,8 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10- https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-py2-none-any.whl #cuda 10.0 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py2-none-any.whl +##cuda10.1 with TensorRT 6 +https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py2-none-any.whl ``` ## Client diff --git a/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h b/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h index f65711e04cf601e40f693b045adbaba0cf7ada71..a4d8dda71a7977185106bb1552cb8f39ef6bc50e 100644 --- a/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h +++ b/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h @@ -23,7 +23,6 @@ #include "core/configure/inferencer_configure.pb.h" #include "core/predictor/framework/infer.h" #include "paddle_inference_api.h" // NOLINT -//#include "predictor/framework/infer.h" namespace baidu { namespace paddle_serving { diff --git a/paddle_inference/inferencer-fluid-gpu/CMakeLists.txt b/paddle_inference/inferencer-fluid-gpu/CMakeLists.txt index 725da85b45ca1070badf5343f340e49dce6b936f..6ba3ddd6ba5d80f7b987b7c0dbbbebfdaaf37e46 100644 --- a/paddle_inference/inferencer-fluid-gpu/CMakeLists.txt +++ b/paddle_inference/inferencer-fluid-gpu/CMakeLists.txt @@ -2,6 +2,7 @@ FILE(GLOB fluid_gpu_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp) add_library(fluid_gpu_engine ${fluid_gpu_engine_srcs}) target_include_directories(fluid_gpu_engine PUBLIC ${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/) + add_dependencies(fluid_gpu_engine pdserving extern_paddle configure) target_link_libraries(fluid_gpu_engine pdserving paddle_fluid iomp5 mklml_intel -lpthread -lcrypto -lm -lrt -lssl -ldl -lz) diff --git a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h index 2fc6ae587ff26f5f05ff9332f08067ab49d06254..3782c967823d07c23ba02e5ce0f388dc6b46e181 100644 --- a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h +++ b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h @@ -190,7 +190,7 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore { paddle::AnalysisConfig analysis_config; analysis_config.SetModel(data_path); - analysis_config.EnableUseGpu(100, FLAGS_gpuid); + analysis_config.EnableUseGpu(1500, FLAGS_gpuid); analysis_config.SwitchSpecifyInputNames(true); analysis_config.SetCpuMathLibraryNumThreads(1); @@ -198,12 +198,68 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore { analysis_config.EnableMemoryOptim(); } - if (params.enable_ir_optimization()) { - analysis_config.SwitchIrOptim(true); +#if 0 // todo: support flexible shape + + int min_seq_len = 1; + int max_seq_len = 512; + int opt_seq_len = 128; + int head_number = 12; + int batch = 50; + + std::vector min_in_shape = {batch, min_seq_len, 1}; + std::vector max_in_shape = {batch, max_seq_len, 1}; + std::vector opt_in_shape = {batch, opt_seq_len, 1}; + + std::string input1_name = "src_text_a_ids"; + std::string input2_name = "pos_text_a_ids"; + std::string input3_name = "sent_text_a_ids"; + std::string input4_name = "stack_0.tmp_0"; + + std::map> min_input_shape = { + {input1_name, min_in_shape}, + {input2_name, min_in_shape}, + {input3_name, min_in_shape}, + {input4_name, {batch, head_number, min_seq_len, min_seq_len}}, + }; + + std::map> max_input_shape = { + {input1_name, max_in_shape}, + {input2_name, max_in_shape}, + {input3_name, max_in_shape}, + {input4_name, {batch, head_number, max_seq_len, max_seq_len}}, + }; + std::map> opt_input_shape = { + {input1_name, opt_in_shape}, + {input2_name, opt_in_shape}, + {input3_name, opt_in_shape}, + {input4_name, {batch, head_number, opt_seq_len, opt_seq_len}}, + }; + + analysis_config.SetTRTDynamicShapeInfo( + min_input_shape, max_input_shape, opt_input_shape); +#endif + int max_batch = 32; + int min_subgraph_size = 3; + if (params.use_trt()) { + analysis_config.EnableTensorRtEngine( + 1 << 20, + max_batch, + min_subgraph_size, + paddle::AnalysisConfig::Precision::kFloat32, + false, + false); + LOG(INFO) << "create TensorRT predictor"; } else { - analysis_config.SwitchIrOptim(false); - } + if (params.enable_memory_optimization()) { + analysis_config.EnableMemoryOptim(); + } + if (params.enable_ir_optimization()) { + analysis_config.SwitchIrOptim(true); + } else { + analysis_config.SwitchIrOptim(false); + } + } AutoLock lock(GlobalPaddleCreateMutex::instance()); _core = paddle::CreatePaddlePredictor(analysis_config); diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 4b20cb2001ebb595601f22fa6e4aab8dd5df18f4..23e0b6b507f53f1ab60a32854891b79b377638ce 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -80,6 +80,16 @@ if (SERVER) COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES}) add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp) + elseif(WITH_TRT) + add_custom_command( + OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp + COMMAND cp -r + ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_server_gpu/ ${PADDLE_SERVING_BINARY_DIR}/python/ + COMMAND env ${py_env} ${PYTHON_EXECUTABLE} gen_version.py + "server_gpu" trt + COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel + DEPENDS ${SERVING_SERVER_CORE} server_config_py_proto ${PY_FILES}) + add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp) else() add_custom_command( OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp diff --git a/python/paddle_serving_server_gpu/__init__.py b/python/paddle_serving_server_gpu/__init__.py index 042027066df7a094dca784722345937608ac4099..1c02ead52df49e8b3c4495c0c52f55e13fb18db3 100644 --- a/python/paddle_serving_server_gpu/__init__.py +++ b/python/paddle_serving_server_gpu/__init__.py @@ -73,6 +73,8 @@ def serve_args(): default=False, action="store_true", help="Use Multi-language-service") + parser.add_argument( + "--use_trt", default=False, action="store_true", help="Use TensorRT") parser.add_argument( "--product_name", type=str, @@ -205,6 +207,7 @@ class Server(object): self.cur_path = os.getcwd() self.use_local_bin = False self.gpuid = 0 + self.use_trt = False self.model_config_paths = None # for multi-model in a workflow self.product_name = None self.container_id = None @@ -271,6 +274,9 @@ class Server(object): def set_gpuid(self, gpuid=0): self.gpuid = gpuid + def set_trt(self): + self.use_trt = True + def _prepare_engine(self, model_config_paths, device): if self.model_toolkit_conf == None: self.model_toolkit_conf = server_sdk.ModelToolkitConf() @@ -290,6 +296,7 @@ class Server(object): engine.enable_ir_optimization = self.ir_optimization engine.static_optimization = False engine.force_update_static_cache = False + engine.use_trt = self.use_trt if device == "cpu": engine.type = "FLUID_CPU_ANALYSIS_DIR" @@ -396,7 +403,10 @@ class Server(object): for line in version_file.readlines(): if re.match("cuda_version", line): cuda_version = line.split("\"")[1] - device_version = "serving-gpu-cuda" + cuda_version + "-" + if cuda_version != "trt": + device_version = "serving-gpu-cuda" + cuda_version + "-" + else: + device_version = "serving-gpu-" + cuda_version + "-" folder_name = device_version + serving_server_version tar_name = folder_name + ".tar.gz" diff --git a/python/paddle_serving_server_gpu/serve.py b/python/paddle_serving_server_gpu/serve.py index 9755b188aa2ddee57c9875610b1bee6ac2b8eb2d..c2b170fbeb3f9ee772e86c216fe3776f34187743 100644 --- a/python/paddle_serving_server_gpu/serve.py +++ b/python/paddle_serving_server_gpu/serve.py @@ -64,6 +64,8 @@ def start_gpu_card_model(index, gpuid, args): # pylint: disable=doc-string-miss server.set_memory_optimize(mem_optim) server.set_ir_optimize(ir_optim) server.set_max_body_size(max_body_size) + if args.use_trt: + server.set_trt() if args.product_name != None: server.set_product_name(args.product_name) diff --git a/python/setup.py.server_gpu.in b/python/setup.py.server_gpu.in index 7379bcb83a4e48913d22442543e0f65f93d2fa5f..1303e0404eb9b557dbfb6232ef391aa89c97747a 100644 --- a/python/setup.py.server_gpu.in +++ b/python/setup.py.server_gpu.in @@ -19,11 +19,13 @@ from __future__ import print_function from setuptools import setup, Distribution, Extension from setuptools import find_packages from setuptools import setup -from paddle_serving_server_gpu.version import serving_server_version +from paddle_serving_server_gpu.version import serving_server_version, cuda_version import util -max_version, mid_version, min_version = util.python_version() +if cuda_version != "trt": + cuda_version = "post" + cuda_version +max_version, mid_version, min_version = util.python_version() # gen pipeline proto code util.gen_pipeline_code("paddle_serving_server_gpu") @@ -56,7 +58,7 @@ package_data={'paddle_serving_server_gpu': ['pipeline/gateway/libproxy_server.so setup( name='paddle-serving-server-gpu', - version=serving_server_version.replace('-', '') + '.post@CUDA_VERSION_MAJOR@', + version=serving_server_version.replace('-', '') + "." + cuda_version, description= ('Paddle Serving Package for saved model with PaddlePaddle'), url='https://github.com/PaddlePaddle/Serving',