bug fix

2a8dac26 · MRXLT · 6e5a02fa · 2a8dac26 · 2a8dac26 · 2a8dac26
7 changed file
--- a/README.md
+++ b/README.md
@@ -128,6 +128,7 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 | `mem_optim_off` | - | - | Disable memory / graphic memory optimization |
 | `ir_optim` | - | - | Enable analysis and optimization of calculation graph |
 | `use_mkl` (Only for cpu version) | - | - | Run inference with MKL |
+| `use_trt` (Only for trt version) | - | - | Run inference with TensorRT  |
 Here, we use `curl` to send a HTTP POST request to the service we just started. Users can use any python library to send HTTP POST as well, e.g, [requests](https://requests.readthedocs.io/en/master/).
 </center>

--- a/README_CN.md
+++ b/README_CN.md
@@ -124,6 +124,7 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 | `mem_optim_off` | - | - | Disable memory optimization |
 | `ir_optim` | - | - | Enable analysis and optimization of calculation graph |
 | `use_mkl` (Only for cpu version) | - | - | Run inference with MKL |
+| `use_trt` (Only for trt version) | - | - | Run inference with TensorRT  |
 我们使用 `curl` 命令来发送HTTP POST请求给刚刚启动的服务。用户也可以调用python库来发送HTTP POST请求，请参考英文文档 [requests](https://requests.readthedocs.io/en/master/)。
 </center>

--- a/core/predictor/CMakeLists.txt
+++ b/core/predictor/CMakeLists.txt
@@ -13,7 +13,9 @@ set_source_files_properties(
        PROPERTIES
        COMPILE_FLAGS  "-Wno-strict-aliasing -Wno-unused-variable -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 add_dependencies(pdserving protobuf boost brpc leveldb pdcodegen configure)
+if (WITH_TRT)
+    add_definitions(-DWITH_TRT)
+endif()
 target_link_libraries(pdserving
        brpc protobuf boost leveldb configure -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)

--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -563,10 +563,12 @@ class CloneDBReloadableInferEngine
 };
 template <typename FluidFamilyCore>
-// class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore>
+#ifdef WITH_TRT
-// {
 class FluidInferEngine : public DBReloadableInferEngine<FluidFamilyCore> {
- public:
+#else
+class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
+#endif
+ public:  // NOLINT
  FluidInferEngine() {}
  ~FluidInferEngine() {}
@@ -622,7 +624,7 @@ class VersionedInferEngine : public InferEngine {
      LOG(ERROR) << "Failed initialize engine, type:" << engine_type;
      return -1;
    }
-    VLOG(2) << "FLGS_logtostderr " << FLAGS_logtostderr;
+    VLOG(2) << "FLAGS_logtostderr " << FLAGS_logtostderr;
    FLAGS_logtostderr = tmp;
 #else
    if (engine->proc_initialize(conf, version) != 0) {

--- a/doc/LATEST_PACKAGES.md
+++ b/doc/LATEST_PACKAGES.md
@@ -18,6 +18,8 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server-0.0.0-py2-none-an
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-py3-none-any.whl
 #cuda 10.0
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py3-none-any.whl
+#cuda10.1 with TensorRT 6
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py3-none-any.whl
 ```
 ### Python 2
 ```
@@ -25,6 +27,8 @@ https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post9-py2-none-any.whl
 #cuda 10.0
 https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.post10-py2-none-any.whl
+##cuda10.1 with TensorRT 6
+https://paddle-serving.bj.bcebos.com/whl/paddle_serving_server_gpu-0.0.0.trt-py2-none-any.whl
 ```
 ## Client

--- a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
+++ b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
@@ -198,7 +198,7 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {
      analysis_config.EnableMemoryOptim();
    }
-#if 0
+#if 0  // todo: support flexible shape
    int min_seq_len = 1;
    int max_seq_len = 512;
@@ -238,7 +238,7 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {
    analysis_config.SetTRTDynamicShapeInfo(
        min_input_shape, max_input_shape, opt_input_shape);
 #endif
-    int max_batch = 256;
+    int max_batch = 32;
    int min_subgraph_size = 3;
    if (params.use_trt()) {
      analysis_config.EnableTensorRtEngine(
@@ -246,8 +246,8 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {
          max_batch,
          min_subgraph_size,
          paddle::AnalysisConfig::Precision::kFloat32,
-          true,
+          false,
-          true);
+          false);
      LOG(INFO) << "create TensorRT predictor";
    } else {
      if (params.enable_memory_optimization()) {

--- a/python/paddle_serving_server_gpu/__init__.py
+++ b/python/paddle_serving_server_gpu/__init__.py
@@ -403,7 +403,10 @@ class Server(object):
        for line in version_file.readlines():
            if re.match("cuda_version", line):
                cuda_version = line.split("\"")[1]
-                device_version = "serving-gpu-cuda" + cuda_version + "-"
+                if cuda_version != "trt":
+                    device_version = "serving-gpu-cuda" + cuda_version + "-"
+                else:
+                    device_version = "serving-gpu-" + cuda_version + "-"
        folder_name = device_version + serving_server_version
        tar_name = folder_name + ".tar.gz"