update doc and comment

f30447d6 · HexToString · 8c834fba · f30447d6 · f30447d6 · f30447d6
6 changed file
--- a/core/predictor/framework/bsf.h
+++ b/core/predictor/framework/bsf.h
@@ -133,6 +133,8 @@ struct Task {
    int element_num = 1;
    if ((*inVectorT_ptr)[feedvar_index].shape.size() == 1) {
      // cause shape[0] is batch_size.
+      // [10,1] = [10], so if shape[1] doesn`t exist.
+      // should return 1.
      return 1;
    }
    // start from shape[1], cause shape[0] = batch_size.
@@ -516,6 +518,13 @@ class BatchTasks {
 };

 // BSF task handle
+// TaskHandler is the handle of Task.
+// `read_fd` is used for receive signal in brpc Thread.
+// 'write_fd' is used for write signal in bsf Thread.
+// when TaskMeta is done, bsf Thread will write to 'write_fd'.
+// brpc Thread is keeping reading 'read_fd' in a while loop.
+// brpc Thread will receive signal when TaskMeta is done.
+// so `read_fd` and 'write_fd' is used for communicate in different Thread.
 template <typename TaskT>
 struct TaskHandler {
  int read_fd;
@@ -538,9 +547,11 @@ struct TaskHandler {
  }
 };

+// TaskExecutor is a Thread pool.
 template <typename TaskT>
 class TaskExecutor;

+// ThreadContext is used for start a bsf Thread.
 template <typename TaskT>
 struct ThreadContext {
  TaskExecutor<TaskT>* executor;
@@ -561,6 +572,15 @@ struct ThreadContext {
  }
 };

+// TaskExecutor is a Thread pool.
+// Each Model corresponding to a Model.
+// TaskT is actually a Request preprocessed by ReaderOp.
+// TaskT will be divided as TaskMeta which will be
+// put into _task_queue in brpc-Thread by schedule().
+// TaskHander will be returned to brpc-Thread.
+// start() function will create `thread_num` bsf Threads.
+// every bsf Thread check the _task_queue and take TaskMeta from it.
+// when a Task`s all TaskMeta is done, TaskHander will be noticed.
 template <typename TaskT>
 class TaskExecutor {
 public:
@@ -595,12 +615,6 @@ class TaskExecutor {
      TaskExecutor();
    }
  }
-  /*
-    static TaskExecutor<TaskT>* instance() {
-      static TaskExecutor<TaskT> singleton;
-      return &singleton;
-    }
-  */

  void set_batch_size(size_t batch_size) { _batch_size = batch_size; }

@@ -661,6 +675,9 @@ class TaskExecutor {
  boost::function<void(const void*, void*)> _fn;
 };

+// TaskExecutorVector is a SingleTon class.
+// Each Model corresponding to a TaskExecutor.
+// So we need several TaskExecutor when there are more than 1 Model.
 template <typename TaskT>
 class TaskExecutorVector {
 public:
@@ -689,6 +706,11 @@ class TaskExecutorVector {
  std::vector<TaskExecutor<TaskT>> _vector_executor;
 };

+// TaskManager is actually a wrapper of Request in bsf.
+// TaskManager`s schedule() change Request to be TaskT.
+// and divided TaskT into several TaskMeta to put into the TaskExecutor`s
+// task_queue.
+// wait() is a while loop to receive signal when a whole Task is done.
 template <typename InItemT, typename OutItemT>
 class TaskManager {
 public:

--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -89,7 +89,7 @@ class InferEngine {
                         void* out,
                         uint32_t batch_size = -1) = 0;
  virtual int task_infer_impl(const void* in, void* out) = 0;  // NOLINT
-  
+
 protected:
  uint32_t _model_index;
  // end: framework inner call
@@ -260,17 +260,27 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
  }

  int thrd_clear_impl() {
-    // for bsf-Task-threads
    // actually, there are 2 kinds of multi-thread.
    // 1. brpc thread 2. bsf Task thread
    // each request is in 1-single brpc thread.
    // IF (bsf Task thread is not used)
-    // every single brpc thread thread corresponds to all the EngineCores.
-    // each request runs all models in 1-single thread brpc thread.
-
-    // IF (bsf Task thread is used)
+    // every single brpc thread corresponds to all the DBReloadableInferEngines.
+    // each request runs all models in 1-single brpc thread.
+    // every single brpc thread will create or clone N predictor.
+    // N = the number of Model.
+    // so if there are 2 models, and --thread 10.
+    // each brpc thread will create predictor of Model-1 and Model-2.
+    // there are totally 10 predictors of Model-1 and 10 predictors of Model-2
+    // cause there are 10 brpc threads.
+
+    // IF bsf Task thread is used。
    // there will be a ThreadPool called bsf TaskExecutor.
-    // in TaskExecutor, 1 bsf thread corresponds to 1 EngineCore.
+    // TaskExecutorVector is the vector of TaskExecutor.
+    // the number of TaskExecutor equals to the number of Model.
+    // 1 TaskExecutor corresponding to 1 Model.
+    // 1 TaskExecutor have N bsf threads.
+    // 1 bsf thread corresponds to 1 predictor of
+    // the Model corresponding to the TaskExecutor.
    // brpc thread only put the data into the task_queue(which is in
    // TaskExecutor)
    // EngineCore->infer() is running in bsf Task thread.
@@ -335,8 +345,8 @@ class CloneDBReloadableInferEngine
                            gpu_ids_num);
    }
    // gpu_index will be set to be 0, when load() or proc_initial() is called.
-    // gpu_index < gpu_ids_num, means there are still not create on some GPU
-    // card.
+    // gpu_index < gpu_ids_num, means there are predictors still not create
+    // on some GPU card.
    // so we need to create the predictor.
    // gpu_index >= gpu_ids_num, means each GPU card has already create one.
    // so we need to clone the predictor.
@@ -356,6 +366,10 @@ class CloneDBReloadableInferEngine
      }
    } else {
      // when gpu_id = -1, means we use cpu, but the index should be 0.
+      // _cloneTemplate[-1] will occur error.
+      // actually, when gpu_id = -1, there is only 1 predictor in
+      // _cloneTemplate.
+      // so the index should always be 0 when gpu_id = -1.
      if (gpu_id == -1) gpu_id = 0;
      if (!md->cores[next_idx] ||
          md->cores[next_idx]->clone(_cloneTemplate[gpu_id]->get()) != 0) {

--- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md
+++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md
@@ -13,7 +13,8 @@ tar xf faster_rcnn_hrnetv2p_w18_1x.tar
 python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```

-This model support TensorRT, if you want a faster inference, please use `--use_trt`. 
+This model support TensorRT, if you want a faster inference, please use `--use_trt`. But you need to do some extra work.
+Please reference to https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40 


 ### Prediction

--- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md
+++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md
@@ -13,7 +13,8 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 tar xf faster_rcnn_hrnetv2p_w18_1x.tar
 python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
-该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
+该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项,但此时需要额外设置子图的TRT变长最大最小最优shape.
+请参考https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40

 ### 执行预测
 ```

--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
@@ -13,7 +13,8 @@ tar xf faster_rcnn_r50_fpn_1x_coco.tar
 python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```

-This model support TensorRT, if you want a faster inference, please use `--use_trt`. 
+This model support TensorRT, if you want a faster inference, please use `--use_trt`. But you need to do some extra work.
+Please reference to https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40 


 ### Perform prediction

--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
@@ -13,7 +13,8 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 tar xf faster_rcnn_r50_fpn_1x_coco.tar
 python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
-该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
+该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项,但此时需要额外设置子图的TRT变长最大最小最优shape.
+请参考https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40

 ### 执行预测
 ```