From f30447d615a59a1a47945b4200eeb65689e0ec56 Mon Sep 17 00:00:00 2001
From: HexToString <506181616@qq.com>
Date: Fri, 2 Jul 2021 08:45:21 +0000
Subject: [PATCH] update doc and comment

---
 core/predictor/framework/bsf.h                | 34 +++++++++++++++----
 core/predictor/framework/infer.h              | 32 ++++++++++++-----
 .../faster_rcnn_hrnetv2p_w18_1x/README.md     |  3 +-
 .../faster_rcnn_hrnetv2p_w18_1x/README_CN.md  |  3 +-
 .../faster_rcnn_r50_fpn_1x_coco/README.md     |  3 +-
 .../faster_rcnn_r50_fpn_1x_coco/README_CN.md  |  3 +-
 6 files changed, 59 insertions(+), 19 deletions(-)
 mode change 100755 => 100644 core/predictor/framework/infer.h
diff --git a/core/predictor/framework/bsf.h b/core/predictor/framework/bsf.h
index 642c54d3..75cce300 100644
--- a/core/predictor/framework/bsf.h
+++ b/core/predictor/framework/bsf.h
@@ -133,6 +133,8 @@ struct Task {
     int element_num = 1;
     if ((*inVectorT_ptr)[feedvar_index].shape.size() == 1) {
       // cause shape[0] is batch_size.
+      // [10,1] = [10], so if shape[1] doesn`t exist.
+      // should return 1.
       return 1;
     }
     // start from shape[1], cause shape[0] = batch_size.
@@ -516,6 +518,13 @@ class BatchTasks {
 };
 
 // BSF task handle
+// TaskHandler is the handle of Task.
+// `read_fd` is used for receive signal in brpc Thread.
+// 'write_fd' is used for write signal in bsf Thread.
+// when TaskMeta is done, bsf Thread will write to 'write_fd'.
+// brpc Thread is keeping reading 'read_fd' in a while loop.
+// brpc Thread will receive signal when TaskMeta is done.
+// so `read_fd` and 'write_fd' is used for communicate in different Thread.
 template <typename TaskT>
 struct TaskHandler {
   int read_fd;
@@ -538,9 +547,11 @@ struct TaskHandler {
   }
 };
 
+// TaskExecutor is a Thread pool.
 template <typename TaskT>
 class TaskExecutor;
 
+// ThreadContext is used for start a bsf Thread.
 template <typename TaskT>
 struct ThreadContext {
   TaskExecutor<TaskT>* executor;
@@ -561,6 +572,15 @@ struct ThreadContext {
   }
 };
 
+// TaskExecutor is a Thread pool.
+// Each Model corresponding to a Model.
+// TaskT is actually a Request preprocessed by ReaderOp.
+// TaskT will be divided as TaskMeta which will be
+// put into _task_queue in brpc-Thread by schedule().
+// TaskHander will be returned to brpc-Thread.
+// start() function will create `thread_num` bsf Threads.
+// every bsf Thread check the _task_queue and take TaskMeta from it.
+// when a Task`s all TaskMeta is done, TaskHander will be noticed.
 template <typename TaskT>
 class TaskExecutor {
  public:
@@ -595,12 +615,6 @@ class TaskExecutor {
       TaskExecutor();
     }
   }
-  /*
-    static TaskExecutor<TaskT>* instance() {
-      static TaskExecutor<TaskT> singleton;
-      return &singleton;
-    }
-  */
 
   void set_batch_size(size_t batch_size) { _batch_size = batch_size; }
 
@@ -661,6 +675,9 @@ class TaskExecutor {
   boost::function<void(const void*, void*)> _fn;
 };
 
+// TaskExecutorVector is a SingleTon class.
+// Each Model corresponding to a TaskExecutor.
+// So we need several TaskExecutor when there are more than 1 Model.
 template <typename TaskT>
 class TaskExecutorVector {
  public:
@@ -689,6 +706,11 @@ class TaskExecutorVector {
   std::vector<TaskExecutor<TaskT>> _vector_executor;
 };
 
+// TaskManager is actually a wrapper of Request in bsf.
+// TaskManager`s schedule() change Request to be TaskT.
+// and divided TaskT into several TaskMeta to put into the TaskExecutor`s
+// task_queue.
+// wait() is a while loop to receive signal when a whole Task is done.
 template <typename InItemT, typename OutItemT>
 class TaskManager {
  public:
diff --git a/core/predictor/framework/infer.h b/core/predictor/framework/infer.h
old mode 100755
new mode 100644
index 672e03b8..3cdef9dc
--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -89,7 +89,7 @@ class InferEngine {
                          void* out,
                          uint32_t batch_size = -1) = 0;
   virtual int task_infer_impl(const void* in, void* out) = 0;  // NOLINT
-  
+
  protected:
   uint32_t _model_index;
   // end: framework inner call
@@ -260,17 +260,27 @@ class DBReloadableInferEngine : public ReloadableInferEngine {
   }
 
   int thrd_clear_impl() {
-    // for bsf-Task-threads
     // actually, there are 2 kinds of multi-thread.
     // 1. brpc thread 2. bsf Task thread
     // each request is in 1-single brpc thread.
     // IF (bsf Task thread is not used)
-    // every single brpc thread thread corresponds to all the EngineCores.
-    // each request runs all models in 1-single thread brpc thread.
-
-    // IF (bsf Task thread is used)
+    // every single brpc thread corresponds to all the DBReloadableInferEngines.
+    // each request runs all models in 1-single brpc thread.
+    // every single brpc thread will create or clone N predictor.
+    // N = the number of Model.
+    // so if there are 2 models, and --thread 10.
+    // each brpc thread will create predictor of Model-1 and Model-2.
+    // there are totally 10 predictors of Model-1 and 10 predictors of Model-2
+    // cause there are 10 brpc threads.
+
+    // IF bsf Task thread is used。
     // there will be a ThreadPool called bsf TaskExecutor.
-    // in TaskExecutor, 1 bsf thread corresponds to 1 EngineCore.
+    // TaskExecutorVector is the vector of TaskExecutor.
+    // the number of TaskExecutor equals to the number of Model.
+    // 1 TaskExecutor corresponding to 1 Model.
+    // 1 TaskExecutor have N bsf threads.
+    // 1 bsf thread corresponds to 1 predictor of
+    // the Model corresponding to the TaskExecutor.
     // brpc thread only put the data into the task_queue(which is in
     // TaskExecutor)
     // EngineCore->infer() is running in bsf Task thread.
@@ -335,8 +345,8 @@ class CloneDBReloadableInferEngine
                             gpu_ids_num);
     }
     // gpu_index will be set to be 0, when load() or proc_initial() is called.
-    // gpu_index < gpu_ids_num, means there are still not create on some GPU
-    // card.
+    // gpu_index < gpu_ids_num, means there are predictors still not create
+    // on some GPU card.
     // so we need to create the predictor.
     // gpu_index >= gpu_ids_num, means each GPU card has already create one.
     // so we need to clone the predictor.
@@ -356,6 +366,10 @@ class CloneDBReloadableInferEngine
       }
     } else {
       // when gpu_id = -1, means we use cpu, but the index should be 0.
+      // _cloneTemplate[-1] will occur error.
+      // actually, when gpu_id = -1, there is only 1 predictor in
+      // _cloneTemplate.
+      // so the index should always be 0 when gpu_id = -1.
       if (gpu_id == -1) gpu_id = 0;
       if (!md->cores[next_idx] ||
           md->cores[next_idx]->clone(_cloneTemplate[gpu_id]->get()) != 0) {
diff --git a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md
index f25bd277..ff4eb101 100644
--- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md
+++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md
@@ -13,7 +13,8 @@ tar xf faster_rcnn_hrnetv2p_w18_1x.tar
 python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
-This model support TensorRT, if you want a faster inference, please use `--use_trt`. 
+This model support TensorRT, if you want a faster inference, please use `--use_trt`. But you need to do some extra work.
+Please reference to https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40 
 
 
 ### Prediction
diff --git a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md
index 2c9048e1..4bd51128 100644
--- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md
+++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md
@@ -13,7 +13,8 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 tar xf faster_rcnn_hrnetv2p_w18_1x.tar
 python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
-该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
+该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项,但此时需要额外设置子图的TRT变长最大最小最优shape.
+请参考https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40
 
 ### 执行预测
 ```
diff --git a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
index a755b33c..1fb0dfee 100644
--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md
@@ -13,7 +13,8 @@ tar xf faster_rcnn_r50_fpn_1x_coco.tar
 python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
 
-This model support TensorRT, if you want a faster inference, please use `--use_trt`. 
+This model support TensorRT, if you want a faster inference, please use `--use_trt`. But you need to do some extra work.
+Please reference to https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40 
 
 
 ### Perform prediction
diff --git a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
index 47f0aca1..7617df7a 100644
--- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
+++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md
@@ -13,7 +13,8 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/
 tar xf faster_rcnn_r50_fpn_1x_coco.tar
 python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0
 ```
-该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项。
+该模型支持TensorRT，如果想要更快的预测速度，可以开启`--use_trt`选项,但此时需要额外设置子图的TRT变长最大最小最优shape.
+请参考https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40
 
 ### 执行预测
 ```
-- 
GitLab