From f30447d615a59a1a47945b4200eeb65689e0ec56 Mon Sep 17 00:00:00 2001 From: HexToString <506181616@qq.com> Date: Fri, 2 Jul 2021 08:45:21 +0000 Subject: [PATCH] update doc and comment --- core/predictor/framework/bsf.h | 34 +++++++++++++++---- core/predictor/framework/infer.h | 32 ++++++++++++----- .../faster_rcnn_hrnetv2p_w18_1x/README.md | 3 +- .../faster_rcnn_hrnetv2p_w18_1x/README_CN.md | 3 +- .../faster_rcnn_r50_fpn_1x_coco/README.md | 3 +- .../faster_rcnn_r50_fpn_1x_coco/README_CN.md | 3 +- 6 files changed, 59 insertions(+), 19 deletions(-) mode change 100755 => 100644 core/predictor/framework/infer.h diff --git a/core/predictor/framework/bsf.h b/core/predictor/framework/bsf.h index 642c54d3..75cce300 100644 --- a/core/predictor/framework/bsf.h +++ b/core/predictor/framework/bsf.h @@ -133,6 +133,8 @@ struct Task { int element_num = 1; if ((*inVectorT_ptr)[feedvar_index].shape.size() == 1) { // cause shape[0] is batch_size. + // [10,1] = [10], so if shape[1] doesn`t exist. + // should return 1. return 1; } // start from shape[1], cause shape[0] = batch_size. @@ -516,6 +518,13 @@ class BatchTasks { }; // BSF task handle +// TaskHandler is the handle of Task. +// `read_fd` is used for receive signal in brpc Thread. +// 'write_fd' is used for write signal in bsf Thread. +// when TaskMeta is done, bsf Thread will write to 'write_fd'. +// brpc Thread is keeping reading 'read_fd' in a while loop. +// brpc Thread will receive signal when TaskMeta is done. +// so `read_fd` and 'write_fd' is used for communicate in different Thread. template struct TaskHandler { int read_fd; @@ -538,9 +547,11 @@ struct TaskHandler { } }; +// TaskExecutor is a Thread pool. template class TaskExecutor; +// ThreadContext is used for start a bsf Thread. template struct ThreadContext { TaskExecutor* executor; @@ -561,6 +572,15 @@ struct ThreadContext { } }; +// TaskExecutor is a Thread pool. +// Each Model corresponding to a Model. +// TaskT is actually a Request preprocessed by ReaderOp. +// TaskT will be divided as TaskMeta which will be +// put into _task_queue in brpc-Thread by schedule(). +// TaskHander will be returned to brpc-Thread. +// start() function will create `thread_num` bsf Threads. +// every bsf Thread check the _task_queue and take TaskMeta from it. +// when a Task`s all TaskMeta is done, TaskHander will be noticed. template class TaskExecutor { public: @@ -595,12 +615,6 @@ class TaskExecutor { TaskExecutor(); } } - /* - static TaskExecutor* instance() { - static TaskExecutor singleton; - return &singleton; - } - */ void set_batch_size(size_t batch_size) { _batch_size = batch_size; } @@ -661,6 +675,9 @@ class TaskExecutor { boost::function _fn; }; +// TaskExecutorVector is a SingleTon class. +// Each Model corresponding to a TaskExecutor. +// So we need several TaskExecutor when there are more than 1 Model. template class TaskExecutorVector { public: @@ -689,6 +706,11 @@ class TaskExecutorVector { std::vector> _vector_executor; }; +// TaskManager is actually a wrapper of Request in bsf. +// TaskManager`s schedule() change Request to be TaskT. +// and divided TaskT into several TaskMeta to put into the TaskExecutor`s +// task_queue. +// wait() is a while loop to receive signal when a whole Task is done. template class TaskManager { public: diff --git a/core/predictor/framework/infer.h b/core/predictor/framework/infer.h old mode 100755 new mode 100644 index 672e03b8..3cdef9dc --- a/core/predictor/framework/infer.h +++ b/core/predictor/framework/infer.h @@ -89,7 +89,7 @@ class InferEngine { void* out, uint32_t batch_size = -1) = 0; virtual int task_infer_impl(const void* in, void* out) = 0; // NOLINT - + protected: uint32_t _model_index; // end: framework inner call @@ -260,17 +260,27 @@ class DBReloadableInferEngine : public ReloadableInferEngine { } int thrd_clear_impl() { - // for bsf-Task-threads // actually, there are 2 kinds of multi-thread. // 1. brpc thread 2. bsf Task thread // each request is in 1-single brpc thread. // IF (bsf Task thread is not used) - // every single brpc thread thread corresponds to all the EngineCores. - // each request runs all models in 1-single thread brpc thread. - - // IF (bsf Task thread is used) + // every single brpc thread corresponds to all the DBReloadableInferEngines. + // each request runs all models in 1-single brpc thread. + // every single brpc thread will create or clone N predictor. + // N = the number of Model. + // so if there are 2 models, and --thread 10. + // each brpc thread will create predictor of Model-1 and Model-2. + // there are totally 10 predictors of Model-1 and 10 predictors of Model-2 + // cause there are 10 brpc threads. + + // IF bsf Task thread is used。 // there will be a ThreadPool called bsf TaskExecutor. - // in TaskExecutor, 1 bsf thread corresponds to 1 EngineCore. + // TaskExecutorVector is the vector of TaskExecutor. + // the number of TaskExecutor equals to the number of Model. + // 1 TaskExecutor corresponding to 1 Model. + // 1 TaskExecutor have N bsf threads. + // 1 bsf thread corresponds to 1 predictor of + // the Model corresponding to the TaskExecutor. // brpc thread only put the data into the task_queue(which is in // TaskExecutor) // EngineCore->infer() is running in bsf Task thread. @@ -335,8 +345,8 @@ class CloneDBReloadableInferEngine gpu_ids_num); } // gpu_index will be set to be 0, when load() or proc_initial() is called. - // gpu_index < gpu_ids_num, means there are still not create on some GPU - // card. + // gpu_index < gpu_ids_num, means there are predictors still not create + // on some GPU card. // so we need to create the predictor. // gpu_index >= gpu_ids_num, means each GPU card has already create one. // so we need to clone the predictor. @@ -356,6 +366,10 @@ class CloneDBReloadableInferEngine } } else { // when gpu_id = -1, means we use cpu, but the index should be 0. + // _cloneTemplate[-1] will occur error. + // actually, when gpu_id = -1, there is only 1 predictor in + // _cloneTemplate. + // so the index should always be 0 when gpu_id = -1. if (gpu_id == -1) gpu_id = 0; if (!md->cores[next_idx] || md->cores[next_idx]->clone(_cloneTemplate[gpu_id]->get()) != 0) { diff --git a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md index f25bd277..ff4eb101 100644 --- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md +++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README.md @@ -13,7 +13,8 @@ tar xf faster_rcnn_hrnetv2p_w18_1x.tar python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0 ``` -This model support TensorRT, if you want a faster inference, please use `--use_trt`. +This model support TensorRT, if you want a faster inference, please use `--use_trt`. But you need to do some extra work. +Please reference to https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40 ### Prediction diff --git a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md index 2c9048e1..4bd51128 100644 --- a/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md +++ b/python/examples/detection/faster_rcnn_hrnetv2p_w18_1x/README_CN.md @@ -13,7 +13,8 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/ tar xf faster_rcnn_hrnetv2p_w18_1x.tar python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0 ``` -该模型支持TensorRT,如果想要更快的预测速度,可以开启`--use_trt`选项。 +该模型支持TensorRT,如果想要更快的预测速度,可以开启`--use_trt`选项,但此时需要额外设置子图的TRT变长最大最小最优shape. +请参考https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40 ### 执行预测 ``` diff --git a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md index a755b33c..1fb0dfee 100644 --- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md +++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README.md @@ -13,7 +13,8 @@ tar xf faster_rcnn_r50_fpn_1x_coco.tar python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0 ``` -This model support TensorRT, if you want a faster inference, please use `--use_trt`. +This model support TensorRT, if you want a faster inference, please use `--use_trt`. But you need to do some extra work. +Please reference to https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40 ### Perform prediction diff --git a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md index 47f0aca1..7617df7a 100644 --- a/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md +++ b/python/examples/detection/faster_rcnn_r50_fpn_1x_coco/README_CN.md @@ -13,7 +13,8 @@ wget --no-check-certificate https://paddle-serving.bj.bcebos.com/pddet_demo/2.0/ tar xf faster_rcnn_r50_fpn_1x_coco.tar python -m paddle_serving_server.serve --model serving_server --port 9494 --gpu_ids 0 ``` -该模型支持TensorRT,如果想要更快的预测速度,可以开启`--use_trt`选项。 +该模型支持TensorRT,如果想要更快的预测速度,可以开启`--use_trt`选项,但此时需要额外设置子图的TRT变长最大最小最优shape. +请参考https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/c%2B%2B/paddle-trt/trt_dynamic_shape_test.cc#L40 ### 执行预测 ``` -- GitLab