add multi-TaskExecutor

8c834fba · HexToString · 391d9794 · 8c834fba · 8c834fba · 8c834fba
7 changed file
--- a/core/general-server/op/general_reader_op.cpp
+++ b/core/general-server/op/general_reader_op.cpp
@@ -181,7 +181,10 @@ int GeneralReaderOp::inference() {
    VLOG(2) << "(logid=" << log_id << ") tensor size for var[" << i
            << "]: " << data_len;
    databuf_size = data_len * elem_size;
-    out->at(i).data.Resize(databuf_size);
+    void *databuf_char = MempoolWrapper::instance().malloc(databuf_size);
+    paddle::PaddleBuf paddleBuf(databuf_char, databuf_size);
+    out->at(i).data = paddleBuf;
+    // out->at(i).data.Resize(databuf_size);
    if (out->at(i).lod.size() > 0) {
      VLOG(2) << "(logid=" << log_id << ") var[" << i
              << "] has lod_tensor and len=" << out->at(i).lod[0].back();

--- a/core/predictor/framework/bsf-inl.h
+++ b/core/predictor/framework/bsf-inl.h
@@ -36,7 +36,7 @@ void* TaskExecutor<TaskT>::thread_entry(void* args) {
      static_cast<TaskExecutor<TaskT>*>(context->executor);
  executor->work(context);

-  return NULL;
+  return nullptr;
 }

 template <typename TaskT>
@@ -256,7 +256,8 @@ int TaskExecutor<TaskT>::work(ThreadContext<TaskT>* context) {
 template <typename InItemT, typename OutItemT>
 bool TaskManager<InItemT, OutItemT>::schedule(const void* in,
                                              void* out) {  // NOLINT
-  TaskHandler<TaskT> handler = _executor.schedule(in, out);
+  TaskHandler<TaskT> handler =
+      TaskExecutorVector<TaskT>::instance()[_model_index].schedule(in, out);

  if (handler.valid()) {
    _task_owned = handler;

--- a/core/predictor/framework/bsf.h
+++ b/core/predictor/framework/bsf.h
@@ -16,7 +16,7 @@

 #include <errno.h>
 #include <algorithm>
-#include <deque>
+#include <list>
 #include <vector>

 #ifdef BCLOUD
@@ -220,7 +220,8 @@ struct TaskMeta {
 // each TaskT is already include batch in itself
 // BatchTasks need to combine several `small TaskMeta` into a new `big TaskT`.
 // The only difference between the `big TaskT` and `small TaskT` is that
-// the TaskT.inVectorT_ptr->[feedvar_index].shape[0] is different.
+// the TaskT.inVectorT_ptr->[feedvar_index].shape[0]
+// which is actually batch_size is different.
 template <typename TaskT>
 class BatchTasks {
 public:
@@ -540,9 +541,6 @@ struct TaskHandler {
 template <typename TaskT>
 class TaskExecutor;

-template <typename InItemT, typename OutItemT>
-class TaskManager;
-
 template <typename TaskT>
 struct ThreadContext {
  TaskExecutor<TaskT>* executor;
@@ -591,10 +589,18 @@ class TaskExecutor {
    THREAD_COND_DESTROY(&_cond);
  }

-  static TaskExecutor<TaskT>* instance() {
-    static TaskExecutor<TaskT> singleton;
-    return &singleton;
+  // cause vector.resize will use copy or move construct.
+  TaskExecutor(TaskExecutor<TaskT>&& other) noexcept {
+    if (this != &other) {
+      TaskExecutor();
+    }
  }
+  /*
+    static TaskExecutor<TaskT>* instance() {
+      static TaskExecutor<TaskT> singleton;
+      return &singleton;
+    }
+  */

  void set_batch_size(size_t batch_size) { _batch_size = batch_size; }

@@ -619,30 +625,35 @@ class TaskExecutor {

  static void* thread_entry(void* args);

- private:
-  TaskExecutor(TaskExecutor<TaskT> const& other);
-  TaskExecutor* operator=(TaskExecutor<TaskT> const& other);
-
  int work(ThreadContext<TaskT>* context);

  TaskHandler<TaskT> schedule(const void*, void*);

  bool move_task_to_batch(BatchTasks<TaskT>& batch);  // NOLINT

+ private:
+  TaskExecutor(TaskExecutor<TaskT> const& other) = delete;
+
+  TaskExecutor& operator=(TaskExecutor<TaskT> const& other) = delete;
+  /*
+  TaskExecutor(TaskExecutor<TaskT> && other) = delete;
+
+  TaskExecutor& operator=(TaskExecutor<TaskT> && other) = delete;
+  */
+
  bool _stop;

  // can't use boost::mutex, because some stupid macro
  THREAD_MUTEX_T _mut;
  THREAD_COND_T _cond;

-  std::deque<TaskT*> _task_queue;
+  std::list<TaskT*> _task_queue;

  boost::function<int(void*)> _thread_init_fn;
  boost::function<int(void*)> _thread_reset_fn;
  void** _user_thread_contexts;

  std::vector<ThreadContext<TaskT>*> _thread_contexts;
-  friend class TaskManager<InType, OutType>;

  size_t _batch_size;
  bool _batch_align;
@@ -650,6 +661,34 @@ class TaskExecutor {
  boost::function<void(const void*, void*)> _fn;
 };

+template <typename TaskT>
+class TaskExecutorVector {
+ public:
+  static TaskExecutorVector<TaskT>& instance() {
+    static TaskExecutorVector<TaskT> singleton;
+    return singleton;
+  }
+
+  void resize(int size) { _vector_executor.resize(size); }
+
+  TaskExecutor<TaskT>& operator[](int index) {
+    if (_vector_executor.size() <= index || index <= -1) {
+      LOG(ERROR) << "_vector_executor.size() <= index or <= -1";
+      throw "_vector_executor.size() <= index or <= -1";
+    }
+    return _vector_executor[index];
+  }
+
+ private:
+  TaskExecutorVector() = default;
+  TaskExecutorVector(const TaskExecutorVector<TaskT>& other) = delete;
+  TaskExecutorVector& operator=(const TaskExecutorVector<TaskT>& other) =
+      delete;
+  TaskExecutorVector(TaskExecutorVector<TaskT>&& other) = delete;
+  TaskExecutorVector& operator=(TaskExecutorVector<TaskT>&& other) = delete;
+  std::vector<TaskExecutor<TaskT>> _vector_executor;
+};
+
 template <typename InItemT, typename OutItemT>
 class TaskManager {
 public:
@@ -657,10 +696,8 @@ class TaskManager {
  typedef typename TaskT::InVectorT InVectorT;
  typedef typename TaskT::OutVectorT OutVectorT;

-  explicit TaskManager(TaskExecutor<TaskT>& exe, size_t batch_size)  // NOLINT
-      : _executor(exe) {}
-
-  TaskManager() : _executor(*TaskExecutor<TaskT>::instance()) {}
+  explicit TaskManager(uint32_t index)  // NOLINT
+      : _model_index(index) {}

  ~TaskManager() { wait(); }

@@ -670,8 +707,8 @@ class TaskManager {
  inline void clear() { wait(); }

 private:
-  TaskExecutor<TaskT>& _executor;
  TaskHandler<TaskT> _task_owned;
+  uint32_t _model_index;
 };  // class TaskManager

 class AutoMutex {

--- a/core/predictor/framework/infer.cpp
+++ b/core/predictor/framework/infer.cpp
@@ -56,15 +56,23 @@ int ReloadableInferEngine::proc_initialize(const configure::EngineDesc& conf,
  }

  // init bsf framework
-  im::bsf::TaskExecutor<TaskT>::instance()->set_thread_init_fn(
-      boost::bind(&InferEngine::thrd_initialize_impl, this));
-  im::bsf::TaskExecutor<TaskT>::instance()->set_thread_reset_fn(
-      boost::bind(&InferEngine::thrd_clear_impl, this));
-  im::bsf::TaskExecutor<TaskT>::instance()->set_thread_callback_fn(
-      boost::bind(&InferEngine::task_infer_impl, this, _1, _2));
-  im::bsf::TaskExecutor<TaskT>::instance()->set_batch_size(_infer_batch_size);
-  im::bsf::TaskExecutor<TaskT>::instance()->set_batch_align(_infer_batch_align);
-  if (im::bsf::TaskExecutor<TaskT>::instance()->start(_infer_thread_num) != 0) {
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
+      .set_thread_init_fn(
+          boost::bind(&InferEngine::thrd_initialize_impl, this));
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
+      .set_thread_init_fn(
+          boost::bind(&InferEngine::thrd_initialize_impl, this));
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
+      .set_thread_reset_fn(boost::bind(&InferEngine::thrd_clear_impl, this));
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index]
+      .set_thread_callback_fn(
+          boost::bind(&InferEngine::task_infer_impl, this, _1, _2));
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].set_batch_size(
+      _infer_batch_size);
+  im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].set_batch_align(
+      _infer_batch_align);
+  if (im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].start(
+          _infer_thread_num) != 0) {
    LOG(ERROR) << "Failed start bsf executor, threads:" << _infer_thread_num;
    return -1;
  }
@@ -75,6 +83,11 @@ int ReloadableInferEngine::proc_initialize(const configure::EngineDesc& conf,
  return 0;
 }

+// Multiple threads will enter this method of the same object
+// One Model corresponds to One ReloadableInferEngine object.
+// ReloadableInferEngine object is Process object.
+// One ReloadableInferEngine object can have several ModelData<EngineCore>
+// ModelData<EngineCore> is Thread object.
 int ReloadableInferEngine::infer(const void* in,
                                 void* out,
                                 uint32_t batch_size) {
@@ -82,7 +95,8 @@ int ReloadableInferEngine::infer(const void* in,
    return infer_impl(in, out, batch_size);
  }

-  im::bsf::TaskManager<paddle::PaddleTensor, paddle::PaddleTensor> task_manager;
+  im::bsf::TaskManager<paddle::PaddleTensor, paddle::PaddleTensor> task_manager(
+      _model_index);

  task_manager.schedule(in, out);
  task_manager.wait();
@@ -110,7 +124,7 @@ int ReloadableInferEngine::proc_finalize() {
  }

  if (_infer_thread_num > 0) {
-    im::bsf::TaskExecutor<TaskT>::instance()->stop();
+    im::bsf::TaskExecutorVector<TaskT>::instance()[_model_index].stop();
  }
  return 0;
 }
@@ -191,6 +205,7 @@ int VersionedInferEngine::proc_initialize(const configure::EngineDesc& conf,
  std::string engine_type = conf.type();
  InferEngine* engine =
      StaticInferFactory::instance().generate_object(engine_type);
+  engine->set_model_index(_model_index);
  if (!engine) {
    LOG(ERROR) << "Failed generate engine with type:" << engine_type;
    return -1;
@@ -373,12 +388,14 @@ int InferManager::proc_initialize(const char* path, const char* file) {
    LOG(ERROR) << "failed load infer config, path: " << path << "/" << file;
    return -1;
  }
-  size_t engine_num = model_toolkit_conf.engines_size();
-  for (size_t ei = 0; ei < engine_num; ++ei) {
+  uint32_t engine_num = model_toolkit_conf.engines_size();
+  im::bsf::TaskExecutorVector<TaskT>::instance().resize(engine_num);
+  for (uint32_t ei = 0; ei < engine_num; ++ei) {
    LOG(INFO) << "model_toolkit_conf.engines(" << ei
              << ").name: " << model_toolkit_conf.engines(ei).name();
    std::string engine_name = model_toolkit_conf.engines(ei).name();
    VersionedInferEngine* engine = new (std::nothrow) VersionedInferEngine();
+    engine->set_model_index(ei);
    if (!engine) {
      LOG(ERROR) << "Failed generate versioned engine: " << engine_name;
      return -1;

--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -17,11 +17,11 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
+#include <functional>
 #include <numeric>
 #include <string>
 #include <utility>
 #include <vector>
-#include <functional>
 #include "core/predictor/common/inner_common.h"
 #include "core/predictor/framework/bsf.h"
 #include "core/predictor/framework/factory.h"
@@ -73,7 +73,7 @@ class InferEngine {
  virtual int infer(const void* in, void* out, uint32_t batch_size = -1) {
    return infer_impl(in, out, batch_size);
  }
-
+  virtual void set_model_index(uint32_t index) { _model_index = index; }
  virtual int reload() = 0;

  virtual uint64_t version() const = 0;
@@ -89,10 +89,12 @@ class InferEngine {
                         void* out,
                         uint32_t batch_size = -1) = 0;
  virtual int task_infer_impl(const void* in, void* out) = 0;  // NOLINT
-
+  
+ protected:
+  uint32_t _model_index;
  // end: framework inner call
 };
-
+typedef im::bsf::Task<paddle::PaddleTensor, paddle::PaddleTensor> TaskT;
 class ReloadableInferEngine : public InferEngine {
 public:
  virtual ~ReloadableInferEngine() {}
@@ -105,7 +107,6 @@ class ReloadableInferEngine : public InferEngine {
  };

  virtual int load(const configure::EngineDesc& conf) = 0;
-  typedef im::bsf::Task<paddle::PaddleTensor, paddle::PaddleTensor> TaskT;

  int proc_initialize_impl(const configure::EngineDesc& conf, bool version);

@@ -372,8 +373,7 @@ class CloneDBReloadableInferEngine

 protected:
  // 模板EngineCore，如果已创建，则多个线程级EngineCore共用该对象的模型数据
-  std::vector<ModelData<EngineCore>*>
-      _cloneTemplate;
+  std::vector<ModelData<EngineCore>*> _cloneTemplate;
 };

 template <typename EngineCore>

--- a/core/predictor/mempool/mempool.cpp
+++ b/core/predictor/mempool/mempool.cpp
@@ -24,7 +24,7 @@ namespace fugue {
 namespace memory {

 void Region::init() {
-  _big_mem_capacity = 64 * 1024 * 1024;  // 64MB
+  _big_mem_capacity = 128 * 1024 * 1024;  // 64MB
  _big_mem_start = new char[_big_mem_capacity];
 }


--- a/core/predictor/mempool/mempool.h
+++ b/core/predictor/mempool/mempool.h
@@ -345,9 +345,10 @@ class Region {
      2 * 1024 *
      1024;  // 2MB,means when you need less than 2M, get memory from Block.

-  // 64MB,means when you need less than 64MB, get memory from BigMemory instead
+  // 128MB,means when you need less than 128MB, get memory from BigMemory
+  // instead
  // of BigNode
-  static const int BIGNODE_MEM_THRESHOLD = (64 * 1024 * 1024 + 1);
+  static const int BIGNODE_MEM_THRESHOLD = (128 * 1024 * 1024 + 1);
  static const int COUNTER_SIZE =
      BIGNODE_MEM_THRESHOLD / BIG_MEM_THRESHOLD + 1;  // this is not used