merge with master 0624

Change-Id: I09b028bf244e63654da0cab154766856f94742d1

merge with master 0624
Change-Id: I09b028bf244e63654da0cab154766856f94742d1
81cd31a8 · wangjiawei04 · cafa0983 · 10c95b9a · 81cd31a8 · 81cd31a8
28 changed file
--- a/demo-serving/BCLOUD
+++ b/demo-serving/BCLOUD
@@ -41,7 +41,7 @@ cpp_source_dirs.append('proto/*.proto')
 # DELETE_AUTO_LIBS('$OUT_ROOT/third-64/glog/output/lib/libglog.a')
 # DELETE_AUTO_LIBS('$OUT_ROOT/third-64/gflags/output/lib/libgflags.a')
 # DELETE_AUTO_LIBS('$OUT_ROOT/third-64/gflags/output/lib/libgflags_nothreads.a')
-# DELETE_AUTO_LIBS('$OUT_ROOT/baidu/paddlepaddle/paddle/output/lib/libpaddle_fluid_noavx_openblas.a')
+DELETE_AUTO_LIBS('$OUT_ROOT/baidu/paddlepaddle/paddle/output/lib/libpaddle_fluid_noavx_openblas.a')
 PROTOFLAGS(
        '--plugin=protoc-gen-pdcodegen=../pdcodegen/plugin/pdcodegen',

--- a/doc/BENCHMARKING.md
+++ b/doc/BENCHMARKING.md
@@ -10,7 +10,7 @@
 下图是一个对serving请求的耗时阶段的不完整分析。图中对brpc的开销，只列出了bthread创建和启动开销。
-![](http://paddle-serving.bj.bcebos.com/doc/serving-timings.png)
+![](serving-timings.png)
 (右键在新窗口中浏览大图)
@@ -280,7 +280,7 @@ Serving扩展能力的测试是指，在不同模型上：
 下图是Paddle Serving在BOW模型上QPS随serving端线程数增加而变化的图表。可以看出当线程数较少时(4线程/8线程/12线程)，QPS的变化规律非常杂乱；当线程数较多时，QPS曲线又基本趋于一致，基本无线性增长关系。
-![](https://paddle-serving.bj.bcebos.com/doc/qps-threads-bow.png)
+![](qps-threads-bow.png)
 (右键在新窗口中浏览大图)
@@ -438,7 +438,7 @@ Serving扩展能力的测试是指，在不同模型上：
 下图是Paddle Serving在CNN模型上QPS随serving端线程数增加而变化的图表。可以看出，随着线程数变大，Serving QPS有较为明显的线性增长关系。可以这样解释此图表：例如，线程数为16时，基本在20个并发时达到最大QPS，此后再增加并发压力QPS基本保持稳定；当线程能够数为24线程时，基本在28并发时达到最大QPS，此后再增大并发压力qps基本保持稳定。
-![](https://paddle-serving.bj.bcebos.com/doc/qps-threads-cnn.png)
+![](qps-threads-cnn.png)
 (右键在新窗口中浏览大图)
@@ -596,7 +596,7 @@ Serving扩展能力的测试是指，在不同模型上：
 下图是Paddle Serving在LSTM模型上QPS随serving端线程数增加而变化的图表。可以看出，随着线程数变大，Serving QPS有较为明显的线性增长关系。可以这样解释此图表：例如，线程数为16时，基本在20个并发时达到最大QPS，此后再增加并发压力QPS基本保持稳定；当线程能够数为24线程时，基本在28并发时达到最大QPS，此后再增大并发压力qps基本保持稳定。
-![](https://paddle-serving.bj.bcebos.com/doc/qps-threads-lstm.png)
+![](qps-threads-lstm.png)
 (右键在新窗口中浏览大图)

--- a/doc/GPU_BENCHMARKING.md
+++ b/doc/GPU_BENCHMARKING.md
--- a/doc/INDEX.md
+++ b/doc/INDEX.md
@@ -16,4 +16,6 @@
 [Benchmarking](BENCHMARKING.md)
+[GPU Benchmarking](GPU_BENCHMARKING.md)
 [FAQ](FAQ.md)
--- a/doc/SERVING_CONFIGURE.md
+++ b/doc/SERVING_CONFIGURE.md
@@ -165,7 +165,7 @@ reloadable_type: 检查reload条件：timestamp_ne/timestamp_gt/md5sum/revision/
 model_data_path: 模型文件路径
-runtime_thread_num: 若大于0， 则启用bsf多线程调度框架，在每个预测bthread worker内启动多线程预测。
+runtime_thread_num: 若大于0， 则启用bsf多线程调度框架，在每个预测bthread worker内启动多线程预测。要注意的是，当启用worker内多线程预测，workflow中OP需要用Serving框架的BatchTensor类做预测的输入和输出 (predictor/framework/infer_data.h, `class BatchTensor`)。
 batch_infer_size: 启用bsf多线程预测时，每个预测线程的batch size

--- a/doc/gpu-local-qps-batchsize.png
+++ b/doc/gpu-local-qps-batchsize.png
--- a/doc/gpu-local-qps-concurrency.png
+++ b/doc/gpu-local-qps-concurrency.png
--- a/doc/gpu-local-time-batchsize.png
+++ b/doc/gpu-local-time-batchsize.png
--- a/doc/gpu-local-time-concurrency.png
+++ b/doc/gpu-local-time-concurrency.png
--- a/doc/gpu-serving-multi-card-multi-concurrency-qps-batchsize-concurrency-client1.png
+++ b/doc/gpu-serving-multi-card-multi-concurrency-qps-batchsize-concurrency-client1.png
--- a/doc/gpu-serving-multi-card-multi-concurrency-qps-batchsize-concurrency-client2.png
+++ b/doc/gpu-serving-multi-card-multi-concurrency-qps-batchsize-concurrency-client2.png
--- a/doc/gpu-serving-multi-card-multi-concurrency-time-batchsize-concurrency-client1.png
+++ b/doc/gpu-serving-multi-card-multi-concurrency-time-batchsize-concurrency-client1.png
--- a/doc/gpu-serving-multi-card-multi-concurrency-time-batchsize-concurrency-client2.png
+++ b/doc/gpu-serving-multi-card-multi-concurrency-time-batchsize-concurrency-client2.png
--- a/doc/gpu-serving-multi-card-single-concurrency-qps-batchsize-client1.png
+++ b/doc/gpu-serving-multi-card-single-concurrency-qps-batchsize-client1.png
--- a/doc/gpu-serving-multi-card-single-concurrency-qps-batchsize-client2.png
+++ b/doc/gpu-serving-multi-card-single-concurrency-qps-batchsize-client2.png
--- a/doc/gpu-serving-multi-card-single-concurrency-time-batchsize-client1.png
+++ b/doc/gpu-serving-multi-card-single-concurrency-time-batchsize-client1.png
--- a/doc/gpu-serving-multi-card-single-concurrency-time-batchsize-client2.png
+++ b/doc/gpu-serving-multi-card-single-concurrency-time-batchsize-client2.png
--- a/doc/gpu-serving-single-card-qps-batchsize.png
+++ b/doc/gpu-serving-single-card-qps-batchsize.png
--- a/doc/gpu-serving-single-card-qps-concurrency.png
+++ b/doc/gpu-serving-single-card-qps-concurrency.png
--- a/doc/gpu-serving-single-card-time-batchsize.png
+++ b/doc/gpu-serving-single-card-time-batchsize.png
--- a/doc/gpu-serving-single-card-time-concurrency.png
+++ b/doc/gpu-serving-single-card-time-concurrency.png
--- a/doc/qps-threads-bow.png
+++ b/doc/qps-threads-bow.png
--- a/doc/qps-threads-cnn.png
+++ b/doc/qps-threads-cnn.png
--- a/doc/qps-threads-lstm.png
+++ b/doc/qps-threads-lstm.png
--- a/doc/serving-timings.png
+++ b/doc/serving-timings.png
--- a/predictor/framework/memory.cpp
+++ b/predictor/framework/memory.cpp
@@ -19,6 +19,27 @@ namespace baidu {
 namespace paddle_serving {
 namespace predictor {
+struct MempoolRegion {
+  MempoolRegion(im::fugue::memory::Region *region,
+                im::Mempool *mempool) :
+      _region(region), _mempool(mempool){}
+  im::fugue::memory::Region *region() {return _region;}
+  im::Mempool *mempool() {return _mempool;}
+  im::fugue::memory::Region* _region;
+  im::Mempool* _mempool;
+  ~MempoolRegion() {
+    if (_region) {
+      delete _region;
+      _region = NULL;
+    }
+    if (_mempool) {
+      delete _mempool;
+      _mempool = NULL;
+    }
+  }
+};
 int MempoolWrapper::initialize() {
  if (THREAD_KEY_CREATE(&_bspec_key, NULL) != 0) {
    LOG(ERROR) << "unable to create thread_key of thrd_data";
@@ -33,16 +54,20 @@ int MempoolWrapper::initialize() {
 }
 int MempoolWrapper::thread_initialize() {
-  _region.init();
+  im::fugue::memory::Region *region = new im::fugue::memory::Region();
-  im::Mempool* p_mempool = new (std::nothrow) im::Mempool(&_region);
+  region->init();
-  if (p_mempool == NULL) {
+  im::Mempool* mempool = new (std::nothrow) im::Mempool(region);
+  MempoolRegion *mempool_region = new MempoolRegion(region, mempool);
+  if (mempool == NULL) {
    LOG(ERROR) << "Failed create thread mempool";
    return -1;
  }
-  if (THREAD_SETSPECIFIC(_bspec_key, p_mempool) != 0) {
+  if (THREAD_SETSPECIFIC(_bspec_key, mempool_region) != 0) {
    LOG(ERROR) << "unable to set the thrd_data";
-    delete p_mempool;
+    delete region;
+    delete mempool;
+    delete mempool_region;
    return -1;
  }
@@ -51,23 +76,34 @@ int MempoolWrapper::thread_initialize() {
 }
 int MempoolWrapper::thread_clear() {
-  im::Mempool* p_mempool = (im::Mempool*)THREAD_GETSPECIFIC(_bspec_key);
+  MempoolRegion* mempool_region = (MempoolRegion*)THREAD_GETSPECIFIC(_bspec_key);
-  if (p_mempool) {
+  if (mempool_region == NULL) {
-    p_mempool->release_block();
+    LOG(WARNING) << "THREAD_GETSPECIFIC() returned NULL";
-    _region.reset();
+    return -1;
+  }
+  im::Mempool* mempool = mempool_region->mempool();
+  im::fugue::memory::Region* region = mempool_region->region();
+  if (mempool) {
+    mempool->release_block();
+    region->reset();
  }
  return 0;
 }
 void* MempoolWrapper::malloc(size_t size) {
-  im::Mempool* p_mempool = (im::Mempool*)THREAD_GETSPECIFIC(_bspec_key);
+  MempoolRegion* mempool_region = (MempoolRegion*)THREAD_GETSPECIFIC(_bspec_key);
-  if (!p_mempool) {
+  if (mempool_region == NULL) {
+    LOG(WARNING) << "THREAD_GETSPECIFIC() returned NULL";
+    return NULL;
+  }
+  im::Mempool* mempool = mempool_region->mempool();
+  if (!mempool) {
    LOG(WARNING) << "Cannot malloc memory:" << size
                 << ", since mempool is not thread initialized";
    return NULL;
  }
-  return p_mempool->malloc(size);
+  return mempool->malloc(size);
 }
 }  // namespace predictor

--- a/predictor/framework/memory.h
+++ b/predictor/framework/memory.h
@@ -39,7 +39,7 @@ class MempoolWrapper {
  void* malloc(size_t size);
 private:
-  im::fugue::memory::Region _region;
+  //im::fugue::memory::Region _region;
  THREAD_KEY_T _bspec_key;
 };

--- a/predictor/src/pdserving.cpp
+++ b/predictor/src/pdserving.cpp
@@ -143,7 +143,7 @@ int main(int argc, char** argv) {
  std::string filename(argv[0]);
  filename = filename.substr(filename.find_last_of('/') + 1);
-  settings.log_file = (std::string("./log/") + filename + ".log").c_str();
+  settings.log_file = strdup((std::string("./log/") + filename + ".log").c_str());
  settings.delete_old = logging::DELETE_OLD_LOG_FILE;
  logging::InitLogging(settings);