Merge branch 'gpu'

0ab4636c · wangguibao · 0a138614 · a5cd92a7 · 0ab4636c · 0ab4636c
12 changed file
--- a/BCLOUD.gpu
+++ b/BCLOUD.gpu
+#edit-mode: -*- python -*-
+#coding:utf-8
+
+WORKROOT('../../../')
+
+CUDA_VERSION("8.0")
+
+#gcc version, default 'gcc'
+COMPILER('gcc482')
+
+#Preprocessor flags.
+CPPFLAGS(r'-D_GNU_SOURCE -D__STDC_LIMIT_MACROS -D__const__= -Dtypeof=__typeof__ -DUSE_PTHREAD -DWITH_GPU')
+#CPPFLAGS(r'-DVERSION=\"%s\"' % SVN_LAST_CHANGED_REV())
+
+#C++ flags.
+CXXFLAGS('-g -O2 -pipe -W -Wall -fPIC -fno-omit-frame-pointer -Wno-deprecated -Wno-unused-parameter -Wno-unused-variable -Wno-unused-local-typedefs -Wno-sign-compare -std=c++11')
+
+#link flags
+LDFLAGS(' -lpthread -lcrypto -lrt -lssl -ldl -lz')
+LDFLAGS(' -L/opt/compiler/cuda-8.0/lib64 -lcudart')
+
+INCPATHS('$INC')
+INCPATHS('./')
+INCPATHS('$OUT_ROOT/third-64/protobuf/output/include')
+
+CONFIGS('third-64/protobuf@protobuf_3-1-0-6209_PD_BL')
+CONFIGS('baidu/base/baidu-rpc@ci-base')
+CONFIGS('baidu/third-party/gflags@v2.0.0@git_branch')
+CONFIGS('baidu/third-party/glog@v0.3.x@git_branch')
+CONFIGS('baidu/third-party/opencv@master@git_branch')
+CONFIGS('baidu/third-party/mklml@v20180406@git_branch')
+NEED_OUTPUT('baidu/third-party/mklml')
+#CONFIGS('baidu/paddlepaddle/paddle@paddle_prebuilt_cuda-1-0-0-1_PD_BL@git_tag')
+
+#LIBS('$WORK_ROOT/baidu/paddlepaddle/paddle/third_party/install/mklml/lib/libiomp5.so')
+#LIBS('$WORK_ROOT/baidu/paddlepaddle/paddle/third_party/install/mklml/lib/libmklml_intel.so')
+# LIBS('$WORK_ROOT/baidu/paddlepaddle/paddle/third_party/install/protobuf/lib/libprotobuf.a')
+#LIBS('$WORK_ROOT/baidu/paddlepaddle/paddle/third_party/install/snappy/lib/libsnappy.a')
+#LIBS('$WORK_ROOT/baidu/paddlepaddle/paddle/third_party/install/snappystream/lib/libsnappystream.a')
+#LIBS('$WORK_ROOT/baidu/paddlepaddle/paddle/third_party/install/xxhash/lib/libxxhash.a')
+#LIBS('$WORK_ROOT/baidu/paddlepaddle/paddle/third_party/install/zlib/lib/libz.a')
+#
+CONFIGS('baidu/paddlepaddle/gpu-infer-temporary@release_cuda8_cudnn7@git_branch')
+
+LIBS('$WORK_ROOT/baidu/paddlepaddle/gpu-infer-temporary/third_party/install/mklml/lib/libiomp5.so')
+LIBS('$WORK_ROOT/baidu/paddlepaddle/gpu-infer-temporary/third_party/install/mklml/lib/libmklml_intel.so')
+# LIBS('$WORK_ROOT/baidu/paddlepaddle/gpu-infer-temporary/third_party/install/protobuf/lib/libprotobuf.a')
+LIBS('$WORK_ROOT/baidu/paddlepaddle/gpu-infer-temporary/third_party/install/snappy/lib/libsnappy.a')
+LIBS('$WORK_ROOT/baidu/paddlepaddle/gpu-infer-temporary/third_party/install/snappystream/lib/libsnappystream.a')
+LIBS('$WORK_ROOT/baidu/paddlepaddle/gpu-infer-temporary/third_party/install/xxhash/lib/libxxhash.a')
+LIBS('$WORK_ROOT/baidu/paddlepaddle/gpu-infer-temporary/third_party/install/zlib/lib/libz.a')
+
+
+DELETE_AUTO_LIBS('$OUT_ROOT/third-64/gflags/output/lib/libgflags.a')
+DELETE_AUTO_LIBS('$OUT_ROOT/third-64/gflags/output/lib/libgflags_nothreads.a')
+#DELETE_AUTO_LIBS('$OUT_ROOT/baidu/third-party/gflags/output/lib/libgflags.a')
+
+#pdcodegen
+HEADERS(GLOB_GEN_SRCS('predictor/proto/pds_option.pb.h'), '$INC/pdcodegen')
+
+pdcodegen_sources = GLOB('pdcodegen/plugin/*.cc')
+pdcodegen_sources += ' predictor/proto/pds_option.proto'
+pdcodegen_sources += ' pdcodegen/src/pdcodegen.cpp'
+
+# DELETE_AUTO_LIBS('$OUT_ROOT/third-64/protobuf/output/lib/libprotobuf.a')
+# DELETE_AUTO_LIBS('$OUT_ROOT/third-64/protobuf/output/lib/libprotobuf-lite.a')
+
+Application('pdcodegen', Sources(pdcodegen_sources))
+
+#sub directory
+Directory('configure')
+
+# predictor
+predictor_sources = []
+predictor_sources.append('predictor/common/*.cpp')
+predictor_sources.append('predictor/op/*.cpp')
+predictor_sources.append('predictor/framework/*.cpp')
+predictor_sources.append('predictor/mempool/*.cpp')
+predictor_sources.append('predictor/proto/*.proto')
+
+HEADERS(GLOB_GEN_SRCS('predictor/proto/*.pb.h'), '$INC/predictor/')
+HEADERS('predictor/common/*.h', '$INC/predictor/common')
+HEADERS('predictor/framework/*.h', '$INC/predictor/framework')
+HEADERS('predictor/mempool/*.h', '$INC/predictor/mempool')
+HEADERS('predictor/op/*.h', '$INC/predictor/op')
+
+StaticLibrary('pdserving', Sources(GLOB(' '.join(predictor_sources)), 'predictor/src/pdserving.cpp'))
+
+# Sub directory
+Directory('inferencer-fluid-gpu')
+Directory('sdk-cpp')
+
+# demo-client
+Directory('sdk-cpp')
+INCPATHS('$INC')
+INCPATHS('$INC/sdk-cpp')
+INCPATHS('$INC/sdk-cpp/include')
+HEADERS(GLOB_GEN_SRCS('configure/proto/sdk_configure.pb.h'), '$INC/sdk-cpp')
+HEADERS(GLOB_GEN_SRCS('predictor/proto/builtin_format.pb.h'), '$INC/sdk-cpp')
+HEADERS(GLOB_GEN_SRCS('predictor/proto/pds_option.pb.h'), '$INC/sdk-cpp')
+HEADERS(GLOB_GEN_SRCS('demo-client/proto/*.pb.h'), '$INC/sdk-cpp')
+
+#Application('ximage', Sources('demo-client/src/ximage.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
+#Application('echo', Sources('demo-client/src/echo.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
+#Application('dense_format', Sources('demo-client/src/dense_format.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
+#Application('sparse_format', Sources('demo-client/src/sparse_format.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
+#Application('int64tensor_format', Sources('demo-client/src/int64tensor_format.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
+#Application('text_classification', Sources('demo-client/src/text_classification.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
+#Application('text_classification_press', Sources('demo-client/src/text_classification_press.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
+
+#OUTPUT('demo-client/conf', '$OUT/demo/client')
+#OUTPUT('demo-client/data', '$OUT/demo/client')
+
+# demo-serving
+INCPATHS('$INC')
+INCPATHS('$INC/predictor')
+INCPATHS('./')
+INCPATHS('./predictor/')
+INCPATHS('$OUT_ROOT/third-64/protobuf/output/include')
+INCPATHS('$OUT_ROOT/baidu/third-party/glog/output/include')
+INCPATHS('$INC/predictor/proto')
+
+PROTOFLAGS(
+        '--plugin=protoc-gen-pdcodegen=./pdcodegen/plugin/pdcodegen',
+        '--pdcodegen_out demo-serving/proto --proto_path=.',
+        '--proto_path=./predictor/proto')
+
+serving_sources = []
+serving_sources.append('demo-serving/op/*.cpp')
+serving_sources.append('demo-serving/proto/*.proto')
+
+HEADERS(GLOB_GEN_SRCS('demo-serving/proto/*.pb.h'), '$INC/demo-serving')
+Application('serving', Sources(GLOB(' '.join(serving_sources))), WholeArchives('$OUT/lib/libinferencer-fluid-gpu.a $OUT/lib/libpdserving.a $OUT/lib/libpdconfigure.a'))
+
+OUTPUT('demo-serving/conf', '$OUT/demo/serving/')
+OUTPUT('demo-serving/data', '$OUT/demo/serving')
+#OUTPUT('$WORK_ROOT/baidu/paddlepaddle/gpu-infer-temporary/third_party/install/mklml/lib/libiomp5.so', '$OUT/demo/serving/bin')
+#OUTPUT('$WORK_ROOT/baidu/paddlepaddle/gpu-infer-temporary/third_party/install/mklml/lib/libmklml_intel.so', '$OUT/demo/serving/bin')
--- a/demo-client/BCLOUD
+++ b/demo-client/BCLOUD
@@ -25,7 +25,6 @@ LIBS('$OUT/lib/libpdconfigure.a')
 CONFIGS('baidu/base/baidu-rpc@ci-base')
 CONFIGS('baidu/third-party/gflags@v2.0.0@git_branch')
 CONFIGS('baidu/third-party/glog@v0.3.x@git_branch')
-CONFIGS('baidu/third-party/opencv@master@git_branch')

 # DELETE_AUTO_LIBS('$OUT_ROOT/third-64/glog/output/lib/libglog.a')
 DELETE_AUTO_LIBS('$OUT_ROOT/third-64/gflags/output/lib/libgflags.a')

--- a/demo-serving/op/classify_op.h
+++ b/demo-serving/op/classify_op.h
@@ -15,7 +15,11 @@
 #pragma once
 #include <vector>
 #ifdef BCLOUD
+#ifdef WITH_GPU
+#include "paddle/paddle_inference_api.h"
+#else
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#endif
 #else
 #include "paddle/fluid/inference/paddle_inference_api.h"
 #endif

--- a/demo-serving/op/reader_op.h
+++ b/demo-serving/op/reader_op.h
@@ -29,7 +29,11 @@
 #include "opencv/highgui.h"

 #ifdef BCLOUD
+#ifdef WITH_GPU
+#include "paddle/paddle_inference_api.h"
+#else
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#endif
 #else
 #include "paddle/fluid/inference/paddle_inference_api.h"
 #endif

--- a/demo-serving/op/text_classification_op.h
+++ b/demo-serving/op/text_classification_op.h
@@ -15,7 +15,11 @@
 #pragma once
 #include <vector>
 #ifdef BCLOUD
+#ifdef WITH_GPU
+#include "paddle/paddle_inference_api.h"
+#else
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#endif
 #else
 #include "paddle/fluid/inference/paddle_inference_api.h"
 #endif

--- a/doc/SERVING_CONFIGURE.md
+++ b/doc/SERVING_CONFIGURE.md
@@ -182,8 +182,10 @@ enable_batch_align:
 |enable_model_toolkit|true|模型管理|
 |enable_protocol_list|baidu_std|brpc 通信协议列表|
 |log_dir|./log|log dir|
-|num_threads|brpc server使用的系统线程数，默认为CPU核数|
-|max_concurrency|并发处理的请求数，设为<=0则为不予限制，若大于0则限定brpc server端同时处理的请求数上限|
+|num_threads||brpc server使用的系统线程数，默认为CPU核数|
+|max_concurrency||并发处理的请求数，设为<=0则为不予限制，若大于0则限定brpc server端同时处理的请求数上限|
+|port|8010|Serving进程接收请求监听端口|
+|gpuid|0|GPU预测时指定Serving进程使用的GPU device id。只允许绑定1张GPU卡|

 可以通过在serving/conf/gflags.conf覆盖默认值，例如
 ```

--- a/inferencer-fluid-gpu/BCLOUD
+++ b/inferencer-fluid-gpu/BCLOUD
+#edit-mode: -*- python -*-
+#coding:utf-8
+
+WORKROOT('../../../../')
+
+#Preprocessor flags.
+CPPFLAGS(r'-D_GNU_SOURCE -D__STDC_LIMIT_MACROS -D__const__= -Dtypeof=__typeof__ -DUSE_PTHREAD -DPYBIND_AVX_MKLML -DWITH_GPU')
+
+#CPPFLAGS(r'-DVERSION=\"%s\"' % SVN_LAST_CHANGED_REV())
+
+#C++ flags.
+CXXFLAGS('-g -O2 -pipe -W -Wall -fPIC -fno-omit-frame-pointer -Wno-deprecated -Wno-unused-parameter -Wno-unused-variable -Wno-unused-local-typedefs -Wno-sign-compare -std=c++11')
+
+ImportConfigsFrom('..')
+
+INCPATHS('$INC')
+INCPATHS('../')
+INCPATHS('$OUT_ROOT/third-64/protobuf/output/include')
+
+HEADERS('include/fluid_gpu_engine.h', '$INC/inferencer-fluid-gpu/include')
+
+inferencer_fluid_gpu_sources = 'src/fluid_gpu_engine.cpp'
+#.a
+StaticLibrary('inferencer-fluid-gpu',
+              Sources(inferencer_fluid_gpu_sources))
+
--- a/inferencer-fluid-gpu/CMakeLists.txt
+++ b/inferencer-fluid-gpu/CMakeLists.txt
+FILE(GLOB fluid_cpu_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
+add_library(fluid_cpu_engine ${fluid_cpu_engine_srcs})
+target_include_directories(fluid_cpu_engine PUBLIC
+        ${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
+add_dependencies(fluid_cpu_engine pdserving extern_paddle configure)
+target_link_libraries(fluid_cpu_engine pdserving paddle_fluid -liomp5 -lmklml_intel -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
+
+install(TARGETS fluid_cpu_engine 
+        ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
+        )
--- a/inferencer-fluid-gpu/include/fluid_gpu_engine.h
+++ b/inferencer-fluid-gpu/include/fluid_gpu_engine.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <pthread.h>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "configure/include/configure_parser.h"
+#include "configure/inferencer_configure.pb.h"
+#ifdef BCLOUD
+#ifdef WITH_GPU
+#include "paddle/paddle_inference_api.h"
+#else
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#endif
+#else
+#include "paddle/fluid/inference/paddle_inference_api.h"
+#endif
+#include "predictor/framework/infer.h"
+
+DECLARE_int32(gpuid);
+
+namespace baidu {
+namespace paddle_serving {
+namespace fluid_gpu {
+
+using configure::SigmoidConf;
+
+class AutoLock {
+ public:
+  explicit AutoLock(pthread_mutex_t& mutex) : _mut(mutex) {
+    pthread_mutex_lock(&mutex);
+  }
+
+  ~AutoLock() { pthread_mutex_unlock(&_mut); }
+
+ private:
+  pthread_mutex_t& _mut;
+};
+
+class GlobalPaddleCreateMutex {
+ public:
+  pthread_mutex_t& mutex() { return _mut; }
+
+  static pthread_mutex_t& instance() {
+    static GlobalPaddleCreateMutex gmutex;
+    return gmutex.mutex();
+  }
+
+ private:
+  GlobalPaddleCreateMutex() { pthread_mutex_init(&_mut, NULL); }
+
+  pthread_mutex_t _mut;
+};
+
+class GlobalSigmoidCreateMutex {
+ public:
+  pthread_mutex_t& mutex() { return _mut; }
+  static pthread_mutex_t& instance() {
+    static GlobalSigmoidCreateMutex gmutex;
+    return gmutex.mutex();
+  }
+
+ private:
+  GlobalSigmoidCreateMutex() { pthread_mutex_init(&_mut, NULL); }
+
+  pthread_mutex_t _mut;
+};
+
+// data interface
+class FluidFamilyCore {
+ public:
+  virtual ~FluidFamilyCore() {}
+  virtual bool Run(const void* in_data, void* out_data) {
+    if (!_core->Run(*(std::vector<paddle::PaddleTensor>*)in_data,
+                    (std::vector<paddle::PaddleTensor>*)out_data)) {
+      LOG(ERROR) << "Failed call Run with paddle predictor";
+      return false;
+    }
+
+    return true;
+  }
+
+  virtual int create(const std::string& data_path) = 0;
+
+  virtual int clone(void* origin_core) {
+    if (origin_core == NULL) {
+      LOG(ERROR) << "origin paddle Predictor is null.";
+      return -1;
+    }
+    paddle::PaddlePredictor* p_predictor =
+        (paddle::PaddlePredictor*)origin_core;
+    _core = p_predictor->Clone();
+    if (_core.get() == NULL) {
+      LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
+      return -1;
+    }
+    return 0;
+  }
+
+  virtual void* get() { return _core.get(); }
+
+ protected:
+  std::unique_ptr<paddle::PaddlePredictor> _core;
+};
+
+// infer interface
+class FluidGpuAnalysisCore : public FluidFamilyCore {
+ public:
+  int create(const std::string& data_path) {
+    if (access(data_path.c_str(), F_OK) == -1) {
+      LOG(ERROR) << "create paddle predictor failed, path not exits: "
+                 << data_path;
+      return -1;
+    }
+
+    paddle::AnalysisConfig analysis_config;
+    analysis_config.SetParamsFile(data_path + "/__params__");
+    analysis_config.SetProgFile(data_path + "/__model__");
+    analysis_config.EnableUseGpu(100, FLAGS_gpuid);
+    analysis_config.SetCpuMathLibraryNumThreads(1);
+    analysis_config.EnableMemoryOptim(false, false);
+    analysis_config.SwitchSpecifyInputNames(true);
+    AutoLock lock(GlobalPaddleCreateMutex::instance());
+    _core =
+        paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
+    if (NULL == _core.get()) {
+      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
+      return -1;
+    }
+
+    LOG(WARNING) << "create paddle predictor sucess, path: " << data_path;
+    return 0;
+  }
+};
+
+class FluidGpuNativeCore : public FluidFamilyCore {
+ public:
+  int create(const std::string& data_path) {
+    if (access(data_path.c_str(), F_OK) == -1) {
+      LOG(ERROR) << "create paddle predictor failed, path not exits: "
+                 << data_path;
+      return -1;
+    }
+
+    paddle::NativeConfig native_config;
+    native_config.param_file = data_path + "/__params__";
+    native_config.prog_file = data_path + "/__model__";
+    native_config.use_gpu = true;
+    native_config.fraction_of_gpu_memory = 0.01;
+    native_config.device = FLAGS_gpuid;
+    AutoLock lock(GlobalPaddleCreateMutex::instance());
+    _core = paddle::CreatePaddlePredictor<paddle::NativeConfig,
+                                          paddle::PaddleEngineKind::kNative>(
+        native_config);
+    if (NULL == _core.get()) {
+      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
+      return -1;
+    }
+
+    LOG(WARNING) << "create paddle predictor sucess, path: " << data_path;
+    return 0;
+  }
+};
+
+class FluidGpuAnalysisDirCore : public FluidFamilyCore {
+ public:
+  int create(const std::string& data_path) {
+    if (access(data_path.c_str(), F_OK) == -1) {
+      LOG(ERROR) << "create paddle predictor failed, path not exits: "
+                 << data_path;
+      return -1;
+    }
+
+    paddle::AnalysisConfig analysis_config;
+    analysis_config.SetModel(data_path);
+    analysis_config.EnableUseGpu(100, FLAGS_gpuid);
+    analysis_config.SwitchSpecifyInputNames(true);
+    analysis_config.SetCpuMathLibraryNumThreads(1);
+    analysis_config.EnableMemoryOptim(false, false);
+    AutoLock lock(GlobalPaddleCreateMutex::instance());
+    _core =
+        paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
+    if (NULL == _core.get()) {
+      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
+      return -1;
+    }
+
+    LOG(WARNING) << "create paddle predictor sucess, path: " << data_path;
+    return 0;
+  }
+};
+
+class FluidGpuNativeDirCore : public FluidFamilyCore {
+ public:
+  int create(const std::string& data_path) {
+    if (access(data_path.c_str(), F_OK) == -1) {
+      LOG(ERROR) << "create paddle predictor failed, path not exits: "
+                 << data_path;
+      return -1;
+    }
+
+
+    paddle::NativeConfig native_config;
+    native_config.model_dir = data_path;
+    native_config.use_gpu = true;
+    native_config.fraction_of_gpu_memory = 0.01;
+    native_config.device = FLAGS_gpuid;
+    AutoLock lock(GlobalPaddleCreateMutex::instance());
+    _core = paddle::CreatePaddlePredictor<paddle::NativeConfig,
+                                          paddle::PaddleEngineKind::kNative>(
+        native_config);
+    if (NULL == _core.get()) {
+      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
+      return -1;
+    }
+
+    LOG(WARNING) << "create paddle predictor sucess, path: " << data_path;
+    return 0;
+  }
+};
+
+class Parameter {
+ public:
+  Parameter() : _row(0), _col(0), _params(NULL) {}
+  ~Parameter() {
+    LOG(INFO) << "before destroy Parameter, file_name[" << _file_name << "]";
+    destroy();
+  }
+
+  int init(int row, int col, const char* file_name) {
+    destroy();
+    _file_name = file_name;
+    _row = row;
+    _col = col;
+    _params = reinterpret_cast<float*>(malloc(_row * _col * sizeof(float)));
+    if (_params == NULL) {
+      LOG(ERROR) << "Load " << _file_name << " malloc error.";
+      return -1;
+    }
+    LOG(WARNING) << "Load parameter file[" << _file_name << "] success.";
+    return 0;
+  }
+
+  void destroy() {
+    _row = 0;
+    _col = 0;
+    if (_params != NULL) {
+      free(_params);
+      _params = NULL;
+    }
+  }
+
+  int load() {
+    if (_params == NULL || _row <= 0 || _col <= 0) {
+      LOG(ERROR) << "load parameter error [not inited].";
+      return -1;
+    }
+
+    FILE* fs = fopen(_file_name.c_str(), "rb");
+    if (fs == NULL) {
+      LOG(ERROR) << "load " << _file_name << " fopen error.";
+      return -1;
+    }
+    static const uint32_t MODEL_FILE_HEAD_LEN = 16;
+    char head[MODEL_FILE_HEAD_LEN] = {0};
+    if (fread(head, 1, MODEL_FILE_HEAD_LEN, fs) != MODEL_FILE_HEAD_LEN) {
+      destroy();
+      LOG(ERROR) << "Load " << _file_name << " read head error.";
+      if (fs != NULL) {
+        fclose(fs);
+        fs = NULL;
+      }
+      return -1;
+    }
+
+    uint32_t matrix_size = _row * _col;
+    if (matrix_size == fread(_params, sizeof(float), matrix_size, fs)) {
+      if (fs != NULL) {
+        fclose(fs);
+        fs = NULL;
+      }
+      LOG(INFO) << "load " << _file_name << " read ok.";
+      return 0;
+    } else {
+      LOG(ERROR) << "load " << _file_name << " read error.";
+      destroy();
+      if (fs != NULL) {
+        fclose(fs);
+        fs = NULL;
+      }
+      return -1;
+    }
+    return 0;
+  }
+
+ public:
+  std::string _file_name;
+  int _row;
+  int _col;
+  float* _params;
+};
+
+class SigmoidModel {
+ public:
+  ~SigmoidModel() {}
+  int load(const char* sigmoid_w_file,
+           const char* sigmoid_b_file,
+           float exp_max,
+           float exp_min) {
+    AutoLock lock(GlobalSigmoidCreateMutex::instance());
+    if (0 != _sigmoid_w.init(2, 1, sigmoid_w_file) || 0 != _sigmoid_w.load()) {
+      LOG(ERROR) << "load params sigmoid_w failed.";
+      return -1;
+    }
+    LOG(WARNING) << "load sigmoid_w [" << _sigmoid_w._params[0] << "] ["
+                 << _sigmoid_w._params[1] << "].";
+    if (0 != _sigmoid_b.init(2, 1, sigmoid_b_file) || 0 != _sigmoid_b.load()) {
+      LOG(ERROR) << "load params sigmoid_b failed.";
+      return -1;
+    }
+    LOG(WARNING) << "load sigmoid_b [" << _sigmoid_b._params[0] << "] ["
+                 << _sigmoid_b._params[1] << "].";
+    _exp_max_input = exp_max;
+    _exp_min_input = exp_min;
+    return 0;
+  }
+
+  int softmax(float x, double& o) {  // NOLINT
+    float _y0 = x * _sigmoid_w._params[0] + _sigmoid_b._params[0];
+    float _y1 = x * _sigmoid_w._params[1] + _sigmoid_b._params[1];
+    _y0 = (_y0 > _exp_max_input)
+              ? _exp_max_input
+              : ((_y0 < _exp_min_input) ? _exp_min_input : _y0);
+    _y1 = (_y1 > _exp_max_input)
+              ? _exp_max_input
+              : ((_y1 < _exp_min_input) ? _exp_min_input : _y1);
+    o = 1.0f / (1.0f + exp(_y0 - _y1));
+    return 0;
+  }
+
+ public:
+  Parameter _sigmoid_w;
+  Parameter _sigmoid_b;
+  float _exp_max_input;
+  float _exp_min_input;
+};
+
+class SigmoidFluidModel {
+ public:
+  int softmax(float x, double& o) {  // NOLINT
+    return _sigmoid_core->softmax(x, o);
+  }  // NOLINT
+
+  std::unique_ptr<SigmoidFluidModel> Clone() {
+    std::unique_ptr<SigmoidFluidModel> clone_model;
+    clone_model.reset(new SigmoidFluidModel());
+    clone_model->_sigmoid_core = _sigmoid_core;
+    clone_model->_fluid_core = _fluid_core->Clone();
+    return std::move(clone_model);
+  }
+
+ public:
+  std::unique_ptr<paddle::PaddlePredictor> _fluid_core;
+  std::shared_ptr<SigmoidModel> _sigmoid_core;
+};
+
+class FluidGpuWithSigmoidCore : public FluidFamilyCore {
+ public:
+  virtual ~FluidGpuWithSigmoidCore() {}
+
+ public:
+  int create(const std::string& model_path) {
+    size_t pos = model_path.find_last_of("/\\");
+    std::string conf_path = model_path.substr(0, pos);
+    std::string conf_file = model_path.substr(pos);
+    configure::SigmoidConf conf;
+    if (configure::read_proto_conf(conf_path, conf_file, &conf) != 0) {
+      LOG(ERROR) << "failed load model path: " << model_path;
+      return -1;
+    }
+
+    _core.reset(new SigmoidFluidModel);
+
+    std::string fluid_model_data_path = conf.dnn_model_path();
+    int ret = load_fluid_model(fluid_model_data_path);
+    if (ret < 0) {
+      LOG(ERROR) << "fail to load fluid model.";
+      return -1;
+    }
+    const char* sigmoid_w_file = conf.sigmoid_w_file().c_str();
+    const char* sigmoid_b_file = conf.sigmoid_b_file().c_str();
+    float exp_max = conf.exp_max_input();
+    float exp_min = conf.exp_min_input();
+    _core->_sigmoid_core.reset(new SigmoidModel);
+    LOG(INFO) << "create sigmoid core[" << _core->_sigmoid_core.get()
+              << "], use count[" << _core->_sigmoid_core.use_count() << "].";
+    ret = _core->_sigmoid_core->load(
+        sigmoid_w_file, sigmoid_b_file, exp_max, exp_min);
+    if (ret < 0) {
+      LOG(ERROR) << "fail to load sigmoid model.";
+      return -1;
+    }
+    return 0;
+  }
+
+  virtual bool Run(const void* in_data, void* out_data) {
+    if (!_core->_fluid_core->Run(
+            *(std::vector<paddle::PaddleTensor>*)in_data,
+            (std::vector<paddle::PaddleTensor>*)out_data)) {
+      LOG(ERROR) << "Failed call Run with paddle predictor";
+      return false;
+    }
+
+    return true;
+  }
+
+  virtual int clone(SigmoidFluidModel* origin_core) {
+    if (origin_core == NULL) {
+      LOG(ERROR) << "origin paddle Predictor is null.";
+      return -1;
+    }
+    _core = origin_core->Clone();
+    if (_core.get() == NULL) {
+      LOG(ERROR) << "fail to clone paddle predictor: " << origin_core;
+      return -1;
+    }
+    LOG(INFO) << "clone sigmoid core[" << _core->_sigmoid_core.get()
+              << "] use count[" << _core->_sigmoid_core.use_count() << "].";
+    return 0;
+  }
+
+  virtual SigmoidFluidModel* get() { return _core.get(); }
+
+  virtual int load_fluid_model(const std::string& data_path) = 0;
+
+  int softmax(float x, double& o) {  // NOLINT
+    return _core->_sigmoid_core->softmax(x, o);
+  }
+
+ protected:
+  std::unique_ptr<SigmoidFluidModel> _core;
+};
+
+class FluidGpuNativeDirWithSigmoidCore : public FluidGpuWithSigmoidCore {
+ public:
+  int load_fluid_model(const std::string& data_path) {
+    if (access(data_path.c_str(), F_OK) == -1) {
+      LOG(ERROR) << "create paddle predictor failed, path not exits: "
+                 << data_path;
+      return -1;
+    }
+
+    paddle::NativeConfig native_config;
+    native_config.model_dir = data_path;
+    native_config.use_gpu = true;
+    native_config.fraction_of_gpu_memory = 0.01;
+    native_config.device = FLAGS_gpuid;
+    AutoLock lock(GlobalPaddleCreateMutex::instance());
+    _core->_fluid_core =
+        paddle::CreatePaddlePredictor<paddle::NativeConfig,
+                                      paddle::PaddleEngineKind::kNative>(
+            native_config);
+    if (NULL == _core.get()) {
+      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
+      return -1;
+    }
+
+    LOG(WARNING) << "create paddle predictor sucess, path: " << data_path;
+    return 0;
+  }
+};
+
+class FluidGpuAnalysisDirWithSigmoidCore : public FluidGpuWithSigmoidCore {
+ public:
+  int load_fluid_model(const std::string& data_path) {
+    if (access(data_path.c_str(), F_OK) == -1) {
+      LOG(ERROR) << "create paddle predictor failed, path not exits: "
+                 << data_path;
+      return -1;
+    }
+
+    paddle::AnalysisConfig analysis_config;
+    analysis_config.SetModel(data_path);
+    analysis_config.EnableUseGpu(100, FLAGS_gpuid);
+    analysis_config.SwitchSpecifyInputNames(true);
+    analysis_config.SetCpuMathLibraryNumThreads(1);
+    analysis_config.EnableMemoryOptim(false, false);
+    AutoLock lock(GlobalPaddleCreateMutex::instance());
+    _core->_fluid_core =
+        paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
+    if (NULL == _core.get()) {
+      LOG(ERROR) << "create paddle predictor failed, path: " << data_path;
+      return -1;
+    }
+
+    LOG(WARNING) << "create paddle predictor sucess, path: " << data_path;
+    return 0;
+  }
+};
+
+}  // namespace fluid_gpu
+}  // namespace paddle_serving
+}  // namespace baidu
--- a/inferencer-fluid-gpu/src/fluid_gpu_engine.cpp
+++ b/inferencer-fluid-gpu/src/fluid_gpu_engine.cpp
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "inferencer-fluid-gpu/include/fluid_gpu_engine.h"
+#include "predictor/framework/factory.h"
+
+DEFINE_int32(gpuid, 0, "GPU device id to use");
+
+namespace baidu {
+namespace paddle_serving {
+namespace fluid_gpu {
+
+REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
+    ::baidu::paddle_serving::predictor::FluidInferEngine<FluidGpuAnalysisCore>,
+    ::baidu::paddle_serving::predictor::InferEngine,
+    "FLUID_GPU_ANALYSIS");
+
+REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
+    ::baidu::paddle_serving::predictor::FluidInferEngine<
+        FluidGpuAnalysisDirCore>,
+    ::baidu::paddle_serving::predictor::InferEngine,
+    "FLUID_GPU_ANALYSIS_DIR");
+
+REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
+    ::baidu::paddle_serving::predictor::FluidInferEngine<
+        FluidGpuAnalysisDirWithSigmoidCore>,
+    ::baidu::paddle_serving::predictor::InferEngine,
+    "FLUID_GPU_ANALYSIS_DIR_SIGMOID");
+
+REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
+    ::baidu::paddle_serving::predictor::FluidInferEngine<FluidGpuNativeCore>,
+    ::baidu::paddle_serving::predictor::InferEngine,
+    "FLUID_GPU_NATIVE");
+
+REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
+    ::baidu::paddle_serving::predictor::FluidInferEngine<FluidGpuNativeDirCore>,
+    ::baidu::paddle_serving::predictor::InferEngine,
+    "FLUID_GPU_NATIVE_DIR");
+
+REGIST_FACTORY_OBJECT_IMPL_WITH_NAME(
+    ::baidu::paddle_serving::predictor::FluidInferEngine<
+        FluidGpuNativeDirWithSigmoidCore>,
+    ::baidu::paddle_serving::predictor::InferEngine,
+    "FLUID_GPU_NATIVE_DIR_SIGMOID");
+
+}  // namespace fluid_gpu
+}  // namespace paddle_serving
+}  // namespace baidu
--- a/predictor/framework/infer.h
+++ b/predictor/framework/infer.h
@@ -461,7 +461,7 @@ class CloneDBReloadableInferEngine
 };

 template <typename FluidFamilyCore>
-class FluidInferEngine : public DBReloadableInferEngine<FluidFamilyCore> {
+class FluidInferEngine : public CloneDBReloadableInferEngine<FluidFamilyCore> {
 public:
  FluidInferEngine() {}
  ~FluidInferEngine() {}

--- a/predictor/src/pdserving.cpp
+++ b/predictor/src/pdserving.cpp
@@ -69,7 +69,28 @@ DEFINE_bool(V, false, "print version, bool");
 DEFINE_bool(g, false, "user defined gflag path");
 DECLARE_string(flagfile);

-void pthread_worker_start_fn() { Resource::instance().thread_initialize(); }
+namespace bthread {
+  extern pthread_mutex_t g_task_control_mutex;
+}
+pthread_mutex_t g_worker_start_fn_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+void pthread_worker_start_fn() {
+  while (pthread_mutex_lock(&g_worker_start_fn_mutex) != 0) {}
+
+  // Try to avoid deadlock in bthread
+  int lock_status = pthread_mutex_trylock(&bthread::g_task_control_mutex);
+  if (lock_status == EBUSY || lock_status == EAGAIN) {
+    pthread_mutex_unlock(&bthread::g_task_control_mutex);
+  }
+  Resource::instance().thread_initialize();
+
+  // Try to avoid deadlock in bthread
+  if (lock_status == EBUSY || lock_status == EAGAIN) {
+    while (pthread_mutex_lock(&bthread::g_task_control_mutex) != 0) {}
+  }
+
+  pthread_mutex_unlock(&g_worker_start_fn_mutex);
+}

 static void g_change_server_port() {
  InferServiceConf conf;