diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8dab01f14a7a82213ae92d5fbcfce619e9939a96..83f9ca4c7dd09b428863f5492996f355fa4b0f07 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,6 +75,7 @@ include(generic)
 include(flags)
 
 if (NOT CLIENT_ONLY)
+include(external/cudnn)
 include(paddlepaddle)
 include(external/opencv)
 endif()
diff --git a/cmake/external/cudnn.cmake b/cmake/external/cudnn.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..98466d44fc0dd91ef0cc8e8eac2660c42a19267c
--- /dev/null
+++ b/cmake/external/cudnn.cmake
@@ -0,0 +1,102 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
+if(WIN32)
+    set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+else(WIN32)
+    set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
+endif(WIN32)
+
+find_path(CUDNN_INCLUDE_DIR cudnn.h
+    PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
+    $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
+    NO_DEFAULT_PATH
+)
+
+get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
+
+set(TARGET_ARCH "x86_64")
+if(NOT ${CMAKE_SYSTEM_PROCESSOR})
+    set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+
+list(APPEND CUDNN_CHECK_LIBRARY_DIRS
+    ${CUDNN_ROOT}
+    ${CUDNN_ROOT}/lib64
+    ${CUDNN_ROOT}/lib
+    ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+    ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
+    $ENV{CUDNN_ROOT}
+    $ENV{CUDNN_ROOT}/lib64
+    $ENV{CUDNN_ROOT}/lib
+    /usr/lib
+	${CUDA_TOOLKIT_ROOT_DIR}
+	${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+	)
+set(CUDNN_LIB_NAME "")
+if (LINUX)
+set(CUDNN_LIB_NAME "libcudnn.so")
+endif(LINUX)
+
+if(WIN32)
+# only support cudnn7
+set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
+endif(WIN32)
+
+if(APPLE)
+set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
+endif(APPLE)
+
+find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
+    PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
+          NO_DEFAULT_PATH
+    DOC "Path to cuDNN library.")
+
+
+if(CUDNN_INCLUDE_DIR AND CUDNN_LIBRARY)
+    set(CUDNN_FOUND ON)
+else()
+    set(CUDNN_FOUND OFF)
+endif()
+
+if(CUDNN_FOUND)
+    file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+
+    get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY)
+
+    string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)"
+        CUDNN_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define CUDNN_VERSION +([0-9]+)" "\\1"
+        CUDNN_VERSION "${CUDNN_VERSION}")
+
+    if("${CUDNN_VERSION}" STREQUAL "2000")
+        message(STATUS "Current cuDNN version is v2. ")
+    else()
+        string(REGEX MATCH "define CUDNN_MAJOR +([0-9]+)" CUDNN_MAJOR_VERSION
+            "${CUDNN_VERSION_FILE_CONTENTS}")
+        string(REGEX REPLACE "define CUDNN_MAJOR +([0-9]+)" "\\1"
+            CUDNN_MAJOR_VERSION "${CUDNN_MAJOR_VERSION}")
+        string(REGEX MATCH "define CUDNN_MINOR +([0-9]+)" CUDNN_MINOR_VERSION
+            "${CUDNN_VERSION_FILE_CONTENTS}")
+        string(REGEX REPLACE "define CUDNN_MINOR +([0-9]+)" "\\1"
+            CUDNN_MINOR_VERSION "${CUDNN_MINOR_VERSION}")
+        string(REGEX MATCH "define CUDNN_PATCHLEVEL +([0-9]+)"
+            CUDNN_PATCHLEVEL_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
+        string(REGEX REPLACE "define CUDNN_PATCHLEVEL +([0-9]+)" "\\1"
+            CUDNN_PATCHLEVEL_VERSION "${CUDNN_PATCHLEVEL_VERSION}")
+
+        if(NOT CUDNN_MAJOR_VERSION)
+            set(CUDNN_VERSION "???")
+        else()
+            add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
+            math(EXPR CUDNN_VERSION
+                "${CUDNN_MAJOR_VERSION} * 1000 +
+                 ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
+        endif()
+
+        message(STATUS "Current cuDNN header is ${CUDNN_INCLUDE_DIR}/cudnn.h. "
+            "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ")
+
+    endif()
+endif()
diff --git a/cmake/paddlepaddle.cmake b/cmake/paddlepaddle.cmake
index 3e2d0f742a07a59986a2441d3d56c4202e866961..1cf2c0c867b2ae4b9d8144ebbb25f724882fa3a1 100644
--- a/cmake/paddlepaddle.cmake
+++ b/cmake/paddlepaddle.cmake
@@ -15,71 +15,70 @@
 INCLUDE(ExternalProject)
 
 SET(PADDLE_SOURCES_DIR ${THIRD_PARTY_PATH}/Paddle)
+SET(PADDLE_DOWNLOAD_DIR ${PADDLE_SOURCES_DIR}/src/extern_paddle)
 SET(PADDLE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/Paddle/)
 SET(PADDLE_INCLUDE_DIR "${PADDLE_INSTALL_DIR}/include" CACHE PATH "PaddlePaddle include directory." FORCE)
 SET(PADDLE_LIBRARIES "${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.a" CACHE FILEPATH "Paddle library." FORCE)
 
-INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir)
 
 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
 set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
 
 message( "WITH_GPU = ${WITH_GPU}")
 
-# If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
+
+# Paddle Version should be one of:
+# latest: latest develop build
+# version number like 1.5.2
+SET(PADDLE_VERSION "latest")
+
+if (WITH_GPU)
+    SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-gpu-cuda${CUDA_VERSION_MAJOR}-cudnn7-avx-mkl")
+else()
+    if (AVX_FOUND)
+        if (WITH_MKLML)
+            SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-cpu-avx-mkl")
+        else()
+            SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-cpu-avx-openblas")
+        endif()
+    else()
+        SET(PADDLE_LIB_VERSION "${PADDLE_VERSION}-cpu-noavx-openblas")
+    endif()
+endif()
+
+SET(PADDLE_LIB_PATH "http://paddle-inference-lib.bj.bcebos.com/${PADDLE_LIB_VERSION}/fluid_inference.tgz")
+MESSAGE(STATUS "PADDLE_LIB_PATH=${PADDLE_LIB_PATH}")
+
 ExternalProject_Add(
-    extern_paddle
+    "extern_paddle"
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    # TODO(wangguibao): change to de newst repo when they changed.
-    GIT_REPOSITORY  "https://github.com/PaddlePaddle/Paddle"
-    GIT_TAG         "v1.5.1"
-    PREFIX          ${PADDLE_SOURCES_DIR}
-    UPDATE_COMMAND  ""
-    BINARY_DIR ${CMAKE_BINARY_DIR}/Paddle
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_INSTALL_PREFIX=${PADDLE_INSTALL_DIR}
-                    -DCMAKE_INSTALL_LIBDIR=${PADDLE_INSTALL_DIR}/lib
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    -DCMAKE_PREFIX_PATH=${prefix_path}
-                    -DCMAKE_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}
-                    -DWITH_SWIG_PY=OFF
-                    -DWITH_PYTHON=OFF
-                    -DWITH_MKL=${WITH_MKL}
-                    -DWITH_AVX=${WITH_AVX}
-                    -DWITH_MKLDNN=OFF
-                    -DWITH_GPU=${WITH_GPU}
-                    -DWITH_FLUID_ONLY=ON
-                    -DWITH_TESTING=OFF
-                    -DWITH_DISTRIBUTE=OFF
-                    -DON_INFER=ON
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    LIST_SEPARATOR |
-    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PADDLE_INSTALL_DIR}
-                     -DCMAKE_INSTALL_LIBDIR:PATH=${PADDLE_INSTALL_DIR}/lib
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-    BUILD_COMMAND $(MAKE)
-    INSTALL_COMMAND $(MAKE) fluid_lib_dist
+    URL                 "${PADDLE_LIB_PATH}"
+    PREFIX              "${PADDLE_SOURCES_DIR}"
+    DOWNLOAD_DIR        "${PADDLE_DOWNLOAD_DIR}"
+    CONFIGURE_COMMAND   ""
+    BUILD_COMMAND       ""
+    UPDATE_COMMAND      ""
+    INSTALL_COMMAND
+        ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/include ${PADDLE_INSTALL_DIR}/include &&
+        ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/paddle/lib ${PADDLE_INSTALL_DIR}/lib &&
+        ${CMAKE_COMMAND} -E copy_directory ${PADDLE_DOWNLOAD_DIR}/third_party ${PADDLE_INSTALL_DIR}/third_party &&
+        ${CMAKE_COMMAND} -E copy ${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib/libmkldnn.so.0 ${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib/libmkldnn.so
 )
 
-ExternalProject_Get_Property(extern_paddle BINARY_DIR)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${BINARY_DIR}/fluid_install_dir/third_party/install/mklml/lib")
-LINK_DIRECTORIES(${BINARY_DIR}/fluid_install_dir/third_party/install/mklml/lib)
-
-ADD_LIBRARY(paddle_fluid STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${BINARY_DIR}/fluid_install_dir/paddle/fluid/inference/libpaddle_fluid.a)
+INCLUDE_DIRECTORIES(${PADDLE_INCLUDE_DIR})
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/mklml/lib")
+LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mklml/lib)
 
-LIST(APPEND external_project_dependencies paddle)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib")
+LINK_DIRECTORIES(${PADDLE_INSTALL_DIR}/third_party/install/mkldnn/lib)
 
-ADD_LIBRARY(snappystream STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET snappystream PROPERTY IMPORTED_LOCATION ${BINARY_DIR}/fluid_install_dir/third_party/install/snappystream/lib/libsnappystream.a)
+ADD_LIBRARY(paddle_fluid STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET paddle_fluid PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/lib/libpaddle_fluid.a)
 
 ADD_LIBRARY(xxhash STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET xxhash PROPERTY IMPORTED_LOCATION ${BINARY_DIR}/fluid_install_dir/third_party/install/xxhash/lib/libxxhash.a)
+SET_PROPERTY(TARGET xxhash PROPERTY IMPORTED_LOCATION ${PADDLE_INSTALL_DIR}/third_party/install/xxhash/lib/libxxhash.a)
+
+LIST(APPEND external_project_dependencies paddle)
 
 LIST(APPEND paddle_depend_libs
-        snappystream
-        snappy
         xxhash)
diff --git a/cube/cube-api/include/cube_api.h b/cube/cube-api/include/cube_api.h
index aee82f6dd1d009d5b5b53b3f6fe8de0d591e5d51..c5059b9fea3a1d03cf4e50f8cb8dc994307d31ad 100644
--- a/cube/cube-api/include/cube_api.h
+++ b/cube/cube-api/include/cube_api.h
@@ -18,7 +18,12 @@
 #include <string>
 #include <vector>
 
+#ifdef BCLOUD
+#include "baidu/rpc/server.h"
+#else
 #include "brpc/server.h"
+#endif
+
 #include "cube/cube-api/cube.pb.h"
 #include "cube/cube-api/include/meta.h"
 
diff --git a/cube/cube-api/include/meta.h b/cube/cube-api/include/meta.h
index ec891720c55503ad77de24cd178db53b2170023e..69bbb8ccc12e423d286183ed5dd87e90bf2e59de 100644
--- a/cube/cube-api/include/meta.h
+++ b/cube/cube-api/include/meta.h
@@ -19,11 +19,25 @@
 #include <unordered_map>
 #include <vector>
 
+#ifdef BCLOUD
+#include "baidu/rpc/channel.h"
+#include "baidu/rpc/parallel_channel.h"
+#include "rapidjson/document.h"
+#else
 #include "brpc/channel.h"
 #include "brpc/parallel_channel.h"
 #include "butil/third_party/rapidjson/document.h"
+#endif
+
 #include "bvar/bvar.h"
 
+#ifdef BCLOUD
+namespace brpc = baidu::rpc;
+#ifndef BUTIL_RAPIDJSON_NAMESPACE
+#define BUTIL_RAPIDJSON_NAMESPACE RAPIDJSON_NAMESPACE
+#endif
+#endif
+
 namespace rec {
 namespace mcube {
 
diff --git a/cube/cube-api/src/cube_api.cpp b/cube/cube-api/src/cube_api.cpp
index 8a9cebb9aa92f1bdb13c47cb7c065eaf2738c73f..c481effae450889bbcf25c4e315edca3e8d88e6b 100644
--- a/cube/cube-api/src/cube_api.cpp
+++ b/cube/cube-api/src/cube_api.cpp
@@ -13,8 +13,14 @@
 // limitations under the License.
 
 #include "cube/cube-api/include/cube_api.h"
+#ifdef BCLOUD
+#include <baidu/rpc/channel.h>
+#include <baidu/rpc/parallel_channel.h>
+#else
 #include <brpc/channel.h>
 #include <brpc/parallel_channel.h>
+#endif
+
 #include <google/protobuf/descriptor.h>
 #include "cube/cube-api/include/cube_api_bvar.h"
 
@@ -25,6 +31,10 @@ namespace {
 static ::rec::mcube::CubeAPI* g_ins = NULL;
 }
 
+#ifdef BCLOUD
+namespace brpc = baidu::rpc;
+#endif
+
 namespace rec {
 namespace mcube {
 
diff --git a/cube/cube-api/src/meta.cpp b/cube/cube-api/src/meta.cpp
index 69ce43a08e0f5460dfa4e440958ff247458f6140..06911e8828685abbe3b0912f22f8dd7a0f807e50 100644
--- a/cube/cube-api/src/meta.cpp
+++ b/cube/cube-api/src/meta.cpp
@@ -26,6 +26,10 @@ namespace {
 static ::rec::mcube::Meta* g_ins = NULL;
 }
 
+#ifdef BCLOUD
+namespace brpc = baidu::rpc;
+#endif
+
 namespace rec {
 namespace mcube {
 
diff --git a/demo-client/src/ctr_prediction.cpp b/demo-client/src/ctr_prediction.cpp
index 70b0c841227e411b70ef8c7a6263837804a83b55..92e82a36203e3f39ed871e9f5afc47b619527e90 100644
--- a/demo-client/src/ctr_prediction.cpp
+++ b/demo-client/src/ctr_prediction.cpp
@@ -30,11 +30,17 @@ using baidu::paddle_serving::predictor::ctr_prediction::Response;
 using baidu::paddle_serving::predictor::ctr_prediction::CTRReqInstance;
 using baidu::paddle_serving::predictor::ctr_prediction::CTRResInstance;
 
-int batch_size = 16;
 int sparse_num = 26;
 int dense_num = 13;
-int thread_num = 1;
 int hash_dim = 1000001;
+
+DEFINE_int32(batch_size, 50, "Set the batch size of test file.");
+DEFINE_int32(concurrency, 1, "Set the max concurrency of requests");
+DEFINE_int32(repeat, 1, "Number of data samples iteration count. Default 1");
+DEFINE_bool(enable_profiling,
+            false,
+            "Enable profiling. Will supress a lot normal output");
+
 std::vector<float> cont_min = {0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 std::vector<float> cont_diff = {
     20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50};
@@ -86,7 +92,7 @@ int64_t hash(std::string str) {
 
 int create_req(Request* req,
                const std::vector<std::string>& data_list,
-               int data_index,
+               int start_index,
                int batch_size) {
   for (int i = 0; i < batch_size; ++i) {
     CTRReqInstance* ins = req->add_instances();
@@ -94,12 +100,14 @@ int create_req(Request* req,
       LOG(ERROR) << "Failed create req instance";
       return -1;
     }
+
     // add data
     // avoid out of boundary
-    int cur_index = data_index + i;
+    int cur_index = start_index + i;
     if (cur_index >= data_list.size()) {
       cur_index = cur_index % data_list.size();
     }
+
     std::vector<std::string> feature_list = split(data_list[cur_index], "\t");
     for (int fi = 0; fi < dense_num; fi++) {
       if (feature_list[fi] == "") {
@@ -122,10 +130,10 @@ int create_req(Request* req,
   }
   return 0;
 }
+
 void print_res(const Request& req,
                const Response& res,
                std::string route_tag,
-               uint64_t mid_ms,
                uint64_t elapse_ms) {
   if (res.err_code() != 0) {
     LOG(ERROR) << "Get result fail :" << res.err_msg();
@@ -138,72 +146,90 @@ void print_res(const Request& req,
     LOG(INFO) << "Receive result " << oss.str();
   }
   LOG(INFO) << "Succ call predictor[ctr_prediction_service], the tag is: "
-            << route_tag << ", mid_ms: " << mid_ms
-            << ", elapse_ms: " << elapse_ms;
+            << route_tag << ", elapse_ms: " << elapse_ms;
 }
 
 void thread_worker(PredictorApi* api,
                    int thread_id,
-                   int batch_size,
-                   int server_concurrency,
                    const std::vector<std::string>& data_list) {
   // init
   Request req;
   Response res;
-  api->thrd_initialize();
   std::string line;
-  int turns = 0;
-  while (turns < 1000) {
-    timeval start;
-    gettimeofday(&start, NULL);
-    api->thrd_clear();
-    Predictor* predictor = api->fetch_predictor("ctr_prediction_service");
-    if (!predictor) {
-      LOG(ERROR) << "Failed fetch predictor: ctr_prediction_service";
-      return;
-    }
-    req.Clear();
-    res.Clear();
-    timeval mid;
-    gettimeofday(&mid, NULL);
-    uint64_t mid_ms = (mid.tv_sec * 1000 + mid.tv_usec / 1000) -
-                      (start.tv_sec * 1000 + start.tv_usec / 1000);
-    // wait for other thread
-    while (g_concurrency.load() >= server_concurrency) {
-    }
-    g_concurrency++;
-    LOG(INFO) << "Current concurrency " << g_concurrency.load();
-    int data_index = turns * batch_size;
-    if (create_req(&req, data_list, data_index, batch_size) != 0) {
-      return;
-    }
-    timeval start_run;
-    gettimeofday(&start_run, NULL);
-    if (predictor->inference(&req, &res) != 0) {
-      LOG(ERROR) << "failed call predictor with req:" << req.ShortDebugString();
-      return;
-    }
-    timeval end;
-    gettimeofday(&end, NULL);
-    uint64_t elapse_ms = (end.tv_sec * 1000 + end.tv_usec / 1000) -
-                         (start_run.tv_sec * 1000 + start_run.tv_usec / 1000);
-    response_time[thread_id].push_back(elapse_ms);
-    print_res(req, res, predictor->tag(), mid_ms, elapse_ms);
-    g_concurrency--;
-    LOG(INFO) << "Done. Current concurrency " << g_concurrency.load();
-    turns++;
-  }
-  //
+
+  api->thrd_initialize();
+
+  for (int i = 0; i < FLAGS_repeat; ++i) {
+    int start_index = 0;
+
+    while (true) {
+      if (start_index >= data_list.size()) {
+        break;
+      }
+
+      api->thrd_clear();
+
+      Predictor* predictor = api->fetch_predictor("ctr_prediction_service");
+      if (!predictor) {
+        LOG(ERROR) << "Failed fetch predictor: ctr_prediction_service";
+        return;
+      }
+
+      req.Clear();
+      res.Clear();
+
+      // wait for other thread
+      while (g_concurrency.load() >= FLAGS_concurrency) {
+      }
+      g_concurrency++;
+      LOG(INFO) << "Current concurrency " << g_concurrency.load();
+
+      if (create_req(&req, data_list, start_index, FLAGS_batch_size) != 0) {
+        return;
+      }
+      start_index += FLAGS_batch_size;
+      LOG(INFO) << "start_index = " << start_index;
+
+      timeval start;
+      gettimeofday(&start, NULL);
+
+      if (predictor->inference(&req, &res) != 0) {
+        LOG(ERROR) << "failed call predictor with req:"
+                   << req.ShortDebugString();
+        return;
+      }
+      g_concurrency--;
+
+      timeval end;
+      gettimeofday(&end, NULL);
+      uint64_t elapse_ms = (end.tv_sec * 1000 + end.tv_usec / 1000) -
+                           (start.tv_sec * 1000 + start.tv_usec / 1000);
+
+      response_time[thread_id].push_back(elapse_ms);
+
+      if (!FLAGS_enable_profiling) {
+        print_res(req, res, predictor->tag(), elapse_ms);
+      }
+
+      LOG(INFO) << "Done. Current concurrency " << g_concurrency.load();
+    }  // end while
+  }    // end for
+
   api->thrd_finalize();
 }
-void calc_time(int server_concurrency, int batch_size) {
+
+void calc_time() {
   std::vector<int> time_list;
   for (auto a : response_time) {
     time_list.insert(time_list.end(), a.begin(), a.end());
   }
+
   LOG(INFO) << "Total request : " << (time_list.size());
-  LOG(INFO) << "Batch size : " << batch_size;
-  LOG(INFO) << "Max concurrency : " << server_concurrency;
+  LOG(INFO) << "Batch size : " << FLAGS_batch_size;
+  LOG(INFO) << "Max concurrency : " << FLAGS_concurrency;
+  LOG(INFO) << "enable_profiling: " << FLAGS_enable_profiling;
+  LOG(INFO) << "repeat count: " << FLAGS_repeat;
+
   float total_time = 0;
   float max_time = 0;
   float min_time = 1000000;
@@ -212,21 +238,28 @@ void calc_time(int server_concurrency, int batch_size) {
     if (time_list[i] > max_time) max_time = time_list[i];
     if (time_list[i] < min_time) min_time = time_list[i];
   }
+
   float mean_time = total_time / (time_list.size());
   float var_time;
   for (int i = 0; i < time_list.size(); ++i) {
     var_time += (time_list[i] - mean_time) * (time_list[i] - mean_time);
   }
   var_time = var_time / time_list.size();
-  LOG(INFO) << "Total time : " << total_time / server_concurrency
-            << " Variance : " << var_time << " Max time : " << max_time
-            << " Min time : " << min_time;
+
+  LOG(INFO) << "Total time : " << total_time / FLAGS_concurrency << "ms";
+  LOG(INFO) << "Variance : " << var_time << "ms";
+  LOG(INFO) << "Max time : " << max_time << "ms";
+  LOG(INFO) << "Min time : " << min_time << "ms";
+
   float qps = 0.0;
-  if (total_time > 0)
-    qps = (time_list.size() * 1000) / (total_time / server_concurrency);
+  if (total_time > 0) {
+    qps = (time_list.size() * 1000) / (total_time / FLAGS_concurrency);
+  }
   LOG(INFO) << "QPS: " << qps << "/s";
+
   LOG(INFO) << "Latency statistics: ";
   sort(time_list.begin(), time_list.end());
+
   int percent_pos_50 = time_list.size() * 0.5;
   int percent_pos_80 = time_list.size() * 0.8;
   int percent_pos_90 = time_list.size() * 0.9;
@@ -244,11 +277,12 @@ void calc_time(int server_concurrency, int batch_size) {
   }
 }
 int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
   // initialize
   PredictorApi api;
-  response_time.resize(thread_num);
-  int server_concurrency = thread_num;
-// log set
+  response_time.resize(FLAGS_concurrency);
+
 #ifdef BCLOUD
   logging::LoggingSettings settings;
   settings.logging_dest = logging::LOG_TO_FILE;
@@ -282,32 +316,40 @@ int main(int argc, char** argv) {
     LOG(ERROR) << "Failed create predictors api!";
     return -1;
   }
+
+  LOG(INFO) << "data sample file: " << data_filename;
+
+  if (FLAGS_enable_profiling) {
+    LOG(INFO) << "In profiling mode, lot of normal output will be supressed. "
+              << "Use --enable_profiling=false to turn off this mode";
+  }
+
   // read data
   std::ifstream data_file(data_filename);
   if (!data_file) {
     std::cout << "read file error \n" << std::endl;
     return -1;
   }
+
   std::vector<std::string> data_list;
   std::string line;
   while (getline(data_file, line)) {
     data_list.push_back(line);
   }
+
   // create threads
   std::vector<std::thread*> thread_pool;
-  for (int i = 0; i < server_concurrency; ++i) {
-    thread_pool.push_back(new std::thread(thread_worker,
-                                          &api,
-                                          i,
-                                          batch_size,
-                                          server_concurrency,
-                                          std::ref(data_list)));
+  for (int i = 0; i < FLAGS_concurrency; ++i) {
+    thread_pool.push_back(new std::thread(thread_worker, &api, i, data_list));
   }
-  for (int i = 0; i < server_concurrency; ++i) {
+
+  for (int i = 0; i < FLAGS_concurrency; ++i) {
     thread_pool[i]->join();
     delete thread_pool[i];
   }
-  calc_time(server_concurrency, batch_size);
+
+  calc_time();
+
   api.destroy();
   return 0;
 }
diff --git a/demo-serving/CMakeLists.txt b/demo-serving/CMakeLists.txt
index 93650cf13a4e8c7fe3077e1780e15074081b2de0..82875d23d566cb97205cc2acb009f8da2642e460 100644
--- a/demo-serving/CMakeLists.txt
+++ b/demo-serving/CMakeLists.txt
@@ -59,7 +59,7 @@ target_link_libraries(serving kvdb rocksdb)
 if(WITH_GPU)
     target_link_libraries(serving ${CUDA_LIBRARIES})
 endif()
-target_link_libraries(serving -liomp5 -lmklml_intel -lpthread
+target_link_libraries(serving -liomp5 -lmklml_intel -lmkldnn -lpthread
         -lcrypto -lm -lrt -lssl -ldl -lz -lbz2)
 
 install(TARGETS serving
@@ -75,7 +75,10 @@ install(FILES ${inc}
         DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/include/serving)
 
 if (${WITH_MKL})
-    install(FILES ${CMAKE_BINARY_DIR}/Paddle/third_party/install/mklml/lib/libmklml_intel.so
-                ${CMAKE_BINARY_DIR}/Paddle/third_party/install/mklml/lib/libiomp5.so DESTINATION
+    install(FILES
+            ${CMAKE_BINARY_DIR}/third_party/install/Paddle/third_party/install/mklml/lib/libmklml_intel.so
+                ${CMAKE_BINARY_DIR}/third_party/install/Paddle/third_party/install/mklml/lib/libiomp5.so
+                ${CMAKE_BINARY_DIR}/third_party/install/Paddle/third_party/install/mkldnn/lib/libmkldnn.so
+                DESTINATION
                 ${PADDLE_SERVING_INSTALL_DIR}/demo/serving/bin)
 endif()
diff --git a/demo-serving/op/bert_service_op.h b/demo-serving/op/bert_service_op.h
index a0002aac65a5d1531e5eb023805635602a4dbb07..82c366237c2c263a85c2f7728301d8ec317594d3 100644
--- a/demo-serving/op/bert_service_op.h
+++ b/demo-serving/op/bert_service_op.h
@@ -21,7 +21,7 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #endif
 #else
-#include "paddle/fluid/inference/paddle_inference_api.h"
+#include "./paddle_inference_api.h"
 #endif
 #include "demo-serving/bert_service.pb.h"
 
diff --git a/demo-serving/op/classify_op.h b/demo-serving/op/classify_op.h
index 366793cc7c1ec38197912399b06b6e9e8db8e996..c381f032d0e3a7e19e1a711b1ebe1747ee3145d8 100644
--- a/demo-serving/op/classify_op.h
+++ b/demo-serving/op/classify_op.h
@@ -21,7 +21,7 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #endif
 #else
-#include "paddle/fluid/inference/paddle_inference_api.h"
+#include "paddle_inference_api.h"  // NOLINT
 #endif
 #include "demo-serving/image_class.pb.h"
 
diff --git a/demo-serving/op/ctr_prediction_op.cpp b/demo-serving/op/ctr_prediction_op.cpp
index a904562b6b303134d5198fbbe01ad2cb79c4ba97..b2166819a2a6b213ae008580349e870e97797984 100644
--- a/demo-serving/op/ctr_prediction_op.cpp
+++ b/demo-serving/op/ctr_prediction_op.cpp
@@ -23,6 +23,9 @@
 #include "predictor/framework/kv_manager.h"
 #include "predictor/framework/memory.h"
 
+// Flag where enable profiling mode
+DECLARE_bool(enable_ctr_profiling);
+
 namespace baidu {
 namespace paddle_serving {
 namespace serving {
@@ -46,6 +49,11 @@ const int CTR_PREDICTION_DENSE_SLOT_ID = 26;
 const int CTR_PREDICTION_DENSE_DIM = 13;
 const int CTR_PREDICTION_EMBEDDING_SIZE = 10;
 
+bthread::Mutex CTRPredictionOp::mutex_;
+int64_t CTRPredictionOp::cube_time_us_ = 0;
+int32_t CTRPredictionOp::cube_req_num_ = 0;
+int32_t CTRPredictionOp::cube_req_key_num_ = 0;
+
 void fill_response_with_message(Response *response,
                                 int err_code,
                                 std::string err_msg) {
@@ -135,7 +143,41 @@ int CTRPredictionOp::inference() {
     return 0;
   } else if (kvinfo->sparse_param_service_type ==
              configure::EngineDesc::REMOTE) {
-    int ret = cube->seek(table_name, keys, &values);
+    struct timeval start;
+    struct timeval end;
+
+    int ret;
+
+    gettimeofday(&start, NULL);
+    ret = cube->seek(table_name, keys, &values);
+    gettimeofday(&end, NULL);
+    uint64_t usec =
+        end.tv_sec * 1e6 + end.tv_usec - start.tv_sec * 1e6 - start.tv_usec;
+
+    // Statistics
+    mutex_.lock();
+    cube_time_us_ += usec;
+    ++cube_req_num_;
+    cube_req_key_num_ += keys.size();
+
+    if (cube_req_num_ >= 1000) {
+      LOG(INFO) << "Cube request count: " << cube_req_num_;
+      LOG(INFO) << "Cube request key count: " << cube_req_key_num_;
+      LOG(INFO) << "Cube request total time: " << cube_time_us_ << "us";
+      LOG(INFO) << "Average "
+                << static_cast<float>(cube_time_us_) / cube_req_num_
+                << "us/req";
+      LOG(INFO) << "Average "
+                << static_cast<float>(cube_time_us_) / cube_req_key_num_
+                << "us/key";
+
+      cube_time_us_ = 0;
+      cube_req_num_ = 0;
+      cube_req_key_num_ = 0;
+    }
+    mutex_.unlock();
+    // Statistics end
+
     if (ret != 0) {
       fill_response_with_message(res, -1, "Query cube for embeddings error");
       LOG(ERROR) << "Query cube for embeddings error";
diff --git a/demo-serving/op/ctr_prediction_op.h b/demo-serving/op/ctr_prediction_op.h
index a12cccab68c06c2238e7205b90b095318b28f3f0..ee648151b4ecf4611502798308c2cd81db923bb3 100644
--- a/demo-serving/op/ctr_prediction_op.h
+++ b/demo-serving/op/ctr_prediction_op.h
@@ -21,7 +21,7 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #endif
 #else
-#include "paddle/fluid/inference/paddle_inference_api.h"
+#include "paddle_inference_api.h"  // NOLINT
 #endif
 #include "demo-serving/ctr_prediction.pb.h"
 
@@ -55,6 +55,7 @@ static const char* CTR_PREDICTION_MODEL_NAME = "ctr_prediction";
  * and modifications we made
  *
  */
+
 class CTRPredictionOp
     : public baidu::paddle_serving::predictor::OpWithChannel<
           baidu::paddle_serving::predictor::ctr_prediction::Response> {
@@ -64,6 +65,12 @@ class CTRPredictionOp
   DECLARE_OP(CTRPredictionOp);
 
   int inference();
+
+ private:
+  static bthread::Mutex mutex_;
+  static int64_t cube_time_us_;
+  static int32_t cube_req_num_;
+  static int32_t cube_req_key_num_;
 };
 
 }  // namespace serving
diff --git a/demo-serving/op/reader_op.h b/demo-serving/op/reader_op.h
index 484d6f62d0f64bd3efbf7f8de3b4068a344ae048..546ca19667af0161ddb62f354e32791d15d8ae4b 100644
--- a/demo-serving/op/reader_op.h
+++ b/demo-serving/op/reader_op.h
@@ -35,7 +35,7 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #endif
 #else
-#include "paddle/fluid/inference/paddle_inference_api.h"
+#include "paddle_inference_api.h"  // NOLINT
 #endif
 
 namespace baidu {
diff --git a/demo-serving/op/text_classification_op.h b/demo-serving/op/text_classification_op.h
index bef8ec520dc45e97def913715d714e2c46067429..21ac6991be1b47654618c52c4123a5f99f4bc185 100644
--- a/demo-serving/op/text_classification_op.h
+++ b/demo-serving/op/text_classification_op.h
@@ -21,7 +21,7 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #endif
 #else
-#include "paddle/fluid/inference/paddle_inference_api.h"
+#include "paddle_inference_api.h"  // NOLINT
 #endif
 #include "demo-serving/text_classification.pb.h"
 
diff --git a/doc/CTR_PREDICTION.md b/doc/CTR_PREDICTION.md
index 746b6360f3fb326f3a63561bab1a2163bf30da67..513b4560f025a08f3fc2ffe9a7fb96ada0b076c5 100755
--- a/doc/CTR_PREDICTION.md
+++ b/doc/CTR_PREDICTION.md
@@ -320,7 +320,7 @@ def prune_program():
 
 ### 2.5 裁剪过程串到一起
 
-我们提供了完整的裁剪CTR预估模型的脚本文件save_program.py，同[CTR分布式训练任务](doc/DISTRIBUTED_TRAINING_AND_SERVING.md)一起发布，可以在trainer和pserver容器的训练脚本目录下找到
+我们提供了完整的裁剪CTR预估模型的脚本文件save_program.py，同[CTR分布式训练和Serving流程化部署](https://github.com/PaddlePaddle/Serving/blob/master/doc/DEPLOY.md)一起发布，可以在trainer和pserver容器的训练脚本目录下找到，也可以在[这里](https://github.com/PaddlePaddle/Serving/tree/master/doc/resource)下载。
 
 ## 3. 整个预测计算流程
 
diff --git a/inferencer-fluid-cpu/include/fluid_cpu_engine.h b/inferencer-fluid-cpu/include/fluid_cpu_engine.h
index 41ab63992361f70f1434efe2c4982342e34b5525..a7ba3ae13bfa040f3a6f4b05d8d5cd6b93ae36b5 100644
--- a/inferencer-fluid-cpu/include/fluid_cpu_engine.h
+++ b/inferencer-fluid-cpu/include/fluid_cpu_engine.h
@@ -28,7 +28,7 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #endif
 #else
-#include "paddle/fluid/inference/paddle_inference_api.h"
+#include "paddle_inference_api.h"  // NOLINT
 #endif
 #include "predictor/framework/infer.h"
 
@@ -134,8 +134,7 @@ class FluidCpuAnalysisCore : public FluidFamilyCore {
     analysis_config.SetCpuMathLibraryNumThreads(1);
 
     if (params.enable_memory_optimization()) {
-      analysis_config.EnableMemoryOptim(params.static_optimization(),
-                                        params.force_update_static_cache());
+      analysis_config.EnableMemoryOptim();
     }
 
     analysis_config.SwitchSpecifyInputNames(true);
@@ -200,8 +199,7 @@ class FluidCpuAnalysisDirCore : public FluidFamilyCore {
     analysis_config.SetCpuMathLibraryNumThreads(1);
 
     if (params.enable_memory_optimization()) {
-      analysis_config.EnableMemoryOptim(params.static_optimization(),
-                                        params.force_update_static_cache());
+      analysis_config.EnableMemoryOptim();
     }
 
     AutoLock lock(GlobalPaddleCreateMutex::instance());
@@ -519,8 +517,7 @@ class FluidCpuAnalysisDirWithSigmoidCore : public FluidCpuWithSigmoidCore {
     analysis_config.SetCpuMathLibraryNumThreads(1);
 
     if (params.enable_memory_optimization()) {
-      analysis_config.EnableMemoryOptim(params.static_optimization(),
-                                        params.force_update_static_cache());
+      analysis_config.EnableMemoryOptim();
     }
 
     AutoLock lock(GlobalPaddleCreateMutex::instance());
diff --git a/inferencer-fluid-gpu/include/fluid_gpu_engine.h b/inferencer-fluid-gpu/include/fluid_gpu_engine.h
index 81c20d69121b19e0f43b03630c476dc8c2ae3d4f..667380f7205655a06bedde899168bda6c37d10b1 100644
--- a/inferencer-fluid-gpu/include/fluid_gpu_engine.h
+++ b/inferencer-fluid-gpu/include/fluid_gpu_engine.h
@@ -28,7 +28,7 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #endif
 #else
-#include "paddle/fluid/inference/paddle_inference_api.h"
+#include "paddle_inference_api.h"  // NOLINT
 #endif
 #include "predictor/framework/infer.h"
 
@@ -136,8 +136,7 @@ class FluidGpuAnalysisCore : public FluidFamilyCore {
     analysis_config.SetCpuMathLibraryNumThreads(1);
 
     if (params.enable_memory_optimization()) {
-      analysis_config.EnableMemoryOptim(params.static_optimization(),
-                                        params.force_update_static_cache());
+      analysis_config.EnableMemoryOptim();
     }
 
     analysis_config.SwitchSpecifyInputNames(true);
@@ -203,8 +202,7 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {
     analysis_config.SwitchIrOptim(true);
 
     if (params.enable_memory_optimization()) {
-      analysis_config.EnableMemoryOptim(params.static_optimization(),
-                                        params.force_update_static_cache());
+      analysis_config.EnableMemoryOptim();
     }
 
     AutoLock lock(GlobalPaddleCreateMutex::instance());
@@ -522,8 +520,7 @@ class FluidGpuAnalysisDirWithSigmoidCore : public FluidGpuWithSigmoidCore {
     analysis_config.SetCpuMathLibraryNumThreads(1);
 
     if (params.enable_memory_optimization()) {
-      analysis_config.EnableMemoryOptim(params.static_optimization(),
-                                        params.force_update_static_cache());
+      analysis_config.EnableMemoryOptim();
     }
 
     AutoLock lock(GlobalPaddleCreateMutex::instance());
diff --git a/predictor/common/constant.h b/predictor/common/constant.h
index da44103eb8e6d064a642520bb90dd2c9df293889..72509c8d9187f817cf4dd0dfef1bff06370ce537 100644
--- a/predictor/common/constant.h
+++ b/predictor/common/constant.h
@@ -40,8 +40,6 @@ DECLARE_int32(reload_interval_s);
 DECLARE_bool(enable_model_toolkit);
 DECLARE_string(enable_protocol_list);
 DECLARE_bool(enable_cube);
-DECLARE_string(cube_config_path);
-DECLARE_string(cube_config_file);
 
 // STATIC Variables
 extern const char* START_OP_NAME;
diff --git a/predictor/framework/infer.h b/predictor/framework/infer.h
index c2823f5e3d8cbd2484f02053ffd36e6a3a275846..c479479a271601b0d197d7f4fc4672ccc54c3801 100644
--- a/predictor/framework/infer.h
+++ b/predictor/framework/infer.h
@@ -632,7 +632,6 @@ class VersionedInferEngine : public InferEngine {
         LOG(ERROR) << "Failed thrd clear version engine: " << iter->first;
         return -1;
       }
-      LOG(INFO) << "Succ thrd clear version engine: " << iter->first;
     }
     return 0;
   }
diff --git a/predictor/framework/resource.cpp b/predictor/framework/resource.cpp
index 74e3c95204dfb4fb0dcf32201c244550b6df08c2..15a5022d69458eae76c6b3f75ab3076d365ed333 100644
--- a/predictor/framework/resource.cpp
+++ b/predictor/framework/resource.cpp
@@ -208,7 +208,6 @@ int Resource::thread_clear() {
     return -1;
   }
 
-  LOG(INFO) << bthread_self() << "Resource::thread_clear success";
   // ...
   return 0;
 }
diff --git a/predictor/src/pdserving.cpp b/predictor/src/pdserving.cpp
index a86b39abac7bd007a8fd401bd9a0b8aaaa5c5114..56ffee84aba6338bcd082d12e6bd4c304fe8ca80 100644
--- a/predictor/src/pdserving.cpp
+++ b/predictor/src/pdserving.cpp
@@ -51,8 +51,6 @@ using baidu::paddle_serving::predictor::FLAGS_port;
 using baidu::paddle_serving::configure::InferServiceConf;
 using baidu::paddle_serving::configure::read_proto_conf;
 
-DECLARE_bool(logtostderr);
-
 void print_revision(std::ostream& os, void*) {
 #if defined(PDSERVING_VERSION)
   os << PDSERVING_VERSION;
@@ -217,7 +215,8 @@ int main(int argc, char** argv) {
   }
   LOG(INFO) << "Succ initialize cube";
 
-  FLAGS_logtostderr = false;
+  // FATAL messages are output to stderr
+  FLAGS_stderrthreshold = 3;
 
   if (ServerManager::instance().start_and_wait() != 0) {
     LOG(ERROR) << "Failed start server and wait!";
diff --git a/sdk-cpp/src/endpoint.cpp b/sdk-cpp/src/endpoint.cpp
index d1c66124c6e7657db23905eb681bfa0b957be9d2..3a30a0de6465512e647321c07637692599f1890b 100644
--- a/sdk-cpp/src/endpoint.cpp
+++ b/sdk-cpp/src/endpoint.cpp
@@ -64,7 +64,6 @@ int Endpoint::thrd_clear() {
       return -1;
     }
   }
-  LOG(INFO) << "Succ thrd clear all vars: " << var_size;
   return 0;
 }
 
diff --git a/sdk-cpp/src/predictor_sdk.cpp b/sdk-cpp/src/predictor_sdk.cpp
index 214473f64204866febb7d842b53551aa1cfe225d..246ac66f2d07f3c1becd7ab6c05be929c5003a03 100644
--- a/sdk-cpp/src/predictor_sdk.cpp
+++ b/sdk-cpp/src/predictor_sdk.cpp
@@ -94,8 +94,6 @@ int PredictorApi::thrd_clear() {
       LOG(ERROR) << "Failed thrd clear endpoint:" << it->first;
       return -1;
     }
-
-    LOG(INFO) << "Succ thrd clear endpoint:" << it->first;
   }
   return 0;
 }