diff --git a/BCLOUD.gpu b/BCLOUD.gpu
index 3d4fd0a7eb030cf98ae1329639bc179633abe90a..52338adbee72e01683e8d72b5ff72e26161650cc 100644
--- a/BCLOUD.gpu
+++ b/BCLOUD.gpu
@@ -87,6 +87,12 @@ HEADERS('predictor/op/*.h', '$INC/predictor/op')
 StaticLibrary('pdserving', Sources(GLOB(' '.join(predictor_sources)), 'predictor/src/pdserving.cpp'))
 
 # Sub directory
+
+# inferencer-fluid-cpu
+INCPATHS('.')
+inferencer_fluid_cpu_sources = 'inferencer-fluid-cpu/src/fluid_cpu_engine.cpp'
+StaticLibrary('inferencer-fluid-cpu', Sources(inferencer_fluid_cpu_sources))
+
 Directory('inferencer-fluid-gpu')
 Directory('sdk-cpp')
 
@@ -100,16 +106,17 @@ HEADERS(GLOB_GEN_SRCS('predictor/proto/builtin_format.pb.h'), '$INC/sdk-cpp')
 HEADERS(GLOB_GEN_SRCS('predictor/proto/pds_option.pb.h'), '$INC/sdk-cpp')
 HEADERS(GLOB_GEN_SRCS('demo-client/proto/*.pb.h'), '$INC/sdk-cpp')
 
-#Application('ximage', Sources('demo-client/src/ximage.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
-#Application('echo', Sources('demo-client/src/echo.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
-#Application('dense_format', Sources('demo-client/src/dense_format.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
-#Application('sparse_format', Sources('demo-client/src/sparse_format.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
-#Application('int64tensor_format', Sources('demo-client/src/int64tensor_format.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
-#Application('text_classification', Sources('demo-client/src/text_classification.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
-#Application('text_classification_press', Sources('demo-client/src/text_classification_press.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
-
-#OUTPUT('demo-client/conf', '$OUT/demo/client')
-#OUTPUT('demo-client/data', '$OUT/demo/client')
+LIBS('$OUT/lib/libpdconfigure.a')
+Application('ximage', Sources('demo-client/src/ximage.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'), Libs('$OUT/lib/libpdconfigure.a'))
+Application('ximage_press', Sources('demo-client/src/ximage_press.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'), Libs('$OUT/lib/libpdconfigure.a'))
+Application('echo', Sources('demo-client/src/echo.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'), Libs('$OUT/lib/libpdconfigure.a'))
+Application('dense_format', Sources('demo-client/src/dense_format.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'), Libs('$OUT/lib/libpdconfigure.a'))
+Application('sparse_format', Sources('demo-client/src/sparse_format.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'), Libs('$OUT/lib/libpdconfigure.a'))
+Application('int64tensor_format', Sources('demo-client/src/int64tensor_format.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'), Libs('$OUT/lib/libpdconfigure.a'))
+Application('text_classification', Sources('demo-client/src/text_classification.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
+Application('text_classification_press', Sources('demo-client/src/text_classification_press.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'), Libs('$OUT/lib/libpdconfigure.a'))
+OUTPUT('demo-client/conf', '$OUT/demo/client')
+OUTPUT('demo-client/data', '$OUT/demo/client')
 
 # demo-serving
 INCPATHS('$INC')
@@ -130,7 +137,7 @@ serving_sources.append('demo-serving/op/*.cpp')
 serving_sources.append('demo-serving/proto/*.proto')
 
 HEADERS(GLOB_GEN_SRCS('demo-serving/proto/*.pb.h'), '$INC/demo-serving')
-Application('serving', Sources(GLOB(' '.join(serving_sources))), WholeArchives('$OUT/lib/libinferencer-fluid-gpu.a $OUT/lib/libpdserving.a $OUT/lib/libpdconfigure.a'))
+Application('serving', Sources(GLOB(' '.join(serving_sources))), WholeArchives('$OUT/lib/libinferencer-fluid-gpu.a $OUT/lib/libinferencer-fluid-cpu.a $OUT/lib/libpdserving.a $OUT/lib/libpdconfigure.a'))
 
 OUTPUT('demo-serving/conf', '$OUT/demo/serving/')
 OUTPUT('demo-serving/data', '$OUT/demo/serving')
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 33465d55a70ca00ac08f251d316fa45be69ca202..d01be6d74f8c0b12b8b50e5ec3e63795a9ef4a09 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,7 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
 
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
+find_package(CUDA QUIET)
 
 include(simd)
 
@@ -43,10 +44,10 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
 
 set(THIRD_PARTY_BUILD_TYPE Release)
 
-option(WITH_AVX         "Compile Paddle Serving with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKL         "Compile Paddle Serving with MKL support."        ${AVX_FOUND})
-option(CLIENT_ONLY      "Compile client libraries and demos only"
-        FALSE)
+option(WITH_AVX     "Compile Paddle Serving with AVX intrinsics"    ${AVX_FOUND})
+option(WITH_MKL     "Compile Paddle Serving with MKL support."      ${AVX_FOUND})
+option(WITH_GPU     "Compile Paddle Serving with NVIDIA GPU"        ${CUDA_FOUND})
+option(CLIENT_ONLY  "Compile client libraries and demos only"       FALSE)
 
 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)
@@ -108,5 +109,8 @@ add_subdirectory(demo-client)
 if (NOT CLIENT_ONLY)
 add_subdirectory(predictor)
 add_subdirectory(inferencer-fluid-cpu)
+if (WITH_GPU)
+add_subdirectory(inferencer-fluid-gpu)
+endif()
 add_subdirectory(demo-serving)
 endif()
diff --git a/README.md b/README.md
index aa06fa20dab45df342326669de758727e1404ca6..5e3093c05627d9ff2108cb6cadeec5d9599fc89f 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,9 @@
 # 概述
-PaddlePaddle是公司开源的机器学习框架，广泛支持各种深度学习模型的定制化开发; Paddle serving是Paddle的在线预测部分，与Paddle模型训练环节无缝衔接，提供机器学习预测云服务。
+PaddlePaddle是百度开源的机器学习框架，广泛支持各种深度学习模型的定制化开发; Paddle serving是Paddle的在线预测部分，与Paddle模型训练环节无缝衔接，提供机器学习预测云服务。
 
 # 框架简介
 
-![图片](https://paddle-serving.bj.bcebos.com/doc/framework.png)
+![图片](doc/framework.png)
 
 - 基础框架：屏蔽一个RPC服务所需的所有元素，让用户只关注自己的业务算子的开发；
 - 业务框架：基于Protobuf定制请求接口，基于有限DAG定制业务逻辑，并行化调度；
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..ef4192ecc98ea6de0c81c1f33320528d547b818a
--- /dev/null
+++ b/cmake/cuda.cmake
@@ -0,0 +1,231 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
+set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
+set(paddle_known_gpu_archs7 "30 35 50 52")
+set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
+set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
+set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")
+
+######################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   detect_installed_gpus(out_variable)
+function(detect_installed_gpus out_variable)
+  if(NOT CUDA_gpu_detect_output)
+    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
+
+    file(WRITE ${cufile} ""
+      "#include <cstdio>\n"
+      "int main() {\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+      "  if (count == 0) return -1;\n"
+      "  for (int device = 0; device < count; ++device) {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+
+    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}"
+                    "--run" "${cufile}"
+                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    if(nvcc_res EQUAL 0)
+      # only keep the last line of nvcc_out
+      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
+      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
+      list(GET nvcc_out -1 nvcc_out)
+      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
+      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
+    endif()
+  endif()
+
+  if(NOT CUDA_gpu_detect_output)
+    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
+    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
+  else()
+    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+########################################################################
+# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
+# Usage:
+#   select_nvcc_arch_flags(out_variable)
+function(select_nvcc_arch_flags out_variable)
+  # List of arch names
+  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
+  set(archs_name_default "All")
+  list(APPEND archs_names "Auto")
+
+  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
+  set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
+  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} )
+  mark_as_advanced(CUDA_ARCH_NAME)
+
+  # verify CUDA_ARCH_NAME value
+  if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
+    string(REPLACE ";" ", " archs_names "${archs_names}")
+    message(FATAL_ERROR "Only ${archs_names} architeture names are supported.")
+  endif()
+
+  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
+  else()
+    unset(CUDA_ARCH_BIN CACHE)
+    unset(CUDA_ARCH_PTX CACHE)
+  endif()
+
+  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
+    set(cuda_arch_bin "30 35")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
+    set(cuda_arch_bin "50")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    set(cuda_arch_bin "60 61")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
+    set(cuda_arch_bin "70")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
+    set(cuda_arch_bin "75")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
+    set(cuda_arch_bin ${paddle_known_gpu_archs})
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
+    detect_installed_gpus(cuda_arch_bin)
+  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(cuda_arch_bin ${CUDA_ARCH_BIN})
+  endif()
+
+  # remove dots and convert to lists
+  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
+  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
+  list(REMOVE_DUPLICATES cuda_arch_bin)
+  list(REMOVE_DUPLICATES cuda_arch_ptx)
+
+  set(nvcc_flags "")
+  set(nvcc_archs_readable "")
+
+  # Tell NVCC to add binaries for the specified GPUs
+  foreach(arch ${cuda_arch_bin})
+    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
+      # User explicitly specified PTX for the concrete BIN
+      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
+    else()
+      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
+      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
+      list(APPEND nvcc_archs_readable sm_${arch})
+    endif()
+  endforeach()
+
+  # Tell NVCC to add PTX intermediate code for the specified architectures
+  foreach(arch ${cuda_arch_ptx})
+    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
+    list(APPEND nvcc_archs_readable compute_${arch})
+  endforeach()
+
+  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
+  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
+  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
+endfunction()
+
+message(STATUS "CUDA detected: " ${CUDA_VERSION})
+if (${CUDA_VERSION} LESS 7.0)
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
+  add_definitions("-DPADDLE_CUDA_BINVER=\"60\"")
+elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"70\"")
+elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
+  # warning for now.
+  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
+elseif (${CUDA_VERSION} LESS 10.0) # CUDA 9.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"90\"")
+elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"100\"")
+endif()
+
+include_directories(${CUDA_INCLUDE_DIRS})
+list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+if(NOT WITH_DSO)
+    # TODO(panyx0718): CUPTI only allows DSO?
+    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+    if(WIN32)
+      set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+    endif(WIN32)
+endif(NOT WITH_DSO)
+
+# setting nvcc arch flags
+select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
+list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
+message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
+
+# Set C++11 support
+set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+
+# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
+# So, don't set these flags here.
+if (NOT WIN32) # windows msvc2015 support c++11 natively. 
+# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
+list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
+list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+endif(NOT WIN32)
+
+if(WITH_FAST_MATH)
+  # Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
+endif()
+# in cuda9, suppress cuda warning on eigen 
+list(APPEND CUDA_NVCC_FLAGS "-w")
+# Set :expt-relaxed-constexpr to suppress Eigen warnings
+list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
+
+if (NOT WIN32)
+  if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+  elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
+      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
+  elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+  elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+      # nvcc 9 does not support -Os. Use Release flags instead
+      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
+  endif()
+else(NOT WIN32)
+  list(APPEND CUDA_NVCC_FLAGS  "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"")
+  list(APPEND CUDA_NVCC_FLAGS  "--compiler-options;/bigobj")
+  if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND CUDA_NVCC_FLAGS  "-g -G")
+    # match the cl's _ITERATOR_DEBUG_LEVEL
+    list(APPEND CUDA_NVCC_FLAGS  "-D_DEBUG")
+  elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
+    list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
+  else()
+  message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
+endif()
+endif(NOT WIN32)
+
+mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
+mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
diff --git a/cmake/paddlepaddle.cmake b/cmake/paddlepaddle.cmake
index 5a490c8eddb6b7fbb0dc1490769ea2db7f0f50b2..4caed8c2494338667d03c08d2a62ccaf3577cc7c 100644
--- a/cmake/paddlepaddle.cmake
+++ b/cmake/paddlepaddle.cmake
@@ -24,13 +24,15 @@ INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir)
 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
 set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
 
+message( "WITH_GPU = ${WITH_GPU}")
+
 # If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
 ExternalProject_Add(
     extern_paddle
     ${EXTERNAL_PROJECT_LOG_ARGS}
     # TODO(wangguibao): change to de newst repo when they changed.
     GIT_REPOSITORY  "https://github.com/PaddlePaddle/Paddle"
-    GIT_TAG         "v1.3.0"
+    GIT_TAG         "v1.4.1"
     PREFIX          ${PADDLE_SOURCES_DIR}
     UPDATE_COMMAND  ""
     BINARY_DIR ${CMAKE_BINARY_DIR}/Paddle
@@ -47,7 +49,7 @@ ExternalProject_Add(
                     -DWITH_MKL=${WITH_MKL}
                     -DWITH_AVX=${WITH_AVX}
                     -DWITH_MKLDNN=OFF
-                    -DWITH_GPU=OFF
+                    -DWITH_GPU=${WITH_GPU}
                     -DWITH_FLUID_ONLY=ON
                     -DWITH_TESTING=OFF
                     -DWITH_DISTRIBUTE=OFF
diff --git a/demo-client/BCLOUD b/demo-client/BCLOUD
index 308556585ed9a8811e95c7d47623e3279e0e0e9e..7f208dea6795c3cae11d4cdea84079c44cff758f 100644
--- a/demo-client/BCLOUD
+++ b/demo-client/BCLOUD
@@ -46,6 +46,7 @@ HEADERS('include/*.hpp', '$INC/sdk-cpp/include')
 # Application
 #bin
 Application('ximage', Sources('src/ximage.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
+Application('ximage_press', Sources('src/ximage_press.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
 Application('echo', Sources('src/echo.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
 Application('dense_format', Sources('src/dense_format.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
 Application('sparse_format', Sources('src/sparse_format.cpp'), WholeArchives('$OUT/lib/libpdsdk-cpp.a'))
diff --git a/demo-client/CMakeLists.txt b/demo-client/CMakeLists.txt
index e30ca0ead42d93f3613f8702f36013316499047f..5e7208090ca4c47f724be38e92b8685684367501 100644
--- a/demo-client/CMakeLists.txt
+++ b/demo-client/CMakeLists.txt
@@ -20,6 +20,11 @@ target_link_libraries(ximage -Wl,--whole-archive sdk-cpp
                -Wl,--no-whole-archive -lpthread -lcrypto -lm -lrt -lssl -ldl
         -lz)
 
+add_executable(ximage_press ${CMAKE_CURRENT_LIST_DIR}/src/ximage_press.cpp)
+target_link_libraries(ximage_press -Wl,--whole-archive sdk-cpp
+               -Wl,--no-whole-archive -lpthread -lcrypto -lm -lrt -lssl -ldl
+        -lz)
+
 add_executable(echo ${CMAKE_CURRENT_LIST_DIR}/src/echo.cpp)
 target_link_libraries(echo -Wl,--whole-archive sdk-cpp -Wl,--no-whole-archive
         -lpthread -lcrypto -lm -lrt -lssl -ldl
@@ -51,6 +56,9 @@ target_link_libraries(text_classification_press -Wl,--whole-archive sdk-cpp -Wl,
 install(TARGETS ximage
         RUNTIME DESTINATION
         ${PADDLE_SERVING_INSTALL_DIR}/demo/client/image_classification/bin)
+install(TARGETS ximage_press
+        RUNTIME DESTINATION
+        ${PADDLE_SERVING_INSTALL_DIR}/demo/client/image_classification/bin)
 install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/conf DESTINATION
         ${PADDLE_SERVING_INSTALL_DIR}/demo/client/image_classification/)
 install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/data/images DESTINATION
diff --git a/demo-client/data/images/ILSVRC2012_val_00000001.jpeg b/demo-client/data/images/ILSVRC2012_val_00000001.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..fcd0de8866f404b87c2881435efe8aa2372f7b36
Binary files /dev/null and b/demo-client/data/images/ILSVRC2012_val_00000001.jpeg differ
diff --git a/demo-client/data/images/ILSVRC2012_val_00000002.jpeg b/demo-client/data/images/ILSVRC2012_val_00000002.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..34476b6e5b2a1ef892f4957e8a5d7b54449aceb1
Binary files /dev/null and b/demo-client/data/images/ILSVRC2012_val_00000002.jpeg differ
diff --git a/demo-client/data/images/ILSVRC2012_val_00000003.jpeg b/demo-client/data/images/ILSVRC2012_val_00000003.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..95438c5f18dace758c6a62cd0955c4212cfb1c1e
Binary files /dev/null and b/demo-client/data/images/ILSVRC2012_val_00000003.jpeg differ
diff --git a/demo-client/data/images/ILSVRC2012_val_00000004.jpeg b/demo-client/data/images/ILSVRC2012_val_00000004.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..d6c3df86e0a15baba0833b4b7a9c996526e5ac79
Binary files /dev/null and b/demo-client/data/images/ILSVRC2012_val_00000004.jpeg differ
diff --git a/demo-client/data/images/ILSVRC2012_val_00000005.jpeg b/demo-client/data/images/ILSVRC2012_val_00000005.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..166f2897e3965f86a0711bce1f929e63c6dd900e
Binary files /dev/null and b/demo-client/data/images/ILSVRC2012_val_00000005.jpeg differ
diff --git a/demo-client/data/images/ILSVRC2012_val_00000006.jpeg b/demo-client/data/images/ILSVRC2012_val_00000006.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..2f589369058e002d823c8d9c783311a576f9ebdb
Binary files /dev/null and b/demo-client/data/images/ILSVRC2012_val_00000006.jpeg differ
diff --git a/demo-client/data/images/ILSVRC2012_val_00000007.jpeg b/demo-client/data/images/ILSVRC2012_val_00000007.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..a043ebf28db967c6e1aefb0913fb1540762e3ad0
Binary files /dev/null and b/demo-client/data/images/ILSVRC2012_val_00000007.jpeg differ
diff --git a/demo-client/data/images/ILSVRC2012_val_00000008.jpeg b/demo-client/data/images/ILSVRC2012_val_00000008.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..4844c969cc8f4c16adc0809e566f80af62048ed6
Binary files /dev/null and b/demo-client/data/images/ILSVRC2012_val_00000008.jpeg differ
diff --git a/demo-client/data/images/ILSVRC2012_val_00000009.jpeg b/demo-client/data/images/ILSVRC2012_val_00000009.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..592eb3a5022ad5df1333dac2935a4cc4ff156aaa
Binary files /dev/null and b/demo-client/data/images/ILSVRC2012_val_00000009.jpeg differ
diff --git a/demo-client/data/images/ILSVRC2012_val_00000010.jpeg b/demo-client/data/images/ILSVRC2012_val_00000010.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..6cad056bcf3cd66b586248455073b66af0e9afb5
Binary files /dev/null and b/demo-client/data/images/ILSVRC2012_val_00000010.jpeg differ
diff --git a/demo-client/data/images/val.txt b/demo-client/data/images/val.txt
new file mode 100644
index 0000000000000000000000000000000000000000..da362d6145f3ad9afb5ba2c02b6dd4f6b758ac55
--- /dev/null
+++ b/demo-client/data/images/val.txt
@@ -0,0 +1,10 @@
+ILSVRC2012_val_00000001.JPEG 65
+ILSVRC2012_val_00000002.JPEG 970
+ILSVRC2012_val_00000003.JPEG 230
+ILSVRC2012_val_00000004.JPEG 809
+ILSVRC2012_val_00000005.JPEG 516
+ILSVRC2012_val_00000006.JPEG 57
+ILSVRC2012_val_00000007.JPEG 334
+ILSVRC2012_val_00000008.JPEG 415
+ILSVRC2012_val_00000009.JPEG 674
+ILSVRC2012_val_00000010.JPEG 332
diff --git a/demo-client/src/ximage_press.cpp b/demo-client/src/ximage_press.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5a71508e79d8d32598b88be4ad26373ef743850a
--- /dev/null
+++ b/demo-client/src/ximage_press.cpp
@@ -0,0 +1,300 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <fstream>
+#include <thread>  // NOLINT
+#include "sdk-cpp/builtin_format.pb.h"
+#include "sdk-cpp/image_class.pb.h"
+#include "sdk-cpp/include/common.h"
+#include "sdk-cpp/include/predictor_sdk.h"
+
+#ifndef BCLOUD
+using json2pb::JsonToProtoMessage;
+#endif
+
+using baidu::paddle_serving::sdk_cpp::Predictor;
+using baidu::paddle_serving::sdk_cpp::PredictorApi;
+using baidu::paddle_serving::predictor::format::XImageReqInstance;
+using baidu::paddle_serving::predictor::format::DensePrediction;
+using baidu::paddle_serving::predictor::image_classification::Request;
+using baidu::paddle_serving::predictor::image_classification::Response;
+
+DEFINE_int32(concurrency, 1, "Set the max concurrent number of requests");
+DEFINE_int32(requests, 100, "Number of requests to send per thread");
+DEFINE_int32(batch_size, 1, "Batch size");
+
+std::atomic<int> g_concurrency(0);
+
+std::vector<std::vector<uint64_t>> g_round_time;
+std::vector<char*> g_image_data;
+std::vector<size_t> g_image_lengths;
+
+const std::vector<std::string> g_image_paths{
+    "./data/images/ILSVRC2012_val_00000001.jpeg",
+    "./data/images/ILSVRC2012_val_00000002.jpeg",
+    "./data/images/ILSVRC2012_val_00000003.jpeg",
+    "./data/images/ILSVRC2012_val_00000004.jpeg",
+    "./data/images/ILSVRC2012_val_00000005.jpeg",
+    "./data/images/ILSVRC2012_val_00000006.jpeg",
+    "./data/images/ILSVRC2012_val_00000007.jpeg",
+    "./data/images/ILSVRC2012_val_00000008.jpeg",
+    "./data/images/ILSVRC2012_val_00000009.jpeg",
+    "./data/images/ILSVRC2012_val_00000010.jpeg"};
+
+int prepare_data() {
+  for (auto x : g_image_paths) {
+    FILE* fp = fopen(x.c_str(), "rb");
+    if (!fp) {
+      LOG(ERROR) << "Failed open image: " << x.c_str();
+      continue;
+    }
+
+    fseek(fp, 0L, SEEK_END);
+    size_t isize = ftell(fp);
+    char* ibuf = new (std::nothrow) char[isize];
+    if (!ibuf) {
+      LOG(ERROR) << "Failed malloc image buffer";
+      fclose(fp);
+      return -1;
+    }
+
+    fseek(fp, 0, SEEK_SET);
+    fread(ibuf, sizeof(ibuf[0]), isize, fp);
+    g_image_data.push_back(ibuf);
+    g_image_lengths.push_back(isize);
+
+    fclose(fp);
+  }
+
+  return 0;
+}
+
+int create_req(Request& req) {  // NOLINT
+  for (int i = 0; i < FLAGS_batch_size; ++i) {
+    XImageReqInstance* ins = req.add_instances();
+    if (!ins) {
+      LOG(ERROR) << "Failed create req instance";
+      return -1;
+    }
+
+    int id = i % g_image_data.size();
+    ins->set_image_binary(g_image_data[id], g_image_lengths[id]);
+    ins->set_image_length(g_image_lengths[id]);
+  }
+
+  return 0;
+}
+
+void extract_res(const Request& req, const Response& res) {
+  uint32_t sample_size = res.predictions_size();
+  std::string err_string;
+  for (uint32_t si = 0; si < sample_size; ++si) {
+    DensePrediction json_msg;
+    std::string json = res.predictions(si).response_json();
+    butil::IOBuf buf;
+    buf.clear();
+    buf.append(json);
+    butil::IOBufAsZeroCopyInputStream wrapper(buf);
+    if (!JsonToProtoMessage(&wrapper, &json_msg, &err_string)) {
+      LOG(ERROR) << "Failed parse json from str:" << json;
+      return;
+    }
+
+    uint32_t csize = json_msg.categories_size();
+    if (csize <= 0) {
+      LOG(ERROR) << "sample-" << si << "has no"
+                 << "categories props";
+      continue;
+    }
+    float max_prop = json_msg.categories(0);
+    uint32_t max_idx = 0;
+    for (uint32_t ci = 1; ci < csize; ++ci) {
+      if (json_msg.categories(ci) > max_prop) {
+        max_prop = json_msg.categories(ci);
+        max_idx = ci;
+      }
+    }
+
+    LOG(INFO) << "instance " << si << "has class " << max_idx;
+  }  // end for
+}
+
+void thread_worker(PredictorApi* api, int thread_id) {
+  Request req;
+  Response res;
+
+  api->thrd_initialize();
+
+  for (int i = 0; i < FLAGS_requests; ++i) {
+    api->thrd_clear();
+
+    Predictor* predictor = api->fetch_predictor("ximage");
+    if (!predictor) {
+      LOG(ERROR) << "Failed fetch predictor: ximage";
+      return;
+    }
+
+    req.Clear();
+    res.Clear();
+
+    if (create_req(req) != 0) {
+      return;
+    }
+
+    while (g_concurrency.load() >= FLAGS_concurrency) {
+    }
+    g_concurrency++;
+#if 1
+    LOG(INFO) << "Current concurrency " << g_concurrency.load();
+#endif
+
+    timeval start;
+    timeval end;
+
+    gettimeofday(&start, NULL);
+    if (predictor->inference(&req, &res) != 0) {
+      LOG(ERROR) << "failed call predictor with req:" << req.ShortDebugString();
+      return;
+    }
+    gettimeofday(&end, NULL);
+
+    g_round_time[thread_id].push_back(end.tv_sec * 1000 + end.tv_usec / 1000 -
+                                      start.tv_sec * 1000 -
+                                      start.tv_usec / 1000);
+
+    extract_res(req, res);
+    res.Clear();
+
+    g_concurrency--;
+#if 1
+    LOG(INFO) << "Done. Currenct concurrency " << g_concurrency.load();
+#endif
+  }  // for (int i = 0; i < FLAGS_requests; ++i)
+
+  api->thrd_finalize();
+}
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  PredictorApi api;
+
+// initialize logger instance
+#ifdef BCLOUD
+  logging::LoggingSettings settings;
+  settings.logging_dest = logging::LOG_TO_FILE;
+
+  std::string filename(argv[0]);
+  filename = filename.substr(filename.find_last_of('/') + 1);
+  settings.log_file = (std::string("./log/") + filename + ".log").c_str();
+  settings.delete_old = logging::DELETE_OLD_LOG_FILE;
+  logging::InitLogging(settings);
+
+  logging::ComlogSinkOptions cso;
+  cso.process_name = filename;
+  cso.enable_wf_device = true;
+  logging::ComlogSink::GetInstance()->Setup(&cso);
+#else
+  struct stat st_buf;
+  int ret = 0;
+  if ((ret = stat("./log", &st_buf)) != 0) {
+    mkdir("./log", 0777);
+    ret = stat("./log", &st_buf);
+    if (ret != 0) {
+      LOG(WARNING) << "Log path ./log not exist, and create fail";
+      return -1;
+    }
+  }
+  FLAGS_log_dir = "./log";
+  google::InitGoogleLogging(strdup(argv[0]));
+#endif
+
+  g_round_time.resize(FLAGS_concurrency);
+
+  if (api.create("./conf", "predictors.prototxt") != 0) {
+    LOG(ERROR) << "Failed create predictors api!";
+    return -1;
+  }
+
+  if (prepare_data() != 0) {
+    LOG(ERROR) << "Prepare data fail";
+    return -1;
+  }
+
+  std::vector<std::thread*> worker_threads;
+  int i = 0;
+  for (; i < FLAGS_concurrency; ++i) {
+    worker_threads.push_back(new std::thread(thread_worker, &api, i));
+  }
+
+  for (i = 0; i < FLAGS_concurrency; ++i) {
+    worker_threads[i]->join();
+    delete worker_threads[i];
+  }
+
+  api.destroy();
+
+  std::vector<uint64_t> round_times;
+  for (auto x : g_round_time) {
+    round_times.insert(round_times.end(), x.begin(), x.end());
+  }
+
+  std::sort(round_times.begin(), round_times.end());
+
+  int percent_pos_50 = round_times.size() * 0.5;
+  int percent_pos_80 = round_times.size() * 0.8;
+  int percent_pos_90 = round_times.size() * 0.9;
+  int percent_pos_99 = round_times.size() * 0.99;
+  int percent_pos_999 = round_times.size() * 0.999;
+
+  uint64_t total_ms = 0;
+  for (auto x : round_times) {
+    total_ms += x;
+  }
+
+  LOG(INFO) << "Batch size: " << FLAGS_batch_size;
+  LOG(INFO) << "Total requests: " << round_times.size();
+  LOG(INFO) << "Max concurrency: " << FLAGS_concurrency;
+  LOG(INFO) << "Total ms (absolute time): " << total_ms / FLAGS_concurrency;
+
+  double qps = 0.0;
+  if (total_ms != 0) {
+    qps = (static_cast<double>(FLAGS_concurrency * FLAGS_requests) /
+           (total_ms / FLAGS_concurrency)) *
+          1000;
+  }
+
+  LOG(INFO) << "QPS: " << qps << "/s";
+
+  LOG(INFO) << "Latency statistics: ";
+  if (round_times.size() != 0) {
+    LOG(INFO) << "Average ms: "
+              << static_cast<float>(total_ms) / round_times.size();
+    LOG(INFO) << "50 percent ms: " << round_times[percent_pos_50];
+    LOG(INFO) << "80 percent ms: " << round_times[percent_pos_80];
+    LOG(INFO) << "90 percent ms: " << round_times[percent_pos_90];
+    LOG(INFO) << "99 percent ms: " << round_times[percent_pos_99];
+    LOG(INFO) << "99.9 percent ms: " << round_times[percent_pos_999];
+  } else {
+    LOG(INFO) << "N/A";
+  }
+
+  return 0;
+}
+
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/demo-serving/CMakeLists.txt b/demo-serving/CMakeLists.txt
index f9c6cab63a6c7e1fe04253e8aae6cbd8d8478e06..238fadf8934ca6bc8051e3f2d662c6fbe99170db 100644
--- a/demo-serving/CMakeLists.txt
+++ b/demo-serving/CMakeLists.txt
@@ -18,9 +18,17 @@ include(proto/CMakeLists.txt)
 add_executable(serving ${serving_srcs})
 add_dependencies(serving pdcodegen fluid_cpu_engine pdserving paddle_fluid
         opencv_imgcodecs)
+if (WITH_GPU)
+    add_dependencies(serving fluid_gpu_engine)
+endif()
 target_include_directories(serving PUBLIC
         ${CMAKE_CURRENT_BINARY_DIR}/../predictor
         )
+
+if(WITH_GPU)
+    target_link_libraries(serving ${CUDA_LIBRARIES} -Wl,--whole-archive fluid_gpu_engine
+            -Wl,--no-whole-archive)
+endif()
 target_link_libraries(serving opencv_imgcodecs
         ${opencv_depend_libs} -Wl,--whole-archive fluid_cpu_engine
         -Wl,--no-whole-archive pdserving paddle_fluid ${paddle_depend_libs}
diff --git a/demo-serving/op/classify_op.cpp b/demo-serving/op/classify_op.cpp
index a714e7cd820113280f3deabbe0e95922cdee8d9a..22c3393866c002985d307168fb64b4cd88d4974e 100644
--- a/demo-serving/op/classify_op.cpp
+++ b/demo-serving/op/classify_op.cpp
@@ -35,7 +35,6 @@ int ClassifyOp::inference() {
   }
 
   const TensorVector* in = &reader_out->tensors;
-  uint32_t sample_size = in->size();
 
   TensorVector* out = butil::get_object<TensorVector>();
   if (!out) {
@@ -43,20 +42,21 @@ int ClassifyOp::inference() {
     return -1;
   }
 
-  if (sample_size <= 0) {
-    LOG(INFO) << "No samples need to to predicted";
-    return 0;
+  if (in->size() != 1) {
+    LOG(ERROR) << "Samples should have been packed into a single tensor";
+    return -1;
   }
 
+  int batch_size = in->at(0).shape[0];
   // call paddle fluid model for inferencing
   if (InferManager::instance().infer(
-          IMAGE_CLASSIFICATION_MODEL_NAME, in, out, sample_size)) {
+          IMAGE_CLASSIFICATION_MODEL_NAME, in, out, batch_size)) {
     LOG(ERROR) << "Failed do infer in fluid model: "
                << IMAGE_CLASSIFICATION_MODEL_NAME;
     return -1;
   }
 
-  if (out->size() != sample_size) {
+  if (out->size() != in->size()) {
     LOG(ERROR) << "Output size not eq input size: " << in->size()
                << out->size();
     return -1;
@@ -64,24 +64,35 @@ int ClassifyOp::inference() {
 
   // copy output tensor into response
   ClassifyResponse* res = mutable_data<ClassifyResponse>();
+  const paddle::PaddleTensor& out_tensor = (*out)[0];
+
+#if 0
+  int out_shape_size = out_tensor.shape.size();
+  LOG(ERROR) << "out_tensor.shpae";
+  for (int i = 0; i < out_shape_size; ++i) {
+    LOG(ERROR) << out_tensor.shape[i] << ":";
+  }
+
+  if (out_shape_size != 2) {
+    return -1;
+  }
+#endif
+
+  int sample_size = out_tensor.shape[0];
+#if 0
+  LOG(ERROR) << "Output sample size " << sample_size;
+#endif
   for (uint32_t si = 0; si < sample_size; si++) {
-    const paddle::PaddleTensor& out_tensor = (*out)[si];
     DensePrediction* ins = res->add_predictions();
     if (!ins) {
       LOG(ERROR) << "Failed append new out tensor";
       return -1;
     }
 
-    uint32_t shape_size = out_tensor.shape.size();
-    if (out_tensor.shape.size() != 2 || out_tensor.shape[0] != 1) {
-      LOG(ERROR) << "Not valid classification out shape"
-                 << ", shape size: " << out_tensor.shape.size();
-      return -1;
-    }
-
     // assign output data
-    uint32_t data_size = out_tensor.data.length() / sizeof(float);
-    float* data = reinterpret_cast<float*>(out_tensor.data.data());
+    uint32_t data_size = out_tensor.shape[1];
+    float* data = reinterpret_cast<float*>(out_tensor.data.data() +
+                                           si * sizeof(float) * data_size);
     for (uint32_t di = 0; di < data_size; ++di) {
       ins->add_categories(data[di]);
     }
@@ -95,10 +106,6 @@ int ClassifyOp::inference() {
   out->clear();
   butil::return_object<TensorVector>(out);
 
-  LOG(INFO) << "Response in image classification:"
-            << "length:" << res->ByteSize() << ","
-            << "data:" << res->ShortDebugString();
-
   return 0;
 }
 
diff --git a/demo-serving/op/reader_op.cpp b/demo-serving/op/reader_op.cpp
index 8c97702de33215f9b33e201c0b02de6a2a8d7d08..d50c9ebd24be48c32d59aa641bace52cf556a337 100644
--- a/demo-serving/op/reader_op.cpp
+++ b/demo-serving/op/reader_op.cpp
@@ -51,6 +51,26 @@ int ReaderOp::inference() {
   resize.height = iresize[0];
   resize.width = iresize[1];
 
+  paddle::PaddleTensor in_tensor;
+  in_tensor.name = "tensor";
+  in_tensor.dtype = paddle::FLOAT32;
+  // shape assignment
+  in_tensor.shape.push_back(sample_size);  // batch_size
+  in_tensor.shape.push_back(3);
+  in_tensor.shape.push_back(resize.width);
+  in_tensor.shape.push_back(resize.height);
+
+  // tls resource assignment
+  size_t dense_capacity = 3 * resize.width * resize.height;
+  size_t len = dense_capacity * sizeof(float) * sample_size;
+  float* data =
+      reinterpret_cast<float*>(MempoolWrapper::instance().malloc(len));
+  if (data == NULL) {
+    LOG(ERROR) << "Failed create temp float array, "
+               << "size=" << dense_capacity * sample_size * sizeof(float);
+    return -1;
+  }
+
   for (uint32_t si = 0; si < sample_size; si++) {
     // parse image object from x-image
     const XImageReqInstance& ins = req->instances(si);
@@ -103,50 +123,31 @@ int ReaderOp::inference() {
     const int H = _image_8u_rgb.rows;
     const int W = _image_8u_rgb.cols;
     const int C = _image_8u_rgb.channels();
-    size_t dense_capacity = H * W * C;
-
-    paddle::PaddleTensor in_tensor;
-    in_tensor.name = "tensor";
-    in_tensor.dtype = paddle::FLOAT32;
-
-    // shape assignment
-    in_tensor.shape.push_back(1);  // batch_size
-
-    // accoreding to training stage, the instance shape should be
-    // in order of C-W-H.
-    in_tensor.shape.push_back(C);
-    in_tensor.shape.push_back(W);
-    in_tensor.shape.push_back(H);
+    if (H != resize.height || W != resize.width || C != 3) {
+      LOG(ERROR) << "Image " << si << " has incompitable size";
+      return -1;
+    }
 
     LOG(INFO) << "Succ read one image, C: " << C << ", W: " << W
               << ", H: " << H;
 
-    // tls resource assignment
-    size_t len = dense_capacity * sizeof(float);
-    float* data =
-        reinterpret_cast<float*>(MempoolWrapper::instance().malloc(len));
-    if (data == NULL) {
-      LOG(ERROR) << "Failed create temp float array, "
-                 << "size=" << dense_capacity;
-      return -1;
-    }
-
+    float* data_ptr = data + dense_capacity * si;
     for (int h = 0; h < H; h++) {
       // p points to a new line
       unsigned char* p = _image_8u_rgb.ptr<unsigned char>(h);
       for (int w = 0; w < W; w++) {
         for (int c = 0; c < C; c++) {
           // HWC(row,column,channel) -> CWH
-          data[W * H * c + W * h + w] = (p[C * w + c] - pmean[c]) * scale[c];
+          data_ptr[W * H * c + W * h + w] =
+              (p[C * w + c] - pmean[c]) * scale[c];
         }
       }
     }
-
-    paddle::PaddleBuf pbuf(data, len);
-    in_tensor.data = pbuf;
-
-    in->push_back(in_tensor);
   }
+  paddle::PaddleBuf pbuf(data, len);
+  in_tensor.data = pbuf;
+
+  in->push_back(in_tensor);
 
   return 0;
 }
diff --git a/demo-serving/op/write_json_op.cpp b/demo-serving/op/write_json_op.cpp
index 25aece0e4533ab5a535e86d2c4cbff259754436c..e0b372d7c8fce86ced556ea397c9dcbb05111f66 100644
--- a/demo-serving/op/write_json_op.cpp
+++ b/demo-serving/op/write_json_op.cpp
@@ -16,7 +16,7 @@
 #include <string>
 
 #ifdef BCLOUD
-#include "pb_to_json.h"
+#include "pb_to_json.h"  // NOLINT
 #else
 #include "json2pb/pb_to_json.h"
 #endif
@@ -70,7 +70,7 @@ int WriteJsonOp::inference() {
     }
   }
 
-  LOG(INFO) << "Succ write json:" << classify_out->ShortDebugString();
+  LOG(INFO) << "Succ write json";
 
   return 0;
 }
diff --git a/doc/DESIGN.md b/doc/DESIGN.md
index ce89854006ac389d954cb2bda89c54e6f1a2e6d2..12dce781a2ba19b678ea488a1b45841d1b611cc2 100644
--- a/doc/DESIGN.md
+++ b/doc/DESIGN.md
@@ -26,7 +26,7 @@ PaddlePaddle是公司开源的机器学习框架，广泛支持各种深度学
 
 ## 3. Paddle Serving总体框架
 
-![Paddle-Serging总体框图](https://paddle-serving.bj.bcebos.com/doc/framework.png)
+![Paddle-Serging总体框图](framework.png)
 
 **模型管理框架**：对接多种机器学习平台的模型文件，向上提供统一的inference接口
 **业务调度框架**：对各种不同预测模型的计算逻辑进行抽象，提供通用的DAG调度框架，通过DAG图串联不同的算子，共同完成一次预测服务。该抽象模型使用户可以方便的实现自己的计算逻辑，同时便于算子共用。（用户搭建自己的预测服务，很大一部分工作是搭建DAG和提供算子的实现）
@@ -62,31 +62,31 @@ class FluidFamilyCore {
 
 参考TF框架的模型计算的抽象思想，将业务逻辑抽象成DAG图，由配置驱动，生成workflow，跳过C++代码编译。业务的每个具体步骤，对应一个具体的OP，OP可配置自己依赖的上游OP。OP之间消息传递统一由线程级Bus和channel机制实现。例如，一个简单的预测服务的服务过程，可以抽象成读请求数据->调用预测接口->写回预测结果等3个步骤，相应的实现到3个OP: ReaderOp->ClassifyOp->WriteOp
 
-![预测服务Service](https://paddle-serving.bj.bcebos.com/doc/predict-service.png)
+![预测服务Service](predict-service.png)
 
 关于OP之间的依赖关系，以及通过OP组建workflow，可以参考[从零开始写一个预测服务](CREATING.md)的相关章节
 
 服务端实例透视图
 
-![服务端实例透视图](https://paddle-serving.bj.bcebos.com/doc/server-side.png)
+![服务端实例透视图](server-side.png)
 
 
 #### 3.2.2 Paddle Serving的多服务机制
 
-![Paddle Serving的多服务机制](https://paddle-serving.bj.bcebos.com/doc/multi-service.png)
+![Paddle Serving的多服务机制](multi-service.png)
 
-Paddle Serving实例可以同时加载多个模型，每个模型用一个Service（以及其所配置的workflow）承接服务。可以参考[Demo例子中的service配置文件](../serving/conf/service.prototxt)了解如何为serving实例配置多个service
+Paddle Serving实例可以同时加载多个模型，每个模型用一个Service（以及其所配置的workflow）承接服务。可以参考[Demo例子中的service配置文件](../demo-serving/conf/service.prototxt)了解如何为serving实例配置多个service
 
 #### 3.2.3 业务调度层级关系
 
 从客户端看，一个Paddle Serving service从顶向下可分为Service, Endpoint, Variant等3个层级
 
-![调用层级关系](https://paddle-serving.bj.bcebos.com/doc/multi-variants.png)
+![调用层级关系](multi-variants.png)
 
 一个Service对应一个预测模型，模型下有1个endpoint。模型的不同版本，通过endpoint下多个variant概念实现：
 同一个模型预测服务，可以配置多个variant，每个variant有自己的下游IP列表。客户端代码可以对各个variant配置相对权重，以达到调节流量比例的关系（参考[客户端配置](CLIENT_CONFIGURE.md)第3.2节中关于variant_weight_list的说明）。
 
-![Client端proxy功能](https://paddle-serving.bj.bcebos.com/doc/client-side-proxy.png)
+![Client端proxy功能](client-side-proxy.png)
 
 ## 4. 用户接口
 
diff --git a/doc/FAQ.md b/doc/FAQ.md
index a48e5c4dbc6562a8035d4ea0731b06183e2e8f3c..2ba9ec9d5e0a5d7c8f0ccc3ebfc480f21170751d 100644
--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -5,3 +5,22 @@
 - 如果在inferservice_file里指定了port:xxx，那么就去申请该端口号；
 - 否则，如果在gflags.conf里指定了--port:xxx，那就去申请该端口号；
 - 否则，使用程序里指定的默认端口号：8010。
+
+## 2. GPU预测中为何请求的响应时间波动会非常大？
+PaddleServing依托PaddlePaddle预测库执行预测计算；在GPU设备上，由于同一个进程内目前共用1个GPU stream，进程内的多个请求的预测计算会被严格串行。所以如果有2个请求同时到达某个Serving实例，不管该实例启动时创建了多少个worker线程，都不能起到加速作用，后到的请求会被排队，直到前面请求计算完成。
+
+## 3. 如何充分利用GPU卡的计算能力？
+如问题2所说，由于预测库的限制，单个Serving进程只能绑定单张GPU卡，且进程内共用1个GPU stream，所有请求必须串行计算。
+
+为提高GPU卡使用率，目前可以想到的方法是：在单张GPU卡上启动多个Serving进程，每个进程绑定一个GPU stream，多个stream并行计算。这种方法是否能起到加速作用，受限于多个因素，主要有：
+
+1. 单个stream占用GPU算力；假如单个stream已经将GPU算力占用超过50%，那么增加stream很可能会导致2个stream的job分别排队，拖慢各自的响应时间
+2. GPU显存：Serving进程需要将模型参数加载到显存中，并且计算时要在GPU显存池分配临时变量；假如单个Serving进程已经用掉超过50%的显存，则增加Serving进程会造成显存不足，导致进程报错退出
+
+为此，可采用如下步骤，进行测试：
+
+1. 加载模型时，在model_toolkit.prototxt中，model type选择FLUID_GPU_ANALYSIS或FLUID_GPU_ANALYSIS_DIR；会对模型进行静态分析，进行一定程度显存优化
+2. 在步骤1完成后，启动单个Serving进程，启动参数:`--gpuid=N --bthread_concurrency=4 --bthread_min_concurrency=4`；启动一个client，进行并发度为1的压力测试，batch size从小到大，记下平响；由于算力的限制，当batch size增大到一定程度，应该会出现响应时间明显变大；或虽然没有明显变大，但已经不满足系统需求
+3. 再启动1个Serving进程，与步骤2启动时使用相同的参数略有不同: `--gpuid=N --bthread_concurrency=4 --bthread_min_concurrency=4 --port=8011` 其中--port=8011用来让新启动的进程使用一个新的服务端口；然后同时对这2个Serving进程进行压测，继续观察batch size从小到大时平均响应时间的变化，直到取得batch size和响应时间的折中
+4. 重复步骤2-3
+5. 以2-4步的测试，来决定：单张GPU卡可以由多少个Serving进程共用; 实际部署时，就在一张GPU卡上启动这么多个Serving进程同时提供服务
diff --git a/doc/INDEX.md b/doc/INDEX.md
index c4e78be00435e3b936b9fa08785098028e53de17..c9399b2f5cf50cda10062023f009b92bfc0c8158 100644
--- a/doc/INDEX.md
+++ b/doc/INDEX.md
@@ -1,16 +1,19 @@
+[Design](DESIGN.md)
 
-[Client Configure](CLIENT_CONFIGURE.md)
+[Installation](INSTALL.md)
 
-[How to Configure a Clustered Service](CLUSTERING.md)
+[Getting Started](GETTING_STARTED.md)
 
 [Creating a Prediction Service](CREATING.md)
 
-[Design](DESIGN.md)
+[Client Configure](CLIENT_CONFIGURE.md)
 
-[FAQ](FAQ.md)
+[Server Side Configuration](SERVING_CONFIGURE.md)
 
-[Getting Started](GETTING_STARTED.md)
+[How to Configure a Clustered Service](CLUSTERING.md)
 
-[Installation](INSTALL.md)
+[Multiple Serving Instances over Single GPU Card](MULTI_SERVING_OVER_SINGLE_GPU_CARD.md)
 
-[Server Side Configuration](SERVING_CONFIGURE.md)
+[Benchmarking](BENCHMARKING.md)
+
+[FAQ](FAQ.md)
diff --git a/doc/INSTALL.md b/doc/INSTALL.md
index 64e482d6438f2d6e5da8ea6ce98e0933b18a0f7b..d3114e86efbf2cd5985811d4d39ec7e8069e3534 100644
--- a/doc/INSTALL.md
+++ b/doc/INSTALL.md
@@ -58,10 +58,9 @@ $ make install
 
 # CMake编译选项说明
 
-因Paddle Serving依托于PaddlePaddle项目进行构建，以下编译选项其实是传递给PaddlePaddle的编译选项：
-
 | 编译选项 | 说明 |
 |----------|------|
-| WITH_AVX | Compile PaddlePaddle with AVX intrinsics |
-| WITH_MKL | Compile PaddlePaddle with MKLML library |
+| WITH_AVX | For configuring PaddlePaddle. Compile PaddlePaddle with AVX intrinsics |
+| WITH_MKL | For configuring PaddlePaddle. Compile PaddlePaddle with MKLML library |
+| WITH_GPU | For configuring PaddlePaddle. Compile PaddlePaddle with NVIDIA GPU |
 | CLINET_ONLY | Compile client libraries and demos only |
diff --git a/doc/MULTI_SERVING_OVER_SINGLE_GPU_CARD.md b/doc/MULTI_SERVING_OVER_SINGLE_GPU_CARD.md
new file mode 100644
index 0000000000000000000000000000000000000000..1bbcaf16a38d551998402c0f52df581dc9dc9866
--- /dev/null
+++ b/doc/MULTI_SERVING_OVER_SINGLE_GPU_CARD.md
@@ -0,0 +1,32 @@
+# Multiple Serving Instances over Single GPU Card
+
+Paddle Serving依托PaddlePaddle预测库执行实际的预测计算。由于当前GPU预测库的限制，单个Serving实例只可以绑定1张GPU卡，且进程内所有worker线程共用1个GPU stream。也就是说，不管Serving启动多少个worker线程，所有的请求在GPU是严格串行计算的，起不到加速作用。这会带来一个问题，就是如果模型计算量不大，那么Serving进程实际上不会用满GPU的算力。
+
+为了充分利用GPU卡的算力，考虑在单张卡上启动多个Serving实例，通过多个GPU stream，力争用满GPU的算力。启动命令可以如下所示：
+
+```
+bin/serving --gpuid=0 --bthread_concurrency=4 --bthread_min_concurrency=4 --port=8010&
+bin/serving --gpuid=0 --bthread_concurrency=4 --bthread_min_concurrency=4 --port=8011&
+```
+
+上述2条命令，启动2个Serving实例，分别监听8010端口和8011端口。但他们都绑定同一张卡 (gpuid = 0)。
+
+命令行参数含义：
+```
+-gpuid=N：用于指定所绑定的GPU卡ID
+-bthread_concurrency和bthread_min_concurrency共同限制该进程启动的worker数：由于在GPU预测模式下，增加worker线程数并不能提高并发能力，为了节省部分资源，干脆将他们限制掉；均设为4，是因为这是bthread允许的最小值。
+-port xxx：Serving实例监听的端口
+```
+
+但是，上述方式究竟是否能在不影响响应时间等其他指标的前提下，起到提高GPU使用率作用，受到多个限制因素的制约，具体的：
+
+1. 单个stream占用GPU算力；假如单个stream已经将GPU算力占用超过50%，那么增加stream很可能会导致2个stream的job分别排队，拖慢各自的响应时间
+2. GPU显存：Serving进程需要将模型参数加载到显存中，并且计算时要在GPU显存池分配临时变量；假如单个Serving进程已经用掉超过50%的显存，则增加Serving进程会造成显存不足，导致进程报错退出
+
+为此，可采用如下步骤，进行测试：
+
+1. 加载模型时，在model_toolkit.prototxt中，model type选择FLUID_GPU_ANALYSIS或FLUID_GPU_ANALYSIS_DIR；会对模型进行静态分析，进行一定程度显存优化
+2. 在步骤1完成后，启动单个Serving进程，启动参数:`--gpuid=N --bthread_concurrency=4 --bthread_min_concurrency=4`；启动一个client，进行并发度为1的压力测试，batch size从小到大，记下平响；由于算力的限制，当batch size增大到一定程度，应该会出现响应时间明显变大；或虽然没有明显变大，但已经不满足系统需求
+3. 再启动1个Serving进程，与步骤2启动时使用相同的参数略有不同: `--gpuid=N --bthread_concurrency=4 --bthread_min_concurrency=4 --port=8011` 其中--port=8011用来让新启动的进程使用一个新的服务端口；然后同时对这2个Serving进程进行压测，继续观察batch size从小到大时平均响应时间的变化，直到取得batch size和响应时间的折中
+4. 重复步骤2-3
+5. 以2-4步的测试，来决定：单张GPU卡可以由多少个Serving进程共用; 实际部署时，就在一张GPU卡上启动这么多个Serving进程同时提供服务
diff --git a/doc/SERVING_CONFIGURE.md b/doc/SERVING_CONFIGURE.md
index 9cb4a149294b66e2ef977d8529f54cea74a81c23..f5887f5cd139b1e70b49f4eee2e2552658692103 100644
--- a/doc/SERVING_CONFIGURE.md
+++ b/doc/SERVING_CONFIGURE.md
@@ -142,6 +142,11 @@ type: 预测引擎的类型。可在inferencer-fluid-cpu/src/fluid_cpu_engine.cp
 |FLUID_CPU_ANALYSIS_DIR|使用fluid Analysis API；模型所有参数分开保存为独立的文件，整个模型放到一个目录中|
 |FLUID_CPU_NATIVE|使用fluid Native API；模型所有参数保存在一个文件|
 |FLUID_CPU_NATIVE_DIR|使用fluid Native API；模型所有参数分开保存为独立的文件，整个模型放到一个目录中|
+|FLUID_GPU_ANALYSIS|GPU预测，使用fluid Analysis API；模型所有参数保存在一个文件|
+|FLUID_GPU_ANALYSIS_DIR|GPU预测，使用fluid Analysis API；模型所有参数分开保存为独立的文件，整个模型放到一个目录中|
+|FLUID_GPU_NATIVE|GPU预测，使用fluid Native API；模型所有参数保存在一个文件|
+|FLUID_GPU_NATIVE_DIR|GPU预测，使用fluid Native API；模型所有参数分开保存为独立的文件，整个模型放到一个目录中|
+
 
 **fluid Analysis API和fluid Native API的区别**
 
@@ -183,9 +188,10 @@ enable_batch_align:
 |enable_protocol_list|baidu_std|brpc 通信协议列表|
 |log_dir|./log|log dir|
 |num_threads||brpc server使用的系统线程数，默认为CPU核数|
-|max_concurrency||并发处理的请求数，设为<=0则为不予限制，若大于0则限定brpc server端同时处理的请求数上限|
 |port|8010|Serving进程接收请求监听端口|
 |gpuid|0|GPU预测时指定Serving进程使用的GPU device id。只允许绑定1张GPU卡|
+|bthread_concurrency|9|BRPC底层bthread的concurrency。在使用GPU预测引擎时，为了限制并发worker数，可使用此参数|
+|bthread_min_concurrency|4|BRPC底层bthread的min concurrency。在使用GPU预测引擎时，为限制并发worker数，可使用此参数。与bthread_concurrency结合使用|
 
 可以通过在serving/conf/gflags.conf覆盖默认值，例如
 ```
diff --git a/doc/client-side-proxy.png b/doc/client-side-proxy.png
new file mode 100755
index 0000000000000000000000000000000000000000..1e7639ac401955d9b7c2761820f3c3cdc7fbf8fd
Binary files /dev/null and b/doc/client-side-proxy.png differ
diff --git a/doc/framework.png b/doc/framework.png
new file mode 100755
index 0000000000000000000000000000000000000000..676d35bed06893d0f6247561756c4595f48f1698
Binary files /dev/null and b/doc/framework.png differ
diff --git a/doc/multi-service.png b/doc/multi-service.png
new file mode 100755
index 0000000000000000000000000000000000000000..629024e58f58299d16fb133601c09e673746d560
Binary files /dev/null and b/doc/multi-service.png differ
diff --git a/doc/multi-variants.png b/doc/multi-variants.png
new file mode 100755
index 0000000000000000000000000000000000000000..c3d141b14712b4853629f9119d60347a20779268
Binary files /dev/null and b/doc/multi-variants.png differ
diff --git a/doc/predict-service.png b/doc/predict-service.png
new file mode 100755
index 0000000000000000000000000000000000000000..ccd92e4bb1b5c58787118b564cc6a776d648be01
Binary files /dev/null and b/doc/predict-service.png differ
diff --git a/doc/server-side.png b/doc/server-side.png
new file mode 100755
index 0000000000000000000000000000000000000000..7a96996c2a4d14832c9c2177a09e78181b1a551c
Binary files /dev/null and b/doc/server-side.png differ
diff --git a/inferencer-fluid-cpu/include/fluid_cpu_engine.h b/inferencer-fluid-cpu/include/fluid_cpu_engine.h
index f01f7fce418278ea45eee4cc3558a1da0b0dd094..24109ef0226a510d48e0cade4d9bc6039d7d5754 100644
--- a/inferencer-fluid-cpu/include/fluid_cpu_engine.h
+++ b/inferencer-fluid-cpu/include/fluid_cpu_engine.h
@@ -22,7 +22,11 @@
 #include "configure/include/configure_parser.h"
 #include "configure/inferencer_configure.pb.h"
 #ifdef BCLOUD
+#ifdef WITH_GPU
+#include "paddle/paddle_inference_api.h"
+#else
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#endif
 #else
 #include "paddle/fluid/inference/paddle_inference_api.h"
 #endif
@@ -155,6 +159,8 @@ class FluidCpuNativeCore : public FluidFamilyCore {
     native_config.prog_file = data_path + "/__model__";
     native_config.use_gpu = false;
     native_config.device = 0;
+    native_config.fraction_of_gpu_memory = 0;
+
     AutoLock lock(GlobalPaddleCreateMutex::instance());
     _core = paddle::CreatePaddlePredictor<paddle::NativeConfig,
                                           paddle::PaddleEngineKind::kNative>(
@@ -209,6 +215,7 @@ class FluidCpuNativeDirCore : public FluidFamilyCore {
     native_config.model_dir = data_path;
     native_config.use_gpu = false;
     native_config.device = 0;
+    native_config.fraction_of_gpu_memory = 0;
     AutoLock lock(GlobalPaddleCreateMutex::instance());
     _core = paddle::CreatePaddlePredictor<paddle::NativeConfig,
                                           paddle::PaddleEngineKind::kNative>(
@@ -458,6 +465,7 @@ class FluidCpuNativeDirWithSigmoidCore : public FluidCpuWithSigmoidCore {
     native_config.model_dir = data_path;
     native_config.use_gpu = false;
     native_config.device = 0;
+    native_config.fraction_of_gpu_memory = 0;
     AutoLock lock(GlobalPaddleCreateMutex::instance());
     _core->_fluid_core =
         paddle::CreatePaddlePredictor<paddle::NativeConfig,
diff --git a/inferencer-fluid-gpu/CMakeLists.txt b/inferencer-fluid-gpu/CMakeLists.txt
index f52f59f64285e1e6e15d126ce83f510e986111d2..0cd16298a5a2301fd302cfbe6426ac1800e53583 100644
--- a/inferencer-fluid-gpu/CMakeLists.txt
+++ b/inferencer-fluid-gpu/CMakeLists.txt
@@ -1,10 +1,10 @@
-FILE(GLOB fluid_cpu_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
-add_library(fluid_cpu_engine ${fluid_cpu_engine_srcs})
-target_include_directories(fluid_cpu_engine PUBLIC
+FILE(GLOB fluid_gpu_engine_srcs ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
+add_library(fluid_gpu_engine ${fluid_gpu_engine_srcs})
+target_include_directories(fluid_gpu_engine PUBLIC
         ${CMAKE_BINARY_DIR}/Paddle/fluid_install_dir/)
-add_dependencies(fluid_cpu_engine pdserving extern_paddle configure)
-target_link_libraries(fluid_cpu_engine pdserving paddle_fluid -liomp5 -lmklml_intel -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
+add_dependencies(fluid_gpu_engine pdserving extern_paddle configure)
+target_link_libraries(fluid_gpu_engine pdserving paddle_fluid -liomp5 -lmklml_intel -lpthread -lcrypto -lm -lrt -lssl -ldl -lz)
 
-install(TARGETS fluid_cpu_engine 
+install(TARGETS fluid_gpu_engine 
         ARCHIVE DESTINATION ${PADDLE_SERVING_INSTALL_DIR}/lib
         )
diff --git a/inferencer-fluid-gpu/include/fluid_gpu_engine.h b/inferencer-fluid-gpu/include/fluid_gpu_engine.h
index 34c9a259f51f212733210a2bed7b0d79a80c8489..b07789300eb688f6f1be6ee4419ad54634a911e3 100644
--- a/inferencer-fluid-gpu/include/fluid_gpu_engine.h
+++ b/inferencer-fluid-gpu/include/fluid_gpu_engine.h
@@ -135,6 +135,7 @@ class FluidGpuAnalysisCore : public FluidFamilyCore {
     analysis_config.SetCpuMathLibraryNumThreads(1);
     analysis_config.EnableMemoryOptim(false, false);
     analysis_config.SwitchSpecifyInputNames(true);
+
     AutoLock lock(GlobalPaddleCreateMutex::instance());
     _core =
         paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
@@ -192,6 +193,7 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {
     analysis_config.SwitchSpecifyInputNames(true);
     analysis_config.SetCpuMathLibraryNumThreads(1);
     analysis_config.EnableMemoryOptim(false, false);
+
     AutoLock lock(GlobalPaddleCreateMutex::instance());
     _core =
         paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
@@ -214,7 +216,6 @@ class FluidGpuNativeDirCore : public FluidFamilyCore {
       return -1;
     }
 
-
     paddle::NativeConfig native_config;
     native_config.model_dir = data_path;
     native_config.use_gpu = true;
diff --git a/predictor/common/inner_common.h b/predictor/common/inner_common.h
index a10ce38ea039b734b62dcef5981d46293057f94d..9f9b3b2f3607578348dcca9253d6451f8fa205d6 100644
--- a/predictor/common/inner_common.h
+++ b/predictor/common/inner_common.h
@@ -32,24 +32,26 @@
 #include "gflags/gflags.h"
 
 #ifdef BCLOUD
-#include "bthread.h"
 #include "baidu/rpc/channel.h"
 #include "baidu/rpc/policy/giano_authenticator.h"
 #include "baidu/rpc/server.h"
-#include "base/logging.h"
 #include "base/comlog_sink.h"
+#include "base/logging.h"
 #include "base/object_pool.h"
 #include "base/time.h"
+#include "bthread.h"  // NOLINT
 #else
-#include "bthread/bthread.h"
 #include "brpc/channel.h"
 #include "brpc/policy/giano_authenticator.h"
 #include "brpc/server.h"
+#include "bthread/bthread.h"
 #include "butil/logging.h"
 #include "butil/object_pool.h"
 #include "butil/time.h"
 #endif
 
+#include "glog/raw_logging.h"
+
 #include "configure/include/configure_parser.h"
 #include "configure/server_configure.pb.h"
 
diff --git a/predictor/framework/op_repository.h b/predictor/framework/op_repository.h
index dca8f129c55fac39e24bb2c03a400d49d727c809..045912945232f9a36f40a941bc77041fa5ca08e2 100644
--- a/predictor/framework/op_repository.h
+++ b/predictor/framework/op_repository.h
@@ -62,7 +62,7 @@ class OpRepository {
   template <typename OP_TYPE>
   void regist_op(std::string op_type) {
     _repository[op_type] = &OpFactory<OP_TYPE>::instance();
-    LOG(INFO) << "Succ regist op: " << op_type << "!";
+    RAW_LOG_INFO("Succ regist op: %s", op_type.c_str());
   }
 
   Op* get_op(std::string op_type);
diff --git a/predictor/framework/service_manager.h b/predictor/framework/service_manager.h
index e456c5cdcd0eb93d91a33efa93db0f71cd92bcc9..1b339c3742ef7302d5ce82704dd70d0ad6f84e7b 100644
--- a/predictor/framework/service_manager.h
+++ b/predictor/framework/service_manager.h
@@ -27,13 +27,13 @@ namespace predictor {
         ::baidu::paddle_serving::predictor::FormatServiceManager::instance() \
             .regist_service(svr_name, svr);                                  \
     if (ret != 0) {                                                          \
-      LOG(ERROR) << "Failed regist service[" << svr_name << "]"              \
-                 << "[" << typeid(svr).name() << "]"                         \
-                 << "!";                                                     \
+      RAW_LOG_ERROR("Failed regist service[%s][%s]",                         \
+                    svr_name.c_str(),                                        \
+                    typeid(svr).name());                                     \
     } else {                                                                 \
-      LOG(INFO) << "Success regist service[" << svr_name << "]["             \
-                << typeid(svr).name() << "]"                                 \
-                << "!";                                                      \
+      RAW_LOG_INFO("Success regist service[%s][%s]",                         \
+                   svr_name.c_str(),                                         \
+                   typeid(svr).name());                                      \
     }                                                                        \
   } while (0)
 
@@ -43,29 +43,30 @@ class FormatServiceManager {
 
   int regist_service(const std::string& svr_name, Service* svr) {
     if (_service_map.find(svr_name) != _service_map.end()) {
-      LOG(ERROR) << "Service[" << svr_name << "][" << typeid(svr).name() << "]"
-                 << " already exist!";
+      RAW_LOG_ERROR("Service[%s][%s] already exist!",
+                    svr_name.c_str(),
+                    typeid(svr).name());
       return -1;
     }
 
     std::pair<boost::unordered_map<std::string, Service*>::iterator, bool> ret;
     ret = _service_map.insert(std::make_pair(svr_name, svr));
     if (ret.second == false) {
-      LOG(ERROR) << "Service[" << svr_name << "][" << typeid(svr).name() << "]"
-                 << " insert failed!";
+      RAW_LOG_ERROR("Service[%s][%s] insert failed!",
+                    svr_name.c_str(),
+                    typeid(svr).name());
       return -1;
     }
 
-    LOG(INFO) << "Service[" << svr_name << "] insert successfully!";
+    RAW_LOG_INFO("Service[%s] insert successfully!", svr_name.c_str());
     return 0;
   }
 
   Service* get_service(const std::string& svr_name) {
     boost::unordered_map<std::string, Service*>::iterator res;
     if ((res = _service_map.find(svr_name)) == _service_map.end()) {
-      LOG(WARNING) << "Service[" << svr_name << "] "
-                   << "not found in service manager"
-                   << "!";
+      RAW_LOG_WARNING("Service[%s] not found in service manager!",
+                      svr_name.c_str());
       return NULL;
     }
     return (*res).second;
diff --git a/predictor/src/pdserving.cpp b/predictor/src/pdserving.cpp
index 0897039c79e4c576fd39cdea4bc21934a1ceed9c..be7f988744b6ef0530c8b725cb3d6275725831ec 100644
--- a/predictor/src/pdserving.cpp
+++ b/predictor/src/pdserving.cpp
@@ -51,6 +51,8 @@ using baidu::paddle_serving::predictor::FLAGS_port;
 using baidu::paddle_serving::configure::InferServiceConf;
 using baidu::paddle_serving::configure::read_proto_conf;
 
+DECLARE_bool(logtostderr);
+
 void print_revision(std::ostream& os, void*) {
 #if defined(PDSERVING_VERSION)
   os << PDSERVING_VERSION;
@@ -70,12 +72,13 @@ DEFINE_bool(g, false, "user defined gflag path");
 DECLARE_string(flagfile);
 
 namespace bthread {
-  extern pthread_mutex_t g_task_control_mutex;
+extern pthread_mutex_t g_task_control_mutex;
 }
 pthread_mutex_t g_worker_start_fn_mutex = PTHREAD_MUTEX_INITIALIZER;
 
 void pthread_worker_start_fn() {
-  while (pthread_mutex_lock(&g_worker_start_fn_mutex) != 0) {}
+  while (pthread_mutex_lock(&g_worker_start_fn_mutex) != 0) {
+  }
 
   // Try to avoid deadlock in bthread
   int lock_status = pthread_mutex_trylock(&bthread::g_task_control_mutex);
@@ -86,7 +89,8 @@ void pthread_worker_start_fn() {
 
   // Try to avoid deadlock in bthread
   if (lock_status == EBUSY || lock_status == EAGAIN) {
-    while (pthread_mutex_lock(&bthread::g_task_control_mutex) != 0) {}
+    while (pthread_mutex_lock(&bthread::g_task_control_mutex) != 0) {
+    }
   }
 
   pthread_mutex_unlock(&g_worker_start_fn_mutex);
@@ -132,7 +136,7 @@ int main(int argc, char** argv) {
 
   g_change_server_port();
 
-  // initialize logger instance
+// initialize logger instance
 #ifdef BCLOUD
   logging::LoggingSettings settings;
   settings.logging_dest = logging::LOG_TO_FILE;
@@ -204,6 +208,8 @@ int main(int argc, char** argv) {
   }
   LOG(INFO) << "Succ call pthread worker start function";
 
+  FLAGS_logtostderr = false;
+
   if (ServerManager::instance().start_and_wait() != 0) {
     LOG(ERROR) << "Failed start server and wait!";
     return -1;
diff --git a/release.bcloud b/release.bcloud
index 3f76ab210331d9118393eb555fad9d1c0d61f487..85454b5da574a5023a30048e7aa3bdb11d352f16 100644
--- a/release.bcloud
+++ b/release.bcloud
@@ -9,6 +9,7 @@ mv bin/sparse_format demo/client/bin
 mv bin/text_classification demo/client/bin
 mv bin/text_classification_press demo/client/bin
 mv bin/ximage demo/client/bin
+mv bin/ximage_press demo/client/bin
 
 cp baidu_third-party_mklml/so/* demo/serving/bin/
 rm -rf baidu_third-party_mklml