Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into unsqueeze_op

b8ea7a08 · chenweihang · fbef49e7 · f9202447 · b8ea7a08 · b8ea7a08
18 changed file
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -45,6 +45,10 @@ endfunction(inference_api_test)
 cc_library(paddle_inference_api
    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+if(NOT APPLE)
+  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_api.sym")
+  set_target_properties(paddle_inference_api PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+endif()
 # Here the shared library doesn't depend on other fluid libraries, or double free will occur.
 cc_library(paddle_inference_api_shared SHARED
@@ -53,8 +57,19 @@ add_dependencies(paddle_inference_api_shared ${FLUID_CORE_MODULES} ${GLOB_OP_LIB
 set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_inference_api)
 if(NOT APPLE)
-  set(LINK_FLAGS "-fPIC -fvisibility=hidden")
+  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_api.map")
  set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
+    "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
+    " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_inference_api.so\" RESULT_VARIABLE symbol_res)\n"
+    "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
+    "  message(FATAL_ERROR \"Check symbol failed.\")\n"
+    "endif()\n")
+  add_custom_command(
+    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
+    COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake"
+    DEPENDS paddle_inference_api_shared)
+  add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
 endif()
 cc_test(test_paddle_inference_api

--- a/paddle/contrib/inference/check_symbol.sh
+++ b/paddle/contrib/inference/check_symbol.sh
+#!/bin/bash
+lib=$1
+if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
+num_paddle_syms=$(nm -D --defined-only ${lib} | grep paddle | wc -l)
+num_google_syms=$(nm -D --defined-only ${lib} | grep google | wc -l)
+if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
+if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi
+exit 0
--- a/paddle/contrib/inference/demo/CMakeLists.txt
+++ b/paddle/contrib/inference/demo/CMakeLists.txt
@@ -13,8 +13,6 @@
 # limitations under the License.
 #
-inference_api_test(simple_on_word2vec ARGS test_word2vec)
 option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF)
 if(NOT WITH_INFERENCE_DEMO)
  return()

--- a/paddle/contrib/inference/demo_ci/CMakeLists.txt
+++ b/paddle/contrib/inference/demo_ci/CMakeLists.txt
+cmake_minimum_required(VERSION 3.0)
+project(cpp_inference_demo CXX C)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+if(NOT DEFINED PADDLE_LIB)
+  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+endif()
+if(NOT DEFINED DEMO_NAME)
+  message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
+endif()
+option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
+option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
+option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
+if(WITH_GPU)
+  set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
+endif()
+include_directories("${PADDLE_LIB}")
+include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
+include_directories("${PADDLE_LIB}/third_party/install/glog/include")
+include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
+include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+include_directories("${PADDLE_LIB}/third_party/boost")
+include_directories("${PADDLE_LIB}/third_party/eigen3")
+link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
+link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
+link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
+link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
+link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
+if(WITH_MKL)
+  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so 
+               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5.so)
+  set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
+  if(EXISTS ${MKLDNN_PATH})
+    include_directories("${MKLDNN_PATH}/include")
+    set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+  endif()
+else()
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
+endif()
+if(WITH_STATIC_LIB)
+  set(DEPS
+      "-Wl,--whole-archive"
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a
+      "-Wl,--no-whole-archive"
+      ${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.a)
+else()
+  # Note: libpaddle_inference_api.so must put before libpaddle_fluid.so
+  set(DEPS
+      ${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.so
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so)
+endif()
+set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+set(DEPS ${DEPS}
+    ${MATH_LIB} ${MKLDNN_LIB}
+    glog gflags protobuf snappystream snappy z
+    ${EXTERNAL_LIB})
+if(WITH_GPU)
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart.so)
+endif()
+target_link_libraries(${DEMO_NAME} ${DEPS})
--- a/paddle/contrib/inference/demo_ci/run.sh
+++ b/paddle/contrib/inference/demo_ci/run.sh
+set -x
+PADDLE_ROOT=$1
+WITH_MKL=$2
+WITH_GPU=$3
+if [ $3 == "ON" ]; then
+  use_gpu_list='true false'
+else    
+  use_gpu_list='false'
+fi
+mkdir -p build
+cd build
+for WITH_STATIC_LIB in false; do
+  rm -rf *
+  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+    -DWITH_MKL=$WITH_MKL \
+    -DDEMO_NAME=simple_on_word2vec \
+    -DWITH_GPU=$WITH_GPU \
+    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+  make
+  for use_gpu in $use_gpu_list; do
+    ./simple_on_word2vec \
+      --dirname=${PADDLE_ROOT}/build/python/paddle/fluid/tests/book/word2vec.inference.model \
+      --use_gpu=$use_gpu
+  done
+done
+if [ $? -eq 0 ]; then
+  exit 0
+else
+  echo "inference demo runs fail."
+  exit 1
+fi
+set +x
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -16,21 +16,27 @@ limitations under the License. */
 * This file contains a simple demo for how to take a model for inference.
 */
+#include <gflags/gflags.h>
 #include <glog/logging.h>
-#include <gtest/gtest.h>
 #include <memory>
 #include <thread>
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "contrib/inference/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
+DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_bool(use_gpu, false, "Whether use gpu.");
 namespace paddle {
 namespace demo {
-DEFINE_string(dirname, "", "Directory of the inference model.");
 void Main(bool use_gpu) {
  //# 1. Create PaddlePredictor with a config.
  NativeConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  if (FLAGS_dirname.empty()) {
+    LOG(INFO) << "Usage: ./simple_on_word2vec --dirname=path/to/your/model";
+    exit(1);
+  }
+  config.model_dir = FLAGS_dirname;
  config.use_gpu = use_gpu;
  config.fraction_of_gpu_memory = 0.15;
  config.device = 0;
@@ -54,12 +60,16 @@ void Main(bool use_gpu) {
    CHECK(predictor->Run(slots, &outputs));
    //# 4. Get output.
-    ASSERT_EQ(outputs.size(), 1UL);
+    PADDLE_ENFORCE(outputs.size(), 1UL);
-    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
+    // Check the output buffer size and result of each tid.
+    PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+    float result[5] = {
+        0.00129761, 0.00151112, 0.000423564, 0.00108815, 0.000932706};
    const size_t num_elements = outputs.front().data.length() / sizeof(float);
    // The outputs' buffers are in CPU memory.
    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+      PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+                     result[i]);
    }
  }
 }
@@ -68,7 +78,7 @@ void MainThreads(int num_threads, bool use_gpu) {
  // Multi-threads only support on CPU
  // 0. Create PaddlePredictor with a config.
  NativeConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.model_dir = FLAGS_dirname;
  config.use_gpu = use_gpu;
  config.fraction_of_gpu_memory = 0.15;
  config.device = 0;
@@ -94,14 +104,17 @@ void MainThreads(int num_threads, bool use_gpu) {
        CHECK(predictor->Run(inputs, &outputs));
        // 4. Get output.
-        ASSERT_EQ(outputs.size(), 1UL);
+        PADDLE_ENFORCE(outputs.size(), 1UL);
-        LOG(INFO) << "TID: " << tid << ", "
+        // Check the output buffer size and result of each tid.
-                  << "output buffer size: " << outputs.front().data.length();
+        PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+        float result[5] = {
+            0.00129761, 0.00151112, 0.000423564, 0.00108815, 0.000932706};
        const size_t num_elements =
            outputs.front().data.length() / sizeof(float);
        // The outputs' buffers are in CPU memory.
        for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-          LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+          PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+                         result[i]);
        }
      }
    });
@@ -111,15 +124,18 @@ void MainThreads(int num_threads, bool use_gpu) {
  }
 }
-TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); }
-#ifdef PADDLE_WITH_CUDA
-TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); }
-#endif
 }  // namespace demo
 }  // namespace paddle
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::demo::Main(false /* use_gpu*/);
+  paddle::demo::MainThreads(1, false /* use_gpu*/);
+  paddle::demo::MainThreads(4, false /* use_gpu*/);
+  if (FLAGS_use_gpu) {
+    paddle::demo::Main(true /*use_gpu*/);
+    paddle::demo::MainThreads(1, true /*use_gpu*/);
+    paddle::demo::MainThreads(4, true /*use_gpu*/);
+  }
+  return 0;
+}
--- a/paddle/contrib/inference/paddle_inference_api.map
+++ b/paddle/contrib/inference/paddle_inference_api.map
+{
+	global:
+		*paddle*;
+	local:
+		*;
+};
--- a/paddle/contrib/inference/paddle_inference_api.sym
+++ b/paddle/contrib/inference/paddle_inference_api.sym
+*paddle*
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -13,6 +13,12 @@ endif()
 # Create static library
 cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
+if(NOT APPLE)
+  # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
+  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
+  set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+endif()
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
    SRCS io.cc

--- a/paddle/fluid/inference/paddle_fluid.sym
+++ b/paddle/fluid/inference/paddle_fluid.sym
+*paddle*
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -29,6 +29,79 @@ using mkldnn::stream;
 using platform::to_void_cast;
 using platform::GetMKLDNNFormat;
+class ConvMKLDNNHandler : public platform::MKLDNNHandler {
+ public:
+  ConvMKLDNNHandler(
+      std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {
+    conv_pd_ = conv_pd;
+  }
+  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
+    return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr,
+                                            "@dst_mem_p");
+  }
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {
+    auto src_pd = conv_pd_->src_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
+                               pipeline);
+  }
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {
+    auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
+    auto weights_pd = conv_pd_->weights_primitive_desc();
+    return this->AcquireMemory(weights_pd, user_weights_pd,
+                               user_weights_memory_p, "@weights_mem_p",
+                               pipeline);
+  }
+  std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> dst_memory_p) {
+    auto prim_key = key_ + "@conv_p";
+    auto prim_desc_key = key_ + "@conv_pd";
+    auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find convolution primitive in device context");
+    if (conv_p == nullptr) {
+      conv_p = std::make_shared<mkldnn::convolution_forward>(
+          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
+          *(dst_memory_p.get()));
+      dev_ctx_.SetBlob(prim_key, conv_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_p;
+  }
+  // Generate keys for storing/retriving primitives for this operator
+  // TODO(jczaja): Make hashing function more optimial
+  static std::string GetHash(memory::dims& input_dims,
+                             memory::dims& weights_dims,
+                             std::vector<int>& strides,
+                             std::vector<int>& paddings,
+                             std::vector<int>& dilations, int groups,
+                             const std::string& suffix) {
+    return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
+           dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
+           suffix;
+  }
+ private:
+  std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd_;
+};
 template <typename T>
 class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 public:
@@ -36,10 +109,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                   "It must use CPUPlace.");
-    // Get unique name for index
-    const std::string key = ctx.op().Output("Output");
-    const std::string key_conv_pd = key + "@conv_pd";
    auto& dev_ctx =
        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
    const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -80,68 +149,62 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
        paddle::framework::vectorize2int(filter->dims());
    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
-    // create mkldnn memory from input tensors (data/weights)
+    // Get unique name for storing MKLDNN primitives
-    auto user_src_memory = memory(
+    const std::string key = ConvMKLDNNHandler::GetHash(
-        {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},
+        src_tz, weights_tz, strides, paddings, dilations, groups,
-        to_void_cast(input_data));
+        ctx.op().Output("Output"));
-    auto user_weights_memory =
+    const std::string key_conv_pd = key + "@conv_pd";
-        memory({{{weights_tz}, memory::data_type::f32, filter->format()},
-                mkldnn_engine},
+    std::vector<primitive> pipeline;
-               to_void_cast(filter_data));
+    auto user_src_md = platform::MKLDNNMemDesc(
+        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
+    auto user_weights_md = platform::MKLDNNMemDesc(
+        {weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format());
    /* create memory descriptor for convolution without specified format
     * ('any') which lets a primitive (convolution in this case) choose
     * the memory format preferred for best performance
     */
-    auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
+    auto src_md = platform::MKLDNNMemDesc(
-                                          memory::format::any);
+        src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
    auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, memory::data_type::f32, memory::format::any);
+        weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
-    auto dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32,
+    auto dst_md = platform::MKLDNNMemDesc(
-                                          memory::format::any);
+        dst_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
    // create a conv primitive descriptor and save it for usage in backward
    std::shared_ptr<conv_fwd::primitive_desc> conv_pd = ConvFwdPrimitiveDesc(
        src_md, weights_md, dst_md, strides, paddings, mkldnn_engine);
+    // Save conv_pd/src_memory/weights_memory for backward pass
+    dev_ctx.SetBlob(key_conv_pd, conv_pd);
-    // create reorder primitive if the input format is not the preferred one
+    ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
-    auto src_memory = user_src_memory;
-    primitive reorder_src;
+    // create mkldnn memory from input tensors (data/weights)
-    bool is_src_reordered = false;
+    auto user_src_memory_p =
-    if (memory::primitive_desc(conv_pd->src_primitive_desc()) !=
+        handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
-        user_src_memory.get_primitive_desc()) {
+    auto user_weights_memory_p = handler.AcquireWeightsMemory(
-      src_memory = memory(conv_pd->src_primitive_desc());
+        user_weights_md, to_void_cast<T>(filter_data));
-      reorder_src = reorder(user_src_memory, src_memory);
-      is_src_reordered = true;
-    }
-    auto weights_memory = user_weights_memory;
-    primitive reorder_weights;
-    bool is_weights_reordered = false;
-    if (memory::primitive_desc(conv_pd->weights_primitive_desc()) !=
-        user_weights_memory.get_primitive_desc()) {
-      weights_memory = memory(conv_pd->weights_primitive_desc());
-      reorder_weights = reorder(user_weights_memory, weights_memory);
-      is_weights_reordered = true;
-    }
-    // create memory primitive for conv dst
+    // create reorder primitive if the input format is not the preferred one
-    auto dst_memory = memory(conv_pd->dst_primitive_desc(), output_data);
+    auto src_memory_p =
+        handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
+    auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
+        user_weights_memory_p, pipeline);
+    auto dst_memory_p =
+        handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
    // create convolution op primitive
-    auto conv_prim = conv_fwd(*conv_pd, src_memory, weights_memory, dst_memory);
+    auto conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
+                                             dst_memory_p);
    // push primitive to stream and wait until it's executed
-    std::vector<primitive> pipeline;
+    pipeline.push_back(*conv_p);
-    if (is_src_reordered) pipeline.push_back(reorder_src);
-    if (is_weights_reordered) pipeline.push_back(reorder_weights);
-    pipeline.push_back(conv_prim);
    stream(stream::kind::eager).submit(pipeline).wait();
-    // Save conv_pd/src_memory/weights_memory for backward pass
-    dev_ctx.SetBlob(key_conv_pd, conv_pd);
    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetMKLDNNFormat(dst_memory));
+    output->set_format(GetMKLDNNFormat(*dst_memory_p));
  }
 private:
@@ -197,13 +260,10 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    if (!input_grad && !filter_grad) return;
-    // Get an unique name from "argument" name of "Output" variable
-    // This name will be used as key when saving info into device context
-    const std::string key = ctx.op().Input("Output");
-    const std::string key_conv_pd = key + "@conv_pd";
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
    const T* input_data = input->data<T>();
    const T* filter_data = filter->data<T>();
@@ -223,6 +283,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
        paddle::framework::vectorize2int(filter->dims());
    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    // Get an unique name from "argument" name of "Output" variable
+    // This name will be used as key when saving info into device context
+    const std::string key =
+        ConvMKLDNNHandler::GetHash(src_tz, weights_tz, strides, paddings,
+                                   dilations, groups, ctx.op().Input("Output"));
+    const std::string key_conv_pd = key + "@conv_pd";
    // create mkldnn memory from input tensors (input/weights/output_grad)
    auto user_src_memory = memory(
        {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},

--- a/paddle/fluid/operators/detection/prior_box_op.cc
+++ b/paddle/fluid/operators/detection/prior_box_op.cc
@@ -149,6 +149,13 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(float) "
                   "Prior boxes center offset.")
        .SetDefault(0.5);
+    AddAttr<bool>(
+        "min_max_aspect_ratios_order",
+        "(bool) If set True, the output prior box is in order of"
+        "[min, max, aspect_ratios], which is consistent with Caffe."
+        "Please note, this order affects the weights order of convolution layer"
+        "followed by and does not affect the final detection results.")
+        .SetDefault(false);
    AddComment(R"DOC(
 Prior box operator
 Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.

--- a/paddle/fluid/operators/detection/prior_box_op.cu
+++ b/paddle/fluid/operators/detection/prior_box_op.cu
@@ -28,8 +28,8 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
                            const int im_width, const int as_num,
                            const T offset, const T step_width,
                            const T step_height, const T* min_sizes,
-                            const T* max_sizes, const int min_num,
+                            const T* max_sizes, const int min_num, bool is_clip,
-                            bool is_clip) {
+                            bool min_max_aspect_ratios_order) {
  int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
  int box_num = height * width * num_priors;
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
@@ -44,6 +44,7 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
    T min_size = min_sizes[m];
    if (max_sizes) {
      int s = p % (as_num + 1);
+      if (!min_max_aspect_ratios_order) {
        if (s < as_num) {
          T ar = aspect_ratios[s];
          bw = min_size * sqrt(ar) / 2.;
@@ -53,6 +54,19 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
          bw = sqrt(min_size * max_size) / 2.;
          bh = bw;
        }
+      } else {
+        if (s == 0) {
+          bw = bh = min_size / 2.;
+        } else if (s == 1) {
+          T max_size = max_sizes[m];
+          bw = sqrt(min_size * max_size) / 2.;
+          bh = bw;
+        } else {
+          T ar = aspect_ratios[s - 1];
+          bw = min_size * sqrt(ar) / 2.;
+          bh = min_size / sqrt(ar) / 2.;
+        }
+      }
    } else {
      int s = p % as_num;
      T ar = aspect_ratios[s];
@@ -94,6 +108,8 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
    auto variances = ctx.Attr<std::vector<float>>("variances");
    auto flip = ctx.Attr<bool>("flip");
    auto clip = ctx.Attr<bool>("clip");
+    auto min_max_aspect_ratios_order =
+        ctx.Attr<bool>("min_max_aspect_ratios_order");
    std::vector<float> aspect_ratios;
    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
@@ -149,7 +165,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
    GenPriorBox<T><<<grid, block, 0, stream>>>(
        boxes->data<T>(), r.data<T>(), height, width, im_height, im_width,
        aspect_ratios.size(), offset, step_width, step_height, min.data<T>(),
-        max_data, min_num, clip);
+        max_data, min_num, clip, min_max_aspect_ratios_order);
    framework::Tensor v;
    framework::TensorFromVector(variances, ctx.device_context(), &v);

--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ b/paddle/fluid/operators/detection/prior_box_op.h
@@ -68,6 +68,8 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
    auto variances = ctx.Attr<std::vector<float>>("variances");
    auto flip = ctx.Attr<bool>("flip");
    auto clip = ctx.Attr<bool>("clip");
+    auto min_max_aspect_ratios_order =
+        ctx.Attr<bool>("min_max_aspect_ratios_order");
    std::vector<float> aspect_ratios;
    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
@@ -108,6 +110,38 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
        int idx = 0;
        for (size_t s = 0; s < min_sizes.size(); ++s) {
          auto min_size = min_sizes[s];
+          if (min_max_aspect_ratios_order) {
+            box_width = box_height = min_size / 2.;
+            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+            idx++;
+            if (max_sizes.size() > 0) {
+              auto max_size = max_sizes[s];
+              // square prior with size sqrt(minSize * maxSize)
+              box_width = box_height = sqrt(min_size * max_size) / 2.;
+              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              idx++;
+            }
+            // priors with different aspect ratios
+            for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+              float ar = aspect_ratios[r];
+              if (fabs(ar - 1.) < 1e-6) {
+                continue;
+              }
+              box_width = min_size * sqrt(ar) / 2.;
+              box_height = min_size / sqrt(ar) / 2.;
+              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              idx++;
+            }
+          } else {
            // priors with different aspect ratios
            for (size_t r = 0; r < aspect_ratios.size(); ++r) {
              float ar = aspect_ratios[r];
@@ -132,6 +166,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
          }
        }
      }
+    }
    if (clip) {
      platform::Transform<platform::CPUDeviceContext> trans;

--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -222,15 +222,16 @@ class MKLDNNHandler {
  static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
                             const std::string& suffix) {
-    auto dims2str = [](const mkldnn::memory::dims& operand_dims) {
+    return dims2str(operand_dims) + suffix;
+  };
+ protected:
+  static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
    std::string dstr = "";
    for (size_t i = 0; i < operand_dims.size(); ++i) {
      dstr += std::to_string(operand_dims[i]) + "-";
    }
    return dstr;
-    };
-    return dims2str(operand_dims) + suffix;
  }
 protected:

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -510,11 +510,23 @@ function gen_fluid_inference_lib() {
 EOF
        make -j `nproc` inference_lib_dist
        cd ${PADDLE_ROOT}/build
-        mv fluid_install_dir fluid
+        cp -r fluid_install_dir fluid
        tar -cf fluid.tgz fluid
      fi
 }
+function test_fluid_inference_lib() {
+    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
+        cat <<EOF
+    ========================================
+    Testing fluid inference library ...
+    ========================================
+EOF
+        cd ${PADDLE_ROOT}/paddle/contrib/inference/demo_ci
+        sh run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF}
+      fi
+}
 function main() {
    set -e
    local CMD=$1
@@ -568,6 +580,7 @@ function main() {
        run_test
        gen_capi_package
        gen_fluid_inference_lib
+        test_fluid_inference_lib
        ;;
      *)
        print_usage

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -789,7 +789,8 @@ def prior_box(input,
              clip=False,
              steps=[0.0, 0.0],
              offset=0.5,
-              name=None):
+              name=None,
+              min_max_aspect_ratios_order=False):
    """
    **Prior Box Operator**
@@ -818,6 +819,11 @@ def prior_box(input,
            Default: [0., 0.]
       offset(float): Prior boxes center offset. Default: 0.5
       name(str): Name of the prior box op. Default: None.
+       min_max_aspect_ratios_order(bool): If set True, the output prior box is
+            in order of [min, max, aspect_ratios], which is consistent with 
+            Caffe. Please note, this order affects the weights order of
+            convolution layer followed by and does not affect the final
+            detection results. Default: False.
    Returns:
        tuple: A tuple with two Variable (boxes, variances)
@@ -871,7 +877,8 @@ def prior_box(input,
        'clip': clip,
        'step_w': steps[0],
        'step_h': steps[1],
-        'offset': offset
+        'offset': offset,
+        'min_max_aspect_ratios_order': min_max_aspect_ratios_order
    }
    if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0:
        if not _is_list_or_tuple_(max_sizes):
@@ -911,7 +918,8 @@ def multi_box_head(inputs,
                   kernel_size=1,
                   pad=0,
                   stride=1,
-                   name=None):
+                   name=None,
+                   min_max_aspect_ratios_order=False):
    """
    Generate prior boxes for SSD(Single Shot MultiBox Detector)
    algorithm. The details of this algorithm, please refer the
@@ -954,6 +962,11 @@ def multi_box_head(inputs,
       pad(int|list|tuple): The padding of conv2d. Default:0.
       stride(int|list|tuple): The stride of conv2d. Default:1,
       name(str): Name of the prior box layer. Default: None.
+       min_max_aspect_ratios_order(bool): If set True, the output prior box is
+            in order of [min, max, aspect_ratios], which is consistent with 
+            Caffe. Please note, this order affects the weights order of
+            convolution layer followed by and does not affect the fininal
+            detection results. Default: False.
    Returns:
        tuple: A tuple with four Variables. (mbox_loc, mbox_conf, boxes, variances)
@@ -1068,7 +1081,8 @@ def multi_box_head(inputs,
        step = [step_w[i] if step_w else 0.0, step_h[i] if step_w else 0.0]
        box, var = prior_box(input, image, min_size, max_size, aspect_ratio,
-                             variance, flip, clip, step, offset)
+                             variance, flip, clip, step, offset, None,
+                             min_max_aspect_ratios_order)
        box_results.append(box)
        var_results.append(var)

--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
@@ -32,6 +32,7 @@ class TestPriorBoxOp(OpTest):
            'variances': self.variances,
            'flip': self.flip,
            'clip': self.clip,
+            'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order,
            'step_w': self.step_w,
            'step_h': self.step_h,
            'offset': self.offset
@@ -52,6 +53,9 @@ class TestPriorBoxOp(OpTest):
        max_sizes = [5, 10]
        self.max_sizes = np.array(max_sizes).astype('float32').tolist()
+    def set_min_max_aspect_ratios_order(self):
+        self.min_max_aspect_ratios_order = False
    def init_test_params(self):
        self.layer_w = 32
        self.layer_h = 32
@@ -71,6 +75,7 @@ class TestPriorBoxOp(OpTest):
        self.set_max_sizes()
        self.aspect_ratios = [2.0, 3.0]
        self.flip = True
+        self.set_min_max_aspect_ratios_order()
        self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
        self.aspect_ratios = np.array(
            self.aspect_ratios, dtype=np.float).flatten()
@@ -78,7 +83,6 @@ class TestPriorBoxOp(OpTest):
        self.variances = np.array(self.variances, dtype=np.float).flatten()
        self.clip = True
        self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
        if len(self.max_sizes) > 0:
            self.num_priors += len(self.max_sizes)
@@ -106,26 +110,60 @@ class TestPriorBoxOp(OpTest):
                idx = 0
                for s in range(len(self.min_sizes)):
                    min_size = self.min_sizes[s]
+                    if not self.min_max_aspect_ratios_order:
                        # rest of priors
                        for r in range(len(self.real_aspect_ratios)):
                            ar = self.real_aspect_ratios[r]
                            c_w = min_size * math.sqrt(ar) / 2
                            c_h = (min_size / math.sqrt(ar)) / 2
-                        out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
+                            out_boxes[h, w, idx, :] = [
-                                                   (c_y - c_h) / self.image_h,
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                                   (c_x + c_w) / self.image_w,
+                                self.image_h, (c_x + c_w) / self.image_w,
-                                                   (c_y + c_h) / self.image_h]
+                                (c_y + c_h) / self.image_h
+                            ]
                            idx += 1
                        if len(self.max_sizes) > 0:
                            max_size = self.max_sizes[s]
                            # second prior: aspect_ratio = 1,
                            c_w = c_h = math.sqrt(min_size * max_size) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
+                    else:
+                        c_w = c_h = min_size / 2.
                        out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
                                                   (c_y - c_h) / self.image_h,
                                                   (c_x + c_w) / self.image_w,
                                                   (c_y + c_h) / self.image_h]
                        idx += 1
+                        if len(self.max_sizes) > 0:
+                            max_size = self.max_sizes[s]
+                            # second prior: aspect_ratio = 1,
+                            c_w = c_h = math.sqrt(min_size * max_size) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
+                        # rest of priors
+                        for r in range(len(self.real_aspect_ratios)):
+                            ar = self.real_aspect_ratios[r]
+                            if abs(ar - 1.) < 1e-6:
+                                continue
+                            c_w = min_size * math.sqrt(ar) / 2
+                            c_h = (min_size / math.sqrt(ar)) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
        # clip the prior's coordidate such that it is within[0, 1]
        if self.clip:
@@ -137,10 +175,15 @@ class TestPriorBoxOp(OpTest):
        self.out_var = out_var.astype('float32')
-class TestPriorBoxOpWithMaxSize(TestPriorBoxOp):
+class TestPriorBoxOpWithoutMaxSize(TestPriorBoxOp):
    def set_max_sizes(self):
        self.max_sizes = []
+class TestPriorBoxOpWithSpecifiedOutOrder(TestPriorBoxOp):
+    def set_min_max_aspect_ratios_order(self):
+        self.min_max_aspect_ratios_order = True
 if __name__ == '__main__':
    unittest.main()