diff --git a/README.md b/README.md
index 8d89c6b1ec9e4aefbd64328dedb4e8c7cc50c21b..63abca069a6629ac59739224ded9cd9f06207d0a 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,6 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
 [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
-[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 9c42044ec163e9db1dd21d5c3915b010c30fdf1c..c4deef6f579a64c911b734699d4e9b221fa578a6 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID)
     set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
 endif(NOT APPLE AND NOT ANDROID)
 
+set_property(GLOBAL PROPERTY FLUID_MODULES "")
+# find all fluid modules is used for paddle fluid static library
+# for building inference libs
+function(find_fluid_modules TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
+  string(FIND "${__target_path}" "fluid" pos)
+  if(pos GREATER 1)
+    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
+  endif()
+endfunction(find_fluid_modules)
+
 function(merge_static_libs TARGET_NAME)
   set(libs ${ARGN})
   list(REMOVE_DUPLICATES libs)
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 1231a111f091b3c965a07114ba6496602c16dba8..0c720faa353438b76a72e1574cb90931ddd0cf73 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -12,19 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set_property(GLOBAL PROPERTY FLUID_MODULES "")
-# find all fluid modules is used for paddle fluid static library
-function(find_fluid_modules TARGET_NAME)
-  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
-  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
-  string(FIND "${__target_path}" "fluid" pos)
-  if(pos GREATER 1)
-    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
-  endif()
-endfunction(find_fluid_modules)
-
 # make package for paddle fluid shared and static library
 function(copy TARGET)
     set(options "")
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index bc48fd3b479157d4aea390cd5f4dc61ea46dca4b..cd00b7de7338982308acfa1f1e8c38e010c6a43b 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -147,9 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                  "Input tensor type is not supported: ", in.type().name());
   memory::data_type out_type = in_type;
 
-  auto in_format = MKLDNNFormatForSize(in_tz.size(), in.format());
+  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
   auto out_format =
-      MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
+      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
 
   void* in_data = GetDataFromTensor(in, in_type);
 
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 67f91e4e48d3e11ed493c5e6943cb9071aff60c4..90bb206ec6b698bc23ad1a5c9609a25186ec6de8 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -62,12 +62,6 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
   return MKLDNNDataType::data_undef;
 }
 
-inline MKLDNNFormat MKLDNNFormatForSize(size_t dims_size,
-                                        MKLDNNFormat default_format) {
-  return (dims_size == 1
-              ? mkldnn::memory::format::x
-              : dims_size == 2 ? mkldnn::memory::format::nc : default_format);
-}
 #endif
 
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 635eb404d4a94e9a765f511bb249ce0cce22c477..82872224501709080ff02a13464d58543a0abda8 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace framework {
 
@@ -48,8 +52,8 @@ void TransformData(const OpKernelType &expected_kernel_type,
         // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
         // Just set layout/format. No real transform occur
 
-        auto out_format =
-            MKLDNNFormatForSize(in.dims().size(), ToMKLDNNFormat(lin));
+        auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
+                                                        ToMKLDNNFormat(lin));
 
         out.ShareDataWith(input_tensor);
         out.set_layout(DataLayout::kMKLDNN);
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 31516a884baefc2e50b7774a7993740f6930212f..f98011e896f4033ef210e0eb69f93ce7800a3cd6 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -73,18 +73,12 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
       memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
                    stream);
     } else {
-      // NOTE(zcd): Because TensorCopy is an async operation, when the src_place
-      // and dst_place are two different GPU, to ensure that the operation can
-      // be carried out correctly, we should make ctx wait.
-      // If ctx_place and src_place are the same, we should add ctx.Wait()
-      // after memory::Copy; if ctx_place and dst_place are the same, we should
-      // add ctx.Wait() before memory::Copy.
       if (platform::is_same_place(ctx_place, src_place)) {
         memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
                      stream);
-        ctx.Wait();
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
       } else if (platform::is_same_place(ctx_place, dst_place)) {
-        ctx.Wait();
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
         memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
                      stream);
       } else {
@@ -97,13 +91,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
 
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst) {
-  // NOTE(zcd): If the src.place() and dst_place are two different GPU,
-  // the copy operation is carried out on the dst_place's stream. This is
-  // very important, because TensorCopy is an async operator, and in most
-  // case, once this copy operator returns, dst is to be used in dst_place's
-  // stream, if this copy operation is carried out on the src_place's stream,
-  // when dst is used in dst_place's stream the copy operation may be
-  // not completed.
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext* dev_ctx;
   if (platform::is_gpu_place(dst_place)) {
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index dca279b69382b80e055f661cefe84b81326704b5..4457382ade37a12f5f3613fc4113fbf1f6f91124 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -23,10 +23,25 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+// NOTE(zcd): Because TensorCopy is an async operation, when the src_place
+// and dst_place are two different GPU, to ensure that the operation can
+// be carried out correctly, there is a src_ctx wait operation in TensorCopy.
+// If ctx_place and src_place are the same, src_ctx.Wait() is added
+// after memory::Copy; if ctx_place and dst_place are the same,
+// src_ctx.Wait() is added before memory::Copy.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 const platform::DeviceContext& ctx, Tensor* dst);
+
+// NOTE(zcd): If the src.place() and dst_place are two different GPU,
+// the copy operation is carried out on the dst_place's stream. This is
+// very important, because TensorCopy is an async operator, and in most
+// case, once this copy operator returns, dst is to be used in dst_place's
+// stream, if this copy operation is carried out on the src_place's stream,
+// when dst is used in dst_place's stream the copy operation may be
+// not completed.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst);
+
 void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                     Tensor* dst);
 
diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
index 6ecb43c49c30f9da2a273d506f7b85c0a4f5fa2c..9ab2179b5fe689762704039c5f67dd080e530aa5 100644
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -115,9 +115,12 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
 
     // create mkldnn memory from input x tensor
-    auto src_memory =
-        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
-               to_void_cast(x_data));
+    mkldnn::memory::format input_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
+
+    auto src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+        to_void_cast(x_data));
 
     // create primitive descriptor for batch norm forward
     using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
@@ -251,15 +254,21 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
 
     // create mkldnn memory from input diff_y tensor
-    auto user_diff_dst_memory =
-        memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()},
-                mkldnn_engine},
-               to_void_cast(diff_y_data));
+
+    mkldnn::memory::format dst_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
+
+    auto user_diff_dst_memory = memory(
+        {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
+        to_void_cast(diff_y_data));
 
     // create mkldnn memory from input x tensor
-    auto src_memory =
-        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
-               to_void_cast(x_data));
+    mkldnn::memory::format input_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
+
+    auto src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+        to_void_cast(x_data));
 
     // for diff_dst, try to use same format as dst in forward pass
     auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index ed99932546446eb877c9701de15e2d37d29b5f88..a6cccc31219104767ac38bdebeb1d4c0e8c2ac01 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -228,7 +228,7 @@ class MKLDNNHandler {
       return dstr;
     };
     return dims2str(operand_dims) + suffix;
-  };
+  }
 
  protected:
   const MKLDNNDeviceContext& dev_ctx_;
@@ -237,5 +237,15 @@ class MKLDNNHandler {
   bool is_reusing_;
 };
 
+inline mkldnn::memory::format MKLDNNFormatForSize(
+    size_t dims_size, mkldnn::memory::format data_format) {
+  if (dims_size == 1) {
+    return mkldnn::memory::format::x;
+  } else if (dims_size == 2) {
+    return mkldnn::memory::format::nc;
+  }
+  return data_format;
+}
+
 }  // namespace platform
 }  // namespace paddle