Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into unsqueeze_op

a1e7f2d5 · chenweihang · 70729ad6 · 7b54f168 · a1e7f2d5 · a1e7f2d5
10 changed file
--- a/README.md
+++ b/README.md
@@ -4,7 +4,6 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
 [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
-[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID)
    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
 endif(NOT APPLE AND NOT ANDROID)
+set_property(GLOBAL PROPERTY FLUID_MODULES "")
+# find all fluid modules is used for paddle fluid static library
+# for building inference libs
+function(find_fluid_modules TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
+  string(FIND "${__target_path}" "fluid" pos)
+  if(pos GREATER 1)
+    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
+  endif()
+endfunction(find_fluid_modules)
 function(merge_static_libs TARGET_NAME)
  set(libs ${ARGN})
  list(REMOVE_DUPLICATES libs)

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -12,19 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-set_property(GLOBAL PROPERTY FLUID_MODULES "")
-# find all fluid modules is used for paddle fluid static library
-function(find_fluid_modules TARGET_NAME)
-  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
-  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
-  string(FIND "${__target_path}" "fluid" pos)
-  if(pos GREATER 1)
-    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
-  endif()
-endfunction(find_fluid_modules)
 # make package for paddle fluid shared and static library
 function(copy TARGET)
    set(options "")

--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -147,9 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                 "Input tensor type is not supported: ", in.type().name());
  memory::data_type out_type = in_type;
-  auto in_format = MKLDNNFormatForSize(in_tz.size(), in.format());
+  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
  auto out_format =
-      MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
+      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
  void* in_data = GetDataFromTensor(in, in_type);

--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -62,12 +62,6 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
  return MKLDNNDataType::data_undef;
 }
-inline MKLDNNFormat MKLDNNFormatForSize(size_t dims_size,
-                                        MKLDNNFormat default_format) {
-  return (dims_size == 1
-              ? mkldnn::memory::format::x
-              : dims_size == 2 ? mkldnn::memory::format::nc : default_format);
-}
 #endif
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,

--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/data_type_transform.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace framework {
@@ -48,8 +52,8 @@ void TransformData(const OpKernelType &expected_kernel_type,
        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
        // Just set layout/format. No real transform occur
-        auto out_format =
+        auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
-            MKLDNNFormatForSize(in.dims().size(), ToMKLDNNFormat(lin));
+                                                        ToMKLDNNFormat(lin));
        out.ShareDataWith(input_tensor);
        out.set_layout(DataLayout::kMKLDNN);

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -73,18 +73,12 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
      memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
                   stream);
    } else {
-      // NOTE(zcd): Because TensorCopy is an async operation, when the src_place
-      // and dst_place are two different GPU, to ensure that the operation can
-      // be carried out correctly, we should make ctx wait.
-      // If ctx_place and src_place are the same, we should add ctx.Wait()
-      // after memory::Copy; if ctx_place and dst_place are the same, we should
-      // add ctx.Wait() before memory::Copy.
      if (platform::is_same_place(ctx_place, src_place)) {
        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
                     stream);
-        ctx.Wait();
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
      } else if (platform::is_same_place(ctx_place, dst_place)) {
-        ctx.Wait();
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
                     stream);
      } else {
@@ -97,13 +91,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                Tensor* dst) {
-  // NOTE(zcd): If the src.place() and dst_place are two different GPU,
-  // the copy operation is carried out on the dst_place's stream. This is
-  // very important, because TensorCopy is an async operator, and in most
-  // case, once this copy operator returns, dst is to be used in dst_place's
-  // stream, if this copy operation is carried out on the src_place's stream,
-  // when dst is used in dst_place's stream the copy operation may be
-  // not completed.
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  const platform::DeviceContext* dev_ctx;
  if (platform::is_gpu_place(dst_place)) {

--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -23,10 +23,25 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
+// NOTE(zcd): Because TensorCopy is an async operation, when the src_place
+// and dst_place are two different GPU, to ensure that the operation can
+// be carried out correctly, there is a src_ctx wait operation in TensorCopy.
+// If ctx_place and src_place are the same, src_ctx.Wait() is added
+// after memory::Copy; if ctx_place and dst_place are the same,
+// src_ctx.Wait() is added before memory::Copy.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                const platform::DeviceContext& ctx, Tensor* dst);
+// NOTE(zcd): If the src.place() and dst_place are two different GPU,
+// the copy operation is carried out on the dst_place's stream. This is
+// very important, because TensorCopy is an async operator, and in most
+// case, once this copy operator returns, dst is to be used in dst_place's
+// stream, if this copy operation is carried out on the src_place's stream,
+// when dst is used in dst_place's stream the copy operation may be
+// not completed.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                Tensor* dst);
 void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                    Tensor* dst);

--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -115,9 +115,12 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
    // create mkldnn memory from input x tensor
-    auto src_memory =
+    mkldnn::memory::format input_format =
-        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
+        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
-               to_void_cast(x_data));
+    auto src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+        to_void_cast(x_data));
    // create primitive descriptor for batch norm forward
    using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
@@ -251,15 +254,21 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
    // create mkldnn memory from input diff_y tensor
-    auto user_diff_dst_memory =
-        memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()},
+    mkldnn::memory::format dst_format =
-                mkldnn_engine},
+        platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
-               to_void_cast(diff_y_data));
+    auto user_diff_dst_memory = memory(
+        {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
+        to_void_cast(diff_y_data));
    // create mkldnn memory from input x tensor
-    auto src_memory =
+    mkldnn::memory::format input_format =
-        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
+        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
-               to_void_cast(x_data));
+    auto src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+        to_void_cast(x_data));
    // for diff_dst, try to use same format as dst in forward pass
    auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();

--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -228,7 +228,7 @@ class MKLDNNHandler {
      return dstr;
    };
    return dims2str(operand_dims) + suffix;
-  };
+  }
 protected:
  const MKLDNNDeviceContext& dev_ctx_;
@@ -237,5 +237,15 @@ class MKLDNNHandler {
  bool is_reusing_;
 };
+inline mkldnn::memory::format MKLDNNFormatForSize(
+    size_t dims_size, mkldnn::memory::format data_format) {
+  if (dims_size == 1) {
+    return mkldnn::memory::format::x;
+  } else if (dims_size == 2) {
+    return mkldnn::memory::format::nc;
+  }
+  return data_format;
+}
 }  // namespace platform
 }  // namespace paddle