Merge branch 'develop' of https://github.com/paddlepaddle/paddle into add_benchmark_for_trt

test=develop

Merge branch 'develop' of https://github.com/paddlepaddle/paddle into add_benchmark_for_trt
test=develop
a5bfed37 · nhzlx · afc51e6f · bc6d0a34 · a5bfed37 · a5bfed37
4 changed file
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
-### Latest PaddlePaddle Release: [Fluid 1.1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1)
+### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -27,9 +27,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.1.0.post87
+pip install paddlepaddle-gpu==1.2.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.1.0.post85
+pip install paddlepaddle-gpu==1.2.0.post85
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.1.0.post85
 ## Installation
-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website.
 ## Documentation
-We provide [English](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) and
+We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) documentation.
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) documentation.
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
  You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.1/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
  You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/api/zh/1.1/fluid.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
   Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.1/advanced_usage/development/contribute_to_paddle.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
   We appreciate your contributions!

--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -151,19 +151,22 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
  auto out_format =
      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
-  void* in_data = GetDataFromTensor(in, in_type);
  // output tensor has the same dims as input. Reorder don't change dims
  out->Resize(in.dims());
+  if (in_format != out_format) {
+    void* in_data = GetDataFromTensor(in, in_type);
    auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
-  auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
+    auto in_memory =
+        memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
    auto out_memory =
        memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
    platform::Reorder(in_memory, out_memory);
+  } else {
+    out->ShareDataWith(in);
+  }
  out->set_layout(out_layout);
  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
  out->set_format(memory::format::format_undef);

--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -109,8 +109,12 @@ class Pool2dOpConverter : public OpConverter {
    }
    if (pool_type == "max") {
-      nvinfer1::DimsHW pre_pad(paddings[0], paddings[1]);
+      // Under ceil mode, the pre_pad and post_pad are used to
-      nvinfer1::DimsHW post_pad(paddings[0], paddings[1]);
+      // record the the padding size. In some ceil mode cases,
+      // we do not need padding, so we initialize the two vars to 0.
+      nvinfer1::DimsHW pre_pad(0, 0);
+      nvinfer1::DimsHW post_pad(0, 0);
      if (ceil_mode) {
        // If ceil mode is true, we will pad the appropriate size to the input.
        DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad,

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -378,6 +378,18 @@ class TestDistBase(unittest.TestCase):
            stderr=tr1_pipe,
            env=env1)
+        # Wait until trainer process terminate
+        while True:
+            stat0 = tr0_proc.poll()
+            time.sleep(0.1)
+            if stat0 is not None:
+                break
+        while True:
+            stat1 = tr1_proc.poll()
+            time.sleep(0.1)
+            if stat1 is not None:
+                break
        tr0_out, tr0_err = tr0_proc.communicate()
        tr1_out, tr1_err = tr1_proc.communicate()
@@ -390,11 +402,21 @@ class TestDistBase(unittest.TestCase):
        ps0.terminate()
        ps1.terminate()
+        # print server log
+        with open("/tmp/ps0_err.log", "r") as fn:
+            sys.stderr.write("ps0 stderr: %s\n" % fn.read())
+        with open("/tmp/ps1_err.log", "r") as fn:
+            sys.stderr.write("ps1 stderr: %s\n" % fn.read())
        # print log
+        if stat0 == 0:
            sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out))
-        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
+        with open("/tmp/tr0_err.log", "r") as fn:
+            sys.stderr.write('trainer 0 stderr: %s\n' % fn.read())
+        if stat1 == 0:
            sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out))
-        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        with open("/tmp/tr1_err.log", "r") as fn:
+            sys.stderr.write('trainer 1 stderr: %s\n' % fn.read())
        return pickle.loads(tr0_out), pickle.loads(tr1_out)
@@ -474,6 +496,7 @@ class TestDistBase(unittest.TestCase):
            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
            "FLAGS_cudnn_deterministic": "1",
            "http_proxy": "",
            "NCCL_P2P_DISABLE": "1"