Merge branch 'release/1.0.0' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'release/1.0.0' of https://github.com/PaddlePaddle/Paddle into 10_fix_mac_14_py3_install test=release/1.0.0

Merge branch 'release/1.0.0' of https://github.com/PaddlePaddle/Paddle into...
Merge branch 'release/1.0.0' of https://github.com/PaddlePaddle/Paddle into 10_fix_mac_14_py3_install test=release/1.0.0
e9fc61d5 · minqiyang · 2ff9f106 · b2e6e5f2 · e9fc61d5 · e9fc61d5
21 changed file
--- a/Dockerfile
+++ b/Dockerfile
@@ -24,6 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/

 RUN apt-get update && \
    apt-get install -y --allow-downgrades patchelf \
+    python3 python3-dev python3-pip \
    git python-pip python-dev python-opencv openssh-server bison \
    libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
@@ -70,24 +71,33 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN easy_install -U pip && \
+RUN pip3 install -U wheel && \
+    pip3 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    easy_install -U pip && \
    pip install -U wheel && \
    pip install -U docopt PyYAML sphinx==1.5.6 && \
    pip install sphinx-rtd-theme==0.1.9 recommonmark

-RUN pip install pre-commit 'ipython==5.3.0' && \
+RUN pip3 install pre-commit 'ipython==5.3.0' && \
+    pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3 install opencv-python && \
+    pip install pre-commit 'ipython==5.3.0' && \
    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
    pip install opencv-python

 #For docstring checker
+RUN pip3 install pylint pytest astroid isort
 RUN pip install pylint pytest astroid isort LinkChecker

 COPY ./python/requirements.txt /root/
+RUN pip3 install -r /root/requirements.txt
 RUN pip install -r /root/requirements.txt

 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
 RUN apt-get install -y libssl-dev libffi-dev
+RUN pip3 install certifi urllib3[secure]
 RUN pip install certifi urllib3[secure]



--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -40,7 +40,7 @@ set(OPENBLAS_LIB_SEARCH_PATHS
        /usr/local/opt/openblas/lib)

 find_path(OPENBLAS_INC_DIR NAMES cblas.h
-  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH)
 find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
 find_library(OPENBLAS_LIB NAMES openblas

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -27,7 +27,7 @@ IF(NOT ${CBLAS_FOUND})

    SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
    SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
-    SET(CBLAS_INCLUDE_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
+    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)

    SET(CBLAS_LIBRARIES
        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
@@ -96,7 +96,7 @@ IF(NOT ${CBLAS_FOUND})
    ENDIF(NOT WIN32)
    SET(CBLAS_PROVIDER openblas)
    IF(WITH_C_API)
-        INSTALL(DIRECTORY ${CBLAS_INCLUDE_DIR} DESTINATION third_party/openblas)
+        INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
        # Because libopenblas.a is a symbolic link of another library, thus need to
        # install the whole directory.
        IF(ANDROID)
@@ -117,8 +117,8 @@ IF(NOT ${CBLAS_FOUND})
 ENDIF(NOT ${CBLAS_FOUND})

 MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
-MESSAGE(STATUS "BLAS Include: ${CBLAS_INCLUDE_DIR}")
-INCLUDE_DIRECTORIES(${CBLAS_INCLUDE_DIR})
+MESSAGE(STATUS "BLAS Include: ${CBLAS_INC_DIR}")
+INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})

 # FIXME(gangliao): generate cblas target to track all high performance
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -138,13 +138,17 @@ if (APPLE)
        # On Mac OS X build fat binaries with x86_64 architectures by default.
        set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
    endif()
-else()
+    # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
+    set (COMMON_FLAGS -Wno-deprecated-register)
+endif(APPLE)
+
+if(LINUX)
    set(GPU_COMMON_FLAGS
        -Wall
        -Wextra
        -Werror
        ${GPU_COMMON_FLAGS})
-endif()
+endif(LINUX)

 if(UNIX AND NOT APPLE)
  # except apple from nix*Os family

--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -80,16 +80,16 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
      // This is weird but there is really some variables without var_desc
      // in computation_op
      if (var_desc == nullptr) {
-        if (compute_op->Node()->Op()->Block()->FindVar(var_name) == nullptr)
-          continue;
-      } else {
+        var_desc = compute_op->Node()->Op()->Block()->FindVar(var_name);
+        if (var_desc == nullptr) continue;
+      }
+
      if (var_desc->Persistable()) continue;
      auto var_type = var_desc->Proto()->type().type();
      if (var_type != proto::VarType::LOD_TENSOR &&
          var_type != proto::VarType::SELECTED_ROWS) {
        continue;
      }
-      }

      // compute op only runs in one device
      if (ref_cnts[place.device]->count(var_name))

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -319,6 +319,13 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
 #ifdef PADDLE_WITH_CUDA
  if (!gcs_.empty()) {
    ResetReferenceCount();
+    for (auto &pair : cur_ref_cnts_) {
+      auto &name_map = *(pair.second);
+      for (auto &fetch_name : fetch_tensors) {
+        name_map.erase(fetch_name);
+      }
+      name_map.erase(fetched_var_name);
+    }
  }
 #endif
  auto fetch_data = member_->executor_->Run(fetch_tensors);

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -301,7 +301,6 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute)
 if (WITH_GPU)
    op_library(conv_op DEPS vol2col depthwise_conv im2col)
    op_library(layer_norm_op DEPS cub)
-    op_library(reduce_mean_op DEPS cub)
 else()
    op_library(conv_op DEPS vol2col im2col)
 endif()

--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -380,8 +380,7 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
    math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;

    auto& dev_ctx = context.template device_context<DeviceContext>();
-    depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
-                  output);
+    depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);
  }
 };

@@ -416,14 +415,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
      input_grad->mutable_data<T>(context.GetPlace());
      set_zero(dev_ctx, input_grad, static_cast<T>(0));
      depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                             paddings, dilations, input_grad);
+                             paddings, input_grad);
    }

    if (filter_grad) {
      filter_grad->mutable_data<T>(context.GetPlace());
      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
      depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
-                              dilations, filter_grad);
+                              filter_grad);
    }
  }
 };

--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
        depthwiseConvInputGrad;
    depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
-                           dilations, output);
+                           output);
  }
 };

@@ -367,11 +367,10 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
    auto& dev_ctx = context.template device_context<DeviceContext>();
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");

    if (input_grad) {
      math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
-      depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, dilations,
+      depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings,
                    input_grad);
    }

@@ -383,7 +382,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
      math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
          depthwiseConvFilterGrad;
      depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
-                              dilations, filter_grad);
+                              filter_grad);
    }
  }
 };

--- a/paddle/fluid/operators/cub_reduce.h
+++ b/paddle/fluid/operators/cub_reduce.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <numeric>
-#include <set>
-#include <vector>
-
-#include <cub/cub.cuh>  // NOLINT
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-
-namespace detail {
-template <typename T, size_t ElementCount>
-struct Array {
- public:
-  HOSTDEVICE inline Array() {}
-
-  HOSTDEVICE inline T& operator[](size_t index) { return data_[index]; }
-
-  HOSTDEVICE inline const T& operator[](size_t index) const {
-    return data_[index];
-  }
-
-  HOSTDEVICE constexpr inline size_t size() const { return ElementCount; }
-
-  template <typename VectorLikeType>
-  static inline Array<T, ElementCount> From(const VectorLikeType& vec) {
-    PADDLE_ENFORCE_EQ(vec.size(), ElementCount, "size not match");
-    size_t n = static_cast<size_t>(vec.size());
-    Array<T, ElementCount> ret;
-    for (size_t i = 0; i < n; ++i) ret[i] = vec[i];
-    return ret;
-  }
-
- private:
-  T data_[ElementCount];
-};
-
-// reduce the last axis of 2d array
-template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
-          int BlockDim>
-__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
-                               TransformOp transformer, Ty init,
-                               int reduce_num) {
-  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
-  int idx_x = blockIdx.x * reduce_num;
-  int idx_y = threadIdx.x;
-  Ty reduce_var = init;
-  for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim)
-    reduce_var = reducer(reduce_var, transformer(x[idx_x + idx_y]));
-
-  reduce_var =
-      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
-
-  if (threadIdx.x == 0) {
-    y[blockIdx.x] = reduce_var;
-  }
-}
-
-template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
-          int BlockDim, int Rank, int ReduceRank>
-__global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
-                             TransformOp transformer, Ty init, int reduce_num,
-                             Array<int, Rank> x_strides,
-                             Array<int, ReduceRank> reduce_dim,
-                             Array<int, ReduceRank> reduce_strides,
-                             Array<int, Rank - ReduceRank> left_dim,
-                             Array<int, Rank - ReduceRank> left_strides) {
-  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
-  Array<int, Rank> sub_index;
-  int left_idx = blockIdx.x;
-  for (int i = 0; i < Rank - ReduceRank; ++i) {
-    sub_index[left_dim[i]] = left_idx / left_strides[i];
-    left_idx %= left_strides[i];
-  }
-
-  int reduce_idx = threadIdx.x;
-  for (int j = 0; j < ReduceRank; ++j) {
-    sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
-    reduce_idx %= reduce_strides[j];
-  }
-
-  int idx_x = 0;
-  for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
-  Ty reduce_var = static_cast<Ty>(transformer(x[idx_x]));
-
-  for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) {
-    int reduce_idx = i;
-    for (int j = 0; j < ReduceRank; ++j) {
-      sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
-      reduce_idx %= reduce_strides[j];
-    }
-
-    int idx_x = 0;
-    for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
-    reduce_var = static_cast<Ty>(reducer(reduce_var, transformer(x[idx_x])));
-  }
-
-  reduce_var =
-      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
-
-  if (threadIdx.x == 0) {
-    y[blockIdx.x] = reduce_var;
-  }
-}
-
-static inline std::vector<int> GetStrides(const std::vector<int>& dims) {
-  int n = static_cast<int>(dims.size());
-  if (n == 0) return std::vector<int>();
-  std::vector<int> strides(n);
-  strides.back() = 1;
-  for (int i = n - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * dims[i + 1];
-  }
-  return strides;
-}
-
-static inline std::vector<int> GetStrides(const std::vector<int>& dims,
-                                          const std::vector<int>& idx) {
-  int n = static_cast<int>(idx.size());
-  if (n == 0) return std::vector<int>();
-  std::vector<int> strides(n);
-  strides.back() = 1;
-  for (int i = n - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * dims[idx[i + 1]];
-  }
-  return strides;
-}
-
-constexpr int kMaxBlockDim = 512;
-
-static inline int GetDesiredBlockDim(int block_dim) {
-  return block_dim >= kMaxBlockDim
-             ? kMaxBlockDim
-             : (1 << static_cast<int>(std::log2(block_dim)));
-}
-
-template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
-          typename TransformOp>
-static void TensorReduceImpl(
-    const Tx* x_data, Ty* y_data, const platform::Place& place,
-    const ReduceOp& reducer, const TransformOp& transformer, const Ty& init,
-    int left_num, int reduce_num, const std::vector<int>& x_strides,
-    const std::vector<int>& reduce_dim, const std::vector<int>& reduce_strides,
-    const std::vector<int>& left_dim, const std::vector<int>& left_strides,
-    cudaStream_t stream) {
-#define CUB_RANK_CASE(i, ...)             \
-  case i: {                               \
-    constexpr auto kRank = i;             \
-    switch (reduce_rank) { __VA_ARGS__; } \
-  } break
-
-#define CUB_REDUCE_RANK_CASE(i, ...)                              \
-  case i: {                                                       \
-    constexpr auto kReduceRank = i;                               \
-    ReduceKernel<Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank,  \
-                 kReduceRank><<<left_num, BlockDim, 0, stream>>>( \
-        x_data, y_data, reducer, transformer, init, reduce_num,   \
-        Array<int, kRank>::From(x_strides),                       \
-        Array<int, kReduceRank>::From(reduce_dim),                \
-        Array<int, kReduceRank>::From(reduce_strides),            \
-        Array<int, kRank - kReduceRank>::From(left_dim),          \
-        Array<int, kRank - kReduceRank>::From(left_strides));     \
-  } break
-
-  int rank = x_strides.size();
-  int reduce_rank = reduce_strides.size();
-  if (rank == reduce_rank) {
-    cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
-        x_data, transformer);
-    size_t temp_storage_bytes = 0;
-    cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
-                              reduce_num, reducer, init, stream);
-    framework::Tensor tmp;
-    auto* temp_storage = tmp.mutable_data<uint8_t>(
-        framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
-        place);
-    cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
-                              reduce_num, reducer, init, stream);
-    return;
-  }
-  if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
-    ReduceKernel2D<Tx, Ty, ReduceOp, TransformOp,
-                   BlockDim><<<left_num, BlockDim, 0, stream>>>(
-        x_data, y_data, reducer, transformer, init, reduce_num);
-    return;
-  }
-  /*
-  if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) {
-    // TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced.
-    // Currently, it is handled by code below, but inefficient
-    return;
-  }
-  */
-
-  switch (rank) {
-    CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1););
-
-    CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2););
-
-    CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
-                  CUB_REDUCE_RANK_CASE(3););
-
-    CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
-                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4););
-
-    CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
-                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
-                  CUB_REDUCE_RANK_CASE(5););
-
-    CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
-                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
-                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
-
-    CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
-                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
-                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
-
-    CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
-                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
-                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);
-                  CUB_REDUCE_RANK_CASE(7); CUB_REDUCE_RANK_CASE(8););
-  }
-
-#undef CUB_REDUCE_RANK_CASE
-#undef CUB_RANK_CASE
-}
-
-}  // namespace detail
-
-template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
-void TensorReduce(const framework::Tensor& x, framework::Tensor* y,
-                  std::vector<int> origin_reduce_dims, const Ty& init,
-                  const ReduceOp& reducer, const TransformOp& transformer,
-                  cudaStream_t stream) {
-  auto x_dim = framework::vectorize2int(x.dims());
-  std::vector<int> new_x_dim, new_reduce_dims;
-  int is_reduced = 0;
-  for (auto e : origin_reduce_dims) {
-    auto pos = e >= 0 ? e : e + x_dim.size();
-    is_reduced |= 1 << e;
-  }
-  for (int i = 0; i < x_dim.size(); i++) {
-    if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
-      new_x_dim.push_back(x_dim[i]);
-      if ((is_reduced >> i) & 1)
-        new_reduce_dims.push_back(new_x_dim.size() - 1);
-    } else {
-      new_x_dim[new_x_dim.size() - 1] *= x_dim[i];
-    }
-  }
-  x_dim = new_x_dim;
-  origin_reduce_dims = new_reduce_dims;
-  int x_rank = static_cast<int>(x_dim.size());
-  std::set<int> left_set, reduce_set;
-  for (int i = 0; i < x_rank; ++i) left_set.insert(i);
-
-  for (auto e : origin_reduce_dims) {
-    left_set.erase(e);
-    reduce_set.insert(e);
-  }
-
-  std::vector<int> reduce_dim(reduce_set.begin(), reduce_set.end());
-  std::vector<int> left_dim(left_set.begin(), left_set.end());
-
-  std::vector<int> x_strides = detail::GetStrides(x_dim);
-  std::vector<int> reduce_strides = detail::GetStrides(x_dim, reduce_dim);
-  std::vector<int> left_strides = detail::GetStrides(x_dim, left_dim);
-  int reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
-  int left_num = 1;
-  if (left_dim.size()) left_num = left_strides[0] * x_dim[left_dim[0]];
-
-  std::vector<int> y_dim(left_dim.size());
-  for (int i = 0; i < left_dim.size(); ++i) {
-    y_dim[i] = x_dim[left_dim[i]];
-  }
-  auto x_data = x.data<Tx>();
-  auto y_data = y->mutable_data<Ty>(x.place());
-  if (reduce_num == 1) return;
-
-#define CUB_BLOCK_DIM_CASE(block_dim)                                    \
-  case block_dim: {                                                      \
-    constexpr auto kBlockDim = block_dim;                                \
-    detail::TensorReduceImpl<Tx, Ty, block_dim, ReduceOp, TransformOp>(  \
-        x_data, y_data, x.place(), reducer, transformer, init, left_num, \
-        reduce_num, x_strides, reduce_dim, reduce_strides, left_dim,     \
-        left_strides, stream);                                           \
-  } break
-
-  switch (detail::GetDesiredBlockDim(reduce_num)) {
-    CUB_BLOCK_DIM_CASE(512);
-    CUB_BLOCK_DIM_CASE(256);
-    CUB_BLOCK_DIM_CASE(128);
-    CUB_BLOCK_DIM_CASE(64);
-    CUB_BLOCK_DIM_CASE(32);
-    CUB_BLOCK_DIM_CASE(16);
-    CUB_BLOCK_DIM_CASE(8);
-    CUB_BLOCK_DIM_CASE(4);
-    CUB_BLOCK_DIM_CASE(2);
-  }
-#undef CUB_BLOCK_DIM_CASE
-}
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -89,7 +89,7 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
        .SetDefault(false);
    AddComment(string::Sprintf(R"DOC(
-Limited Elementwise %s Operator
+Elementwise %s Operator

 The equation is:


--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -290,12 +290,13 @@ class FusionGRUKernel : public framework::OpKernel<T> {
  void BatchCompute(const framework::ExecutionContext& ctx) const {
    using DeviceContext = paddle::platform::CPUDeviceContext;
    auto* x = ctx.Input<LoDTensor>("X");
+    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
    if (x->lod()[0].size() == 2) {
+      xx->Resize({total_T, D3});
      SeqCompute(ctx);
      return;
    }
-    INIT_BASE_INPUT_OUTPUT
-    INIT_BASE_SIZES
    INIT_VEC_FUNC

    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");

--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -424,11 +424,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
  void BatchCompute(const framework::ExecutionContext& ctx) const {
    using DeviceContext = platform::CPUDeviceContext;
    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
    if (x->lod()[0].size() == 2) {
+      xx->Resize({x_dims[0], D4});
      SeqCompute(ctx);
      return;
    }
-    INIT_BASE_SIZES
    INIT_VEC_FUNC
    INIT_BASE_INPUT_DATAS


--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
--- a/paddle/fluid/operators/math/depthwise_conv.h
+++ b/paddle/fluid/operators/math/depthwise_conv.h
@@ -32,8 +32,7 @@ class DepthwiseConvFunctor {
  void operator()(const DeviceContext& context, const framework::Tensor& input,
                  const framework::Tensor& filter,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations, framework::Tensor* output);
+                  const std::vector<int>& paddings, framework::Tensor* output);
 };

 template <typename DeviceContext, typename T>
@@ -44,7 +43,6 @@ class DepthwiseConvInputGradFunctor {
                  const framework::Tensor& output_grad,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
                  framework::Tensor* input_grad);
 };

@@ -55,7 +53,6 @@ class DepthwiseConvFilterGradFunctor {
                  const framework::Tensor& output_grad,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
                  framework::Tensor* filter_grad);
 };


--- a/paddle/fluid/operators/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_mean_op.cu
@@ -12,64 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <vector>
-#include "paddle/fluid/operators/cub_reduce.h"
 #include "paddle/fluid/operators/reduce_mean_op.h"

-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct DivideFunctor {
-  HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
-
-  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
-
- private:
-  T n_inv;
-};
-
-template <typename T>
-class ReduceMeanKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-
-    auto dims = context.Attr<std::vector<int>>("dim");
-    bool keep_dim = context.Attr<bool>("keep_dim");
-
-    std::vector<int> reduce_dims;
-    if (reduce_all) {
-      reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
-    } else {
-      for (auto e : dims) {
-        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
-      }
-    }
-
-    int reduce_num = 1;
-    for (int i = 0; i < reduce_dims.size(); ++i) {
-      reduce_num *= input->dims()[reduce_dims[i]];
-    }
-
-    auto stream = context.cuda_device_context().stream();
-    TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
-        *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
-        DivideFunctor<T>(reduce_num), stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
-                        ops::ReduceMeanKernel<double>,
-                        ops::ReduceMeanKernel<int>,
-                        ops::ReduceMeanKernel<int64_t>);
-
+REGISTER_OP_CUDA_KERNEL(reduce_mean,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MeanFunctor>);
 REGISTER_OP_CUDA_KERNEL(
    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                            float, ops::MeanGradFunctor>,

--- a/paddle/fluid/operators/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_sum_op.cu
@@ -12,59 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/operators/cub_reduce.h"
 #include "paddle/fluid/operators/reduce_sum_op.h"

-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct IdentityFunctor {
-  HOSTDEVICE explicit inline IdentityFunctor() {}
-
-  HOSTDEVICE inline T operator()(const T& x) const { return x; }
-};
-
-template <typename T>
-class ReduceSumKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-
-    auto dims = context.Attr<std::vector<int>>("dim");
-    bool keep_dim = context.Attr<bool>("keep_dim");
-
-    std::vector<int> reduce_dims;
-    if (reduce_all) {
-      reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
-    } else {
-      for (auto e : dims) {
-        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
-      }
-    }
-
-    int reduce_num = 1;
-    for (int i = 0; i < reduce_dims.size(); ++i) {
-      reduce_num *= input->dims()[reduce_dims[i]];
-    }
-
-    auto stream = context.cuda_device_context().stream();
-    TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
-        *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
-        IdentityFunctor<T>(), stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>,
-                        ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
-                        ops::ReduceSumKernel<int64_t>);
-
+REGISTER_OP_CUDA_KERNEL(reduce_sum,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::SumFunctor>);
 REGISTER_OP_CUDA_KERNEL(
    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                           float, ops::SumGradFunctor>,

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -395,10 +395,11 @@ EOF
        ctest --output-on-failure -j $1     
        # make install should also be test when unittest 
        make install -j 8
-        pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
            paddle version
        fi
+        pip uninstall -y paddlepaddle
    fi
 }

@@ -597,9 +598,9 @@ EOF
 EOF

    if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} &&"
+        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} || true"
    else
-        NCCL_DEPS=""
+        NCCL_DEPS="true"
    fi

    if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
@@ -613,9 +614,8 @@ EOF
    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
    ADD python/dist/*.whl /
    # run paddle version to install python packages first
-    RUN apt-get update &&\
-        ${NCCL_DEPS}\
-        apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \
+    RUN apt-get update && ${NCCL_DEPS}
+    RUN apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \
        pip install /*.whl; apt-get install -f -y && \
        apt-get clean -y && \
        rm -f /*.whl && \

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -842,6 +842,13 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
 def shuffle(reader, buffer_size):
    """
    Shuffle the reader.
+
+    Args:
+        reader(Variable): The reader to be decorated with 'shuffling'.
+        buffer_size(int): The pre-read number of data in :code:`reader`.
+
+    Returns:
+        Variable: The reader which has been decorated with 'shuffling'.
    """
    return __create_unshared_decorated_reader__(
        'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6367,7 +6367,7 @@ def stack(x, axis=0):
    if not isinstance(x, list) and not isinstance(x, tuple):
        x = [x]

-    out = helper.create_tmp_variable(x[0].dtype)
+    out = helper.create_tmp_variable(dtype=x[0].dtype)
    helper.append_op(
        type='stack', inputs={'X': x}, outputs={'Y': out},
        attrs={'axis': axis})
@@ -6404,8 +6404,8 @@ def unstack(x, axis=0, num=None):
            num = x.shape[axis]

    outs = []
-    for _ in num:
-        outs.append(helper.create_tmp_variable(x.dtype))
+    for _ in xrange(num):
+        outs.append(helper.create_tmp_variable(dtype=x.dtype))

    helper.append_op(
        type='unstack',

--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -67,7 +67,6 @@ class TestConv2dOp(OpTest):
    def setUp(self):
        self.op_type = "conv2d"
        self.use_cudnn = False
-        self.use_cuda = False
        self.use_mkldnn = False
        self.data_format = "AnyLayout"
        self.dtype = np.float32
@@ -102,25 +101,24 @@ class TestConv2dOp(OpTest):
        }
        self.outputs = {'Output': output}

-    def testcuda(self):
-        return core.is_compiled_with_cuda() and (self.use_cudnn or
-                                                 self.use_cuda)
+    def testcudnn(self):
+        return core.is_compiled_with_cuda() and self.use_cudnn

    def test_check_output(self):
-        place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
        self.check_output_with_place(place, atol=1e-5)

    def test_check_grad(self):
        if self.dtype == np.float16:
            return
-        place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
        self.check_grad_with_place(
            place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02)

    def test_check_grad_no_filter(self):
        if self.dtype == np.float16:
            return
-        place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
        self.check_grad_with_place(
            place, ['Input'],
            'Output',
@@ -130,7 +128,7 @@ class TestConv2dOp(OpTest):
    def test_check_grad_no_input(self):
        if self.dtype == np.float16:
            return
-        place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
        self.check_grad_with_place(
            place, ['Filter'],
            'Output',
@@ -327,65 +325,22 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):

 class TestDepthwiseConv(TestConv2dOp):
    def init_test_case(self):
-        self.use_cuda = True
        self.pad = [1, 1]
        self.stride = [2, 2]
        self.input_size = [2, 3, 5, 5]  # NCHW
        self.groups = 3
        assert np.mod(self.input_size[1], self.groups) == 0
        f_c = self.input_size[1] // self.groups
-        self.filter_size = [3, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConv2(TestConv2dOp):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [3, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConv3(TestConv2dOp):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
        self.filter_size = [6, f_c, 3, 3]
        self.op_type = "depthwise_conv2d"


-class TestDepthwiseConvWithDilation(TestConv2dOp):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        self.dilations = [2, 2]
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConvWithDilation2(TestConv2dOp):
+class TestDepthwiseConv2(TestConv2dOp):
    def init_test_case(self):
-        self.use_cuda = True
        self.pad = [1, 1]
        self.stride = [1, 1]
        self.input_size = [2, 3, 5, 5]  # NCHW
        self.groups = 3
-        self.dilations = [2, 2]
        assert np.mod(self.input_size[1], self.groups) == 0
        f_c = self.input_size[1] // self.groups
        self.filter_size = [6, f_c, 3, 3]