Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into ini

773c6557 · _青葱 · 0c321fe3 · c241959e · 773c6557 · 773c6557
17 changed file
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
@@ -119,7 +119,7 @@ An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Pad
 From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
-We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid.
 ## Turing Completeness

--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -114,23 +114,11 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
    const auto *bias = ctx.Input<Tensor>("Bias");
    auto *y = ctx.Output<Tensor>("Y");
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
    // alloc memory
    y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
-        functor;
-    functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
-    functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
    auto handle = dev_ctx.cudnn_handle();
@@ -159,6 +147,21 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
      // Run training mode.
      // obtain running mean and running inv var, and see if we need to
      // initialize them.
+      auto *mean_out = ctx.Output<Tensor>("MeanOut");
+      auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+      mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+      auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+      saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+          functor;
+      functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
+      functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
      double this_factor = 1. - momentum;
      CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(

--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -288,9 +288,14 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
  // TODO(kexinzhao): add processing code for compute capability < 53 case
  PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
                    "cublas Hgemm requires GPU compute capability >= 53");
+#if CUDA_VERSION >= 8000
  PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
      strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
+#else
+  PADDLE_ENFORCE(false, "HgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
 }
 template <>
@@ -310,9 +315,13 @@ void batched_gemm<platform::CUDADeviceContext, float>(
      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  const int strideC = M * N;
+#if CUDA_VERSION >= 8000
  PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
+#else
+  PADDLE_ENFORCE(false, "SgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
 }
 template <>
@@ -332,9 +341,13 @@ void batched_gemm<platform::CUDADeviceContext, double>(
      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  const int strideC = M * N;
+#if CUDA_VERSION >= 8000
  PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
+#else
+  PADDLE_ENFORCE(false, "DgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
 }
 template <>

--- a/paddle/fluid/operators/pad_op.h
+++ b/paddle/fluid/operators/pad_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 #pragma once
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"

--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -83,9 +83,11 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
    auto src_memory =
-        mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
+        mkldnn::memory({src_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(input_data)));
    auto dst_memory =
-        mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data);
+        mkldnn::memory({dst_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(output_data)));
    auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory,
                                             *workspace_memory);
@@ -195,9 +197,11 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
        pool_bwd_desc, mkldnn_engine, *pool_pd);
    auto diff_src_memory =
-        mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data);
+        mkldnn::memory({diff_src_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(in_x_grad_data)));
    auto diff_dst_memory =
-        mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data);
+        mkldnn::memory({diff_dst_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(out_grad_data)));
    auto bwd_prim = mkldnn::pooling_backward(
        pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory);

--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 #pragma once
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"

--- a/paddle/fluid/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"

--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/prelu_op.h"
 #include <string>
 namespace paddle {

--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -45,7 +45,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
    bool flip = ctx->Attrs().Get<bool>("flip");
    std::vector<float> aspect_ratios_vec;
-    ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
+    ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
    size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
    if (max_sizes.size() > 0) {

--- a/paddle/fluid/operators/prior_box_op.cu
+++ b/paddle/fluid/operators/prior_box_op.cu
@@ -96,7 +96,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
    auto clip = ctx.Attr<bool>("clip");
    std::vector<float> aspect_ratios;
-    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));

--- a/paddle/fluid/operators/prior_box_op.h
+++ b/paddle/fluid/operators/prior_box_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/transform.h"
@@ -22,23 +24,23 @@ namespace operators {
 inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
                               bool flip,
-                               std::vector<float>& output_aspect_ratior) {
+                               std::vector<float>* output_aspect_ratior) {
  constexpr float epsilon = 1e-6;
-  output_aspect_ratior.clear();
+  output_aspect_ratior->clear();
-  output_aspect_ratior.push_back(1.0f);
+  output_aspect_ratior->push_back(1.0f);
  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
    float ar = input_aspect_ratior[i];
    bool already_exist = false;
-    for (size_t j = 0; j < output_aspect_ratior.size(); ++j) {
+    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
-      if (fabs(ar - output_aspect_ratior[j]) < epsilon) {
+      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
        already_exist = true;
        break;
      }
    }
    if (!already_exist) {
-      output_aspect_ratior.push_back(ar);
+      output_aspect_ratior->push_back(ar);
      if (flip) {
-        output_aspect_ratior.push_back(1.0f / ar);
+        output_aspect_ratior->push_back(1.0f / ar);
      }
    }
  }
@@ -68,7 +70,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
    auto clip = ctx.Attr<bool>("clip");
    std::vector<float> aspect_ratios;
-    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));

--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/rank_loss_op.h"
+#include <string>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <future>  // NOLINT
 #include <ostream>
 #include "paddle/fluid/framework/data_type.h"
@@ -19,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include <future>
 #include "paddle/fluid/operators/detail/grpc_client.h"
 namespace paddle {

--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <algorithm>
+#include <limits>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"

--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -37,8 +37,8 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
                          const framework::DDim& src_stride,
                          const framework::DDim& dst_dim,
                          const framework::DDim& dst_stride, T* dst) {
-  using namespace detail;
+  paddle::operators::detail::StridedCopyDimVisitor<T> func(
-  StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
+      dev_ctx, src, src_stride, dst_stride, dst);
  boost::apply_visitor(func, dst_dim);
 }

--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/assert.h"
 namespace paddle {
@@ -133,71 +134,71 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
 }
 template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
                                              int beam_size, const T* src,
-                                              bool& firstStep, bool& is_empty,
+                                              bool* firstStep, bool* is_empty,
-                                              Pair<T>& max, int dim,
+                                              Pair<T>* max, int dim,
                                              const int tid) {
-  if (beam > 0) {
+  if (*beam > 0) {
-    int length = beam < beam_size ? beam : beam_size;
+    int length = (*beam) < beam_size ? *beam : beam_size;
-    if (firstStep) {
+    if (*firstStep) {
-      firstStep = false;
+      *firstStep = false;
      GetTopK<T, BlockSize>(topk, src, tid, dim, length);
    } else {
      for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - beam) {
+        if (k < MaxLength - (*beam)) {
-          topk[k] = topk[k + beam];
+          topk[k] = topk[k + *beam];
        } else {
          topk[k].set(-INFINITY, -1);
        }
      }
-      if (!is_empty) {
+      if (!(*is_empty)) {
-        GetTopK<T, BlockSize>(topk + MaxLength - beam, src, tid, dim, max,
+        GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
                              length);
      }
    }
-    max = topk[MaxLength - 1];
+    *max = topk[MaxLength - 1];
-    if (max.v == -1) is_empty = true;
+    if ((*max).v == -1) *is_empty = true;
-    beam = 0;
+    *beam = 0;
  }
 }
 template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
                                              int beam_size, const T* val,
-                                              int* col, bool& firstStep,
+                                              int* col, bool* firstStep,
-                                              bool& is_empty, Pair<T>& max,
+                                              bool* is_empty, Pair<T>* max,
                                              int dim, const int tid) {
-  if (beam > 0) {
+  if (*beam > 0) {
-    int length = beam < beam_size ? beam : beam_size;
+    int length = (*beam) < beam_size ? *beam : beam_size;
-    if (firstStep) {
+    if (*firstStep) {
-      firstStep = false;
+      *firstStep = false;
      GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
    } else {
      for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - beam) {
+        if (k < MaxLength - *beam) {
-          topk[k] = topk[k + beam];
+          topk[k] = topk[k + *beam];
        } else {
          topk[k].set(-INFINITY, -1);
        }
      }
-      if (!is_empty) {
+      if (!(*is_empty)) {
-        GetTopK<T, BlockSize>(topk + MaxLength - beam, val, col, tid, dim, max,
+        GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
                              length);
      }
    }
-    max = topk[MaxLength - 1];
+    *max = topk[MaxLength - 1];
-    if (max.v == -1) is_empty = true;
+    if ((*max).v == -1) *is_empty = true;
-    beam = 0;
+    *beam = 0;
  }
 }
 template <typename T, int MaxLength, int BlockSize>
 __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
                                            Pair<T> topk[], T** topVal,
-                                            int64_t** topIds, int& beam, int& k,
+                                            int64_t** topIds, int* beam, int* k,
                                            const int tid, const int warp) {
  while (true) {
    __syncthreads();
@@ -225,17 +226,17 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
      (*topVal)++;
      (*topIds)++;
    }
-    if (tid == maxid[0]) beam++;
+    if (tid == maxid[0]) (*beam)++;
-    if (--k == 0) break;
+    if (--(*k) == 0) break;
    __syncthreads();
    if (tid == maxid[0]) {
-      if (beam < MaxLength) {
+      if (*beam < MaxLength) {
-        sh_topk[tid] = topk[beam];
+        sh_topk[tid] = topk[*beam];
      }
    }
    if (maxid[0] / 32 == warp) {
-      if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break;
+      if (__shfl(*beam, (maxid[0]) % 32, 32) == MaxLength) break;
    }
  }
 }
@@ -268,13 +269,13 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
    topk[k].set(-INFINITY, -1);
  }
  while (k) {
-    ThreadGetTopK<T, MaxLength, BlockSize>(topk, beam, k,
+    ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
-                                           src + blockIdx.x * lds, firststep,
+                                           src + blockIdx.x * lds, &firststep,
-                                           is_empty, max, dim, tid);
+                                           &is_empty, &max, dim, tid);
    sh_topk[tid] = topk[0];
    BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
-                                         &indices, beam, k, tid, warp);
+                                         &indices, &beam, &k, tid, warp);
  }
 }
@@ -308,9 +309,9 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
    KeMatrixTopK<T, 5, 256><<<
        grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                              ctx.device_context())
-                              .stream()>>>(output_data, output->dims()[1],
+                              .stream()>>>(
-                                           indices_data, input_data,
+        output_data, output->dims()[1], indices_data, input_data, input_width,
-                                           input_width, input_width, int(k));
+        input_width, static_cast<int>(k));
  }
 };

--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -37,7 +37,7 @@ depth = 8
 mix_hidden_lr = 1e-3
 IS_SPARSE = True
-PASS_NUM = 10
+PASS_NUM = 100
 BATCH_SIZE = 10
 embedding_name = 'emb'
@@ -77,7 +77,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
    emb_layers.append(mark_embedding)
    hidden_0_layers = [
-        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
+        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
+        for emb in emb_layers
    ]
    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
@@ -94,8 +95,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
    for i in range(1, depth):
        mix_hidden = fluid.layers.sums(input=[
-            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
-            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
        ])
        lstm = fluid.layers.dynamic_lstm(
@@ -109,8 +110,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
        input_tmp = [mix_hidden, lstm]
    feature_out = fluid.layers.sums(input=[
-        fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
-        fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
    ])
    return feature_out
@@ -171,7 +172,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
    # check other optimizers and check why out will be NAN
    sgd_optimizer = fluid.optimizer.SGD(
        learning_rate=fluid.layers.exponential_decay(
-            learning_rate=0.0001,
+            learning_rate=0.01,
            decay_steps=100000,
            decay_rate=0.5,
            staircase=True))
@@ -233,7 +234,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
                        print("second per batch: " + str((time.time(
                        ) - start_time) / batch_id))
                    # Set the threshold low to speed up the CI test
-                    if float(pass_precision) > 0.05:
+                    if float(pass_precision) > 0.01:
                        if save_dirname is not None:
                            # TODO(liuyiqun): Change the target to crf_decode
                            fluid.io.save_inference_model(save_dirname, [