Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_bn_eq

9580c450 · peterzhang2029 · 27d7b2cb · c808fbbf · 9580c450 · 9580c450
19 changed file
--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@@ -76,11 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
 # Set the architecture for iOS
 if(NOT DEFINED IOS_ARCH)
  if(IOS_PLATFORM STREQUAL "OS")
-    # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
-    set(IOS_ARCH "arm64")
+    set(IOS_ARCH "armv7;armv7s;arm64")
  elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
-    # FIXME(liuyiqun): support "i386;x86_64" future
-    set(IOS_ARCH "x86_64")
+    set(IOS_ARCH "i386;x86_64")
  endif()
 endif()
 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
@@ -248,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_

 # Hidden visibilty is required for cxx on iOS 
 set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
-set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
+set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")

 set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")


--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND})
                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
            ENDIF()
        ELSEIF(IOS)
-            # FIXME(liuyiqun): support multiple architectures
+            IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
                SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
                SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
-            IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
-                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
-            ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
+            ELSE()
+                MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
+                       "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
            ENDIF()
        ELSEIF(RPI)
            # use hardfp

--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+
 INCLUDE(ExternalProject)

 SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)

--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
-# 构建Android平台上的PaddlePaddle库
+# Android平台编译指南

 用户可通过如下两种方式，交叉编译Android平台上适用的PaddlePaddle库：
 - 基于Docker容器的编译方式

--- a/doc/mobile/cross_compiling_for_ios_cn.md
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
-# 构建iOS平台上的PaddlePaddle库
+# iOS平台编译指南
 交叉编译iOS平台上适用的PaddlePaddle库，需要在MacOS系统上进行。本文的将介绍在MacOS上，从源码交叉编译iOS平台上适用的PaddlePaddle库。

 ## 准备交叉编译环境
@@ -25,7 +25,7 @@ iOS平台可选配置参数：
 - `IOS_PLATFORM`，可设置为`OS/SIMULATOR`，默认值为`OS`。
  - `OS`，构建目标为`arm`架构的iPhone或者iPad等物理设备。
  - `SIMULATOR`，构建目标为`x86`架构的模拟器平台。
- `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示：
+- `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示，默认编译所有架构：

    <table class="docutils">
    <colgroup>
@@ -41,11 +41,11 @@ iOS平台可选配置参数：
    <tbody valign="top">
      <tr class="row-even">
      <td>OS</td>
-      <td>armv7, armv7s, arm64 (默认)</td>
+      <td>armv7, armv7s, arm64 </td>
    </tr>
    <tr class="row-odd">
      <td>SIMULATOR</td>
-      <td>i386, x86_64 (默认)</td>
+      <td>i386, x86_64 </td>
    </tr>
    </tbody>
    </table>
@@ -66,7 +66,7 @@ iOS平台可选配置参数：
 ```bash
 cmake -DCMAKE_SYSTEM_NAME=iOS \
      -DIOS_PLATFORM=OS \
-      -DIOS_ARCH="arm64" \
+      -DIOS_ARCH="armv7;arm64" \
      -DIOS_ENABLE_BITCODE=ON \
      -DIOS_USE_VECLIB_FOR_BLAS=ON \
      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
@@ -112,6 +112,6 @@ $ make install
 - `lib`目录，其中包含PaddlePaddle的C-API静态库
 - `third_party`目录，其中包含所依赖的所有第三方库

-注意，不同架构的PaddlePaddle库建议安装到不同的目录下，然后使用`lipo`工具将多个静态库合并成一个支持多个架构的fat库。
+注意，如果PaddlePaddle库需要同时支持真机和模拟器，则需要分别编译真机和模拟器版本，然后使用`lipo`工具合并fat库。

 自此，PaddlePaddle库已经安装完成，用户可将合成的fat库用于深度学习相关的iOS App中，调用方法见C-API文档。
--- a/doc/mobile/cross_compiling_for_raspberry_cn.md
+++ b/doc/mobile/cross_compiling_for_raspberry_cn.md
-# 构建Raspberry Pi平台上的PaddlePaddle库
+# Raspberry Pi平台编译指南

 通常有两个方法来构建基于 Rasspberry Pi 的版本：


--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -25,7 +25,9 @@ limitations under the License. */
 #include "hl_matrix.h"
 #include "hl_sequence.h"
 #include "hl_sparse.h"
+#ifndef PADDLE_MOBILE_INFERENCE
 #include "hl_warpctc_wrap.h"
+#endif

 #ifdef HPPL_STUB_FUNC
 #include "stub/hl_aggregate_stub.h"

--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -41,7 +41,7 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap,
    useGlobalStats_ = config_.use_global_stats();
  }
  movingAvgFraction_ = config_.moving_average_fraction();
-  EPS = config_.epsilon();
+  epsilon_ = config_.epsilon();

  weight_.reset(new Weight(1, channels_, parameters_[0]));
  movingMean_.reset(new Weight(1, channels_, parameters_[1]));

--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -94,8 +94,8 @@ protected:
  bool useGlobalStats_;
  // use to compute moving mean and variance.
  real movingAvgFraction_;
-  // Epsilon value used in the batch normalization formula.
-  real EPS;
+  // Epsilon is a small random noise used in batch normalization for stability.
+  real epsilon_;
 };

 }  // namespace paddle
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -51,7 +51,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {

  calMovingMeanAndVar();

-  savedInvVar_->subScalar(-EPS);
+  savedInvVar_->subScalar(-epsilon_);
  savedInvVar_->sqrt2(*savedInvVar_);
 }

@@ -72,7 +72,7 @@ void BatchNormalizationLayer::setMeanAndStd() {
  savedInvVar_->copyFrom(*(movingVar_->getW()));
  savedInvVar_->downClip(real(0.0));

-  savedInvVar_->subScalar(-EPS);
+  savedInvVar_->subScalar(-epsilon_);
  savedInvVar_->sqrt2(*savedInvVar_);
 }


--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -60,7 +60,15 @@ void CudnnBatchNormLayer::forward(PassType passType) {
  real* beta = biases_->getW()->getData();
  real* movingMean = movingMean_->getW()->getData();
  real* movingVar = movingVar_->getW()->getData();
-  EPS_ = std::max(MIN_EPS, static_cast<double>(EPS));
+
+  /**
+  * If epsilon_ equals to 1e-5 and eps_ is assigned the value of
+  * static_cast<double>(epsilon_), The CUDNN_STATUS_BAD_PARAM error
+  * will occur due to eps_ value is less than
+  * CUDNN_BN_MIN_EPSILON.
+  * The following code is to ensure that the eps_ meets requirement.
+  */
+  eps_ = std::max(MIN_EPS, static_cast<double>(epsilon_));

  if (!useGlobalStats_) {
    REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
@@ -76,7 +84,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                   1.0 - movingAvgFraction_,
                                   movingMean,
                                   movingVar,
-                                   EPS_,
+                                   eps_,
                                   savedMean,
                                   savedInvVar);
  } else {
@@ -91,7 +99,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                      beta,
                                      movingMean,
                                      movingVar,
-                                      EPS_);
+                                      eps_);
    } else {
      // There is a limitation in cudnn library.
      // When the batch size is larger than 1024 in cuDNN v5.1,
@@ -102,7 +110,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                   beta,
                                   movingMean,
                                   movingVar,
-                                   EPS_,
+                                   eps_,
                                   batchSize,
                                   channels_,
                                   imageH_ * imageD_,
@@ -128,7 +136,15 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
  real* gamma = weight_->getW()->getData();
  real* savedMean = savedMean_->getData();
  real* savedInvVar = savedInvVar_->getData();
-  EPS_ = std::max(MIN_EPS, static_cast<double>(EPS));
+
+  /**
+  * If epsilon_ equals to 1e-5 and eps_ is assigned the value of
+  * static_cast<double>(epsilon_), The CUDNN_STATUS_BAD_PARAM error
+  * will occur due to eps_ value is less than
+  * CUDNN_BN_MIN_EPSILON.
+  * The following code is to ensure that the eps_ meets requirement.
+  */
+  eps_ = std::max(MIN_EPS, static_cast<double>(epsilon_));

  auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
    Matrix::resizeOrCreate(m, h, w, false, true);
@@ -159,7 +175,7 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
                         gamma,
                         gammaGrad,
                         betaGrad,
-                         EPS_,
+                         eps_,
                         savedMean,
                         savedInvVar);


--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -46,15 +46,12 @@ public:
  void backward(const UpdateCallback& callback = nullptr) override;

 protected:
-  /**
-   * Minimum allowed value is CUDNN_BN_MIN_EPSILON defined in cudnn.h.
-   * Same epsilon value should be used in forward and backward functions.
-   */
+  /// Minimum allowed value is CUDNN_BN_MIN_EPSILON defined in cudnn.h.
  static const double MIN_EPS;

  /// Epsilon value used in the batch normalization formula.
-  /// If EPS_ is smaller than MIN_EPS, MIN_EPS will be used.
-  double EPS_;
+  /// Same epsilon value should be used in forward and backward functions.
+  double eps_;

  /// Input/output tensor descriptor desc
  hl_tensor_descriptor ioDesc_;

--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@@ -48,7 +48,7 @@ bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
    useGlobalStats_ = config_.use_global_stats();
  }
  movingAvgFraction_ = config_.moving_average_fraction();
-  EPS = config_.epsilon();
+  epsilon_ = config_.epsilon();

  VLOG(MKLDNN_BASE) << "--- " << (useGlobalStats_ ? "use" : "do not use")
                    << " --- global stats";
@@ -213,7 +213,7 @@ void MKLDNNBatchNormLayer::resetFwdPD(
  if (wgt) {
    flags_ = (flags_ | batch_normalization_flag::use_scale_shift);
  }
-  auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), EPS, flags_);
+  auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), epsilon_, flags_);
  pd.reset(new bn_fwd::primitive_desc(fwdDesc, engine_));
  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
  if (wgt) {
@@ -280,7 +280,7 @@ void MKLDNNBatchNormLayer::resetBwdPD(
  }
  CHECK_PRIMITIVE_DESC_EQ(out, in->getPrimitiveDesc());
  auto md = in->getMemoryDesc();
-  auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, EPS, flags_);
+  auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, epsilon_, flags_);
  pd.reset(new bn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
  CHECK(pd->weights_primitive_desc() == fwdPD_->weights_primitive_desc());
  CHECK_PRIMITIVE_DESC_EQ(wgt, pd->diff_weights_primitive_desc());

--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.h
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
@@ -32,7 +32,7 @@ protected:
  std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;

  // Epsilon value used in the batch normalization formula.
-  real EPS;
+  real epsilon_;

  // weight and bias in paddle
  std::unique_ptr<Weight> weight_;

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2483,7 +2483,7 @@ class BatchNormLayer(LayerBase):
            self.config.use_global_stats = use_global_stats
        if moving_average_fraction is not None:
            self.config.moving_average_fraction = moving_average_fraction
-        if epsilon is not None:
+
        self.config.epsilon = epsilon

        input_layer = self.get_input_layer(0)

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -3127,7 +3127,7 @@ def batch_norm_layer(input,
           (batch_norm_type == "mkldnn_batch_norm") or \
           (batch_norm_type == "cudnn_batch_norm")

-    assert epsilon >= 1e-5, "Parameter epsilon must be no less than 1e-5."
+    assert epsilon >= 1e-5, "epsilon must be no less than 1e-5."

    l = Layer(
        name=name,

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
@@ -65,6 +65,7 @@ layers {
  height: 227
  width: 227
  depth: 1
+  epsilon: 1e-05
 }
 layers {
  name: "__crmnorm_0__"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -65,6 +65,7 @@ layers {
  height: 256
  width: 256
  depth: 1
+  epsilon: 1e-05
 }
 layers {
  name: "__crmnorm_0__"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
@@ -36,6 +36,7 @@ layers {
  height: 6
  width: 20
  depth: 3
+  epsilon: 1e-05
 }
 parameters {
  name: "___batch_norm_0__.w0"