add epsilon in bn

8a49f7f1 · peterzhang2029 · 08bc08d6 · 8a49f7f1 · 8a49f7f1 · 8a49f7f1
11 changed file
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -41,6 +41,7 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap,
    useGlobalStats_ = config_.use_global_stats();
  }
  movingAvgFraction_ = config_.moving_average_fraction();
+  EPS = config_.epsilon();

  weight_.reset(new Weight(1, channels_, parameters_[0]));
  movingMean_.reset(new Weight(1, channels_, parameters_[1]));

--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -94,6 +94,8 @@ protected:
  bool useGlobalStats_;
  // use to compute moving mean and variance.
  real movingAvgFraction_;
+  // Epsilon value used in the batch normalization formula.
+  real EPS;
 };

 }  // namespace paddle
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -22,8 +22,6 @@ namespace paddle {

 REGISTER_LAYER(batch_norm, BatchNormalizationLayer);

-const real BatchNormalizationLayer::EPS = 1E-5;
-
 bool BatchNormalizationLayer::init(const LayerMap& layerMap,
                                   const ParameterMap& parameterMap) {
  /* Initialize the basic parent class */

--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -39,9 +39,6 @@ public:
  void backward(const UpdateCallback& callback = nullptr) override;

 protected:
-  /// Epsilon value used in the batch normalization formula.
-  static const real EPS;
-
  /// Load pre-calculated mean and std.
  void setMeanAndStd();


--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -21,7 +21,7 @@ namespace paddle {

 REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);

-const double CudnnBatchNormLayer::EPS = 1E-5;
+const double CudnnBatchNormLayer::MIN_EPS = 1E-5;

 bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
                               const ParameterMap& parameterMap) {
@@ -60,6 +60,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
  real* beta = biases_->getW()->getData();
  real* movingMean = movingMean_->getW()->getData();
  real* movingVar = movingVar_->getW()->getData();
+  EPS_ = std::max(MIN_EPS, static_cast<double>(EPS));

  if (!useGlobalStats_) {
    REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
@@ -75,7 +76,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                   1.0 - movingAvgFraction_,
                                   movingMean,
                                   movingVar,
-                                   EPS,
+                                   EPS_,
                                   savedMean,
                                   savedInvVar);
  } else {
@@ -90,7 +91,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                      beta,
                                      movingMean,
                                      movingVar,
-                                      EPS);
+                                      EPS_);
    } else {
      // There is a limitation in cudnn library.
      // When the batch size is larger than 1024 in cuDNN v5.1,
@@ -101,7 +102,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                   beta,
                                   movingMean,
                                   movingVar,
-                                   EPS,
+                                   EPS_,
                                   batchSize,
                                   channels_,
                                   imageH_ * imageD_,
@@ -127,6 +128,7 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
  real* gamma = weight_->getW()->getData();
  real* savedMean = savedMean_->getData();
  real* savedInvVar = savedInvVar_->getData();
+  EPS_ = std::max(MIN_EPS, static_cast<double>(EPS));

  auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
    Matrix::resizeOrCreate(m, h, w, false, true);
@@ -157,7 +159,7 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
                         gamma,
                         gammaGrad,
                         betaGrad,
-                         EPS,
+                         EPS_,
                         savedMean,
                         savedInvVar);


--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -47,11 +47,14 @@ public:

 protected:
  /**
-   * Epsilon value used in the batch normalization formula.
   * Minimum allowed value is CUDNN_BN_MIN_EPSILON defined in cudnn.h.
   * Same epsilon value should be used in forward and backward functions.
   */
-  static const double EPS;
+  static const double MIN_EPS;
+
+  /// Epsilon value used in the batch normalization formula.
+  /// If EPS_ is smaller than MIN_EPS, MIN_EPS will be used.
+  double EPS_;

  /// Input/output tensor descriptor desc
  hl_tensor_descriptor ioDesc_;

--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@@ -21,8 +21,6 @@ namespace paddle {

 REGISTER_LAYER(mkldnn_batch_norm, MKLDNNBatchNormLayer);

-const real MKLDNNBatchNormLayer::EPS = 1E-5;
-
 bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
                                const ParameterMap& parameterMap) {
  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
@@ -50,6 +48,8 @@ bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
    useGlobalStats_ = config_.use_global_stats();
  }
  movingAvgFraction_ = config_.moving_average_fraction();
+  EPS = config_.epsilon();
+
  VLOG(MKLDNN_BASE) << "--- " << (useGlobalStats_ ? "use" : "do not use")
                    << " --- global stats";
  VLOG(MKLDNN_BASE) << "Moving average fraction: " << movingAvgFraction_;

--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.h
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
@@ -32,7 +32,8 @@ protected:
  std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;

  // Epsilon value used in the batch normalization formula.
-  static const real EPS;
+  real EPS;
+
  // weight and bias in paddle
  std::unique_ptr<Weight> weight_;
  std::unique_ptr<Weight> biases_;

--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -540,6 +540,10 @@ message LayerConfig {

  // for switch order layer
  optional ReshapeConfig reshape_conf = 59;
+
+  // for batch normalization layer
+  // small constant added to the variance to avoid numerical problems.
+  optional double epsilon = 60 [ default = 0.00001 ];
 }

 message EvaluatorConfig {

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2434,6 +2434,7 @@ class BatchNormLayer(LayerBase):
                 bias=True,
                 img3D=False,
                 use_global_stats=True,
+                 epsilon=1e-5,
                 moving_average_fraction=0.9,
                 batch_norm_type=None,
                 mean_var_names=None,
@@ -2482,6 +2483,8 @@ class BatchNormLayer(LayerBase):
            self.config.use_global_stats = use_global_stats
        if moving_average_fraction is not None:
            self.config.moving_average_fraction = moving_average_fraction
+        if epsilon is not None:
+            self.config.epsilon = epsilon

        input_layer = self.get_input_layer(0)
        image_conf = self.config.inputs[0].image_conf

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -3036,6 +3036,7 @@ def batch_norm_layer(input,
                     param_attr=None,
                     layer_attr=None,
                     batch_norm_type=None,
+                     epsilon=1e-5,
                     moving_average_fraction=0.9,
                     use_global_stats=None,
                     mean_var_names=None):
@@ -3106,6 +3107,8 @@ def batch_norm_layer(input,
                             will use the mean and variance of the current batch
                             of test data.
    :type use_global_stats: bool | None.
+    :param epsilon: Small constant added to the variance to avoid numerical problems.
+    :type epsilon: float.
    :param moving_average_fraction: Factor used in the moving average computation.
                                   :math:`runningMean = newMean*(1-factor) + runningMean*factor`
    :type moving_average_fraction: float.
@@ -3123,6 +3126,9 @@ def batch_norm_layer(input,
    assert (batch_norm_type is None) or (batch_norm_type == "batch_norm") or \
           (batch_norm_type == "mkldnn_batch_norm") or \
           (batch_norm_type == "cudnn_batch_norm")
+
+    assert epsilon >= 1e-5, "Parameter epsilon must be no less than 1e-5."
+
    l = Layer(
        name=name,
        img3D=img3D,
@@ -3132,6 +3138,7 @@ def batch_norm_layer(input,
        type=LayerType.BATCH_NORM_LAYER,
        batch_norm_type=batch_norm_type,
        bias=ParamAttr.to_bias(bias_attr),
+        epsilon=epsilon,
        moving_average_fraction=moving_average_fraction,
        use_global_stats=use_global_stats,
        mean_var_names=mean_var_names,