From 8a49f7f16bf1147611e13f1d32ede953b2c48ac9 Mon Sep 17 00:00:00 2001 From: peterzhang2029 Date: Thu, 16 Nov 2017 15:15:19 +0800 Subject: [PATCH] add epsilon in bn --- paddle/gserver/layers/BatchNormBaseLayer.cpp | 1 + paddle/gserver/layers/BatchNormBaseLayer.h | 2 ++ paddle/gserver/layers/BatchNormalizationLayer.cpp | 2 -- paddle/gserver/layers/BatchNormalizationLayer.h | 3 --- paddle/gserver/layers/CudnnBatchNormLayer.cpp | 12 +++++++----- paddle/gserver/layers/CudnnBatchNormLayer.h | 7 +++++-- paddle/gserver/layers/MKLDNNBatchNormLayer.cpp | 4 ++-- paddle/gserver/layers/MKLDNNBatchNormLayer.h | 3 ++- proto/ModelConfig.proto | 4 ++++ python/paddle/trainer/config_parser.py | 3 +++ python/paddle/trainer_config_helpers/layers.py | 7 +++++++ 11 files changed, 33 insertions(+), 15 deletions(-) diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp index bc7d1c83a48..d56f70ada3b 100644 --- a/paddle/gserver/layers/BatchNormBaseLayer.cpp +++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp @@ -41,6 +41,7 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap, useGlobalStats_ = config_.use_global_stats(); } movingAvgFraction_ = config_.moving_average_fraction(); + EPS = config_.epsilon(); weight_.reset(new Weight(1, channels_, parameters_[0])); movingMean_.reset(new Weight(1, channels_, parameters_[1])); diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h index e721d2d267a..78f476024ad 100644 --- a/paddle/gserver/layers/BatchNormBaseLayer.h +++ b/paddle/gserver/layers/BatchNormBaseLayer.h @@ -94,6 +94,8 @@ protected: bool useGlobalStats_; // use to compute moving mean and variance. real movingAvgFraction_; + // Epsilon value used in the batch normalization formula. + real EPS; }; } // namespace paddle diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp index dacff25e592..aaf59b05061 100644 --- a/paddle/gserver/layers/BatchNormalizationLayer.cpp +++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp @@ -22,8 +22,6 @@ namespace paddle { REGISTER_LAYER(batch_norm, BatchNormalizationLayer); -const real BatchNormalizationLayer::EPS = 1E-5; - bool BatchNormalizationLayer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) { /* Initialize the basic parent class */ diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h index f6115801fc6..1fdb5e20702 100644 --- a/paddle/gserver/layers/BatchNormalizationLayer.h +++ b/paddle/gserver/layers/BatchNormalizationLayer.h @@ -39,9 +39,6 @@ public: void backward(const UpdateCallback& callback = nullptr) override; protected: - /// Epsilon value used in the batch normalization formula. - static const real EPS; - /// Load pre-calculated mean and std. void setMeanAndStd(); diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp index 49a9540c0b6..5b3d07eed1c 100644 --- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp +++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp @@ -21,7 +21,7 @@ namespace paddle { REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer); -const double CudnnBatchNormLayer::EPS = 1E-5; +const double CudnnBatchNormLayer::MIN_EPS = 1E-5; bool CudnnBatchNormLayer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) { @@ -60,6 +60,7 @@ void CudnnBatchNormLayer::forward(PassType passType) { real* beta = biases_->getW()->getData(); real* movingMean = movingMean_->getW()->getData(); real* movingVar = movingVar_->getW()->getData(); + EPS_ = std::max(MIN_EPS, static_cast(EPS)); if (!useGlobalStats_) { REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str()); @@ -75,7 +76,7 @@ void CudnnBatchNormLayer::forward(PassType passType) { 1.0 - movingAvgFraction_, movingMean, movingVar, - EPS, + EPS_, savedMean, savedInvVar); } else { @@ -90,7 +91,7 @@ void CudnnBatchNormLayer::forward(PassType passType) { beta, movingMean, movingVar, - EPS); + EPS_); } else { // There is a limitation in cudnn library. // When the batch size is larger than 1024 in cuDNN v5.1, @@ -101,7 +102,7 @@ void CudnnBatchNormLayer::forward(PassType passType) { beta, movingMean, movingVar, - EPS, + EPS_, batchSize, channels_, imageH_ * imageD_, @@ -127,6 +128,7 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) { real* gamma = weight_->getW()->getData(); real* savedMean = savedMean_->getData(); real* savedInvVar = savedInvVar_->getData(); + EPS_ = std::max(MIN_EPS, static_cast(EPS)); auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) { Matrix::resizeOrCreate(m, h, w, false, true); @@ -157,7 +159,7 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) { gamma, gammaGrad, betaGrad, - EPS, + EPS_, savedMean, savedInvVar); diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h index 413efd4d3ec..4916a9ce80a 100644 --- a/paddle/gserver/layers/CudnnBatchNormLayer.h +++ b/paddle/gserver/layers/CudnnBatchNormLayer.h @@ -47,11 +47,14 @@ public: protected: /** - * Epsilon value used in the batch normalization formula. * Minimum allowed value is CUDNN_BN_MIN_EPSILON defined in cudnn.h. * Same epsilon value should be used in forward and backward functions. */ - static const double EPS; + static const double MIN_EPS; + + /// Epsilon value used in the batch normalization formula. + /// If EPS_ is smaller than MIN_EPS, MIN_EPS will be used. + double EPS_; /// Input/output tensor descriptor desc hl_tensor_descriptor ioDesc_; diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp index 071bdf54d5d..f5bd4300982 100644 --- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp +++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp @@ -21,8 +21,6 @@ namespace paddle { REGISTER_LAYER(mkldnn_batch_norm, MKLDNNBatchNormLayer); -const real MKLDNNBatchNormLayer::EPS = 1E-5; - bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) { if (!MKLDNNLayer::init(layerMap, parameterMap)) { @@ -50,6 +48,8 @@ bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap, useGlobalStats_ = config_.use_global_stats(); } movingAvgFraction_ = config_.moving_average_fraction(); + EPS = config_.epsilon(); + VLOG(MKLDNN_BASE) << "--- " << (useGlobalStats_ ? "use" : "do not use") << " --- global stats"; VLOG(MKLDNN_BASE) << "Moving average fraction: " << movingAvgFraction_; diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.h b/paddle/gserver/layers/MKLDNNBatchNormLayer.h index 456c0424ecb..769af2dfc71 100644 --- a/paddle/gserver/layers/MKLDNNBatchNormLayer.h +++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.h @@ -32,7 +32,8 @@ protected: std::shared_ptr fwdPD_; // Epsilon value used in the batch normalization formula. - static const real EPS; + real EPS; + // weight and bias in paddle std::unique_ptr weight_; std::unique_ptr biases_; diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index 2c2cc624593..ad1251e3192 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -540,6 +540,10 @@ message LayerConfig { // for switch order layer optional ReshapeConfig reshape_conf = 59; + + // for batch normalization layer + // small constant added to the variance to avoid numerical problems. + optional double epsilon = 60 [ default = 0.00001 ]; } message EvaluatorConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 5bd68e211ac..da768ee547c 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2434,6 +2434,7 @@ class BatchNormLayer(LayerBase): bias=True, img3D=False, use_global_stats=True, + epsilon=1e-5, moving_average_fraction=0.9, batch_norm_type=None, mean_var_names=None, @@ -2482,6 +2483,8 @@ class BatchNormLayer(LayerBase): self.config.use_global_stats = use_global_stats if moving_average_fraction is not None: self.config.moving_average_fraction = moving_average_fraction + if epsilon is not None: + self.config.epsilon = epsilon input_layer = self.get_input_layer(0) image_conf = self.config.inputs[0].image_conf diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index a02eba007dd..77fa5f8640b 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -3036,6 +3036,7 @@ def batch_norm_layer(input, param_attr=None, layer_attr=None, batch_norm_type=None, + epsilon=1e-5, moving_average_fraction=0.9, use_global_stats=None, mean_var_names=None): @@ -3106,6 +3107,8 @@ def batch_norm_layer(input, will use the mean and variance of the current batch of test data. :type use_global_stats: bool | None. + :param epsilon: Small constant added to the variance to avoid numerical problems. + :type epsilon: float. :param moving_average_fraction: Factor used in the moving average computation. :math:`runningMean = newMean*(1-factor) + runningMean*factor` :type moving_average_fraction: float. @@ -3123,6 +3126,9 @@ def batch_norm_layer(input, assert (batch_norm_type is None) or (batch_norm_type == "batch_norm") or \ (batch_norm_type == "mkldnn_batch_norm") or \ (batch_norm_type == "cudnn_batch_norm") + + assert epsilon >= 1e-5, "Parameter epsilon must be no less than 1e-5." + l = Layer( name=name, img3D=img3D, @@ -3132,6 +3138,7 @@ def batch_norm_layer(input, type=LayerType.BATCH_NORM_LAYER, batch_norm_type=batch_norm_type, bias=ParamAttr.to_bias(bias_attr), + epsilon=epsilon, moving_average_fraction=moving_average_fraction, use_global_stats=use_global_stats, mean_var_names=mean_var_names, -- GitLab