diff --git a/paddle/parameter/FirstOrderOptimizer.cpp b/paddle/parameter/FirstOrderOptimizer.cpp index dbb738e98b5874f5bb33026ad585a6c3ef327d1d..5938b2210c7174c9a0ce659220825b74af007db5 100644 --- a/paddle/parameter/FirstOrderOptimizer.cpp +++ b/paddle/parameter/FirstOrderOptimizer.cpp @@ -161,6 +161,7 @@ void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { CHECK(sparseId == -1LU) << "Sparse update is not supported"; + BaseMatrix& value = *vecs[PARAMETER_VALUE]; BaseMatrix& grad = *vecs[PARAMETER_GRADIENT]; BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM]; @@ -265,6 +266,7 @@ void AdamParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { CHECK(sparseId == -1UL) << "Sparse update is not supported"; + real beta1_power = std::pow(beta1_, step_); real beta2_power = std::pow(beta2_, step_); real learningRate = config.learning_rate() * learningRate_; @@ -303,18 +305,25 @@ void AdamaxParameterOptimizer::update(const VectorPtr vecs[], void OptimizerWithGradientClipping::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { + real globalThreshold = optConfig_.gradient_clipping_threshold(); + real localThreshold = config.gradient_clipping_threshold(); + + // Use local gradient clipping threshold if it's enabled, + // otherwise using the global one. + real threshold = localThreshold > 0.0f ? localThreshold : globalThreshold; + std::string field = localThreshold > 0.0f ? "local" : "global"; + real maxAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsMax(); - if (maxAbsGrad > config.gradient_clipping_threshold()) { + if (maxAbsGrad > threshold) { if (FLAGS_log_clipping) { real avgAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsSum() / vecs[PARAMETER_GRADIENT]->getSize(); - LOG(INFO) << "parameter=" << config.name() << " need clipping," - << " max grad=" << maxAbsGrad << " avg grad=" << avgAbsGrad; + LOG(INFO) << "parameter=" << config.name() << " need clipping by " + << field << " threshold=" << threshold + << ", max grad=" << maxAbsGrad << ", avg grad=" << avgAbsGrad; } - vecs[PARAMETER_GRADIENT]->clip(-config.gradient_clipping_threshold(), - config.gradient_clipping_threshold()); + vecs[PARAMETER_GRADIENT]->clip(-threshold, threshold); } - optimizer_->update(vecs, config, sparseId); } diff --git a/paddle/parameter/OptimizerWithRegularizer.cpp b/paddle/parameter/OptimizerWithRegularizer.cpp index 85f13c8bc08c534224a1a8365d541737980b439f..7910b12444938a0555c211bb3dfd0f4209e480ec 100644 --- a/paddle/parameter/OptimizerWithRegularizer.cpp +++ b/paddle/parameter/OptimizerWithRegularizer.cpp @@ -131,7 +131,8 @@ ParameterOptimizer* OptimizerWithRegularizer::create( bool inPserver) { ParameterOptimizer* optimizer = ParameterOptimizer::create(optConfig, inPserver); - if (paraConfig.gradient_clipping_threshold() > 0.0f && + if ((optConfig.gradient_clipping_threshold() > 0.0f || + paraConfig.gradient_clipping_threshold() > 0.0f) && !dynamic_cast(optimizer)) { optimizer = new OptimizerWithGradientClipping(optConfig, optimizer); } diff --git a/paddle/parameter/ParameterOptimizer.h b/paddle/parameter/ParameterOptimizer.h index 2bdc793d605e01f8e055087bb3e0973168cb0213..f98ba569b569379b30d034739a7f84aaf97108db 100644 --- a/paddle/parameter/ParameterOptimizer.h +++ b/paddle/parameter/ParameterOptimizer.h @@ -167,6 +167,7 @@ public: } parameterTypes_.push_back(type); } + real getLearningRate() const { return learningRate_; } virtual void setNoDecay() { applyDecay_ = false; } @@ -201,6 +202,7 @@ protected: * so, if lr change in StartBatch, please assign to learningRate_ */ real learningRate_; + std::unique_ptr learningRateScheduler_; int64_t pass_; // current training pass (starting from 0) bool firstTime_; diff --git a/proto/TrainerConfig.proto b/proto/TrainerConfig.proto index a334e07b6282a6ff9867482e0c3a299df2a78d1d..a819d20d11ff3932d331801007b8cfb9c77a3f2b 100644 --- a/proto/TrainerConfig.proto +++ b/proto/TrainerConfig.proto @@ -128,6 +128,9 @@ message OptimizationConfig { // when async_lagged_grad_discard_ratio * num_gradient_servers commit passed, // current async gradient will be discard silently. optional double async_lagged_grad_discard_ratio = 37 [default = 1.5]; + + // global threshold for gradient clipping + optional double gradient_clipping_threshold = 38 [default = 0.0]; }; message TrainerConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 9135f38719a44e3070f42e478d0fc6b0004227b5..9fe8794691e5f742b3c290850d7f2f4db4862cf4 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -3377,6 +3377,7 @@ settings = dict( algorithm='async_sgd', async_lagged_grad_discard_ratio=1.5, learning_method='momentum', + gradient_clipping_threshold=None, num_batches_per_send_parameter=None, num_batches_per_get_parameter=None, center_parameter_update_method=None, diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py index a53ebe160be3b5d6d115e3e15d059d3d87e80942..c3495ee110bfaf91a47637a52e88b3bb56dce7a9 100644 --- a/python/paddle/trainer_config_helpers/optimizers.py +++ b/python/paddle/trainer_config_helpers/optimizers.py @@ -408,7 +408,8 @@ def settings(batch_size, args = [ 'batch_size', 'learning_rate', 'learning_rate_decay_a', - 'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args' + 'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args', + 'gradient_clipping_threshold' ] kwargs = dict() kwargs['algorithm'] = algorithm