diff --git a/doc/doc_ch/config.md b/doc/doc_ch/config.md index 03fe1b3280881472c830cf5ac57dee183a94b373..fe8db9c893cf0e6190111de5fe7627d2fe52a4fd 100644 --- a/doc/doc_ch/config.md +++ b/doc/doc_ch/config.md @@ -63,8 +63,9 @@ | beta1 | 设置一阶矩估计的指数衰减率 | 0.9 | \ | | beta2 | 设置二阶矩估计的指数衰减率 | 0.999 | \ | | decay | 是否使用decay | \ | \ | -| function(decay) | 设置decay方式 | - | 目前支持cosine_decay与piecewise_decay | -| step_each_epoch | 每个epoch包含多少次迭代, cosine_decay时有效 | 20 | 计算方式:total_image_num / (batch_size_per_card * card_size) | -| total_epoch | 总共迭代多少个epoch, cosine_decay时有效 | 1000 | 与Global.epoch_num 一致 | +| function(decay) | 设置decay方式 | - | 目前支持cosine_decay, cosine_decay_warmup与piecewise_decay | +| step_each_epoch | 每个epoch包含多少次迭代, cosine_decay/cosine_decay_warmup时有效 | 20 | 计算方式:total_image_num / (batch_size_per_card * card_size) | +| total_epoch | 总共迭代多少个epoch, cosine_decay/cosine_decay_warmup时有效 | 1000 | 与Global.epoch_num 一致 | +| warmup_minibatch | 线性warmup的迭代次数, cosine_decay_warmup时有效 | 1000 | \ | | boundaries | 学习率下降时的迭代次数间隔, piecewise_decay时有效 | - | 参数为列表形式 | | decay_rate | 学习率衰减系数, piecewise_decay时有效 | - | \ | diff --git a/doc/doc_en/config_en.md b/doc/doc_en/config_en.md index 66578424a60488a986eaff6fe937e4ffbc1bf59e..b54def895f0758df7cdbd089253d6acd712d2b8e 100644 --- a/doc/doc_en/config_en.md +++ b/doc/doc_en/config_en.md @@ -60,8 +60,9 @@ Take `rec_icdar15_train.yml` as an example: | beta1 | Set the exponential decay rate for the 1st moment estimates | 0.9 | \ | | beta2 | Set the exponential decay rate for the 2nd moment estimates | 0.999 | \ | | decay | Whether to use decay | \ | \ | -| function(decay) | Set the decay function | cosine_decay | Support cosine_decay and piecewise_decay | -| step_each_epoch | The number of steps in an epoch. Used in cosine_decay | 20 | Calculation :total_image_num / (batch_size_per_card * card_size) | -| total_epoch | The number of epochs. Used in cosine_decay | 1000 | Consistent with Global.epoch_num | +| function(decay) | Set the decay function | cosine_decay | Support cosine_decay, cosine_decay_warmup and piecewise_decay | +| step_each_epoch | The number of steps in an epoch. Used in cosine_decay/cosine_decay_warmup | 20 | Calculation: total_image_num / (batch_size_per_card * card_size) | +| total_epoch | The number of epochs. Used in cosine_decay/cosine_decay_warmup | 1000 | Consistent with Global.epoch_num | +| warmup_minibatch | Number of steps for linear warmup. Used in cosine_decay_warmup | 1000 | \ | | boundaries | The step intervals to reduce learning rate. Used in piecewise_decay | - | The format is list | | decay_rate | Learning rate decay rate. Used in piecewise_decay | - | \ | diff --git a/ppocr/optimizer.py b/ppocr/optimizer.py index 55f2eba14c4be738c0dbc686cd32afbcff62f874..fd315cd1319d4925e893705957a42f931a39076e 100644 --- a/ppocr/optimizer.py +++ b/ppocr/optimizer.py @@ -14,14 +14,50 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import math import paddle.fluid as fluid from paddle.fluid.regularizer import L2Decay +from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter +import paddle.fluid.layers.ops as ops from ppocr.utils.utility import initial_logger logger = initial_logger() +def cosine_decay_with_warmup(learning_rate, + step_each_epoch, + epochs=500, + warmup_minibatch=1000): + """Applies cosine decay to the learning rate. + lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) + decrease lr for every mini-batch and start with warmup. + """ + global_step = _decay_step_counter() + lr = fluid.layers.tensor.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=True, + name="learning_rate") + + warmup_minibatch = fluid.layers.fill_constant( + shape=[1], + dtype='float32', + value=float(warmup_minibatch), + force_cpu=True) + + with fluid.layers.control_flow.Switch() as switch: + with switch.case(global_step < warmup_minibatch): + decayed_lr = learning_rate * (1.0 * global_step / warmup_minibatch) + fluid.layers.tensor.assign(input=decayed_lr, output=lr) + with switch.default(): + decayed_lr = learning_rate * \ + (ops.cos((global_step - warmup_minibatch) * (math.pi / (epochs * step_each_epoch))) + 1)/2 + fluid.layers.tensor.assign(input=decayed_lr, output=lr) + return lr + + def AdamDecay(params, parameter_list=None): """ define optimizer function @@ -36,7 +72,9 @@ def AdamDecay(params, parameter_list=None): l2_decay = params.get("l2_decay", 0.0) if 'decay' in params: - supported_decay_mode = ["cosine_decay", "piecewise_decay"] + supported_decay_mode = [ + "cosine_decay", "cosine_decay_warmup", "piecewise_decay" + ] params = params['decay'] decay_mode = params['function'] assert decay_mode in supported_decay_mode, "Supported decay mode is {}, but got {}".format( @@ -49,6 +87,15 @@ def AdamDecay(params, parameter_list=None): learning_rate=base_lr, step_each_epoch=step_each_epoch, epochs=total_epoch) + elif decay_mode == "cosine_decay_warmup": + step_each_epoch = params['step_each_epoch'] + total_epoch = params['total_epoch'] + warmup_minibatch = params.get("warmup_minibatch", 1000) + base_lr = cosine_decay_with_warmup( + learning_rate=base_lr, + step_each_epoch=step_each_epoch, + epochs=total_epoch, + warmup_minibatch=warmup_minibatch) elif decay_mode == "piecewise_decay": boundaries = params["boundaries"] decay_rate = params["decay_rate"] @@ -104,5 +151,5 @@ def RMSProp(params, parameter_list=None): optimizer = fluid.optimizer.RMSProp( learning_rate=base_lr, regularization=fluid.regularizer.L2Decay(regularization_coeff=l2_decay)) - - return optimizer \ No newline at end of file + + return optimizer