From ccd7c40be670b1ae63f16169b12be4e218408696 Mon Sep 17 00:00:00 2001 From: WenmuZhou Date: Mon, 14 Dec 2020 12:09:25 +0800 Subject: [PATCH] add grad clip --- doc/doc_ch/config.md | 5 +++-- doc/doc_en/config_en.md | 5 +++-- ppocr/optimizer/__init__.py | 8 +++++++- ppocr/optimizer/optimizer.py | 18 ++++++++++++++---- 4 files changed, 27 insertions(+), 9 deletions(-) diff --git a/doc/doc_ch/config.md b/doc/doc_ch/config.md index 2cc502ca..af5b6e51 100644 --- a/doc/doc_ch/config.md +++ b/doc/doc_ch/config.md @@ -11,7 +11,7 @@ ## 配置文件参数介绍 以 `rec_chinese_lite_train_v1.1.yml ` 为例 -### Global +### Global | 字段 | 用途 | 默认值 | 备注 | | :----------------------: | :---------------------: | :--------------: | :--------------------: | @@ -42,6 +42,7 @@ | name | 优化器类名 | Adam | 目前支持`Momentum`,`Adam`,`RMSProp`, 见[ppocr/optimizer/optimizer.py](../../ppocr/optimizer/optimizer.py) | | beta1 | 设置一阶矩估计的指数衰减率 | 0.9 | \ | | beta2 | 设置二阶矩估计的指数衰减率 | 0.999 | \ | +| clip_norm | 所允许的二范数最大值 | | \ | | **lr** | 设置学习率decay方式 | - | \ | | name | 学习率decay类名 | Cosine | 目前支持`Linear`,`Cosine`,`Step`,`Piecewise`, 见[ppocr/optimizer/learning_rate.py](../../ppocr/optimizer/learning_rate.py) | | learning_rate | 基础学习率 | 0.001 | \ | @@ -119,4 +120,4 @@ | shuffle | 每个epoch是否将数据集顺序打乱 | True | \ | | batch_size_per_card | 训练时单卡batch size | 256 | \ | | drop_last | 是否丢弃因数据集样本数不能被 batch_size 整除而产生的最后一个不完整的mini-batch | True | \ | -| num_workers | 用于加载数据的子进程个数,若为0即为不开启子进程,在主进程中进行数据加载 | 8 | \ | \ No newline at end of file +| num_workers | 用于加载数据的子进程个数,若为0即为不开启子进程,在主进程中进行数据加载 | 8 | \ | diff --git a/doc/doc_en/config_en.md b/doc/doc_en/config_en.md index 574bb41b..b8f638a6 100644 --- a/doc/doc_en/config_en.md +++ b/doc/doc_en/config_en.md @@ -10,7 +10,7 @@ The following list can be viewed through `--help` ## INTRODUCTION TO GLOBAL PARAMETERS OF CONFIGURATION FILE Take rec_chinese_lite_train_v1.1.yml as an example -### Global +### Global | Parameter | Use | Defaults | Note | | :----------------------: | :---------------------: | :--------------: | :--------------------: | @@ -41,6 +41,7 @@ Take rec_chinese_lite_train_v1.1.yml as an example | name | Optimizer class name | Adam | Currently supports`Momentum`,`Adam`,`RMSProp`, see [ppocr/optimizer/optimizer.py](../../ppocr/optimizer/optimizer.py) | | beta1 | Set the exponential decay rate for the 1st moment estimates | 0.9 | \ | | beta2 | Set the exponential decay rate for the 2nd moment estimates | 0.999 | \ | +| clip_norm | The maximum norm value | - | \ | | **lr** | Set the learning rate decay method | - | \ | | name | Learning rate decay class name | Cosine | Currently supports`Linear`,`Cosine`,`Step`,`Piecewise`, see[ppocr/optimizer/learning_rate.py](../../ppocr/optimizer/learning_rate.py) | | learning_rate | Set the base learning rate | 0.001 | \ | @@ -118,4 +119,4 @@ In ppocr, the network is divided into four stages: Transform, Backbone, Neck and | shuffle | Does each epoch disrupt the order of the data set | True | \ | | batch_size_per_card | Single card batch size during training | 256 | \ | | drop_last | Whether to discard the last incomplete mini-batch because the number of samples in the data set cannot be divisible by batch_size | True | \ | -| num_workers | The number of sub-processes used to load data, if it is 0, the sub-process is not started, and the data is loaded in the main process | 8 | \ | \ No newline at end of file +| num_workers | The number of sub-processes used to load data, if it is 0, the sub-process is not started, and the data is loaded in the main process | 8 | \ | diff --git a/ppocr/optimizer/__init__.py b/ppocr/optimizer/__init__.py index 6413ae95..c729103a 100644 --- a/ppocr/optimizer/__init__.py +++ b/ppocr/optimizer/__init__.py @@ -16,8 +16,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals - import copy +import paddle __all__ = ['build_optimizer'] @@ -49,7 +49,13 @@ def build_optimizer(config, epochs, step_each_epoch, parameters): # step3 build optimizer optim_name = config.pop('name') + if 'clip_norm' in config: + clip_norm = config.pop('clip_norm') + grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm) + else: + grad_clip = None optim = getattr(optimizer, optim_name)(learning_rate=lr, weight_decay=reg, + grad_clip=grad_clip, **config) return optim(parameters), lr diff --git a/ppocr/optimizer/optimizer.py b/ppocr/optimizer/optimizer.py index 2519e4e3..8215b92d 100644 --- a/ppocr/optimizer/optimizer.py +++ b/ppocr/optimizer/optimizer.py @@ -30,18 +30,25 @@ class Momentum(object): regularization (WeightDecayRegularizer, optional) - The strategy of regularization. """ - def __init__(self, learning_rate, momentum, weight_decay=None, **args): + def __init__(self, + learning_rate, + momentum, + weight_decay=None, + grad_clip=None, + **args): super(Momentum, self).__init__() self.learning_rate = learning_rate self.momentum = momentum self.weight_decay = weight_decay + self.grad_clip = grad_clip def __call__(self, parameters): opt = optim.Momentum( learning_rate=self.learning_rate, momentum=self.momentum, - parameters=parameters, - weight_decay=self.weight_decay) + weight_decay=self.weight_decay, + grad_clip=self.grad_clip, + parameters=parameters) return opt @@ -96,10 +103,11 @@ class RMSProp(object): def __init__(self, learning_rate, - momentum, + momentum=0.0, rho=0.95, epsilon=1e-6, weight_decay=None, + grad_clip=None, **args): super(RMSProp, self).__init__() self.learning_rate = learning_rate @@ -107,6 +115,7 @@ class RMSProp(object): self.rho = rho self.epsilon = epsilon self.weight_decay = weight_decay + self.grad_clip = grad_clip def __call__(self, parameters): opt = optim.RMSProp( @@ -115,5 +124,6 @@ class RMSProp(object): rho=self.rho, epsilon=self.epsilon, weight_decay=self.weight_decay, + grad_clip=self.grad_clip, parameters=parameters) return opt -- GitLab