From 7608f16397d232e2ac5429d4b2d5bdcc4e6a9e93 Mon Sep 17 00:00:00 2001 From: littletomatodonkey Date: Thu, 1 Jul 2021 17:52:16 +0800 Subject: [PATCH] add mv1 enhance backbone (#3228) * add mv1 enhance backbone * add knowledge doc --- ...c_chinese_lite_train_distillation_v2.1.yml | 31 +-- doc/doc_ch/knowledge_distillation.md | 251 +++++++++++++++++ ppocr/modeling/backbones/__init__.py | 23 +- ppocr/modeling/backbones/rec_mv1_enhance.py | 256 ++++++++++++++++++ 4 files changed, 535 insertions(+), 26 deletions(-) create mode 100644 doc/doc_ch/knowledge_distillation.md create mode 100644 ppocr/modeling/backbones/rec_mv1_enhance.py diff --git a/configs/rec/ch_ppocr_v2.1/rec_chinese_lite_train_distillation_v2.1.yml b/configs/rec/ch_ppocr_v2.1/rec_chinese_lite_train_distillation_v2.1.yml index 791b34cf..27ba4fd7 100644 --- a/configs/rec/ch_ppocr_v2.1/rec_chinese_lite_train_distillation_v2.1.yml +++ b/configs/rec/ch_ppocr_v2.1/rec_chinese_lite_train_distillation_v2.1.yml @@ -17,7 +17,7 @@ Global: character_type: ch max_text_length: 25 infer_mode: false - use_space_char: false + use_space_char: true distributed: true save_res_path: ./output/rec/predicts_chinese_lite_distillation_v2.1.txt @@ -27,28 +27,29 @@ Optimizer: beta1: 0.9 beta2: 0.999 lr: - name: Cosine - learning_rate: 0.0005 + name: Piecewise + decay_epochs : [700, 800] + values : [0.001, 0.0001] warmup_epoch: 5 regularizer: name: L2 - factor: 1.0e-05 + factor: 2.0e-05 + Architecture: + model_type: &model_type "rec" name: DistillationModel algorithm: Distillation Models: - Student: + Teacher: pretrained: freeze_params: false return_all_feats: true - model_type: rec + model_type: *model_type algorithm: CRNN Transform: Backbone: - name: MobileNetV3 + name: MobileNetV1Enhance scale: 0.5 - model_name: small - small_stride: [1, 2, 2, 2] Neck: name: SequenceEncoder encoder_type: rnn @@ -56,19 +57,17 @@ Architecture: Head: name: CTCHead mid_channels: 96 - fc_decay: 0.00001 - Teacher: + fc_decay: 0.00002 + Student: pretrained: freeze_params: false return_all_feats: true - model_type: rec + model_type: *model_type algorithm: CRNN Transform: Backbone: - name: MobileNetV3 + name: MobileNetV1Enhance scale: 0.5 - model_name: small - small_stride: [1, 2, 2, 2] Neck: name: SequenceEncoder encoder_type: rnn @@ -76,7 +75,7 @@ Architecture: Head: name: CTCHead mid_channels: 96 - fc_decay: 0.00001 + fc_decay: 0.00002 Loss: diff --git a/doc/doc_ch/knowledge_distillation.md b/doc/doc_ch/knowledge_distillation.md new file mode 100644 index 00000000..b561f718 --- /dev/null +++ b/doc/doc_ch/knowledge_distillation.md @@ -0,0 +1,251 @@ +# 知识蒸馏 + + +## 1. 简介 + +### 1.1 知识蒸馏介绍 + +近年来,深度神经网络在计算机视觉、自然语言处理等领域被验证是一种极其有效的解决问题的方法。通过构建合适的神经网络,加以训练,最终网络模型的性能指标基本上都会超过传统算法。 + +在数据量足够大的情况下,通过合理构建网络模型的方式增加其参数量,可以显著改善模型性能,但是这又带来了模型复杂度急剧提升的问题。大模型在实际场景中使用的成本较高。 + +深度神经网络一般有较多的参数冗余,目前有几种主要的方法对模型进行压缩,减小其参数量。如裁剪、量化、知识蒸馏等,其中知识蒸馏是指使用教师模型(teacher model)去指导学生模型(student model)学习特定任务,保证小模型在参数量不变的情况下,得到比较大的性能提升。 + +此外,在知识蒸馏任务中,也衍生出了互学习的模型训练方法,论文[Deep Mutual Learning](https://arxiv.org/abs/1706.00384)中指出,使用两个完全相同的模型在训练的过程中互相监督,可以达到比单个模型训练更好的效果。 + +### 1.2 PaddleOCR知识蒸馏简介 + +无论是大模型蒸馏小模型,还是小模型之间互相学习,更新参数,他们本质上是都是不同模型之间输出或者特征图(feature map)之间的相互监督,区别仅在于 (1) 模型是否需要固定参数。(2) 模型是否需要加载预训练模型。 + +对于大模型蒸馏小模型的情况,大模型一般需要加载预训练模型并固定参数;对于小模型之间互相蒸馏的情况,小模型一般都不加载预训练模型,参数也都是可学习的状态。 + +在知识蒸馏任务中,不只有2个模型之间进行蒸馏的情况,多个模型之间互相学习的情况也非常普遍。因此在知识蒸馏代码框架中,也有必要支持该种类别的蒸馏方法。 + +PaddleOCR中集成了知识蒸馏的算法,具体地,有以下几个主要的特点: +- 支持任意网络的互相学习,不要求子网络结构完全一致或者具有预训练模型;同时子网络数量也没有任何限制,只需要在配置文件中添加即可。 +- 支持loss函数通过配置文件任意配置,不仅可以使用某种loss,也可以使用多种loss的组合 +- 支持知识蒸馏训练、预测、评估与导出等所有模型相关的环境,方便使用与部署。 + + +通过知识蒸馏,在中英文通用文字识别任务中,不增加任何预测耗时的情况下,可以给模型带来3%以上的精度提升,结合学习率调整策略以及模型结构微调策略,最终提升提升超过5%。 + + + +## 2. 配置文件解析 + +在知识蒸馏训练的过程中,数据预处理、优化器、学习率、全局的一些属性没有任何变化。模型结构、损失函数、后处理、指标计算等模块的配置文件需要进行微调。 + +下面以识别与检测的知识蒸馏配置文件为例,对知识蒸馏的训练与配置进行解析。 + +### 2.1 识别配置文件解析 + +配置文件在[rec_chinese_lite_train_distillation_v2.1.yml](../../configs/rec/ch_ppocr_v2.1/rec_chinese_lite_train_distillation_v2.1.yml)。 + +#### 2.1.1 模型结构 + +知识蒸馏任务中,模型结构配置如下所示。 + +```yaml +Architecture: + model_type: &model_type "rec" # 模型类别,rec、det等,每个子网络的的模型类别都与 + name: DistillationModel # 结构名称,蒸馏任务中,为DistillationModel,用于构建对应的结构 + algorithm: Distillation # 算法名称 + Models: # 模型,包含子网络的配置信息 + Teacher: # 子网络名称,至少需要包含`pretrained`与`freeze_params`信息,其他的参数为子网络的构造参数 + pretrained: # 该子网络是否需要加载预训练模型 + freeze_params: false # 是否需要固定参数 + return_all_feats: true # 子网络的参数,表示是否需要返回所有的features,如果为False,则只返回最后的输出 + model_type: *model_type # 模型类别 + algorithm: CRNN # 子网络的算法名称,该子网络剩余参与均为构造参数,与普通的模型训练配置一致 + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 64 + Head: + name: CTCHead + mid_channels: 96 + fc_decay: 0.00002 + Student: # 另外一个子网络,这里给的是DML的蒸馏示例,两个子网络结构相同,均需要学习参数 + pretrained: # 下面的组网参数同上 + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 64 + Head: + name: CTCHead + mid_channels: 96 + fc_decay: 0.00002 +``` + +当然,这里如果希望添加更多的子网络进行训练,也可以按照`Student`与`Teacher`的添加方式,在配置文件中添加相应的字段。比如说如果希望有3个模型互相监督,共同训练,那么`Architecture`可以写为如下格式。 + +```yaml +Architecture: + model_type: &model_type "rec" + name: DistillationModel + algorithm: Distillation + Models: + Teacher: + pretrained: + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 64 + Head: + name: CTCHead + mid_channels: 96 + fc_decay: 0.00002 + Student: + pretrained: + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 64 + Head: + name: CTCHead + mid_channels: 96 + fc_decay: 0.00002 + Student2: # 知识蒸馏任务中引入的新的子网络,其他部分与上述配置相同 + pretrained: + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 64 + Head: + name: CTCHead + mid_channels: 96 + fc_decay: 0.00002 +``` + +最终该模型训练时,包含3个子网络:`Teacher`, `Student`, `Student2`。 + +蒸馏模型`DistillationModel`类的具体实现代码可以参考[distillation_model.py](../../ppocr/modeling/architectures/distillation_model.py)。 + +最终模型`forward`输出为一个字典,key为所有的子网络名称,例如这里为`Student`与`Teacher`,value为对应子网络的输出,可以为`Tensor`(只返回该网络的最后一层)和`dict`(也返回了中间的特征信息)。 + +在识别任务中,为了添加更多损失函数,保证蒸馏方法的可扩展性,将每个子网络的输出保存为`dict`,其中包含子模块输出。以该识别模型为例,每个子网络的输出结果均为`dict`,key包含`backbone_out`,`neck_out`, `head_out`,`value`为对应模块的tensor,最终对于上述配置文件,`DistillationModel`的输出格式如下。 + +```json +{ + "Teacher": { + "backbone_out": tensor, + "neck_out": tensor, + "head_out": tensor, + }, + "Student": { + "backbone_out": tensor, + "neck_out": tensor, + "head_out": tensor, + } +} +``` + +#### 2.1.2 损失函数 + +知识蒸馏任务中,损失函数配置如下所示。 + +```yaml +Loss: + name: CombinedLoss # 损失函数名称,基于改名称,构建用于损失函数的类 + loss_config_list: # 损失函数配置文件列表,为CombinedLoss的必备函数 + - DistillationCTCLoss: # 基于蒸馏的CTC损失函数,继承自标准的CTC loss + weight: 1.0 # 损失函数的权重,loss_config_list中,每个损失函数的配置都必须包含该字段 + model_name_list: ["Student", "Teacher"] # 对于蒸馏模型的预测结果,提取这两个子网络的输出,与gt计算CTC loss + key: head_out # 取子网络输出dict中,该key对应的tensor + - DistillationDMLLoss: # 蒸馏的DML损失函数,继承自标准的DMLLoss + weight: 1.0 # 权重 + act: "softmax" # 激活函数,对输入使用激活函数处理,可以为softmax, sigmoid或者为None,默认为None + model_name_pairs: # 用于计算DML loss的子网络名称对,如果希望计算其他子网络的DML loss,可以在列表下面继续填充 + - ["Student", "Teacher"] + key: head_out # 取子网络输出dict中,该key对应的tensor + - DistillationDistanceLoss: # 蒸馏的距离损失函数 + weight: 1.0 # 权重 + mode: "l2" # 距离计算方法,目前支持l1, l2, smooth_l1 + model_name_pairs: # 用于计算distance loss的子网络名称对 + - ["Student", "Teacher"] + key: backbone_out # 取子网络输出dict中,该key对应的tensor +``` + +上述损失函数中,所有的蒸馏损失函数均继承自标准的损失函数类,主要功能为: 对蒸馏模型的输出进行解析,找到用于计算损失的中间节点(tensor),再使用标准的损失函数类去计算。 + +以上述配置为例,最终蒸馏训练的损失函数包含下面3个部分。 + +- `Student`和`Teacher`的最终输出(`head_out`)与gt的CTC loss,权重为1。在这里因为2个子网络都需要更新参数,因此2者都需要计算与g的loss。 +- `Student`和`Teacher`的最终输出(`head_out`)之间的DML loss,权重为1。 +- `Student`和`Teacher`的骨干网络输出(`backbone_out`)之间的l2 loss,权重为1。 + +关于`CombinedLoss`更加具体的实现可以参考: [combined_loss.py](../../ppocr/losses/combined_loss.py#L23)。关于`DistillationCTCLoss`等蒸馏损失函数更加具体的实现可以参考[distillation_loss.py](../../ppocr/losses/distillation_loss.py)。 + + +#### 2.1.3 后处理 + +知识蒸馏任务中,后处理配置如下所示。 + +```yaml +PostProcess: + name: DistillationCTCLabelDecode # 蒸馏任务的CTC解码后处理,继承自标准的CTCLabelDecode类 + model_name: ["Student", "Teacher"] # 对于蒸馏模型的预测结果,提取这两个子网络的输出,进行解码 + key: head_out # 取子网络输出dict中,该key对应的tensor +``` + +以上述配置为例,最终会同时计算`Student`和`Teahcer` 2个子网络的CTC解码输出,返回一个`dict`,`key`为用于处理的子网络名称,`value`为用于处理的子网络列表。 + +关于`DistillationCTCLabelDecode`更加具体的实现可以参考: [rec_postprocess.py](../../ppocr/postprocess/rec_postprocess.py#L128) + + +#### 2.1.4 指标计算 + +知识蒸馏任务中,指标计算配置如下所示。 + +```yaml +Metric: + name: DistillationMetric # 蒸馏任务的CTC解码后处理,继承自标准的CTCLabelDecode类 + base_metric_name: RecMetric # 指标计算的基类,对于模型的输出,会基于该类,计算指标 + main_indicator: acc # 指标的名称 + key: "Student" # 选取该子网络的 main_indicator 作为作为保存保存best model的判断标准 +``` + +以上述配置为例,最终会使用`Student`子网络的acc指标作为保存best model的判断指标,同时,日志中也会打印出所有子网络的acc指标。 + +关于`DistillationMetric`更加具体的实现可以参考: [distillation_metric.py](../../ppocr/metrics/distillation_metric.py#L24)。 + + +### 2.2 检测配置文件解析 + +* coming soon! diff --git a/ppocr/modeling/backbones/__init__.py b/ppocr/modeling/backbones/__init__.py index 13b70b20..f4fe8c76 100755 --- a/ppocr/modeling/backbones/__init__.py +++ b/ppocr/modeling/backbones/__init__.py @@ -12,33 +12,36 @@ # See the License for the specific language governing permissions and # limitations under the License. -__all__ = ['build_backbone'] +__all__ = ["build_backbone"] def build_backbone(config, model_type): - if model_type == 'det': + if model_type == "det": from .det_mobilenet_v3 import MobileNetV3 from .det_resnet_vd import ResNet from .det_resnet_vd_sast import ResNet_SAST - support_dict = ['MobileNetV3', 'ResNet', 'ResNet_SAST'] - elif model_type == 'rec' or model_type == 'cls': + support_dict = ["MobileNetV3", "ResNet", "ResNet_SAST"] + elif model_type == "rec" or model_type == "cls": from .rec_mobilenet_v3 import MobileNetV3 from .rec_resnet_vd import ResNet from .rec_resnet_fpn import ResNetFPN - support_dict = ['MobileNetV3', 'ResNet', 'ResNetFPN'] - elif model_type == 'e2e': + from .rec_mv1_enhance import MobileNetV1Enhance + support_dict = [ + "MobileNetV1Enhance", "MobileNetV3", "ResNet", "ResNetFPN" + ] + elif model_type == "e2e": from .e2e_resnet_vd_pg import ResNet - support_dict = ['ResNet'] + support_dict = ["ResNet"] elif model_type == "table": from .table_resnet_vd import ResNet from .table_mobilenet_v3 import MobileNetV3 - support_dict = ['ResNet', 'MobileNetV3'] + support_dict = ["ResNet", "MobileNetV3"] else: raise NotImplementedError - module_name = config.pop('name') + module_name = config.pop("name") assert module_name in support_dict, Exception( - 'when model typs is {}, backbone only support {}'.format(model_type, + "when model typs is {}, backbone only support {}".format(model_type, support_dict)) module_class = eval(module_name)(**config) return module_class diff --git a/ppocr/modeling/backbones/rec_mv1_enhance.py b/ppocr/modeling/backbones/rec_mv1_enhance.py new file mode 100644 index 00000000..fe874fac --- /dev/null +++ b/ppocr/modeling/backbones/rec_mv1_enhance.py @@ -0,0 +1,256 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import Conv2D, BatchNorm, Linear, Dropout +from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D +from paddle.nn.initializer import KaimingNormal +import math +import numpy as np +import paddle +from paddle import ParamAttr, reshape, transpose, concat, split +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import Conv2D, BatchNorm, Linear, Dropout +from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D +from paddle.nn.initializer import KaimingNormal +import math +from paddle.nn.functional import hardswish, hardsigmoid +from paddle.regularizer import L2Decay + + +class ConvBNLayer(nn.Layer): + def __init__(self, + num_channels, + filter_size, + num_filters, + stride, + padding, + channels=None, + num_groups=1, + act='hard_swish'): + super(ConvBNLayer, self).__init__() + + self._conv = Conv2D( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False) + + self._batch_norm = BatchNorm( + num_filters, + act=act, + param_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class DepthwiseSeparable(nn.Layer): + def __init__(self, + num_channels, + num_filters1, + num_filters2, + num_groups, + stride, + scale, + dw_size=3, + padding=1, + use_se=False): + super(DepthwiseSeparable, self).__init__() + self.use_se = use_se + self._depthwise_conv = ConvBNLayer( + num_channels=num_channels, + num_filters=int(num_filters1 * scale), + filter_size=dw_size, + stride=stride, + padding=padding, + num_groups=int(num_groups * scale)) + if use_se: + self._se = SEModule(int(num_filters1 * scale)) + self._pointwise_conv = ConvBNLayer( + num_channels=int(num_filters1 * scale), + filter_size=1, + num_filters=int(num_filters2 * scale), + stride=1, + padding=0) + + def forward(self, inputs): + y = self._depthwise_conv(inputs) + if self.use_se: + y = self._se(y) + y = self._pointwise_conv(y) + return y + + +class MobileNetV1Enhance(nn.Layer): + def __init__(self, in_channels=3, scale=0.5, **kwargs): + super().__init__() + self.scale = scale + self.block_list = [] + + self.conv1 = ConvBNLayer( + num_channels=3, + filter_size=3, + channels=3, + num_filters=int(32 * scale), + stride=2, + padding=1) + + conv2_1 = DepthwiseSeparable( + num_channels=int(32 * scale), + num_filters1=32, + num_filters2=64, + num_groups=32, + stride=1, + scale=scale) + self.block_list.append(conv2_1) + + conv2_2 = DepthwiseSeparable( + num_channels=int(64 * scale), + num_filters1=64, + num_filters2=128, + num_groups=64, + stride=1, + scale=scale) + self.block_list.append(conv2_2) + + conv3_1 = DepthwiseSeparable( + num_channels=int(128 * scale), + num_filters1=128, + num_filters2=128, + num_groups=128, + stride=1, + scale=scale) + self.block_list.append(conv3_1) + + conv3_2 = DepthwiseSeparable( + num_channels=int(128 * scale), + num_filters1=128, + num_filters2=256, + num_groups=128, + stride=(2, 1), + scale=scale) + self.block_list.append(conv3_2) + + conv4_1 = DepthwiseSeparable( + num_channels=int(256 * scale), + num_filters1=256, + num_filters2=256, + num_groups=256, + stride=1, + scale=scale) + self.block_list.append(conv4_1) + + conv4_2 = DepthwiseSeparable( + num_channels=int(256 * scale), + num_filters1=256, + num_filters2=512, + num_groups=256, + stride=(2, 1), + scale=scale) + self.block_list.append(conv4_2) + + for _ in range(5): + conv5 = DepthwiseSeparable( + num_channels=int(512 * scale), + num_filters1=512, + num_filters2=512, + num_groups=512, + stride=1, + dw_size=5, + padding=2, + scale=scale, + use_se=False) + self.block_list.append(conv5) + + conv5_6 = DepthwiseSeparable( + num_channels=int(512 * scale), + num_filters1=512, + num_filters2=1024, + num_groups=512, + stride=(2, 1), + dw_size=5, + padding=2, + scale=scale, + use_se=True) + self.block_list.append(conv5_6) + + conv6 = DepthwiseSeparable( + num_channels=int(1024 * scale), + num_filters1=1024, + num_filters2=1024, + num_groups=1024, + stride=1, + dw_size=5, + padding=2, + use_se=True, + scale=scale) + self.block_list.append(conv6) + + self.block_list = nn.Sequential(*self.block_list) + + self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + self.out_channels = int(1024 * scale) + + def forward(self, inputs): + y = self.conv1(inputs) + y = self.block_list(y) + y = self.pool(y) + return y + + +class SEModule(nn.Layer): + def __init__(self, channel, reduction=4): + super(SEModule, self).__init__() + self.avg_pool = AdaptiveAvgPool2D(1) + self.conv1 = Conv2D( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr(), + bias_attr=ParamAttr()) + self.conv2 = Conv2D( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr(), + bias_attr=ParamAttr()) + + def forward(self, inputs): + outputs = self.avg_pool(inputs) + outputs = self.conv1(outputs) + outputs = F.relu(outputs) + outputs = self.conv2(outputs) + outputs = hardsigmoid(outputs) + return paddle.multiply(x=inputs, y=outputs) -- GitLab