未验证 提交 095a6f72 编写于 作者: C ceci3 提交者: GitHub

add ofa api docs (#576)

* add ofa api docs
上级 77bfa3ad
...@@ -231,27 +231,6 @@ def soft_cross_entropy(inp, target): ...@@ -231,27 +231,6 @@ def soft_cross_entropy(inp, target):
return -1. * paddle.mean(paddle.sum(inp_likelihood * target_prob, axis=-1)) return -1. * paddle.mean(paddle.sum(inp_likelihood * target_prob, axis=-1))
### get certain config
def apply_config(model, width_mult):
new_config = dict()
def fix_exp(idx):
if (idx - 3) % 6 == 0 or (idx - 5) % 6 == 0:
return True
return False
for idx, (block_k, block_v) in enumerate(model.layers.items()):
if len(block_v.keys()) != 0:
name, name_idx = block_k.split('_'), int(block_k.split('_')[1])
if fix_exp(name_idx) or 'emb' in block_k or idx == (
len(model.layers.items()) - 2):
block_v['expand_ratio'] = 1.0
else:
block_v['expand_ratio'] = width_mult
new_config[block_k] = block_v
return new_config
def convert_example(example, def convert_example(example,
tokenizer, tokenizer,
label_list, label_list,
...@@ -487,7 +466,7 @@ def do_train(args): ...@@ -487,7 +466,7 @@ def do_train(args):
for width_mult in args.width_mult_list: for width_mult in args.width_mult_list:
# Step8: Broadcast supernet config from width_mult, # Step8: Broadcast supernet config from width_mult,
# and use this config in supernet training. # and use this config in supernet training.
net_config = apply_config(ofa_model, width_mult) net_config = utils.dynabert_config(ofa_model, width_mult)
ofa_model.set_net_config(net_config) ofa_model.set_net_config(net_config)
logits, teacher_logits = ofa_model( logits, teacher_logits = ofa_model(
input_ids, segment_ids, attention_mask=[None, None]) input_ids, segment_ids, attention_mask=[None, None])
......
...@@ -20,8 +20,7 @@ import paddle.fluid.dygraph as FD ...@@ -20,8 +20,7 @@ import paddle.fluid.dygraph as FD
import paddle.fluid.layers as L import paddle.fluid.layers as L
def compute_neuron_head_importance(args, model, tokenizer, dev_ds, place, def compute_neuron_head_importance(args, model, dev_ds, place, model_cfg):
model_cfg):
n_layers, n_heads = model_cfg['num_hidden_layers'], model_cfg[ n_layers, n_heads = model_cfg['num_hidden_layers'], model_cfg[
'num_attention_heads'] 'num_attention_heads']
head_importance = L.zeros(shape=[n_layers, n_heads], dtype='float32') head_importance = L.zeros(shape=[n_layers, n_heads], dtype='float32')
......
...@@ -18,6 +18,7 @@ from __future__ import print_function ...@@ -18,6 +18,7 @@ from __future__ import print_function
from __future__ import unicode_literals from __future__ import unicode_literals
from __future__ import absolute_import from __future__ import absolute_import
import re
import paddle.fluid as F import paddle.fluid as F
import paddle.fluid.layers as L import paddle.fluid.layers as L
import paddle.fluid.dygraph as D import paddle.fluid.dygraph as D
...@@ -39,4 +40,4 @@ class AdamW(F.optimizer.AdamOptimizer): ...@@ -39,4 +40,4 @@ class AdamW(F.optimizer.AdamOptimizer):
for p, g in params_grads: for p, g in params_grads:
if not self.pat.match(p.name): if not self.pat.match(p.name):
with D.no_grad(): with D.no_grad():
L.assign(p * (20 - self.wd * self.current_step_lr()), p) L.assign(p * (1. - self.wd * self.current_step_lr()), p)
...@@ -49,31 +49,6 @@ def soft_cross_entropy(inp, target): ...@@ -49,31 +49,6 @@ def soft_cross_entropy(inp, target):
return -1. * L.mean(L.reduce_sum(inp_likelihood * target_prob, dim=-1)) return -1. * L.mean(L.reduce_sum(inp_likelihood * target_prob, dim=-1))
### get certain config
def apply_config(model, width_mult, depth_mult):
new_config = dict()
def fix_exp(idx):
if (idx - 3) % 6 == 0 or (idx - 5) % 6 == 0:
return True
return False
for idx, (block_k, block_v) in enumerate(model.layers.items()):
if isinstance(block_v, dict) and len(block_v.keys()) != 0:
name, name_idx = block_k.split('_'), int(block_k.split('_')[1])
if fix_exp(name_idx) or 'emb' in block_k or idx == (
len(model.layers.items()) - 2):
block_v['expand_ratio'] = 1.0
else:
block_v['expand_ratio'] = width_mult
if block_k == 'depth':
block_v = depth_mult
new_config[block_k] = block_v
return new_config
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser('classify model with ERNIE') parser = argparse.ArgumentParser('classify model with ERNIE')
parser.add_argument( parser.add_argument(
...@@ -93,7 +68,7 @@ if __name__ == '__main__': ...@@ -93,7 +68,7 @@ if __name__ == '__main__':
type=str, type=str,
required=True, required=True,
help='data directory includes train / develop data') help='data directory includes train / develop data')
parser.add_argument('--task', type=str, default='mnli', help='task name') parser.add_argument('--task', type=str, default='xnli', help='task name')
parser.add_argument( parser.add_argument(
'--use_lr_decay', '--use_lr_decay',
action='store_true', action='store_true',
...@@ -159,7 +134,7 @@ if __name__ == '__main__': ...@@ -159,7 +134,7 @@ if __name__ == '__main__':
'--width_mult_list', '--width_mult_list',
nargs='+', nargs='+',
type=float, type=float,
default=[1.0, 0.75, 0.5, 0.5], default=[1.0, 0.75, 0.5, 0.25],
help="width mult in compress") help="width mult in compress")
parser.add_argument( parser.add_argument(
'--depth_mult_list', '--depth_mult_list',
...@@ -259,7 +234,7 @@ if __name__ == '__main__': ...@@ -259,7 +234,7 @@ if __name__ == '__main__':
### suppose elastic width first ### suppose elastic width first
if args.reorder_weight: if args.reorder_weight:
head_importance, neuron_importance = compute_neuron_head_importance( head_importance, neuron_importance = compute_neuron_head_importance(
args, ofa_model.model, tokenizer, dev_ds, place, model_cfg) args, ofa_model.model, dev_ds, place, model_cfg)
reorder_neuron_head(ofa_model.model, head_importance, reorder_neuron_head(ofa_model.model, head_importance,
neuron_importance) neuron_importance)
################# #################
...@@ -304,7 +279,7 @@ if __name__ == '__main__': ...@@ -304,7 +279,7 @@ if __name__ == '__main__':
for depth_mult in depth_mult_list: for depth_mult in depth_mult_list:
for width_mult in args.width_mult_list: for width_mult in args.width_mult_list:
net_config = apply_config( net_config = utils.dynabert_config(
ofa_model, width_mult, depth_mult=depth_mult) ofa_model, width_mult, depth_mult=depth_mult)
ofa_model.set_net_config(net_config) ofa_model.set_net_config(net_config)
...@@ -380,7 +355,7 @@ if __name__ == '__main__': ...@@ -380,7 +355,7 @@ if __name__ == '__main__':
if step % 100 == 0: if step % 100 == 0:
for depth_mult in depth_mult_list: for depth_mult in depth_mult_list:
for width_mult in args.width_mult_list: for width_mult in args.width_mult_list:
net_config = apply_config( net_config = utils.dynabert_config(
ofa_model, width_mult, depth_mult=depth_mult) ofa_model, width_mult, depth_mult=depth_mult)
ofa_model.set_net_config(net_config) ofa_model.set_net_config(net_config)
......
docs/images/algo/ofa_bert.jpg

364.6 KB | W: | H:

docs/images/algo/ofa_bert.jpg

990.1 KB | W: | H:

docs/images/algo/ofa_bert.jpg
docs/images/algo/ofa_bert.jpg
docs/images/algo/ofa_bert.jpg
docs/images/algo/ofa_bert.jpg
  • 2-up
  • Swipe
  • Onion skin
Convert SuperNet
============
在进行Once-For-All训练之前,需要把普通的模型先转换为由动态OP组网的超网络。超网络转换在把普通网络转换为超网络的同时也会把超网络中的最大的子网络转换为搜索空间中最大的网络。
.. note::
- 如果原始卷积的kernel_size1,则不会对它的kernel_size进行改变。
..
接口介绍
------------------
.. py:class:: paddleslim.nas.ofa.supernet(kernel_size=None, expand_ratio=None, channel=None)
`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/nas/ofa/convert_super.py#L643>`_
通过键值对的方式传入搜索空间。
**参数:**
- **kernel_size(list|tuple, optional)** 网络中Conv2Dkernel_size的搜索空间。
- **expand_ratio(list|tuple, optional)** 网络中Conv2D的通道数、EmbeddingLinear的参数输出维度的搜索空间,本参数是按照原始模型中每个OP的通道的比例来得到转换后的超网络中每个OP的通道数,所以本参数的长度为1。本参数和 ``channel`` 之间设置一个即可。
- **channel(list(list)|tuple(tuple), optional)** 网络中Conv2D的通道数、EmbeddingLinear的参数输出维度的搜索空间,本参数是直接设置超网络中每个OP的通道数量,所以本参数的长度需要和网络中包括的Conv2DEmbeddingLinear的总数相等。本参数和 ``expand_ratio`` 之间设置一个即可。
**返回:**
超网络配置。
.. py:class:: paddleslim.nas.ofa.Convert(context)
`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/nas/ofa/convert_super.py#L45>`_
把普通网络根据传入的自定义的搜索空间转换为超网络。
**返回:**
转换实例
**参数:**
- **context(paddleslim.nas.ofa.supernet)** 用户自定义的搜索空间
.. py:method:: convert(network)
实际超网络转换。
**参数:**
- **network(paddle.nn.Layer)** 要转换为超网络的原始模型实例。
**返回:**
实例化之后的超网络。
PaddleSlim提供了三种方式构造超网络,下面分别介绍这三种方式。
方式一
------------------
直接调用搜索空间定义接口和超网络转换接口转换超网络。这种方式的优点是不需要重新定义网络,直接对初始化之后的网络实例进行转换,缺点是只能对整个网络进行超网络转换,不能对部分网络进行超网络转换。
**示例代码:**
.. code-block:: python
from paddle.vision.models import mobilenet_v1
from paddleslim.nas.ofa.convert_super import Convert, supernet
model = mobilenet_v1()
sp_net_config = supernet(kernel_size=(3, 5, 7), expand_ratio=[1, 2, 4])
sp_model = Convert(sp_net_config).convert(self.model)
方式二
------------------
使用上下文的方式转换超网络。这种方式的优点是可以仅转换部分网络为超网络,或者对网络不同部分进行不同的超网络转换,缺点是需要拿到原始网络的定义,并修改网络定义。
**示例代码:**
.. code-block:: python
import paddle.nn as nn
from paddleslim.nas.ofa.convert_super import supernet
class Net(nn.Layer):
def __init__(self):
super(Net, self).__init__()
models = []
with supernet(kernel_size=(3, 5, 7), expand_ratio=(1, 2, 4)) as ofa_super:
models += [nn.Conv2D(3, 4, 3, padding=1)]
models += [nn.InstanceNorm2D(4)]
models = ofa_super.convert(models)
models += [nn.Conv2D(4, 4, 3, groups=4)]
self.models = paddle.nn.Sequential(*models)
def forward(self, inputs):
return self.models(inputs)
方式三
------------------
直接调用动态OP组网,组网方式和普通模型相同。PaddleSlim支持的动态OP请参考 `动态OP <>`_ 。这种方式的优点是组网更自由,缺点是用法更复杂。
.. note::
- paddleslim.nas.ofa.layers 文件中的动态OP是基于Paddle 2.0beta及其之后的版本实现的。paddleslim.nas.ofa.layers_old文件中的动态OP是基于Paddle 2.0beta之前的版本实现的。
- Block接口是把当前动态OP的搜索空间加入到OFA训练过程中的搜索空间中。由于Conv2DEmbeddingLinear这三个OP的参数中输出的部分是可以随意修改的,所以这三个OP所对应的动态OP需要使用Block包装一下。而Norm相关的动态OP由于其参数大小是根据输入大小相关,所以不需要用Block包装。
..
**示例代码:**
.. code-block:: python
import paddle.nn as nn
from paddleslim.nas.ofa.layers import Block, SuperConv2D, SuperBatchNorm2D
class Net(nn.Layer):
def __init__(self):
super(Net, self).__init__()
self.models = [Block(SuperConv2D(3, 4, 3, candidate_config={'kernel_size': (3, 5, 7), 'channel': (4, 8, 16)}))]
self.models += [SuperBatchNorm2D(16)]
def forward(self, inputs):
return self.models(inputs)
Once-For-All
============
在进行Once-For-All训练之前,需要把普通的模型先转换为由动态OP组网的超网络。超网络转换方式可以参考 `超网络转换 <>`_ 。
Once-For-All 训练参数配置
------------------
RunConfig
>>>>>>>>>
超网络实际运行需要用到的配置和超参,通过字典的形式配置。如果想使用论文中默认的 ``Progressive shrinking`` 的方式进行超网络训练,则本项为必填参数。否则可以通过 ``paddleslim.nas.ofa.OFA().set_epoch(epoch)`` 和 ``paddleslim.nas.ofa.OFA().set_task(task, phase=None)`` 来手动指定超网络训练所处的阶段。默认:None。
**参数:**
- **train_batch_size:(int, 可选):** 训练时的batch size,用来计算每个epoch包括的iteration数量。默认:None。
- **n_epochs(list, 可选):** 包含每个阶段运行到多少epochs,用来判断当前epoch在超网训练中所处的阶段,默认:None。
- **total_images(int, 可选):** 训练集图片数量,用来计算每个epoch包括的iteration数量。默认:None。
- **elastic_depth(list/tuple, 可选):** 如果设置为None,则不把depth作为搜索的一部分,否则,采样到的config中会包含depth。对模型depth的改变需要在模型定义中的forward部分配合使用,具体示例可以参考 `示例 <>`_ ,默认:None。
- **dynamic_batch_size(list, 可选):** 代表每个阶段每个batch数据应该参与几个子网络的训练,shape应该和n_epochs的shape保持一致。默认:None。
**返回:**
训练配置。
**示例代码:**
.. code-block:: python
from paddleslim.nas.ofa import RunConfig
default_run_config = {
'train_batch_size': 1,
'n_epochs': [[1], [2, 3], [4, 5]],
'total_images': 12,
'elastic_depth': (5, 15, 24)
'dynamic_batch_size': [1, 1, 1],
}
run_config = RunConfig(**default_run_config)
DistillConfig
>>>>>>>>>
如果在训练过程中需要添加蒸馏的话,蒸馏过程的配置和超参,通过字典的形式配置,默认:None。
**参数:**
- **lambda_distill(float, 可选):** 蒸馏loss的缩放比例,默认:None。
- **teacher_model(instance of paddle.nn.Layer, 可选):** 教师网络实例,默认:None。
- **mapping_layers(list[str], 可选):** 如果需要给模型中间层添加蒸馏,则需要用这个参数给出需要添加蒸馏的中间层的名字,默认:None。
- **teacher_model_path(str, 可选):** 教师网络预训练模型的路径,默认:None。
- **distill_fn(instance of paddle.nn.Layer, 可选):** 如果需要自定义添加蒸馏loss,则需要传入loss的实例,若传入参数为None,则默认使用mse_loss作为蒸馏损失,默认:None。
- **mapping_op(str, 可选):** 如果在给模型中间层添加蒸馏的时候教师网络和学生网络中间层的shape不相同,则给学生网络中间层添加相应的op,保证在计算蒸馏损失时,教师网络和学生网络中间层的shape相同。该参数可选范围为 ``["conv", "linear", None]`` ,'conv'表示添加Conv2D,'linear'表示添加Linear,None表示不添加任何op。若使用本参数在蒸馏过程中额外添加op,则在优化过程中可以调用 ``paddleslim.nas.ofa.OFA().netAs_param`` 获取到这些op的参数,并把这些op的参数添加到优化器的参数列表中。默认:None。
**返回:**
蒸馏配置。
**示例代码:**
.. code-block:: python
from paddleslim.nas.ofa import DistillConfig
default_distill_config = {
'lambda_distill': 0.01,
'teacher_model': teacher_model,
'mapping_layers': ['models.0.fn'],
'teacher_model_path': None,
'distill_fn': None,
'mapping_op': 'conv2d'
}
distill_config = DistillConfig(**default_distill_config)
OFA
------------------
把超网络训练方式转换为Once-For-All的方式训练。在 `Once-For-All论文 <>`_ 中,提出 ``Progressive Shrinking`` 的超网络训练方式,具体原理是在训练过程中按照 ``elastic kernel_size`` 、 ``elastic width`` 、 ``elactic depth`` 的顺序分阶段进行训练,并且在训练过程中逐步扩大搜索空间,例如:搜索空间为 ``kernel_size=(3,5,7), expand_ratio=(0.5, 1.0, 2.0), depth=(0.5, 0.75, 1.0)`` ,则在训练过程中首先对kernel size的大小进行动态训练,并把kernel_size的动态训练分为两个阶段,第一阶段kernel_size的搜索空间为 ``[5, 7]`` ,第二阶段kernel_size的搜索空间为 ``[3, 5, 7]`` ;之后把expand_ratio的动态训练加入到超网络训练中,和对kernel_size的训练方式相同,对expand_ratio的动态训练也分为两个阶段,第一阶段expand_ratio的搜索空间为 ``[1.0, 2.0]`` ,第二阶段expand_ratio的搜索空间为 ``[0.5, 1.0, 2.0]`` ;最后对depth进行动态训练,训练阶段和kernel_size相同。
.. py:class:: paddleslim.nas.ofa.OFA(model, run_config=None, distill_config=None, elastic_order=None, train_full=False)
`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/nas/ofa/ofa.py#L91>`_
**参数:**
- **model(paddle.nn.Layer):** 把超网络的训练规则转换成默认的Once-For-All论文中推荐的方式训练。
- **run_config(paddleslim.ofa.RunConfig, 可选):** 模型运行过程中的配置,默认:None。
- **distill_config(paddleslim.ofa.DistillConfig, 可选):** 若模型运行过程中添加蒸馏的话,蒸馏相关的配置,具体可配置的参数请参考 `DistillConfig <>`_ , 为None的话则不添加蒸馏,默认:None。
- **elastic_order(list, 可选):** 指定训练顺序,若传入None,则按照默认的 ``Progressive Shrinking`` 的方式进行超网络训练,默认:None。
- **train_full(bool, 可选):** 是否训练超网络中最大的子网络,默认:False。
**返回:**
OFA实例
**示例代码:**
.. code-block:: python
from paddlslim.nas.ofa import OFA
ofa_model = OFA(model)
..
.. py:method:: set_epoch(epoch)
手动设置OFA训练所处的epoch。
**参数:**
- **epoch(int):** - 模型训练过程中当前所处的epoch。
**返回:**
None
**示例代码:**
.. code-block:: python
ofa_model.set_epoch(3)
.. py:method:: set_task(task, phase=None)
手动设置OFA超网络训练所处的阶段。
**参数:**
- **task(list(str)|str):** 手动设置超网络训练中当前训练的任务名称,可选 ``"kernel_size", "width", "depth"`` 。
- **phase(int, 可选):** 手动设置超网络训练中当前训练任务所处的阶段,阶段指的是 ``Progresssive Shrinking`` 训练方式中每个任务依次增加搜索空间,不同阶段代表着不同大小的搜索空间,若为None,则当前任务使用整个搜索空间,默认:None。
**返回:**
None
**示例代码:**
.. code-block:: python
ofa_model.set_task('width')
.. py:method:: set_net_config(config)
手动指定训练超网络中的指定配置的子网络,在训练超网络中特定的某一个或几个子网络时使用。
**参数:**
- **config(dict):** 某个子网络训练中每层的训练配置。
**返回:**
None
**示例代码:**
.. code-block:: python
config = ofa_model.current_config
ofa_model.set_net_config(config)
.. py:method:: calc_distill_loss()
若OFA训练过程中包含中间层蒸馏,则需要调用本接口获取中间蒸馏损失。
**返回:**
中间层蒸馏损失。
**示例代码:**
.. code-block:: python
distill_loss = ofa_model.calc_distill_loss()
.. py:method:: search()
### TODO
.. py:method:: export(config)
根据传入的子网络配置导出当前子网络的参数。
**参数:**
- **config(dict):** 某个子网络每层的配置。
**返回:**
TODO
**示例代码:**
TODO
SuperOP
========
PaddleSlim提供了一些API的动态版本,动态API指的是这些OP的参数大小可以在实际运行过程中根据传入的参数进行改变,用法上的差别具体是forward时候需要额外传一些实际运行相关的参数。其中 `layers_old.py <>`_ 对应的是Paddle 2.0alpha及之前版本的API, `layers.py <>`_ 对应的是Paddle 2.0alpha之后版本的API。
.. py:class:: paddleslim.nas.ofa.layers.Block(fn, fixed=False, key=None)
`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/74db974b6f0187e22bbaf340381a63b7d687a7d4/paddleslim/nas/ofa/layers.py#L64>`_
对Layer进行封装,封装后的Layer和普通Layer用法相同。把每层定义的搜索空间整合到一个大的搜索空间中,训练的时候可以去选择每层的搜索空间。只有在实际运行过程中可以主动改变参数大小的API需要用本类封装,即只有 ``Conv2D`` 、 ``Linear`` 和 ``Embedding`` 这三个API构造的层可能需要被封装。
**参数:**
- **fn(paddle.nn.Layer):** 需要被封装的层的实例。
- **fixed(bool, optional):** 在OFA训练过程中,本层的参数形状否保持不变,如果设置为False,则正常搜索,如果设置为True,则在OFA训练过程中本API的参数形状保持不变。默认:False。
- **key(string, optional):** 本层在整个搜索空间中对应的名称,默认:None。
**返回:**
Block实例
**示例代码:**
.. code-block:: python
from paddleslim.nas.ofa.layers import Block
block_layer = Block(SuperConv2D(3, 4, 3, candidate_config={'kerne_size': (3, 5, 7)})
.. py:class:: paddleslim.nas.ofa.layers.SuperConv2D(in_channels, out_channels, kernel_size, candidate_config={}, transform_kernel=False, stride=1, padding=0, dilation=1, groups=1, padding_mode='zeros', weight_attr=None, bias_attr=None, data_format='NCHW')
`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/74db974b6f0187e22bbaf340381a63b7d687a7d4/paddleslim/nas/ofa/layers.py#L85>`_
该接口用于构建 SuperConv2D 类的一个可调用对象。
**参数:**
- **in_channels** (int) - 输入图像的通道数。
- **out_channels** (int) - 由卷积操作产生的输出的通道数。
- **kernel_size** (int) - 卷积核大小。可以为单个整数或包含两个整数的元组或列表,分别表示卷积核的高和宽。如果为单个整数,表示卷积核的高和宽都等于该整数。
- **candidate_config** (dict,可选)- 针对本层卷积的搜索空间,以字典的形式传入,字典可选的关键字包括: ``kernel_size`` , ``expand_ratio``, ``channel`` ,其中 ``expand_ratio`` 和 ``channel`` 含义相同,都是对通道数进行搜索,不能同时设置。默认值:{}。
- **transform_kernel** (bool,可选)- 是否使用转换矩阵把大kernel转换为小kernel。默认值:False。
- **stride** (int|list|tuple,可选) - 步长大小。可以为单个整数或包含两个整数的元组或列表,分别表示卷积沿着高和宽的步长。如果为单个整数,表示沿着高和宽的步长都等于该整数。默认值:1。
- **padding** (int|list|tuple|str,可选) - 填充大小。如果它是一个字符串,可以是"VALID"或者"SAME",表示填充算法,计算细节可参考上述 ``padding`` = "SAME"或 ``padding`` = "VALID" 时的计算公式。如果它是一个元组或列表,它可以有3种格式:(1)包含4个二元组:当 ``data_format`` 为"NCHW"时为 [[0,0], [0,0], [padding_height_top, padding_height_bottom], [padding_width_left, padding_width_right]],当 ``data_format`` 为"NHWC"时为[[0,0], [padding_height_top, padding_height_bottom], [padding_width_left, padding_width_right], [0,0]];(2)包含4个整数值:[padding_height_top, padding_height_bottom, padding_width_left, padding_width_right];(3)包含2个整数值:[padding_height, padding_width],此时padding_height_top = padding_height_bottom = padding_height, padding_width_left = padding_width_right = padding_width。若为一个整数,padding_height = padding_width = padding。默认值:0。
- **dilation** (int|list|tuple,可选) - 空洞大小。可以为单个整数或包含两个整数的元组或列表,分别表示卷积核中的元素沿着高和宽的空洞。如果为单个整数,表示高和宽的空洞都等于该整数。默认值:1。
- **groups** (int,可选) - 二维卷积层的组数。根据Alex Krizhevsky的深度卷积神经网络(CNN)论文中的成组卷积:当group=n,输入和卷积核分别根据通道数量平均分为n组,第一组卷积核和第一组输入进行卷积计算,第二组卷积核和第二组输入进行卷积计算,……,第n组卷积核和第n组输入进行卷积计算。默认值:1。
- **padding_mode** (str, 可选): - 填充模式。 包括 ``'zeros'``, ``'reflect'``, ``'replicate'`` 或者 ``'circular'``. 默认值: ``'zeros'`` .
- **weight_attr** (ParamAttr,可选) - 指定权重参数属性的对象。默认值为None,表示使用默认的权重参数属性。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。
- **bias_attr** (ParamAttr|bool,可选)- 指定偏置参数属性的对象。若 ``bias_attr`` 为bool类型,只支持为False,表示没有偏置参数。默认值为None,表示使用默认的偏置参数属性。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。
- **data_format** (str,可选) - 指定输入的数据格式,输出的数据格式将与输入保持一致,可以是"NCHW"和"NHWC"。N是批尺寸,C是通道数,H是特征高度,W是特征宽度。默认值:"NCHW"。
.. py:method:: forward(input, kernel_size=None, expand_ratio=None, channel=None)
**参数:**
- **input** (Tensor):- 实际输入。
- **kernel_size** (int, 可选):- 实际运行过程中卷积核大小,设置为None时则初始卷积核大小。默认:None。
- **expand_ratio** (int|float, 可选):- 实际运行过程中卷积核输出通道数膨胀比例,设置为None时则初始卷积核通道数。本参数和 ``channel`` 不能同时不为None。默认:None。
- **channel** (int, 可选):- 实际运行过程中卷积核输出通道数,设置为None时则初始卷积核通道数。本参数和 ``expand_ratio`` 不能同时不为None。默认:None。
**示例代码:**
.. code-block:: python
import paddle
from paddleslim.nas.ofa.layers import SuperConv2D
import numpy as np
data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
super_conv2d = SuperConv2D(3, 10, 3)
config = {'channel': 5}
data = paddle.to_variable(data)
conv = super_conv2d(data, **config)
.. py:class:: paddleslim.nas.ofa.layers.SuperConv2DTranspose(in_channels, out_channels, kernel_size, candidate_config={}, transform_kernel=False, stride=1, padding=0, output_padding=0, dilation=1, groups=1, padding_mode='zeros', weight_attr=None, bias_attr=None, data_format='NCHW')
`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/74db974b6f0187e22bbaf340381a63b7d687a7d4/paddleslim/nas/ofa/layers.py#L381>`_
该接口用于构建 SuperConv2DTranspose 类的一个可调用对象。
**参数:**
- **in_channels** (int) - 输入图像的通道数。
- **out_channels** (int) - 卷积核的个数,和输出特征图通道数相同。
- **kernel_size** (int|list|tuple) - 卷积核大小。可以为单个整数或包含两个整数的元组或列表,分别表示卷积核的高和宽。如果为单个整数,表示卷积核的高和宽都等于该整数。
- **candidate_config** (dict,可选)- 针对本层转置卷积的搜索空间,以字典的形式传入,字典可选的关键字包括: ``kernel_size`` , ``expand_ratio``, ``channel`` ,其中 ``expand_ratio`` 和 ``channel`` 含义相同,都是对通道数进行搜索,不能同时设置。默认值:{}。
- **transform_kernel** (bool,可选)- 是否使用转换矩阵把大kernel转换为小kernel。默认值:False。
- **stride** (int|tuple, 可选) - 步长大小。如果 ``stride`` 为元组或列表,则必须包含两个整型数,分别表示垂直和水平滑动步长。否则,表示垂直和水平滑动步长均为 ``stride`` 。默认值:1。
- **padding** (int|tuple, 可选) - 填充大小。如果 ``padding`` 为元组或列表,则必须包含两个整型数,分别表示竖直和水平边界填充大小。否则,表示竖直和水平边界填充大小均为 ``padding`` 。如果它是一个字符串,可以是"VALID"或者"SAME",表示填充算法,计算细节可参考下方形状 ``padding`` = "SAME"或 ``padding`` = "VALID" 时的计算公式。默认值:0。
- **output_padding** (int|list|tuple, optional): 输出形状上一侧额外添加的大小. 默认值: 0.
- **groups** (int, 可选) - 二维卷积层的组数。根据Alex Krizhevsky的深度卷积神经网络(CNN)论文中的分组卷积:当group=2,卷积核的前一半仅和输入特征图的前一半连接。卷积核的后一半仅和输入特征图的后一半连接。默认值:1。
- **dilation** (int|tuple, 可选) - 空洞大小。可以为单个整数或包含两个整数的元组或列表,分别表示卷积核中的元素沿着高和宽的空洞。如果为单个整数,表示高和宽的空洞都等于该整数。默认值:1。
- **weight_attr** (ParamAttr, 可选) - 指定权重参数属性的对象。默认值为None,表示使用默认的权重参数属性。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。
- **bias_attr** (ParamAttr|bool, 可选) - 指定偏置参数属性的对象。默认值为None,表示使用默认的偏置参数属性。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。
- **data_format** (str,可选) - 指定输入的数据格式,输出的数据格式将与输入保持一致,可以是"NCHW"和"NHWC"。N是批尺寸,C是通道数,H是特征高度,W是特征宽度。默认值:"NCHW"。
.. py:method:: forward(input, kernel_size=None, expand_ratio=None, channel=None)
**参数:**
- **input** (Tensor):- 实际输入。
- **kernel_size** (int, 可选):- 实际运行过程中卷积核大小,设置为None时则初始卷积核大小。默认:None。
- **expand_ratio** (int|float, 可选):- 实际运行过程中卷积核输出通道数膨胀比例,设置为None时则初始卷积核通道数。本参数和 ``channel`` 不能同时不为None。默认:None。
- **channel** (int, 可选):- 实际运行过程中卷积核输出通道数,设置为None时则初始卷积核通道数。本参数和 ``expand_ratio`` 不能同时不为None。默认:None。
**示例代码:**
.. code-block:: python
import paddle
from paddleslim.nas.ofa.layers import SuperConv2D
import numpy as np
data = np.random.uniform(-1, 1, [32, 10, 32, 32]).astype('float32')
config = {'channel': 5}
data = paddle.to_variable(data)
super_convtranspose = SuperConv2DTranspose(num_channels=32, num_filters=10, filter_size=3)
ret = super_convtranspose(paddle.to_variable(data), **config)
.. py:class:: paddleslim.nas.ofa.layers.SuperLinear(in_features, out_features, candidate_config={}, weight_attr=None, bias_attr=None, name=None):
`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/74db974b6f0187e22bbaf340381a63b7d687a7d4/paddleslim/nas/ofa/layers.py#L828>`_
该接口用于构建 SuperLinear 类的一个可调用对象。
**参数:**
- **in_features** (int) – 线性变换层输入单元的数目。
- **out_features** (int) – 线性变换层输出单元的数目。
- **candidate_config** (dict,可选)- 针对本层Linear的搜索空间,以字典的形式传入,字典可选的关键字包括: ``expand_ratio``, ``channel`` ,其中 ``expand_ratio`` 和 ``channel`` 含义相同,都是对通道数进行搜索,不能同时设置。默认值:{}。
- **weight_attr** (ParamAttr, 可选) – 指定权重参数属性的对象。默认值为None,表示使用默认的权重参数属性。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。
- **bias_attr** (ParamAttr, 可选) – 指定偏置参数属性的对象,若 `bias_attr` 为bool类型,如果设置为False,表示不会为该层添加偏置;如果设置为True,表示使用默认的偏置参数属性。默认值为None,表示使用默认的偏置参数属性。默认的偏置参数属性将偏置参数的初始值设为0。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。
- **name** (string, 可选) – BatchNorm的名称, 默认值为None。更多信息请参见 :ref:`api_guide_Name` 。
.. py:method:: forward(input, expand_ratio=None, channel=None)
**参数:**
- **input** (Tensor):- 实际输入。
- **expand_ratio** (int|float, 可选):- 实际运行过程中卷积核输出通道数膨胀比例,设置为None时则初始卷积核通道数。本参数和 ``channel`` 不能同时不为None。默认:None。
- **channel** (int, 可选):- 实际运行过程中卷积核输出通道数,设置为None时则初始卷积核通道数。本参数和 ``expand_ratio`` 不能同时不为None。默认:None。
**示例代码:**
.. code-block:: python
import numpy as np
import paddle
from paddleslim.nas.ofa.layers import SuperLinear
data = np.random.uniform(-1, 1, [32, 64] ).astype('float32')
config = {'channel': 16}
linear = SuperLinear(32, 64)
data = paddle.to_variable(data)
res = linear(data, **config)
.. py:class:: paddleslim.nas.ofa.layers.SuperEmbedding(num_embeddings, embedding_dim, candidate_config={}, padding_idx=None, sparse=False, weight_attr=None, name=None):
`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/74db974b6f0187e22bbaf340381a63b7d687a7d4/paddleslim/nas/ofa/layers.py#L1126>`_
该接口用于构建 SuperEmbedding 类的一个可调用对象。
**参数:**
- **num_embeddings** (int) - Embedding字典词表大小。
- **embedding_dim** (int) - Embedding矩阵每个词向量的维度。
- **candidate_config** (dict,可选)- 针对本层Embedding的搜索空间,以字典的形式传入,字典可选的关键字包括: ``expand_ratio``, ``channel`` ,其中 ``expand_ratio`` 和 ``channel`` 含义相同,都是对通道数进行搜索,不能同时设置。默认值:{}。
- **padding_idx** (int|long|None) - padding_idx需在区间[-vocab_size, vocab_size),否则不生效,padding_idx<0时,padding_idx会被改成vocab_size + padding_idx,input中等于padding_index的id对应的embedding信息会被设置为0,且这部分填充数据在训练时将不会被更新。如果为None,不作处理,默认为None。
- **sparse** (bool) - 是否使用稀疏的更新方式,这个参数只会影响反向的梯度更新的性能,sparse更新速度更快,推荐使用稀疏更新的方式。但某些optimizer不支持sparse更新,比如 :ref:`cn_api_fluid_optimizer_AdadeltaOptimizer` 、 :ref:`cn_api_fluid_optimizer_AdamaxOptimizer` 、 :ref:`cn_api_fluid_optimizer_DecayedAdagradOptimizer` 、 :ref:`cn_api_fluid_optimizer_FtrlOptimizer` 、 :ref:`cn_api_fluid_optimizer_LambOptimizer` 、:ref:`cn_api_fluid_optimizer_LarsMomentumOptimizer` ,此时sparse必须为False。默认为False。
- **weight_attr** (ParamAttr) - 指定权重参数属性的对象。默认值为None,表示使用默认的权重参数属性。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。此外,可以通过 ``weight_attr`` 参数加载用户自定义或预训练的词向量。只需将本地词向量转为numpy数据格式,且保证本地词向量的shape和embedding的 ``num_embeddings`` 和 ``embedding_dim`` 参数一致,然后使用 :ref:`cn_api_fluid_initializer_NumpyArrayInitializer` 进行初始化,即可实现加载自定义或预训练的词向量。详细使用方法见代码示例2。
- **name** (string, 可选) – BatchNorm的名称, 默认值为None。更多信息请参见 :ref:`api_guide_Name` 。
.. py:method:: forward(input, kernel_size=None, expand_ratio=None, channel=None)
**参数:**
- **input** (Tensor):- 实际输入。
- **expand_ratio** (int|float, 可选):- 实际运行过程中卷积核输出通道数膨胀比例,设置为None时则初始卷积核通道数。本参数和 ``channel`` 不能同时不为None。默认:None。
- **channel** (int, 可选):- 实际运行过程中卷积核输出通道数,设置为None时则初始卷积核通道数。本参数和 ``expand_ratio`` 不能同时不为None。默认:None。
**示例代码:**
.. code-block:: python
import numpy as np
import paddle
from paddleslim.nas.ofa.layers import SuperEmbedding
data = np.random.uniform(-1, 1, [32, 64]).astype('float32')
config = {'channel': 16}
emb = SuperEmbedding(32, 64)
data = paddle.to_variable(data)
res = emb(data, **config)
.. py:class:: paddleslim.nas.ofa.layers.SuperBatchNorm2D(num_features, momentum=0.9, epsilon=1e-05, weight_attr=None, bias_attr=None, data_format='NCHW', name=None):
`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/74db974b6f0187e22bbaf340381a63b7d687a7d4/paddleslim/nas/ofa/layers.py#L932>`_
该接口用于构建 SuperBatchNorm2D 类的一个可调用对象。
**参数:**
- **num_features** (int) - 指明输入 ``Tensor`` 的通道数量。
- **epsilon** (float, 可选) - 为了数值稳定加在分母上的值。默认值:1e-05。
- **momentum** (float, 可选) - 此值用于计算 ``moving_mean`` 和 ``moving_var`` 。默认值:0.9。
- **weight_attr** (ParamAttr|bool, 可选) - 指定权重参数属性的对象。如果为False, 则表示每个通道的伸缩固定为1,不可改变。默认值为None,表示使用默认的权重参数属性。具体用法请参见 :ref:`cn_api_ParamAttr` 。
- **bias_attr** (ParamAttr, 可选) - 指定偏置参数属性的对象。如果为False, 则表示每一个通道的偏移固定为0,不可改变。默认值为None,表示使用默认的偏置参数属性。具体用法请参见 :ref:`cn_api_ParamAttr` 。
- **data_format** (string, 可选) - 指定输入数据格式,数据格式可以为"NCHW"。默认值:“NCHW”。
- **name** (string, 可选) – BatchNorm的名称, 默认值为None。更多信息请参见 :ref:`api_guide_Name` 。
**示例代码:**
.. code-block:: python
import paddle
import numpy as np
from paddleslim.nas.ofa.layers import SuperBatchNorm2D
np.random.seed(123)
x_data = np.random.random(size=(2, 5, 2, 3)).astype('float32')
x = paddle.to_tensor(x_data)
batch_norm = SuperBatchNorm2D(5)
batch_norm_out = batch_norm(x)
.. py:class:: paddleslim.nas.ofa.layers.SuperInstanceNorm2D(num_features, momentum=0.9, epsilon=1e-05, weight_attr=None, bias_attr=None, data_format='NCHW', name=None):
`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/74db974b6f0187e22bbaf340381a63b7d687a7d4/paddleslim/nas/ofa/layers.py#L999>`_
该接口用于构建 SuperInstanceNorm2D 类的一个可调用对象。
**参数:**
- **num_features** (int) - 指明输入 ``Tensor`` 的通道数量。
- **epsilon** (float, 可选) - 为了数值稳定加在分母上的值。默认值:1e-05。
- **momentum** (float, 可选) - 本参数目前对 ``InstanceNorm2D`` 无效,无需设置。默认值:0.9。
- **weight_attr** (ParamAttr|bool, 可选) - 指定权重参数属性的对象。如果为False, 则表示每个通道的伸缩固定为1,不可改变。默认值为None,表示使用默认的权重参数属性。具体用法请参见 :ref:`cn_api_ParamAttr` 。
- **bias_attr** (ParamAttr, 可选) - 指定偏置参数属性的对象。如果为False, 则表示每一个通道的偏移固定为0,不可改变。默认值为None,表示使用默认的偏置参数属性。具体用法请参见 :ref:`cn_api_ParamAttr` 。
- **data_format** (string, 可选) - 指定输入数据格式,数据格式可以为"NCHW"。默认值:“NCHW”。
- **name** (string, 可选) – BatchNorm的名称, 默认值为None。更多信息请参见 :ref:`api_guide_Name` 。
**示例代码:**
.. code-block:: python
import paddle
import numpy as np
from paddleslim.nas.ofa.layers import SuperInstanceNorm2D
np.random.seed(123)
x_data = np.random.random(size=(2, 5, 2, 3)).astype('float32')
x = paddle.to_tensor(x_data)
instance_norm = SuperInstanceNorm2D(5)
out = instance_norm(x)
.. py:class:: paddleslim.nas.ofa.layers.SuperLayerNorm(normalized_shape, epsilon=1e-05, weight_attr=None, bias_attr=None, name=None):
`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/74db974b6f0187e22bbaf340381a63b7d687a7d4/paddleslim/nas/ofa/layers.py#L1057>`_
该接口用于构建 SuperLayerNorm 类的一个可调用对象。
**参数:**
- **normalized_shape** (int 或 list 或 tuple) – 需规范化的shape,期望的输入shape为 ``[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`` 。如果是单个整数,则此模块将在最后一个维度上规范化(此时最后一维的维度需与该参数相同)。
- **epsilon** (float, 可选) - 指明在计算过程中是否添加较小的值到方差中以防止除零。默认值:1e-05。
- **weight_attr** (ParamAttr|bool, 可选) - 指定权重参数属性的对象。如果为False固定为1,不进行学习。默认值为None, 表示使用默认的权重参数属性。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。
- **bias_attr** (ParamAttr, 可选) - 指定偏置参数属性的对象。如果为False固定为0,不进行学习。默认值为None,表示使用默认的偏置参数属性。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。
- **name** (string, 可选) – LayerNorm的名称, 默认值为None。更多信息请参见 :ref:`api_guide_Name` 。
**示例代码:**
.. code-block:: python
import paddle
import numpy as np
from paddleslim.nas.ofa.layers import SuperLayerNorm
np.random.seed(123)
x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
x = paddle.to_tensor(x_data)
layer_norm = SuperLayerNorm(x_data.shape[1:])
layer_norm_out = layer_norm(x)
# TinyERNIE模型压缩教程
1. 本教程是对TinyERNIE模型进行压缩的原理介绍。并以ERNIE repo中TinyERNIE模型为例,说明如何快速把整体压缩流程迁移到其他NLP模型。
2. 本教程使用的是[DynaBERT-Dynamic BERT with Adaptive Width and Depth](https://arxiv.org/abs/2004.04037)中的训练策略。把原始模型作为超网络中最大的子模型,原始模型包括多个相同大小的Transformer Block。在每次训练前会选择当前轮次要训练的子模型,每个子模型包含多个相同大小的Sub Transformer Block,每个Sub Transformer Block是选择不同宽度的Transformer Block得到的,一个Transformer Block包含一个Multi-Head Attention和一个Feed-Forward Network,Sub Transformer Block获得方式为:<br/>
&emsp;&emsp;a. 一个Multi-Head Attention层中有多个Head,每次选择不同宽度的子模型时,会同时对Head数量进行等比例减少,例如:如果原始模型中有12个Head,本次训练选择的模型是宽度为原始宽度75%的子模型,则本次训练中所有Transformer Block的Head数量为9。<br/>
&emsp;&emsp;b. Feed-Forward Network层中Linear的参数大小进行等比例减少,例如:如果原始模型中FFN层的特征维度为3072,本次训练选择的模型是宽度为原始宽度75%的子模型,则本次训练中所有Transformer Block中FFN层的特征维度为2304。
## 整体原理介绍
1. 首先对预训练模型的参数和head根据其重要性进行重排序,把重要的参数和head排在参数的前侧,保证训练过程中的参数裁剪不会裁剪掉这些重要的参数。参数的重要性计算是先使用dev数据计算一遍每个参数的梯度,然后根据梯度和参数的整体大小来计算当前参数的重要性,head的的重要性计算是通过传入一个全1的对head的mask,并计算这个mask的梯度,根据mask的梯度来判断每个Multi-Head Attention层中每个Head的重要性。
2. 使用原本的预训练模型作为蒸馏过程中的教师网络。同时定义一个超网络,这个超网络中最大的子网络的结构和教师网络的结构相同其他小的子网络是对最大网络的进行不同的宽度选择来得到的,宽度选择具体指的是网络中的参数进行裁剪,所有子网络在整个训练过程中都是参数共享的。
3. 使用重排序之后的预训练模型参数初始化超网络,并把这个超网络作为学生网络。分别为embedding层,每个transformer block层和最后的logit添加蒸馏损失。
4. 每个batch数据在训练前首先中会选择当前要训练的子网络配置(子网络配置目前仅包括对整个模型的宽度的选择),参数更新时仅会更新当前子网络计算中用到的那部分参数。
5. 通过以上的方式来优化整个超网络参数,训练完成后选择满足加速要求和精度要求的子模型。
<p align="center">
<img src="../../images/algo/ofa_bert.jpg" width="950"/><br />
整体流程图
</p>
## 基于ERNIE repo代码进行压缩
本教程基于PaddleSlim2.0及之后版本、Paddle1.8.5和ERNIE 0.0.4dev及之后版本,请确认已正确安装Paddle、PaddleSlim和ERNIE。
基于ERNIE repo中TinyERNIE的整体代码示例请参考:[TinyERNIE](../../../demo/ofa/ernie/README.md)
### 1. 定义初始网络
定义原始TinyERNIE模型并定义一个字典保存原始模型参数。普通模型转换为超网络之后,由于其组网OP的改变导致原始模型加载的参数失效,所以需要定义一个字典保存原始模型的参数并用来初始化超网络。设置'return_additional_info'参数为True,返回中间层结果,便于添加蒸馏。
```python
model = ErnieModelForSequenceClassification.from_pretrained(args.from_pretrained, num_labels=3, name='')
setattr(model, 'return_additional_info', True)
origin_weights = {}
for name, param in model.named_parameters():
origin_weights[name] = param
```
### 2. 构建超网络
定义搜索空间,并根据搜索空间把普通网络转换为超网络。
```python
# 定义搜索空间
sp_config = supernet(expand_ratio=[0.25, 0.5, 0.75, 1.0])
# 转换模型为超网络
model = Convert(sp_config).convert(model)
paddleslim.nas.ofa.utils.set_state_dict(model, origin_weights)
```
### 3. 定义教师网络
调用paddlenlp中的接口直接构造教师网络。设置'return_additional_info'参数为True,返回中间层结果,便于添加蒸馏。
```python
teacher_model = ErnieModelForSequenceClassification.from_pretrained(args.from_pretrained, num_labels=3, name='teacher')
setattr(teacher_model, 'return_additional_info', True)
```
### 4. 配置蒸馏相关参数
需要配置的参数包括教师模型实例。TinyERNIE模型定义的时候会返回隐藏层和Embedding层的计算结果,所以直接利用返回值进行网络蒸馏。
```python
default_distill_config = {
'teacher_model': teacher_model
}
distill_config = DistillConfig(**default_distill_config)
```
### 5. 定义Once-For-All模型
普通模型和蒸馏相关配置传给OFA接口,自动添加蒸馏过程并把超网络训练方式转为OFA训练方式。
```python
ofa_model = paddleslim.nas.ofa.OFA(model, distill_config=distill_config)
```
### 6. 计算神经元和head的重要性并根据其重要性重排序参数
基于Paddle 1.8.5实现的重要性计算代码位于:[importance.py](../../../demo/ofa/ernie/ernie_supernet/importance.py)
```python
head_importance, neuron_importance = compute_neuron_head_importance(
args,
ofa_model.model,
dev_ds,
place,
model_cfg)
reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)
```
### 7. 传入当前OFA训练所处的阶段
```python
ofa_model.set_epoch(epoch)
ofa_model.set_task('width')
```
### 8. 传入网络相关配置,开始训练
本示例使用DynaBERT的方式进行超网络训练。
```python
width_mult_list = [1.0, 0.75, 0.5, 0.25]
lambda_logit = 0.1
# paddle 2.0rc1之前版本的动态图模型梯度不会自动累加,需要自定义一个dict保存每个模型的梯度,自行进行梯度累加
accumulate_gradients = dict()
for param in opt._parameter_list:
accumulate_gradients[param.name] = 0.0
for width_mult in width_mult_list:
net_config = paddleslim.nas.ofa.utils.dynabert_config(ofa_model, width_mult)
ofa_model.set_net_config(net_config)
student_output, teacher_output = ofa_model(ids, sids, labels=label,
num_layers=model_cfg['num_hidden_layers'])
loss, student_logit, student_reps = student_output[
0], student_output[1], student_output[2]['hiddens']
teacher_logit, teacher_reps = teacher_output[
1], teacher_output[2]['hiddens']
logit_loss = soft_cross_entropy(student_logits, teacher_logits.detach())
rep_loss = 0.0
for stu_rep, tea_rep in zip(student_reps, teacher_reps):
tmp_loss = L.mse_loss(stu_rep, tea_rep.detach())
rep_loss += tmp_loss
loss = rep_loss + lambda_logit * logit_loss
loss.backward()
param_grads = opt.backward(loss)
# 梯度累加
for param in opt._parameter_list:
accumulate_gradients[param.name] += param.gradient()
# 利用累加后的梯度更新模型
for k, v in param_grads:
assert k.name in accumulate_gradients.keys(
), "{} not in accumulate_gradients".format(k.name)
v.set_value(accumulate_gradients[k.name])
opt.apply_optimize(
loss, startup_program=None, params_grads=param_grads)
ofa_model.model.clear_gradients()
```
---
**NOTE**
由于在计算head的重要性时会利用一个mask来收集梯度,所以需要通过monkey patch的方式重新实现一下TinyERNIE中一些相关类的forward函数。具体实现的forward可以参考:[model_ernie_supernet.py](../../../demo/ofa/ernie/ernie_supernet/modeling_ernie_supernet.py)
---
# PaddleNLP-BERT模型压缩教程 # BERT模型压缩教程
1. 对Fine-tuning得到模型通过计算参数及其梯度的乘积得到参数的重要性,把模型参数根据重要性进行重排序。 1. 本教程是对BERT模型进行压缩的原理介绍。并以PaddleNLP repo中BERT-base模型为例,说明如何快速把整体压缩流程迁移到其他NLP模型。
2. 超网络中最大的子网络选择和Bert-base模型网络结构一致的网络结构,其他小的子网络是对最大网络的进行不同的宽度选择来得到的,宽度选择
具体指的是网络中的参数进行裁剪,所有子网络在整个训练过程中都是参数共享的。 2. 本教程使用的是[DynaBERT-Dynamic BERT with Adaptive Width and Depth](https://arxiv.org/abs/2004.04037)中的训练策略。把原始模型作为超网络中最大的子模型,原始模型包括多个相同大小的Transformer Block。在每次训练前会选择当前轮次要训练的子模型,每个子模型包含多个相同大小的Sub Transformer Block,每个Sub Transformer Block是选择不同宽度的Transformer Block得到的,一个Transformer Block包含一个Multi-Head Attention和一个Feed-Forward Network,Sub Transformer Block获得方式为:<br/>
2. 用重排序之后的模型参数作为超网络模型的初始化参数。 &emsp;&emsp;a. 一个Multi-Head Attention层中有多个Head,每次选择不同宽度的子模型时,会同时对Head数量进行等比例减少,例如:如果原始模型中有12个Head,本次训练选择的模型是宽度为原始宽度75%的子模型,则本次训练中所有Transformer Block的Head数量为9。<br/>
3. Fine-tuning之后的模型作为教师网络,超网络作为学生网络,进行知识蒸馏。 &emsp;&emsp;b. Feed-Forward Network层中Linear的参数大小进行等比例减少,例如:如果原始模型中FFN层的特征维度为3072,本次训练选择的模型是宽度为原始宽度75%的子模型,则本次训练中所有Transformer Block中FFN层的特征维度为2304。
## 整体原理介绍
1. 首先对预训练模型的参数和head根据其重要性进行重排序,把重要的参数和head排在参数的前侧,保证训练过程中的参数裁剪不会裁剪掉这些重要的参数。参数的重要性计算是先使用dev数据计算一遍每个参数的梯度,然后根据梯度和参数的整体大小来计算当前参数的重要性,head的的重要性计算是通过传入一个全1的对head的mask,并计算这个mask的梯度,根据mask的梯度来判断每个Multi-Head Attention层中每个Head的重要性。
2. 使用原本的预训练模型作为蒸馏过程中的教师网络。同时定义一个超网络,这个超网络中最大的子网络的结构和教师网络的结构相同其他小的子网络是对最大网络的进行不同的宽度选择来得到的,宽度选择具体指的是网络中的参数进行裁剪,所有子网络在整个训练过程中都是参数共享的。
3. 使用重排序之后的预训练模型参数初始化超网络,并把这个超网络作为学生网络。分别为embedding层,每个transformer block层和最后的logit添加蒸馏损失。
4. 每个batch数据在训练前首先中会选择当前要训练的子网络配置(子网络配置目前仅包括对整个模型的宽度的选择),参数更新时仅会更新当前子网络计算中用到的那部分参数。
5. 通过以上的方式来优化整个超网络参数,训练完成后选择满足加速要求和精度要求的子模型。
<p align="center"> <p align="center">
<img src="../../images/algo/ofa_bert.jpg" width="950"/><br /> <img src="../../images/algo/ofa_bert.jpg" width="950"/><br />
整体流程图 整体流程图
</p> </p>
## 基于PaddleNLP repo代码进行压缩
本教程基于PaddleSlim2.0及之后版本、Paddle2.0rc1及之后版本和PaddleNLP2.0beta及之后版本,请确认已正确安装Paddle、PaddleSlim和PaddleNLP。
基于PaddleNLP repo中BERT-base的整体代码示例请参考:[BERT-base](../../../demo/ofa/bert/README.md)
### 1. 定义初始网络
定义原始BERT-base模型并定义一个字典保存原始模型参数。普通模型转换为超网络之后,由于其组网OP的改变导致原始模型加载的参数失效,所以需要定义一个字典保存原始模型的参数并用来初始化超网络。
```python
model = BertForSequenceClassification.from_pretrained('bert', num_classes=2)
origin_weights = {}
for name, param in model.named_parameters():
origin_weights[name] = param
```
### 2. 构建超网络
定义搜索空间,并根据搜索空间把普通网络转换为超网络。
```python
# 定义搜索空间
sp_config = supernet(expand_ratio=[0.25, 0.5, 0.75, 1.0])
# 转换模型为超网络
model = Convert(sp_config).convert(model)
paddleslim.nas.ofa.utils.set_state_dict(model, origin_weights)
```
### 3. 定义教师网络
调用paddlenlp中的接口直接构造教师网络。
```python
teacher_model = BertForSequenceClassification.from_pretrained('bert', num_classes=2)
```
### 4. 配置蒸馏相关参数
需要配置的参数包括教师模型实例;需要添加蒸馏的层,在教师网络和学生网络的Embedding层和每一个Tranformer Block层之间添加蒸馏损失,中间层的蒸馏损失使用默认的MSE损失函数;配置'lambda_distill'参数表示整体蒸馏损失的缩放比例。
```python
mapping_layers = ['bert.embeddings']
for idx in range(model.bert.config['num_hidden_layers']):
mapping_layers.append('bert.encoder.layers.{}'.format(idx))
default_distill_config = {
'lambda_distill': 0.1,
'teacher_model': teacher_model,
'mapping_layers': mapping_layers,
}
distill_config = DistillConfig(**default_distill_config)
```
### 5. 定义Once-For-All模型
普通模型和蒸馏相关配置传给OFA接口,自动添加蒸馏过程并把超网络训练方式转为OFA训练方式。
```python
ofa_model = paddleslim.nas.ofa.OFA(model, distill_config=distill_config)
```
### 6. 计算神经元和head的重要性并根据其重要性重排序参数
```python
head_importance, neuron_importance = utils.compute_neuron_head_importance(
'sst-2',
ofa_model.model,
dev_data_loader,
num_layers=model.bert.config['num_hidden_layers'],
num_heads=model.bert.config['num_attention_heads'])
reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)
```
### 7. 传入当前OFA训练所处的阶段
```python
ofa_model.set_epoch(epoch)
ofa_model.set_task('width')
```
### 8. 传入网络相关配置,开始训练
本示例使用DynaBERT的方式进行超网络训练。
```python
width_mult_list = [1.0, 0.75, 0.5, 0.25]
lambda_logit = 0.1
for width_mult in width_mult_list:
net_config = paddleslim.nas.ofa.utils.dynabert_config(ofa_model, width_mult)
ofa_model.set_net_config(net_config)
logits, teacher_logits = ofa_model(input_ids, segment_ids, attention_mask=[None, None])
rep_loss = ofa_model.calc_distill_loss()
logit_loss = soft_cross_entropy(logits, teacher_logits.detach())
loss = rep_loss + lambda_logit * logit_loss
loss.backward()
optimizer.step()
lr_scheduler.step()
ofa_model.model.clear_gradients()
```
---
**NOTE**
由于在计算head的重要性时会利用一个mask来收集梯度,所以需要通过monkey patch的方式重新实现一下BERT的forward函数。示例如下:
```python
from paddlenlp.transformers import BertModel
def bert_forward(self,
input_ids,
token_type_ids=None,
position_ids=None,
attention_mask=[None, None]):
wtype = self.pooler.dense.fn.weight.dtype if hasattr(
self.pooler.dense, 'fn') else self.pooler.dense.weight.dtype
if attention_mask[0] is None:
attention_mask[0] = paddle.unsqueeze(
(input_ids == self.pad_token_id).astype(wtype) * -1e9, axis=[1, 2])
embedding_output = self.embeddings(
input_ids=input_ids,
position_ids=position_ids,
token_type_ids=token_type_ids)
encoder_outputs = self.encoder(embedding_output, attention_mask)
sequence_output = encoder_outputs
pooled_output = self.pooler(sequence_output)
return sequence_output, pooled_output
BertModel.forward = bert_forward
```
---
...@@ -18,6 +18,6 @@ from .convert_super import supernet ...@@ -18,6 +18,6 @@ from .convert_super import supernet
from .utils.utils import get_paddle_version from .utils.utils import get_paddle_version
pd_ver = get_paddle_version() pd_ver = get_paddle_version()
if pd_ver == 185: if pd_ver == 185:
from .layers import * from .layers_old import *
else: else:
from .layers_new import * from .layers import *
...@@ -24,15 +24,15 @@ if pd_ver == 185: ...@@ -24,15 +24,15 @@ if pd_ver == 185:
import paddle.fluid.dygraph.nn as nn import paddle.fluid.dygraph.nn as nn
from paddle.fluid.dygraph.nn import Conv2D, Conv2DTranspose, Linear, LayerNorm, Embedding from paddle.fluid.dygraph.nn import Conv2D, Conv2DTranspose, Linear, LayerNorm, Embedding
from paddle.fluid import ParamAttr from paddle.fluid import ParamAttr
from .layers import * from .layers_old import *
from . import layers from . import layers_old as layers
Layer = paddle.fluid.dygraph.Layer Layer = paddle.fluid.dygraph.Layer
else: else:
import paddle.nn as nn import paddle.nn as nn
from paddle.nn import Conv2D, Conv2DTranspose, Linear, LayerNorm, Embedding from paddle.nn import Conv2D, Conv2DTranspose, Linear, LayerNorm, Embedding
from paddle import ParamAttr from paddle import ParamAttr
from .layers_new import * from .layers import *
from . import layers_new as layers from . import layers
Layer = paddle.nn.Layer Layer = paddle.nn.Layer
_logger = get_logger(__name__, level=logging.INFO) _logger = get_logger(__name__, level=logging.INFO)
...@@ -43,6 +43,17 @@ WEIGHT_LAYER = ['conv', 'linear', 'embedding'] ...@@ -43,6 +43,17 @@ WEIGHT_LAYER = ['conv', 'linear', 'embedding']
class Convert: class Convert:
"""
Convert network to the supernet according to the search space.
Parameters:
context(paddleslim.nas.ofa.supernet): search space defined by the user.
Examples:
.. code-block:: python
from paddleslim.nas.ofa import supernet, Convert
sp_net_config = supernet(kernel_size=(3, 5, 7), expand_ratio=[1, 2, 4])
convert = Convert(sp_net_config)
"""
def __init__(self, context): def __init__(self, context):
self.context = context self.context = context
...@@ -63,6 +74,17 @@ class Convert: ...@@ -63,6 +74,17 @@ class Convert:
layer._bias_attr.name = 'super_' + layer._bias_attr.name layer._bias_attr.name = 'super_' + layer._bias_attr.name
def convert(self, network): def convert(self, network):
"""
The function to convert the network to a supernet.
Parameters:
network(paddle.nn.Layer|list(paddle.nn.Layer)): instance of the model or list of instance of layers.
Examples:
.. code-block:: python
from paddle.vision.models import mobilenet_v1
from paddleslim.nas.ofa import supernet, Convert
sp_net_config = supernet(kernel_size=(3, 5, 7), expand_ratio=[1, 2, 4])
convert = Convert(sp_net_config).convert(mobilenet_v1())
"""
# search the first and last weight layer, don't change out channel of the last weight layer # search the first and last weight layer, don't change out channel of the last weight layer
# don't change in channel of the first weight layer # don't change in channel of the first weight layer
model = [] model = []
...@@ -641,6 +663,14 @@ class Convert: ...@@ -641,6 +663,14 @@ class Convert:
class supernet: class supernet:
"""
Search space of the network.
Parameters:
kernel_size(list|tuple, optional): search space for the kernel size of the Conv2D.
expand_ratio(list|tuple, optional): the search space for the expand ratio of the number of channels of Conv2D, the expand ratio of the output dimensions of the Embedding or Linear, which means this parameter get the number of channels of each OP in the converted super network based on the the channels of each OP in the original model, so this parameter The length is 1. Just set one between this parameter and ``channel``.
channel(list|tuple, optional): the search space for the number of channels of Conv2D, the output dimensions of the Embedding or Linear, this parameter directly sets the number of channels of each OP in the super network, so the length of this parameter needs to be the same as the total number that of Conv2D, Embedding, and Linear included in the network. Just set one between this parameter and ``expand_ratio``.
"""
def __init__(self, **kwargs): def __init__(self, **kwargs):
for key, value in kwargs.items(): for key, value in kwargs.items():
setattr(self, key, value) setattr(self, key, value)
......
...@@ -12,21 +12,21 @@ ...@@ -12,21 +12,21 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
### NOTE: the API of this file is based on Paddle2.0, the API in layers_old.py is based on Paddle1.8
import numpy as np import numpy as np
import logging import logging
import paddle.fluid as fluid import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid.dygraph_utils as dygraph_utils
from paddle.fluid.data_feeder import check_variable_and_dtype
from paddle.fluid.framework import in_dygraph_mode, _varbase_creator
from paddle.fluid.dygraph.nn import InstanceNorm, Conv2D, Conv2DTranspose, BatchNorm
from ...common import get_logger from ...common import get_logger
from .utils.utils import compute_start_end, get_same_padding, convert_to_list from .utils.utils import compute_start_end, get_same_padding, convert_to_list
__all__ = [ __all__ = [
'SuperConv2D', 'SuperConv2DTranspose', 'SuperSeparableConv2D', 'SuperConv2D', 'SuperConv2DTranspose', 'SuperSeparableConv2D',
'SuperBatchNorm', 'SuperLinear', 'SuperInstanceNorm', 'Block', 'SuperBatchNorm2D', 'SuperLinear', 'SuperInstanceNorm2D', 'Block',
'SuperGroupConv2D', 'SuperDepthwiseConv2D', 'SuperGroupConv2DTranspose', 'SuperGroupConv2D', 'SuperDepthwiseConv2D', 'SuperGroupConv2DTranspose',
'SuperDepthwiseConv2DTranspose', 'SuperLayerNorm', 'SuperEmbedding' 'SuperDepthwiseConv2DTranspose', 'SuperLayerNorm', 'SuperEmbedding'
] ]
...@@ -44,7 +44,7 @@ def counter(): ...@@ -44,7 +44,7 @@ def counter():
return _cnt return _cnt
class BaseBlock(fluid.dygraph.Layer): class BaseBlock(paddle.nn.Layer):
def __init__(self, key=None): def __init__(self, key=None):
super(BaseBlock, self).__init__() super(BaseBlock, self).__init__()
if key is not None: if key is not None:
...@@ -66,7 +66,8 @@ class Block(BaseBlock): ...@@ -66,7 +66,8 @@ class Block(BaseBlock):
Model is composed of nest blocks. Model is composed of nest blocks.
Parameters: Parameters:
fn(Layer): instance of super layers, such as: SuperConv2D(3, 5, 3). fn(paddle.nn.Layer): instance of super layers, such as: SuperConv2D(3, 5, 3).
fixed(bool, optional): whether to fix the shape of the weight in this layer. Default: False.
key(str, optional): key of this layer, one-to-one correspondence between key and candidate config. Default: None. key(str, optional): key of this layer, one-to-one correspondence between key and candidate config. Default: None.
""" """
...@@ -81,13 +82,9 @@ class Block(BaseBlock): ...@@ -81,13 +82,9 @@ class Block(BaseBlock):
return out return out
class SuperConv2D(fluid.dygraph.Conv2D): class SuperConv2D(nn.Conv2D):
""" """
This interface is used to construct a callable object of the ``SuperConv2D`` class. This interface is used to construct a callable object of the ``SuperConv2D`` class.
The difference between ```SuperConv2D``` and ```Conv2D``` is: ```SuperConv2D``` need
to feed a config dictionary with the format of {'channel', num_of_channel} represents
the channels of the outputs, used to change the first dimension of weight and bias,
only train the first channels of the weight and bias.
Note: the channel in config need to less than first defined. Note: the channel in config need to less than first defined.
...@@ -179,42 +176,44 @@ class SuperConv2D(fluid.dygraph.Conv2D): ...@@ -179,42 +176,44 @@ class SuperConv2D(fluid.dygraph.Conv2D):
ValueError: if ``use_cudnn`` is not a bool value. ValueError: if ``use_cudnn`` is not a bool value.
Examples: Examples:
.. code-block:: python .. code-block:: python
from paddle.fluid.dygraph.base import to_variable import paddle
import paddle.fluid as fluid from paddleslim.nas.ofa.layers import SuperConv2D
from paddleslim.core.layers import SuperConv2D
import numpy as np import numpy as np
data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
with fluid.dygraph.guard(): super_conv2d = SuperConv2D(3, 10, 3)
super_conv2d = SuperConv2D(3, 10, 3) config = {'channel': 5}
config = {'channel': 5} data = paddle.to_variable(data)
data = to_variable(data) conv = super_conv2d(data, config)
conv = super_conv2d(data, config)
""" """
### NOTE: filter_size, num_channels and num_filters must be the max of candidate to define a largest network. ### NOTE: filter_size, num_channels and num_filters must be the max of candidate to define a largest network.
def __init__(self, def __init__(self,
num_channels, in_channels,
num_filters, out_channels,
filter_size, kernel_size,
candidate_config={}, candidate_config={},
transform_kernel=False, transform_kernel=False,
stride=1, stride=1,
dilation=1,
padding=0, padding=0,
groups=None, dilation=1,
param_attr=None, groups=1,
padding_mode='zeros',
weight_attr=None,
bias_attr=None, bias_attr=None,
use_cudnn=True, data_format='NCHW'):
act=None,
dtype='float32'):
### NOTE: padding always is 0, add padding in forward because of kernel size is uncertain
super(SuperConv2D, self).__init__( super(SuperConv2D, self).__init__(
num_channels, num_filters, filter_size, stride, padding, dilation, in_channels,
groups, param_attr, bias_attr, use_cudnn, act, dtype) out_channels,
kernel_size,
if isinstance(self._filter_size, int): stride=stride,
self._filter_size = convert_to_list(self._filter_size, 2) padding=padding,
padding_mode=padding_mode,
dilation=dilation,
groups=groups,
weight_attr=weight_attr,
bias_attr=bias_attr,
data_format=data_format)
self.candidate_config = candidate_config self.candidate_config = candidate_config
if len(candidate_config.items()) != 0: if len(candidate_config.items()) != 0:
...@@ -228,9 +227,9 @@ class SuperConv2D(fluid.dygraph.Conv2D): ...@@ -228,9 +227,9 @@ class SuperConv2D(fluid.dygraph.Conv2D):
'expand_ratio'] if 'expand_ratio' in candidate_config else None 'expand_ratio'] if 'expand_ratio' in candidate_config else None
self.channel = candidate_config[ self.channel = candidate_config[
'channel'] if 'channel' in candidate_config else None 'channel'] if 'channel' in candidate_config else None
self.base_channel = self._num_filters self.base_channel = self._out_channels
if self.expand_ratio != None: if self.expand_ratio != None:
self.base_channel = int(self._num_filters / max(self.expand_ratio)) self.base_channel = int(self._out_channels / max(self.expand_ratio))
self.transform_kernel = transform_kernel self.transform_kernel = transform_kernel
if self.ks_set != None: if self.ks_set != None:
...@@ -244,10 +243,9 @@ class SuperConv2D(fluid.dygraph.Conv2D): ...@@ -244,10 +243,9 @@ class SuperConv2D(fluid.dygraph.Conv2D):
param_name = '%dto%d_matrix' % (ks_large, ks_small) param_name = '%dto%d_matrix' % (ks_large, ks_small)
ks_t = ks_small**2 ks_t = ks_small**2
scale_param[param_name] = self.create_parameter( scale_param[param_name] = self.create_parameter(
attr=fluid.ParamAttr( attr=paddle.ParamAttr(
name=self._full_name + param_name, name=self._full_name + param_name,
initializer=fluid.initializer.NumpyArrayInitializer( initializer=nn.initializer.Assign(np.eye(ks_t))),
np.eye(ks_t))),
shape=(ks_t, ks_t), shape=(ks_t, ks_t),
dtype=self._dtype) dtype=self._dtype)
...@@ -255,10 +253,10 @@ class SuperConv2D(fluid.dygraph.Conv2D): ...@@ -255,10 +253,10 @@ class SuperConv2D(fluid.dygraph.Conv2D):
setattr(self, name, param) setattr(self, name, param)
def get_active_filter(self, in_nc, out_nc, kernel_size): def get_active_filter(self, in_nc, out_nc, kernel_size):
start, end = compute_start_end(self._filter_size[0], kernel_size) start, end = compute_start_end(self._kernel_size[0], kernel_size)
### if NOT transform kernel, intercept a center filter with kernel_size from largest filter ### if NOT transform kernel, intercept a center filter with kernel_size from largest filter
filters = self.weight[:out_nc, :in_nc, start:end, start:end] filters = self.weight[:out_nc, :in_nc, start:end, start:end]
if self.transform_kernel != False and kernel_size < self._filter_size[ if self.transform_kernel != False and kernel_size < self._kernel_size[
0]: 0]:
### if transform kernel, then use matrix to transform ### if transform kernel, then use matrix to transform
start_filter = self.weight[:out_nc, :in_nc, :, :] start_filter = self.weight[:out_nc, :in_nc, :, :]
...@@ -269,16 +267,15 @@ class SuperConv2D(fluid.dygraph.Conv2D): ...@@ -269,16 +267,15 @@ class SuperConv2D(fluid.dygraph.Conv2D):
target_ks = self.ks_set[i - 1] target_ks = self.ks_set[i - 1]
start, end = compute_start_end(src_ks, target_ks) start, end = compute_start_end(src_ks, target_ks)
_input_filter = start_filter[:, :, start:end, start:end] _input_filter = start_filter[:, :, start:end, start:end]
_input_filter = fluid.layers.reshape( _input_filter = paddle.reshape(
_input_filter, _input_filter,
shape=[(_input_filter.shape[0] * _input_filter.shape[1]), shape=[(_input_filter.shape[0] * _input_filter.shape[1]),
-1]) -1])
core.ops.matmul(_input_filter, _input_filter = paddle.matmul(
self.__getattr__('%dto%d_matrix' % _input_filter,
(src_ks, target_ks)), self.__getattr__('%dto%d_matrix' %
_input_filter, 'transpose_X', False, (src_ks, target_ks)), False, False)
'transpose_Y', False, "alpha", 1) _input_filter = paddle.reshape(
_input_filter = fluid.layers.reshape(
_input_filter, _input_filter,
shape=[ shape=[
filters.shape[0], filters.shape[1], target_ks, target_ks filters.shape[0], filters.shape[1], target_ks, target_ks
...@@ -288,14 +285,33 @@ class SuperConv2D(fluid.dygraph.Conv2D): ...@@ -288,14 +285,33 @@ class SuperConv2D(fluid.dygraph.Conv2D):
return filters return filters
def get_groups_in_out_nc(self, in_nc, out_nc): def get_groups_in_out_nc(self, in_nc, out_nc):
### standard conv if self._groups == 1:
return self._groups, in_nc, out_nc ### standard conv
return self._groups, in_nc, out_nc
elif self._groups == self._in_channels:
### depthwise convolution
if in_nc != out_nc:
_logger.debug(
"input channel and output channel in depthwise conv is different, change output channel to input channel! origin channel:(in_nc {}, out_nc {}): ".
format(in_nc, out_nc))
groups = in_nc
out_nc = in_nc
return groups, in_nc, out_nc
else:
### groups convolution
### conv: weight: (Cout, Cin/G, Kh, Kw)
groups = self._groups
in_nc = int(in_nc // groups)
return groups, in_nc, out_nc
def forward(self, input, kernel_size=None, expand_ratio=None, channel=None): def forward(self, input, kernel_size=None, expand_ratio=None, channel=None):
"""
if not in_dygraph_mode(): Parameters:
_logger.error("NOT support static graph") input(Tensor): Input tensor.
kernel_size(int, optional): the kernel size of the filter in actual calculation. Default: None.
expand_ratio(int|float, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
channel(int, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
"""
self.cur_config = { self.cur_config = {
'kernel_size': kernel_size, 'kernel_size': kernel_size,
'expand_ratio': expand_ratio, 'expand_ratio': expand_ratio,
...@@ -310,8 +326,8 @@ class SuperConv2D(fluid.dygraph.Conv2D): ...@@ -310,8 +326,8 @@ class SuperConv2D(fluid.dygraph.Conv2D):
elif channel != None: elif channel != None:
out_nc = int(channel) out_nc = int(channel)
else: else:
out_nc = self._num_filters out_nc = self._out_channels
ks = int(self._filter_size[0]) if kernel_size == None else int( ks = int(self._kernel_size[0]) if kernel_size == None else int(
kernel_size) kernel_size)
groups, weight_in_nc, weight_out_nc = self.get_groups_in_out_nc(in_nc, groups, weight_in_nc, weight_out_nc = self.get_groups_in_out_nc(in_nc,
...@@ -324,28 +340,21 @@ class SuperConv2D(fluid.dygraph.Conv2D): ...@@ -324,28 +340,21 @@ class SuperConv2D(fluid.dygraph.Conv2D):
else: else:
padding = self._padding padding = self._padding
if self._l_type == 'conv2d':
attrs = ('strides', self._stride, 'paddings', padding, 'dilations',
self._dilation, 'groups', groups
if groups else 1, 'use_cudnn', self._use_cudnn)
out = core.ops.conv2d(input, weight, *attrs)
elif self._l_type == 'depthwise_conv2d':
attrs = ('strides', self._stride, 'paddings', padding, 'dilations',
self._dilation, 'groups', groups
if groups else self._groups, 'use_cudnn', self._use_cudnn)
out = core.ops.depthwise_conv2d(input, weight, *attrs)
else:
raise ValueError("conv type error")
pre_bias = out
out_nc = int(pre_bias.shape[1])
if self.bias is not None: if self.bias is not None:
bias = self.bias[:out_nc] bias = self.bias[:out_nc]
pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
else: else:
pre_act = pre_bias bias = self.bias
return dygraph_utils._append_activation_in_dygraph(pre_act, self._act) out = F.conv2d(
input,
weight,
bias=bias,
stride=self._stride,
padding=padding,
dilation=self._dilation,
groups=self._groups,
data_format=self._data_format)
return out
class SuperGroupConv2D(SuperConv2D): class SuperGroupConv2D(SuperConv2D):
...@@ -369,15 +378,10 @@ class SuperDepthwiseConv2D(SuperConv2D): ...@@ -369,15 +378,10 @@ class SuperDepthwiseConv2D(SuperConv2D):
return groups, in_nc, out_nc return groups, in_nc, out_nc
class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose): class SuperConv2DTranspose(nn.Conv2DTranspose):
""" """
This interface is used to construct a callable object of the ``SuperConv2DTranspose`` This interface is used to construct a callable object of the ``SuperConv2DTranspose``
class. class.
The difference between ```SuperConv2DTranspose``` and ```Conv2DTranspose``` is:
```SuperConv2DTranspose``` need to feed a config dictionary with the format of
{'channel', num_of_channel} represents the channels of the outputs, used to change
the first dimension of weight and bias, only train the first channels of the weight
and bias.
Note: the channel in config need to less than first defined. Note: the channel in config need to less than first defined.
...@@ -471,53 +475,55 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose): ...@@ -471,53 +475,55 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose):
None None
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle
from paddleslim.core.layers import SuperConv2DTranspose
import numpy as np import numpy as np
with fluid.dygraph.guard(): from paddleslim.nas.ofa.layers import SuperConv2DTranspose
data = np.random.random((3, 32, 32, 5)).astype('float32') data = np.random.random((3, 32, 32, 5)).astype('float32')
config = {'channel': 5 config = {'channel': 5}
super_convtranspose = SuperConv2DTranspose(num_channels=32, num_filters=10, filter_size=3) super_convtranspose = SuperConv2DTranspose(num_channels=32, num_filters=10, filter_size=3)
ret = super_convtranspose(fluid.dygraph.base.to_variable(data), config) ret = super_convtranspose(paddle.to_variable(data), config)
""" """
def __init__(self, def __init__(self,
num_channels, in_channels,
num_filters, out_channels,
filter_size, kernel_size,
output_size=None,
candidate_config={}, candidate_config={},
transform_kernel=False, transform_kernel=False,
stride=1, stride=1,
dilation=1,
padding=0, padding=0,
groups=None, output_padding=0,
param_attr=None, dilation=1,
groups=1,
weight_attr=None,
bias_attr=None, bias_attr=None,
use_cudnn=True, data_format="NCHW"):
act=None,
dtype='float32'):
super(SuperConv2DTranspose, self).__init__( super(SuperConv2DTranspose, self).__init__(
num_channels, num_filters, filter_size, output_size, padding, in_channels,
stride, dilation, groups, param_attr, bias_attr, use_cudnn, act, out_channels,
dtype) kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
output_padding=output_padding,
groups=groups,
weight_attr=weight_attr,
bias_attr=bias_attr,
data_format=data_format)
self.candidate_config = candidate_config self.candidate_config = candidate_config
if len(self.candidate_config.items()) != 0: if len(self.candidate_config.items()) != 0:
for k, v in candidate_config.items(): for k, v in candidate_config.items():
candidate_config[k] = list(set(v)) candidate_config[k] = list(set(v))
self.ks_set = candidate_config[ self.ks_set = candidate_config[
'kernel_size'] if 'kernel_size' in candidate_config else None 'kernel_size'] if 'kernel_size' in candidate_config else None
if isinstance(self._filter_size, int):
self._filter_size = convert_to_list(self._filter_size, 2)
self.expand_ratio = candidate_config[ self.expand_ratio = candidate_config[
'expand_ratio'] if 'expand_ratio' in candidate_config else None 'expand_ratio'] if 'expand_ratio' in candidate_config else None
self.channel = candidate_config[ self.channel = candidate_config[
'channel'] if 'channel' in candidate_config else None 'channel'] if 'channel' in candidate_config else None
self.base_channel = self._num_filters self.base_channel = self._out_channels
if self.expand_ratio: if self.expand_ratio:
self.base_channel = int(self._num_filters / max(self.expand_ratio)) self.base_channel = int(self._out_channels / max(self.expand_ratio))
self.transform_kernel = transform_kernel self.transform_kernel = transform_kernel
if self.ks_set != None: if self.ks_set != None:
...@@ -531,10 +537,9 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose): ...@@ -531,10 +537,9 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose):
param_name = '%dto%d_matrix' % (ks_large, ks_small) param_name = '%dto%d_matrix' % (ks_large, ks_small)
ks_t = ks_small**2 ks_t = ks_small**2
scale_param[param_name] = self.create_parameter( scale_param[param_name] = self.create_parameter(
attr=fluid.ParamAttr( attr=paddle.ParamAttr(
name=self._full_name + param_name, name=self._full_name + param_name,
initializer=fluid.initializer.NumpyArrayInitializer( initializer=nn.initializer.Assign(np.eye(ks_t))),
np.eye(ks_t))),
shape=(ks_t, ks_t), shape=(ks_t, ks_t),
dtype=self._dtype) dtype=self._dtype)
...@@ -542,9 +547,9 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose): ...@@ -542,9 +547,9 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose):
setattr(self, name, param) setattr(self, name, param)
def get_active_filter(self, in_nc, out_nc, kernel_size): def get_active_filter(self, in_nc, out_nc, kernel_size):
start, end = compute_start_end(self._filter_size[0], kernel_size) start, end = compute_start_end(self._kernel_size[0], kernel_size)
filters = self.weight[:in_nc, :out_nc, start:end, start:end] filters = self.weight[:in_nc, :out_nc, start:end, start:end]
if self.transform_kernel != False and kernel_size < self._filter_size[ if self.transform_kernel != False and kernel_size < self._kernel_size[
0]: 0]:
start_filter = self.weight[:in_nc, :out_nc, :, :] start_filter = self.weight[:in_nc, :out_nc, :, :]
for i in range(len(self.ks_set) - 1, 0, -1): for i in range(len(self.ks_set) - 1, 0, -1):
...@@ -554,16 +559,15 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose): ...@@ -554,16 +559,15 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose):
target_ks = self.ks_set[i - 1] target_ks = self.ks_set[i - 1]
start, end = compute_start_end(src_ks, target_ks) start, end = compute_start_end(src_ks, target_ks)
_input_filter = start_filter[:, :, start:end, start:end] _input_filter = start_filter[:, :, start:end, start:end]
_input_filter = fluid.layers.reshape( _input_filter = paddle.reshape(
_input_filter, _input_filter,
shape=[(_input_filter.shape[0] * _input_filter.shape[1]), shape=[(_input_filter.shape[0] * _input_filter.shape[1]),
-1]) -1])
core.ops.matmul(_input_filter, _input_filter = paddle.matmul(
self.__getattr__('%dto%d_matrix' % _input_filter,
(src_ks, target_ks)), self.__getattr__('%dto%d_matrix' %
_input_filter, 'transpose_X', False, (src_ks, target_ks)), False, False)
'transpose_Y', False, "alpha", 1) _input_filter = paddle.reshape(
_input_filter = fluid.layers.reshape(
_input_filter, _input_filter,
shape=[ shape=[
filters.shape[0], filters.shape[1], target_ks, target_ks filters.shape[0], filters.shape[1], target_ks, target_ks
...@@ -573,13 +577,39 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose): ...@@ -573,13 +577,39 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose):
return filters return filters
def get_groups_in_out_nc(self, in_nc, out_nc): def get_groups_in_out_nc(self, in_nc, out_nc):
### standard conv if self._groups == 1:
return self._groups, in_nc, out_nc ### standard conv
return self._groups, in_nc, out_nc
def forward(self, input, kernel_size=None, expand_ratio=None, channel=None): elif self._groups == self._in_channels:
if not in_dygraph_mode(): ### depthwise convolution
_logger.error("NOT support static graph") if in_nc != out_nc:
_logger.debug(
"input channel and output channel in depthwise conv is different, change output channel to input channel! origin channel:(in_nc {}, out_nc {}): ".
format(in_nc, out_nc))
groups = in_nc
out_nc = in_nc
return groups, in_nc, out_nc
else:
### groups convolution
### groups conv transpose: weight: (Cin, Cout/G, Kh, Kw)
groups = self._groups
out_nc = int(out_nc // groups)
return groups, in_nc, out_nc
def forward(self,
input,
output_size=None,
kernel_size=None,
expand_ratio=None,
channel=None):
"""
Parameters:
input(Tensor): input tensor.
output_size(int, optional): the size of the feature map after transpose convolution. Default: None.
kernel_size(int, optional): the kernel size of the filter in actual calculation. Default: None.
expand_ratio(int|float, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
channel(int, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
"""
self.cur_config = { self.cur_config = {
'kernel_size': kernel_size, 'kernel_size': kernel_size,
'expand_ratio': expand_ratio, 'expand_ratio': expand_ratio,
...@@ -594,34 +624,43 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose): ...@@ -594,34 +624,43 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose):
elif channel != None: elif channel != None:
out_nc = int(channel) out_nc = int(channel)
else: else:
out_nc = self._num_filters out_nc = self._out_channels
ks = int(self._filter_size[0]) if kernel_size == None else int( ks = int(self._kernel_size[0]) if kernel_size == None else int(
kernel_size) kernel_size)
groups, weight_in_nc, weight_out_nc = self.get_groups_in_out_nc(in_nc, groups, weight_in_nc, weight_out_nc = self.get_groups_in_out_nc(in_nc,
out_nc) out_nc)
weight = self.get_active_filter(weight_in_nc, weight_out_nc, ks) weight = self.get_active_filter(weight_in_nc, weight_out_nc, ks)
if kernel_size != None or 'kernel_size' in self.candidate_config.keys(): if kernel_size != None or 'kernel_size' in self.candidate_config.keys():
padding = convert_to_list(get_same_padding(ks), 2) padding = convert_to_list(get_same_padding(ks), 2)
else: else:
padding = self._padding padding = self._padding
op = getattr(core.ops, self._op_type) if output_size is None:
out = op(input, weight, 'output_size', self._output_size, 'strides', output_padding = self.output_padding
self._stride, 'paddings', padding, 'dilations', self._dilation, else:
'groups', groups, 'use_cudnn', self._use_cudnn) output_padding = 0
pre_bias = out
out_nc = int(pre_bias.shape[1])
if self.bias is not None: if self.bias is not None:
bias = self.bias[:out_nc] bias = self.bias[:out_nc]
pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
else: else:
pre_act = pre_bias bias = self.bias
return dygraph_utils._append_activation_in_dygraph( out = F.conv2d_transpose(
pre_act, act=self._act) input,
weight,
bias=bias,
padding=padding,
output_padding=output_padding,
stride=self._stride,
dilation=self._dilation,
groups=self._groups,
output_size=output_size,
data_format=self._data_format)
return out
class SuperGroupConv2DTranspose(SuperConv2DTranspose): class SuperGroupConv2DTranspose(SuperConv2DTranspose):
...@@ -645,7 +684,7 @@ class SuperDepthwiseConv2DTranspose(SuperConv2DTranspose): ...@@ -645,7 +684,7 @@ class SuperDepthwiseConv2DTranspose(SuperConv2DTranspose):
### NOTE: only search channel, write for GAN-compression, maybe change to SuperDepthwiseConv and SuperConv after. ### NOTE: only search channel, write for GAN-compression, maybe change to SuperDepthwiseConv and SuperConv after.
class SuperSeparableConv2D(fluid.dygraph.Layer): class SuperSeparableConv2D(nn.Layer):
""" """
This interface is used to construct a callable object of the ``SuperSeparableConv2D`` This interface is used to construct a callable object of the ``SuperSeparableConv2D``
class. class.
...@@ -655,8 +694,8 @@ class SuperSeparableConv2D(fluid.dygraph.Layer): ...@@ -655,8 +694,8 @@ class SuperSeparableConv2D(fluid.dygraph.Layer):
the second conv's inputs, used to change the first dimension of weight and bias, the second conv's inputs, used to change the first dimension of weight and bias,
only train the first channels of the weight and bias. only train the first channels of the weight and bias.
The architecture of super separable convolution2D op is [Conv2D, norm layer(may be BatchNorm The architecture of super separable convolution2D op is [Conv2D, norm layer(may be BatchNorm2D
or InstanceNorm), Conv2D]. The first conv is depthwise conv, the filter number is input channel or InstanceNorm2D), Conv2D]. The first conv is depthwise conv, the filter number is input channel
multiply scale_factor, the group is equal to the number of input channel. The second conv multiply scale_factor, the group is equal to the number of input channel. The second conv
is standard conv, which filter size and stride size are 1. is standard conv, which filter size and stride size are 1.
...@@ -676,68 +715,66 @@ class SuperSeparableConv2D(fluid.dygraph.Layer): ...@@ -676,68 +715,66 @@ class SuperSeparableConv2D(fluid.dygraph.Layer):
dilation(int or tuple, optional): The first conv's dilation size. If dilation is a tuple, dilation(int or tuple, optional): The first conv's dilation size. If dilation is a tuple,
it must contain two integers, (dilation_H, dilation_W). Otherwise, the it must contain two integers, (dilation_H, dilation_W). Otherwise, the
dilation_H = dilation_W = dilation. Default: 1. dilation_H = dilation_W = dilation. Default: 1.
norm_layer(class): The normalization layer between two convolution. Default: InstanceNorm. norm_layer(class): The normalization layer between two convolution. Default: InstanceNorm2D.
bias_attr (ParamAttr or bool, optional): The attribute for the bias of convolution. bias_attr (ParamAttr or bool, optional): The attribute for the bias of convolution.
If it is set to False, no bias will be added to the output units. If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, convolution If it is set to None or one attribute of ParamAttr, convolution
will create ParamAttr as bias_attr. If the Initializer of the bias_attr will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None. is not set, the bias is initialized zero. Default: None.
scale_factor(float): The scale factor of the first conv's output channel. Default: 1. scale_factor(float): The scale factor of the first conv's output channel. Default: 1.
use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True.
Returns: Returns:
None None
""" """
def __init__(self, def __init__(self,
num_channels, in_channels,
num_filters, out_channels,
filter_size, kernel_size,
candidate_config={}, candidate_config={},
stride=1, stride=1,
padding=0, padding=0,
dilation=1, dilation=1,
norm_layer=InstanceNorm, norm_layer=nn.InstanceNorm2D,
bias_attr=None, bias_attr=None,
scale_factor=1, scale_factor=1):
use_cudnn=False):
super(SuperSeparableConv2D, self).__init__() super(SuperSeparableConv2D, self).__init__()
self.conv = fluid.dygraph.LayerList([ self.conv = nn.LayerList([
fluid.dygraph.nn.Conv2D( nn.Conv2D(
num_channels=num_channels, in_channels=in_channels,
num_filters=num_channels * scale_factor, out_channels=in_channels * scale_factor,
filter_size=filter_size, kernel_size=kernel_size,
stride=stride, stride=stride,
padding=padding, padding=padding,
use_cudnn=False, groups=in_channels,
groups=num_channels,
bias_attr=bias_attr) bias_attr=bias_attr)
]) ])
self.conv.extend([norm_layer(num_channels * scale_factor)]) self.conv.extend([norm_layer(in_channels * scale_factor)])
self.conv.extend([ self.conv.extend([
fluid.dygraph.nn.Conv2D( nn.Conv2D(
num_channels=num_channels * scale_factor, in_channels=in_channels * scale_factor,
num_filters=num_filters, out_channels=out_channels,
filter_size=1, kernel_size=1,
stride=1, stride=1,
use_cudnn=use_cudnn,
bias_attr=bias_attr) bias_attr=bias_attr)
]) ])
self.candidate_config = candidate_config self.candidate_config = candidate_config
self.expand_ratio = candidate_config[ self.expand_ratio = candidate_config[
'expand_ratio'] if 'expand_ratio' in candidate_config else None 'expand_ratio'] if 'expand_ratio' in candidate_config else None
self.base_output_dim = self.conv[0]._num_filters self.base_output_dim = self.conv[0]._out_channels
if self.expand_ratio != None: if self.expand_ratio != None:
self.base_output_dim = int(self.conv[0]._num_filters / self.base_output_dim = int(self.conv[0]._out_channels /
max(self.expand_ratio)) max(self.expand_ratio))
def forward(self, input, expand_ratio=None, channel=None): def forward(self, input, expand_ratio=None, channel=None):
if not in_dygraph_mode(): """
_logger.error("NOT support static graph") Parameters:
input(Tensor): input tensor.
expand_ratio(int|float, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
channel(int, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
"""
self.cur_config = {'expand_ratio': expand_ratio, 'channel': channel} self.cur_config = {'expand_ratio': expand_ratio, 'channel': channel}
in_nc = int(input.shape[1]) in_nc = int(input.shape[1])
assert ( assert (
...@@ -748,93 +785,127 @@ class SuperSeparableConv2D(fluid.dygraph.Layer): ...@@ -748,93 +785,127 @@ class SuperSeparableConv2D(fluid.dygraph.Layer):
elif channel != None: elif channel != None:
out_nc = int(channel) out_nc = int(channel)
else: else:
out_nc = self.conv[0]._num_filters out_nc = self.conv[0]._out_channels
weight = self.conv[0].weight[:in_nc] weight = self.conv[0].weight[:in_nc]
### conv1 ### conv1
if self.conv[0]._l_type == 'conv2d':
attrs = ('strides', self.conv[0]._stride, 'paddings',
self.conv[0]._padding, 'dilations', self.conv[0]._dilation,
'groups', in_nc, 'use_cudnn', self.conv[0]._use_cudnn)
out = core.ops.conv2d(input, weight, *attrs)
elif self.conv[0]._l_type == 'depthwise_conv2d':
attrs = ('strides', self.conv[0]._stride, 'paddings',
self.conv[0]._padding, 'dilations', self.conv[0]._dilation,
'groups', in_nc, 'use_cudnn', self.conv[0]._use_cudnn)
out = core.ops.depthwise_conv2d(input, weight, *attrs)
else:
raise ValueError("conv type error")
pre_bias = out
if self.conv[0].bias is not None: if self.conv[0].bias is not None:
bias = self.conv[0].bias[:in_nc] bias = self.conv[0].bias[:in_nc]
pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
else: else:
pre_act = pre_bias bias = self.conv[0].bias
conv0_out = dygraph_utils._append_activation_in_dygraph( conv0_out = F.conv2d(
pre_act, self.conv[0]._act) input,
weight,
bias,
stride=self.conv[0]._stride,
padding=self.conv[0]._padding,
dilation=self.conv[0]._dilation,
groups=in_nc,
data_format=self.conv[0]._data_format)
norm_out = self.conv[1](conv0_out) norm_out = self.conv[1](conv0_out)
weight = self.conv[2].weight[:out_nc, :in_nc, :, :] weight = self.conv[2].weight[:out_nc, :in_nc, :, :]
if self.conv[2]._l_type == 'conv2d':
attrs = ('strides', self.conv[2]._stride, 'paddings',
self.conv[2]._padding, 'dilations', self.conv[2]._dilation,
'groups', self.conv[2]._groups if self.conv[2]._groups else
1, 'use_cudnn', self.conv[2]._use_cudnn)
out = core.ops.conv2d(norm_out, weight, *attrs)
elif self.conv[2]._l_type == 'depthwise_conv2d':
attrs = ('strides', self.conv[2]._stride, 'paddings',
self.conv[2]._padding, 'dilations', self.conv[2]._dilation,
'groups', self.conv[2]._groups, 'use_cudnn',
self.conv[2]._use_cudnn)
out = core.ops.depthwise_conv2d(norm_out, weight, *attrs)
else:
raise ValueError("conv type error")
pre_bias = out
if self.conv[2].bias is not None: if self.conv[2].bias is not None:
bias = self.conv[2].bias[:out_nc] bias = self.conv[2].bias[:out_nc]
pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
else: else:
pre_act = pre_bias bias = self.conv[2].bias
conv1_out = dygraph_utils._append_activation_in_dygraph( conv1_out = F.conv2d(
pre_act, self.conv[2]._act) norm_out,
weight,
bias,
stride=self.conv[2]._stride,
padding=self.conv[2]._padding,
dilation=self.conv[2]._dilation,
groups=self.conv[2]._groups,
data_format=self.conv[2]._data_format)
return conv1_out return conv1_out
class SuperLinear(fluid.dygraph.Linear): class SuperLinear(nn.Linear):
""" """
Super Fully-connected linear transformation layer.
For each input :math:`X` , the equation is:
.. math::
Out = XW + b
where :math:`W` is the weight and :math:`b` is the bias.
Linear layer takes only one multi-dimensional tensor as input with the
shape :math:`[batch\_size, *, in\_features]` , where :math:`*` means any
number of additional dimensions. It multiplies input tensor with the weight
(a 2-D tensor of shape :math:`[in\_features, out\_features]` ) and produces
an output tensor of shape :math:`[batch\_size, *, out\_features]` .
If :math:`bias\_attr` is not False, the bias (a 1-D tensor of
shape :math:`[out\_features]` ) will be created and added to the output.
Parameters:
in_features (int): The number of input units.
out_features (int): The number of output units.
candidate_config(dict, optional): Dictionary descripts candidate config of this layer,
such as {'channel': (4, 6, 8)}, the key of candidate_config
only can be 'channel' and 'expand_ratio', 'channel' and 'expand_ratio'
CANNOT be set at the same time. Default: None.
weight_attr (ParamAttr, optional): The attribute for the learnable
weight of this layer. The default value is None and the weight will be
initialized to zero. For detailed information, please refer to
paddle.ParamAttr.
bias_attr (ParamAttr|bool, optional): The attribute for the learnable bias
of this layer. If it is set to False, no bias will be added to the output.
If it is set to None or one kind of ParamAttr, a bias parameter will
be created according to ParamAttr. For detailed information, please refer
to paddle.ParamAttr. The default value is None and the bias will be
initialized to zero.
name (str, optional): Normally there is no need for user to set this parameter.
For detailed information, please refer to :ref:`api_guide_Name` .
Attribute:
**weight** (Parameter): the learnable weight of this layer.
**bias** (Parameter): the learnable bias of this layer.
Shape:
- input: Multi-dimentional tensor with shape :math:`[batch\_size, *, in\_features]` .
- output: Multi-dimentional tensor with shape :math:`[batch\_size, *, out\_features]` .
Examples:
.. code-block:: python
import numpy as np
import paddle
from paddleslim.nas.ofa.layers import SuperLinear
data = np.random.uniform(-1, 1, [32, 64] ).astype('float32')
config = {'channel': 16}
linear = SuperLinear(32, 64)
data = paddle.to_variable(data)
res = linear(data, **config)
""" """
def __init__(self, def __init__(self,
input_dim, in_features,
output_dim, out_features,
candidate_config={}, candidate_config={},
param_attr=None, weight_attr=None,
bias_attr=None, bias_attr=None,
act=None, name=None):
dtype="float32"): super(SuperLinear, self).__init__(in_features, out_features,
super(SuperLinear, self).__init__(input_dim, output_dim, param_attr, weight_attr, bias_attr, name)
bias_attr, act, dtype) self._weight_attr = weight_attr
self._param_attr = param_attr
self._bias_attr = bias_attr self._bias_attr = bias_attr
self.output_dim = output_dim self._in_features = in_features
self._out_features = out_features
self.candidate_config = candidate_config self.candidate_config = candidate_config
self.expand_ratio = candidate_config[ self.expand_ratio = candidate_config[
'expand_ratio'] if 'expand_ratio' in candidate_config else None 'expand_ratio'] if 'expand_ratio' in candidate_config else None
self.base_output_dim = self.output_dim self.base_output_dim = self._out_features
if self.expand_ratio != None: if self.expand_ratio != None:
self.base_output_dim = int(self.output_dim / max(self.expand_ratio)) self.base_output_dim = int(self._out_features /
max(self.expand_ratio))
def forward(self, input, expand_ratio=None, channel=None): def forward(self, input, expand_ratio=None, channel=None):
if not in_dygraph_mode(): """
_logger.error("NOT support static graph") Parameters:
input(Tensor): input tensor.
expand_ratio(int|float, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
channel(int, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
"""
self.cur_config = {'expand_ratio': expand_ratio, 'channel': channel} self.cur_config = {'expand_ratio': expand_ratio, 'channel': channel}
### weight: (Cin, Cout) ### weight: (Cin, Cout)
in_nc = int(input.shape[-1]) in_nc = int(input.shape[-1])
...@@ -846,55 +917,65 @@ class SuperLinear(fluid.dygraph.Linear): ...@@ -846,55 +917,65 @@ class SuperLinear(fluid.dygraph.Linear):
elif channel != None: elif channel != None:
out_nc = int(channel) out_nc = int(channel)
else: else:
out_nc = self.output_dim out_nc = self._out_features
weight = self.weight[:in_nc, :out_nc] weight = self.weight[:in_nc, :out_nc]
if self._bias_attr != False: if self._bias_attr != False:
bias = self.bias[:out_nc] bias = self.bias[:out_nc]
use_bias = True
pre_bias = _varbase_creator(dtype=input.dtype)
core.ops.matmul(input, weight, pre_bias, 'transpose_X', False,
'transpose_Y', False, "alpha", 1)
if self._bias_attr != False:
pre_act = dygraph_utils._append_bias_in_dygraph(
pre_bias, bias, axis=len(input.shape) - 1)
else: else:
pre_act = pre_bias bias = self.bias
return dygraph_utils._append_activation_in_dygraph(pre_act, self._act) out = F.linear(x=input, weight=weight, bias=bias, name=self.name)
return out
class SuperBatchNorm(fluid.dygraph.BatchNorm): class SuperBatchNorm2D(nn.BatchNorm2D):
""" """
add comment This interface is used to construct a callable object of the ``SuperBatchNorm2D`` class.
Parameters:
num_features(int): Indicate the number of channels of the input ``Tensor``.
epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
If it is set to None or one attribute of ParamAttr, batch_norm
will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
Examples:
.. code-block:: python
import paddle
import numpy as np
from paddleslim.nas.ofa.layers import SuperBatchNorm2D
np.random.seed(123)
x_data = np.random.random(size=(2, 5, 2, 3)).astype('float32')
x = paddle.to_tensor(x_data)
batch_norm = SuperBatchNorm2D(5)
batch_norm_out = batch_norm(x)
""" """
def __init__(self, def __init__(self,
num_channels, num_features,
act=None,
is_test=False,
momentum=0.9, momentum=0.9,
epsilon=1e-05, epsilon=1e-05,
param_attr=None, weight_attr=None,
bias_attr=None, bias_attr=None,
dtype='float32', data_format='NCHW',
data_layout='NCHW', name=None):
in_place=False, super(SuperBatchNorm2D, self).__init__(num_features, momentum, epsilon,
moving_mean_name=None, weight_attr, bias_attr,
moving_variance_name=None, data_format, name)
do_model_average_for_mean_and_var=True,
use_global_stats=False,
trainable_statistics=False):
super(SuperBatchNorm, self).__init__(
num_channels, act, is_test, momentum, epsilon, param_attr,
bias_attr, dtype, data_layout, in_place, moving_mean_name,
moving_variance_name, do_model_average_for_mean_and_var,
use_global_stats, trainable_statistics)
def forward(self, input): def forward(self, input):
if not in_dygraph_mode(): self._check_data_format(self._data_format)
_logger.error("NOT support static graph") self._check_input_dim(input)
feature_dim = int(input.shape[1]) feature_dim = int(input.shape[1])
...@@ -903,108 +984,217 @@ class SuperBatchNorm(fluid.dygraph.BatchNorm): ...@@ -903,108 +984,217 @@ class SuperBatchNorm(fluid.dygraph.BatchNorm):
mean = self._mean[:feature_dim] mean = self._mean[:feature_dim]
variance = self._variance[:feature_dim] variance = self._variance[:feature_dim]
mean_out = mean return F.batch_norm(
variance_out = variance input,
mean,
attrs = ("momentum", self._momentum, "epsilon", self._epsilon, variance,
"is_test", not self.training, "data_layout", self._data_layout, weight=weight,
"use_mkldnn", False, "fuse_with_relu", self._fuse_with_relu, bias=bias,
"use_global_stats", self._use_global_stats, training=self.training,
'trainable_statistics', self._trainable_statistics) momentum=self._momentum,
batch_norm_out, _, _, _, _, _ = core.ops.batch_norm( epsilon=self._epsilon,
input, weight, bias, mean, variance, mean_out, variance_out, *attrs) data_format=self._data_format)
return dygraph_utils._append_activation_in_dygraph(
batch_norm_out, act=self._act)
class SuperInstanceNorm(fluid.dygraph.InstanceNorm): class SuperInstanceNorm2D(nn.InstanceNorm2D):
""" """
This interface is used to construct a callable object of the ``SuperBatchNorm2D`` class.
Parameters:
num_features(int): Indicate the number of channels of the input ``Tensor``.
epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
If it is set to None or one attribute of ParamAttr, batch_norm
will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
Examples:
.. code-block:: python
import paddle
import numpy as np
from paddleslim.nas.ofa.layers import SuperInstanceNorm2D
np.random.seed(123)
x_data = np.random.random(size=(2, 5, 2, 3)).astype('float32')
x = paddle.to_tensor(x_data)
instance_norm = SuperInstanceNorm2D(5)
out = instance_norm(x)
""" """
def __init__(self, def __init__(self,
num_channels, num_features,
epsilon=1e-05, epsilon=1e-05,
param_attr=None, momentum=0.9,
weight_attr=None,
bias_attr=None, bias_attr=None,
dtype='float32'): data_format='NCHW',
super(SuperInstanceNorm, self).__init__(num_channels, epsilon, name=None):
param_attr, bias_attr, dtype) super(SuperInstanceNorm2D, self).__init__(num_features, epsilon,
momentum, weight_attr,
bias_attr, data_format, name)
def forward(self, input): def forward(self, input):
if not in_dygraph_mode(): self._check_input_dim(input)
_logger.error("NOT support static graph")
feature_dim = int(input.shape[1]) feature_dim = int(input.shape[1])
if self._weight_attr == False and self._bias_attr == False:
if self._param_attr == False and self._bias_attr == False:
scale = None scale = None
bias = None bias = None
else: else:
scale = self.scale[:feature_dim] scale = self.scale[:feature_dim]
bias = self.bias[:feature_dim] bias = self.bias[:feature_dim]
out, _, _ = core.ops.instance_norm(input, scale, bias, 'epsilon', return F.instance_norm(input, scale, bias, eps=self._epsilon)
self._epsilon)
return out
class SuperLayerNorm(nn.LayerNorm):
"""
This interface is used to construct a callable object of the ``SuperLayerNorm`` class.
The difference between ```SuperLayerNorm``` and ```LayerNorm``` is:
the trained weight and bias in ```SuperLayerNorm``` can be changed according to the shape of input,
only train the first channels of the weight and bias.
Parameters:
normalized_shape(int|list|tuple): Input shape from an expected input of
size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
If it is a single integer, this module will normalize over the last dimension
which is expected to be of that specific size.
epsilon(float, optional): The small value added to the variance to prevent
division by zero. Default: 1e-05.
weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
gain :math:`g`. If False, weight is None. If is None, a default :code:`ParamAttr` would be added as scale. The
:attr:`param_attr` is initialized as 1 if it is added. Default: None.
bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
bias :math:`b`. If is False, bias is None. If is None, a default :code:`ParamAttr` would be added as bias. The
:attr:`bias_attr` is initialized as 0 if it is added. Default: None.
name(str, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
Shape:
- x: 2-D, 3-D, 4-D or 5-D tensor.
- output: same shape as input x.
Returns:
None
Examples:
.. code-block:: python
import paddle
import numpy as np
from paddleslim.nas.ofa.layers import SuperLayerNorm
np.random.seed(123)
x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
x = paddle.to_tensor(x_data)
layer_norm = SuperLayerNorm(x_data.shape[1:])
layer_norm_out = layer_norm(x)
"""
class SuperLayerNorm(fluid.dygraph.LayerNorm):
def __init__(self, def __init__(self,
normalized_shape, normalized_shape,
candidate_config={},
scale=True,
shift=True,
epsilon=1e-05, epsilon=1e-05,
param_attr=None, weight_attr=None,
bias_attr=None, bias_attr=None,
act=None, name=None):
dtype='float32'): super(SuperLayerNorm, self).__init__(normalized_shape, epsilon,
super(SuperLayerNorm, weight_attr, bias_attr, name)
self).__init__(normalized_shape, scale, shift, epsilon,
param_attr, bias_attr, act, dtype)
def forward(self, input): def forward(self, input):
if not in_dygraph_mode():
_logger.error("NOT support static graph")
input_shape = list(input.shape)
input_ndim = len(input_shape)
normalized_ndim = len(self._normalized_shape)
self._begin_norm_axis = input_ndim - normalized_ndim
### TODO(ceci3): fix if normalized_shape is not a single number ### TODO(ceci3): fix if normalized_shape is not a single number
input_ndim = len(list(input.shape))
normalized_ndim = len(self._normalized_shape)
begin_norm_axis = input_ndim - normalized_ndim
feature_dim = int(input.shape[-1]) feature_dim = int(input.shape[-1])
weight = self.weight[:feature_dim] if self._weight_attr != False:
bias = self.bias[:feature_dim] weight = self.weight[:feature_dim]
pre_act, _, _ = core.ops.layer_norm(input, weight, bias, 'epsilon', else:
self._epsilon, 'begin_norm_axis', weight = None
self._begin_norm_axis) if self._bias_attr != False:
return dygraph_utils._append_activation_in_dygraph( bias = self.bias[:feature_dim]
pre_act, act=self._act) else:
bias = None
out, _, _ = core.ops.layer_norm(input, weight, bias, 'epsilon',
self._epsilon, 'begin_norm_axis',
begin_norm_axis)
return out
class SuperEmbedding(fluid.dygraph.Embedding): class SuperEmbedding(nn.Embedding):
"""
This interface is used to construct a callable object of the ``SuperEmbedding`` class.
Parameters:
num_embeddings (int): Just one element which indicate the size
of the dictionary of embeddings.
embedding_dim: Just one element which indicate the size of each embedding vector respectively.
padding_idx(int|long|None): padding_idx needs to be in the interval [-num_embeddings, num_embeddings).
If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
If set None, it makes no effect to output. Default: None.
sparse(bool): The flag indicating whether to use sparse update. This parameter only
affects the performance of the backwards gradient update. It is recommended to set
True because sparse update is faster. But some optimizer does not support sparse update,
such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
:ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
:ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
In these case, sparse must be False. Default: False.
weight_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
default weight parameter property is used. See usage for details in :ref:`api_ParamAttr` . In addition,
user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter.
The local word vector needs to be transformed into numpy format, and the shape of local word
vector should be consistent with :attr:`num_embeddings` . Then :ref:`api_initializer_NumpyArrayInitializer`
is used to load custom or pre-trained word vectors. See code example for details.
name(str|None): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
Attribute:
**weight** (Parameter): the learnable weights of this layer.
Returns:
None
Examples:
.. code-block:: python
import numpy as np
import paddle
from paddleslim.nas.ofa.layers import SuperEmbedding
data = np.random.uniform(-1, 1, [32, 64]).astype('float32')
config = {'channel': 16}
emb = SuperEmbedding(32, 64)
data = paddle.to_variable(data)
res = emb(data, **config)
"""
def __init__(self, def __init__(self,
size, num_embeddings,
embedding_dim,
candidate_config={}, candidate_config={},
is_sparse=False,
is_distributed=False,
padding_idx=None, padding_idx=None,
param_attr=None, sparse=False,
dtype='float32'): weight_attr=None,
super(SuperEmbedding, self).__init__(size, is_sparse, is_distributed, name=None):
padding_idx, param_attr, dtype) super(SuperEmbedding, self).__init__(num_embeddings, embedding_dim,
padding_idx, sparse, weight_attr,
name)
self.candidate_config = candidate_config self.candidate_config = candidate_config
self.expand_ratio = candidate_config[ self.expand_ratio = candidate_config[
'expand_ratio'] if 'expand_ratio' in candidate_config else None 'expand_ratio'] if 'expand_ratio' in candidate_config else None
self.base_output_dim = self._size[-1] self.base_output_dim = self._embedding_dim
if self.expand_ratio != None: if self.expand_ratio != None:
self.base_output_dim = int(self._size[-1] / max(self.expand_ratio)) self.base_output_dim = int(self._embedding_dim /
max(self.expand_ratio))
def forward(self, input, expand_ratio=None, channel=None): def forward(self, input, expand_ratio=None, channel=None):
if not in_dygraph_mode(): """
_logger.error("NOT support static graph") Parameters:
input(Tensor): input tensor.
expand_ratio(int|float, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
channel(int, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
"""
assert ( assert (
expand_ratio == None or channel == None expand_ratio == None or channel == None
), "expand_ratio and channel CANNOT be NOT None at the same time." ), "expand_ratio and channel CANNOT be NOT None at the same time."
...@@ -1013,10 +1203,12 @@ class SuperEmbedding(fluid.dygraph.Embedding): ...@@ -1013,10 +1203,12 @@ class SuperEmbedding(fluid.dygraph.Embedding):
elif channel != None: elif channel != None:
out_nc = int(channel) out_nc = int(channel)
else: else:
out_nc = self._size[-1] out_nc = self._embedding_dim
weight = self.weight[:, :out_nc] weight = self.weight[:, :out_nc]
return core.ops.lookup_table_v2( return F.embedding(
weight, input, 'is_sparse', self._is_sparse, 'is_distributed', input,
self._is_distributed, 'remote_prefetch', self._remote_prefetch, weight=weight,
'padding_idx', self._padding_idx) padding_idx=self._padding_idx,
sparse=self._sparse,
name=self._name)
...@@ -12,19 +12,23 @@ ...@@ -12,19 +12,23 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
### NOTE: the API of this file is based on Paddle1.8, the API in layers.py is based on Paddle2.0
import numpy as np import numpy as np
import logging import logging
import paddle import paddle.fluid as fluid
import paddle.nn as nn
import paddle.nn.functional as F
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid.dygraph_utils as dygraph_utils
from paddle.fluid.data_feeder import check_variable_and_dtype
from paddle.fluid.framework import _varbase_creator
from paddle.fluid.dygraph.nn import InstanceNorm, Conv2D, Conv2DTranspose, BatchNorm
from ...common import get_logger from ...common import get_logger
from .utils.utils import compute_start_end, get_same_padding, convert_to_list from .utils.utils import compute_start_end, get_same_padding, convert_to_list
__all__ = [ __all__ = [
'SuperConv2D', 'SuperConv2DTranspose', 'SuperSeparableConv2D', 'SuperConv2D', 'SuperConv2DTranspose', 'SuperSeparableConv2D',
'SuperBatchNorm2D', 'SuperLinear', 'SuperInstanceNorm2D', 'Block', 'SuperBatchNorm', 'SuperLinear', 'SuperInstanceNorm', 'Block',
'SuperGroupConv2D', 'SuperDepthwiseConv2D', 'SuperGroupConv2DTranspose', 'SuperGroupConv2D', 'SuperDepthwiseConv2D', 'SuperGroupConv2DTranspose',
'SuperDepthwiseConv2DTranspose', 'SuperLayerNorm', 'SuperEmbedding' 'SuperDepthwiseConv2DTranspose', 'SuperLayerNorm', 'SuperEmbedding'
] ]
...@@ -42,7 +46,7 @@ def counter(): ...@@ -42,7 +46,7 @@ def counter():
return _cnt return _cnt
class BaseBlock(paddle.nn.Layer): class BaseBlock(fluid.dygraph.Layer):
def __init__(self, key=None): def __init__(self, key=None):
super(BaseBlock, self).__init__() super(BaseBlock, self).__init__()
if key is not None: if key is not None:
...@@ -79,7 +83,7 @@ class Block(BaseBlock): ...@@ -79,7 +83,7 @@ class Block(BaseBlock):
return out return out
class SuperConv2D(nn.Conv2D): class SuperConv2D(fluid.dygraph.Conv2D):
""" """
This interface is used to construct a callable object of the ``SuperConv2D`` class. This interface is used to construct a callable object of the ``SuperConv2D`` class.
The difference between ```SuperConv2D``` and ```Conv2D``` is: ```SuperConv2D``` need The difference between ```SuperConv2D``` and ```Conv2D``` is: ```SuperConv2D``` need
...@@ -177,44 +181,42 @@ class SuperConv2D(nn.Conv2D): ...@@ -177,44 +181,42 @@ class SuperConv2D(nn.Conv2D):
ValueError: if ``use_cudnn`` is not a bool value. ValueError: if ``use_cudnn`` is not a bool value.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle from paddle.fluid.dygraph.base import to_variable
from paddleslim.nas.ofa.layers import SuperConv2D import paddle.fluid as fluid
from paddleslim.core.layers import SuperConv2D
import numpy as np import numpy as np
data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
super_conv2d = SuperConv2D(3, 10, 3) with fluid.dygraph.guard():
config = {'channel': 5} super_conv2d = SuperConv2D(3, 10, 3)
data = paddle.to_variable(data) config = {'channel': 5}
conv = super_conv2d(data, config) data = to_variable(data)
conv = super_conv2d(data, config)
""" """
### NOTE: filter_size, num_channels and num_filters must be the max of candidate to define a largest network. ### NOTE: filter_size, num_channels and num_filters must be the max of candidate to define a largest network.
def __init__(self, def __init__(self,
in_channels, num_channels,
out_channels, num_filters,
kernel_size, filter_size,
candidate_config={}, candidate_config={},
transform_kernel=False, transform_kernel=False,
stride=1, stride=1,
padding=0,
dilation=1, dilation=1,
groups=1, padding=0,
padding_mode='zeros', groups=None,
weight_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
data_format='NCHW'): use_cudnn=True,
act=None,
dtype='float32'):
### NOTE: padding always is 0, add padding in forward because of kernel size is uncertain
super(SuperConv2D, self).__init__( super(SuperConv2D, self).__init__(
in_channels, num_channels, num_filters, filter_size, stride, padding, dilation,
out_channels, groups, param_attr, bias_attr, use_cudnn, act, dtype)
kernel_size,
stride=stride, if isinstance(self._filter_size, int):
padding=padding, self._filter_size = convert_to_list(self._filter_size, 2)
padding_mode=padding_mode,
dilation=dilation,
groups=groups,
weight_attr=weight_attr,
bias_attr=bias_attr,
data_format=data_format)
self.candidate_config = candidate_config self.candidate_config = candidate_config
if len(candidate_config.items()) != 0: if len(candidate_config.items()) != 0:
...@@ -228,9 +230,9 @@ class SuperConv2D(nn.Conv2D): ...@@ -228,9 +230,9 @@ class SuperConv2D(nn.Conv2D):
'expand_ratio'] if 'expand_ratio' in candidate_config else None 'expand_ratio'] if 'expand_ratio' in candidate_config else None
self.channel = candidate_config[ self.channel = candidate_config[
'channel'] if 'channel' in candidate_config else None 'channel'] if 'channel' in candidate_config else None
self.base_channel = self._out_channels self.base_channel = self._num_filters
if self.expand_ratio != None: if self.expand_ratio != None:
self.base_channel = int(self._out_channels / max(self.expand_ratio)) self.base_channel = int(self._num_filters / max(self.expand_ratio))
self.transform_kernel = transform_kernel self.transform_kernel = transform_kernel
if self.ks_set != None: if self.ks_set != None:
...@@ -244,9 +246,10 @@ class SuperConv2D(nn.Conv2D): ...@@ -244,9 +246,10 @@ class SuperConv2D(nn.Conv2D):
param_name = '%dto%d_matrix' % (ks_large, ks_small) param_name = '%dto%d_matrix' % (ks_large, ks_small)
ks_t = ks_small**2 ks_t = ks_small**2
scale_param[param_name] = self.create_parameter( scale_param[param_name] = self.create_parameter(
attr=paddle.ParamAttr( attr=fluid.ParamAttr(
name=self._full_name + param_name, name=self._full_name + param_name,
initializer=nn.initializer.Assign(np.eye(ks_t))), initializer=fluid.initializer.NumpyArrayInitializer(
np.eye(ks_t))),
shape=(ks_t, ks_t), shape=(ks_t, ks_t),
dtype=self._dtype) dtype=self._dtype)
...@@ -254,10 +257,10 @@ class SuperConv2D(nn.Conv2D): ...@@ -254,10 +257,10 @@ class SuperConv2D(nn.Conv2D):
setattr(self, name, param) setattr(self, name, param)
def get_active_filter(self, in_nc, out_nc, kernel_size): def get_active_filter(self, in_nc, out_nc, kernel_size):
start, end = compute_start_end(self._kernel_size[0], kernel_size) start, end = compute_start_end(self._filter_size[0], kernel_size)
### if NOT transform kernel, intercept a center filter with kernel_size from largest filter ### if NOT transform kernel, intercept a center filter with kernel_size from largest filter
filters = self.weight[:out_nc, :in_nc, start:end, start:end] filters = self.weight[:out_nc, :in_nc, start:end, start:end]
if self.transform_kernel != False and kernel_size < self._kernel_size[ if self.transform_kernel != False and kernel_size < self._filter_size[
0]: 0]:
### if transform kernel, then use matrix to transform ### if transform kernel, then use matrix to transform
start_filter = self.weight[:out_nc, :in_nc, :, :] start_filter = self.weight[:out_nc, :in_nc, :, :]
...@@ -268,15 +271,16 @@ class SuperConv2D(nn.Conv2D): ...@@ -268,15 +271,16 @@ class SuperConv2D(nn.Conv2D):
target_ks = self.ks_set[i - 1] target_ks = self.ks_set[i - 1]
start, end = compute_start_end(src_ks, target_ks) start, end = compute_start_end(src_ks, target_ks)
_input_filter = start_filter[:, :, start:end, start:end] _input_filter = start_filter[:, :, start:end, start:end]
_input_filter = paddle.reshape( _input_filter = fluid.layers.reshape(
_input_filter, _input_filter,
shape=[(_input_filter.shape[0] * _input_filter.shape[1]), shape=[(_input_filter.shape[0] * _input_filter.shape[1]),
-1]) -1])
_input_filter = paddle.matmul( core.ops.matmul(_input_filter,
_input_filter, self.__getattr__('%dto%d_matrix' %
self.__getattr__('%dto%d_matrix' % (src_ks, target_ks)),
(src_ks, target_ks)), False, False) _input_filter, 'transpose_X', False,
_input_filter = paddle.reshape( 'transpose_Y', False, "alpha", 1)
_input_filter = fluid.layers.reshape(
_input_filter, _input_filter,
shape=[ shape=[
filters.shape[0], filters.shape[1], target_ks, target_ks filters.shape[0], filters.shape[1], target_ks, target_ks
...@@ -286,8 +290,24 @@ class SuperConv2D(nn.Conv2D): ...@@ -286,8 +290,24 @@ class SuperConv2D(nn.Conv2D):
return filters return filters
def get_groups_in_out_nc(self, in_nc, out_nc): def get_groups_in_out_nc(self, in_nc, out_nc):
### standard conv if self._groups == 1 or self._groups == None:
return self._groups, in_nc, out_nc ### standard conv
return self._groups, in_nc, out_nc
elif self._groups == self._num_channels:
### depthwise convolution
if in_nc != out_nc:
_logger.debug(
"input channel and output channel in depthwise conv is different, change output channel to input channel! origin channel:(in_nc {}, out_nc {}): ".
format(in_nc, out_nc))
groups = in_nc
out_nc = in_nc
return groups, in_nc, out_nc
else:
### groups convolution
### conv: weight: (Cout, Cin/G, Kh, Kw)
groups = self._groups
in_nc = int(in_nc // groups)
return groups, in_nc, out_nc
def forward(self, input, kernel_size=None, expand_ratio=None, channel=None): def forward(self, input, kernel_size=None, expand_ratio=None, channel=None):
self.cur_config = { self.cur_config = {
...@@ -304,8 +324,8 @@ class SuperConv2D(nn.Conv2D): ...@@ -304,8 +324,8 @@ class SuperConv2D(nn.Conv2D):
elif channel != None: elif channel != None:
out_nc = int(channel) out_nc = int(channel)
else: else:
out_nc = self._out_channels out_nc = self._num_filters
ks = int(self._kernel_size[0]) if kernel_size == None else int( ks = int(self._filter_size[0]) if kernel_size == None else int(
kernel_size) kernel_size)
groups, weight_in_nc, weight_out_nc = self.get_groups_in_out_nc(in_nc, groups, weight_in_nc, weight_out_nc = self.get_groups_in_out_nc(in_nc,
...@@ -318,21 +338,28 @@ class SuperConv2D(nn.Conv2D): ...@@ -318,21 +338,28 @@ class SuperConv2D(nn.Conv2D):
else: else:
padding = self._padding padding = self._padding
if self._l_type == 'conv2d':
attrs = ('strides', self._stride, 'paddings', padding, 'dilations',
self._dilation, 'groups', groups
if groups else 1, 'use_cudnn', self._use_cudnn)
out = core.ops.conv2d(input, weight, *attrs)
elif self._l_type == 'depthwise_conv2d':
attrs = ('strides', self._stride, 'paddings', padding, 'dilations',
self._dilation, 'groups', groups
if groups else self._groups, 'use_cudnn', self._use_cudnn)
out = core.ops.depthwise_conv2d(input, weight, *attrs)
else:
raise ValueError("conv type error")
pre_bias = out
out_nc = int(pre_bias.shape[1])
if self.bias is not None: if self.bias is not None:
bias = self.bias[:out_nc] bias = self.bias[:out_nc]
pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
else: else:
bias = self.bias pre_act = pre_bias
out = F.conv2d( return dygraph_utils._append_activation_in_dygraph(pre_act, self._act)
input,
weight,
bias=bias,
stride=self._stride,
padding=padding,
dilation=self._dilation,
groups=self._groups,
data_format=self._data_format)
return out
class SuperGroupConv2D(SuperConv2D): class SuperGroupConv2D(SuperConv2D):
...@@ -356,7 +383,7 @@ class SuperDepthwiseConv2D(SuperConv2D): ...@@ -356,7 +383,7 @@ class SuperDepthwiseConv2D(SuperConv2D):
return groups, in_nc, out_nc return groups, in_nc, out_nc
class SuperConv2DTranspose(nn.Conv2DTranspose): class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose):
""" """
This interface is used to construct a callable object of the ``SuperConv2DTranspose`` This interface is used to construct a callable object of the ``SuperConv2DTranspose``
class. class.
...@@ -458,55 +485,53 @@ class SuperConv2DTranspose(nn.Conv2DTranspose): ...@@ -458,55 +485,53 @@ class SuperConv2DTranspose(nn.Conv2DTranspose):
None None
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle.fluid as fluid
from paddleslim.core.layers import SuperConv2DTranspose
import numpy as np import numpy as np
from paddleslim.nas.ofa.layers import SuperConv2DTranspose with fluid.dygraph.guard():
data = np.random.random((3, 32, 32, 5)).astype('float32') data = np.random.random((3, 32, 32, 5)).astype('float32')
config = {'channel': 5} config = {'channel': 5
super_convtranspose = SuperConv2DTranspose(num_channels=32, num_filters=10, filter_size=3) super_convtranspose = SuperConv2DTranspose(num_channels=32, num_filters=10, filter_size=3)
ret = super_convtranspose(paddle.to_variable(data), config) ret = super_convtranspose(fluid.dygraph.base.to_variable(data), config)
""" """
def __init__(self, def __init__(self,
in_channels, num_channels,
out_channels, num_filters,
kernel_size, filter_size,
output_size=None,
candidate_config={}, candidate_config={},
transform_kernel=False, transform_kernel=False,
stride=1, stride=1,
padding=0,
output_padding=0,
dilation=1, dilation=1,
groups=1, padding=0,
weight_attr=None, groups=None,
param_attr=None,
bias_attr=None, bias_attr=None,
data_format="NCHW"): use_cudnn=True,
act=None,
dtype='float32'):
super(SuperConv2DTranspose, self).__init__( super(SuperConv2DTranspose, self).__init__(
in_channels, num_channels, num_filters, filter_size, output_size, padding,
out_channels, stride, dilation, groups, param_attr, bias_attr, use_cudnn, act,
kernel_size, dtype)
stride=stride,
padding=padding,
dilation=dilation,
output_padding=output_padding,
groups=groups,
weight_attr=weight_attr,
bias_attr=bias_attr,
data_format=data_format)
self.candidate_config = candidate_config self.candidate_config = candidate_config
if len(self.candidate_config.items()) != 0: if len(self.candidate_config.items()) != 0:
for k, v in candidate_config.items(): for k, v in candidate_config.items():
candidate_config[k] = list(set(v)) candidate_config[k] = list(set(v))
self.ks_set = candidate_config[ self.ks_set = candidate_config[
'kernel_size'] if 'kernel_size' in candidate_config else None 'kernel_size'] if 'kernel_size' in candidate_config else None
if isinstance(self._filter_size, int):
self._filter_size = convert_to_list(self._filter_size, 2)
self.expand_ratio = candidate_config[ self.expand_ratio = candidate_config[
'expand_ratio'] if 'expand_ratio' in candidate_config else None 'expand_ratio'] if 'expand_ratio' in candidate_config else None
self.channel = candidate_config[ self.channel = candidate_config[
'channel'] if 'channel' in candidate_config else None 'channel'] if 'channel' in candidate_config else None
self.base_channel = self._out_channels self.base_channel = self._num_filters
if self.expand_ratio: if self.expand_ratio:
self.base_channel = int(self._out_channels / max(self.expand_ratio)) self.base_channel = int(self._num_filters / max(self.expand_ratio))
self.transform_kernel = transform_kernel self.transform_kernel = transform_kernel
if self.ks_set != None: if self.ks_set != None:
...@@ -520,9 +545,10 @@ class SuperConv2DTranspose(nn.Conv2DTranspose): ...@@ -520,9 +545,10 @@ class SuperConv2DTranspose(nn.Conv2DTranspose):
param_name = '%dto%d_matrix' % (ks_large, ks_small) param_name = '%dto%d_matrix' % (ks_large, ks_small)
ks_t = ks_small**2 ks_t = ks_small**2
scale_param[param_name] = self.create_parameter( scale_param[param_name] = self.create_parameter(
attr=paddle.ParamAttr( attr=fluid.ParamAttr(
name=self._full_name + param_name, name=self._full_name + param_name,
initializer=nn.initializer.Assign(np.eye(ks_t))), initializer=fluid.initializer.NumpyArrayInitializer(
np.eye(ks_t))),
shape=(ks_t, ks_t), shape=(ks_t, ks_t),
dtype=self._dtype) dtype=self._dtype)
...@@ -530,9 +556,9 @@ class SuperConv2DTranspose(nn.Conv2DTranspose): ...@@ -530,9 +556,9 @@ class SuperConv2DTranspose(nn.Conv2DTranspose):
setattr(self, name, param) setattr(self, name, param)
def get_active_filter(self, in_nc, out_nc, kernel_size): def get_active_filter(self, in_nc, out_nc, kernel_size):
start, end = compute_start_end(self._kernel_size[0], kernel_size) start, end = compute_start_end(self._filter_size[0], kernel_size)
filters = self.weight[:in_nc, :out_nc, start:end, start:end] filters = self.weight[:in_nc, :out_nc, start:end, start:end]
if self.transform_kernel != False and kernel_size < self._kernel_size[ if self.transform_kernel != False and kernel_size < self._filter_size[
0]: 0]:
start_filter = self.weight[:in_nc, :out_nc, :, :] start_filter = self.weight[:in_nc, :out_nc, :, :]
for i in range(len(self.ks_set) - 1, 0, -1): for i in range(len(self.ks_set) - 1, 0, -1):
...@@ -542,15 +568,16 @@ class SuperConv2DTranspose(nn.Conv2DTranspose): ...@@ -542,15 +568,16 @@ class SuperConv2DTranspose(nn.Conv2DTranspose):
target_ks = self.ks_set[i - 1] target_ks = self.ks_set[i - 1]
start, end = compute_start_end(src_ks, target_ks) start, end = compute_start_end(src_ks, target_ks)
_input_filter = start_filter[:, :, start:end, start:end] _input_filter = start_filter[:, :, start:end, start:end]
_input_filter = paddle.reshape( _input_filter = fluid.layers.reshape(
_input_filter, _input_filter,
shape=[(_input_filter.shape[0] * _input_filter.shape[1]), shape=[(_input_filter.shape[0] * _input_filter.shape[1]),
-1]) -1])
_input_filter = paddle.matmul( core.ops.matmul(_input_filter,
_input_filter, self.__getattr__('%dto%d_matrix' %
self.__getattr__('%dto%d_matrix' % (src_ks, target_ks)),
(src_ks, target_ks)), False, False) _input_filter, 'transpose_X', False,
_input_filter = paddle.reshape( 'transpose_Y', False, "alpha", 1)
_input_filter = fluid.layers.reshape(
_input_filter, _input_filter,
shape=[ shape=[
filters.shape[0], filters.shape[1], target_ks, target_ks filters.shape[0], filters.shape[1], target_ks, target_ks
...@@ -560,15 +587,26 @@ class SuperConv2DTranspose(nn.Conv2DTranspose): ...@@ -560,15 +587,26 @@ class SuperConv2DTranspose(nn.Conv2DTranspose):
return filters return filters
def get_groups_in_out_nc(self, in_nc, out_nc): def get_groups_in_out_nc(self, in_nc, out_nc):
### standard conv if self._groups == 1 or self._groups == None:
return self._groups, in_nc, out_nc ### standard conv
return self._groups, in_nc, out_nc
def forward(self, elif self._groups == self._num_channels:
input, ### depthwise convolution
output_size=None, if in_nc != out_nc:
kernel_size=None, _logger.debug(
expand_ratio=None, "input channel and output channel in depthwise conv is different, change output channel to input channel! origin channel:(in_nc {}, out_nc {}): ".
channel=None): format(in_nc, out_nc))
groups = in_nc
out_nc = in_nc
return groups, in_nc, out_nc
else:
### groups convolution
### groups conv transpose: weight: (Cin, Cout/G, Kh, Kw)
groups = self._groups
out_nc = int(out_nc // groups)
return groups, in_nc, out_nc
def forward(self, input, kernel_size=None, expand_ratio=None, channel=None):
self.cur_config = { self.cur_config = {
'kernel_size': kernel_size, 'kernel_size': kernel_size,
'expand_ratio': expand_ratio, 'expand_ratio': expand_ratio,
...@@ -583,43 +621,34 @@ class SuperConv2DTranspose(nn.Conv2DTranspose): ...@@ -583,43 +621,34 @@ class SuperConv2DTranspose(nn.Conv2DTranspose):
elif channel != None: elif channel != None:
out_nc = int(channel) out_nc = int(channel)
else: else:
out_nc = self._out_channels out_nc = self._num_filters
ks = int(self._kernel_size[0]) if kernel_size == None else int( ks = int(self._filter_size[0]) if kernel_size == None else int(
kernel_size) kernel_size)
groups, weight_in_nc, weight_out_nc = self.get_groups_in_out_nc(in_nc, groups, weight_in_nc, weight_out_nc = self.get_groups_in_out_nc(in_nc,
out_nc) out_nc)
weight = self.get_active_filter(weight_in_nc, weight_out_nc, ks) weight = self.get_active_filter(weight_in_nc, weight_out_nc, ks)
if kernel_size != None or 'kernel_size' in self.candidate_config.keys(): if kernel_size != None or 'kernel_size' in self.candidate_config.keys():
padding = convert_to_list(get_same_padding(ks), 2) padding = convert_to_list(get_same_padding(ks), 2)
else: else:
padding = self._padding padding = self._padding
if output_size is None: op = getattr(core.ops, self._op_type)
output_padding = self.output_padding out = op(input, weight, 'output_size', self._output_size, 'strides',
else: self._stride, 'paddings', padding, 'dilations', self._dilation,
output_padding = 0 'groups', groups, 'use_cudnn', self._use_cudnn)
pre_bias = out
out_nc = int(pre_bias.shape[1])
if self.bias is not None: if self.bias is not None:
bias = self.bias[:out_nc] bias = self.bias[:out_nc]
pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
else: else:
bias = self.bias pre_act = pre_bias
out = F.conv2d_transpose( return dygraph_utils._append_activation_in_dygraph(
input, pre_act, act=self._act)
weight,
bias=bias,
padding=padding,
output_padding=output_padding,
stride=self._stride,
dilation=self._dilation,
groups=self._groups,
output_size=output_size,
data_format=self._data_format)
return out
class SuperGroupConv2DTranspose(SuperConv2DTranspose): class SuperGroupConv2DTranspose(SuperConv2DTranspose):
...@@ -643,7 +672,7 @@ class SuperDepthwiseConv2DTranspose(SuperConv2DTranspose): ...@@ -643,7 +672,7 @@ class SuperDepthwiseConv2DTranspose(SuperConv2DTranspose):
### NOTE: only search channel, write for GAN-compression, maybe change to SuperDepthwiseConv and SuperConv after. ### NOTE: only search channel, write for GAN-compression, maybe change to SuperDepthwiseConv and SuperConv after.
class SuperSeparableConv2D(nn.Layer): class SuperSeparableConv2D(fluid.dygraph.Layer):
""" """
This interface is used to construct a callable object of the ``SuperSeparableConv2D`` This interface is used to construct a callable object of the ``SuperSeparableConv2D``
class. class.
...@@ -653,8 +682,8 @@ class SuperSeparableConv2D(nn.Layer): ...@@ -653,8 +682,8 @@ class SuperSeparableConv2D(nn.Layer):
the second conv's inputs, used to change the first dimension of weight and bias, the second conv's inputs, used to change the first dimension of weight and bias,
only train the first channels of the weight and bias. only train the first channels of the weight and bias.
The architecture of super separable convolution2D op is [Conv2D, norm layer(may be BatchNorm2D The architecture of super separable convolution2D op is [Conv2D, norm layer(may be BatchNorm
or InstanceNorm2D), Conv2D]. The first conv is depthwise conv, the filter number is input channel or InstanceNorm), Conv2D]. The first conv is depthwise conv, the filter number is input channel
multiply scale_factor, the group is equal to the number of input channel. The second conv multiply scale_factor, the group is equal to the number of input channel. The second conv
is standard conv, which filter size and stride size are 1. is standard conv, which filter size and stride size are 1.
...@@ -674,57 +703,62 @@ class SuperSeparableConv2D(nn.Layer): ...@@ -674,57 +703,62 @@ class SuperSeparableConv2D(nn.Layer):
dilation(int or tuple, optional): The first conv's dilation size. If dilation is a tuple, dilation(int or tuple, optional): The first conv's dilation size. If dilation is a tuple,
it must contain two integers, (dilation_H, dilation_W). Otherwise, the it must contain two integers, (dilation_H, dilation_W). Otherwise, the
dilation_H = dilation_W = dilation. Default: 1. dilation_H = dilation_W = dilation. Default: 1.
norm_layer(class): The normalization layer between two convolution. Default: InstanceNorm2D. norm_layer(class): The normalization layer between two convolution. Default: InstanceNorm.
bias_attr (ParamAttr or bool, optional): The attribute for the bias of convolution. bias_attr (ParamAttr or bool, optional): The attribute for the bias of convolution.
If it is set to False, no bias will be added to the output units. If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, convolution If it is set to None or one attribute of ParamAttr, convolution
will create ParamAttr as bias_attr. If the Initializer of the bias_attr will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None. is not set, the bias is initialized zero. Default: None.
scale_factor(float): The scale factor of the first conv's output channel. Default: 1. scale_factor(float): The scale factor of the first conv's output channel. Default: 1.
use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True.
Returns: Returns:
None None
""" """
def __init__(self, def __init__(self,
in_channels, num_channels,
out_channels, num_filters,
kernel_size, filter_size,
candidate_config={}, candidate_config={},
stride=1, stride=1,
padding=0, padding=0,
dilation=1, dilation=1,
norm_layer=nn.InstanceNorm2D, norm_layer=InstanceNorm,
bias_attr=None, bias_attr=None,
scale_factor=1): scale_factor=1,
use_cudnn=False):
super(SuperSeparableConv2D, self).__init__() super(SuperSeparableConv2D, self).__init__()
self.conv = nn.LayerList([ self.conv = fluid.dygraph.LayerList([
nn.Conv2D( fluid.dygraph.nn.Conv2D(
in_channels=in_channels, num_channels=num_channels,
out_channels=in_channels * scale_factor, num_filters=num_channels * scale_factor,
kernel_size=kernel_size, filter_size=filter_size,
stride=stride, stride=stride,
padding=padding, padding=padding,
groups=in_channels, use_cudnn=False,
groups=num_channels,
bias_attr=bias_attr) bias_attr=bias_attr)
]) ])
self.conv.extend([norm_layer(in_channels * scale_factor)]) self.conv.extend([norm_layer(num_channels * scale_factor)])
self.conv.extend([ self.conv.extend([
nn.Conv2D( fluid.dygraph.nn.Conv2D(
in_channels=in_channels * scale_factor, num_channels=num_channels * scale_factor,
out_channels=out_channels, num_filters=num_filters,
kernel_size=1, filter_size=1,
stride=1, stride=1,
use_cudnn=use_cudnn,
bias_attr=bias_attr) bias_attr=bias_attr)
]) ])
self.candidate_config = candidate_config self.candidate_config = candidate_config
self.expand_ratio = candidate_config[ self.expand_ratio = candidate_config[
'expand_ratio'] if 'expand_ratio' in candidate_config else None 'expand_ratio'] if 'expand_ratio' in candidate_config else None
self.base_output_dim = self.conv[0]._out_channels self.base_output_dim = self.conv[0]._num_filters
if self.expand_ratio != None: if self.expand_ratio != None:
self.base_output_dim = int(self.conv[0]._out_channels / self.base_output_dim = int(self.conv[0]._num_filters /
max(self.expand_ratio)) max(self.expand_ratio))
def forward(self, input, expand_ratio=None, channel=None): def forward(self, input, expand_ratio=None, channel=None):
...@@ -738,70 +772,88 @@ class SuperSeparableConv2D(nn.Layer): ...@@ -738,70 +772,88 @@ class SuperSeparableConv2D(nn.Layer):
elif channel != None: elif channel != None:
out_nc = int(channel) out_nc = int(channel)
else: else:
out_nc = self.conv[0]._out_channels out_nc = self.conv[0]._num_filters
weight = self.conv[0].weight[:in_nc] weight = self.conv[0].weight[:in_nc]
### conv1 ### conv1
if self.conv[0]._l_type == 'conv2d':
attrs = ('strides', self.conv[0]._stride, 'paddings',
self.conv[0]._padding, 'dilations', self.conv[0]._dilation,
'groups', in_nc, 'use_cudnn', self.conv[0]._use_cudnn)
out = core.ops.conv2d(input, weight, *attrs)
elif self.conv[0]._l_type == 'depthwise_conv2d':
attrs = ('strides', self.conv[0]._stride, 'paddings',
self.conv[0]._padding, 'dilations', self.conv[0]._dilation,
'groups', in_nc, 'use_cudnn', self.conv[0]._use_cudnn)
out = core.ops.depthwise_conv2d(input, weight, *attrs)
else:
raise ValueError("conv type error")
pre_bias = out
if self.conv[0].bias is not None: if self.conv[0].bias is not None:
bias = self.conv[0].bias[:in_nc] bias = self.conv[0].bias[:in_nc]
pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
else: else:
bias = self.conv[0].bias pre_act = pre_bias
conv0_out = F.conv2d( conv0_out = dygraph_utils._append_activation_in_dygraph(
input, pre_act, self.conv[0]._act)
weight,
bias,
stride=self.conv[0]._stride,
padding=self.conv[0]._padding,
dilation=self.conv[0]._dilation,
groups=in_nc,
data_format=self.conv[0]._data_format)
norm_out = self.conv[1](conv0_out) norm_out = self.conv[1](conv0_out)
weight = self.conv[2].weight[:out_nc, :in_nc, :, :] weight = self.conv[2].weight[:out_nc, :in_nc, :, :]
if self.conv[2]._l_type == 'conv2d':
attrs = ('strides', self.conv[2]._stride, 'paddings',
self.conv[2]._padding, 'dilations', self.conv[2]._dilation,
'groups', self.conv[2]._groups if self.conv[2]._groups else
1, 'use_cudnn', self.conv[2]._use_cudnn)
out = core.ops.conv2d(norm_out, weight, *attrs)
elif self.conv[2]._l_type == 'depthwise_conv2d':
attrs = ('strides', self.conv[2]._stride, 'paddings',
self.conv[2]._padding, 'dilations', self.conv[2]._dilation,
'groups', self.conv[2]._groups, 'use_cudnn',
self.conv[2]._use_cudnn)
out = core.ops.depthwise_conv2d(norm_out, weight, *attrs)
else:
raise ValueError("conv type error")
pre_bias = out
if self.conv[2].bias is not None: if self.conv[2].bias is not None:
bias = self.conv[2].bias[:out_nc] bias = self.conv[2].bias[:out_nc]
pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
else: else:
bias = self.conv[2].bias pre_act = pre_bias
conv1_out = F.conv2d( conv1_out = dygraph_utils._append_activation_in_dygraph(
norm_out, pre_act, self.conv[2]._act)
weight,
bias,
stride=self.conv[2]._stride,
padding=self.conv[2]._padding,
dilation=self.conv[2]._dilation,
groups=self.conv[2]._groups,
data_format=self.conv[2]._data_format)
return conv1_out return conv1_out
class SuperLinear(nn.Linear): class SuperLinear(fluid.dygraph.Linear):
""" """
""" """
def __init__(self, def __init__(self,
in_features, input_dim,
out_features, output_dim,
candidate_config={}, candidate_config={},
weight_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
name=None): act=None,
super(SuperLinear, self).__init__(in_features, out_features, dtype="float32"):
weight_attr, bias_attr, name) super(SuperLinear, self).__init__(input_dim, output_dim, param_attr,
self._weight_attr = weight_attr bias_attr, act, dtype)
self._param_attr = param_attr
self._bias_attr = bias_attr self._bias_attr = bias_attr
self._in_features = in_features self.output_dim = output_dim
self._out_features = out_features
self.candidate_config = candidate_config self.candidate_config = candidate_config
self.expand_ratio = candidate_config[ self.expand_ratio = candidate_config[
'expand_ratio'] if 'expand_ratio' in candidate_config else None 'expand_ratio'] if 'expand_ratio' in candidate_config else None
self.base_output_dim = self._out_features self.base_output_dim = self.output_dim
if self.expand_ratio != None: if self.expand_ratio != None:
self.base_output_dim = int(self._out_features / self.base_output_dim = int(self.output_dim / max(self.expand_ratio))
max(self.expand_ratio))
def forward(self, input, expand_ratio=None, channel=None): def forward(self, input, expand_ratio=None, channel=None):
self.cur_config = {'expand_ratio': expand_ratio, 'channel': channel} self.cur_config = {'expand_ratio': expand_ratio, 'channel': channel}
...@@ -815,39 +867,53 @@ class SuperLinear(nn.Linear): ...@@ -815,39 +867,53 @@ class SuperLinear(nn.Linear):
elif channel != None: elif channel != None:
out_nc = int(channel) out_nc = int(channel)
else: else:
out_nc = self._out_features out_nc = self.output_dim
weight = self.weight[:in_nc, :out_nc] weight = self.weight[:in_nc, :out_nc]
if self._bias_attr != False: if self._bias_attr != False:
bias = self.bias[:out_nc] bias = self.bias[:out_nc]
use_bias = True
pre_bias = _varbase_creator(dtype=input.dtype)
core.ops.matmul(input, weight, pre_bias, 'transpose_X', False,
'transpose_Y', False, "alpha", 1)
if self._bias_attr != False:
pre_act = dygraph_utils._append_bias_in_dygraph(
pre_bias, bias, axis=len(input.shape) - 1)
else: else:
bias = self.bias pre_act = pre_bias
out = F.linear(x=input, weight=weight, bias=bias, name=self.name) return dygraph_utils._append_activation_in_dygraph(pre_act, self._act)
return out
class SuperBatchNorm2D(nn.BatchNorm2D): class SuperBatchNorm(fluid.dygraph.BatchNorm):
""" """
add comment add comment
""" """
def __init__(self, def __init__(self,
num_features, num_channels,
act=None,
is_test=False,
momentum=0.9, momentum=0.9,
epsilon=1e-05, epsilon=1e-05,
weight_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
data_format='NCHW', dtype='float32',
name=None): data_layout='NCHW',
super(SuperBatchNorm2D, self).__init__(num_features, momentum, epsilon, in_place=False,
weight_attr, bias_attr, moving_mean_name=None,
data_format, name) moving_variance_name=None,
do_model_average_for_mean_and_var=True,
use_global_stats=False,
trainable_statistics=False):
super(SuperBatchNorm, self).__init__(
num_channels, act, is_test, momentum, epsilon, param_attr,
bias_attr, dtype, data_layout, in_place, moving_mean_name,
moving_variance_name, do_model_average_for_mean_and_var,
use_global_stats, trainable_statistics)
def forward(self, input): def forward(self, input):
self._check_data_format(self._data_format)
self._check_input_dim(input)
feature_dim = int(input.shape[1]) feature_dim = int(input.shape[1])
weight = self.weight[:feature_dim] weight = self.weight[:feature_dim]
...@@ -855,97 +921,96 @@ class SuperBatchNorm2D(nn.BatchNorm2D): ...@@ -855,97 +921,96 @@ class SuperBatchNorm2D(nn.BatchNorm2D):
mean = self._mean[:feature_dim] mean = self._mean[:feature_dim]
variance = self._variance[:feature_dim] variance = self._variance[:feature_dim]
return F.batch_norm( mean_out = mean
input, variance_out = variance
mean,
variance, attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
weight=weight, "is_test", not self.training, "data_layout", self._data_layout,
bias=bias, "use_mkldnn", False, "fuse_with_relu", self._fuse_with_relu,
training=self.training, "use_global_stats", self._use_global_stats,
momentum=self._momentum, 'trainable_statistics', self._trainable_statistics)
epsilon=self._epsilon, batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
data_format=self._data_format) input, weight, bias, mean, variance, mean_out, variance_out, *attrs)
return dygraph_utils._append_activation_in_dygraph(
batch_norm_out, act=self._act)
class SuperInstanceNorm2D(nn.InstanceNorm2D): class SuperInstanceNorm(fluid.dygraph.InstanceNorm):
""" """
""" """
def __init__(self, def __init__(self,
num_features, num_channels,
epsilon=1e-05, epsilon=1e-05,
momentum=0.9, param_attr=None,
weight_attr=None,
bias_attr=None, bias_attr=None,
data_format='NCHW', dtype='float32'):
name=None): super(SuperInstanceNorm, self).__init__(num_channels, epsilon,
super(SuperInstanceNorm2D, self).__init__(num_features, epsilon, param_attr, bias_attr, dtype)
momentum, weight_attr,
bias_attr, data_format, name)
def forward(self, input): def forward(self, input):
self._check_input_dim(input)
feature_dim = int(input.shape[1]) feature_dim = int(input.shape[1])
if self._weight_attr == False and self._bias_attr == False:
if self._param_attr == False and self._bias_attr == False:
scale = None scale = None
bias = None bias = None
else: else:
scale = self.scale[:feature_dim] scale = self.scale[:feature_dim]
bias = self.bias[:feature_dim] bias = self.bias[:feature_dim]
return F.instance_norm(input, scale, bias, eps=self._epsilon) out, _, _ = core.ops.instance_norm(input, scale, bias, 'epsilon',
self._epsilon)
return out
class SuperLayerNorm(nn.LayerNorm): class SuperLayerNorm(fluid.dygraph.LayerNorm):
def __init__(self, def __init__(self,
normalized_shape, normalized_shape,
scale=True,
shift=True,
epsilon=1e-05, epsilon=1e-05,
weight_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
name=None): act=None,
super(SuperLayerNorm, self).__init__(normalized_shape, epsilon, dtype='float32'):
weight_attr, bias_attr, name) super(SuperLayerNorm,
self).__init__(normalized_shape, scale, shift, epsilon,
param_attr, bias_attr, act, dtype)
def forward(self, input): def forward(self, input):
### TODO(ceci3): fix if normalized_shape is not a single number input_shape = list(input.shape)
input_ndim = len(list(input.shape)) input_ndim = len(input_shape)
normalized_ndim = len(self._normalized_shape) normalized_ndim = len(self._normalized_shape)
begin_norm_axis = input_ndim - normalized_ndim self._begin_norm_axis = input_ndim - normalized_ndim
### TODO(ceci3): fix if normalized_shape is not a single number
feature_dim = int(input.shape[-1]) feature_dim = int(input.shape[-1])
if self._weight_attr != False: weight = self.weight[:feature_dim]
weight = self.weight[:feature_dim] bias = self.bias[:feature_dim]
else: pre_act, _, _ = core.ops.layer_norm(input, weight, bias, 'epsilon',
weight = None self._epsilon, 'begin_norm_axis',
if self._bias_attr != False: self._begin_norm_axis)
bias = self.bias[:feature_dim] return dygraph_utils._append_activation_in_dygraph(
else: pre_act, act=self._act)
bias = None
out, _, _ = core.ops.layer_norm(input, weight, bias, 'epsilon',
self._epsilon, 'begin_norm_axis',
begin_norm_axis)
return out
class SuperEmbedding(nn.Embedding): class SuperEmbedding(fluid.dygraph.Embedding):
def __init__(self, def __init__(self,
num_embeddings, size,
embedding_dim,
candidate_config={}, candidate_config={},
is_sparse=False,
is_distributed=False,
padding_idx=None, padding_idx=None,
sparse=False, param_attr=None,
weight_attr=None, dtype='float32'):
name=None): super(SuperEmbedding, self).__init__(size, is_sparse, is_distributed,
super(SuperEmbedding, self).__init__(num_embeddings, embedding_dim, padding_idx, param_attr, dtype)
padding_idx, sparse, weight_attr,
name)
self.candidate_config = candidate_config self.candidate_config = candidate_config
self.expand_ratio = candidate_config[ self.expand_ratio = candidate_config[
'expand_ratio'] if 'expand_ratio' in candidate_config else None 'expand_ratio'] if 'expand_ratio' in candidate_config else None
self.base_output_dim = self._embedding_dim self.base_output_dim = self._size[-1]
if self.expand_ratio != None: if self.expand_ratio != None:
self.base_output_dim = int(self._embedding_dim / self.base_output_dim = int(self._size[-1] / max(self.expand_ratio))
max(self.expand_ratio))
def forward(self, input, expand_ratio=None, channel=None): def forward(self, input, expand_ratio=None, channel=None):
assert ( assert (
...@@ -956,12 +1021,10 @@ class SuperEmbedding(nn.Embedding): ...@@ -956,12 +1021,10 @@ class SuperEmbedding(nn.Embedding):
elif channel != None: elif channel != None:
out_nc = int(channel) out_nc = int(channel)
else: else:
out_nc = self._embedding_dim out_nc = self._size[-1]
weight = self.weight[:, :out_nc] weight = self.weight[:, :out_nc]
return F.embedding( return core.ops.lookup_table_v2(
input, weight, input, 'is_sparse', self._is_sparse, 'is_distributed',
weight=weight, self._is_distributed, 'remote_prefetch', self._remote_prefetch,
padding_idx=self._padding_idx, 'padding_idx', self._padding_idx)
sparse=self._sparse,
name=self._name)
...@@ -20,10 +20,10 @@ import paddle.fluid as fluid ...@@ -20,10 +20,10 @@ import paddle.fluid as fluid
from .utils.utils import get_paddle_version from .utils.utils import get_paddle_version
pd_ver = get_paddle_version() pd_ver = get_paddle_version()
if pd_ver == 185: if pd_ver == 185:
from .layers import BaseBlock, SuperConv2D, SuperLinear from .layers_old import BaseBlock, SuperConv2D, SuperLinear
Layer = paddle.fluid.dygraph.Layer Layer = paddle.fluid.dygraph.Layer
else: else:
from .layers_new import BaseBlock, SuperConv2D, SuperLinear from .layers import BaseBlock, SuperConv2D, SuperLinear
Layer = paddle.nn.Layer Layer = paddle.nn.Layer
from .utils.utils import search_idx from .utils.utils import search_idx
from ...common import get_logger from ...common import get_logger
...@@ -32,16 +32,40 @@ _logger = get_logger(__name__, level=logging.INFO) ...@@ -32,16 +32,40 @@ _logger = get_logger(__name__, level=logging.INFO)
__all__ = ['OFA', 'RunConfig', 'DistillConfig'] __all__ = ['OFA', 'RunConfig', 'DistillConfig']
RunConfig = namedtuple('RunConfig', [ RunConfig = namedtuple(
'train_batch_size', 'n_epochs', 'save_frequency', 'eval_frequency', 'RunConfig',
'init_learning_rate', 'total_images', 'elastic_depth', 'dynamic_batch_size' [
]) # int, batch_size in training, used to get current epoch, default: None
'train_batch_size',
# list, the number of epoch of every task in training, default: None
'n_epochs',
# list, initial learning rate of every task in traning, NOT used now. Default: None.
'init_learning_rate',
# int, total images of train dataset, used to get current epoch, default: None
'total_images',
# list, elactic depth of the model in training, default: None
'elastic_depth',
# list, the number of sub-network to train per mini-batch data, used to get current epoch, default: None
'dynamic_batch_size'
])
RunConfig.__new__.__defaults__ = (None, ) * len(RunConfig._fields) RunConfig.__new__.__defaults__ = (None, ) * len(RunConfig._fields)
DistillConfig = namedtuple('DistillConfig', [ DistillConfig = namedtuple(
'lambda_distill', 'teacher_model', 'mapping_layers', 'teacher_model_path', 'DistillConfig',
'distill_fn', 'mapping_op' [
]) # float, lambda scale of distillation loss, default: None.
'lambda_distill',
# instance of model, instance of teacher model, default: None.
'teacher_model',
# list(str), name of the layers which need a distillation, default: None.
'mapping_layers',
# str, the path of teacher pretrained model, default: None.
'teacher_model_path',
# instance of loss layer, the loss function used in distillation, if set to None, use mse_loss default, default: None.
'distill_fn',
# str, define which op append between teacher model and student model used in distillation, choice in ['conv', 'linear', None], default: None.
'mapping_op'
])
DistillConfig.__new__.__defaults__ = (None, ) * len(DistillConfig._fields) DistillConfig.__new__.__defaults__ = (None, ) * len(DistillConfig._fields)
...@@ -89,15 +113,31 @@ class OFABase(Layer): ...@@ -89,15 +113,31 @@ class OFABase(Layer):
class OFA(OFABase): class OFA(OFABase):
"""
Convert the training progress to the Once-For-All training progress, a detailed description in the paper: `Once-for-All: Train One Network and Specialize it for Efficient Deployment<https://arxiv.org/abs/1908.09791>`_ . This paper propose a training propgress named progressive shrinking (PS), which means we start with training the largest neural network with the maximum kernel size (i.e., 7), depth (i.e., 4), and width (i.e., 6). Next, we progressively fine-tune the network to support smaller sub-networks by gradually adding them into the sampling space (larger sub-networks may also be sampled). Specifically, after training the largest network, we first support elastic kernel size which can choose from {3, 5, 7} at each layer, while the depth and width remain the maximum values. Then, we support elastic depth and elastic width sequentially.
Parameters:
model(paddle.nn.Layer): instance of model.
run_config(paddleslim.ofa.RunConfig, optional): config in ofa training, can reference `<>`_ . Default: None.
distill_config(paddleslim.ofa.DistillConfig, optional): config of distilltion in ofa training, can reference `<>`_. Default: None.
elastic_order(list, optional): define the training order, if it set to None, use the default order in the paper. Default: None.
train_full(bool, optional): whether to train the largest sub-network only. Default: False.
Examples:
.. code-block:: python
from paddlslim.nas.ofa import OFA
ofa_model = OFA(model)
"""
def __init__(self, def __init__(self,
model, model,
run_config=None, run_config=None,
net_config=None,
distill_config=None, distill_config=None,
elastic_order=None, elastic_order=None,
train_full=False): train_full=False):
super(OFA, self).__init__(model) super(OFA, self).__init__(model)
self.net_config = net_config self.net_config = None
self.run_config = run_config self.run_config = run_config
self.distill_config = distill_config self.distill_config = distill_config
self.elastic_order = elastic_order self.elastic_order = elastic_order
...@@ -278,12 +318,29 @@ class OFA(OFABase): ...@@ -278,12 +318,29 @@ class OFA(OFABase):
self.layers, sample_type=sample_type, task=task, phase=phase) self.layers, sample_type=sample_type, task=task, phase=phase)
return config return config
def set_task(self, task=None, phase=None): def set_task(self, task, phase=None):
"""
set task in the ofa training progress.
Parameters:
task(list(str)|str): spectial task in training progress.
phase(int, optional): the search space is gradually increased, use this parameter to spectial the phase in current task, if set to None, means use the whole search space in training progress. Default: None.
Examples:
.. code-block:: python
ofa_model.set_task('width')
"""
self.manual_set_task = True self.manual_set_task = True
self.task = task self.task = task
self.phase = phase self.phase = phase
def set_epoch(self, epoch): def set_epoch(self, epoch):
"""
set epoch in the ofa training progress.
Parameters:
epoch(int): spectial epoch in training progress.
Examples:
.. code-block:: python
ofa_model.set_epoch(3)
"""
self.epoch = epoch self.epoch = epoch
def _progressive_shrinking(self): def _progressive_shrinking(self):
...@@ -302,6 +359,12 @@ class OFA(OFABase): ...@@ -302,6 +359,12 @@ class OFA(OFABase):
return self._sample_config(task=self.task, phase=phase_idx) return self._sample_config(task=self.task, phase=phase_idx)
def calc_distill_loss(self): def calc_distill_loss(self):
"""
Calculate distill loss if there are distillation.
Examples:
.. code-block:: python
dis_loss = ofa_model.calc_distill_loss()
"""
losses = [] losses = []
assert len(self.netAs) > 0 assert len(self.netAs) > 0
for i, netA in enumerate(self.netAs): for i, netA in enumerate(self.netAs):
...@@ -319,6 +382,8 @@ class OFA(OFABase): ...@@ -319,6 +382,8 @@ class OFA(OFABase):
else: else:
Sact = Sact Sact = Sact
Sact = Sact[0] if isinstance(Sact, tuple) else Sact
Tact = Tact[0] if isinstance(Tact, tuple) else Tact
if self.distill_config.distill_fn == None: if self.distill_config.distill_fn == None:
loss = fluid.layers.mse_loss(Sact, Tact.detach()) loss = fluid.layers.mse_loss(Sact, Tact.detach())
else: else:
...@@ -337,6 +402,15 @@ class OFA(OFABase): ...@@ -337,6 +402,15 @@ class OFA(OFABase):
pass pass
def set_net_config(self, net_config): def set_net_config(self, net_config):
"""
Set the config of the special sub-network to be trained.
Parameters:
net_config(dict): special the config of sug-network.
Examples:
.. code-block:: python
config = ofa_model.current_config
ofa_model.set_net_config(config)
"""
self.net_config = net_config self.net_config = net_config
def forward(self, *inputs, **kwargs): def forward(self, *inputs, **kwargs):
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
from .utils import * from .utils import *
from .special_config import *
from .utils import get_paddle_version from .utils import get_paddle_version
pd_ver = get_paddle_version() pd_ver = get_paddle_version()
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
__all__ = ['dynabert_config']
def dynabert_config(model, width_mult, depth_mult=1.0):
new_config = dict()
block_num = np.floor((len(model.layers.items()) - 3) / 6)
block_name = block_num * 6 + 2
def fix_exp(idx):
if (idx - 3) % 6 == 0 or (idx - 5) % 6 == 0:
return True
return False
for idx, (block_k, block_v) in enumerate(model.layers.items()):
if isinstance(block_v, dict) and len(block_v.keys()) != 0:
name, name_idx = block_k.split('_'), int(block_k.split('_')[1])
if fix_exp(name_idx) or 'emb' in block_k or idx >= block_name:
block_v['expand_ratio'] = 1.0
else:
block_v['expand_ratio'] = width_mult
if block_k == 'depth':
block_v = depth_mult
new_config[block_k] = block_v
return new_config
...@@ -22,7 +22,7 @@ from paddle.nn import ReLU ...@@ -22,7 +22,7 @@ from paddle.nn import ReLU
from paddleslim.nas import ofa from paddleslim.nas import ofa
from paddleslim.nas.ofa import OFA, RunConfig, DistillConfig from paddleslim.nas.ofa import OFA, RunConfig, DistillConfig
from paddleslim.nas.ofa.convert_super import supernet from paddleslim.nas.ofa.convert_super import supernet
from paddleslim.nas.ofa.layers_new import Block, SuperSeparableConv2D from paddleslim.nas.ofa.layers import Block, SuperSeparableConv2D
class ModelConv(nn.Layer): class ModelConv(nn.Layer):
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
sys.path.append("../")
import numpy as np
import unittest
import paddle
import paddle.nn as nn
from paddle.nn import ReLU
from paddleslim.nas import ofa
from paddleslim.nas.ofa import OFA, RunConfig, DistillConfig
from paddleslim.nas.ofa.convert_super import supernet
from paddleslim.nas.ofa.layers import *
class ModelCase1(nn.Layer):
def __init__(self):
super(ModelCase1, self).__init__()
models = [SuperConv2D(3, 4, 3, bias_attr=False)]
models += [SuperConv2D(4, 4, 3, groups=4)]
models += [SuperConv2D(4, 4, 3, groups=2)]
models += [SuperConv2DTranspose(4, 4, 3, bias_attr=False)]
models += [SuperConv2DTranspose(4, 4, 3, groups=4)]
models += [nn.Conv2DTranspose(4, 4, 3, groups=2)]
models += [SuperConv2DTranspose(4, 4, 3, groups=2)]
models += [
SuperSeparableConv2D(
4,
4,
1,
padding=1,
bias_attr=False,
candidate_config={'expand_ratio': (1.0, 2.0)}),
]
self.models = paddle.nn.Sequential(*models)
def forward(self, inputs):
return self.models(inputs)
class TestCase(unittest.TestCase):
def setUp(self):
self.model = ModelCase1()
data_np = np.random.random((1, 3, 64, 64)).astype(np.float32)
self.data = paddle.to_tensor(data_np)
def test_ofa(self):
ofa_model = OFA(self.model)
out = self.model(self.data)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
sys.path.append("../")
import numpy as np
import unittest
import paddle
import paddle.nn as nn
from paddleslim.nas import ofa
from paddleslim.nas.ofa import OFA
from paddleslim.nas.ofa.layers_old import *
class ModelCase1(nn.Layer):
def __init__(self):
super(ModelCase1, self).__init__()
models = [SuperConv2D(3, 4, 3, bias_attr=False)]
models += [
SuperConv2D(
4,
4,
7,
candidate_config={
'expand_ratio': (0.5, 1.0),
'kernel_size': (3, 5, 7)
},
transform_kernel=True)
]
models += [SuperConv2D(4, 4, 3, groups=4)]
models += [SuperConv2D(4, 4, 3, groups=2)]
models += [SuperBatchNorm(4)]
models += [SuperConv2DTranspose(4, 4, 3, bias_attr=False)]
models += [
SuperConv2DTranspose(
4,
4,
7,
candidate_config={
'expand_ratio': (0.5, 1.0),
'kernel_size': (3, 5, 7)
},
transform_kernel=True)
]
models += [SuperConv2DTranspose(4, 4, 3, groups=4)]
models += [SuperInstanceNorm(4)]
models += [nn.Conv2DTranspose(4, 4, 3, groups=2)]
models += [SuperConv2DTranspose(4, 4, 3, groups=2)]
models += [
SuperSeparableConv2D(
4,
4,
1,
padding=1,
bias_attr=False,
candidate_config={'expand_ratio': (0.5, 1.0)}),
]
models += [
SuperSeparableConv2D(
4, 4, 1, padding=1, candidate_config={'channel': (2, 4)}),
]
self.models = paddle.nn.Sequential(*models)
def forward(self, inputs):
return self.models(inputs)
class ModelCase2(nn.Layer):
def __init__(self):
super(ModelCase2, self).__init__()
models = [
SuperEmbedding(
size=(64, 64), candidate_config={'expand_ratio': (0.5, 1.0)})
]
models += [
SuperLinear(
64, 64, candidate_config={'expand_ratio': (0.5, 1.0)})
]
models += [SuperLayerNorm(64)]
models += [SuperLinear(64, 64, candidate_config={'channel': (32, 64)})]
models += [
SuperLinear(
64, 64, bias_attr=False,
candidate_config={'channel': (32, 64)})
]
self.models = paddle.nn.Sequential(*models)
def forward(self, inputs):
return self.models(inputs)
class ModelCase3(nn.Layer):
def __init__(self):
super(ModelCase3, self).__init__()
self.conv1 = SuperConv2D(
3,
4,
7,
candidate_config={'kernel_size': (3, 5, 7)},
transform_kernel=True)
self.conv2 = SuperConv2DTranspose(
4,
4,
7,
candidate_config={'kernel_size': (3, 5, 7)},
transform_kernel=True)
def forward(self, inputs):
inputs = self.conv1(inputs, kernel_size=3)
inputs = self.conv2(inputs, kernel_size=3)
return inputs
class TestCase(unittest.TestCase):
def setUp(self):
self.model = ModelCase1()
data_np = np.random.random((1, 3, 64, 64)).astype(np.float32)
self.data = paddle.to_tensor(data_np)
def test_ofa(self):
ofa_model = OFA(self.model)
out = self.model(self.data)
class TestCase2(TestCase):
def setUp(self):
self.model = ModelCase2()
data_np = np.random.random((64, 64)).astype(np.int64)
self.data = paddle.to_tensor(data_np)
class TestCase3(TestCase):
def setUp(self):
self.model = ModelCase3()
data_np = np.random.random((1, 3, 64, 64)).astype(np.float32)
self.data = paddle.to_tensor(data_np)
if __name__ == '__main__':
unittest.main()
...@@ -20,40 +20,36 @@ import paddle ...@@ -20,40 +20,36 @@ import paddle
import paddle.nn as nn import paddle.nn as nn
from paddle.vision.models import mobilenet_v1 from paddle.vision.models import mobilenet_v1
from paddleslim.nas.ofa.convert_super import Convert, supernet from paddleslim.nas.ofa.convert_super import Convert, supernet
from paddleslim.nas.ofa.utils import compute_neuron_head_importance, reorder_head, reorder_neuron, set_state_dict from paddleslim.nas.ofa.utils import compute_neuron_head_importance, reorder_head, reorder_neuron, set_state_dict, dynabert_config
from paddleslim.nas.ofa import OFA
class TestModel(nn.Layer):
def __init__(self):
super(TestModel, self).__init__()
encoder_layer = nn.TransformerEncoderLayer(
312,
12,
1024,
dropout=0.1,
activation='gelu',
attn_dropout=0.1,
act_dropout=0)
self.encoder = nn.TransformerEncoder(encoder_layer, 3)
self.fc = nn.Linear(312, 3)
def forward(self, input_ids, segment_ids, attention_mask=[None, None]):
src = input_ids + segment_ids
out = self.encoder(src, attention_mask)
out = self.fc(out[:, 0])
return out
class TestComputeImportance(unittest.TestCase): class TestComputeImportance(unittest.TestCase):
def setUp(self): def setUp(self):
self.model = self.init_model() self.model = TestModel()
self.data_loader = self.init_data() self.data_loader = self.init_data()
def init_model(self):
class TestModel(nn.Layer):
def __init__(self):
super(TestModel, self).__init__()
encoder_layer = nn.TransformerEncoderLayer(
312,
12,
1024,
dropout=0.1,
activation='gelu',
attn_dropout=0.1,
act_dropout=0)
self.encoder = nn.TransformerEncoder(encoder_layer, 3)
self.fc = nn.Linear(312, 3)
def forward(self,
input_ids,
segment_ids,
attention_mask=[None, None]):
src = input_ids + segment_ids
out = self.encoder(src, attention_mask)
out = self.fc(out[:, 0])
return out
return TestModel()
def init_data(self): def init_data(self):
batch_size = 16 batch_size = 16
hidden_size = 312 hidden_size = 312
...@@ -67,8 +63,7 @@ class TestComputeImportance(unittest.TestCase): ...@@ -67,8 +63,7 @@ class TestComputeImportance(unittest.TestCase):
paddle.to_tensor(labels)), ) paddle.to_tensor(labels)), )
return data return data
def reorder_reorder_neuron_head(self, model, head_importance, def reorder_neuron_head(self, model, head_importance, neuron_importance):
neuron_importance):
# reorder heads and ffn neurons # reorder heads and ffn neurons
for layer, current_importance in enumerate(neuron_importance): for layer, current_importance in enumerate(neuron_importance):
# reorder heads # reorder heads
...@@ -89,8 +84,7 @@ class TestComputeImportance(unittest.TestCase): ...@@ -89,8 +84,7 @@ class TestComputeImportance(unittest.TestCase):
num_heads=12) num_heads=12)
assert (len(head_importance) == 3) assert (len(head_importance) == 3)
assert (len(neuron_importance) == 3) assert (len(neuron_importance) == 3)
self.reorder_reorder_neuron_head(self.model, head_importance, self.reorder_neuron_head(self.model, head_importance, neuron_importance)
neuron_importance)
class TestComputeImportanceCase1(TestComputeImportance): class TestComputeImportanceCase1(TestComputeImportance):
...@@ -125,5 +119,14 @@ class TestSetStateDict(unittest.TestCase): ...@@ -125,5 +119,14 @@ class TestSetStateDict(unittest.TestCase):
set_state_dict(sp_model, self.origin_weights) set_state_dict(sp_model, self.origin_weights)
class TestSpecialConfig(unittest.TestCase):
def test_dynabert(self):
self.model = TestModel()
sp_net_config = supernet(expand_ratio=[0.5, 1.0])
self.model = Convert(sp_net_config).convert(self.model)
ofa_model = OFA(self.model)
config = dynabert_config(ofa_model, 0.5)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册