add ofa api docs (#576)

* add ofa api docs

add ofa api docs (#576)
* add ofa api docs
095a6f72 · ceci3 · GitHub · 77bfa3ad · 095a6f72 · 095a6f72
21 changed file
--- a/demo/ofa/bert/run_glue_ofa.py
+++ b/demo/ofa/bert/run_glue_ofa.py
@@ -231,27 +231,6 @@ def soft_cross_entropy(inp, target):
    return -1. * paddle.mean(paddle.sum(inp_likelihood * target_prob, axis=-1))
-### get certain config
-def apply_config(model, width_mult):
-    new_config = dict()
-    def fix_exp(idx):
-        if (idx - 3) % 6 == 0 or (idx - 5) % 6 == 0:
-            return True
-        return False
-    for idx, (block_k, block_v) in enumerate(model.layers.items()):
-        if len(block_v.keys()) != 0:
-            name, name_idx = block_k.split('_'), int(block_k.split('_')[1])
-            if fix_exp(name_idx) or 'emb' in block_k or idx == (
-                    len(model.layers.items()) - 2):
-                block_v['expand_ratio'] = 1.0
-            else:
-                block_v['expand_ratio'] = width_mult
-        new_config[block_k] = block_v
-    return new_config
 def convert_example(example,
                    tokenizer,
                    label_list,
@@ -487,7 +466,7 @@ def do_train(args):
            for width_mult in args.width_mult_list:
                # Step8: Broadcast supernet config from width_mult,
                # and use this config in supernet training.
-                net_config = apply_config(ofa_model, width_mult)
+                net_config = utils.dynabert_config(ofa_model, width_mult)
                ofa_model.set_net_config(net_config)
                logits, teacher_logits = ofa_model(
                    input_ids, segment_ids, attention_mask=[None, None])

--- a/demo/ofa/ernie/ernie_supernet/importance.py
+++ b/demo/ofa/ernie/ernie_supernet/importance.py
@@ -20,8 +20,7 @@ import paddle.fluid.dygraph as FD
 import paddle.fluid.layers as L
-def compute_neuron_head_importance(args, model, tokenizer, dev_ds, place,
+def compute_neuron_head_importance(args, model, dev_ds, place, model_cfg):
-                                   model_cfg):
    n_layers, n_heads = model_cfg['num_hidden_layers'], model_cfg[
        'num_attention_heads']
    head_importance = L.zeros(shape=[n_layers, n_heads], dtype='float32')

--- a/demo/ofa/ernie/ernie_supernet/optimization.py
+++ b/demo/ofa/ernie/ernie_supernet/optimization.py
@@ -18,6 +18,7 @@ from __future__ import print_function
 from __future__ import unicode_literals
 from __future__ import absolute_import
+import re
 import paddle.fluid as F
 import paddle.fluid.layers as L
 import paddle.fluid.dygraph as D
@@ -39,4 +40,4 @@ class AdamW(F.optimizer.AdamOptimizer):
        for p, g in params_grads:
            if not self.pat.match(p.name):
                with D.no_grad():
-                    L.assign(p * (20 - self.wd * self.current_step_lr()), p)
+                    L.assign(p * (1. - self.wd * self.current_step_lr()), p)
--- a/demo/ofa/ernie/ofa_ernie.py
+++ b/demo/ofa/ernie/ofa_ernie.py
@@ -49,31 +49,6 @@ def soft_cross_entropy(inp, target):
    return -1. * L.mean(L.reduce_sum(inp_likelihood * target_prob, dim=-1))
-### get certain config
-def apply_config(model, width_mult, depth_mult):
-    new_config = dict()
-    def fix_exp(idx):
-        if (idx - 3) % 6 == 0 or (idx - 5) % 6 == 0:
-            return True
-        return False
-    for idx, (block_k, block_v) in enumerate(model.layers.items()):
-        if isinstance(block_v, dict) and len(block_v.keys()) != 0:
-            name, name_idx = block_k.split('_'), int(block_k.split('_')[1])
-            if fix_exp(name_idx) or 'emb' in block_k or idx == (
-                    len(model.layers.items()) - 2):
-                block_v['expand_ratio'] = 1.0
-            else:
-                block_v['expand_ratio'] = width_mult
-        if block_k == 'depth':
-            block_v = depth_mult
-        new_config[block_k] = block_v
-    return new_config
 if __name__ == '__main__':
    parser = argparse.ArgumentParser('classify model with ERNIE')
    parser.add_argument(
@@ -93,7 +68,7 @@ if __name__ == '__main__':
        type=str,
        required=True,
        help='data directory includes train / develop data')
-    parser.add_argument('--task', type=str, default='mnli', help='task name')
+    parser.add_argument('--task', type=str, default='xnli', help='task name')
    parser.add_argument(
        '--use_lr_decay',
        action='store_true',
@@ -159,7 +134,7 @@ if __name__ == '__main__':
        '--width_mult_list',
        nargs='+',
        type=float,
-        default=[1.0, 0.75, 0.5, 0.5],
+        default=[1.0, 0.75, 0.5, 0.25],
        help="width mult in compress")
    parser.add_argument(
        '--depth_mult_list',
@@ -259,7 +234,7 @@ if __name__ == '__main__':
        ### suppose elastic width first
        if args.reorder_weight:
            head_importance, neuron_importance = compute_neuron_head_importance(
-                args, ofa_model.model, tokenizer, dev_ds, place, model_cfg)
+                args, ofa_model.model, dev_ds, place, model_cfg)
            reorder_neuron_head(ofa_model.model, head_importance,
                                neuron_importance)
        #################
@@ -304,7 +279,7 @@ if __name__ == '__main__':
                for depth_mult in depth_mult_list:
                    for width_mult in args.width_mult_list:
-                        net_config = apply_config(
+                        net_config = utils.dynabert_config(
                            ofa_model, width_mult, depth_mult=depth_mult)
                        ofa_model.set_net_config(net_config)
@@ -380,7 +355,7 @@ if __name__ == '__main__':
                if step % 100 == 0:
                    for depth_mult in depth_mult_list:
                        for width_mult in args.width_mult_list:
-                            net_config = apply_config(
+                            net_config = utils.dynabert_config(
                                ofa_model, width_mult, depth_mult=depth_mult)
                            ofa_model.set_net_config(net_config)

--- a/docs/images/algo/ofa_bert.jpg
+++ b/docs/images/algo/ofa_bert.jpg
--- a/docs/zh_cn/api_cn/convert_supernet_api.rst
+++ b/docs/zh_cn/api_cn/convert_supernet_api.rst
+Convert SuperNet
+============
+在进行Once-For-All训练之前，需要把普通的模型先转换为由动态OP组网的超网络。超网络转换在把普通网络转换为超网络的同时也会把超网络中的最大的子网络转换为搜索空间中最大的网络。
+.. note::
+  - 如果原始卷积的kernel_size是1，则不会对它的kernel_size进行改变。
+..
+接口介绍
+------------------
+.. py:class:: paddleslim.nas.ofa.supernet(kernel_size=None, expand_ratio=None, channel=None)
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/nas/ofa/convert_super.py#L643>`_
+通过键值对的方式传入搜索空间。
+**参数：**
+  - **kernel_size(list|tuple, optional)：** 网络中Conv2D的kernel_size的搜索空间。
+  - **expand_ratio(list|tuple, optional)：** 网络中Conv2D的通道数、Embedding和Linear的参数输出维度的搜索空间，本参数是按照原始模型中每个OP的通道的比例来得到转换后的超网络中每个OP的通道数，所以本参数的长度为1。本参数和 ``channel`` 之间设置一个即可。
+  - **channel(list(list)|tuple(tuple), optional)：** 网络中Conv2D的通道数、Embedding和Linear的参数输出维度的搜索空间，本参数是直接设置超网络中每个OP的通道数量，所以本参数的长度需要和网络中包括的Conv2D、Embedding、Linear的总数相等。本参数和 ``expand_ratio`` 之间设置一个即可。
+**返回：**
+超网络配置。
+.. py:class:: paddleslim.nas.ofa.Convert(context)
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/nas/ofa/convert_super.py#L45>`_
+把普通网络根据传入的自定义的搜索空间转换为超网络。
+**返回：**
+转换实例
+**参数：**
+  - **context(paddleslim.nas.ofa.supernet)：** 用户自定义的搜索空间
+  .. py:method:: convert(network)
+  实际超网络转换。
+  **参数：**
+    - **network(paddle.nn.Layer)：** 要转换为超网络的原始模型实例。
+  **返回：**
+  实例化之后的超网络。
+PaddleSlim提供了三种方式构造超网络，下面分别介绍这三种方式。
+方式一
+------------------
+直接调用搜索空间定义接口和超网络转换接口转换超网络。这种方式的优点是不需要重新定义网络，直接对初始化之后的网络实例进行转换，缺点是只能对整个网络进行超网络转换，不能对部分网络进行超网络转换。
+**示例代码：**
+.. code-block:: python
+  from paddle.vision.models import mobilenet_v1
+  from paddleslim.nas.ofa.convert_super import Convert, supernet
+  model = mobilenet_v1()
+  sp_net_config = supernet(kernel_size=(3, 5, 7), expand_ratio=[1, 2, 4])
+  sp_model = Convert(sp_net_config).convert(self.model)
+方式二
+------------------
+使用上下文的方式转换超网络。这种方式的优点是可以仅转换部分网络为超网络，或者对网络不同部分进行不同的超网络转换，缺点是需要拿到原始网络的定义，并修改网络定义。
+**示例代码：**
+.. code-block:: python
+  import paddle.nn as nn
+  from paddleslim.nas.ofa.convert_super import supernet
+  class Net(nn.Layer):
+    def __init__(self):
+      super(Net, self).__init__()
+      models = []
+      with supernet(kernel_size=(3, 5, 7), expand_ratio=(1, 2, 4)) as ofa_super:
+        models += [nn.Conv2D(3, 4, 3, padding=1)]
+        models += [nn.InstanceNorm2D(4)]
+        models = ofa_super.convert(models)
+      models += [nn.Conv2D(4, 4, 3, groups=4)]
+      self.models = paddle.nn.Sequential(*models)
+     def forward(self, inputs):
+       return self.models(inputs)
+方式三
+------------------
+直接调用动态OP组网，组网方式和普通模型相同。PaddleSlim支持的动态OP请参考 `动态OP <>`_ 。这种方式的优点是组网更自由，缺点是用法更复杂。
+.. note::
+  - paddleslim.nas.ofa.layers 文件中的动态OP是基于Paddle 2.0beta及其之后的版本实现的。paddleslim.nas.ofa.layers_old文件中的动态OP是基于Paddle 2.0beta之前的版本实现的。
+  - Block接口是把当前动态OP的搜索空间加入到OFA训练过程中的搜索空间中。由于Conv2D、Embedding、Linear这三个OP的参数中输出的部分是可以随意修改的，所以这三个OP所对应的动态OP需要使用Block包装一下。而Norm相关的动态OP由于其参数大小是根据输入大小相关，所以不需要用Block包装。
+..
+**示例代码：**
+.. code-block:: python
+  import paddle.nn as nn
+  from paddleslim.nas.ofa.layers import Block, SuperConv2D, SuperBatchNorm2D
+  class Net(nn.Layer):
+    def __init__(self):
+      super(Net, self).__init__()
+      self.models = [Block(SuperConv2D(3, 4, 3, candidate_config={'kernel_size': (3, 5, 7), 'channel': (4, 8, 16)}))]
+      self.models += [SuperBatchNorm2D(16)]
+    def forward(self, inputs):
+        return self.models(inputs)
--- a/docs/zh_cn/api_cn/ofa_api.rst
+++ b/docs/zh_cn/api_cn/ofa_api.rst
+Once-For-All
+============
+在进行Once-For-All训练之前，需要把普通的模型先转换为由动态OP组网的超网络。超网络转换方式可以参考 `超网络转换 <>`_ 。
+Once-For-All 训练参数配置
+------------------
+RunConfig
+>>>>>>>>>
+超网络实际运行需要用到的配置和超参，通过字典的形式配置。如果想使用论文中默认的 ``Progressive shrinking`` 的方式进行超网络训练，则本项为必填参数。否则可以通过 ``paddleslim.nas.ofa.OFA().set_epoch(epoch)`` 和 ``paddleslim.nas.ofa.OFA().set_task(task, phase=None)`` 来手动指定超网络训练所处的阶段。默认：None。
+**参数：**
+  - **train_batch_size:(int, 可选):** 训练时的batch size，用来计算每个epoch包括的iteration数量。默认：None。
+  - **n_epochs(list, 可选):** 包含每个阶段运行到多少epochs，用来判断当前epoch在超网训练中所处的阶段，默认：None。
+  - **total_images(int, 可选):**  训练集图片数量，用来计算每个epoch包括的iteration数量。默认：None。
+  - **elastic_depth(list/tuple, 可选):** 如果设置为None，则不把depth作为搜索的一部分，否则，采样到的config中会包含depth。对模型depth的改变需要在模型定义中的forward部分配合使用，具体示例可以参考 `示例 <>`_ ，默认：None。
+  - **dynamic_batch_size(list, 可选):** 代表每个阶段每个batch数据应该参与几个子网络的训练，shape应该和n_epochs的shape保持一致。默认：None。
+**返回：**
+训练配置。
+**示例代码：**
+.. code-block:: python
+  from paddleslim.nas.ofa import RunConfig
+  default_run_config = {
+      'train_batch_size': 1,
+      'n_epochs': [[1], [2, 3], [4, 5]],
+      'total_images': 12,
+      'elastic_depth': (5, 15, 24)
+      'dynamic_batch_size': [1, 1, 1],
+  }
+  run_config = RunConfig(**default_run_config)
+DistillConfig
+>>>>>>>>>
+如果在训练过程中需要添加蒸馏的话，蒸馏过程的配置和超参，通过字典的形式配置，默认：None。
+**参数：**
+  - **lambda_distill(float, 可选):**  蒸馏loss的缩放比例，默认：None。
+  - **teacher_model(instance of paddle.nn.Layer, 可选):** 教师网络实例，默认：None。
+  - **mapping_layers(list[str], 可选):** 如果需要给模型中间层添加蒸馏，则需要用这个参数给出需要添加蒸馏的中间层的名字，默认：None。
+  - **teacher_model_path(str, 可选):** 教师网络预训练模型的路径，默认：None。
+  - **distill_fn(instance of paddle.nn.Layer, 可选):** 如果需要自定义添加蒸馏loss，则需要传入loss的实例，若传入参数为None，则默认使用mse_loss作为蒸馏损失，默认：None。
+  - **mapping_op(str, 可选):** 如果在给模型中间层添加蒸馏的时候教师网络和学生网络中间层的shape不相同，则给学生网络中间层添加相应的op，保证在计算蒸馏损失时，教师网络和学生网络中间层的shape相同。该参数可选范围为 ``["conv", "linear", None]`` ，'conv'表示添加Conv2D，'linear'表示添加Linear，None表示不添加任何op。若使用本参数在蒸馏过程中额外添加op，则在优化过程中可以调用 ``paddleslim.nas.ofa.OFA().netAs_param`` 获取到这些op的参数，并把这些op的参数添加到优化器的参数列表中。默认：None。
+**返回：**
+蒸馏配置。
+**示例代码：**
+.. code-block:: python
+  from paddleslim.nas.ofa import DistillConfig
+  default_distill_config = {
+      'lambda_distill': 0.01,
+      'teacher_model': teacher_model,
+      'mapping_layers': ['models.0.fn'],
+      'teacher_model_path': None,
+      'distill_fn': None,
+      'mapping_op': 'conv2d'
+  }
+  distill_config = DistillConfig(**default_distill_config)
+OFA
+------------------
+把超网络训练方式转换为Once-For-All的方式训练。在 `Once-For-All论文 <>`_ 中，提出 ``Progressive Shrinking`` 的超网络训练方式，具体原理是在训练过程中按照 ``elastic kernel_size`` 、 ``elastic width`` 、 ``elactic depth`` 的顺序分阶段进行训练，并且在训练过程中逐步扩大搜索空间，例如：搜索空间为 ``kernel_size=(3,5,7), expand_ratio=(0.5, 1.0, 2.0), depth=(0.5, 0.75, 1.0)`` ，则在训练过程中首先对kernel size的大小进行动态训练，并把kernel_size的动态训练分为两个阶段，第一阶段kernel_size的搜索空间为 ``[5, 7]`` ，第二阶段kernel_size的搜索空间为 ``[3, 5, 7]`` ；之后把expand_ratio的动态训练加入到超网络训练中，和对kernel_size的训练方式相同，对expand_ratio的动态训练也分为两个阶段，第一阶段expand_ratio的搜索空间为 ``[1.0, 2.0]`` ，第二阶段expand_ratio的搜索空间为 ``[0.5, 1.0, 2.0]`` ；最后对depth进行动态训练，训练阶段和kernel_size相同。
+.. py:class:: paddleslim.nas.ofa.OFA(model, run_config=None, distill_config=None, elastic_order=None, train_full=False)
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/nas/ofa/ofa.py#L91>`_
+**参数：**
+  - **model(paddle.nn.Layer):** 把超网络的训练规则转换成默认的Once-For-All论文中推荐的方式训练。
+  - **run_config(paddleslim.ofa.RunConfig, 可选):** 模型运行过程中的配置，默认：None。
+  - **distill_config(paddleslim.ofa.DistillConfig, 可选):** 若模型运行过程中添加蒸馏的话，蒸馏相关的配置，具体可配置的参数请参考 `DistillConfig <>`_ , 为None的话则不添加蒸馏，默认：None。
+  - **elastic_order(list, 可选):** 指定训练顺序，若传入None，则按照默认的 ``Progressive Shrinking`` 的方式进行超网络训练，默认：None。
+  - **train_full(bool, 可选):** 是否训练超网络中最大的子网络，默认：False。
+**返回：**
+OFA实例
+**示例代码：**
+.. code-block:: python
+   from paddlslim.nas.ofa import OFA
+   ofa_model = OFA(model)
+..
+  .. py:method:: set_epoch(epoch)
+  手动设置OFA训练所处的epoch。
+  **参数：**
+    - **epoch(int)：** - 模型训练过程中当前所处的epoch。
+  **返回：**
+  None
+  **示例代码：**
+  .. code-block:: python
+    ofa_model.set_epoch(3)
+  .. py:method:: set_task(task, phase=None)
+  手动设置OFA超网络训练所处的阶段。
+  **参数：**
+    - **task(list(str)|str)：** 手动设置超网络训练中当前训练的任务名称，可选 ``"kernel_size", "width", "depth"`` 。
+    - **phase(int, 可选)：** 手动设置超网络训练中当前训练任务所处的阶段，阶段指的是 ``Progresssive Shrinking`` 训练方式中每个任务依次增加搜索空间，不同阶段代表着不同大小的搜索空间，若为None，则当前任务使用整个搜索空间，默认：None。
+  **返回：**
+  None
+  **示例代码：**
+  .. code-block:: python
+    ofa_model.set_task('width')
+  .. py:method:: set_net_config(config)
+  手动指定训练超网络中的指定配置的子网络，在训练超网络中特定的某一个或几个子网络时使用。
+  **参数：**
+    - **config(dict)：** 某个子网络训练中每层的训练配置。
+  **返回：**
+  None
+  **示例代码：**
+  .. code-block:: python
+    config = ofa_model.current_config
+    ofa_model.set_net_config(config)
+  .. py:method:: calc_distill_loss()
+  若OFA训练过程中包含中间层蒸馏，则需要调用本接口获取中间蒸馏损失。
+  **返回：**
+  中间层蒸馏损失。
+  **示例代码：**
+  .. code-block:: python
+    distill_loss = ofa_model.calc_distill_loss()
+  .. py:method:: search()
+  ### TODO
+  .. py:method:: export(config)
+  根据传入的子网络配置导出当前子网络的参数。
+  **参数：**
+    - **config(dict)：** 某个子网络每层的配置。
+  **返回：**
+  TODO
+  **示例代码：**
+  TODO
--- a/docs/zh_cn/api_cn/ofa_layer_api.rst
+++ b/docs/zh_cn/api_cn/ofa_layer_api.rst
+SuperOP
+========
+PaddleSlim提供了一些API的动态版本，动态API指的是这些OP的参数大小可以在实际运行过程中根据传入的参数进行改变，用法上的差别具体是forward时候需要额外传一些实际运行相关的参数。其中 `layers_old.py <>`_ 对应的是Paddle 2.0alpha及之前版本的API， `layers.py <>`_ 对应的是Paddle 2.0alpha之后版本的API。
+.. py:class:: paddleslim.nas.ofa.layers.Block(fn, fixed=False, key=None)
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/74db974b6f0187e22bbaf340381a63b7d687a7d4/paddleslim/nas/ofa/layers.py#L64>`_
+对Layer进行封装，封装后的Layer和普通Layer用法相同。把每层定义的搜索空间整合到一个大的搜索空间中，训练的时候可以去选择每层的搜索空间。只有在实际运行过程中可以主动改变参数大小的API需要用本类封装，即只有 ``Conv2D`` 、 ``Linear`` 和 ``Embedding`` 这三个API构造的层可能需要被封装。
+**参数：**
+  - **fn(paddle.nn.Layer)：** 需要被封装的层的实例。
+  - **fixed(bool, optional)：** 在OFA训练过程中，本层的参数形状否保持不变，如果设置为False，则正常搜索，如果设置为True，则在OFA训练过程中本API的参数形状保持不变。默认：False。
+  - **key(string, optional)：** 本层在整个搜索空间中对应的名称，默认：None。
+**返回：**
+Block实例
+**示例代码：**
+.. code-block:: python
+  from paddleslim.nas.ofa.layers import Block
+  block_layer = Block(SuperConv2D(3, 4, 3, candidate_config={'kerne_size': (3, 5, 7)})
+.. py:class:: paddleslim.nas.ofa.layers.SuperConv2D(in_channels, out_channels, kernel_size, candidate_config={}, transform_kernel=False, stride=1, padding=0, dilation=1, groups=1, padding_mode='zeros', weight_attr=None, bias_attr=None, data_format='NCHW')
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/74db974b6f0187e22bbaf340381a63b7d687a7d4/paddleslim/nas/ofa/layers.py#L85>`_
+该接口用于构建 SuperConv2D 类的一个可调用对象。
+**参数：**
+  - **in_channels** (int) - 输入图像的通道数。
+  - **out_channels** (int) - 由卷积操作产生的输出的通道数。
+  - **kernel_size** (int) - 卷积核大小。可以为单个整数或包含两个整数的元组或列表，分别表示卷积核的高和宽。如果为单个整数，表示卷积核的高和宽都等于该整数。
+  - **candidate_config** （dict，可选）- 针对本层卷积的搜索空间，以字典的形式传入，字典可选的关键字包括： ``kernel_size`` ， ``expand_ratio``， ``channel`` ，其中 ``expand_ratio`` 和 ``channel`` 含义相同，都是对通道数进行搜索，不能同时设置。默认值：{}。
+  - **transform_kernel** （bool，可选）- 是否使用转换矩阵把大kernel转换为小kernel。默认值：False。
+  - **stride** (int|list|tuple，可选) - 步长大小。可以为单个整数或包含两个整数的元组或列表，分别表示卷积沿着高和宽的步长。如果为单个整数，表示沿着高和宽的步长都等于该整数。默认值：1。
+  - **padding** (int|list|tuple|str，可选) - 填充大小。如果它是一个字符串，可以是"VALID"或者"SAME"，表示填充算法，计算细节可参考上述 ``padding`` = "SAME"或  ``padding`` = "VALID" 时的计算公式。如果它是一个元组或列表，它可以有3种格式：(1)包含4个二元组：当 ``data_format`` 为"NCHW"时为 [[0,0], [0,0], [padding_height_top, padding_height_bottom], [padding_width_left, padding_width_right]]，当 ``data_format`` 为"NHWC"时为[[0,0], [padding_height_top, padding_height_bottom], [padding_width_left, padding_width_right], [0,0]]；(2)包含4个整数值：[padding_height_top, padding_height_bottom, padding_width_left, padding_width_right]；(3)包含2个整数值：[padding_height, padding_width]，此时padding_height_top = padding_height_bottom = padding_height， padding_width_left = padding_width_right = padding_width。若为一个整数，padding_height = padding_width = padding。默认值：0。
+  - **dilation** (int|list|tuple，可选) - 空洞大小。可以为单个整数或包含两个整数的元组或列表，分别表示卷积核中的元素沿着高和宽的空洞。如果为单个整数，表示高和宽的空洞都等于该整数。默认值：1。
+  - **groups** (int，可选) - 二维卷积层的组数。根据Alex Krizhevsky的深度卷积神经网络（CNN）论文中的成组卷积：当group=n，输入和卷积核分别根据通道数量平均分为n组，第一组卷积核和第一组输入进行卷积计算，第二组卷积核和第二组输入进行卷积计算，……，第n组卷积核和第n组输入进行卷积计算。默认值：1。
+  - **padding_mode** (str, 可选): - 填充模式。 包括 ``'zeros'``, ``'reflect'``, ``'replicate'`` 或者 ``'circular'``. 默认值: ``'zeros'`` .
+  - **weight_attr** (ParamAttr，可选) - 指定权重参数属性的对象。默认值为None，表示使用默认的权重参数属性。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。
+  - **bias_attr** （ParamAttr|bool，可选）- 指定偏置参数属性的对象。若 ``bias_attr`` 为bool类型，只支持为False，表示没有偏置参数。默认值为None，表示使用默认的偏置参数属性。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。
+  - **data_format** (str，可选) - 指定输入的数据格式，输出的数据格式将与输入保持一致，可以是"NCHW"和"NHWC"。N是批尺寸，C是通道数，H是特征高度，W是特征宽度。默认值："NCHW"。
+  .. py:method:: forward(input, kernel_size=None, expand_ratio=None, channel=None)
+  **参数：**
+    - **input** (Tensor)：- 实际输入。
+    - **kernel_size** （int, 可选）：- 实际运行过程中卷积核大小，设置为None时则初始卷积核大小。默认：None。
+    - **expand_ratio** （int|float, 可选）：- 实际运行过程中卷积核输出通道数膨胀比例，设置为None时则初始卷积核通道数。本参数和 ``channel`` 不能同时不为None。默认：None。
+    - **channel** （int, 可选）：- 实际运行过程中卷积核输出通道数，设置为None时则初始卷积核通道数。本参数和 ``expand_ratio`` 不能同时不为None。默认：None。
+**示例代码：**
+.. code-block:: python
+   import paddle 
+   from paddleslim.nas.ofa.layers import SuperConv2D
+   import numpy as np
+   data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+   super_conv2d = SuperConv2D(3, 10, 3)
+   config = {'channel': 5}
+   data = paddle.to_variable(data)
+   conv = super_conv2d(data, **config)
+.. py:class:: paddleslim.nas.ofa.layers.SuperConv2DTranspose(in_channels, out_channels, kernel_size, candidate_config={}, transform_kernel=False, stride=1, padding=0, output_padding=0, dilation=1, groups=1, padding_mode='zeros', weight_attr=None, bias_attr=None, data_format='NCHW')
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/74db974b6f0187e22bbaf340381a63b7d687a7d4/paddleslim/nas/ofa/layers.py#L381>`_
+该接口用于构建 SuperConv2DTranspose 类的一个可调用对象。
+**参数：**
+  - **in_channels** (int) - 输入图像的通道数。
+  - **out_channels** (int) - 卷积核的个数，和输出特征图通道数相同。
+  - **kernel_size** (int|list|tuple) - 卷积核大小。可以为单个整数或包含两个整数的元组或列表，分别表示卷积核的高和宽。如果为单个整数，表示卷积核的高和宽都等于该整数。
+  - **candidate_config** （dict，可选）- 针对本层转置卷积的搜索空间，以字典的形式传入，字典可选的关键字包括： ``kernel_size`` ， ``expand_ratio``， ``channel`` ，其中 ``expand_ratio`` 和 ``channel`` 含义相同，都是对通道数进行搜索，不能同时设置。默认值：{}。
+  - **transform_kernel** （bool，可选）- 是否使用转换矩阵把大kernel转换为小kernel。默认值：False。
+  - **stride** (int|tuple, 可选) - 步长大小。如果 ``stride`` 为元组或列表，则必须包含两个整型数，分别表示垂直和水平滑动步长。否则，表示垂直和水平滑动步长均为 ``stride`` 。默认值：1。
+  - **padding** (int|tuple, 可选) - 填充大小。如果 ``padding`` 为元组或列表，则必须包含两个整型数，分别表示竖直和水平边界填充大小。否则，表示竖直和水平边界填充大小均为 ``padding`` 。如果它是一个字符串，可以是"VALID"或者"SAME"，表示填充算法，计算细节可参考下方形状 ``padding`` = "SAME"或  ``padding`` = "VALID" 时的计算公式。默认值：0。
+  - **output_padding** (int|list|tuple, optional): 输出形状上一侧额外添加的大小. 默认值: 0.
+  - **groups** (int, 可选) - 二维卷积层的组数。根据Alex Krizhevsky的深度卷积神经网络（CNN）论文中的分组卷积：当group=2，卷积核的前一半仅和输入特征图的前一半连接。卷积核的后一半仅和输入特征图的后一半连接。默认值：1。
+  - **dilation** (int|tuple, 可选) - 空洞大小。可以为单个整数或包含两个整数的元组或列表，分别表示卷积核中的元素沿着高和宽的空洞。如果为单个整数，表示高和宽的空洞都等于该整数。默认值：1。
+  - **weight_attr** (ParamAttr, 可选) - 指定权重参数属性的对象。默认值为None，表示使用默认的权重参数属性。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。
+  - **bias_attr** (ParamAttr|bool, 可选) - 指定偏置参数属性的对象。默认值为None，表示使用默认的偏置参数属性。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。
+  - **data_format** (str，可选) - 指定输入的数据格式，输出的数据格式将与输入保持一致，可以是"NCHW"和"NHWC"。N是批尺寸，C是通道数，H是特征高度，W是特征宽度。默认值："NCHW"。
+  .. py:method:: forward(input, kernel_size=None, expand_ratio=None, channel=None)
+  **参数：**
+    - **input** (Tensor)：- 实际输入。
+    - **kernel_size** （int, 可选）：- 实际运行过程中卷积核大小，设置为None时则初始卷积核大小。默认：None。
+    - **expand_ratio** （int|float, 可选）：- 实际运行过程中卷积核输出通道数膨胀比例，设置为None时则初始卷积核通道数。本参数和 ``channel`` 不能同时不为None。默认：None。
+    - **channel** （int, 可选）：- 实际运行过程中卷积核输出通道数，设置为None时则初始卷积核通道数。本参数和 ``expand_ratio`` 不能同时不为None。默认：None。
+**示例代码：**
+.. code-block:: python
+   import paddle 
+   from paddleslim.nas.ofa.layers import SuperConv2D
+   import numpy as np
+   data = np.random.uniform(-1, 1, [32, 10, 32, 32]).astype('float32')
+   config = {'channel': 5}
+   data = paddle.to_variable(data)
+   super_convtranspose = SuperConv2DTranspose(num_channels=32, num_filters=10, filter_size=3)
+   ret = super_convtranspose(paddle.to_variable(data), **config)
+.. py:class:: paddleslim.nas.ofa.layers.SuperLinear(in_features, out_features, candidate_config={}, weight_attr=None, bias_attr=None, name=None):
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/74db974b6f0187e22bbaf340381a63b7d687a7d4/paddleslim/nas/ofa/layers.py#L828>`_
+该接口用于构建 SuperLinear 类的一个可调用对象。
+**参数：**
+  - **in_features** (int) – 线性变换层输入单元的数目。
+  - **out_features** (int) – 线性变换层输出单元的数目。
+  - **candidate_config** （dict，可选）- 针对本层Linear的搜索空间，以字典的形式传入，字典可选的关键字包括： ``expand_ratio``， ``channel`` ，其中 ``expand_ratio`` 和 ``channel`` 含义相同，都是对通道数进行搜索，不能同时设置。默认值：{}。
+  - **weight_attr** (ParamAttr, 可选) – 指定权重参数属性的对象。默认值为None，表示使用默认的权重参数属性。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。
+  - **bias_attr** (ParamAttr, 可选) – 指定偏置参数属性的对象，若 `bias_attr` 为bool类型，如果设置为False，表示不会为该层添加偏置；如果设置为True，表示使用默认的偏置参数属性。默认值为None，表示使用默认的偏置参数属性。默认的偏置参数属性将偏置参数的初始值设为0。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。
+  - **name** (string, 可选) – BatchNorm的名称, 默认值为None。更多信息请参见 :ref:`api_guide_Name` 。
+  .. py:method:: forward(input, expand_ratio=None, channel=None)
+  **参数：**
+    - **input** (Tensor)：- 实际输入。
+    - **expand_ratio** （int|float, 可选）：- 实际运行过程中卷积核输出通道数膨胀比例，设置为None时则初始卷积核通道数。本参数和 ``channel`` 不能同时不为None。默认：None。
+    - **channel** （int, 可选）：- 实际运行过程中卷积核输出通道数，设置为None时则初始卷积核通道数。本参数和 ``expand_ratio`` 不能同时不为None。默认：None。
+**示例代码：**
+.. code-block:: python
+  import numpy as np
+  import paddle
+  from paddleslim.nas.ofa.layers import SuperLinear
+  data = np.random.uniform(-1, 1, [32, 64] ).astype('float32')
+  config = {'channel': 16}
+  linear = SuperLinear(32, 64)
+  data = paddle.to_variable(data)
+  res = linear(data, **config)
+.. py:class:: paddleslim.nas.ofa.layers.SuperEmbedding(num_embeddings, embedding_dim, candidate_config={}, padding_idx=None, sparse=False, weight_attr=None, name=None):
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/74db974b6f0187e22bbaf340381a63b7d687a7d4/paddleslim/nas/ofa/layers.py#L1126>`_
+该接口用于构建 SuperEmbedding 类的一个可调用对象。
+**参数：**
+  - **num_embeddings** (int) - Embedding字典词表大小。
+  - **embedding_dim** (int) - Embedding矩阵每个词向量的维度。
+  - **candidate_config** （dict，可选）- 针对本层Embedding的搜索空间，以字典的形式传入，字典可选的关键字包括： ``expand_ratio``， ``channel`` ，其中 ``expand_ratio`` 和 ``channel`` 含义相同，都是对通道数进行搜索，不能同时设置。默认值：{}。
+  - **padding_idx** (int|long|None) - padding_idx需在区间[-vocab_size, vocab_size)，否则不生效，padding_idx<0时，padding_idx会被改成vocab_size + padding_idx，input中等于padding_index的id对应的embedding信息会被设置为0，且这部分填充数据在训练时将不会被更新。如果为None，不作处理，默认为None。
+  - **sparse** (bool) - 是否使用稀疏的更新方式，这个参数只会影响反向的梯度更新的性能，sparse更新速度更快，推荐使用稀疏更新的方式。但某些optimizer不支持sparse更新，比如 :ref:`cn_api_fluid_optimizer_AdadeltaOptimizer` 、 :ref:`cn_api_fluid_optimizer_AdamaxOptimizer` 、 :ref:`cn_api_fluid_optimizer_DecayedAdagradOptimizer` 、 :ref:`cn_api_fluid_optimizer_FtrlOptimizer` 、 :ref:`cn_api_fluid_optimizer_LambOptimizer` 、:ref:`cn_api_fluid_optimizer_LarsMomentumOptimizer` ，此时sparse必须为False。默认为False。
+  - **weight_attr** (ParamAttr) - 指定权重参数属性的对象。默认值为None，表示使用默认的权重参数属性。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。此外，可以通过 ``weight_attr`` 参数加载用户自定义或预训练的词向量。只需将本地词向量转为numpy数据格式，且保证本地词向量的shape和embedding的 ``num_embeddings`` 和 ``embedding_dim`` 参数一致，然后使用 :ref:`cn_api_fluid_initializer_NumpyArrayInitializer` 进行初始化，即可实现加载自定义或预训练的词向量。详细使用方法见代码示例2。
+  - **name** (string, 可选) – BatchNorm的名称, 默认值为None。更多信息请参见 :ref:`api_guide_Name` 。
+  .. py:method:: forward(input, kernel_size=None, expand_ratio=None, channel=None)
+  **参数：**
+    - **input** (Tensor)：- 实际输入。
+    - **expand_ratio** （int|float, 可选）：- 实际运行过程中卷积核输出通道数膨胀比例，设置为None时则初始卷积核通道数。本参数和 ``channel`` 不能同时不为None。默认：None。
+    - **channel** （int, 可选）：- 实际运行过程中卷积核输出通道数，设置为None时则初始卷积核通道数。本参数和 ``expand_ratio`` 不能同时不为None。默认：None。
+**示例代码：**
+.. code-block:: python
+  import numpy as np
+  import paddle
+  from paddleslim.nas.ofa.layers import SuperEmbedding
+  data = np.random.uniform(-1, 1, [32, 64]).astype('float32')
+  config = {'channel': 16}
+  emb = SuperEmbedding(32, 64)
+  data = paddle.to_variable(data)
+  res = emb(data, **config)
+.. py:class:: paddleslim.nas.ofa.layers.SuperBatchNorm2D(num_features, momentum=0.9, epsilon=1e-05, weight_attr=None, bias_attr=None, data_format='NCHW', name=None):
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/74db974b6f0187e22bbaf340381a63b7d687a7d4/paddleslim/nas/ofa/layers.py#L932>`_
+该接口用于构建 SuperBatchNorm2D 类的一个可调用对象。
+**参数：**
+  - **num_features** (int) - 指明输入 ``Tensor`` 的通道数量。
+  - **epsilon** (float, 可选) - 为了数值稳定加在分母上的值。默认值：1e-05。
+  - **momentum** (float, 可选) - 此值用于计算 ``moving_mean`` 和 ``moving_var`` 。默认值：0.9。
+  - **weight_attr** (ParamAttr|bool, 可选) - 指定权重参数属性的对象。如果为False, 则表示每个通道的伸缩固定为1，不可改变。默认值为None，表示使用默认的权重参数属性。具体用法请参见 :ref:`cn_api_ParamAttr` 。
+  - **bias_attr** (ParamAttr, 可选) - 指定偏置参数属性的对象。如果为False, 则表示每一个通道的偏移固定为0，不可改变。默认值为None，表示使用默认的偏置参数属性。具体用法请参见 :ref:`cn_api_ParamAttr` 。
+  - **data_format** (string, 可选) - 指定输入数据格式，数据格式可以为"NCHW"。默认值：“NCHW”。
+  - **name** (string, 可选) – BatchNorm的名称, 默认值为None。更多信息请参见 :ref:`api_guide_Name` 。
+**示例代码：**
+.. code-block:: python
+    import paddle
+    import numpy as np
+    from paddleslim.nas.ofa.layers import SuperBatchNorm2D
+    np.random.seed(123)
+    x_data = np.random.random(size=(2, 5, 2, 3)).astype('float32')
+    x = paddle.to_tensor(x_data) 
+    batch_norm = SuperBatchNorm2D(5)
+    batch_norm_out = batch_norm(x)
+.. py:class:: paddleslim.nas.ofa.layers.SuperInstanceNorm2D(num_features, momentum=0.9, epsilon=1e-05, weight_attr=None, bias_attr=None, data_format='NCHW', name=None):
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/74db974b6f0187e22bbaf340381a63b7d687a7d4/paddleslim/nas/ofa/layers.py#L999>`_
+该接口用于构建 SuperInstanceNorm2D 类的一个可调用对象。
+**参数：**
+  - **num_features** (int) - 指明输入 ``Tensor`` 的通道数量。
+  - **epsilon** (float, 可选) - 为了数值稳定加在分母上的值。默认值：1e-05。
+  - **momentum** (float, 可选) - 本参数目前对 ``InstanceNorm2D`` 无效，无需设置。默认值：0.9。
+  - **weight_attr** (ParamAttr|bool, 可选) - 指定权重参数属性的对象。如果为False, 则表示每个通道的伸缩固定为1，不可改变。默认值为None，表示使用默认的权重参数属性。具体用法请参见 :ref:`cn_api_ParamAttr` 。
+  - **bias_attr** (ParamAttr, 可选) - 指定偏置参数属性的对象。如果为False, 则表示每一个通道的偏移固定为0，不可改变。默认值为None，表示使用默认的偏置参数属性。具体用法请参见 :ref:`cn_api_ParamAttr` 。
+  - **data_format** (string, 可选) - 指定输入数据格式，数据格式可以为"NCHW"。默认值：“NCHW”。
+  - **name** (string, 可选) – BatchNorm的名称, 默认值为None。更多信息请参见 :ref:`api_guide_Name` 。
+**示例代码：**
+.. code-block:: python
+    import paddle
+    import numpy as np
+    from paddleslim.nas.ofa.layers import SuperInstanceNorm2D
+    np.random.seed(123)
+    x_data = np.random.random(size=(2, 5, 2, 3)).astype('float32')
+    x = paddle.to_tensor(x_data) 
+    instance_norm = SuperInstanceNorm2D(5)
+    out = instance_norm(x)
+.. py:class:: paddleslim.nas.ofa.layers.SuperLayerNorm(normalized_shape, epsilon=1e-05, weight_attr=None, bias_attr=None, name=None):
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/74db974b6f0187e22bbaf340381a63b7d687a7d4/paddleslim/nas/ofa/layers.py#L1057>`_
+该接口用于构建 SuperLayerNorm 类的一个可调用对象。
+**参数：**
+  - **normalized_shape** (int 或 list 或 tuple) – 需规范化的shape，期望的输入shape为 ``[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`` 。如果是单个整数，则此模块将在最后一个维度上规范化（此时最后一维的维度需与该参数相同）。
+  - **epsilon** (float, 可选) - 指明在计算过程中是否添加较小的值到方差中以防止除零。默认值：1e-05。
+  - **weight_attr** (ParamAttr|bool, 可选) - 指定权重参数属性的对象。如果为False固定为1，不进行学习。默认值为None, 表示使用默认的权重参数属性。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。
+  - **bias_attr** (ParamAttr, 可选) - 指定偏置参数属性的对象。如果为False固定为0，不进行学习。默认值为None，表示使用默认的偏置参数属性。具体用法请参见 :ref:`cn_api_fluid_ParamAttr` 。
+  - **name** (string, 可选) – LayerNorm的名称, 默认值为None。更多信息请参见 :ref:`api_guide_Name` 。
+**示例代码：**
+.. code-block:: python
+    import paddle
+    import numpy as np
+    from paddleslim.nas.ofa.layers import SuperLayerNorm
+    np.random.seed(123)
+    x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+    x = paddle.to_tensor(x_data) 
+    layer_norm = SuperLayerNorm(x_data.shape[1:])
+    layer_norm_out = layer_norm(x)
--- a/docs/zh_cn/tutorials/ernie_slim_ofa_tutorial.md
+++ b/docs/zh_cn/tutorials/ernie_slim_ofa_tutorial.md
+# TinyERNIE模型压缩教程
+1. 本教程是对TinyERNIE模型进行压缩的原理介绍。并以ERNIE repo中TinyERNIE模型为例，说明如何快速把整体压缩流程迁移到其他NLP模型。
+2. 本教程使用的是[DynaBERT-Dynamic BERT with Adaptive Width and Depth](https://arxiv.org/abs/2004.04037)中的训练策略。把原始模型作为超网络中最大的子模型，原始模型包括多个相同大小的Transformer Block。在每次训练前会选择当前轮次要训练的子模型，每个子模型包含多个相同大小的Sub Transformer Block，每个Sub Transformer Block是选择不同宽度的Transformer Block得到的，一个Transformer Block包含一个Multi-Head Attention和一个Feed-Forward Network，Sub Transformer Block获得方式为：<br/>
+&emsp;&emsp;a. 一个Multi-Head Attention层中有多个Head，每次选择不同宽度的子模型时，会同时对Head数量进行等比例减少，例如：如果原始模型中有12个Head，本次训练选择的模型是宽度为原始宽度75%的子模型，则本次训练中所有Transformer Block的Head数量为9。<br/>
+&emsp;&emsp;b. Feed-Forward Network层中Linear的参数大小进行等比例减少，例如：如果原始模型中FFN层的特征维度为3072，本次训练选择的模型是宽度为原始宽度75%的子模型，则本次训练中所有Transformer Block中FFN层的特征维度为2304。
+## 整体原理介绍
+1. 首先对预训练模型的参数和head根据其重要性进行重排序，把重要的参数和head排在参数的前侧，保证训练过程中的参数裁剪不会裁剪掉这些重要的参数。参数的重要性计算是先使用dev数据计算一遍每个参数的梯度，然后根据梯度和参数的整体大小来计算当前参数的重要性，head的的重要性计算是通过传入一个全1的对head的mask，并计算这个mask的梯度，根据mask的梯度来判断每个Multi-Head Attention层中每个Head的重要性。
+2. 使用原本的预训练模型作为蒸馏过程中的教师网络。同时定义一个超网络，这个超网络中最大的子网络的结构和教师网络的结构相同其他小的子网络是对最大网络的进行不同的宽度选择来得到的，宽度选择具体指的是网络中的参数进行裁剪，所有子网络在整个训练过程中都是参数共享的。
+3. 使用重排序之后的预训练模型参数初始化超网络，并把这个超网络作为学生网络。分别为embedding层，每个transformer block层和最后的logit添加蒸馏损失。
+4. 每个batch数据在训练前首先中会选择当前要训练的子网络配置（子网络配置目前仅包括对整个模型的宽度的选择），参数更新时仅会更新当前子网络计算中用到的那部分参数。
+5. 通过以上的方式来优化整个超网络参数，训练完成后选择满足加速要求和精度要求的子模型。
+<p align="center">
+<img src="../../images/algo/ofa_bert.jpg" width="950"/><br />
+整体流程图
+</p>
+## 基于ERNIE repo代码进行压缩
+本教程基于PaddleSlim2.0及之后版本、Paddle1.8.5和ERNIE 0.0.4dev及之后版本，请确认已正确安装Paddle、PaddleSlim和ERNIE。
+基于ERNIE repo中TinyERNIE的整体代码示例请参考：[TinyERNIE](../../../demo/ofa/ernie/README.md)
+### 1. 定义初始网络
+定义原始TinyERNIE模型并定义一个字典保存原始模型参数。普通模型转换为超网络之后，由于其组网OP的改变导致原始模型加载的参数失效，所以需要定义一个字典保存原始模型的参数并用来初始化超网络。设置'return_additional_info'参数为True，返回中间层结果，便于添加蒸馏。
+```python
+model = ErnieModelForSequenceClassification.from_pretrained(args.from_pretrained, num_labels=3, name='')
+setattr(model, 'return_additional_info', True)
+origin_weights = {}
+for name, param in model.named_parameters():
+    origin_weights[name] = param
+```
+### 2. 构建超网络
+定义搜索空间，并根据搜索空间把普通网络转换为超网络。
+```python
+# 定义搜索空间
+sp_config = supernet(expand_ratio=[0.25, 0.5, 0.75, 1.0])
+# 转换模型为超网络
+model = Convert(sp_config).convert(model)
+paddleslim.nas.ofa.utils.set_state_dict(model, origin_weights)
+```
+### 3. 定义教师网络
+调用paddlenlp中的接口直接构造教师网络。设置'return_additional_info'参数为True，返回中间层结果，便于添加蒸馏。
+```python
+teacher_model = ErnieModelForSequenceClassification.from_pretrained(args.from_pretrained, num_labels=3, name='teacher')
+setattr(teacher_model, 'return_additional_info', True)
+```
+### 4. 配置蒸馏相关参数
+需要配置的参数包括教师模型实例。TinyERNIE模型定义的时候会返回隐藏层和Embedding层的计算结果，所以直接利用返回值进行网络蒸馏。
+```python
+default_distill_config = {
+    'teacher_model': teacher_model
+}
+distill_config = DistillConfig(**default_distill_config)
+```
+### 5. 定义Once-For-All模型
+普通模型和蒸馏相关配置传给OFA接口，自动添加蒸馏过程并把超网络训练方式转为OFA训练方式。
+```python
+ofa_model = paddleslim.nas.ofa.OFA(model, distill_config=distill_config)
+```
+### 6. 计算神经元和head的重要性并根据其重要性重排序参数
+基于Paddle 1.8.5实现的重要性计算代码位于：[importance.py](../../../demo/ofa/ernie/ernie_supernet/importance.py)
+```python
+head_importance, neuron_importance = compute_neuron_head_importance(
+    args,
+    ofa_model.model,
+    dev_ds,
+    place,
+    model_cfg)
+reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)
+```
+### 7. 传入当前OFA训练所处的阶段
+```python
+ofa_model.set_epoch(epoch)
+ofa_model.set_task('width')
+```
+### 8. 传入网络相关配置，开始训练
+本示例使用DynaBERT的方式进行超网络训练。
+```python
+width_mult_list = [1.0, 0.75, 0.5, 0.25]
+lambda_logit = 0.1
+# paddle 2.0rc1之前版本的动态图模型梯度不会自动累加，需要自定义一个dict保存每个模型的梯度，自行进行梯度累加
+accumulate_gradients = dict()
+for param in opt._parameter_list:
+    accumulate_gradients[param.name] = 0.0
+for width_mult in width_mult_list:
+    net_config = paddleslim.nas.ofa.utils.dynabert_config(ofa_model, width_mult)
+    ofa_model.set_net_config(net_config)
+    student_output, teacher_output = ofa_model(ids, sids, labels=label,
+        num_layers=model_cfg['num_hidden_layers'])
+    loss, student_logit, student_reps = student_output[
+        0], student_output[1], student_output[2]['hiddens']
+    teacher_logit, teacher_reps = teacher_output[
+        1], teacher_output[2]['hiddens']
+    logit_loss = soft_cross_entropy(student_logits, teacher_logits.detach())
+    rep_loss = 0.0
+    for stu_rep, tea_rep in zip(student_reps, teacher_reps):
+        tmp_loss = L.mse_loss(stu_rep, tea_rep.detach())
+        rep_loss += tmp_loss
+    loss = rep_loss + lambda_logit * logit_loss
+    loss.backward()
+    param_grads = opt.backward(loss)
+    # 梯度累加
+    for param in opt._parameter_list:
+        accumulate_gradients[param.name] += param.gradient()
+# 利用累加后的梯度更新模型
+for k, v in param_grads:
+    assert k.name in accumulate_gradients.keys(
+    ), "{} not in accumulate_gradients".format(k.name)
+    v.set_value(accumulate_gradients[k.name])
+opt.apply_optimize(
+    loss, startup_program=None, params_grads=param_grads)
+ofa_model.model.clear_gradients()
+```
+---
+**NOTE**
+由于在计算head的重要性时会利用一个mask来收集梯度，所以需要通过monkey patch的方式重新实现一下TinyERNIE中一些相关类的forward函数。具体实现的forward可以参考：[model_ernie_supernet.py](../../../demo/ofa/ernie/ernie_supernet/modeling_ernie_supernet.py)
+---
--- a/docs/zh_cn/tutorials/paddlenlp_slim_ofa_tutorial.md
+++ b/docs/zh_cn/tutorials/paddlenlp_slim_ofa_tutorial.md
-# PaddleNLP-BERT模型压缩教程
+# BERT模型压缩教程
-1. 对Fine-tuning得到模型通过计算参数及其梯度的乘积得到参数的重要性，把模型参数根据重要性进行重排序。
+1. 本教程是对BERT模型进行压缩的原理介绍。并以PaddleNLP repo中BERT-base模型为例，说明如何快速把整体压缩流程迁移到其他NLP模型。
-2. 超网络中最大的子网络选择和Bert-base模型网络结构一致的网络结构，其他小的子网络是对最大网络的进行不同的宽度选择来得到的，宽度选择
-具体指的是网络中的参数进行裁剪，所有子网络在整个训练过程中都是参数共享的。
+2. 本教程使用的是[DynaBERT-Dynamic BERT with Adaptive Width and Depth](https://arxiv.org/abs/2004.04037)中的训练策略。把原始模型作为超网络中最大的子模型，原始模型包括多个相同大小的Transformer Block。在每次训练前会选择当前轮次要训练的子模型，每个子模型包含多个相同大小的Sub Transformer Block，每个Sub Transformer Block是选择不同宽度的Transformer Block得到的，一个Transformer Block包含一个Multi-Head Attention和一个Feed-Forward Network，Sub Transformer Block获得方式为：<br/>
-2. 用重排序之后的模型参数作为超网络模型的初始化参数。
+&emsp;&emsp;a. 一个Multi-Head Attention层中有多个Head，每次选择不同宽度的子模型时，会同时对Head数量进行等比例减少，例如：如果原始模型中有12个Head，本次训练选择的模型是宽度为原始宽度75%的子模型，则本次训练中所有Transformer Block的Head数量为9。<br/>
-3. Fine-tuning之后的模型作为教师网络，超网络作为学生网络，进行知识蒸馏。
+&emsp;&emsp;b. Feed-Forward Network层中Linear的参数大小进行等比例减少，例如：如果原始模型中FFN层的特征维度为3072，本次训练选择的模型是宽度为原始宽度75%的子模型，则本次训练中所有Transformer Block中FFN层的特征维度为2304。
+## 整体原理介绍
+1. 首先对预训练模型的参数和head根据其重要性进行重排序，把重要的参数和head排在参数的前侧，保证训练过程中的参数裁剪不会裁剪掉这些重要的参数。参数的重要性计算是先使用dev数据计算一遍每个参数的梯度，然后根据梯度和参数的整体大小来计算当前参数的重要性，head的的重要性计算是通过传入一个全1的对head的mask，并计算这个mask的梯度，根据mask的梯度来判断每个Multi-Head Attention层中每个Head的重要性。
+2. 使用原本的预训练模型作为蒸馏过程中的教师网络。同时定义一个超网络，这个超网络中最大的子网络的结构和教师网络的结构相同其他小的子网络是对最大网络的进行不同的宽度选择来得到的，宽度选择具体指的是网络中的参数进行裁剪，所有子网络在整个训练过程中都是参数共享的。
+3. 使用重排序之后的预训练模型参数初始化超网络，并把这个超网络作为学生网络。分别为embedding层，每个transformer block层和最后的logit添加蒸馏损失。
+4. 每个batch数据在训练前首先中会选择当前要训练的子网络配置（子网络配置目前仅包括对整个模型的宽度的选择），参数更新时仅会更新当前子网络计算中用到的那部分参数。
+5. 通过以上的方式来优化整个超网络参数，训练完成后选择满足加速要求和精度要求的子模型。
 <p align="center">
 <img src="../../images/algo/ofa_bert.jpg" width="950"/><br />
 整体流程图
 </p>
+## 基于PaddleNLP repo代码进行压缩
+本教程基于PaddleSlim2.0及之后版本、Paddle2.0rc1及之后版本和PaddleNLP2.0beta及之后版本，请确认已正确安装Paddle、PaddleSlim和PaddleNLP。
+基于PaddleNLP repo中BERT-base的整体代码示例请参考：[BERT-base](../../../demo/ofa/bert/README.md)
+### 1. 定义初始网络
+定义原始BERT-base模型并定义一个字典保存原始模型参数。普通模型转换为超网络之后，由于其组网OP的改变导致原始模型加载的参数失效，所以需要定义一个字典保存原始模型的参数并用来初始化超网络。
+```python
+model = BertForSequenceClassification.from_pretrained('bert', num_classes=2)
+origin_weights = {}
+for name, param in model.named_parameters():
+    origin_weights[name] = param
+```
+### 2. 构建超网络
+定义搜索空间，并根据搜索空间把普通网络转换为超网络。
+```python
+# 定义搜索空间
+sp_config = supernet(expand_ratio=[0.25, 0.5, 0.75, 1.0])
+# 转换模型为超网络
+model = Convert(sp_config).convert(model)
+paddleslim.nas.ofa.utils.set_state_dict(model, origin_weights)
+```
+### 3. 定义教师网络
+调用paddlenlp中的接口直接构造教师网络。
+```python
+teacher_model = BertForSequenceClassification.from_pretrained('bert', num_classes=2)
+```
+### 4. 配置蒸馏相关参数
+需要配置的参数包括教师模型实例；需要添加蒸馏的层，在教师网络和学生网络的Embedding层和每一个Tranformer Block层之间添加蒸馏损失，中间层的蒸馏损失使用默认的MSE损失函数；配置'lambda_distill'参数表示整体蒸馏损失的缩放比例。
+```python
+mapping_layers = ['bert.embeddings']
+for idx in range(model.bert.config['num_hidden_layers']):
+    mapping_layers.append('bert.encoder.layers.{}'.format(idx))
+default_distill_config = {
+    'lambda_distill': 0.1,
+    'teacher_model': teacher_model,
+    'mapping_layers': mapping_layers,
+}
+distill_config = DistillConfig(**default_distill_config)
+```
+### 5. 定义Once-For-All模型
+普通模型和蒸馏相关配置传给OFA接口，自动添加蒸馏过程并把超网络训练方式转为OFA训练方式。
+```python
+ofa_model = paddleslim.nas.ofa.OFA(model, distill_config=distill_config)
+```
+### 6. 计算神经元和head的重要性并根据其重要性重排序参数
+```python
+head_importance, neuron_importance = utils.compute_neuron_head_importance(
+    'sst-2',
+    ofa_model.model,
+    dev_data_loader,
+    num_layers=model.bert.config['num_hidden_layers'],
+    num_heads=model.bert.config['num_attention_heads'])
+reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)
+```
+### 7. 传入当前OFA训练所处的阶段
+```python
+ofa_model.set_epoch(epoch)
+ofa_model.set_task('width')
+```
+### 8. 传入网络相关配置，开始训练
+本示例使用DynaBERT的方式进行超网络训练。
+```python
+width_mult_list = [1.0, 0.75, 0.5, 0.25]
+lambda_logit = 0.1
+for width_mult in width_mult_list:
+    net_config = paddleslim.nas.ofa.utils.dynabert_config(ofa_model, width_mult)
+    ofa_model.set_net_config(net_config)
+    logits, teacher_logits = ofa_model(input_ids, segment_ids, attention_mask=[None, None])
+    rep_loss = ofa_model.calc_distill_loss()
+    logit_loss = soft_cross_entropy(logits, teacher_logits.detach())
+    loss = rep_loss + lambda_logit * logit_loss
+    loss.backward()
+optimizer.step()
+lr_scheduler.step()
+ofa_model.model.clear_gradients()
+```
+---
+**NOTE**
+由于在计算head的重要性时会利用一个mask来收集梯度，所以需要通过monkey patch的方式重新实现一下BERT的forward函数。示例如下:
+```python
+from paddlenlp.transformers import BertModel
+def bert_forward(self,
+                 input_ids,
+                 token_type_ids=None,
+                 position_ids=None,
+                 attention_mask=[None, None]):
+    wtype = self.pooler.dense.fn.weight.dtype if hasattr(
+        self.pooler.dense, 'fn') else self.pooler.dense.weight.dtype
+    if attention_mask[0] is None:
+        attention_mask[0] = paddle.unsqueeze(
+            (input_ids == self.pad_token_id).astype(wtype) * -1e9, axis=[1, 2])
+    embedding_output = self.embeddings(
+        input_ids=input_ids,
+        position_ids=position_ids,
+        token_type_ids=token_type_ids)
+    encoder_outputs = self.encoder(embedding_output, attention_mask)
+    sequence_output = encoder_outputs
+    pooled_output = self.pooler(sequence_output)
+    return sequence_output, pooled_output
+BertModel.forward = bert_forward
+```
+---
--- a/paddleslim/nas/ofa/__init__.py
+++ b/paddleslim/nas/ofa/__init__.py
@@ -18,6 +18,6 @@ from .convert_super import supernet
 from .utils.utils import get_paddle_version
 pd_ver = get_paddle_version()
 if pd_ver == 185:
-    from .layers import *
+    from .layers_old import *
 else:
-    from .layers_new import *
+    from .layers import *
--- a/paddleslim/nas/ofa/convert_super.py
+++ b/paddleslim/nas/ofa/convert_super.py
@@ -24,15 +24,15 @@ if pd_ver == 185:
    import paddle.fluid.dygraph.nn as nn
    from paddle.fluid.dygraph.nn import Conv2D, Conv2DTranspose, Linear, LayerNorm, Embedding
    from paddle.fluid import ParamAttr
-    from .layers import *
+    from .layers_old import *
-    from . import layers
+    from . import layers_old as layers
    Layer = paddle.fluid.dygraph.Layer
 else:
    import paddle.nn as nn
    from paddle.nn import Conv2D, Conv2DTranspose, Linear, LayerNorm, Embedding
    from paddle import ParamAttr
-    from .layers_new import *
+    from .layers import *
-    from . import layers_new as layers
+    from . import layers
    Layer = paddle.nn.Layer
 _logger = get_logger(__name__, level=logging.INFO)
@@ -43,6 +43,17 @@ WEIGHT_LAYER = ['conv', 'linear', 'embedding']
 class Convert:
+    """
+    Convert network to the supernet according to the search space.
+    Parameters:
+        context(paddleslim.nas.ofa.supernet): search space defined by the user.
+    Examples:
+        .. code-block:: python
+          from paddleslim.nas.ofa import supernet, Convert
+          sp_net_config = supernet(kernel_size=(3, 5, 7), expand_ratio=[1, 2, 4])
+          convert = Convert(sp_net_config)
+    """
    def __init__(self, context):
        self.context = context
@@ -63,6 +74,17 @@ class Convert:
                    layer._bias_attr.name = 'super_' + layer._bias_attr.name
    def convert(self, network):
+        """
+        The function to convert the network to a supernet.
+        Parameters:
+            network(paddle.nn.Layer|list(paddle.nn.Layer)): instance of the model or list of instance of layers.
+        Examples:
+            .. code-block:: python
+              from paddle.vision.models import mobilenet_v1
+              from paddleslim.nas.ofa import supernet, Convert
+              sp_net_config = supernet(kernel_size=(3, 5, 7), expand_ratio=[1, 2, 4])
+              convert = Convert(sp_net_config).convert(mobilenet_v1())
+        """
        # search the first and last weight layer, don't change out channel of the last weight layer
        # don't change in channel of the first weight layer
        model = []
@@ -641,6 +663,14 @@ class Convert:
 class supernet:
+    """
+    Search space of the network.
+    Parameters:
+        kernel_size(list|tuple, optional): search space for the kernel size of the Conv2D.
+        expand_ratio(list|tuple, optional): the search space for the expand ratio of the number of channels of Conv2D, the expand ratio of the output dimensions of the Embedding or Linear, which means this parameter get the number of channels of each OP in the converted super network based on the the channels of each OP in the original model, so this parameter The length is 1. Just set one between this parameter and ``channel``.
+        channel(list|tuple, optional): the search space for the number of channels of Conv2D, the output dimensions of the Embedding or Linear, this parameter directly sets the number of channels of each OP in the super network, so the length of this parameter needs to be the same as the total number that of Conv2D, Embedding, and Linear included in the network. Just set one between this parameter and ``expand_ratio``.
+    """
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

--- a/paddleslim/nas/ofa/layers.py
+++ b/paddleslim/nas/ofa/layers.py
@@ -12,21 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+### NOTE: the API of this file is based on Paddle2.0, the API in layers_old.py is based on Paddle1.8
 import numpy as np
 import logging
-import paddle.fluid as fluid
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
 import paddle.fluid.core as core
-import paddle.fluid.dygraph_utils as dygraph_utils
-from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle.fluid.framework import in_dygraph_mode, _varbase_creator
-from paddle.fluid.dygraph.nn import InstanceNorm, Conv2D, Conv2DTranspose, BatchNorm
 from ...common import get_logger
 from .utils.utils import compute_start_end, get_same_padding, convert_to_list
 __all__ = [
    'SuperConv2D', 'SuperConv2DTranspose', 'SuperSeparableConv2D',
-    'SuperBatchNorm', 'SuperLinear', 'SuperInstanceNorm', 'Block',
+    'SuperBatchNorm2D', 'SuperLinear', 'SuperInstanceNorm2D', 'Block',
    'SuperGroupConv2D', 'SuperDepthwiseConv2D', 'SuperGroupConv2DTranspose',
    'SuperDepthwiseConv2DTranspose', 'SuperLayerNorm', 'SuperEmbedding'
 ]
@@ -44,7 +44,7 @@ def counter():
    return _cnt
-class BaseBlock(fluid.dygraph.Layer):
+class BaseBlock(paddle.nn.Layer):
    def __init__(self, key=None):
        super(BaseBlock, self).__init__()
        if key is not None:
@@ -66,7 +66,8 @@ class Block(BaseBlock):
    Model is composed of nest blocks.
    Parameters:
-        fn(Layer): instance of super layers, such as: SuperConv2D(3, 5, 3).
+        fn(paddle.nn.Layer): instance of super layers, such as: SuperConv2D(3, 5, 3).
+        fixed(bool, optional): whether to fix the shape of the weight in this layer. Default: False.
        key(str, optional): key of this layer, one-to-one correspondence between key and candidate config. Default: None.
    """
@@ -81,13 +82,9 @@ class Block(BaseBlock):
        return out
-class SuperConv2D(fluid.dygraph.Conv2D):
+class SuperConv2D(nn.Conv2D):
    """
    This interface is used to construct a callable object of the ``SuperConv2D``  class.
-    The difference between ```SuperConv2D``` and ```Conv2D``` is: ```SuperConv2D``` need 
-    to feed a config dictionary with the format of {'channel', num_of_channel} represents 
-    the channels of the outputs, used to change the first dimension of weight and bias, 
-    only train the first channels of the weight and bias.
    Note: the channel in config need to less than first defined.
@@ -179,42 +176,44 @@ class SuperConv2D(fluid.dygraph.Conv2D):
        ValueError: if ``use_cudnn`` is not a bool value.
    Examples:
        .. code-block:: python
-          from paddle.fluid.dygraph.base import to_variable
+          import paddle 
-          import paddle.fluid as fluid
+          from paddleslim.nas.ofa.layers import SuperConv2D
-          from paddleslim.core.layers import SuperConv2D
          import numpy as np
          data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
-          with fluid.dygraph.guard():
+          super_conv2d = SuperConv2D(3, 10, 3)
-              super_conv2d = SuperConv2D(3, 10, 3)
+          config = {'channel': 5}
-              config = {'channel': 5}
+          data = paddle.to_variable(data)
-              data = to_variable(data)
+          conv = super_conv2d(data, config)
-              conv = super_conv2d(data, config)
    """
    ### NOTE: filter_size, num_channels and num_filters must be the max of candidate to define a largest network.
    def __init__(self,
-                 num_channels,
+                 in_channels,
-                 num_filters,
+                 out_channels,
-                 filter_size,
+                 kernel_size,
                 candidate_config={},
                 transform_kernel=False,
                 stride=1,
-                 dilation=1,
                 padding=0,
-                 groups=None,
+                 dilation=1,
-                 param_attr=None,
+                 groups=1,
+                 padding_mode='zeros',
+                 weight_attr=None,
                 bias_attr=None,
-                 use_cudnn=True,
+                 data_format='NCHW'):
-                 act=None,
-                 dtype='float32'):
-        ### NOTE: padding always is 0, add padding in forward because of kernel size is uncertain
        super(SuperConv2D, self).__init__(
-            num_channels, num_filters, filter_size, stride, padding, dilation,
+            in_channels,
-            groups, param_attr, bias_attr, use_cudnn, act, dtype)
+            out_channels,
+            kernel_size,
-        if isinstance(self._filter_size, int):
+            stride=stride,
-            self._filter_size = convert_to_list(self._filter_size, 2)
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
        self.candidate_config = candidate_config
        if len(candidate_config.items()) != 0:
@@ -228,9 +227,9 @@ class SuperConv2D(fluid.dygraph.Conv2D):
            'expand_ratio'] if 'expand_ratio' in candidate_config else None
        self.channel = candidate_config[
            'channel'] if 'channel' in candidate_config else None
-        self.base_channel = self._num_filters
+        self.base_channel = self._out_channels
        if self.expand_ratio != None:
-            self.base_channel = int(self._num_filters / max(self.expand_ratio))
+            self.base_channel = int(self._out_channels / max(self.expand_ratio))
        self.transform_kernel = transform_kernel
        if self.ks_set != None:
@@ -244,10 +243,9 @@ class SuperConv2D(fluid.dygraph.Conv2D):
                param_name = '%dto%d_matrix' % (ks_large, ks_small)
                ks_t = ks_small**2
                scale_param[param_name] = self.create_parameter(
-                    attr=fluid.ParamAttr(
+                    attr=paddle.ParamAttr(
                        name=self._full_name + param_name,
-                        initializer=fluid.initializer.NumpyArrayInitializer(
+                        initializer=nn.initializer.Assign(np.eye(ks_t))),
-                            np.eye(ks_t))),
                    shape=(ks_t, ks_t),
                    dtype=self._dtype)
@@ -255,10 +253,10 @@ class SuperConv2D(fluid.dygraph.Conv2D):
                setattr(self, name, param)
    def get_active_filter(self, in_nc, out_nc, kernel_size):
-        start, end = compute_start_end(self._filter_size[0], kernel_size)
+        start, end = compute_start_end(self._kernel_size[0], kernel_size)
        ### if NOT transform kernel, intercept a center filter with kernel_size from largest filter
        filters = self.weight[:out_nc, :in_nc, start:end, start:end]
-        if self.transform_kernel != False and kernel_size < self._filter_size[
+        if self.transform_kernel != False and kernel_size < self._kernel_size[
                0]:
            ### if transform kernel, then use matrix to transform
            start_filter = self.weight[:out_nc, :in_nc, :, :]
@@ -269,16 +267,15 @@ class SuperConv2D(fluid.dygraph.Conv2D):
                target_ks = self.ks_set[i - 1]
                start, end = compute_start_end(src_ks, target_ks)
                _input_filter = start_filter[:, :, start:end, start:end]
-                _input_filter = fluid.layers.reshape(
+                _input_filter = paddle.reshape(
                    _input_filter,
                    shape=[(_input_filter.shape[0] * _input_filter.shape[1]),
                           -1])
-                core.ops.matmul(_input_filter,
+                _input_filter = paddle.matmul(
-                                self.__getattr__('%dto%d_matrix' %
+                    _input_filter,
-                                                 (src_ks, target_ks)),
+                    self.__getattr__('%dto%d_matrix' %
-                                _input_filter, 'transpose_X', False,
+                                     (src_ks, target_ks)), False, False)
-                                'transpose_Y', False, "alpha", 1)
+                _input_filter = paddle.reshape(
-                _input_filter = fluid.layers.reshape(
                    _input_filter,
                    shape=[
                        filters.shape[0], filters.shape[1], target_ks, target_ks
@@ -288,14 +285,33 @@ class SuperConv2D(fluid.dygraph.Conv2D):
        return filters
    def get_groups_in_out_nc(self, in_nc, out_nc):
-        ### standard conv
+        if self._groups == 1:
-        return self._groups, in_nc, out_nc
+            ### standard conv
+            return self._groups, in_nc, out_nc
+        elif self._groups == self._in_channels:
+            ### depthwise convolution
+            if in_nc != out_nc:
+                _logger.debug(
+                    "input channel and output channel in depthwise conv is different, change output channel to input channel! origin channel:(in_nc {}, out_nc {}): ".
+                    format(in_nc, out_nc))
+            groups = in_nc
+            out_nc = in_nc
+            return groups, in_nc, out_nc
+        else:
+            ### groups convolution
+            ### conv: weight: (Cout, Cin/G, Kh, Kw)
+            groups = self._groups
+            in_nc = int(in_nc // groups)
+            return groups, in_nc, out_nc
    def forward(self, input, kernel_size=None, expand_ratio=None, channel=None):
+        """
-        if not in_dygraph_mode():
+        Parameters:
-            _logger.error("NOT support static graph")
+            input(Tensor): Input tensor.
+            kernel_size(int, optional): the kernel size of the filter in actual calculation. Default: None.
+            expand_ratio(int|float, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
+            channel(int, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
+        """
        self.cur_config = {
            'kernel_size': kernel_size,
            'expand_ratio': expand_ratio,
@@ -310,8 +326,8 @@ class SuperConv2D(fluid.dygraph.Conv2D):
        elif channel != None:
            out_nc = int(channel)
        else:
-            out_nc = self._num_filters
+            out_nc = self._out_channels
-        ks = int(self._filter_size[0]) if kernel_size == None else int(
+        ks = int(self._kernel_size[0]) if kernel_size == None else int(
            kernel_size)
        groups, weight_in_nc, weight_out_nc = self.get_groups_in_out_nc(in_nc,
@@ -324,28 +340,21 @@ class SuperConv2D(fluid.dygraph.Conv2D):
        else:
            padding = self._padding
-        if self._l_type == 'conv2d':
-            attrs = ('strides', self._stride, 'paddings', padding, 'dilations',
-                     self._dilation, 'groups', groups
-                     if groups else 1, 'use_cudnn', self._use_cudnn)
-            out = core.ops.conv2d(input, weight, *attrs)
-        elif self._l_type == 'depthwise_conv2d':
-            attrs = ('strides', self._stride, 'paddings', padding, 'dilations',
-                     self._dilation, 'groups', groups
-                     if groups else self._groups, 'use_cudnn', self._use_cudnn)
-            out = core.ops.depthwise_conv2d(input, weight, *attrs)
-        else:
-            raise ValueError("conv type error")
-        pre_bias = out
-        out_nc = int(pre_bias.shape[1])
        if self.bias is not None:
            bias = self.bias[:out_nc]
-            pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
        else:
-            pre_act = pre_bias
+            bias = self.bias
-        return dygraph_utils._append_activation_in_dygraph(pre_act, self._act)
+        out = F.conv2d(
+            input,
+            weight,
+            bias=bias,
+            stride=self._stride,
+            padding=padding,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format)
+        return out
 class SuperGroupConv2D(SuperConv2D):
@@ -369,15 +378,10 @@ class SuperDepthwiseConv2D(SuperConv2D):
        return groups, in_nc, out_nc
-class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose):
+class SuperConv2DTranspose(nn.Conv2DTranspose):
    """
    This interface is used to construct a callable object of the ``SuperConv2DTranspose`` 
    class.
-    The difference between ```SuperConv2DTranspose``` and ```Conv2DTranspose``` is: 
-    ```SuperConv2DTranspose``` need to feed a config dictionary with the format of 
-    {'channel', num_of_channel} represents the channels of the outputs, used to change 
-    the first dimension of weight and bias, only train the first channels of the weight 
-    and bias.
    Note: the channel in config need to less than first defined.
@@ -471,53 +475,55 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose):
        None
    Examples:
       .. code-block:: python
-          import paddle.fluid as fluid
+          import paddle
-          from paddleslim.core.layers import SuperConv2DTranspose
          import numpy as np
-          with fluid.dygraph.guard():
+          from paddleslim.nas.ofa.layers import SuperConv2DTranspose
-              data = np.random.random((3, 32, 32, 5)).astype('float32')
+          data = np.random.random((3, 32, 32, 5)).astype('float32')
-              config = {'channel': 5
+          config = {'channel': 5}
-              super_convtranspose = SuperConv2DTranspose(num_channels=32, num_filters=10, filter_size=3)
+          super_convtranspose = SuperConv2DTranspose(num_channels=32, num_filters=10, filter_size=3)
-              ret = super_convtranspose(fluid.dygraph.base.to_variable(data), config)
+          ret = super_convtranspose(paddle.to_variable(data), config)
    """
    def __init__(self,
-                 num_channels,
+                 in_channels,
-                 num_filters,
+                 out_channels,
-                 filter_size,
+                 kernel_size,
-                 output_size=None,
                 candidate_config={},
                 transform_kernel=False,
                 stride=1,
-                 dilation=1,
                 padding=0,
-                 groups=None,
+                 output_padding=0,
-                 param_attr=None,
+                 dilation=1,
+                 groups=1,
+                 weight_attr=None,
                 bias_attr=None,
-                 use_cudnn=True,
+                 data_format="NCHW"):
-                 act=None,
-                 dtype='float32'):
        super(SuperConv2DTranspose, self).__init__(
-            num_channels, num_filters, filter_size, output_size, padding,
+            in_channels,
-            stride, dilation, groups, param_attr, bias_attr, use_cudnn, act,
+            out_channels,
-            dtype)
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            output_padding=output_padding,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
        self.candidate_config = candidate_config
        if len(self.candidate_config.items()) != 0:
            for k, v in candidate_config.items():
                candidate_config[k] = list(set(v))
        self.ks_set = candidate_config[
            'kernel_size'] if 'kernel_size' in candidate_config else None
-        if isinstance(self._filter_size, int):
-            self._filter_size = convert_to_list(self._filter_size, 2)
        self.expand_ratio = candidate_config[
            'expand_ratio'] if 'expand_ratio' in candidate_config else None
        self.channel = candidate_config[
            'channel'] if 'channel' in candidate_config else None
-        self.base_channel = self._num_filters
+        self.base_channel = self._out_channels
        if self.expand_ratio:
-            self.base_channel = int(self._num_filters / max(self.expand_ratio))
+            self.base_channel = int(self._out_channels / max(self.expand_ratio))
        self.transform_kernel = transform_kernel
        if self.ks_set != None:
@@ -531,10 +537,9 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose):
                param_name = '%dto%d_matrix' % (ks_large, ks_small)
                ks_t = ks_small**2
                scale_param[param_name] = self.create_parameter(
-                    attr=fluid.ParamAttr(
+                    attr=paddle.ParamAttr(
                        name=self._full_name + param_name,
-                        initializer=fluid.initializer.NumpyArrayInitializer(
+                        initializer=nn.initializer.Assign(np.eye(ks_t))),
-                            np.eye(ks_t))),
                    shape=(ks_t, ks_t),
                    dtype=self._dtype)
@@ -542,9 +547,9 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose):
                setattr(self, name, param)
    def get_active_filter(self, in_nc, out_nc, kernel_size):
-        start, end = compute_start_end(self._filter_size[0], kernel_size)
+        start, end = compute_start_end(self._kernel_size[0], kernel_size)
        filters = self.weight[:in_nc, :out_nc, start:end, start:end]
-        if self.transform_kernel != False and kernel_size < self._filter_size[
+        if self.transform_kernel != False and kernel_size < self._kernel_size[
                0]:
            start_filter = self.weight[:in_nc, :out_nc, :, :]
            for i in range(len(self.ks_set) - 1, 0, -1):
@@ -554,16 +559,15 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose):
                target_ks = self.ks_set[i - 1]
                start, end = compute_start_end(src_ks, target_ks)
                _input_filter = start_filter[:, :, start:end, start:end]
-                _input_filter = fluid.layers.reshape(
+                _input_filter = paddle.reshape(
                    _input_filter,
                    shape=[(_input_filter.shape[0] * _input_filter.shape[1]),
                           -1])
-                core.ops.matmul(_input_filter,
+                _input_filter = paddle.matmul(
-                                self.__getattr__('%dto%d_matrix' %
+                    _input_filter,
-                                                 (src_ks, target_ks)),
+                    self.__getattr__('%dto%d_matrix' %
-                                _input_filter, 'transpose_X', False,
+                                     (src_ks, target_ks)), False, False)
-                                'transpose_Y', False, "alpha", 1)
+                _input_filter = paddle.reshape(
-                _input_filter = fluid.layers.reshape(
                    _input_filter,
                    shape=[
                        filters.shape[0], filters.shape[1], target_ks, target_ks
@@ -573,13 +577,39 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose):
        return filters
    def get_groups_in_out_nc(self, in_nc, out_nc):
-        ### standard conv
+        if self._groups == 1:
-        return self._groups, in_nc, out_nc
+            ### standard conv
+            return self._groups, in_nc, out_nc
-    def forward(self, input, kernel_size=None, expand_ratio=None, channel=None):
+        elif self._groups == self._in_channels:
-        if not in_dygraph_mode():
+            ### depthwise convolution
-            _logger.error("NOT support static graph")
+            if in_nc != out_nc:
+                _logger.debug(
+                    "input channel and output channel in depthwise conv is different, change output channel to input channel! origin channel:(in_nc {}, out_nc {}): ".
+                    format(in_nc, out_nc))
+            groups = in_nc
+            out_nc = in_nc
+            return groups, in_nc, out_nc
+        else:
+            ### groups convolution
+            ### groups conv transpose: weight: (Cin, Cout/G, Kh, Kw)
+            groups = self._groups
+            out_nc = int(out_nc // groups)
+            return groups, in_nc, out_nc
+    def forward(self,
+                input,
+                output_size=None,
+                kernel_size=None,
+                expand_ratio=None,
+                channel=None):
+        """
+        Parameters:
+            input(Tensor): input tensor.
+            output_size(int, optional): the size of the feature map after transpose convolution. Default: None.
+            kernel_size(int, optional): the kernel size of the filter in actual calculation. Default: None.
+            expand_ratio(int|float, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
+            channel(int, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
+        """
        self.cur_config = {
            'kernel_size': kernel_size,
            'expand_ratio': expand_ratio,
@@ -594,34 +624,43 @@ class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose):
        elif channel != None:
            out_nc = int(channel)
        else:
-            out_nc = self._num_filters
+            out_nc = self._out_channels
-        ks = int(self._filter_size[0]) if kernel_size == None else int(
+        ks = int(self._kernel_size[0]) if kernel_size == None else int(
            kernel_size)
        groups, weight_in_nc, weight_out_nc = self.get_groups_in_out_nc(in_nc,
                                                                        out_nc)
        weight = self.get_active_filter(weight_in_nc, weight_out_nc, ks)
        if kernel_size != None or 'kernel_size' in self.candidate_config.keys():
            padding = convert_to_list(get_same_padding(ks), 2)
        else:
            padding = self._padding
-        op = getattr(core.ops, self._op_type)
+        if output_size is None:
-        out = op(input, weight, 'output_size', self._output_size, 'strides',
+            output_padding = self.output_padding
-                 self._stride, 'paddings', padding, 'dilations', self._dilation,
+        else:
-                 'groups', groups, 'use_cudnn', self._use_cudnn)
+            output_padding = 0
-        pre_bias = out
-        out_nc = int(pre_bias.shape[1])
        if self.bias is not None:
            bias = self.bias[:out_nc]
-            pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
        else:
-            pre_act = pre_bias
+            bias = self.bias
-        return dygraph_utils._append_activation_in_dygraph(
+        out = F.conv2d_transpose(
-            pre_act, act=self._act)
+            input,
+            weight,
+            bias=bias,
+            padding=padding,
+            output_padding=output_padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            output_size=output_size,
+            data_format=self._data_format)
+        return out
 class SuperGroupConv2DTranspose(SuperConv2DTranspose):
@@ -645,7 +684,7 @@ class SuperDepthwiseConv2DTranspose(SuperConv2DTranspose):
 ### NOTE: only search channel, write for GAN-compression, maybe change to SuperDepthwiseConv and SuperConv after.
-class SuperSeparableConv2D(fluid.dygraph.Layer):
+class SuperSeparableConv2D(nn.Layer):
    """
    This interface is used to construct a callable object of the ``SuperSeparableConv2D``
    class.
@@ -655,8 +694,8 @@ class SuperSeparableConv2D(fluid.dygraph.Layer):
    the second conv's inputs, used to change the first dimension of weight and bias, 
    only train the first channels of the weight and bias.
-    The architecture of super separable convolution2D op is [Conv2D, norm layer(may be BatchNorm
+    The architecture of super separable convolution2D op is [Conv2D, norm layer(may be BatchNorm2D
-    or InstanceNorm), Conv2D]. The first conv is depthwise conv, the filter number is input channel
+    or InstanceNorm2D), Conv2D]. The first conv is depthwise conv, the filter number is input channel
    multiply scale_factor, the group is equal to the number of input channel. The second conv
    is standard conv, which filter size and stride size are 1. 
@@ -676,68 +715,66 @@ class SuperSeparableConv2D(fluid.dygraph.Layer):
        dilation(int or tuple, optional): The first conv's dilation size. If dilation is a tuple, 
            it must contain two integers, (dilation_H, dilation_W). Otherwise, the
            dilation_H = dilation_W = dilation. Default: 1.
-        norm_layer(class): The normalization layer between two convolution. Default: InstanceNorm.
+        norm_layer(class): The normalization layer between two convolution. Default: InstanceNorm2D.
        bias_attr (ParamAttr or bool, optional): The attribute for the bias of convolution.
            If it is set to False, no bias will be added to the output units.
            If it is set to None or one attribute of ParamAttr, convolution
            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized zero. Default: None.
        scale_factor(float): The scale factor of the first conv's output channel. Default: 1.
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
    Returns:
        None
    """
    def __init__(self,
-                 num_channels,
+                 in_channels,
-                 num_filters,
+                 out_channels,
-                 filter_size,
+                 kernel_size,
                 candidate_config={},
                 stride=1,
                 padding=0,
                 dilation=1,
-                 norm_layer=InstanceNorm,
+                 norm_layer=nn.InstanceNorm2D,
                 bias_attr=None,
-                 scale_factor=1,
+                 scale_factor=1):
-                 use_cudnn=False):
        super(SuperSeparableConv2D, self).__init__()
-        self.conv = fluid.dygraph.LayerList([
+        self.conv = nn.LayerList([
-            fluid.dygraph.nn.Conv2D(
+            nn.Conv2D(
-                num_channels=num_channels,
+                in_channels=in_channels,
-                num_filters=num_channels * scale_factor,
+                out_channels=in_channels * scale_factor,
-                filter_size=filter_size,
+                kernel_size=kernel_size,
                stride=stride,
                padding=padding,
-                use_cudnn=False,
+                groups=in_channels,
-                groups=num_channels,
                bias_attr=bias_attr)
        ])
-        self.conv.extend([norm_layer(num_channels * scale_factor)])
+        self.conv.extend([norm_layer(in_channels * scale_factor)])
        self.conv.extend([
-            fluid.dygraph.nn.Conv2D(
+            nn.Conv2D(
-                num_channels=num_channels * scale_factor,
+                in_channels=in_channels * scale_factor,
-                num_filters=num_filters,
+                out_channels=out_channels,
-                filter_size=1,
+                kernel_size=1,
                stride=1,
-                use_cudnn=use_cudnn,
                bias_attr=bias_attr)
        ])
        self.candidate_config = candidate_config
        self.expand_ratio = candidate_config[
            'expand_ratio'] if 'expand_ratio' in candidate_config else None
-        self.base_output_dim = self.conv[0]._num_filters
+        self.base_output_dim = self.conv[0]._out_channels
        if self.expand_ratio != None:
-            self.base_output_dim = int(self.conv[0]._num_filters /
+            self.base_output_dim = int(self.conv[0]._out_channels /
                                       max(self.expand_ratio))
    def forward(self, input, expand_ratio=None, channel=None):
-        if not in_dygraph_mode():
+        """
-            _logger.error("NOT support static graph")
+        Parameters:
+            input(Tensor): input tensor.
+            expand_ratio(int|float, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
+            channel(int, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
+        """
        self.cur_config = {'expand_ratio': expand_ratio, 'channel': channel}
        in_nc = int(input.shape[1])
        assert (
@@ -748,93 +785,127 @@ class SuperSeparableConv2D(fluid.dygraph.Layer):
        elif channel != None:
            out_nc = int(channel)
        else:
-            out_nc = self.conv[0]._num_filters
+            out_nc = self.conv[0]._out_channels
        weight = self.conv[0].weight[:in_nc]
        ###  conv1
-        if self.conv[0]._l_type == 'conv2d':
-            attrs = ('strides', self.conv[0]._stride, 'paddings',
-                     self.conv[0]._padding, 'dilations', self.conv[0]._dilation,
-                     'groups', in_nc, 'use_cudnn', self.conv[0]._use_cudnn)
-            out = core.ops.conv2d(input, weight, *attrs)
-        elif self.conv[0]._l_type == 'depthwise_conv2d':
-            attrs = ('strides', self.conv[0]._stride, 'paddings',
-                     self.conv[0]._padding, 'dilations', self.conv[0]._dilation,
-                     'groups', in_nc, 'use_cudnn', self.conv[0]._use_cudnn)
-            out = core.ops.depthwise_conv2d(input, weight, *attrs)
-        else:
-            raise ValueError("conv type error")
-        pre_bias = out
        if self.conv[0].bias is not None:
            bias = self.conv[0].bias[:in_nc]
-            pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
        else:
-            pre_act = pre_bias
+            bias = self.conv[0].bias
-        conv0_out = dygraph_utils._append_activation_in_dygraph(
+        conv0_out = F.conv2d(
-            pre_act, self.conv[0]._act)
+            input,
+            weight,
+            bias,
+            stride=self.conv[0]._stride,
+            padding=self.conv[0]._padding,
+            dilation=self.conv[0]._dilation,
+            groups=in_nc,
+            data_format=self.conv[0]._data_format)
        norm_out = self.conv[1](conv0_out)
        weight = self.conv[2].weight[:out_nc, :in_nc, :, :]
-        if self.conv[2]._l_type == 'conv2d':
-            attrs = ('strides', self.conv[2]._stride, 'paddings',
-                     self.conv[2]._padding, 'dilations', self.conv[2]._dilation,
-                     'groups', self.conv[2]._groups if self.conv[2]._groups else
-                     1, 'use_cudnn', self.conv[2]._use_cudnn)
-            out = core.ops.conv2d(norm_out, weight, *attrs)
-        elif self.conv[2]._l_type == 'depthwise_conv2d':
-            attrs = ('strides', self.conv[2]._stride, 'paddings',
-                     self.conv[2]._padding, 'dilations', self.conv[2]._dilation,
-                     'groups', self.conv[2]._groups, 'use_cudnn',
-                     self.conv[2]._use_cudnn)
-            out = core.ops.depthwise_conv2d(norm_out, weight, *attrs)
-        else:
-            raise ValueError("conv type error")
-        pre_bias = out
        if self.conv[2].bias is not None:
            bias = self.conv[2].bias[:out_nc]
-            pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
        else:
-            pre_act = pre_bias
+            bias = self.conv[2].bias
-        conv1_out = dygraph_utils._append_activation_in_dygraph(
+        conv1_out = F.conv2d(
-            pre_act, self.conv[2]._act)
+            norm_out,
+            weight,
+            bias,
+            stride=self.conv[2]._stride,
+            padding=self.conv[2]._padding,
+            dilation=self.conv[2]._dilation,
+            groups=self.conv[2]._groups,
+            data_format=self.conv[2]._data_format)
        return conv1_out
-class SuperLinear(fluid.dygraph.Linear):
+class SuperLinear(nn.Linear):
    """
+    Super Fully-connected linear transformation layer. 
+    For each input :math:`X` , the equation is:
+    .. math::
+        Out = XW + b
+    where :math:`W` is the weight and :math:`b` is the bias.
+    Linear layer takes only one multi-dimensional tensor as input with the
+    shape :math:`[batch\_size, *, in\_features]` , where :math:`*` means any
+    number of additional dimensions. It multiplies input tensor with the weight
+    (a 2-D tensor of shape :math:`[in\_features, out\_features]` ) and produces
+    an output tensor of shape :math:`[batch\_size, *, out\_features]` .
+    If :math:`bias\_attr` is not False, the bias (a 1-D tensor of
+    shape :math:`[out\_features]` ) will be created and added to the output.
+    Parameters:
+        in_features (int): The number of input units.
+        out_features (int): The number of output units.
+        candidate_config(dict, optional): Dictionary descripts candidate config of this layer,
+            such as {'channel': (4, 6, 8)}, the key of candidate_config
+            only can be 'channel' and 'expand_ratio', 'channel' and 'expand_ratio'
+            CANNOT be set at the same time. Default: None.
+        weight_attr (ParamAttr, optional): The attribute for the learnable
+            weight of this layer. The default value is None and the weight will be
+            initialized to zero. For detailed information, please refer to
+            paddle.ParamAttr.
+        bias_attr (ParamAttr|bool, optional): The attribute for the learnable bias
+            of this layer. If it is set to False, no bias will be added to the output.
+            If it is set to None or one kind of ParamAttr, a bias parameter will
+            be created according to ParamAttr. For detailed information, please refer
+            to paddle.ParamAttr. The default value is None and the bias will be
+            initialized to zero.
+        name (str, optional): Normally there is no need for user to set this parameter.
+            For detailed information, please refer to :ref:`api_guide_Name` .
+    Attribute:
+        **weight** (Parameter): the learnable weight of this layer.
+        **bias** (Parameter): the learnable bias of this layer.
+    Shape:
+        - input: Multi-dimentional tensor with shape :math:`[batch\_size, *, in\_features]` .
+        - output: Multi-dimentional tensor with shape :math:`[batch\_size, *, out\_features]` .
+    Examples:
+        .. code-block:: python
+          import numpy as np
+          import paddle
+          from paddleslim.nas.ofa.layers import SuperLinear
+          data = np.random.uniform(-1, 1, [32, 64] ).astype('float32')
+          config = {'channel': 16}
+          linear = SuperLinear(32, 64)
+          data = paddle.to_variable(data)
+          res = linear(data, **config)
    """
    def __init__(self,
-                 input_dim,
+                 in_features,
-                 output_dim,
+                 out_features,
                 candidate_config={},
-                 param_attr=None,
+                 weight_attr=None,
                 bias_attr=None,
-                 act=None,
+                 name=None):
-                 dtype="float32"):
+        super(SuperLinear, self).__init__(in_features, out_features,
-        super(SuperLinear, self).__init__(input_dim, output_dim, param_attr,
+                                          weight_attr, bias_attr, name)
-                                          bias_attr, act, dtype)
+        self._weight_attr = weight_attr
-        self._param_attr = param_attr
        self._bias_attr = bias_attr
-        self.output_dim = output_dim
+        self._in_features = in_features
+        self._out_features = out_features
        self.candidate_config = candidate_config
        self.expand_ratio = candidate_config[
            'expand_ratio'] if 'expand_ratio' in candidate_config else None
-        self.base_output_dim = self.output_dim
+        self.base_output_dim = self._out_features
        if self.expand_ratio != None:
-            self.base_output_dim = int(self.output_dim / max(self.expand_ratio))
+            self.base_output_dim = int(self._out_features /
+                                       max(self.expand_ratio))
    def forward(self, input, expand_ratio=None, channel=None):
-        if not in_dygraph_mode():
+        """
-            _logger.error("NOT support static graph")
+        Parameters:
+            input(Tensor): input tensor.
+            expand_ratio(int|float, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
+            channel(int, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
+        """
        self.cur_config = {'expand_ratio': expand_ratio, 'channel': channel}
        ### weight: (Cin, Cout)
        in_nc = int(input.shape[-1])
@@ -846,55 +917,65 @@ class SuperLinear(fluid.dygraph.Linear):
        elif channel != None:
            out_nc = int(channel)
        else:
-            out_nc = self.output_dim
+            out_nc = self._out_features
        weight = self.weight[:in_nc, :out_nc]
        if self._bias_attr != False:
            bias = self.bias[:out_nc]
-            use_bias = True
-        pre_bias = _varbase_creator(dtype=input.dtype)
-        core.ops.matmul(input, weight, pre_bias, 'transpose_X', False,
-                        'transpose_Y', False, "alpha", 1)
-        if self._bias_attr != False:
-            pre_act = dygraph_utils._append_bias_in_dygraph(
-                pre_bias, bias, axis=len(input.shape) - 1)
        else:
-            pre_act = pre_bias
+            bias = self.bias
-        return dygraph_utils._append_activation_in_dygraph(pre_act, self._act)
+        out = F.linear(x=input, weight=weight, bias=bias, name=self.name)
+        return out
-class SuperBatchNorm(fluid.dygraph.BatchNorm):
+class SuperBatchNorm2D(nn.BatchNorm2D):
    """
-    add comment
+    This interface is used to construct a callable object of the ``SuperBatchNorm2D`` class. 
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+    Examples:
+       .. code-block:: python
+         import paddle
+         import numpy as np
+         from paddleslim.nas.ofa.layers import SuperBatchNorm2D
+         np.random.seed(123)
+         x_data = np.random.random(size=(2, 5, 2, 3)).astype('float32')
+         x = paddle.to_tensor(x_data)
+         batch_norm = SuperBatchNorm2D(5)
+         batch_norm_out = batch_norm(x)
    """
    def __init__(self,
-                 num_channels,
+                 num_features,
-                 act=None,
-                 is_test=False,
                 momentum=0.9,
                 epsilon=1e-05,
-                 param_attr=None,
+                 weight_attr=None,
                 bias_attr=None,
-                 dtype='float32',
+                 data_format='NCHW',
-                 data_layout='NCHW',
+                 name=None):
-                 in_place=False,
+        super(SuperBatchNorm2D, self).__init__(num_features, momentum, epsilon,
-                 moving_mean_name=None,
+                                               weight_attr, bias_attr,
-                 moving_variance_name=None,
+                                               data_format, name)
-                 do_model_average_for_mean_and_var=True,
-                 use_global_stats=False,
-                 trainable_statistics=False):
-        super(SuperBatchNorm, self).__init__(
-            num_channels, act, is_test, momentum, epsilon, param_attr,
-            bias_attr, dtype, data_layout, in_place, moving_mean_name,
-            moving_variance_name, do_model_average_for_mean_and_var,
-            use_global_stats, trainable_statistics)
    def forward(self, input):
-        if not in_dygraph_mode():
+        self._check_data_format(self._data_format)
-            _logger.error("NOT support static graph")
+        self._check_input_dim(input)
        feature_dim = int(input.shape[1])
@@ -903,108 +984,217 @@ class SuperBatchNorm(fluid.dygraph.BatchNorm):
        mean = self._mean[:feature_dim]
        variance = self._variance[:feature_dim]
-        mean_out = mean
+        return F.batch_norm(
-        variance_out = variance
+            input,
+            mean,
-        attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
+            variance,
-                 "is_test", not self.training, "data_layout", self._data_layout,
+            weight=weight,
-                 "use_mkldnn", False, "fuse_with_relu", self._fuse_with_relu,
+            bias=bias,
-                 "use_global_stats", self._use_global_stats,
+            training=self.training,
-                 'trainable_statistics', self._trainable_statistics)
+            momentum=self._momentum,
-        batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
+            epsilon=self._epsilon,
-            input, weight, bias, mean, variance, mean_out, variance_out, *attrs)
+            data_format=self._data_format)
-        return dygraph_utils._append_activation_in_dygraph(
-            batch_norm_out, act=self._act)
-class SuperInstanceNorm(fluid.dygraph.InstanceNorm):
+class SuperInstanceNorm2D(nn.InstanceNorm2D):
    """
+    This interface is used to construct a callable object of the ``SuperBatchNorm2D`` class. 
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+    Examples:
+       .. code-block:: python
+         import paddle
+         import numpy as np
+         from paddleslim.nas.ofa.layers import SuperInstanceNorm2D
+         np.random.seed(123)
+         x_data = np.random.random(size=(2, 5, 2, 3)).astype('float32')
+         x = paddle.to_tensor(x_data)
+         instance_norm = SuperInstanceNorm2D(5)
+         out = instance_norm(x)
    """
    def __init__(self,
-                 num_channels,
+                 num_features,
                 epsilon=1e-05,
-                 param_attr=None,
+                 momentum=0.9,
+                 weight_attr=None,
                 bias_attr=None,
-                 dtype='float32'):
+                 data_format='NCHW',
-        super(SuperInstanceNorm, self).__init__(num_channels, epsilon,
+                 name=None):
-                                                param_attr, bias_attr, dtype)
+        super(SuperInstanceNorm2D, self).__init__(num_features, epsilon,
+                                                  momentum, weight_attr,
+                                                  bias_attr, data_format, name)
    def forward(self, input):
-        if not in_dygraph_mode():
+        self._check_input_dim(input)
-            _logger.error("NOT support static graph")
        feature_dim = int(input.shape[1])
+        if self._weight_attr == False and self._bias_attr == False:
-        if self._param_attr == False and self._bias_attr == False:
            scale = None
            bias = None
        else:
            scale = self.scale[:feature_dim]
            bias = self.bias[:feature_dim]
-        out, _, _ = core.ops.instance_norm(input, scale, bias, 'epsilon',
+        return F.instance_norm(input, scale, bias, eps=self._epsilon)
-                                           self._epsilon)
-        return out
+class SuperLayerNorm(nn.LayerNorm):
+    """
+    This interface is used to construct a callable object of the ``SuperLayerNorm`` class.
+    The difference between ```SuperLayerNorm``` and ```LayerNorm``` is: 
+    the trained weight and bias in ```SuperLayerNorm``` can be changed according to the shape of input,
+    only train the first channels of the weight and bias.
+    Parameters:
+        normalized_shape(int|list|tuple): Input shape from an expected input of
+            size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
+            If it is a single integer, this module will normalize over the last dimension
+            which is expected to be of that specific size.
+        epsilon(float, optional): The small value added to the variance to prevent
+            division by zero. Default: 1e-05.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+            gain :math:`g`. If False, weight is None. If is None, a default :code:`ParamAttr` would be added as scale. The
+            :attr:`param_attr` is initialized as 1 if it is added. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+            bias :math:`b`. If is False, bias is None. If is None, a default :code:`ParamAttr` would be added as bias. The
+            :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
+        name(str, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+    Shape:
+        - x: 2-D, 3-D, 4-D or 5-D tensor.
+        - output: same shape as input x.
+    Returns:
+        None
+    Examples:
+        .. code-block:: python
+          import paddle
+          import numpy as np
+          from paddleslim.nas.ofa.layers import SuperLayerNorm
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data)
+          layer_norm = SuperLayerNorm(x_data.shape[1:])
+          layer_norm_out = layer_norm(x)
+    """
-class SuperLayerNorm(fluid.dygraph.LayerNorm):
    def __init__(self,
                 normalized_shape,
-                 candidate_config={},
-                 scale=True,
-                 shift=True,
                 epsilon=1e-05,
-                 param_attr=None,
+                 weight_attr=None,
                 bias_attr=None,
-                 act=None,
+                 name=None):
-                 dtype='float32'):
+        super(SuperLayerNorm, self).__init__(normalized_shape, epsilon,
-        super(SuperLayerNorm,
+                                             weight_attr, bias_attr, name)
-              self).__init__(normalized_shape, scale, shift, epsilon,
-                             param_attr, bias_attr, act, dtype)
    def forward(self, input):
-        if not in_dygraph_mode():
-            _logger.error("NOT support static graph")
-        input_shape = list(input.shape)
-        input_ndim = len(input_shape)
-        normalized_ndim = len(self._normalized_shape)
-        self._begin_norm_axis = input_ndim - normalized_ndim
        ### TODO(ceci3): fix if normalized_shape is not a single number
+        input_ndim = len(list(input.shape))
+        normalized_ndim = len(self._normalized_shape)
+        begin_norm_axis = input_ndim - normalized_ndim
        feature_dim = int(input.shape[-1])
-        weight = self.weight[:feature_dim]
+        if self._weight_attr != False:
-        bias = self.bias[:feature_dim]
+            weight = self.weight[:feature_dim]
-        pre_act, _, _ = core.ops.layer_norm(input, weight, bias, 'epsilon',
+        else:
-                                            self._epsilon, 'begin_norm_axis',
+            weight = None
-                                            self._begin_norm_axis)
+        if self._bias_attr != False:
-        return dygraph_utils._append_activation_in_dygraph(
+            bias = self.bias[:feature_dim]
-            pre_act, act=self._act)
+        else:
+            bias = None
+        out, _, _ = core.ops.layer_norm(input, weight, bias, 'epsilon',
+                                        self._epsilon, 'begin_norm_axis',
+                                        begin_norm_axis)
+        return out
-class SuperEmbedding(fluid.dygraph.Embedding):
+class SuperEmbedding(nn.Embedding):
+    """
+    This interface is used to construct a callable object of the ``SuperEmbedding`` class.
+    Parameters:
+        num_embeddings (int): Just one element which indicate the size
+            of the dictionary of embeddings.
+        embedding_dim:  Just one element which indicate the size of each embedding vector respectively.
+        padding_idx(int|long|None): padding_idx needs to be in the interval [-num_embeddings, num_embeddings).
+            If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
+            to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
+            encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
+            If set None, it makes no effect to output. Default: None.
+        sparse(bool): The flag indicating whether to use sparse update. This parameter only
+            affects the performance of the backwards gradient update. It is recommended to set
+            True because sparse update is faster. But some optimizer does not support sparse update,
+            such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
+            :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
+            :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
+            In these case, sparse must be False. Default: False.
+        weight_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
+            default weight parameter property is used. See usage for details in :ref:`api_ParamAttr` . In addition,
+            user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter.
+            The local word vector needs to be transformed into numpy format, and the shape of local word
+            vector should be consistent with :attr:`num_embeddings` . Then :ref:`api_initializer_NumpyArrayInitializer`
+            is used to load custom or pre-trained word vectors. See code example for details.
+        name(str|None): For detailed information, please refer
+               to :ref:`api_guide_Name`. Usually name is no need to set and
+               None by default.
+    Attribute:
+        **weight** (Parameter): the learnable weights of this layer.
+    Returns:
+        None
+    Examples:
+        .. code-block:: python
+          import numpy as np
+          import paddle
+          from paddleslim.nas.ofa.layers import SuperEmbedding
+          data = np.random.uniform(-1, 1, [32, 64]).astype('float32')
+          config = {'channel': 16}
+          emb = SuperEmbedding(32, 64)
+          data = paddle.to_variable(data)
+          res = emb(data, **config)
+    """
    def __init__(self,
-                 size,
+                 num_embeddings,
+                 embedding_dim,
                 candidate_config={},
-                 is_sparse=False,
-                 is_distributed=False,
                 padding_idx=None,
-                 param_attr=None,
+                 sparse=False,
-                 dtype='float32'):
+                 weight_attr=None,
-        super(SuperEmbedding, self).__init__(size, is_sparse, is_distributed,
+                 name=None):
-                                             padding_idx, param_attr, dtype)
+        super(SuperEmbedding, self).__init__(num_embeddings, embedding_dim,
+                                             padding_idx, sparse, weight_attr,
+                                             name)
        self.candidate_config = candidate_config
        self.expand_ratio = candidate_config[
            'expand_ratio'] if 'expand_ratio' in candidate_config else None
-        self.base_output_dim = self._size[-1]
+        self.base_output_dim = self._embedding_dim
        if self.expand_ratio != None:
-            self.base_output_dim = int(self._size[-1] / max(self.expand_ratio))
+            self.base_output_dim = int(self._embedding_dim /
+                                       max(self.expand_ratio))
    def forward(self, input, expand_ratio=None, channel=None):
-        if not in_dygraph_mode():
+        """
-            _logger.error("NOT support static graph")
+        Parameters:
+            input(Tensor): input tensor.
+            expand_ratio(int|float, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
+            channel(int, optional): the expansion ratio of filter's channel number in actual calculation. Default: None.
+        """
        assert (
            expand_ratio == None or channel == None
        ), "expand_ratio and channel CANNOT be NOT None at the same time."
@@ -1013,10 +1203,12 @@ class SuperEmbedding(fluid.dygraph.Embedding):
        elif channel != None:
            out_nc = int(channel)
        else:
-            out_nc = self._size[-1]
+            out_nc = self._embedding_dim
        weight = self.weight[:, :out_nc]
-        return core.ops.lookup_table_v2(
+        return F.embedding(
-            weight, input, 'is_sparse', self._is_sparse, 'is_distributed',
+            input,
-            self._is_distributed, 'remote_prefetch', self._remote_prefetch,
+            weight=weight,
-            'padding_idx', self._padding_idx)
+            padding_idx=self._padding_idx,
+            sparse=self._sparse,
+            name=self._name)
--- a/paddleslim/nas/ofa/layers_new.py
+++ b/paddleslim/nas/ofa/layers_new.py
@@ -12,19 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+### NOTE: the API of this file is based on Paddle1.8, the API in layers.py is based on Paddle2.0
 import numpy as np
 import logging
-import paddle
+import paddle.fluid as fluid
-import paddle.nn as nn
-import paddle.nn.functional as F
 import paddle.fluid.core as core
+import paddle.fluid.dygraph_utils as dygraph_utils
+from paddle.fluid.data_feeder import check_variable_and_dtype
+from paddle.fluid.framework import _varbase_creator
+from paddle.fluid.dygraph.nn import InstanceNorm, Conv2D, Conv2DTranspose, BatchNorm
 from ...common import get_logger
 from .utils.utils import compute_start_end, get_same_padding, convert_to_list
 __all__ = [
    'SuperConv2D', 'SuperConv2DTranspose', 'SuperSeparableConv2D',
-    'SuperBatchNorm2D', 'SuperLinear', 'SuperInstanceNorm2D', 'Block',
+    'SuperBatchNorm', 'SuperLinear', 'SuperInstanceNorm', 'Block',
    'SuperGroupConv2D', 'SuperDepthwiseConv2D', 'SuperGroupConv2DTranspose',
    'SuperDepthwiseConv2DTranspose', 'SuperLayerNorm', 'SuperEmbedding'
 ]
@@ -42,7 +46,7 @@ def counter():
    return _cnt
-class BaseBlock(paddle.nn.Layer):
+class BaseBlock(fluid.dygraph.Layer):
    def __init__(self, key=None):
        super(BaseBlock, self).__init__()
        if key is not None:
@@ -79,7 +83,7 @@ class Block(BaseBlock):
        return out
-class SuperConv2D(nn.Conv2D):
+class SuperConv2D(fluid.dygraph.Conv2D):
    """
    This interface is used to construct a callable object of the ``SuperConv2D``  class.
    The difference between ```SuperConv2D``` and ```Conv2D``` is: ```SuperConv2D``` need 
@@ -177,44 +181,42 @@ class SuperConv2D(nn.Conv2D):
        ValueError: if ``use_cudnn`` is not a bool value.
    Examples:
        .. code-block:: python
-          import paddle 
+          from paddle.fluid.dygraph.base import to_variable
-          from paddleslim.nas.ofa.layers import SuperConv2D
+          import paddle.fluid as fluid
+          from paddleslim.core.layers import SuperConv2D
          import numpy as np
          data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
-          super_conv2d = SuperConv2D(3, 10, 3)
+          with fluid.dygraph.guard():
-          config = {'channel': 5}
+              super_conv2d = SuperConv2D(3, 10, 3)
-          data = paddle.to_variable(data)
+              config = {'channel': 5}
-          conv = super_conv2d(data, config)
+              data = to_variable(data)
+              conv = super_conv2d(data, config)
    """
    ### NOTE: filter_size, num_channels and num_filters must be the max of candidate to define a largest network.
    def __init__(self,
-                 in_channels,
+                 num_channels,
-                 out_channels,
+                 num_filters,
-                 kernel_size,
+                 filter_size,
                 candidate_config={},
                 transform_kernel=False,
                 stride=1,
-                 padding=0,
                 dilation=1,
-                 groups=1,
+                 padding=0,
-                 padding_mode='zeros',
+                 groups=None,
-                 weight_attr=None,
+                 param_attr=None,
                 bias_attr=None,
-                 data_format='NCHW'):
+                 use_cudnn=True,
+                 act=None,
+                 dtype='float32'):
+        ### NOTE: padding always is 0, add padding in forward because of kernel size is uncertain
        super(SuperConv2D, self).__init__(
-            in_channels,
+            num_channels, num_filters, filter_size, stride, padding, dilation,
-            out_channels,
+            groups, param_attr, bias_attr, use_cudnn, act, dtype)
-            kernel_size,
-            stride=stride,
+        if isinstance(self._filter_size, int):
-            padding=padding,
+            self._filter_size = convert_to_list(self._filter_size, 2)
-            padding_mode=padding_mode,
-            dilation=dilation,
-            groups=groups,
-            weight_attr=weight_attr,
-            bias_attr=bias_attr,
-            data_format=data_format)
        self.candidate_config = candidate_config
        if len(candidate_config.items()) != 0:
@@ -228,9 +230,9 @@ class SuperConv2D(nn.Conv2D):
            'expand_ratio'] if 'expand_ratio' in candidate_config else None
        self.channel = candidate_config[
            'channel'] if 'channel' in candidate_config else None
-        self.base_channel = self._out_channels
+        self.base_channel = self._num_filters
        if self.expand_ratio != None:
-            self.base_channel = int(self._out_channels / max(self.expand_ratio))
+            self.base_channel = int(self._num_filters / max(self.expand_ratio))
        self.transform_kernel = transform_kernel
        if self.ks_set != None:
@@ -244,9 +246,10 @@ class SuperConv2D(nn.Conv2D):
                param_name = '%dto%d_matrix' % (ks_large, ks_small)
                ks_t = ks_small**2
                scale_param[param_name] = self.create_parameter(
-                    attr=paddle.ParamAttr(
+                    attr=fluid.ParamAttr(
                        name=self._full_name + param_name,
-                        initializer=nn.initializer.Assign(np.eye(ks_t))),
+                        initializer=fluid.initializer.NumpyArrayInitializer(
+                            np.eye(ks_t))),
                    shape=(ks_t, ks_t),
                    dtype=self._dtype)
@@ -254,10 +257,10 @@ class SuperConv2D(nn.Conv2D):
                setattr(self, name, param)
    def get_active_filter(self, in_nc, out_nc, kernel_size):
-        start, end = compute_start_end(self._kernel_size[0], kernel_size)
+        start, end = compute_start_end(self._filter_size[0], kernel_size)
        ### if NOT transform kernel, intercept a center filter with kernel_size from largest filter
        filters = self.weight[:out_nc, :in_nc, start:end, start:end]
-        if self.transform_kernel != False and kernel_size < self._kernel_size[
+        if self.transform_kernel != False and kernel_size < self._filter_size[
                0]:
            ### if transform kernel, then use matrix to transform
            start_filter = self.weight[:out_nc, :in_nc, :, :]
@@ -268,15 +271,16 @@ class SuperConv2D(nn.Conv2D):
                target_ks = self.ks_set[i - 1]
                start, end = compute_start_end(src_ks, target_ks)
                _input_filter = start_filter[:, :, start:end, start:end]
-                _input_filter = paddle.reshape(
+                _input_filter = fluid.layers.reshape(
                    _input_filter,
                    shape=[(_input_filter.shape[0] * _input_filter.shape[1]),
                           -1])
-                _input_filter = paddle.matmul(
+                core.ops.matmul(_input_filter,
-                    _input_filter,
+                                self.__getattr__('%dto%d_matrix' %
-                    self.__getattr__('%dto%d_matrix' %
+                                                 (src_ks, target_ks)),
-                                     (src_ks, target_ks)), False, False)
+                                _input_filter, 'transpose_X', False,
-                _input_filter = paddle.reshape(
+                                'transpose_Y', False, "alpha", 1)
+                _input_filter = fluid.layers.reshape(
                    _input_filter,
                    shape=[
                        filters.shape[0], filters.shape[1], target_ks, target_ks
@@ -286,8 +290,24 @@ class SuperConv2D(nn.Conv2D):
        return filters
    def get_groups_in_out_nc(self, in_nc, out_nc):
-        ### standard conv
+        if self._groups == 1 or self._groups == None:
-        return self._groups, in_nc, out_nc
+            ### standard conv
+            return self._groups, in_nc, out_nc
+        elif self._groups == self._num_channels:
+            ### depthwise convolution
+            if in_nc != out_nc:
+                _logger.debug(
+                    "input channel and output channel in depthwise conv is different, change output channel to input channel! origin channel:(in_nc {}, out_nc {}): ".
+                    format(in_nc, out_nc))
+            groups = in_nc
+            out_nc = in_nc
+            return groups, in_nc, out_nc
+        else:
+            ### groups convolution
+            ### conv: weight: (Cout, Cin/G, Kh, Kw)
+            groups = self._groups
+            in_nc = int(in_nc // groups)
+            return groups, in_nc, out_nc
    def forward(self, input, kernel_size=None, expand_ratio=None, channel=None):
        self.cur_config = {
@@ -304,8 +324,8 @@ class SuperConv2D(nn.Conv2D):
        elif channel != None:
            out_nc = int(channel)
        else:
-            out_nc = self._out_channels
+            out_nc = self._num_filters
-        ks = int(self._kernel_size[0]) if kernel_size == None else int(
+        ks = int(self._filter_size[0]) if kernel_size == None else int(
            kernel_size)
        groups, weight_in_nc, weight_out_nc = self.get_groups_in_out_nc(in_nc,
@@ -318,21 +338,28 @@ class SuperConv2D(nn.Conv2D):
        else:
            padding = self._padding
+        if self._l_type == 'conv2d':
+            attrs = ('strides', self._stride, 'paddings', padding, 'dilations',
+                     self._dilation, 'groups', groups
+                     if groups else 1, 'use_cudnn', self._use_cudnn)
+            out = core.ops.conv2d(input, weight, *attrs)
+        elif self._l_type == 'depthwise_conv2d':
+            attrs = ('strides', self._stride, 'paddings', padding, 'dilations',
+                     self._dilation, 'groups', groups
+                     if groups else self._groups, 'use_cudnn', self._use_cudnn)
+            out = core.ops.depthwise_conv2d(input, weight, *attrs)
+        else:
+            raise ValueError("conv type error")
+        pre_bias = out
+        out_nc = int(pre_bias.shape[1])
        if self.bias is not None:
            bias = self.bias[:out_nc]
+            pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
        else:
-            bias = self.bias
+            pre_act = pre_bias
-        out = F.conv2d(
+        return dygraph_utils._append_activation_in_dygraph(pre_act, self._act)
-            input,
-            weight,
-            bias=bias,
-            stride=self._stride,
-            padding=padding,
-            dilation=self._dilation,
-            groups=self._groups,
-            data_format=self._data_format)
-        return out
 class SuperGroupConv2D(SuperConv2D):
@@ -356,7 +383,7 @@ class SuperDepthwiseConv2D(SuperConv2D):
        return groups, in_nc, out_nc
-class SuperConv2DTranspose(nn.Conv2DTranspose):
+class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose):
    """
    This interface is used to construct a callable object of the ``SuperConv2DTranspose`` 
    class.
@@ -458,55 +485,53 @@ class SuperConv2DTranspose(nn.Conv2DTranspose):
        None
    Examples:
       .. code-block:: python
-          import paddle
+          import paddle.fluid as fluid
+          from paddleslim.core.layers import SuperConv2DTranspose
          import numpy as np
-          from paddleslim.nas.ofa.layers import SuperConv2DTranspose
+          with fluid.dygraph.guard():
-          data = np.random.random((3, 32, 32, 5)).astype('float32')
+              data = np.random.random((3, 32, 32, 5)).astype('float32')
-          config = {'channel': 5}
+              config = {'channel': 5
-          super_convtranspose = SuperConv2DTranspose(num_channels=32, num_filters=10, filter_size=3)
+              super_convtranspose = SuperConv2DTranspose(num_channels=32, num_filters=10, filter_size=3)
-          ret = super_convtranspose(paddle.to_variable(data), config)
+              ret = super_convtranspose(fluid.dygraph.base.to_variable(data), config)
    """
    def __init__(self,
-                 in_channels,
+                 num_channels,
-                 out_channels,
+                 num_filters,
-                 kernel_size,
+                 filter_size,
+                 output_size=None,
                 candidate_config={},
                 transform_kernel=False,
                 stride=1,
-                 padding=0,
-                 output_padding=0,
                 dilation=1,
-                 groups=1,
+                 padding=0,
-                 weight_attr=None,
+                 groups=None,
+                 param_attr=None,
                 bias_attr=None,
-                 data_format="NCHW"):
+                 use_cudnn=True,
+                 act=None,
+                 dtype='float32'):
        super(SuperConv2DTranspose, self).__init__(
-            in_channels,
+            num_channels, num_filters, filter_size, output_size, padding,
-            out_channels,
+            stride, dilation, groups, param_attr, bias_attr, use_cudnn, act,
-            kernel_size,
+            dtype)
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            output_padding=output_padding,
-            groups=groups,
-            weight_attr=weight_attr,
-            bias_attr=bias_attr,
-            data_format=data_format)
        self.candidate_config = candidate_config
        if len(self.candidate_config.items()) != 0:
            for k, v in candidate_config.items():
                candidate_config[k] = list(set(v))
        self.ks_set = candidate_config[
            'kernel_size'] if 'kernel_size' in candidate_config else None
+        if isinstance(self._filter_size, int):
+            self._filter_size = convert_to_list(self._filter_size, 2)
        self.expand_ratio = candidate_config[
            'expand_ratio'] if 'expand_ratio' in candidate_config else None
        self.channel = candidate_config[
            'channel'] if 'channel' in candidate_config else None
-        self.base_channel = self._out_channels
+        self.base_channel = self._num_filters
        if self.expand_ratio:
-            self.base_channel = int(self._out_channels / max(self.expand_ratio))
+            self.base_channel = int(self._num_filters / max(self.expand_ratio))
        self.transform_kernel = transform_kernel
        if self.ks_set != None:
@@ -520,9 +545,10 @@ class SuperConv2DTranspose(nn.Conv2DTranspose):
                param_name = '%dto%d_matrix' % (ks_large, ks_small)
                ks_t = ks_small**2
                scale_param[param_name] = self.create_parameter(
-                    attr=paddle.ParamAttr(
+                    attr=fluid.ParamAttr(
                        name=self._full_name + param_name,
-                        initializer=nn.initializer.Assign(np.eye(ks_t))),
+                        initializer=fluid.initializer.NumpyArrayInitializer(
+                            np.eye(ks_t))),
                    shape=(ks_t, ks_t),
                    dtype=self._dtype)
@@ -530,9 +556,9 @@ class SuperConv2DTranspose(nn.Conv2DTranspose):
                setattr(self, name, param)
    def get_active_filter(self, in_nc, out_nc, kernel_size):
-        start, end = compute_start_end(self._kernel_size[0], kernel_size)
+        start, end = compute_start_end(self._filter_size[0], kernel_size)
        filters = self.weight[:in_nc, :out_nc, start:end, start:end]
-        if self.transform_kernel != False and kernel_size < self._kernel_size[
+        if self.transform_kernel != False and kernel_size < self._filter_size[
                0]:
            start_filter = self.weight[:in_nc, :out_nc, :, :]
            for i in range(len(self.ks_set) - 1, 0, -1):
@@ -542,15 +568,16 @@ class SuperConv2DTranspose(nn.Conv2DTranspose):
                target_ks = self.ks_set[i - 1]
                start, end = compute_start_end(src_ks, target_ks)
                _input_filter = start_filter[:, :, start:end, start:end]
-                _input_filter = paddle.reshape(
+                _input_filter = fluid.layers.reshape(
                    _input_filter,
                    shape=[(_input_filter.shape[0] * _input_filter.shape[1]),
                           -1])
-                _input_filter = paddle.matmul(
+                core.ops.matmul(_input_filter,
-                    _input_filter,
+                                self.__getattr__('%dto%d_matrix' %
-                    self.__getattr__('%dto%d_matrix' %
+                                                 (src_ks, target_ks)),
-                                     (src_ks, target_ks)), False, False)
+                                _input_filter, 'transpose_X', False,
-                _input_filter = paddle.reshape(
+                                'transpose_Y', False, "alpha", 1)
+                _input_filter = fluid.layers.reshape(
                    _input_filter,
                    shape=[
                        filters.shape[0], filters.shape[1], target_ks, target_ks
@@ -560,15 +587,26 @@ class SuperConv2DTranspose(nn.Conv2DTranspose):
        return filters
    def get_groups_in_out_nc(self, in_nc, out_nc):
-        ### standard conv
+        if self._groups == 1 or self._groups == None:
-        return self._groups, in_nc, out_nc
+            ### standard conv
+            return self._groups, in_nc, out_nc
-    def forward(self,
+        elif self._groups == self._num_channels:
-                input,
+            ### depthwise convolution
-                output_size=None,
+            if in_nc != out_nc:
-                kernel_size=None,
+                _logger.debug(
-                expand_ratio=None,
+                    "input channel and output channel in depthwise conv is different, change output channel to input channel! origin channel:(in_nc {}, out_nc {}): ".
-                channel=None):
+                    format(in_nc, out_nc))
+            groups = in_nc
+            out_nc = in_nc
+            return groups, in_nc, out_nc
+        else:
+            ### groups convolution
+            ### groups conv transpose: weight: (Cin, Cout/G, Kh, Kw)
+            groups = self._groups
+            out_nc = int(out_nc // groups)
+            return groups, in_nc, out_nc
+    def forward(self, input, kernel_size=None, expand_ratio=None, channel=None):
        self.cur_config = {
            'kernel_size': kernel_size,
            'expand_ratio': expand_ratio,
@@ -583,43 +621,34 @@ class SuperConv2DTranspose(nn.Conv2DTranspose):
        elif channel != None:
            out_nc = int(channel)
        else:
-            out_nc = self._out_channels
+            out_nc = self._num_filters
-        ks = int(self._kernel_size[0]) if kernel_size == None else int(
+        ks = int(self._filter_size[0]) if kernel_size == None else int(
            kernel_size)
        groups, weight_in_nc, weight_out_nc = self.get_groups_in_out_nc(in_nc,
                                                                        out_nc)
        weight = self.get_active_filter(weight_in_nc, weight_out_nc, ks)
        if kernel_size != None or 'kernel_size' in self.candidate_config.keys():
            padding = convert_to_list(get_same_padding(ks), 2)
        else:
            padding = self._padding
-        if output_size is None:
+        op = getattr(core.ops, self._op_type)
-            output_padding = self.output_padding
+        out = op(input, weight, 'output_size', self._output_size, 'strides',
-        else:
+                 self._stride, 'paddings', padding, 'dilations', self._dilation,
-            output_padding = 0
+                 'groups', groups, 'use_cudnn', self._use_cudnn)
+        pre_bias = out
+        out_nc = int(pre_bias.shape[1])
        if self.bias is not None:
            bias = self.bias[:out_nc]
+            pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
        else:
-            bias = self.bias
+            pre_act = pre_bias
-        out = F.conv2d_transpose(
+        return dygraph_utils._append_activation_in_dygraph(
-            input,
+            pre_act, act=self._act)
-            weight,
-            bias=bias,
-            padding=padding,
-            output_padding=output_padding,
-            stride=self._stride,
-            dilation=self._dilation,
-            groups=self._groups,
-            output_size=output_size,
-            data_format=self._data_format)
-        return out
 class SuperGroupConv2DTranspose(SuperConv2DTranspose):
@@ -643,7 +672,7 @@ class SuperDepthwiseConv2DTranspose(SuperConv2DTranspose):
 ### NOTE: only search channel, write for GAN-compression, maybe change to SuperDepthwiseConv and SuperConv after.
-class SuperSeparableConv2D(nn.Layer):
+class SuperSeparableConv2D(fluid.dygraph.Layer):
    """
    This interface is used to construct a callable object of the ``SuperSeparableConv2D``
    class.
@@ -653,8 +682,8 @@ class SuperSeparableConv2D(nn.Layer):
    the second conv's inputs, used to change the first dimension of weight and bias, 
    only train the first channels of the weight and bias.
-    The architecture of super separable convolution2D op is [Conv2D, norm layer(may be BatchNorm2D
+    The architecture of super separable convolution2D op is [Conv2D, norm layer(may be BatchNorm
-    or InstanceNorm2D), Conv2D]. The first conv is depthwise conv, the filter number is input channel
+    or InstanceNorm), Conv2D]. The first conv is depthwise conv, the filter number is input channel
    multiply scale_factor, the group is equal to the number of input channel. The second conv
    is standard conv, which filter size and stride size are 1. 
@@ -674,57 +703,62 @@ class SuperSeparableConv2D(nn.Layer):
        dilation(int or tuple, optional): The first conv's dilation size. If dilation is a tuple, 
            it must contain two integers, (dilation_H, dilation_W). Otherwise, the
            dilation_H = dilation_W = dilation. Default: 1.
-        norm_layer(class): The normalization layer between two convolution. Default: InstanceNorm2D.
+        norm_layer(class): The normalization layer between two convolution. Default: InstanceNorm.
        bias_attr (ParamAttr or bool, optional): The attribute for the bias of convolution.
            If it is set to False, no bias will be added to the output units.
            If it is set to None or one attribute of ParamAttr, convolution
            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized zero. Default: None.
        scale_factor(float): The scale factor of the first conv's output channel. Default: 1.
+        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True.
    Returns:
        None
    """
    def __init__(self,
-                 in_channels,
+                 num_channels,
-                 out_channels,
+                 num_filters,
-                 kernel_size,
+                 filter_size,
                 candidate_config={},
                 stride=1,
                 padding=0,
                 dilation=1,
-                 norm_layer=nn.InstanceNorm2D,
+                 norm_layer=InstanceNorm,
                 bias_attr=None,
-                 scale_factor=1):
+                 scale_factor=1,
+                 use_cudnn=False):
        super(SuperSeparableConv2D, self).__init__()
-        self.conv = nn.LayerList([
+        self.conv = fluid.dygraph.LayerList([
-            nn.Conv2D(
+            fluid.dygraph.nn.Conv2D(
-                in_channels=in_channels,
+                num_channels=num_channels,
-                out_channels=in_channels * scale_factor,
+                num_filters=num_channels * scale_factor,
-                kernel_size=kernel_size,
+                filter_size=filter_size,
                stride=stride,
                padding=padding,
-                groups=in_channels,
+                use_cudnn=False,
+                groups=num_channels,
                bias_attr=bias_attr)
        ])
-        self.conv.extend([norm_layer(in_channels * scale_factor)])
+        self.conv.extend([norm_layer(num_channels * scale_factor)])
        self.conv.extend([
-            nn.Conv2D(
+            fluid.dygraph.nn.Conv2D(
-                in_channels=in_channels * scale_factor,
+                num_channels=num_channels * scale_factor,
-                out_channels=out_channels,
+                num_filters=num_filters,
-                kernel_size=1,
+                filter_size=1,
                stride=1,
+                use_cudnn=use_cudnn,
                bias_attr=bias_attr)
        ])
        self.candidate_config = candidate_config
        self.expand_ratio = candidate_config[
            'expand_ratio'] if 'expand_ratio' in candidate_config else None
-        self.base_output_dim = self.conv[0]._out_channels
+        self.base_output_dim = self.conv[0]._num_filters
        if self.expand_ratio != None:
-            self.base_output_dim = int(self.conv[0]._out_channels /
+            self.base_output_dim = int(self.conv[0]._num_filters /
                                       max(self.expand_ratio))
    def forward(self, input, expand_ratio=None, channel=None):
@@ -738,70 +772,88 @@ class SuperSeparableConv2D(nn.Layer):
        elif channel != None:
            out_nc = int(channel)
        else:
-            out_nc = self.conv[0]._out_channels
+            out_nc = self.conv[0]._num_filters
        weight = self.conv[0].weight[:in_nc]
        ###  conv1
+        if self.conv[0]._l_type == 'conv2d':
+            attrs = ('strides', self.conv[0]._stride, 'paddings',
+                     self.conv[0]._padding, 'dilations', self.conv[0]._dilation,
+                     'groups', in_nc, 'use_cudnn', self.conv[0]._use_cudnn)
+            out = core.ops.conv2d(input, weight, *attrs)
+        elif self.conv[0]._l_type == 'depthwise_conv2d':
+            attrs = ('strides', self.conv[0]._stride, 'paddings',
+                     self.conv[0]._padding, 'dilations', self.conv[0]._dilation,
+                     'groups', in_nc, 'use_cudnn', self.conv[0]._use_cudnn)
+            out = core.ops.depthwise_conv2d(input, weight, *attrs)
+        else:
+            raise ValueError("conv type error")
+        pre_bias = out
        if self.conv[0].bias is not None:
            bias = self.conv[0].bias[:in_nc]
+            pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
        else:
-            bias = self.conv[0].bias
+            pre_act = pre_bias
-        conv0_out = F.conv2d(
+        conv0_out = dygraph_utils._append_activation_in_dygraph(
-            input,
+            pre_act, self.conv[0]._act)
-            weight,
-            bias,
-            stride=self.conv[0]._stride,
-            padding=self.conv[0]._padding,
-            dilation=self.conv[0]._dilation,
-            groups=in_nc,
-            data_format=self.conv[0]._data_format)
        norm_out = self.conv[1](conv0_out)
        weight = self.conv[2].weight[:out_nc, :in_nc, :, :]
+        if self.conv[2]._l_type == 'conv2d':
+            attrs = ('strides', self.conv[2]._stride, 'paddings',
+                     self.conv[2]._padding, 'dilations', self.conv[2]._dilation,
+                     'groups', self.conv[2]._groups if self.conv[2]._groups else
+                     1, 'use_cudnn', self.conv[2]._use_cudnn)
+            out = core.ops.conv2d(norm_out, weight, *attrs)
+        elif self.conv[2]._l_type == 'depthwise_conv2d':
+            attrs = ('strides', self.conv[2]._stride, 'paddings',
+                     self.conv[2]._padding, 'dilations', self.conv[2]._dilation,
+                     'groups', self.conv[2]._groups, 'use_cudnn',
+                     self.conv[2]._use_cudnn)
+            out = core.ops.depthwise_conv2d(norm_out, weight, *attrs)
+        else:
+            raise ValueError("conv type error")
+        pre_bias = out
        if self.conv[2].bias is not None:
            bias = self.conv[2].bias[:out_nc]
+            pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
        else:
-            bias = self.conv[2].bias
+            pre_act = pre_bias
-        conv1_out = F.conv2d(
+        conv1_out = dygraph_utils._append_activation_in_dygraph(
-            norm_out,
+            pre_act, self.conv[2]._act)
-            weight,
-            bias,
-            stride=self.conv[2]._stride,
-            padding=self.conv[2]._padding,
-            dilation=self.conv[2]._dilation,
-            groups=self.conv[2]._groups,
-            data_format=self.conv[2]._data_format)
        return conv1_out
-class SuperLinear(nn.Linear):
+class SuperLinear(fluid.dygraph.Linear):
    """
    """
    def __init__(self,
-                 in_features,
+                 input_dim,
-                 out_features,
+                 output_dim,
                 candidate_config={},
-                 weight_attr=None,
+                 param_attr=None,
                 bias_attr=None,
-                 name=None):
+                 act=None,
-        super(SuperLinear, self).__init__(in_features, out_features,
+                 dtype="float32"):
-                                          weight_attr, bias_attr, name)
+        super(SuperLinear, self).__init__(input_dim, output_dim, param_attr,
-        self._weight_attr = weight_attr
+                                          bias_attr, act, dtype)
+        self._param_attr = param_attr
        self._bias_attr = bias_attr
-        self._in_features = in_features
+        self.output_dim = output_dim
-        self._out_features = out_features
        self.candidate_config = candidate_config
        self.expand_ratio = candidate_config[
            'expand_ratio'] if 'expand_ratio' in candidate_config else None
-        self.base_output_dim = self._out_features
+        self.base_output_dim = self.output_dim
        if self.expand_ratio != None:
-            self.base_output_dim = int(self._out_features /
+            self.base_output_dim = int(self.output_dim / max(self.expand_ratio))
-                                       max(self.expand_ratio))
    def forward(self, input, expand_ratio=None, channel=None):
        self.cur_config = {'expand_ratio': expand_ratio, 'channel': channel}
@@ -815,39 +867,53 @@ class SuperLinear(nn.Linear):
        elif channel != None:
            out_nc = int(channel)
        else:
-            out_nc = self._out_features
+            out_nc = self.output_dim
        weight = self.weight[:in_nc, :out_nc]
        if self._bias_attr != False:
            bias = self.bias[:out_nc]
+            use_bias = True
+        pre_bias = _varbase_creator(dtype=input.dtype)
+        core.ops.matmul(input, weight, pre_bias, 'transpose_X', False,
+                        'transpose_Y', False, "alpha", 1)
+        if self._bias_attr != False:
+            pre_act = dygraph_utils._append_bias_in_dygraph(
+                pre_bias, bias, axis=len(input.shape) - 1)
        else:
-            bias = self.bias
+            pre_act = pre_bias
-        out = F.linear(x=input, weight=weight, bias=bias, name=self.name)
+        return dygraph_utils._append_activation_in_dygraph(pre_act, self._act)
-        return out
-class SuperBatchNorm2D(nn.BatchNorm2D):
+class SuperBatchNorm(fluid.dygraph.BatchNorm):
    """
    add comment
    """
    def __init__(self,
-                 num_features,
+                 num_channels,
+                 act=None,
+                 is_test=False,
                 momentum=0.9,
                 epsilon=1e-05,
-                 weight_attr=None,
+                 param_attr=None,
                 bias_attr=None,
-                 data_format='NCHW',
+                 dtype='float32',
-                 name=None):
+                 data_layout='NCHW',
-        super(SuperBatchNorm2D, self).__init__(num_features, momentum, epsilon,
+                 in_place=False,
-                                               weight_attr, bias_attr,
+                 moving_mean_name=None,
-                                               data_format, name)
+                 moving_variance_name=None,
+                 do_model_average_for_mean_and_var=True,
+                 use_global_stats=False,
+                 trainable_statistics=False):
+        super(SuperBatchNorm, self).__init__(
+            num_channels, act, is_test, momentum, epsilon, param_attr,
+            bias_attr, dtype, data_layout, in_place, moving_mean_name,
+            moving_variance_name, do_model_average_for_mean_and_var,
+            use_global_stats, trainable_statistics)
    def forward(self, input):
-        self._check_data_format(self._data_format)
-        self._check_input_dim(input)
        feature_dim = int(input.shape[1])
        weight = self.weight[:feature_dim]
@@ -855,97 +921,96 @@ class SuperBatchNorm2D(nn.BatchNorm2D):
        mean = self._mean[:feature_dim]
        variance = self._variance[:feature_dim]
-        return F.batch_norm(
+        mean_out = mean
-            input,
+        variance_out = variance
-            mean,
-            variance,
+        attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
-            weight=weight,
+                 "is_test", not self.training, "data_layout", self._data_layout,
-            bias=bias,
+                 "use_mkldnn", False, "fuse_with_relu", self._fuse_with_relu,
-            training=self.training,
+                 "use_global_stats", self._use_global_stats,
-            momentum=self._momentum,
+                 'trainable_statistics', self._trainable_statistics)
-            epsilon=self._epsilon,
+        batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
-            data_format=self._data_format)
+            input, weight, bias, mean, variance, mean_out, variance_out, *attrs)
+        return dygraph_utils._append_activation_in_dygraph(
+            batch_norm_out, act=self._act)
-class SuperInstanceNorm2D(nn.InstanceNorm2D):
+class SuperInstanceNorm(fluid.dygraph.InstanceNorm):
    """
    """
    def __init__(self,
-                 num_features,
+                 num_channels,
                 epsilon=1e-05,
-                 momentum=0.9,
+                 param_attr=None,
-                 weight_attr=None,
                 bias_attr=None,
-                 data_format='NCHW',
+                 dtype='float32'):
-                 name=None):
+        super(SuperInstanceNorm, self).__init__(num_channels, epsilon,
-        super(SuperInstanceNorm2D, self).__init__(num_features, epsilon,
+                                                param_attr, bias_attr, dtype)
-                                                  momentum, weight_attr,
-                                                  bias_attr, data_format, name)
    def forward(self, input):
-        self._check_input_dim(input)
        feature_dim = int(input.shape[1])
-        if self._weight_attr == False and self._bias_attr == False:
+        if self._param_attr == False and self._bias_attr == False:
            scale = None
            bias = None
        else:
            scale = self.scale[:feature_dim]
            bias = self.bias[:feature_dim]
-        return F.instance_norm(input, scale, bias, eps=self._epsilon)
+        out, _, _ = core.ops.instance_norm(input, scale, bias, 'epsilon',
+                                           self._epsilon)
+        return out
-class SuperLayerNorm(nn.LayerNorm):
+class SuperLayerNorm(fluid.dygraph.LayerNorm):
    def __init__(self,
                 normalized_shape,
+                 scale=True,
+                 shift=True,
                 epsilon=1e-05,
-                 weight_attr=None,
+                 param_attr=None,
                 bias_attr=None,
-                 name=None):
+                 act=None,
-        super(SuperLayerNorm, self).__init__(normalized_shape, epsilon,
+                 dtype='float32'):
-                                             weight_attr, bias_attr, name)
+        super(SuperLayerNorm,
+              self).__init__(normalized_shape, scale, shift, epsilon,
+                             param_attr, bias_attr, act, dtype)
    def forward(self, input):
-        ### TODO(ceci3): fix if normalized_shape is not a single number
+        input_shape = list(input.shape)
-        input_ndim = len(list(input.shape))
+        input_ndim = len(input_shape)
        normalized_ndim = len(self._normalized_shape)
-        begin_norm_axis = input_ndim - normalized_ndim
+        self._begin_norm_axis = input_ndim - normalized_ndim
+        ### TODO(ceci3): fix if normalized_shape is not a single number
        feature_dim = int(input.shape[-1])
-        if self._weight_attr != False:
+        weight = self.weight[:feature_dim]
-            weight = self.weight[:feature_dim]
+        bias = self.bias[:feature_dim]
-        else:
+        pre_act, _, _ = core.ops.layer_norm(input, weight, bias, 'epsilon',
-            weight = None
+                                            self._epsilon, 'begin_norm_axis',
-        if self._bias_attr != False:
+                                            self._begin_norm_axis)
-            bias = self.bias[:feature_dim]
+        return dygraph_utils._append_activation_in_dygraph(
-        else:
+            pre_act, act=self._act)
-            bias = None
-        out, _, _ = core.ops.layer_norm(input, weight, bias, 'epsilon',
-                                        self._epsilon, 'begin_norm_axis',
-                                        begin_norm_axis)
-        return out
-class SuperEmbedding(nn.Embedding):
+class SuperEmbedding(fluid.dygraph.Embedding):
    def __init__(self,
-                 num_embeddings,
+                 size,
-                 embedding_dim,
                 candidate_config={},
+                 is_sparse=False,
+                 is_distributed=False,
                 padding_idx=None,
-                 sparse=False,
+                 param_attr=None,
-                 weight_attr=None,
+                 dtype='float32'):
-                 name=None):
+        super(SuperEmbedding, self).__init__(size, is_sparse, is_distributed,
-        super(SuperEmbedding, self).__init__(num_embeddings, embedding_dim,
+                                             padding_idx, param_attr, dtype)
-                                             padding_idx, sparse, weight_attr,
-                                             name)
        self.candidate_config = candidate_config
        self.expand_ratio = candidate_config[
            'expand_ratio'] if 'expand_ratio' in candidate_config else None
-        self.base_output_dim = self._embedding_dim
+        self.base_output_dim = self._size[-1]
        if self.expand_ratio != None:
-            self.base_output_dim = int(self._embedding_dim /
+            self.base_output_dim = int(self._size[-1] / max(self.expand_ratio))
-                                       max(self.expand_ratio))
    def forward(self, input, expand_ratio=None, channel=None):
        assert (
@@ -956,12 +1021,10 @@ class SuperEmbedding(nn.Embedding):
        elif channel != None:
            out_nc = int(channel)
        else:
-            out_nc = self._embedding_dim
+            out_nc = self._size[-1]
        weight = self.weight[:, :out_nc]
-        return F.embedding(
+        return core.ops.lookup_table_v2(
-            input,
+            weight, input, 'is_sparse', self._is_sparse, 'is_distributed',
-            weight=weight,
+            self._is_distributed, 'remote_prefetch', self._remote_prefetch,
-            padding_idx=self._padding_idx,
+            'padding_idx', self._padding_idx)
-            sparse=self._sparse,
-            name=self._name)
--- a/paddleslim/nas/ofa/ofa.py
+++ b/paddleslim/nas/ofa/ofa.py
@@ -20,10 +20,10 @@ import paddle.fluid as fluid
 from .utils.utils import get_paddle_version
 pd_ver = get_paddle_version()
 if pd_ver == 185:
-    from .layers import BaseBlock, SuperConv2D, SuperLinear
+    from .layers_old import BaseBlock, SuperConv2D, SuperLinear
    Layer = paddle.fluid.dygraph.Layer
 else:
-    from .layers_new import BaseBlock, SuperConv2D, SuperLinear
+    from .layers import BaseBlock, SuperConv2D, SuperLinear
    Layer = paddle.nn.Layer
 from .utils.utils import search_idx
 from ...common import get_logger
@@ -32,16 +32,40 @@ _logger = get_logger(__name__, level=logging.INFO)
 __all__ = ['OFA', 'RunConfig', 'DistillConfig']
-RunConfig = namedtuple('RunConfig', [
+RunConfig = namedtuple(
-    'train_batch_size', 'n_epochs', 'save_frequency', 'eval_frequency',
+    'RunConfig',
-    'init_learning_rate', 'total_images', 'elastic_depth', 'dynamic_batch_size'
+    [
-])
+        # int, batch_size in training, used to get current epoch, default: None
+        'train_batch_size',
+        # list, the number of epoch of every task in training, default: None
+        'n_epochs',
+        # list, initial learning rate of every task in traning, NOT used now. Default: None.
+        'init_learning_rate',
+        # int, total images of train dataset, used to get current epoch, default: None
+        'total_images',
+        # list, elactic depth of the model in training, default: None
+        'elastic_depth',
+        # list, the number of sub-network to train per mini-batch data, used to get current epoch, default: None
+        'dynamic_batch_size'
+    ])
 RunConfig.__new__.__defaults__ = (None, ) * len(RunConfig._fields)
-DistillConfig = namedtuple('DistillConfig', [
+DistillConfig = namedtuple(
-    'lambda_distill', 'teacher_model', 'mapping_layers', 'teacher_model_path',
+    'DistillConfig',
-    'distill_fn', 'mapping_op'
+    [
-])
+        # float, lambda scale of distillation loss, default: None.
+        'lambda_distill',
+        # instance of model, instance of teacher model, default: None.
+        'teacher_model',
+        # list(str), name of the layers which need a distillation, default: None.
+        'mapping_layers',
+        # str, the path of teacher pretrained model, default: None.
+        'teacher_model_path',
+        # instance of loss layer, the loss function used in distillation, if set to None, use mse_loss default, default: None.
+        'distill_fn',
+        # str, define which op append between teacher model and student model used in distillation, choice in ['conv', 'linear', None], default: None.
+        'mapping_op'
+    ])
 DistillConfig.__new__.__defaults__ = (None, ) * len(DistillConfig._fields)
@@ -89,15 +113,31 @@ class OFABase(Layer):
 class OFA(OFABase):
+    """
+    Convert the training progress to the Once-For-All training progress, a detailed description in the paper: `Once-for-All: Train One Network and Specialize it for Efficient Deployment<https://arxiv.org/abs/1908.09791>`_ . This paper propose a training propgress named progressive shrinking (PS), which means we start with training the largest neural network with the maximum kernel size (i.e., 7), depth (i.e., 4), and width (i.e., 6). Next, we progressively fine-tune the network to support smaller sub-networks by gradually adding them into the sampling space (larger sub-networks may also be sampled). Specifically, after training the largest network, we first support elastic kernel size which can choose from {3, 5, 7} at each layer, while the depth and width remain the maximum values. Then, we support elastic depth and elastic width sequentially. 
+    Parameters:
+        model(paddle.nn.Layer): instance of model.
+        run_config(paddleslim.ofa.RunConfig, optional): config in ofa training, can reference `<>`_ . Default: None.
+        distill_config(paddleslim.ofa.DistillConfig, optional): config of distilltion in ofa training, can reference `<>`_. Default: None.
+        elastic_order(list, optional): define the training order, if it set to None, use the default order in the paper. Default: None.
+        train_full(bool, optional): whether to train the largest sub-network only. Default: False.
+    Examples:
+        .. code-block:: python
+          from paddlslim.nas.ofa import OFA
+          ofa_model = OFA(model)
+    """
    def __init__(self,
                 model,
                 run_config=None,
-                 net_config=None,
                 distill_config=None,
                 elastic_order=None,
                 train_full=False):
        super(OFA, self).__init__(model)
-        self.net_config = net_config
+        self.net_config = None
        self.run_config = run_config
        self.distill_config = distill_config
        self.elastic_order = elastic_order
@@ -278,12 +318,29 @@ class OFA(OFABase):
            self.layers, sample_type=sample_type, task=task, phase=phase)
        return config
-    def set_task(self, task=None, phase=None):
+    def set_task(self, task, phase=None):
+        """
+        set task in the ofa training progress.
+        Parameters:
+            task(list(str)|str): spectial task in training progress.
+            phase(int, optional): the search space is gradually increased, use this parameter to spectial the phase in current task, if set to None, means use the whole search space in training progress. Default: None.
+        Examples:
+            .. code-block:: python
+              ofa_model.set_task('width')
+        """
        self.manual_set_task = True
        self.task = task
        self.phase = phase
    def set_epoch(self, epoch):
+        """
+        set epoch in the ofa training progress.
+        Parameters:
+            epoch(int): spectial epoch in training progress.
+        Examples:
+            .. code-block:: python
+              ofa_model.set_epoch(3)
+        """
        self.epoch = epoch
    def _progressive_shrinking(self):
@@ -302,6 +359,12 @@ class OFA(OFABase):
        return self._sample_config(task=self.task, phase=phase_idx)
    def calc_distill_loss(self):
+        """
+        Calculate distill loss if there are distillation.
+        Examples:
+            .. code-block:: python
+              dis_loss = ofa_model.calc_distill_loss()
+        """
        losses = []
        assert len(self.netAs) > 0
        for i, netA in enumerate(self.netAs):
@@ -319,6 +382,8 @@ class OFA(OFABase):
            else:
                Sact = Sact
+            Sact = Sact[0] if isinstance(Sact, tuple) else Sact
+            Tact = Tact[0] if isinstance(Tact, tuple) else Tact
            if self.distill_config.distill_fn == None:
                loss = fluid.layers.mse_loss(Sact, Tact.detach())
            else:
@@ -337,6 +402,15 @@ class OFA(OFABase):
        pass
    def set_net_config(self, net_config):
+        """
+        Set the config of the special sub-network to be trained.
+        Parameters:
+            net_config(dict): special the config of sug-network.
+        Examples:
+            .. code-block:: python
+              config = ofa_model.current_config
+              ofa_model.set_net_config(config)
+        """
        self.net_config = net_config
    def forward(self, *inputs, **kwargs):

--- a/paddleslim/nas/ofa/utils/__init__.py
+++ b/paddleslim/nas/ofa/utils/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from .utils import *
+from .special_config import *
 from .utils import get_paddle_version
 pd_ver = get_paddle_version()

--- a/paddleslim/nas/ofa/utils/special_config.py
+++ b/paddleslim/nas/ofa/utils/special_config.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+__all__ = ['dynabert_config']
+def dynabert_config(model, width_mult, depth_mult=1.0):
+    new_config = dict()
+    block_num = np.floor((len(model.layers.items()) - 3) / 6)
+    block_name = block_num * 6 + 2
+    def fix_exp(idx):
+        if (idx - 3) % 6 == 0 or (idx - 5) % 6 == 0:
+            return True
+        return False
+    for idx, (block_k, block_v) in enumerate(model.layers.items()):
+        if isinstance(block_v, dict) and len(block_v.keys()) != 0:
+            name, name_idx = block_k.split('_'), int(block_k.split('_')[1])
+            if fix_exp(name_idx) or 'emb' in block_k or idx >= block_name:
+                block_v['expand_ratio'] = 1.0
+            else:
+                block_v['expand_ratio'] = width_mult
+        if block_k == 'depth':
+            block_v = depth_mult
+        new_config[block_k] = block_v
+    return new_config
--- a/tests/test_ofa.py
+++ b/tests/test_ofa.py
@@ -22,7 +22,7 @@ from paddle.nn import ReLU
 from paddleslim.nas import ofa
 from paddleslim.nas.ofa import OFA, RunConfig, DistillConfig
 from paddleslim.nas.ofa.convert_super import supernet
-from paddleslim.nas.ofa.layers_new import Block, SuperSeparableConv2D
+from paddleslim.nas.ofa.layers import Block, SuperSeparableConv2D
 class ModelConv(nn.Layer):

--- a/tests/test_ofa_layers.py
+++ b/tests/test_ofa_layers.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import numpy as np
+import unittest
+import paddle
+import paddle.nn as nn
+from paddle.nn import ReLU
+from paddleslim.nas import ofa
+from paddleslim.nas.ofa import OFA, RunConfig, DistillConfig
+from paddleslim.nas.ofa.convert_super import supernet
+from paddleslim.nas.ofa.layers import *
+class ModelCase1(nn.Layer):
+    def __init__(self):
+        super(ModelCase1, self).__init__()
+        models = [SuperConv2D(3, 4, 3, bias_attr=False)]
+        models += [SuperConv2D(4, 4, 3, groups=4)]
+        models += [SuperConv2D(4, 4, 3, groups=2)]
+        models += [SuperConv2DTranspose(4, 4, 3, bias_attr=False)]
+        models += [SuperConv2DTranspose(4, 4, 3, groups=4)]
+        models += [nn.Conv2DTranspose(4, 4, 3, groups=2)]
+        models += [SuperConv2DTranspose(4, 4, 3, groups=2)]
+        models += [
+            SuperSeparableConv2D(
+                4,
+                4,
+                1,
+                padding=1,
+                bias_attr=False,
+                candidate_config={'expand_ratio': (1.0, 2.0)}),
+        ]
+        self.models = paddle.nn.Sequential(*models)
+    def forward(self, inputs):
+        return self.models(inputs)
+class TestCase(unittest.TestCase):
+    def setUp(self):
+        self.model = ModelCase1()
+        data_np = np.random.random((1, 3, 64, 64)).astype(np.float32)
+        self.data = paddle.to_tensor(data_np)
+    def test_ofa(self):
+        ofa_model = OFA(self.model)
+        out = self.model(self.data)
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/test_ofa_layers_old.py
+++ b/tests/test_ofa_layers_old.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import numpy as np
+import unittest
+import paddle
+import paddle.nn as nn
+from paddleslim.nas import ofa
+from paddleslim.nas.ofa import OFA
+from paddleslim.nas.ofa.layers_old import *
+class ModelCase1(nn.Layer):
+    def __init__(self):
+        super(ModelCase1, self).__init__()
+        models = [SuperConv2D(3, 4, 3, bias_attr=False)]
+        models += [
+            SuperConv2D(
+                4,
+                4,
+                7,
+                candidate_config={
+                    'expand_ratio': (0.5, 1.0),
+                    'kernel_size': (3, 5, 7)
+                },
+                transform_kernel=True)
+        ]
+        models += [SuperConv2D(4, 4, 3, groups=4)]
+        models += [SuperConv2D(4, 4, 3, groups=2)]
+        models += [SuperBatchNorm(4)]
+        models += [SuperConv2DTranspose(4, 4, 3, bias_attr=False)]
+        models += [
+            SuperConv2DTranspose(
+                4,
+                4,
+                7,
+                candidate_config={
+                    'expand_ratio': (0.5, 1.0),
+                    'kernel_size': (3, 5, 7)
+                },
+                transform_kernel=True)
+        ]
+        models += [SuperConv2DTranspose(4, 4, 3, groups=4)]
+        models += [SuperInstanceNorm(4)]
+        models += [nn.Conv2DTranspose(4, 4, 3, groups=2)]
+        models += [SuperConv2DTranspose(4, 4, 3, groups=2)]
+        models += [
+            SuperSeparableConv2D(
+                4,
+                4,
+                1,
+                padding=1,
+                bias_attr=False,
+                candidate_config={'expand_ratio': (0.5, 1.0)}),
+        ]
+        models += [
+            SuperSeparableConv2D(
+                4, 4, 1, padding=1, candidate_config={'channel': (2, 4)}),
+        ]
+        self.models = paddle.nn.Sequential(*models)
+    def forward(self, inputs):
+        return self.models(inputs)
+class ModelCase2(nn.Layer):
+    def __init__(self):
+        super(ModelCase2, self).__init__()
+        models = [
+            SuperEmbedding(
+                size=(64, 64), candidate_config={'expand_ratio': (0.5, 1.0)})
+        ]
+        models += [
+            SuperLinear(
+                64, 64, candidate_config={'expand_ratio': (0.5, 1.0)})
+        ]
+        models += [SuperLayerNorm(64)]
+        models += [SuperLinear(64, 64, candidate_config={'channel': (32, 64)})]
+        models += [
+            SuperLinear(
+                64, 64, bias_attr=False,
+                candidate_config={'channel': (32, 64)})
+        ]
+        self.models = paddle.nn.Sequential(*models)
+    def forward(self, inputs):
+        return self.models(inputs)
+class ModelCase3(nn.Layer):
+    def __init__(self):
+        super(ModelCase3, self).__init__()
+        self.conv1 = SuperConv2D(
+            3,
+            4,
+            7,
+            candidate_config={'kernel_size': (3, 5, 7)},
+            transform_kernel=True)
+        self.conv2 = SuperConv2DTranspose(
+            4,
+            4,
+            7,
+            candidate_config={'kernel_size': (3, 5, 7)},
+            transform_kernel=True)
+    def forward(self, inputs):
+        inputs = self.conv1(inputs, kernel_size=3)
+        inputs = self.conv2(inputs, kernel_size=3)
+        return inputs
+class TestCase(unittest.TestCase):
+    def setUp(self):
+        self.model = ModelCase1()
+        data_np = np.random.random((1, 3, 64, 64)).astype(np.float32)
+        self.data = paddle.to_tensor(data_np)
+    def test_ofa(self):
+        ofa_model = OFA(self.model)
+        out = self.model(self.data)
+class TestCase2(TestCase):
+    def setUp(self):
+        self.model = ModelCase2()
+        data_np = np.random.random((64, 64)).astype(np.int64)
+        self.data = paddle.to_tensor(data_np)
+class TestCase3(TestCase):
+    def setUp(self):
+        self.model = ModelCase3()
+        data_np = np.random.random((1, 3, 64, 64)).astype(np.float32)
+        self.data = paddle.to_tensor(data_np)
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/test_ofa_utils.py
+++ b/tests/test_ofa_utils.py
@@ -20,40 +20,36 @@ import paddle
 import paddle.nn as nn
 from paddle.vision.models import mobilenet_v1
 from paddleslim.nas.ofa.convert_super import Convert, supernet
-from paddleslim.nas.ofa.utils import compute_neuron_head_importance, reorder_head, reorder_neuron, set_state_dict
+from paddleslim.nas.ofa.utils import compute_neuron_head_importance, reorder_head, reorder_neuron, set_state_dict, dynabert_config
+from paddleslim.nas.ofa import OFA
+class TestModel(nn.Layer):
+    def __init__(self):
+        super(TestModel, self).__init__()
+        encoder_layer = nn.TransformerEncoderLayer(
+            312,
+            12,
+            1024,
+            dropout=0.1,
+            activation='gelu',
+            attn_dropout=0.1,
+            act_dropout=0)
+        self.encoder = nn.TransformerEncoder(encoder_layer, 3)
+        self.fc = nn.Linear(312, 3)
+    def forward(self, input_ids, segment_ids, attention_mask=[None, None]):
+        src = input_ids + segment_ids
+        out = self.encoder(src, attention_mask)
+        out = self.fc(out[:, 0])
+        return out
 class TestComputeImportance(unittest.TestCase):
    def setUp(self):
-        self.model = self.init_model()
+        self.model = TestModel()
        self.data_loader = self.init_data()
-    def init_model(self):
-        class TestModel(nn.Layer):
-            def __init__(self):
-                super(TestModel, self).__init__()
-                encoder_layer = nn.TransformerEncoderLayer(
-                    312,
-                    12,
-                    1024,
-                    dropout=0.1,
-                    activation='gelu',
-                    attn_dropout=0.1,
-                    act_dropout=0)
-                self.encoder = nn.TransformerEncoder(encoder_layer, 3)
-                self.fc = nn.Linear(312, 3)
-            def forward(self,
-                        input_ids,
-                        segment_ids,
-                        attention_mask=[None, None]):
-                src = input_ids + segment_ids
-                out = self.encoder(src, attention_mask)
-                out = self.fc(out[:, 0])
-                return out
-        return TestModel()
    def init_data(self):
        batch_size = 16
        hidden_size = 312
@@ -67,8 +63,7 @@ class TestComputeImportance(unittest.TestCase):
                 paddle.to_tensor(labels)), )
        return data
-    def reorder_reorder_neuron_head(self, model, head_importance,
+    def reorder_neuron_head(self, model, head_importance, neuron_importance):
-                                    neuron_importance):
        # reorder heads and ffn neurons
        for layer, current_importance in enumerate(neuron_importance):
            # reorder heads
@@ -89,8 +84,7 @@ class TestComputeImportance(unittest.TestCase):
            num_heads=12)
        assert (len(head_importance) == 3)
        assert (len(neuron_importance) == 3)
-        self.reorder_reorder_neuron_head(self.model, head_importance,
+        self.reorder_neuron_head(self.model, head_importance, neuron_importance)
-                                         neuron_importance)
 class TestComputeImportanceCase1(TestComputeImportance):
@@ -125,5 +119,14 @@ class TestSetStateDict(unittest.TestCase):
        set_state_dict(sp_model, self.origin_weights)
+class TestSpecialConfig(unittest.TestCase):
+    def test_dynabert(self):
+        self.model = TestModel()
+        sp_net_config = supernet(expand_ratio=[0.5, 1.0])
+        self.model = Convert(sp_net_config).convert(self.model)
+        ofa_model = OFA(self.model)
+        config = dynabert_config(ofa_model, 0.5)
 if __name__ == '__main__':
    unittest.main()