From 31cbd12fc18eedde40a3d5d04d4a4deed8df5a85 Mon Sep 17 00:00:00 2001
From: ceci3 <ceci3@users.noreply.github.com>
Date: Tue, 2 Feb 2021 12:51:09 +0800
Subject: [PATCH] add ofa docs (#636) (#643)

---
 demo/ofa/bert/README.md                       | 24 +++++
 demo/ofa/bert/export_model.py                 |  8 ++
 demo/ofa/bert/run_glue_ofa.py                 |  8 +-
 docs/zh_cn/api_cn/convert_supernet_api.rst    |  2 +-
 docs/zh_cn/api_cn/nas_api.rst                 | 37 +++++---
 docs/zh_cn/api_cn/ofa_api.rst                 | 22 ++---
 docs/zh_cn/api_cn/ofa_layer_api.rst           | 41 ++++----
 .../ernie_slim_ofa_tutorial.md                |  0
 docs/zh_cn/tutorials/nas/nas_ofa.md           | 93 +++++++++++++++++++
 docs/zh_cn/tutorials/nas/nas_overview.md      | 20 ++++
 .../paddlenlp_slim_ofa_tutorial.md            |  0
 paddleslim/nas/ofa/layers.py                  | 31 ++++---
 paddleslim/nas/ofa/layers_old.py              |  4 +-
 13 files changed, 222 insertions(+), 68 deletions(-)
 rename docs/zh_cn/tutorials/{static => nas}/ernie_slim_ofa_tutorial.md (100%)
 create mode 100644 docs/zh_cn/tutorials/nas/nas_ofa.md
 create mode 100644 docs/zh_cn/tutorials/nas/nas_overview.md
 rename docs/zh_cn/tutorials/{static => nas}/paddlenlp_slim_ofa_tutorial.md (100%)

diff --git a/demo/ofa/bert/README.md b/demo/ofa/bert/README.md
index 7ac94833..83054a4c 100644
--- a/demo/ofa/bert/README.md
+++ b/demo/ofa/bert/README.md
@@ -222,3 +222,27 @@ python -u ./run_glue_ofa.py --model_type bert \
                          --n_gpu 1 \
                          --width_mult_list 1.0 0.8333333333333334 0.6666666666666666 0.5
 ```
+
+# 导出子模型
+根据传入的config导出相应的子模型并转为静态图模型。
+
+## 启动命令
+
+```shell
+python3.7 -u ./export_model.py --model_type bert \
+                             --model_name_or_path ${PATH_OF_QQP_MODEL_AFTER_OFA} \
+                             --max_seq_length 128     \
+			     --sub_model_output_dir ./tmp/$TASK_NAME/dynamic_model \
+                             --static_sub_model ./tmp/$TASK_NAME/static_model \
+			     --n_gpu 1 \
+			     --width_mult  0.6666666666666666
+```
+
+其中参数释义如下:
+- `model_type` 指示了模型类型,当前仅支持BERT模型。
+- `model_name_or_path` 指示了某种特定配置的经过OFA训练后保存的模型,对应有其预训练模型和预训练时使用的tokenizer。若模型相关内容保存在本地,这里也可以提供相应目录地址。
+- `max_seq_length` 表示最大句子长度,超过该长度将被截断。默认:128.
+- `sub_model_output_dir` 指示了导出子模型动态图参数的目录。
+- `static_sub_model` 指示了导出子模型静态图模型及参数的目录,设置为None,则表示不导出静态图模型。默认:None。
+- `n_gpu` 表示使用的 GPU 卡数。若希望使用多卡训练,将其设置为指定数目即可;若为0,则使用CPU。默认:1.
+- `width_mult` 表示导出子模型的宽度。默认:1.0.
diff --git a/demo/ofa/bert/export_model.py b/demo/ofa/bert/export_model.py
index 9763ef14..0a23fa9e 100644
--- a/demo/ofa/bert/export_model.py
+++ b/demo/ofa/bert/export_model.py
@@ -60,6 +60,7 @@ def parse_args():
         "--sub_model_output_dir",
         default=None,
         type=str,
+        required=True,
         help="The output directory where the sub model predictions and checkpoints will be written.",
     )
     parser.add_argument(
@@ -131,6 +132,13 @@ def do_train(args):
         if isinstance(sublayer, paddle.nn.MultiHeadAttention):
             sublayer.num_heads = int(args.width_mult * sublayer.num_heads)
 
+    output_dir = os.path.join(args.sub_model_output_dir,
+                              "model_width_%.5f" % args.width_mult)
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    model_to_save = origin_model
+    model_to_save.save_pretrained(output_dir)
+
     if args.static_sub_model != None:
         export_static_model(origin_model, args.static_sub_model,
                             args.max_seq_length)
diff --git a/demo/ofa/bert/run_glue_ofa.py b/demo/ofa/bert/run_glue_ofa.py
index bc581f2f..9553bfeb 100644
--- a/demo/ofa/bert/run_glue_ofa.py
+++ b/demo/ofa/bert/run_glue_ofa.py
@@ -179,11 +179,9 @@ def evaluate(model, criterion, metric, data_loader, epoch, step,
             correct = metric.compute(logits, labels)
             metric.update(correct)
         results = metric.accumulate()
-        print(
-            "epoch: %d, batch: %d, width_mult: %s, eval loss: %f, %s: %s\n" %
-            (epoch, step, 'teacher' if width_mult == 100 else str(width_mult),
-             loss.numpy(), metric.name(), results),
-            end='')
+        print("epoch: %d, batch: %d, width_mult: %s, eval loss: %f, %s: %s\n" %
+              (epoch, step, 'teacher' if width_mult == 100 else str(width_mult),
+               loss.numpy(), metric.name(), results))
         model.train()
 
 
diff --git a/docs/zh_cn/api_cn/convert_supernet_api.rst b/docs/zh_cn/api_cn/convert_supernet_api.rst
index ba730dee..e596f5d8 100644
--- a/docs/zh_cn/api_cn/convert_supernet_api.rst
+++ b/docs/zh_cn/api_cn/convert_supernet_api.rst
@@ -90,7 +90,7 @@ PaddleSlim提供了三种方式构造超网络,下面分别介绍这三种方
 
 方式三
 ------------------
-直接调用动态OP组网,组网方式和普通模型相同。PaddleSlim支持的动态OP请参考 `动态OP <>`_ 。这种方式的优点是组网更自由,缺点是用法更复杂。
+直接调用动态OP组网,组网方式和普通模型相同。PaddleSlim支持的动态OP请参考 `动态OP <./ofa_layer_api.rst>`_ 。这种方式的优点是组网更自由,缺点是用法更复杂。
 
 .. note::
   - paddleslim.nas.ofa.layers 文件中的动态OP是基于Paddle 2.0beta及其之后的版本实现的。paddleslim.nas.ofa.layers_old文件中的动态OP是基于Paddle 2.0beta之前的版本实现的。
diff --git a/docs/zh_cn/api_cn/nas_api.rst b/docs/zh_cn/api_cn/nas_api.rst
index 9cb0938f..f970380c 100644
--- a/docs/zh_cn/api_cn/nas_api.rst
+++ b/docs/zh_cn/api_cn/nas_api.rst
@@ -45,8 +45,10 @@ SANAS(Simulated Annealing Neural Architecture Search)是基于模拟退火
 
 .. code-block:: python
 
+   import paddle
    from paddleslim.nas import SANAS
    config = [('MobileNetV2Space')]
+   paddle.enable_static()
    sanas = SANAS(configs=config)
 
 .. note::
@@ -82,11 +84,12 @@ SANAS(Simulated Annealing Neural Architecture Search)是基于模拟退火
 
    .. code-block:: python
 
-      import paddle.fluid as fluid
+      import paddle
       from paddleslim.nas import SANAS
       config = [('MobileNetV2Space')]
+      paddle.enable_static()
       sanas = SANAS(configs=config)
-      input = fluid.data(name='input', shape=[None, 3, 32, 32], dtype='float32')
+      input = paddle.static.data(name='input', shape=[None, 3, 32, 32], dtype='float32')
       archs = sanas.next_archs()
       for arch in archs:
           output = arch(input)
@@ -108,9 +111,10 @@ SANAS(Simulated Annealing Neural Architecture Search)是基于模拟退火
 
    .. code-block:: python
 
-      import paddle.fluid as fluid
+      import paddle
       from paddleslim.nas import SANAS
       config = [('MobileNetV2Space')]
+      paddle.enable_static()
       sanas = SANAS(configs=config)
       archs = sanas.next_archs()
       
@@ -134,11 +138,12 @@ SANAS(Simulated Annealing Neural Architecture Search)是基于模拟退火
 
    .. code-block:: python
 
-      import paddle.fluid as fluid
+      import paddle
       from paddleslim.nas import SANAS
       config = [('MobileNetV2Space')]
+      paddle.enable_static()
       sanas = SANAS(configs=config)
-      input = fluid.data(name='input', shape=[None, 3, 32, 32], dtype='float32')
+      input = paddle.static.data(name='input', shape=[None, 3, 32, 32], dtype='float32')
       tokens = ([0] * 25)
       archs = sanas.tokens2arch(tokens)[0]
       print(archs(input))
@@ -154,9 +159,10 @@ SANAS(Simulated Annealing Neural Architecture Search)是基于模拟退火
 
    .. code-block:: python
 
-      import paddle.fluid as fluid
+      import paddle
       from paddleslim.nas import SANAS
       config = [('MobileNetV2Space')]
+      paddle.enable_static()
       sanas = SANAS(configs=config)
       print(sanas.current_info())
 
@@ -222,8 +228,11 @@ RLNAS (Reinforcement Learning Neural Architecture Search)是基于强化学习
 
 .. code-block:: python
 
+   import paddle
    from paddleslim.nas import RLNAS
    config = [('MobileNetV2Space')]
+
+   paddle.enable_static()
    rlnas = RLNAS(key='lstm', configs=config)
 
 
@@ -242,11 +251,12 @@ RLNAS (Reinforcement Learning Neural Architecture Search)是基于强化学习
 
 .. code-block:: python
 
-  import paddle.fluid as fluid
+  import paddle
   from paddleslim.nas import RLNAS
   config = [('MobileNetV2Space')]
+  paddle.enable_static()
   rlnas = RLNAS(key='lstm', configs=config)
-  input = fluid.data(name='input', shape=[None, 3, 32, 32], dtype='float32')
+  input = paddle.static.data(name='input', shape=[None, 3, 32, 32], dtype='float32')
   archs = rlnas.next_archs(1)[0]
   for arch in archs:
       output = arch(input)
@@ -266,9 +276,10 @@ RLNAS (Reinforcement Learning Neural Architecture Search)是基于强化学习
 
 .. code-block:: python
 
-  import paddle.fluid as fluid
+  import paddle
   from paddleslim.nas import RLNAS
   config = [('MobileNetV2Space')]
+  paddle.enable_static()
   rlnas = RLNAS(key='lstm', configs=config)
   rlnas.next_archs(1)
   rlnas.reward(1.0)
@@ -292,9 +303,10 @@ RLNAS (Reinforcement Learning Neural Architecture Search)是基于强化学习
 
 .. code-block:: python
 
-  import paddle.fluid as fluid
+  import paddle
   from paddleslim.nas import RLNAS
   config = [('MobileNetV2Space')]
+  paddle.enable_static()
   rlnas = RLNAS(key='lstm', configs=config)
   archs = rlnas.final_archs(1)
   print(archs)
@@ -314,11 +326,12 @@ RLNAS (Reinforcement Learning Neural Architecture Search)是基于强化学习
 
 .. code-block:: python
 
-  import paddle.fluid as fluid
+  import paddle
   from paddleslim.nas import RLNAS
   config = [('MobileNetV2Space')]
+  paddle.enable_static()
   rlnas = RLNAS(key='lstm', configs=config)
-  input = fluid.data(name='input', shape=[None, 3, 32, 32], dtype='float32')
+  input = paddle.static.data(name='input', shape=[None, 3, 32, 32], dtype='float32')
   tokens = ([0] * 25)
   archs = rlnas.tokens2arch(tokens)[0]
   print(archs(input))
diff --git a/docs/zh_cn/api_cn/ofa_api.rst b/docs/zh_cn/api_cn/ofa_api.rst
index 304cbb04..24c189fc 100644
--- a/docs/zh_cn/api_cn/ofa_api.rst
+++ b/docs/zh_cn/api_cn/ofa_api.rst
@@ -1,7 +1,7 @@
 Once-For-All
 ============
 
-在进行Once-For-All训练之前,需要把普通的模型先转换为由动态OP组网的超网络。超网络转换方式可以参考 `超网络转换 <>`_ 。
+在进行Once-For-All训练之前,需要把普通的模型先转换为由动态OP组网的超网络。超网络转换方式可以参考 `超网络转换 <./convert_supernet_api.rst>`_ 。
 
 Once-For-All 训练参数配置
 ------------------
@@ -14,7 +14,7 @@ RunConfig
   - **train_batch_size:(int, 可选):** 训练时的batch size,用来计算每个epoch包括的iteration数量。默认:None。
   - **n_epochs(list, 可选):** 包含每个阶段运行到多少epochs,用来判断当前epoch在超网训练中所处的阶段,默认:None。
   - **total_images(int, 可选):**  训练集图片数量,用来计算每个epoch包括的iteration数量。默认:None。
-  - **elastic_depth(list/tuple, 可选):** 如果设置为None,则不把depth作为搜索的一部分,否则,采样到的config中会包含depth。对模型depth的改变需要在模型定义中的forward部分配合使用,具体示例可以参考 `示例 <>`_ ,默认:None。
+  - **elastic_depth(list/tuple, 可选):** 如果设置为None,则不把depth作为搜索的一部分,否则,采样到的config中会包含depth。对模型depth的改变需要在模型定义中的forward部分配合使用,具体示例可以参考 `示例 <../tutorials/nas/nas_ofa.md>`_ ,默认:None。
   - **dynamic_batch_size(list, 可选):** 代表每个阶段每个batch数据应该参与几个子网络的训练,shape应该和n_epochs的shape保持一致。默认:None。
 
 **返回:**
@@ -29,7 +29,7 @@ RunConfig
       'train_batch_size': 1,
       'n_epochs': [[1], [2, 3], [4, 5]],
       'total_images': 12,
-      'elastic_depth': (5, 15, 24)
+      'elastic_depth': (5, 15, 24),
       'dynamic_batch_size': [1, 1, 1],
   }
   run_config = RunConfig(**default_run_config)
@@ -67,7 +67,7 @@ DistillConfig
 
 OFA
 ------------------
-把超网络训练方式转换为Once-For-All的方式训练。在 `Once-For-All论文 <>`_ 中,提出 ``Progressive Shrinking`` 的超网络训练方式,具体原理是在训练过程中按照 ``elastic kernel_size`` 、 ``elastic width`` 、 ``elactic depth`` 的顺序分阶段进行训练,并且在训练过程中逐步扩大搜索空间,例如:搜索空间为 ``kernel_size=(3,5,7), expand_ratio=(0.5, 1.0, 2.0), depth=(0.5, 0.75, 1.0)`` ,则在训练过程中首先对kernel size的大小进行动态训练,并把kernel_size的动态训练分为两个阶段,第一阶段kernel_size的搜索空间为 ``[5, 7]`` ,第二阶段kernel_size的搜索空间为 ``[3, 5, 7]`` ;之后把expand_ratio的动态训练加入到超网络训练中,和对kernel_size的训练方式相同,对expand_ratio的动态训练也分为两个阶段,第一阶段expand_ratio的搜索空间为 ``[1.0, 2.0]`` ,第二阶段expand_ratio的搜索空间为 ``[0.5, 1.0, 2.0]`` ;最后对depth进行动态训练,训练阶段和kernel_size相同。
+把超网络训练方式转换为Once-For-All的方式训练。在 `Once-For-All论文 <https://arxiv.org/abs/1908.09791>`_ 中,提出 ``Progressive Shrinking`` 的超网络训练方式,具体原理是在训练过程中按照 ``elastic kernel_size`` 、 ``elastic width`` 、 ``elactic depth`` 的顺序分阶段进行训练,并且在训练过程中逐步扩大搜索空间,例如:搜索空间为 ``kernel_size=(3,5,7), expand_ratio=(0.5, 1.0, 2.0), depth=(0.5, 0.75, 1.0)`` ,则在训练过程中首先对kernel size的大小进行动态训练,并把kernel_size的动态训练分为两个阶段,第一阶段kernel_size的搜索空间为 ``[5, 7]`` ,第二阶段kernel_size的搜索空间为 ``[3, 5, 7]`` ;之后把expand_ratio的动态训练加入到超网络训练中,和对kernel_size的训练方式相同,对expand_ratio的动态训练也分为两个阶段,第一阶段expand_ratio的搜索空间为 ``[1.0, 2.0]`` ,第二阶段expand_ratio的搜索空间为 ``[0.5, 1.0, 2.0]`` ;最后对depth进行动态训练,训练阶段和kernel_size相同。
 
 .. py:class:: paddleslim.nas.ofa.OFA(model, run_config=None, distill_config=None, elastic_order=None, train_full=False)
 
@@ -96,6 +96,7 @@ OFA实例
    sp_net_config = supernet(kernel_size=(3, 5, 7), expand_ratio=[1, 2, 4])
    sp_model = Convert(sp_net_config).convert(model)
    ofa_model = OFA(sp_model)
+
 ..
 
   .. py:method:: set_epoch(epoch)
@@ -110,9 +111,7 @@ OFA实例
 
   **示例代码:**
 
-  .. code-block:: python
-
-    ofa_model.set_epoch(3)
+   ofa_model.set_epoch(3)
 
   .. py:method:: set_task(task, phase=None)
 
@@ -127,9 +126,7 @@ OFA实例
 
   **示例代码:**
 
-  .. code-block:: python
-
-    ofa_model.set_task('width')
+   ofa_model.set_task('width')
 
   .. py:method:: set_net_config(config)
 
@@ -143,8 +140,6 @@ OFA实例
 
   **示例代码:**
 
-  .. code-block:: python
-
     config = {'conv2d_0': {'expand_ratio': 2}, 'conv2d_1': {'expand_ratio': 2}}
     ofa_model.set_net_config(config)
 
@@ -157,8 +152,6 @@ OFA实例
 
   **示例代码:**
 
-  .. code-block:: python
-
     distill_loss = ofa_model.calc_distill_loss()
 
   .. py:method:: search()
@@ -180,7 +173,6 @@ OFA实例
 
   **示例代码:**
 
-  .. code-block:: python
     from paddle.vision.models import mobilenet_v1     
     origin_model = mobilenet_v1()
 
diff --git a/docs/zh_cn/api_cn/ofa_layer_api.rst b/docs/zh_cn/api_cn/ofa_layer_api.rst
index dc6a7a0d..14b34c9a 100644
--- a/docs/zh_cn/api_cn/ofa_layer_api.rst
+++ b/docs/zh_cn/api_cn/ofa_layer_api.rst
@@ -1,7 +1,7 @@
 SuperOP
 ========
 
-PaddleSlim提供了一些API的动态版本,动态API指的是这些OP的参数大小可以在实际运行过程中根据传入的参数进行改变,用法上的差别具体是forward时候需要额外传一些实际运行相关的参数。其中 `layers_old.py <>`_ 对应的是Paddle 2.0alpha及之前版本的API, `layers.py <>`_ 对应的是Paddle 2.0alpha之后版本的API。
+PaddleSlim提供了一些API的动态版本,动态API指的是这些OP的参数大小可以在实际运行过程中根据传入的参数进行改变,用法上的差别具体是forward时候需要额外传一些实际运行相关的参数。其中 `layers_old.py <../../../paddleslim/nas/ofa/layers_old.py>`_ 对应的是Paddle 2.0alpha及之前版本的API, `layers.py <../../../paddleslim/nas/ofa/layers.py>`_ 对应的是Paddle 2.0alpha之后版本的API。
 
 .. py:class:: paddleslim.nas.ofa.layers.Block(fn, fixed=False, key=None)
 
@@ -21,8 +21,9 @@ Block实例
 
 .. code-block:: python
 
-  from paddleslim.nas.ofa.layers import Block
-  block_layer = Block(SuperConv2D(3, 4, 3, candidate_config={'kerne_size': (3, 5, 7)})
+  from paddleslim.nas.ofa.layers import Block, SuperConv2D
+  
+  block_layer = Block(SuperConv2D(3, 4, 3, candidate_config={'kerne_size': (3, 5, 7)}))
 
 .. py:class:: paddleslim.nas.ofa.layers.SuperConv2D(in_channels, out_channels, kernel_size, candidate_config={}, transform_kernel=False, stride=1, padding=0, dilation=1, groups=1, padding_mode='zeros', weight_attr=None, bias_attr=None, data_format='NCHW')
 
@@ -63,7 +64,7 @@ Block实例
    data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
    super_conv2d = SuperConv2D(3, 10, 3)
    config = {'channel': 5}
-   data = paddle.to_variable(data)
+   data = paddle.to_tensor(data)
    conv = super_conv2d(data, **config)
 
 .. py:class:: paddleslim.nas.ofa.layers.SuperConv2DTranspose(in_channels, out_channels, kernel_size, candidate_config={}, transform_kernel=False, stride=1, padding=0, output_padding=0, dilation=1, groups=1, padding_mode='zeros', weight_attr=None, bias_attr=None, data_format='NCHW')
@@ -99,14 +100,14 @@ Block实例
 
 .. code-block:: python
 
-   import paddle 
-   from paddleslim.nas.ofa.layers import SuperConv2D
-   import numpy as np
-   data = np.random.uniform(-1, 1, [32, 10, 32, 32]).astype('float32')
-   config = {'channel': 5}
-   data = paddle.to_variable(data)
-   super_convtranspose = SuperConv2DTranspose(num_channels=32, num_filters=10, filter_size=3)
-   ret = super_convtranspose(paddle.to_variable(data), **config)
+  import paddle 
+  from paddleslim.nas.ofa.layers import SuperConv2DTranspose
+  import numpy as np
+  data = np.random.uniform(-1, 1, [32, 10, 32, 32]).astype('float32')
+  config = {'channel': 5}
+  data = paddle.to_tensor(data)
+  super_convtranspose = SuperConv2DTranspose(32, 10, 3)
+  ret = super_convtranspose(paddle.to_tensor(data), **config)
 
 
 .. py:class:: paddleslim.nas.ofa.layers.SuperLinear(in_features, out_features, candidate_config={}, weight_attr=None, bias_attr=None, name=None):
@@ -138,10 +139,10 @@ Block实例
   import paddle
   from paddleslim.nas.ofa.layers import SuperLinear
 
-  data = np.random.uniform(-1, 1, [32, 64] ).astype('float32')
+  data = np.random.uniform(-1, 1, [32, 64]).astype('float32')
   config = {'channel': 16}
-  linear = SuperLinear(32, 64)
-  data = paddle.to_variable(data)
+  linear = SuperLinear(64, 64)
+  data = paddle.to_tensor(data)
   res = linear(data, **config)
 
 
@@ -175,10 +176,10 @@ Block实例
   import paddle
   from paddleslim.nas.ofa.layers import SuperEmbedding
 
-  data = np.random.uniform(-1, 1, [32, 64]).astype('float32')
+  data = np.random.uniform(-1, 1, [32, 64]).astype('int64')
   config = {'channel': 16}
-  emb = SuperEmbedding(32, 64)
-  data = paddle.to_variable(data)
+  emb = SuperEmbedding(64, 64)
+  data = paddle.to_tensor(data)
   res = emb(data, **config)
 
 .. py:class:: paddleslim.nas.ofa.layers.SuperBatchNorm2D(num_features, momentum=0.9, epsilon=1e-05, weight_attr=None, bias_attr=None, data_format='NCHW', name=None):
@@ -261,8 +262,8 @@ Block实例
     from paddleslim.nas.ofa.layers import SuperLayerNorm
 
     np.random.seed(123)
-    x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+    x_data = np.random.random(size=(2, 3)).astype('float32')
     x = paddle.to_tensor(x_data) 
-    layer_norm = SuperLayerNorm(x_data.shape[1:])
+    layer_norm = SuperLayerNorm(x_data.shape[1])
     layer_norm_out = layer_norm(x)
 
diff --git a/docs/zh_cn/tutorials/static/ernie_slim_ofa_tutorial.md b/docs/zh_cn/tutorials/nas/ernie_slim_ofa_tutorial.md
similarity index 100%
rename from docs/zh_cn/tutorials/static/ernie_slim_ofa_tutorial.md
rename to docs/zh_cn/tutorials/nas/ernie_slim_ofa_tutorial.md
diff --git a/docs/zh_cn/tutorials/nas/nas_ofa.md b/docs/zh_cn/tutorials/nas/nas_ofa.md
new file mode 100644
index 00000000..7df1d6f2
--- /dev/null
+++ b/docs/zh_cn/tutorials/nas/nas_ofa.md
@@ -0,0 +1,93 @@
+# Once-For-All
+
+&emsp;&emsp;[Once-For-All(以下简称OFA)](https://arxiv.org/abs/1908.09791)主要的目的是训练一个超网络,根据不同的硬件从超网络中选择满足时延要求和精度要求的小模型。可以基于已有的预训练模型进行压缩也是OFA一个很大的优势。  
+&emsp;&emsp;为了防止子网络之间互相干扰,本论文提出了一种Progressive Shrinking(PS)的模式进行超网络训练,逐步从大型子网络到小型子网络进行训练。首先是从最大的子网络开始训练,例如:超网络包含可变的卷积核大小 kernel_size = {3, 5, 7},可变的网络结构深度 depth = {2, 3, 4} 和可变的网络的宽度 expand_ratio = {2, 4, 6},则训练卷积核为7、深度为4,宽度为6的网络。之后逐步将其添加到搜索空间中来逐步调整网络以支持较小的子网络。具体来说,在训练了最大的网络之后,我们首先支持可变卷积核大小,可以在{3,5,7}中进行选择,而深度和宽度则保持最大值。然后,我们依次支持可变深度和可变宽度。
+
+## 使用方法
+
+OFA的基本流程分为以下步骤:
+1. 定义超网络
+2. 训练配置
+3. 蒸馏配置
+4. 传入模型和相应配置
+
+### 1. 定义超网络
+   这里的超网络指的是用[动态OP](../../api_cn/ofa_layer_api.rst)组网的网络。
+   PaddleSlim提供了三种获得超网络的方式,具体可以参考[超网络转换](../../api_cn/convert_supernet_api.rst)。
+
+```python
+  import paddle
+  from paddle.vision.models import mobilenet_v1
+  from paddleslim.nas.ofa.convert_super import Convert, supernet
+
+  model = mobilenet_v1()
+  sp_net_config = supernet(kernel_size=(3, 5, 7), expand_ratio=[1, 2, 4])
+  sp_model = Convert(sp_net_config).convert(model)
+```
+
+### 2. 训练配置
+   训练配置默认根据论文中PS的训练模式进行配置,可进行配置的参数和含义可以参考: [RunConfig](../../api_cn/ofa_api.rst)
+
+```python
+  from paddleslim.nas.ofa import RunConfig
+  default_run_config = {
+      'train_batch_size': 256,
+      'n_epochs': [[1], [2, 3], [4, 5]],
+      'init_learning_rate': [[0.001], [0.003, 0.001], [0.003, 0.001]],
+      'dynamic_batch_size': [1, 1, 1],
+      'total_images': 1281167,
+      'elastic_depth': (2, 5, 8)
+  }
+  run_config = RunConfig(**default_run_config)
+```
+
+### 3. 蒸馏配置
+  为OFA训练过程添加蒸馏配置,可进行配置的参数和含义可以参考: [DistillConfig](../../api_cn/ofa_api.rst)
+
+```python
+  from paddle.vision.models import mobilenet_v1
+  from paddleslim.nas.ofa import DistillConfig
+  teacher_model = mobilenet_v1()
+
+  default_distill_config = {
+      'teacher_model': teacher_model
+  }
+  distill_config = DistillConfig(**default_distill_config)
+```
+
+### 4. 传入模型和相应配置
+  用OFA封装模型、训练配置和蒸馏配置。配置完模型和正常模型训练流程相同。如果添加了蒸馏,则OFA封装后的模型会比原始模型多返回一组教师网络的输出。
+```python
+  from paddleslim.nas.ofa import OFA
+
+  ofa_model = OFA(model, run_config=run_config, distill_config=distill_config)
+```
+
+## 实验效果
+
+目前我们进在BERT-base、TinyBERT和TinyERNIE上进行了压缩实验,其他CV任务的压缩效果之后会进行补充。BERT和TinyBERT的压缩结果如下表所示。
+
+&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;<strong>表1: BERT-base上GLUE数据集精度对比</strong>
+| Task  | Metric                       | BERT-base         | Result with PaddleSlim |
+|:-----:|:----------------------------:|:-----------------:|:----------------------:|
+| SST-2 | Accuracy                     |      0.93005      |     [0.931193]()       |
+| QNLI  | Accuracy                     |      0.91781      |     [0.920740]()       |
+| CoLA  | Mattehew's corr              |      0.59557      |     [0.601244]()       |
+| MRPC  | F1/Accuracy                  |  0.91667/0.88235  |  [0.91740/0.88480]()   |
+| STS-B | Person/Spearman corr         |  0.88847/0.88350  |  [0.89271/0.88958]()   |
+| QQP   | Accuracy/F1                  |  0.90581/0.87347  |  [0.90994/0.87947]()   |
+| MNLI  | Matched acc/MisMatched acc   |  0.84422/0.84825  |  [0.84687/0.85242]()   |
+| RTE   | Accuracy                     |      0.711191     |     [0.718412]()       |
+
+
+&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;<strong>表2: TinyBERT上GLUE数据集精度对比</strong>
+| Task  | Metric                       | TinyBERT(L=4, D=312) |     Result with OFA    |
+|:-----:|:----------------------------:|:--------------------:|:----------------------:|
+| SST-2 | Accuracy                     |     [0.9234]()       |      [0.9220]()        |
+| QNLI  | Accuracy                     |     [0.8746]()       |      [0.8720]()        |
+| CoLA  | Mattehew's corr              |     [0.4961]()       |      [0.5048]()        |
+| MRPC  | F1/Accuracy                  |  [0.8998/0.8554]()   |   [0.9003/0.8578]()    |
+| STS-B | Person/Spearman corr         |  [0.8635/0.8631]()   |   [0.8717/0.8706]()    |
+| QQP   | Accuracy/F1                  |  [0.9047/0.8751]()   |   [0.9034/0.8733]()    |
+| MNLI  | Matched acc/MisMatched acc   |  [0.8256/0.8294]()   |   [0.8211/0.8261]()    |
+| RTE   | Accuracy                     |     [0.6534]()       |      [0.6787]()        |
diff --git a/docs/zh_cn/tutorials/nas/nas_overview.md b/docs/zh_cn/tutorials/nas/nas_overview.md
new file mode 100644
index 00000000..b15fd63d
--- /dev/null
+++ b/docs/zh_cn/tutorials/nas/nas_overview.md
@@ -0,0 +1,20 @@
+# PaddleSlim模型结构搜索总览
+
+PaddleSlim提供了4种网络结构搜索的方法:基于模拟退火进行网络结构搜索、基于强化学习进行网络结构搜索、基于梯度进行网络结构搜索和Once-For-All。
+
+| 算法名称  |   算法简介   | 代表模型 |
+|:---------:|:------------:|:--------:|
+| [Once-For-All](nas_ofa.md)    | OFA是一种基于One-Shot NAS的压缩方案。这种方式比较高效,其优势是只需要训练一个超网络就可以从中选择满足不同延时要求的子模型。 | Once-For-All   |
+| [SANAS](../../quick_start/static/nas_tutorial.md)            | SANAS是基于模拟退火的方式进行网络结构搜索,在机器资源不多的情况下,选择这种方式一般能得到比强化学习更好的模型。             | \              |
+| [RLNAS](./../api_cn/nas_api.rst)            | RLNAS是基于强化学习的方式进行网络结构搜索,这种方式需要耗费大量机器资源。 | ENAS、NasNet、MNasNet  |
+| [DARTS/PCDARTS](../../api_cn/darts.rst) | DARTS是基于梯度进行网络结构搜索,这种方式比较高效,大大减少了搜索时间和所需要的机器资源。 |DARTS、PCDARTS、ProxylessNAS|
+
+
+# 参考文献
+[1] H. Cai, C. Gan, T. Wang, Z. Zhang, and S. Han. Once for all: Train one network and specialize it for efficient deployment. In International Conference on Learning Representations, 2020.  
+[2] Pham, H.; Guan, M. Y.; Zoph, B.; Le, Q. V.; and Dean, J. 2018. Efficient neural architecture search via parameter sharing. arXiv preprint arXiv:1802.03268.  
+[3] Zoph B, Vasudevan V, Shlens J, et al. Learning transferable architectures for scalable image recognition[J]. arXiv preprint arXiv:1707.07012, 2017, 2(6).  
+[4] Mingxing Tan, Bo Chen, Ruoming Pang, Vijay Vasudevan, and Quoc V Le. Mnasnet: Platform-aware neural architecture search for mobile. arXiv preprint arXiv:1807.11626, 2018.  
+[5] H Liu, K Simonyan, Y Yang. Darts: Differentiable architecture search. arXiv preprint arXiv:1806.09055, 2018.  
+[6] Xu, Y., Xie, L., Zhang, X., Chen, X., Qi, G.J., Tian, Q., Xiong, H.: PCDARTS: Partial Channel Connections for Memory-efficient Differentiable Architecture Search. In: International Conference on Learning Representations (2020)  
+[7] Han Cai, Ligeng Zhu, and Song Han. ProxylessNAS: Direct neural architecture search on target task and hardware. In ICLR, 2019. URL https://arxiv.org/pdf/1812.00332.pdf. 3, 5, 6, 7, 8  
diff --git a/docs/zh_cn/tutorials/static/paddlenlp_slim_ofa_tutorial.md b/docs/zh_cn/tutorials/nas/paddlenlp_slim_ofa_tutorial.md
similarity index 100%
rename from docs/zh_cn/tutorials/static/paddlenlp_slim_ofa_tutorial.md
rename to docs/zh_cn/tutorials/nas/paddlenlp_slim_ofa_tutorial.md
diff --git a/paddleslim/nas/ofa/layers.py b/paddleslim/nas/ofa/layers.py
index 34b930e1..a4b63913 100644
--- a/paddleslim/nas/ofa/layers.py
+++ b/paddleslim/nas/ofa/layers.py
@@ -104,7 +104,9 @@ class SuperConv2D(nn.Conv2D):
     applied to the final result.
     For each input :math:`X`, the equation is:
     .. math::
-        Out = \\sigma (W \\ast X + b)
+
+        Out = sigma (W \\ast X + b)
+
     Where:
     * :math:`X`: Input value, a ``Tensor`` with NCHW format.
     * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
@@ -121,8 +123,11 @@ class SuperConv2D(nn.Conv2D):
           Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
         Where
         .. math::
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1   
+
             W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+
     Parameters:
         num_channels(int): The number of channels in the input image.
         num_filters(int): The number of filter. It is as same as the output
@@ -182,7 +187,7 @@ class SuperConv2D(nn.Conv2D):
           data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
           super_conv2d = SuperConv2D(3, 10, 3)
           config = {'channel': 5}
-          data = paddle.to_variable(data)
+          data = paddle.to_tensor(data)
           conv = super_conv2d(data, config)
 
     """
@@ -480,8 +485,8 @@ class SuperConv2DTranspose(nn.Conv2DTranspose):
           from paddleslim.nas.ofa.layers import SuperConv2DTranspose
           data = np.random.random((3, 32, 32, 5)).astype('float32')
           config = {'channel': 5}
-          super_convtranspose = SuperConv2DTranspose(num_channels=32, num_filters=10, filter_size=3)
-          ret = super_convtranspose(paddle.to_variable(data), config)
+          super_convtranspose = SuperConv2DTranspose(32, 10, 3)
+          ret = super_convtranspose(paddle.to_tensor(data), config)
     """
 
     def __init__(self,
@@ -871,10 +876,10 @@ class SuperLinear(nn.Linear):
           import paddle
           from paddleslim.nas.ofa.layers import SuperLinear
           
-          data = np.random.uniform(-1, 1, [32, 64] ).astype('float32')
+          data = np.random.uniform(-1, 1, [32, 64]).astype('float32')
           config = {'channel': 16}
-          linear = SuperLinear(32, 64)
-          data = paddle.to_variable(data)
+          linear = SuperLinear(64, 64)
+          data = paddle.to_tensor(data)
           res = linear(data, **config)
     """
 
@@ -1088,9 +1093,9 @@ class SuperLayerNorm(nn.LayerNorm):
           from paddleslim.nas.ofa.layers import SuperLayerNorm
           
           np.random.seed(123)
-          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x_data = np.random.random(size=(2, 3)).astype('float32')
           x = paddle.to_tensor(x_data)
-          layer_norm = SuperLayerNorm(x_data.shape[1:])
+          layer_norm = SuperLayerNorm(x_data.shape[1])
           layer_norm_out = layer_norm(x)
     """
 
@@ -1162,10 +1167,10 @@ class SuperEmbedding(nn.Embedding):
           import paddle
           from paddleslim.nas.ofa.layers import SuperEmbedding
           
-          data = np.random.uniform(-1, 1, [32, 64]).astype('float32')
+          data = np.random.uniform(-1, 1, [32, 64]).astype('int64')
           config = {'channel': 16}
-          emb = SuperEmbedding(32, 64)
-          data = paddle.to_variable(data)
+          emb = SuperEmbedding(64, 64)
+          data = paddle.to_tensor(data)
           res = emb(data, **config)
     """
 
diff --git a/paddleslim/nas/ofa/layers_old.py b/paddleslim/nas/ofa/layers_old.py
index fa136875..ef53a428 100644
--- a/paddleslim/nas/ofa/layers_old.py
+++ b/paddleslim/nas/ofa/layers_old.py
@@ -930,10 +930,10 @@ class SuperBatchNorm(fluid.dygraph.BatchNorm):
                  "use_mkldnn", False, "fuse_with_relu", self._fuse_with_relu,
                  "use_global_stats", self._use_global_stats,
                  'trainable_statistics', self._trainable_statistics)
-        batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
+        batch_norm_out = core.ops.batch_norm(
             input, weight, bias, mean, variance, mean_out, variance_out, *attrs)
         return dygraph_utils._append_activation_in_dygraph(
-            batch_norm_out, act=self._act)
+            batch_norm_out[0], act=self._act)
 
 
 class SuperInstanceNorm(fluid.dygraph.InstanceNorm):
-- 
GitLab