diff --git a/demo/ofa/bert/README.md b/demo/ofa/bert/README.md index 7ac94833786e2b3307e588a867574c6521839df5..83054a4c01bf78c5ff089d469823bf1920bc0d31 100644 --- a/demo/ofa/bert/README.md +++ b/demo/ofa/bert/README.md @@ -222,3 +222,27 @@ python -u ./run_glue_ofa.py --model_type bert \ --n_gpu 1 \ --width_mult_list 1.0 0.8333333333333334 0.6666666666666666 0.5 ``` + +# 导出å模型 +æ ¹æ®ä¼ 入的config导出相应的å模型并转为é™æ€å›¾æ¨¡åž‹ã€‚ + +## å¯åŠ¨å‘½ä»¤ + +```shell +python3.7 -u ./export_model.py --model_type bert \ + --model_name_or_path ${PATH_OF_QQP_MODEL_AFTER_OFA} \ + --max_seq_length 128 \ + --sub_model_output_dir ./tmp/$TASK_NAME/dynamic_model \ + --static_sub_model ./tmp/$TASK_NAME/static_model \ + --n_gpu 1 \ + --width_mult 0.6666666666666666 +``` + +å…¶ä¸å‚数释义如下: +- `model_type` 指示了模型类型,当å‰ä»…支æŒBERT模型。 +- `model_name_or_path` 指示了æŸç§ç‰¹å®šé…置的ç»è¿‡OFAè®ç»ƒåŽä¿å˜çš„模型,对应有其预è®ç»ƒæ¨¡åž‹å’Œé¢„è®ç»ƒæ—¶ä½¿ç”¨çš„tokenizer。若模型相关内容ä¿å˜åœ¨æœ¬åœ°ï¼Œè¿™é‡Œä¹Ÿå¯ä»¥æ供相应目录地å€ã€‚ +- `max_seq_length` 表示最大å¥å长度,超过该长度将被截æ–。默认:128. +- `sub_model_output_dir` 指示了导出å模型动æ€å›¾å‚数的目录。 +- `static_sub_model` 指示了导出å模型é™æ€å›¾æ¨¡åž‹åŠå‚数的目录,设置为None,则表示ä¸å¯¼å‡ºé™æ€å›¾æ¨¡åž‹ã€‚默认:None。 +- `n_gpu` 表示使用的 GPU å¡æ•°ã€‚若希望使用多å¡è®ç»ƒï¼Œå°†å…¶è®¾ç½®ä¸ºæŒ‡å®šæ•°ç›®å³å¯ï¼›è‹¥ä¸º0,则使用CPU。默认:1. +- `width_mult` 表示导出å模型的宽度。默认:1.0. diff --git a/demo/ofa/bert/export_model.py b/demo/ofa/bert/export_model.py index 9763ef14fc75bb53f87737e13eb00956fa1d559c..0a23fa9ed897acaba977aa7483be16ee17bb52bb 100644 --- a/demo/ofa/bert/export_model.py +++ b/demo/ofa/bert/export_model.py @@ -60,6 +60,7 @@ def parse_args(): "--sub_model_output_dir", default=None, type=str, + required=True, help="The output directory where the sub model predictions and checkpoints will be written.", ) parser.add_argument( @@ -131,6 +132,13 @@ def do_train(args): if isinstance(sublayer, paddle.nn.MultiHeadAttention): sublayer.num_heads = int(args.width_mult * sublayer.num_heads) + output_dir = os.path.join(args.sub_model_output_dir, + "model_width_%.5f" % args.width_mult) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + model_to_save = origin_model + model_to_save.save_pretrained(output_dir) + if args.static_sub_model != None: export_static_model(origin_model, args.static_sub_model, args.max_seq_length) diff --git a/demo/ofa/bert/run_glue_ofa.py b/demo/ofa/bert/run_glue_ofa.py index bc581f2ff89027b9c09b6484a3ff228ed8bdfc4b..9553bfebae1027b97fb4f97aadafb99d11c18448 100644 --- a/demo/ofa/bert/run_glue_ofa.py +++ b/demo/ofa/bert/run_glue_ofa.py @@ -179,11 +179,9 @@ def evaluate(model, criterion, metric, data_loader, epoch, step, correct = metric.compute(logits, labels) metric.update(correct) results = metric.accumulate() - print( - "epoch: %d, batch: %d, width_mult: %s, eval loss: %f, %s: %s\n" % - (epoch, step, 'teacher' if width_mult == 100 else str(width_mult), - loss.numpy(), metric.name(), results), - end='') + print("epoch: %d, batch: %d, width_mult: %s, eval loss: %f, %s: %s\n" % + (epoch, step, 'teacher' if width_mult == 100 else str(width_mult), + loss.numpy(), metric.name(), results)) model.train() diff --git a/docs/zh_cn/api_cn/convert_supernet_api.rst b/docs/zh_cn/api_cn/convert_supernet_api.rst index ba730dee12ec0e12d9e1d0386dfa0bb0321b5a13..e596f5d8f268aa9cb35604862ac647dc4dba63b8 100644 --- a/docs/zh_cn/api_cn/convert_supernet_api.rst +++ b/docs/zh_cn/api_cn/convert_supernet_api.rst @@ -90,7 +90,7 @@ PaddleSlimæ供了三ç§æ–¹å¼æž„é€ è¶…ç½‘ç»œï¼Œä¸‹é¢åˆ†åˆ«ä»‹ç»è¿™ä¸‰ç§æ–¹ æ–¹å¼ä¸‰ ------------------ -直接调用动æ€OP组网,组网方å¼å’Œæ™®é€šæ¨¡åž‹ç›¸åŒã€‚PaddleSlim支æŒçš„动æ€OP请å‚考 `动æ€OP <>`_ 。这ç§æ–¹å¼çš„优点是组网更自由,缺点是用法更å¤æ‚。 +直接调用动æ€OP组网,组网方å¼å’Œæ™®é€šæ¨¡åž‹ç›¸åŒã€‚PaddleSlim支æŒçš„动æ€OP请å‚考 `动æ€OP <./ofa_layer_api.rst>`_ 。这ç§æ–¹å¼çš„优点是组网更自由,缺点是用法更å¤æ‚。 .. note:: - paddleslim.nas.ofa.layers 文件ä¸çš„动æ€OP是基于Paddle 2.0betaåŠå…¶ä¹‹åŽçš„版本实现的。paddleslim.nas.ofa.layers_old文件ä¸çš„动æ€OP是基于Paddle 2.0beta之å‰çš„版本实现的。 diff --git a/docs/zh_cn/api_cn/nas_api.rst b/docs/zh_cn/api_cn/nas_api.rst index 9cb0938f1406b7ee70c22730d879eec3f49b992e..f970380c62ec487acc5df4dd759fd06b210c4235 100644 --- a/docs/zh_cn/api_cn/nas_api.rst +++ b/docs/zh_cn/api_cn/nas_api.rst @@ -45,8 +45,10 @@ SANAS(Simulated Annealing Neural Architecture Searchï¼‰æ˜¯åŸºäºŽæ¨¡æ‹Ÿé€€ç« .. code-block:: python + import paddle from paddleslim.nas import SANAS config = [('MobileNetV2Space')] + paddle.enable_static() sanas = SANAS(configs=config) .. note:: @@ -82,11 +84,12 @@ SANAS(Simulated Annealing Neural Architecture Searchï¼‰æ˜¯åŸºäºŽæ¨¡æ‹Ÿé€€ç« .. code-block:: python - import paddle.fluid as fluid + import paddle from paddleslim.nas import SANAS config = [('MobileNetV2Space')] + paddle.enable_static() sanas = SANAS(configs=config) - input = fluid.data(name='input', shape=[None, 3, 32, 32], dtype='float32') + input = paddle.static.data(name='input', shape=[None, 3, 32, 32], dtype='float32') archs = sanas.next_archs() for arch in archs: output = arch(input) @@ -108,9 +111,10 @@ SANAS(Simulated Annealing Neural Architecture Searchï¼‰æ˜¯åŸºäºŽæ¨¡æ‹Ÿé€€ç« .. code-block:: python - import paddle.fluid as fluid + import paddle from paddleslim.nas import SANAS config = [('MobileNetV2Space')] + paddle.enable_static() sanas = SANAS(configs=config) archs = sanas.next_archs() @@ -134,11 +138,12 @@ SANAS(Simulated Annealing Neural Architecture Searchï¼‰æ˜¯åŸºäºŽæ¨¡æ‹Ÿé€€ç« .. code-block:: python - import paddle.fluid as fluid + import paddle from paddleslim.nas import SANAS config = [('MobileNetV2Space')] + paddle.enable_static() sanas = SANAS(configs=config) - input = fluid.data(name='input', shape=[None, 3, 32, 32], dtype='float32') + input = paddle.static.data(name='input', shape=[None, 3, 32, 32], dtype='float32') tokens = ([0] * 25) archs = sanas.tokens2arch(tokens)[0] print(archs(input)) @@ -154,9 +159,10 @@ SANAS(Simulated Annealing Neural Architecture Searchï¼‰æ˜¯åŸºäºŽæ¨¡æ‹Ÿé€€ç« .. code-block:: python - import paddle.fluid as fluid + import paddle from paddleslim.nas import SANAS config = [('MobileNetV2Space')] + paddle.enable_static() sanas = SANAS(configs=config) print(sanas.current_info()) @@ -222,8 +228,11 @@ RLNAS (Reinforcement Learning Neural Architecture Search)是基于强化å¦ä¹ .. code-block:: python + import paddle from paddleslim.nas import RLNAS config = [('MobileNetV2Space')] + + paddle.enable_static() rlnas = RLNAS(key='lstm', configs=config) @@ -242,11 +251,12 @@ RLNAS (Reinforcement Learning Neural Architecture Search)是基于强化å¦ä¹ .. code-block:: python - import paddle.fluid as fluid + import paddle from paddleslim.nas import RLNAS config = [('MobileNetV2Space')] + paddle.enable_static() rlnas = RLNAS(key='lstm', configs=config) - input = fluid.data(name='input', shape=[None, 3, 32, 32], dtype='float32') + input = paddle.static.data(name='input', shape=[None, 3, 32, 32], dtype='float32') archs = rlnas.next_archs(1)[0] for arch in archs: output = arch(input) @@ -266,9 +276,10 @@ RLNAS (Reinforcement Learning Neural Architecture Search)是基于强化å¦ä¹ .. code-block:: python - import paddle.fluid as fluid + import paddle from paddleslim.nas import RLNAS config = [('MobileNetV2Space')] + paddle.enable_static() rlnas = RLNAS(key='lstm', configs=config) rlnas.next_archs(1) rlnas.reward(1.0) @@ -292,9 +303,10 @@ RLNAS (Reinforcement Learning Neural Architecture Search)是基于强化å¦ä¹ .. code-block:: python - import paddle.fluid as fluid + import paddle from paddleslim.nas import RLNAS config = [('MobileNetV2Space')] + paddle.enable_static() rlnas = RLNAS(key='lstm', configs=config) archs = rlnas.final_archs(1) print(archs) @@ -314,11 +326,12 @@ RLNAS (Reinforcement Learning Neural Architecture Search)是基于强化å¦ä¹ .. code-block:: python - import paddle.fluid as fluid + import paddle from paddleslim.nas import RLNAS config = [('MobileNetV2Space')] + paddle.enable_static() rlnas = RLNAS(key='lstm', configs=config) - input = fluid.data(name='input', shape=[None, 3, 32, 32], dtype='float32') + input = paddle.static.data(name='input', shape=[None, 3, 32, 32], dtype='float32') tokens = ([0] * 25) archs = rlnas.tokens2arch(tokens)[0] print(archs(input)) diff --git a/docs/zh_cn/api_cn/ofa_api.rst b/docs/zh_cn/api_cn/ofa_api.rst index 304cbb040cc7f84a36946d92e4ca5ca7ba70c198..24c189fc22ba3000104acd58e24f32f09a0af940 100644 --- a/docs/zh_cn/api_cn/ofa_api.rst +++ b/docs/zh_cn/api_cn/ofa_api.rst @@ -1,7 +1,7 @@ Once-For-All ============ -在进行Once-For-Allè®ç»ƒä¹‹å‰ï¼Œéœ€è¦æŠŠæ™®é€šçš„模型先转æ¢ä¸ºç”±åŠ¨æ€OP组网的超网络。超网络转æ¢æ–¹å¼å¯ä»¥å‚考 `è¶…ç½‘ç»œè½¬æ¢ <>`_ 。 +在进行Once-For-Allè®ç»ƒä¹‹å‰ï¼Œéœ€è¦æŠŠæ™®é€šçš„模型先转æ¢ä¸ºç”±åŠ¨æ€OP组网的超网络。超网络转æ¢æ–¹å¼å¯ä»¥å‚考 `è¶…ç½‘ç»œè½¬æ¢ <./convert_supernet_api.rst>`_ 。 Once-For-All è®ç»ƒå‚æ•°é…ç½® ------------------ @@ -14,7 +14,7 @@ RunConfig - **train_batch_size:(int, å¯é€‰):** è®ç»ƒæ—¶çš„batch size,用æ¥è®¡ç®—æ¯ä¸ªepoch包括的iterationæ•°é‡ã€‚默认:None。 - **n_epochs(list, å¯é€‰):** 包å«æ¯ä¸ªé˜¶æ®µè¿è¡Œåˆ°å¤šå°‘epochs,用æ¥åˆ¤æ–当å‰epoch在超网è®ç»ƒä¸æ‰€å¤„的阶段,默认:None。 - **total_images(int, å¯é€‰):** è®ç»ƒé›†å›¾ç‰‡æ•°é‡ï¼Œç”¨æ¥è®¡ç®—æ¯ä¸ªepoch包括的iterationæ•°é‡ã€‚默认:None。 - - **elastic_depth(list/tuple, å¯é€‰):** 如果设置为None,则ä¸æŠŠdepth作为æœç´¢çš„一部分,å¦åˆ™ï¼Œé‡‡æ ·åˆ°çš„configä¸ä¼šåŒ…å«depth。对模型depth的改å˜éœ€è¦åœ¨æ¨¡åž‹å®šä¹‰ä¸çš„forward部分é…åˆä½¿ç”¨ï¼Œå…·ä½“示例å¯ä»¥å‚考 `示例 <>`_ ,默认:None。 + - **elastic_depth(list/tuple, å¯é€‰):** 如果设置为None,则ä¸æŠŠdepth作为æœç´¢çš„一部分,å¦åˆ™ï¼Œé‡‡æ ·åˆ°çš„configä¸ä¼šåŒ…å«depth。对模型depth的改å˜éœ€è¦åœ¨æ¨¡åž‹å®šä¹‰ä¸çš„forward部分é…åˆä½¿ç”¨ï¼Œå…·ä½“示例å¯ä»¥å‚考 `示例 <../tutorials/nas/nas_ofa.md>`_ ,默认:None。 - **dynamic_batch_size(list, å¯é€‰):** 代表æ¯ä¸ªé˜¶æ®µæ¯ä¸ªbatchæ•°æ®åº”该å‚ä¸Žå‡ ä¸ªå网络的è®ç»ƒï¼Œshape应该和n_epochsçš„shapeä¿æŒä¸€è‡´ã€‚默认:None。 **返回:** @@ -29,7 +29,7 @@ RunConfig 'train_batch_size': 1, 'n_epochs': [[1], [2, 3], [4, 5]], 'total_images': 12, - 'elastic_depth': (5, 15, 24) + 'elastic_depth': (5, 15, 24), 'dynamic_batch_size': [1, 1, 1], } run_config = RunConfig(**default_run_config) @@ -67,7 +67,7 @@ DistillConfig OFA ------------------ -把超网络è®ç»ƒæ–¹å¼è½¬æ¢ä¸ºOnce-For-Allçš„æ–¹å¼è®ç»ƒã€‚在 `Once-For-All论文 <>`_ ä¸ï¼Œæ出 ``Progressive Shrinking`` 的超网络è®ç»ƒæ–¹å¼ï¼Œå…·ä½“原ç†æ˜¯åœ¨è®ç»ƒè¿‡ç¨‹ä¸æŒ‰ç…§ ``elastic kernel_size`` 〠``elastic width`` 〠``elactic depth`` 的顺åºåˆ†é˜¶æ®µè¿›è¡Œè®ç»ƒï¼Œå¹¶ä¸”在è®ç»ƒè¿‡ç¨‹ä¸é€æ¥æ‰©å¤§æœç´¢ç©ºé—´ï¼Œä¾‹å¦‚:æœç´¢ç©ºé—´ä¸º ``kernel_size=(3,5,7), expand_ratio=(0.5, 1.0, 2.0), depth=(0.5, 0.75, 1.0)`` ,则在è®ç»ƒè¿‡ç¨‹ä¸é¦–先对kernel size的大å°è¿›è¡ŒåŠ¨æ€è®ç»ƒï¼Œå¹¶æŠŠkernel_size的动æ€è®ç»ƒåˆ†ä¸ºä¸¤ä¸ªé˜¶æ®µï¼Œç¬¬ä¸€é˜¶æ®µkernel_sizeçš„æœç´¢ç©ºé—´ä¸º ``[5, 7]`` ,第二阶段kernel_sizeçš„æœç´¢ç©ºé—´ä¸º ``[3, 5, 7]`` ;之åŽæŠŠexpand_ratio的动æ€è®ç»ƒåŠ 入到超网络è®ç»ƒä¸ï¼Œå’Œå¯¹kernel_sizeçš„è®ç»ƒæ–¹å¼ç›¸åŒï¼Œå¯¹expand_ratio的动æ€è®ç»ƒä¹Ÿåˆ†ä¸ºä¸¤ä¸ªé˜¶æ®µï¼Œç¬¬ä¸€é˜¶æ®µexpand_ratioçš„æœç´¢ç©ºé—´ä¸º ``[1.0, 2.0]`` ,第二阶段expand_ratioçš„æœç´¢ç©ºé—´ä¸º ``[0.5, 1.0, 2.0]`` ;最åŽå¯¹depth进行动æ€è®ç»ƒï¼Œè®ç»ƒé˜¶æ®µå’Œkernel_size相åŒã€‚ +把超网络è®ç»ƒæ–¹å¼è½¬æ¢ä¸ºOnce-For-Allçš„æ–¹å¼è®ç»ƒã€‚在 `Once-For-All论文 <https://arxiv.org/abs/1908.09791>`_ ä¸ï¼Œæ出 ``Progressive Shrinking`` 的超网络è®ç»ƒæ–¹å¼ï¼Œå…·ä½“原ç†æ˜¯åœ¨è®ç»ƒè¿‡ç¨‹ä¸æŒ‰ç…§ ``elastic kernel_size`` 〠``elastic width`` 〠``elactic depth`` 的顺åºåˆ†é˜¶æ®µè¿›è¡Œè®ç»ƒï¼Œå¹¶ä¸”在è®ç»ƒè¿‡ç¨‹ä¸é€æ¥æ‰©å¤§æœç´¢ç©ºé—´ï¼Œä¾‹å¦‚:æœç´¢ç©ºé—´ä¸º ``kernel_size=(3,5,7), expand_ratio=(0.5, 1.0, 2.0), depth=(0.5, 0.75, 1.0)`` ,则在è®ç»ƒè¿‡ç¨‹ä¸é¦–先对kernel size的大å°è¿›è¡ŒåŠ¨æ€è®ç»ƒï¼Œå¹¶æŠŠkernel_size的动æ€è®ç»ƒåˆ†ä¸ºä¸¤ä¸ªé˜¶æ®µï¼Œç¬¬ä¸€é˜¶æ®µkernel_sizeçš„æœç´¢ç©ºé—´ä¸º ``[5, 7]`` ,第二阶段kernel_sizeçš„æœç´¢ç©ºé—´ä¸º ``[3, 5, 7]`` ;之åŽæŠŠexpand_ratio的动æ€è®ç»ƒåŠ 入到超网络è®ç»ƒä¸ï¼Œå’Œå¯¹kernel_sizeçš„è®ç»ƒæ–¹å¼ç›¸åŒï¼Œå¯¹expand_ratio的动æ€è®ç»ƒä¹Ÿåˆ†ä¸ºä¸¤ä¸ªé˜¶æ®µï¼Œç¬¬ä¸€é˜¶æ®µexpand_ratioçš„æœç´¢ç©ºé—´ä¸º ``[1.0, 2.0]`` ,第二阶段expand_ratioçš„æœç´¢ç©ºé—´ä¸º ``[0.5, 1.0, 2.0]`` ;最åŽå¯¹depth进行动æ€è®ç»ƒï¼Œè®ç»ƒé˜¶æ®µå’Œkernel_size相åŒã€‚ .. py:class:: paddleslim.nas.ofa.OFA(model, run_config=None, distill_config=None, elastic_order=None, train_full=False) @@ -96,6 +96,7 @@ OFA实例 sp_net_config = supernet(kernel_size=(3, 5, 7), expand_ratio=[1, 2, 4]) sp_model = Convert(sp_net_config).convert(model) ofa_model = OFA(sp_model) + .. .. py:method:: set_epoch(epoch) @@ -110,9 +111,7 @@ OFA实例 **示例代ç :** - .. code-block:: python - - ofa_model.set_epoch(3) + ofa_model.set_epoch(3) .. py:method:: set_task(task, phase=None) @@ -127,9 +126,7 @@ OFA实例 **示例代ç :** - .. code-block:: python - - ofa_model.set_task('width') + ofa_model.set_task('width') .. py:method:: set_net_config(config) @@ -143,8 +140,6 @@ OFA实例 **示例代ç :** - .. code-block:: python - config = {'conv2d_0': {'expand_ratio': 2}, 'conv2d_1': {'expand_ratio': 2}} ofa_model.set_net_config(config) @@ -157,8 +152,6 @@ OFA实例 **示例代ç :** - .. code-block:: python - distill_loss = ofa_model.calc_distill_loss() .. py:method:: search() @@ -180,7 +173,6 @@ OFA实例 **示例代ç :** - .. code-block:: python from paddle.vision.models import mobilenet_v1 origin_model = mobilenet_v1() diff --git a/docs/zh_cn/api_cn/ofa_layer_api.rst b/docs/zh_cn/api_cn/ofa_layer_api.rst index dc6a7a0d363051481a8aaac6ec2d8c6b16cdb597..14b34c9a06c66a0b8d9521f6cb8731ccc25aa969 100644 --- a/docs/zh_cn/api_cn/ofa_layer_api.rst +++ b/docs/zh_cn/api_cn/ofa_layer_api.rst @@ -1,7 +1,7 @@ SuperOP ======== -PaddleSlimæ供了一些API的动æ€ç‰ˆæœ¬ï¼ŒåŠ¨æ€API指的是这些OPçš„å‚数大å°å¯ä»¥åœ¨å®žé™…è¿è¡Œè¿‡ç¨‹ä¸æ ¹æ®ä¼ 入的å‚数进行改å˜ï¼Œç”¨æ³•ä¸Šçš„差别具体是forward时候需è¦é¢å¤–ä¼ ä¸€äº›å®žé™…è¿è¡Œç›¸å…³çš„å‚æ•°ã€‚å…¶ä¸ `layers_old.py <>`_ 对应的是Paddle 2.0alphaåŠä¹‹å‰ç‰ˆæœ¬çš„API, `layers.py <>`_ 对应的是Paddle 2.0alpha之åŽç‰ˆæœ¬çš„API。 +PaddleSlimæ供了一些API的动æ€ç‰ˆæœ¬ï¼ŒåŠ¨æ€API指的是这些OPçš„å‚数大å°å¯ä»¥åœ¨å®žé™…è¿è¡Œè¿‡ç¨‹ä¸æ ¹æ®ä¼ 入的å‚数进行改å˜ï¼Œç”¨æ³•ä¸Šçš„差别具体是forward时候需è¦é¢å¤–ä¼ ä¸€äº›å®žé™…è¿è¡Œç›¸å…³çš„å‚æ•°ã€‚å…¶ä¸ `layers_old.py <../../../paddleslim/nas/ofa/layers_old.py>`_ 对应的是Paddle 2.0alphaåŠä¹‹å‰ç‰ˆæœ¬çš„API, `layers.py <../../../paddleslim/nas/ofa/layers.py>`_ 对应的是Paddle 2.0alpha之åŽç‰ˆæœ¬çš„API。 .. py:class:: paddleslim.nas.ofa.layers.Block(fn, fixed=False, key=None) @@ -21,8 +21,9 @@ Block实例 .. code-block:: python - from paddleslim.nas.ofa.layers import Block - block_layer = Block(SuperConv2D(3, 4, 3, candidate_config={'kerne_size': (3, 5, 7)}) + from paddleslim.nas.ofa.layers import Block, SuperConv2D + + block_layer = Block(SuperConv2D(3, 4, 3, candidate_config={'kerne_size': (3, 5, 7)})) .. py:class:: paddleslim.nas.ofa.layers.SuperConv2D(in_channels, out_channels, kernel_size, candidate_config={}, transform_kernel=False, stride=1, padding=0, dilation=1, groups=1, padding_mode='zeros', weight_attr=None, bias_attr=None, data_format='NCHW') @@ -63,7 +64,7 @@ Block实例 data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') super_conv2d = SuperConv2D(3, 10, 3) config = {'channel': 5} - data = paddle.to_variable(data) + data = paddle.to_tensor(data) conv = super_conv2d(data, **config) .. py:class:: paddleslim.nas.ofa.layers.SuperConv2DTranspose(in_channels, out_channels, kernel_size, candidate_config={}, transform_kernel=False, stride=1, padding=0, output_padding=0, dilation=1, groups=1, padding_mode='zeros', weight_attr=None, bias_attr=None, data_format='NCHW') @@ -99,14 +100,14 @@ Block实例 .. code-block:: python - import paddle - from paddleslim.nas.ofa.layers import SuperConv2D - import numpy as np - data = np.random.uniform(-1, 1, [32, 10, 32, 32]).astype('float32') - config = {'channel': 5} - data = paddle.to_variable(data) - super_convtranspose = SuperConv2DTranspose(num_channels=32, num_filters=10, filter_size=3) - ret = super_convtranspose(paddle.to_variable(data), **config) + import paddle + from paddleslim.nas.ofa.layers import SuperConv2DTranspose + import numpy as np + data = np.random.uniform(-1, 1, [32, 10, 32, 32]).astype('float32') + config = {'channel': 5} + data = paddle.to_tensor(data) + super_convtranspose = SuperConv2DTranspose(32, 10, 3) + ret = super_convtranspose(paddle.to_tensor(data), **config) .. py:class:: paddleslim.nas.ofa.layers.SuperLinear(in_features, out_features, candidate_config={}, weight_attr=None, bias_attr=None, name=None): @@ -138,10 +139,10 @@ Block实例 import paddle from paddleslim.nas.ofa.layers import SuperLinear - data = np.random.uniform(-1, 1, [32, 64] ).astype('float32') + data = np.random.uniform(-1, 1, [32, 64]).astype('float32') config = {'channel': 16} - linear = SuperLinear(32, 64) - data = paddle.to_variable(data) + linear = SuperLinear(64, 64) + data = paddle.to_tensor(data) res = linear(data, **config) @@ -175,10 +176,10 @@ Block实例 import paddle from paddleslim.nas.ofa.layers import SuperEmbedding - data = np.random.uniform(-1, 1, [32, 64]).astype('float32') + data = np.random.uniform(-1, 1, [32, 64]).astype('int64') config = {'channel': 16} - emb = SuperEmbedding(32, 64) - data = paddle.to_variable(data) + emb = SuperEmbedding(64, 64) + data = paddle.to_tensor(data) res = emb(data, **config) .. py:class:: paddleslim.nas.ofa.layers.SuperBatchNorm2D(num_features, momentum=0.9, epsilon=1e-05, weight_attr=None, bias_attr=None, data_format='NCHW', name=None): @@ -261,8 +262,8 @@ Block实例 from paddleslim.nas.ofa.layers import SuperLayerNorm np.random.seed(123) - x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32') + x_data = np.random.random(size=(2, 3)).astype('float32') x = paddle.to_tensor(x_data) - layer_norm = SuperLayerNorm(x_data.shape[1:]) + layer_norm = SuperLayerNorm(x_data.shape[1]) layer_norm_out = layer_norm(x) diff --git a/docs/zh_cn/tutorials/static/ernie_slim_ofa_tutorial.md b/docs/zh_cn/tutorials/nas/ernie_slim_ofa_tutorial.md similarity index 100% rename from docs/zh_cn/tutorials/static/ernie_slim_ofa_tutorial.md rename to docs/zh_cn/tutorials/nas/ernie_slim_ofa_tutorial.md diff --git a/docs/zh_cn/tutorials/nas/nas_ofa.md b/docs/zh_cn/tutorials/nas/nas_ofa.md new file mode 100644 index 0000000000000000000000000000000000000000..7df1d6f2af1eaaabf882a0e40dd9e1f834917772 --- /dev/null +++ b/docs/zh_cn/tutorials/nas/nas_ofa.md @@ -0,0 +1,93 @@ +# Once-For-All + +  [Once-For-All(以下简称OFA)](https://arxiv.org/abs/1908.09791)主è¦çš„目的是è®ç»ƒä¸€ä¸ªè¶…ç½‘ç»œï¼Œæ ¹æ®ä¸åŒçš„硬件从超网络ä¸é€‰æ‹©æ»¡è¶³æ—¶å»¶è¦æ±‚和精度è¦æ±‚çš„å°æ¨¡åž‹ã€‚å¯ä»¥åŸºäºŽå·²æœ‰çš„预è®ç»ƒæ¨¡åž‹è¿›è¡ŒåŽ‹ç¼©ä¹Ÿæ˜¯OFA一个很大的优势。 +  为了防æ¢å网络之间互相干扰,本论文æ出了一ç§Progressive Shrinking(PS)的模å¼è¿›è¡Œè¶…网络è®ç»ƒï¼Œé€æ¥ä»Žå¤§åž‹å网络到å°åž‹å网络进行è®ç»ƒã€‚首先是从最大的å网络开始è®ç»ƒï¼Œä¾‹å¦‚:超网络包å«å¯å˜çš„å·ç§¯æ ¸å¤§å° kernel_size = {3, 5, 7},å¯å˜çš„网络结构深度 depth = {2, 3, 4} å’Œå¯å˜çš„网络的宽度 expand_ratio = {2, 4, 6},则è®ç»ƒå·ç§¯æ ¸ä¸º7ã€æ·±åº¦ä¸º4,宽度为6的网络。之åŽé€æ¥å°†å…¶æ·»åŠ 到æœç´¢ç©ºé—´ä¸æ¥é€æ¥è°ƒæ•´ç½‘络以支æŒè¾ƒå°çš„å网络。具体æ¥è¯´ï¼Œåœ¨è®ç»ƒäº†æœ€å¤§çš„网络之åŽï¼Œæˆ‘们首先支æŒå¯å˜å·ç§¯æ ¸å¤§å°ï¼Œå¯ä»¥åœ¨{3,5,7}ä¸è¿›è¡Œé€‰æ‹©ï¼Œè€Œæ·±åº¦å’Œå®½åº¦åˆ™ä¿æŒæœ€å¤§å€¼ã€‚然åŽï¼Œæˆ‘们ä¾æ¬¡æ”¯æŒå¯å˜æ·±åº¦å’Œå¯å˜å®½åº¦ã€‚ + +## 使用方法 + +OFA的基本æµç¨‹åˆ†ä¸ºä»¥ä¸‹æ¥éª¤ï¼š +1. 定义超网络 +2. è®ç»ƒé…ç½® +3. è’¸é¦é…ç½® +4. ä¼ å…¥æ¨¡åž‹å’Œç›¸åº”é…ç½® + +### 1. 定义超网络 + 这里的超网络指的是用[动æ€OP](../../api_cn/ofa_layer_api.rst)组网的网络。 + PaddleSlimæ供了三ç§èŽ·å¾—超网络的方å¼ï¼Œå…·ä½“å¯ä»¥å‚考[超网络转æ¢](../../api_cn/convert_supernet_api.rst)。 + +```python + import paddle + from paddle.vision.models import mobilenet_v1 + from paddleslim.nas.ofa.convert_super import Convert, supernet + + model = mobilenet_v1() + sp_net_config = supernet(kernel_size=(3, 5, 7), expand_ratio=[1, 2, 4]) + sp_model = Convert(sp_net_config).convert(model) +``` + +### 2. è®ç»ƒé…ç½® + è®ç»ƒé…ç½®é»˜è®¤æ ¹æ®è®ºæ–‡ä¸PSçš„è®ç»ƒæ¨¡å¼è¿›è¡Œé…置,å¯è¿›è¡Œé…置的å‚æ•°å’Œå«ä¹‰å¯ä»¥å‚考: [RunConfig](../../api_cn/ofa_api.rst) + +```python + from paddleslim.nas.ofa import RunConfig + default_run_config = { + 'train_batch_size': 256, + 'n_epochs': [[1], [2, 3], [4, 5]], + 'init_learning_rate': [[0.001], [0.003, 0.001], [0.003, 0.001]], + 'dynamic_batch_size': [1, 1, 1], + 'total_images': 1281167, + 'elastic_depth': (2, 5, 8) + } + run_config = RunConfig(**default_run_config) +``` + +### 3. è’¸é¦é…ç½® + 为OFAè®ç»ƒè¿‡ç¨‹æ·»åŠ è’¸é¦é…置,å¯è¿›è¡Œé…置的å‚æ•°å’Œå«ä¹‰å¯ä»¥å‚考: [DistillConfig](../../api_cn/ofa_api.rst) + +```python + from paddle.vision.models import mobilenet_v1 + from paddleslim.nas.ofa import DistillConfig + teacher_model = mobilenet_v1() + + default_distill_config = { + 'teacher_model': teacher_model + } + distill_config = DistillConfig(**default_distill_config) +``` + +### 4. ä¼ å…¥æ¨¡åž‹å’Œç›¸åº”é…ç½® + 用OFAå°è£…模型ã€è®ç»ƒé…置和蒸é¦é…置。é…置完模型和æ£å¸¸æ¨¡åž‹è®ç»ƒæµç¨‹ç›¸åŒã€‚å¦‚æžœæ·»åŠ äº†è’¸é¦ï¼Œåˆ™OFAå°è£…åŽçš„模型会比原始模型多返回一组教师网络的输出。 +```python + from paddleslim.nas.ofa import OFA + + ofa_model = OFA(model, run_config=run_config, distill_config=distill_config) +``` + +## 实验效果 + +ç›®å‰æˆ‘们进在BERT-baseã€TinyBERTå’ŒTinyERNIE上进行了压缩实验,其他CV任务的压缩效果之åŽä¼šè¿›è¡Œè¡¥å……。BERTå’ŒTinyBERT的压缩结果如下表所示。 + +            <strong>表1: BERT-base上GLUEæ•°æ®é›†ç²¾åº¦å¯¹æ¯”</strong> +| Task | Metric | BERT-base | Result with PaddleSlim | +|:-----:|:----------------------------:|:-----------------:|:----------------------:| +| SST-2 | Accuracy | 0.93005 | [0.931193]() | +| QNLI | Accuracy | 0.91781 | [0.920740]() | +| CoLA | Mattehew's corr | 0.59557 | [0.601244]() | +| MRPC | F1/Accuracy | 0.91667/0.88235 | [0.91740/0.88480]() | +| STS-B | Person/Spearman corr | 0.88847/0.88350 | [0.89271/0.88958]() | +| QQP | Accuracy/F1 | 0.90581/0.87347 | [0.90994/0.87947]() | +| MNLI | Matched acc/MisMatched acc | 0.84422/0.84825 | [0.84687/0.85242]() | +| RTE | Accuracy | 0.711191 | [0.718412]() | + + +            <strong>表2: TinyBERT上GLUEæ•°æ®é›†ç²¾åº¦å¯¹æ¯”</strong> +| Task | Metric | TinyBERT(L=4, D=312) | Result with OFA | +|:-----:|:----------------------------:|:--------------------:|:----------------------:| +| SST-2 | Accuracy | [0.9234]() | [0.9220]() | +| QNLI | Accuracy | [0.8746]() | [0.8720]() | +| CoLA | Mattehew's corr | [0.4961]() | [0.5048]() | +| MRPC | F1/Accuracy | [0.8998/0.8554]() | [0.9003/0.8578]() | +| STS-B | Person/Spearman corr | [0.8635/0.8631]() | [0.8717/0.8706]() | +| QQP | Accuracy/F1 | [0.9047/0.8751]() | [0.9034/0.8733]() | +| MNLI | Matched acc/MisMatched acc | [0.8256/0.8294]() | [0.8211/0.8261]() | +| RTE | Accuracy | [0.6534]() | [0.6787]() | diff --git a/docs/zh_cn/tutorials/nas/nas_overview.md b/docs/zh_cn/tutorials/nas/nas_overview.md new file mode 100644 index 0000000000000000000000000000000000000000..b15fd63d1105016343e3ef5c364d105bdaf5bef7 --- /dev/null +++ b/docs/zh_cn/tutorials/nas/nas_overview.md @@ -0,0 +1,20 @@ +# PaddleSlim模型结构æœç´¢æ€»è§ˆ + +PaddleSlimæ供了4ç§ç½‘络结构æœç´¢çš„方法:基于模拟退ç«è¿›è¡Œç½‘络结构æœç´¢ã€åŸºäºŽå¼ºåŒ–å¦ä¹ 进行网络结构æœç´¢ã€åŸºäºŽæ¢¯åº¦è¿›è¡Œç½‘络结构æœç´¢å’ŒOnce-For-All。 + +| 算法å称 | 算法简介 | 代表模型 | +|:---------:|:------------:|:--------:| +| [Once-For-All](nas_ofa.md) | OFA是一ç§åŸºäºŽOne-Shot NAS的压缩方案。这ç§æ–¹å¼æ¯”较高效,其优势是åªéœ€è¦è®ç»ƒä¸€ä¸ªè¶…网络就å¯ä»¥ä»Žä¸é€‰æ‹©æ»¡è¶³ä¸åŒå»¶æ—¶è¦æ±‚çš„å模型。 | Once-For-All | +| [SANAS](../../quick_start/static/nas_tutorial.md) | SANAS是基于模拟退ç«çš„æ–¹å¼è¿›è¡Œç½‘络结构æœç´¢ï¼Œåœ¨æœºå™¨èµ„æºä¸å¤šçš„情况下,选择这ç§æ–¹å¼ä¸€èˆ¬èƒ½å¾—到比强化å¦ä¹ 更好的模型。 | \ | +| [RLNAS](./../api_cn/nas_api.rst) | RLNAS是基于强化å¦ä¹ çš„æ–¹å¼è¿›è¡Œç½‘络结构æœç´¢ï¼Œè¿™ç§æ–¹å¼éœ€è¦è€—费大é‡æœºå™¨èµ„æºã€‚ | ENASã€NasNetã€MNasNet | +| [DARTS/PCDARTS](../../api_cn/darts.rst) | DARTS是基于梯度进行网络结构æœç´¢ï¼Œè¿™ç§æ–¹å¼æ¯”较高效,大大å‡å°‘了æœç´¢æ—¶é—´å’Œæ‰€éœ€è¦çš„机器资æºã€‚ |DARTSã€PCDARTSã€ProxylessNAS| + + +# å‚考文献 +[1] H. Cai, C. Gan, T. Wang, Z. Zhang, and S. Han. Once for all: Train one network and specialize it for efficient deployment. In International Conference on Learning Representations, 2020. +[2] Pham, H.; Guan, M. Y.; Zoph, B.; Le, Q. V.; and Dean, J. 2018. Efficient neural architecture search via parameter sharing. arXiv preprint arXiv:1802.03268. +[3] Zoph B, Vasudevan V, Shlens J, et al. Learning transferable architectures for scalable image recognition[J]. arXiv preprint arXiv:1707.07012, 2017, 2(6). +[4] Mingxing Tan, Bo Chen, Ruoming Pang, Vijay Vasudevan, and Quoc V Le. Mnasnet: Platform-aware neural architecture search for mobile. arXiv preprint arXiv:1807.11626, 2018. +[5] H Liu, K Simonyan, Y Yang. Darts: Differentiable architecture search. arXiv preprint arXiv:1806.09055, 2018. +[6] Xu, Y., Xie, L., Zhang, X., Chen, X., Qi, G.J., Tian, Q., Xiong, H.: PCDARTS: Partial Channel Connections for Memory-efficient Differentiable Architecture Search. In: International Conference on Learning Representations (2020) +[7] Han Cai, Ligeng Zhu, and Song Han. ProxylessNAS: Direct neural architecture search on target task and hardware. In ICLR, 2019. URL https://arxiv.org/pdf/1812.00332.pdf. 3, 5, 6, 7, 8 diff --git a/docs/zh_cn/tutorials/static/paddlenlp_slim_ofa_tutorial.md b/docs/zh_cn/tutorials/nas/paddlenlp_slim_ofa_tutorial.md similarity index 100% rename from docs/zh_cn/tutorials/static/paddlenlp_slim_ofa_tutorial.md rename to docs/zh_cn/tutorials/nas/paddlenlp_slim_ofa_tutorial.md diff --git a/paddleslim/nas/ofa/layers.py b/paddleslim/nas/ofa/layers.py index 34b930e16af5549744ed3875d83622e4f23e32c0..a4b63913bcc6320887d691b0fffcb47017f2ef68 100644 --- a/paddleslim/nas/ofa/layers.py +++ b/paddleslim/nas/ofa/layers.py @@ -104,7 +104,9 @@ class SuperConv2D(nn.Conv2D): applied to the final result. For each input :math:`X`, the equation is: .. math:: - Out = \\sigma (W \\ast X + b) + + Out = sigma (W \\ast X + b) + Where: * :math:`X`: Input value, a ``Tensor`` with NCHW format. * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] . @@ -121,8 +123,11 @@ class SuperConv2D(nn.Conv2D): Output shape: :math:`(N, C_{out}, H_{out}, W_{out})` Where .. math:: - H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\ + + H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 + W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1 + Parameters: num_channels(int): The number of channels in the input image. num_filters(int): The number of filter. It is as same as the output @@ -182,7 +187,7 @@ class SuperConv2D(nn.Conv2D): data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') super_conv2d = SuperConv2D(3, 10, 3) config = {'channel': 5} - data = paddle.to_variable(data) + data = paddle.to_tensor(data) conv = super_conv2d(data, config) """ @@ -480,8 +485,8 @@ class SuperConv2DTranspose(nn.Conv2DTranspose): from paddleslim.nas.ofa.layers import SuperConv2DTranspose data = np.random.random((3, 32, 32, 5)).astype('float32') config = {'channel': 5} - super_convtranspose = SuperConv2DTranspose(num_channels=32, num_filters=10, filter_size=3) - ret = super_convtranspose(paddle.to_variable(data), config) + super_convtranspose = SuperConv2DTranspose(32, 10, 3) + ret = super_convtranspose(paddle.to_tensor(data), config) """ def __init__(self, @@ -871,10 +876,10 @@ class SuperLinear(nn.Linear): import paddle from paddleslim.nas.ofa.layers import SuperLinear - data = np.random.uniform(-1, 1, [32, 64] ).astype('float32') + data = np.random.uniform(-1, 1, [32, 64]).astype('float32') config = {'channel': 16} - linear = SuperLinear(32, 64) - data = paddle.to_variable(data) + linear = SuperLinear(64, 64) + data = paddle.to_tensor(data) res = linear(data, **config) """ @@ -1088,9 +1093,9 @@ class SuperLayerNorm(nn.LayerNorm): from paddleslim.nas.ofa.layers import SuperLayerNorm np.random.seed(123) - x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32') + x_data = np.random.random(size=(2, 3)).astype('float32') x = paddle.to_tensor(x_data) - layer_norm = SuperLayerNorm(x_data.shape[1:]) + layer_norm = SuperLayerNorm(x_data.shape[1]) layer_norm_out = layer_norm(x) """ @@ -1162,10 +1167,10 @@ class SuperEmbedding(nn.Embedding): import paddle from paddleslim.nas.ofa.layers import SuperEmbedding - data = np.random.uniform(-1, 1, [32, 64]).astype('float32') + data = np.random.uniform(-1, 1, [32, 64]).astype('int64') config = {'channel': 16} - emb = SuperEmbedding(32, 64) - data = paddle.to_variable(data) + emb = SuperEmbedding(64, 64) + data = paddle.to_tensor(data) res = emb(data, **config) """ diff --git a/paddleslim/nas/ofa/layers_old.py b/paddleslim/nas/ofa/layers_old.py index fa136875719d83c956717763077bee60ce622184..ef53a4285b8883d484a7fd028a380d87d3a110a8 100644 --- a/paddleslim/nas/ofa/layers_old.py +++ b/paddleslim/nas/ofa/layers_old.py @@ -930,10 +930,10 @@ class SuperBatchNorm(fluid.dygraph.BatchNorm): "use_mkldnn", False, "fuse_with_relu", self._fuse_with_relu, "use_global_stats", self._use_global_stats, 'trainable_statistics', self._trainable_statistics) - batch_norm_out, _, _, _, _, _ = core.ops.batch_norm( + batch_norm_out = core.ops.batch_norm( input, weight, bias, mean, variance, mean_out, variance_out, *attrs) return dygraph_utils._append_activation_in_dygraph( - batch_norm_out, act=self._act) + batch_norm_out[0], act=self._act) class SuperInstanceNorm(fluid.dygraph.InstanceNorm):