Add ppyoloe distillation modelzoo (#7694)

* fix tal distill and singe scale training * add modelzoo fix configs * fix docs typos, test=document_fix

Add ppyoloe distillation modelzoo (#7694)
* fix tal distill and singe scale training * add modelzoo fix configs * fix docs typos, test=document_fix
a496c2dd · Feng Ni · GitHub · 93ea350c · a496c2dd · a496c2dd
9 changed file
--- a/configs/ppyoloe/distill/README.md
+++ b/configs/ppyoloe/distill/README.md
@@ -2,9 +2,16 @@

 PaddleDetection提供了对PPYOLOE+ 进行模型蒸馏的方案，结合了logits蒸馏和feature蒸馏。

-
 ## 模型库

+| 模型               |    方案     | 输入尺寸 | epochs |    Box mAP    |       配置文件    |     下载链接    |
+| ----------------- | ----------- | ------ | :----: | :-----------: | :--------------: | :------------: |
+|   PP-YOLOE+_x     |  teacher   |  640     | 80e   |      54.7     | [config](../ppyoloe_plus_crn_x_80e_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/ppyoloe_plus_crn_x_80e_coco.pdparams) |
+|   PP-YOLOE+_l     |  student   |  640     | 80e   |      52.9     | [config](../ppyoloe_plus_crn_l_80e_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/ppyoloe_plus_crn_l_80e_coco.pdparams) |
+|   PP-YOLOE+_l     |  distill   |  640     | 80e   |   53.9(+1.0)  | [config](./ppyoloe_plus_crn_l_80e_coco_distill.yml),[slim_config](../../slim/distill/ppyoloe_plus_distill_x_distill_l.yml)  | [model](https://bj.bcebos.com/v1/paddledet/models/ppyoloe_plus_crn_l_80e_coco_distill.pdparams) |
+|   PP-YOLOE+_l     |  teacher   |  640     | 80e   |      52.9     | [config](../ppyoloe_plus_crn_l_80e_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/ppyoloe_plus_crn_l_80e_coco.pdparams) |
+|   PP-YOLOE+_m     |  student   |  640     | 80e   |      49.8     | [config](../ppyoloe_plus_crn_m_80e_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/ppyoloe_plus_crn_m_80e_coco.pdparams) |
+|   PP-YOLOE+_m     |  distill   |  640     | 80e   |    50.7(+0.9)    | [config](./ppyoloe_plus_crn_m_80e_coco_distill.yml),[slim_config](../../slim/distill/ppyoloe_plus_distill_l_distill_m.yml)  | [model](https://bj.bcebos.com/v1/paddledet/models/ppyoloe_plus_crn_m_80e_coco_distill.pdparams) |


 ## 快速开始
@@ -12,9 +19,9 @@ PaddleDetection提供了对PPYOLOE+ 进行模型蒸馏的方案，结合了logit
 ### 训练
 ```shell
 # 单卡
-python tools/train.py -c configs/ppyoloe/distill/ppyoloe_plus_crn_l_80e_coco_distill.yml --slim_config configs/slim/distill/ppyoloe_plus_distill_x_to_l.yml
+python tools/train.py -c configs/ppyoloe/distill/ppyoloe_plus_crn_l_80e_coco_distill.yml --slim_config configs/slim/distill/ppyoloe_plus_distill_x_distill_l.yml
 # 多卡
-python3.7 -m paddle.distributed.launch --log_dir=ppyoloe_plus_distill_x_to_l/ --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/ppyoloe/distill/ppyoloe_plus_crn_l_80e_coco_distill.yml --slim_config configs/slim/distill/ppyoloe_plus_distill_x_to_l.yml
+python -m paddle.distributed.launch --log_dir=ppyoloe_plus_distill_x_distill_l/ --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/ppyoloe/distill/ppyoloe_plus_crn_l_80e_coco_distill.yml --slim_config configs/slim/distill/ppyoloe_plus_distill_x_distill_l.yml
 ```

 - `-c`: 指定模型配置文件，也是student配置文件。

--- a/configs/ppyoloe/distill/ppyoloe_plus_crn_l_80e_coco_distill.yml
+++ b/configs/ppyoloe/distill/ppyoloe_plus_crn_l_80e_coco_distill.yml
@@ -10,9 +10,29 @@ PPYOLOE:
  post_process: ~


+worker_num: 4
+TrainReader:
+  sample_transforms:
+    - Decode: {}
+    - RandomDistort: {}
+    - RandomExpand: {fill_value: [123.675, 116.28, 103.53]}
+    - RandomCrop: {}
+    - RandomFlip: {}
+  batch_transforms:
+    - BatchRandomResize: {target_size: [640], random_size: True, random_interp: True, keep_ratio: False}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+    - PadGT: {}
+  batch_size: 8
+  shuffle: True
+  drop_last: True
+  use_shared_memory: True
+  collate_batch: True
+
+
 log_iter: 100
 snapshot_epoch: 5
-weights: output/ppyoloe_plus_crn_l_80e_coco/model_final
+weights: output/ppyoloe_plus_crn_l_80e_coco_distill/model_final

 pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/ppyoloe_crn_l_obj365_pretrained.pdparams
 depth_mult: 1.0

--- a/configs/ppyoloe/distill/ppyoloe_plus_crn_m_80e_coco_distill.yml
+++ b/configs/ppyoloe/distill/ppyoloe_plus_crn_m_80e_coco_distill.yml
@@ -10,9 +10,29 @@ PPYOLOE:
  post_process: ~


+worker_num: 4
+TrainReader:
+  sample_transforms:
+    - Decode: {}
+    - RandomDistort: {}
+    - RandomExpand: {fill_value: [123.675, 116.28, 103.53]}
+    - RandomCrop: {}
+    - RandomFlip: {}
+  batch_transforms:
+    - BatchRandomResize: {target_size: [640], random_size: True, random_interp: True, keep_ratio: False}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+    - PadGT: {}
+  batch_size: 8
+  shuffle: True
+  drop_last: True
+  use_shared_memory: True
+  collate_batch: True
+
+
 log_iter: 100
 snapshot_epoch: 5
-weights: output/ppyoloe_plus_crn_m_80e_coco/model_final
+weights: output/ppyoloe_plus_crn_m_80e_coco_distill/model_final

 pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/ppyoloe_crn_m_obj365_pretrained.pdparams
 depth_mult: 0.67

--- a/configs/ppyoloe/distill/ppyoloe_plus_crn_s_80e_coco_distill.yml
+++ b/configs/ppyoloe/distill/ppyoloe_plus_crn_s_80e_coco_distill.yml
@@ -10,9 +10,29 @@ PPYOLOE:
  post_process: ~


+worker_num: 4
+TrainReader:
+  sample_transforms:
+    - Decode: {}
+    - RandomDistort: {}
+    - RandomExpand: {fill_value: [123.675, 116.28, 103.53]}
+    - RandomCrop: {}
+    - RandomFlip: {}
+  batch_transforms:
+    - BatchRandomResize: {target_size: [640], random_size: True, random_interp: True, keep_ratio: False}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+    - PadGT: {}
+  batch_size: 8
+  shuffle: True
+  drop_last: True
+  use_shared_memory: True
+  collate_batch: True
+
+
 log_iter: 100
 snapshot_epoch: 5
-weights: output/ppyoloe_plus_crn_s_80e_coco/model_final
+weights: output/ppyoloe_plus_crn_s_80e_coco_distill/model_final

 pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/ppyoloe_crn_s_obj365_pretrained.pdparams
 depth_mult: 0.33

--- a/configs/slim/distill/README.md
+++ b/configs/slim/distill/README.md
 # Distillation(蒸馏)

 ## YOLOv3模型蒸馏
+
 以YOLOv3-MobileNetV1为例，使用YOLOv3-ResNet34作为蒸馏训练的teacher网络, 对YOLOv3-MobileNetV1结构的student网络进行蒸馏。
 COCO数据集作为目标检测任务的训练目标难度更大，意味着teacher网络会预测出更多的背景bbox，如果直接用teacher的预测输出作为student学习的`soft label`会有严重的类别不均衡问题。解决这个问题需要引入新的方法，详细背景请参考论文:[Object detection at 200 Frames Per Second](https://arxiv.org/abs/1805.06361)。
-为了确定蒸馏的对象，我们首先需要找到student和teacher网络得到的`x,y,w,h,cls,objness`等Tensor，用teacher得到的结果指导student训练。具体实现可参考[代码](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/ppdet/slim/distill.py)
+为了确定蒸馏的对象，我们首先需要找到student和teacher网络得到的`x,y,w,h,cls,objectness`等Tensor，用teacher得到的结果指导student训练。具体实现可参考[代码](../../../ppdet/slim/distill_loss.py)
+
+| 模型               |    方案     | 输入尺寸 | epochs |   Box mAP    |       配置文件    |     下载链接    |
+| :---------------: | :---------: | :----: | :----: |:-----------: | :--------------: | :------------: |
+| YOLOv3-ResNet34    | teacher     | 608   |  270e  |     36.2     | [config](../../yolov3/yolov3_r34_270e_coco.yml) | [download](https://paddledet.bj.bcebos.com/models/yolov3_r34_270e_coco.pdparams) |
+| YOLOv3-MobileNetV1 | student     | 608   |  270e  |     29.4     | [config](../../yolov3/yolov3_mobilenet_v1_270e_coco.yml) | [download](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_270e_coco.pdparams) |
+| YOLOv3-MobileNetV1 | distill     | 608   |  270e  |  31.0(+1.6)  | [config](../../yolov3/yolov3_mobilenet_v1_270e_coco.yml),[slim_config](./yolov3_mobilenet_v1_coco_distill.yml) | [download](https://paddledet.bj.bcebos.com/models/slim/yolov3_mobilenet_v1_coco_distill.pdparams) |


 ## FGD模型蒸馏

 FGD全称为[Focal and Global Knowledge Distillation for Detectors](https://arxiv.org/abs/2111.11837v1)，是目标检测任务的一种蒸馏方法，FGD蒸馏分为两个部分`Focal`和`Global`。`Focal`蒸馏分离图像的前景和背景，让学生模型分别关注教师模型的前景和背景部分特征的关键像素；`Global`蒸馏部分重建不同像素之间的关系并将其从教师转移到学生，以补偿`Focal`蒸馏中丢失的全局信息。试验结果表明，FGD蒸馏算法在基于anchor和anchor free的方法上能有效提升模型精度。
-在PaddleDetection中，我们实现了FGD算法，并基于retinaNet算法进行验证，实验结果如下：
-| algorithm | model | AP | download|
-|:-:| :-: | :-: | :-:|
-|retinaNet_r101_fpn_2x | teacher | 40.6 | [download](https://paddledet.bj.bcebos.com/models/retinanet_r101_fpn_2x_coco.pdparams) |
-|retinaNet_r50_fpn_1x| student | 37.5 |[download](https://paddledet.bj.bcebos.com/models/retinanet_r50_fpn_1x_coco.pdparams) |
-|retinaNet_r50_fpn_2x + FGD| student | 40.8 |[download](https://paddledet.bj.bcebos.com/models/retinanet_r101_distill_r50_2x_coco.pdparams) |
+在PaddleDetection中，我们实现了FGD算法，并基于RetinaNet算法进行验证，实验结果如下：
+
+| 模型               |    方案     | 输入尺寸 | epochs |    Box mAP    |       配置文件    |     下载链接    |
+| ----------------- | ----------- | ------ | :----: | :-----------: | :--------------: | :------------: |
+| RetinaNet-ResNet101| teacher    | 1333x800 |  2x  |     40.6     | [config](../../retinanet/retinanet_r101_fpn_2x_coco.yml) | [download](https://paddledet.bj.bcebos.com/models/retinanet_r101_fpn_2x_coco.pdparams) |
+| RetinaNet-ResNet50 | student    | 1333x800 |  2x  |      39.1     | [config](../../retinanet/retinanet_r50_fpn_2x_coco.yml) | [download](https://paddledet.bj.bcebos.com/models/retinanet_r50_fpn_2x_coco.pdparams) |
+| RetinaNet-ResNet50 | FGD        | 1333x800 |  2x  |   40.8(+1.7)  | [config](../../retinanet/retinanet_r50_fpn_2x_coco.yml),[slim_config](./retinanet_resnet101_coco_distill.yml) | [download](https://paddledet.bj.bcebos.com/models/retinanet_r101_distill_r50_2x_coco.pdparams) |


 ## LD模型蒸馏

 LD全称为[Localization Distillation for Dense Object Detection](https://arxiv.org/abs/2102.12252)，将回归框表示为概率分布，把分类任务的KD用在定位任务上，并且使用因地制宜、分而治之的策略，在不同的区域分别学习分类知识与定位知识。在PaddleDetection中，我们实现了LD算法，并基于GFL模型进行验证，实验结果如下：
-| algorithm | model | AP | download|
-|:-:| :-: | :-: | :-:|
-| GFL_ResNet101-vd   | teacher          | 46.8  | [model](https://paddledet.bj.bcebos.com/models/gfl_r101vd_fpn_mstrain_2x_coco.pdparams), [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/gfl/gfl_r101vd_fpn_mstrain_2x_coco.yml) |
-| GFL_ResNet18-vd   | student          | 36.6  | [model](https://paddledet.bj.bcebos.com/models/gfl_r18vd_1x_coco.pdparams), [config](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/gfl/gfl_r18vd_1x_coco.yml) |
-| GFL_ResNet18-vd + LD   | student          | 38.2  | [model](https://bj.bcebos.com/v1/paddledet/models/gfl_slim_ld_r18vd_1x_coco.pdparams), [config1](../../gfl/gfl_slim_ld_r18vd_1x_coco.yml), [config2](./gfl_ld_distill.yml) |
+
+| 模型               |    方案     | 输入尺寸 | epochs |    Box mAP    |       配置文件    |     下载链接    |
+| ----------------- | ----------- | ------ | :----: | :-----------: | :--------------: | :------------: |
+| GFL_ResNet101-vd| teacher    | 1333x800 |  2x  |     46.8     | [config](../../gfl/gfl_r101vd_fpn_mstrain_2x_coco.yml) | [download](https://paddledet.bj.bcebos.com/models/gfl_r101vd_fpn_mstrain_2x_coco.pdparams) |
+| GFL_ResNet18-vd | student    | 1333x800 |  1x  |     36.6     | [config](../../gfl/gfl_r18vd_1x_coco.yml) | [download](https://paddledet.bj.bcebos.com/models/gfl_r18vd_1x_coco.pdparams) |
+| GFL_ResNet18-vd | LD         | 1333x800 |  1x  |   38.2(+1.6) | [config](../../gfl/gfl_slim_ld_r18vd_1x_coco.yml),[slim_config](./gfl_ld_distill.yml) | [download](https://bj.bcebos.com/v1/paddledet/models/gfl_slim_ld_r18vd_1x_coco.pdparams) |
+

 ## CWD模型蒸馏

 CWD全称为[Channel-wise Knowledge Distillation for Dense Prediction*](https://arxiv.org/pdf/2011.13256.pdf)，通过最小化教师网络与学生网络的通道概率图之间的 Kullback-Leibler (KL) 散度，使得在蒸馏过程更加关注每个通道的最显著的区域，进而提升文本检测与图像分割任务的精度。在PaddleDetection中，我们实现了CWD算法，并基于GFL和RetinaNet模型进行验证，实验结果如下：
-| algorithm | model | AP | download|
-|:-:| :-: | :-: | :-:|
-|retinaNet_r101_fpn_2x | teacher | 40.6 | [download](https://paddledet.bj.bcebos.com/models/retinanet_r101_fpn_2x_coco.pdparams) |
-|retinaNet_r50_fpn_1x| student | 37.5 |[download](https://paddledet.bj.bcebos.com/models/retinanet_r50_fpn_1x_coco.pdparams) |
-|retinaNet_r50_fpn_2x + CWD| student | 40.5 |[download](https://paddledet.bj.bcebos.com/models/retinanet_r50_fpn_2x_coco_cwd.pdparams) |
-|gfl_r101_fpn_2x | teacher | 46.8 | [download](https://paddledet.bj.bcebos.com/models/gfl_r101vd_fpn_mstrain_2x_coco.pdparams) |
-|gfl_r50_fpn_1x| student | 41.0 |[download](https://paddledet.bj.bcebos.com/models/gfl_r50_fpn_1x_coco.pdparams) |
-|gfl_r50_fpn_2x + CWD| student | 44.0 |[download](https://paddledet.bj.bcebos.com/models/gfl_r50_fpn_2x_coco_cwd.pdparams) |

-## PPYOLOE+模型蒸馏
+| 模型               |    方案     | 输入尺寸 | epochs |    Box mAP    |       配置文件    |     下载链接    |
+| ----------------- | ----------- | ------ | :----: | :-----------: | :--------------: | :------------: |
+| RetinaNet-ResNet101| teacher    | 1333x800 |  2x  |     40.6     | [config](../../retinanet/retinanet_r101_fpn_2x_coco.yml) | [download](https://paddledet.bj.bcebos.com/models/retinanet_r101_fpn_2x_coco.pdparams) |
+| RetinaNet-ResNet50 | student    | 1333x800 |  2x  |     39.1     | [config](../../retinanet/retinanet_r50_fpn_2x_coco.yml) | [download](https://paddledet.bj.bcebos.com/models/retinanet_r50_fpn_2x_coco.pdparams)  |
+| RetinaNet-ResNet50 | CWD        | 1333x800 |  2x  |   40.5(+1.4) | [config](../../retinanet/retinanet_r50_fpn_2x_coco_cwd.yml),[slim_config](./retinanet_resnet101_coco_distill_cwd.yml) | [download](https://paddledet.bj.bcebos.com/models/retinanet_r50_fpn_2x_coco_cwd.pdparams) |
+| GFL_ResNet101-vd| teacher    | 1333x800 |  2x  |     46.8     | [config](../../gfl/gfl_r101vd_fpn_mstrain_2x_coco.yml) | [download](https://paddledet.bj.bcebos.com/models/gfl_r101vd_fpn_mstrain_2x_coco.pdparams) |
+| GFL_ResNet50    | student    | 1333x800 |  1x  |     41.0     | [config](../../gfl/gfl_r50_fpn_1x_coco.yml) | [download](https://paddledet.bj.bcebos.com/models/gfl_r50_fpn_1x_coco.pdparams) |
+| GFL_ResNet50    | LD         | 1333x800 |  2x  |   44.0(+3.0) | [config](../../gfl/gfl_r50_fpn_2x_coco_cwd.yml),[slim_config](./gfl_r101vd_fpn_coco_distill_cwd.yml) | [download](https://bj.bcebos.com/v1/paddledet/models/gfl_r50_fpn_2x_coco_cwd.pdparams) |
+
+
+## PPYOLOE+ 模型蒸馏
+
+PaddleDetection提供了对PPYOLOE+ 进行模型蒸馏的方案，结合了logits蒸馏和feature蒸馏。

+| 模型               |    方案     | 输入尺寸 | epochs |    Box mAP    |       配置文件    |     下载链接    |
+| ----------------- | ----------- | ------ | :----: | :-----------: | :--------------: | :------------: |
+|   PP-YOLOE+_x     |  teacher   |  640     | 80e   |      54.7     | [config](../../ppyoloe/ppyoloe_plus_crn_x_80e_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/ppyoloe_plus_crn_x_80e_coco.pdparams) |
+|   PP-YOLOE+_l     |  student   |  640     | 80e   |      52.9     | [config](../../ppyoloe/ppyoloe_plus_crn_l_80e_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/ppyoloe_plus_crn_l_80e_coco.pdparams) |
+|   PP-YOLOE+_l     |  distill   |  640     | 80e   |   53.9(+1.0)  | [config](../../ppyoloe/distill/ppyoloe_plus_crn_l_80e_coco_distill.yml),[slim_config](./ppyoloe_plus_distill_x_distill_l.yml)  | [model](https://bj.bcebos.com/v1/paddledet/models/ppyoloe_plus_crn_l_80e_coco_distill.pdparams) |
+|   PP-YOLOE+_l     |  teacher   |  640     | 80e   |      52.9     | [config](../../ppyoloe/ppyoloe_plus_crn_l_80e_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/ppyoloe_plus_crn_l_80e_coco.pdparams) |
+|   PP-YOLOE+_m     |  student   |  640     | 80e   |      49.8     | [config](../../ppyoloe/ppyoloe_plus_crn_m_80e_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/ppyoloe_plus_crn_m_80e_coco.pdparams) |
+|   PP-YOLOE+_m     |  distill   |  640     | 80e   |    50.7(+0.9)    | [config](../../ppyoloe/distill/ppyoloe_plus_crn_m_80e_coco_distill.yml),[slim_config](./ppyoloe_plus_distill_l_distill_m.yml)  | [model](https://bj.bcebos.com/v1/paddledet/models/ppyoloe_plus_crn_m_80e_coco_distill.pdparams) |


 ## 快速开始
@@ -47,9 +69,9 @@ CWD全称为[Channel-wise Knowledge Distillation for Dense Prediction*](https://
 ### 训练
 ```shell
 # 单卡
-python tools/train.py -c configs/ppyoloe/distill/ppyoloe_plus_crn_l_80e_coco_distill.yml --slim_config configs/slim/distill/ppyoloe_plus_distill_x_to_l.yml
+python tools/train.py -c configs/ppyoloe/distill/ppyoloe_plus_crn_l_80e_coco_distill.yml --slim_config configs/slim/distill/ppyoloe_plus_distill_x_distill_l.yml
 # 多卡
-python3.7 -m paddle.distributed.launch --log_dir=ppyoloe_plus_distill_x_to_l/ --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/ppyoloe/distill/ppyoloe_plus_crn_l_80e_coco_distill.yml --slim_config configs/slim/distill/ppyoloe_plus_distill_x_to_l.yml
+python -m paddle.distributed.launch --log_dir=ppyoloe_plus_distill_x_distill_l/ --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/ppyoloe/distill/ppyoloe_plus_crn_l_80e_coco_distill.yml --slim_config configs/slim/distill/ppyoloe_plus_distill_x_distill_l.yml
 ```

 - `-c`: 指定模型配置文件，也是student配置文件。

--- a/configs/slim/distill/ppyoloe_plus_distill_l_to_m.yml
+++ b/configs/slim/distill/ppyoloe_plus_distill_l_to_m.yml
@@ -4,18 +4,35 @@ _BASE_: [
 ]
 depth_mult: 1.0
 width_mult: 1.0
-
+for_distill: True
 architecture: PPYOLOE
 PPYOLOE:
  backbone: CSPResNet
  neck: CustomCSPPAN
  yolo_head: PPYOLOEHead
  post_process: ~
-  for_distill: True

 pretrain_weights: https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_l_80e_coco.pdparams
 find_unused_parameters: True
-for_distill: True
+
+worker_num: 4
+TrainReader:
+  sample_transforms:
+    - Decode: {}
+    - RandomDistort: {}
+    - RandomExpand: {fill_value: [123.675, 116.28, 103.53]}
+    - RandomCrop: {}
+    - RandomFlip: {}
+  batch_transforms:
+    - BatchRandomResize: {target_size: [640], random_size: True, random_interp: True, keep_ratio: False}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+    - PadGT: {}
+  batch_size: 8
+  shuffle: True
+  drop_last: True
+  use_shared_memory: True
+  collate_batch: True


 slim: Distill

--- a/configs/slim/distill/ppyoloe_plus_distill_m_to_s.yml
+++ b/configs/slim/distill/ppyoloe_plus_distill_m_to_s.yml
 # teacher and slim config
 _BASE_: [
-  '../../ppyoloe/ppyoloe_plus_crn_l_80e_coco.yml',
+  '../../ppyoloe/ppyoloe_plus_crn_m_80e_coco.yml',
 ]
 depth_mult: 0.67
 width_mult: 0.75
-
+for_distill: True
 architecture: PPYOLOE
 PPYOLOE:
  backbone: CSPResNet
  neck: CustomCSPPAN
  yolo_head: PPYOLOEHead
  post_process: ~
-  for_distill: True

 pretrain_weights: https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_m_80e_coco.pdparams
 find_unused_parameters: True
-for_distill: True
+
+worker_num: 4
+TrainReader:
+  sample_transforms:
+    - Decode: {}
+    - RandomDistort: {}
+    - RandomExpand: {fill_value: [123.675, 116.28, 103.53]}
+    - RandomCrop: {}
+    - RandomFlip: {}
+  batch_transforms:
+    - BatchRandomResize: {target_size: [640], random_size: True, random_interp: True, keep_ratio: False}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+    - PadGT: {}
+  batch_size: 8
+  shuffle: True
+  drop_last: True
+  use_shared_memory: True
+  collate_batch: True


 slim: Distill

--- a/configs/slim/distill/ppyoloe_plus_distill_x_to_l.yml
+++ b/configs/slim/distill/ppyoloe_plus_distill_x_to_l.yml
@@ -4,18 +4,35 @@ _BASE_: [
 ]
 depth_mult: 1.33
 width_mult: 1.25
-
+for_distill: True
 architecture: PPYOLOE
 PPYOLOE:
  backbone: CSPResNet
  neck: CustomCSPPAN
  yolo_head: PPYOLOEHead
  post_process: ~
-  for_distill: True

 pretrain_weights: https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_x_80e_coco.pdparams
 find_unused_parameters: True
-for_distill: True
+
+worker_num: 4
+TrainReader:
+  sample_transforms:
+    - Decode: {}
+    - RandomDistort: {}
+    - RandomExpand: {fill_value: [123.675, 116.28, 103.53]}
+    - RandomCrop: {}
+    - RandomFlip: {}
+  batch_transforms:
+    - BatchRandomResize: {target_size: [640], random_size: True, random_interp: True, keep_ratio: False}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+    - PadGT: {}
+  batch_size: 8
+  shuffle: True
+  drop_last: True
+  use_shared_memory: True
+  collate_batch: True


 slim: Distill

--- a/ppdet/slim/distill_model.py
+++ b/ppdet/slim/distill_model.py
@@ -330,6 +330,8 @@ class PPYOLOEDistillModel(DistillModel):

    def forward(self, inputs, alpha=0.125):
        if self.training:
+            with paddle.no_grad():
+                teacher_loss = self.teacher_model(inputs)
            if hasattr(self.teacher_model.yolo_head, "assigned_labels"):
                self.student_model.yolo_head.assigned_labels, self.student_model.yolo_head.assigned_bboxes, self.student_model.yolo_head.assigned_scores, self.student_model.yolo_head.mask_positive = \
                    self.teacher_model.yolo_head.assigned_labels, self.teacher_model.yolo_head.assigned_bboxes, self.teacher_model.yolo_head.assigned_scores, self.teacher_model.yolo_head.mask_positive
@@ -338,8 +340,6 @@ class PPYOLOEDistillModel(DistillModel):
                delattr(self.teacher_model.yolo_head, "assigned_scores")
                delattr(self.teacher_model.yolo_head, "mask_positive")
            student_loss = self.student_model(inputs)
-            with paddle.no_grad():
-                teacher_loss = self.teacher_model(inputs)

            logits_loss, feat_loss = self.distill_loss(self.teacher_model,
                                                       self.student_model)