Merge branch 'PaddlePaddle:develop' into develop

06767545 · Wei Shengyu · GitHub · 84d1f9e0 · ba1043e4 · 06767545
83 changed file
--- a/MANIFEST.in
+++ b/MANIFEST.in
 include LICENSE.txt
 include README.md
 include docs/en/whl_en.md
-recursive-include deploy/python predict_cls.py preprocess.py postprocess.py det_preprocess.py
+recursive-include deploy/python *.py
+recursive-include deploy/configs *.yaml
 recursive-include deploy/utils get_image_list.py config.py logger.py predictor.py

 recursive-include ppcls/ *.py *.txt
\ No newline at end of file
--- a/deploy/configs/PULC/language_classification/inference_language_classification.yaml
+++ b/deploy/configs/PULC/language_classification/inference_language_classification.yaml
+Global:
+  infer_imgs: "./images/PULC/language_classification/word_35404.png"
+  inference_model_dir: "./models/language_classification_infer"
+  batch_size: 1
+  use_gpu: True
+  enable_mkldnn: False
+  cpu_num_threads: 10
+  enable_benchmark: True
+  use_fp16: False
+  ir_optim: True
+  use_tensorrt: False
+  gpu_mem: 8000
+  enable_profile: False
+
+PreProcess:
+  transform_ops:
+    - ResizeImage:
+        size: [160, 80]
+    - NormalizeImage:
+        scale: 0.00392157
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        channel_num: 3
+    - ToCHWImage:
+
+PostProcess:
+  main_indicator: Topk
+  Topk:
+    topk: 2
+    class_id_map_file: "../dataset/language_classification/label_list.txt"
+  SavePreLabel:
+    save_dir: ./pre_label/
--- a/deploy/configs/PULC/safety_helmet/inference_safety_helmet.yaml
+++ b/deploy/configs/PULC/safety_helmet/inference_safety_helmet.yaml
+Global:
+  infer_imgs: "./images/PULC/safety_helmet/safety_helmet_test_1.png"
+  inference_model_dir: "./models/safety_helmet_infer"
+  batch_size: 1
+  use_gpu: True
+  enable_mkldnn: False
+  cpu_num_threads: 10
+  enable_benchmark: True
+  use_fp16: False
+  ir_optim: True
+  use_tensorrt: False
+  gpu_mem: 8000
+  enable_profile: False
+
+PreProcess:
+  transform_ops:
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 0.00392157
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        channel_num: 3
+    - ToCHWImage:
+
+PostProcess:
+  main_indicator: ThreshOutput
+  ThreshOutput:
+    threshold: 0.5
+    label_0: wearing_helmet
+    label_1: unwearing_helmet
+  SavePreLabel:
+    save_dir: ./pre_label/
--- a/deploy/configs/PULC/text_image_orientation/inference_text_image_orientation.yaml
+++ b/deploy/configs/PULC/text_image_orientation/inference_text_image_orientation.yaml
+Global:
+  infer_imgs: "./images/PULC/text_image_orientation/img_rot0_demo.jpg"
+  inference_model_dir: "./models/text_image_orientation_infer"
+  batch_size: 1
+  use_gpu: True
+  enable_mkldnn: False
+  cpu_num_threads: 10
+  enable_benchmark: True
+  use_fp16: False
+  ir_optim: True
+  use_tensorrt: False
+  gpu_mem: 8000
+  enable_profile: False
+
+PreProcess:
+  transform_ops:
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 0.00392157
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        channel_num: 3
+    - ToCHWImage:
+
+PostProcess:
+  main_indicator: Topk
+  Topk:
+    topk: 2
+    class_id_map_file: "../dataset/text_image_orientation/label_list.txt"
+  SavePreLabel:
+    save_dir: ./pre_label/
--- a/deploy/configs/PULC/textline_orientation/inference_textline_orientation.yaml
+++ b/deploy/configs/PULC/textline_orientation/inference_textline_orientation.yaml
+Global:
+  infer_imgs: "./images/PULC/textline_orientation/textline_orientation_test_0_0.png"
+  inference_model_dir: "./models/textline_orientation_infer"
+  batch_size: 1
+  use_gpu: True
+  enable_mkldnn: True
+  cpu_num_threads: 10
+  enable_benchmark: True
+  use_fp16: False
+  ir_optim: True
+  use_tensorrt: False
+  gpu_mem: 8000
+  enable_profile: False
+
+PreProcess:
+  transform_ops:
+    - ResizeImage:
+        size: [160, 80]
+    - NormalizeImage:
+        scale: 0.00392157
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        channel_num: 3
+    - ToCHWImage:
+
+PostProcess:
+  main_indicator: Topk
+  Topk:
+    topk: 1
+    class_id_map_file: "../ppcls/utils/PULC/textline_orientation_label_list.txt"
+  SavePreLabel:
+    save_dir: ./pre_label/
--- a/deploy/configs/PULC/traffic_sign/inference_traffic_sign.yaml
+++ b/deploy/configs/PULC/traffic_sign/inference_traffic_sign.yaml
@@ -30,6 +30,6 @@ PostProcess:
  main_indicator: Topk
  Topk:
    topk: 5
-    class_id_map_file: "../dataset/traffic_sign/label_name_id.txt"
+    class_id_map_file: "../ppcls/utils/PULC_label_list/traffic_sign_label_list.txt"
  SavePreLabel:
    save_dir: ./pre_label/
--- a/deploy/configs/PULC/vehicle_attr/inference_vehicle_attr.yaml
+++ b/deploy/configs/PULC/vehicle_attr/inference_vehicle_attr.yaml
 Global:
  infer_imgs: "./images/PULC/vehicle_attr/0002_c002_00030670_0.jpg"
-  inference_model_dir: "./models/vehicle_attr_infer"
+  inference_model_dir: "./models/vehicle_attribute_infer"
  batch_size: 1
  use_gpu: True
  enable_mkldnn: True

--- a/deploy/images/PULC/language_classification/word_17.png
+++ b/deploy/images/PULC/language_classification/word_17.png
--- a/deploy/images/PULC/language_classification/word_20.png
+++ b/deploy/images/PULC/language_classification/word_20.png
--- a/deploy/images/PULC/language_classification/word_35404.png
+++ b/deploy/images/PULC/language_classification/word_35404.png
--- a/deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
+++ b/deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
--- a/deploy/images/PULC/safety_helmet/safety_helmet_test_2.png
+++ b/deploy/images/PULC/safety_helmet/safety_helmet_test_2.png
--- a/deploy/images/PULC/text_image_orientation/img_rot0_demo.jpg
+++ b/deploy/images/PULC/text_image_orientation/img_rot0_demo.jpg
--- a/deploy/images/PULC/text_image_orientation/img_rot180_demo.jpg
+++ b/deploy/images/PULC/text_image_orientation/img_rot180_demo.jpg
--- a/deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png
+++ b/deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png
--- a/deploy/images/PULC/textline_orientation/textline_orientation_test_0_1.png
+++ b/deploy/images/PULC/textline_orientation/textline_orientation_test_0_1.png
--- a/deploy/images/PULC/textline_orientation/textline_orientation_test_1_0.png
+++ b/deploy/images/PULC/textline_orientation/textline_orientation_test_1_0.png
--- a/deploy/images/PULC/textline_orientation/textline_orientation_test_1_1.png
+++ b/deploy/images/PULC/textline_orientation/textline_orientation_test_1_1.png
--- a/deploy/images/PULC/vehicle_attr/0002_c002_00030670_0.jpg
+++ b/deploy/images/PULC/vehicle_attr/0002_c002_00030670_0.jpg
--- a/deploy/images/PULC/vehicle_attr/0014_c012_00040750_0.jpg
+++ b/deploy/images/PULC/vehicle_attr/0014_c012_00040750_0.jpg
--- a/docs/en/inference_deployment/whl_deploy_en.md
+++ b/docs/en/inference_deployment/whl_deploy_en.md
@@ -212,14 +212,14 @@ You can save the prediction result(s) as pre-label, only need to use `pre_label_
 ```python
 from paddleclas import PaddleClas
 clas = PaddleClas(model_name='ResNet50', save_dir='./output_pre_label/')
-infer_imgs = 'docs/images/inference_deployment/whl_' # it can be infer_imgs folder path which contains all of images you want to predict.
+infer_imgs = 'docs/images/' # it can be infer_imgs folder path which contains all of images you want to predict.
 result=clas.predict(infer_imgs)
 print(next(result))
 ```

 * CLI
 ```bash
-paddleclas --model_name='ResNet50' --infer_imgs='docs/images/inference_deployment/whl_' --save_dir='./output_pre_label/'
+paddleclas --model_name='ResNet50' --infer_imgs='docs/images/' --save_dir='./output_pre_label/'
 ```

 <a name="4.8"></a>

--- a/docs/images/PULC/docs/language_classification_original_data.png
+++ b/docs/images/PULC/docs/language_classification_original_data.png
--- a/docs/images/PULC/docs/safety_helmet_data_demo.png
+++ b/docs/images/PULC/docs/safety_helmet_data_demo.png
--- a/docs/images/PULC/docs/text_image_orientation_data_demo.png
+++ b/docs/images/PULC/docs/text_image_orientation_data_demo.png
--- a/docs/images/PULC/docs/text_image_orientation_original_data.png
+++ b/docs/images/PULC/docs/text_image_orientation_original_data.png
--- a/docs/images/PULC/docs/textline_orientation_data_demo.png
+++ b/docs/images/PULC/docs/textline_orientation_data_demo.png
--- a/docs/images/algorithm_introduction/hnsw.png
+++ b/docs/images/algorithm_introduction/hnsw.png
--- a/docs/zh_CN/PULC/PULC_language_classification.md
+++ b/docs/zh_CN/PULC/PULC_language_classification.md
--- a/docs/zh_CN/PULC/PULC_safety_helmet.md
+++ b/docs/zh_CN/PULC/PULC_safety_helmet.md
+# PULC 佩戴安全帽分类模型
+
+------
+
+## 目录
+
+- [1. 模型和应用场景介绍](#1)
+- [2. 模型快速体验](#2)
+- [3. 模型训练、评估和预测](#3)
+    - [3.1 环境配置](#3.1)
+    - [3.2 数据准备](#3.2)
+      - [3.2.1 数据集来源](#3.2.1)
+      - [3.2.2 数据集获取](#3.2.2)
+    - [3.3 模型训练](#3.3)
+    - [3.4 模型评估](#3.4)
+    - [3.5 模型预测](#3.5)
+- [4. 模型压缩](#4)
+  - [4.1 UDML 知识蒸馏](#4.1)
+    - [4.1.1 教师模型训练](#4.1.1)
+    - [4.1.2 蒸馏训练](#4.1.2)
+- [5. 超参搜索](#5)
+- [6. 模型推理部署](#6)
+  - [6.1 推理模型准备](#6.1)
+    - [6.1.1 基于训练得到的权重导出 inference 模型](#6.1.1)
+    - [6.1.2 直接下载 inference 模型](#6.1.2)
+  - [6.2 基于 Python 预测引擎推理](#6.2)
+    - [6.2.1 预测单张图像](#6.2.1)
+    - [6.2.2 基于文件夹的批量预测](#6.2.2)
+  - [6.3 基于 C++ 预测引擎推理](#6.3)
+  - [6.4 服务化部署](#6.4)
+  - [6.5 端侧部署](#6.5)
+  - [6.6 Paddle2ONNX 模型转换与预测](#6.6)
+
+
+<a name="1"></a>
+
+## 1. 模型和应用场景介绍
+
+该案例提供了用户使用 PaddleClas 的超轻量图像分类方案（PULC，Practical Ultra Lightweight Classification）快速构建轻量级、高精度、可落地的“是否佩戴安全帽”的二分类模型。该模型可以广泛应用于如建筑施工场景、工厂车间场景、交通场景等。
+
+下表列出了判断图片中是否佩戴安全帽的二分类模型的相关指标，展现了使用 Res2Net200_vd_26w_4s，SwinTranformer_tiny 和 MobileNetV3_large_x1_0 作为 backbone 训练得到的模型的相关指标，第三行至第六行依次展现了替换 backbone 为 PPLCNet_x1_0、使用 SSLD 预训练模型、使用 SSLD 预训练模型 + EDA 策略、使用 SSLD 预训练模型 + EDA 策略 + UDML 知识蒸馏策略训练得到的模型的相关指标。
+
+| 模型 | Tpr（%） | 延时（ms） | 存储（M） | 策略 |
+|-------|-----------|----------|---------------|---------------|
+| SwinTranformer_tiny  | 93.57 | 91.32  | 107 | 使用ImageNet预训练模型 |
+| Res2Net200_vd_26w_4s  | 98.92 | 80.99 | 284 | 使用ImageNet预训练模型 |
+| MobileNetV3_small_x0_35  | 96.50 | 2.85 | 1.6 | 使用ImageNet预训练模型 |
+| PPLCNet_x1_0  | 93.29 | 2.03  | 6.5 | 使用ImageNet预训练模型 |
+| PPLCNet_x1_0  | 98.07 | 2.03  | 6.5 | 使用SSLD预训练模型 |
+| PPLCNet_x1_0  | 99.30 | 2.03  | 6.5 | 使用SSLD预训练模型+EDA策略|
+| <b>PPLCNet_x1_0<b>  | <b>99.38<b> | <b>2.03<b>  | <b>6.5<b> | 使用SSLD预训练模型+EDA策略+UDML知识蒸馏策略|
+
+从表中可以看出，在使用服务器端大模型作为 backbone 时，SwinTranformer_tiny 精度较低，Res2Net200_vd_26w_4s 精度较高，但服务器端大模型推理速度普遍较慢。将 backboone 替换为轻量级模型 MobileNetV3_small_x0_35 后，速度可以大幅提升，但是精度显著降低。在将 backbone 替换为 PPLCNet_x1_0，精度较 MobileNetV3_small_x0_35 提高约 30 个百分点，与此同时速度快 20% 以上。在此基础上，将 PPLCNet_x1_0 的预训练模型替换为 SSLD 预训练模型后，在对推理速度无影响的前提下，精度提升约 4.8 个百分点，进一步地使用 EDA 策略后，精度可以再提升 0.7 个百分点。此时，PPLCNet_x1_0 已经接近了 Res2Net200_vd_26w_4s 模型的精度，但是速度快 70+ 倍。最后，在使用 UDML 知识蒸馏后，精度可以再提升 0.5 个百分点。此时，PPLCNet_x1_0 已经超过了 Res2Net200_vd_26w_4s 模型的精度，但速度是其 70 余倍。下面详细介绍关于 PULC 安全帽模型的训练方法和推理部署方法。
+
+**备注：**
+
+* `Tpr`指标的介绍可以参考 [3.3小节](#3.3)的备注部分，延时是基于 Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz 测试得到，开启MKLDNN加速策略，线程数为10。
+* 关于PPLCNet的介绍可以参考[PPLCNet介绍](../models/PP-LCNet.md)，相关论文可以查阅[PPLCNet paper](https://arxiv.org/abs/2109.15099)。
+
+<a name="2"></a>
+
+## 2. 模型快速体验
+
+<a name="2.1"></a>  
+
+### 2.1 安装 paddleclas
+
+使用如下命令快速安装 paddleclas
+
+```  
+pip3 install paddlepaddle paddleclas
+```
+
+<a name="2.2"></a>
+
+### 2.2 预测
+
+* 使用命令行快速预测
+
+```bash
+paddleclas --model_name=safety_helmet --infer_imgs=deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
+```
+
+结果如下：
+```
+>>> result
+class_ids: [1], scores: [0.9986255], label_names: ['unwearing_helmet'], filename: deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
+Predict complete!
+```
+
+**备注**： 更换其他预测的数据时，只需要改变 `--infer_imgs=xxx` 中的字段即可，支持传入整个文件夹。
+
+* 在 Python 代码中预测
+```python
+import paddleclas
+model = paddleclas.PaddleClas(model_name="safety_helmet")
+result = model.predict(input_data="deploy/images/PULC/safety_helmet/safety_helmet_test_1.png")
+print(next(result))
+```
+
+**备注**：`model.predict()` 为可迭代对象（`generator`），因此需要使用 `next()` 函数或 `for` 循环对其迭代调用。每次调用将以 `batch_size` 为单位进行一次预测，并返回预测结果, 默认 `batch_size` 为 1，如果需要更改 `batch_size`，实例化模型时，需要指定 `batch_size`，如 `model = paddleclas.PaddleClas(model_name="safety_helmet",  batch_size=2)`, 使用上述测试代码返回结果示例如下：
+
+```
+>>> result
+[{'class_ids': [1], 'scores': [0.9986255], 'label_names': ['unwearing_helmet'], 'filename': 'deploy/images/PULC/safety_helmet/safety_helmet_test_1.png'}]
+```
+
+<a name="3"></a>
+
+## 3. 模型训练、评估和预测
+
+<a name="3.1"></a>  
+
+### 3.1 环境配置
+
+* 安装：请先参考 [Paddle 安装教程](../installation/install_paddle.md) 以及 [PaddleClas 安装教程](../installation/install_paddleclas.md) 配置 PaddleClas 运行环境。
+
+<a name="3.2"></a>
+
+### 3.2 数据准备
+
+<a name="3.2.1"></a>
+
+#### 3.2.1 数据集来源
+
+本案例中所使用的所有数据集均为开源数据，数据集基于[Safety-Helmet-Wearing-Dataset](https://github.com/njvisionpower/Safety-Helmet-Wearing-Dataset)、[hard-hat-detection](https://www.kaggle.com/datasets/andrewmvd/hard-hat-detection)与[Large-scale CelebFaces Attributes (CelebA) Dataset](https://mmlab.ie.cuhk.edu.hk/projects/CelebA.html)处理整合而来。
+
+<a name="3.2.2"></a>  
+
+#### 3.2.2 数据集获取
+
+在公开数据集的基础上经过后处理即可得到本案例需要的数据，具体处理方法如下：
+
+* 对于 Safety-Helmet-Wearing-Dataset 数据集：根据 bbox 标签数据，对其宽、高放大 3 倍作为 bbox 对图像进行裁剪，其中带有安全帽的图像类别为0，不戴安全帽的图像类别为1；
+* 对于 hard-hat-detection 数据集：仅使用其中类别标签为 “hat” 的图像，并使用 bbox 标签进行裁剪，图像类别为0；
+* 对于 CelebA 数据集：仅使用其中类别标签为 “Wearing_Hat” 的图像，并使用 bbox 标签进行裁剪，图像类别为0。
+
+在整合上述数据后，可得到共约 15 万数据，其中戴安全帽与不戴安全帽的图像数量分别约为 2.8 万与 12.1 万，然后在两个类别上分别随机选取 0.56 万张图像作为测试集，共约 1.12 万张图像，其他约 13.8 万张图像作为训练集。
+
+处理后的数据集部分数据可视化如下：
+
+![](../../images/PULC/docs/safety_helmet_data_demo.png)
+
+此处提供了经过上述方法处理好的数据，可以直接下载得到。
+
+进入 PaddleClas 目录。
+
+```
+cd path_to_PaddleClas
+```
+
+进入 `dataset/` 目录，下载并解压安全帽场景的数据。
+
+```shell
+cd dataset
+wget https://paddleclas.bj.bcebos.com/data/PULC/safety_helmet.tar
+tar -xf safety_helmet.tar
+cd ../
+```
+
+执行上述命令后，`dataset/` 下存在 `safety_helmet` 目录，该目录中具有以下数据：
+
+```
+├── images
+│   ├── VOC2028_part2_001209_1.jpg
+│   ├── HHD_hard_hat_workers23_1.jpg
+│   ├── CelebA_077809.jpg
+│   ├── ...
+│   └── ...
+├── train_list.txt
+└── val_list.txt
+```
+
+其中，`train_list.txt` 和 `val_list.txt` 分别为训练集和验证集的标签文件，所有的图像数据在 `images/` 目录下。
+
+**备注：**
+
+* 关于 `train_list.txt`、`val_list.txt`的格式说明，可以参考[PaddleClas分类数据集格式说明](../data_preparation/classification_dataset.md#1-数据集格式说明) 。
+
+<a name="3.3"></a>
+
+### 3.3 模型训练
+
+在 `ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0.yaml` 中提供了基于该场景的训练配置，可以通过如下脚本启动训练：
+
+```shell
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python3 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    tools/train.py \
+    -c ./ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0.yaml
+```
+
+验证集的最佳指标在 `0.975-0.985` 之间（数据集较小，容易造成波动）。
+
+**备注：**
+
+* 此时使用的指标为Tpr，该指标描述了在假正类率（Fpr）小于某一个指标时的真正类率（Tpr），是产业中二分类问题常用的指标之一。在本案例中，Fpr 为万分之一。关于 Fpr 和 Tpr 的更多介绍，可以参考[这里](https://baike.baidu.com/item/AUC/19282953)。
+
+* 在eval时，会打印出来当前最佳的 TprAtFpr 指标，具体地，其会打印当前的 `Fpr`、`Tpr` 值，以及当前的 `threshold`值，`Tpr` 值反映了在当前 `Fpr` 值下的召回率，该值越高，代表模型越好。`threshold` 表示当前最佳 `Fpr` 所对应的分类阈值，可用于后续模型部署落地等。
+
+<a name="3.4"></a>
+
+### 3.4 模型评估
+
+训练好模型之后，可以通过以下命令实现对模型指标的评估。
+
+```bash
+python3 tools/eval.py \
+    -c ./ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0.yaml \
+    -o Global.pretrained_model=output/PPLCNet_x1_0/best_model
+```
+
+其中 `-o Global.pretrained_model="output/PPLCNet_x1_0/best_model"` 指定了训练过程中的最佳参数权重文件所在的路径，如需指定其他权重文件，只需替换对应的路径即可。
+
+<a name="3.5"></a>
+
+### 3.5 模型预测
+
+模型训练完成之后，可以加载训练得到的预训练模型，进行模型预测。在模型库的 `tools/infer.py` 中提供了完整的示例，只需执行下述命令即可完成模型预测：
+
+```python
+python3 tools/infer.py \
+    -c ./ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0.yaml \
+    -o Global.pretrained_model=output/PPLCNet_x1_0/best_model
+```
+
+输出结果如下：
+
+```
+[{'class_ids': [1], 'scores': [0.9524797], 'label_names': ['unwearing_helmet'], 'file_name': 'deploy/images/PULC/safety_helmet/safety_helmet_test_1.png'}]
+```
+
+**备注：**
+
+* 这里`-o Global.pretrained_model="output/PPLCNet_x1_0/best_model"` 指定了当前最佳权重所在的路径，如果指定其他权重，只需替换对应的路径即可。
+
+* 默认是对 `deploy/images/PULC/safety_helmet/safety_helmet_test_1.png` 进行预测，此处也可以通过增加字段 `-o Infer.infer_imgs=xxx` 对其他图片预测。
+
+* 二分类默认的阈值为0.5， 如果需要指定阈值，可以重写 `Infer.PostProcess.threshold` ，如 `-o Infer.PostProcess.threshold=0.9167`，该值需要根据实际应用场景来确定，在 safety_helmet 数据集的 val 验证集上，在万分之一 Fpr 下得到的最佳 Tpr 时，该值为 0.9167。
+
+<a name="4"></a>
+
+## 4. 模型压缩
+
+<a name="4.1"></a>
+
+### 4.1 UDML 知识蒸馏
+
+UDML 知识蒸馏是一种简单有效的知识蒸馏方法，关于该方法的介绍，可以参考[UDML 知识蒸馏](@ruoyu)。
+
+<a name="4.1.1"></a>
+
+#### 4.1.1 蒸馏训练
+
+配置文件 `ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_distillation.yaml` 提供了 `UDML知识蒸馏策略` 的配置。训练脚本如下：
+
+```shell
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python3 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    tools/train.py \
+    -c ./ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_distillation.yaml
+```
+
+验证集的最佳指标为 `0.990-0.993` 之间，当前模型最好的权重保存在 `output/DistillationModel/best_model_student.pdparams`。
+
+<a name="5"></a>
+
+## 5. 超参搜索
+
+在 [3.2 节](#3.2)和 [4.1 节](#4.1)所使用的超参数是根据 PaddleClas 提供的 `SHAS 超参数搜索策略` 搜索得到的，如果希望在自己的数据集上得到更好的结果，可以参考[SHAS 超参数搜索策略](#TODO)来获得更好的训练超参数。
+
+**备注**：此部分内容是可选内容，搜索过程需要较长的时间，您可以根据自己的硬件情况来选择执行。如果没有更换数据集，可以忽略此节内容。
+
+<a name="6"></a>
+
+## 6. 模型推理部署
+
+<a name="6.1"></a>
+
+### 6.1 推理模型准备
+
+Paddle Inference 是飞桨的原生推理库， 作用于服务器端和云端，提供高性能的推理能力。相比于直接基于预训练模型进行预测，Paddle Inference 可使用 MKLDNN、CUDNN、TensorRT 进行预测加速，从而实现更优的推理性能。更多关于 Paddle Inference 推理引擎的介绍，可以参考[Paddle Inference官网教程](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/infer/inference/inference_cn.html)。
+
+当使用 Paddle Inference 推理时，加载的模型类型为 inference 模型。本案例提供了两种获得 inference 模型的方法，如果希望得到和文档相同的结果，请选择[直接下载 inference 模型](#6.1.2)的方式。
+
+<a name="6.1.1"></a>
+
+### 6.1.1 基于训练得到的权重导出 inference 模型
+
+此处，我们提供了将权重和模型转换的脚本，执行该脚本可以得到对应的 inference 模型：
+
+```bash
+python3 tools/export_model.py \
+    -c ./ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0.yaml \
+    -o Global.pretrained_model=output/DistillationModel/best_model_student \
+    -o Global.save_inference_dir=deploy/models/PPLCNet_x1_0_safety_helmet_infer
+```
+
+执行完该脚本后会在 `deploy/models/` 下生成 `PPLCNet_x1_0_safety_helmet_infer` 目录，该目录下有如下文件结构：
+
+```
+├── PPLCNet_x1_0_safety_helmet_infer
+│   ├── inference.pdiparams
+│   ├── inference.pdiparams.info
+│   └── inference.pdmodel
+```
+
+**备注：** 此处的最佳权重是经过知识蒸馏后的权重路径，如果没有执行知识蒸馏的步骤，最佳模型保存在 `output/PPLCNet_x1_0/best_model.pdparams` 中。
+
+<a name="6.1.2"></a>
+
+### 6.1.2 直接下载 inference 模型
+
+[6.1.1 小节](#6.1.1)提供了导出 inference 模型的方法，此处也提供了该场景可以下载的 inference 模型，可以直接下载体验。
+
+```
+cd deploy/models
+# 下载 inference 模型并解压
+wget https://paddleclas.bj.bcebos.com/models/PULC/safety_helmet_infer.tar && tar -xf safety_helmet_infer.tar
+```
+
+解压完毕后，`models` 文件夹下应有如下文件结构：
+
+```
+├── safety_helmet_infer
+│   ├── inference.pdiparams
+│   ├── inference.pdiparams.info
+│   └── inference.pdmodel
+```
+
+<a name="6.2"></a>
+
+### 6.2 基于 Python 预测引擎推理
+
+<a name="6.2.1"></a>  
+
+#### 6.2.1 预测单张图像
+
+返回 `deploy` 目录：
+
+```
+cd ../
+```
+
+运行下面的命令，对图像 `./images/PULC/safety_helmet/safety_helmet_test_1.png` 进行是否佩戴安全帽分类。
+
+```shell
+# 使用下面的命令使用 GPU 进行预测
+python3.7 python/predict_cls.py -c configs/PULC/safety_helmet/inference_safety_helmet.yaml
+# 使用下面的命令使用 CPU 进行预测
+python3.7 python/predict_cls.py -c configs/PULC/safety_helmet/inference_safety_helmet.yaml -o Global.use_gpu=False
+```
+
+输出结果如下。
+
+```
+safety_helmet_test_1.png:       class id(s): [1], score(s): [1.00], label_name(s): ['unwearing_helmet']
+```
+
+**备注：** 二分类默认的阈值为0.5， 如果需要指定阈值，可以重写 `Infer.PostProcess.threshold` ，如 `-o Infer.PostProcess.threshold=0.9167`，该值需要根据实际应用场景来确定，在 safety_helmet 数据集的 val 验证集上，在万分之一 Fpr 下得到的最佳 Tpr 时，该值为 0.9167。该阈值的确定方法可以参考[3.3节](#3.3)备注部分。
+
+<a name="6.2.2"></a>  
+
+#### 6.2.2 基于文件夹的批量预测
+
+如果希望预测文件夹内的图像，可以直接修改配置文件中的 `Global.infer_imgs` 字段，也可以通过下面的 `-o` 参数修改对应的配置。
+
+```shell
+# 使用下面的命令使用 GPU 进行预测，如果希望使用 CPU 预测，可以在命令后面添加 -o Global.use_gpu=False
+python3.7 python/predict_cls.py -c configs/PULC/safety_helmet/inference_safety_helmet.yaml -o Global.infer_imgs="./images/PULC/safety_helmet/"
+```
+
+终端中会输出该文件夹内所有图像的分类结果，如下所示。
+
+```
+safety_helmet_test_1.png:       class id(s): [1], score(s): [1.00], label_name(s): ['unwearing_helmet']
+safety_helmet_test_2.png:       class id(s): [0], score(s): [1.00], label_name(s): ['wearing_helmet']
+```
+
+其中，`wearing_helmet` 表示该图中的人佩戴了安全帽，`unwearing_helmet` 表示该图中的人未佩戴安全帽。
+
+<a name="6.3"></a>
+
+### 6.3 基于 C++ 预测引擎推理
+
+PaddleClas 提供了基于 C++ 预测引擎推理的示例，您可以参考[服务器端 C++ 预测](../inference_deployment/cpp_deploy.md)来完成相应的推理部署。如果您使用的是 Windows 平台，可以参考[基于 Visual Studio 2019 Community CMake 编译指南](../inference_deployment/cpp_deploy_on_windows.md)完成相应的预测库编译和模型预测工作。
+
+<a name="6.4"></a>
+
+### 6.4 服务化部署
+
+Paddle Serving 提供高性能、灵活易用的工业级在线推理服务。Paddle Serving 支持 RESTful、gRPC、bRPC 等多种协议，提供多种异构硬件和多种操作系统环境下推理解决方案。更多关于Paddle Serving 的介绍，可以参考[Paddle Serving 代码仓库](https://github.com/PaddlePaddle/Serving)。
+
+PaddleClas 提供了基于 Paddle Serving 来完成模型服务化部署的示例，您可以参考[模型服务化部署](../inference_deployment/paddle_serving_deploy.md)来完成相应的部署工作。
+
+<a name="6.5"></a>
+
+### 6.5 端侧部署
+
+Paddle Lite 是一个高性能、轻量级、灵活性强且易于扩展的深度学习推理框架，定位于支持包括移动端、嵌入式以及服务器端在内的多硬件平台。更多关于 Paddle Lite 的介绍，可以参考[Paddle Lite 代码仓库](https://github.com/PaddlePaddle/Paddle-Lite)。
+
+PaddleClas 提供了基于 Paddle Lite 来完成模型端侧部署的示例，您可以参考[端侧部署](../inference_deployment/paddle_lite_deploy.md)来完成相应的部署工作。
+
+<a name="6.6"></a>
+
+### 6.6 Paddle2ONNX 模型转换与预测
+
+Paddle2ONNX 支持将 PaddlePaddle 模型格式转化到 ONNX 模型格式。通过 ONNX 可以完成将 Paddle 模型到多种推理引擎的部署，包括TensorRT/OpenVINO/MNN/TNN/NCNN，以及其它对 ONNX 开源格式进行支持的推理引擎或硬件。更多关于 Paddle2ONNX 的介绍，可以参考[Paddle2ONNX 代码仓库](https://github.com/PaddlePaddle/Paddle2ONNX)。
+
+PaddleClas 提供了基于 Paddle2ONNX 来完成 inference 模型转换 ONNX 模型并作推理预测的示例，您可以参考[Paddle2ONNX 模型转换与预测](@shuilong)来完成相应的部署工作。
--- a/docs/zh_CN/PULC/PULC_text_image_orientation.md
+++ b/docs/zh_CN/PULC/PULC_text_image_orientation.md
--- a/docs/zh_CN/PULC/PULC_textline_orientation.md
+++ b/docs/zh_CN/PULC/PULC_textline_orientation.md
+# PULC 文本行方向分类模型
+
+------
+
+
+## 目录
+
+- [1. 模型和应用场景介绍](#1)
+- [2. 模型快速体验](#2) 
+- [3. 模型训练、评估和预测](#3)
+    - [3.1 环境配置](#3.1)
+    - [3.2 数据准备](#3.2)
+      - [3.2.1 数据集来源](#3.2.1)
+      - [3.2.2 数据集获取](#3.2.2)
+    - [3.3 模型训练](#3.3)
+    - [3.4 模型评估](#3.4)
+    - [3.5 模型预测](#3.5)
+- [4. 模型压缩](#4)
+  - [4.1 SKL-UGI 知识蒸馏](#4.1)
+    - [4.1.1 教师模型训练](#4.1.1)
+    - [4.1.2 蒸馏训练](#4.1.2)
+- [5. 超参搜索](#5)
+- [6. 模型推理部署](#6)
+  - [6.1 推理模型准备](#6.1)
+    - [6.1.1 基于训练得到的权重导出 inference 模型](#6.1.1)
+    - [6.1.2 直接下载 inference 模型](#6.1.2)
+  - [6.2 基于 Python 预测引擎推理](#6.2)
+    - [6.2.1 预测单张图像](#6.2.1)
+    - [6.2.2 基于文件夹的批量预测](#6.2.2)
+  - [6.3 基于 C++ 预测引擎推理](#6.3)
+  - [6.4 服务化部署](#6.4)
+  - [6.5 端侧部署](#6.5)
+  - [6.6 Paddle2ONNX 模型转换与预测](#6.6)
+
+
+<a name="1"></a>
+
+## 1. 模型和应用场景介绍
+
+该案例提供了用户使用 PaddleClas 的超轻量图像分类方案（PULC，Practical Ultra Lightweight Classification）快速构建轻量级、高精度、可落地的文本行方向分类模型。该模型可以广泛应用于如文字矫正、文字识别等场景。
+
+下表列出了文本行方向分类模型的相关指标，前两行展现了使用 Res2Net200_vd 和 MobileNetV3_large_x1_0 作为 backbone 训练得到的模型的相关指标，第三行至第六行依次展现了替换 backbone 为 PPLCNet_x1_0、使用 SSLD 预训练模型、使用 SSLD 预训练模型 + EDA 策略、使用 SSLD 预训练模型 + EDA 策略 + SKL-UGI 知识蒸馏策略训练得到的模型的相关指标。
+
+
+| 模型 | Top-1 Acc（%） | 延时（ms） | 存储（M） | 策略 |
+|-------|-----------|----------|---------------|---------------|
+| SwinTranformer_tiny  | 93.61 | 89.64  | 107 | 使用 ImageNet 预训练模型 |
+| MobileNetV3_small_x0_35  | 81.40 | 2.96  | 17 | 使用 ImageNet 预训练模型 |
+| PPLCNet_x1_0  | 89.99 | 2.11  | 6.5 | 使用 ImageNet 预训练模型 |
+| PPLCNet_x1_0*  | 94.06 | 2.68  | 6.5 | 使用 ImageNet 预训练模型 |
+| PPLCNet_x1_0*  | 94.11 | 2.68  | 6.5 | 使用 SSLD 预训练模型 |
+| <b>PPLCNet_x1_0**<b>  | <b>96.01<b> | <b>2.72<b>  | <b>6.5<b> | 使用 SSLD 预训练模型+EDA 策略|
+| PPLCNet_x1_0**  | 95.86 | 2.72  | 6.5 | 使用 SSLD 预训练模型+EDA 策略+SKL-UGI 知识蒸馏策略|
+     
+从表中可以看出，backbone 为 SwinTranformer_tiny 时精度较高，但是推理速度较慢。将 backboone 替换为轻量级模型 MobileNetV3_small_x0_35 后，速度可以大幅提升，精度下降也比较明显。将 backbone 替换为 PPLCNet_x1_0 时，精度较 MobileNetV3_small_x0_35 高 8.6 个百分点，速度快10%左右。在此基础上，更改分辨率和stride， 速度变慢 27%，但是精度可以提升 4.5%（采用[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)的方案），使用 SSLD 预训练模型后，精度可以继续提升约 0.05% ，进一步地，当融合EDA策略后，精度可以再提升 1.9 个百分点。最后，融合SKL-UGI 知识蒸馏策略后，在该场景无效。关于 PULC 的训练方法和推理部署方法将在下面详细介绍。
+    
+**备注：** 
+
+* 其中不带\*的模型表示分辨率为224x224，带\*的模型表示分辨率为48x192（h*w）,数据增强从网络中的 stride 改为 `[2, [2, 1], [2, 1], [2, 1], [2, 1]]`，其中，外层列表中的每一个元素代表网络结构下采样层的stride，该策略为 [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) 提供的文本行方向分类器方案。带\*\*的模型表示分辨率为80x160（h*w）, 网络中的 stride 改为 `[2, [2, 1], [2, 1], [2, 1], [2, 1]]`，其中，外层列表中的每一个元素代表网络结构下采样层的stride，此分辨率是经过[SHAS 超参数搜索策略](#TODO)搜索得到的。
+* 延时是基于 Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz 测试得到，开启 MKLDNN 加速策略，线程数为10。
+* 关于PPLCNet的介绍可以参考[PPLCNet介绍](../models/PP-LCNet.md)，相关论文可以查阅[PPLCNet paper](https://arxiv.org/abs/2109.15099)。
+
+<a name="2"></a>
+
+## 2. 模型快速体验
+  
+  
+<a name="2.1"></a>   
+
+### 2.1 安装 paddleclas
+
+使用如下命令快速安装 paddlepaddle, paddleclas
+
+```    
+pip3 install paddlepaddle paddleclas
+```
+<a name="2.2"></a> 
+
+### 2.2 预测
+
+* 使用命令行快速预测
+
+```bash
+paddleclas --model_name=textline_orientation --infer_imgs=deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png
+```
+
+结果如下：
+```
+>>> result
+class_ids: [0], scores: [1.00], label_names: ['0_degree'], filename: deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png
+Predict complete!
+```
+
+**备注**： 更换其他预测的数据时，只需要改变 `--infer_imgs=xx` 中的字段即可，支持传入整个文件夹。
+
+
+* 在 Python 代码中预测
+```python
+import paddleclas
+model = paddleclas.PaddleClas(model_name="textline_orientation")
+result = model.predict(input_data="deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png")
+print(next(result))
+```
+
+**备注**：`model.predict()` 为可迭代对象（`generator`），因此需要使用 `next()` 函数或 `for` 循环对其迭代调用。每次调用将以 `batch_size` 为单位进行一次预测，并返回预测结果, 默认 `batch_size` 为 1，如果需要更改 `batch_size`，实例化模型时，需要指定 `batch_size`，如 `model = paddleclas.PaddleClas(model_name="person_exists",  batch_size=2)`, 使用默认的代码返回结果示例如下：
+
+```
+>>> result
+[{'class_ids': [0], 'scores': [1.00], 'label_names': ['0_degree'], 'filename': 'deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png'}]
+```
+    
+    
+<a name="3"></a> 
+
+## 3. 模型训练、评估和预测
+    
+<a name="3.1"></a>  
+
+### 3.1 环境配置
+
+* 安装：请先参考 [Paddle 安装教程](../installation/install_paddle.md) 以及 [PaddleClas 安装教程](../installation/install_paddleclas.md) 配置 PaddleClas 运行环境。
+
+<a name="3.2"></a> 
+
+### 3.2 数据准备
+
+<a name="3.2.1"></a> 
+
+#### 3.2.1 数据集来源
+
+本案例中所使用的所有数据集来源于内部数据，如果您希望体验训练过程，可以使用开源数据如[ICDAR2019-LSVT 文本行识别数据](https://aistudio.baidu.com/aistudio/datasetdetail/8429)。
+
+<a name="3.2.2"></a>     
+
+#### 3.2.2 数据集获取
+
+在公开数据集的基础上经过后处理即可得到本案例需要的数据，具体处理方法如下：
+    
+本案例处理了 ICDAR2019-LSVT 文本行识别数据，将其中的 id 号为 0-1999 作为本案例的数据集合，经过旋转处理成 0 类 和 1 类，其中 0 类代表文本行为正，即 0 度，1 类代表文本行为反，即 180 度。
+
+- 训练集合，id号为 0-1799 作为训练集合，0 类和 1 类共 3600 张。
+
+- 验证集合，id号为 1800-1999 作为验证集合，0 类和 1 类共 400 张。
+
+处理后的数据集部分数据可视化如下：
+
+![](../../images/PULC/docs/textline_orientation_data_demo.png)
+
+
+此处提供了经过上述方法处理好的数据，可以直接下载得到。
+
+
+进入 PaddleClas 目录。
+
+```
+cd path_to_PaddleClas
+```
+
+进入 `dataset/` 目录，下载并解压有人/无人场景的数据。
+
+```shell
+cd dataset
+wget https://paddleclas.bj.bcebos.com/data/PULC/textline_orientation.tar
+tar -xf textline_orientation.tar
+cd ../
+```
+
+执行上述命令后，`dataset/` 下存在 `textline_orientation` 目录，该目录中具有以下数据：
+
+```
+
+├── 0
+│   ├── img_0.jpg
+│   ├── img_1.jpg
+...
+├── 1
+│   ├── img_0.jpg
+│   ├── img_1.jpg
+...
+├── train_list.txt
+└── val_list.txt
+```
+
+其中 `0/` 和 `1/` 分别存放 0 类和 1 类的数据。`train_list.txt` 和 `val_list.txt` 分别为训练集和验证集的标签文件。
+    
+**备注：** 
+
+* 关于 `train_list.txt`、`val_list.txt` 的格式说明，可以参考[PaddleClas分类数据集格式说明](../data_preparation/classification_dataset.md#1-数据集格式说明) 。
+
+
+<a name="3.3"></a> 
+
+### 3.3 模型训练 
+
+
+在 `ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0.yaml` 中提供了基于该场景的训练配置，可以通过如下脚本启动训练：
+
+```shell
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python3 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    tools/train.py \
+        -c ./ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0.yaml 
+```
+
+
+**备注：** 
+
+* 由于此时使用的数据集并非内部非开源数据集，此处不能直接复现提供的模型的指标，如果希望得到更高的精度，可以根据需要处理[ICDAR2019-LSVT 文本行识别数据](https://aistudio.baidu.com/aistudio/datasetdetail/8429)。
+
+<a name="3.4"></a>
+
+### 3.4 模型评估
+
+训练好模型之后，可以通过以下命令实现对模型指标的评估。
+
+```bash
+python3 tools/eval.py \
+    -c ./ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0.yaml \
+    -o Global.pretrained_model="output/PPLCNet_x1_0/best_model"
+```
+
+其中 `-o Global.pretrained_model="output/PPLCNet_x1_0/best_model"` 指定了当前最佳权重所在的路径，如果指定其他权重，只需替换对应的路径即可。
+
+<a name="3.5"></a>
+
+### 3.5 模型预测
+
+模型训练完成之后，可以加载训练得到的预训练模型，进行模型预测。在模型库的 `tools/infer.py` 中提供了完整的示例，只需执行下述命令即可完成模型预测：
+
+```python
+python3 tools/infer.py \
+    -c ./ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0.yaml \
+    -o Global.pretrained_model=output/PPLCNet_x1_0/best_model \
+```
+
+输出结果如下：
+
+```
+[{'class_ids': [0], 'scores': [1.0], 'file_name': 'deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png', 'label_names': ['0_degree']}]
+```
+
+**备注：** 
+
+* 这里`-o Global.pretrained_model="output/PPLCNet_x1_0/best_model"` 指定了当前最佳权重所在的路径，如果指定其他权重，只需替换对应的路径即可。
+    
+* 默认是对 `deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png` 进行预测，此处也可以通过增加字段 `-o Infer.infer_imgs=xxx` 对其他图片预测。
+    
+
+<a name="4"></a>
+
+## 4. 模型压缩
+
+<a name="4.1"></a>
+
+### 4.1 SKL-UGI 知识蒸馏
+    
+SKL-UGI 知识蒸馏是 PaddleClas 提出的一种简单有效的知识蒸馏方法，关于该方法的介绍，可以参考[SKL-UGI 知识蒸馏](@ruoyu)。
+
+<a name="4.1.1"></a> 
+
+#### 4.1.1 教师模型训练
+
+复用 `./ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0.yaml` 中的超参数，训练教师模型，训练脚本如下：
+
+```shell
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python3 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    tools/train.py \
+        -c ./ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0.yaml \
+        -o Arch.name=ResNet101_vd
+```
+
+当前教师模型最好的权重保存在 `output/ResNet101_vd/best_model.pdparams`。
+
+<a name="4.1.2"></a> 
+
+####  4.1.2 蒸馏训练
+
+配置文件`ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0_distillation.yaml`提供了`SKL-UGI知识蒸馏策略`的配置。该配置将`ResNet101_vd`当作教师模型，`PPLCNet_x1_0`当作学生模型。训练脚本如下：
+
+```shell
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python3 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    tools/train.py \
+        -c ./ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0_distillation.yaml \
+        -o Arch.models.0.Teacher.pretrained=output/ResNet101_vd/best_model
+```
+
+当前模型最好的权重保存在 `output/DistillationModel/best_model_student.pdparams`。
+
+    
+<a name="5"></a> 
+
+## 5. 超参搜索
+
+在 [3.2 节](#3.2)和 [4.1 节](#4.1)所使用的超参数是根据 PaddleClas 提供的 `SHAS 超参数搜索策略` 搜索得到的，如果希望在自己的数据集上得到更好的结果，可以参考[SHAS 超参数搜索策略](#TODO)来获得更好的训练超参数。
+
+**备注：** 此部分内容是可选内容，搜索过程需要较长的时间，您可以根据自己的硬件情况来选择执行。
+
+<a name="6"></a>
+
+## 6. 模型推理部署
+
+<a name="6.1"></a> 
+
+### 6.1 推理模型准备
+
+Paddle Inference 是飞桨的原生推理库， 作用于服务器端和云端，提供高性能的推理能力。相比于直接基于预训练模型进行预测，Paddle Inference可使用MKLDNN、CUDNN、TensorRT 进行预测加速，从而实现更优的推理性能。更多关于Paddle Inference推理引擎的介绍，可以参考[Paddle Inference官网教程](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/infer/inference/inference_cn.html)。
+    
+当使用 Paddle Inference 推理时，加载的模型类型为 inference 模型。本案例提供了两种获得 inference 模型的方法，如果希望得到和文档相同的结果，请选择[直接下载 inference 模型](#6.1.2)的方式。
+
+<a name="6.1.1"></a> 
+
+### 6.1.1 基于训练得到的权重导出 inference 模型
+
+此处，我们提供了将权重和模型转换的脚本，执行该脚本可以得到对应的 inference 模型：
+
+```bash
+python3 tools/export_model.py \
+    -c ./ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0.yaml \
+    -o Global.pretrained_model=output/PPLCNet_x1_0/best_model \
+    -o Global.save_inference_dir=deploy/models/PPLCNet_x1_0_textline_orientation_infer
+```
+执行完该脚本后会在 `deploy/models/` 下生成 `PPLCNet_x1_0_textline_orientation_infer` 文件夹，`models` 文件夹下应有如下文件结构：
+
+```
+├── PPLCNet_x1_0_textline_orientation_infer
+│   ├── inference.pdiparams
+│   ├── inference.pdiparams.info
+│   └── inference.pdmodel
+```
+
+**备注：** 此处的最佳权重可以根据实际情况来选择，如果希望导出知识蒸馏后的权重，则最佳权重保存在`output/DistillationModel/best_model_student.pdparams`，在导出命令中更改`-o Global.pretrained_model=xx`中的字段为`output/DistillationModel/best_model_student`即可。
+
+<a name="6.1.2"></a> 
+
+### 6.1.2 直接下载 inference 模型
+
+[6.1.1 小节](#6.1.1)提供了导出 inference 模型的方法，此处也提供了该场景可以下载的 inference 模型，可以直接下载体验。
+
+```
+cd deploy/models
+# 下载 inference 模型并解压
+wget https://paddleclas.bj.bcebos.com/models/PULC/textline_orientation_infer.tar && tar -xf textline_orientation_infer.tar
+```
+
+解压完毕后，`models` 文件夹下应有如下文件结构：
+
+```
+├── textline_orientation_infer
+│   ├── inference.pdiparams
+│   ├── inference.pdiparams.info
+│   └── inference.pdmodel
+```
+
+<a name="6.2"></a> 
+
+### 6.2 基于 Python 预测引擎推理
+
+
+<a name="6.2.1"></a>  
+
+#### 6.2.1 预测单张图像
+
+返回 `deploy` 目录：
+
+```
+cd ../
+```
+
+运行下面的命令，对图像 `./images/PULC/textline_orientation/textline_orientation_test_0_0.png` 进行文字方向cd分类。
+
+```shell
+# 使用下面的命令使用 GPU 进行预测
+python3.7 python/predict_cls.py -c configs/PULC/textline_orientation/inference_textline_orientation.yaml
+# 使用下面的命令使用 CPU 进行预测
+python3.7 python/predict_cls.py -c configs/PULC/textline_orientation/inference_textline_orientation.yaml  -o Global.use_gpu=False
+```
+
+输出结果如下。
+
+```
+textline_orientation_test_0_0.png:	class id(s): [0], score(s): [1.00], label_name(s): ['0_degree']
+```
+
+<a name="6.2.2"></a>  
+
+#### 6.2.2 基于文件夹的批量预测
+
+如果希望预测文件夹内的图像，可以直接修改配置文件中的 `Global.infer_imgs` 字段，也可以通过下面的 `-o` 参数修改对应的配置。
+
+```shell
+# 使用下面的命令使用 GPU 进行预测，如果希望使用 CPU 预测，可以在命令后面添加 -o Global.use_gpu=False
+python3.7 python/predict_cls.py -c configs/PULC/textline_orientation/inference_textline_orientation.yaml -o Global.infer_imgs="./images/PULC/textline_orientation/"
+```
+
+终端中会输出该文件夹内所有图像的分类结果，如下所示。
+
+```
+textline_orientation_test_0_0.png:	class id(s): [0], score(s): [1.00], label_name(s): ['0_degree']
+textline_orientation_test_0_1.png:	class id(s): [0], score(s): [1.00], label_name(s): ['0_degree']
+textline_orientation_test_1_0.png:	class id(s): [1], score(s): [1.00], label_name(s): ['180_degree']
+textline_orientation_test_1_1.png:	class id(s): [1], score(s): [1.00], label_name(s): ['180_degree']
+```
+
+其中，`0_degree` 表示该文本行为 0 度，`180_degree` 表示该文本行为 180 度。
+
+<a name="6.3"></a> 
+
+### 6.3 基于 C++ 预测引擎推理
+
+PaddleClas 提供了基于 C++ 预测引擎推理的示例，您可以参考[服务器端 C++ 预测](../inference_deployment/cpp_deploy.md)来完成相应的推理部署。如果您使用的是 Windows 平台，可以参考[基于 Visual Studio 2019 Community CMake 编译指南](../inference_deployment/cpp_deploy_on_windows.md)完成相应的预测库编译和模型预测工作。
+
+<a name="6.4"></a> 
+
+### 6.4 服务化部署
+
+Paddle Serving 提供高性能、灵活易用的工业级在线推理服务。Paddle Serving 支持 RESTful、gRPC、bRPC 等多种协议，提供多种异构硬件和多种操作系统环境下推理解决方案。更多关于Paddle Serving 的介绍，可以参考[Paddle Serving 代码仓库](https://github.com/PaddlePaddle/Serving)。
+    
+PaddleClas 提供了基于 Paddle Serving 来完成模型服务化部署的示例，您可以参考[模型服务化部署](../inference_deployment/paddle_serving_deploy.md)来完成相应的部署工作。
+
+<a name="6.5"></a> 
+
+### 6.5 端侧部署
+
+Paddle Lite 是一个高性能、轻量级、灵活性强且易于扩展的深度学习推理框架，定位于支持包括移动端、嵌入式以及服务器端在内的多硬件平台。更多关于 Paddle Lite 的介绍，可以参考[Paddle Lite 代码仓库](https://github.com/PaddlePaddle/Paddle-Lite)。
+    
+PaddleClas 提供了基于 Paddle Lite 来完成模型端侧部署的示例，您可以参考[端侧部署](../inference_deployment/paddle_lite_deploy.md)来完成相应的部署工作。
+
+<a name="6.6"></a> 
+
+### 6.6 Paddle2ONNX 模型转换与预测
+    
+Paddle2ONNX 支持将 PaddlePaddle 模型格式转化到 ONNX 模型格式。通过 ONNX 可以完成将 Paddle 模型到多种推理引擎的部署，包括TensorRT/OpenVINO/MNN/TNN/NCNN，以及其它对 ONNX 开源格式进行支持的推理引擎或硬件。更多关于 Paddle2ONNX 的介绍，可以参考[Paddle2ONNX 代码仓库](https://github.com/PaddlePaddle/Paddle2ONNX)。
+
+PaddleClas 提供了基于 Paddle2ONNX 来完成 inference 模型转换 ONNX 模型并作推理预测的示例，您可以参考[Paddle2ONNX 模型转换与预测](@shuilong)来完成相应的部署工作。
--- a/docs/zh_CN/PULC/PULC_traffic_sign.md
+++ b/docs/zh_CN/PULC/PULC_traffic_sign.md
@@ -39,19 +39,19 @@

 该案例提供了用户使用 PaddleClas 的超轻量图像分类方案（PULC，Practical Ultra Lightweight Classification）快速构建轻量级、高精度、可落地的交通标志分类模型。该模型可以广泛应用于自动驾驶、道路监控等场景。

-下表列出了不同交通标志分类模型的相关指标，前两行展现了使用 SwinTranformer_tiny 和 MobileNetV3_large_x1_0 作为 backbone 训练得到的模型的相关指标，第三行至第六行依次展现了替换 backbone 为 PPLCNet_x1_0、使用 SSLD 预训练模型、使用 SSLD 预训练模型 + EDA 策略、使用 SSLD 预训练模型 + EDA 策略 + SKL-UGI 知识蒸馏策略训练得到的模型的相关指标。
+下表列出了不同交通标志分类模型的相关指标，前两行展现了使用 SwinTranformer_tiny 和 MobileNetV3_small_x0_35 作为 backbone 训练得到的模型的相关指标，第三行至第六行依次展现了替换 backbone 为 PPLCNet_x1_0、使用 SSLD 预训练模型、使用 SSLD 预训练模型 + EDA 策略、使用 SSLD 预训练模型 + EDA 策略 + SKL-UGI 知识蒸馏策略训练得到的模型的相关指标。


 | 模型 | Top-1 Acc（%） | 延时（ms） | 存储（M） | 策略 |
 |-------|-----------|----------|---------------|---------------|
 | SwinTranformer_tiny  | 98.11 | 89.45  | 111 | 使用ImageNet预训练模型 |
-| MobileNetV3_large_x1_0  | 97.79 | 4.81  | 23 | 使用ImageNet预训练模型 |
+| MobileNetV3_small_x0_35  | 93.88 | 3.01  | 3.9 | 使用ImageNet预训练模型 |
 | PPLCNet_x1_0  | 97.78 | 2.10  | 8.2 | 使用ImageNet预训练模型 |
 | PPLCNet_x1_0  | 97.84 | 2.10  | 8.2 | 使用SSLD预训练模型 |
 | PPLCNet_x1_0  | 98.14 | 2.10  | 8.2 | 使用SSLD预训练模型+EDA策略|
 | <b>PPLCNet_x1_0<b>  | <b>98.35<b> | <b>2.10<b>  | <b>8.2<b> | 使用SSLD预训练模型+EDA策略+SKL-UGI知识蒸馏策略|

-从表中可以看出，backbone 为 SwinTranformer_tiny 时精度较高，但是推理速度较慢。将 backbone 替换为轻量级模型 MobileNetV3_large_x1_0 后，速度可以大幅提升，但是精度下降明显。将 backbone 替换为 PPLCNet_x1_0 时，精度低0.01%，但是速度提升 1 倍左右。在此基础上，使用 SSLD 预训练模型后，在不改变推理速度的前提下，精度可以提升约 0.06%，进一步地，当融合EDA策略后，精度可以再提升 0.3%，最后，在使用 SKL-UGI 知识蒸馏后，精度可以继续提升 0.21%。此时，PPLCNet_x1_0 的精度超越了SwinTranformer_tiny，速度快 41 倍。关于 PULC 的训练方法和推理部署方法将在下面详细介绍。
+从表中可以看出，backbone 为 SwinTranformer_tiny 时精度较高，但是推理速度较慢。将 backbone 替换为轻量级模型 MobileNetV3_small_x0_35 后，速度可以大幅提升，但是精度下降明显。将 backbone 替换为 PPLCNet_x1_0 时，精度低3.9%，同时速度提升 43% 左右。在此基础上，使用 SSLD 预训练模型后，在不改变推理速度的前提下，精度可以提升约 0.06%，进一步地，当融合EDA策略后，精度可以再提升 0.3%，最后，在使用 SKL-UGI 知识蒸馏后，精度可以继续提升 0.21%。此时，PPLCNet_x1_0 的精度超越了 SwinTranformer_tiny，速度快 41 倍。关于 PULC 的训练方法和推理部署方法将在下面详细介绍。

 **备注：**

@@ -62,8 +62,48 @@

 ## 2. 模型快速体验

-    （pip方式，待补充）
+<a name="2.1"></a>  

+### 2.1 安装 paddleclas
+
+使用如下命令快速安装 paddlepaddle, paddleclas
+
+```bash
+pip3 install paddlepaddle paddleclas
+```
+<a name="2.2"></a>
+
+### 2.2 预测
+
+* 使用命令行快速预测
+
+```bash
+paddleclas --model_name traffic_sign  --infer_imgs PaddleClas/deploy/images/PULC/traffic_sign/100999_83928.jpg
+```
+
+结果如下：
+```
+>>> result
+class_ids: [182, 179, 162, 128, 24], scores: [0.98623, 0.01255, 0.00022, 0.00021, 0.00012], label_names: ['pl110', 'pl100', 'pl120', 'p26', 'pm10'], filename: PaddleClas/deploy/images/PULC/traffic_sign/100999_83928.jpg
+```
+
+**备注**： 更换其他预测的数据时，只需要改变 `--infer_imgs=xx` 中的字段即可，支持传入整个文件夹。
+
+
+* 在 Python 代码中预测
+```python
+import paddleclas
+model = paddleclas.PaddleClas(model_name="traffic_sign")
+result = model.predict(input_data="PaddleClas/deploy/images/PULC/traffic_sign/100999_83928.jpg")
+print(next(result))
+```
+
+**备注**：`model.predict()` 为可迭代对象（`generator`），因此需要使用 `next()` 函数或 `for` 循环对其迭代调用。每次调用将以 `batch_size` 为单位进行一次预测，并返回预测结果, 默认 `batch_size` 为 1，如果需要更改 `batch_size`，实例化模型时，需要指定 `batch_size`，如 `model = paddleclas.PaddleClas(model_name="person_exists",  batch_size=2)`, 使用默认的代码返回结果示例如下：
+
+```
+result
+[{'class_ids': [182, 179, 162, 128, 24], 'scores': [0.98623, 0.01255, 0.00022, 0.00021, 0.00012], 'label_names': ['pl110', 'pl100', 'pl120', 'p26', 'pm10'], 'filename': 'PaddleClas/deploy/images/PULC/traffic_sign/100999_83928.jpg'}]
+```

 <a name="3"></a>


--- a/docs/zh_CN/PULC/PULC_train.md
+++ b/docs/zh_CN/PULC/PULC_train.md
+## 超轻量图像分类方案PULC
+### 0. PULC方案简介
+图像分类是计算机视觉的基础算法之一，是企业应用中最常见的算法，也是许多CV应用的重要组成部分。
+近年来，骨干网络模型发展迅速，Imagenet的精度纪录被不断刷新。然而，这些模型在实用场景的表现有时却不尽如人意。
+一方面，精度高的模型往往体积大，运算慢，常常难以满足实际部署需求；另一方面，选择了合适的模型之后，往往还需要经验丰富的工程师进行调参，
+费时费力。PaddleClas为了解决企业应用难题，让分类模型的训练和调参更加容易，总结推出了实用轻量图像分类解决方案PULC。
+PULC融合了骨干网络、数据增广、蒸馏等多种前沿算法，可以自动训练得到轻量且高精度的图像分类模型。
+方案在人、车、OCR等方向的多个场景中均验证有效，用超轻量模型就可实现与SwinTransformer模型接近的精度，预测速度提高50倍。
+<div align="center">
+<img src="https://user-images.githubusercontent.com/19523330/172054976-e12d2c9b-439f-469d-b520-56bb5c3e6215.png"/>
+</div>
+
+方案主要包括4部分，分别是：PP-LCNet轻量级骨干网络、SSLD预训练权重、数据增强策略集成和SKL-UGI知识蒸馏算法。此外，我们还采用了超参搜索的方法，高效优化训练中的超参数。
+下面，我们以有人/无人场景为例，对方案进行说明。
+
+**注**：针对一些特定场景，我们提供了基础的训练文档供参考，例如[有人/无人分类模型](PULC_person_exists.md)等，您可以在[这里]()找到这些文档。
+如果这些文档中的方法不能满足您的需求，或者您需要自定义训练任务，您可以参考本文档。
+
+### 1. 数据准备
+#### 1.1 数据集格式说明
+
+PaddleClas 使用 `txt` 格式文件指定训练集和测试集，以有人无人场景为例，其中 `train_list.txt` 和 `val_list.txt` 的格式形如：
+
+```shell
+# 每一行采用"空格"分隔图像路径与标注
+train/1.jpg 0
+train/10.jpg 1
+...
+```
+如果您想获取更多常用分类数据集的信息，可以参考文档[常见分类说明](../data_preparation/classification_dataset.md)。
+
+// todo@cuicheng v2.4.1 1.2有人无人场景数据获取代码。整理obj365数据提取的数据并说明。
+
+
+#### 1.2 标注文件生成
+如果您已经有实际场景中的数据，那么按照上节的格式进行标注即可。这里，我们提供了一个快速生成数据的脚本，您只需要将不同类别的数据分别放在文件夹中，运行脚本即可生成标注文件。 
+// todo 数据脚本。 
+
+### 2. 使用标准分类配置进行训练
+#### 2.1 骨干网络PP-LCNet
+PULC采用了轻量骨干网络PP-LCNet，相比同精度竞品速度快50%，您可以在[这里](../models/PP-LCNet.md)找到详细介绍。
+直接使用PP-LCNet训练的命令为：
+
+**todo**
+
+为了方便性能对比，我们也提供了大模型SwinTransformer和轻量模型MobileNet的配置文件，您可以使用命令训练：
+
+**todo**
+
+训练得到的模型精度对比如下表。从中可以看出，LCNet的速度比SwinTransformer快很多，但是精度也略低。
+下面我们通过一系列优化来提高PP-LCNet模型的精度。
+
+#### 2.2 SSLD预训练权重
+SSLD是百度自研的半监督蒸馏算法，在ImageNet数据集上，模型精度可以提升3-7个点，您可以在[这里](../algorithm_introduction/#2)找到详细介绍。
+我们发现，使用SSLD预训练权重，可以提升应用分类模型的精度。此外，使用SSLD预训练权重也有助于其他策略精度提升。
+此外，根据**todo**，在训练中使用略低一点的分辨率，可以有效提升模型精度。同时，我们也对学习率进行了优化。
+基于以上三点改进，我们训练得到模型精度为**todo**，提升**todo**。
+
+#### 2.3 EDA数据增广策略
+数据增广是视觉算法中常用的优化策略，可以对模型精度有明显提升。除了传统的RandomCrop，RandomFlip等方法之外，我们还应用了RandomAugment和RandomErasing。
+您可以在[这里](../advanced_tutorials/DataAugmentation.md)找到详细介绍。
+由于这两种数据增强对图片的修改较大，使任务变难，在一些小数据集上可能会导致模型欠拟合，我们将这两种方法启用的概率设为10%。
+基于以上改进，我们训练得到模型精度为**todo**，提升**todo**。
+
+#### 2.4 SKL-UGI模型蒸馏
+模型蒸馏是一种可以有效提升小模型精度的方法，您可以在[这里](todo@ruoyu)找到详细介绍。
+我们选择ResNet101作为教师模型进行蒸馏。
+**todo @cuicheng，对lr_mult进行说明**
+基于以上改进，我们训练得到模型精度为**todo**，提升:**todo。
+#### 2.5 总结
+经过以上方法优化，PP-LCNet最终精度达到**todo**，达到了大模型的精度水平。我们将实验结果总结如下表：
+**todo**
+我们在其他9个场景中也使用了同样的优化策略，得到如下结果：
+**todo**
+
+从结果可以看出，PULC优化方法在多个应用场景中均可提升模型精度。虽然并非每种方法都有正向收益，但是使用PULC可以大大减少模型优化的工作量，快速得到精度较高的模型。
+
+### 3. 超参搜索
+在上述训练过程中，我们调节了学习率、数据增广方法开启概率、分阶段学习率倍数等参数。
+这些参数在不同场景中最优值可能并不相同。我们提供了一个快速超参搜索的脚本，将超参调优的过程自动化。
+这个脚本会遍历搜索值列表中的参数来替代默认配置中的参数，依次训练，最终选择精度最高的模型所对应的参数作为搜索结果。
+
+#### 3.1 基于默认配置搜索
+配置文件[search.yaml](todo)定义了有人/无人场景超参搜索的配置，使用命令**todo**，可以使用默认的超参数搜索配置进行训练，最终可得训练结果为：
+**todo**
+#### 3.2 自定义搜索配置
+您也可以根据训练结果或调参经验，修改超参搜索的配置。
+修改**todo**字段，可以修改学习率搜索值列表；
+
+修改**todo**字段，可以修改RandAugment开启概率的搜索值列表；
+
+修改**todo**字段，可以修改RnadomErasing开启概率的搜索值列表；
+
+修改**todo**字段，可以修改lr_mult搜索值列表；
+
+修改**todo**字段，可以修改教师模型的搜索列表。
--- a/docs/zh_CN/PULC/PULC_vehicle_attr.md
+++ b/docs/zh_CN/PULC/PULC_vehicle_attr.md
@@ -39,20 +39,20 @@

 该案例提供了用户使用 PaddleClas 的超轻量图像分类方案（PULC，Practical Ultra Lightweight Classification）快速构建轻量级、高精度、可落地的车辆属性识别模型。该模型可以广泛应用于车辆识别、道路监控等场景。

-下表列出了不同车辆属性识别模型的相关指标，前两行展现了使用 Res2Net200_vd_26w_4s 和 MobileNetV3_large_x1_0 作为 backbone 训练得到的模型的相关指标，第三行至第六行依次展现了替换 backbone 为 PPLCNet_x1_0、使用 SSLD 预训练模型、使用 SSLD 预训练模型 + EDA 策略、使用 SSLD 预训练模型 + EDA 策略 + SKL-UGI 知识蒸馏策略训练得到的模型的相关指标。
+下表列出了不同车辆属性识别模型的相关指标，前两行展现了使用 Res2Net200_vd_26w_4s 和 MobileNetV3_small_x0_35 作为 backbone 训练得到的模型的相关指标，第三行至第六行依次展现了替换 backbone 为 PPLCNet_x1_0、使用 SSLD 预训练模型、使用 SSLD 预训练模型 + EDA 策略、使用 SSLD 预训练模型 + EDA 策略 + SKL-UGI 知识蒸馏策略训练得到的模型的相关指标。


 | 模型 | ma（%） | 延时（ms） | 存储（M） | 策略 |
 |-------|-----------|----------|---------------|---------------|
 | Res2Net200_vd_26w_4s  | 91.36 | 79.46  | 293 | 使用ImageNet预训练模型 |
 | ResNet50  | 89.98 | 12.83  | 92 | 使用ImageNet预训练模型 |
-| MobileNetV3_large_x1_0  | 89.77 | 5.09  | 23 | 使用ImageNet预训练模型 |
+| MobileNetV3_small_x0_35  | 87.41 | 2.91  | 2.8 | 使用ImageNet预训练模型 |
 | PPLCNet_x1_0  | 89.57 | 2.36  | 8.2 | 使用ImageNet预训练模型 |
 | PPLCNet_x1_0  | 90.07 | 2.36  | 8.2 | 使用SSLD预训练模型 |
 | PPLCNet_x1_0  | 90.59 | 2.36  | 8.2 | 使用SSLD预训练模型+EDA策略|
 | <b>PPLCNet_x1_0<b>  | <b>90.81<b> | <b>2.36<b>  | <b>8.2<b> | 使用SSLD预训练模型+EDA策略+SKL-UGI知识蒸馏策略|

-从表中可以看出，backbone 为 Res2Net200_vd_26w_4s 时精度较高，但是推理速度较慢。将 backbone 替换为轻量级模型 MobileNetV3_large_x1_0 后，速度可以大幅提升，但是精度下降明显。将 backbone 替换为 PPLCNet_x1_0 时，精度低0.2%，但是速度提升 1 倍左右。在此基础上，使用 SSLD 预训练模型后，在不改变推理速度的前提下，精度可以提升约 0.5%，进一步地，当融合EDA策略后，精度可以再提升 0.52%，最后，在使用 SKL-UGI 知识蒸馏后，精度可以继续提升 0.23%。此时，PPLCNet_x1_0 的精度与 Res2Net200_vd_26w_4s 仅相差0.55%，但是速度快32倍。关于 PULC 的训练方法和推理部署方法将在下面详细介绍。
+从表中可以看出，backbone 为 Res2Net200_vd_26w_4s 时精度较高，但是推理速度较慢。将 backbone 替换为轻量级模型 MobileNetV3_small_x0_35 后，速度可以大幅提升，但是精度下降明显。将 backbone 替换为 PPLCNet_x1_0 时，精度提升 2.16%，同时速度也提升 23% 左右。在此基础上，使用 SSLD 预训练模型后，在不改变推理速度的前提下，精度可以提升约 0.5%，进一步地，当融合EDA策略后，精度可以再提升 0.52%，最后，在使用 SKL-UGI 知识蒸馏后，精度可以继续提升 0.23%。此时，PPLCNet_x1_0 的精度与 Res2Net200_vd_26w_4s 仅相差0.55%，但是速度快32倍。关于 PULC 的训练方法和推理部署方法将在下面详细介绍。

 **备注：**

@@ -63,8 +63,48 @@

 ## 2. 模型快速体验

+<a name="2.1"></a>  
+
+### 2.1 安装 paddleclas
+
+使用如下命令快速安装 paddlepaddle, paddleclas
+
+```bash
+pip3 install paddlepaddle paddleclas
+```
+<a name="2.2"></a>
+
+### 2.2 预测
+
+* 使用命令行快速预测
+
+```bash
+paddleclas --model_name vehicle_attribute --infer_imgs PaddleClas/deploy/images/PULC/vehicle_attribute/0002_c002_00030670_0.jpg
+```
+
+结果如下：
+```
+>>> result
+attributes: Color: (yellow, prob: 0.9893476963043213), Type: (hatchback, prob: 0.9734097719192505), output: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], filename: PaddleClas/deploy/images/PULC/vehicle_attribute/0002_c002_00030670_0.jpg
+ppcls INFO: Predict complete!
+```
+
+**备注**： 更换其他预测的数据时，只需要改变 `--infer_imgs=xx` 中的字段即可，支持传入整个文件夹。
+
+
+* 在 Python 代码中预测
+```python
+import paddleclas
+model = paddleclas.PaddleClas(model_name="vehicle_attribute")
+result = model.predict(input_data="PaddleClas/deploy/images/PULC/vehicle_attribute/0002_c002_00030670_0.jpg")
+print(next(result))
+```
+
+**备注**：`model.predict()` 为可迭代对象（`generator`），因此需要使用 `next()` 函数或 `for` 循环对其迭代调用。每次调用将以 `batch_size` 为单位进行一次预测，并返回预测结果, 默认 `batch_size` 为 1，如果需要更改 `batch_size`，实例化模型时，需要指定 `batch_size`，如 `model = paddleclas.PaddleClas(model_name="person_exists",  batch_size=2)`, 使用默认的代码返回结果示例如下：
+
 ```
-（pip方式，待补充）
+result
+[{'attributes': 'Color: (yellow, prob: 0.9893476963043213), Type: (hatchback, prob: 0.9734097719192505)', 'output': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 'filename': 'PaddleClas/deploy/images/PULC/vehicle_attribute/0002_c002_00030670_0.jpg'}]
 ```

 <a name="3"></a>
@@ -94,7 +134,7 @@
 部分数据可视化如下所示。

 <div align="center">
-<img src="../../images/PULC/docs/vehicle_attr_data_demo.png"  width = "500" />
+<img src="../../images/PULC/docs/vehicle_attribute_data_demo.png"  width = "500" />
 </div>

 首先从[VeRi数据集官网](https://www.v7labs.com/open-datasets/veri-dataset)中申请并下载数据，放在PaddleClas的`dataset`目录下，数据集目录名为`VeRi`，使用下面的命令进入该文件夹。
@@ -172,17 +212,17 @@ VeRi
 ### 3.3 模型训练


-在 `ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0.yaml` 中提供了基于该场景的训练配置，可以通过如下脚本启动训练：
+在 `ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0.yaml` 中提供了基于该场景的训练配置，可以通过如下脚本启动训练：

 ```shell
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 python3 -m paddle.distributed.launch \
    --gpus="0,1,2,3" \
    tools/train.py \
-        -c ./ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0.yaml
+        -c ./ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0.yaml
 ```

-验证集的最佳指标在 `90.07%` 左右（数据集较小，一般有0.3%左右的波动）。
+验证集的最佳指标在 `90.59%` 左右（数据集较小，一般有0.3%左右的波动）。


 <a name="3.4"></a>
@@ -193,7 +233,7 @@ python3 -m paddle.distributed.launch \

 ```bash
 python3 tools/eval.py \
-    -c ./ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0.yaml \
+    -c ./ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0.yaml \
    -o Global.pretrained_model="output/PPLCNet_x1_0/best_model"
 ```

@@ -207,21 +247,21 @@ python3 tools/eval.py \

 ```bash
 python3 tools/infer.py \
-    -c ./ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0.yaml \
+    -c ./ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0.yaml \
    -o Global.pretrained_model=output/DistillationModel/best_model
 ```

 输出结果如下：

 ```
-[{'attr': 'Color: (yellow, prob: 0.9893478155136108), Type: (hatchback, prob: 0.9734100103378296)', 'pred': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 'file_name': './deploy/images/PULC/vehicle_attr/0002_c002_00030670_0.jpg'}]
+[{'attr': 'Color: (yellow, prob: 0.9893478155136108), Type: (hatchback, prob: 0.9734100103378296)', 'pred': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 'file_name': './deploy/images/PULC/vehicle_attribute/0002_c002_00030670_0.jpg'}]
 ```

 **备注：**

 * 这里`-o Global.pretrained_model="output/PPLCNet_x1_0/best_model"` 指定了当前最佳权重所在的路径，如果指定其他权重，只需替换对应的路径即可。

-* 默认是对 `./deploy/images/PULC/vehicle_attr/0002_c002_00030670_0.jpg` 进行预测，此处也可以通过增加字段 `-o Infer.infer_imgs=xxx` 对其他图片预测。
+* 默认是对 `./deploy/images/PULC/vehicle_attribute/0002_c002_00030670_0.jpg` 进行预测，此处也可以通过增加字段 `-o Infer.infer_imgs=xxx` 对其他图片预测。

 <a name="4"></a>

@@ -237,14 +277,14 @@ SKL-UGI 知识蒸馏是 PaddleClas 提出的一种简单有效的知识蒸馏方

 #### 4.1.1 教师模型训练

-复用 `ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0.yaml` 中的超参数，训练教师模型，训练脚本如下：
+复用 `ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0.yaml` 中的超参数，训练教师模型，训练脚本如下：

 ```shell
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 python3 -m paddle.distributed.launch \
    --gpus="0,1,2,3" \
    tools/train.py \
-        -c ./ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0.yaml \
+        -c ./ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0.yaml \
        -o Arch.name=ResNet101_vd
 ```

@@ -254,14 +294,14 @@ python3 -m paddle.distributed.launch \

 ####  4.1.2 蒸馏训练

-配置文件`ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0_distillation.yaml`提供了`SKL-UGI知识蒸馏策略`的配置。该配置将`ResNet101_vd`当作教师模型，`PPLCNet_x1_0`当作学生模型。训练脚本如下：
+配置文件`ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0_distillation.yaml`提供了`SKL-UGI知识蒸馏策略`的配置。该配置将`ResNet101_vd`当作教师模型，`PPLCNet_x1_0`当作学生模型。训练脚本如下：

 ```shell
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 python3 -m paddle.distributed.launch \
    --gpus="0,1,2,3" \
    tools/train.py \
-        -c ./ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0_distillation.yaml \
+        -c ./ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0_distillation.yaml \
        -o Arch.models.0.Teacher.pretrained=output/ResNet101_vd/best_model
 ```

@@ -296,14 +336,14 @@ Paddle Inference 是飞桨的原生推理库， 作用于服务器端和云端

 ```bash
 python3 tools/export_model.py \
-    -c ./ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0.yaml \
+    -c ./ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0.yaml \
    -o Global.pretrained_model=output/DistillationModel/best_model_student \
-    -o Global.save_inference_dir=deploy/models/PPLCNet_x1_0_vehicle_attr_infer
+    -o Global.save_inference_dir=deploy/models/PPLCNet_x1_0_vehicle_attribute_infer
 ```
-执行完该脚本后会在 `deploy/models/` 下生成 `PPLCNet_x1_0_vehicle_attr_infer` 文件夹，`models` 文件夹下应有如下文件结构：
+执行完该脚本后会在 `deploy/models/` 下生成 `PPLCNet_x1_0_vehicle_attributeibute_infer` 文件夹，`models` 文件夹下应有如下文件结构：

 ```
-├── PPLCNet_x1_0_vehicle_attr_infer
+├── PPLCNet_x1_0_vehicle_attribute_infer
 │   ├── inference.pdiparams
 │   ├── inference.pdiparams.info
 │   └── inference.pdmodel
@@ -320,13 +360,13 @@ python3 tools/export_model.py \
 ```
 cd deploy/models
 # 下载 inference 模型并解压
-wget https://paddleclas.bj.bcebos.com/models/PULC/vehicle_attr_infer.tar && tar -xf vehicle_attr_infer.tar
+wget https://paddleclas.bj.bcebos.com/models/PULC/vehicle_attribute_infer.tar && tar -xf vehicle_attribute_infer.tar
 ```

 解压完毕后，`models` 文件夹下应有如下文件结构：

 ```
-├── vehicle_attr_infer
+├── vehicle_attribute_infer
 │   ├── inference.pdiparams
 │   ├── inference.pdiparams.info
 │   └── inference.pdmodel
@@ -347,13 +387,13 @@ wget https://paddleclas.bj.bcebos.com/models/PULC/vehicle_attr_infer.tar && tar
 cd ../
 ```

-运行下面的命令，对图像 `./images/PULC/vehicle_attr/0002_c002_00030670_0.jpg` 进行车辆属性识别。
+运行下面的命令，对图像 `./images/PULC/vehicle_attribute/0002_c002_00030670_0.jpg` 进行车辆属性识别。

 ```shell
 # 使用下面的命令使用 GPU 进行预测
-python3.7 python/predict_cls.py -c configs/PULC/vehicle_attr/inference_vehicle_attr.yaml -o Global.use_gpu=True
+python3.7 python/predict_cls.py -c configs/PULC/vehicle_attribute/inference_vehicle_attribute.yaml -o Global.use_gpu=True
 # 使用下面的命令使用 CPU 进行预测
-python3.7 python/predict_cls.py -c configs/PULC/vehicle_attr/inference_vehicle_attr.yaml -o Global.use_gpu=False
+python3.7 python/predict_cls.py -c configs/PULC/vehicle_attribute/inference_vehicle_attribute.yaml -o Global.use_gpu=False
 ```

 输出结果如下。
@@ -371,7 +411,7 @@ predict output: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]

 ```shell
 # 使用下面的命令使用 GPU 进行预测，如果希望使用 CPU 预测，可以在命令后面添加 -o Global.use_gpu=False
-python3.7 python/predict_cls.py -c configs/PULC/vehicle_attr/inference_vehicle_attr.yaml -o Global.infer_imgs="./images/PULC/vehicle_attr/"
+python3.7 python/predict_cls.py -c configs/PULC/vehicle_attribute/inference_vehicle_attribute.yaml -o Global.infer_imgs="./images/PULC/vehicle_attribute/"
 ```

 终端中会输出该文件夹内所有图像的属性识别结果，如下所示。

--- a/docs/zh_CN/advanced_tutorials/knowledge_distillation.md
+++ b/docs/zh_CN/advanced_tutorials/knowledge_distillation.md
@@ -11,8 +11,9 @@
    - [1.2 PaddleClas支持的知识蒸馏算法](#1.2)
        - [1.2.1 SSLD](#1.2.1)
        - [1.2.2 DML](#1.2.2)
-        - [1.2.3 AFD](#1.2.3)
-        - [1.2.4 DKD](#1.2.4)
+        - [1.2.3 UDML](#1.2.3)
+        - [1.2.4 AFD](#1.2.4)
+        - [1.2.5 DKD](#1.2.5)
 - [2. 使用方法](#2)
    - [2.1 环境配置](#2.1)
    - [2.2 数据准备](#2.2)
@@ -196,9 +197,80 @@ Loss:

 <a name='1.2.3'></a>

-#### 1.2.3 AFD
+#### 1.2.3 UDML

-##### 1.2.3.1 AFD 算法介绍
+##### 1.2.3.1 UDML 算法介绍
+
+论文信息：
+
+UDML 是百度飞桨视觉团队提出的无需依赖教师模型的知识蒸馏算法，它基于DML进行改进，在蒸馏的过程中，除了考虑两个模型的输出信息，也考虑两个模型的中间层特征信息，从而进一步提升知识蒸馏的精度。更多关于UDML的说明与应用，请参考[PP-ShiTu论文](https://arxiv.org/abs/2111.00775)以及[PP-OCRv3论文](https://arxiv.org/abs/2109.03144)。
+
+
+
+在ImageNet1k公开数据集上，效果如下所示。
+
+| 策略 | 骨干网络 | 配置文件 | Top-1 acc | 下载链接 |
+| --- | --- | --- | --- | --- |
+| baseline | PPLCNet_x2_5 | [PPLCNet_x2_5.yaml](../../../ppcls/configs/ImageNet/PPLCNet/PPLCNet_x2_5.yaml) | 74.93% | - |
+| UDML | PPLCNet_x2_5 | [PPLCNet_x2_5_dml.yaml](../../../ppcls/configs/ImageNet/Distillation/PPLCNet_x2_5_udml.yaml) | 76.74%(**+1.81%**) | - |
+
+
+##### 1.2.3.2 UDML 配置
+
+
+```yaml
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - False
+  - False
+  models:
+    - Teacher:
+        name: PPLCNet_x2_5
+        class_num: *class_num
+        pretrained: False
+        # return_patterns表示除了返回输出的logits，也会返回对应名称的中间层feature map
+        return_patterns: ["blocks3", "blocks4", "blocks5", "blocks6"]
+    - Student:
+        name: PPLCNet_x2_5
+        class_num: *class_num
+        pretrained: False
+        return_patterns: ["blocks3", "blocks4", "blocks5", "blocks6"]
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+       weight: 1.0
+       key: logits
+       model_names: ["Student", "Teacher"]
+    - DistillationDMLLoss:
+        weight: 1.0
+        key: logits
+        model_name_pairs:
+        - ["Student", "Teacher"]
+    - DistillationDistanceLoss:  # 基于蒸馏结果的距离loss，这里默认使用l2 loss计算block5之间的损失函数
+        weight: 1.0
+        key: "blocks5"
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+```
+
+**注意(：** 上述在网络中指定`return_patterns`，返回中间层特征的功能是基于TheseusLayer，更多关于TheseusLayer的使用说明，请参考：[TheseusLayer 使用说明](./theseus_layer.md)。
+
+
+<a name='1.2.4'></a>
+
+#### 1.2.4 AFD
+
+##### 1.2.4.1 AFD 算法介绍

 论文信息：

@@ -220,7 +292,7 @@ AFD提出在蒸馏的过程中，利用基于注意力的元网络学习特征

 注意：这里为了与论文的训练配置保持对齐，设置训练的迭代轮数为100epoch，因此baseline精度低于PaddleClas中开源出的模型精度（71.0%）

-##### 1.2.3.2 AFD 配置
+##### 1.2.4.2 AFD 配置

 AFD配置如下所示。在模型构建Arch字段中，需要同时定义学生模型与教师模型，固定教师模型的权重。这里需要对从教师模型获取的特征进行变换，进而与学生模型进行损失函数的计算。在损失函数Loss字段中，需要定义`DistillationKLDivLoss`（学生与教师之间的KL-Div loss）、`AFDLoss`（学生与教师之间的AFD loss）以及`DistillationGTCELoss`（学生与教师关于真值标签的CE loss），作为训练的损失函数。

@@ -305,11 +377,11 @@ Loss:

 **注意(：** 上述在网络中指定`return_patterns`，返回中间层特征的功能是基于TheseusLayer，更多关于TheseusLayer的使用说明，请参考：[TheseusLayer 使用说明](./theseus_layer.md)。

-<a name='1.2.4'></a>
+<a name='1.2.5'></a>

-#### 1.2.4 DKD
+#### 1.2.5 DKD

-##### 1.2.4.1 DKD 算法介绍
+##### 1.2.5.1 DKD 算法介绍

 论文信息：

@@ -330,7 +402,7 @@ DKD将蒸馏中常用的 KD Loss 进行了解耦成为Target Class Knowledge Dis
 | AFD | ResNet18 | [resnet34_distill_resnet18_dkd.yaml](../../../ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_dkd.yaml) | 72.59%(**+1.79%**) | - |


-##### 1.2.4.2 DKD 配置
+##### 1.2.5.2 DKD 配置

 DKD 配置如下所示。在模型构建Arch字段中，需要同时定义学生模型与教师模型，教师模型固定参数，且需要加载预训练模型。在损失函数Loss字段中，需要定义`DistillationDKDLoss`（学生与教师之间的DKD loss）以及`DistillationGTCELoss`（学生与教师关于真值标签的CE loss），作为训练的损失函数。


--- a/docs/zh_CN/image_recognition_pipeline/feature_extraction.md
+++ b/docs/zh_CN/image_recognition_pipeline/feature_extraction.md
--- a/docs/zh_CN/image_recognition_pipeline/mainbody_detection.md
+++ b/docs/zh_CN/image_recognition_pipeline/mainbody_detection.md
@@ -19,9 +19,13 @@
  - [3.3 配置文件改动和说明](#3.3)
  - [3.4 启动训练](#3.4)
  - [3.5 模型预测与调试](#3.5)
-  - [3.6 模型导出与预测部署](#3.6)
+- [4. 模型推理部署](#4)
+  - [4.1 推理模型准备](#4.1)
+  - [4.2 基于python预测引擎推理](#4.2)
+  - [4.3 其他推理方式](#4.3)

-<a name="1"></a> 
+
+<a name="1"></a>

 ## 1. 数据集

@@ -37,7 +41,7 @@

 在实际训练的过程中，将所有数据集混合在一起。由于是主体检测，这里将所有标注出的检测框对应的类别都修改为 `前景` 的类别，最终融合的数据集中只包含 1 个类别，即前景。

-<a name="2"></a> 
+<a name="2"></a>

 ## 2. 模型选择

@@ -55,7 +59,7 @@
  * 速度评测机器的 CPU 具体信息为：`Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz`，速度指标为开启 mkldnn，线程数设置为 10 测试得到。
  * 主体检测的预处理过程较为耗时，平均每张图在上述机器上的时间在 40~55 ms 左右，没有包含在上述的预测耗时统计中。

-<a name="2.1"></a> 
+<a name="2.1"></a>

 ### 2.1 轻量级主体检测模型

@@ -72,7 +76,7 @@ PicoDet 由 [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection) 

 在轻量级主体检测任务中，为了更好地兼顾检测速度与效果，我们使用 PPLCNet_x2_5 作为主体检测模型的骨干网络，同时将训练与预测的图像尺度修改为了 640x640，其余配置与 [picodet_lcnet_1_5x_416_coco.yml](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/configs/picodet/more_config/picodet_lcnet_1_5x_416_coco.yml) 完全一致。将数据集更换为自定义的主体检测数据集，进行训练，最终得到检测模型。

-<a name="2.2"></a> 
+<a name="2.2"></a>

 ### 2.2 服务端主体检测模型

@@ -93,13 +97,13 @@ PP-YOLO 由 [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection) 

 在服务端主体检测任务中，为了保证检测效果，我们使用 ResNet50vd-DCN 作为检测模型的骨干网络，使用配置文件 [ppyolov2_r50vd_dcn_365e_coco.yml](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml)，更换为自定义的主体检测数据集，进行训练，最终得到检测模型。

-<a name="3"></a> 
+<a name="3"></a>

 ## 3. 模型训练

 本节主要介绍怎样基于 PaddleDetection，基于自己的数据集，训练主体检测模型。

-<a name="3.1"></a> 
+<a name="3.1"></a>

 ### 3.1 环境准备

@@ -116,7 +120,7 @@ pip install -r requirements.txt

 更多安装教程，请参考: [安装文档](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/docs/tutorials/INSTALL_cn.md)

-<a name="3.2"></a> 
+<a name="3.2"></a>

 ### 3.2 数据准备

@@ -128,7 +132,7 @@ pip install -r requirements.txt
 [{u'id': 1, u'name': u'foreground', u'supercategory': u'foreground'}]
 ```

-<a name="3.3"></a> 
+<a name="3.3"></a>

 ### 3.3 配置文件改动和说明

@@ -154,7 +158,7 @@ ppyolov2_reader.yml：主要说明数据读取器配置，如 batch size，并

 此外，也可以根据实际情况，修改上述文件，比如，如果显存溢出，可以将 batch size 和学习率等比缩小等。

-<a name="3.4"></a> 
+<a name="3.4"></a>

 ### 3.4 启动训练

@@ -198,7 +202,7 @@ python -m paddle.distributed.launch --gpus 0,1,2,3 tools/train.py -c configs/ppy

 注意：如果遇到 "`Out of memory error`" 问题, 尝试在 `ppyolov2_reader.yml` 文件中调小 `batch_size`，同时等比例调小学习率。

-<a name="3.5"></a> 
+<a name="3.5"></a>

 ### 3.5 模型预测与调试

@@ -211,9 +215,11 @@ python tools/infer.py -c configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml --infer

 `--draw_threshold` 是个可选参数. 根据 [NMS](https://ieeexplore.ieee.org/document/1699659) 的计算，不同阈值会产生不同的结果 `keep_top_k` 表示设置输出目标的最大数量，默认值为 100，用户可以根据自己的实际情况进行设定。

-<a name="3.6"></a> 
+<a name="4"></a>
+## 4. 模型推理部署

-### 3.6 模型导出与预测部署。
+<a name="4.1"></a>
+### 4.1 推理模型准备

 执行导出模型脚本：

@@ -225,15 +231,21 @@ python tools/export_model.py -c configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml

 注意： `PaddleDetection` 导出的 inference 模型的文件格式为 `model.xxx`，这里如果希望与 PaddleClas 的 inference 模型文件格式保持一致，需要将其 `model.xxx` 文件修改为 `inference.xxx` 文件，用于后续主体检测的预测部署。

-更多模型导出教程，请参考： [EXPORT_MODEL](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/deploy/EXPORT_MODEL.md)
+更多模型导出教程，请参考： [EXPORT_MODEL](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/deploy/EXPORT_MODEL.md)

 最终，目录 `inference/ppyolov2_r50vd_dcn_365e_coco` 中包含 `inference.pdiparams`, `inference.pdiparams.info` 以及 `inference.pdmodel` 文件，其中 `inference.pdiparams` 为保存的 inference 模型权重文件，`inference.pdmodel` 为保存的 inference 模型结构文件。

+<a name="4.2"></a>
+### 4.2 基于python预测引擎推理

 导出模型之后，在主体检测与识别任务中，就可以将检测模型的路径更改为该 inference 模型路径，完成预测。

 以商品识别为例，其配置文件为 [inference_product.yaml](../../../deploy/configs/inference_product.yaml)，修改其中的 `Global.det_inference_model_dir` 字段为导出的主体检测 inference 模型目录，参考[图像识别快速开始教程](../quick_start/quick_start_recognition.md)，即可完成商品检测与识别过程。

+<a name="4.3"></a>
+### 4.3 其他推理方式
+其他推理方法，如C++推理部署、PaddleServing部署等请参考[检测模型推理部署](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/deploy/README.md)。
+

 ### FAQ


--- a/docs/zh_CN/image_recognition_pipeline/vector_search.md
+++ b/docs/zh_CN/image_recognition_pipeline/vector_search.md
 # 向量检索

+## 目录
+
+- [1. 向量检索应用场景介绍](#1)
+- [2. 向量检索算法介绍](#2)
+	- [2.1 HNSW](#2.1)
+	- [2.2 IVF](#2.2)
+	- [2.3 FLAT](#2.3)
+- [3. 检索库安装](#3)
+- [4. 使用及配置文档介绍](#4)
+	- [4.1 建库及配置文件参数](#4.1)
+	- [4.2 检索配置文件参数](#4.2)
+
+
+<a name="1"></a>
+## 1. 向量检索应用场景介绍
+
 向量检索技术在图像识别、图像检索中应用比较广泛。其主要目标是，对于给定的查询向量，在已经建立好的向量库中，与库中所有的待查询向量，进行特征向量的相似度或距离计算，得到相似度排序。在图像识别系统中，我们使用 [Faiss](https://github.com/facebookresearch/faiss) 对此部分进行支持，具体信息请详查 [Faiss 官网](https://github.com/facebookresearch/faiss)。`Faiss` 主要有以下优势

 - 适配性好：支持 Windos、Linux、MacOS 系统
@@ -20,17 +36,33 @@

 --------------------------

-## 目录
+<a name="2"></a>
+## 2. 使用的检索算法
+
+目前 `PaddleClas` 中检索模块，支持三种检索算法**HNSW32**、**IVF**、**FLAT**。每种检索算法，满足不同场景。其中 `HNSW32` 为默认方法，此方法的检索精度、检索速度可以取得一个较好的平衡，具体算法介绍可以查看[官方文档](https://github.com/facebookresearch/faiss/wiki)。
+
+<a name="2.1"></a>
+### 2.1 HNSW方法
+
+此方法为图索引方法，如下图所示，在建立索引的时候，分为不同的层，所以检索精度较高，速度较快，但是特征库只支持添加图像功能，不支持删除图像特征功能。基于图的向量检索算法在向量检索的评测中性能都是比较优异的。如果比较在乎检索算法的效率，而且可以容忍一定的空间成本，多数场景下比较推荐基于图的检索算法。而HNSW是一种典型的，应用广泛的图算法，很多分布式检索引擎都对HNSW算法进行了分布式改造，以应用于高并发，大数据量的线上查询。此方法为默认方法。
+<div align="center">
+<img src="../../images/algorithm_introduction/hnsw.png"  width = "400" />
+</div>
+
+<a name="2.2"></a>
+### 2.2 IVF

- [1. 检索库安装](#1)
- [2. 使用的检索算法](#2)
- [3. 使用及配置文档介绍](#3)
-  - [3.1 建库及配置文件参数](#3.1)
-  - [3.2 检索配置文件参数](#3.2)
+一种倒排索引检索方法。速度较快，但是精度略低。特征库支持增加、删除图像特征功能。IVF主要利用倒排的思想保存每个聚类中心下的向量，每次查询向量的时候找到最近的几个中心，分别搜索这几个中心下的向量。通过减小搜索范围，大大提升搜索效率。

-<a name="1"></a> 
+<a name="2.3"></a>
+### 2.3 FLAT

-## 1. 检索库安装
+暴力检索算法。精度最高，但是数据量大时，检索速度较慢。特征库支持增加、删除图像特征功能。
+
+
+<a name="3"></a>
+
+## 3. 检索库安装

 `Faiss` 具体安装方法如下：

@@ -40,27 +72,16 @@ pip install faiss-cpu==1.7.1post2

 若使用时，不能正常引用，则 `uninstall` 之后，重新 `install`，尤其是 `windows` 下。

-<a name="2"></a> 
-
-## 2. 使用的检索算法
-
-目前 `PaddleClas` 中检索模块，支持如下三种检索算法
-
- **HNSW32**: 一种图索引方法。检索精度较高，速度较快。但是特征库只支持添加图像功能，不支持删除图像特征功能。（默认方法）
- **IVF**：倒排索引检索方法。速度较快，但是精度略低。特征库支持增加、删除图像特征功能。
- **FLAT**： 暴力检索算法。精度最高，但是数据量大时，检索速度较慢。特征库支持增加、删除图像特征功能。
-
-每种检索算法，满足不同场景。其中 `HNSW32` 为默认方法，此方法的检索精度、检索速度可以取得一个较好的平衡，具体算法介绍可以查看[官方文档](https://github.com/facebookresearch/faiss/wiki)。

-<a name="3"></a> 
+<a name="4"></a>

-## 3. 使用及配置文档介绍
+## 4. 使用及配置文档介绍

-涉及检索模块配置文件位于：`deploy/configs/` 下，其中 `build_*.yaml` 是建立特征库的相关配置文件，`inference_*.yaml` 是检索或者分类的推理配置文件。
+涉及检索模块配置文件位于：`deploy/configs/` 下，其中 `inference_*.yaml` 是检索或者分类的推理配置文件,同时也是建立特征库的相关配置文件。

-<a name="3.1"></a> 
+<a name="4.1"></a>

-### 3.1 建库及配置文件参数
+### 4.1 建库及配置文件参数

 建库的具体操作如下：

@@ -68,14 +89,14 @@ pip install faiss-cpu==1.7.1post2
 # 进入 deploy 目录
 cd deploy
 # yaml 文件根据需要改成自己所需的具体 yaml 文件
-python python/build_gallery.py -c configs/build_***.yaml
+python python/build_gallery.py -c configs/inference_***.yaml
 ```

 其中 `yaml` 文件的建库的配置如下，在运行时，请根据实际情况进行修改。建库操作会将根据 `data_file` 的图像列表，将 `image_root` 下的图像进行特征提取，并在 `index_dir` 下进行存储，以待后续检索使用。

 其中 `data_file` 文件存储的是图像文件的路径和标签，每一行的格式为：`image_path  label`。中间间隔以 `yaml` 文件中 `delimiter` 参数作为间隔。

-关于特征提取的具体模型参数，可查看 `yaml` 文件。
+关于特征提取的具体模型参数，可查看 `yaml` 文件。注意下面的配置参数只列举了建立索引库相关部分。

 ```yaml
 # indexing engine config
@@ -88,6 +109,7 @@ IndexProcess:
  delimiter: "\t"
  dist_type: "IP"
  embedding_size: 512
+  batch_size: 32
 ```

 - **index_method**：使用的检索算法。目前支持三种，HNSW32、IVF、Flat
@@ -98,23 +120,29 @@ IndexProcess:
 - **delimiter**：**data_file** 中每一行的间隔符
 - **dist_type**: 特征匹配过程中使用的相似度计算方式。例如 `IP` 内积相似度计算方式，`L2` 欧式距离计算方法
 - **embedding_size**：特征维度
+- **batch_size**：建立特征库时，特征提取的`batch_size`

-<a name="3.2"></a> 
+<a name="4.2"></a>
+
+### 4.2 检索配置文件参数

-### 3.2 检索配置文件参数

 将检索的过程融合到 `PP-ShiTu` 的整体流程中，请参考 [README](../../../README_ch.md) 中 `PP-ShiTu 图像识别系统介绍` 部分。检索具体使用操作请参考[识别快速开始文档](../quick_start/quick_start_recognition.md)。

 其中，检索部分配置如下，整体检索配置文件，请参考 `deploy/configs/inference_*.yaml` 文件。

+注意：此部分参数只是列举了离线检索相关部分参数。
+
 ```yaml
 IndexProcess:
  index_dir: "./recognition_demo_data_v1.1/gallery_logo/index/"
  return_k: 5
  score_thres: 0.5
+  hamming_radius: 100
 ```

 与建库配置文件不同，新参数主要如下：

 - `return_k`: 检索结果返回 `k` 个结果
 - `score_thres`: 检索匹配的阈值
+- `hamming_radius`: 汉明距离半径。此参数只有在使用二值特征模型，`dist_type`设置为`hamming`时才能生效。具体二值特征模型使用方法请参考[哈希编码](./deep_hashing.md)
--- a/docs/zh_CN/inference_deployment/python_deploy.md
+++ b/docs/zh_CN/inference_deployment/python_deploy.md
@@ -6,10 +6,11 @@

 ## 目录

- [1. 图像分类推理](#1)
- [2. 主体检测模型推理](#2)
- [3. 特征提取模型推理](#3)
- [4. 主体检测、特征提取和向量检索串联](#4)
+- [1. 图像分类模型推理](#1)
+- [2. PP-ShiTu模型推理](#2)
+	- [2.1 主体检测模型推理](#2.1)
+	- [2.2 特征提取模型推理](#2.2)
+	- [2.3 PP-ShiTu PipeLine推理](#2.3)

 <a name="1"></a>
 ## 1. 图像分类推理
@@ -42,7 +43,12 @@ python python/predict_cls.py -c configs/inference_cls.yaml
 * 如果你希望提升评测模型速度，使用 GPU 评测时，建议开启 TensorRT 加速预测，使用 CPU 评测时，建议开启 MKL-DNN 加速预测。

 <a name="2"></a>
-## 2. 主体检测模型推理
+## 2. PP-ShiTu模型推理
+
+PP-ShiTu整个Pipeline包含三部分：主体检测、特提取模型、特征检索。其中主体检测、特征模型可以单独推理使用。单独主体检测详见[2.1](#2.1)，特征提取模型单独推理详见[2.2](#2.2)， PP-ShiTu整体推理详见[2.3](#2.3)。
+
+<a name="2.1"></a>
+### 2.1 主体检测模型推理

 进入 PaddleClas 的 `deploy` 目录下：

@@ -70,8 +76,8 @@ python python/predict_det.py -c configs/inference_det.yaml
 * `Global.use_gpu`： 是否使用 GPU 预测，默认为 `True`。


-<a name="3"></a>
-## 3. 特征提取模型推理
+<a name="2.2"></a>
+### 2.2 特征提取模型推理

 下面以商品特征提取为例，介绍特征提取模型推理。首先进入 PaddleClas 的 `deploy` 目录下：

@@ -90,7 +96,7 @@ tar -xf ./models/product_ResNet50_vd_aliproduct_v1.0_infer.tar -C ./models/

 上述预测命令可以得到一个 512 维的特征向量，直接输出在在命令行中。

-<a name="4"></a>
-## 4. 主体检测、特征提取和向量检索串联
+<a name="2.3"></a>
+### 2.3. PP-ShiTu PipeLine推理

 主体检测、特征提取和向量检索的串联预测，可以参考图像识别[快速体验](../quick_start/quick_start_recognition.md)。
--- a/docs/zh_CN/inference_deployment/whl_deploy.md
+++ b/docs/zh_CN/inference_deployment/whl_deploy.md
@@ -18,7 +18,7 @@ PaddleClas 支持 Python Whl 包方式进行预测，目前 Whl 包方式仅支
   - [4.6 对 `NumPy.ndarray` 格式数据进行预测](#4.6)
   - [4.7 保存预测结果](#4.7)
   - [4.8 指定 label name](#4.8)
-   
+

 <a name="1"></a>
 ## 1. 安装 paddleclas
@@ -212,14 +212,14 @@ print(next(result))
 ```python
 from paddleclas import PaddleClas
 clas = PaddleClas(model_name='ResNet50', save_dir='./output_pre_label/')
-infer_imgs = 'docs/images/whl/' # it can be infer_imgs folder path which contains all of images you want to predict.
+infer_imgs = 'docs/images/' # it can be infer_imgs folder path which contains all of images you want to predict.
 result=clas.predict(infer_imgs)
 print(next(result))
 ```

 * CLI
 ```bash
-paddleclas --model_name='ResNet50' --infer_imgs='docs/images/whl/' --save_dir='./output_pre_label/'
+paddleclas --model_name='ResNet50' --infer_imgs='docs/images/' --save_dir='./output_pre_label/'
 ```

 <a name="4.8"></a>

--- a/docs/zh_CN/models/SwinTransformer.md
+++ b/docs/zh_CN/models/SwinTransformer.md
 # SwinTransformer
---
+
+-----
 ## 目录

-* [1. 概述](#1)
-* [2. 精度、FLOPS 和参数量](#2)
-* [3. 基于V100 GPU 的预测速度](#3)
+- [1. 模型介绍](#1)
+    - [1.1 模型简介](#1.1)
+    - [1.2 模型指标](#1.2)
+    - [1.3 Benchmark](#1.3)
+      - [1.3.1 基于 V100 GPU 的预测速度](#1.3.1)
+- [2. 模型快速体验](#2)
+- [3. 模型训练、评估和预测](#3)
+- [4. 模型推理部署](#4)
+  - [4.1 推理模型准备](#4.1)
+  - [4.2 基于 Python 预测引擎推理](#4.2)
+  - [4.3 基于 C++ 预测引擎推理](#4.3)
+  - [4.4 服务化部署](#4.4)
+  - [4.5 端侧部署](#4.5)
+  - [4.6 Paddle2ONNX 模型转换与预测](#4.6)
+

 <a name='1'></a>

-## 1. 概述
+## 1. 模型介绍
+
+### 1.1 模型简介
+
 Swin Transformer 是一种新的视觉 Transformer 网络，可以用作计算机视觉领域的通用骨干网路。SwinTransformer 由移动窗口（shifted windows）表示的层次 Transformer 结构组成。移动窗口将自注意计算限制在非重叠的局部窗口上，同时允许跨窗口连接，从而提高了网络性能。[论文地址](https://arxiv.org/abs/2103.14030)。

 <a name='2'></a>

-## 2. 精度、FLOPS 和参数量
+### 1.2 模型指标

-| Models           | Top1 | Top5 | Reference<br>top1 | Reference<br>top5 | FLOPS<br>(G) | Params<br>(M) |
+| Models           | Top1 | Top5 | Reference<br>top1 | Reference<br>top5 | FLOPs<br>(G) | Params<br>(M) |
 |:--:|:--:|:--:|:--:|:--:|:--:|:--:|
 | SwinTransformer_tiny_patch4_window7_224    | 0.8069 | 0.9534 | 0.812 | 0.955 | 4.5  | 28   |
 | SwinTransformer_small_patch4_window7_224   | 0.8275 | 0.9613 | 0.832 | 0.962 | 8.7  | 50   |
@@ -32,17 +48,87 @@ Swin Transformer 是一种新的视觉 Transformer 网络，可以用作计算

 <a name='3'></a>

-## 3. 基于 V100 GPU 的预测速度
+### 1.3 Benchmark
+
+#### 1.3.1 基于 V100 GPU 的预测速度

-| Models                                                  | Crop Size | Resize Short Size | FP32<br/>Batch Size=1<br/>(ms) | FP32<br/>Batch Size=4<br/>(ms) | FP32<br/>Batch Size=8<br/>(ms) |
-| ------------------------------------------------------- | --------- | ----------------- | ------------------------------ | ------------------------------ | ------------------------------ |
-| SwinTransformer_tiny_patch4_window7_224                 | 224       | 256               | 6.59                           | 9.68                           | 16.32                          |
-| SwinTransformer_small_patch4_window7_224                | 224       | 256               | 12.54                          | 17.07                          | 28.08                          |
-| SwinTransformer_base_patch4_window7_224                 | 224       | 256               | 13.37                          | 23.53                          | 39.11                          |
-| SwinTransformer_base_patch4_window12_384                | 384       | 384               | 19.52                          | 64.56                          | 123.30                         |
-| SwinTransformer_base_patch4_window7_224<sup>[1]</sup>   | 224       | 256               | 13.53                          | 23.46                          | 39.13                          |
-| SwinTransformer_base_patch4_window12_384<sup>[1]</sup>  | 384       | 384               | 19.65                          | 64.72                          | 123.42                         |
-| SwinTransformer_large_patch4_window7_224<sup>[1]</sup>  | 224       | 256               | 15.74                          | 38.57                          | 71.49                          |
-| SwinTransformer_large_patch4_window12_384<sup>[1]</sup> | 384       | 384               | 32.61                          | 116.59                         | 223.23                         |
+| Models  | Size |  Latency(ms)<br>bs=1 | Latency(ms)<br>bs=4 | Latency(ms)<br>bs=8 |
+|:--:|:--:|:--:|:--:|:--:|
+| SwinTransformer_tiny_patch4_window7_224                 | 224       | 6.59                           | 9.68                           | 16.32                          |
+| SwinTransformer_small_patch4_window7_224                | 224       | 12.54                          | 17.07                          | 28.08                          |
+| SwinTransformer_base_patch4_window7_224                 | 224       | 13.37                          | 23.53                          | 39.11                          |
+| SwinTransformer_base_patch4_window12_384                | 384       | 19.52                          | 64.56                          | 123.30                         |
+| SwinTransformer_base_patch4_window7_224<sup>[1]</sup>   | 224       | 13.53                          | 23.46                          | 39.13                          |
+| SwinTransformer_base_patch4_window12_384<sup>[1]</sup>  | 384       | 19.65                          | 64.72                          | 123.42                         |
+| SwinTransformer_large_patch4_window7_224<sup>[1]</sup>  | 224       | 15.74                          | 38.57                          | 71.49                          |
+| SwinTransformer_large_patch4_window12_384<sup>[1]</sup> | 384       | 32.61                          | 116.59                         | 223.23                         |

 [1]：基于 ImageNet22k 数据集预训练，然后在 ImageNet1k 数据集迁移学习得到。
+
+**备注：** 精度类型为 FP32，推理过程使用 TensorRT。
+
+
+<a name="2"></a>   
+    
+## 2. 模型快速体验
+
+安装 paddlepaddle 和 paddleclas 即可快速对图片进行预测，体验方法可以参考[ResNet50 模型快速体验](./ResNet.md#2-模型快速体验)。
+
+<a name="3"></a> 
+    
+## 3. 模型训练、评估和预测
+
+
+此部分内容包括训练环境配置、ImageNet数据的准备、SwinTransformer 在 ImageNet 上的训练、评估、预测等内容。在 `ppcls/configs/ImageNet/SwinTransformer/` 中提供了 SwinTransformer 的训练配置，可以通过如下脚本启动训练：此部分内容可以参考[ResNet50 模型训练、评估和预测](./ResNet.md#3-模型训练评估和预测)。
+
+**备注：** 由于 SwinTransformer 系列模型默认使用的 GPU 数量为 8 个，所以在训练时，需要指定8个GPU，如`python3 -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c xxx.yaml`, 如果使用 4 个 GPU 训练，默认学习率需要减小一半，精度可能有损。
+
+    
+<a name="4"></a>
+
+## 4. 模型推理部署
+
+<a name="4.1"></a> 
+
+### 4.1 推理模型准备
+
+Paddle Inference 是飞桨的原生推理库， 作用于服务器端和云端，提供高性能的推理能力。相比于直接基于预训练模型进行预测，Paddle Inference可使用 MKLDNN、CUDNN、TensorRT 进行预测加速，从而实现更优的推理性能。更多关于Paddle Inference推理引擎的介绍，可以参考[Paddle Inference官网教程](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/infer/inference/inference_cn.html)。
+    
+Inference 的获取可以参考 [ResNet50 推理模型准备](./ResNet.md#41-推理模型准备) 。
+
+<a name="4.2"></a> 
+
+### 4.2 基于 Python 预测引擎推理
+
+PaddleClas 提供了基于 python 预测引擎推理的示例。您可以参考[ResNet50 基于 Python 预测引擎推理](./ResNet.md#42-基于-python-预测引擎推理) 对 SwinTransformer 完成推理预测。
+
+<a name="4.3"></a> 
+
+### 4.3 基于 C++ 预测引擎推理
+
+PaddleClas 提供了基于 C++ 预测引擎推理的示例，您可以参考[服务器端 C++ 预测](../inference_deployment/cpp_deploy.md)来完成相应的推理部署。如果您使用的是 Windows 平台，可以参考[基于 Visual Studio 2019 Community CMake 编译指南](../inference_deployment/cpp_deploy_on_windows.md)完成相应的预测库编译和模型预测工作。
+
+<a name="4.4"></a> 
+
+### 4.4 服务化部署
+
+Paddle Serving 提供高性能、灵活易用的工业级在线推理服务。Paddle Serving 支持 RESTful、gRPC、bRPC 等多种协议，提供多种异构硬件和多种操作系统环境下推理解决方案。更多关于Paddle Serving 的介绍，可以参考[Paddle Serving 代码仓库](https://github.com/PaddlePaddle/Serving)。
+    
+PaddleClas 提供了基于 Paddle Serving 来完成模型服务化部署的示例，您可以参考[模型服务化部署](../inference_deployment/paddle_serving_deploy.md)来完成相应的部署工作。
+
+<a name="4.5"></a> 
+
+### 4.5 端侧部署
+
+Paddle Lite 是一个高性能、轻量级、灵活性强且易于扩展的深度学习推理框架，定位于支持包括移动端、嵌入式以及服务器端在内的多硬件平台。更多关于 Paddle Lite 的介绍，可以参考[Paddle Lite 代码仓库](https://github.com/PaddlePaddle/Paddle-Lite)。
+    
+PaddleClas 提供了基于 Paddle Lite 来完成模型端侧部署的示例，您可以参考[端侧部署](../inference_deployment/paddle_lite_deploy.md)来完成相应的部署工作。
+
+<a name="4.6"></a> 
+
+### 4.6 Paddle2ONNX 模型转换与预测
+    
+Paddle2ONNX 支持将 PaddlePaddle 模型格式转化到 ONNX 模型格式。通过 ONNX 可以完成将 Paddle 模型到多种推理引擎的部署，包括TensorRT/OpenVINO/MNN/TNN/NCNN，以及其它对 ONNX 开源格式进行支持的推理引擎或硬件。更多关于 Paddle2ONNX 的介绍，可以参考[Paddle2ONNX 代码仓库](https://github.com/PaddlePaddle/Paddle2ONNX)。
+
+PaddleClas 提供了基于 Paddle2ONNX 来完成 inference 模型转换 ONNX 模型并作推理预测的示例，您可以参考[Paddle2ONNX 模型转换与预测](@shuilong)来完成相应的部署工作。
+
--- a/paddleclas.py
+++ b/paddleclas.py
--- a/ppcls/configs/ImageNet/Distillation/PPLCNet_x2_5_udml.yaml
+++ b/ppcls/configs/ImageNet/Distillation/PPLCNet_x2_5_udml.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output_lcnet_x2_5_udml
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - False
+  - False
+  models:
+    - Teacher:
+        name: PPLCNet_x2_5
+        class_num: *class_num
+        pretrained: False
+        return_patterns: ["blocks3", "blocks4", "blocks5", "blocks6"]
+    - Student:
+        name: PPLCNet_x2_5
+        class_num: *class_num
+        pretrained: False
+        return_patterns: ["blocks3", "blocks4", "blocks5", "blocks6"]
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+       weight: 1.0
+       key: logits
+       model_names: ["Student", "Teacher"]
+    - DistillationDMLLoss:
+        weight: 1.0
+        key: logits
+        model_name_pairs:
+        - ["Student", "Teacher"]
+    - DistillationDistanceLoss:
+        weight: 1.0
+        key: "blocks5"
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.4
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+
--- a/ppcls/configs/PULC/language_classification/MobileNetV3_small_x0_35.yaml
+++ b/ppcls/configs/PULC/language_classification/MobileNetV3_small_x0_35.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  start_eval_epoch: 20
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 10
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC/language_classification_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
--- a/ppcls/configs/PULC/language_classification/PPLCNet_x1_0.yaml
+++ b/ppcls/configs/PULC/language_classification/PPLCNet_x1_0.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 10
+  pretrained: True
+  use_ssld: True
+  stride_list: [2, [2, 1], [2, 1], [2, 1], [2, 1]]
+  lr_mult_list : [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 1.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 1.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/language_classification/word_35404.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+            size: [160, 80]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC/language_classification_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
--- a/ppcls/configs/PULC/language_classification/PPLCNet_x1_0_distillation.yaml
+++ b/ppcls/configs/PULC/language_classification/PPLCNet_x1_0_distillation.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 10
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  use_sync_bn: True
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+        stride_list: [2, [2, 1], [2, 1], [2, 1], [2, 1]]
+        lr_mult_list : [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+        
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/train_list_for_distill.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 1.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 1.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/language_classification/word_35404.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [160, 80]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC/language_classification_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 2]
+    Eval:
+    - TopkAcc:
+        topk: [1, 2]
\ No newline at end of file
--- a/ppcls/configs/PULC/language_classification/PPLCNet_x1_0_search.yaml
+++ b/ppcls/configs/PULC/language_classification/PPLCNet_x1_0_search.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  start_eval_epoch: 20
+  
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 10
+  pretrained: True
+  use_ssld: True
+  stride_list: [2, [2, 1], [2, 1], [2, 1], [2, 1]]
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.4
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 48]
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [192, 48]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 48]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 32
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/language_classification/word_35404.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [192, 48]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC/language_classification_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
--- a/ppcls/configs/PULC/language_classification/SwinTransformer_tiny_patch4_window7_224.yaml
+++ b/ppcls/configs/PULC/language_classification/SwinTransformer_tiny_patch4_window7_224.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 10
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/language_classification/word_35404.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC/language_classification_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
--- a/ppcls/configs/PULC/language_classification/search.yaml
+++ b/ppcls/configs/PULC/language_classification/search.yaml
+base_config_file: ppcls/configs/PULC/language_classification/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/language_classification/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_language_classification
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.2, 0.4, 0.8]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.ResizeImage.size
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.img_size
+      - DataLoader.Eval.dataset.transform_ops.1.ResizeImage.size
+    search_values: [[192, 48], [180, 60], [160, 80]]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.prob
+    search_values: [0.0, 0.5, 1.0]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.5.RandomErasing.EPSILON
+    search_values: [0.0, 0.5, 1.0]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
\ No newline at end of file
--- a/ppcls/configs/PULC/safety_helmet/MobileNetV3_small_x0_35.yaml
+++ b/ppcls/configs/PULC/safety_helmet/MobileNetV3_small_x0_35.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  pretrained: True
+  class_num: 2
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.4
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: wearing_helmet
+    label_1: unwearing_helmet
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.0001
+    - TopkAcc:
+        topk: [1]
--- a/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0.yaml
+++ b/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  pretrained: True
+  use_ssld: True
+  class_num: 2
+  use_sync_bn : True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.025
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 176
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob : 0.5
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size : 176
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON : 0.1
+            r1 : 0.3
+            sh : 1.0/3.0
+            sl : 0.02
+            attempt : 10
+            use_log_aspect : True
+            mode : pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: wearing_helmet
+    label_1: unwearing_helmet
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.0001
+    - TopkAcc:
+        topk: [1]
--- a/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_distillation.yaml
+++ b/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_distillation.yaml
--- a/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_search.yaml
+++ b/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_search.yaml
--- a/ppcls/configs/PULC/safety_helmet/Res2Net200_vd_26w_4s.yaml
+++ b/ppcls/configs/PULC/safety_helmet/Res2Net200_vd_26w_4s.yaml
--- a/ppcls/configs/PULC/safety_helmet/SwinTransformer_tiny_patch4_window7_224.yaml
+++ b/ppcls/configs/PULC/safety_helmet/SwinTransformer_tiny_patch4_window7_224.yaml
--- a/ppcls/configs/PULC/safety_helmet/search.yaml
+++ b/ppcls/configs/PULC/safety_helmet/search.yaml
--- a/ppcls/configs/PULC/text_image_orientation/MobileNetV3_small_x0_35.yaml
+++ b/ppcls/configs/PULC/text_image_orientation/MobileNetV3_small_x0_35.yaml
--- a/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0.yaml
+++ b/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0.yaml
--- a/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0_distillation.yaml
+++ b/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0_distillation.yaml
--- a/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0_search.yaml
+++ b/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0_search.yaml
--- a/ppcls/configs/PULC/text_image_orientation/SwinTransformer_tiny_patch4_window7_224.yaml
+++ b/ppcls/configs/PULC/text_image_orientation/SwinTransformer_tiny_patch4_window7_224.yaml
--- a/ppcls/configs/PULC/text_image_orientation/search.yaml
+++ b/ppcls/configs/PULC/text_image_orientation/search.yaml
--- a/ppcls/configs/PULC/text_direction/MobileNetV3_large_x1_0.yaml
+++ b/ppcls/configs/PULC/text_direction/MobileNetV3_large_x1_0.yaml
--- a/ppcls/configs/PULC/text_direction/PPLCNet_x1_0.yaml
+++ b/ppcls/configs/PULC/text_direction/PPLCNet_x1_0.yaml
--- a/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0_224x224.yaml
+++ b/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0_224x224.yaml
--- a/ppcls/configs/PULC/text_direction/PPLCNet_x1_0_distillation.yaml
+++ b/ppcls/configs/PULC/text_direction/PPLCNet_x1_0_distillation.yaml
--- a/ppcls/configs/PULC/text_direction/PPLCNet_x1_0_search.yaml
+++ b/ppcls/configs/PULC/text_direction/PPLCNet_x1_0_search.yaml
--- a/ppcls/configs/PULC/text_direction/SwinTransformer_tiny_patch4_window7_224.yaml
+++ b/ppcls/configs/PULC/text_direction/SwinTransformer_tiny_patch4_window7_224.yaml
--- a/ppcls/configs/PULC/text_direction/search.yaml
+++ b/ppcls/configs/PULC/text_direction/search.yaml
--- a/ppcls/configs/PULC/traffic_sign/MobileNetV3_large_x1_0.yaml
+++ b/ppcls/configs/PULC/traffic_sign/MobileNetV3_large_x1_0.yaml
--- a/ppcls/configs/PULC/vehicle_attr/MobileNetV3_large_x1_0.yaml
+++ b/ppcls/configs/PULC/vehicle_attr/MobileNetV3_large_x1_0.yaml
--- a/ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0.yaml
+++ b/ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0.yaml
--- a/ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0_distillation.yaml
+++ b/ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0_distillation.yaml
--- a/ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0_search.yaml
+++ b/ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0_search.yaml
--- a/ppcls/configs/PULC/vehicle_attr/Res2Net200_vd_26w_4s.yaml
+++ b/ppcls/configs/PULC/vehicle_attr/Res2Net200_vd_26w_4s.yaml
--- a/ppcls/configs/PULC/vehicle_attr/ResNet50.yaml
+++ b/ppcls/configs/PULC/vehicle_attr/ResNet50.yaml
--- a/ppcls/configs/PULC/vehicle_attr/search.yaml
+++ b/ppcls/configs/PULC/vehicle_attr/search.yaml
--- a/ppcls/utils/PULC/language_classification_label_list.txt
+++ b/ppcls/utils/PULC/language_classification_label_list.txt
--- a/ppcls/utils/PULC/text_image_orientation_label_list.txt
+++ b/ppcls/utils/PULC/text_image_orientation_label_list.txt
--- a/ppcls/utils/PULC/textline_orientation_label_list.txt
+++ b/ppcls/utils/PULC/textline_orientation_label_list.txt
--- a/ppcls/utils/PULC_label_list/text_image_orientation_label_list.txt
+++ b/ppcls/utils/PULC_label_list/text_image_orientation_label_list.txt
--- a/ppcls/utils/PULC_label_list/traffic_sign_label_list.txt
+++ b/ppcls/utils/PULC_label_list/traffic_sign_label_list.txt
--- a/ppcls/utils/cls_demo/person_label_list.txt
+++ b/ppcls/utils/cls_demo/person_label_list.txt