diff --git a/.github/ISSUE_TEMPLATE/---clas-issue-.md b/.github/ISSUE_TEMPLATE/---clas-issue-.md
new file mode 100644
index 0000000000000000000000000000000000000000..01bd38f9b243a6b1ab357219b50f0065db708529
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---clas-issue-.md
@@ -0,0 +1,18 @@
+---
+name: 问题反馈
+about: PaddleClas问题反馈
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+欢迎您使用PaddleClas并反馈相关问题,非常感谢您对PaddleClas的贡献!
+提出issue时,辛苦您提供以下信息,方便我们快速定位问题并及时有效地解决您的问题:
+ 1. PaddleClas版本以及PaddlePaddle版本:请您提供您使用的版本号或分支信息,如PaddleClas release/2.2和PaddlePaddle 2.1.0
+ 2. 涉及的其他产品使用的版本号:如您在使用PaddleClas的同时还在使用其他产品,如PaddleServing、PaddleInference等,请您提供其版本号
+ 3. 训练环境信息:
+  a. 具体操作系统,如Linux/Windows/MacOS
+  b. Python版本号,如Python3.6/7/8
+  c. CUDA/cuDNN版本, 如CUDA10.2/cuDNN 7.6.5等
+ 4. 完整的代码(相比于repo中代码,有改动的地方)、详细的错误信息及相关log
diff --git a/README_ch.md b/README_ch.md
index d08d01642c6cc6a14a6054a2adbd87ba108e6788..011e638c0ff2a065120915f62039a66b46937672 100644
--- a/README_ch.md
+++ b/README_ch.md
@@ -8,6 +8,7 @@
 
 **近期更新**
 
+- 2021.08.11 更新7个[FAQ](docs/zh_CN/faq_series/faq_2021_s2.md)。
 - 2021.06.29 添加Swin-transformer系列模型,ImageNet1k数据集上Top1 acc最高精度可达87.2%;支持训练预测评估与whl包部署,预训练模型可以从[这里](docs/zh_CN/models/models_intro.md)下载。
 - 2021.06.22,23,24 PaddleClas官方研发团队带来技术深入解读三日直播课。课程回放:[https://aistudio.baidu.com/aistudio/course/introduce/24519](https://aistudio.baidu.com/aistudio/course/introduce/24519)
 - 2021.06.16 PaddleClas v2.2版本升级,集成Metric learning,向量检索等组件。新增商品识别、动漫人物识别、车辆识别和logo识别等4个图像识别应用。新增LeViT、Twins、TNT、DLA、HarDNet、RedNet系列30个预训练模型。
@@ -50,6 +51,10 @@ Res2Net200_vd预训练模型Top-1精度高达85.1%。
 - [图像识别快速体验](./docs/zh_CN/tutorials/quick_start_recognition.md)
 - [图像识别系统介绍](#图像识别系统介绍)
 - [识别效果展示](#识别效果展示)
+- 图像分类快速体验
+    - [尝鲜版](./docs/zh_CN/tutorials/quick_start_new_user.md)
+    - [进阶版](./docs/zh_CN/tutorials/quick_start_professional.md)
+    - [社区版](./docs/zh_CN/tutorials/quick_start_community.md)
 - 算法介绍
     - [骨干网络和预训练模型库](./docs/zh_CN/ImageNet_models_cn.md)
     - [主体检测](./docs/zh_CN/application/mainbody_detection.md)
@@ -74,11 +79,14 @@ Res2Net200_vd预训练模型Top-1精度高达85.1%。
     - [知识蒸馏](./docs/zh_CN/advanced_tutorials/distillation/distillation.md)
     - [模型量化](./docs/zh_CN/extension/paddle_quantization.md)
     - [数据增广](./docs/zh_CN/advanced_tutorials/image_augmentation/ImageAugment.md)
-- FAQ(暂停更新)
+- FAQ
+    - [图像识别任务FAQ](docs/zh_CN/faq_series/faq_2021_s2.md)
     - [图像分类任务FAQ](docs/zh_CN/faq.md)
 - [许可证书](#许可证书)
 - [贡献代码](#贡献代码)
 
+
+## 图像识别系统介绍
 
 
 
diff --git a/deploy/configs/build_cartoon.yaml b/deploy/configs/build_cartoon.yaml
index 17a5ee55ce0269df12121b378b1d8a2eeac0343f..c73279801dfedbf9576ef8013226bb1cc4ba02cd 100644
--- a/deploy/configs/build_cartoon.yaml
+++ b/deploy/configs/build_cartoon.yaml
@@ -1,6 +1,6 @@
 Global:
   rec_inference_model_dir: "./models/cartoon_rec_ResNet50_iCartoon_v1.0_infer/"
-  batch_size: 1
+  batch_size: 32
   use_gpu: True
   enable_mkldnn: True
   cpu_num_threads: 10
diff --git a/deploy/configs/build_logo.yaml b/deploy/configs/build_logo.yaml
index 3f6d12d7daf17c787ae7ee08971574a493ef1ab7..5be17ed978d57ee5e084d14ee348b090d61fb5c7 100644
--- a/deploy/configs/build_logo.yaml
+++ b/deploy/configs/build_logo.yaml
@@ -1,6 +1,6 @@
 Global:
   rec_inference_model_dir: "./models/logo_rec_ResNet50_Logo3K_v1.0_infer/"
-  batch_size: 1
+  batch_size: 32
   use_gpu: True
   enable_mkldnn: True
   cpu_num_threads: 10
diff --git a/deploy/configs/build_product.yaml b/deploy/configs/build_product.yaml
index dd0ce2d89ce0940e3c0915bd1b71f22df7ec5f40..59e3b29baec066f6b7cf87c90fca69793d12482e 100644
--- a/deploy/configs/build_product.yaml
+++ b/deploy/configs/build_product.yaml
@@ -1,6 +1,6 @@
 Global:
   rec_inference_model_dir: "./models/product_ResNet50_vd_aliproduct_v1.0_infer"
-  batch_size: 1
+  batch_size: 32
   use_gpu: True
   enable_mkldnn: True
   cpu_num_threads: 10
diff --git a/deploy/configs/build_vehicle.yaml b/deploy/configs/build_vehicle.yaml
index 66b4bad6a531afa825f76cfe902733f638038ad6..be095f4e1eb78c81b1b0b083b10352e7f50ad25d 100644
--- a/deploy/configs/build_vehicle.yaml
+++ b/deploy/configs/build_vehicle.yaml
@@ -1,6 +1,6 @@
 Global:
   rec_inference_model_dir: "./models/vehicle_cls_ResNet50_CompCars_v1.0_infer/"
-  batch_size: 1
+  batch_size: 32
   use_gpu: True
   enable_mkldnn: True
   cpu_num_threads: 10
diff --git a/deploy/python/predict_cls.py b/deploy/python/predict_cls.py
index a9165a92efa62a9252834a988050eebfa8d89f69..dc6865404ecfbc517c7b952c52035a27cbc0137f 100644
--- a/deploy/python/predict_cls.py
+++ b/deploy/python/predict_cls.py
@@ -41,6 +41,29 @@ class ClsPredictor(Predictor):
         if "PostProcess" in config:
             self.postprocess = build_postprocess(config["PostProcess"])
 
+        # for whole_chain project to test each repo of paddle
+        self.benchmark = config["Global"].get("benchmark", False)
+        if self.benchmark:
+            import auto_log
+            import os
+            pid = os.getpid()
+            self.auto_logger = auto_log.AutoLogger(
+                model_name=config["Global"].get("model_name", "cls"),
+                model_precision='fp16'
+                if config["Global"]["use_fp16"] else 'fp32',
+                batch_size=config["Global"].get("batch_size", 1),
+                data_shape=[3, 224, 224],
+                save_path=config["Global"].get("save_log_path",
+                                               "./auto_log.log"),
+                inference_config=self.config,
+                pids=pid,
+                process_name=None,
+                gpu_ids=None,
+                time_keys=[
+                    'preprocess_time', 'inference_time', 'postprocess_time'
+                ],
+                warmup=2)
+
     def predict(self, images):
         input_names = self.paddle_predictor.get_input_names()
         input_tensor = self.paddle_predictor.get_input_handle(input_names[0])
@@ -49,16 +72,26 @@ class ClsPredictor(Predictor):
         output_tensor = self.paddle_predictor.get_output_handle(output_names[
             0])
 
+        if self.benchmark:
+            self.auto_logger.times.start()
         if not isinstance(images, (list, )):
             images = [images]
         for idx in range(len(images)):
             for ops in self.preprocess_ops:
                 images[idx] = ops(images[idx])
         image = np.array(images)
+        if self.benchmark:
+            self.auto_logger.times.stamp()
 
         input_tensor.copy_from_cpu(image)
         self.paddle_predictor.run()
         batch_output = output_tensor.copy_to_cpu()
+        if self.benchmark:
+            self.auto_logger.times.stamp()
+        if self.postprocess is not None:
+            batch_output = self.postprocess(batch_output)
+        if self.benchmark:
+            self.auto_logger.times.end(stamp=True)
         return batch_output
 
 
@@ -66,12 +99,40 @@ def main(config):
     cls_predictor = ClsPredictor(config)
     image_list = get_image_list(config["Global"]["infer_imgs"])
 
-    assert config["Global"]["batch_size"] == 1
-    for idx, image_file in enumerate(image_list):
-        img = cv2.imread(image_file)[:, :, ::-1]
-        output = cls_predictor.predict(img)
-        output = cls_predictor.postprocess(output, [image_file])
-        print(output)
+    batch_imgs = []
+    batch_names = []
+    cnt = 0
+    for idx, img_path in enumerate(image_list):
+        img = cv2.imread(img_path)
+        if img is None:
+            logger.warning(
+                "Image file failed to read and has been skipped. The path: {}".
+                format(img_path))
+        else:
+            img = img[:, :, ::-1]
+            batch_imgs.append(img)
+            img_name = os.path.basename(img_path)
+            batch_names.append(img_name)
+            cnt += 1
+
+        if cnt % config["Global"]["batch_size"] == 0 or (idx + 1
+                                                         ) == len(image_list):
+            if len(batch_imgs) == 0:
+                continue
+
+            batch_results = cls_predictor.predict(batch_imgs)
+            for number, result_dict in enumerate(batch_results):
+                filename = batch_names[number]
+                clas_ids = result_dict["class_ids"]
+                scores_str = "[{}]".format(", ".join("{:.2f}".format(
+                    r) for r in result_dict["scores"]))
+                label_names = result_dict["label_names"]
+                print("{}:\tclass id(s): {}, score(s): {}, label_name(s): {}".
+                      format(filename, clas_ids, scores_str, label_names))
+            batch_imgs = []
+            batch_names = []
+    if cls_predictor.benchmark:
+        cls_predictor.auto_logger.report()
     return
 
 
diff --git a/deploy/python/predict_rec.py b/deploy/python/predict_rec.py
index de293bf0097f9ea48e1ecd296ec2695e15c54ba4..d41c513f89fd83972e86bc5941a8dba1fd488856 100644
--- a/deploy/python/predict_rec.py
+++ b/deploy/python/predict_rec.py
@@ -54,12 +54,14 @@ class RecPredictor(Predictor):
         input_tensor.copy_from_cpu(image)
         self.paddle_predictor.run()
         batch_output = output_tensor.copy_to_cpu()
-        
+
         if feature_normalize:
             feas_norm = np.sqrt(
                 np.sum(np.square(batch_output), axis=1, keepdims=True))
             batch_output = np.divide(batch_output, feas_norm)
-            
+
+        if self.postprocess is not None:
+            batch_output = self.postprocess(batch_output)
         return batch_output
 
 
@@ -67,14 +69,33 @@ def main(config):
     rec_predictor = RecPredictor(config)
     image_list = get_image_list(config["Global"]["infer_imgs"])
 
-    assert config["Global"]["batch_size"] == 1
-    for idx, image_file in enumerate(image_list):
-        batch_input = []
-        img = cv2.imread(image_file)[:, :, ::-1]
-        output = rec_predictor.predict(img)
-        if rec_predictor.postprocess is not None:
-            output = rec_predictor.postprocess(output)
-        print(output)
+    batch_imgs = []
+    batch_names = []
+    cnt = 0
+    for idx, img_path in enumerate(image_list):
+        img = cv2.imread(img_path)
+        if img is None:
+            logger.warning(
+                "Image file failed to read and has been skipped. The path: {}".
+                format(img_path))
+        else:
+            img = img[:, :, ::-1]
+            batch_imgs.append(img)
+            img_name = os.path.basename(img_path)
+            batch_names.append(img_name)
+            cnt += 1
+
+        if cnt % config["Global"]["batch_size"] == 0 or (idx + 1) == len(image_list):
+            if len(batch_imgs) == 0: 
+                continue
+                
+            batch_results = rec_predictor.predict(batch_imgs)
+            for number, result_dict in enumerate(batch_results):
+                filename = batch_names[number]
+                print("{}:\t{}".format(filename, result_dict))
+            batch_imgs = []
+            batch_names = []
+
     return
 
 
diff --git a/deploy/utils/predictor.py b/deploy/utils/predictor.py
index 7757aa1e12a79cbba99e6ab56bde286ab2d09369..11f153071a0da0f82f035fe7389ad8f9f3bd8e6b 100644
--- a/deploy/utils/predictor.py
+++ b/deploy/utils/predictor.py
@@ -28,7 +28,7 @@ class Predictor(object):
         if args.use_fp16 is True:
             assert args.use_tensorrt is True
         self.args = args
-        self.paddle_predictor = self.create_paddle_predictor(
+        self.paddle_predictor, self.config = self.create_paddle_predictor(
             args, inference_model_dir)
 
     def predict(self, image):
@@ -59,11 +59,12 @@ class Predictor(object):
             config.enable_tensorrt_engine(
                 precision_mode=Config.Precision.Half
                 if args.use_fp16 else Config.Precision.Float32,
-                max_batch_size=args.batch_size)
+                max_batch_size=args.batch_size,
+                min_subgraph_size=30)
 
         config.enable_memory_optim()
         # use zero copy
         config.switch_use_feed_fetch_ops(False)
         predictor = create_predictor(config)
 
-        return predictor
+        return predictor, config
diff --git a/deploy/vector_search/README_en.md b/deploy/vector_search/README_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..20b2539486a3df289b710b3e7c243bf99ac361e8
--- /dev/null
+++ b/deploy/vector_search/README_en.md
@@ -0,0 +1,95 @@
+# Vector search
+
+## 1. Introduction
+
+Some vertical domain recognition tasks (e.g., vehicles, commodities, etc.) require a large number of recognized categories, and often use a retrieval-based approach to obtain matching predicted categories by performing a fast nearest neighbor search with query vectors and underlying library vectors. The vector search module provides the basic approximate nearest neighbor search algorithm based on Baidu's self-developed Möbius algorithm, a graph-based approximate nearest neighbor search algorithm for maximum inner product search (MIPS). This module provides python interface, supports numpy and tensor type vectors, and supports L2 and Inner Product distance calculation.
+
+Details of the Mobius algorithm can be found in the paper.([Möbius Transformation for Fast Inner Product Search on Graph](http://research.baidu.com/Public/uploads/5e189d36b5cf6.PDF), [Code](https://github.com/sunbelbd/mobius))
+
+## 2. Installation
+
+### 2.1 Use the provided library files directly
+
+This folder contains the compiled `index.so` (compiled under gcc8.2.0 for Linux) and `index.dll` (compiled under gcc10.3.0 for Windows), which can be used directly, skipping sections 2.2 and 2.3.
+
+If the library files are not available due to a low gcc version or an incompatible environment, you need to manually compile the library files under a different platform.
+
+**Note:** Make sure that C++ compiler supports the C++11 standard.
+
+### 2.2 Compile and generate library files on Linux
+
+Run the following command to install gcc and g++.
+
+```
+sudo apt-get update
+sudo apt-get upgrade -y
+sudo apt-get install build-essential gcc g++
+```
+
+Check the gcc version by the command `gcc -v`.
+
+`make` can be operated directly. If you wish to regenerate the `index.so`, you can first use `make clean` to clear the cache, and then use `make` to generate the updated library file.
+
+### 2.3 Compile and generate library files on Windows
+
+You need to install gcc compiler tool first, we recommend using [TDM-GCC](https://jmeubank.github.io/tdm-gcc/articles/2020-03/9.2.0-release), you can choose the right version on the official website. We recommend downloading [tdm64-gcc-10.3.0-2.exe](https://github.com/jmeubank/tdm-gcc/releases/download/v10.3.0-tdm64-2/tdm64-gcc-10.3.0-2.exe).
+
+After the downloading, follow the default installation steps to install. There are 3 points to note here:
+
+1.  The vector search module depends on openmp, so you need to check the `openmp` installation option when going on to `choose components` step, otherwise it will report an error `libgomp.spec: No such file or directory`, [reference link](https://github.com/dmlc/xgboost/issues/1027)
+2.  When being asked whether to add to the system environment variables, it is recommended to check here, otherwise you need to add the system environment variables manually later.
+3. The compile command is `make` on Linux and `mingw32-make` on Windows, so you need to distinguish here.
+
+After installation, you can open a command line terminal and check the gcc version with the command `gcc -v`.
+
+Run the command `mingw32-make` to generate the `index.dll` library file under the folder (deploy/vector_search). If you want to regenerate the `index.dll` file, you can first use `mingw32-make clean` to clear the cache, and then use `mingw32-make` to generate the updated library file.
+
+### 2.4 Compile and generate library files on MacOS
+
+Run the following command to install gcc and g++:
+
+```
+brew install gcc
+```
+
+#### Caution:
+
+1. If prompted with `Error: Running Homebrew as root is extremely dangerous and no longer supported... `, refer to this [link](https://jingyan.baidu.com/article/e52e3615057a2840c60c519c.html)
+2.  If prompted with `Error: Failure while executing; tar --extract --no-same-owner --file... `, refer to this [link](https://blog.csdn.net/Dawn510/article/details/117787358).
+
+After installation the compiled executable is copied under /usr/local/bin, look at the gcc in this folder: 
+
+```
+ls /usr/local/bin/gcc*
+```
+
+The local gcc version is gcc-11, and the compile command is as follows: (If the local gcc version is gcc-9, the corresponding command should be `CXX=g++-9 make`)
+
+```
+CXX=g++-11 make
+```
+
+## 3. Quick use
+
+```
+import numpy as np
+from interface import Graph_Index
+
+# Random sample generation
+index_vectors = np.random.rand(100000,128).astype(np.float32)
+query_vector = np.random.rand(128).astype(np.float32)
+index_docs = ["ID_"+str(i) for i in range(100000)]
+
+# Initialize index structure
+indexer = Graph_Index(dist_type="IP") #support "IP" and "L2"
+indexer.build(gallery_vectors=index_vectors, gallery_docs=index_docs, pq_size=100, index_path='test')
+
+# Query
+scores, docs = indexer.search(query=query_vector, return_k=10, search_budget=100)
+print(scores)
+print(docs)
+
+# Save and load
+indexer.dump(index_path="test")
+indexer.load(index_path="test")
+```
diff --git a/docs/en/extension/VisualDL_en.md b/docs/en/extension/VisualDL_en.md
index cdd99581a0e3d19752f66361e15f9b696e57024a..9ffd03e92f19a617d7412894073957490dd7b799 100644
--- a/docs/en/extension/VisualDL_en.md
+++ b/docs/en/extension/VisualDL_en.md
@@ -7,29 +7,30 @@ VisualDL, a visualization analysis tool of PaddlePaddle, provides a variety of c
 Now PaddleClas support use VisualDL to visualize the changes of learning rate, loss, accuracy in training.
 
 ### Set config and start training
-You only need to set the `vdl_dir` field in train config:
+You only need to set the field `Global.use_visualdl` to `True` in train config:
 
 ```yaml
 # config.yaml
-vdl_dir: "./vdl.log"
+Global:
+...
+  use_visualdl: True
+...
 ```
 
-`vdl_dir`: Specify the directory where VisualDL stores logs.
-
-Then normal start training:
+PaddleClas will save the VisualDL logs to subdirectory `vdl/` under the output directory specified by `Global.output_dir`. And then you just need to start training normally:
 
 ```shell
 python3 tools/train.py -c config.yaml
 ```
 
 ### Start VisualDL
-After starting the training program, you can start the VisualDL service in the new terminal session:
+After starting the training program, you can start the VisualDL service in a new terminal session:
 
 ```shell
- visualdl --logdir ./vdl.log
+ visualdl --logdir ./output/vdl/
 ```
 
-In the above command, `--logdir` specify the logs directory. VisualDL will traverse and iterate to find the subdirectories of the specified directory to visualize all the experimental results. You can also use the following parameters to set the IP and port number of the VisualDL service:
+In the above command, `--logdir` specify the directory of the VisualDL logs produced in training. VisualDL will traverse and iterate to find the subdirectories of the specified directory to visualize all the experimental results. You can also use the following parameters to set the IP and port number of the VisualDL service:
 
 * `--host`:ip, default is 127.0.0.1
 * `--port`:port, default is 8040
diff --git a/docs/en/models/Twins.md b/docs/en/models/Twins.md
index 69e7054486cfc9fb22415c4438a9c02e9eae3a4a..ccd83e44a47c99ed3c95481c30a682068cb17ff6 100644
--- a/docs/en/models/Twins.md
+++ b/docs/en/models/Twins.md
@@ -3,9 +3,9 @@
 ## Overview
 The Twins network includes Twins-PCPVT and Twins-SVT, which focuses on the meticulous design of the spatial attention mechanism, resulting in a simple but more effective solution. Since the architecture only involves matrix multiplication, and the current deep learning framework has a high degree of optimization for matrix multiplication, the architecture is very efficient and easy to implement. Moreover, this architecture can achieve excellent performance in a variety of downstream vision tasks such as image classification, target detection, and semantic segmentation. [Paper](https://arxiv.org/abs/2104.13840).
 
-## Accuracy, FLOPS and Parameters
+## Accuracy, FLOPs and Parameters
 
-| Models        | Top1 | Top5 | Reference
top1 | Reference
top5 | FLOPS
(G) | Params
(M) |
+| Models        | Top1 | Top5 | Reference
top1 | Reference
top5 | FLOPs
(G) | Params
(M) |
 |:--:|:--:|:--:|:--:|:--:|:--:|:--:|
 | pcpvt_small   | 0.8082 | 0.9552 | 0.812 | - | 3.7 | 24.1   |
 | pcpvt_base    | 0.8242 | 0.9619 | 0.827 | - | 6.4 | 43.8   |
diff --git a/docs/en/tutorials/config_description_en.md b/docs/en/tutorials/config_description_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..d510df753038f8cc2e766ee1f7a12c05ba5ce677
--- /dev/null
+++ b/docs/en/tutorials/config_description_en.md
@@ -0,0 +1,232 @@
+# Configuration Instruction
+
+------
+
+## Introdction
+
+The parameters in the PaddleClas configuration file(`ppcls/configs/*.yaml`)are described for you to customize or modify the hyperparameter configuration more quickly.
+
+## Details
+
+### 1. Classification model
+
+Here the configuration of `ResNet50_vd` on`ImageNet-1k`is used as an example to explain the each parameter in detail. [Configure Path](https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml).
+
+#### 1.1Global Configuration
+
+| Parameter name     | Specific meaning                                        | Defult value     | Optional value    |
+| ------------------ | ------------------------------------------------------- | ---------------- | ----------------- |
+| checkpoints        | Breakpoint model path for resuming training             | null             | str               |
+| pretrained_model   | Pre-trained model path                                  | null             | str               |
+| output_dir         | Save model path                                         | "./output/"      | str               |
+| save_interval      | How many epochs to save the model at each interval      | 1                | int               |
+| eval_during_train  | Whether to evaluate at training                         | True             | bool              |
+| eval_interval      | How many epochs to evaluate at each interval            | 1                | int               |
+| epochs             | Total number of epochs in training                      |                  | int               |
+| print_batch_step   | How many mini-batches to print out at each interval     | 10               | int               |
+| use_visualdl       | Whether to visualize the training process with visualdl | False            | bool              |
+| image_shape        | Image size                                              | [3,224,224]    | list, shape: (3,) |
+| save_inference_dir | Inference model save path                               | "./inference"    | str               |
+| eval_mode          | Model of eval                                           | "classification" | "retrieval"       |
+
+**Note**:The http address of pre-trained model can be filled in the `pretrained_model`
+
+#### 1.2 Architecture
+
+| Parameter name | Specific meaning  | Defult value | Optional value        |
+| -------------- | ----------------- | ------------ | --------------------- |
+| name           | Model Arch name   | ResNet50     | PaddleClas model arch |
+| class_num      | Category number   | 1000         | int                   |
+| pretrained     | Pre-trained model | False        | bool, str            |
+
+**Note**: Here pretrained can be set to True or False, so does the path of the weights. In addition, the pretrained is disabled when Global.pretrained_model is also set to the corresponding path.
+
+#### 1.3 Loss function
+
+| Parameter name | Specific meaning                            | Defult value | Optional value         |
+| -------------- | ------------------------------------------- | ------------ | ---------------------- |
+| CELoss         | cross-entropy loss function                 | ——           | ——                     |
+| CELoss.weight  | The weight of CELoss in the whole Loss      | 1.0          | float                  |
+| CELoss.epsilon | The epsilon value of label_smooth in CELoss | 0.1          | float,between 0 and 1 |
+
+#### 1.4 Optimizer
+
+| Parameter name    | Specific meaning                 | Defult value | Optional value                                     |
+| ----------------- | -------------------------------- | ------------ | -------------------------------------------------- |
+| name              | optimizer method name            | "Momentum"   | Other optimizer including "RmsProp"                |
+| momentum          | momentum value                   | 0.9          | float                                              |
+| lr.name           | method of dropping learning rate | "Cosine"     | Other dropping methods of "Linear" and "Piecewise" |
+| lr.learning_rate  | initial value of learning rate   | 0.1          | float                                              |
+| lr.warmup_epoch   | warmup rounds                    | 0            | int,such as 5                                           |
+| regularizer.name  | regularization method name       | "L2"         | ["L1", "L2"]                                       |
+| regularizer.coeff | regularization factor            | 0.00007      | float                                              |
+
+**Note**:The new parameters may be different when `lr.name`  is different , as when `lr.name=Piecewise`, the following parameters need to be added:
+
+```
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+```
+
+Referring to [learning_rate.py](https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/optimizer/learning_rate.py) for adding method and parameters.
+
+#### 1.5 Data reading module(DataLoader)
+
+##### 1.5.1 dataset
+
+| Parameter name      | Specific meaning                     | Defult value                        | Optional value                 |
+| ------------------- | ------------------------------------ | ----------------------------------- | ------------------------------ |
+| name                | The name of the class to read the data                   | ImageNetDataset                     | VeriWild and other Dataet type |
+| image_root          | The path where the dataset is stored | ./dataset/ILSVRC2012/               | str                            |
+| cls_label_path      | data label list                      | ./dataset/ILSVRC2012/train_list.txt | str                            |
+| transform_ops       | data preprocessing for single images | ——                                  | ——                             |
+| batch_transform_ops | Data preprocessing for batch images  | ——                                  | ——                             |
+
+The parameter meaning of transform_ops:
+
+| Function name  | Parameter name | Specific meaning      |
+| -------------- | -------------- | --------------------- |
+| DecodeImage    | to_rgb         | data to RGB           |
+|                | channel_first  | image data by CHW     |
+| RandCropImage  | size           | Random crop           |
+| RandFlipImage  |                | Random flip           |
+| NormalizeImage | scale          | Normalize scale value |
+|                | mean           | Normalize mean value  |
+|                | std            | normalized variance   |
+|                | order          | Normalize order       |
+| CropImage      | size           | crop size             |
+| ResizeImage    | resize_short   | resize by short edge  |
+
+The parameter meaning of batch_transform_ops:
+
+| Function name | Parameter name | Specific meaning                        |
+| ------------- | -------------- | --------------------------------------- |
+| MixupOperator | alpha          | Mixup parameter value,the larger the value, the stronger the augment |
+
+##### 1.5.2 sampler
+
+| Parameter name | Specific meaning                                             | Default value           | Optional value                                     |
+| -------------- | ------------------------------------------------------------ | ----------------------- | -------------------------------------------------- |
+| name           | sampler type                                                 | DistributedBatchSampler | DistributedRandomIdentitySampler and other Sampler |
+| batch_size     | batch size                                                   | 64                      | int                                                |
+| drop_last      | Whether to drop the last data that does reach the batch-size | False                   | bool                                               |
+| shuffle        | whether to shuffle the data                                  | True                    | bool                                               |
+
+##### 1.5.3 loader
+
+| Parameter name    | Specific meaning             | Default meaning | Optional meaning |
+| ----------------- | ---------------------------- | --------------- | ---------------- |
+| num_workers       | Number of data read threads  | 4               | int              |
+| use_shared_memory | Whether to use shared memory | True            | bool             |
+
+#### 1.6 Evaluation metric
+
+| Parameter name | Specific meaning | Default meaning | Optional meaning |
+| -------------- | ---------------- | --------------- | ---------------- |
+| TopkAcc        | TopkAcc          | [1, 5]          | list, int        |
+
+#### 1.7 Inference
+
+| Parameter name                | Specific meaning                  | Default meaning                       | Optional meaning |
+| ----------------------------- | --------------------------------- | ------------------------------------- | ---------------- |
+| infer_imgs                    | Image address to be inferred      | docs/images/whl/demo.jpg              | str              |
+| batch_size                    | batch size                        | 10                                    | int              |
+| PostProcess.name              | Post-process name                 | Topk                                  | str              |
+| PostProcess.topk              | topk value                        | 5                                     | int              |
+| PostProcess.class_id_map_file | mapping file of class id and name | ppcls/utils/imagenet1k_label_list.txt | str              |
+
+**Note**:The interpretation of `transforms` in the Infer module refers to the interpretation of`transform_ops`in the dataset in the data reading module.
+
+### 2.Distillation model
+
+**Note**:Here the training configuration of `MobileNetV3_large_x1_0` on `ImageNet-1k` distilled MobileNetV3_small_x1_0 is used as an example to explain the meaning of each parameter in detail. [Configure path](https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml). Only parameters that are distinct from the classification model are introduced here.
+
+#### 2.1 Architecture
+
+| Parameter name     | Specific meaning                                          | Default meaning        | Optional meaning                   |
+| ------------------ | --------------------------------------------------------- | ---------------------- | ---------------------------------- |
+| name               | model arch name                                           | DistillationModel      | ——                                 |
+| class_num          | category number                                           | 1000                   | int                                |
+| freeze_params_list | freeze_params_list                                        | [True, False]          | list                               |
+| models             | model list                                                | [Teacher, Student]     | list                               |
+| Teacher.name       | teacher model name                                        | MobileNetV3_large_x1_0 | PaddleClas model                   |
+| Teacher.pretrained | teacher model pre-trained weights                         | True                   | Boolean or pre-trained weight path |
+| Teacher.use_ssld   | whether teacher model pretrained weights are ssld weights | True                   | Boolean                            |
+| infer_model_name   | type of the model being inferred                          | Student                | Teacher                            |
+
+**Note**:
+
+1. list is represented in yaml as follows:
+
+```
+  freeze_params_list:
+  - True
+  - False
+```
+
+2.Student's parameters are similar and will not be repeated.
+
+#### 2.2  Loss function
+
+| Parameter name                      | Specific meaning                                             | Default meaning | Optional meaning |
+| ----------------------------------- | ------------------------------------------------------------ | --------------- | ---------------- |
+| DistillationCELoss                  | Distillation's cross-entropy loss function                   | ——              | ——               |
+| DistillationCELoss.weight           | Loss weight                                                  | 1.0             | float            |
+| DistillationCELoss.model_name_pairs | ["Student", "Teacher"]                                       | ——              | ——               |
+| DistillationGTCELoss.weight         | Distillation's cross-entropy loss function of model and true Label | ——              | ——               |
+| DistillationGTCELos.weight          | Loss weight                                                  | 1.0             | float            |
+| DistillationCELoss.model_names      | Model names with real label for cross-entropy                | ["Student"]     | ——               |
+
+#### 2.3 Evaluation metric
+
+| Parameter name                | Specific meaning    | Default meaning              | Optional meaning |
+| ----------------------------- | ------------------- | ---------------------------- | ---------------- |
+| DistillationTopkAcc           | DistillationTopkAcc | including model_key and topk | ——               |
+| DistillationTopkAcc.model_key | the evaluated model | "Student"                    | "Teacher"        |
+| DistillationTopkAcc.topk      | Topk value          | [1, 5]                       | list, int        |
+
+**Note**: `DistillationTopkAcc` has the same meaning as `TopkAcc`, except that it is only used in distillation tasks.
+
+### 3. Recognition model
+
+**Note**:The training configuration of`ResNet50` on`LogoDet-3k` is used here as an example to explain the meaning of each parameter in detail. [configure path](https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/configs/Logo/ResNet50_ReID.yaml). Only parameters that are distinct from the classification model are presented here.
+
+#### 3.1 Architechture
+
+| Parameter name         | Specific meaning                                             | Default meaning             | Optional meaning                                             |
+| ---------------------- | ------------------------------------------------------------ | --------------------------- | ------------------------------------------------------------ |
+| name                   | Model arch                                                   | "RecModel"                  | ["RecModel"]                                                 |
+| infer_output_key       | inference output value                                       | “feature”                   | ["feature", "logits"]                                        |
+| infer_add_softmax      | softmaxwhether to add softmax to infercne                    | False                       | [True, False]                                                |
+| Backbone.name          | Backbone name                                                | ResNet50_last_stage_stride1 | other backbone provided by PaddleClas                        |
+| Backbone.pretrained    | Backbone pre-trained model                                   | True                        | Boolean value or pre-trained model path                      |
+| BackboneStopLayer.name | The name of the output layer in Backbone                     | True                        | The`full_name`of the feature output layer in Backbone        |
+| Neck.name              | The name of the Neck part                                    | VehicleNeck                 | the dictionary structure to be passed in, the specific input parameters for the Neck network layer |
+| Neck.in_channels       | Input dimension size of the Neck part                        | 2048                        | the size is the same as BackboneStopLayer.name               |
+| Neck.out_channels      | Output the dimension size of the Neck part, i.e. feature dimension size | 512                         | int                                                          |
+| Head.name              | Network Head part nam                                        | CircleMargin                | Arcmargin. Etc                                               |
+| Head.embedding_size    | Feature dimension size                                       | 512                         | Consistent with Neck.out_channels                            |
+| Head.class_num         | number of classes                                            | 3000                        | int                                                          |
+| Head.margin            | margin value in CircleMargin                                 | 0.35                        | float                                                        |
+| Head.scale             | scale value in CircleMargin                                  | 64                          | int                                                          |
+
+**Note**:
+
+1.In PaddleClas, the `Neck` part is the connection part between Backbone and embedding layer, and `Head` part is the connection part between embedding layer and classification layer.。
+
+2.`BackboneStopLayer.name` can be obtained by visualizing the model, visualization can be referred to [Netron](https://github.com/lutzroeder/netron) or [visualdl](https://github.com/PaddlePaddle/VisualDL).
+
+3.Calling tools/export_model.py will convert the model weights to inference model, where the infer_add_softmax parameter will control whether to add the Softmax activation function afterwards, the code default is True (the last output layer in the classification task will be connected to the Softmax activation function). In the recognition task, the activation function is not required for the feature layer, so it should be set to False here.
+
+
+
+
+#### 3.2 Evaluation metric
+
+| Parameter name | Specific meaning            | Default meaning | Optional meaning |
+| -------------- | --------------------------- | --------------- | ---------------- |
+| Recallk        | Recall rate                 | [1, 5]          | list, int        |
+| mAP            | Average retrieval precision | None            | None             |
diff --git a/docs/en/tutorials/config_en.md b/docs/en/tutorials/config_en.md
deleted file mode 100644
index 05663c7ec57848256d4255f0e7c17f0e92276c14..0000000000000000000000000000000000000000
--- a/docs/en/tutorials/config_en.md
+++ /dev/null
@@ -1,94 +0,0 @@
-# Configuration
-
----
-
-## Introduction
-
-This document introduces the configuration(filed in `config/*.yaml`) of PaddleClas.
-
-* Note: Some parameters do not appear in the yaml file (because they are not used for this file). During training or validation, you can use the command `-o` to update or add the specified parameters. For the example `-o checkpoints=./ckp_path/ppcls`, it means that the parameter `checkpoints` will be updated or added using the value `./ckp_path/ppcls`.
-
-### Basic
-
-| name | detail | default value | optional value |
-|:---:|:---:|:---:|:---:|
-| mode | mode | "train" | ["train"," valid"] |
-| checkpoints | checkpoint model path for resuming training process | "" | Str |
-| last_epoch | last epoch for the training,used with checkpoints | -1 | int |
-| pretrained_model | pretrained model path | "" | Str |
-| load_static_weights | whether the pretrained model is saved in static mode | False | bool |
-| model_save_dir | model stored path | "" | Str |
-| classes_num | class number | 1000 | int |
-| total_images | total images | 1281167 | int |
-| save_interval | save interval | 1 | int |
-| validate | whether to validate when training | TRUE | bool |
-| valid_interval | valid interval | 1 | int |
-| epochs | epoch |  | int |
-| topk | K value | 5 | int |
-| image_shape | image size | [3,224,224] | list, shape: (3,) |
-| use_mix | whether to use mixup | False | ['True', 'False'] |
-| ls_epsilon | label_smoothing epsilon value| 0 | float |
-| use_distillation | whether to use SSLD distillation training | False | bool |
-
-
-## ARCHITECTURE
-
-| name | detail | default value | optional value |
-|:---:|:---:|:---:|:---:|
-| name | model name | "ResNet50_vd" | one of 23 architectures |
-| params | model parameters | {} | extra dictionary for the model structure, parameters such as `padding_type` in EfficientNet can be set here |
-
-
-### LEARNING_RATE
-
-| name | detail | default value |Optional value |
-|:---:|:---:|:---:|:---:|
-| function | decay type | "Linear" | ["Linear", "Cosine", 
 "Piecewise", "CosineWarmup"] |
-| params.lr | initial learning rate | 0.1 | float |
-| params.decay_epochs | milestone in piecewisedecay |  | list |
-| params.gamma | gamma in piecewisedecay | 0.1 | float |
-| params.warmup_epoch | warmup epoch | 5 | int |
-| parmas.steps | decay steps in lineardecay | 100 | int |
-| params.end_lr | end lr in lineardecay | 0 | float |
-
-### OPTIMIZER
-
-| name | detail | default value | optional value |
-|:---:|:---:|:---:|:---:|
-| function | optimizer name | "Momentum" | ["Momentum", "RmsProp"] |
-| params.momentum | momentum value | 0.9 | float |
-| regularizer.function | regularizer method name | "L2" | ["L1", "L2"] |
-| regularizer.factor | regularizer factor | 0.0001 | float |
-
-### reader
-
-| name | detail |
-|:---:|:---:|
-| batch_size | batch size |
-| num_workers | worker number |
-| file_list | train list path |
-| data_dir | train  dataset path |
-| shuffle_seed | seed |
-
-processing
-
-| function name | attribute name | detail |
-|:---:|:---:|:---:|
-| DecodeImage | to_rgb | decode to RGB |
-|  | to_np | to numpy |
-|  | channel_first | Channel first |
-| RandCropImage | size | random crop |
-| RandFlipImage | | random flip |
-| NormalizeImage | scale | normalize image |
-|  | mean | mean |
-|  | std | std |
-|  | order | order |
-| ToCHWImage |  | to CHW |
-| CropImage | size | crop size |
-| ResizeImage | resize_short | resize according to short size |
-
-mix preprocessing
-
-| name| detail|
-|:---:|:---:|
-| MixupOperator.alpha | alpha value in mixup|
diff --git a/docs/en/tutorials/getting_started_en.md b/docs/en/tutorials/getting_started_en.md
index 19142b0145d3d4c43a152120f9645838f2a93b04..1903a04ef883c8f13239abf47899199d866f0dcd 100644
--- a/docs/en/tutorials/getting_started_en.md
+++ b/docs/en/tutorials/getting_started_en.md
@@ -23,7 +23,7 @@ Among them, `-c` is used to specify the path of the configuration file, `-o` is
 `-o use_gpu=True` means to use GPU for training. If you want to use the CPU for training, you need to set `use_gpu` to `False`.
 
 
-Of course, you can also directly modify the configuration file to update the configuration. For specific configuration parameters, please refer to [Configuration Document](config_en.md).
+Of course, you can also directly modify the configuration file to update the configuration. For specific configuration parameters, please refer to [Configuration Document](config_description_en.md).
 
 * The output log examples are as follows:
     * If mixup or cutmix is used in training, top-1 and top-k (default by 5) will not be printed in the log:
diff --git a/docs/en/whl_en.md b/docs/en/whl_en.md
index 05248cb188730a0ef0c8a4cf5d867732676e8ccb..e791ef08cb22139f8aab771be0ecce4883cd151c 100644
--- a/docs/en/whl_en.md
+++ b/docs/en/whl_en.md
@@ -5,7 +5,7 @@
 * installing from pypi
 
 ```bash
-pip3 install paddleclas==2.2.0
+pip3 install paddleclas==2.2.1
 ```
 
 * build own whl package and install
diff --git a/docs/images/faq/momentum.jpeg b/docs/images/faq/momentum.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..cb3cb0b0a92f3862de6f0be01d8e45b6c8616eb2
Binary files /dev/null and b/docs/images/faq/momentum.jpeg differ
diff --git a/docs/images/product/aliproduct.png b/docs/images/product/aliproduct.png
index df2ea725223abd370b7f15ea8ea8abc5e3f4c6e4..b57f96ceecc9dfb3e1bf348a1ade6a5ba44f01f4 100644
Binary files a/docs/images/product/aliproduct.png and b/docs/images/product/aliproduct.png differ
diff --git a/docs/images/wx_group.png b/docs/images/wx_group.png
index 79e233fd3bb148561252ac22f3bb88031f2a4db1..60259dd4e0a11035b02266b3f67b35748518c5c0 100644
Binary files a/docs/images/wx_group.png and b/docs/images/wx_group.png differ
diff --git a/docs/zh_CN/application/mainbody_detection.md b/docs/zh_CN/application/mainbody_detection.md
index e3cba4b52cbba7e1ce352fc2a18f9fd8e38e8a86..46c7ff4be9034b783040598a94ebf18bc0aa95e4 100644
--- a/docs/zh_CN/application/mainbody_detection.md
+++ b/docs/zh_CN/application/mainbody_detection.md
@@ -167,4 +167,22 @@ python tools/export_model.py -c configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml
 
 更多模型导出教程,请参考:[EXPORT_MODEL](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/deploy/EXPORT_MODEL.md)
 
-导出模型之后,在主体检测与识别任务中,就可以将检测模型的路径更改为该inference模型路径,完成预测。图像识别快速体验可以参考:[图像识别快速开始教程](../tutorials/quick_start_recognition.md)。
+最终,目录`inference/ppyolov2_r50vd_dcn_365e_coco`中包含`inference.pdiparams`, `inference.pdiparams.info` 以及 `inference.pdmodel` 文件,其中`inference.pdiparams`为保存的inference模型权重文件,`inference.pdmodel`为保存的inference模型结构文件。
+
+
+导出模型之后,在主体检测与识别任务中,就可以将检测模型的路径更改为该inference模型路径,完成预测。
+
+以商品识别为例,其配置文件为[inference_product.yaml](../../../deploy/configs/inference_product.yaml),修改其中的`Global.det_inference_model_dir`字段为导出的主体检测inference模型目录,参考[图像识别快速开始教程](../tutorials/quick_start_recognition.md),即可完成商品检测与识别过程。
+
+
+### FAQ
+
+#### Q:可以使用其他的主体检测模型结构吗?
+
+* A:可以的,但是目前的检测预处理过程仅适配yolo系列的预处理,因此在使用的时候,建议优先使用yolo系列的模型进行训练,如果希望使用faster rcnn等其他系列的模型,需要按照PaddleDetection的数据预处理,修改下预处理逻辑,这块如果您有需求或者有问题的话,欢迎提issue或者在群里反馈。
+
+#### Q:可以修改主体检测的预测尺度吗?
+
+* A:可以的,但是需要注意2个地方
+  * PaddleClas中提供的主体检测模型是基于640x640的分辨率去训练的,因此预测的时候也是默认使用640x640的分辨率进行预测,使用其他分辨率预测的话,精度会有所降低。
+  * 在模型导出的时候,建议也修改下模型导出的分辨率,保持模型导出、模型预测的分辨率一致。
diff --git a/docs/zh_CN/extension/VisualDL.md b/docs/zh_CN/extension/VisualDL.md
index d360c5b152957f53c2fdf5f334a2fe6d8762f85c..3c0020063ac7e0ed58e03100fcdd3b82e2f0bac5 100644
--- a/docs/zh_CN/extension/VisualDL.md
+++ b/docs/zh_CN/extension/VisualDL.md
@@ -7,15 +7,17 @@ VisualDL是飞桨可视化分析工具,以丰富的图表呈现训练参数变
 现在PaddleClas支持在训练阶段使用VisualDL查看训练过程中学习率(learning rate)、损失值(loss)以及准确率(accuracy)的变化情况。
 
 ### 设置config文件并启动训练
-在PaddleClas中使用VisualDL,只需在训练配置文件(config文件)添加如下字段:
+在PaddleClas中使用VisualDL,只需在训练配置文件(config文件)中设置字段 `Global.use_visualdl` 为 `True`:
 
 ```yaml
 # config.yaml
-vdl_dir: "./vdl.log"
+Global:
+...
+  use_visualdl: True
+...
 ```
-`vdl_dir` 用于指定VisualDL用于保存log信息的目录。
 
-然后正常启动训练即可:
+PaddleClas 会将 VisualDL 的日志保存在 `Global.output_dir` 字段指定目录下的 `vdl/` 子目录下,然后正常启动训练即可:
 
 ```shell
 python3 tools/train.py -c config.yaml
@@ -25,10 +27,10 @@ python3 tools/train.py -c config.yaml
 在启动训练程序后,可以在新的终端session中启动VisualDL服务:
 
 ```shell
- visualdl --logdir ./vdl.log
+ visualdl --logdir ./output/vdl/
  ```
 
-上述命令中,参数`--logdir`用于指定日志目录,VisualDL将遍历并且迭代寻找指定目录的子目录,将所有实验结果进行可视化。也同样可以使用下述参数设定VisualDL服务的ip及端口号:
+上述命令中,参数`--logdir`用于指定保存 VisualDL 日志的目录,VisualDL将遍历并且迭代寻找指定目录的子目录,将所有实验结果进行可视化。也同样可以使用下述参数设定VisualDL服务的ip及端口号:
 * `--host`:设定IP,默认为127.0.0.1
 * `--port`:设定端口,默认为8040
 
diff --git a/docs/zh_CN/faq_series/faq_2021_s2.md b/docs/zh_CN/faq_series/faq_2021_s2.md
index 07d74345736c84a02656dcd1bc20920587610c0e..4e2e0ab5cd1bf1cb029f25bed431e703a6adc6b3 100644
--- a/docs/zh_CN/faq_series/faq_2021_s2.md
+++ b/docs/zh_CN/faq_series/faq_2021_s2.md
@@ -1,101 +1,269 @@
-# 图像识别常见问题汇总 - 2021 第2季
+# PaddleClas 相关常见问题汇总 - 2021 第2季
 
+## 写在前面
 
-## 目录
-* [第1期](#第1期)(2021.07.08)
-
-
-
-## 第1期
-
-### Q1.1: 目前使用的主体检测模型检测在某些场景中会有误检?
-
-**A**:目前的主体检测模型训练时使用了COCO、Object365、RPC、LogoDet等公开数据集,如果被检测数据是类似工业质检等于常见类别差异较大的数据,需要基于目前的检测模型重新微调训练。
+* 我们收集整理了开源以来在issues和用户群中的常见问题并且给出了简要解答,旨在为广大用户提供一些参考,也希望帮助大家少走一些弯路。
 
-### Q1.2: 添加图片后建索引报`assert text_num >= 2`错?
+* 图像分类、识别、检索领域大佬众多,模型和论文更新速度也很快,本文档回答主要依赖有限的项目实践,难免挂一漏万,如有遗漏和不足,也希望有识之士帮忙补充和修正,万分感谢。
 
-**A**:请确保data_file.txt中图片路径和图片名称中间的间隔为单个table,而不是空格。
+## 目录
+* [近期更新](#近期更新)(2021.08.11)
+* [精选](#精选)
+* [1. 理论篇](#1.理论篇)
+    * [1.1 PaddleClas基础知识](#1.1PaddleClas基础知识)
+    * [1.2 骨干网络和预训练模型库](#1.2骨干网络和预训练模型库)
+    * [1.3 图像分类](#1.3图像分类)
+    * [1.4 通用检测模块](#1.4通用检测模块)
+    * [1.5 图像识别模块](#1.5图像识别模块)
+    * [1.6 检索模块](#1.6检索模块)
+* [2. 实战篇](#2.实战篇)
+    * [2.1 训练与评估共性问题](#2.1训练与评估共性问题)
+    * [2.2 图像分类](#2.2图像分类)
+    * [2.3 通用检测模块](#2.3通用检测模块)
+    * [2.4 图像识别模块](#2.4图像识别模块)
+    * [2.5 检索模块](#2.5检索模块)
+    * [2.6 模型预测部署](#2.6模型预测部署)
+
+
+## 近期更新
+
+#### Q2.6.2: 导出inference模型进行预测部署,准确率异常,为什么呢?
+**A**: 该问题通常是由于在导出时未能正确加载模型参数导致的,首先检查模型导出时的日志,是否存在类似下述内容:
+```
+UserWarning: Skip loading for ***. *** is not found in the provided dict.
+```
+如果存在,则说明模型权重未能加载成功,请进一步检查配置文件中的 `Global.pretrained_model` 字段,是否正确配置了模型权重文件的路径。模型权重文件后缀名通常为 `pdparams`,注意在配置该路径时无需填写文件后缀名。
+
+#### Q2.1.4: 数据预处理中,不想对输入数据进行裁剪,该如何设置?或者如何设置剪裁的尺寸。
+**A**: PaddleClas 支持的数据预处理算子可在这里查看:`ppcls/data/preprocess/__init__.py`,所有支持的算子均可在配置文件中进行配置,配置的算子名称需要和算子类名一致,参数与对应算子类的构造函数参数一致。如不需要对图像裁剪,则可去掉 `CropImage`、`RandCropImage`,使用 `ResizeImage` 替换即可,可通过其参数设置不同的resize方式, 使用 `size` 参数则直接将图像缩放至固定大小,使用`resize_short` 参数则会维持图像宽高比进行缩放。设置裁剪尺寸时,可通过 `CropImage` 算子的 `size` 参数,或 `RandCropImage` 算子的 `size` 参数。
+
+#### Q1.1.3: Momentum 优化器中的 momentum 参数是什么意思呢?
+**A**: Momentum 优化器是在 SGD 优化器的基础上引入了“动量”的概念。在 SGD 优化器中,在 `t+1` 时刻,参数 `w` 的更新可表示为:
+```latex
+w_t+1 = w_t - lr * grad
+```
+其中,`lr` 为学习率,`grad` 为此时参数 `w` 的梯度。在引入动量的概念后,参数 `w` 的更新可表示为:
+```latex
+v_t+1 = m * v_t + lr * grad
+w_t+1 = w_t - v_t+1
+```
+其中,`m` 即为动量 `momentum`,表示累积动量的加权值,一般取 `0.9`,当取值小于 `1` 时,则越早期的梯度对当前的影响越小,例如,当动量参数 `m` 取 `0.9` 时,在 `t` 时刻,`t-5` 的梯度加权值为 `0.9 ^ 5 = 0.59049`,而 `t-2` 时刻的梯度加权值为 `0.9 ^ 2 = 0.81`。因此,太过“久远”的梯度信息对当前的参考意义很小,而“最近”的历史梯度信息对当前影响更大,这也是符合直觉的。
+
+
+    

+
+
+*该图来自 `https://blog.csdn.net/tsyccnh/article/details/76270707`*
+
+通过引入动量的概念,在参数更新时考虑了历史更新的影响,因此可以加快收敛速度,也改善了 `SGD` 优化器带来的损失(cost、loss)震荡问题。
+
+#### Q1.1.4: PaddleClas 是否有 `Fixing the train-test resolution discrepancy` 这篇论文的实现呢?
+**A**: 目前 PaddleClas 没有实现。如果需要,可以尝试自己修改代码。简单来说,该论文所提出的思想是使用较大分辨率作为输入,对已经训练好的模型最后的FC层进行fine-tune。具体操作上,首先在较低分辨率的数据集上对模型网络进行训练,完成训练后,对网络除最后的FC层外的其他层的权重设置参数 `stop_gradient=True`,然后使用较大分辨率的输入对网络进行fine-tune训练。
+
+#### Q1.6.2: PaddleClas 图像识别用于 Eval 的配置文件中,`Query` 和 `Gallery` 配置具体是用于做什么呢?
+**A**: `Query` 与 `Gallery` 均为数据集配置,其中 `Gallery` 用于配置底库数据,`Query` 用于配置验证集。在进行 Eval 时,首先使用模型对 `Gallery` 底库数据进行前向计算特征向量,特征向量用于构建底库,然后模型对 `Query` 验证集中的数据进行前向计算特征向量,再与底库计算召回率等指标。
+
+#### Q2.1.5: PaddlePaddle 安装后,使用报错,无法导入 paddle 下的任何模块(import paddle.xxx),是为什么呢?
+**A**: 首先可以使用以下代码测试 Paddle 是否安装正确:
+```python
+import paddle
+paddle.utils.install_check.run_check()
+```
+正确安装时,通常会有如下提示:
+```
+PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now.
+```
+如未能安装成功,则会有相应问题的提示。
+另外,在同时安装CPU版本和GPU版本Paddle后,由于两个版本存在冲突,需要将两个版本全部卸载,然后重新安装所需要的版本。
+
+#### Q2.1.6: 使用PaddleClas训练时,如何设置仅保存最优模型?不想保存中间模型。
+**A**: PaddleClas在训练过程中,会保存/更新以下三类模型:
+1. 最新的模型(`latest.pdopt`, `latest.pdparams`,`latest.pdstates`),当训练意外中断时,可使用最新保存的模型恢复训练;
+2. 最优的模型(`best_model.pdopt`,`best_model.pdparams`,`best_model.pdstates`);
+3. 训练过程中,一个epoch结束时的断点(`epoch_xxx.pdopt`,`epoch_xxx.pdparams`,`epoch_xxx.pdstates`)。训练配置文件中 `Global.save_interval` 字段表示该模型的保存间隔。将该字段设置大于总epochs数,则不再保存中间断点模型。
+
+
+## 精选
+
+
+## 1. 理论篇
+
+
+### 1.1 PaddleClas基础知识
+
+#### Q1.1.1 PaddleClas和PaddleDetection区别
+**A**:PaddleClas是一个兼主体检测、图像分类、图像检索于一体的图像识别repo,用于解决大部分图像识别问题,用户可以很方便的使用PaddleClas来解决小样本、多类别的图像识别问题。PaddleDetection提供了目标检测、关键点检测、多目标跟踪等能力,方便用户定位图像中的感兴趣的点和区域,被广泛应用于工业质检、遥感图像检测、无人巡检等项目。
 
-### Q1.3: 识别模块预测时报`Illegal instruction`错?
+#### Q1.1.2 PaddleClas 2.2和PaddleClas 2.1完全兼容吗?
+**A**:PaddleClas2.2相对PaddleClas2.1新增了metric learning模块,主体检测模块、向量检索模块。另外,也提供了商品识别、车辆识别、logo识别和动漫人物识别等4个场景应用示例。用户可以基于PaddleClas 2.2快速构建图像识别系统。在图像分类模块,二者的使用方法类似,可以参考[图像分类示例](../tutorials/getting_started.md)快速迭代和评估。新增的metric learning模块,可以参考[metric learning示例](../tutorials/getting_started_retrieval.md)。另外,新版本暂时还不支持fp16、dali训练,也暂时不支持多标签训练,这块内容将在不久后支持。
 
-**A**:可能是编译生成的库文件与您的环境不兼容,导致程序报错,如果报错,推荐参考[向量检索教程](../../../deploy/vector_search/README.md)重新编译库文件。
+#### Q1.1.3: Momentum 优化器中的 momentum 参数是什么意思呢?
+**A**: Momentum 优化器是在 SGD 优化器的基础上引入了“动量”的概念。在 SGD 优化器中,在 `t+1` 时刻,参数 `w` 的更新可表示为:
+```latex
+w_t+1 = w_t - lr * grad
+```
+其中,`lr` 为学习率,`grad` 为此时参数 `w` 的梯度。在引入动量的概念后,参数 `w` 的更新可表示为:
+```latex
+v_t+1 = m * v_t + lr * grad
+w_t+1 = w_t - v_t+1
+```
+其中,`m` 即为动量 `momentum`,表示累积动量的加权值,一般取 `0.9`,当取值小于 `1` 时,则越早期的梯度对当前的影响越小,例如,当动量参数 `m` 取 `0.9` 时,在 `t` 时刻,`t-5` 的梯度加权值为 `0.9 ^ 5 = 0.59049`,而 `t-2` 时刻的梯度加权值为 `0.9 ^ 2 = 0.81`。因此,太过“久远”的梯度信息对当前的参考意义很小,而“最近”的历史梯度信息对当前影响更大,这也是符合直觉的。
 
-### Q1.4 主体检测是每次只输出一个主体检测框吗?
+
+    

+
 
-**A**:主体检测这块的输出数量是可以通过配置文件配置的。在配置文件中Global.threshold控制检测的阈值,小于该阈值的检测框被舍弃,Global.max_det_results控制最大返回的结果数,这两个参数共同决定了输出检测框的数量。
+通过引入动量的概念,在参数更新时考虑了历史更新的影响,因此可以加快收敛速度,也改善了 `SGD` 优化器带来的损失(cost、loss)震荡问题。
 
-### Q1.5 训练主体检测模型的数据是如何选择的?换成更小的模型会有损精度吗?
+#### Q1.1.4: PaddleClas 是否有 `Fixing the train-test resolution discrepancy` 这篇论文的实现呢?
+**A**: 目前 PaddleClas 没有实现。如果需要,可以尝试自己修改代码。简单来说,该论文所提出的思想是使用较大分辨率作为输入,对已经训练好的模型最后的FC层进行fine-tune。具体操作上,首先在较低分辨率的数据集上对模型网络进行训练,完成训练后,对网络除最后的FC层外的其他层的权重设置参数 `stop_gradient=True`,然后使用较大分辨率的输入对网络进行fine-tune训练。
 
-**A**:训练数据是在COCO、Object365、RPC、LogoDet等公开数据集中随机抽取的子集,小模型精度可能会有一些损失,后续我们也会尝试下更小的检测模型。关于主体检测模型的更多信息请参考[主体检测](../application/mainbody_detection.md)。
+
+### 1.2 骨干网络和预训练模型库
 
-### Q1.6 识别模型怎么在预训练模型的基础上进行微调训练?
+
+### 1.3 图像分类
 
-**A**:识别模型的微调训练和分类模型的微调训练类似,识别模型可以加载商品的预训练模型],训练过程可以参考[识别模型训练](../tutorials/getting_started_retrieval.md),后续我们也会持续细化这块的文档。
+#### Q1.3.1: PaddleClas有提供调整图片亮度,对比度,饱和度,色调等方面的数据增强吗?
+**A**:PaddleClas提供了多种数据增广方式, 可分为3类:
+1. 图像变换类: AutoAugment, RandAugment;  
+2. 图像裁剪类: CutOut、RandErasing、HideAndSeek、GridMask;
+3. 图像混叠类:Mixup, Cutmix.
 
-### Q1.7 PaddleClas和PaddleDetection区别
+其中,Randangment提供了多种数据增强方式的随机组合,可以满足亮度、对比度、饱和度、色调等多方面的数据增广需求
 
-**A**:PaddleClas是一个兼主体检测、图像分类、图像检索于一体的图像识别repo,用于解决大部分图像识别问题,用户可以很方便的使用PaddleClas来解决小样本、多类别的图像识别问题。PaddleDetection提供了目标检测、关键点检测、多目标跟踪等能力,方便用户定位图像中的感兴趣的点和区域,被广泛应用于工业质检、遥感图像检测、无人巡检等项目。
+
+### 1.4 通用检测模块
 
-### Q1.8 PaddleClas 2.2和PaddleClas 2.1完全兼容吗?
+#### Q1.4.1 主体检测是每次只输出一个主体检测框吗?
+**A**:主体检测这块的输出数量是可以通过配置文件配置的。在配置文件中Global.threshold控制检测的阈值,小于该阈值的检测框被舍弃,Global.max_det_results控制最大返回的结果数,这两个参数共同决定了输出检测框的数量。
 
-**A**:PaddleClas2.2相对PaddleClas2.1新增了metric learning模块,主体检测模块、向量检索模块。另外,也提供了商品识别、车辆识别、logo识别和动漫人物识别等4个场景应用示例。用户可以基于PaddleClas 2.2快速构建图像识别系统。在图像分类模块,二者的使用方法类似,可以参考[图像分类示例](../tutorials/getting_started.md)快速迭代和评估。新增的metric learning模块,可以参考[metric learning示例](../tutorials/getting_started_retrieval.md)。另外,新版本暂时还不支持fp16、dali训练,也暂时不支持多标签训练,这块内容将在不久后支持。
+#### Q1.4.2 训练主体检测模型的数据是如何选择的?换成更小的模型会有损精度吗?
+**A**:训练数据是在COCO、Object365、RPC、LogoDet等公开数据集中随机抽取的子集,小模型精度可能会有一些损失,后续我们也会尝试下更小的检测模型。关于主体检测模型的更多信息请参考[主体检测](../application/mainbody_detection.md)。
 
-### Q1.9 训练metric learning时,每个epoch中,无法跑完所有mini-batch,为什么?
+#### Q1.4.3: 目前使用的主体检测模型检测在某些场景中会有误检?
+**A**:目前的主体检测模型训练时使用了COCO、Object365、RPC、LogoDet等公开数据集,如果被检测数据是类似工业质检等于常见类别差异较大的数据,需要基于目前的检测模型重新微调训练。
 
-**A**:在训练metric learning时,使用的Sampler是DistributedRandomIdentitySampler,该Sampler不会采样全部的图片,导致会让每一个epoch采样的数据不是所有的数据,所以无法跑完显示的mini-batch是正常现象。后续我们会优化下打印的信息,尽可能减少给大家带来的困惑。
+
+### 1.5 图像识别模块
 
-### Q1.10 有些图片没有识别出结果,为什么?
+#### Q1.5.1 使用`circle loss`还需加`triplet loss`吗?
+**A**:`circle loss`是统一了样本对学习和分类学习的两种形式,如果是分类学习的形式的话,可以增加`triplet loss`。
 
-**A**:在配置文件(如inference_product.yaml)中,`IndexProcess.score_thres`中会控制被识别的图片与库中的图片的余弦相似度的最小值。当余弦相似度小于该值时,不会打印结果。您可以根据自己的实际数据调整该值。
+#### Q1.5.2 如果不是识别开源的四个方向的图片,该使用哪个识别模型?
+**A**:建议使用商品识别模型,一来是因为商品覆盖的范围比较广,被识别的图片是商品的概率更大,二来是因为商品识别模型的训练数据使用了5万类别的数据,泛化能力更好,特征会更鲁棒一些。
 
-### Q1.11 为什么有一些图片检测出的结果就是原图?
+#### Q1.5.3 最后使用512维的向量,为什么不用1024或者其他维度的呢?
+**A**:使用维度小的向量,为了加快计算,在实际使用过程中,可能使用128甚至更小。一般来说,512的维度已经够大,能充分表示特征了。
 
-**A**:主体检测模型会返回检测框,但事实上为了让后续的识别模型更加准确,在返回检测框的同时也返回了原图。后续会根据原图或者检测框与库中的图片的相似度排序,相似度最高的库中图片的标签即为被识别图片的标签。
+
+### 1.6 检索模块
 
-### Q1.12 使用`circle loss`还需加`triplet loss`吗?
+#### Q1.6.1 PaddleClas目前使用的Möbius向量检索算法支持类似于faiss的那种index.add()的功能吗? 另外,每次构建新的图都要进行train吗?这里的train是为了检索加速还是为了构建相似的图?
+**A**:Mobius提供的检索算法是一种基于图的近似最近邻搜索算法,目前支持两种距离计算方式:inner product和L2 distance. faiss中提供的index.add功能暂时不支持,如果需要增加检索库的内容,需要从头重新构建新的index. 在每次构建index时,检索算法内部执行的操作是一种类似于train的过程,不同于faiss提供的train接口,我们命名为build, 主要的目的是为了加速检索的速度。
 
-**A**:`circle loss`是统一了样本对学习和分类学习的两种形式,如果是分类学习的形式的话,可以增加`triplet loss`。
+#### Q1.6.2: PaddleClas 图像识别用于 Eval 的配置文件中,`Query` 和 `Gallery` 配置具体是用于做什么呢?
+**A**: `Query` 与 `Gallery` 均为数据集配置,其中 `Gallery` 用于配置底库数据,`Query` 用于配置验证集。在进行 Eval 时,首先使用模型对 `Gallery` 底库数据进行前向计算特征向量,特征向量用于构建底库,然后模型对 `Query` 验证集中的数据进行前向计算特征向量,再与底库计算召回率等指标。
 
-### Q1.13 hub serving方式启动某个模块,怎么添加该模块的参数呢?
+
+## 2. 实战篇
 
-**A**:具体可以参考[hub serving参数](../../../deploy/hubserving/clas/params.py)。
+
+### 2.1 训练与评估共性问题
 
-### Q1.14  模型训练出nan,为什么?
+#### Q2.1.1 PaddleClas 的`train_log`文件在哪里?
+**A**:在保存权重的路径中存放了`train.log`。
 
+#### Q2.1.2 模型训练出nan,为什么?
 **A**:
-
 1.确保正确加载预训练模型, 最简单的加载方式添加参数`-o Arch.pretrained=True`即可;
-
 2.模型微调时,学习率不要太大,如设置0.001就好。
 
-
-### Q1.15 SSLD中,大模型在500M数据上预训练后蒸馏小模型,然后在1M数据上蒸馏finetune小模型?
-
+#### Q2.1.3 可以对视频中每一帧画面进行逐帧预测吗?
+**A**:可以,但目前PaddleClas并不支持视频输入。可以尝试修改一下PaddleClas代码,或者预先将视频逐帧转为图像存储,再使用PaddleClas进行预测。
+
+#### Q2.1.4: 数据预处理中,不想对输入数据进行裁剪,该如何设置?或者如何设置剪裁的尺寸。
+**A**: PaddleClas 支持的数据预处理算子可在这里查看:`ppcls/data/preprocess/__init__.py`,所有支持的算子均可在配置文件中进行配置,配置的算子名称需要和算子类名一致,参数与对应算子类的构造函数参数一致。如不需要对图像裁剪,则可去掉 `CropImage`、`RandCropImage`,使用 `ResizeImage` 替换即可,可通过其参数设置不同的resize方式, 使用 `size` 参数则直接将图像缩放至固定大小,使用`resize_short` 参数则会维持图像宽高比进行缩放。设置裁剪尺寸时,可通过 `CropImage` 算子的 `size` 参数,或 `RandCropImage` 算子的 `size` 参数。
+
+#### Q2.1.5: PaddlePaddle 安装后,使用报错,无法导入 paddle 下的任何模块(import paddle.xxx),是为什么呢?
+**A**: 首先可以使用以下代码测试 Paddle 是否安装正确:
+```python
+import paddle
+paddle.utils.install_check.run_check()
+```
+正确安装时,通常会有如下提示:
+```
+PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now.
+```
+如未能安装成功,则会有相应问题的提示。
+另外,在同时安装CPU版本和GPU版本Paddle后,由于两个版本存在冲突,需要将两个版本全部卸载,然后重新安装所需要的版本。
+
+#### Q2.1.6: 使用PaddleClas训练时,如何设置仅保存最优模型?不想保存中间模型。
+**A**: PaddleClas在训练过程中,会保存/更新以下三类模型:
+1. 最新的模型(`latest.pdopt`, `latest.pdparams`,`latest.pdstates`),当训练意外中断时,可使用最新保存的模型恢复训练;
+2. 最优的模型(`best_model.pdopt`,`best_model.pdparams`,`best_model.pdstates`);
+3. 训练过程中,一个epoch结束时的断点(`epoch_xxx.pdopt`,`epoch_xxx.pdparams`,`epoch_xxx.pdstates`)。训练配置文件中 `Global.save_interval` 字段表示该模型的保存间隔。将该字段设置大于总epochs数,则不再保存中间断点模型。
+
+
+### 2.2 图像分类
+
+#### Q2.2.1 SSLD中,大模型在500M数据上预训练后蒸馏小模型,然后在1M数据上蒸馏finetune小模型?
 **A**:步骤如下:
+1. 基于facebook开源的`ResNeXt101-32x16d-wsl`模型 去蒸馏得到了`ResNet50-vd`模型;
+2. 用这个`ResNet50-vd`,在500W数据集上去蒸馏`MobilNetV3`;
+3. 考虑到500W的数据集的分布和100W的数据分布不完全一致,所以这块,在100W上的数据上又finetune了一下,精度有微弱的提升。
 
-1.基于facebook开源的`ResNeXt101-32x16d-wsl`模型 去蒸馏得到了`ResNet50-vd`模型;
+#### Q2.2.2 训练SwinTransformer,loss出现nan
+**A**:训练SwinTransformer时,请使用版本大于等于 `2.1.1` 的 `Paddle`,并且加载我们提供的预训练模型,学习率也不宜过大。
 
-2.用这个`ResNet50-vd`,在500W数据集上去蒸馏`MobilNetV3`;
+
+### 2.3 通用检测模块
 
-3.考虑到500W的数据集的分布和100W的数据分布不完全一致,所以这块,在100W上的数据上又finetune了一下,精度有微弱的提升。
+#### Q2.3.1 为什么有一些图片检测出的结果就是原图?
+**A**:主体检测模型会返回检测框,但事实上为了让后续的识别模型更加准确,在返回检测框的同时也返回了原图。后续会根据原图或者检测框与库中的图片的相似度排序,相似度最高的库中图片的标签即为被识别图片的标签。
 
+#### Q2.3.2:在直播场景中,需要提供一个直播即时识别画面,能够在延迟几秒内找到特征目标物并用框圈起,这个可以实现吗?
+**A**:要达到实时的检测效果,需要检测速度达到实时性的要求;PP-YOLO是Paddle团队提供的轻量级目标检测模型,检测速度和精度达到了很好的平衡,可以试试PP-YOLO来做检测. 关于PP-YOLO的使用,可以参照:[PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/configs/ppyolo/README_cn.md)。
 
-### Q1.16 如果不是识别开源的四个方向的图片,该使用哪个识别模型?
+#### Q2.3.3: 对于未知的标签,加入gallery dataset可以用于后续的分类识别(无需训练),但是如果前面的检测模型对于未知的标签无法定位检测出来,是否还是要训练前面的检测模型?
+**A**:如果检测模型在自己的数据集上表现不佳,需要在自己的检测数据集上再finetune下
 
-**A**:建议使用商品识别模型,一来是因为商品覆盖的范围比较广,被识别的图片是商品的概率更大,二来是因为商品识别模型的训练数据使用了5万类别的数据,泛化能力更好,特征会更鲁棒一些。
+
+### 2.4 图像识别模块
 
-### Q1.17 最后使用512维的向量,为什么不用1024或者其他维度的呢?
+#### Q2.4.1: 识别模块预测时报`Illegal instruction`错?
+**A**:可能是编译生成的库文件与您的环境不兼容,导致程序报错,如果报错,推荐参考[向量检索教程](../../../deploy/vector_search/README.md)重新编译库文件。
 
-**A**:使用维度小的向量,为了加快计算,在实际使用过程中,可能使用128甚至更小。一般来说,512的维度已经够大,能充分表示特征了。
+#### Q2.4.2: 识别模型怎么在预训练模型的基础上进行微调训练?
+**A**:识别模型的微调训练和分类模型的微调训练类似,识别模型可以加载商品的预训练模型,训练过程可以参考[识别模型训练](../tutorials/getting_started_retrieval.md),后续我们也会持续细化这块的文档。
 
-### Q1.18 训练SwinTransformer,loss出现nan
+#### Q2.4.3: 训练metric learning时,每个epoch中,无法跑完所有mini-batch,为什么?
+**A**:在训练metric learning时,使用的Sampler是DistributedRandomIdentitySampler,该Sampler不会采样全部的图片,导致会让每一个epoch采样的数据不是所有的数据,所以无法跑完显示的mini-batch是正常现象。后续我们会优化下打印的信息,尽可能减少给大家带来的困惑。
+
+#### Q2.4.4: 有些图片没有识别出结果,为什么?
+**A**:在配置文件(如inference_product.yaml)中,`IndexProcess.score_thres`中会控制被识别的图片与库中的图片的余弦相似度的最小值。当余弦相似度小于该值时,不会打印结果。您可以根据自己的实际数据调整该值。
 
-**A**:训练SwinTransformer的话,需要使用paddle-dev去训练,安装方式参考[paddlepaddle安装方式](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html),后续paddlepaddle-2.1也会同时支持。
+
+### 2.5 检索模块
 
-### Q1.19 新增底库数据需要重新构建索引吗?
+#### Q2.5.1: 添加图片后建索引报`assert text_num >= 2`错?
+**A**:请确保data_file.txt中图片路径和图片名称中间的间隔为单个table,而不是空格。
 
+#### Q2.5.2: 新增底库数据需要重新构建索引吗?
 **A**:这一版需要重新构建索引,未来版本会支持只构建新增图片的索引。
 
-### Q1.20 PaddleClas 的`train_log`文件在哪里?
+#### Q2.5.3: Mac重新编译index.so时报错如下:clang: error: unsupported option '-fopenmp', 该如何处理?
+**A**:该问题已经解决。可以参照[文档](../../../develop/deploy/vector_search/README.md)重新编译 index.so。
 
-**A**:在保存权重的路径中存放了`train.log`。
+
+### 2.6 模型预测部署
+
+#### Q2.6.1: hub serving方式启动某个模块,怎么添加该模块的参数呢?
+**A**:具体可以参考[hub serving参数](../../../deploy/hubserving/clas/params.py)。
+
+#### Q2.6.2: 导出inference模型进行预测部署,准确率异常,为什么呢?
+**A**: 该问题通常是由于在导出时未能正确加载模型参数导致的,首先检查模型导出时的日志,是否存在类似下述内容:
+```
+UserWarning: Skip loading for ***. *** is not found in the provided dict.
+```
+如果存在,则说明模型权重未能加载成功,请进一步检查配置文件中的 `Global.pretrained_model` 字段,是否正确配置了模型权重文件的路径。模型权重文件后缀名通常为 `pdparams`,注意在配置该路径时无需填写文件后缀名。
diff --git a/docs/zh_CN/models/Twins.md b/docs/zh_CN/models/Twins.md
index 424f3985df00216c048e026632c43f9e720f4542..143dc6fe7e199e34e3d91a1f0153a70ba96ca932 100644
--- a/docs/zh_CN/models/Twins.md
+++ b/docs/zh_CN/models/Twins.md
@@ -3,9 +3,9 @@
 ## 概述
 Twins网络包括Twins-PCPVT和Twins-SVT,其重点对空间注意力机制进行了精心设计,得到了简单却更为有效的方案。由于该体系结构仅涉及矩阵乘法,而目前的深度学习框架中对矩阵乘法有较高的优化程度,因此该体系结构十分高效且易于实现。并且,该体系结构在图像分类、目标检测和语义分割等多种下游视觉任务中都能够取得优异的性能。[论文地址](https://arxiv.org/abs/2104.13840)。
 
-## 精度、FLOPS和参数量
+## 精度、FLOPs和参数量
 
-| Models        | Top1 | Top5 | Reference
top1 | Reference
top5 | FLOPS
(G) | Params
(M) |
+| Models        | Top1 | Top5 | Reference
top1 | Reference
top5 | FLOPs
(G) | Params
(M) |
 |:--:|:--:|:--:|:--:|:--:|:--:|:--:|
 | pcpvt_small   | 0.8082 | 0.9552 | 0.812 | - | 3.7 | 24.1   |
 | pcpvt_base    | 0.8242 | 0.9619 | 0.827 | - | 6.4 | 43.8   |
diff --git a/docs/zh_CN/tutorials/getting_started.md b/docs/zh_CN/tutorials/getting_started.md
index cacd483ff859d3d933216106903ea7ff99e82562..3708d3e82cff9334bc872499e7931ad6994b7889 100644
--- a/docs/zh_CN/tutorials/getting_started.md
+++ b/docs/zh_CN/tutorials/getting_started.md
@@ -30,7 +30,7 @@ python3 tools/train.py \
 
 其中,`-c`用于指定配置文件的路径,`-o`用于指定需要修改或者添加的参数,其中`-o Arch.pretrained=False`表示不使用预训练模型,`-o Global.device=gpu`表示使用GPU进行训练。如果希望使用CPU进行训练,则需要将`Global.device`设置为`cpu`。
 
-更详细的训练配置,也可以直接修改模型对应的配置文件。具体配置参数参考[配置文档](config.md)。
+更详细的训练配置,也可以直接修改模型对应的配置文件。具体配置参数参考[配置文档](config_description.md)。
 
 运行上述命令,可以看到输出日志,示例如下:
 
@@ -244,7 +244,7 @@ python3 python/predict_cls.py \
     -c configs/inference_cls.yaml \
     -o Global.infer_imgs=../dataset/flowers102/jpg/image_00001.jpg \
     -o Global.inference_model_dir=../inference/ \
-    -o PostProcess.class_id_map_file=None
+    -o PostProcess.Topk.class_id_map_file=None
 
 
 其中:
diff --git a/docs/zh_CN/tutorials/quick_start_professional.md b/docs/zh_CN/tutorials/quick_start_professional.md
index 769e4d443841df8dc9c0ff4877a171bb9eddf1b1..2ab5ea5da9ffbc65c99944e420fce9117cfcb5d9 100644
--- a/docs/zh_CN/tutorials/quick_start_professional.md
+++ b/docs/zh_CN/tutorials/quick_start_professional.md
@@ -128,7 +128,7 @@ python3 -m paddle.distributed.launch \
 PaddleClas包含了自研的SSLD知识蒸馏方案,具体的内容可以参考[知识蒸馏章节](../advanced_tutorials/distillation/distillation.md), 本小节将尝试使用知识蒸馏技术对MobileNetV3_large_x1_0模型进行训练,使用`2.1.2小节`训练得到的ResNet50_vd模型作为蒸馏所用的教师模型,首先将`2.1.2小节`训练得到的ResNet50_vd模型保存到指定目录,脚本如下。
 
 ```shell
-mkdir pretrained 
+mkdir pretrained
 cp -r output_CIFAR/ResNet50_vd/best_model.pdparams  ./pretrained/
 ```
 
@@ -256,5 +256,5 @@ PreProcess:
 python3 python/predict_cls.py \
     -c configs/inference_cls.yaml \
     -o Global.infer_imgs=../dataset/CIFAR100/test/0/0001.png \
-    -o PostProcess.class_id_map_file=None
+    -o PostProcess.Topk.class_id_map_file=None
 ```
diff --git a/docs/zh_CN/whl.md b/docs/zh_CN/whl.md
index 58051c52db51b49c18cdc60c5ef541339a5fddeb..2a138c7fa1e9e575e61440308de74dba36abcc4d 100644
--- a/docs/zh_CN/whl.md
+++ b/docs/zh_CN/whl.md
@@ -5,7 +5,7 @@
 * pip安装
 
 ```bash
-pip3 install paddleclas==2.2.0
+pip3 install paddleclas==2.2.1
 ```
 
 * 本地构建并安装
diff --git a/docs/zh_CN_tmp/.gitkeep b/docs/zh_CN_tmp/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docs/zh_CN_tmp/advanced_tutorials/.gitkeep b/docs/zh_CN_tmp/advanced_tutorials/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docs/zh_CN_tmp/algorithm_introduction/.gitkeep b/docs/zh_CN_tmp/algorithm_introduction/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docs/zh_CN_tmp/data_preparation/.gitkeep b/docs/zh_CN_tmp/data_preparation/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docs/zh_CN_tmp/faq_series/.gitkeep b/docs/zh_CN_tmp/faq_series/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docs/zh_CN_tmp/image_recognition_pipeline/.gitkeep b/docs/zh_CN_tmp/image_recognition_pipeline/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docs/zh_CN_tmp/inference_deployment/.gitkeep b/docs/zh_CN_tmp/inference_deployment/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docs/zh_CN_tmp/installation/.gitkeep b/docs/zh_CN_tmp/installation/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docs/zh_CN_tmp/introduction/.gitkeep b/docs/zh_CN_tmp/introduction/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docs/zh_CN_tmp/models_training/.gitkeep b/docs/zh_CN_tmp/models_training/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docs/zh_CN_tmp/quick_start/.gitkeep b/docs/zh_CN_tmp/quick_start/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/paddleclas.py b/paddleclas.py
index d0bd6b14a1386c908a77c25dcb93a4ff60a5efe6..91cd030abcb5fb24cece4ac508c7f0eb46e779fc 100644
--- a/paddleclas.py
+++ b/paddleclas.py
@@ -18,6 +18,7 @@ __dir__ = os.path.dirname(__file__)
 sys.path.append(os.path.join(__dir__, ""))
 sys.path.append(os.path.join(__dir__, "deploy"))
 
+from typing import Union, Generator
 import argparse
 import shutil
 import textwrap
@@ -356,7 +357,7 @@ def download_with_progressbar(url, save_path):
 
 
 def check_model_file(model_name):
-    """Check the model files exist and download and untar when no exist. 
+    """Check the model files exist and download and untar when no exist.
     """
     storage_directory = partial(os.path.join, BASE_INFERENCE_MODEL_DIR,
                                 model_name)
@@ -410,11 +411,11 @@ class PaddleClas(object):
         """Init PaddleClas with config.
 
         Args:
-            model_name: The model name supported by PaddleClas, default by None. If specified, override config.
-            inference_model_dir: The directory that contained model file and params file to be used, default by None. If specified, override config.
-            use_gpu: Whether use GPU, default by None. If specified, override config.
-            batch_size: The batch size to pridict, default by None. If specified, override config.
-            topk: Return the top k prediction results with the highest score.
+            model_name (str, optional): The model name supported by PaddleClas. If specified, override config. Defaults to None.
+            inference_model_dir (str, optional): The directory that contained model file and params file to be used. If specified, override config. Defaults to None.
+            use_gpu (bool, optional): Whether use GPU. If specified, override config. Defaults to True.
+            batch_size (int, optional): The batch size to pridict. If specified, override config. Defaults to 1.
+            topk (int, optional): Return the top k prediction results with the highest score. Defaults to 5.
         """
         super().__init__()
         self._config = init_config(model_name, inference_model_dir, use_gpu,
@@ -459,20 +460,26 @@ class PaddleClas(object):
             raise InputModelError(err)
         return
 
-    def predict(self, input_data, print_pred=False):
+    def predict(self, input_data: Union[str, np.array],
+                print_pred: bool=False) -> Generator[list, None, None]:
         """Predict input_data.
 
         Args:
-            input_data (str | NumPy.array): The path of image, or the directory containing images, or the URL of image from Internet.
-            print_pred (bool, optional): Whether print the prediction result. Defaults to False.
+            input_data (Union[str, np.array]): 
+                When the type is str, it is the path of image, or the directory containing images, or the URL of image from Internet.
+                When the type is np.array, it is the image data whose channel order is RGB.
+            print_pred (bool, optional): Whether print the prediction result. Defaults to False. Defaults to False.
 
         Raises:
             ImageTypeError: Illegal input_data.
 
         Yields:
-            list: The prediction result(s) of input_data by batch_size. For every one image, prediction result(s) is zipped as a dict, that includs topk "class_ids", "scores" and "label_names". The format is as follow:
-            [{"class_ids": [...], "scores": [...], "label_names": [...]}, ...]
+            Generator[list, None, None]: 
+                The prediction result(s) of input_data by batch_size. For every one image, 
+                prediction result(s) is zipped as a dict, that includs topk "class_ids", "scores" and "label_names". 
+                The format is as follow: [{"class_ids": [...], "scores": [...], "label_names": [...]}, ...]
         """
+
         if isinstance(input_data, np.ndarray):
             outputs = self.cls_predictor.predict(input_data)
             yield self.cls_predictor.postprocess(outputs)
@@ -502,6 +509,7 @@ class PaddleClas(object):
                         f"Image file failed to read and has been skipped. The path: {img_path}"
                     )
                     continue
+                img = img[:, :, ::-1]
                 img_list.append(img)
                 img_path_list.append(img_path)
                 cnt += 1
diff --git a/ppcls/arch/backbone/base/theseus_layer.py b/ppcls/arch/backbone/base/theseus_layer.py
index 8ef7913f89a55586698ffca985dc3eb61ed4ff24..f328aed6f0a5109b9bb5e7e5ccc37438ff734d51 100644
--- a/ppcls/arch/backbone/base/theseus_layer.py
+++ b/ppcls/arch/backbone/base/theseus_layer.py
@@ -12,15 +12,9 @@ class Identity(nn.Layer):
 
 
 class TheseusLayer(nn.Layer):
-    def __init__(self, *args, return_patterns=None, **kwargs):
+    def __init__(self, *args, **kwargs):
         super(TheseusLayer, self).__init__()
-        self.res_dict = None
-        if return_patterns is not None:
-            self._update_res(return_patterns)
-
-    def forward(self, *input, res_dict=None, **kwargs):
-        if res_dict is not None:
-            self.res_dict = res_dict
+        self.res_dict = {}
 
     # stop doesn't work when stop layer has a parallel branch.
     def stop_after(self, stop_layer_name: str):
@@ -38,33 +32,43 @@ class TheseusLayer(nn.Layer):
                     stop_layer_name)
         return after_stop
 
-    def _update_res(self, return_layers):
+    def update_res(self, return_patterns):
+        if not return_patterns or isinstance(self, WrapLayer):
+            return
+        for layer_i in self._sub_layers:
+            layer_name = self._sub_layers[layer_i].full_name()
+            if isinstance(self._sub_layers[layer_i], (nn.Sequential, nn.LayerList)):
+                self._sub_layers[layer_i] = wrap_theseus(self._sub_layers[layer_i])
+                self._sub_layers[layer_i].res_dict = self.res_dict
+                self._sub_layers[layer_i].update_res(return_patterns)
+            else:
+                for return_pattern in return_patterns:
+                    if re.match(return_pattern, layer_name):
+                        if not isinstance(self._sub_layers[layer_i], TheseusLayer):
+                            self._sub_layers[layer_i] = wrap_theseus(self._sub_layers[layer_i])
+                        self._sub_layers[layer_i].register_forward_post_hook(
+                            self._sub_layers[layer_i]._save_sub_res_hook)
+                        self._sub_layers[layer_i].res_dict = self.res_dict
+                if isinstance(self._sub_layers[layer_i], TheseusLayer):
+                    self._sub_layers[layer_i].res_dict = self.res_dict
+                    self._sub_layers[layer_i].update_res(return_patterns)
+
+    def _save_sub_res_hook(self, layer, input, output):
+        self.res_dict[layer.full_name()] = output
+
+    def replace_sub(self, layer_name_pattern, replace_function, recursive=True):
         for layer_i in self._sub_layers:
             layer_name = self._sub_layers[layer_i].full_name()
-            for return_pattern in return_layers:
-                if return_layers is not None and re.match(return_pattern,
-                                                          layer_name):
-                    self._sub_layers[layer_i].register_forward_post_hook(
-                        self._save_sub_res_hook)
-
-    def replace_sub(self, layer_name_pattern, replace_function,
-                    recursive=True):
-        for k in self._sub_layers.keys():
-            layer_name = self._sub_layers[k].full_name()
             if re.match(layer_name_pattern, layer_name):
-                self._sub_layers[k] = replace_function(self._sub_layers[k])
+                self._sub_layers[layer_i] = replace_function(self._sub_layers[layer_i])
             if recursive:
-                if isinstance(self._sub_layers[k], TheseusLayer):
-                    self._sub_layers[k].replace_sub(
+                if isinstance(self._sub_layers[layer_i], TheseusLayer):
+                    self._sub_layers[layer_i].replace_sub(
                         layer_name_pattern, replace_function, recursive)
-                elif isinstance(self._sub_layers[k],
-                                nn.Sequential) or isinstance(
-                                    self._sub_layers[k], nn.LayerList):
-                    for kk in self._sub_layers[k]._sub_layers.keys():
-                        self._sub_layers[k]._sub_layers[kk].replace_sub(
+                elif isinstance(self._sub_layers[layer_i], (nn.Sequential, nn.LayerList)):
+                    for layer_j in self._sub_layers[layer_i]._sub_layers:
+                        self._sub_layers[layer_i]._sub_layers[layer_j].replace_sub(
                             layer_name_pattern, replace_function, recursive)
-                else:
-                    pass
 
     '''
     example of replace function:
@@ -78,3 +82,40 @@ class TheseusLayer(nn.Layer):
         return new_conv
 
         '''
+
+
+class WrapLayer(TheseusLayer):
+    def __init__(self, sub_layer):
+        super(WrapLayer, self).__init__()
+        self.sub_layer = sub_layer
+        self.name = sub_layer.full_name()
+
+    def full_name(self):
+        return self.name
+
+    def forward(self, *inputs, **kwargs):
+        return self.sub_layer(*inputs, **kwargs)
+
+    def update_res(self, return_patterns):
+        if not return_patterns or not isinstance(self.sub_layer, (nn.Sequential, nn.LayerList)):
+            return
+        for layer_i in self.sub_layer._sub_layers:
+            if isinstance(self.sub_layer._sub_layers[layer_i], (nn.Sequential, nn.LayerList)):
+                self.sub_layer._sub_layers[layer_i] = wrap_theseus(self.sub_layer._sub_layers[layer_i])
+                self.sub_layer._sub_layers[layer_i].res_dict = self.res_dict
+                self.sub_layer._sub_layers[layer_i].update_res(return_patterns)
+
+            layer_name = self.sub_layer._sub_layers[layer_i].full_name()
+            for return_pattern in return_patterns:
+                if re.match(return_pattern, layer_name):
+                    self.sub_layer._sub_layers[layer_i].res_dict = self.res_dict
+                    self.sub_layer._sub_layers[layer_i].register_forward_post_hook(
+                        self._sub_layers[layer_i]._save_sub_res_hook)
+
+            if isinstance(self.sub_layer._sub_layers[layer_i], TheseusLayer):
+                self.sub_layer._sub_layers[layer_i].update_res(return_patterns)
+
+
+def wrap_theseus(sub_layer):
+    wrapped_layer = WrapLayer(sub_layer)
+    return wrapped_layer
diff --git a/ppcls/arch/backbone/legendary_models/vgg.py b/ppcls/arch/backbone/legendary_models/vgg.py
index 7868b51eafce4f0bd383ad66199e50f2a05c1832..a45637d2a6a89bad081ca4452497e540872dfafe 100644
--- a/ppcls/arch/backbone/legendary_models/vgg.py
+++ b/ppcls/arch/backbone/legendary_models/vgg.py
@@ -111,7 +111,7 @@ class VGGNet(TheseusLayer):
         model: nn.Layer. Specific VGG model depends on args.
     """
 
-    def __init__(self, config, stop_grad_layers=0, class_num=1000):
+    def __init__(self, config, stop_grad_layers=0, class_num=1000, return_patterns=None):
         super().__init__()
 
         self.stop_grad_layers = stop_grad_layers
@@ -138,7 +138,7 @@ class VGGNet(TheseusLayer):
         self.fc2 = Linear(4096, 4096)
         self.fc3 = Linear(4096, class_num)
 
-    def forward(self, inputs):
+    def forward(self, inputs, res_dict=None):
         x = self.conv_block_1(inputs)
         x = self.conv_block_2(x)
         x = self.conv_block_3(x)
@@ -152,6 +152,9 @@ class VGGNet(TheseusLayer):
         x = self.relu(x)
         x = self.drop(x)
         x = self.fc3(x)
+        if self.res_dict and res_dict is not None:
+            for res_key in list(self.res_dict):
+                res_dict[res_key] = self.res_dict.pop(res_key)
         return x
 
 
diff --git a/ppcls/arch/backbone/model_zoo/gvt.py b/ppcls/arch/backbone/model_zoo/gvt.py
index eb5643b6389c372450f01bd6c645264db61de35c..3553073dad8f2110ddaca59a451230c447812bc9 100644
--- a/ppcls/arch/backbone/model_zoo/gvt.py
+++ b/ppcls/arch/backbone/model_zoo/gvt.py
@@ -82,11 +82,11 @@ class GroupAttention(nn.Layer):
             B, total_groups, self.ws**2, 3, self.num_heads, C // self.num_heads
         ]).transpose([3, 0, 1, 4, 2, 5])
         q, k, v = qkv[0], qkv[1], qkv[2]
-        attn = (q @ k.transpose([0, 1, 2, 4, 3])) * self.scale
+        attn = paddle.matmul(q, k.transpose([0, 1, 2, 4, 3])) * self.scale
 
         attn = nn.Softmax(axis=-1)(attn)
         attn = self.attn_drop(attn)
-        attn = (attn @ v).transpose([0, 1, 3, 2, 4]).reshape(
+        attn = paddle.matmul(attn, v).transpose([0, 1, 3, 2, 4]).reshape(
             [B, h_group, w_group, self.ws, self.ws, C])
 
         x = attn.transpose([0, 1, 3, 2, 4, 5]).reshape([B, N, C])
@@ -147,11 +147,11 @@ class Attention(nn.Layer):
                     [2, 0, 3, 1, 4])
         k, v = kv[0], kv[1]
 
-        attn = (q @ k.transpose([0, 1, 3, 2])) * self.scale
+        attn = paddle.matmul(q, k.transpose([0, 1, 3, 2])) * self.scale
         attn = nn.Softmax(axis=-1)(attn)
         attn = self.attn_drop(attn)
 
-        x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B, N, C])
+        x = paddle.matmul(attn, v).transpose([0, 2, 1, 3]).reshape([B, N, C])
         x = self.proj(x)
         x = self.proj_drop(x)
         return x
@@ -350,7 +350,6 @@ class PyramidVisionTransformer(nn.Layer):
             shape=[1, 1, embed_dims[-1]],
             default_initializer=zeros_,
             attr=paddle.ParamAttr(regularizer=L2Decay(0.0)))
-        self.add_parameter("cls_token", self.cls_token)
 
         # classification head
         self.head = nn.Linear(embed_dims[-1],
diff --git a/ppcls/arch/backbone/model_zoo/resnext101_wsl.py b/ppcls/arch/backbone/model_zoo/resnext101_wsl.py
index 2b3c88b55f391e48b79971bcd11c1aed5f5e62f6..e85e13388ab13586f8dac6e8bd42fe68bfc1ca52 100644
--- a/ppcls/arch/backbone/model_zoo/resnext101_wsl.py
+++ b/ppcls/arch/backbone/model_zoo/resnext101_wsl.py
@@ -12,7 +12,7 @@ MODEL_URLS = {
     "ResNeXt101_32x8d_wsl":
     "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x8d_wsl_pretrained.pdparams",
     "ResNeXt101_32x16d_wsl":
-    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x816_wsl_pretrained.pdparams",
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x16_wsl_pretrained.pdparams",
     "ResNeXt101_32x32d_wsl":
     "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x32d_wsl_pretrained.pdparams",
     "ResNeXt101_32x48d_wsl":
@@ -460,17 +460,17 @@ def ResNeXt101_32x8d_wsl(pretrained=False, use_ssld=False, **kwargs):
     return model
 
 
-def ResNeXt101_32x16d_wsl(**args):
+def ResNeXt101_32x16d_wsl(pretrained=False, use_ssld=False, **kwargs):
     model = ResNeXt101WSL(cardinality=32, width=16, **kwargs)
     _load_pretrained(
         pretrained,
         model,
-        MODEL_URLS["ResNeXt101_32x16d_ws"],
+        MODEL_URLS["ResNeXt101_32x16d_wsl"],
         use_ssld=use_ssld)
     return model
 
 
-def ResNeXt101_32x32d_wsl(**args):
+def ResNeXt101_32x32d_wsl(pretrained=False, use_ssld=False, **kwargs):
     model = ResNeXt101WSL(cardinality=32, width=32, **kwargs)
     _load_pretrained(
         pretrained,
@@ -480,7 +480,7 @@ def ResNeXt101_32x32d_wsl(**args):
     return model
 
 
-def ResNeXt101_32x48d_wsl(**args):
+def ResNeXt101_32x48d_wsl(pretrained=False, use_ssld=False, **kwargs):
     model = ResNeXt101WSL(cardinality=32, width=48, **kwargs)
     _load_pretrained(
         pretrained,
diff --git a/ppcls/arch/backbone/model_zoo/swin_transformer.py b/ppcls/arch/backbone/model_zoo/swin_transformer.py
index f4348fbaea601bdf8527dfbec9e66cba1dede579..8ce810c2f2779ab9f98afc38f720adf8e9b433c6 100644
--- a/ppcls/arch/backbone/model_zoo/swin_transformer.py
+++ b/ppcls/arch/backbone/model_zoo/swin_transformer.py
@@ -33,9 +33,9 @@ MODEL_URLS = {
     "SwinTransformer_base_patch4_window12_384":
     "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformer_base_patch4_window12_384_pretrained.pdparams",
     "SwinTransformer_large_patch4_window7_224":
-    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformer_large_patch4_window7_224_pretrained.pdparams",
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformer_large_patch4_window7_224_22kto1k_pretrained.pdparams",
     "SwinTransformer_large_patch4_window12_384":
-    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformer_large_patch4_window12_384_pretrained.pdparams",
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformer_large_patch4_window12_384_22kto1k_pretrained.pdparams",
 }
 
 __all__ = list(MODEL_URLS.keys())
diff --git a/ppcls/configs/ImageNet/GhostNet/GhostNet_x0_5.yaml b/ppcls/configs/ImageNet/GhostNet/GhostNet_x0_5.yaml
index ab50f8a2410034c7f53aa065f309e082f0a70c48..ecf64c0a1d63322de4f1fe6dca31ce504ef912a3 100644
--- a/ppcls/configs/ImageNet/GhostNet/GhostNet_x0_5.yaml
+++ b/ppcls/configs/ImageNet/GhostNet/GhostNet_x0_5.yaml
@@ -24,6 +24,7 @@ Loss:
   Train:
     - CELoss:
         weight: 1.0
+        epsilon: 0.1
   Eval:
     - CELoss:
         weight: 1.0
@@ -35,9 +36,10 @@ Optimizer:
   lr:
     name: Cosine
     learning_rate: 0.8
+    warmup_epoch: 5
   regularizer:
     name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_0.yaml b/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_0.yaml
index d28748a915545cae954115c33a4e071900fe9d63..613aef80bde35f60c4832cb1763f069a90bdb938 100644
--- a/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_0.yaml
+++ b/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_0.yaml
@@ -24,6 +24,7 @@ Loss:
   Train:
     - CELoss:
         weight: 1.0
+        epsilon: 0.1
   Eval:
     - CELoss:
         weight: 1.0
@@ -35,9 +36,10 @@ Optimizer:
   lr:
     name: Cosine
     learning_rate: 0.8
+    warmup_epoch: 5
   regularizer:
     name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_3.yaml b/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_3.yaml
index b6b03c781f8283577c0d6a78a950ed58ce893563..d1d40e0a700f24ff142c6318339e74c9a8f2aa98 100644
--- a/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_3.yaml
+++ b/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_3.yaml
@@ -24,6 +24,7 @@ Loss:
   Train:
     - CELoss:
         weight: 1.0
+        epsilon: 0.1
   Eval:
     - CELoss:
         weight: 1.0
@@ -35,9 +36,10 @@ Optimizer:
   lr:
     name: Cosine
     learning_rate: 0.8
+    warmup_epoch: 5
   regularizer:
     name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml b/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
index a39496d7b37fe0143c56cfce88ebe7dba5e48989..53d1d1dd27659da5be30f4e9b21ce8084c332f57 100644
--- a/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
@@ -41,7 +41,7 @@ Optimizer:
     values: [0.1, 0.01, 0.001, 0.0001]
   regularizer:
     name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_25.yaml b/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_25.yaml
index b4de5a9131c94acc6f671b75bcaa52d605fcc7ac..9fc0dd98910df880d95c5ef030ec7eae16deac5f 100644
--- a/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_25.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_25.yaml
@@ -39,7 +39,7 @@ Optimizer:
     values: [0.1, 0.01, 0.001, 0.0001]
   regularizer:
     name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_5.yaml b/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_5.yaml
index 1728d0a722b4cb782e9fe11ae3b6d82f4ac05684..ef70964095432d5dc97ce97835064599d85dca6a 100644
--- a/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_5.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_5.yaml
@@ -39,7 +39,7 @@ Optimizer:
     values: [0.1, 0.01, 0.001, 0.0001]
   regularizer:
     name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_75.yaml b/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_75.yaml
index ca80fb0b7213f558a0f959697231b6627ab42561..b9be283f39e37e32428c5b316db290fd57794a0d 100644
--- a/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_75.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_75.yaml
@@ -39,7 +39,7 @@ Optimizer:
     values: [0.1, 0.01, 0.001, 0.0001]
   regularizer:
     name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
index 40fc2d1f4fb81a33ed6b66f29b6cf6d9bbdae6a3..fc0e61bb6006e48d6b5856b2db5c1ff286cdd70e 100644
--- a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
@@ -39,7 +39,7 @@ Optimizer:
     learning_rate: 0.045
   regularizer:
     name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_25.yaml b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_25.yaml
index a08cee8798daaea22ca1fe8b687287fc9e4d1365..0ebe45893a7fee0de1678b37121cf7ec8c2d9df0 100644
--- a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_25.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_25.yaml
@@ -37,7 +37,7 @@ Optimizer:
     learning_rate: 0.045
   regularizer:
     name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_5.yaml b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_5.yaml
index f541d24477b0f1cd3a05d4e88e2e7db6548cd9b5..c2023908f13b40ed8580703a6deb9ac6d9f59c82 100644
--- a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_5.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_5.yaml
@@ -37,7 +37,7 @@ Optimizer:
     learning_rate: 0.045
   regularizer:
     name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_75.yaml b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_75.yaml
index 1ade41b7e830a0a3deefc6b23001137d25b02219..e581d72f9d9da3ac966ddbfa793d014b721277fd 100644
--- a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_75.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_75.yaml
@@ -37,7 +37,7 @@ Optimizer:
     learning_rate: 0.045
   regularizer:
     name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x1_5.yaml b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x1_5.yaml
index 555f6a207dd77acf52a26e1d50688780850b6c82..2ac0d6992df480166b52b277116a8c98ab376fca 100644
--- a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x1_5.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x1_5.yaml
@@ -37,7 +37,7 @@ Optimizer:
     learning_rate: 0.045
   regularizer:
     name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x2_0.yaml b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x2_0.yaml
index 03fba8d32a28ceeea1800a743d21a97460f11429..ca4bf665039e8eaa5c6f54cf44dd71486b0df440 100644
--- a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x2_0.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x2_0.yaml
@@ -37,7 +37,7 @@ Optimizer:
     learning_rate: 0.045
   regularizer:
     name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml b/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..792311a3a613bb2beb8d9fd206306bd331e633e4
--- /dev/null
+++ b/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x16d_wsl
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml b/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a943ebb47964e0dcc37d7900ce3316f657e06aa1
--- /dev/null
+++ b/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x32d_wsl
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml b/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7f8fa72b993ebbc2a3f791e7ffba54fa1d72e423
--- /dev/null
+++ b/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x48d_wsl
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml b/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3266c34676e416aa376ea326f3fc0d97ab297bec
--- /dev/null
+++ b/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x8d_wsl
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_swish.yaml b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_swish.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..79bb34edec97c1af7398cbd36afa3bc1ed6b5bc4
--- /dev/null
+++ b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_swish.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_swish
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_25.yaml b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_25.yaml
index ec2ecdbd46663fcafe7148d6fc9842459f92ff28..aa1cce189d4d9bc58166d42466f997750cbfc8c6 100644
--- a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_25.yaml
+++ b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_25.yaml
@@ -38,7 +38,7 @@ Optimizer:
     warmup_epoch: 5
   regularizer:
     name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_33.yaml b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_33.yaml
index cf9781a16c9fe9b3a2c529d4b5ce0617bc1d0057..3f0742e542eac487930b7713b5885a2fb75763c1 100644
--- a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_33.yaml
+++ b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_33.yaml
@@ -38,7 +38,7 @@ Optimizer:
     warmup_epoch: 5
   regularizer:
     name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_5.yaml b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_5.yaml
index dc7376af89ae166483a2c8033d8d7193296195ab..f14a24923998bc635e1e25b4a72475d4f893ea1b 100644
--- a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_5.yaml
+++ b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_5.yaml
@@ -38,7 +38,7 @@ Optimizer:
     warmup_epoch: 5
   regularizer:
     name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
index 8a7d41952b4dedec8f6376b791b8d8596e628f58..dd4820da45ab49302510ed834bce584613721e05 100644
--- a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
+++ b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
@@ -38,7 +38,7 @@ Optimizer:
     warmup_epoch: 5
   regularizer:
     name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_5.yaml b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_5.yaml
index a8cc24fa69b4ac5d76531be21264edcf70302009..9a05a59d60b6814299d338cf638be14f29ff5873 100644
--- a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_5.yaml
+++ b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_5.yaml
@@ -38,7 +38,7 @@ Optimizer:
     warmup_epoch: 5
   regularizer:
     name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x2_0.yaml b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x2_0.yaml
index d92132a8f5f6c852efa770bf9345cb3884129da8..c871ec7dcc7aa84158b9c20ab75057f499d8d0bb 100644
--- a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x2_0.yaml
+++ b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x2_0.yaml
@@ -38,7 +38,7 @@ Optimizer:
     warmup_epoch: 5
   regularizer:
     name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 
 
 # data loader for train and eval
diff --git a/ppcls/data/preprocess/ops/random_erasing.py b/ppcls/data/preprocess/ops/random_erasing.py
index d96ceda6e895520e0e4bd3036804276f864b769d..b395d5205bc7aab7aba7098f832e4c470638913d 100644
--- a/ppcls/data/preprocess/ops/random_erasing.py
+++ b/ppcls/data/preprocess/ops/random_erasing.py
@@ -42,9 +42,9 @@ class RandomErasing(object):
             h = int(round(math.sqrt(target_area * aspect_ratio)))
             w = int(round(math.sqrt(target_area / aspect_ratio)))
 
-            if w < img.shape[2] and h < img.shape[1]:
-                x1 = random.randint(0, img.shape[1] - h)
-                y1 = random.randint(0, img.shape[2] - w)
+            if w < img.shape[1] and h < img.shape[0]:
+                x1 = random.randint(0, img.shape[0] - h)
+                y1 = random.randint(0, img.shape[1] - w)
                 if img.shape[0] == 3:
                     img[x1:x1 + h, y1:y1 + w, 0] = self.mean[0]
                     img[x1:x1 + h, y1:y1 + w, 1] = self.mean[1]
diff --git a/ppcls/engine/engine.py b/ppcls/engine/engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad5c584f056b422fb113306a7cbebf2e56347a7b
--- /dev/null
+++ b/ppcls/engine/engine.py
@@ -0,0 +1,391 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import platform
+import paddle
+import paddle.distributed as dist
+from visualdl import LogWriter
+from paddle import nn
+
+from ppcls.utils.check import check_gpu
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+from ppcls.utils.logger import init_logger
+from ppcls.utils.config import print_config
+from ppcls.data import build_dataloader
+from ppcls.arch import build_model, RecModel, DistillationModel
+from ppcls.arch import apply_to_static
+from ppcls.loss import build_loss
+from ppcls.metric import build_metrics
+from ppcls.optimizer import build_optimizer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+from ppcls.utils.save_load import init_model
+from ppcls.utils import save_load
+
+from ppcls.data.utils.get_image_list import get_image_list
+from ppcls.data.postprocess import build_postprocess
+from ppcls.data import create_operators
+from ppcls.engine.train import train_epoch
+from ppcls.engine import evaluation
+from ppcls.arch.gears.identity_head import IdentityHead
+
+
+class Engine(object):
+    def __init__(self, config, mode="train"):
+        assert mode in ["train", "eval", "infer", "export"]
+        self.mode = mode
+        self.config = config
+        self.eval_mode = self.config["Global"].get("eval_mode",
+                                                   "classification")
+        # init logger
+        self.output_dir = self.config['Global']['output_dir']
+        log_file = os.path.join(self.output_dir, self.config["Arch"]["name"],
+                                f"{mode}.log")
+        init_logger(name='root', log_file=log_file)
+        print_config(config)
+
+        # init train_func and eval_func
+        assert self.eval_mode in ["classification", "retrieval"], logger.error(
+            "Invalid eval mode: {}".format(self.eval_mode))
+        self.train_epoch_func = train_epoch
+        self.eval_func = getattr(evaluation, self.eval_mode + "_eval")
+
+        self.use_dali = self.config['Global'].get("use_dali", False)
+
+        # for visualdl
+        self.vdl_writer = None
+        if self.config['Global']['use_visualdl'] and mode == "train":
+            vdl_writer_path = os.path.join(self.output_dir, "vdl")
+            if not os.path.exists(vdl_writer_path):
+                os.makedirs(vdl_writer_path)
+            self.vdl_writer = LogWriter(logdir=vdl_writer_path)
+
+        # set device
+        assert self.config["Global"]["device"] in ["cpu", "gpu", "xpu"]
+        self.device = paddle.set_device(self.config["Global"]["device"])
+        logger.info('train with paddle {} and device {}'.format(
+            paddle.__version__, self.device))
+
+        # AMP training
+        self.amp = True if "AMP" in self.config else False
+        if self.amp and self.config["AMP"] is not None:
+            self.scale_loss = self.config["AMP"].get("scale_loss", 1.0)
+            self.use_dynamic_loss_scaling = self.config["AMP"].get(
+                "use_dynamic_loss_scaling", False)
+        else:
+            self.scale_loss = 1.0
+            self.use_dynamic_loss_scaling = False
+        if self.amp:
+            AMP_RELATED_FLAGS_SETTING = {
+                'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
+                'FLAGS_max_inplace_grad_add': 8,
+            }
+            paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
+
+        # build dataloader
+        if self.mode == 'train':
+            self.train_dataloader = build_dataloader(
+                self.config["DataLoader"], "Train", self.device, self.use_dali)
+        if self.mode in ["train", "eval"]:
+            if self.eval_mode == "classification":
+                self.eval_dataloader = build_dataloader(
+                    self.config["DataLoader"], "Eval", self.device,
+                    self.use_dali)
+            elif self.eval_mode == "retrieval":
+                self.gallery_dataloader = build_dataloader(
+                    self.config["DataLoader"]["Eval"], "Gallery", self.device,
+                    self.use_dali)
+                self.query_dataloader = build_dataloader(
+                    self.config["DataLoader"]["Eval"], "Query", self.device,
+                    self.use_dali)
+
+        # build loss
+        if self.mode == "train":
+            loss_info = self.config["Loss"]["Train"]
+            self.train_loss_func = build_loss(loss_info)
+        if self.mode in ["train", "eval"]:
+            loss_config = self.config.get("Loss", None)
+            if loss_config is not None:
+                loss_config = loss_config.get("Eval")
+                if loss_config is not None:
+                    self.eval_loss_func = build_loss(loss_config)
+                else:
+                    self.eval_loss_func = None
+            else:
+                self.eval_loss_func = None
+
+        # build metric
+        if self.mode == 'train':
+            metric_config = self.config.get("Metric")
+            if metric_config is not None:
+                metric_config = metric_config.get("Train")
+                if metric_config is not None:
+                    self.train_metric_func = build_metrics(metric_config)
+                else:
+                    self.train_metric_func = None
+        else:
+            self.train_metric_func = None
+
+        if self.mode in ["train", "eval"]:
+            metric_config = self.config.get("Metric")
+            if self.eval_mode == "classification":
+                if metric_config is not None:
+                    metric_config = metric_config.get("Eval")
+                    if metric_config is not None:
+                        self.eval_metric_func = build_metrics(metric_config)
+            elif self.eval_mode == "retrieval":
+                if metric_config is None:
+                    metric_config = [{"name": "Recallk", "topk": (1, 5)}]
+                else:
+                    metric_config = metric_config["Eval"]
+                self.eval_metric_func = build_metrics(metric_config)
+        else:
+            self.eval_metric_func = None
+
+        # build model
+        self.model = build_model(self.config["Arch"])
+        # set @to_static for benchmark, skip this by default.
+        apply_to_static(self.config, self.model)
+        # load_pretrain
+        if self.config["Global"]["pretrained_model"] is not None:
+            if self.config["Global"]["pretrained_model"].startswith("http"):
+                load_dygraph_pretrain_from_url(
+                    self.model, self.config["Global"]["pretrained_model"])
+            else:
+                load_dygraph_pretrain(
+                    self.model, self.config["Global"]["pretrained_model"])
+
+        # for slim
+
+        # build optimizer
+        if self.mode == 'train':
+            self.optimizer, self.lr_sch = build_optimizer(
+                self.config["Optimizer"], self.config["Global"]["epochs"],
+                len(self.train_dataloader), self.model.parameters())
+
+        # for distributed
+        self.config["Global"][
+            "distributed"] = paddle.distributed.get_world_size() != 1
+        if self.config["Global"]["distributed"]:
+            dist.init_parallel_env()
+        if self.config["Global"]["distributed"]:
+            self.model = paddle.DataParallel(self.model)
+
+        # build postprocess for infer
+        if self.mode == 'infer':
+            self.preprocess_func = create_operators(self.config["Infer"][
+                "transforms"])
+            self.postprocess_func = build_postprocess(self.config["Infer"][
+                "PostProcess"])
+
+    def train(self):
+        assert self.mode == "train"
+        print_batch_step = self.config['Global']['print_batch_step']
+        save_interval = self.config["Global"]["save_interval"]
+        best_metric = {
+            "metric": 0.0,
+            "epoch": 0,
+        }
+        # key:
+        # val: metrics list word
+        self.output_info = dict()
+        self.time_info = {
+            "batch_cost": AverageMeter(
+                "batch_cost", '.5f', postfix=" s,"),
+            "reader_cost": AverageMeter(
+                "reader_cost", ".5f", postfix=" s,"),
+        }
+        # global iter counter
+        self.global_step = 0
+
+        if self.config["Global"]["checkpoints"] is not None:
+            metric_info = init_model(self.config["Global"], self.model,
+                                     self.optimizer)
+            if metric_info is not None:
+                best_metric.update(metric_info)
+
+        # for amp training
+        if self.amp:
+            self.scaler = paddle.amp.GradScaler(
+                init_loss_scaling=self.scale_loss,
+                use_dynamic_loss_scaling=self.use_dynamic_loss_scaling)
+
+        self.max_iter = len(self.train_dataloader) - 1 if platform.system(
+        ) == "Windows" else len(self.train_dataloader)
+        for epoch_id in range(best_metric["epoch"] + 1,
+                              self.config["Global"]["epochs"] + 1):
+            acc = 0.0
+            # for one epoch train
+            self.train_epoch_func(self, epoch_id, print_batch_step)
+
+            if self.use_dali:
+                self.train_dataloader.reset()
+            metric_msg = ", ".join([
+                "{}: {:.5f}".format(key, self.output_info[key].avg)
+                for key in self.output_info
+            ])
+            logger.info("[Train][Epoch {}/{}][Avg]{}".format(
+                epoch_id, self.config["Global"]["epochs"], metric_msg))
+            self.output_info.clear()
+
+            # eval model and save model if possible
+            if self.config["Global"][
+                    "eval_during_train"] and epoch_id % self.config["Global"][
+                        "eval_interval"] == 0:
+                acc = self.eval(epoch_id)
+                if acc > best_metric["metric"]:
+                    best_metric["metric"] = acc
+                    best_metric["epoch"] = epoch_id
+                    save_load.save_model(
+                        self.model,
+                        self.optimizer,
+                        best_metric,
+                        self.output_dir,
+                        model_name=self.config["Arch"]["name"],
+                        prefix="best_model")
+                logger.info("[Eval][Epoch {}][best metric: {}]".format(
+                    epoch_id, best_metric["metric"]))
+                logger.scaler(
+                    name="eval_acc",
+                    value=acc,
+                    step=epoch_id,
+                    writer=self.vdl_writer)
+
+                self.model.train()
+
+            # save model
+            if epoch_id % save_interval == 0:
+                save_load.save_model(
+                    self.model,
+                    self.optimizer, {"metric": acc,
+                                     "epoch": epoch_id},
+                    self.output_dir,
+                    model_name=self.config["Arch"]["name"],
+                    prefix="epoch_{}".format(epoch_id))
+                # save the latest model
+                save_load.save_model(
+                    self.model,
+                    self.optimizer, {"metric": acc,
+                                     "epoch": epoch_id},
+                    self.output_dir,
+                    model_name=self.config["Arch"]["name"],
+                    prefix="latest")
+
+        if self.vdl_writer is not None:
+            self.vdl_writer.close()
+
+    @paddle.no_grad()
+    def eval(self, epoch_id=0):
+        assert self.mode in ["train", "eval"]
+        self.model.eval()
+        eval_result = self.eval_func(self, epoch_id)
+        self.model.train()
+        return eval_result
+
+    @paddle.no_grad()
+    def infer(self):
+        assert self.mode == "infer" and self.eval_mode == "classification"
+        total_trainer = paddle.distributed.get_world_size()
+        local_rank = paddle.distributed.get_rank()
+        image_list = get_image_list(self.config["Infer"]["infer_imgs"])
+        # data split
+        image_list = image_list[local_rank::total_trainer]
+
+        batch_size = self.config["Infer"]["batch_size"]
+        self.model.eval()
+        batch_data = []
+        image_file_list = []
+        for idx, image_file in enumerate(image_list):
+            with open(image_file, 'rb') as f:
+                x = f.read()
+            for process in self.preprocess_func:
+                x = process(x)
+            batch_data.append(x)
+            image_file_list.append(image_file)
+            if len(batch_data) >= batch_size or idx == len(image_list) - 1:
+                batch_tensor = paddle.to_tensor(batch_data)
+                out = self.model(batch_tensor)
+                if isinstance(out, list):
+                    out = out[0]
+                result = self.postprocess_func(out, image_file_list)
+                print(result)
+                batch_data.clear()
+                image_file_list.clear()
+
+    def export(self):
+        assert self.mode == "export"
+        model = ExportModel(self.config["Arch"], self.model)
+        if self.config["Global"]["pretrained_model"] is not None:
+            load_dygraph_pretrain(model.base_model,
+                                  self.config["Global"]["pretrained_model"])
+
+        model.eval()
+
+        model = paddle.jit.to_static(
+            model,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None] + self.config["Global"]["image_shape"],
+                    dtype='float32')
+            ])
+        paddle.jit.save(
+            model,
+            os.path.join(self.config["Global"]["save_inference_dir"],
+                         "inference"))
+
+
+class ExportModel(nn.Layer):
+    """
+    ExportModel: add softmax onto the model
+    """
+
+    def __init__(self, config, model):
+        super().__init__()
+        self.base_model = model
+
+        # we should choose a final model to export
+        if isinstance(self.base_model, DistillationModel):
+            self.infer_model_name = config["infer_model_name"]
+        else:
+            self.infer_model_name = None
+
+        self.infer_output_key = config.get("infer_output_key", None)
+        if self.infer_output_key == "features" and isinstance(self.base_model,
+                                                              RecModel):
+            self.base_model.head = IdentityHead()
+        if config.get("infer_add_softmax", True):
+            self.softmax = nn.Softmax(axis=-1)
+        else:
+            self.softmax = None
+
+    def eval(self):
+        self.training = False
+        for layer in self.sublayers():
+            layer.training = False
+            layer.eval()
+
+    def forward(self, x):
+        x = self.base_model(x)
+        if isinstance(x, list):
+            x = x[0]
+        if self.infer_model_name is not None:
+            x = x[self.infer_model_name]
+        if self.infer_output_key is not None:
+            x = x[self.infer_output_key]
+        if self.softmax is not None:
+            x = self.softmax(x)
+        return x
diff --git a/ppcls/engine/evaluation/__init__.py b/ppcls/engine/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0cd778887bf6f0e7ce05c18b587e5b54bcf6b3f
--- /dev/null
+++ b/ppcls/engine/evaluation/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ppcls.engine.evaluation.classification import classification_eval
+from ppcls.engine.evaluation.retrieval import retrieval_eval
diff --git a/ppcls/engine/evaluation/classification.py b/ppcls/engine/evaluation/classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1ddc41863044e5b8bed01334d374134f3124387
--- /dev/null
+++ b/ppcls/engine/evaluation/classification.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import time
+import platform
+import paddle
+
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+
+
+def classification_eval(evaler, epoch_id=0):
+    output_info = dict()
+    time_info = {
+        "batch_cost": AverageMeter(
+            "batch_cost", '.5f', postfix=" s,"),
+        "reader_cost": AverageMeter(
+            "reader_cost", ".5f", postfix=" s,"),
+    }
+    print_batch_step = evaler.config["Global"]["print_batch_step"]
+
+    metric_key = None
+    tic = time.time()
+    eval_dataloader = evaler.eval_dataloader if evaler.use_dali else evaler.eval_dataloader(
+    )
+    max_iter = len(evaler.eval_dataloader) - 1 if platform.system(
+    ) == "Windows" else len(evaler.eval_dataloader)
+    for iter_id, batch in enumerate(eval_dataloader):
+        if iter_id >= max_iter:
+            break
+        if iter_id == 5:
+            for key in time_info:
+                time_info[key].reset()
+        if evaler.use_dali:
+            batch = [
+                paddle.to_tensor(batch[0]['data']),
+                paddle.to_tensor(batch[0]['label'])
+            ]
+        time_info["reader_cost"].update(time.time() - tic)
+        batch_size = batch[0].shape[0]
+        batch[0] = paddle.to_tensor(batch[0]).astype("float32")
+        batch[1] = batch[1].reshape([-1, 1]).astype("int64")
+        # image input
+        out = evaler.model(batch[0])
+        # calc loss
+        if evaler.eval_loss_func is not None:
+            loss_dict = evaler.eval_loss_func(out, batch[1])
+            for key in loss_dict:
+                if key not in output_info:
+                    output_info[key] = AverageMeter(key, '7.5f')
+                output_info[key].update(loss_dict[key].numpy()[0], batch_size)
+        # calc metric
+        if evaler.eval_metric_func is not None:
+            metric_dict = evaler.eval_metric_func(out, batch[1])
+            if paddle.distributed.get_world_size() > 1:
+                for key in metric_dict:
+                    paddle.distributed.all_reduce(
+                        metric_dict[key], op=paddle.distributed.ReduceOp.SUM)
+                    metric_dict[key] = metric_dict[
+                        key] / paddle.distributed.get_world_size()
+            for key in metric_dict:
+                if metric_key is None:
+                    metric_key = key
+                if key not in output_info:
+                    output_info[key] = AverageMeter(key, '7.5f')
+
+                output_info[key].update(metric_dict[key].numpy()[0],
+                                        batch_size)
+
+        time_info["batch_cost"].update(time.time() - tic)
+
+        if iter_id % print_batch_step == 0:
+            time_msg = "s, ".join([
+                "{}: {:.5f}".format(key, time_info[key].avg)
+                for key in time_info
+            ])
+
+            ips_msg = "ips: {:.5f} images/sec".format(
+                batch_size / time_info["batch_cost"].avg)
+
+            metric_msg = ", ".join([
+                "{}: {:.5f}".format(key, output_info[key].val)
+                for key in output_info
+            ])
+            logger.info("[Eval][Epoch {}][Iter: {}/{}]{}, {}, {}".format(
+                epoch_id, iter_id,
+                len(evaler.eval_dataloader), metric_msg, time_msg, ips_msg))
+
+        tic = time.time()
+    if evaler.use_dali:
+        evaler.eval_dataloader.reset()
+    metric_msg = ", ".join([
+        "{}: {:.5f}".format(key, output_info[key].avg) for key in output_info
+    ])
+    logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
+
+    # do not try to save best eval.model
+    if evaler.eval_metric_func is None:
+        return -1
+    # return 1st metric in the dict
+    return output_info[metric_key].avg
diff --git a/ppcls/engine/evaluation/retrieval.py b/ppcls/engine/evaluation/retrieval.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0da3ace25f36725711cdf6a95a966f1ff8107d9
--- /dev/null
+++ b/ppcls/engine/evaluation/retrieval.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import platform
+import paddle
+from ppcls.utils import logger
+
+
+def retrieval_eval(evaler, epoch_id=0):
+    evaler.model.eval()
+    # step1. build gallery
+    gallery_feas, gallery_img_id, gallery_unique_id = cal_feature(
+        evaler, name='gallery')
+    query_feas, query_img_id, query_query_id = cal_feature(
+        evaler, name='query')
+
+    # step2. do evaluation
+    sim_block_size = evaler.config["Global"].get("sim_block_size", 64)
+    sections = [sim_block_size] * (len(query_feas) // sim_block_size)
+    if len(query_feas) % sim_block_size:
+        sections.append(len(query_feas) % sim_block_size)
+    fea_blocks = paddle.split(query_feas, num_or_sections=sections)
+    if query_query_id is not None:
+        query_id_blocks = paddle.split(
+            query_query_id, num_or_sections=sections)
+    image_id_blocks = paddle.split(query_img_id, num_or_sections=sections)
+    metric_key = None
+
+    if evaler.eval_loss_func is None:
+        metric_dict = {metric_key: 0.}
+    else:
+        metric_dict = dict()
+        for block_idx, block_fea in enumerate(fea_blocks):
+            similarity_matrix = paddle.matmul(
+                block_fea, gallery_feas, transpose_y=True)
+            if query_query_id is not None:
+                query_id_block = query_id_blocks[block_idx]
+                query_id_mask = (query_id_block != gallery_unique_id.t())
+
+                image_id_block = image_id_blocks[block_idx]
+                image_id_mask = (image_id_block != gallery_img_id.t())
+
+                keep_mask = paddle.logical_or(query_id_mask, image_id_mask)
+                similarity_matrix = similarity_matrix * keep_mask.astype(
+                    "float32")
+            else:
+                keep_mask = None
+
+            metric_tmp = evaler.eval_metric_func(similarity_matrix,
+                                                 image_id_blocks[block_idx],
+                                                 gallery_img_id, keep_mask)
+
+            for key in metric_tmp:
+                if key not in metric_dict:
+                    metric_dict[key] = metric_tmp[key] * block_fea.shape[
+                        0] / len(query_feas)
+                else:
+                    metric_dict[key] += metric_tmp[key] * block_fea.shape[
+                        0] / len(query_feas)
+
+    metric_info_list = []
+    for key in metric_dict:
+        if metric_key is None:
+            metric_key = key
+        metric_info_list.append("{}: {:.5f}".format(key, metric_dict[key]))
+    metric_msg = ", ".join(metric_info_list)
+    logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
+
+    return metric_dict[metric_key]
+
+
+def cal_feature(evaler, name='gallery'):
+    all_feas = None
+    all_image_id = None
+    all_unique_id = None
+    has_unique_id = False
+
+    if name == 'gallery':
+        dataloader = evaler.gallery_dataloader
+    elif name == 'query':
+        dataloader = evaler.query_dataloader
+    else:
+        raise RuntimeError("Only support gallery or query dataset")
+
+    max_iter = len(dataloader) - 1 if platform.system() == "Windows" else len(
+        dataloader)
+    dataloader_tmp = dataloader if evaler.use_dali else dataloader()
+    for idx, batch in enumerate(dataloader_tmp):  # load is very time-consuming
+        if idx >= max_iter:
+            break
+        if idx % evaler.config["Global"]["print_batch_step"] == 0:
+            logger.info(
+                f"{name} feature calculation process: [{idx}/{len(dataloader)}]"
+            )
+        if evaler.use_dali:
+            batch = [
+                paddle.to_tensor(batch[0]['data']),
+                paddle.to_tensor(batch[0]['label'])
+            ]
+        batch = [paddle.to_tensor(x) for x in batch]
+        batch[1] = batch[1].reshape([-1, 1]).astype("int64")
+        if len(batch) == 3:
+            has_unique_id = True
+            batch[2] = batch[2].reshape([-1, 1]).astype("int64")
+        out = evaler.model(batch[0], batch[1])
+        batch_feas = out["features"]
+
+        # do norm
+        if evaler.config["Global"].get("feature_normalize", True):
+            feas_norm = paddle.sqrt(
+                paddle.sum(paddle.square(batch_feas), axis=1, keepdim=True))
+            batch_feas = paddle.divide(batch_feas, feas_norm)
+
+        if all_feas is None:
+            all_feas = batch_feas
+            if has_unique_id:
+                all_unique_id = batch[2]
+            all_image_id = batch[1]
+        else:
+            all_feas = paddle.concat([all_feas, batch_feas])
+            all_image_id = paddle.concat([all_image_id, batch[1]])
+            if has_unique_id:
+                all_unique_id = paddle.concat([all_unique_id, batch[2]])
+    if evaler.use_dali:
+        dataloader_tmp.reset()
+    if paddle.distributed.get_world_size() > 1:
+        feat_list = []
+        img_id_list = []
+        unique_id_list = []
+        paddle.distributed.all_gather(feat_list, all_feas)
+        paddle.distributed.all_gather(img_id_list, all_image_id)
+        all_feas = paddle.concat(feat_list, axis=0)
+        all_image_id = paddle.concat(img_id_list, axis=0)
+        if has_unique_id:
+            paddle.distributed.all_gather(unique_id_list, all_unique_id)
+            all_unique_id = paddle.concat(unique_id_list, axis=0)
+
+    logger.info("Build {} done, all feat shape: {}, begin to eval..".format(
+        name, all_feas.shape))
+    return all_feas, all_image_id, all_unique_id
diff --git a/ppcls/engine/slim/__init__.py b/ppcls/engine/slim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ppcls/engine/train/__init__.py b/ppcls/engine/train/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..800d3a41edfa4c3ba2ad8c9295d66c4acfe1ea5d
--- /dev/null
+++ b/ppcls/engine/train/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ppcls.engine.train.train import train_epoch
diff --git a/ppcls/engine/train/train.py b/ppcls/engine/train/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e36a063e48009fdb27a36ea7f93a94e81e6de48
--- /dev/null
+++ b/ppcls/engine/train/train.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+import time
+import paddle
+from ppcls.engine.train.utils import update_loss, update_metric, log_info
+
+
+def train_epoch(trainer, epoch_id, print_batch_step):
+    tic = time.time()
+
+    train_dataloader = trainer.train_dataloader if trainer.use_dali else trainer.train_dataloader(
+    )
+    for iter_id, batch in enumerate(train_dataloader):
+        if iter_id >= trainer.max_iter:
+            break
+        if iter_id == 5:
+            for key in trainer.time_info:
+                trainer.time_info[key].reset()
+        trainer.time_info["reader_cost"].update(time.time() - tic)
+        if trainer.use_dali:
+            batch = [
+                paddle.to_tensor(batch[0]['data']),
+                paddle.to_tensor(batch[0]['label'])
+            ]
+        batch_size = batch[0].shape[0]
+        batch[1] = batch[1].reshape([-1, 1]).astype("int64")
+
+        trainer.global_step += 1
+        # image input
+        if trainer.amp:
+            with paddle.amp.auto_cast(custom_black_list={
+                    "flatten_contiguous_range", "greater_than"
+            }):
+                out = forward(trainer, batch)
+                loss_dict = trainer.train_loss_func(out, batch[1])
+        else:
+            out = forward(trainer, batch)
+
+        # calc loss
+        if trainer.config["DataLoader"]["Train"]["dataset"].get(
+                "batch_transform_ops", None):
+            loss_dict = trainer.train_loss_func(out, batch[1:])
+        else:
+            loss_dict = trainer.train_loss_func(out, batch[1])
+
+        # step opt and lr
+        if trainer.amp:
+            scaled = trainer.scaler.scale(loss_dict["loss"])
+            scaled.backward()
+            trainer.scaler.minimize(trainer.optimizer, scaled)
+        else:
+            loss_dict["loss"].backward()
+            trainer.optimizer.step()
+        trainer.optimizer.clear_grad()
+        trainer.lr_sch.step()
+
+        # below code just for logging
+        # update metric_for_logger
+        update_metric(trainer, out, batch, batch_size)
+        # update_loss_for_logger
+        update_loss(trainer, loss_dict, batch_size)
+        trainer.time_info["batch_cost"].update(time.time() - tic)
+        if iter_id % print_batch_step == 0:
+            log_info(trainer, batch_size, epoch_id, iter_id)
+        tic = time.time()
+
+
+def forward(trainer, batch):
+    if trainer.eval_mode == "classification":
+        return trainer.model(batch[0])
+    else:
+        return trainer.model(batch[0], batch[1])
diff --git a/ppcls/engine/train/utils.py b/ppcls/engine/train/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..92eb35d7549c72a7ef3f4907d867e50d237c3f75
--- /dev/null
+++ b/ppcls/engine/train/utils.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+import datetime
+from ppcls.utils import logger
+from ppcls.utils.misc import AverageMeter
+
+
+def update_metric(trainer, out, batch, batch_size):
+    # calc metric
+    if trainer.train_metric_func is not None:
+        metric_dict = trainer.train_metric_func(out, batch[-1])
+        for key in metric_dict:
+            if key not in trainer.output_info:
+                trainer.output_info[key] = AverageMeter(key, '7.5f')
+            trainer.output_info[key].update(metric_dict[key].numpy()[0],
+                                            batch_size)
+
+
+def update_loss(trainer, loss_dict, batch_size):
+    # update_output_info
+    for key in loss_dict:
+        if key not in trainer.output_info:
+            trainer.output_info[key] = AverageMeter(key, '7.5f')
+        trainer.output_info[key].update(loss_dict[key].numpy()[0], batch_size)
+
+
+def log_info(trainer, batch_size, epoch_id, iter_id):
+    lr_msg = "lr: {:.5f}".format(trainer.lr_sch.get_lr())
+    metric_msg = ", ".join([
+        "{}: {:.5f}".format(key, trainer.output_info[key].avg)
+        for key in trainer.output_info
+    ])
+    time_msg = "s, ".join([
+        "{}: {:.5f}".format(key, trainer.time_info[key].avg)
+        for key in trainer.time_info
+    ])
+
+    ips_msg = "ips: {:.5f} images/sec".format(
+        batch_size / trainer.time_info["batch_cost"].avg)
+    eta_sec = ((trainer.config["Global"]["epochs"] - epoch_id + 1
+                ) * len(trainer.train_dataloader) - iter_id
+               ) * trainer.time_info["batch_cost"].avg
+    eta_msg = "eta: {:s}".format(str(datetime.timedelta(seconds=int(eta_sec))))
+    logger.info("[Train][Epoch {}/{}][Iter: {}/{}]{}, {}, {}, {}, {}".format(
+        epoch_id, trainer.config["Global"]["epochs"], iter_id,
+        len(trainer.train_dataloader), lr_msg, metric_msg, time_msg, ips_msg,
+        eta_msg))
+
+    logger.scaler(
+        name="lr",
+        value=trainer.lr_sch.get_lr(),
+        step=trainer.global_step,
+        writer=trainer.vdl_writer)
+    for key in trainer.output_info:
+        logger.scaler(
+            name="train_{}".format(key),
+            value=trainer.output_info[key].avg,
+            step=trainer.global_step,
+            writer=trainer.vdl_writer)
diff --git a/ppcls/engine/trainer.py b/ppcls/engine/trainer.py
deleted file mode 100644
index 451531c1d1e6ca59e2addc1add752649e05f1e67..0000000000000000000000000000000000000000
--- a/ppcls/engine/trainer.py
+++ /dev/null
@@ -1,662 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import sys
-import numpy as np
-
-__dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))
-
-import time
-import platform
-import datetime
-import argparse
-import paddle
-import paddle.nn as nn
-import paddle.distributed as dist
-from visualdl import LogWriter
-
-from ppcls.utils.check import check_gpu
-from ppcls.utils.misc import AverageMeter
-from ppcls.utils import logger
-from ppcls.utils.logger import init_logger
-from ppcls.utils.config import print_config
-from ppcls.data import build_dataloader
-from ppcls.arch import build_model
-from ppcls.arch import apply_to_static
-from ppcls.loss import build_loss
-from ppcls.metric import build_metrics
-from ppcls.optimizer import build_optimizer
-from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
-from ppcls.utils.save_load import init_model
-from ppcls.utils import save_load
-
-from ppcls.data.utils.get_image_list import get_image_list
-from ppcls.data.postprocess import build_postprocess
-from ppcls.data import create_operators
-
-
-class Trainer(object):
-    def __init__(self, config, mode="train"):
-        self.mode = mode
-        self.config = config
-        self.output_dir = self.config['Global']['output_dir']
-
-        log_file = os.path.join(self.output_dir, self.config["Arch"]["name"],
-                                f"{mode}.log")
-        init_logger(name='root', log_file=log_file)
-        print_config(config)
-        # set device
-        assert self.config["Global"]["device"] in ["cpu", "gpu", "xpu"]
-        self.device = paddle.set_device(self.config["Global"]["device"])
-        # set dist
-        self.config["Global"][
-            "distributed"] = paddle.distributed.get_world_size() != 1
-        if self.config["Global"]["distributed"]:
-            dist.init_parallel_env()
-
-        if "Head" in self.config["Arch"]:
-            self.is_rec = True
-        else:
-            self.is_rec = False
-
-        self.model = build_model(self.config["Arch"])
-        # set @to_static for benchmark, skip this by default.
-        apply_to_static(self.config, self.model)
-
-        if self.config["Global"]["pretrained_model"] is not None:
-            if self.config["Global"]["pretrained_model"].startswith("http"):
-                load_dygraph_pretrain_from_url(
-                    self.model, self.config["Global"]["pretrained_model"])
-            else:
-                load_dygraph_pretrain(
-                    self.model, self.config["Global"]["pretrained_model"])
-
-        if self.config["Global"]["distributed"]:
-            self.model = paddle.DataParallel(self.model)
-
-        self.vdl_writer = None
-        if self.config['Global']['use_visualdl'] and mode == "train":
-            vdl_writer_path = os.path.join(self.output_dir, "vdl")
-            if not os.path.exists(vdl_writer_path):
-                os.makedirs(vdl_writer_path)
-            self.vdl_writer = LogWriter(logdir=vdl_writer_path)
-        logger.info('train with paddle {} and device {}'.format(
-            paddle.__version__, self.device))
-        # init members
-        self.train_dataloader = None
-        self.eval_dataloader = None
-        self.gallery_dataloader = None
-        self.query_dataloader = None
-        self.eval_mode = self.config["Global"].get("eval_mode",
-                                                   "classification")
-        self.amp = True if "AMP" in self.config else False
-        if self.amp and self.config["AMP"] is not None:
-            self.scale_loss = self.config["AMP"].get("scale_loss", 1.0)
-            self.use_dynamic_loss_scaling = self.config["AMP"].get(
-                "use_dynamic_loss_scaling", False)
-        else:
-            self.scale_loss = 1.0
-            self.use_dynamic_loss_scaling = False
-        if self.amp:
-            AMP_RELATED_FLAGS_SETTING = {
-                'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
-                'FLAGS_max_inplace_grad_add': 8,
-            }
-            paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
-        self.train_loss_func = None
-        self.eval_loss_func = None
-        self.train_metric_func = None
-        self.eval_metric_func = None
-        self.use_dali = self.config['Global'].get("use_dali", False)
-
-    def train(self):
-        # build train loss and metric info
-        if self.train_loss_func is None:
-            loss_info = self.config["Loss"]["Train"]
-            self.train_loss_func = build_loss(loss_info)
-        if self.train_metric_func is None:
-            metric_config = self.config.get("Metric")
-            if metric_config is not None:
-                metric_config = metric_config.get("Train")
-                if metric_config is not None:
-                    self.train_metric_func = build_metrics(metric_config)
-
-        if self.train_dataloader is None:
-            self.train_dataloader = build_dataloader(
-                self.config["DataLoader"], "Train", self.device, self.use_dali)
-
-        step_each_epoch = len(self.train_dataloader)
-
-        optimizer, lr_sch = build_optimizer(self.config["Optimizer"],
-                                            self.config["Global"]["epochs"],
-                                            step_each_epoch,
-                                            self.model.parameters())
-
-        print_batch_step = self.config['Global']['print_batch_step']
-        save_interval = self.config["Global"]["save_interval"]
-
-        best_metric = {
-            "metric": 0.0,
-            "epoch": 0,
-        }
-        # key:
-        # val: metrics list word
-        output_info = dict()
-        time_info = {
-            "batch_cost": AverageMeter(
-                "batch_cost", '.5f', postfix=" s,"),
-            "reader_cost": AverageMeter(
-                "reader_cost", ".5f", postfix=" s,"),
-        }
-        # global iter counter
-        global_step = 0
-
-        if self.config["Global"]["checkpoints"] is not None:
-            metric_info = init_model(self.config["Global"], self.model,
-                                     optimizer)
-            if metric_info is not None:
-                best_metric.update(metric_info)
-
-        # for amp training
-        if self.amp:
-            scaler = paddle.amp.GradScaler(
-                init_loss_scaling=self.scale_loss,
-                use_dynamic_loss_scaling=self.use_dynamic_loss_scaling)
-
-        tic = time.time()
-        max_iter = len(self.train_dataloader) - 1 if platform.system(
-        ) == "Windows" else len(self.train_dataloader)
-        for epoch_id in range(best_metric["epoch"] + 1,
-                              self.config["Global"]["epochs"] + 1):
-            acc = 0.0
-            train_dataloader = self.train_dataloader if self.use_dali else self.train_dataloader(
-            )
-            for iter_id, batch in enumerate(train_dataloader):
-                if iter_id >= max_iter:
-                    break
-                if iter_id == 5:
-                    for key in time_info:
-                        time_info[key].reset()
-                time_info["reader_cost"].update(time.time() - tic)
-                if self.use_dali:
-                    batch = [
-                        paddle.to_tensor(batch[0]['data']),
-                        paddle.to_tensor(batch[0]['label'])
-                    ]
-                batch_size = batch[0].shape[0]
-                batch[1] = batch[1].reshape([-1, 1]).astype("int64")
-
-                global_step += 1
-                # image input
-                if self.amp:
-                    with paddle.amp.auto_cast(custom_black_list={
-                            "flatten_contiguous_range", "greater_than"
-                    }):
-                        out = self.forward(batch)
-                        loss_dict = self.train_loss_func(out, batch[1])
-                else:
-                    out = self.forward(batch)
-
-                # calc loss
-                if self.config["DataLoader"]["Train"]["dataset"].get(
-                        "batch_transform_ops", None):
-                    loss_dict = self.train_loss_func(out, batch[1:])
-                else:
-                    loss_dict = self.train_loss_func(out, batch[1])
-
-                for key in loss_dict:
-                    if not key in output_info:
-                        output_info[key] = AverageMeter(key, '7.5f')
-                    output_info[key].update(loss_dict[key].numpy()[0],
-                                            batch_size)
-                # calc metric
-                if self.train_metric_func is not None:
-                    metric_dict = self.train_metric_func(out, batch[-1])
-                    for key in metric_dict:
-                        if not key in output_info:
-                            output_info[key] = AverageMeter(key, '7.5f')
-                        output_info[key].update(metric_dict[key].numpy()[0],
-                                                batch_size)
-
-                # step opt and lr
-                if self.amp:
-                    scaled = scaler.scale(loss_dict["loss"])
-                    scaled.backward()
-                    scaler.minimize(optimizer, scaled)
-                else:
-                    loss_dict["loss"].backward()
-                    optimizer.step()
-                optimizer.clear_grad()
-                lr_sch.step()
-
-                time_info["batch_cost"].update(time.time() - tic)
-
-                if iter_id % print_batch_step == 0:
-                    lr_msg = "lr: {:.5f}".format(lr_sch.get_lr())
-                    metric_msg = ", ".join([
-                        "{}: {:.5f}".format(key, output_info[key].avg)
-                        for key in output_info
-                    ])
-                    time_msg = "s, ".join([
-                        "{}: {:.5f}".format(key, time_info[key].avg)
-                        for key in time_info
-                    ])
-
-                    ips_msg = "ips: {:.5f} images/sec".format(
-                        batch_size / time_info["batch_cost"].avg)
-                    eta_sec = ((self.config["Global"]["epochs"] - epoch_id + 1
-                                ) * len(self.train_dataloader) - iter_id
-                               ) * time_info["batch_cost"].avg
-                    eta_msg = "eta: {:s}".format(
-                        str(datetime.timedelta(seconds=int(eta_sec))))
-                    logger.info(
-                        "[Train][Epoch {}/{}][Iter: {}/{}]{}, {}, {}, {}, {}".
-                        format(epoch_id, self.config["Global"][
-                            "epochs"], iter_id,
-                               len(self.train_dataloader), lr_msg, metric_msg,
-                               time_msg, ips_msg, eta_msg))
-
-                    logger.scaler(
-                        name="lr",
-                        value=lr_sch.get_lr(),
-                        step=global_step,
-                        writer=self.vdl_writer)
-                    for key in output_info:
-                        logger.scaler(
-                            name="train_{}".format(key),
-                            value=output_info[key].avg,
-                            step=global_step,
-                            writer=self.vdl_writer)
-                tic = time.time()
-            if self.use_dali:
-                self.train_dataloader.reset()
-            metric_msg = ", ".join([
-                "{}: {:.5f}".format(key, output_info[key].avg)
-                for key in output_info
-            ])
-            logger.info("[Train][Epoch {}/{}][Avg]{}".format(
-                epoch_id, self.config["Global"]["epochs"], metric_msg))
-            output_info.clear()
-
-            # eval model and save model if possible
-            if self.config["Global"][
-                    "eval_during_train"] and epoch_id % self.config["Global"][
-                        "eval_interval"] == 0:
-                acc = self.eval(epoch_id)
-                if acc > best_metric["metric"]:
-                    best_metric["metric"] = acc
-                    best_metric["epoch"] = epoch_id
-                    save_load.save_model(
-                        self.model,
-                        optimizer,
-                        best_metric,
-                        self.output_dir,
-                        model_name=self.config["Arch"]["name"],
-                        prefix="best_model")
-                logger.info("[Eval][Epoch {}][best metric: {}]".format(
-                    epoch_id, best_metric["metric"]))
-                logger.scaler(
-                    name="eval_acc",
-                    value=acc,
-                    step=epoch_id,
-                    writer=self.vdl_writer)
-
-                self.model.train()
-
-            # save model
-            if epoch_id % save_interval == 0:
-                save_load.save_model(
-                    self.model,
-                    optimizer, {"metric": acc,
-                                "epoch": epoch_id},
-                    self.output_dir,
-                    model_name=self.config["Arch"]["name"],
-                    prefix="epoch_{}".format(epoch_id))
-                # save the latest model
-                save_load.save_model(
-                    self.model,
-                    optimizer, {"metric": acc,
-                                "epoch": epoch_id},
-                    self.output_dir,
-                    model_name=self.config["Arch"]["name"],
-                    prefix="latest")
-
-        if self.vdl_writer is not None:
-            self.vdl_writer.close()
-
-    def build_avg_metrics(self, info_dict):
-        return {key: AverageMeter(key, '7.5f') for key in info_dict}
-
-    @paddle.no_grad()
-    def eval(self, epoch_id=0):
-        self.model.eval()
-        if self.eval_loss_func is None:
-            loss_config = self.config.get("Loss", None)
-            if loss_config is not None:
-                loss_config = loss_config.get("Eval")
-                if loss_config is not None:
-                    self.eval_loss_func = build_loss(loss_config)
-        if self.eval_mode == "classification":
-            if self.eval_dataloader is None:
-                self.eval_dataloader = build_dataloader(
-                    self.config["DataLoader"], "Eval", self.device,
-                    self.use_dali)
-
-            if self.eval_metric_func is None:
-                metric_config = self.config.get("Metric")
-                if metric_config is not None:
-                    metric_config = metric_config.get("Eval")
-                    if metric_config is not None:
-                        self.eval_metric_func = build_metrics(metric_config)
-
-            eval_result = self.eval_cls(epoch_id)
-
-        elif self.eval_mode == "retrieval":
-            if self.gallery_dataloader is None:
-                self.gallery_dataloader = build_dataloader(
-                    self.config["DataLoader"]["Eval"], "Gallery", self.device,
-                    self.use_dali)
-
-            if self.query_dataloader is None:
-                self.query_dataloader = build_dataloader(
-                    self.config["DataLoader"]["Eval"], "Query", self.device,
-                    self.use_dali)
-            # build metric info
-            if self.eval_metric_func is None:
-                metric_config = self.config.get("Metric", None)
-                if metric_config is None:
-                    metric_config = [{"name": "Recallk", "topk": (1, 5)}]
-                else:
-                    metric_config = metric_config["Eval"]
-                self.eval_metric_func = build_metrics(metric_config)
-            eval_result = self.eval_retrieval(epoch_id)
-        else:
-            logger.warning("Invalid eval mode: {}".format(self.eval_mode))
-            eval_result = None
-        self.model.train()
-        return eval_result
-
-    def forward(self, batch):
-        if not self.is_rec:
-            out = self.model(batch[0])
-        else:
-            out = self.model(batch[0], batch[1])
-        return out
-
-    @paddle.no_grad()
-    def eval_cls(self, epoch_id=0):
-        output_info = dict()
-        time_info = {
-            "batch_cost": AverageMeter(
-                "batch_cost", '.5f', postfix=" s,"),
-            "reader_cost": AverageMeter(
-                "reader_cost", ".5f", postfix=" s,"),
-        }
-        print_batch_step = self.config["Global"]["print_batch_step"]
-
-        metric_key = None
-        tic = time.time()
-        eval_dataloader = self.eval_dataloader if self.use_dali else self.eval_dataloader(
-        )
-        max_iter = len(self.eval_dataloader) - 1 if platform.system(
-        ) == "Windows" else len(self.eval_dataloader)
-        for iter_id, batch in enumerate(eval_dataloader):
-            if iter_id >= max_iter:
-                break
-            if iter_id == 5:
-                for key in time_info:
-                    time_info[key].reset()
-            if self.use_dali:
-                batch = [
-                    paddle.to_tensor(batch[0]['data']),
-                    paddle.to_tensor(batch[0]['label'])
-                ]
-            time_info["reader_cost"].update(time.time() - tic)
-            batch_size = batch[0].shape[0]
-            batch[0] = paddle.to_tensor(batch[0]).astype("float32")
-            batch[1] = batch[1].reshape([-1, 1]).astype("int64")
-            # image input
-            out = self.forward(batch)
-            # calc loss
-            if self.eval_loss_func is not None:
-                loss_dict = self.eval_loss_func(out, batch[-1])
-                for key in loss_dict:
-                    if not key in output_info:
-                        output_info[key] = AverageMeter(key, '7.5f')
-                    output_info[key].update(loss_dict[key].numpy()[0],
-                                            batch_size)
-            # calc metric
-            if self.eval_metric_func is not None:
-                metric_dict = self.eval_metric_func(out, batch[-1])
-                if paddle.distributed.get_world_size() > 1:
-                    for key in metric_dict:
-                        paddle.distributed.all_reduce(
-                            metric_dict[key],
-                            op=paddle.distributed.ReduceOp.SUM)
-                        metric_dict[key] = metric_dict[
-                            key] / paddle.distributed.get_world_size()
-                for key in metric_dict:
-                    if metric_key is None:
-                        metric_key = key
-                    if not key in output_info:
-                        output_info[key] = AverageMeter(key, '7.5f')
-
-                    output_info[key].update(metric_dict[key].numpy()[0],
-                                            batch_size)
-
-            time_info["batch_cost"].update(time.time() - tic)
-
-            if iter_id % print_batch_step == 0:
-                time_msg = "s, ".join([
-                    "{}: {:.5f}".format(key, time_info[key].avg)
-                    for key in time_info
-                ])
-
-                ips_msg = "ips: {:.5f} images/sec".format(
-                    batch_size / time_info["batch_cost"].avg)
-
-                metric_msg = ", ".join([
-                    "{}: {:.5f}".format(key, output_info[key].val)
-                    for key in output_info
-                ])
-                logger.info("[Eval][Epoch {}][Iter: {}/{}]{}, {}, {}".format(
-                    epoch_id, iter_id,
-                    len(self.eval_dataloader), metric_msg, time_msg, ips_msg))
-
-            tic = time.time()
-        if self.use_dali:
-            self.eval_dataloader.reset()
-        metric_msg = ", ".join([
-            "{}: {:.5f}".format(key, output_info[key].avg)
-            for key in output_info
-        ])
-        logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
-
-        # do not try to save best model
-        if self.eval_metric_func is None:
-            return -1
-        # return 1st metric in the dict
-        return output_info[metric_key].avg
-
-    def eval_retrieval(self, epoch_id=0):
-        self.model.eval()
-        # step1. build gallery
-        gallery_feas, gallery_img_id, gallery_unique_id = self._cal_feature(
-            name='gallery')
-        query_feas, query_img_id, query_query_id = self._cal_feature(
-            name='query')
-
-        # step2. do evaluation
-        sim_block_size = self.config["Global"].get("sim_block_size", 64)
-        sections = [sim_block_size] * (len(query_feas) // sim_block_size)
-        if len(query_feas) % sim_block_size:
-            sections.append(len(query_feas) % sim_block_size)
-        fea_blocks = paddle.split(query_feas, num_or_sections=sections)
-        if query_query_id is not None:
-            query_id_blocks = paddle.split(
-                query_query_id, num_or_sections=sections)
-        image_id_blocks = paddle.split(query_img_id, num_or_sections=sections)
-        metric_key = None
-
-        if self.eval_metric_func is None:
-            metric_dict = {metric_key: 0.}
-        else:
-            metric_dict = dict()
-            for block_idx, block_fea in enumerate(fea_blocks):
-                similarity_matrix = paddle.matmul(
-                    block_fea, gallery_feas, transpose_y=True)
-                if query_query_id is not None:
-                    query_id_block = query_id_blocks[block_idx]
-                    query_id_mask = (query_id_block != gallery_unique_id.t())
-
-                    image_id_block = image_id_blocks[block_idx]
-                    image_id_mask = (image_id_block != gallery_img_id.t())
-
-                    keep_mask = paddle.logical_or(query_id_mask, image_id_mask)
-                    similarity_matrix = similarity_matrix * keep_mask.astype(
-                        "float32")
-                else:
-                    keep_mask = None
-
-                metric_tmp = self.eval_metric_func(similarity_matrix,
-                                                   image_id_blocks[block_idx],
-                                                   gallery_img_id, keep_mask)
-
-                for key in metric_tmp:
-                    if key not in metric_dict:
-                        metric_dict[key] = metric_tmp[key] * block_fea.shape[
-                            0] / len(query_feas)
-                    else:
-                        metric_dict[key] += metric_tmp[key] * block_fea.shape[
-                            0] / len(query_feas)
-
-        metric_info_list = []
-        for key in metric_dict:
-            if metric_key is None:
-                metric_key = key
-            metric_info_list.append("{}: {:.5f}".format(key, metric_dict[key]))
-        metric_msg = ", ".join(metric_info_list)
-        logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
-
-        return metric_dict[metric_key]
-
-    def _cal_feature(self, name='gallery'):
-        all_feas = None
-        all_image_id = None
-        all_unique_id = None
-        if name == 'gallery':
-            dataloader = self.gallery_dataloader
-        elif name == 'query':
-            dataloader = self.query_dataloader
-        else:
-            raise RuntimeError("Only support gallery or query dataset")
-
-        has_unique_id = False
-        max_iter = len(dataloader) - 1 if platform.system(
-        ) == "Windows" else len(dataloader)
-        dataloader_tmp = dataloader if self.use_dali else dataloader()
-        for idx, batch in enumerate(
-                dataloader_tmp):  # load is very time-consuming
-            if idx >= max_iter:
-                break
-            if idx % self.config["Global"]["print_batch_step"] == 0:
-                logger.info(
-                    f"{name} feature calculation process: [{idx}/{len(dataloader)}]"
-                )
-            if self.use_dali:
-                batch = [
-                    paddle.to_tensor(batch[0]['data']),
-                    paddle.to_tensor(batch[0]['label'])
-                ]
-            batch = [paddle.to_tensor(x) for x in batch]
-            batch[1] = batch[1].reshape([-1, 1]).astype("int64")
-            if len(batch) == 3:
-                has_unique_id = True
-                batch[2] = batch[2].reshape([-1, 1]).astype("int64")
-            out = self.model(batch[0], batch[1])
-            batch_feas = out["features"]
-
-            # do norm
-            if self.config["Global"].get("feature_normalize", True):
-                feas_norm = paddle.sqrt(
-                    paddle.sum(paddle.square(batch_feas), axis=1,
-                               keepdim=True))
-                batch_feas = paddle.divide(batch_feas, feas_norm)
-
-            if all_feas is None:
-                all_feas = batch_feas
-                if has_unique_id:
-                    all_unique_id = batch[2]
-                all_image_id = batch[1]
-            else:
-                all_feas = paddle.concat([all_feas, batch_feas])
-                all_image_id = paddle.concat([all_image_id, batch[1]])
-                if has_unique_id:
-                    all_unique_id = paddle.concat([all_unique_id, batch[2]])
-        if self.use_dali:
-            dataloader_tmp.reset()
-        if paddle.distributed.get_world_size() > 1:
-            feat_list = []
-            img_id_list = []
-            unique_id_list = []
-            paddle.distributed.all_gather(feat_list, all_feas)
-            paddle.distributed.all_gather(img_id_list, all_image_id)
-            all_feas = paddle.concat(feat_list, axis=0)
-            all_image_id = paddle.concat(img_id_list, axis=0)
-            if has_unique_id:
-                paddle.distributed.all_gather(unique_id_list, all_unique_id)
-                all_unique_id = paddle.concat(unique_id_list, axis=0)
-
-        logger.info("Build {} done, all feat shape: {}, begin to eval..".
-                    format(name, all_feas.shape))
-        return all_feas, all_image_id, all_unique_id
-
-    @paddle.no_grad()
-    def infer(self, ):
-        total_trainer = paddle.distributed.get_world_size()
-        local_rank = paddle.distributed.get_rank()
-        image_list = get_image_list(self.config["Infer"]["infer_imgs"])
-        # data split
-        image_list = image_list[local_rank::total_trainer]
-
-        preprocess_func = create_operators(self.config["Infer"]["transforms"])
-        postprocess_func = build_postprocess(self.config["Infer"][
-            "PostProcess"])
-
-        batch_size = self.config["Infer"]["batch_size"]
-
-        self.model.eval()
-
-        batch_data = []
-        image_file_list = []
-        for idx, image_file in enumerate(image_list):
-            with open(image_file, 'rb') as f:
-                x = f.read()
-            for process in preprocess_func:
-                x = process(x)
-            batch_data.append(x)
-            image_file_list.append(image_file)
-            if len(batch_data) >= batch_size or idx == len(image_list) - 1:
-                batch_tensor = paddle.to_tensor(batch_data)
-                out = self.model(batch_tensor)
-                if isinstance(out, list):
-                    out = out[0]
-                result = postprocess_func(out, image_file_list)
-                print(result)
-                batch_data.clear()
-                image_file_list.clear()
diff --git a/ppcls/static/program.py b/ppcls/static/program.py
index 71f630f7b7913988101510c1766e82da36a29932..e6022bbde4529b353db6102e5ac93f798a1cd196 100644
--- a/ppcls/static/program.py
+++ b/ppcls/static/program.py
@@ -38,7 +38,7 @@ from ppcls.optimizer import build_optimizer
 from ppcls.optimizer import build_lr_scheduler
 
 from ppcls.utils.misc import AverageMeter
-from ppcls.utils import logger
+from ppcls.utils import logger, profiler
 
 
 def create_feeds(image_shape, use_mix=None, dtype="float32"):
@@ -326,7 +326,8 @@ def run(dataloader,
         mode='train',
         config=None,
         vdl_writer=None,
-        lr_scheduler=None):
+        lr_scheduler=None,
+        profiler_options=None):
     """
     Feed data to the model and fetch the measures and loss
 
@@ -382,6 +383,8 @@ def run(dataloader,
 
         metric_dict['reader_time'].update(time.time() - tic)
 
+        profiler.add_profiler_step(profiler_options)
+
         if use_dali:
             batch_size = batch[0]["data"].shape()[0]
             feed_dict = batch[0]
diff --git a/ppcls/static/train.py b/ppcls/static/train.py
index d894ce8ca2eb4853db30441c4304964751e91e71..a3aa0b591ce2db7d1066f1fada521e3a91cfd239 100644
--- a/ppcls/static/train.py
+++ b/ppcls/static/train.py
@@ -43,6 +43,13 @@ def parse_args():
         type=str,
         default='configs/ResNet/ResNet50.yaml',
         help='config file path')
+    parser.add_argument(
+        '-p',
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
     parser.add_argument(
         '-o',
         '--override',
@@ -166,7 +173,7 @@ def main(args):
         # 1. train with train dataset
         program.run(train_dataloader, exe, compiled_train_prog, train_feeds,
                     train_fetchs, epoch_id, 'train', config, vdl_writer,
-                    lr_scheduler)
+                    lr_scheduler, args.profiler_options)
         # 2. evaate with eval dataset
         if global_config["eval_during_train"] and epoch_id % global_config[
                 "eval_interval"] == 0:
diff --git a/ppcls/utils/static/dali.py b/ppcls/utils/static/dali.py
deleted file mode 100644
index eacb3fc9a895fad1d9dd010b05056646a884b48b..0000000000000000000000000000000000000000
--- a/ppcls/utils/static/dali.py
+++ /dev/null
@@ -1,361 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-
-import os
-
-import numpy as np
-from nvidia.dali.pipeline import Pipeline
-import nvidia.dali.ops as ops
-import nvidia.dali.types as types
-from nvidia.dali.plugin.paddle import DALIGenericIterator
-
-import paddle
-from paddle import fluid
-
-
-class HybridTrainPipe(Pipeline):
-    def __init__(self,
-                 file_root,
-                 file_list,
-                 batch_size,
-                 resize_shorter,
-                 crop,
-                 min_area,
-                 lower,
-                 upper,
-                 interp,
-                 mean,
-                 std,
-                 device_id,
-                 shard_id=0,
-                 num_shards=1,
-                 random_shuffle=True,
-                 num_threads=4,
-                 seed=42,
-                 pad_output=False,
-                 output_dtype=types.FLOAT):
-        super(HybridTrainPipe, self).__init__(
-            batch_size, num_threads, device_id, seed=seed)
-        self.input = ops.FileReader(
-            file_root=file_root,
-            file_list=file_list,
-            shard_id=shard_id,
-            num_shards=num_shards,
-            random_shuffle=random_shuffle)
-        # set internal nvJPEG buffers size to handle full-sized ImageNet images
-        # without additional reallocations
-        device_memory_padding = 211025920
-        host_memory_padding = 140544512
-        self.decode = ops.ImageDecoderRandomCrop(
-            device='mixed',
-            output_type=types.RGB,
-            device_memory_padding=device_memory_padding,
-            host_memory_padding=host_memory_padding,
-            random_aspect_ratio=[lower, upper],
-            random_area=[min_area, 1.0],
-            num_attempts=100)
-        self.res = ops.Resize(
-            device='gpu', resize_x=crop, resize_y=crop, interp_type=interp)
-        self.cmnp = ops.CropMirrorNormalize(
-            device="gpu",
-            output_dtype=output_dtype,
-            output_layout=types.NCHW,
-            crop=(crop, crop),
-            image_type=types.RGB,
-            mean=mean,
-            std=std,
-            pad_output=pad_output)
-        self.coin = ops.CoinFlip(probability=0.5)
-        self.to_int64 = ops.Cast(dtype=types.INT64, device="gpu")
-
-    def define_graph(self):
-        rng = self.coin()
-        jpegs, labels = self.input(name="Reader")
-        images = self.decode(jpegs)
-        images = self.res(images)
-        output = self.cmnp(images.gpu(), mirror=rng)
-        return [output, self.to_int64(labels.gpu())]
-
-    def __len__(self):
-        return self.epoch_size("Reader")
-
-
-class HybridValPipe(Pipeline):
-    def __init__(self,
-                 file_root,
-                 file_list,
-                 batch_size,
-                 resize_shorter,
-                 crop,
-                 interp,
-                 mean,
-                 std,
-                 device_id,
-                 shard_id=0,
-                 num_shards=1,
-                 random_shuffle=False,
-                 num_threads=4,
-                 seed=42,
-                 pad_output=False,
-                 output_dtype=types.FLOAT):
-        super(HybridValPipe, self).__init__(
-            batch_size, num_threads, device_id, seed=seed)
-        self.input = ops.FileReader(
-            file_root=file_root,
-            file_list=file_list,
-            shard_id=shard_id,
-            num_shards=num_shards,
-            random_shuffle=random_shuffle)
-        self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB)
-        self.res = ops.Resize(
-            device="gpu", resize_shorter=resize_shorter, interp_type=interp)
-        self.cmnp = ops.CropMirrorNormalize(
-            device="gpu",
-            output_dtype=output_dtype,
-            output_layout=types.NCHW,
-            crop=(crop, crop),
-            image_type=types.RGB,
-            mean=mean,
-            std=std,
-            pad_output=pad_output)
-        self.to_int64 = ops.Cast(dtype=types.INT64, device="gpu")
-
-    def define_graph(self):
-        jpegs, labels = self.input(name="Reader")
-        images = self.decode(jpegs)
-        images = self.res(images)
-        output = self.cmnp(images)
-        return [output, self.to_int64(labels.gpu())]
-
-    def __len__(self):
-        return self.epoch_size("Reader")
-
-
-def build(config, mode='train'):
-    env = os.environ
-    assert config.get('use_gpu',
-                      True) == True, "gpu training is required for DALI"
-    assert not config.get(
-        'use_aa'), "auto augment is not supported by DALI reader"
-    assert float(env.get('FLAGS_fraction_of_gpu_memory_to_use', 0.92)) < 0.9, \
-        "Please leave enough GPU memory for DALI workspace, e.g., by setting" \
-        " `export FLAGS_fraction_of_gpu_memory_to_use=0.8`"
-
-    dataset_config = config[mode.upper()]
-
-    gpu_num = paddle.fluid.core.get_cuda_device_count() if (
-        'PADDLE_TRAINERS_NUM') and (
-            'PADDLE_TRAINER_ID'
-        ) not in env else int(env.get('PADDLE_TRAINERS_NUM', 0))
-
-    batch_size = dataset_config.batch_size
-    assert batch_size % gpu_num == 0, \
-        "batch size must be multiple of number of devices"
-    batch_size = batch_size // gpu_num
-
-    file_root = dataset_config.data_dir
-    file_list = dataset_config.file_list
-
-    interp = 1  # settings.interpolation or 1  # default to linear
-    interp_map = {
-        0: types.INTERP_NN,  # cv2.INTER_NEAREST
-        1: types.INTERP_LINEAR,  # cv2.INTER_LINEAR
-        2: types.INTERP_CUBIC,  # cv2.INTER_CUBIC
-        4: types.INTERP_LANCZOS3,  # XXX use LANCZOS3 for cv2.INTER_LANCZOS4
-    }
-
-    output_dtype = (types.FLOAT16 if 'AMP' in config and
-                    config.AMP.get("use_pure_fp16", False) 
-                    else types.FLOAT)
-    
-    assert interp in interp_map, "interpolation method not supported by DALI"
-    interp = interp_map[interp]
-    pad_output = False
-    image_shape = config.get("image_shape", None)
-    if image_shape and image_shape[0] == 4:
-        pad_output = True
-
-    transforms = {
-        k: v
-        for d in dataset_config["transforms"] for k, v in d.items()
-    }
-
-    scale = transforms["NormalizeImage"].get("scale", 1.0 / 255)
-    if isinstance(scale, str):
-        scale = eval(scale)
-    mean = transforms["NormalizeImage"].get("mean", [0.485, 0.456, 0.406])
-    std = transforms["NormalizeImage"].get("std", [0.229, 0.224, 0.225])
-    mean = [v / scale for v in mean]
-    std = [v / scale for v in std]
-
-    if mode == "train":
-        resize_shorter = 256
-        crop = transforms["RandCropImage"]["size"]
-        scale = transforms["RandCropImage"].get("scale", [0.08, 1.])
-        ratio = transforms["RandCropImage"].get("ratio", [3.0 / 4, 4.0 / 3])
-        min_area = scale[0]
-        lower = ratio[0]
-        upper = ratio[1]
-
-        if 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env:
-            shard_id = int(env['PADDLE_TRAINER_ID'])
-            num_shards = int(env['PADDLE_TRAINERS_NUM'])
-            device_id = int(env['FLAGS_selected_gpus'])
-            pipe = HybridTrainPipe(
-                file_root,
-                file_list,
-                batch_size,
-                resize_shorter,
-                crop,
-                min_area,
-                lower,
-                upper,
-                interp,
-                mean,
-                std,
-                device_id,
-                shard_id,
-                num_shards,
-                seed=42 + shard_id,
-                pad_output=pad_output,
-                output_dtype=output_dtype)
-            pipe.build()
-            pipelines = [pipe]
-            sample_per_shard = len(pipe) // num_shards
-        else:
-            pipelines = []
-            places = fluid.framework.cuda_places()
-            num_shards = len(places)
-            for idx, p in enumerate(places):
-                place = fluid.core.Place()
-                place.set_place(p)
-                device_id = place.gpu_device_id()
-                pipe = HybridTrainPipe(
-                    file_root,
-                    file_list,
-                    batch_size,
-                    resize_shorter,
-                    crop,
-                    min_area,
-                    lower,
-                    upper,
-                    interp,
-                    mean,
-                    std,
-                    device_id,
-                    idx,
-                    num_shards,
-                    seed=42 + idx,
-                pad_output=pad_output,
-                output_dtype=output_dtype)
-                pipe.build()
-                pipelines.append(pipe)
-            sample_per_shard = len(pipelines[0])
-        return DALIGenericIterator(
-            pipelines, ['feed_image', 'feed_label'], size=sample_per_shard)
-    else:
-        resize_shorter = transforms["ResizeImage"].get("resize_short", 256)
-        crop = transforms["CropImage"]["size"]
-
-        p = fluid.framework.cuda_places()[0]
-        place = fluid.core.Place()
-        place.set_place(p)
-        device_id = place.gpu_device_id()
-        pipe = HybridValPipe(
-            file_root,
-            file_list,
-            batch_size,
-            resize_shorter,
-            crop,
-            interp,
-            mean,
-            std,
-            device_id=device_id,
-            pad_output=pad_output,
-            output_dtype=output_dtype)
-        pipe.build()
-        return DALIGenericIterator(
-            pipe, ['feed_image', 'feed_label'],
-            size=len(pipe),
-            dynamic_shape=True,
-            fill_last_batch=True,
-            last_batch_padded=True)
-
-
-def train(config):
-    return build(config, 'train')
-
-
-def val(config):
-    return build(config, 'valid')
-
-
-def _to_Tensor(lod_tensor, dtype):
-    data_tensor = fluid.layers.create_tensor(dtype=dtype)
-    data = np.array(lod_tensor).astype(dtype)
-    fluid.layers.assign(data, data_tensor)
-    return data_tensor
-
-
-def normalize(feeds, config):
-    image, label = feeds['image'], feeds['label']
-    img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
-    img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
-    image = fluid.layers.cast(image, 'float32')
-    costant = fluid.layers.fill_constant(
-        shape=[1], value=255.0, dtype='float32')
-    image = fluid.layers.elementwise_div(image, costant)
-
-    mean = fluid.layers.create_tensor(dtype="float32")
-    fluid.layers.assign(input=img_mean.astype("float32"), output=mean)
-    std = fluid.layers.create_tensor(dtype="float32")
-    fluid.layers.assign(input=img_std.astype("float32"), output=std)
-
-    image = fluid.layers.elementwise_sub(image, mean)
-    image = fluid.layers.elementwise_div(image, std)
-
-    image.stop_gradient = True
-    feeds['image'] = image
-
-    return feeds
-
-
-def mix(feeds, config, is_train=True):
-    env = os.environ
-    gpu_num = paddle.fluid.core.get_cuda_device_count() if (
-        'PADDLE_TRAINERS_NUM') and (
-            'PADDLE_TRAINER_ID'
-        ) not in env else int(env.get('PADDLE_TRAINERS_NUM', 0))
-
-    batch_size = config.TRAIN.batch_size // gpu_num
-
-    images = feeds['image']
-    label = feeds['label']
-    # TODO: hard code here, should be fixed!
-    alpha = 0.2
-    idx = _to_Tensor(np.random.permutation(batch_size), 'int32')
-    lam = np.random.beta(alpha, alpha)
-
-    images = lam * images + (1 - lam) * paddle.fluid.layers.gather(images, idx)
-
-    feed = {
-        'image': images,
-        'feed_y_a': label,
-        'feed_y_b': paddle.fluid.layers.gather(label, idx),
-        'feed_lam': _to_Tensor([lam] * batch_size, 'float32')
-    }
-
-    return feed if is_train else feeds
diff --git a/ppcls/utils/static/program.py b/ppcls/utils/static/program.py
deleted file mode 100644
index f50d7b5d00fab2081e06143939a624cea48b1f46..0000000000000000000000000000000000000000
--- a/ppcls/utils/static/program.py
+++ /dev/null
@@ -1,606 +0,0 @@
-# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-import numpy as np
-
-from collections import OrderedDict
-from ppcls.optimizer import OptimizerBuilder
-
-import paddle
-import paddle.nn.functional as F
-
-from ppcls.optimizer.learning_rate import LearningRateBuilder
-from ppcls.arch import backbone
-from ppcls.arch.loss import CELoss
-from ppcls.arch.loss import MixCELoss
-from ppcls.arch.loss import JSDivLoss
-from ppcls.arch.loss import GoogLeNetLoss
-from ppcls.utils.misc import AverageMeter
-from ppcls.utils import logger, profiler
-
-from paddle.distributed import fleet
-from paddle.distributed.fleet import DistributedStrategy
-
-
-def create_feeds(image_shape, use_mix=None, use_dali=None, dtype="float32"):
-    """
-    Create feeds as model input
-
-    Args:
-        image_shape(list[int]): model input shape, such as [3, 224, 224]
-        use_mix(bool): whether to use mix(include mixup, cutmix, fmix)
-
-    Returns:
-        feeds(dict): dict of model input variables
-    """
-    feeds = OrderedDict()
-    feeds['image'] = paddle.static.data(
-        name="feed_image", shape=[None] + image_shape, dtype=dtype)
-    if use_mix and not use_dali:
-        feeds['feed_y_a'] = paddle.static.data(
-            name="feed_y_a", shape=[None, 1], dtype="int64")
-        feeds['feed_y_b'] = paddle.static.data(
-            name="feed_y_b", shape=[None, 1], dtype="int64")
-        feeds['feed_lam'] = paddle.static.data(
-            name="feed_lam", shape=[None, 1], dtype=dtype)
-    else:
-        feeds['label'] = paddle.static.data(
-            name="feed_label", shape=[None, 1], dtype="int64")
-
-    return feeds
-
-
-def create_model(architecture, image, classes_num, config, is_train):
-    """
-    Create a model
-
-    Args:
-        architecture(dict): architecture information,
-            name(such as ResNet50) is needed
-        image(variable): model input variable
-        classes_num(int): num of classes
-        config(dict): model config
-
-    Returns:
-        out(variable): model output variable
-    """
-    name = architecture["name"]
-    params = architecture.get("params", {})
-
-    if "data_format" in config:
-        params["data_format"] = config["data_format"]
-        data_format = config["data_format"]
-    input_image_channel = config.get('image_shape', [3, 224, 224])[0]
-    if input_image_channel != 3:
-        logger.warning(
-            "Input image channel is changed to {}, maybe for better speed-up".
-            format(input_image_channel))
-        params["input_image_channel"] = input_image_channel
-    if "is_test" in params:
-        params['is_test'] = not is_train
-    model = backbone.__dict__[name](class_dim=classes_num, **params)
-
-    out = model(image)
-    return out
-
-
-def create_loss(out,
-                feeds,
-                architecture,
-                classes_num=1000,
-                epsilon=None,
-                use_mix=False,
-                use_distillation=False):
-    """
-    Create a loss for optimization, such as:
-        1. CrossEnotry loss
-        2. CrossEnotry loss with label smoothing
-        3. CrossEnotry loss with mix(mixup, cutmix, fmix)
-        4. CrossEnotry loss with label smoothing and (mixup, cutmix, fmix)
-        5. GoogLeNet loss
-
-    Args:
-        out(variable): model output variable
-        feeds(dict): dict of model input variables
-        architecture(dict): architecture information,
-            name(such as ResNet50) is needed
-        classes_num(int): num of classes
-        epsilon(float): parameter for label smoothing, 0.0 <= epsilon <= 1.0
-        use_mix(bool): whether to use mix(include mixup, cutmix, fmix)
-
-    Returns:
-        loss(variable): loss variable
-    """
-    if use_mix:
-        feed_y_a = paddle.reshape(feeds['feed_y_a'], [-1, 1])
-        feed_y_b = paddle.reshape(feeds['feed_y_b'], [-1, 1])
-        feed_lam = paddle.reshape(feeds['feed_lam'], [-1, 1])
-    else:
-        target = paddle.reshape(feeds['label'], [-1, 1])
-
-    if architecture["name"] == "GoogLeNet":
-        assert len(out) == 3, "GoogLeNet should have 3 outputs"
-        loss = GoogLeNetLoss(class_dim=classes_num, epsilon=epsilon)
-        return loss(out[0], out[1], out[2], target)
-
-    if use_distillation:
-        assert len(out) == 2, ("distillation output length must be 2, "
-                               "but got {}".format(len(out)))
-        loss = JSDivLoss(class_dim=classes_num, epsilon=epsilon)
-        return loss(out[1], out[0])
-
-    if use_mix:
-        loss = MixCELoss(class_dim=classes_num, epsilon=epsilon)
-        return loss(out, feed_y_a, feed_y_b, feed_lam)
-    else:
-        loss = CELoss(class_dim=classes_num, epsilon=epsilon)
-        return loss(out, target)
-
-
-def create_metric(out,
-                  feeds,
-                  architecture,
-                  topk=5,
-                  classes_num=1000,
-                  config=None,
-                  use_distillation=False):
-    """
-    Create measures of model accuracy, such as top1 and top5
-
-    Args:
-        out(variable): model output variable
-        feeds(dict): dict of model input variables(included label)
-        topk(int): usually top5
-        classes_num(int): num of classes
-        config(dict) : model config
-
-    Returns:
-        fetchs(dict): dict of measures
-    """
-    label = paddle.reshape(feeds['label'], [-1, 1])
-    if architecture["name"] == "GoogLeNet":
-        assert len(out) == 3, "GoogLeNet should have 3 outputs"
-        out = out[0]
-    else:
-        # just need student label to get metrics
-        if use_distillation:
-            out = out[1]
-    softmax_out = F.softmax(out)
-
-    fetchs = OrderedDict()
-    # set top1 to fetchs
-    top1 = paddle.metric.accuracy(softmax_out, label=label, k=1)
-    fetchs['top1'] = (top1, AverageMeter('top1', '.4f', need_avg=True))
-    # set topk to fetchs
-    k = min(topk, classes_num)
-    topk = paddle.metric.accuracy(softmax_out, label=label, k=k)
-    topk_name = 'top{}'.format(k)
-    fetchs[topk_name] = (topk, AverageMeter(topk_name, '.4f', need_avg=True))
-    return fetchs
-
-
-def create_fetchs(out,
-                  feeds,
-                  architecture,
-                  topk=5,
-                  classes_num=1000,
-                  epsilon=None,
-                  use_mix=False,
-                  config=None,
-                  use_distillation=False):
-    """
-    Create fetchs as model outputs(included loss and measures),
-    will call create_loss and create_metric(if use_mix).
-
-    Args:
-        out(variable): model output variable
-        feeds(dict): dict of model input variables.
-            If use mix_up, it will not include label.
-        architecture(dict): architecture information,
-            name(such as ResNet50) is needed
-        topk(int): usually top5
-        classes_num(int): num of classes
-        epsilon(float): parameter for label smoothing, 0.0 <= epsilon <= 1.0
-        use_mix(bool): whether to use mix(include mixup, cutmix, fmix)
-        config(dict): model config
-
-    Returns:
-        fetchs(dict): dict of model outputs(included loss and measures)
-    """
-    fetchs = OrderedDict()
-    loss = create_loss(out, feeds, architecture, classes_num, epsilon, use_mix,
-                       use_distillation)
-    fetchs['loss'] = (loss, AverageMeter('loss', '7.4f', need_avg=True))
-    if not use_mix:
-        metric = create_metric(out, feeds, architecture, topk, classes_num,
-                               config, use_distillation)
-        fetchs.update(metric)
-
-    return fetchs
-
-
-def create_optimizer(config):
-    """
-    Create an optimizer using config, usually including
-    learning rate and regularization.
-
-    Args:
-        config(dict):  such as
-        {
-            'LEARNING_RATE':
-                {'function': 'Cosine',
-                 'params': {'lr': 0.1}
-                },
-            'OPTIMIZER':
-                {'function': 'Momentum',
-                 'params':{'momentum': 0.9},
-                 'regularizer':
-                    {'function': 'L2', 'factor': 0.0001}
-                }
-        }
-
-    Returns:
-        an optimizer instance
-    """
-    # create learning_rate instance
-    lr_config = config['LEARNING_RATE']
-    lr_config['params'].update({
-        'epochs': config['epochs'],
-        'step_each_epoch':
-        config['total_images'] // config['TRAIN']['batch_size'],
-    })
-    lr = LearningRateBuilder(**lr_config)()
-
-    # create optimizer instance
-    opt_config = config['OPTIMIZER']
-    opt = OptimizerBuilder(**opt_config)
-    return opt(lr), lr
-
-
-def create_strategy(config):
-    """
-    Create build strategy and exec strategy.
-
-    Args:
-        config(dict): config
-
-    Returns:
-        build_strategy: build strategy
-        exec_strategy: exec strategy
-    """
-    build_strategy = paddle.static.BuildStrategy()
-    exec_strategy = paddle.static.ExecutionStrategy()
-
-    exec_strategy.num_threads = 1
-    exec_strategy.num_iteration_per_drop_scope = (
-        10000
-        if 'AMP' in config and config.AMP.get("use_pure_fp16", False) else 10)
-
-    fuse_op = True if 'AMP' in config else False
-
-    fuse_bn_act_ops = config.get('fuse_bn_act_ops', fuse_op)
-    fuse_elewise_add_act_ops = config.get('fuse_elewise_add_act_ops', fuse_op)
-    fuse_bn_add_act_ops = config.get('fuse_bn_add_act_ops', fuse_op)
-    enable_addto = config.get('enable_addto', fuse_op)
-
-    try:
-        build_strategy.fuse_bn_act_ops = fuse_bn_act_ops
-    except Exception as e:
-        logger.info(
-            "PaddlePaddle version 1.7.0 or higher is "
-            "required when you want to fuse batch_norm and activation_op.")
-
-    try:
-        build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
-    except Exception as e:
-        logger.info(
-            "PaddlePaddle version 1.7.0 or higher is "
-            "required when you want to fuse elewise_add_act and activation_op.")
-
-    try:
-        build_strategy.fuse_bn_add_act_ops = fuse_bn_add_act_ops
-    except Exception as e:
-        logger.info(
-            "PaddlePaddle 2.0-rc or higher is "
-            "required when you want to enable fuse_bn_add_act_ops strategy.")
-
-    try:
-        build_strategy.enable_addto = enable_addto
-    except Exception as e:
-        logger.info("PaddlePaddle 2.0-rc or higher is "
-                    "required when you want to enable addto strategy.")
-    return build_strategy, exec_strategy
-
-
-def dist_optimizer(config, optimizer):
-    """
-    Create a distributed optimizer based on a normal optimizer
-
-    Args:
-        config(dict):
-        optimizer(): a normal optimizer
-
-    Returns:
-        optimizer: a distributed optimizer
-    """
-    build_strategy, exec_strategy = create_strategy(config)
-
-    dist_strategy = DistributedStrategy()
-    dist_strategy.execution_strategy = exec_strategy
-    dist_strategy.build_strategy = build_strategy
-
-    dist_strategy.nccl_comm_num = 1
-    dist_strategy.fuse_all_reduce_ops = True
-    dist_strategy.fuse_grad_size_in_MB = 16
-    optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
-
-    return optimizer
-
-
-def mixed_precision_optimizer(config, optimizer):
-    if 'AMP' in config:
-        amp_cfg = config.AMP if config.AMP else dict()
-        scale_loss = amp_cfg.get('scale_loss', 1.0)
-        use_dynamic_loss_scaling = amp_cfg.get('use_dynamic_loss_scaling',
-                                               False)
-        use_pure_fp16 = amp_cfg.get('use_pure_fp16', False)
-        optimizer = paddle.static.amp.decorate(
-            optimizer,
-            init_loss_scaling=scale_loss,
-            use_dynamic_loss_scaling=use_dynamic_loss_scaling,
-            use_pure_fp16=use_pure_fp16,
-            use_fp16_guard=True)
-
-    return optimizer
-
-
-def build(config, main_prog, startup_prog, is_train=True, is_distributed=True):
-    """
-    Build a program using a model and an optimizer
-        1. create feeds
-        2. create a dataloader
-        3. create a model
-        4. create fetchs
-        5. create an optimizer
-
-    Args:
-        config(dict): config
-        main_prog(): main program
-        startup_prog(): startup program
-        is_train(bool): train or valid
-        is_distributed(bool): whether to use distributed training method
-
-    Returns:
-        dataloader(): a bridge between the model and the data
-        fetchs(dict): dict of model outputs(included loss and measures)
-    """
-    with paddle.static.program_guard(main_prog, startup_prog):
-        with paddle.utils.unique_name.guard():
-            use_mix = config.get('use_mix') and is_train
-            use_dali = config.get('use_dali', False)
-            use_distillation = config.get('use_distillation')
-
-            feeds = create_feeds(
-                config.image_shape,
-                use_mix=use_mix,
-                use_dali=use_dali,
-                dtype="float32")
-            if use_dali and use_mix:
-                import dali
-                feeds = dali.mix(feeds, config, is_train)
-            out = create_model(config.ARCHITECTURE, feeds['image'],
-                               config.classes_num, config, is_train)
-            fetchs = create_fetchs(
-                out,
-                feeds,
-                config.ARCHITECTURE,
-                config.topk,
-                config.classes_num,
-                epsilon=config.get('ls_epsilon'),
-                use_mix=use_mix,
-                config=config,
-                use_distillation=use_distillation)
-            lr_scheduler = None
-            optimizer = None
-            if is_train:
-                optimizer, lr_scheduler = create_optimizer(config)
-                optimizer = mixed_precision_optimizer(config, optimizer)
-                if is_distributed:
-                    optimizer = dist_optimizer(config, optimizer)
-                optimizer.minimize(fetchs['loss'][0])
-    return fetchs, lr_scheduler, feeds, optimizer
-
-
-def compile(config, program, loss_name=None, share_prog=None):
-    """
-    Compile the program
-
-    Args:
-        config(dict): config
-        program(): the program which is wrapped by
-        loss_name(str): loss name
-        share_prog(): the shared program, used for evaluation during training
-
-    Returns:
-        compiled_program(): a compiled program
-    """
-    build_strategy, exec_strategy = create_strategy(config)
-
-    compiled_program = paddle.static.CompiledProgram(
-        program).with_data_parallel(
-            share_vars_from=share_prog,
-            loss_name=loss_name,
-            build_strategy=build_strategy,
-            exec_strategy=exec_strategy)
-
-    return compiled_program
-
-
-total_step = 0
-
-
-def run(dataloader,
-        exe,
-        program,
-        feeds,
-        fetchs,
-        epoch=0,
-        mode='train',
-        config=None,
-        vdl_writer=None,
-        lr_scheduler=None,
-        profiler_options=None):
-    """
-    Feed data to the model and fetch the measures and loss
-
-    Args:
-        dataloader(paddle io dataloader):
-        exe():
-        program():
-        fetchs(dict): dict of measures and the loss
-        epoch(int): epoch of training or validation
-        model(str): log only
-
-    Returns:
-    """
-    fetch_list = [f[0] for f in fetchs.values()]
-    metric_list = [
-        ("lr", AverageMeter(
-            'lr', 'f', postfix=",", need_avg=False)),
-        ("batch_time", AverageMeter(
-            'batch_cost', '.5f', postfix=" s,")),
-        ("reader_time", AverageMeter(
-            'reader_cost', '.5f', postfix=" s,")),
-    ]
-    topk_name = 'top{}'.format(config.topk)
-    metric_list.insert(0, ("loss", fetchs["loss"][1]))
-    use_mix = config.get("use_mix", False) and mode == "train"
-    if not use_mix:
-        metric_list.insert(0, (topk_name, fetchs[topk_name][1]))
-        metric_list.insert(0, ("top1", fetchs["top1"][1]))
-
-    metric_list = OrderedDict(metric_list)
-
-    for m in metric_list.values():
-        m.reset()
-
-    use_dali = config.get('use_dali', False)
-    dataloader = dataloader if use_dali else dataloader()
-    tic = time.time()
-
-    idx = 0
-    batch_size = None
-    while True:
-        # The DALI maybe raise RuntimeError for some particular images, such as ImageNet1k/n04418357_26036.JPEG
-        try:
-            batch = next(dataloader)
-        except StopIteration:
-            break
-        except RuntimeError:
-            logger.warning(
-                "Except RuntimeError when reading data from dataloader, try to read once again..."
-            )
-            continue
-        idx += 1
-        # ignore the warmup iters
-        if idx == 5:
-            metric_list["batch_time"].reset()
-            metric_list["reader_time"].reset()
-
-        metric_list['reader_time'].update(time.time() - tic)
-
-        profiler.add_profiler_step(profiler_options)
-
-        if use_dali:
-            batch_size = batch[0]["feed_image"].shape()[0]
-            feed_dict = batch[0]
-        else:
-            batch_size = batch[0].shape()[0]
-            feed_dict = {
-                key.name: batch[idx]
-                for idx, key in enumerate(feeds.values())
-            }
-        metrics = exe.run(program=program,
-                          feed=feed_dict,
-                          fetch_list=fetch_list)
-
-        for name, m in zip(fetchs.keys(), metrics):
-            metric_list[name].update(np.mean(m), batch_size)
-        metric_list["batch_time"].update(time.time() - tic)
-        if mode == "train":
-            metric_list['lr'].update(lr_scheduler.get_lr())
-
-        fetchs_str = ' '.join([
-            str(metric_list[key].mean)
-            if "time" in key else str(metric_list[key].value)
-            for key in metric_list
-        ])
-        ips_info = " ips: {:.5f} images/sec.".format(
-            batch_size / metric_list["batch_time"].avg)
-        fetchs_str += ips_info
-
-        if lr_scheduler is not None:
-            if lr_scheduler.update_specified:
-                curr_global_counter = lr_scheduler.step_each_epoch * epoch + idx
-                update = max(
-                    0, curr_global_counter - lr_scheduler.
-                    update_start_step) % lr_scheduler.update_step_interval == 0
-                if update:
-                    lr_scheduler.step()
-            else:
-                lr_scheduler.step()
-
-        if vdl_writer:
-            global total_step
-            logger.scaler('loss', metrics[0][0], total_step, vdl_writer)
-            total_step += 1
-        if mode == 'valid':
-            if idx % config.get('print_interval', 10) == 0:
-                logger.info("{:s} step:{:<4d} {:s}".format(mode, idx,
-                                                           fetchs_str))
-        else:
-            epoch_str = "epoch:{:<3d}".format(epoch)
-            step_str = "{:s} step:{:<4d}".format(mode, idx)
-
-            if idx % config.get('print_interval', 10) == 0:
-                logger.info("{:s} {:s} {:s}".format(
-                    logger.coloring(epoch_str, "HEADER")
-                    if idx == 0 else epoch_str,
-                    logger.coloring(step_str, "PURPLE"),
-                    logger.coloring(fetchs_str, 'OKGREEN')))
-
-        tic = time.time()
-
-    end_str = ' '.join([str(m.mean) for m in metric_list.values()] +
-                       [metric_list["batch_time"].total])
-    ips_info = "ips: {:.5f} images/sec.".format(
-        batch_size * metric_list["batch_time"].count /
-        metric_list["batch_time"].sum)
-    if mode == 'valid':
-        logger.info("END {:s} {:s} {:s}".format(mode, end_str, ips_info))
-    else:
-        end_epoch_str = "END epoch:{:<3d}".format(epoch)
-        logger.info("{:s} {:s} {:s} {:s}".format(end_epoch_str, mode, end_str,
-                                                 ips_info))
-    if use_dali:
-        dataloader.reset()
-
-    # return top1_acc in order to save the best model
-    if mode == 'valid':
-        return fetchs["top1"][1].avg
diff --git a/ppcls/utils/static/run_dali.sh b/ppcls/utils/static/run_dali.sh
deleted file mode 100644
index 1ac48b3caef459eaf86f8a528af732d1a14a3630..0000000000000000000000000000000000000000
--- a/ppcls/utils/static/run_dali.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/env bash
-
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-export FLAGS_fraction_of_gpu_memory_to_use=0.80
-
-python3.7 -m paddle.distributed.launch \
-    --gpus="0,1,2,3" \
-    tools/static/train.py \
-        -c ./configs/ResNet/ResNet50.yaml \
-        -o print_interval=10 \
-        -o use_dali=True
diff --git a/ppcls/utils/static/save_load.py b/ppcls/utils/static/save_load.py
deleted file mode 100644
index 7f20b29228a1fd56ef85f204bc511bb56389c527..0000000000000000000000000000000000000000
--- a/ppcls/utils/static/save_load.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import errno
-import os
-import re
-import shutil
-import tempfile
-
-import paddle
-
-from ppcls.utils import logger
-
-__all__ = ['init_model', 'save_model']
-
-
-def _mkdir_if_not_exist(path):
-    """
-    mkdir if not exists, ignore the exception when multiprocess mkdir together
-    """
-    if not os.path.exists(path):
-        try:
-            os.makedirs(path)
-        except OSError as e:
-            if e.errno == errno.EEXIST and os.path.isdir(path):
-                logger.warning(
-                    'be happy if some process has already created {}'.format(
-                        path))
-            else:
-                raise OSError('Failed to mkdir {}'.format(path))
-
-
-def _load_state(path):
-    if os.path.exists(path + '.pdopt'):
-        # XXX another hack to ignore the optimizer state
-        tmp = tempfile.mkdtemp()
-        dst = os.path.join(tmp, os.path.basename(os.path.normpath(path)))
-        shutil.copy(path + '.pdparams', dst + '.pdparams')
-        state = paddle.static.load_program_state(dst)
-        shutil.rmtree(tmp)
-    else:
-        state = paddle.static.load_program_state(path)
-    return state
-
-
-def load_params(exe, prog, path, ignore_params=None):
-    """
-    Load model from the given path.
-    Args:
-        exe (fluid.Executor): The fluid.Executor object.
-        prog (fluid.Program): load weight to which Program object.
-        path (string): URL string or loca model path.
-        ignore_params (list): ignore variable to load when finetuning.
-            It can be specified by finetune_exclude_pretrained_params
-            and the usage can refer to the document
-            docs/advanced_tutorials/TRANSFER_LEARNING.md
-    """
-    if not (os.path.isdir(path) or os.path.exists(path + '.pdparams')):
-        raise ValueError("Model pretrain path {} does not "
-                         "exists.".format(path))
-
-    logger.info(
-        logger.coloring('Loading parameters from {}...'.format(path),
-                        'HEADER'))
-
-    ignore_set = set()
-    state = _load_state(path)
-
-    # ignore the parameter which mismatch the shape
-    # between the model and pretrain weight.
-    all_var_shape = {}
-    for block in prog.blocks:
-        for param in block.all_parameters():
-            all_var_shape[param.name] = param.shape
-    ignore_set.update([
-        name for name, shape in all_var_shape.items()
-        if name in state and shape != state[name].shape
-    ])
-
-    if ignore_params:
-        all_var_names = [var.name for var in prog.list_vars()]
-        ignore_list = filter(
-            lambda var: any([re.match(name, var) for name in ignore_params]),
-            all_var_names)
-        ignore_set.update(list(ignore_list))
-
-    if len(ignore_set) > 0:
-        for k in ignore_set:
-            if k in state:
-                logger.warning(
-                    'variable {} is already excluded automatically'.format(k))
-                del state[k]
-
-    paddle.static.set_program_state(prog, state)
-
-
-def init_model(config, program, exe):
-    """
-    load model from checkpoint or pretrained_model
-    """
-    checkpoints = config.get('checkpoints')
-    if checkpoints:
-        paddle.static.load(program, checkpoints, exe)
-        logger.info(
-            logger.coloring("Finish initing model from {}".format(checkpoints),
-                            "HEADER"))
-        return
-
-    pretrained_model = config.get('pretrained_model')
-    if pretrained_model:
-        if not isinstance(pretrained_model, list):
-            pretrained_model = [pretrained_model]
-        for pretrain in pretrained_model:
-            load_params(exe, program, pretrain)
-        logger.info(
-            logger.coloring("Finish initing model from {}".format(
-                pretrained_model), "HEADER"))
-
-
-def save_model(program, model_path, epoch_id, prefix='ppcls'):
-    """
-    save model to the target path
-    """
-    model_path = os.path.join(model_path, str(epoch_id))
-    _mkdir_if_not_exist(model_path)
-    model_prefix = os.path.join(model_path, prefix)
-    paddle.static.save(program, model_prefix)
-    logger.info(
-        logger.coloring("Already save model in {}".format(model_path),
-                        "HEADER"))
diff --git a/ppcls/utils/static/train.py b/ppcls/utils/static/train.py
deleted file mode 100644
index 40c7856307bffdaa30ea253902b90abf3fa87bc6..0000000000000000000000000000000000000000
--- a/ppcls/utils/static/train.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import os
-import sys
-__dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(__dir__)
-sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))
-
-from sys import version_info
-
-import paddle
-from paddle.distributed import fleet
-
-from ppcls.data import Reader
-from ppcls.utils.config import get_config
-from ppcls.utils import logger
-from tools.static import program
-from save_load import init_model, save_model
-
-
-def parse_args():
-    parser = argparse.ArgumentParser("PaddleClas train script")
-    parser.add_argument(
-        '-c',
-        '--config',
-        type=str,
-        default='configs/ResNet/ResNet50.yaml',
-        help='config file path')
-    parser.add_argument(
-        '--vdl_dir',
-        type=str,
-        default=None,
-        help='VisualDL logging directory for image.')
-    parser.add_argument(
-        '-p',
-        '--profiler_options',
-        type=str,
-        default=None,
-        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
-    )
-    parser.add_argument(
-        '-o',
-        '--override',
-        action='append',
-        default=[],
-        help='config options to be overridden')
-    args = parser.parse_args()
-    return args
-
-
-def main(args):
-    config = get_config(args.config, overrides=args.override, show=True)
-    if config.get("is_distributed", True):
-        fleet.init(is_collective=True)
-    # assign the place
-    use_gpu = config.get("use_gpu", True)
-    # amp related config
-    if 'AMP' in config:
-        AMP_RELATED_FLAGS_SETTING = {
-            'FLAGS_cudnn_exhaustive_search': 1,
-            'FLAGS_conv_workspace_size_limit': 1500,
-            'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
-            'FLAGS_max_inplace_grad_add': 8,
-        }
-        os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
-        paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
-    use_xpu = config.get("use_xpu", False)
-    assert (
-        use_gpu and use_xpu
-    ) is not True, "gpu and xpu can not be true in the same time in static mode!"
-
-    if use_gpu:
-        place = paddle.set_device('gpu')
-    elif use_xpu:
-        place = paddle.set_device('xpu')
-    else:
-        place = paddle.set_device('cpu')
-
-    # startup_prog is used to do some parameter init work,
-    # and train prog is used to hold the network
-    startup_prog = paddle.static.Program()
-    train_prog = paddle.static.Program()
-
-    best_top1_acc = 0.0  # best top1 acc record
-
-    train_fetchs, lr_scheduler, train_feeds, optimizer = program.build(
-        config,
-        train_prog,
-        startup_prog,
-        is_train=True,
-        is_distributed=config.get("is_distributed", True))
-
-    if config.validate:
-        valid_prog = paddle.static.Program()
-        valid_fetchs, _, valid_feeds, _ = program.build(
-            config,
-            valid_prog,
-            startup_prog,
-            is_train=False,
-            is_distributed=config.get("is_distributed", True))
-        # clone to prune some content which is irrelevant in valid_prog
-        valid_prog = valid_prog.clone(for_test=True)
-
-    # create the "Executor" with the statement of which place
-    exe = paddle.static.Executor(place)
-    # Parameter initialization
-    exe.run(startup_prog)
-    # load pretrained models or checkpoints
-    init_model(config, train_prog, exe)
-
-    if 'AMP' in config and config.AMP.get("use_pure_fp16", False):
-        optimizer.amp_init(
-            place,
-            scope=paddle.static.global_scope(),
-            test_program=valid_prog if config.validate else None)
-
-    if not config.get("is_distributed", True):
-        compiled_train_prog = program.compile(
-            config, train_prog, loss_name=train_fetchs["loss"][0].name)
-    else:
-        compiled_train_prog = train_prog
-
-    if not config.get('use_dali', False):
-        train_dataloader = Reader(config, 'train', places=place)()
-        if config.validate and paddle.distributed.get_rank() == 0:
-            valid_dataloader = Reader(config, 'valid', places=place)()
-            compiled_valid_prog = program.compile(config, valid_prog)
-    else:
-        assert use_gpu is True, "DALI only support gpu, please set use_gpu to True!"
-        import dali
-        train_dataloader = dali.train(config)
-        if config.validate and paddle.distributed.get_rank() == 0:
-            valid_dataloader = dali.val(config)
-            compiled_valid_prog = program.compile(config, valid_prog)
-
-    vdl_writer = None
-    if args.vdl_dir:
-        if version_info.major == 2:
-            logger.info(
-                "visualdl is just supported for python3, so it is disabled in python2..."
-            )
-        else:
-            from visualdl import LogWriter
-            vdl_writer = LogWriter(args.vdl_dir)
-
-    for epoch_id in range(config.epochs):
-        # 1. train with train dataset
-        program.run(train_dataloader, exe, compiled_train_prog, train_feeds,
-                    train_fetchs, epoch_id, 'train', config, vdl_writer,
-                    lr_scheduler, args.profiler_options)
-        if paddle.distributed.get_rank() == 0:
-            # 2. validate with validate dataset
-            if config.validate and epoch_id % config.valid_interval == 0:
-                top1_acc = program.run(valid_dataloader, exe,
-                                       compiled_valid_prog, valid_feeds,
-                                       valid_fetchs, epoch_id, 'valid', config)
-                if top1_acc > best_top1_acc:
-                    best_top1_acc = top1_acc
-                    message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
-                        best_top1_acc, epoch_id)
-                    logger.info("{:s}".format(logger.coloring(message, "RED")))
-                    if epoch_id % config.save_interval == 0:
-
-                        model_path = os.path.join(config.model_save_dir,
-                                                  config.ARCHITECTURE["name"])
-                        save_model(train_prog, model_path, "best_model")
-
-            # 3. save the persistable model
-            if epoch_id % config.save_interval == 0:
-                model_path = os.path.join(config.model_save_dir,
-                                          config.ARCHITECTURE["name"])
-                save_model(train_prog, model_path, epoch_id)
-
-
-if __name__ == '__main__':
-    paddle.enable_static()
-    args = parse_args()
-    main(args)
diff --git a/tests/DarkNet53.txt b/tests/DarkNet53.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e5a9adb862eaae2b2abc3f9aaddee6b4c7a161ba
--- /dev/null
+++ b/tests/DarkNet53.txt
@@ -0,0 +1,51 @@
+===========================train_params===========================
+model_name:DarkNet53
+python:python3.7
+gpu_list:0|0,1
+-o Global.device:gpu
+-o Global.auto_cast:null
+-o Global.epochs:lite_train_infer=2|whole_train_infer=120
+-o Global.output_dir:./output/
+-o DataLoader.Train.sampler.batch_size:8
+-o Global.pretrained_model:null
+train_model_name:latest
+train_infer_img_dir:./dataset/ILSVRC2012/val
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c ppcls/configs/ImageNet/DarkNet/DarkNet53.yaml
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params=========================== 
+eval:tools/eval.py -c ppcls/configs/ImageNet/DarkNet/DarkNet53.yaml
+null:null
+##
+===========================infer_params==========================
+-o Global.save_inference_dir:./inference
+-o Global.pretrained_model:
+norm_export:tools/export_model.py -c ppcls/configs/ImageNet/DarkNet/DarkNet53.yaml
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+infer_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/whole_chain/DarkNet53_inference.tar
+infer_model:../inference/
+infer_export:null
+infer_quant:Fasle
+inference:python/predict_cls.py -c configs/inference_cls.yaml
+-o Global.use_gpu:True|False
+-o Global.enable_mkldnn:True|False
+-o Global.cpu_num_threads:1|6
+-o Global.batch_size:1
+-o Global.use_tensorrt:True|False
+-o Global.use_fp16:True|False
+-o Global.inference_model_dir:../inference
+-o Global.infer_imgs:../dataset/ILSVRC2012/val
+-o Global.save_log_path:null
+-o Global.benchmark:True
+null:null
diff --git a/tests/HRNet_W18_C.txt b/tests/HRNet_W18_C.txt
new file mode 100644
index 0000000000000000000000000000000000000000..08c712accc70dc3ea70030b006c2f44c12488a3f
--- /dev/null
+++ b/tests/HRNet_W18_C.txt
@@ -0,0 +1,51 @@
+===========================train_params===========================
+model_name:HRNet_W18_C
+python:python3.7
+gpu_list:0|0,1
+-o Global.device:gpu
+-o Global.auto_cast:null
+-o Global.epochs:lite_train_infer=2|whole_train_infer=120
+-o Global.output_dir:./output/
+-o DataLoader.Train.sampler.batch_size:8
+-o Global.pretrained_model:null
+train_model_name:latest
+train_infer_img_dir:./dataset/ILSVRC2012/val
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c ppcls/configs/ImageNet/HRNet/HRNet_W18_C.yaml
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params=========================== 
+eval:tools/eval.py -c ppcls/configs/ImageNet/HRNet/HRNet_W18_C.yaml
+null:null
+##
+===========================infer_params==========================
+-o Global.save_inference_dir:./inference
+-o Global.pretrained_model:
+norm_export:tools/export_model.py -c ppcls/configs/ImageNet/HRNet/HRNet_W18_C.yaml
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/whole_chain/HRNet_W18_C_inference.tar
+infer_model:../inference/
+infer_export:null
+infer_quant:Fasle
+inference:python/predict_cls.py -c configs/inference_cls.yaml
+-o Global.use_gpu:True|False
+-o Global.enable_mkldnn:True|False
+-o Global.cpu_num_threads:1|6
+-o Global.batch_size:1
+-o Global.use_tensorrt:True|False
+-o Global.use_fp16:True|False
+-o Global.inference_model_dir:../inference
+-o Global.infer_imgs:../dataset/ILSVRC2012/val
+-o Global.save_log_path:null
+-o Global.benchmark:True
+null:null
diff --git a/tests/LeViT_128S.txt b/tests/LeViT_128S.txt
new file mode 100644
index 0000000000000000000000000000000000000000..337d8af7701eae6104a834dfe64995a97ac96439
--- /dev/null
+++ b/tests/LeViT_128S.txt
@@ -0,0 +1,51 @@
+===========================train_params===========================
+model_name:LeViT_128S
+python:python3.7
+gpu_list:0|0,1
+-o Global.device:gpu
+-o Global.auto_cast:null
+-o Global.epochs:lite_train_infer=2|whole_train_infer=120
+-o Global.output_dir:./output/
+-o DataLoader.Train.sampler.batch_size:8
+-o Global.pretrained_model:null
+train_model_name:latest
+train_infer_img_dir:./dataset/ILSVRC2012/val
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c ppcls/configs/ImageNet/LeViT/LeViT_128S.yaml
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params=========================== 
+eval:tools/eval.py -c ppcls/configs/ImageNet/LeViT/LeViT_128S.yaml
+null:null
+##
+===========================infer_params==========================
+-o Global.save_inference_dir:./inference
+-o Global.pretrained_model:
+norm_export:tools/export_model.py -c ppcls/configs/ImageNet/LeViT/LeViT_128S.yaml
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+infer_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/whole_chain/LeViT_128S_inference.tar
+infer_model:../inference/
+infer_export:null
+infer_quant:Fasle
+inference:python/predict_cls.py -c configs/inference_cls.yaml
+-o Global.use_gpu:True|Fasle
+-o Global.enable_mkldnn:True|False
+-o Global.cpu_num_threads:1|6
+-o Global.batch_size:1
+-o Global.use_tensorrt:True|False
+-o Global.use_fp16:True|False
+-o Global.inference_model_dir:../inference
+-o Global.infer_imgs:../dataset/ILSVRC2012/val
+-o Global.save_log_path:null
+-o Global.benchmark:True
+null:null
diff --git a/tests/MobileNetV1.txt b/tests/MobileNetV1.txt
new file mode 100644
index 0000000000000000000000000000000000000000..784d6f30832f26831a1f0d60adf6c43d71f3f181
--- /dev/null
+++ b/tests/MobileNetV1.txt
@@ -0,0 +1,51 @@
+===========================train_params===========================
+model_name:MobileNetV1
+python:python3.7
+gpu_list:0|0,1
+-o Global.device:gpu
+-o Global.auto_cast:null
+-o Global.epochs:lite_train_infer=2|whole_train_infer=120
+-o Global.output_dir:./output/
+-o DataLoader.Train.sampler.batch_size:8
+-o Global.pretrained_model:null
+train_model_name:latest
+train_infer_img_dir:./dataset/ILSVRC2012/val
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params=========================== 
+eval:tools/eval.py -c ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
+null:null
+##
+===========================infer_params==========================
+-o Global.save_inference_dir:./inference
+-o Global.pretrained_model:
+norm_export:tools/export_model.py -c ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/whole_chain/MobileNetV1_inference.tar
+infer_model:../inference/
+infer_export:null
+infer_quant:Fasle
+inference:python/predict_cls.py -c configs/inference_cls.yaml
+-o Global.use_gpu:True|False
+-o Global.enable_mkldnn:True|False
+-o Global.cpu_num_threads:1|6
+-o Global.batch_size:1
+-o Global.use_tensorrt:True|False
+-o Global.use_fp16:True|False
+-o Global.inference_model_dir:../inference
+-o Global.infer_imgs:../dataset/ILSVRC2012/val
+-o Global.save_log_path:null
+-o Global.benchmark:True
+null:null
diff --git a/tests/MobileNetV2.txt b/tests/MobileNetV2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c622100fea92753ff4a7239e1b56a0ea4ee97352
--- /dev/null
+++ b/tests/MobileNetV2.txt
@@ -0,0 +1,51 @@
+===========================train_params===========================
+model_name:MobileNetV2
+python:python3.7
+gpu_list:0|0,1
+-o Global.device:gpu
+-o Global.auto_cast:null
+-o Global.epochs:lite_train_infer=2|whole_train_infer=120
+-o Global.output_dir:./output/
+-o DataLoader.Train.sampler.batch_size:8
+-o Global.pretrained_model:null
+train_model_name:latest
+train_infer_img_dir:./dataset/ILSVRC2012/val
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params=========================== 
+eval:tools/eval.py -c ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
+null:null
+##
+===========================infer_params==========================
+-o Global.save_inference_dir:./inference
+-o Global.pretrained_model:
+norm_export:tools/export_model.py -c ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+infer_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/whole_chain/MobileNetV2_inference.tar
+infer_model:../inference/
+infer_export:null
+infer_quant:Fasle
+inference:python/predict_cls.py -c configs/inference_cls.yaml
+-o Global.use_gpu:True|False
+-o Global.enable_mkldnn:True|False
+-o Global.cpu_num_threads:1|6
+-o Global.batch_size:1
+-o Global.use_tensorrt:True|False
+-o Global.use_fp16:True|False
+-o Global.inference_model_dir:../inference
+-o Global.infer_imgs:../dataset/ILSVRC2012/val
+-o Global.save_log_path:null
+-o Global.benchmark:True
+null:null
diff --git a/tests/MobileNetV3_large_x1_0.txt b/tests/MobileNetV3_large_x1_0.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2bc4ec43fce5706375648f13d6773d07b1524309
--- /dev/null
+++ b/tests/MobileNetV3_large_x1_0.txt
@@ -0,0 +1,51 @@
+===========================train_params===========================
+model_name:MobileNetV3_large_x1_0
+python:python3.7
+gpu_list:0|0,1
+-o Global.device:gpu
+-o Global.auto_cast:null
+-o Global.epochs:lite_train_infer=2|whole_train_infer=120
+-o Global.output_dir:./output/
+-o DataLoader.Train.sampler.batch_size:8
+-o Global.pretrained_model:null
+train_model_name:latest
+train_infer_img_dir:./dataset/ILSVRC2012/val
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml
+pact_train:deploy/slim/slim.py -c ppcls/configs/slim/MobileNetV3_large_x1_0_quantization.yaml
+fpgm_train:deploy/slim/slim.py -c ppcls/configs/slim/MobileNetV3_large_x1_0_prune.yaml
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params=========================== 
+eval:tools/eval.py -c ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml
+null:null
+##
+===========================infer_params==========================
+-o Global.save_inference_dir:./inference
+-o Global.pretrained_model:
+norm_export:tools/export_model.py -c ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml
+quant_export:deploy/slim/slim.py -m export -c ppcls/configs/slim/MobileNetV3_large_x1_0_quantalization.yaml
+fpgm_export:deploy/slim/slim.py -m export -c ppcls/configs/slim/MobileNetV3_large_x1_0_prune.yaml
+distill_export:null
+export1:null
+export2:null
+inference_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/whole_chain/MobileNetV3_large_x1_0_inference.tar
+infer_model:../inference/
+infer_export:null
+infer_quant:Fasle
+inference:python/predict_cls.py -c configs/inference_cls.yaml
+-o Global.use_gpu:True|False
+-o Global.enable_mkldnn:True|False
+-o Global.cpu_num_threads:1|6
+-o Global.batch_size:1
+-o Global.use_tensorrt:True|False
+-o Global.use_fp16:True|False
+-o Global.inference_model_dir:../inference
+-o Global.infer_imgs:../dataset/ILSVRC2012/val
+-o Global.save_log_path:null
+-o Global.benchmark:True
+null:null
diff --git a/tests/ResNeXt101_vd_64x4d.txt b/tests/ResNeXt101_vd_64x4d.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5a4a088d7367f522a9837394922a49d918d07830
--- /dev/null
+++ b/tests/ResNeXt101_vd_64x4d.txt
@@ -0,0 +1,51 @@
+===========================train_params===========================
+model_name:ResNeXt101_vd_64x4d
+python:python3.7
+gpu_list:0|0,1
+-o Global.device:gpu
+-o Global.auto_cast:null
+-o Global.epochs:lite_train_infer=2|whole_train_infer=120
+-o Global.output_dir:./output/
+-o DataLoader.Train.sampler.batch_size:8
+-o Global.pretrained_model:null
+train_model_name:latest
+train_infer_img_dir:./dataset/ILSVRC2012/val
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_64x4d.yaml
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params=========================== 
+eval:tools/eval.py -c ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_64x4d.yaml
+null:null
+##
+===========================infer_params==========================
+-o Global.save_inference_dir:./inference
+-o Global.pretrained_model:
+norm_export:tools/export_model.py -c ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_64x4d.yaml
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/whole_chain/ResNeXt101_64x4d_inference.tar
+infer_model:../inference/
+infer_export:null
+infer_quant:Fasle
+inference:python/predict_cls.py -c configs/inference_cls.yaml
+-o Global.use_gpu:True|False
+-o Global.enable_mkldnn:True|False
+-o Global.cpu_num_threads:1|6
+-o Global.batch_size:1
+-o Global.use_tensorrt:True|False
+-o Global.use_fp16:True|False
+-o Global.inference_model_dir:../inference
+-o Global.infer_imgs:../dataset/ILSVRC2012/val
+-o Global.save_log_path:null
+-o Global.benchmark:True
+null:null
diff --git a/tests/ResNet50_vd.txt b/tests/ResNet50_vd.txt
new file mode 100644
index 0000000000000000000000000000000000000000..da02c8894b0c13981742bf698f95450a2b3e3082
--- /dev/null
+++ b/tests/ResNet50_vd.txt
@@ -0,0 +1,51 @@
+===========================train_params===========================
+model_name:ResNet50_vd
+python:python3.7
+gpu_list:0|0,1
+-o Global.device:gpu
+-o Global.auto_cast:null
+-o Global.epochs:lite_train_infer=2|whole_train_infer=120
+-o Global.output_dir:./output/
+-o DataLoader.Train.sampler.batch_size:8
+-o Global.pretrained_model:null
+train_model_name:latest
+train_infer_img_dir:./dataset/ILSVRC2012/val
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml
+pact_train:deploy/slim/slim.py -c ppcls/configs/slim/ResNet50_vd_quantization.yaml
+fpgm_train:deploy/slim/slim.py -c ppcls/configs/slim/ResNet50_vd_prune.yaml
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params=========================== 
+eval:tools/eval.py -c ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml
+null:null
+##
+===========================infer_params==========================
+-o Global.save_inference_dir:./inference
+-o Global.pretrained_model:
+norm_export:tools/export_model.py -c ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml
+quant_export:deploy/slim/slim.py -m export -c ppcls/configs/slim/ResNet50_vd_quantalization.yaml
+fpgm_export:deploy/slim/slim.py -m export -c ppcls/configs/slim/ResNet50_vd_prune.yaml
+distill_export:null
+export1:null
+export2:null
+infer_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/whole_chain/ResNet50_vd_inference.tar
+infer_model:../inference/
+infer_export:null
+infer_quant:Fasle
+inference:python/predict_cls.py -c configs/inference_cls.yaml
+-o Global.use_gpu:True|False
+-o Global.enable_mkldnn:True|False
+-o Global.cpu_num_threads:1|6
+-o Global.batch_size:1
+-o Global.use_tensorrt:True|False
+-o Global.use_fp16:True|False
+-o Global.inference_model_dir:../inference
+-o Global.infer_imgs:../dataset/ILSVRC2012/val
+-o Global.save_log_path:null
+-o Global.benchmark:True
+null:null
diff --git a/tests/ShuffleNetV2_x1_0.txt b/tests/ShuffleNetV2_x1_0.txt
new file mode 100644
index 0000000000000000000000000000000000000000..08964a2f0fda5c2eb7c56690604980a1eac73d75
--- /dev/null
+++ b/tests/ShuffleNetV2_x1_0.txt
@@ -0,0 +1,51 @@
+===========================train_params===========================
+model_name:ShuffleNetV2_x1_0
+python:python3.7
+gpu_list:0|0,1
+-o Global.device:gpu
+-o Global.auto_cast:null
+-o Global.epochs:lite_train_infer=2|whole_train_infer=120
+-o Global.output_dir:./output/
+-o DataLoader.Train.sampler.batch_size:8
+-o Global.pretrained_model:null
+train_model_name:latest
+train_infer_img_dir:./dataset/ILSVRC2012/val
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params=========================== 
+eval:tools/eval.py -c ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
+null:null
+##
+===========================infer_params==========================
+-o Global.save_inference_dir:./inference
+-o Global.pretrained_model:
+norm_export:tools/export_model.py -c ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/whole_chain/ShuffleNetV2_x1_0_inference.tar
+infer_model:../inference/
+infer_export:null
+infer_quant:Fasle
+inference:python/predict_cls.py -c configs/inference_cls.yaml
+-o Global.use_gpu:True|False
+-o Global.enable_mkldnn:True|False
+-o Global.cpu_num_threads:1|6
+-o Global.batch_size:1
+-o Global.use_tensorrt:True|False
+-o Global.use_fp16:True|False
+-o Global.inference_model_dir:../inference
+-o Global.infer_imgs:../dataset/ILSVRC2012/val
+-o Global.save_log_path:null
+-o Global.benchmark:True
+null:null
diff --git a/tests/SwinTransformer_tiny_patch4_window7_224.txt b/tests/SwinTransformer_tiny_patch4_window7_224.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a358d191a04e3e471da5a7f127da37e1f7162c0a
--- /dev/null
+++ b/tests/SwinTransformer_tiny_patch4_window7_224.txt
@@ -0,0 +1,51 @@
+===========================train_params===========================
+model_name:SwinTransformer_tiny_patch4_window7_224
+python:python3.7
+gpu_list:0|0,1
+-o Global.device:gpu
+-o Global.auto_cast:null
+-o Global.epochs:lite_train_infer=2|whole_train_infer=120
+-o Global.output_dir:./output/
+-o DataLoader.Train.sampler.batch_size:8
+-o Global.pretrained_model:null
+train_model_name:latest
+train_infer_img_dir:./dataset/ILSVRC2012/val
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params=========================== 
+eval:tools/eval.py -c ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml
+null:null
+##
+===========================infer_params==========================
+-o Global.save_inference_dir:./inference
+-o Global.pretrained_model:
+norm_export:tools/export_model.py -c ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/whole_chain/SwinTransformer_tiny_patch4_window7_224_inference.tar
+infer_model:../inference/
+infer_export:null
+infer_quant:Fasle
+inference:python/predict_cls.py -c configs/inference_cls.yaml
+-o Global.use_gpu:True|False
+-o Global.enable_mkldnn:True|False
+-o Global.cpu_num_threads:1|6
+-o Global.batch_size:1
+-o Global.use_tensorrt:True|False
+-o Global.use_fp16:True|False
+-o Global.inference_model_dir:../inference
+-o Global.infer_imgs:../dataset/ILSVRC2012/val
+-o Global.save_log_path:null
+-o Global.benchmark:True
+null:null
diff --git a/tests/prepare.sh b/tests/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..55e1f2c7f0898779cfba6b91f2a0f0a789931170
--- /dev/null
+++ b/tests/prepare.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+FILENAME=$1
+# MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer', 'infer']
+MODE=$2
+
+dataline=$(cat ${FILENAME})
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+function func_parser_value(){
+    strs=$1
+    IFS=":"
+    array=(${strs})
+    if [ ${#array[*]} = 2 ]; then
+        echo ${array[1]}
+    else
+    	IFS="|"
+    	tmp="${array[1]}:${array[2]}"
+        echo ${tmp}
+    fi
+}
+model_name=$(func_parser_value "${lines[1]}")
+inference_model_url=$(func_parser_value "${lines[35]}")
+
+if [ ${MODE} = "lite_train_infer" ] || [ ${MODE} = "whole_infer" ];then
+    # pretrain lite train data
+    cd dataset
+    rm -rf ILSVRC2012
+    wget -nc https://paddle-imagenet-models-name.bj.bcebos.com/data/whole_chain/whole_chain_little_train.tar
+    tar xf whole_chain_little_train.tar
+    ln -s whole_chain_little_train ILSVRC2012
+    cd ILSVRC2012 
+    mv train.txt train_list.txt
+    mv val.txt val_list.txt
+    cd ../../
+elif [ ${MODE} = "infer" ];then
+    # download data
+    cd dataset
+    rm -rf ILSVRC2012
+    wget -nc https://paddle-imagenet-models-name.bj.bcebos.com/data/whole_chain/whole_chain_infer.tar
+    tar xf whole_chain_infer.tar
+    ln -s whole_chain_infer ILSVRC2012
+    cd ILSVRC2012 
+    mv val.txt val_list.txt
+    cd ../../
+    # download inference model
+    eval "wget -nc $inference_model_url"
+    tar xf "${model_name}_inference.tar"
+
+elif [ ${MODE} = "whole_train_infer" ];then
+    cd dataset
+    rm -rf ILSVRC2012
+    wget -nc https://paddle-imagenet-models-name.bj.bcebos.com/data/whole_chain/whole_chain_CIFAR100.tar
+    tar xf whole_chain_CIFAR100.tar
+    ln -s whole_chain_CIFAR100 ILSVRC2012
+    cd ILSVRC2012 
+    mv train.txt train_list.txt
+    mv val.txt val_list.txt
+    cd ../../
+fi
diff --git a/tests/test.sh b/tests/test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c717f77de3496018459655d46437eabccc142834
--- /dev/null
+++ b/tests/test.sh
@@ -0,0 +1,363 @@
+#!/bin/bash
+FILENAME=$1
+# MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer', 'infer'] 
+MODE=$2
+
+dataline=$(cat ${FILENAME})
+
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+
+function func_parser_key(){
+    strs=$1
+    IFS=":"
+    array=(${strs})
+    tmp=${array[0]}
+    echo ${tmp}
+}
+function func_parser_value(){
+    strs=$1
+    IFS=":"
+    array=(${strs})
+    tmp=${array[1]}
+    echo ${tmp}
+}
+function func_set_params(){
+    key=$1
+    value=$2
+    if [ ${key} = "null" ];then
+        echo " "
+    elif [[ ${value} = "null" ]] || [[ ${value} = " " ]] || [ ${#value} -le 0 ];then
+        echo " "
+    else 
+        echo "${key}=${value}"
+    fi
+}
+function func_parser_params(){
+    strs=$1
+    IFS=":"
+    array=(${strs})
+    key=${array[0]}
+    tmp=${array[1]}
+    IFS="|"
+    res=""
+    for _params in ${tmp[*]}; do
+        IFS="="
+        array=(${_params})
+        mode=${array[0]}
+        value=${array[1]}
+        if [[ ${mode} = ${MODE} ]]; then
+            IFS="|"
+            #echo $(func_set_params "${mode}" "${value}")
+            echo $value
+            break
+        fi
+        IFS="|"
+    done
+    echo ${res}
+}
+function status_check(){
+    last_status=$1   # the exit code
+    run_command=$2
+    run_log=$3
+    if [ $last_status -eq 0 ]; then
+        echo -e "\033[33m Run successfully with command - ${run_command}!  \033[0m" | tee -a ${run_log}
+    else
+        echo -e "\033[33m Run failed with command - ${run_command}!  \033[0m" | tee -a ${run_log}
+    fi
+}
+
+IFS=$'\n'
+# The training params
+model_name=$(func_parser_value "${lines[1]}")
+python=$(func_parser_value "${lines[2]}")
+gpu_list=$(func_parser_value "${lines[3]}")
+train_use_gpu_key=$(func_parser_key "${lines[4]}")
+train_use_gpu_value=$(func_parser_value "${lines[4]}")
+autocast_list=$(func_parser_value "${lines[5]}")
+autocast_key=$(func_parser_key "${lines[5]}")
+epoch_key=$(func_parser_key "${lines[6]}")
+epoch_num=$(func_parser_params "${lines[6]}")
+save_model_key=$(func_parser_key "${lines[7]}")
+train_batch_key=$(func_parser_key "${lines[8]}")
+train_batch_value=$(func_parser_params "${lines[8]}")
+pretrain_model_key=$(func_parser_key "${lines[9]}")
+pretrain_model_value=$(func_parser_value "${lines[9]}")
+train_model_name=$(func_parser_value "${lines[10]}")
+train_infer_img_dir=$(func_parser_value "${lines[11]}")
+train_param_key1=$(func_parser_key "${lines[12]}")
+train_param_value1=$(func_parser_value "${lines[12]}")
+
+trainer_list=$(func_parser_value "${lines[14]}")
+trainer_norm=$(func_parser_key "${lines[15]}")
+norm_trainer=$(func_parser_value "${lines[15]}")
+pact_key=$(func_parser_key "${lines[16]}")
+pact_trainer=$(func_parser_value "${lines[16]}")
+fpgm_key=$(func_parser_key "${lines[17]}")
+fpgm_trainer=$(func_parser_value "${lines[17]}")
+distill_key=$(func_parser_key "${lines[18]}")
+distill_trainer=$(func_parser_value "${lines[18]}")
+trainer_key1=$(func_parser_key "${lines[19]}")
+trainer_value1=$(func_parser_value "${lines[19]}")
+trainer_key2=$(func_parser_key "${lines[20]}")
+trainer_value2=$(func_parser_value "${lines[20]}")
+
+eval_py=$(func_parser_value "${lines[23]}")
+eval_key1=$(func_parser_key "${lines[24]}")
+eval_value1=$(func_parser_value "${lines[24]}")
+
+save_infer_key=$(func_parser_key "${lines[27]}")
+export_weight=$(func_parser_key "${lines[28]}")
+norm_export=$(func_parser_value "${lines[29]}")
+pact_export=$(func_parser_value "${lines[30]}")
+fpgm_export=$(func_parser_value "${lines[31]}")
+distill_export=$(func_parser_value "${lines[32]}")
+export_key1=$(func_parser_key "${lines[33]}")
+export_value1=$(func_parser_value "${lines[33]}")
+export_key2=$(func_parser_key "${lines[34]}")
+export_value2=$(func_parser_value "${lines[34]}")
+
+# parser inference model 
+infer_model_dir_list=$(func_parser_value "${lines[36]}")
+infer_export_list=$(func_parser_value "${lines[37]}")
+infer_is_quant=$(func_parser_value "${lines[38]}")
+# parser inference 
+inference_py=$(func_parser_value "${lines[39]}")
+use_gpu_key=$(func_parser_key "${lines[40]}")
+use_gpu_list=$(func_parser_value "${lines[40]}")
+use_mkldnn_key=$(func_parser_key "${lines[41]}")
+use_mkldnn_list=$(func_parser_value "${lines[41]}")
+cpu_threads_key=$(func_parser_key "${lines[42]}")
+cpu_threads_list=$(func_parser_value "${lines[42]}")
+batch_size_key=$(func_parser_key "${lines[43]}")
+batch_size_list=$(func_parser_value "${lines[43]}")
+use_trt_key=$(func_parser_key "${lines[44]}")
+use_trt_list=$(func_parser_value "${lines[44]}")
+precision_key=$(func_parser_key "${lines[45]}")
+precision_list=$(func_parser_value "${lines[45]}")
+infer_model_key=$(func_parser_key "${lines[46]}")
+image_dir_key=$(func_parser_key "${lines[47]}")
+infer_img_dir=$(func_parser_value "${lines[47]}")
+save_log_key=$(func_parser_key "${lines[48]}")
+benchmark_key=$(func_parser_key "${lines[49]}")
+benchmark_value=$(func_parser_value "${lines[49]}")
+infer_key1=$(func_parser_key "${lines[50]}")
+infer_value1=$(func_parser_value "${lines[50]}")
+
+LOG_PATH="./tests/output"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results.log"
+
+
+function func_inference(){
+    IFS='|'
+    _python=$1
+    _script=$2
+    _model_dir=$3
+    _log_path=$4
+    _img_dir=$5
+    _flag_quant=$6
+    # inference 
+    for use_gpu in ${use_gpu_list[*]}; do
+        if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then
+            for use_mkldnn in ${use_mkldnn_list[*]}; do
+                if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then
+                    continue
+                fi
+                for threads in ${cpu_threads_list[*]}; do
+                    for batch_size in ${batch_size_list[*]}; do
+                        _save_log_path="${_log_path}/infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_batchsize_${batch_size}.log"
+                        set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}")
+                        set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
+                        set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
+                        set_cpu_threads=$(func_set_params "${cpu_threads_key}" "${threads}")
+                        set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}")
+                        set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}")
+                        command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} > ${_save_log_path} 2>&1 "
+                        eval $command
+                        last_status=${PIPESTATUS[0]}
+                        eval "cat ${_save_log_path}"
+                        status_check $last_status "${command}" "../${status_log}"
+                    done
+                done
+            done
+        elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then
+            for use_trt in ${use_trt_list[*]}; do
+                for precision in ${precision_list[*]}; do
+                    if [ ${precision} = "True" ] && [ ${use_trt} = "False" ]; then
+                        continue
+                    fi
+                    if [[ ${use_trt} = "False" || ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then
+                        continue
+                    fi
+                    for batch_size in ${batch_size_list[*]}; do
+                        _save_log_path="${_log_path}/infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log"
+                        set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}")
+                        set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
+                        set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
+                        set_tensorrt=$(func_set_params "${use_trt_key}" "${use_trt}")
+                        set_precision=$(func_set_params "${precision_key}" "${precision}")
+                        set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}")
+                        command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} > ${_save_log_path} 2>&1 "
+                        eval $command
+                        last_status=${PIPESTATUS[0]}
+                        eval "cat ${_save_log_path}"
+                        status_check $last_status "${command}" "../${status_log}"
+                        
+                    done
+                done
+            done
+        else
+            echo "Does not support hardware other than CPU and GPU Currently!"
+        fi
+    done
+}
+
+if [ ${MODE} = "infer" ]; then
+    GPUID=$3
+    if [ ${#GPUID} -le 0 ];then
+        env=" "
+    else
+        env="export CUDA_VISIBLE_DEVICES=${GPUID}"
+    fi
+    # set CUDA_VISIBLE_DEVICES
+    eval $env
+    export Count=0
+    IFS="|"
+    infer_run_exports=(${infer_export_list})
+    infer_quant_flag=(${infer_is_quant})
+    cd deploy
+    for infer_model in ${infer_model_dir_list[*]}; do
+        # run export
+        if [ ${infer_run_exports[Count]} != "null" ];then
+            set_export_weight=$(func_set_params "${export_weight}" "${infer_model}")
+            set_save_infer_key=$(func_set_params "${save_infer_key}" "${infer_model}")
+            export_cmd="${python} ${norm_export} ${set_export_weight} ${set_save_infer_key}"
+            eval $export_cmd
+            status_export=$?
+            if [ ${status_export} = 0 ];then
+                status_check $status_export "${export_cmd}" "../${status_log}"
+            fi
+        fi
+        #run inference
+        is_quant=${infer_quant_flag[Count]}
+        echo "is_quant: ${is_quant}"
+        func_inference "${python}" "${inference_py}" "${infer_model}" "../${LOG_PATH}" "${infer_img_dir}" ${is_quant}
+        Count=$(($Count + 1))
+    done
+    cd ..
+
+else
+    IFS="|"
+    export Count=0
+    USE_GPU_KEY=(${train_use_gpu_value})
+    for gpu in ${gpu_list[*]}; do
+        use_gpu=${USE_GPU_KEY[Count]}
+        Count=$(($Count + 1))
+        if [ ${gpu} = "-1" ];then
+            env=""
+        elif [ ${#gpu} -le 1 ];then
+            env="export CUDA_VISIBLE_DEVICES=${gpu}"
+            eval ${env}
+        elif [ ${#gpu} -le 15 ];then
+            IFS=","
+            array=(${gpu})
+            env="export CUDA_VISIBLE_DEVICES=${array[0]}"
+            IFS="|"
+        else
+            IFS=";"
+            array=(${gpu})
+            ips=${array[0]}
+            gpu=${array[1]}
+            IFS="|"
+            env=" "
+        fi
+        for autocast in ${autocast_list[*]}; do 
+            for trainer in ${trainer_list[*]}; do 
+                flag_quant=False
+                if [ ${trainer} = ${pact_key} ]; then
+                    run_train=${pact_trainer}
+                    run_export=${pact_export}
+                    flag_quant=True
+                elif [ ${trainer} = "${fpgm_key}" ]; then
+                    run_train=${fpgm_trainer}
+                    run_export=${fpgm_export}
+                elif [ ${trainer} = "${distill_key}" ]; then
+                    run_train=${distill_trainer}
+                    run_export=${distill_export}
+                elif [ ${trainer} = ${trainer_key1} ]; then
+                    run_train=${trainer_value1}
+                    run_export=${export_value1}
+                elif [[ ${trainer} = ${trainer_key2} ]]; then
+                    run_train=${trainer_value2}
+                    run_export=${export_value2}
+                else
+                    run_train=${norm_trainer}
+                    run_export=${norm_export}
+                fi
+
+                if [ ${run_train} = "null" ]; then
+                    continue
+                fi
+                
+                set_autocast=$(func_set_params "${autocast_key}" "${autocast}")
+                set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}")
+                set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}")
+                set_batchsize=$(func_set_params "${train_batch_key}" "${train_batch_value}")
+                set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}")
+                set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${use_gpu}")
+                save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}"
+                
+                # load pretrain from norm training if current trainer is pact or fpgm trainer
+                if [ ${trainer} = ${pact_key} ] || [ ${trainer} = ${fpgm_key} ]; then
+                    set_pretrain="${load_norm_train_model}"
+                fi
+
+                set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
+                if [ ${#gpu} -le 2 ];then  # train with cpu or single gpu
+                    cmd="${python} ${run_train} ${set_use_gpu}  ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} "
+                elif [ ${#gpu} -le 15 ];then  # train with multi-gpu
+                    cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1}"
+                else     # train with multi-machine
+                    cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}"
+                fi
+                # run train
+		eval "unset CUDA_VISIBLE_DEVICES"
+                eval $cmd
+                status_check $? "${cmd}" "${status_log}"
+
+                set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/${$model_name}/${train_model_name}")
+                # save norm trained models to set pretrain for pact training and fpgm training 
+                if [ ${trainer} = ${trainer_norm} ]; then
+                    load_norm_train_model=${set_eval_pretrain}
+                fi
+                # run eval 
+                if [ ${eval_py} != "null" ]; then
+                    set_eval_params1=$(func_set_params "${eval_key1}" "${eval_value1}")
+                    eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1}" 
+                    eval $eval_cmd
+                    status_check $? "${eval_cmd}" "${status_log}"
+                fi
+                # run export model
+                if [ ${run_export} != "null" ]; then 
+                    # run export model
+                    save_infer_path="${save_log}"
+                    set_export_weight=$(func_set_params "${export_weight}" "${save_log}/${model_name}/${train_model_name}")
+                    set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_path}")
+                    export_cmd="${python} ${run_export} ${set_export_weight} ${set_save_infer_key}"
+                    eval $export_cmd
+                    status_check $? "${export_cmd}" "${status_log}"
+
+                    #run inference
+                    eval $env
+                    save_infer_path="${save_log}"
+		    cd deploy
+                    func_inference "${python}" "${inference_py}" "../${save_infer_path}" "../${LOG_PATH}" "${infer_img_dir}" "${flag_quant}"
+		    cd ..
+                fi
+                eval "unset CUDA_VISIBLE_DEVICES"
+            done  # done with:    for trainer in ${trainer_list[*]}; do 
+        done      # done with:    for autocast in ${autocast_list[*]}; do 
+    done          # done with:    for gpu in ${gpu_list[*]}; do
+fi  # end if [ ${MODE} = "infer" ]; then
diff --git a/tools/eval.py b/tools/eval.py
index b03030c5a4fa605cd28d5d93d303c6f886462260..e086da1b56cae26ed64b044ddf1aef9e40f43a96 100644
--- a/tools/eval.py
+++ b/tools/eval.py
@@ -21,11 +21,11 @@ __dir__ = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
 
 from ppcls.utils import config
-from ppcls.engine.trainer import Trainer
+from ppcls.engine.engine import Engine
 
 if __name__ == "__main__":
     args = config.parse_args()
     config = config.get_config(
         args.config, overrides=args.override, show=False)
-    trainer = Trainer(config, mode="eval")
-    trainer.eval()
+    engine = Engine(config, mode="eval")
+    engine.eval()
diff --git a/tools/export_model.py b/tools/export_model.py
index e019e25746f04f97c0d4b668d87aaab006e5b375..01aba06c1f715f764352c6fd38a23c470e66e289 100644
--- a/tools/export_model.py
+++ b/tools/export_model.py
@@ -24,82 +24,11 @@ import paddle
 import paddle.nn as nn
 
 from ppcls.utils import config
-from ppcls.utils.logger import init_logger
-from ppcls.utils.config import print_config
-from ppcls.arch import build_model, RecModel, DistillationModel
-from ppcls.utils.save_load import load_dygraph_pretrain
-from ppcls.arch.gears.identity_head import IdentityHead
-
-
-class ExportModel(nn.Layer):
-    """
-    ExportModel: add softmax onto the model
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.base_model = build_model(config)
-
-        # we should choose a final model to export
-        if isinstance(self.base_model, DistillationModel):
-            self.infer_model_name = config["infer_model_name"]
-        else:
-            self.infer_model_name = None
-
-        self.infer_output_key = config.get("infer_output_key", None)
-        if self.infer_output_key == "features" and isinstance(self.base_model,
-                                                              RecModel):
-            self.base_model.head = IdentityHead()
-        if config.get("infer_add_softmax", True):
-            self.softmax = nn.Softmax(axis=-1)
-        else:
-            self.softmax = None
-
-    def eval(self):
-        self.training = False
-        for layer in self.sublayers():
-            layer.training = False
-            layer.eval()
-
-    def forward(self, x):
-        x = self.base_model(x)
-        if isinstance(x, list):
-            x = x[0]
-        if self.infer_model_name is not None:
-            x = x[self.infer_model_name]
-        if self.infer_output_key is not None:
-            x = x[self.infer_output_key]
-        if self.softmax is not None:
-            x = self.softmax(x)
-        return x
-
+from ppcls.engine.engine import Engine
 
 if __name__ == "__main__":
     args = config.parse_args()
     config = config.get_config(
         args.config, overrides=args.override, show=False)
-    log_file = os.path.join(config['Global']['output_dir'],
-                            config["Arch"]["name"], "export.log")
-    init_logger(name='root', log_file=log_file)
-    print_config(config)
-
-    # set device
-    assert config["Global"]["device"] in ["cpu", "gpu", "xpu"]
-    device = paddle.set_device(config["Global"]["device"])
-    model = ExportModel(config["Arch"])
-    if config["Global"]["pretrained_model"] is not None:
-        load_dygraph_pretrain(model.base_model,
-                              config["Global"]["pretrained_model"])
-
-    model.eval()
-
-    model = paddle.jit.to_static(
-        model,
-        input_spec=[
-            paddle.static.InputSpec(
-                shape=[None] + config["Global"]["image_shape"],
-                dtype='float32')
-        ])
-    paddle.jit.save(model,
-                    os.path.join(config["Global"]["save_inference_dir"],
-                                 "inference"))
+    engine = Engine(config, mode="export")
+    engine.export()
diff --git a/tools/infer.py b/tools/infer.py
index da23a3d88c2f00e56c2868cbffd6d8e2b05202f9..4f6bf9272667f184942cde82549a39387f1ce862 100644
--- a/tools/infer.py
+++ b/tools/infer.py
@@ -21,12 +21,11 @@ __dir__ = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
 
 from ppcls.utils import config
-from ppcls.engine.trainer import Trainer
+from ppcls.engine.engine import Engine
 
 if __name__ == "__main__":
     args = config.parse_args()
     config = config.get_config(
         args.config, overrides=args.override, show=False)
-    trainer = Trainer(config, mode="infer")
-
-    trainer.infer()
+    engine = Engine(config, mode="infer")
+    engine.infer()
diff --git a/tools/train.py b/tools/train.py
index 169678c5a81e651839d62246a99b9f66f27fcdef..1d835903638aacb459f982a7c5f8710241f01be4 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -21,11 +21,11 @@ __dir__ = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
 
 from ppcls.utils import config
-from ppcls.engine.trainer import Trainer
+from ppcls.engine.engine import Engine
 
 if __name__ == "__main__":
     args = config.parse_args()
     config = config.get_config(
         args.config, overrides=args.override, show=False)
-    trainer = Trainer(config, mode="train")
-    trainer.train()
+    engine = Engine(config, mode="train")
+    engine.train()