Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleClas into slim

5d1ab55a · dongshuilong · 9f034781 · 41036408 · 5d1ab55a · 5d1ab55a
99 changed file
--- a/.github/ISSUE_TEMPLATE/---clas-issue-.md
+++ b/.github/ISSUE_TEMPLATE/---clas-issue-.md
+---
+name: 问题反馈
+about: PaddleClas问题反馈
+title: ''
+labels: ''
+assignees: ''
+---
+欢迎您使用PaddleClas并反馈相关问题，非常感谢您对PaddleClas的贡献！
+提出issue时，辛苦您提供以下信息，方便我们快速定位问题并及时有效地解决您的问题：
+ 1. PaddleClas版本以及PaddlePaddle版本：请您提供您使用的版本号或分支信息，如PaddleClas release/2.2和PaddlePaddle 2.1.0
+ 2. 涉及的其他产品使用的版本号：如您在使用PaddleClas的同时还在使用其他产品，如PaddleServing、PaddleInference等，请您提供其版本号
+ 3. 训练环境信息：
+  a. 具体操作系统，如Linux/Windows/MacOS
+  b. Python版本号，如Python3.6/7/8
+  c. CUDA/cuDNN版本， 如CUDA10.2/cuDNN 7.6.5等
+ 4. 完整的代码(相比于repo中代码，有改动的地方)、详细的错误信息及相关log
--- a/README_ch.md
+++ b/README_ch.md
@@ -8,6 +8,7 @@
 **近期更新**
+- 2021.08.11 更新7个[FAQ](docs/zh_CN/faq_series/faq_2021_s2.md)。
 - 2021.06.29 添加Swin-transformer系列模型，ImageNet1k数据集上Top1 acc最高精度可达87.2%；支持训练预测评估与whl包部署，预训练模型可以从[这里](docs/zh_CN/models/models_intro.md)下载。
 - 2021.06.22,23,24 PaddleClas官方研发团队带来技术深入解读三日直播课。课程回放：[https://aistudio.baidu.com/aistudio/course/introduce/24519](https://aistudio.baidu.com/aistudio/course/introduce/24519)
 - 2021.06.16 PaddleClas v2.2版本升级，集成Metric learning，向量检索等组件。新增商品识别、动漫人物识别、车辆识别和logo识别等4个图像识别应用。新增LeViT、Twins、TNT、DLA、HarDNet、RedNet系列30个预训练模型。
@@ -50,6 +51,10 @@ Res2Net200_vd预训练模型Top-1精度高达85.1%。
 - [图像识别快速体验](./docs/zh_CN/tutorials/quick_start_recognition.md)
 - [图像识别系统介绍](#图像识别系统介绍)
 - [识别效果展示](#识别效果展示)
+- 图像分类快速体验
+    - [尝鲜版](./docs/zh_CN/tutorials/quick_start_new_user.md)
+    - [进阶版](./docs/zh_CN/tutorials/quick_start_professional.md)
+    - [社区版](./docs/zh_CN/tutorials/quick_start_community.md)
 - 算法介绍
    - [骨干网络和预训练模型库](./docs/zh_CN/ImageNet_models_cn.md)
    - [主体检测](./docs/zh_CN/application/mainbody_detection.md)
@@ -74,11 +79,14 @@ Res2Net200_vd预训练模型Top-1精度高达85.1%。
    - [知识蒸馏](./docs/zh_CN/advanced_tutorials/distillation/distillation.md)
    - [模型量化](./docs/zh_CN/extension/paddle_quantization.md)
    - [数据增广](./docs/zh_CN/advanced_tutorials/image_augmentation/ImageAugment.md)
- FAQ(暂停更新)
+- FAQ
+    - [图像识别任务FAQ](docs/zh_CN/faq_series/faq_2021_s2.md)
    - [图像分类任务FAQ](docs/zh_CN/faq.md)
 - [许可证书](#许可证书)
 - [贡献代码](#贡献代码)
+<a name="图像识别系统介绍"></a>
+## 图像识别系统介绍
 <div align="center">
 <img src="./docs/images/structure.png"  width = "400" />

--- a/deploy/configs/build_cartoon.yaml
+++ b/deploy/configs/build_cartoon.yaml
 Global:
  rec_inference_model_dir: "./models/cartoon_rec_ResNet50_iCartoon_v1.0_infer/"
-  batch_size: 1
+  batch_size: 32
  use_gpu: True
  enable_mkldnn: True
  cpu_num_threads: 10

--- a/deploy/configs/build_logo.yaml
+++ b/deploy/configs/build_logo.yaml
 Global:
  rec_inference_model_dir: "./models/logo_rec_ResNet50_Logo3K_v1.0_infer/"
-  batch_size: 1
+  batch_size: 32
  use_gpu: True
  enable_mkldnn: True
  cpu_num_threads: 10

--- a/deploy/configs/build_product.yaml
+++ b/deploy/configs/build_product.yaml
 Global:
  rec_inference_model_dir: "./models/product_ResNet50_vd_aliproduct_v1.0_infer"
-  batch_size: 1
+  batch_size: 32
  use_gpu: True
  enable_mkldnn: True
  cpu_num_threads: 10

--- a/deploy/configs/build_vehicle.yaml
+++ b/deploy/configs/build_vehicle.yaml
 Global:
  rec_inference_model_dir: "./models/vehicle_cls_ResNet50_CompCars_v1.0_infer/"
-  batch_size: 1
+  batch_size: 32
  use_gpu: True
  enable_mkldnn: True
  cpu_num_threads: 10

--- a/deploy/python/predict_cls.py
+++ b/deploy/python/predict_cls.py
@@ -41,6 +41,29 @@ class ClsPredictor(Predictor):
        if "PostProcess" in config:
            self.postprocess = build_postprocess(config["PostProcess"])
+        # for whole_chain project to test each repo of paddle
+        self.benchmark = config["Global"].get("benchmark", False)
+        if self.benchmark:
+            import auto_log
+            import os
+            pid = os.getpid()
+            self.auto_logger = auto_log.AutoLogger(
+                model_name=config["Global"].get("model_name", "cls"),
+                model_precision='fp16'
+                if config["Global"]["use_fp16"] else 'fp32',
+                batch_size=config["Global"].get("batch_size", 1),
+                data_shape=[3, 224, 224],
+                save_path=config["Global"].get("save_log_path",
+                                               "./auto_log.log"),
+                inference_config=self.config,
+                pids=pid,
+                process_name=None,
+                gpu_ids=None,
+                time_keys=[
+                    'preprocess_time', 'inference_time', 'postprocess_time'
+                ],
+                warmup=2)
    def predict(self, images):
        input_names = self.paddle_predictor.get_input_names()
        input_tensor = self.paddle_predictor.get_input_handle(input_names[0])
@@ -49,16 +72,26 @@ class ClsPredictor(Predictor):
        output_tensor = self.paddle_predictor.get_output_handle(output_names[
            0])
+        if self.benchmark:
+            self.auto_logger.times.start()
        if not isinstance(images, (list, )):
            images = [images]
        for idx in range(len(images)):
            for ops in self.preprocess_ops:
                images[idx] = ops(images[idx])
        image = np.array(images)
+        if self.benchmark:
+            self.auto_logger.times.stamp()
        input_tensor.copy_from_cpu(image)
        self.paddle_predictor.run()
        batch_output = output_tensor.copy_to_cpu()
+        if self.benchmark:
+            self.auto_logger.times.stamp()
+        if self.postprocess is not None:
+            batch_output = self.postprocess(batch_output)
+        if self.benchmark:
+            self.auto_logger.times.end(stamp=True)
        return batch_output
@@ -66,12 +99,40 @@ def main(config):
    cls_predictor = ClsPredictor(config)
    image_list = get_image_list(config["Global"]["infer_imgs"])
-    assert config["Global"]["batch_size"] == 1
+    batch_imgs = []
-    for idx, image_file in enumerate(image_list):
+    batch_names = []
-        img = cv2.imread(image_file)[:, :, ::-1]
+    cnt = 0
-        output = cls_predictor.predict(img)
+    for idx, img_path in enumerate(image_list):
-        output = cls_predictor.postprocess(output, [image_file])
+        img = cv2.imread(img_path)
-        print(output)
+        if img is None:
+            logger.warning(
+                "Image file failed to read and has been skipped. The path: {}".
+                format(img_path))
+        else:
+            img = img[:, :, ::-1]
+            batch_imgs.append(img)
+            img_name = os.path.basename(img_path)
+            batch_names.append(img_name)
+            cnt += 1
+        if cnt % config["Global"]["batch_size"] == 0 or (idx + 1
+                                                         ) == len(image_list):
+            if len(batch_imgs) == 0:
+                continue
+            batch_results = cls_predictor.predict(batch_imgs)
+            for number, result_dict in enumerate(batch_results):
+                filename = batch_names[number]
+                clas_ids = result_dict["class_ids"]
+                scores_str = "[{}]".format(", ".join("{:.2f}".format(
+                    r) for r in result_dict["scores"]))
+                label_names = result_dict["label_names"]
+                print("{}:\tclass id(s): {}, score(s): {}, label_name(s): {}".
+                      format(filename, clas_ids, scores_str, label_names))
+            batch_imgs = []
+            batch_names = []
+    if cls_predictor.benchmark:
+        cls_predictor.auto_logger.report()
    return

--- a/deploy/python/predict_rec.py
+++ b/deploy/python/predict_rec.py
@@ -54,12 +54,14 @@ class RecPredictor(Predictor):
        input_tensor.copy_from_cpu(image)
        self.paddle_predictor.run()
        batch_output = output_tensor.copy_to_cpu()
        if feature_normalize:
            feas_norm = np.sqrt(
                np.sum(np.square(batch_output), axis=1, keepdims=True))
            batch_output = np.divide(batch_output, feas_norm)
+        if self.postprocess is not None:
+            batch_output = self.postprocess(batch_output)
        return batch_output
@@ -67,14 +69,33 @@ def main(config):
    rec_predictor = RecPredictor(config)
    image_list = get_image_list(config["Global"]["infer_imgs"])
-    assert config["Global"]["batch_size"] == 1
+    batch_imgs = []
-    for idx, image_file in enumerate(image_list):
+    batch_names = []
-        batch_input = []
+    cnt = 0
-        img = cv2.imread(image_file)[:, :, ::-1]
+    for idx, img_path in enumerate(image_list):
-        output = rec_predictor.predict(img)
+        img = cv2.imread(img_path)
-        if rec_predictor.postprocess is not None:
+        if img is None:
-            output = rec_predictor.postprocess(output)
+            logger.warning(
-        print(output)
+                "Image file failed to read and has been skipped. The path: {}".
+                format(img_path))
+        else:
+            img = img[:, :, ::-1]
+            batch_imgs.append(img)
+            img_name = os.path.basename(img_path)
+            batch_names.append(img_name)
+            cnt += 1
+        if cnt % config["Global"]["batch_size"] == 0 or (idx + 1) == len(image_list):
+            if len(batch_imgs) == 0: 
+                continue
+            batch_results = rec_predictor.predict(batch_imgs)
+            for number, result_dict in enumerate(batch_results):
+                filename = batch_names[number]
+                print("{}:\t{}".format(filename, result_dict))
+            batch_imgs = []
+            batch_names = []
    return

--- a/deploy/utils/predictor.py
+++ b/deploy/utils/predictor.py
@@ -28,7 +28,7 @@ class Predictor(object):
        if args.use_fp16 is True:
            assert args.use_tensorrt is True
        self.args = args
-        self.paddle_predictor = self.create_paddle_predictor(
+        self.paddle_predictor, self.config = self.create_paddle_predictor(
            args, inference_model_dir)
    def predict(self, image):
@@ -59,11 +59,12 @@ class Predictor(object):
            config.enable_tensorrt_engine(
                precision_mode=Config.Precision.Half
                if args.use_fp16 else Config.Precision.Float32,
-                max_batch_size=args.batch_size)
+                max_batch_size=args.batch_size,
+                min_subgraph_size=30)
        config.enable_memory_optim()
        # use zero copy
        config.switch_use_feed_fetch_ops(False)
        predictor = create_predictor(config)
-        return predictor
+        return predictor, config
--- a/deploy/vector_search/README_en.md
+++ b/deploy/vector_search/README_en.md
+# Vector search
+## 1. Introduction
+Some vertical domain recognition tasks (e.g., vehicles, commodities, etc.) require a large number of recognized categories, and often use a retrieval-based approach to obtain matching predicted categories by performing a fast nearest neighbor search with query vectors and underlying library vectors. The vector search module provides the basic approximate nearest neighbor search algorithm based on Baidu's self-developed Möbius algorithm, a graph-based approximate nearest neighbor search algorithm for maximum inner product search (MIPS). This module provides python interface, supports numpy and tensor type vectors, and supports L2 and Inner Product distance calculation.
+Details of the Mobius algorithm can be found in the paper.（[Möbius Transformation for Fast Inner Product Search on Graph](http://research.baidu.com/Public/uploads/5e189d36b5cf6.PDF), [Code](https://github.com/sunbelbd/mobius)）
+## 2. Installation
+### 2.1 Use the provided library files directly
+This folder contains the compiled `index.so` (compiled under gcc8.2.0 for Linux) and `index.dll` (compiled under gcc10.3.0 for Windows), which can be used directly, skipping sections 2.2 and 2.3.
+If the library files are not available due to a low gcc version or an incompatible environment, you need to manually compile the library files under a different platform.
+**Note：** Make sure that C++ compiler supports the C++11 standard.
+### 2.2 Compile and generate library files on Linux
+Run the following command to install gcc and g++.
+```
+sudo apt-get update
+sudo apt-get upgrade -y
+sudo apt-get install build-essential gcc g++
+```
+Check the gcc version by the command `gcc -v`.
+`make` can be operated directly. If you wish to regenerate the `index.so`, you can first use `make clean` to clear the cache, and then use `make` to generate the updated library file.
+### 2.3 Compile and generate library files on Windows
+You need to install gcc compiler tool first, we recommend using [TDM-GCC](https://jmeubank.github.io/tdm-gcc/articles/2020-03/9.2.0-release), you can choose the right version on the official website. We recommend downloading [tdm64-gcc-10.3.0-2.exe](https://github.com/jmeubank/tdm-gcc/releases/download/v10.3.0-tdm64-2/tdm64-gcc-10.3.0-2.exe).
+After the downloading, follow the default installation steps to install. There are 3 points to note here:
+1.  The vector search module depends on openmp, so you need to check the `openmp` installation option when going on to `choose components` step, otherwise it will report an error `libgomp.spec: No such file or directory`, [reference link](https://github.com/dmlc/xgboost/issues/1027)
+2.  When being asked whether to add to the system environment variables, it is recommended to check here, otherwise you need to add the system environment variables manually later.
+3. The compile command is `make` on Linux and `mingw32-make` on Windows, so you need to distinguish here.
+After installation, you can open a command line terminal and check the gcc version with the command `gcc -v`.
+Run the command `mingw32-make` to generate the `index.dll` library file under the folder (deploy/vector_search). If you want to regenerate the `index.dll` file, you can first use `mingw32-make clean` to clear the cache, and then use `mingw32-make` to generate the updated library file.
+### 2.4 Compile and generate library files on MacOS
+Run the following command to install gcc and g++:
+```
+brew install gcc
+```
+#### Caution：
+1. If prompted with `Error: Running Homebrew as root is extremely dangerous and no longer supported... `, refer to this [link](https://jingyan.baidu.com/article/e52e3615057a2840c60c519c.html)
+2.  If prompted with `Error: Failure while executing; tar --extract --no-same-owner --file... `, refer to this [link](https://blog.csdn.net/Dawn510/article/details/117787358).
+After installation the compiled executable is copied under /usr/local/bin, look at the gcc in this folder: 
+```
+ls /usr/local/bin/gcc*
+```
+The local gcc version is gcc-11, and the compile command is as follows: (If the local gcc version is gcc-9, the corresponding command should be `CXX=g++-9 make`)
+```
+CXX=g++-11 make
+```
+## 3. Quick use
+```
+import numpy as np
+from interface import Graph_Index
+# Random sample generation
+index_vectors = np.random.rand(100000,128).astype(np.float32)
+query_vector = np.random.rand(128).astype(np.float32)
+index_docs = ["ID_"+str(i) for i in range(100000)]
+# Initialize index structure
+indexer = Graph_Index(dist_type="IP") #support "IP" and "L2"
+indexer.build(gallery_vectors=index_vectors, gallery_docs=index_docs, pq_size=100, index_path='test')
+# Query
+scores, docs = indexer.search(query=query_vector, return_k=10, search_budget=100)
+print(scores)
+print(docs)
+# Save and load
+indexer.dump(index_path="test")
+indexer.load(index_path="test")
+```
--- a/docs/en/extension/VisualDL_en.md
+++ b/docs/en/extension/VisualDL_en.md
@@ -7,29 +7,30 @@ VisualDL, a visualization analysis tool of PaddlePaddle, provides a variety of c
 Now PaddleClas support use VisualDL to visualize the changes of learning rate, loss, accuracy in training.
 ### Set config and start training
-You only need to set the `vdl_dir` field in train config:
+You only need to set the field `Global.use_visualdl` to `True` in train config:
 ```yaml
 # config.yaml
-vdl_dir: "./vdl.log"
+Global:
+...
+  use_visualdl: True
+...
 ```
-`vdl_dir`: Specify the directory where VisualDL stores logs.
+PaddleClas will save the VisualDL logs to subdirectory `vdl/` under the output directory specified by `Global.output_dir`. And then you just need to start training normally:
-Then normal start training:
 ```shell
 python3 tools/train.py -c config.yaml
 ```
 ### Start VisualDL
-After starting the training program, you can start the VisualDL service in the new terminal session:
+After starting the training program, you can start the VisualDL service in a new terminal session:
 ```shell
- visualdl --logdir ./vdl.log
+ visualdl --logdir ./output/vdl/
 ```
-In the above command, `--logdir` specify the logs directory. VisualDL will traverse and iterate to find the subdirectories of the specified directory to visualize all the experimental results. You can also use the following parameters to set the IP and port number of the VisualDL service:
+In the above command, `--logdir` specify the directory of the VisualDL logs produced in training. VisualDL will traverse and iterate to find the subdirectories of the specified directory to visualize all the experimental results. You can also use the following parameters to set the IP and port number of the VisualDL service:
 * `--host`：ip, default is 127.0.0.1
 * `--port`：port, default is 8040

--- a/docs/en/models/Twins.md
+++ b/docs/en/models/Twins.md
@@ -3,9 +3,9 @@
 ## Overview
 The Twins network includes Twins-PCPVT and Twins-SVT, which focuses on the meticulous design of the spatial attention mechanism, resulting in a simple but more effective solution. Since the architecture only involves matrix multiplication, and the current deep learning framework has a high degree of optimization for matrix multiplication, the architecture is very efficient and easy to implement. Moreover, this architecture can achieve excellent performance in a variety of downstream vision tasks such as image classification, target detection, and semantic segmentation. [Paper](https://arxiv.org/abs/2104.13840).
-## Accuracy, FLOPS and Parameters
+## Accuracy, FLOPs and Parameters
-| Models        | Top1 | Top5 | Reference<br>top1 | Reference<br>top5 | FLOPS<br>(G) | Params<br>(M) |
+| Models        | Top1 | Top5 | Reference<br>top1 | Reference<br>top5 | FLOPs<br>(G) | Params<br>(M) |
 |:--:|:--:|:--:|:--:|:--:|:--:|:--:|
 | pcpvt_small   | 0.8082 | 0.9552 | 0.812 | - | 3.7 | 24.1   |
 | pcpvt_base    | 0.8242 | 0.9619 | 0.827 | - | 6.4 | 43.8   |

--- a/docs/en/tutorials/config_description_en.md
+++ b/docs/en/tutorials/config_description_en.md
+# Configuration Instruction
+------
+## Introdction
+The parameters in the PaddleClas configuration file(`ppcls/configs/*.yaml`)are described for you to customize or modify the hyperparameter configuration more quickly.
+## Details
+### 1. Classification model
+Here the configuration of `ResNet50_vd` on`ImageNet-1k`is used as an example to explain the each parameter in detail. [Configure Path](https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml).
+#### 1.1Global Configuration
+| Parameter name     | Specific meaning                                        | Defult value     | Optional value    |
+| ------------------ | ------------------------------------------------------- | ---------------- | ----------------- |
+| checkpoints        | Breakpoint model path for resuming training             | null             | str               |
+| pretrained_model   | Pre-trained model path                                  | null             | str               |
+| output_dir         | Save model path                                         | "./output/"      | str               |
+| save_interval      | How many epochs to save the model at each interval      | 1                | int               |
+| eval_during_train  | Whether to evaluate at training                         | True             | bool              |
+| eval_interval      | How many epochs to evaluate at each interval            | 1                | int               |
+| epochs             | Total number of epochs in training                      |                  | int               |
+| print_batch_step   | How many mini-batches to print out at each interval     | 10               | int               |
+| use_visualdl       | Whether to visualize the training process with visualdl | False            | bool              |
+| image_shape        | Image size                                              | [3，224，224]    | list, shape: (3,) |
+| save_inference_dir | Inference model save path                               | "./inference"    | str               |
+| eval_mode          | Model of eval                                           | "classification" | "retrieval"       |
+**Note**：The http address of pre-trained model can be filled in the `pretrained_model`
+#### 1.2 Architecture
+| Parameter name | Specific meaning  | Defult value | Optional value        |
+| -------------- | ----------------- | ------------ | --------------------- |
+| name           | Model Arch name   | ResNet50     | PaddleClas model arch |
+| class_num      | Category number   | 1000         | int                   |
+| pretrained     | Pre-trained model | False        | bool， str            |
+**Note**: Here pretrained can be set to True or False, so does the path of the weights. In addition, the pretrained is disabled when Global.pretrained_model is also set to the corresponding path.
+#### 1.3 Loss function
+| Parameter name | Specific meaning                            | Defult value | Optional value         |
+| -------------- | ------------------------------------------- | ------------ | ---------------------- |
+| CELoss         | cross-entropy loss function                 | ——           | ——                     |
+| CELoss.weight  | The weight of CELoss in the whole Loss      | 1.0          | float                  |
+| CELoss.epsilon | The epsilon value of label_smooth in CELoss | 0.1          | float，between 0 and 1 |
+#### 1.4 Optimizer
+| Parameter name    | Specific meaning                 | Defult value | Optional value                                     |
+| ----------------- | -------------------------------- | ------------ | -------------------------------------------------- |
+| name              | optimizer method name            | "Momentum"   | Other optimizer including "RmsProp"                |
+| momentum          | momentum value                   | 0.9          | float                                              |
+| lr.name           | method of dropping learning rate | "Cosine"     | Other dropping methods of "Linear" and "Piecewise" |
+| lr.learning_rate  | initial value of learning rate   | 0.1          | float                                              |
+| lr.warmup_epoch   | warmup rounds                    | 0            | int，such as 5                                           |
+| regularizer.name  | regularization method name       | "L2"         | ["L1", "L2"]                                       |
+| regularizer.coeff | regularization factor            | 0.00007      | float                                              |
+**Note**：The new parameters may be different when `lr.name`  is different , as when `lr.name=Piecewise`, the following parameters need to be added:
+```
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+```
+Referring to [learning_rate.py](https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/optimizer/learning_rate.py) for adding method and parameters.
+#### 1.5 Data reading module（DataLoader）
+##### 1.5.1 dataset
+| Parameter name      | Specific meaning                     | Defult value                        | Optional value                 |
+| ------------------- | ------------------------------------ | ----------------------------------- | ------------------------------ |
+| name                | The name of the class to read the data                   | ImageNetDataset                     | VeriWild and other Dataet type |
+| image_root          | The path where the dataset is stored | ./dataset/ILSVRC2012/               | str                            |
+| cls_label_path      | data label list                      | ./dataset/ILSVRC2012/train_list.txt | str                            |
+| transform_ops       | data preprocessing for single images | ——                                  | ——                             |
+| batch_transform_ops | Data preprocessing for batch images  | ——                                  | ——                             |
+The parameter meaning of transform_ops:
+| Function name  | Parameter name | Specific meaning      |
+| -------------- | -------------- | --------------------- |
+| DecodeImage    | to_rgb         | data to RGB           |
+|                | channel_first  | image data by CHW     |
+| RandCropImage  | size           | Random crop           |
+| RandFlipImage  |                | Random flip           |
+| NormalizeImage | scale          | Normalize scale value |
+|                | mean           | Normalize mean value  |
+|                | std            | normalized variance   |
+|                | order          | Normalize order       |
+| CropImage      | size           | crop size             |
+| ResizeImage    | resize_short   | resize by short edge  |
+The parameter meaning of batch_transform_ops:
+| Function name | Parameter name | Specific meaning                        |
+| ------------- | -------------- | --------------------------------------- |
+| MixupOperator | alpha          | Mixup parameter value，the larger the value, the stronger the augment |
+##### 1.5.2 sampler
+| Parameter name | Specific meaning                                             | Default value           | Optional value                                     |
+| -------------- | ------------------------------------------------------------ | ----------------------- | -------------------------------------------------- |
+| name           | sampler type                                                 | DistributedBatchSampler | DistributedRandomIdentitySampler and other Sampler |
+| batch_size     | batch size                                                   | 64                      | int                                                |
+| drop_last      | Whether to drop the last data that does reach the batch-size | False                   | bool                                               |
+| shuffle        | whether to shuffle the data                                  | True                    | bool                                               |
+##### 1.5.3 loader
+| Parameter name    | Specific meaning             | Default meaning | Optional meaning |
+| ----------------- | ---------------------------- | --------------- | ---------------- |
+| num_workers       | Number of data read threads  | 4               | int              |
+| use_shared_memory | Whether to use shared memory | True            | bool             |
+#### 1.6 Evaluation metric
+| Parameter name | Specific meaning | Default meaning | Optional meaning |
+| -------------- | ---------------- | --------------- | ---------------- |
+| TopkAcc        | TopkAcc          | [1, 5]          | list, int        |
+#### 1.7 Inference
+| Parameter name                | Specific meaning                  | Default meaning                       | Optional meaning |
+| ----------------------------- | --------------------------------- | ------------------------------------- | ---------------- |
+| infer_imgs                    | Image address to be inferred      | docs/images/whl/demo.jpg              | str              |
+| batch_size                    | batch size                        | 10                                    | int              |
+| PostProcess.name              | Post-process name                 | Topk                                  | str              |
+| PostProcess.topk              | topk value                        | 5                                     | int              |
+| PostProcess.class_id_map_file | mapping file of class id and name | ppcls/utils/imagenet1k_label_list.txt | str              |
+**Note**：The interpretation of `transforms` in the Infer module refers to the interpretation of`transform_ops`in the dataset in the data reading module.
+### 2.Distillation model
+**Note**：Here the training configuration of `MobileNetV3_large_x1_0` on `ImageNet-1k` distilled MobileNetV3_small_x1_0 is used as an example to explain the meaning of each parameter in detail. [Configure path](https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml). Only parameters that are distinct from the classification model are introduced here.
+#### 2.1 Architecture
+| Parameter name     | Specific meaning                                          | Default meaning        | Optional meaning                   |
+| ------------------ | --------------------------------------------------------- | ---------------------- | ---------------------------------- |
+| name               | model arch name                                           | DistillationModel      | ——                                 |
+| class_num          | category number                                           | 1000                   | int                                |
+| freeze_params_list | freeze_params_list                                        | [True, False]          | list                               |
+| models             | model list                                                | [Teacher, Student]     | list                               |
+| Teacher.name       | teacher model name                                        | MobileNetV3_large_x1_0 | PaddleClas model                   |
+| Teacher.pretrained | teacher model pre-trained weights                         | True                   | Boolean or pre-trained weight path |
+| Teacher.use_ssld   | whether teacher model pretrained weights are ssld weights | True                   | Boolean                            |
+| infer_model_name   | type of the model being inferred                          | Student                | Teacher                            |
+**Note**：
+1. list is represented in yaml as follows:
+```
+  freeze_params_list:
+  - True
+  - False
+```
+2.Student's parameters are similar and will not be repeated.
+#### 2.2  Loss function
+| Parameter name                      | Specific meaning                                             | Default meaning | Optional meaning |
+| ----------------------------------- | ------------------------------------------------------------ | --------------- | ---------------- |
+| DistillationCELoss                  | Distillation's cross-entropy loss function                   | ——              | ——               |
+| DistillationCELoss.weight           | Loss weight                                                  | 1.0             | float            |
+| DistillationCELoss.model_name_pairs | ["Student", "Teacher"]                                       | ——              | ——               |
+| DistillationGTCELoss.weight         | Distillation's cross-entropy loss function of model and true Label | ——              | ——               |
+| DistillationGTCELos.weight          | Loss weight                                                  | 1.0             | float            |
+| DistillationCELoss.model_names      | Model names with real label for cross-entropy                | ["Student"]     | ——               |
+#### 2.3 Evaluation metric
+| Parameter name                | Specific meaning    | Default meaning              | Optional meaning |
+| ----------------------------- | ------------------- | ---------------------------- | ---------------- |
+| DistillationTopkAcc           | DistillationTopkAcc | including model_key and topk | ——               |
+| DistillationTopkAcc.model_key | the evaluated model | "Student"                    | "Teacher"        |
+| DistillationTopkAcc.topk      | Topk value          | [1, 5]                       | list, int        |
+**Note**： `DistillationTopkAcc` has the same meaning as `TopkAcc`, except that it is only used in distillation tasks.
+### 3. Recognition model
+**Note**：The training configuration of`ResNet50` on`LogoDet-3k` is used here as an example to explain the meaning of each parameter in detail. [configure path](https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/configs/Logo/ResNet50_ReID.yaml). Only parameters that are distinct from the classification model are presented here.
+#### 3.1 Architechture
+| Parameter name         | Specific meaning                                             | Default meaning             | Optional meaning                                             |
+| ---------------------- | ------------------------------------------------------------ | --------------------------- | ------------------------------------------------------------ |
+| name                   | Model arch                                                   | "RecModel"                  | ["RecModel"]                                                 |
+| infer_output_key       | inference output value                                       | “feature”                   | ["feature", "logits"]                                        |
+| infer_add_softmax      | softmaxwhether to add softmax to infercne                    | False                       | [True, False]                                                |
+| Backbone.name          | Backbone name                                                | ResNet50_last_stage_stride1 | other backbone provided by PaddleClas                        |
+| Backbone.pretrained    | Backbone pre-trained model                                   | True                        | Boolean value or pre-trained model path                      |
+| BackboneStopLayer.name | The name of the output layer in Backbone                     | True                        | The`full_name`of the feature output layer in Backbone        |
+| Neck.name              | The name of the Neck part                                    | VehicleNeck                 | the dictionary structure to be passed in, the specific input parameters for the Neck network layer |
+| Neck.in_channels       | Input dimension size of the Neck part                        | 2048                        | the size is the same as BackboneStopLayer.name               |
+| Neck.out_channels      | Output the dimension size of the Neck part, i.e. feature dimension size | 512                         | int                                                          |
+| Head.name              | Network Head part nam                                        | CircleMargin                | Arcmargin. Etc                                               |
+| Head.embedding_size    | Feature dimension size                                       | 512                         | Consistent with Neck.out_channels                            |
+| Head.class_num         | number of classes                                            | 3000                        | int                                                          |
+| Head.margin            | margin value in CircleMargin                                 | 0.35                        | float                                                        |
+| Head.scale             | scale value in CircleMargin                                  | 64                          | int                                                          |
+**Note**：
+1.In PaddleClas, the `Neck` part is the connection part between Backbone and embedding layer, and `Head` part is the connection part between embedding layer and classification layer.。
+2.`BackboneStopLayer.name` can be obtained by visualizing the model, visualization can be referred to [Netron](https://github.com/lutzroeder/netron) or [visualdl](https://github.com/PaddlePaddle/VisualDL).
+3.Calling tools/export_model.py will convert the model weights to inference model, where the infer_add_softmax parameter will control whether to add the Softmax activation function afterwards, the code default is True (the last output layer in the classification task will be connected to the Softmax activation function). In the recognition task, the activation function is not required for the feature layer, so it should be set to False here.
+#### 3.2 Evaluation metric
+| Parameter name | Specific meaning            | Default meaning | Optional meaning |
+| -------------- | --------------------------- | --------------- | ---------------- |
+| Recallk        | Recall rate                 | [1, 5]          | list, int        |
+| mAP            | Average retrieval precision | None            | None             |
--- a/docs/en/tutorials/config_en.md
+++ b/docs/en/tutorials/config_en.md
-# Configuration
---
-## Introduction
-This document introduces the configuration(filed in `config/*.yaml`) of PaddleClas.
-* Note: Some parameters do not appear in the yaml file (because they are not used for this file). During training or validation, you can use the command `-o` to update or add the specified parameters. For the example `-o checkpoints=./ckp_path/ppcls`, it means that the parameter `checkpoints` will be updated or added using the value `./ckp_path/ppcls`.
-### Basic
-| name | detail | default value | optional value |
-|:---:|:---:|:---:|:---:|
-| mode | mode | "train" | ["train"," valid"] |
-| checkpoints | checkpoint model path for resuming training process | "" | Str |
-| last_epoch | last epoch for the training，used with checkpoints | -1 | int |
-| pretrained_model | pretrained model path | "" | Str |
-| load_static_weights | whether the pretrained model is saved in static mode | False | bool |
-| model_save_dir | model stored path | "" | Str |
-| classes_num | class number | 1000 | int |
-| total_images | total images | 1281167 | int |
-| save_interval | save interval | 1 | int |
-| validate | whether to validate when training | TRUE | bool |
-| valid_interval | valid interval | 1 | int |
-| epochs | epoch |  | int |
-| topk | K value | 5 | int |
-| image_shape | image size | [3，224，224] | list, shape: (3,) |
-| use_mix | whether to use mixup | False | ['True', 'False'] |
-| ls_epsilon | label_smoothing epsilon value| 0 | float |
-| use_distillation | whether to use SSLD distillation training | False | bool |
-## ARCHITECTURE
-| name | detail | default value | optional value |
-|:---:|:---:|:---:|:---:|
-| name | model name | "ResNet50_vd" | one of 23 architectures |
-| params | model parameters | {} | extra dictionary for the model structure, parameters such as `padding_type` in EfficientNet can be set here |
-### LEARNING_RATE
-| name | detail | default value |Optional value |
-|:---:|:---:|:---:|:---:|
-| function | decay type | "Linear" | ["Linear", "Cosine", <br> "Piecewise", "CosineWarmup"] |
-| params.lr | initial learning rate | 0.1 | float |
-| params.decay_epochs | milestone in piecewisedecay |  | list |
-| params.gamma | gamma in piecewisedecay | 0.1 | float |
-| params.warmup_epoch | warmup epoch | 5 | int |
-| parmas.steps | decay steps in lineardecay | 100 | int |
-| params.end_lr | end lr in lineardecay | 0 | float |
-### OPTIMIZER
-| name | detail | default value | optional value |
-|:---:|:---:|:---:|:---:|
-| function | optimizer name | "Momentum" | ["Momentum", "RmsProp"] |
-| params.momentum | momentum value | 0.9 | float |
-| regularizer.function | regularizer method name | "L2" | ["L1", "L2"] |
-| regularizer.factor | regularizer factor | 0.0001 | float |
-### reader
-| name | detail |
-|:---:|:---:|
-| batch_size | batch size |
-| num_workers | worker number |
-| file_list | train list path |
-| data_dir | train  dataset path |
-| shuffle_seed | seed |
-processing
-| function name | attribute name | detail |
-|:---:|:---:|:---:|
-| DecodeImage | to_rgb | decode to RGB |
-|  | to_np | to numpy |
-|  | channel_first | Channel first |
-| RandCropImage | size | random crop |
-| RandFlipImage | | random flip |
-| NormalizeImage | scale | normalize image |
-|  | mean | mean |
-|  | std | std |
-|  | order | order |
-| ToCHWImage |  | to CHW |
-| CropImage | size | crop size |
-| ResizeImage | resize_short | resize according to short size |
-mix preprocessing
-| name| detail|
-|:---:|:---:|
-| MixupOperator.alpha | alpha value in mixup|
--- a/docs/en/tutorials/getting_started_en.md
+++ b/docs/en/tutorials/getting_started_en.md
@@ -23,7 +23,7 @@ Among them, `-c` is used to specify the path of the configuration file, `-o` is
 `-o use_gpu=True` means to use GPU for training. If you want to use the CPU for training, you need to set `use_gpu` to `False`.
-Of course, you can also directly modify the configuration file to update the configuration. For specific configuration parameters, please refer to [Configuration Document](config_en.md).
+Of course, you can also directly modify the configuration file to update the configuration. For specific configuration parameters, please refer to [Configuration Document](config_description_en.md).
 * The output log examples are as follows:
    * If mixup or cutmix is used in training, top-1 and top-k (default by 5) will not be printed in the log:

--- a/docs/en/whl_en.md
+++ b/docs/en/whl_en.md
@@ -5,7 +5,7 @@
 * installing from pypi
 ```bash
-pip3 install paddleclas==2.2.0
+pip3 install paddleclas==2.2.1
 ```
 * build own whl package and install

--- a/docs/images/faq/momentum.jpeg
+++ b/docs/images/faq/momentum.jpeg
--- a/docs/images/product/aliproduct.png
+++ b/docs/images/product/aliproduct.png
--- a/docs/images/wx_group.png
+++ b/docs/images/wx_group.png
--- a/docs/zh_CN/application/mainbody_detection.md
+++ b/docs/zh_CN/application/mainbody_detection.md
@@ -167,4 +167,22 @@ python tools/export_model.py -c configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml
 更多模型导出教程，请参考：[EXPORT_MODEL](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/deploy/EXPORT_MODEL.md)
-导出模型之后，在主体检测与识别任务中，就可以将检测模型的路径更改为该inference模型路径，完成预测。图像识别快速体验可以参考：[图像识别快速开始教程](../tutorials/quick_start_recognition.md)。
+最终，目录`inference/ppyolov2_r50vd_dcn_365e_coco`中包含`inference.pdiparams`, `inference.pdiparams.info` 以及 `inference.pdmodel` 文件，其中`inference.pdiparams`为保存的inference模型权重文件，`inference.pdmodel`为保存的inference模型结构文件。
+导出模型之后，在主体检测与识别任务中，就可以将检测模型的路径更改为该inference模型路径，完成预测。
+以商品识别为例，其配置文件为[inference_product.yaml](../../../deploy/configs/inference_product.yaml)，修改其中的`Global.det_inference_model_dir`字段为导出的主体检测inference模型目录，参考[图像识别快速开始教程](../tutorials/quick_start_recognition.md)，即可完成商品检测与识别过程。
+### FAQ
+#### Q：可以使用其他的主体检测模型结构吗？
+* A：可以的，但是目前的检测预处理过程仅适配yolo系列的预处理，因此在使用的时候，建议优先使用yolo系列的模型进行训练，如果希望使用faster rcnn等其他系列的模型，需要按照PaddleDetection的数据预处理，修改下预处理逻辑，这块如果您有需求或者有问题的话，欢迎提issue或者在群里反馈。
+#### Q：可以修改主体检测的预测尺度吗？
+* A：可以的，但是需要注意2个地方
+  * PaddleClas中提供的主体检测模型是基于640x640的分辨率去训练的，因此预测的时候也是默认使用640x640的分辨率进行预测，使用其他分辨率预测的话，精度会有所降低。
+  * 在模型导出的时候，建议也修改下模型导出的分辨率，保持模型导出、模型预测的分辨率一致。
--- a/docs/zh_CN/extension/VisualDL.md
+++ b/docs/zh_CN/extension/VisualDL.md
@@ -7,15 +7,17 @@ VisualDL是飞桨可视化分析工具，以丰富的图表呈现训练参数变
 现在PaddleClas支持在训练阶段使用VisualDL查看训练过程中学习率（learning rate）、损失值（loss）以及准确率（accuracy）的变化情况。
 ### 设置config文件并启动训练
-在PaddleClas中使用VisualDL，只需在训练配置文件（config文件）添加如下字段：
+在PaddleClas中使用VisualDL，只需在训练配置文件（config文件）中设置字段 `Global.use_visualdl` 为 `True`：
 ```yaml
 # config.yaml
-vdl_dir: "./vdl.log"
+Global:
+...
+  use_visualdl: True
+...
 ```
-`vdl_dir` 用于指定VisualDL用于保存log信息的目录。
-然后正常启动训练即可：
+PaddleClas 会将 VisualDL 的日志保存在 `Global.output_dir` 字段指定目录下的 `vdl/` 子目录下，然后正常启动训练即可：
 ```shell
 python3 tools/train.py -c config.yaml
@@ -25,10 +27,10 @@ python3 tools/train.py -c config.yaml
 在启动训练程序后，可以在新的终端session中启动VisualDL服务：
 ```shell
- visualdl --logdir ./vdl.log
+ visualdl --logdir ./output/vdl/
 ```
-上述命令中，参数`--logdir`用于指定日志目录，VisualDL将遍历并且迭代寻找指定目录的子目录，将所有实验结果进行可视化。也同样可以使用下述参数设定VisualDL服务的ip及端口号：
+上述命令中，参数`--logdir`用于指定保存 VisualDL 日志的目录，VisualDL将遍历并且迭代寻找指定目录的子目录，将所有实验结果进行可视化。也同样可以使用下述参数设定VisualDL服务的ip及端口号：
 * `--host`：设定IP，默认为127.0.0.1
 * `--port`：设定端口，默认为8040

--- a/docs/zh_CN/faq_series/faq_2021_s2.md
+++ b/docs/zh_CN/faq_series/faq_2021_s2.md
--- a/docs/zh_CN/models/Twins.md
+++ b/docs/zh_CN/models/Twins.md
@@ -3,9 +3,9 @@
 ## 概述
 Twins网络包括Twins-PCPVT和Twins-SVT，其重点对空间注意力机制进行了精心设计，得到了简单却更为有效的方案。由于该体系结构仅涉及矩阵乘法，而目前的深度学习框架中对矩阵乘法有较高的优化程度，因此该体系结构十分高效且易于实现。并且，该体系结构在图像分类、目标检测和语义分割等多种下游视觉任务中都能够取得优异的性能。[论文地址](https://arxiv.org/abs/2104.13840)。
-## 精度、FLOPS和参数量
+## 精度、FLOPs和参数量
-| Models        | Top1 | Top5 | Reference<br>top1 | Reference<br>top5 | FLOPS<br>(G) | Params<br>(M) |
+| Models        | Top1 | Top5 | Reference<br>top1 | Reference<br>top5 | FLOPs<br>(G) | Params<br>(M) |
 |:--:|:--:|:--:|:--:|:--:|:--:|:--:|
 | pcpvt_small   | 0.8082 | 0.9552 | 0.812 | - | 3.7 | 24.1   |
 | pcpvt_base    | 0.8242 | 0.9619 | 0.827 | - | 6.4 | 43.8   |

--- a/docs/zh_CN/tutorials/getting_started.md
+++ b/docs/zh_CN/tutorials/getting_started.md
@@ -30,7 +30,7 @@ python3 tools/train.py \
 其中，`-c`用于指定配置文件的路径，`-o`用于指定需要修改或者添加的参数，其中`-o Arch.pretrained=False`表示不使用预训练模型，`-o Global.device=gpu`表示使用GPU进行训练。如果希望使用CPU进行训练，则需要将`Global.device`设置为`cpu`。
-更详细的训练配置，也可以直接修改模型对应的配置文件。具体配置参数参考[配置文档](config.md)。
+更详细的训练配置，也可以直接修改模型对应的配置文件。具体配置参数参考[配置文档](config_description.md)。
 运行上述命令，可以看到输出日志，示例如下：
@@ -244,7 +244,7 @@ python3 python/predict_cls.py \
    -c configs/inference_cls.yaml \
    -o Global.infer_imgs=../dataset/flowers102/jpg/image_00001.jpg \
    -o Global.inference_model_dir=../inference/ \
-    -o PostProcess.class_id_map_file=None
+    -o PostProcess.Topk.class_id_map_file=None
 其中：

--- a/docs/zh_CN/tutorials/quick_start_professional.md
+++ b/docs/zh_CN/tutorials/quick_start_professional.md
@@ -128,7 +128,7 @@ python3 -m paddle.distributed.launch \
 PaddleClas包含了自研的SSLD知识蒸馏方案，具体的内容可以参考[知识蒸馏章节](../advanced_tutorials/distillation/distillation.md), 本小节将尝试使用知识蒸馏技术对MobileNetV3_large_x1_0模型进行训练，使用`2.1.2小节`训练得到的ResNet50_vd模型作为蒸馏所用的教师模型，首先将`2.1.2小节`训练得到的ResNet50_vd模型保存到指定目录，脚本如下。
 ```shell
-mkdir pretrained 
+mkdir pretrained
 cp -r output_CIFAR/ResNet50_vd/best_model.pdparams  ./pretrained/
 ```
@@ -256,5 +256,5 @@ PreProcess:
 python3 python/predict_cls.py \
    -c configs/inference_cls.yaml \
    -o Global.infer_imgs=../dataset/CIFAR100/test/0/0001.png \
-    -o PostProcess.class_id_map_file=None
+    -o PostProcess.Topk.class_id_map_file=None
 ```
--- a/docs/zh_CN/whl.md
+++ b/docs/zh_CN/whl.md
@@ -5,7 +5,7 @@
 * pip安装
 ```bash
-pip3 install paddleclas==2.2.0
+pip3 install paddleclas==2.2.1
 ```
 * 本地构建并安装

--- a/docs/zh_CN_tmp/.gitkeep
+++ b/docs/zh_CN_tmp/.gitkeep
--- a/docs/zh_CN_tmp/advanced_tutorials/.gitkeep
+++ b/docs/zh_CN_tmp/advanced_tutorials/.gitkeep
--- a/docs/zh_CN_tmp/algorithm_introduction/.gitkeep
+++ b/docs/zh_CN_tmp/algorithm_introduction/.gitkeep
--- a/docs/zh_CN_tmp/data_preparation/.gitkeep
+++ b/docs/zh_CN_tmp/data_preparation/.gitkeep
--- a/docs/zh_CN_tmp/faq_series/.gitkeep
+++ b/docs/zh_CN_tmp/faq_series/.gitkeep
--- a/docs/zh_CN_tmp/image_recognition_pipeline/.gitkeep
+++ b/docs/zh_CN_tmp/image_recognition_pipeline/.gitkeep
--- a/docs/zh_CN_tmp/inference_deployment/.gitkeep
+++ b/docs/zh_CN_tmp/inference_deployment/.gitkeep
--- a/docs/zh_CN_tmp/installation/.gitkeep
+++ b/docs/zh_CN_tmp/installation/.gitkeep
--- a/docs/zh_CN_tmp/introduction/.gitkeep
+++ b/docs/zh_CN_tmp/introduction/.gitkeep
--- a/docs/zh_CN_tmp/models_training/.gitkeep
+++ b/docs/zh_CN_tmp/models_training/.gitkeep
--- a/docs/zh_CN_tmp/quick_start/.gitkeep
+++ b/docs/zh_CN_tmp/quick_start/.gitkeep
--- a/paddleclas.py
+++ b/paddleclas.py
@@ -18,6 +18,7 @@ __dir__ = os.path.dirname(__file__)
 sys.path.append(os.path.join(__dir__, ""))
 sys.path.append(os.path.join(__dir__, "deploy"))
+from typing import Union, Generator
 import argparse
 import shutil
 import textwrap
@@ -356,7 +357,7 @@ def download_with_progressbar(url, save_path):
 def check_model_file(model_name):
-    """Check the model files exist and download and untar when no exist. 
+    """Check the model files exist and download and untar when no exist.
    """
    storage_directory = partial(os.path.join, BASE_INFERENCE_MODEL_DIR,
                                model_name)
@@ -410,11 +411,11 @@ class PaddleClas(object):
        """Init PaddleClas with config.
        Args:
-            model_name: The model name supported by PaddleClas, default by None. If specified, override config.
+            model_name (str, optional): The model name supported by PaddleClas. If specified, override config. Defaults to None.
-            inference_model_dir: The directory that contained model file and params file to be used, default by None. If specified, override config.
+            inference_model_dir (str, optional): The directory that contained model file and params file to be used. If specified, override config. Defaults to None.
-            use_gpu: Whether use GPU, default by None. If specified, override config.
+            use_gpu (bool, optional): Whether use GPU. If specified, override config. Defaults to True.
-            batch_size: The batch size to pridict, default by None. If specified, override config.
+            batch_size (int, optional): The batch size to pridict. If specified, override config. Defaults to 1.
-            topk: Return the top k prediction results with the highest score.
+            topk (int, optional): Return the top k prediction results with the highest score. Defaults to 5.
        """
        super().__init__()
        self._config = init_config(model_name, inference_model_dir, use_gpu,
@@ -459,20 +460,26 @@ class PaddleClas(object):
            raise InputModelError(err)
        return
-    def predict(self, input_data, print_pred=False):
+    def predict(self, input_data: Union[str, np.array],
+                print_pred: bool=False) -> Generator[list, None, None]:
        """Predict input_data.
        Args:
-            input_data (str | NumPy.array): The path of image, or the directory containing images, or the URL of image from Internet.
+            input_data (Union[str, np.array]): 
-            print_pred (bool, optional): Whether print the prediction result. Defaults to False.
+                When the type is str, it is the path of image, or the directory containing images, or the URL of image from Internet.
+                When the type is np.array, it is the image data whose channel order is RGB.
+            print_pred (bool, optional): Whether print the prediction result. Defaults to False. Defaults to False.
        Raises:
            ImageTypeError: Illegal input_data.
        Yields:
-            list: The prediction result(s) of input_data by batch_size. For every one image, prediction result(s) is zipped as a dict, that includs topk "class_ids", "scores" and "label_names". The format is as follow:
+            Generator[list, None, None]: 
-            [{"class_ids": [...], "scores": [...], "label_names": [...]}, ...]
+                The prediction result(s) of input_data by batch_size. For every one image, 
+                prediction result(s) is zipped as a dict, that includs topk "class_ids", "scores" and "label_names". 
+                The format is as follow: [{"class_ids": [...], "scores": [...], "label_names": [...]}, ...]
        """
        if isinstance(input_data, np.ndarray):
            outputs = self.cls_predictor.predict(input_data)
            yield self.cls_predictor.postprocess(outputs)
@@ -502,6 +509,7 @@ class PaddleClas(object):
                        f"Image file failed to read and has been skipped. The path: {img_path}"
                    )
                    continue
+                img = img[:, :, ::-1]
                img_list.append(img)
                img_path_list.append(img_path)
                cnt += 1

--- a/ppcls/arch/backbone/base/theseus_layer.py
+++ b/ppcls/arch/backbone/base/theseus_layer.py
@@ -12,15 +12,9 @@ class Identity(nn.Layer):
 class TheseusLayer(nn.Layer):
-    def __init__(self, *args, return_patterns=None, **kwargs):
+    def __init__(self, *args, **kwargs):
        super(TheseusLayer, self).__init__()
-        self.res_dict = None
+        self.res_dict = {}
-        if return_patterns is not None:
-            self._update_res(return_patterns)
-    def forward(self, *input, res_dict=None, **kwargs):
-        if res_dict is not None:
-            self.res_dict = res_dict
    # stop doesn't work when stop layer has a parallel branch.
    def stop_after(self, stop_layer_name: str):
@@ -38,33 +32,43 @@ class TheseusLayer(nn.Layer):
                    stop_layer_name)
        return after_stop
-    def _update_res(self, return_layers):
+    def update_res(self, return_patterns):
+        if not return_patterns or isinstance(self, WrapLayer):
+            return
+        for layer_i in self._sub_layers:
+            layer_name = self._sub_layers[layer_i].full_name()
+            if isinstance(self._sub_layers[layer_i], (nn.Sequential, nn.LayerList)):
+                self._sub_layers[layer_i] = wrap_theseus(self._sub_layers[layer_i])
+                self._sub_layers[layer_i].res_dict = self.res_dict
+                self._sub_layers[layer_i].update_res(return_patterns)
+            else:
+                for return_pattern in return_patterns:
+                    if re.match(return_pattern, layer_name):
+                        if not isinstance(self._sub_layers[layer_i], TheseusLayer):
+                            self._sub_layers[layer_i] = wrap_theseus(self._sub_layers[layer_i])
+                        self._sub_layers[layer_i].register_forward_post_hook(
+                            self._sub_layers[layer_i]._save_sub_res_hook)
+                        self._sub_layers[layer_i].res_dict = self.res_dict
+                if isinstance(self._sub_layers[layer_i], TheseusLayer):
+                    self._sub_layers[layer_i].res_dict = self.res_dict
+                    self._sub_layers[layer_i].update_res(return_patterns)
+    def _save_sub_res_hook(self, layer, input, output):
+        self.res_dict[layer.full_name()] = output
+    def replace_sub(self, layer_name_pattern, replace_function, recursive=True):
        for layer_i in self._sub_layers:
            layer_name = self._sub_layers[layer_i].full_name()
-            for return_pattern in return_layers:
-                if return_layers is not None and re.match(return_pattern,
-                                                          layer_name):
-                    self._sub_layers[layer_i].register_forward_post_hook(
-                        self._save_sub_res_hook)
-    def replace_sub(self, layer_name_pattern, replace_function,
-                    recursive=True):
-        for k in self._sub_layers.keys():
-            layer_name = self._sub_layers[k].full_name()
            if re.match(layer_name_pattern, layer_name):
-                self._sub_layers[k] = replace_function(self._sub_layers[k])
+                self._sub_layers[layer_i] = replace_function(self._sub_layers[layer_i])
            if recursive:
-                if isinstance(self._sub_layers[k], TheseusLayer):
+                if isinstance(self._sub_layers[layer_i], TheseusLayer):
-                    self._sub_layers[k].replace_sub(
+                    self._sub_layers[layer_i].replace_sub(
                        layer_name_pattern, replace_function, recursive)
-                elif isinstance(self._sub_layers[k],
+                elif isinstance(self._sub_layers[layer_i], (nn.Sequential, nn.LayerList)):
-                                nn.Sequential) or isinstance(
+                    for layer_j in self._sub_layers[layer_i]._sub_layers:
-                                    self._sub_layers[k], nn.LayerList):
+                        self._sub_layers[layer_i]._sub_layers[layer_j].replace_sub(
-                    for kk in self._sub_layers[k]._sub_layers.keys():
-                        self._sub_layers[k]._sub_layers[kk].replace_sub(
                            layer_name_pattern, replace_function, recursive)
-                else:
-                    pass
    '''
    example of replace function:
@@ -78,3 +82,40 @@ class TheseusLayer(nn.Layer):
        return new_conv
        '''
+class WrapLayer(TheseusLayer):
+    def __init__(self, sub_layer):
+        super(WrapLayer, self).__init__()
+        self.sub_layer = sub_layer
+        self.name = sub_layer.full_name()
+    def full_name(self):
+        return self.name
+    def forward(self, *inputs, **kwargs):
+        return self.sub_layer(*inputs, **kwargs)
+    def update_res(self, return_patterns):
+        if not return_patterns or not isinstance(self.sub_layer, (nn.Sequential, nn.LayerList)):
+            return
+        for layer_i in self.sub_layer._sub_layers:
+            if isinstance(self.sub_layer._sub_layers[layer_i], (nn.Sequential, nn.LayerList)):
+                self.sub_layer._sub_layers[layer_i] = wrap_theseus(self.sub_layer._sub_layers[layer_i])
+                self.sub_layer._sub_layers[layer_i].res_dict = self.res_dict
+                self.sub_layer._sub_layers[layer_i].update_res(return_patterns)
+            layer_name = self.sub_layer._sub_layers[layer_i].full_name()
+            for return_pattern in return_patterns:
+                if re.match(return_pattern, layer_name):
+                    self.sub_layer._sub_layers[layer_i].res_dict = self.res_dict
+                    self.sub_layer._sub_layers[layer_i].register_forward_post_hook(
+                        self._sub_layers[layer_i]._save_sub_res_hook)
+            if isinstance(self.sub_layer._sub_layers[layer_i], TheseusLayer):
+                self.sub_layer._sub_layers[layer_i].update_res(return_patterns)
+def wrap_theseus(sub_layer):
+    wrapped_layer = WrapLayer(sub_layer)
+    return wrapped_layer
--- a/ppcls/arch/backbone/legendary_models/vgg.py
+++ b/ppcls/arch/backbone/legendary_models/vgg.py
@@ -111,7 +111,7 @@ class VGGNet(TheseusLayer):
        model: nn.Layer. Specific VGG model depends on args.
    """
-    def __init__(self, config, stop_grad_layers=0, class_num=1000):
+    def __init__(self, config, stop_grad_layers=0, class_num=1000, return_patterns=None):
        super().__init__()
        self.stop_grad_layers = stop_grad_layers
@@ -138,7 +138,7 @@ class VGGNet(TheseusLayer):
        self.fc2 = Linear(4096, 4096)
        self.fc3 = Linear(4096, class_num)
-    def forward(self, inputs):
+    def forward(self, inputs, res_dict=None):
        x = self.conv_block_1(inputs)
        x = self.conv_block_2(x)
        x = self.conv_block_3(x)
@@ -152,6 +152,9 @@ class VGGNet(TheseusLayer):
        x = self.relu(x)
        x = self.drop(x)
        x = self.fc3(x)
+        if self.res_dict and res_dict is not None:
+            for res_key in list(self.res_dict):
+                res_dict[res_key] = self.res_dict.pop(res_key)
        return x

--- a/ppcls/arch/backbone/model_zoo/gvt.py
+++ b/ppcls/arch/backbone/model_zoo/gvt.py
@@ -82,11 +82,11 @@ class GroupAttention(nn.Layer):
            B, total_groups, self.ws**2, 3, self.num_heads, C // self.num_heads
        ]).transpose([3, 0, 1, 4, 2, 5])
        q, k, v = qkv[0], qkv[1], qkv[2]
-        attn = (q @ k.transpose([0, 1, 2, 4, 3])) * self.scale
+        attn = paddle.matmul(q, k.transpose([0, 1, 2, 4, 3])) * self.scale
        attn = nn.Softmax(axis=-1)(attn)
        attn = self.attn_drop(attn)
-        attn = (attn @ v).transpose([0, 1, 3, 2, 4]).reshape(
+        attn = paddle.matmul(attn, v).transpose([0, 1, 3, 2, 4]).reshape(
            [B, h_group, w_group, self.ws, self.ws, C])
        x = attn.transpose([0, 1, 3, 2, 4, 5]).reshape([B, N, C])
@@ -147,11 +147,11 @@ class Attention(nn.Layer):
                    [2, 0, 3, 1, 4])
        k, v = kv[0], kv[1]
-        attn = (q @ k.transpose([0, 1, 3, 2])) * self.scale
+        attn = paddle.matmul(q, k.transpose([0, 1, 3, 2])) * self.scale
        attn = nn.Softmax(axis=-1)(attn)
        attn = self.attn_drop(attn)
-        x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B, N, C])
+        x = paddle.matmul(attn, v).transpose([0, 2, 1, 3]).reshape([B, N, C])
        x = self.proj(x)
        x = self.proj_drop(x)
        return x
@@ -350,7 +350,6 @@ class PyramidVisionTransformer(nn.Layer):
            shape=[1, 1, embed_dims[-1]],
            default_initializer=zeros_,
            attr=paddle.ParamAttr(regularizer=L2Decay(0.0)))
-        self.add_parameter("cls_token", self.cls_token)
        # classification head
        self.head = nn.Linear(embed_dims[-1],

--- a/ppcls/arch/backbone/model_zoo/resnext101_wsl.py
+++ b/ppcls/arch/backbone/model_zoo/resnext101_wsl.py
@@ -12,7 +12,7 @@ MODEL_URLS = {
    "ResNeXt101_32x8d_wsl":
    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x8d_wsl_pretrained.pdparams",
    "ResNeXt101_32x16d_wsl":
-    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x816_wsl_pretrained.pdparams",
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x16_wsl_pretrained.pdparams",
    "ResNeXt101_32x32d_wsl":
    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x32d_wsl_pretrained.pdparams",
    "ResNeXt101_32x48d_wsl":
@@ -460,17 +460,17 @@ def ResNeXt101_32x8d_wsl(pretrained=False, use_ssld=False, **kwargs):
    return model
-def ResNeXt101_32x16d_wsl(**args):
+def ResNeXt101_32x16d_wsl(pretrained=False, use_ssld=False, **kwargs):
    model = ResNeXt101WSL(cardinality=32, width=16, **kwargs)
    _load_pretrained(
        pretrained,
        model,
-        MODEL_URLS["ResNeXt101_32x16d_ws"],
+        MODEL_URLS["ResNeXt101_32x16d_wsl"],
        use_ssld=use_ssld)
    return model
-def ResNeXt101_32x32d_wsl(**args):
+def ResNeXt101_32x32d_wsl(pretrained=False, use_ssld=False, **kwargs):
    model = ResNeXt101WSL(cardinality=32, width=32, **kwargs)
    _load_pretrained(
        pretrained,
@@ -480,7 +480,7 @@ def ResNeXt101_32x32d_wsl(**args):
    return model
-def ResNeXt101_32x48d_wsl(**args):
+def ResNeXt101_32x48d_wsl(pretrained=False, use_ssld=False, **kwargs):
    model = ResNeXt101WSL(cardinality=32, width=48, **kwargs)
    _load_pretrained(
        pretrained,

--- a/ppcls/arch/backbone/model_zoo/swin_transformer.py
+++ b/ppcls/arch/backbone/model_zoo/swin_transformer.py
@@ -33,9 +33,9 @@ MODEL_URLS = {
    "SwinTransformer_base_patch4_window12_384":
    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformer_base_patch4_window12_384_pretrained.pdparams",
    "SwinTransformer_large_patch4_window7_224":
-    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformer_large_patch4_window7_224_pretrained.pdparams",
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformer_large_patch4_window7_224_22kto1k_pretrained.pdparams",
    "SwinTransformer_large_patch4_window12_384":
-    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformer_large_patch4_window12_384_pretrained.pdparams",
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformer_large_patch4_window12_384_22kto1k_pretrained.pdparams",
 }
 __all__ = list(MODEL_URLS.keys())

--- a/ppcls/configs/ImageNet/GhostNet/GhostNet_x0_5.yaml
+++ b/ppcls/configs/ImageNet/GhostNet/GhostNet_x0_5.yaml
@@ -24,6 +24,7 @@ Loss:
  Train:
    - CELoss:
        weight: 1.0
+        epsilon: 0.1
  Eval:
    - CELoss:
        weight: 1.0
@@ -35,9 +36,10 @@ Optimizer:
  lr:
    name: Cosine
    learning_rate: 0.8
+    warmup_epoch: 5
  regularizer:
    name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_0.yaml
+++ b/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_0.yaml
@@ -24,6 +24,7 @@ Loss:
  Train:
    - CELoss:
        weight: 1.0
+        epsilon: 0.1
  Eval:
    - CELoss:
        weight: 1.0
@@ -35,9 +36,10 @@ Optimizer:
  lr:
    name: Cosine
    learning_rate: 0.8
+    warmup_epoch: 5
  regularizer:
    name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_3.yaml
+++ b/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_3.yaml
@@ -24,6 +24,7 @@ Loss:
  Train:
    - CELoss:
        weight: 1.0
+        epsilon: 0.1
  Eval:
    - CELoss:
        weight: 1.0
@@ -35,9 +36,10 @@ Optimizer:
  lr:
    name: Cosine
    learning_rate: 0.8
+    warmup_epoch: 5
  regularizer:
    name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
@@ -41,7 +41,7 @@ Optimizer:
    values: [0.1, 0.01, 0.001, 0.0001]
  regularizer:
    name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_25.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_25.yaml
@@ -39,7 +39,7 @@ Optimizer:
    values: [0.1, 0.01, 0.001, 0.0001]
  regularizer:
    name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_5.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_5.yaml
@@ -39,7 +39,7 @@ Optimizer:
    values: [0.1, 0.01, 0.001, 0.0001]
  regularizer:
    name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_75.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_75.yaml
@@ -39,7 +39,7 @@ Optimizer:
    values: [0.1, 0.01, 0.001, 0.0001]
  regularizer:
    name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
@@ -39,7 +39,7 @@ Optimizer:
    learning_rate: 0.045
  regularizer:
    name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_25.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_25.yaml
@@ -37,7 +37,7 @@ Optimizer:
    learning_rate: 0.045
  regularizer:
    name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_5.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_5.yaml
@@ -37,7 +37,7 @@ Optimizer:
    learning_rate: 0.045
  regularizer:
    name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_75.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_75.yaml
@@ -37,7 +37,7 @@ Optimizer:
    learning_rate: 0.045
  regularizer:
    name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x1_5.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x1_5.yaml
@@ -37,7 +37,7 @@ Optimizer:
    learning_rate: 0.045
  regularizer:
    name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x2_0.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x2_0.yaml
@@ -37,7 +37,7 @@ Optimizer:
    learning_rate: 0.045
  regularizer:
    name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml
+++ b/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: ResNeXt101_32x16d_wsl
+  class_num: 1000
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
--- a/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml
+++ b/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: ResNeXt101_32x32d_wsl
+  class_num: 1000
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
--- a/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml
+++ b/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: ResNeXt101_32x48d_wsl
+  class_num: 1000
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
--- a/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml
+++ b/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: ResNeXt101_32x8d_wsl
+  class_num: 1000
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
--- a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_swish.yaml
+++ b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_swish.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: ShuffleNetV2_swish
+  class_num: 1000
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
--- a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_25.yaml
+++ b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_25.yaml
@@ -38,7 +38,7 @@ Optimizer:
    warmup_epoch: 5
  regularizer:
    name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_33.yaml
+++ b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_33.yaml
@@ -38,7 +38,7 @@ Optimizer:
    warmup_epoch: 5
  regularizer:
    name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_5.yaml
+++ b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_5.yaml
@@ -38,7 +38,7 @@ Optimizer:
    warmup_epoch: 5
  regularizer:
    name: 'L2'
-    coeff: 0.0003
+    coeff: 0.00003
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
+++ b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
@@ -38,7 +38,7 @@ Optimizer:
    warmup_epoch: 5
  regularizer:
    name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_5.yaml
+++ b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_5.yaml
@@ -38,7 +38,7 @@ Optimizer:
    warmup_epoch: 5
  regularizer:
    name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 # data loader for train and eval

--- a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x2_0.yaml
+++ b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x2_0.yaml
@@ -38,7 +38,7 @@ Optimizer:
    warmup_epoch: 5
  regularizer:
    name: 'L2'
-    coeff: 0.0004
+    coeff: 0.00004
 # data loader for train and eval

--- a/ppcls/data/preprocess/ops/random_erasing.py
+++ b/ppcls/data/preprocess/ops/random_erasing.py
@@ -42,9 +42,9 @@ class RandomErasing(object):
            h = int(round(math.sqrt(target_area * aspect_ratio)))
            w = int(round(math.sqrt(target_area / aspect_ratio)))
-            if w < img.shape[2] and h < img.shape[1]:
+            if w < img.shape[1] and h < img.shape[0]:
-                x1 = random.randint(0, img.shape[1] - h)
+                x1 = random.randint(0, img.shape[0] - h)
-                y1 = random.randint(0, img.shape[2] - w)
+                y1 = random.randint(0, img.shape[1] - w)
                if img.shape[0] == 3:
                    img[x1:x1 + h, y1:y1 + w, 0] = self.mean[0]
                    img[x1:x1 + h, y1:y1 + w, 1] = self.mean[1]

--- a/ppcls/engine/trainer.py
+++ b/ppcls/engine/trainer.py
--- a/ppcls/engine/evaluation/__init__.py
+++ b/ppcls/engine/evaluation/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ppcls.engine.evaluation.classification import classification_eval
+from ppcls.engine.evaluation.retrieval import retrieval_eval
--- a/ppcls/engine/evaluation/classification.py
+++ b/ppcls/engine/evaluation/classification.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import time
+import platform
+import paddle
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+def classification_eval(evaler, epoch_id=0):
+    output_info = dict()
+    time_info = {
+        "batch_cost": AverageMeter(
+            "batch_cost", '.5f', postfix=" s,"),
+        "reader_cost": AverageMeter(
+            "reader_cost", ".5f", postfix=" s,"),
+    }
+    print_batch_step = evaler.config["Global"]["print_batch_step"]
+    metric_key = None
+    tic = time.time()
+    eval_dataloader = evaler.eval_dataloader if evaler.use_dali else evaler.eval_dataloader(
+    )
+    max_iter = len(evaler.eval_dataloader) - 1 if platform.system(
+    ) == "Windows" else len(evaler.eval_dataloader)
+    for iter_id, batch in enumerate(eval_dataloader):
+        if iter_id >= max_iter:
+            break
+        if iter_id == 5:
+            for key in time_info:
+                time_info[key].reset()
+        if evaler.use_dali:
+            batch = [
+                paddle.to_tensor(batch[0]['data']),
+                paddle.to_tensor(batch[0]['label'])
+            ]
+        time_info["reader_cost"].update(time.time() - tic)
+        batch_size = batch[0].shape[0]
+        batch[0] = paddle.to_tensor(batch[0]).astype("float32")
+        batch[1] = batch[1].reshape([-1, 1]).astype("int64")
+        # image input
+        out = evaler.model(batch[0])
+        # calc loss
+        if evaler.eval_loss_func is not None:
+            loss_dict = evaler.eval_loss_func(out, batch[1])
+            for key in loss_dict:
+                if key not in output_info:
+                    output_info[key] = AverageMeter(key, '7.5f')
+                output_info[key].update(loss_dict[key].numpy()[0], batch_size)
+        # calc metric
+        if evaler.eval_metric_func is not None:
+            metric_dict = evaler.eval_metric_func(out, batch[1])
+            if paddle.distributed.get_world_size() > 1:
+                for key in metric_dict:
+                    paddle.distributed.all_reduce(
+                        metric_dict[key], op=paddle.distributed.ReduceOp.SUM)
+                    metric_dict[key] = metric_dict[
+                        key] / paddle.distributed.get_world_size()
+            for key in metric_dict:
+                if metric_key is None:
+                    metric_key = key
+                if key not in output_info:
+                    output_info[key] = AverageMeter(key, '7.5f')
+                output_info[key].update(metric_dict[key].numpy()[0],
+                                        batch_size)
+        time_info["batch_cost"].update(time.time() - tic)
+        if iter_id % print_batch_step == 0:
+            time_msg = "s, ".join([
+                "{}: {:.5f}".format(key, time_info[key].avg)
+                for key in time_info
+            ])
+            ips_msg = "ips: {:.5f} images/sec".format(
+                batch_size / time_info["batch_cost"].avg)
+            metric_msg = ", ".join([
+                "{}: {:.5f}".format(key, output_info[key].val)
+                for key in output_info
+            ])
+            logger.info("[Eval][Epoch {}][Iter: {}/{}]{}, {}, {}".format(
+                epoch_id, iter_id,
+                len(evaler.eval_dataloader), metric_msg, time_msg, ips_msg))
+        tic = time.time()
+    if evaler.use_dali:
+        evaler.eval_dataloader.reset()
+    metric_msg = ", ".join([
+        "{}: {:.5f}".format(key, output_info[key].avg) for key in output_info
+    ])
+    logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
+    # do not try to save best eval.model
+    if evaler.eval_metric_func is None:
+        return -1
+    # return 1st metric in the dict
+    return output_info[metric_key].avg
--- a/ppcls/engine/evaluation/retrieval.py
+++ b/ppcls/engine/evaluation/retrieval.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import platform
+import paddle
+from ppcls.utils import logger
+def retrieval_eval(evaler, epoch_id=0):
+    evaler.model.eval()
+    # step1. build gallery
+    gallery_feas, gallery_img_id, gallery_unique_id = cal_feature(
+        evaler, name='gallery')
+    query_feas, query_img_id, query_query_id = cal_feature(
+        evaler, name='query')
+    # step2. do evaluation
+    sim_block_size = evaler.config["Global"].get("sim_block_size", 64)
+    sections = [sim_block_size] * (len(query_feas) // sim_block_size)
+    if len(query_feas) % sim_block_size:
+        sections.append(len(query_feas) % sim_block_size)
+    fea_blocks = paddle.split(query_feas, num_or_sections=sections)
+    if query_query_id is not None:
+        query_id_blocks = paddle.split(
+            query_query_id, num_or_sections=sections)
+    image_id_blocks = paddle.split(query_img_id, num_or_sections=sections)
+    metric_key = None
+    if evaler.eval_loss_func is None:
+        metric_dict = {metric_key: 0.}
+    else:
+        metric_dict = dict()
+        for block_idx, block_fea in enumerate(fea_blocks):
+            similarity_matrix = paddle.matmul(
+                block_fea, gallery_feas, transpose_y=True)
+            if query_query_id is not None:
+                query_id_block = query_id_blocks[block_idx]
+                query_id_mask = (query_id_block != gallery_unique_id.t())
+                image_id_block = image_id_blocks[block_idx]
+                image_id_mask = (image_id_block != gallery_img_id.t())
+                keep_mask = paddle.logical_or(query_id_mask, image_id_mask)
+                similarity_matrix = similarity_matrix * keep_mask.astype(
+                    "float32")
+            else:
+                keep_mask = None
+            metric_tmp = evaler.eval_metric_func(similarity_matrix,
+                                                 image_id_blocks[block_idx],
+                                                 gallery_img_id, keep_mask)
+            for key in metric_tmp:
+                if key not in metric_dict:
+                    metric_dict[key] = metric_tmp[key] * block_fea.shape[
+                        0] / len(query_feas)
+                else:
+                    metric_dict[key] += metric_tmp[key] * block_fea.shape[
+                        0] / len(query_feas)
+    metric_info_list = []
+    for key in metric_dict:
+        if metric_key is None:
+            metric_key = key
+        metric_info_list.append("{}: {:.5f}".format(key, metric_dict[key]))
+    metric_msg = ", ".join(metric_info_list)
+    logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
+    return metric_dict[metric_key]
+def cal_feature(evaler, name='gallery'):
+    all_feas = None
+    all_image_id = None
+    all_unique_id = None
+    has_unique_id = False
+    if name == 'gallery':
+        dataloader = evaler.gallery_dataloader
+    elif name == 'query':
+        dataloader = evaler.query_dataloader
+    else:
+        raise RuntimeError("Only support gallery or query dataset")
+    max_iter = len(dataloader) - 1 if platform.system() == "Windows" else len(
+        dataloader)
+    dataloader_tmp = dataloader if evaler.use_dali else dataloader()
+    for idx, batch in enumerate(dataloader_tmp):  # load is very time-consuming
+        if idx >= max_iter:
+            break
+        if idx % evaler.config["Global"]["print_batch_step"] == 0:
+            logger.info(
+                f"{name} feature calculation process: [{idx}/{len(dataloader)}]"
+            )
+        if evaler.use_dali:
+            batch = [
+                paddle.to_tensor(batch[0]['data']),
+                paddle.to_tensor(batch[0]['label'])
+            ]
+        batch = [paddle.to_tensor(x) for x in batch]
+        batch[1] = batch[1].reshape([-1, 1]).astype("int64")
+        if len(batch) == 3:
+            has_unique_id = True
+            batch[2] = batch[2].reshape([-1, 1]).astype("int64")
+        out = evaler.model(batch[0], batch[1])
+        batch_feas = out["features"]
+        # do norm
+        if evaler.config["Global"].get("feature_normalize", True):
+            feas_norm = paddle.sqrt(
+                paddle.sum(paddle.square(batch_feas), axis=1, keepdim=True))
+            batch_feas = paddle.divide(batch_feas, feas_norm)
+        if all_feas is None:
+            all_feas = batch_feas
+            if has_unique_id:
+                all_unique_id = batch[2]
+            all_image_id = batch[1]
+        else:
+            all_feas = paddle.concat([all_feas, batch_feas])
+            all_image_id = paddle.concat([all_image_id, batch[1]])
+            if has_unique_id:
+                all_unique_id = paddle.concat([all_unique_id, batch[2]])
+    if evaler.use_dali:
+        dataloader_tmp.reset()
+    if paddle.distributed.get_world_size() > 1:
+        feat_list = []
+        img_id_list = []
+        unique_id_list = []
+        paddle.distributed.all_gather(feat_list, all_feas)
+        paddle.distributed.all_gather(img_id_list, all_image_id)
+        all_feas = paddle.concat(feat_list, axis=0)
+        all_image_id = paddle.concat(img_id_list, axis=0)
+        if has_unique_id:
+            paddle.distributed.all_gather(unique_id_list, all_unique_id)
+            all_unique_id = paddle.concat(unique_id_list, axis=0)
+    logger.info("Build {} done, all feat shape: {}, begin to eval..".format(
+        name, all_feas.shape))
+    return all_feas, all_image_id, all_unique_id
--- a/ppcls/engine/slim/__init__.py
+++ b/ppcls/engine/slim/__init__.py
--- a/ppcls/engine/train/__init__.py
+++ b/ppcls/engine/train/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ppcls.engine.train.train import train_epoch
--- a/ppcls/engine/train/train.py
+++ b/ppcls/engine/train/train.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+import time
+import paddle
+from ppcls.engine.train.utils import update_loss, update_metric, log_info
+def train_epoch(trainer, epoch_id, print_batch_step):
+    tic = time.time()
+    train_dataloader = trainer.train_dataloader if trainer.use_dali else trainer.train_dataloader(
+    )
+    for iter_id, batch in enumerate(train_dataloader):
+        if iter_id >= trainer.max_iter:
+            break
+        if iter_id == 5:
+            for key in trainer.time_info:
+                trainer.time_info[key].reset()
+        trainer.time_info["reader_cost"].update(time.time() - tic)
+        if trainer.use_dali:
+            batch = [
+                paddle.to_tensor(batch[0]['data']),
+                paddle.to_tensor(batch[0]['label'])
+            ]
+        batch_size = batch[0].shape[0]
+        batch[1] = batch[1].reshape([-1, 1]).astype("int64")
+        trainer.global_step += 1
+        # image input
+        if trainer.amp:
+            with paddle.amp.auto_cast(custom_black_list={
+                    "flatten_contiguous_range", "greater_than"
+            }):
+                out = forward(trainer, batch)
+                loss_dict = trainer.train_loss_func(out, batch[1])
+        else:
+            out = forward(trainer, batch)
+        # calc loss
+        if trainer.config["DataLoader"]["Train"]["dataset"].get(
+                "batch_transform_ops", None):
+            loss_dict = trainer.train_loss_func(out, batch[1:])
+        else:
+            loss_dict = trainer.train_loss_func(out, batch[1])
+        # step opt and lr
+        if trainer.amp:
+            scaled = trainer.scaler.scale(loss_dict["loss"])
+            scaled.backward()
+            trainer.scaler.minimize(trainer.optimizer, scaled)
+        else:
+            loss_dict["loss"].backward()
+            trainer.optimizer.step()
+        trainer.optimizer.clear_grad()
+        trainer.lr_sch.step()
+        # below code just for logging
+        # update metric_for_logger
+        update_metric(trainer, out, batch, batch_size)
+        # update_loss_for_logger
+        update_loss(trainer, loss_dict, batch_size)
+        trainer.time_info["batch_cost"].update(time.time() - tic)
+        if iter_id % print_batch_step == 0:
+            log_info(trainer, batch_size, epoch_id, iter_id)
+        tic = time.time()
+def forward(trainer, batch):
+    if trainer.eval_mode == "classification":
+        return trainer.model(batch[0])
+    else:
+        return trainer.model(batch[0], batch[1])
--- a/ppcls/engine/train/utils.py
+++ b/ppcls/engine/train/utils.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+import datetime
+from ppcls.utils import logger
+from ppcls.utils.misc import AverageMeter
+def update_metric(trainer, out, batch, batch_size):
+    # calc metric
+    if trainer.train_metric_func is not None:
+        metric_dict = trainer.train_metric_func(out, batch[-1])
+        for key in metric_dict:
+            if key not in trainer.output_info:
+                trainer.output_info[key] = AverageMeter(key, '7.5f')
+            trainer.output_info[key].update(metric_dict[key].numpy()[0],
+                                            batch_size)
+def update_loss(trainer, loss_dict, batch_size):
+    # update_output_info
+    for key in loss_dict:
+        if key not in trainer.output_info:
+            trainer.output_info[key] = AverageMeter(key, '7.5f')
+        trainer.output_info[key].update(loss_dict[key].numpy()[0], batch_size)
+def log_info(trainer, batch_size, epoch_id, iter_id):
+    lr_msg = "lr: {:.5f}".format(trainer.lr_sch.get_lr())
+    metric_msg = ", ".join([
+        "{}: {:.5f}".format(key, trainer.output_info[key].avg)
+        for key in trainer.output_info
+    ])
+    time_msg = "s, ".join([
+        "{}: {:.5f}".format(key, trainer.time_info[key].avg)
+        for key in trainer.time_info
+    ])
+    ips_msg = "ips: {:.5f} images/sec".format(
+        batch_size / trainer.time_info["batch_cost"].avg)
+    eta_sec = ((trainer.config["Global"]["epochs"] - epoch_id + 1
+                ) * len(trainer.train_dataloader) - iter_id
+               ) * trainer.time_info["batch_cost"].avg
+    eta_msg = "eta: {:s}".format(str(datetime.timedelta(seconds=int(eta_sec))))
+    logger.info("[Train][Epoch {}/{}][Iter: {}/{}]{}, {}, {}, {}, {}".format(
+        epoch_id, trainer.config["Global"]["epochs"], iter_id,
+        len(trainer.train_dataloader), lr_msg, metric_msg, time_msg, ips_msg,
+        eta_msg))
+    logger.scaler(
+        name="lr",
+        value=trainer.lr_sch.get_lr(),
+        step=trainer.global_step,
+        writer=trainer.vdl_writer)
+    for key in trainer.output_info:
+        logger.scaler(
+            name="train_{}".format(key),
+            value=trainer.output_info[key].avg,
+            step=trainer.global_step,
+            writer=trainer.vdl_writer)
--- a/ppcls/static/program.py
+++ b/ppcls/static/program.py
@@ -38,7 +38,7 @@ from ppcls.optimizer import build_optimizer
 from ppcls.optimizer import build_lr_scheduler
 from ppcls.utils.misc import AverageMeter
-from ppcls.utils import logger
+from ppcls.utils import logger, profiler
 def create_feeds(image_shape, use_mix=None, dtype="float32"):
@@ -326,7 +326,8 @@ def run(dataloader,
        mode='train',
        config=None,
        vdl_writer=None,
-        lr_scheduler=None):
+        lr_scheduler=None,
+        profiler_options=None):
    """
    Feed data to the model and fetch the measures and loss
@@ -382,6 +383,8 @@ def run(dataloader,
        metric_dict['reader_time'].update(time.time() - tic)
+        profiler.add_profiler_step(profiler_options)
        if use_dali:
            batch_size = batch[0]["data"].shape()[0]
            feed_dict = batch[0]

--- a/ppcls/static/train.py
+++ b/ppcls/static/train.py
@@ -43,6 +43,13 @@ def parse_args():
        type=str,
        default='configs/ResNet/ResNet50.yaml',
        help='config file path')
+    parser.add_argument(
+        '-p',
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
    parser.add_argument(
        '-o',
        '--override',
@@ -166,7 +173,7 @@ def main(args):
        # 1. train with train dataset
        program.run(train_dataloader, exe, compiled_train_prog, train_feeds,
                    train_fetchs, epoch_id, 'train', config, vdl_writer,
-                    lr_scheduler)
+                    lr_scheduler, args.profiler_options)
        # 2. evaate with eval dataset
        if global_config["eval_during_train"] and epoch_id % global_config[
                "eval_interval"] == 0:

--- a/ppcls/utils/static/dali.py
+++ b/ppcls/utils/static/dali.py
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import division
-import os
-import numpy as np
-from nvidia.dali.pipeline import Pipeline
-import nvidia.dali.ops as ops
-import nvidia.dali.types as types
-from nvidia.dali.plugin.paddle import DALIGenericIterator
-import paddle
-from paddle import fluid
-class HybridTrainPipe(Pipeline):
-    def __init__(self,
-                 file_root,
-                 file_list,
-                 batch_size,
-                 resize_shorter,
-                 crop,
-                 min_area,
-                 lower,
-                 upper,
-                 interp,
-                 mean,
-                 std,
-                 device_id,
-                 shard_id=0,
-                 num_shards=1,
-                 random_shuffle=True,
-                 num_threads=4,
-                 seed=42,
-                 pad_output=False,
-                 output_dtype=types.FLOAT):
-        super(HybridTrainPipe, self).__init__(
-            batch_size, num_threads, device_id, seed=seed)
-        self.input = ops.FileReader(
-            file_root=file_root,
-            file_list=file_list,
-            shard_id=shard_id,
-            num_shards=num_shards,
-            random_shuffle=random_shuffle)
-        # set internal nvJPEG buffers size to handle full-sized ImageNet images
-        # without additional reallocations
-        device_memory_padding = 211025920
-        host_memory_padding = 140544512
-        self.decode = ops.ImageDecoderRandomCrop(
-            device='mixed',
-            output_type=types.RGB,
-            device_memory_padding=device_memory_padding,
-            host_memory_padding=host_memory_padding,
-            random_aspect_ratio=[lower, upper],
-            random_area=[min_area, 1.0],
-            num_attempts=100)
-        self.res = ops.Resize(
-            device='gpu', resize_x=crop, resize_y=crop, interp_type=interp)
-        self.cmnp = ops.CropMirrorNormalize(
-            device="gpu",
-            output_dtype=output_dtype,
-            output_layout=types.NCHW,
-            crop=(crop, crop),
-            image_type=types.RGB,
-            mean=mean,
-            std=std,
-            pad_output=pad_output)
-        self.coin = ops.CoinFlip(probability=0.5)
-        self.to_int64 = ops.Cast(dtype=types.INT64, device="gpu")
-    def define_graph(self):
-        rng = self.coin()
-        jpegs, labels = self.input(name="Reader")
-        images = self.decode(jpegs)
-        images = self.res(images)
-        output = self.cmnp(images.gpu(), mirror=rng)
-        return [output, self.to_int64(labels.gpu())]
-    def __len__(self):
-        return self.epoch_size("Reader")
-class HybridValPipe(Pipeline):
-    def __init__(self,
-                 file_root,
-                 file_list,
-                 batch_size,
-                 resize_shorter,
-                 crop,
-                 interp,
-                 mean,
-                 std,
-                 device_id,
-                 shard_id=0,
-                 num_shards=1,
-                 random_shuffle=False,
-                 num_threads=4,
-                 seed=42,
-                 pad_output=False,
-                 output_dtype=types.FLOAT):
-        super(HybridValPipe, self).__init__(
-            batch_size, num_threads, device_id, seed=seed)
-        self.input = ops.FileReader(
-            file_root=file_root,
-            file_list=file_list,
-            shard_id=shard_id,
-            num_shards=num_shards,
-            random_shuffle=random_shuffle)
-        self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB)
-        self.res = ops.Resize(
-            device="gpu", resize_shorter=resize_shorter, interp_type=interp)
-        self.cmnp = ops.CropMirrorNormalize(
-            device="gpu",
-            output_dtype=output_dtype,
-            output_layout=types.NCHW,
-            crop=(crop, crop),
-            image_type=types.RGB,
-            mean=mean,
-            std=std,
-            pad_output=pad_output)
-        self.to_int64 = ops.Cast(dtype=types.INT64, device="gpu")
-    def define_graph(self):
-        jpegs, labels = self.input(name="Reader")
-        images = self.decode(jpegs)
-        images = self.res(images)
-        output = self.cmnp(images)
-        return [output, self.to_int64(labels.gpu())]
-    def __len__(self):
-        return self.epoch_size("Reader")
-def build(config, mode='train'):
-    env = os.environ
-    assert config.get('use_gpu',
-                      True) == True, "gpu training is required for DALI"
-    assert not config.get(
-        'use_aa'), "auto augment is not supported by DALI reader"
-    assert float(env.get('FLAGS_fraction_of_gpu_memory_to_use', 0.92)) < 0.9, \
-        "Please leave enough GPU memory for DALI workspace, e.g., by setting" \
-        " `export FLAGS_fraction_of_gpu_memory_to_use=0.8`"
-    dataset_config = config[mode.upper()]
-    gpu_num = paddle.fluid.core.get_cuda_device_count() if (
-        'PADDLE_TRAINERS_NUM') and (
-            'PADDLE_TRAINER_ID'
-        ) not in env else int(env.get('PADDLE_TRAINERS_NUM', 0))
-    batch_size = dataset_config.batch_size
-    assert batch_size % gpu_num == 0, \
-        "batch size must be multiple of number of devices"
-    batch_size = batch_size // gpu_num
-    file_root = dataset_config.data_dir
-    file_list = dataset_config.file_list
-    interp = 1  # settings.interpolation or 1  # default to linear
-    interp_map = {
-        0: types.INTERP_NN,  # cv2.INTER_NEAREST
-        1: types.INTERP_LINEAR,  # cv2.INTER_LINEAR
-        2: types.INTERP_CUBIC,  # cv2.INTER_CUBIC
-        4: types.INTERP_LANCZOS3,  # XXX use LANCZOS3 for cv2.INTER_LANCZOS4
-    }
-    output_dtype = (types.FLOAT16 if 'AMP' in config and
-                    config.AMP.get("use_pure_fp16", False) 
-                    else types.FLOAT)
-    assert interp in interp_map, "interpolation method not supported by DALI"
-    interp = interp_map[interp]
-    pad_output = False
-    image_shape = config.get("image_shape", None)
-    if image_shape and image_shape[0] == 4:
-        pad_output = True
-    transforms = {
-        k: v
-        for d in dataset_config["transforms"] for k, v in d.items()
-    }
-    scale = transforms["NormalizeImage"].get("scale", 1.0 / 255)
-    if isinstance(scale, str):
-        scale = eval(scale)
-    mean = transforms["NormalizeImage"].get("mean", [0.485, 0.456, 0.406])
-    std = transforms["NormalizeImage"].get("std", [0.229, 0.224, 0.225])
-    mean = [v / scale for v in mean]
-    std = [v / scale for v in std]
-    if mode == "train":
-        resize_shorter = 256
-        crop = transforms["RandCropImage"]["size"]
-        scale = transforms["RandCropImage"].get("scale", [0.08, 1.])
-        ratio = transforms["RandCropImage"].get("ratio", [3.0 / 4, 4.0 / 3])
-        min_area = scale[0]
-        lower = ratio[0]
-        upper = ratio[1]
-        if 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env:
-            shard_id = int(env['PADDLE_TRAINER_ID'])
-            num_shards = int(env['PADDLE_TRAINERS_NUM'])
-            device_id = int(env['FLAGS_selected_gpus'])
-            pipe = HybridTrainPipe(
-                file_root,
-                file_list,
-                batch_size,
-                resize_shorter,
-                crop,
-                min_area,
-                lower,
-                upper,
-                interp,
-                mean,
-                std,
-                device_id,
-                shard_id,
-                num_shards,
-                seed=42 + shard_id,
-                pad_output=pad_output,
-                output_dtype=output_dtype)
-            pipe.build()
-            pipelines = [pipe]
-            sample_per_shard = len(pipe) // num_shards
-        else:
-            pipelines = []
-            places = fluid.framework.cuda_places()
-            num_shards = len(places)
-            for idx, p in enumerate(places):
-                place = fluid.core.Place()
-                place.set_place(p)
-                device_id = place.gpu_device_id()
-                pipe = HybridTrainPipe(
-                    file_root,
-                    file_list,
-                    batch_size,
-                    resize_shorter,
-                    crop,
-                    min_area,
-                    lower,
-                    upper,
-                    interp,
-                    mean,
-                    std,
-                    device_id,
-                    idx,
-                    num_shards,
-                    seed=42 + idx,
-                pad_output=pad_output,
-                output_dtype=output_dtype)
-                pipe.build()
-                pipelines.append(pipe)
-            sample_per_shard = len(pipelines[0])
-        return DALIGenericIterator(
-            pipelines, ['feed_image', 'feed_label'], size=sample_per_shard)
-    else:
-        resize_shorter = transforms["ResizeImage"].get("resize_short", 256)
-        crop = transforms["CropImage"]["size"]
-        p = fluid.framework.cuda_places()[0]
-        place = fluid.core.Place()
-        place.set_place(p)
-        device_id = place.gpu_device_id()
-        pipe = HybridValPipe(
-            file_root,
-            file_list,
-            batch_size,
-            resize_shorter,
-            crop,
-            interp,
-            mean,
-            std,
-            device_id=device_id,
-            pad_output=pad_output,
-            output_dtype=output_dtype)
-        pipe.build()
-        return DALIGenericIterator(
-            pipe, ['feed_image', 'feed_label'],
-            size=len(pipe),
-            dynamic_shape=True,
-            fill_last_batch=True,
-            last_batch_padded=True)
-def train(config):
-    return build(config, 'train')
-def val(config):
-    return build(config, 'valid')
-def _to_Tensor(lod_tensor, dtype):
-    data_tensor = fluid.layers.create_tensor(dtype=dtype)
-    data = np.array(lod_tensor).astype(dtype)
-    fluid.layers.assign(data, data_tensor)
-    return data_tensor
-def normalize(feeds, config):
-    image, label = feeds['image'], feeds['label']
-    img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
-    img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
-    image = fluid.layers.cast(image, 'float32')
-    costant = fluid.layers.fill_constant(
-        shape=[1], value=255.0, dtype='float32')
-    image = fluid.layers.elementwise_div(image, costant)
-    mean = fluid.layers.create_tensor(dtype="float32")
-    fluid.layers.assign(input=img_mean.astype("float32"), output=mean)
-    std = fluid.layers.create_tensor(dtype="float32")
-    fluid.layers.assign(input=img_std.astype("float32"), output=std)
-    image = fluid.layers.elementwise_sub(image, mean)
-    image = fluid.layers.elementwise_div(image, std)
-    image.stop_gradient = True
-    feeds['image'] = image
-    return feeds
-def mix(feeds, config, is_train=True):
-    env = os.environ
-    gpu_num = paddle.fluid.core.get_cuda_device_count() if (
-        'PADDLE_TRAINERS_NUM') and (
-            'PADDLE_TRAINER_ID'
-        ) not in env else int(env.get('PADDLE_TRAINERS_NUM', 0))
-    batch_size = config.TRAIN.batch_size // gpu_num
-    images = feeds['image']
-    label = feeds['label']
-    # TODO: hard code here, should be fixed!
-    alpha = 0.2
-    idx = _to_Tensor(np.random.permutation(batch_size), 'int32')
-    lam = np.random.beta(alpha, alpha)
-    images = lam * images + (1 - lam) * paddle.fluid.layers.gather(images, idx)
-    feed = {
-        'image': images,
-        'feed_y_a': label,
-        'feed_y_b': paddle.fluid.layers.gather(label, idx),
-        'feed_lam': _to_Tensor([lam] * batch_size, 'float32')
-    }
-    return feed if is_train else feeds
--- a/ppcls/utils/static/program.py
+++ b/ppcls/utils/static/program.py
--- a/ppcls/utils/static/run_dali.sh
+++ b/ppcls/utils/static/run_dali.sh
-#!/usr/bin/env bash
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-export FLAGS_fraction_of_gpu_memory_to_use=0.80
-python3.7 -m paddle.distributed.launch \
-    --gpus="0,1,2,3" \
-    tools/static/train.py \
-        -c ./configs/ResNet/ResNet50.yaml \
-        -o print_interval=10 \
-        -o use_dali=True
--- a/ppcls/utils/static/save_load.py
+++ b/ppcls/utils/static/save_load.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import errno
-import os
-import re
-import shutil
-import tempfile
-import paddle
-from ppcls.utils import logger
-__all__ = ['init_model', 'save_model']
-def _mkdir_if_not_exist(path):
-    """
-    mkdir if not exists, ignore the exception when multiprocess mkdir together
-    """
-    if not os.path.exists(path):
-        try:
-            os.makedirs(path)
-        except OSError as e:
-            if e.errno == errno.EEXIST and os.path.isdir(path):
-                logger.warning(
-                    'be happy if some process has already created {}'.format(
-                        path))
-            else:
-                raise OSError('Failed to mkdir {}'.format(path))
-def _load_state(path):
-    if os.path.exists(path + '.pdopt'):
-        # XXX another hack to ignore the optimizer state
-        tmp = tempfile.mkdtemp()
-        dst = os.path.join(tmp, os.path.basename(os.path.normpath(path)))
-        shutil.copy(path + '.pdparams', dst + '.pdparams')
-        state = paddle.static.load_program_state(dst)
-        shutil.rmtree(tmp)
-    else:
-        state = paddle.static.load_program_state(path)
-    return state
-def load_params(exe, prog, path, ignore_params=None):
-    """
-    Load model from the given path.
-    Args:
-        exe (fluid.Executor): The fluid.Executor object.
-        prog (fluid.Program): load weight to which Program object.
-        path (string): URL string or loca model path.
-        ignore_params (list): ignore variable to load when finetuning.
-            It can be specified by finetune_exclude_pretrained_params
-            and the usage can refer to the document
-            docs/advanced_tutorials/TRANSFER_LEARNING.md
-    """
-    if not (os.path.isdir(path) or os.path.exists(path + '.pdparams')):
-        raise ValueError("Model pretrain path {} does not "
-                         "exists.".format(path))
-    logger.info(
-        logger.coloring('Loading parameters from {}...'.format(path),
-                        'HEADER'))
-    ignore_set = set()
-    state = _load_state(path)
-    # ignore the parameter which mismatch the shape
-    # between the model and pretrain weight.
-    all_var_shape = {}
-    for block in prog.blocks:
-        for param in block.all_parameters():
-            all_var_shape[param.name] = param.shape
-    ignore_set.update([
-        name for name, shape in all_var_shape.items()
-        if name in state and shape != state[name].shape
-    ])
-    if ignore_params:
-        all_var_names = [var.name for var in prog.list_vars()]
-        ignore_list = filter(
-            lambda var: any([re.match(name, var) for name in ignore_params]),
-            all_var_names)
-        ignore_set.update(list(ignore_list))
-    if len(ignore_set) > 0:
-        for k in ignore_set:
-            if k in state:
-                logger.warning(
-                    'variable {} is already excluded automatically'.format(k))
-                del state[k]
-    paddle.static.set_program_state(prog, state)
-def init_model(config, program, exe):
-    """
-    load model from checkpoint or pretrained_model
-    """
-    checkpoints = config.get('checkpoints')
-    if checkpoints:
-        paddle.static.load(program, checkpoints, exe)
-        logger.info(
-            logger.coloring("Finish initing model from {}".format(checkpoints),
-                            "HEADER"))
-        return
-    pretrained_model = config.get('pretrained_model')
-    if pretrained_model:
-        if not isinstance(pretrained_model, list):
-            pretrained_model = [pretrained_model]
-        for pretrain in pretrained_model:
-            load_params(exe, program, pretrain)
-        logger.info(
-            logger.coloring("Finish initing model from {}".format(
-                pretrained_model), "HEADER"))
-def save_model(program, model_path, epoch_id, prefix='ppcls'):
-    """
-    save model to the target path
-    """
-    model_path = os.path.join(model_path, str(epoch_id))
-    _mkdir_if_not_exist(model_path)
-    model_prefix = os.path.join(model_path, prefix)
-    paddle.static.save(program, model_prefix)
-    logger.info(
-        logger.coloring("Already save model in {}".format(model_path),
-                        "HEADER"))
--- a/ppcls/utils/static/train.py
+++ b/ppcls/utils/static/train.py
-# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import argparse
-import os
-import sys
-__dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(__dir__)
-sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))
-from sys import version_info
-import paddle
-from paddle.distributed import fleet
-from ppcls.data import Reader
-from ppcls.utils.config import get_config
-from ppcls.utils import logger
-from tools.static import program
-from save_load import init_model, save_model
-def parse_args():
-    parser = argparse.ArgumentParser("PaddleClas train script")
-    parser.add_argument(
-        '-c',
-        '--config',
-        type=str,
-        default='configs/ResNet/ResNet50.yaml',
-        help='config file path')
-    parser.add_argument(
-        '--vdl_dir',
-        type=str,
-        default=None,
-        help='VisualDL logging directory for image.')
-    parser.add_argument(
-        '-p',
-        '--profiler_options',
-        type=str,
-        default=None,
-        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
-    )
-    parser.add_argument(
-        '-o',
-        '--override',
-        action='append',
-        default=[],
-        help='config options to be overridden')
-    args = parser.parse_args()
-    return args
-def main(args):
-    config = get_config(args.config, overrides=args.override, show=True)
-    if config.get("is_distributed", True):
-        fleet.init(is_collective=True)
-    # assign the place
-    use_gpu = config.get("use_gpu", True)
-    # amp related config
-    if 'AMP' in config:
-        AMP_RELATED_FLAGS_SETTING = {
-            'FLAGS_cudnn_exhaustive_search': 1,
-            'FLAGS_conv_workspace_size_limit': 1500,
-            'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
-            'FLAGS_max_inplace_grad_add': 8,
-        }
-        os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
-        paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
-    use_xpu = config.get("use_xpu", False)
-    assert (
-        use_gpu and use_xpu
-    ) is not True, "gpu and xpu can not be true in the same time in static mode!"
-    if use_gpu:
-        place = paddle.set_device('gpu')
-    elif use_xpu:
-        place = paddle.set_device('xpu')
-    else:
-        place = paddle.set_device('cpu')
-    # startup_prog is used to do some parameter init work,
-    # and train prog is used to hold the network
-    startup_prog = paddle.static.Program()
-    train_prog = paddle.static.Program()
-    best_top1_acc = 0.0  # best top1 acc record
-    train_fetchs, lr_scheduler, train_feeds, optimizer = program.build(
-        config,
-        train_prog,
-        startup_prog,
-        is_train=True,
-        is_distributed=config.get("is_distributed", True))
-    if config.validate:
-        valid_prog = paddle.static.Program()
-        valid_fetchs, _, valid_feeds, _ = program.build(
-            config,
-            valid_prog,
-            startup_prog,
-            is_train=False,
-            is_distributed=config.get("is_distributed", True))
-        # clone to prune some content which is irrelevant in valid_prog
-        valid_prog = valid_prog.clone(for_test=True)
-    # create the "Executor" with the statement of which place
-    exe = paddle.static.Executor(place)
-    # Parameter initialization
-    exe.run(startup_prog)
-    # load pretrained models or checkpoints
-    init_model(config, train_prog, exe)
-    if 'AMP' in config and config.AMP.get("use_pure_fp16", False):
-        optimizer.amp_init(
-            place,
-            scope=paddle.static.global_scope(),
-            test_program=valid_prog if config.validate else None)
-    if not config.get("is_distributed", True):
-        compiled_train_prog = program.compile(
-            config, train_prog, loss_name=train_fetchs["loss"][0].name)
-    else:
-        compiled_train_prog = train_prog
-    if not config.get('use_dali', False):
-        train_dataloader = Reader(config, 'train', places=place)()
-        if config.validate and paddle.distributed.get_rank() == 0:
-            valid_dataloader = Reader(config, 'valid', places=place)()
-            compiled_valid_prog = program.compile(config, valid_prog)
-    else:
-        assert use_gpu is True, "DALI only support gpu, please set use_gpu to True!"
-        import dali
-        train_dataloader = dali.train(config)
-        if config.validate and paddle.distributed.get_rank() == 0:
-            valid_dataloader = dali.val(config)
-            compiled_valid_prog = program.compile(config, valid_prog)
-    vdl_writer = None
-    if args.vdl_dir:
-        if version_info.major == 2:
-            logger.info(
-                "visualdl is just supported for python3, so it is disabled in python2..."
-            )
-        else:
-            from visualdl import LogWriter
-            vdl_writer = LogWriter(args.vdl_dir)
-    for epoch_id in range(config.epochs):
-        # 1. train with train dataset
-        program.run(train_dataloader, exe, compiled_train_prog, train_feeds,
-                    train_fetchs, epoch_id, 'train', config, vdl_writer,
-                    lr_scheduler, args.profiler_options)
-        if paddle.distributed.get_rank() == 0:
-            # 2. validate with validate dataset
-            if config.validate and epoch_id % config.valid_interval == 0:
-                top1_acc = program.run(valid_dataloader, exe,
-                                       compiled_valid_prog, valid_feeds,
-                                       valid_fetchs, epoch_id, 'valid', config)
-                if top1_acc > best_top1_acc:
-                    best_top1_acc = top1_acc
-                    message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
-                        best_top1_acc, epoch_id)
-                    logger.info("{:s}".format(logger.coloring(message, "RED")))
-                    if epoch_id % config.save_interval == 0:
-                        model_path = os.path.join(config.model_save_dir,
-                                                  config.ARCHITECTURE["name"])
-                        save_model(train_prog, model_path, "best_model")
-            # 3. save the persistable model
-            if epoch_id % config.save_interval == 0:
-                model_path = os.path.join(config.model_save_dir,
-                                          config.ARCHITECTURE["name"])
-                save_model(train_prog, model_path, epoch_id)
-if __name__ == '__main__':
-    paddle.enable_static()
-    args = parse_args()
-    main(args)
--- a/tests/DarkNet53.txt
+++ b/tests/DarkNet53.txt
--- a/tests/HRNet_W18_C.txt
+++ b/tests/HRNet_W18_C.txt
--- a/tests/LeViT_128S.txt
+++ b/tests/LeViT_128S.txt
--- a/tests/MobileNetV1.txt
+++ b/tests/MobileNetV1.txt
--- a/tests/MobileNetV2.txt
+++ b/tests/MobileNetV2.txt
--- a/tests/MobileNetV3_large_x1_0.txt
+++ b/tests/MobileNetV3_large_x1_0.txt
--- a/tests/ResNeXt101_vd_64x4d.txt
+++ b/tests/ResNeXt101_vd_64x4d.txt
--- a/tests/ResNet50_vd.txt
+++ b/tests/ResNet50_vd.txt
--- a/tests/ShuffleNetV2_x1_0.txt
+++ b/tests/ShuffleNetV2_x1_0.txt
--- a/tests/SwinTransformer_tiny_patch4_window7_224.txt
+++ b/tests/SwinTransformer_tiny_patch4_window7_224.txt
--- a/tests/prepare.sh
+++ b/tests/prepare.sh
--- a/tests/test.sh
+++ b/tests/test.sh
--- a/tools/eval.py
+++ b/tools/eval.py
@@ -21,11 +21,11 @@ __dir__ = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
 from ppcls.utils import config
-from ppcls.engine.trainer import Trainer
+from ppcls.engine.engine import Engine
 if __name__ == "__main__":
    args = config.parse_args()
    config = config.get_config(
        args.config, overrides=args.override, show=False)
-    trainer = Trainer(config, mode="eval")
+    engine = Engine(config, mode="eval")
-    trainer.eval()
+    engine.eval()
--- a/tools/export_model.py
+++ b/tools/export_model.py
--- a/tools/infer.py
+++ b/tools/infer.py
@@ -21,12 +21,11 @@ __dir__ = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
 from ppcls.utils import config
-from ppcls.engine.trainer import Trainer
+from ppcls.engine.engine import Engine
 if __name__ == "__main__":
    args = config.parse_args()
    config = config.get_config(
        args.config, overrides=args.override, show=False)
-    trainer = Trainer(config, mode="infer")
+    engine = Engine(config, mode="infer")
+    engine.infer()
-    trainer.infer()
--- a/tools/train.py
+++ b/tools/train.py
@@ -21,11 +21,11 @@ __dir__ = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
 from ppcls.utils import config
-from ppcls.engine.trainer import Trainer
+from ppcls.engine.engine import Engine
 if __name__ == "__main__":
    args = config.parse_args()
    config = config.get_config(
        args.config, overrides=args.override, show=False)
-    trainer = Trainer(config, mode="train")
+    engine = Engine(config, mode="train")
-    trainer.train()
+    engine.train()