fix benchmark

d9056bcd · dongshuilong · d160656b · bd586f4a · d9056bcd · d9056bcd
110 changed file
--- a/.github/ISSUE_TEMPLATE/---clas-issue-.md
+++ b/.github/ISSUE_TEMPLATE/---clas-issue-.md
+---
+name: 问题反馈
+about: PaddleClas问题反馈
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+欢迎您使用PaddleClas并反馈相关问题，非常感谢您对PaddleClas的贡献！
+提出issue时，辛苦您提供以下信息，方便我们快速定位问题并及时有效地解决您的问题：
+ 1. PaddleClas版本以及PaddlePaddle版本：请您提供您使用的版本号或分支信息，如PaddleClas release/2.2和PaddlePaddle 2.1.0
+ 2. 涉及的其他产品使用的版本号：如您在使用PaddleClas的同时还在使用其他产品，如PaddleServing、PaddleInference等，请您提供其版本号
+ 3. 训练环境信息：
+  a. 具体操作系统，如Linux/Windows/MacOS
+  b. Python版本号，如Python3.6/7/8
+  c. CUDA/cuDNN版本， 如CUDA10.2/cuDNN 7.6.5等
+ 4. 完整的代码(相比于repo中代码，有改动的地方)、详细的错误信息及相关log
--- a/README_ch.md
+++ b/README_ch.md
@@ -8,6 +8,7 @@

 **近期更新**

+- 2021.07.08、07.27 添加26个[FAQ](docs/zh_CN/faq_series/faq_2021_s2.md)
 - 2021.06.29 添加Swin-transformer系列模型，ImageNet1k数据集上Top1 acc最高精度可达87.2%；支持训练预测评估与whl包部署，预训练模型可以从[这里](docs/zh_CN/models/models_intro.md)下载。
 - 2021.06.22,23,24 PaddleClas官方研发团队带来技术深入解读三日直播课。课程回放：[https://aistudio.baidu.com/aistudio/course/introduce/24519](https://aistudio.baidu.com/aistudio/course/introduce/24519)
 - 2021.06.16 PaddleClas v2.2版本升级，集成Metric learning，向量检索等组件。新增商品识别、动漫人物识别、车辆识别和logo识别等4个图像识别应用。新增LeViT、Twins、TNT、DLA、HarDNet、RedNet系列30个预训练模型。
@@ -74,7 +75,8 @@ Res2Net200_vd预训练模型Top-1精度高达85.1%。
    - [知识蒸馏](./docs/zh_CN/advanced_tutorials/distillation/distillation.md)
    - [模型量化](./docs/zh_CN/extension/paddle_quantization.md)
    - [数据增广](./docs/zh_CN/advanced_tutorials/image_augmentation/ImageAugment.md)
- FAQ(暂停更新)
+- FAQ
+    - [图像识别任务FAQ](docs/zh_CN/faq_series/faq_2021_s2.md)
    - [图像分类任务FAQ](docs/zh_CN/faq.md)
 - [许可证书](#许可证书)
 - [贡献代码](#贡献代码)

--- a/deploy/auto_log.log
+++ b/deploy/auto_log.log
--- a/deploy/configs/build_cartoon.yaml
+++ b/deploy/configs/build_cartoon.yaml
 Global:
  rec_inference_model_dir: "./models/cartoon_rec_ResNet50_iCartoon_v1.0_infer/"
-  batch_size: 1
+  batch_size: 32
  use_gpu: True
-  enable_mkldnn: False
-  cpu_num_threads: 100
+  enable_mkldnn: True
+  cpu_num_threads: 10
  enable_benchmark: True
  use_fp16: False
  ir_optim: True

--- a/deploy/configs/build_logo.yaml
+++ b/deploy/configs/build_logo.yaml
 Global:
  rec_inference_model_dir: "./models/logo_rec_ResNet50_Logo3K_v1.0_infer/"
-  batch_size: 1
+  batch_size: 32
  use_gpu: True
-  enable_mkldnn: False
-  cpu_num_threads: 100
+  enable_mkldnn: True
+  cpu_num_threads: 10
  enable_benchmark: True
  use_fp16: False
  ir_optim: True

--- a/deploy/configs/build_product.yaml
+++ b/deploy/configs/build_product.yaml
 Global:
  rec_inference_model_dir: "./models/product_ResNet50_vd_aliproduct_v1.0_infer"
-  batch_size: 1
+  batch_size: 32
  use_gpu: True
-  enable_mkldnn: False
-  cpu_num_threads: 100
+  enable_mkldnn: True
+  cpu_num_threads: 10
  enable_benchmark: True
  use_fp16: False
  ir_optim: True

--- a/deploy/configs/build_vehicle.yaml
+++ b/deploy/configs/build_vehicle.yaml
 Global:
  rec_inference_model_dir: "./models/vehicle_cls_ResNet50_CompCars_v1.0_infer/"
-  batch_size: 1
+  batch_size: 32
  use_gpu: True
-  enable_mkldnn: False
-  cpu_num_threads: 100
+  enable_mkldnn: True
+  cpu_num_threads: 10
  enable_benchmark: True
  use_fp16: False
  ir_optim: True

--- a/deploy/configs/inference_cartoon.yaml
+++ b/deploy/configs/inference_cartoon.yaml
@@ -12,8 +12,8 @@ Global:
  - foreground

  use_gpu: True
-  enable_mkldnn: False
-  cpu_num_threads: 100
+  enable_mkldnn: True
+  cpu_num_threads: 10
  enable_benchmark: True
  use_fp16: False
  ir_optim: True

--- a/deploy/configs/inference_cls.yaml
+++ b/deploy/configs/inference_cls.yaml
@@ -3,8 +3,8 @@ Global:
  inference_model_dir: "./models"
  batch_size: 1
  use_gpu: True
-  enable_mkldnn: False
-  cpu_num_threads: 100
+  enable_mkldnn: True
+  cpu_num_threads: 10
  enable_benchmark: True
  use_fp16: False
  ir_optim: True
@@ -22,6 +22,7 @@ PreProcess:
        mean: [0.485, 0.456, 0.406]
        std: [0.229, 0.224, 0.225]
        order: ''
+        channel_num: 3
    - ToCHWImage:
 PostProcess:
  main_indicator: Topk
@@ -29,4 +30,4 @@ PostProcess:
    topk: 5
    class_id_map_file: "../ppcls/utils/imagenet1k_label_list.txt"
  SavePreLabel:
-    save_dir: ./pre_label/
\ No newline at end of file
+    save_dir: ./pre_label/
--- a/deploy/configs/inference_cls_ch4.yaml
+++ b/deploy/configs/inference_cls_ch4.yaml
+Global:
+  infer_imgs: "./images/ILSVRC2012_val_00000010.jpeg"
+  inference_model_dir: "./models"
+  batch_size: 1
+  use_gpu: True
+  enable_mkldnn: True
+  cpu_num_threads: 10
+  enable_benchmark: True
+  use_fp16: False
+  ir_optim: True
+  use_tensorrt: False
+  gpu_mem: 8000
+  enable_profile: False
+PreProcess:
+  transform_ops:
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 0.00392157
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        channel_num: 4
+    - ToCHWImage:
+PostProcess:
+  main_indicator: Topk
+  Topk:
+    topk: 5
+    class_id_map_file: "../ppcls/utils/imagenet1k_label_list.txt"
+  SavePreLabel:
+    save_dir: ./pre_label/
--- a/deploy/configs/inference_det.yaml
+++ b/deploy/configs/inference_det.yaml
@@ -10,8 +10,8 @@ Global:

  # inference engine config
  use_gpu: True
-  enable_mkldnn: False
-  cpu_num_threads: 100
+  enable_mkldnn: True
+  cpu_num_threads: 10
  enable_benchmark: True
  use_fp16: False
  ir_optim: True

--- a/deploy/configs/inference_logo.yaml
+++ b/deploy/configs/inference_logo.yaml
@@ -13,8 +13,8 @@ Global:

  # inference engine config
  use_gpu: True
-  enable_mkldnn: False
-  cpu_num_threads: 100
+  enable_mkldnn: True
+  cpu_num_threads: 10
  enable_benchmark: True
  use_fp16: False
  ir_optim: True

--- a/deploy/configs/inference_product.yaml
+++ b/deploy/configs/inference_product.yaml
@@ -13,8 +13,8 @@ Global:

  # inference engine config
  use_gpu: True
-  enable_mkldnn: False
-  cpu_num_threads: 100
+  enable_mkldnn: True
+  cpu_num_threads: 10
  enable_benchmark: True
  use_fp16: False
  ir_optim: True

--- a/deploy/configs/inference_rec.yaml
+++ b/deploy/configs/inference_rec.yaml
@@ -10,8 +10,8 @@ Global:

  # inference engine config
  use_gpu: False
-  enable_mkldnn: False
-  cpu_num_threads: 100
+  enable_mkldnn: True
+  cpu_num_threads: 10
  enable_benchmark: True
  use_fp16: False
  ir_optim: True

--- a/deploy/configs/inference_vehicle.yaml
+++ b/deploy/configs/inference_vehicle.yaml
@@ -13,8 +13,8 @@ Global:

  # inference engine config
  use_gpu: True
-  enable_mkldnn: False
-  cpu_num_threads: 100
+  enable_mkldnn: True
+  cpu_num_threads: 10
  enable_benchmark: True
  use_fp16: False
  ir_optim: True

--- a/deploy/hubserving/clas/params.py
+++ b/deploy/hubserving/clas/params.py
@@ -33,8 +33,10 @@ def get_default_confg():
            "enable_benchmark": False
        },
        'PostProcess': {
-            'name': 'Topk',
-            'topk': 5,
-            'class_id_map_file': './utils/imagenet1k_label_list.txt'
+            'main_indicator': 'Topk',
+            'Topk': {
+                'topk': 5,
+                'class_id_map_file': './utils/imagenet1k_label_list.txt'
+            }
        }
-    }
\ No newline at end of file
+    }
--- a/deploy/hubserving/readme.md
+++ b/deploy/hubserving/readme.md
@@ -15,7 +15,7 @@ hubserving/clas/
 ### 1. 准备环境
 ```shell
 # 安装paddlehub,请安装2.0版本
-pip3 install paddlehub==2.0.0b1 --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple
+pip3 install paddlehub==2.1.0 --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple
 ```

 ### 2. 下载推理模型
@@ -128,8 +128,12 @@ python hubserving/test_hubserving.py server_url image_path
 `http://[ip_address]:[port]/predict/[module_name]`  
 - **image_path**：测试图像路径，可以是单张图片路径，也可以是图像集合目录路径。
 - **batch_size**：[**可选**] 以`batch_size`大小为单位进行预测，默认为`1`。
+- **resize_short**：[**可选**] 预处理时，按短边调整大小，默认为`256`。
+- **crop_size**：[**可选**] 预处理时，居中裁剪的大小，默认为`224`。
+- **normalize**：[**可选**] 预处理时，是否进行`normalize`，默认为`True`。
+- **to_chw**：[**可选**] 预处理时，是否调整为`CHW`顺序，默认为`True`。

-**注意**：如果使用`Transformer`系列模型，如`DeiT_***_384`, `ViT_***_384`等，请注意模型的输入数据尺寸。需要指定`--resize_short=384 --resize=384`。
+**注意**：如果使用`Transformer`系列模型，如`DeiT_***_384`, `ViT_***_384`等，请注意模型的输入数据尺寸，需要指定`--resize_short=384 --crop_size=384`。


 访问示例：  

--- a/deploy/hubserving/readme_en.md
+++ b/deploy/hubserving/readme_en.md
@@ -15,7 +15,7 @@ hubserving/clas/
 ### 1. Prepare the environment
 ```shell
 # Install version 2.0 of PaddleHub  
-pip3 install paddlehub==2.0.0b1 --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple
+pip3 install paddlehub==2.1.0 --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple
 ```

 ### 2. Download inference model
@@ -126,9 +126,13 @@ Two required parameters need to be passed to the script:
 `http://[ip_address]:[port]/predict/[module_name]`  
 - **image_path**: Test image path, can be a single image path or an image directory path
 - **batch_size**: [**Optional**] batch_size. Default by `1`.
+- **resize_short**: [**Optional**] In preprocessing, resize according to short size. Default by `256`。
+- **crop_size**: [**Optional**] In preprocessing, centor crop size. Default by `224`。
+- **normalize**: [**Optional**] In preprocessing, whether to do `normalize`. Default by `True`。
+- **to_chw**: [**Optional**] In preprocessing, whether to transpose to `CHW`. Default by `True`。

 **Notice**:
-If you want to use `Transformer series models`, such as `DeiT_***_384`, `ViT_***_384`, etc., please pay attention to the input size of model, and need to set `--resize_short=384`, `--resize=384`.
+If you want to use `Transformer series models`, such as `DeiT_***_384`, `ViT_***_384`, etc., please pay attention to the input size of model, and need to set `--resize_short=384`, `--crop_size=384`.

 **Eg.**
 ```shell

--- a/deploy/hubserving/test_hubserving.py
+++ b/deploy/hubserving/test_hubserving.py
@@ -32,30 +32,59 @@ from utils import config
 from utils.encode_decode import np_to_b64
 from python.preprocess import create_operators

-preprocess_config = [{
-    'ResizeImage': {
-        'resize_short': 256
-    }
-}, {
-    'CropImage': {
-        'size': 224
-    }
-}, {
-    'NormalizeImage': {
-        'scale': 0.00392157,
-        'mean': [0.485, 0.456, 0.406],
-        'std': [0.229, 0.224, 0.225],
-        'order': ''
-    }
-}, {
-    'ToCHWImage': None
-}]
+
+def get_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--server_url", type=str)
+    parser.add_argument("--image_file", type=str)
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--resize_short", type=int, default=256)
+    parser.add_argument("--crop_size", type=int, default=224)
+    parser.add_argument("--normalize", type=str2bool, default=True)
+    parser.add_argument("--to_chw", type=str2bool, default=True)
+    return parser.parse_args()
+
+
+class PreprocessConfig(object):
+    def __init__(self,
+                 resize_short=256,
+                 crop_size=224,
+                 normalize=True,
+                 to_chw=True):
+        self.config = [{
+            'ResizeImage': {
+                'resize_short': resize_short
+            }
+        }, {
+            'CropImage': {
+                'size': crop_size
+            }
+        }]
+        if normalize:
+            self.config.append({
+                'NormalizeImage': {
+                    'scale': 0.00392157,
+                    'mean': [0.485, 0.456, 0.406],
+                    'std': [0.229, 0.224, 0.225],
+                    'order': ''
+                }
+            })
+        if to_chw:
+            self.config.append({'ToCHWImage': None})
+
+    def __call__(self):
+        return self.config


 def main(args):
    image_path_list = get_image_list(args.image_file)
    headers = {"Content-type": "application/json"}
-    preprocess_ops = create_operators(preprocess_config)
+    preprocess_ops = create_operators(
+        PreprocessConfig(args.resize_short, args.crop_size, args.normalize,
+                         args.to_chw)())

    cnt = 0
    predict_time = 0
@@ -113,14 +142,10 @@ def main(args):

                for number, result_list in enumerate(preds):
                    all_score += result_list["scores"][0]
-                    result_str = ""
-                    for i in range(len(result_list["class_ids"])):
-                        result_str += "{}: {:.2f}\t".format(
-                            result_list["class_ids"][i],
-                            result_list["scores"][i])
-
+                    pred_str = ", ".join(
+                        [f"{k}: {result_list[k]}" for k in result_list])
                    logger.info(
-                        f"File:{img_name_list[number]}, The result(s): {result_str}"
+                        f"File:{img_name_list[number]}, The result(s): {pred_str}"
                    )

            finally:
@@ -136,10 +161,5 @@ def main(args):


 if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--server_url", type=str)
-    parser.add_argument("--image_file", type=str)
-    parser.add_argument("--batch_size", type=int, default=1)
-    args = parser.parse_args()
-
+    args = get_args()
    main(args)
--- a/deploy/python/predict_cls.py
+++ b/deploy/python/predict_cls.py
@@ -42,7 +42,7 @@ class ClsPredictor(Predictor):
            self.postprocess = build_postprocess(config["PostProcess"])

        # for whole_chain project to test each repo of paddle
-        self.benchmark = config.get("benchmark", False)
+        self.benchmark = config["Global"].get("benchmark", False)
        if self.benchmark:
            import auto_log
            import os
@@ -88,6 +88,10 @@ class ClsPredictor(Predictor):
        batch_output = output_tensor.copy_to_cpu()
        if self.benchmark:
            self.auto_log.times.stamp()
+        if self.postprocess is not None:
+            batch_output = self.postprocess(batch_output)
+        if self.benchmark:
+            self.auto_log.times.end(stamp=True)
        return batch_output


@@ -95,14 +99,38 @@ def main(config):
    cls_predictor = ClsPredictor(config)
    image_list = get_image_list(config["Global"]["infer_imgs"])

-    assert config["Global"]["batch_size"] == 1
-    for idx, image_file in enumerate(image_list):
-        img = cv2.imread(image_file)[:, :, ::-1]
-        output = cls_predictor.predict(img)
-        output = cls_predictor.postprocess(output, [image_file])
-        if cls_predictor.benchmark:
-            cls_predictor.auto_log.times.end(stamp=True)
-        print(output)
+    batch_imgs = []
+    batch_names = []
+    cnt = 0
+    for idx, img_path in enumerate(image_list):
+        img = cv2.imread(img_path)
+        if img is None:
+            logger.warning(
+                "Image file failed to read and has been skipped. The path: {}".
+                format(img_path))
+        else:
+            img = img[:, :, ::-1]
+            batch_imgs.append(img)
+            img_name = os.path.basename(img_path)
+            batch_names.append(img_name)
+            cnt += 1
+
+        if cnt % config["Global"]["batch_size"] == 0 or (idx + 1
+                                                         ) == len(image_list):
+            if len(batch_imgs) == 0:
+                continue
+
+            batch_results = cls_predictor.predict(batch_imgs)
+            for number, result_dict in enumerate(batch_results):
+                filename = batch_names[number]
+                clas_ids = result_dict["class_ids"]
+                scores_str = "[{}]".format(", ".join("{:.2f}".format(
+                    r) for r in result_dict["scores"]))
+                label_names = result_dict["label_names"]
+                print("{}:\tclass id(s): {}, score(s): {}, label_name(s): {}".
+                      format(filename, clas_ids, scores_str, label_names))
+            batch_imgs = []
+            batch_names = []
    cls_predictor.auto_log.report()
    return


--- a/deploy/python/predict_rec.py
+++ b/deploy/python/predict_rec.py
@@ -54,12 +54,14 @@ class RecPredictor(Predictor):
        input_tensor.copy_from_cpu(image)
        self.paddle_predictor.run()
        batch_output = output_tensor.copy_to_cpu()
-        
+
        if feature_normalize:
            feas_norm = np.sqrt(
                np.sum(np.square(batch_output), axis=1, keepdims=True))
            batch_output = np.divide(batch_output, feas_norm)
-            
+
+        if self.postprocess is not None:
+            batch_output = self.postprocess(batch_output)
        return batch_output


@@ -67,14 +69,33 @@ def main(config):
    rec_predictor = RecPredictor(config)
    image_list = get_image_list(config["Global"]["infer_imgs"])

-    assert config["Global"]["batch_size"] == 1
-    for idx, image_file in enumerate(image_list):
-        batch_input = []
-        img = cv2.imread(image_file)[:, :, ::-1]
-        output = rec_predictor.predict(img)
-        if rec_predictor.postprocess is not None:
-            output = rec_predictor.postprocess(output)
-        print(output)
+    batch_imgs = []
+    batch_names = []
+    cnt = 0
+    for idx, img_path in enumerate(image_list):
+        img = cv2.imread(img_path)
+        if img is None:
+            logger.warning(
+                "Image file failed to read and has been skipped. The path: {}".
+                format(img_path))
+        else:
+            img = img[:, :, ::-1]
+            batch_imgs.append(img)
+            img_name = os.path.basename(img_path)
+            batch_names.append(img_name)
+            cnt += 1
+
+        if cnt % config["Global"]["batch_size"] == 0 or (idx + 1) == len(image_list):
+            if len(batch_imgs) == 0: 
+                continue
+                
+            batch_results = rec_predictor.predict(batch_imgs)
+            for number, result_dict in enumerate(batch_results):
+                filename = batch_names[number]
+                print("{}:\t{}".format(filename, result_dict))
+            batch_imgs = []
+            batch_names = []
+
    return



--- a/deploy/vector_search/README.md
+++ b/deploy/vector_search/README.md
@@ -35,7 +35,6 @@ sudo apt-get install build-essential gcc g++

 进入该文件夹，直接运行`make`即可，如果希望重新生成`index.so`文件，可以首先使用`make clean`清除已经生成的缓存，再使用`make`生成更新之后的库文件。

-
 ### 2.3 Windows上编译生成库文件

 Windows上首先需要安装gcc编译工具，推荐使用[TDM-GCC](https://jmeubank.github.io/tdm-gcc/articles/2020-03/9.2.0-release)，进入官网之后，可以选择合适的版本进行下载。推荐下载[tdm64-gcc-10.3.0-2.exe](https://github.com/jmeubank/tdm-gcc/releases/download/v10.3.0-tdm64-2/tdm64-gcc-10.3.0-2.exe)。
@@ -50,6 +49,25 @@ Windows上首先需要安装gcc编译工具，推荐使用[TDM-GCC](https://jmeu

 在该文件夹(deploy/vector_search)下，运行命令`mingw32-make`，即可生成`index.dll`库文件。如果希望重新生成`index.dll`文件，可以首先使用`mingw32-make clean`清除已经生成的缓存，再使用`mingw32-make`生成更新之后的库文件。

+### 2.4 MacOS上编译生成库文件
+
+运行下面的命令，安装gcc与g++:
+
+```shell
+brew install gcc
+```
+#### 注意：
+1. 若提示 `Error: Running Homebrew as root is extremely dangerous and no longer supported...`,  参考该[链接](https://jingyan.baidu.com/article/e52e3615057a2840c60c519c.html)处理
+2. 若提示 `Error: Failure while executing; `tar --extract --no-same-owner --file...`， 参考该[链接](https://blog.csdn.net/Dawn510/article/details/117787358)处理
+
+在安装之后编译后的可执行程序会被复制到/usr/local/bin下面，查看这个文件夹下的gcc：
+```
+ls /usr/local/bin/gcc*
+```
+可以看到本地gcc对应的版本号为gcc-11，编译命令如下: (如果本地gcc版本为gcc-9, 则相应命令修改为`CXX=g++-9 make`)
+```
+CXX=g++-11 make
+```

 ## 3. 快速使用


--- a/deploy/vector_search/README_en.md
+++ b/deploy/vector_search/README_en.md
+# Vector search
+
+## 1. Introduction
+
+Some vertical domain recognition tasks (e.g., vehicles, commodities, etc.) require a large number of recognized categories, and often use a retrieval-based approach to obtain matching predicted categories by performing a fast nearest neighbor search with query vectors and underlying library vectors. The vector search module provides the basic approximate nearest neighbor search algorithm based on Baidu's self-developed Möbius algorithm, a graph-based approximate nearest neighbor search algorithm for maximum inner product search (MIPS). This module provides python interface, supports numpy and tensor type vectors, and supports L2 and Inner Product distance calculation.
+
+Details of the Mobius algorithm can be found in the paper.（[Möbius Transformation for Fast Inner Product Search on Graph](http://research.baidu.com/Public/uploads/5e189d36b5cf6.PDF), [Code](https://github.com/sunbelbd/mobius)）
+
+## 2. Installation
+
+### 2.1 Use the provided library files directly
+
+This folder contains the compiled `index.so` (compiled under gcc8.2.0 for Linux) and `index.dll` (compiled under gcc10.3.0 for Windows), which can be used directly, skipping sections 2.2 and 2.3.
+
+If the library files are not available due to a low gcc version or an incompatible environment, you need to manually compile the library files under a different platform.
+
+**Note：** Make sure that C++ compiler supports the C++11 standard.
+
+### 2.2 Compile and generate library files on Linux
+
+Run the following command to install gcc and g++.
+
+```
+sudo apt-get update
+sudo apt-get upgrade -y
+sudo apt-get install build-essential gcc g++
+```
+
+Check the gcc version by the command `gcc -v`.
+
+`make` can be operated directly. If you wish to regenerate the `index.so`, you can first use `make clean` to clear the cache, and then use `make` to generate the updated library file.
+
+### 2.3 Compile and generate library files on Windows
+
+You need to install gcc compiler tool first, we recommend using [TDM-GCC](https://jmeubank.github.io/tdm-gcc/articles/2020-03/9.2.0-release), you can choose the right version on the official website. We recommend downloading [tdm64-gcc-10.3.0-2.exe](https://github.com/jmeubank/tdm-gcc/releases/download/v10.3.0-tdm64-2/tdm64-gcc-10.3.0-2.exe).
+
+After the downloading, follow the default installation steps to install. There are 3 points to note here:
+
+1.  The vector search module depends on openmp, so you need to check the `openmp` installation option when going on to `choose components` step, otherwise it will report an error `libgomp.spec: No such file or directory`, [reference link](https://github.com/dmlc/xgboost/issues/1027)
+2.  When being asked whether to add to the system environment variables, it is recommended to check here, otherwise you need to add the system environment variables manually later.
+3. The compile command is `make` on Linux and `mingw32-make` on Windows, so you need to distinguish here.
+
+After installation, you can open a command line terminal and check the gcc version with the command `gcc -v`.
+
+Run the command `mingw32-make` to generate the `index.dll` library file under the folder (deploy/vector_search). If you want to regenerate the `index.dll` file, you can first use `mingw32-make clean` to clear the cache, and then use `mingw32-make` to generate the updated library file.
+
+### 2.4 Compile and generate library files on MacOS
+
+Run the following command to install gcc and g++:
+
+```
+brew install gcc
+```
+
+#### Caution：
+
+1. If prompted with `Error: Running Homebrew as root is extremely dangerous and no longer supported... `, refer to this [link](https://jingyan.baidu.com/article/e52e3615057a2840c60c519c.html)
+2.  If prompted with `Error: Failure while executing; tar --extract --no-same-owner --file... `, refer to this [link](https://blog.csdn.net/Dawn510/article/details/117787358).
+
+After installation the compiled executable is copied under /usr/local/bin, look at the gcc in this folder: 
+
+```
+ls /usr/local/bin/gcc*
+```
+
+The local gcc version is gcc-11, and the compile command is as follows: (If the local gcc version is gcc-9, the corresponding command should be `CXX=g++-9 make`)
+
+```
+CXX=g++-11 make
+```
+
+## 3. Quick use
+
+```
+import numpy as np
+from interface import Graph_Index
+
+# Random sample generation
+index_vectors = np.random.rand(100000,128).astype(np.float32)
+query_vector = np.random.rand(128).astype(np.float32)
+index_docs = ["ID_"+str(i) for i in range(100000)]
+
+# Initialize index structure
+indexer = Graph_Index(dist_type="IP") #support "IP" and "L2"
+indexer.build(gallery_vectors=index_vectors, gallery_docs=index_docs, pq_size=100, index_path='test')
+
+# Query
+scores, docs = indexer.search(query=query_vector, return_k=10, search_budget=100)
+print(scores)
+print(docs)
+
+# Save and load
+indexer.dump(index_path="test")
+indexer.load(index_path="test")
+```
--- a/docs/en/models/Twins.md
+++ b/docs/en/models/Twins.md
@@ -3,9 +3,9 @@
 ## Overview
 The Twins network includes Twins-PCPVT and Twins-SVT, which focuses on the meticulous design of the spatial attention mechanism, resulting in a simple but more effective solution. Since the architecture only involves matrix multiplication, and the current deep learning framework has a high degree of optimization for matrix multiplication, the architecture is very efficient and easy to implement. Moreover, this architecture can achieve excellent performance in a variety of downstream vision tasks such as image classification, target detection, and semantic segmentation. [Paper](https://arxiv.org/abs/2104.13840).

-## Accuracy, FLOPS and Parameters
+## Accuracy, FLOPs and Parameters

-| Models        | Top1 | Top5 | Reference<br>top1 | Reference<br>top5 | FLOPS<br>(G) | Params<br>(M) |
+| Models        | Top1 | Top5 | Reference<br>top1 | Reference<br>top5 | FLOPs<br>(G) | Params<br>(M) |
 |:--:|:--:|:--:|:--:|:--:|:--:|:--:|
 | pcpvt_small   | 0.8082 | 0.9552 | 0.812 | - | 3.7 | 24.1   |
 | pcpvt_base    | 0.8242 | 0.9619 | 0.827 | - | 6.4 | 43.8   |

--- a/docs/en/tutorials/config_description_en.md
+++ b/docs/en/tutorials/config_description_en.md
+# Configuration Instruction
+
+------
+
+## Introdction
+
+The parameters in the PaddleClas configuration file(`ppcls/configs/*.yaml`)are described for you to customize or modify the hyperparameter configuration more quickly.
+
+## Details
+
+### 1. Classification model
+
+Here the configuration of `ResNet50_vd` on`ImageNet-1k`is used as an example to explain the each parameter in detail. [Configure Path](https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml).
+
+#### 1.1Global Configuration
+
+| Parameter name     | Specific meaning                                        | Defult value     | Optional value    |
+| ------------------ | ------------------------------------------------------- | ---------------- | ----------------- |
+| checkpoints        | Breakpoint model path for resuming training             | null             | str               |
+| pretrained_model   | Pre-trained model path                                  | null             | str               |
+| output_dir         | Save model path                                         | "./output/"      | str               |
+| save_interval      | How many epochs to save the model at each interval      | 1                | int               |
+| eval_during_train  | Whether to evaluate at training                         | True             | bool              |
+| eval_interval      | How many epochs to evaluate at each interval            | 1                | int               |
+| epochs             | Total number of epochs in training                      |                  | int               |
+| print_batch_step   | How many mini-batches to print out at each interval     | 10               | int               |
+| use_visualdl       | Whether to visualize the training process with visualdl | False            | bool              |
+| image_shape        | Image size                                              | [3，224，224]    | list, shape: (3,) |
+| save_inference_dir | Inference model save path                               | "./inference"    | str               |
+| eval_mode          | Model of eval                                           | "classification" | "retrieval"       |
+
+**Note**：The http address of pre-trained model can be filled in the `pretrained_model`
+
+#### 1.2 Architecture
+
+| Parameter name | Specific meaning  | Defult value | Optional value        |
+| -------------- | ----------------- | ------------ | --------------------- |
+| name           | Model Arch name   | ResNet50     | PaddleClas model arch |
+| class_num      | Category number   | 1000         | int                   |
+| pretrained     | Pre-trained model | False        | bool， str            |
+
+**Note**: Here pretrained can be set to True or False, so does the path of the weights. In addition, the pretrained is disabled when Global.pretrained_model is also set to the corresponding path.
+
+#### 1.3 Loss function
+
+| Parameter name | Specific meaning                            | Defult value | Optional value         |
+| -------------- | ------------------------------------------- | ------------ | ---------------------- |
+| CELoss         | cross-entropy loss function                 | ——           | ——                     |
+| CELoss.weight  | The weight of CELoss in the whole Loss      | 1.0          | float                  |
+| CELoss.epsilon | The epsilon value of label_smooth in CELoss | 0.1          | float，between 0 and 1 |
+
+#### 1.4 Optimizer
+
+| Parameter name    | Specific meaning                 | Defult value | Optional value                                     |
+| ----------------- | -------------------------------- | ------------ | -------------------------------------------------- |
+| name              | optimizer method name            | "Momentum"   | Other optimizer including "RmsProp"                |
+| momentum          | momentum value                   | 0.9          | float                                              |
+| lr.name           | method of dropping learning rate | "Cosine"     | Other dropping methods of "Linear" and "Piecewise" |
+| lr.learning_rate  | initial value of learning rate   | 0.1          | float                                              |
+| lr.warmup_epoch   | warmup rounds                    | 0            | int，such as 5                                           |
+| regularizer.name  | regularization method name       | "L2"         | ["L1", "L2"]                                       |
+| regularizer.coeff | regularization factor            | 0.00007      | float                                              |
+
+**Note**：The new parameters may be different when `lr.name`  is different , as when `lr.name=Piecewise`, the following parameters need to be added:
+
+```
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+```
+
+Referring to [learning_rate.py](https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/optimizer/learning_rate.py) for adding method and parameters.
+
+#### 1.5 Data reading module（DataLoader）
+
+##### 1.5.1 dataset
+
+| Parameter name      | Specific meaning                     | Defult value                        | Optional value                 |
+| ------------------- | ------------------------------------ | ----------------------------------- | ------------------------------ |
+| name                | The name of the class to read the data                   | ImageNetDataset                     | VeriWild and other Dataet type |
+| image_root          | The path where the dataset is stored | ./dataset/ILSVRC2012/               | str                            |
+| cls_label_path      | data label list                      | ./dataset/ILSVRC2012/train_list.txt | str                            |
+| transform_ops       | data preprocessing for single images | ——                                  | ——                             |
+| batch_transform_ops | Data preprocessing for batch images  | ——                                  | ——                             |
+
+The parameter meaning of transform_ops:
+
+| Function name  | Parameter name | Specific meaning      |
+| -------------- | -------------- | --------------------- |
+| DecodeImage    | to_rgb         | data to RGB           |
+|                | channel_first  | image data by CHW     |
+| RandCropImage  | size           | Random crop           |
+| RandFlipImage  |                | Random flip           |
+| NormalizeImage | scale          | Normalize scale value |
+|                | mean           | Normalize mean value  |
+|                | std            | normalized variance   |
+|                | order          | Normalize order       |
+| CropImage      | size           | crop size             |
+| ResizeImage    | resize_short   | resize by short edge  |
+
+The parameter meaning of batch_transform_ops:
+
+| Function name | Parameter name | Specific meaning                        |
+| ------------- | -------------- | --------------------------------------- |
+| MixupOperator | alpha          | Mixup parameter value，the larger the value, the stronger the augment |
+
+##### 1.5.2 sampler
+
+| Parameter name | Specific meaning                                             | Default value           | Optional value                                     |
+| -------------- | ------------------------------------------------------------ | ----------------------- | -------------------------------------------------- |
+| name           | sampler type                                                 | DistributedBatchSampler | DistributedRandomIdentitySampler and other Sampler |
+| batch_size     | batch size                                                   | 64                      | int                                                |
+| drop_last      | Whether to drop the last data that does reach the batch-size | False                   | bool                                               |
+| shuffle        | whether to shuffle the data                                  | True                    | bool                                               |
+
+##### 1.5.3 loader
+
+| Parameter name    | Specific meaning             | Default meaning | Optional meaning |
+| ----------------- | ---------------------------- | --------------- | ---------------- |
+| num_workers       | Number of data read threads  | 4               | int              |
+| use_shared_memory | Whether to use shared memory | True            | bool             |
+
+#### 1.6 Evaluation metric
+
+| Parameter name | Specific meaning | Default meaning | Optional meaning |
+| -------------- | ---------------- | --------------- | ---------------- |
+| TopkAcc        | TopkAcc          | [1, 5]          | list, int        |
+
+#### 1.7 Inference
+
+| Parameter name                | Specific meaning                  | Default meaning                       | Optional meaning |
+| ----------------------------- | --------------------------------- | ------------------------------------- | ---------------- |
+| infer_imgs                    | Image address to be inferred      | docs/images/whl/demo.jpg              | str              |
+| batch_size                    | batch size                        | 10                                    | int              |
+| PostProcess.name              | Post-process name                 | Topk                                  | str              |
+| PostProcess.topk              | topk value                        | 5                                     | int              |
+| PostProcess.class_id_map_file | mapping file of class id and name | ppcls/utils/imagenet1k_label_list.txt | str              |
+
+**Note**：The interpretation of `transforms` in the Infer module refers to the interpretation of`transform_ops`in the dataset in the data reading module.
+
+### 2.Distillation model
+
+**Note**：Here the training configuration of `MobileNetV3_large_x1_0` on `ImageNet-1k` distilled MobileNetV3_small_x1_0 is used as an example to explain the meaning of each parameter in detail. [Configure path](https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml). Only parameters that are distinct from the classification model are introduced here.
+
+#### 2.1 Architecture
+
+| Parameter name     | Specific meaning                                          | Default meaning        | Optional meaning                   |
+| ------------------ | --------------------------------------------------------- | ---------------------- | ---------------------------------- |
+| name               | model arch name                                           | DistillationModel      | ——                                 |
+| class_num          | category number                                           | 1000                   | int                                |
+| freeze_params_list | freeze_params_list                                        | [True, False]          | list                               |
+| models             | model list                                                | [Teacher, Student]     | list                               |
+| Teacher.name       | teacher model name                                        | MobileNetV3_large_x1_0 | PaddleClas model                   |
+| Teacher.pretrained | teacher model pre-trained weights                         | True                   | Boolean or pre-trained weight path |
+| Teacher.use_ssld   | whether teacher model pretrained weights are ssld weights | True                   | Boolean                            |
+| infer_model_name   | type of the model being inferred                          | Student                | Teacher                            |
+
+**Note**：
+
+1. list is represented in yaml as follows:
+
+```
+  freeze_params_list:
+  - True
+  - False
+```
+
+2.Student's parameters are similar and will not be repeated.
+
+#### 2.2  Loss function
+
+| Parameter name                      | Specific meaning                                             | Default meaning | Optional meaning |
+| ----------------------------------- | ------------------------------------------------------------ | --------------- | ---------------- |
+| DistillationCELoss                  | Distillation's cross-entropy loss function                   | ——              | ——               |
+| DistillationCELoss.weight           | Loss weight                                                  | 1.0             | float            |
+| DistillationCELoss.model_name_pairs | ["Student", "Teacher"]                                       | ——              | ——               |
+| DistillationGTCELoss.weight         | Distillation's cross-entropy loss function of model and true Label | ——              | ——               |
+| DistillationGTCELos.weight          | Loss weight                                                  | 1.0             | float            |
+| DistillationCELoss.model_names      | Model names with real label for cross-entropy                | ["Student"]     | ——               |
+
+#### 2.3 Evaluation metric
+
+| Parameter name                | Specific meaning    | Default meaning              | Optional meaning |
+| ----------------------------- | ------------------- | ---------------------------- | ---------------- |
+| DistillationTopkAcc           | DistillationTopkAcc | including model_key and topk | ——               |
+| DistillationTopkAcc.model_key | the evaluated model | "Student"                    | "Teacher"        |
+| DistillationTopkAcc.topk      | Topk value          | [1, 5]                       | list, int        |
+
+**Note**： `DistillationTopkAcc` has the same meaning as `TopkAcc`, except that it is only used in distillation tasks.
+
+### 3. Recognition model
+
+**Note**：The training configuration of`ResNet50` on`LogoDet-3k` is used here as an example to explain the meaning of each parameter in detail. [configure path](https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/configs/Logo/ResNet50_ReID.yaml). Only parameters that are distinct from the classification model are presented here.
+
+#### 3.1 Architechture
+
+| Parameter name         | Specific meaning                                             | Default meaning             | Optional meaning                                             |
+| ---------------------- | ------------------------------------------------------------ | --------------------------- | ------------------------------------------------------------ |
+| name                   | Model arch                                                   | "RecModel"                  | ["RecModel"]                                                 |
+| infer_output_key       | inference output value                                       | “feature”                   | ["feature", "logits"]                                        |
+| infer_add_softmax      | softmaxwhether to add softmax to infercne                    | False                       | [True, False]                                                |
+| Backbone.name          | Backbone name                                                | ResNet50_last_stage_stride1 | other backbone provided by PaddleClas                        |
+| Backbone.pretrained    | Backbone pre-trained model                                   | True                        | Boolean value or pre-trained model path                      |
+| BackboneStopLayer.name | The name of the output layer in Backbone                     | True                        | The`full_name`of the feature output layer in Backbone        |
+| Neck.name              | The name of the Neck part                                    | VehicleNeck                 | the dictionary structure to be passed in, the specific input parameters for the Neck network layer |
+| Neck.in_channels       | Input dimension size of the Neck part                        | 2048                        | the size is the same as BackboneStopLayer.name               |
+| Neck.out_channels      | Output the dimension size of the Neck part, i.e. feature dimension size | 512                         | int                                                          |
+| Head.name              | Network Head part nam                                        | CircleMargin                | Arcmargin. Etc                                               |
+| Head.embedding_size    | Feature dimension size                                       | 512                         | Consistent with Neck.out_channels                            |
+| Head.class_num         | number of classes                                            | 3000                        | int                                                          |
+| Head.margin            | margin value in CircleMargin                                 | 0.35                        | float                                                        |
+| Head.scale             | scale value in CircleMargin                                  | 64                          | int                                                          |
+
+**Note**：
+
+1.In PaddleClas, the `Neck` part is the connection part between Backbone and embedding layer, and `Head` part is the connection part between embedding layer and classification layer.。
+
+2.`BackboneStopLayer.name` can be obtained by visualizing the model, visualization can be referred to [Netron](https://github.com/lutzroeder/netron) or [visualdl](https://github.com/PaddlePaddle/VisualDL).
+
+3.Calling tools/export_model.py will convert the model weights to inference model, where the infer_add_softmax parameter will control whether to add the Softmax activation function afterwards, the code default is True (the last output layer in the classification task will be connected to the Softmax activation function). In the recognition task, the activation function is not required for the feature layer, so it should be set to False here.
+
+
+
+
+#### 3.2 Evaluation metric
+
+| Parameter name | Specific meaning            | Default meaning | Optional meaning |
+| -------------- | --------------------------- | --------------- | ---------------- |
+| Recallk        | Recall rate                 | [1, 5]          | list, int        |
+| mAP            | Average retrieval precision | None            | None             |
--- a/docs/en/tutorials/config_en.md
+++ b/docs/en/tutorials/config_en.md
-# Configuration
-
---
-
-## Introduction
-
-This document introduces the configuration(filed in `config/*.yaml`) of PaddleClas.
-
-* Note: Some parameters do not appear in the yaml file (because they are not used for this file). During training or validation, you can use the command `-o` to update or add the specified parameters. For the example `-o checkpoints=./ckp_path/ppcls`, it means that the parameter `checkpoints` will be updated or added using the value `./ckp_path/ppcls`.
-
-### Basic
-
-| name | detail | default value | optional value |
-|:---:|:---:|:---:|:---:|
-| mode | mode | "train" | ["train"," valid"] |
-| checkpoints | checkpoint model path for resuming training process | "" | Str |
-| last_epoch | last epoch for the training，used with checkpoints | -1 | int |
-| pretrained_model | pretrained model path | "" | Str |
-| load_static_weights | whether the pretrained model is saved in static mode | False | bool |
-| model_save_dir | model stored path | "" | Str |
-| classes_num | class number | 1000 | int |
-| total_images | total images | 1281167 | int |
-| save_interval | save interval | 1 | int |
-| validate | whether to validate when training | TRUE | bool |
-| valid_interval | valid interval | 1 | int |
-| epochs | epoch |  | int |
-| topk | K value | 5 | int |
-| image_shape | image size | [3，224，224] | list, shape: (3,) |
-| use_mix | whether to use mixup | False | ['True', 'False'] |
-| ls_epsilon | label_smoothing epsilon value| 0 | float |
-| use_distillation | whether to use SSLD distillation training | False | bool |
-
-
-## ARCHITECTURE
-
-| name | detail | default value | optional value |
-|:---:|:---:|:---:|:---:|
-| name | model name | "ResNet50_vd" | one of 23 architectures |
-| params | model parameters | {} | extra dictionary for the model structure, parameters such as `padding_type` in EfficientNet can be set here |
-
-
-### LEARNING_RATE
-
-| name | detail | default value |Optional value |
-|:---:|:---:|:---:|:---:|
-| function | decay type | "Linear" | ["Linear", "Cosine", <br> "Piecewise", "CosineWarmup"] |
-| params.lr | initial learning rate | 0.1 | float |
-| params.decay_epochs | milestone in piecewisedecay |  | list |
-| params.gamma | gamma in piecewisedecay | 0.1 | float |
-| params.warmup_epoch | warmup epoch | 5 | int |
-| parmas.steps | decay steps in lineardecay | 100 | int |
-| params.end_lr | end lr in lineardecay | 0 | float |
-
-### OPTIMIZER
-
-| name | detail | default value | optional value |
-|:---:|:---:|:---:|:---:|
-| function | optimizer name | "Momentum" | ["Momentum", "RmsProp"] |
-| params.momentum | momentum value | 0.9 | float |
-| regularizer.function | regularizer method name | "L2" | ["L1", "L2"] |
-| regularizer.factor | regularizer factor | 0.0001 | float |
-
-### reader
-
-| name | detail |
-|:---:|:---:|
-| batch_size | batch size |
-| num_workers | worker number |
-| file_list | train list path |
-| data_dir | train  dataset path |
-| shuffle_seed | seed |
-
-processing
-
-| function name | attribute name | detail |
-|:---:|:---:|:---:|
-| DecodeImage | to_rgb | decode to RGB |
-|  | to_np | to numpy |
-|  | channel_first | Channel first |
-| RandCropImage | size | random crop |
-| RandFlipImage | | random flip |
-| NormalizeImage | scale | normalize image |
-|  | mean | mean |
-|  | std | std |
-|  | order | order |
-| ToCHWImage |  | to CHW |
-| CropImage | size | crop size |
-| ResizeImage | resize_short | resize according to short size |
-
-mix preprocessing
-
-| name| detail|
-|:---:|:---:|
-| MixupOperator.alpha | alpha value in mixup|
--- a/docs/en/whl_en.md
+++ b/docs/en/whl_en.md
@@ -5,7 +5,7 @@
 * installing from pypi

 ```bash
-pip3 install paddleclas==2.2.0
+pip3 install paddleclas==2.2.1
 ```

 * build own whl package and install

--- a/docs/images/wx_group.jpeg
+++ b/docs/images/wx_group.jpeg
--- a/docs/images/wx_group.png
+++ b/docs/images/wx_group.png
--- a/docs/zh_CN/faq_series/faq_2021_s2.md
+++ b/docs/zh_CN/faq_series/faq_2021_s2.md
@@ -3,7 +3,7 @@

 ## 目录
 * [第1期](#第1期)(2021.07.08)
-
+* [第2期](#第2期)(2021.07.27)

 <a name="第1期"></a>
 ## 第1期
@@ -99,3 +99,26 @@
 ### Q1.20 PaddleClas 的`train_log`文件在哪里?

 **A**：在保存权重的路径中存放了`train.log`。
+
+
+<a name="第2期"></a>
+## 第2期
+
+### Q2.1 PaddleClas目前使用的Möbius向量检索算法支持类似于faiss的那种index.add()的功能吗? 另外，每次构建新的图都要进行train吗？这里的train是为了检索加速还是为了构建相似的图？
+
+**A**：Mobius提供的检索算法是一种基于图的近似最近邻搜索算法，目前支持两种距离计算方式：inner product和L2 distance. faiss中提供的index.add功能暂时不支持，如果需要增加检索库的内容，需要从头重新构建新的index. 在每次构建index时，检索算法内部执行的操作是一种类似于train的过程，不同于faiss提供的train接口，我们命名为build, 主要的目的是为了加速检索的速度。
+
+### Q2.2 可以对视频中每一帧画面进行逐帧预测吗？
+**A**：可以，但目前PaddleClas并不支持视频输入。可以尝试修改一下PaddleClas代码，或者预先将视频逐帧转为图像存储，再使用PaddleClas进行预测。
+
+### Q2.3：在直播场景中，需要提供一个直播即时识别画面，能够在延迟几秒内找到特征目标物并用框圈起，这个可以实现吗？
+**A**：要达到实时的检测效果，需要检测速度达到实时性的要求；PPyolo是Paddle团队提供的轻量级目标检测模型，检测速度和精度达到了很好的平衡，可以试试ppyolo来做检测. 关于ppyolo的使用，可以参照：   https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/configs/ppyolo/README_cn.md
+
+### Q2.4: 对于未知的标签，加入gallery dataset可以用于后续的分类识别（无需训练），但是如果前面的检测模型对于未知的标签无法定位检测出来，是否还是要训练前面的检测模型？
+**A**：如果检测模型在自己的数据集上表现不佳，需要在自己的检测数据集上再finetune下
+
+### Q2.5: Mac重新编译index.so时报错如下：clang: error: unsupported option '-fopenmp', 该如何处理？
+**A**：该问题已经解决。Mac编译index.so，可以参照文档： https://github.com/PaddlePaddle/PaddleClas/blob/develop/deploy/vector_search/README.md
+
+### Q2.6: PaddleClas有提供调整图片亮度，对比度，饱和度，色调等方面的数据增强吗？
+**A**：PaddleClas提供了多种数据增广方式， 可分为3类：1. 图像变换类： AutoAugment, RandAugment;  2. 图像裁剪类： CutOut、RandErasing、HideAndSeek、GridMask；3. 图像混叠类：Mixup, Cutmix. 其中，Randangment提供了多种数据增强方式的随机组合，可以满足亮度、对比度、饱和度、色调等多方面的数据增广需求
--- a/docs/zh_CN/models/Twins.md
+++ b/docs/zh_CN/models/Twins.md
@@ -3,9 +3,9 @@
 ## 概述
 Twins网络包括Twins-PCPVT和Twins-SVT，其重点对空间注意力机制进行了精心设计，得到了简单却更为有效的方案。由于该体系结构仅涉及矩阵乘法，而目前的深度学习框架中对矩阵乘法有较高的优化程度，因此该体系结构十分高效且易于实现。并且，该体系结构在图像分类、目标检测和语义分割等多种下游视觉任务中都能够取得优异的性能。[论文地址](https://arxiv.org/abs/2104.13840)。

-## 精度、FLOPS和参数量
+## 精度、FLOPs和参数量

-| Models        | Top1 | Top5 | Reference<br>top1 | Reference<br>top5 | FLOPS<br>(G) | Params<br>(M) |
+| Models        | Top1 | Top5 | Reference<br>top1 | Reference<br>top5 | FLOPs<br>(G) | Params<br>(M) |
 |:--:|:--:|:--:|:--:|:--:|:--:|:--:|
 | pcpvt_small   | 0.8082 | 0.9552 | 0.812 | - | 3.7 | 24.1   |
 | pcpvt_base    | 0.8242 | 0.9619 | 0.827 | - | 6.4 | 43.8   |

--- a/docs/zh_CN/whl.md
+++ b/docs/zh_CN/whl.md
@@ -5,7 +5,7 @@
 * pip安装

 ```bash
-pip3 install paddleclas==2.2.0
+pip3 install paddleclas==2.2.1
 ```

 * 本地构建并安装

--- a/paddleclas.py
+++ b/paddleclas.py
@@ -18,6 +18,7 @@ __dir__ = os.path.dirname(__file__)
 sys.path.append(os.path.join(__dir__, ""))
 sys.path.append(os.path.join(__dir__, "deploy"))

+from typing import Union, Generator
 import argparse
 import shutil
 import textwrap
@@ -279,8 +280,13 @@ def args_cfg():
        "--save_dir",
        type=str,
        help="The directory to save prediction results as pre-label.")
-    parser.add_argument("--resize_short", type=int, default=256, help="")
-    parser.add_argument("--crop_size", type=int, default=224, help="")
+    parser.add_argument(
+        "--resize_short",
+        type=int,
+        default=256,
+        help="Resize according to short size.")
+    parser.add_argument(
+        "--crop_size", type=int, default=224, help="Centor crop size.")

    args = parser.parse_args()
    return vars(args)
@@ -351,7 +357,7 @@ def download_with_progressbar(url, save_path):


 def check_model_file(model_name):
-    """Check the model files exist and download and untar when no exist. 
+    """Check the model files exist and download and untar when no exist.
    """
    storage_directory = partial(os.path.join, BASE_INFERENCE_MODEL_DIR,
                                model_name)
@@ -405,11 +411,11 @@ class PaddleClas(object):
        """Init PaddleClas with config.

        Args:
-            model_name: The model name supported by PaddleClas, default by None. If specified, override config.
-            inference_model_dir: The directory that contained model file and params file to be used, default by None. If specified, override config.
-            use_gpu: Wheather use GPU, default by None. If specified, override config.
-            batch_size: The batch size to pridict, default by None. If specified, override config.
-            topk: Return the top k prediction results with the highest score.
+            model_name (str, optional): The model name supported by PaddleClas. If specified, override config. Defaults to None.
+            inference_model_dir (str, optional): The directory that contained model file and params file to be used. If specified, override config. Defaults to None.
+            use_gpu (bool, optional): Whether use GPU. If specified, override config. Defaults to True.
+            batch_size (int, optional): The batch size to pridict. If specified, override config. Defaults to 1.
+            topk (int, optional): Return the top k prediction results with the highest score. Defaults to 5.
        """
        super().__init__()
        self._config = init_config(model_name, inference_model_dir, use_gpu,
@@ -454,20 +460,26 @@ class PaddleClas(object):
            raise InputModelError(err)
        return

-    def predict(self, input_data, print_pred=False):
+    def predict(self, input_data: Union[str, np.array],
+                print_pred: bool=False) -> Generator[list, None, None]:
        """Predict input_data.

        Args:
-            input_data (str | NumPy.array): The path of image, or the directory containing images, or the URL of image from Internet.
-            print_pred (bool, optional): Wheather print the prediction result. Defaults to False.
+            input_data (Union[str, np.array]): 
+                When the type is str, it is the path of image, or the directory containing images, or the URL of image from Internet.
+                When the type is np.array, it is the image data whose channel order is RGB.
+            print_pred (bool, optional): Whether print the prediction result. Defaults to False. Defaults to False.

        Raises:
            ImageTypeError: Illegal input_data.

        Yields:
-            list: The prediction result(s) of input_data by batch_size. For every one image, prediction result(s) is zipped as a dict, that includs topk "class_ids", "scores" and "label_names". The format is as follow:
-            [{"class_ids": [...], "scores": [...], "label_names": [...]}, ...]
+            Generator[list, None, None]: 
+                The prediction result(s) of input_data by batch_size. For every one image, 
+                prediction result(s) is zipped as a dict, that includs topk "class_ids", "scores" and "label_names". 
+                The format is as follow: [{"class_ids": [...], "scores": [...], "label_names": [...]}, ...]
        """
+
        if isinstance(input_data, np.ndarray):
            outputs = self.cls_predictor.predict(input_data)
            yield self.cls_predictor.postprocess(outputs)
@@ -497,6 +509,7 @@ class PaddleClas(object):
                        f"Image file failed to read and has been skipped. The path: {img_path}"
                    )
                    continue
+                img = img[:, :, ::-1]
                img_list.append(img)
                img_path_list.append(img_path)
                cnt += 1
@@ -506,12 +519,12 @@ class PaddleClas(object):
                    preds = self.cls_predictor.postprocess(outputs,
                                                           img_path_list)
                    if print_pred and preds:
-                        for nu, pred in enumerate(preds):
+                        for pred in preds:
+                            filename = pred.pop("file_name")
                            pred_str = ", ".join(
                                [f"{k}: {pred[k]}" for k in pred])
                            print(
-                                f"filename: {img_path_list[nu]}, top-{topk}, {pred_str}"
-                            )
+                                f"filename: {filename}, top-{topk}, {pred_str}")

                    img_list = []
                    img_path_list = []

--- a/ppcls/arch/backbone/legendary_models/resnet.py
+++ b/ppcls/arch/backbone/legendary_models/resnet.py
@@ -104,7 +104,8 @@ class ConvBNLayer(TheseusLayer):
                 groups=1,
                 is_vd_mode=False,
                 act=None,
-                 lr_mult=1.0):
+                 lr_mult=1.0,
+                 data_format="NCHW"):
        super().__init__()
        self.is_vd_mode = is_vd_mode
        self.act = act
@@ -118,11 +119,13 @@ class ConvBNLayer(TheseusLayer):
            padding=(filter_size - 1) // 2,
            groups=groups,
            weight_attr=ParamAttr(learning_rate=lr_mult),
-            bias_attr=False)
+            bias_attr=False,
+            data_format=data_format)
        self.bn = BatchNorm(
            num_filters,
            param_attr=ParamAttr(learning_rate=lr_mult),
-            bias_attr=ParamAttr(learning_rate=lr_mult))
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+            data_layout=data_format)
        self.relu = nn.ReLU()

    def forward(self, x):
@@ -136,14 +139,14 @@ class ConvBNLayer(TheseusLayer):


 class BottleneckBlock(TheseusLayer):
-    def __init__(
-            self,
-            num_channels,
-            num_filters,
-            stride,
-            shortcut=True,
-            if_first=False,
-            lr_mult=1.0, ):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
        super().__init__()

        self.conv0 = ConvBNLayer(
@@ -151,20 +154,23 @@ class BottleneckBlock(TheseusLayer):
            num_filters=num_filters,
            filter_size=1,
            act="relu",
-            lr_mult=lr_mult)
+            lr_mult=lr_mult,
+            data_format=data_format)
        self.conv1 = ConvBNLayer(
            num_channels=num_filters,
            num_filters=num_filters,
            filter_size=3,
            stride=stride,
            act="relu",
-            lr_mult=lr_mult)
+            lr_mult=lr_mult,
+            data_format=data_format)
        self.conv2 = ConvBNLayer(
            num_channels=num_filters,
            num_filters=num_filters * 4,
            filter_size=1,
            act=None,
-            lr_mult=lr_mult)
+            lr_mult=lr_mult,
+            data_format=data_format)

        if not shortcut:
            self.short = ConvBNLayer(
@@ -173,7 +179,8 @@ class BottleneckBlock(TheseusLayer):
                filter_size=1,
                stride=stride if if_first else 1,
                is_vd_mode=False if if_first else True,
-                lr_mult=lr_mult)
+                lr_mult=lr_mult,
+                data_format=data_format)
        self.relu = nn.ReLU()
        self.shortcut = shortcut

@@ -199,7 +206,8 @@ class BasicBlock(TheseusLayer):
                 stride,
                 shortcut=True,
                 if_first=False,
-                 lr_mult=1.0):
+                 lr_mult=1.0,
+                 data_format="NCHW"):
        super().__init__()

        self.stride = stride
@@ -209,13 +217,15 @@ class BasicBlock(TheseusLayer):
            filter_size=3,
            stride=stride,
            act="relu",
-            lr_mult=lr_mult)
+            lr_mult=lr_mult,
+            data_format=data_format)
        self.conv1 = ConvBNLayer(
            num_channels=num_filters,
            num_filters=num_filters,
            filter_size=3,
            act=None,
-            lr_mult=lr_mult)
+            lr_mult=lr_mult,
+            data_format=data_format)
        if not shortcut:
            self.short = ConvBNLayer(
                num_channels=num_channels,
@@ -223,7 +233,8 @@ class BasicBlock(TheseusLayer):
                filter_size=1,
                stride=stride if if_first else 1,
                is_vd_mode=False if if_first else True,
-                lr_mult=lr_mult)
+                lr_mult=lr_mult,
+                data_format=data_format)
        self.shortcut = shortcut
        self.relu = nn.ReLU()

@@ -256,7 +267,9 @@ class ResNet(TheseusLayer):
                 config,
                 version="vb",
                 class_num=1000,
-                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0]):
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 data_format="NCHW",
+                 input_image_channel=3):
        super().__init__()

        self.cfg = config
@@ -279,22 +292,25 @@ class ResNet(TheseusLayer):

        self.stem_cfg = {
            #num_channels, num_filters, filter_size, stride
-            "vb": [[3, 64, 7, 2]],
-            "vd": [[3, 32, 3, 2], [32, 32, 3, 1], [32, 64, 3, 1]]
+            "vb": [[input_image_channel, 64, 7, 2]],
+            "vd":
+            [[input_image_channel, 32, 3, 2], [32, 32, 3, 1], [32, 64, 3, 1]]
        }

-        self.stem = nn.Sequential(*[
+        self.stem = nn.Sequential(* [
            ConvBNLayer(
                num_channels=in_c,
                num_filters=out_c,
                filter_size=k,
                stride=s,
                act="relu",
-                lr_mult=self.lr_mult_list[0])
+                lr_mult=self.lr_mult_list[0],
+                data_format=data_format)
            for in_c, out_c, k, s in self.stem_cfg[version]
        ])

-        self.max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+        self.max_pool = MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=data_format)
        block_list = []
        for block_idx in range(len(self.block_depth)):
            shortcut = False
@@ -306,11 +322,12 @@ class ResNet(TheseusLayer):
                    stride=2 if i == 0 and block_idx != 0 else 1,
                    shortcut=shortcut,
                    if_first=block_idx == i == 0 if version == "vd" else True,
-                    lr_mult=self.lr_mult_list[block_idx + 1]))
+                    lr_mult=self.lr_mult_list[block_idx + 1],
+                    data_format=data_format))
                shortcut = True
        self.blocks = nn.Sequential(*block_list)

-        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.avg_pool = AdaptiveAvgPool2D(1, data_format=data_format)
        self.flatten = nn.Flatten()
        self.avg_pool_channels = self.num_channels[-1] * 2
        stdv = 1.0 / math.sqrt(self.avg_pool_channels * 1.0)
@@ -319,13 +336,19 @@ class ResNet(TheseusLayer):
            self.class_num,
            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))

+        self.data_format = data_format
+
    def forward(self, x):
-        x = self.stem(x)
-        x = self.max_pool(x)
-        x = self.blocks(x)
-        x = self.avg_pool(x)
-        x = self.flatten(x)
-        x = self.fc(x)
+        with paddle.static.amp.fp16_guard():
+            if self.data_format == "NHWC":
+                x = paddle.transpose(x, [0, 2, 3, 1])
+                x.stop_gradient = True
+            x = self.stem(x)
+            x = self.max_pool(x)
+            x = self.blocks(x)
+            x = self.avg_pool(x)
+            x = self.flatten(x)
+            x = self.fc(x)
        return x



--- a/ppcls/arch/backbone/model_zoo/gvt.py
+++ b/ppcls/arch/backbone/model_zoo/gvt.py
@@ -56,10 +56,10 @@ class GroupAttention(nn.Layer):
                 ws=1):
        super().__init__()
        if ws == 1:
-            raise Exception(f"ws {ws} should not be 1")
+            raise Exception("ws {ws} should not be 1")
        if dim % num_heads != 0:
            raise Exception(
-                f"dim {dim} should be divided by num_heads {num_heads}.")
+                "dim {dim} should be divided by num_heads {num_heads}.")

        self.dim = dim
        self.num_heads = num_heads
@@ -78,15 +78,15 @@ class GroupAttention(nn.Layer):
        total_groups = h_group * w_group
        x = x.reshape([B, h_group, self.ws, w_group, self.ws, C]).transpose(
            [0, 1, 3, 2, 4, 5])
-        qkv = self.qkv(x).reshape(
-            [B, total_groups, -1, 3, self.num_heads,
-             C // self.num_heads]).transpose([3, 0, 1, 4, 2, 5])
+        qkv = self.qkv(x).reshape([
+            B, total_groups, self.ws**2, 3, self.num_heads, C // self.num_heads
+        ]).transpose([3, 0, 1, 4, 2, 5])
        q, k, v = qkv[0], qkv[1], qkv[2]
-        attn = (q @k.transpose([0, 1, 2, 4, 3])) * self.scale
+        attn = paddle.matmul(q, k.transpose([0, 1, 2, 4, 3])) * self.scale

        attn = nn.Softmax(axis=-1)(attn)
        attn = self.attn_drop(attn)
-        attn = (attn @v).transpose([0, 1, 3, 2, 4]).reshape(
+        attn = paddle.matmul(attn, v).transpose([0, 1, 3, 2, 4]).reshape(
            [B, h_group, w_group, self.ws, self.ws, C])

        x = attn.transpose([0, 1, 3, 2, 4, 5]).reshape([B, N, C])
@@ -135,22 +135,23 @@ class Attention(nn.Layer):

        if self.sr_ratio > 1:
            x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W])
-            x_ = self.sr(x_).reshape([B, C, -1]).transpose([0, 2, 1])
+            tmp_n = H * W // self.sr_ratio**2
+            x_ = self.sr(x_).reshape([B, C, tmp_n]).transpose([0, 2, 1])
            x_ = self.norm(x_)
            kv = self.kv(x_).reshape(
-                [B, -1, 2, self.num_heads, C // self.num_heads]).transpose(
+                [B, tmp_n, 2, self.num_heads, C // self.num_heads]).transpose(
                    [2, 0, 3, 1, 4])
        else:
            kv = self.kv(x).reshape(
-                [B, -1, 2, self.num_heads, C // self.num_heads]).transpose(
+                [B, N, 2, self.num_heads, C // self.num_heads]).transpose(
                    [2, 0, 3, 1, 4])
        k, v = kv[0], kv[1]

-        attn = (q @k.transpose([0, 1, 3, 2])) * self.scale
+        attn = paddle.matmul(q, k.transpose([0, 1, 3, 2])) * self.scale
        attn = nn.Softmax(axis=-1)(attn)
        attn = self.attn_drop(attn)

-        x = (attn @v).transpose([0, 2, 1, 3]).reshape([B, N, C])
+        x = paddle.matmul(attn, v).transpose([0, 2, 1, 3]).reshape([B, N, C])
        x = self.proj(x)
        x = self.proj_drop(x)
        return x
@@ -280,7 +281,7 @@ class PyramidVisionTransformer(nn.Layer):
                 img_size=224,
                 patch_size=16,
                 in_chans=3,
-                 num_classes=1000,
+                 class_num=1000,
                 embed_dims=[64, 128, 256, 512],
                 num_heads=[1, 2, 4, 8],
                 mlp_ratios=[4, 4, 4, 4],
@@ -294,7 +295,7 @@ class PyramidVisionTransformer(nn.Layer):
                 sr_ratios=[8, 4, 2, 1],
                 block_cls=Block):
        super().__init__()
-        self.num_classes = num_classes
+        self.class_num = class_num
        self.depths = depths

        # patch_embed
@@ -317,7 +318,6 @@ class PyramidVisionTransformer(nn.Layer):
                self.create_parameter(
                    shape=[1, patch_num, embed_dims[i]],
                    default_initializer=zeros_))
-            self.add_parameter(f"pos_embeds_{i}", self.pos_embeds[i])
            self.pos_drops.append(nn.Dropout(p=drop_rate))

        dpr = [
@@ -354,7 +354,7 @@ class PyramidVisionTransformer(nn.Layer):

        # classification head
        self.head = nn.Linear(embed_dims[-1],
-                              num_classes) if num_classes > 0 else Identity()
+                              class_num) if class_num > 0 else Identity()

        # init weights
        for pos_emb in self.pos_embeds:
@@ -433,7 +433,7 @@ class CPVTV2(PyramidVisionTransformer):
                 img_size=224,
                 patch_size=4,
                 in_chans=3,
-                 num_classes=1000,
+                 class_num=1000,
                 embed_dims=[64, 128, 256, 512],
                 num_heads=[1, 2, 4, 8],
                 mlp_ratios=[4, 4, 4, 4],
@@ -446,10 +446,10 @@ class CPVTV2(PyramidVisionTransformer):
                 depths=[3, 4, 6, 3],
                 sr_ratios=[8, 4, 2, 1],
                 block_cls=Block):
-        super().__init__(img_size, patch_size, in_chans, num_classes,
-                         embed_dims, num_heads, mlp_ratios, qkv_bias, qk_scale,
-                         drop_rate, attn_drop_rate, drop_path_rate, norm_layer,
-                         depths, sr_ratios, block_cls)
+        super().__init__(img_size, patch_size, in_chans, class_num, embed_dims,
+                         num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate,
+                         attn_drop_rate, drop_path_rate, norm_layer, depths,
+                         sr_ratios, block_cls)
        del self.pos_embeds
        del self.cls_token
        self.pos_block = nn.LayerList(
@@ -488,7 +488,7 @@ class CPVTV2(PyramidVisionTransformer):
                    x = self.pos_block[i](x, H, W)  # PEG here

            if i < len(self.depths) - 1:
-                x = x.reshape([B, H, W, -1]).transpose([0, 3, 1, 2])
+                x = x.reshape([B, H, W, x.shape[-1]]).transpose([0, 3, 1, 2])

        x = self.norm(x)
        return x.mean(axis=1)  # GAP here
@@ -499,7 +499,7 @@ class PCPVT(CPVTV2):
                 img_size=224,
                 patch_size=4,
                 in_chans=3,
-                 num_classes=1000,
+                 class_num=1000,
                 embed_dims=[64, 128, 256],
                 num_heads=[1, 2, 4],
                 mlp_ratios=[4, 4, 4],
@@ -512,10 +512,10 @@ class PCPVT(CPVTV2):
                 depths=[4, 4, 4],
                 sr_ratios=[4, 2, 1],
                 block_cls=SBlock):
-        super().__init__(img_size, patch_size, in_chans, num_classes,
-                         embed_dims, num_heads, mlp_ratios, qkv_bias, qk_scale,
-                         drop_rate, attn_drop_rate, drop_path_rate, norm_layer,
-                         depths, sr_ratios, block_cls)
+        super().__init__(img_size, patch_size, in_chans, class_num, embed_dims,
+                         num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate,
+                         attn_drop_rate, drop_path_rate, norm_layer, depths,
+                         sr_ratios, block_cls)


 class ALTGVT(PCPVT):

--- a/ppcls/arch/gears/cosmargin.py
+++ b/ppcls/arch/gears/cosmargin.py
@@ -38,7 +38,7 @@ class CosMargin(paddle.nn.Layer):

        input_norm = paddle.sqrt(
            paddle.sum(paddle.square(input), axis=1, keepdim=True))
-        input = paddle.divide(input, x_norm)
+        input = paddle.divide(input, input_norm)

        weight = self.fc.weight
        weight_norm = paddle.sqrt(

--- a/ppcls/configs/ImageNet/DPN/DPN107.yaml
+++ b/ppcls/configs/ImageNet/DPN/DPN107.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/DPN/DPN131.yaml
+++ b/ppcls/configs/ImageNet/DPN/DPN131.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/DPN/DPN68.yaml
+++ b/ppcls/configs/ImageNet/DPN/DPN68.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/DPN/DPN92.yaml
+++ b/ppcls/configs/ImageNet/DPN/DPN92.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/DPN/DPN98.yaml
+++ b/ppcls/configs/ImageNet/DPN/DPN98.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/DarkNet/DarkNet53.yaml
+++ b/ppcls/configs/ImageNet/DarkNet/DarkNet53.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml
+++ b/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
  Eval:
    - CELoss:

--- a/ppcls/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml
+++ b/ppcls/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
  Eval:
    - CELoss:

--- a/ppcls/configs/ImageNet/Inception/InceptionV3.yaml
+++ b/ppcls/configs/ImageNet/Inception/InceptionV3.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/Inception/InceptionV4.yaml
+++ b/ppcls/configs/ImageNet/Inception/InceptionV4.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/Res2Net/Res2Net101_vd_26w_4s.yaml
+++ b/ppcls/configs/ImageNet/Res2Net/Res2Net101_vd_26w_4s.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/Res2Net/Res2Net200_vd_26w_4s.yaml
+++ b/ppcls/configs/ImageNet/Res2Net/Res2Net200_vd_26w_4s.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/Res2Net/Res2Net50_14w_8s.yaml
+++ b/ppcls/configs/ImageNet/Res2Net/Res2Net50_14w_8s.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/Res2Net/Res2Net50_26w_4s.yaml
+++ b/ppcls/configs/ImageNet/Res2Net/Res2Net50_26w_4s.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/Res2Net/Res2Net50_vd_26w_4s.yaml
+++ b/ppcls/configs/ImageNet/Res2Net/Res2Net50_vd_26w_4s.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/ResNeSt/ResNeSt101.yaml
+++ b/ppcls/configs/ImageNet/ResNeSt/ResNeSt101.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/ResNeSt/ResNeSt50.yaml
+++ b/ppcls/configs/ImageNet/ResNeSt/ResNeSt50.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/ResNeSt/ResNeSt50_fast_1s1x64d.yaml
+++ b/ppcls/configs/ImageNet/ResNeSt/ResNeSt50_fast_1s1x64d.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_32x4d.yaml
+++ b/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_32x4d.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_64x4d.yaml
+++ b/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_64x4d.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_32x4d.yaml
+++ b/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_32x4d.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_64x4d.yaml
+++ b/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_64x4d.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_32x4d.yaml
+++ b/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_32x4d.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_64x4d.yaml
+++ b/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_64x4d.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/ResNet/ResNet101_vd.yaml
+++ b/ppcls/configs/ImageNet/ResNet/ResNet101_vd.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/ResNet/ResNet152_vd.yaml
+++ b/ppcls/configs/ImageNet/ResNet/ResNet152_vd.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/ResNet/ResNet18_vd.yaml
+++ b/ppcls/configs/ImageNet/ResNet/ResNet18_vd.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/ResNet/ResNet200_vd.yaml
+++ b/ppcls/configs/ImageNet/ResNet/ResNet200_vd.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/ResNet/ResNet34_vd.yaml
+++ b/ppcls/configs/ImageNet/ResNet/ResNet34_vd.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/ResNet/ResNet50_fp16.yaml
+++ b/ppcls/configs/ImageNet/ResNet/ResNet50_fp16.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_channel: &image_channel 4
+  image_shape: [*image_channel, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_pure_fp16: &use_pure_fp16 True
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+  input_image_channel: *image_channel
+  data_format: "NHWC"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  multi_precision: *use_pure_fp16
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            output_fp16: *use_pure_fp16
+            channel_num: *image_channel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            output_fp16: *use_pure_fp16
+            channel_num: *image_channel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        output_fp16: *use_pure_fp16
+        channel_num: *image_channel
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
--- a/ppcls/configs/ImageNet/ResNet/ResNet50_fp16_dygraph.yaml
+++ b/ppcls/configs/ImageNet/ResNet/ResNet50_fp16_dygraph.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  image_channel: &image_channel 4
+  # used for static mode and model export
+  image_shape: [*image_channel, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: True
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_pure_fp16: &use_pure_fp16 False
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+  input_image_channel: *image_channel
+  data_format: "NHWC"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            output_fp16: *use_pure_fp16
+            channel_num: *image_channel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            output_fp16: *use_pure_fp16
+            channel_num: *image_channel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        output_fp16: *use_pure_fp16
+        channel_num: *image_channel
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
--- a/ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml
+++ b/ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/SENet/SENet154_vd.yaml
+++ b/ppcls/configs/ImageNet/SENet/SENet154_vd.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d.yaml
+++ b/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_fp16.yaml
+++ b/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_fp16.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_channel: &image_channel 4
+  image_shape: [*image_channel, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: SE_ResNeXt101_32x4d
+  class_num: 1000
+  input_image_channel: *image_channel
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+# mixed precision training
+AMP:
+    scale_loss: 128.0
+    use_dynamic_loss_scaling: True
+    use_pure_fp16: &use_pure_fp16 True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            output_fp16: *use_pure_fp16
+            channel_num: *image_channel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            output_fp16: *use_pure_fp16
+            channel_num: *image_channel
+    sampler:
+      name: BatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        output_fp16: *use_pure_fp16
+        channel_num: *image_channel
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
--- a/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_32x4d.yaml
+++ b/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_32x4d.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_vd_32x4d.yaml
+++ b/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_vd_32x4d.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/SENet/SE_ResNet18_vd.yaml
+++ b/ppcls/configs/ImageNet/SENet/SE_ResNet18_vd.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/SENet/SE_ResNet34_vd.yaml
+++ b/ppcls/configs/ImageNet/SENet/SE_ResNet34_vd.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/SENet/SE_ResNet50_vd.yaml
+++ b/ppcls/configs/ImageNet/SENet/SE_ResNet50_vd.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml
+++ b/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: alt_gvt_base
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
--- a/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml
+++ b/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: alt_gvt_large
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
--- a/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml
+++ b/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: alt_gvt_small
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
--- a/test/benchmark.yaml
+++ b/test/benchmark.yaml
@@ -7,7 +7,7 @@ Global:
  save_interval: 1
  eval_during_train: True
  eval_interval: 1
-  epochs: 10
+  epochs: 120
  print_batch_step: 10
  use_visualdl: False
  # used for static mode and model export
@@ -18,7 +18,7 @@ Global:

 # model architecture
 Arch:
-  name: ResNet50
+  name: pcpvt_base
  class_num: 1000
 
 # loss function config for traing/eval process
@@ -49,8 +49,8 @@ DataLoader:
  Train:
    dataset:
      name: ImageNetDataset
-      image_root: ./dataset/chain_dataset/
-      cls_label_path: ./dataset/chain_dataset/train.txt
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
@@ -77,8 +77,8 @@ DataLoader:
  Eval:
    dataset: 
      name: ImageNetDataset
-      image_root: ./dataset/chain_dataset/
-      cls_label_path: ./dataset/chain_dataset/val.txt
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True

--- a/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml
+++ b/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: pcpvt_large
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
--- a/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml
+++ b/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: pcpvt_small
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
--- a/ppcls/configs/ImageNet/Xception/Xception65.yaml
+++ b/ppcls/configs/ImageNet/Xception/Xception65.yaml
@@ -16,13 +16,13 @@ Global:

 # model architecture
 Arch:
-  name: Xception41_deeplab
+  name: Xception65
  class_num: 1000
 
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/ImageNet/Xception/Xception71.yaml
+++ b/ppcls/configs/ImageNet/Xception/Xception71.yaml
@@ -22,7 +22,7 @@ Arch:
 # loss function config for traing/eval process
 Loss:
  Train:
-    - CELoss:
+    - MixCELoss:
        weight: 1.0
        epsilon: 0.1
  Eval:

--- a/ppcls/configs/Products/ResNet50_vd_Inshop.yaml
+++ b/ppcls/configs/Products/ResNet50_vd_Inshop.yaml
 # global configs
 Global:
  checkpoints: null
-# please download pretrained model via this link:
-# https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/product_ResNet50_vd_Aliproduct_v1.0_pretrained.pdparams
-  pretrained_model: product_ResNet50_vd_Aliproduct_v1.0_pretrained
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/product_ResNet50_vd_Aliproduct_v1.0_pretrained.pdparams"
  output_dir: ./output/
  device: gpu
  save_interval: 10

--- a/ppcls/configs/Products/ResNet50_vd_SOP.yaml
+++ b/ppcls/configs/Products/ResNet50_vd_SOP.yaml
 # global configs
 Global:
  checkpoints: null
-# please download pretrained model via this link:
-# https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/product_ResNet50_vd_Aliproduct_v1.0_pretrained.pdparams
-  pretrained_model: product_ResNet50_vd_Aliproduct_v1.0_pretrained
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/product_ResNet50_vd_Aliproduct_v1.0_pretrained.pdparams"
  output_dir: ./output/
  device: gpu
  save_interval: 10

--- a/ppcls/data/__init__.py
+++ b/ppcls/data/__init__.py
@@ -53,10 +53,14 @@ def create_operators(params):
    return ops


-def build_dataloader(config, mode, device, seed=None):
+def build_dataloader(config, mode, device, use_dali=False, seed=None):
    assert mode in ['Train', 'Eval', 'Test', 'Gallery', 'Query'
                    ], "Mode should be Train, Eval, Test, Gallery, Query"
    # build dataset
+    if use_dali:
+        from ppcls.data.dataloader.dali import dali_dataloader
+        return dali_dataloader(config, mode, paddle.device.get_device(), seed)
+
    config_dataset = config[mode]['dataset']
    config_dataset = copy.deepcopy(config_dataset)
    dataset_name = config_dataset.pop('name')

--- a/ppcls/utils/static/dali.py
+++ b/ppcls/utils/static/dali.py
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,16 +14,17 @@

 from __future__ import division

+import copy
 import os

 import numpy as np
-from nvidia.dali.pipeline import Pipeline
 import nvidia.dali.ops as ops
 import nvidia.dali.types as types
-from nvidia.dali.plugin.paddle import DALIGenericIterator
-
 import paddle
-from paddle import fluid
+from nvidia.dali import fn
+from nvidia.dali.pipeline import Pipeline
+from nvidia.dali.plugin.base_iterator import LastBatchPolicy
+from nvidia.dali.plugin.paddle import DALIGenericIterator


 class HybridTrainPipe(Pipeline):
@@ -46,10 +47,11 @@ class HybridTrainPipe(Pipeline):
                 num_threads=4,
                 seed=42,
                 pad_output=False,
-                 output_dtype=types.FLOAT):
+                 output_dtype=types.FLOAT,
+                 dataset='Train'):
        super(HybridTrainPipe, self).__init__(
            batch_size, num_threads, device_id, seed=seed)
-        self.input = ops.FileReader(
+        self.input = ops.readers.File(
            file_root=file_root,
            file_list=file_list,
            shard_id=shard_id,
@@ -59,9 +61,9 @@ class HybridTrainPipe(Pipeline):
        # without additional reallocations
        device_memory_padding = 211025920
        host_memory_padding = 140544512
-        self.decode = ops.ImageDecoderRandomCrop(
+        self.decode = ops.decoders.ImageRandomCrop(
            device='mixed',
-            output_type=types.RGB,
+            output_type=types.DALIImageType.RGB,
            device_memory_padding=device_memory_padding,
            host_memory_padding=host_memory_padding,
            random_aspect_ratio=[lower, upper],
@@ -71,15 +73,14 @@ class HybridTrainPipe(Pipeline):
            device='gpu', resize_x=crop, resize_y=crop, interp_type=interp)
        self.cmnp = ops.CropMirrorNormalize(
            device="gpu",
-            output_dtype=output_dtype,
-            output_layout=types.NCHW,
+            dtype=output_dtype,
+            output_layout='CHW',
            crop=(crop, crop),
-            image_type=types.RGB,
            mean=mean,
            std=std,
            pad_output=pad_output)
-        self.coin = ops.CoinFlip(probability=0.5)
-        self.to_int64 = ops.Cast(dtype=types.INT64, device="gpu")
+        self.coin = ops.random.CoinFlip(probability=0.5)
+        self.to_int64 = ops.Cast(dtype=types.DALIDataType.INT64, device="gpu")

    def define_graph(self):
        rng = self.coin()
@@ -113,25 +114,24 @@ class HybridValPipe(Pipeline):
                 output_dtype=types.FLOAT):
        super(HybridValPipe, self).__init__(
            batch_size, num_threads, device_id, seed=seed)
-        self.input = ops.FileReader(
+        self.input = ops.readers.File(
            file_root=file_root,
            file_list=file_list,
            shard_id=shard_id,
            num_shards=num_shards,
            random_shuffle=random_shuffle)
-        self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB)
+        self.decode = ops.decoders.Image(device="mixed")
        self.res = ops.Resize(
            device="gpu", resize_shorter=resize_shorter, interp_type=interp)
        self.cmnp = ops.CropMirrorNormalize(
            device="gpu",
-            output_dtype=output_dtype,
-            output_layout=types.NCHW,
+            dtype=output_dtype,
+            output_layout='CHW',
            crop=(crop, crop),
-            image_type=types.RGB,
            mean=mean,
            std=std,
            pad_output=pad_output)
-        self.to_int64 = ops.Cast(dtype=types.INT64, device="gpu")
+        self.to_int64 = ops.Cast(dtype=types.DALIDataType.INT64, device="gpu")

    def define_graph(self):
        jpegs, labels = self.input(name="Reader")
@@ -144,64 +144,84 @@ class HybridValPipe(Pipeline):
        return self.epoch_size("Reader")


-def build(config, mode='train'):
-    env = os.environ
-    assert config.get('use_gpu',
-                      True) == True, "gpu training is required for DALI"
-    assert not config.get(
-        'use_aa'), "auto augment is not supported by DALI reader"
-    assert float(env.get('FLAGS_fraction_of_gpu_memory_to_use', 0.92)) < 0.9, \
-        "Please leave enough GPU memory for DALI workspace, e.g., by setting" \
-        " `export FLAGS_fraction_of_gpu_memory_to_use=0.8`"
+def dali_dataloader(config, mode, device, seed=None):
+    assert "gpu" in device, "gpu training is required for DALI"
+    device_id = int(device.split(':')[1])
+    config_dataloader = config[mode]
+    seed = 42 if seed is None else seed
+    ops = [
+        list(x.keys())[0]
+        for x in config_dataloader["dataset"]["transform_ops"]
+    ]
+    support_ops_train = [
+        "DecodeImage", "NormalizeImage", "RandFlipImage", "RandCropImage"
+    ]
+    support_ops_eval = [
+        "DecodeImage", "ResizeImage", "CropImage", "NormalizeImage"
+    ]
+
+    if mode.lower() == 'train':
+        assert set(ops) == set(
+            support_ops_train
+        ), "The supported trasform_ops for train_dataset in dali is : {}".format(
+            ",".join(support_ops_train))
+    else:
+        assert set(ops) == set(
+            support_ops_eval
+        ), "The supported trasform_ops for eval_dataset in dali is : {}".format(
+            ",".join(support_ops_eval))
+
+    normalize_ops = [
+        op for op in config_dataloader["dataset"]["transform_ops"]
+        if "NormalizeImage" in op
+    ][0]["NormalizeImage"]
+    channel_num = normalize_ops.get("channel_num", 3)
+    output_dtype = types.FLOAT16 if normalize_ops.get("output_fp16",
+                                                      False) else types.FLOAT

-    dataset_config = config[mode.upper()]
+    env = os.environ
+    #  assert float(env.get('FLAGS_fraction_of_gpu_memory_to_use', 0.92)) < 0.9, \
+    #      "Please leave enough GPU memory for DALI workspace, e.g., by setting" \
+    #      " `export FLAGS_fraction_of_gpu_memory_to_use=0.8`"

-    gpu_num = paddle.fluid.core.get_cuda_device_count() if (
-        'PADDLE_TRAINERS_NUM') and (
-            'PADDLE_TRAINER_ID'
-        ) not in env else int(env.get('PADDLE_TRAINERS_NUM', 0))
+    gpu_num = paddle.distributed.get_world_size()

-    batch_size = dataset_config.batch_size
-    assert batch_size % gpu_num == 0, \
-        "batch size must be multiple of number of devices"
-    batch_size = batch_size // gpu_num
+    batch_size = config_dataloader["sampler"]["batch_size"]

-    file_root = dataset_config.data_dir
-    file_list = dataset_config.file_list
+    file_root = config_dataloader["dataset"]["image_root"]
+    file_list = config_dataloader["dataset"]["cls_label_path"]

    interp = 1  # settings.interpolation or 1  # default to linear
    interp_map = {
-        0: types.INTERP_NN,  # cv2.INTER_NEAREST
-        1: types.INTERP_LINEAR,  # cv2.INTER_LINEAR
-        2: types.INTERP_CUBIC,  # cv2.INTER_CUBIC
-        4: types.INTERP_LANCZOS3,  # XXX use LANCZOS3 for cv2.INTER_LANCZOS4
+        0: types.DALIInterpType.INTERP_NN,  # cv2.INTER_NEAREST
+        1: types.DALIInterpType.INTERP_LINEAR,  # cv2.INTER_LINEAR
+        2: types.DALIInterpType.INTERP_CUBIC,  # cv2.INTER_CUBIC
+        3: types.DALIInterpType.
+        INTERP_LANCZOS3,  # XXX use LANCZOS3 for cv2.INTER_LANCZOS4
    }

-    output_dtype = (types.FLOAT16 if 'AMP' in config and
-                    config.AMP.get("use_pure_fp16", False) 
-                    else types.FLOAT)
-    
    assert interp in interp_map, "interpolation method not supported by DALI"
    interp = interp_map[interp]
-    pad_output = False
-    image_shape = config.get("image_shape", None)
-    if image_shape and image_shape[0] == 4:
-        pad_output = True
+    pad_output = channel_num == 4

    transforms = {
        k: v
-        for d in dataset_config["transforms"] for k, v in d.items()
+        for d in config_dataloader["dataset"]["transform_ops"]
+        for k, v in d.items()
    }

    scale = transforms["NormalizeImage"].get("scale", 1.0 / 255)
-    if isinstance(scale, str):
-        scale = eval(scale)
+    scale = eval(scale) if isinstance(scale, str) else scale
    mean = transforms["NormalizeImage"].get("mean", [0.485, 0.456, 0.406])
    std = transforms["NormalizeImage"].get("std", [0.229, 0.224, 0.225])
    mean = [v / scale for v in mean]
    std = [v / scale for v in std]

-    if mode == "train":
+    sampler_name = config_dataloader["sampler"].get("name",
+                                                    "DistributedBatchSampler")
+    assert sampler_name in ["DistributedBatchSampler", "BatchSampler"]
+
+    if mode.lower() == "train":
        resize_shorter = 256
        crop = transforms["RandCropImage"]["size"]
        scale = transforms["RandCropImage"].get("scale", [0.08, 1.])
@@ -229,133 +249,71 @@ def build(config, mode='train'):
                device_id,
                shard_id,
                num_shards,
-                seed=42 + shard_id,
+                seed=seed + shard_id,
                pad_output=pad_output,
                output_dtype=output_dtype)
            pipe.build()
            pipelines = [pipe]
-            sample_per_shard = len(pipe) // num_shards
+            #  sample_per_shard = len(pipe) // num_shards
        else:
-            pipelines = []
-            places = fluid.framework.cuda_places()
-            num_shards = len(places)
-            for idx, p in enumerate(places):
-                place = fluid.core.Place()
-                place.set_place(p)
-                device_id = place.gpu_device_id()
-                pipe = HybridTrainPipe(
-                    file_root,
-                    file_list,
-                    batch_size,
-                    resize_shorter,
-                    crop,
-                    min_area,
-                    lower,
-                    upper,
-                    interp,
-                    mean,
-                    std,
-                    device_id,
-                    idx,
-                    num_shards,
-                    seed=42 + idx,
+            pipe = HybridTrainPipe(
+                file_root,
+                file_list,
+                batch_size,
+                resize_shorter,
+                crop,
+                min_area,
+                lower,
+                upper,
+                interp,
+                mean,
+                std,
+                device_id=device_id,
+                shard_id=0,
+                num_shards=1,
+                seed=seed,
                pad_output=pad_output,
                output_dtype=output_dtype)
-                pipe.build()
-                pipelines.append(pipe)
-            sample_per_shard = len(pipelines[0])
+            pipe.build()
+            pipelines = [pipe]
+            #  sample_per_shard = len(pipelines[0])
        return DALIGenericIterator(
-            pipelines, ['feed_image', 'feed_label'], size=sample_per_shard)
+            pipelines, ['data', 'label'], reader_name='Reader')
    else:
        resize_shorter = transforms["ResizeImage"].get("resize_short", 256)
        crop = transforms["CropImage"]["size"]
+        if 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env and sampler_name == "DistributedBatchSampler":
+            shard_id = int(env['PADDLE_TRAINER_ID'])
+            num_shards = int(env['PADDLE_TRAINERS_NUM'])
+            device_id = int(env['FLAGS_selected_gpus'])

-        p = fluid.framework.cuda_places()[0]
-        place = fluid.core.Place()
-        place.set_place(p)
-        device_id = place.gpu_device_id()
-        pipe = HybridValPipe(
-            file_root,
-            file_list,
-            batch_size,
-            resize_shorter,
-            crop,
-            interp,
-            mean,
-            std,
-            device_id=device_id,
-            pad_output=pad_output,
-            output_dtype=output_dtype)
+            pipe = HybridValPipe(
+                file_root,
+                file_list,
+                batch_size,
+                resize_shorter,
+                crop,
+                interp,
+                mean,
+                std,
+                device_id=device_id,
+                shard_id=shard_id,
+                num_shards=num_shards,
+                pad_output=pad_output,
+                output_dtype=output_dtype)
+        else:
+            pipe = HybridValPipe(
+                file_root,
+                file_list,
+                batch_size,
+                resize_shorter,
+                crop,
+                interp,
+                mean,
+                std,
+                device_id=device_id,
+                pad_output=pad_output,
+                output_dtype=output_dtype)
        pipe.build()
        return DALIGenericIterator(
-            pipe, ['feed_image', 'feed_label'],
-            size=len(pipe),
-            dynamic_shape=True,
-            fill_last_batch=True,
-            last_batch_padded=True)
-
-
-def train(config):
-    return build(config, 'train')
-
-
-def val(config):
-    return build(config, 'valid')
-
-
-def _to_Tensor(lod_tensor, dtype):
-    data_tensor = fluid.layers.create_tensor(dtype=dtype)
-    data = np.array(lod_tensor).astype(dtype)
-    fluid.layers.assign(data, data_tensor)
-    return data_tensor
-
-
-def normalize(feeds, config):
-    image, label = feeds['image'], feeds['label']
-    img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
-    img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
-    image = fluid.layers.cast(image, 'float32')
-    costant = fluid.layers.fill_constant(
-        shape=[1], value=255.0, dtype='float32')
-    image = fluid.layers.elementwise_div(image, costant)
-
-    mean = fluid.layers.create_tensor(dtype="float32")
-    fluid.layers.assign(input=img_mean.astype("float32"), output=mean)
-    std = fluid.layers.create_tensor(dtype="float32")
-    fluid.layers.assign(input=img_std.astype("float32"), output=std)
-
-    image = fluid.layers.elementwise_sub(image, mean)
-    image = fluid.layers.elementwise_div(image, std)
-
-    image.stop_gradient = True
-    feeds['image'] = image
-
-    return feeds
-
-
-def mix(feeds, config, is_train=True):
-    env = os.environ
-    gpu_num = paddle.fluid.core.get_cuda_device_count() if (
-        'PADDLE_TRAINERS_NUM') and (
-            'PADDLE_TRAINER_ID'
-        ) not in env else int(env.get('PADDLE_TRAINERS_NUM', 0))
-
-    batch_size = config.TRAIN.batch_size // gpu_num
-
-    images = feeds['image']
-    label = feeds['label']
-    # TODO: hard code here, should be fixed!
-    alpha = 0.2
-    idx = _to_Tensor(np.random.permutation(batch_size), 'int32')
-    lam = np.random.beta(alpha, alpha)
-
-    images = lam * images + (1 - lam) * paddle.fluid.layers.gather(images, idx)
-
-    feed = {
-        'image': images,
-        'feed_y_a': label,
-        'feed_y_b': paddle.fluid.layers.gather(label, idx),
-        'feed_lam': _to_Tensor([lam] * batch_size, 'float32')
-    }
-
-    return feed if is_train else feeds
+            [pipe], ['data', 'label'], reader_name="Reader")
--- a/ppcls/data/preprocess/ops/operators.py
+++ b/ppcls/data/preprocess/ops/operators.py
@@ -197,14 +197,26 @@ class NormalizeImage(object):
    """ normalize image such as substract mean, divide std
    """

-    def __init__(self, scale=None, mean=None, std=None, order='chw'):
+    def __init__(self,
+                 scale=None,
+                 mean=None,
+                 std=None,
+                 order='chw',
+                 output_fp16=False,
+                 channel_num=3):
        if isinstance(scale, str):
            scale = eval(scale)
+        assert channel_num in [
+            3, 4
+        ], "channel number of input image should be set to 3 or 4."
+        self.channel_num = channel_num
+        self.output_dtype = 'float16' if output_fp16 else 'float32'
        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+        self.order = order
        mean = mean if mean is not None else [0.485, 0.456, 0.406]
        std = std if std is not None else [0.229, 0.224, 0.225]

-        shape = (3, 1, 1) if order == 'chw' else (1, 1, 3)
+        shape = (3, 1, 1) if self.order == 'chw' else (1, 1, 3)
        self.mean = np.array(mean).reshape(shape).astype('float32')
        self.std = np.array(std).reshape(shape).astype('float32')

@@ -215,7 +227,20 @@ class NormalizeImage(object):

        assert isinstance(img,
                          np.ndarray), "invalid input 'img' in NormalizeImage"
-        return (img.astype('float32') * self.scale - self.mean) / self.std
+
+        img = (img.astype('float32') * self.scale - self.mean) / self.std
+
+        if self.channel_num == 4:
+            img_h = img.shape[1] if self.order == 'chw' else img.shape[0]
+            img_w = img.shape[2] if self.order == 'chw' else img.shape[1]
+            pad_zeros = np.zeros(
+                (1, img_h, img_w)) if self.order == 'chw' else np.zeros(
+                    (img_h, img_w, 1))
+            img = (np.concatenate(
+                (img, pad_zeros), axis=0)
+                   if self.order == 'chw' else np.concatenate(
+                       (img, pad_zeros), axis=2))
+        return img.astype(self.output_dtype)


 class ToCHWImage(object):

--- a/ppcls/engine/trainer.py
+++ b/ppcls/engine/trainer.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import os
 import sys
 import numpy as np
+
 __dir__ = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))

@@ -40,7 +41,7 @@ from ppcls.arch import apply_to_static
 from ppcls.loss import build_loss
 from ppcls.metric import build_metrics
 from ppcls.optimizer import build_optimizer
-from ppcls.utils.save_load import load_dygraph_pretrain
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
 from ppcls.utils.save_load import init_model
 from ppcls.utils import save_load

@@ -78,8 +79,12 @@ class Trainer(object):
        apply_to_static(self.config, self.model)

        if self.config["Global"]["pretrained_model"] is not None:
-            load_dygraph_pretrain(self.model,
-                                  self.config["Global"]["pretrained_model"])
+            if self.config["Global"]["pretrained_model"].startswith("http"):
+                load_dygraph_pretrain_from_url(
+                    self.model, self.config["Global"]["pretrained_model"])
+            else:
+                load_dygraph_pretrain(
+                    self.model, self.config["Global"]["pretrained_model"])

        if self.config["Global"]["distributed"]:
            self.model = paddle.DataParallel(self.model)
@@ -99,10 +104,25 @@ class Trainer(object):
        self.query_dataloader = None
        self.eval_mode = self.config["Global"].get("eval_mode",
                                                   "classification")
+        self.amp = True if "AMP" in self.config else False
+        if self.amp and self.config["AMP"] is not None:
+            self.scale_loss = self.config["AMP"].get("scale_loss", 1.0)
+            self.use_dynamic_loss_scaling = self.config["AMP"].get(
+                "use_dynamic_loss_scaling", False)
+        else:
+            self.scale_loss = 1.0
+            self.use_dynamic_loss_scaling = False
+        if self.amp:
+            AMP_RELATED_FLAGS_SETTING = {
+                'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
+                'FLAGS_max_inplace_grad_add': 8,
+            }
+            paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
        self.train_loss_func = None
        self.eval_loss_func = None
        self.train_metric_func = None
        self.eval_metric_func = None
+        self.use_dali = self.config['Global'].get("use_dali", False)

    def train(self):
        # build train loss and metric info
@@ -117,8 +137,8 @@ class Trainer(object):
                    self.train_metric_func = build_metrics(metric_config)

        if self.train_dataloader is None:
-            self.train_dataloader = build_dataloader(self.config["DataLoader"],
-                                                     "Train", self.device)
+            self.train_dataloader = build_dataloader(
+                self.config["DataLoader"], "Train", self.device, self.use_dali)

        step_each_epoch = len(self.train_dataloader)

@@ -134,7 +154,7 @@ class Trainer(object):
            "metric": 0.0,
            "epoch": 0,
        }
-        # key: 
+        # key:
        # val: metrics list word
        output_info = dict()
        time_info = {
@@ -152,31 +172,52 @@ class Trainer(object):
            if metric_info is not None:
                best_metric.update(metric_info)

+        # for amp training
+        if self.amp:
+            scaler = paddle.amp.GradScaler(
+                init_loss_scaling=self.scale_loss,
+                use_dynamic_loss_scaling=self.use_dynamic_loss_scaling)
+
        tic = time.time()
        max_iter = len(self.train_dataloader) - 1 if platform.system(
        ) == "Windows" else len(self.train_dataloader)
        for epoch_id in range(best_metric["epoch"] + 1,
                              self.config["Global"]["epochs"] + 1):
            acc = 0.0
-            for iter_id, batch in enumerate(self.train_dataloader()):
+            train_dataloader = self.train_dataloader if self.use_dali else self.train_dataloader(
+            )
+            for iter_id, batch in enumerate(train_dataloader):
                if iter_id >= max_iter:
                    break
                if iter_id == 5:
                    for key in time_info:
                        time_info[key].reset()
                time_info["reader_cost"].update(time.time() - tic)
+                if self.use_dali:
+                    batch = [
+                        paddle.to_tensor(batch[0]['data']),
+                        paddle.to_tensor(batch[0]['label'])
+                    ]
                batch_size = batch[0].shape[0]
                batch[1] = batch[1].reshape([-1, 1]).astype("int64")

                global_step += 1
                # image input
-                if not self.is_rec:
-                    out = self.model(batch[0])
+                if self.amp:
+                    with paddle.amp.auto_cast(custom_black_list={
+                            "flatten_contiguous_range", "greater_than"
+                    }):
+                        out = self.forward(batch)
+                        loss_dict = self.train_loss_func(out, batch[1])
                else:
-                    out = self.model(batch[0], batch[1])
+                    out = self.forward(batch)

                # calc loss
-                loss_dict = self.train_loss_func(out, batch[1])
+                if self.config["DataLoader"]["Train"]["dataset"].get(
+                        "batch_transform_ops", None):
+                    loss_dict = self.train_loss_func(out, batch[1:])
+                else:
+                    loss_dict = self.train_loss_func(out, batch[1])

                for key in loss_dict:
                    if not key in output_info:
@@ -193,8 +234,13 @@ class Trainer(object):
                                                batch_size)

                # step opt and lr
-                loss_dict["loss"].backward()
-                optimizer.step()
+                if self.amp:
+                    scaled = scaler.scale(loss_dict["loss"])
+                    scaled.backward()
+                    scaler.minimize(optimizer, scaled)
+                else:
+                    loss_dict["loss"].backward()
+                    optimizer.step()
                optimizer.clear_grad()
                lr_sch.step()

@@ -237,7 +283,8 @@ class Trainer(object):
                            step=global_step,
                            writer=self.vdl_writer)
                tic = time.time()
-
+            if self.use_dali:
+                self.train_dataloader.reset()
            metric_msg = ", ".join([
                "{}: {:.5f}".format(key, output_info[key].avg)
                for key in output_info
@@ -307,7 +354,8 @@ class Trainer(object):
        if self.eval_mode == "classification":
            if self.eval_dataloader is None:
                self.eval_dataloader = build_dataloader(
-                    self.config["DataLoader"], "Eval", self.device)
+                    self.config["DataLoader"], "Eval", self.device,
+                    self.use_dali)

            if self.eval_metric_func is None:
                metric_config = self.config.get("Metric")
@@ -321,11 +369,13 @@ class Trainer(object):
        elif self.eval_mode == "retrieval":
            if self.gallery_dataloader is None:
                self.gallery_dataloader = build_dataloader(
-                    self.config["DataLoader"]["Eval"], "Gallery", self.device)
+                    self.config["DataLoader"]["Eval"], "Gallery", self.device,
+                    self.use_dali)

            if self.query_dataloader is None:
                self.query_dataloader = build_dataloader(
-                    self.config["DataLoader"]["Eval"], "Query", self.device)
+                    self.config["DataLoader"]["Eval"], "Query", self.device,
+                    self.use_dali)
            # build metric info
            if self.eval_metric_func is None:
                metric_config = self.config.get("Metric", None)
@@ -341,6 +391,13 @@ class Trainer(object):
        self.model.train()
        return eval_result

+    def forward(self, batch):
+        if not self.is_rec:
+            out = self.model(batch[0])
+        else:
+            out = self.model(batch[0], batch[1])
+        return out
+
    @paddle.no_grad()
    def eval_cls(self, epoch_id=0):
        output_info = dict()
@@ -354,24 +411,27 @@ class Trainer(object):

        metric_key = None
        tic = time.time()
+        eval_dataloader = self.eval_dataloader if self.use_dali else self.eval_dataloader(
+        )
        max_iter = len(self.eval_dataloader) - 1 if platform.system(
        ) == "Windows" else len(self.eval_dataloader)
-        for iter_id, batch in enumerate(self.eval_dataloader()):
+        for iter_id, batch in enumerate(eval_dataloader):
            if iter_id >= max_iter:
                break
            if iter_id == 5:
                for key in time_info:
                    time_info[key].reset()
-
+            if self.use_dali:
+                batch = [
+                    paddle.to_tensor(batch[0]['data']),
+                    paddle.to_tensor(batch[0]['label'])
+                ]
            time_info["reader_cost"].update(time.time() - tic)
            batch_size = batch[0].shape[0]
            batch[0] = paddle.to_tensor(batch[0]).astype("float32")
            batch[1] = batch[1].reshape([-1, 1]).astype("int64")
            # image input
-            if self.is_rec:
-                out = self.model(batch[0], batch[1])
-            else:
-                out = self.model(batch[0])
+            out = self.forward(batch)
            # calc loss
            if self.eval_loss_func is not None:
                loss_dict = self.eval_loss_func(out, batch[-1])
@@ -419,7 +479,8 @@ class Trainer(object):
                    len(self.eval_dataloader), metric_msg, time_msg, ips_msg))

            tic = time.time()
-
+        if self.use_dali:
+            self.eval_dataloader.reset()
        metric_msg = ", ".join([
            "{}: {:.5f}".format(key, output_info[key].avg)
            for key in output_info
@@ -434,7 +495,6 @@ class Trainer(object):

    def eval_retrieval(self, epoch_id=0):
        self.model.eval()
-        cum_similarity_matrix = None
        # step1. build gallery
        gallery_feas, gallery_img_id, gallery_unique_id = self._cal_feature(
            name='gallery')
@@ -509,14 +569,20 @@ class Trainer(object):
        has_unique_id = False
        max_iter = len(dataloader) - 1 if platform.system(
        ) == "Windows" else len(dataloader)
-        for idx, batch in enumerate(dataloader(
-        )):  # load is very time-consuming
+        dataloader_tmp = dataloader if self.use_dali else dataloader()
+        for idx, batch in enumerate(
+                dataloader_tmp):  # load is very time-consuming
            if idx >= max_iter:
                break
            if idx % self.config["Global"]["print_batch_step"] == 0:
                logger.info(
                    f"{name} feature calculation process: [{idx}/{len(dataloader)}]"
                )
+            if self.use_dali:
+                batch = [
+                    paddle.to_tensor(batch[0]['data']),
+                    paddle.to_tensor(batch[0]['label'])
+                ]
            batch = [paddle.to_tensor(x) for x in batch]
            batch[1] = batch[1].reshape([-1, 1]).astype("int64")
            if len(batch) == 3:
@@ -542,7 +608,8 @@ class Trainer(object):
                all_image_id = paddle.concat([all_image_id, batch[1]])
                if has_unique_id:
                    all_unique_id = paddle.concat([all_unique_id, batch[2]])
-
+        if self.use_dali:
+            dataloader_tmp.reset()
        if paddle.distributed.get_world_size() > 1:
            feat_list = []
            img_id_list = []

--- a/ppcls/loss/__init__.py
+++ b/ppcls/loss/__init__.py
@@ -4,7 +4,7 @@ import paddle
 import paddle.nn as nn
 from ppcls.utils import logger

-from .celoss import CELoss
+from .celoss import CELoss, MixCELoss
 from .googlenetloss import GoogLeNetLoss
 from .centerloss import CenterLoss
 from .emlloss import EmlLoss
@@ -30,7 +30,6 @@ class CombinedLoss(nn.Layer):
        assert isinstance(config_list, list), (
            'operator config should be a list')
        for config in config_list:
-            print(config)
            assert isinstance(config,
                              dict) and len(config) == 1, "yaml format error"
            name = list(config)[0]

--- a/ppcls/loss/celoss.py
+++ b/ppcls/loss/celoss.py
@@ -18,6 +18,10 @@ import paddle.nn.functional as F


 class CELoss(nn.Layer):
+    """
+    Cross entropy loss
+    """
+
    def __init__(self, epsilon=None):
        super().__init__()
        if epsilon is not None and (epsilon <= 0 or epsilon >= 1):
@@ -50,3 +54,21 @@ class CELoss(nn.Layer):
            loss = F.cross_entropy(x, label=label, soft_label=soft_label)
        loss = loss.mean()
        return {"CELoss": loss}
+
+
+class MixCELoss(CELoss):
+    """
+    Cross entropy loss with mix(mixup, cutmix, fixmix)
+    """
+
+    def __init__(self, epsilon=None):
+        super().__init__()
+        self.epsilon = epsilon
+
+    def __call__(self, input, batch):
+        target0, target1, lam = batch
+        loss0 = super().forward(input, target0)["CELoss"]
+        loss1 = super().forward(input, target1)["CELoss"]
+        loss = lam * loss0 + (1.0 - lam) * loss1
+        loss = paddle.mean(loss)
+        return {"MixCELoss": loss}
--- a/ppcls/optimizer/__init__.py
+++ b/ppcls/optimizer/__init__.py
@@ -41,7 +41,7 @@ def build_lr_scheduler(lr_config, epochs, step_each_epoch):
    return lr


-def build_optimizer(config, epochs, step_each_epoch, parameters):
+def build_optimizer(config, epochs, step_each_epoch, parameters=None):
    config = copy.deepcopy(config)
    # step1 build lr
    lr = build_lr_scheduler(config.pop('lr'), epochs, step_each_epoch)

--- a/ppcls/optimizer/optimizer.py
+++ b/ppcls/optimizer/optimizer.py
@@ -33,12 +33,14 @@ class Momentum(object):
                 learning_rate,
                 momentum,
                 weight_decay=None,
-                 grad_clip=None):
+                 grad_clip=None,
+                 multi_precision=False):
        super(Momentum, self).__init__()
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.grad_clip = grad_clip
+        self.multi_precision = multi_precision

    def __call__(self, parameters):
        opt = optim.Momentum(
@@ -46,6 +48,7 @@ class Momentum(object):
            momentum=self.momentum,
            weight_decay=self.weight_decay,
            grad_clip=self.grad_clip,
+            multi_precision=self.multi_precision,
            parameters=parameters)
        return opt

@@ -60,7 +63,8 @@ class Adam(object):
                 weight_decay=None,
                 grad_clip=None,
                 name=None,
-                 lazy_mode=False):
+                 lazy_mode=False,
+                 multi_precision=False):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
@@ -71,6 +75,7 @@ class Adam(object):
        self.grad_clip = grad_clip
        self.name = name
        self.lazy_mode = lazy_mode
+        self.multi_precision = multi_precision

    def __call__(self, parameters):
        opt = optim.Adam(
@@ -82,6 +87,7 @@ class Adam(object):
            grad_clip=self.grad_clip,
            name=self.name,
            lazy_mode=self.lazy_mode,
+            multi_precision=self.multi_precision,
            parameters=parameters)
        return opt

@@ -104,7 +110,8 @@ class RMSProp(object):
                 rho=0.95,
                 epsilon=1e-6,
                 weight_decay=None,
-                 grad_clip=None):
+                 grad_clip=None,
+                 multi_precision=False):
        super(RMSProp, self).__init__()
        self.learning_rate = learning_rate
        self.momentum = momentum
@@ -122,4 +129,4 @@ class RMSProp(object):
            weight_decay=self.weight_decay,
            grad_clip=self.grad_clip,
            parameters=parameters)
-        return opt
\ No newline at end of file
+        return opt
--- a/ppcls/utils/static/program.py
+++ b/ppcls/utils/static/program.py
--- a/ppcls/utils/static/run_dali.sh
+++ b/ppcls/utils/static/run_dali.sh
 #!/usr/bin/env bash

-export CUDA_VISIBLE_DEVICES="0,1,2,3"
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
 export FLAGS_fraction_of_gpu_memory_to_use=0.80

 python3.7 -m paddle.distributed.launch \
-    --gpus="0,1,2,3" \
-    tools/static/train.py \
-        -c ./configs/ResNet/ResNet50.yaml \
-        -o print_interval=10 \
-        -o use_dali=True
+    --gpus="0,1,2,3,4,5,6,7" \
+    ppcls/static//train.py \
+    -c ./ppcls/configs/ImageNet/ResNet/ResNet50_fp16.yaml \
+    -o Global.use_dali=True
+
--- a/ppcls/utils/static/save_load.py
+++ b/ppcls/utils/static/save_load.py
@@ -74,9 +74,7 @@ def load_params(exe, prog, path, ignore_params=None):
        raise ValueError("Model pretrain path {} does not "
                         "exists.".format(path))

-    logger.info(
-        logger.coloring('Loading parameters from {}...'.format(path),
-                        'HEADER'))
+    logger.info("Loading parameters from {}...".format(path))

    ignore_set = set()
    state = _load_state(path)
@@ -116,9 +114,7 @@ def init_model(config, program, exe):
    checkpoints = config.get('checkpoints')
    if checkpoints:
        paddle.static.load(program, checkpoints, exe)
-        logger.info(
-            logger.coloring("Finish initing model from {}".format(checkpoints),
-                            "HEADER"))
+        logger.info("Finish initing model from {}".format(checkpoints))
        return

    pretrained_model = config.get('pretrained_model')
@@ -127,19 +123,17 @@ def init_model(config, program, exe):
            pretrained_model = [pretrained_model]
        for pretrain in pretrained_model:
            load_params(exe, program, pretrain)
-        logger.info(
-            logger.coloring("Finish initing model from {}".format(
-                pretrained_model), "HEADER"))
+        logger.info("Finish initing model from {}".format(pretrained_model))


 def save_model(program, model_path, epoch_id, prefix='ppcls'):
    """
    save model to the target path
    """
+    if paddle.distributed.get_rank() != 0:
+        return
    model_path = os.path.join(model_path, str(epoch_id))
    _mkdir_if_not_exist(model_path)
    model_prefix = os.path.join(model_path, prefix)
    paddle.static.save(program, model_prefix)
-    logger.info(
-        logger.coloring("Already save model in {}".format(model_path),
-                        "HEADER"))
+    logger.info("Already save model in {}".format(model_path))
--- a/ppcls/utils/static/train.py
+++ b/ppcls/utils/static/train.py
@@ -23,16 +23,16 @@ __dir__ = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(__dir__)
 sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))

-from sys import version_info
-
 import paddle
 from paddle.distributed import fleet
+from visualdl import LogWriter

-from ppcls.data import Reader
-from ppcls.utils.config import get_config
+from ppcls.data import build_dataloader
+from ppcls.utils.config import get_config, print_config
 from ppcls.utils import logger
-from tools.static import program
-from save_load import init_model, save_model
+from ppcls.utils.logger import init_logger
+from ppcls.static.save_load import init_model, save_model
+from ppcls.static import program


 def parse_args():
@@ -43,11 +43,6 @@ def parse_args():
        type=str,
        default='configs/ResNet/ResNet50.yaml',
        help='config file path')
-    parser.add_argument(
-        '--vdl_dir',
-        type=str,
-        default=None,
-        help='VisualDL logging directory for image.')
    parser.add_argument(
        '-p',
        '--profiler_options',
@@ -66,32 +61,64 @@ def parse_args():


 def main(args):
-    config = get_config(args.config, overrides=args.override, show=True)
-    if config.get("is_distributed", True):
+    """
+    all the config of training paradigm should be in config["Global"]
+    """
+    config = get_config(args.config, overrides=args.override, show=False)
+    global_config = config["Global"]
+
+    mode = "train"
+
+    log_file = os.path.join(global_config['output_dir'],
+                            config["Arch"]["name"], f"{mode}.log")
+    init_logger(name='root', log_file=log_file)
+    print_config(config)
+
+    if global_config.get("is_distributed", True):
        fleet.init(is_collective=True)
-    # assign the place
-    use_gpu = config.get("use_gpu", True)
+    # assign the device
+    use_gpu = global_config.get("use_gpu", True)
    # amp related config
    if 'AMP' in config:
        AMP_RELATED_FLAGS_SETTING = {
-            'FLAGS_cudnn_exhaustive_search': 1,
-            'FLAGS_conv_workspace_size_limit': 1500,
-            'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
-            'FLAGS_max_inplace_grad_add': 8,
+            'FLAGS_cudnn_exhaustive_search': "1",
+            'FLAGS_conv_workspace_size_limit': "1500",
+            'FLAGS_cudnn_batchnorm_spatial_persistent': "1",
+            'FLAGS_max_indevice_grad_add': "8",
+            "FLAGS_cudnn_batchnorm_spatial_persistent": "1",
        }
-        os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
-        paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
-    use_xpu = config.get("use_xpu", False)
+        for k in AMP_RELATED_FLAGS_SETTING:
+            os.environ[k] = AMP_RELATED_FLAGS_SETTING[k]
+
+    use_xpu = global_config.get("use_xpu", False)
    assert (
        use_gpu and use_xpu
    ) is not True, "gpu and xpu can not be true in the same time in static mode!"

    if use_gpu:
-        place = paddle.set_device('gpu')
+        device = paddle.set_device('gpu')
    elif use_xpu:
-        place = paddle.set_device('xpu')
+        device = paddle.set_device('xpu')
    else:
-        place = paddle.set_device('cpu')
+        device = paddle.set_device('cpu')
+
+    # visualDL
+    vdl_writer = None
+    if global_config["use_visualdl"]:
+        vdl_dir = os.path.join(global_config["output_dir"], "vdl")
+        vdl_writer = LogWriter(vdl_dir)
+
+    # build dataloader
+    eval_dataloader = None
+    use_dali = global_config.get('use_dali', False)
+
+    train_dataloader = build_dataloader(
+        config["DataLoader"], "Train", device=device, use_dali=use_dali)
+    if global_config["eval_during_train"]:
+        eval_dataloader = build_dataloader(
+            config["DataLoader"], "Eval", device=device, use_dali=use_dali)
+
+    step_each_epoch = len(train_dataloader)

    # startup_prog is used to do some parameter init work,
    # and train prog is used to hold the network
@@ -104,89 +131,71 @@ def main(args):
        config,
        train_prog,
        startup_prog,
+        step_each_epoch=step_each_epoch,
        is_train=True,
-        is_distributed=config.get("is_distributed", True))
+        is_distributed=global_config.get("is_distributed", True))

-    if config.validate:
-        valid_prog = paddle.static.Program()
-        valid_fetchs, _, valid_feeds, _ = program.build(
+    if global_config["eval_during_train"]:
+        eval_prog = paddle.static.Program()
+        eval_fetchs, _, eval_feeds, _ = program.build(
            config,
-            valid_prog,
+            eval_prog,
            startup_prog,
            is_train=False,
-            is_distributed=config.get("is_distributed", True))
-        # clone to prune some content which is irrelevant in valid_prog
-        valid_prog = valid_prog.clone(for_test=True)
+            is_distributed=global_config.get("is_distributed", True))
+        # clone to prune some content which is irrelevant in eval_prog
+        eval_prog = eval_prog.clone(for_test=True)

-    # create the "Executor" with the statement of which place
-    exe = paddle.static.Executor(place)
+    # create the "Executor" with the statement of which device
+    exe = paddle.static.Executor(device)
    # Parameter initialization
    exe.run(startup_prog)
    # load pretrained models or checkpoints
-    init_model(config, train_prog, exe)
+    init_model(global_config, train_prog, exe)

    if 'AMP' in config and config.AMP.get("use_pure_fp16", False):
        optimizer.amp_init(
-            place,
+            device,
            scope=paddle.static.global_scope(),
-            test_program=valid_prog if config.validate else None)
+            test_program=eval_prog
+            if global_config["eval_during_train"] else None)

-    if not config.get("is_distributed", True):
+    if not global_config.get("is_distributed", True):
        compiled_train_prog = program.compile(
            config, train_prog, loss_name=train_fetchs["loss"][0].name)
    else:
        compiled_train_prog = train_prog

-    if not config.get('use_dali', False):
-        train_dataloader = Reader(config, 'train', places=place)()
-        if config.validate and paddle.distributed.get_rank() == 0:
-            valid_dataloader = Reader(config, 'valid', places=place)()
-            compiled_valid_prog = program.compile(config, valid_prog)
-    else:
-        assert use_gpu is True, "DALI only support gpu, please set use_gpu to True!"
-        import dali
-        train_dataloader = dali.train(config)
-        if config.validate and paddle.distributed.get_rank() == 0:
-            valid_dataloader = dali.val(config)
-            compiled_valid_prog = program.compile(config, valid_prog)
+    if eval_dataloader is not None:
+        compiled_eval_prog = program.compile(config, eval_prog)

-    vdl_writer = None
-    if args.vdl_dir:
-        if version_info.major == 2:
-            logger.info(
-                "visualdl is just supported for python3, so it is disabled in python2..."
-            )
-        else:
-            from visualdl import LogWriter
-            vdl_writer = LogWriter(args.vdl_dir)
-
-    for epoch_id in range(config.epochs):
+    for epoch_id in range(global_config["epochs"]):
        # 1. train with train dataset
        program.run(train_dataloader, exe, compiled_train_prog, train_feeds,
                    train_fetchs, epoch_id, 'train', config, vdl_writer,
                    lr_scheduler, args.profiler_options)
-        if paddle.distributed.get_rank() == 0:
-            # 2. validate with validate dataset
-            if config.validate and epoch_id % config.valid_interval == 0:
-                top1_acc = program.run(valid_dataloader, exe,
-                                       compiled_valid_prog, valid_feeds,
-                                       valid_fetchs, epoch_id, 'valid', config)
-                if top1_acc > best_top1_acc:
-                    best_top1_acc = top1_acc
-                    message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
-                        best_top1_acc, epoch_id)
-                    logger.info("{:s}".format(logger.coloring(message, "RED")))
-                    if epoch_id % config.save_interval == 0:
-
-                        model_path = os.path.join(config.model_save_dir,
-                                                  config.ARCHITECTURE["name"])
-                        save_model(train_prog, model_path, "best_model")
-
-            # 3. save the persistable model
-            if epoch_id % config.save_interval == 0:
-                model_path = os.path.join(config.model_save_dir,
-                                          config.ARCHITECTURE["name"])
-                save_model(train_prog, model_path, epoch_id)
+        # 2. evaate with eval dataset
+        if global_config["eval_during_train"] and epoch_id % global_config[
+                "eval_interval"] == 0:
+            top1_acc = program.run(eval_dataloader, exe, compiled_eval_prog,
+                                   eval_feeds, eval_fetchs, epoch_id, "eval",
+                                   config)
+            if top1_acc > best_top1_acc:
+                best_top1_acc = top1_acc
+                message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
+                    best_top1_acc, epoch_id)
+                logger.info(message)
+                if epoch_id % global_config["save_interval"] == 0:
+
+                    model_path = os.path.join(global_config["output_dir"],
+                                              config["Arch"]["name"])
+                    save_model(train_prog, model_path, "best_model")
+
+        # 3. save the persistable model
+        if epoch_id % global_config["save_interval"] == 0:
+            model_path = os.path.join(global_config["output_dir"],
+                                      config["Arch"]["name"])
+            save_model(train_prog, model_path, epoch_id)


 if __name__ == '__main__':

--- a/ppcls/utils/save_load.py
+++ b/ppcls/utils/save_load.py
@@ -54,7 +54,7 @@ def load_dygraph_pretrain(model, path=None):
    return


-def load_dygraph_pretrain_from_url(model, pretrained_url, use_ssld):
+def load_dygraph_pretrain_from_url(model, pretrained_url, use_ssld=False):
    if use_ssld:
        pretrained_url = pretrained_url.replace("_pretrained",
                                                "_ssld_pretrained")

--- a/test/test.sh
+++ b/test/test.sh
-#!/bin/bash
-FILENAME=$1
-# MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer', 'infer']
-MODE=$2
-dataline=$(cat ${FILENAME})
-# parser params
-IFS=$'\n'
-lines=(${dataline})
-function func_parser_key(){
-    strs=$1
-    IFS=":"
-    array=(${strs})
-    tmp=${array[0]}
-    echo ${tmp}
-}
-function func_parser_value(){
-    strs=$1
-    IFS=":"
-    array=(${strs})
-    tmp=${array[1]}
-    echo ${tmp}
-}
-function status_check(){
-    last_status=$1   # the exit code
-    run_command=$2
-    run_log=$3
-    if [ $last_status -eq 0 ]; then
-        echo -e "\033[33m Run successfully with command - ${run_command}!  \033[0m" | tee -a ${run_log}
-    else
-        echo -e "\033[33m Run failed with command - ${run_command}!  \033[0m" | tee -a ${run_log}
-    fi
-}
-
-IFS=$'\n'
-# The training params
-model_name_list=$(func_parser_value "${lines[1]}")
-model_name_pact_list=$(func_parser_value "${lines[2]}")
-model_name_fpgm_list=$(func_parser_value "${lines[3]}")
-model_name_kl_list=$(func_parser_value "${lines[4]}")
-python=$(func_parser_value "${lines[5]}")
-gpu_list=$(func_parser_value "${lines[6]}")
-epoch_key=$(func_parser_key "${lines[7]}")
-epoch_value=$(func_parser_value "${lines[7]}")
-save_model_key=$(func_parser_key "${lines[8]}")
-save_model_value=$(func_parser_value "${lines[8]}")
-pretrain_model_key=$(func_parser_key "${lines[9]}")
-save_infer_key=$(func_parser_key "${lines[10]}")
-
-#scripts
-train_py=$(func_parser_value "${lines[20]}")
-eval_py=$(func_parser_value "${lines[21]}")
-norm_export=$(func_parser_value "${lines[22]}")
-inference_py=$(func_parser_value "${lines[23]}")
-
-#The inference params
-use_gpu_key=$(func_parser_key "${lines[33]}")
-use_gpu_list=$(func_parser_value "${lines[33]}")
-use_mkldnn_key=$(func_parser_key "${lines[34]}")
-use_mkldnn_list=$(func_parser_value "${lines[34]}")
-cpu_threads_key=$(func_parser_key "${lines[35]}")
-cpu_threads_list=$(func_parser_value "${lines[35]}")
-batch_size_key=$(func_parser_key "${lines[36]}")
-batch_size_list=$(func_parser_value "${lines[36]}")
-use_trt_key=$(func_parser_key "${lines[37]}")
-use_trt_list=$(func_parser_value "${lines[37]}")
-precision_key=$(func_parser_key "${lines[38]}")
-precision_list=$(func_parser_value "${lines[38]}")
-infer_model_key=$(func_parser_key "${lines[39]}")
-infer_model=$(func_parser_value "${lines[39]}")
-image_dir_key=$(func_parser_key "${lines[40]}")
-infer_img_dir=$(func_parser_value "${lines[40]}")
-save_log_key=$(func_parser_key "${lines[32]}")
-
-LOG_PATH="./test/output"
-mkdir -p ${LOG_PATH}
-status_log="${LOG_PATH}/results.log"
-
-
-function func_inference(){
-    IFS='|'
-    _python=$1
-    _script=$2
-    _model_dir=$3
-    _log_path=$4
-    _img_dir=$5
-    _model_name=$6
-    
-    # inference 
-    for use_gpu in ${use_gpu_list[*]}; do 
-        if [ ${use_gpu} = "False" ]; then
-            for use_mkldnn in ${use_mkldnn_list[*]}; do
-                for threads in ${cpu_threads_list[*]}; do
-                    for batch_size in ${batch_size_list[*]}; do
-                        _save_log_path="${_log_path}/${_model_name}_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_batchsize_${batch_size}.log"
-                        command="${_python} ${_script} -o ${use_gpu_key}=${use_gpu} -o ${use_mkldnn_key}=${use_mkldnn} -o ${cpu_threads_key}=${threads} -o ${infer_model_key}=${_model_dir} -o ${batch_size_key}=${batch_size} -o ${image_dir_key}=${_img_dir} -o ${save_log_key}=${_save_log_path} -o benchmark=True -o Global.model_name=${_model_name}"
-                        eval $command
-                        status_check $? "${command}" "../${status_log}"
-                    done
-                done
-            done
-        else
-            for use_trt in ${use_trt_list[*]}; do
-                for precision in ${precision_list[*]}; do
-                    if [ ${use_trt} = "False" ] && [ ${precision} != "fp32" ]; then
-                        continue
-                    fi
-                    for batch_size in ${batch_size_list[*]}; do
-                        _save_log_path="${_log_path}/${_model_name}_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log"
-                        command="${_python} ${_script} -o ${use_gpu_key}=${use_gpu} -o ${use_trt_key}=${use_trt} -o ${precision_key}=${precision} -o ${infer_model_key}=${_model_dir} -o ${batch_size_key}=${batch_size} -o ${image_dir_key}=${_img_dir} -o ${save_log_key}=${_save_log_path}  -o benchmark=True -o Global.model_name=${_model_name}"
-                        eval $command
-                        status_check $? "${command}" "../${status_log}"
-                    done
-                done
-            done
-        fi
-    done
-}
-
-if [ ${MODE} != "infer" ]; then
-
-IFS="|"
-for gpu in ${gpu_list[*]}; do
-    use_gpu=True
-    if [ ${gpu} = "-1" ];then
-	use_gpu=False
-        env=""
-    elif [ ${#gpu} -le 1 ];then
-        env="export CUDA_VISIBLE_DEVICES=${gpu}"
-        eval ${env}
-    elif [ ${#gpu} -le 15 ];then
-        IFS=","
-        array=(${gpu})
-        env="export CUDA_VISIBLE_DEVICES=${array[0]}"
-        IFS="|"
-    else
-        IFS=";"
-        array=(${gpu})
-        ips=${array[0]}
-        gpu=${array[1]}
-        IFS="|"
-        env=" "
-    fi
-    for model_name in ${model_name_list[*]}; do 
-        # not set epoch when whole_train_infer
-        if [ ${MODE} != "whole_train_infer" ]; then
-            set_epoch="-o ${epoch_key}=${epoch_value}"
-        else
-            set_epoch=" "
-        fi
-        save_log="${LOG_PATH}/${model_name}_gpus_${gpu}"
-	# train with cpu
-	if [ ${gpu} = "-1" ];then
-            cmd="${python} ${train_py} -o Arch.name=${model_name} -o Global.device=cpu -o ${save_model_key}=${save_log} ${set_epoch}"
-	# train with single gpu
-        elif [ ${#gpu} -le 2 ];then  # train with single gpu
-            cmd="${python} ${train_py} -o Arch.name=${model_name} -o ${save_model_key}=${save_log} ${set_epoch}"
-        elif [ ${#gpu} -le 15 ];then  # train with multi-gpu
-            cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${train_py} -o Arch.name=${model_name} -o ${save_model_key}=${save_log}  ${set_epoch}"
-        else     # train with multi-machine
-		cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${train_py} -o Arch.name=${model_name} -c ${save_model_key}=${save_log} ${set_epoch}"
-        fi
-        # run train
-        eval $cmd
-        status_check $? "${cmd}" "${status_log}"
-
-        # run eval
-        eval_cmd="${python} ${eval_py} -o Arch.name=${model_name} -o ${pretrain_model_key}=${save_log}/${model_name}/latest" 
-        eval $eval_cmd
-        status_check $? "${eval_cmd}" "${status_log}"
-
-        # run export model
-        save_infer_path="${save_log}/inference"
-        export_cmd="${python} ${norm_export} -o Arch.name=${model_name} -o ${pretrain_model_key}=${save_log}/${model_name}/latest -o ${save_infer_key}=${save_infer_path}"
-        eval $export_cmd
-        status_check $? "${export_cmd}" "${status_log}"
-
-        #run inference
-        eval $env
-        save_infer_path="${save_log}/inference"
-	cd deploy
-        func_inference "${python}" "${inference_py}" "../${save_infer_path}" "../${LOG_PATH}" "../${infer_img_dir}" "${model_name}"
-        eval "unset CUDA_VISIBLE_DEVICES"
-	cd ..
-    done
-done
-
-else
-    GPUID=$3
-    if [ ${#GPUID} -le 0 ];then
-        env=" "
-    else
-        env="export CUDA_VISIBLE_DEVICES=${GPUID}"
-    fi
-    echo $env
-    # export inference model
-    mkdir -p inference_models
-    for model_name in ${model_name_list[*]}; do
-        export_cmd="${python} ${norm_export} -o Arch.name=${model_name} -o ${pretrain_model_key}=pretrained_models/${model_name}_pretrained -o ${save_infer_key}=./inference_models/${model_name}"
-	eval $export_cmd
-    done
-    #run inference
-    cd deploy
-    for model_name in ${model_name_list[*]}; do
-        func_inference "${python}" "${inference_py}" "../inference_models/${model_name}" "../${LOG_PATH}" "../${infer_img_dir}" "${model_name}"
-    done
-    cd ..
-fi
--- a/test/parames.txt
+++ b/test/parames.txt
--- a/tests/HRNet_W18_C.txt
+++ b/tests/HRNet_W18_C.txt
--- a/tests/MobileNetV1.txt
+++ b/tests/MobileNetV1.txt
--- a/tests/MobileNetV2.txt
+++ b/tests/MobileNetV2.txt
--- a/tests/MobileNetV3_large_x1_0.txt
+++ b/tests/MobileNetV3_large_x1_0.txt
--- a/tests/ResNeXt101_vd_64x4d.txt
+++ b/tests/ResNeXt101_vd_64x4d.txt
--- a/tests/ResNet50_vd.txt
+++ b/tests/ResNet50_vd.txt
--- a/tests/ShuffleNetV2_x1_0.txt
+++ b/tests/ShuffleNetV2_x1_0.txt
--- a/test/prepare.sh
+++ b/test/prepare.sh
--- a/tests/test.sh
+++ b/tests/test.sh