Init PaddleClas

9f39da88 · WuHaobo · a7337f4a · 9f39da88 · 9f39da88 · 9f39da88
169 changed file
--- a/.gitignore
+++ b/.gitignore
+*.pyc
+*.sw*
+*log*
+/dataset
+checkpoints/
+pretrained/
+*.ipynb*
+build/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+-   repo: https://github.com/PaddlePaddle/mirrors-yapf.git
+    sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
+    hooks:
+    -   id: yapf
+        files: \.py$
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    sha: a11d9314b22d8f8c7556443875b731ef05965464
+    hooks:
+    -   id: check-merge-conflict
+    -   id: check-symlinks
+    -   id: detect-private-key
+        files: (?!.*paddle)^.*$
+    -   id: end-of-file-fixer
+        files: \.(md|yml)$
+    -   id: trailing-whitespace
+        files: \.(md|yml)$
+-   repo: https://github.com/Lucas-C/pre-commit-hooks
+    sha: v1.0.1
+    hooks:
+    -   id: forbid-crlf
+        files: \.(md|yml)$
+    -   id: remove-crlf
+        files: \.(md|yml)$
+    -   id: forbid-tabs
+        files: \.(md|yml)$
+    -   id: remove-tabs
+        files: \.(md|yml)$
--- a/configs/AlexNet/AlexNet.yaml
+++ b/configs/AlexNet/AlexNet.yaml
+mode: 'train'
+architecture: "AlexNet"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+   function: 'Piecewise'
+   params:
+       lr: 0.01
+       decay_epochs: [30, 60, 90]
+       gamma: 0.1
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.0001
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DPN/DPN107.yaml
+++ b/configs/DPN/DPN107.yaml
+mode: 'train'
+architecture: 'DPN107'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DPN/DPN131.yaml
+++ b/configs/DPN/DPN131.yaml
+mode: 'train'
+architecture: 'DPN131'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DPN/DPN68.yaml
+++ b/configs/DPN/DPN68.yaml
+mode: 'train'
+architecture: 'DPN68'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DPN/DPN92.yaml
+++ b/configs/DPN/DPN92.yaml
+mode: 'train'
+architecture: 'DPN92'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DPN/DPN98.yaml
+++ b/configs/DPN/DPN98.yaml
+mode: 'train'
+architecture: 'DPN98'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DarkNet/DarkNet53.yaml
+++ b/configs/DarkNet/DarkNet53.yaml
+mode: 'train'
+architecture: "DarkNet53"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 256, 256]
+LEARNING_RATE:
+   function: 'Cosine'
+   params:
+       lr: 0.1
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.0001
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 256
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DenseNet/DenseNet121.yaml
+++ b/configs/DenseNet/DenseNet121.yaml
+mode: 'train'
+architecture: 'DenseNet121'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DenseNet/DenseNet161.yaml
+++ b/configs/DenseNet/DenseNet161.yaml
+mode: 'train'
+architecture: 'DenseNet161'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DenseNet/DenseNet169.yaml
+++ b/configs/DenseNet/DenseNet169.yaml
+mode: 'train'
+architecture: 'DenseNet169'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DenseNet/DenseNet201.yaml
+++ b/configs/DenseNet/DenseNet201.yaml
+mode: 'train'
+architecture: 'DenseNet201'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DenseNet/DenseNet264.yaml
+++ b/configs/DenseNet/DenseNet264.yaml
+mode: 'train'
+architecture: 'DenseNet264'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/HRNet/HRNet_W18_C.yaml
+++ b/configs/HRNet/HRNet_W18_C.yaml
+mode: 'train'
+architecture: 'HRNet_W18_C'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/HRNet/HRNet_W30_C.yaml
+++ b/configs/HRNet/HRNet_W30_C.yaml
+mode: 'train'
+architecture: 'HRNet_W30_C'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/HRNet/HRNet_W32_C.yaml
+++ b/configs/HRNet/HRNet_W32_C.yaml
+mode: 'train'
+architecture: 'HRNet_W32_C'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/HRNet/HRNet_W40_C.yaml
+++ b/configs/HRNet/HRNet_W40_C.yaml
+mode: 'train'
+architecture: 'HRNet_W40_C'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/HRNet/HRNet_W44_C.yaml
+++ b/configs/HRNet/HRNet_W44_C.yaml
+mode: 'train'
+architecture: 'HRNet_W44_C'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/HRNet/HRNet_W48_C.yaml
+++ b/configs/HRNet/HRNet_W48_C.yaml
+mode: 'train'
+architecture: 'HRNet_W48_C'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/HRNet/HRNet_W64_C.yaml
+++ b/configs/HRNet/HRNet_W64_C.yaml
+mode: 'train'
+architecture: 'HRNet_W64_C'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/Inception/GoogLeNet.yaml
+++ b/configs/Inception/GoogLeNet.yaml
+mode: 'train'
+architecture: "GoogLeNet"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+   function: 'Cosine'
+   params:
+       lr: 0.01
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.0001
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/Inception/InceptionV4.yaml
+++ b/configs/Inception/InceptionV4.yaml
+mode: 'train'
+architecture: 'InceptionV4'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 299, 299]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                  
+        lr: 0.045               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00010
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 16
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV1/MobileNetV1.yaml
+++ b/configs/MobileNetV1/MobileNetV1.yaml
+mode: 'train'
+architecture: "MobileNetV1"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+   function: 'Piecewise'
+   params:
+       lr: 0.1
+       decay_epochs: [30, 60, 90]
+       gamma: 0.1
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00003
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV1/MobileNetV1_x0_25.yaml
+++ b/configs/MobileNetV1/MobileNetV1_x0_25.yaml
+mode: 'train'
+architecture: "MobileNetV1_x0_25"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+   function: 'Piecewise'
+   params:
+       lr: 0.1
+       decay_epochs: [30, 60, 90]
+       gamma: 0.1
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00003
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV1/MobileNetV1_x0_5.yaml
+++ b/configs/MobileNetV1/MobileNetV1_x0_5.yaml
+mode: 'train'
+architecture: "MobileNetV1_x0_5"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+   function: 'Piecewise'
+   params:
+       lr: 0.1
+       decay_epochs: [30, 60, 90]
+       gamma: 0.1
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00003
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV1/MobileNetV1_x0_75.yaml
+++ b/configs/MobileNetV1/MobileNetV1_x0_75.yaml
+mode: 'train'
+architecture: "MobileNetV1_x0_75"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+   function: 'Piecewise'
+   params:
+       lr: 0.1
+       decay_epochs: [30, 60, 90]
+       gamma: 0.1
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00003
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV2/MobileNetV2.yaml
+++ b/configs/MobileNetV2/MobileNetV2.yaml
+mode: 'train'
+architecture: "MobileNetV2"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 240
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'Cosine'
+    params:
+        lr: 0.045
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00004
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV2/MobileNetV2_x0_25.yaml
+++ b/configs/MobileNetV2/MobileNetV2_x0_25.yaml
+mode: 'train'
+architecture: "MobileNetV2_x0_25"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 240
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'Cosine'
+    params:
+        lr: 0.045
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00003
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            ratio: [1.0, 1.0]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV2/MobileNetV2_x0_5.yaml
+++ b/configs/MobileNetV2/MobileNetV2_x0_5.yaml
+mode: 'train'
+architecture: "MobileNetV2_x0_5"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 240
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'Cosine'
+    params:
+        lr: 0.045
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00003
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            ratio: [1.0, 1.0]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV2/MobileNetV2_x0_75.yaml
+++ b/configs/MobileNetV2/MobileNetV2_x0_75.yaml
+mode: 'train'
+architecture: "MobileNetV2_x0_75"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 240
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'Cosine'
+    params:
+        lr: 0.045
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00004
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV2/MobileNetV2_x1_5.yaml
+++ b/configs/MobileNetV2/MobileNetV2_x1_5.yaml
+mode: 'train'
+architecture: "MobileNetV2_x1_5"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 240
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'Cosine'
+    params:
+        lr: 0.045
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00004
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV2/MobileNetV2_x2_0.yaml
+++ b/configs/MobileNetV2/MobileNetV2_x2_0.yaml
+mode: 'train'
+architecture: "MobileNetV2_x2_0"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 240
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'Cosine'
+    params:
+        lr: 0.045
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00004
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV3/MobileNetV3_large_x0_35.yaml
+++ b/configs/MobileNetV3/MobileNetV3_large_x0_35.yaml
+mode: 'train'
+architecture: "MobileNetV3_large_x0_35"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+ls_epsilon: 0.1
+validate: True
+valid_interval: 1
+epochs: 360
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'CosineWarmup'
+    params:
+        lr: 2.6
+        warmup_epoch: 5
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00002
+TRAIN:
+    batch_size: 4096
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV3/MobileNetV3_large_x0_5.yaml
+++ b/configs/MobileNetV3/MobileNetV3_large_x0_5.yaml
+mode: 'train'
+architecture: "MobileNetV3_large_x0_5"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+ls_epsilon: 0.1
+validate: True
+valid_interval: 1
+epochs: 360
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'CosineWarmup'
+    params:
+        lr: 1.3
+        warmup_epoch: 5
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00002
+TRAIN:
+    batch_size: 2048
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV3/MobileNetV3_large_x0_75.yaml
+++ b/configs/MobileNetV3/MobileNetV3_large_x0_75.yaml
+mode: 'train'
+architecture: "MobileNetV3_large_x0_75"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+ls_epsilon: 0.1
+validate: True
+valid_interval: 1
+epochs: 360
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'CosineWarmup'
+    params:
+        lr: 1.3
+        warmup_epoch: 5
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00002
+TRAIN:
+    batch_size: 2048
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV3/MobileNetV3_large_x1_0.yaml
+++ b/configs/MobileNetV3/MobileNetV3_large_x1_0.yaml
+mode: 'train'
+architecture: "MobileNetV3_large_x1_0"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+ls_epsilon: 0.1
+validate: True
+valid_interval: 1
+epochs: 360
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'CosineWarmup'
+    params:
+        lr: 2.6
+        warmup_epoch: 5
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00002
+TRAIN:
+    batch_size: 4096
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - ImageNetPolicy:
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 32
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV3/MobileNetV3_large_x1_25.yaml
+++ b/configs/MobileNetV3/MobileNetV3_large_x1_25.yaml
+mode: 'train'
+architecture: "MobileNetV3_large_x1_25"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+ls_epsilon: 0.1
+validate: True
+valid_interval: 1
+epochs: 360
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'CosineWarmup'
+    params:
+        lr: 0.65
+        warmup_epoch: 5
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00004
+TRAIN:
+    batch_size: 1024
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV3/MobileNetV3_small_x0_35.yaml
+++ b/configs/MobileNetV3/MobileNetV3_small_x0_35.yaml
+mode: 'train'
+architecture: "MobileNetV3_small_x0_35"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 360
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'CosineWarmup'
+    params:
+        lr: 2.6
+        warmup_epoch: 5
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00001
+TRAIN:
+    batch_size: 4096
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV3/MobileNetV3_small_x0_5.yaml
+++ b/configs/MobileNetV3/MobileNetV3_small_x0_5.yaml
+mode: 'train'
+architecture: "MobileNetV3_small_x0_5"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+ls_epsilon: 0.1
+validate: True
+valid_interval: 1
+epochs: 360
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'CosineWarmup'
+    params:
+        lr: 2.6
+        warmup_epoch: 5
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00001
+TRAIN:
+    batch_size: 4096
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV3/MobileNetV3_small_x0_75.yaml
+++ b/configs/MobileNetV3/MobileNetV3_small_x0_75.yaml
+mode: 'train'
+architecture: "MobileNetV3_small_x0_75"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+ls_epsilon: 0.1
+validate: True
+valid_interval: 1
+epochs: 360
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'CosineWarmup'
+    params:
+        lr: 2.6
+        warmup_epoch: 5
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00002
+TRAIN:
+    batch_size: 4096
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV3/MobileNetV3_small_x1_0.yaml
+++ b/configs/MobileNetV3/MobileNetV3_small_x1_0.yaml
+mode: 'train'
+architecture: "MobileNetV3_small_x1_0"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+ls_epsilon: 0.1
+validate: True
+valid_interval: 1
+epochs: 360
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'CosineWarmup'
+    params:
+        lr: 2.6
+        warmup_epoch: 5
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00002
+TRAIN:
+    batch_size: 4096
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/MobileNetV3/MobileNetV3_small_x1_25.yaml
+++ b/configs/MobileNetV3/MobileNetV3_small_x1_25.yaml
+mode: 'train'
+architecture: "MobileNetV3_small_x1_25"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+ls_epsilon: 0.1
+validate: True
+valid_interval: 1
+epochs: 360
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'CosineWarmup'
+    params:
+        lr: 1.3
+        warmup_epoch: 5
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00002
+TRAIN:
+    batch_size: 2048
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/Res2Net/Res2Net101_vd_26w_4s.yaml
+++ b/configs/Res2Net/Res2Net101_vd_26w_4s.yaml
+mode: 'train'
+architecture: 'Res2Net101_vd_26w_4s'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/Res2Net/Res2Net200_vd_26w_4s.yaml
+++ b/configs/Res2Net/Res2Net200_vd_26w_4s.yaml
+mode: 'train'
+architecture: 'Res2Net200_vd_26w_4s'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/Res2Net/Res2Net50_14w_8s.yaml
+++ b/configs/Res2Net/Res2Net50_14w_8s.yaml
+mode: 'train'
+architecture: 'Res2Net50_14w_8s'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/Res2Net/Res2Net50_26w_4s.yaml
+++ b/configs/Res2Net/Res2Net50_26w_4s.yaml
+mode: 'train'
+architecture: 'Res2Net50_26w_4s'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/Res2Net/Res2Net50_vd_26w_4s.yaml
+++ b/configs/Res2Net/Res2Net50_vd_26w_4s.yaml
+mode: 'train'
+architecture: 'Res2Net50_vd_26w_4s'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNeXt/ResNeXt101_32x4d.yaml
+++ b/configs/ResNeXt/ResNeXt101_32x4d.yaml
+mode: 'train'
+architecture: 'ResNeXt101_32x4d'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNeXt/ResNeXt101_64x4d.yaml
+++ b/configs/ResNeXt/ResNeXt101_64x4d.yaml
+mode: 'train'
+architecture: 'ResNeXt101_64x4d'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000150
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNeXt/ResNeXt101_vd_32x4d.yaml
+++ b/configs/ResNeXt/ResNeXt101_vd_32x4d.yaml
+mode: 'train'
+architecture: 'ResNeXt101_vd_32x4d'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNeXt/ResNeXt101_vd_64x4d.yaml
+++ b/configs/ResNeXt/ResNeXt101_vd_64x4d.yaml
+mode: 'train'
+architecture: 'ResNeXt101_vd_64x4d'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNeXt/ResNeXt152_32x4d.yaml
+++ b/configs/ResNeXt/ResNeXt152_32x4d.yaml
+mode: 'train'
+architecture: 'ResNeXt152_32x4d'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNeXt/ResNeXt152_64x4d.yaml
+++ b/configs/ResNeXt/ResNeXt152_64x4d.yaml
+mode: 'train'
+architecture: 'ResNeXt152_64x4d'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000180
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNeXt/ResNeXt152_vd_32x4d.yaml
+++ b/configs/ResNeXt/ResNeXt152_vd_32x4d.yaml
+mode: 'train'
+architecture: 'ResNeXt152_vd_32x4d'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNeXt/ResNeXt152_vd_64x4d.yaml
+++ b/configs/ResNeXt/ResNeXt152_vd_64x4d.yaml
+mode: 'train'
+architecture: 'ResNeXt152_vd_64x4d'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNeXt/ResNeXt50_32x4d.yaml
+++ b/configs/ResNeXt/ResNeXt50_32x4d.yaml
+mode: 'train'
+architecture: 'ResNeXt50_32x4d'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNeXt/ResNeXt50_64x4d.yaml
+++ b/configs/ResNeXt/ResNeXt50_64x4d.yaml
+mode: 'train'
+architecture: "ResNeXt50_64x4d"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+   function: 'Piecewise'
+   params:
+       lr: 0.1
+       decay_epochs: [30, 60, 90]
+       gamma: 0.1
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.0001
+TRAIN:
+    batch_size: 32
+    num_workers: 8
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNeXt/ResNeXt50_vd_32x4d.yaml
+++ b/configs/ResNeXt/ResNeXt50_vd_32x4d.yaml
+mode: 'train'
+architecture: "ResNeXt50_vd_32x4d"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+   function: 'CosineWarmup'
+   params:
+       lr: 0.1
+       decay_epochs: [30, 60, 90]
+       gamma: 0.1
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.0001
+TRAIN:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:
+        - MixupOperator:
+            alpha: 0.2
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNeXt/ResNeXt50_vd_64x4d.yaml
+++ b/configs/ResNeXt/ResNeXt50_vd_64x4d.yaml
+mode: 'train'
+architecture: 'ResNeXt50_vd_64x4d'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNet/ResNet101.yaml
+++ b/configs/ResNet/ResNet101.yaml
+mode: 'train'
+architecture: 'ResNet101'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNet/ResNet101_vd.yaml
+++ b/configs/ResNet/ResNet101_vd.yaml
+mode: 'train'
+architecture: 'ResNet101_vd'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNet/ResNet152.yaml
+++ b/configs/ResNet/ResNet152.yaml
+mode: 'train'
+architecture: 'ResNet152'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNet/ResNet152_vd.yaml
+++ b/configs/ResNet/ResNet152_vd.yaml
+mode: 'train'
+architecture: 'ResNet152_vd'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNet/ResNet18.yaml
+++ b/configs/ResNet/ResNet18.yaml
+mode: 'train'
+architecture: 'ResNet18'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNet/ResNet18_vd.yaml
+++ b/configs/ResNet/ResNet18_vd.yaml
+mode: 'train'
+architecture: 'ResNet18_vd'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000070
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNet/ResNet200_vd.yaml
+++ b/configs/ResNet/ResNet200_vd.yaml
+mode: 'train'
+architecture: 'ResNet200_vd'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNet/ResNet34.yaml
+++ b/configs/ResNet/ResNet34.yaml
+mode: 'train'
+architecture: 'ResNet34'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNet/ResNet34_vd.yaml
+++ b/configs/ResNet/ResNet34_vd.yaml
+mode: 'train'
+architecture: 'ResNet34_vd'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000070
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNet/ResNet50.yaml
+++ b/configs/ResNet/ResNet50.yaml
+mode: 'train'
+architecture: 'ResNet50'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Piecewise'          
+    params:                   
+        lr: 0.1               
+        decay_epochs: [30, 60, 90] 
+        gamma: 0.1 
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNet/ResNet50_vc.yaml
+++ b/configs/ResNet/ResNet50_vc.yaml
+mode: 'train'
+architecture: 'ResNet50_vc'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNet/ResNet50_vd.yaml
+++ b/configs/ResNet/ResNet50_vd.yaml
+mode: 'train'
+architecture: 'ResNet50_vd'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000070
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ResNet_ACNet/ResNet_ACNet.yaml
+++ b/configs/ResNet_ACNet/ResNet_ACNet.yaml
+mode: 'train'
+architecture: "ResNet_ACNet"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+   function: 'Piecewise'
+   params:
+       lr: 0.1
+       decay_epochs: [30, 60, 90]
+       gamma: 0.1
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.0001
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/SENet/SENet154_vd.yaml
+++ b/configs/SENet/SENet154_vd.yaml
+mode: 'train'
+architecture: 'SENet154_vd'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/SENet/SE_ResNeXt101_32x4d.yaml
+++ b/configs/SENet/SE_ResNeXt101_32x4d.yaml
+mode: 'train'
+architecture: 'SE_ResNeXt101_32x4d'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000015
+TRAIN:
+    batch_size: 400
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/SENet/SE_ResNeXt50_32x4d.yaml
+++ b/configs/SENet/SE_ResNeXt50_32x4d.yaml
+mode: 'train'
+architecture: 'SE_ResNeXt50_32x4d'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000120
+TRAIN:
+    batch_size: 400
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/SENet/SE_ResNeXt50_vd_32x4d.yaml
+++ b/configs/SENet/SE_ResNeXt50_vd_32x4d.yaml
+mode: 'train'
+architecture: 'SE_ResNeXt50_vd_32x4d'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/SENet/SE_ResNet18_vd.yaml
+++ b/configs/SENet/SE_ResNet18_vd.yaml
+mode: 'train'
+architecture: 'SE_ResNet18_vd'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000070
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/SENet/SE_ResNet34_vd.yaml
+++ b/configs/SENet/SE_ResNet34_vd.yaml
+mode: 'train'
+architecture: 'SE_ResNet34_vd'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000070
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/SENet/SE_ResNet50_vd.yaml
+++ b/configs/SENet/SE_ResNet50_vd.yaml
+mode: 'train'
+architecture: 'SE_ResNet50_vd'
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 200
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: True
+ls_epsilon: 0.1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.1               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    mix:                       
+        - MixupOperator:    
+            alpha: 0.2      
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ShuffleNet/ShuffleNetV2.yaml
+++ b/configs/ShuffleNet/ShuffleNetV2.yaml
+mode: 'train'
+architecture: "ShuffleNetV2"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 240
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'CosineWarmup'
+    params:
+        lr: 0.5
+        warmup_epoch: 5
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00004
+TRAIN:
+    batch_size: 1024
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ShuffleNet/ShuffleNetV2_swish.yaml
+++ b/configs/ShuffleNet/ShuffleNetV2_swish.yaml
+mode: 'train'
+architecture: "ShuffleNetV2_swish"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 240
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'CosineWarmup'
+    params:
+        lr: 0.5
+        warmup_epoch: 5
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00004
+TRAIN:
+    batch_size: 1024
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ShuffleNet/ShuffleNetV2_x0_25.yaml
+++ b/configs/ShuffleNet/ShuffleNetV2_x0_25.yaml
+mode: 'train'
+architecture: "ShuffleNetV2_x0_25"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 240
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'CosineWarmup'
+    params:
+        lr: 0.5
+        warmup_epoch: 5
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00003
+TRAIN:
+    batch_size: 1024
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            scale: [0.64, 1.0]
+            ratio: [0.8, 1.2]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ShuffleNet/ShuffleNetV2_x0_33.yaml
+++ b/configs/ShuffleNet/ShuffleNetV2_x0_33.yaml
+mode: 'train'
+architecture: "ShuffleNetV2_x0_33"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 240
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'CosineWarmup'
+    params:
+        lr: 0.5
+        warmup_epoch: 5
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00003
+TRAIN:
+    batch_size: 1024
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            scale: [0.64, 1.0]
+            ratio: [0.8, 1.2]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ShuffleNet/ShuffleNetV2_x0_5.yaml
+++ b/configs/ShuffleNet/ShuffleNetV2_x0_5.yaml
+mode: 'train'
+architecture: "ShuffleNetV2_x0_5"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 240
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'CosineWarmup'
+    params:
+        lr: 0.5
+        warmup_epoch: 5
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00003
+TRAIN:
+    batch_size: 1024
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            scale: [0.64, 1.0]
+            ratio: [0.8, 1.2]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ShuffleNet/ShuffleNetV2_x1_5.yaml
+++ b/configs/ShuffleNet/ShuffleNetV2_x1_5.yaml
+mode: 'train'
+architecture: "ShuffleNetV2_x1_5"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 240
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'CosineWarmup'
+    params:
+        lr: 0.25
+        warmup_epoch: 5
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00004
+TRAIN:
+    batch_size: 512
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            ratio: [1.0, 1.0]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/ShuffleNet/ShuffleNetV2_x2_0.yaml
+++ b/configs/ShuffleNet/ShuffleNetV2_x2_0.yaml
+mode: 'train'
+architecture: "ShuffleNetV2_x2_0"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 240
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+    function: 'CosineWarmup'
+    params:
+        lr: 0.25
+        warmup_epoch: 5
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00004
+TRAIN:
+    batch_size: 512
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/SqueezeNet/SqueezeNet1_0.yaml
+++ b/configs/SqueezeNet/SqueezeNet1_0.yaml
+mode: 'train'
+architecture: "SqueezeNet1_0"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+   function: 'Cosine'
+   params:
+       lr: 0.02
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.0001
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/SqueezeNet/SqueezeNet1_1.yaml
+++ b/configs/SqueezeNet/SqueezeNet1_1.yaml
+mode: 'train'
+architecture: "SqueezeNet1_1"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+   function: 'Cosine'
+   params:
+       lr: 0.02
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.0001
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/VGG/VGG11.yaml
+++ b/configs/VGG/VGG11.yaml
+mode: 'train'
+architecture: "VGG11"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 90
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+   function: 'Cosine'
+   params:
+       lr: 0.1
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.0004
+TRAIN:
+    batch_size: 512
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/VGG/VGG13.yaml
+++ b/configs/VGG/VGG13.yaml
+mode: 'train'
+architecture: "VGG13"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 90
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+   function: 'Cosine'
+   params:
+       lr: 0.01
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.0003
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/VGG/VGG16.yaml
+++ b/configs/VGG/VGG16.yaml
+mode: 'train'
+architecture: "VGG16"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 90
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+   function: 'Cosine'
+   params:
+       lr: 0.01
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.0004
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/VGG/VGG19.yaml
+++ b/configs/VGG/VGG19.yaml
+mode: 'train'
+architecture: "VGG19"
+pretrained_model: ""
+model_save_dir: "./checkpoints/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 150
+topk: 5
+image_shape: [3, 224, 224]
+LEARNING_RATE:
+   function: 'Cosine'
+   params:
+       lr: 0.01
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.0004
+TRAIN:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/eval.yaml
+++ b/configs/eval.yaml
+mode: 'valid'
+architecture: ""
+pretrained_model: ""
+classes_num: 1000
+total_images: 1281167
+topk: 5
+image_shape: [3, 224, 224]
+VALID:
+    batch_size: 16
+    num_workers: 4
+    file_list: "../dataset/ILSVRC2012/val_list.txt"
+    data_dir: "../dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/ppcls/__init__.py
+++ b/ppcls/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import optimizer
+from .modeling import *
+from .optimizer import *
+from .data import *
+from .utils import *
--- a/ppcls/data/__init__.py
+++ b/ppcls/data/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .reader import Reader
--- a/ppcls/data/imaug/__init__.py
+++ b/ppcls/data/imaug/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .autoaugment import ImageNetPolicy as RawImageNetPolicy
+from .randaugment import RandAugment as RawRandAugment
+from .cutout import Cutout
+from .hide_and_seek import HideAndSeek
+from .random_erasing import RandomErasing
+from .grid import GridMask
+from .operators import DecodeImage
+from .operators import ResizeImage
+from .operators import CropImage
+from .operators import RandCropImage
+from .operators import RandFlipImage
+from .operators import NormalizeImage
+from .operators import ToCHWImage
+from .batch_operators import MixupOperator
+from .batch_operators import CutmixOperator
+from .batch_operators import FmixOperator
+import six
+import numpy as np
+from PIL import Image
+def transform(data, ops=[]):
+    """ transform """
+    for op in ops:
+        data = op(data)
+    return data
+class ImageNetPolicy(RawImageNetPolicy):
+    """ ImageNetPolicy wrapper to auto fit different img types """
+    def __init__(self, *args, **kwargs):
+        if six.PY2:
+            super(ImageNetPolicy, self).__init__(*args, **kwargs)
+        else:
+            super().__init__(*args, **kwargs)
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+        if six.PY2:
+            img = super(ImageNetPolicy, self).__call__(img)
+        else:
+            img = super().__call__(img)
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+        return img
+class RandAugment(RawRandAugment):
+    """ RandAugment wrapper to auto fit different img types """
+    def __init__(self, *args, **kwargs):
+        if six.PY2:
+            super(RandAugment, self).__init__(*args, **kwargs)
+        else:
+            super().__init__(*args, **kwargs)
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+        if six.PY2:
+            img = super(RandAugment, self).__call__(img)
+        else:
+            img = super().__call__(img)
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+        return img
--- a/ppcls/data/imaug/autoaugment.py
+++ b/ppcls/data/imaug/autoaugment.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#This code is based on https://github.com/DeepVoltaire/AutoAugment/blob/master/autoaugment.py
+from PIL import Image, ImageEnhance, ImageOps
+import numpy as np
+import random
+class ImageNetPolicy(object):
+    """ Randomly choose one of the best 24 Sub-policies on ImageNet.
+        Example:
+        >>> policy = ImageNetPolicy()
+        >>> transformed = policy(image)
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     ImageNetPolicy(),
+        >>>     transforms.ToTensor()])
+    """
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.4, "posterize", 8, 0.6, "rotate", 9, fillcolor),
+            SubPolicy(0.6, "solarize", 5, 0.6, "autocontrast", 5, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.6, "posterize", 7, 0.6, "posterize", 6, fillcolor),
+            SubPolicy(0.4, "equalize", 7, 0.2, "solarize", 4, fillcolor),
+            SubPolicy(0.4, "equalize", 4, 0.8, "rotate", 8, fillcolor),
+            SubPolicy(0.6, "solarize", 3, 0.6, "equalize", 7, fillcolor),
+            SubPolicy(0.8, "posterize", 5, 1.0, "equalize", 2, fillcolor),
+            SubPolicy(0.2, "rotate", 3, 0.6, "solarize", 8, fillcolor),
+            SubPolicy(0.6, "equalize", 8, 0.4, "posterize", 6, fillcolor),
+            SubPolicy(0.8, "rotate", 8, 0.4, "color", 0, fillcolor),
+            SubPolicy(0.4, "rotate", 9, 0.6, "equalize", 2, fillcolor),
+            SubPolicy(0.0, "equalize", 7, 0.8, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "invert", 4, 1.0, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "color", 4, 1.0, "contrast", 8, fillcolor),
+            SubPolicy(0.8, "rotate", 8, 1.0, "color", 2, fillcolor),
+            SubPolicy(0.8, "color", 8, 0.8, "solarize", 7, fillcolor),
+            SubPolicy(0.4, "sharpness", 7, 0.6, "invert", 8, fillcolor),
+            SubPolicy(0.6, "shearX", 5, 1.0, "equalize", 9, fillcolor),
+            SubPolicy(0.4, "color", 0, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.4, "equalize", 7, 0.2, "solarize", 4, fillcolor),
+            SubPolicy(0.6, "solarize", 5, 0.6, "autocontrast", 5, fillcolor),
+            SubPolicy(0.6, "invert", 4, 1.0, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "color", 4, 1.0, "contrast", 8, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.6, "equalize", 3, fillcolor)
+        ]
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+    def __repr__(self):
+        return "AutoAugment ImageNet Policy"
+class CIFAR10Policy(object):
+    """ Randomly choose one of the best 25 Sub-policies on CIFAR10.
+        Example:
+        >>> policy = CIFAR10Policy()
+        >>> transformed = policy(image)
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     CIFAR10Policy(),
+        >>>     transforms.ToTensor()])
+    """
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.1, "invert", 7, 0.2, "contrast", 6, fillcolor),
+            SubPolicy(0.7, "rotate", 2, 0.3, "translateX", 9, fillcolor),
+            SubPolicy(0.8, "sharpness", 1, 0.9, "sharpness", 3, fillcolor),
+            SubPolicy(0.5, "shearY", 8, 0.7, "translateY", 9, fillcolor),
+            SubPolicy(0.5, "autocontrast", 8, 0.9, "equalize", 2, fillcolor),
+            SubPolicy(0.2, "shearY", 7, 0.3, "posterize", 7, fillcolor),
+            SubPolicy(0.4, "color", 3, 0.6, "brightness", 7, fillcolor),
+            SubPolicy(0.3, "sharpness", 9, 0.7, "brightness", 9, fillcolor),
+            SubPolicy(0.6, "equalize", 5, 0.5, "equalize", 1, fillcolor),
+            SubPolicy(0.6, "contrast", 7, 0.6, "sharpness", 5, fillcolor),
+            SubPolicy(0.7, "color", 7, 0.5, "translateX", 8, fillcolor),
+            SubPolicy(0.3, "equalize", 7, 0.4, "autocontrast", 8, fillcolor),
+            SubPolicy(0.4, "translateY", 3, 0.2, "sharpness", 6, fillcolor),
+            SubPolicy(0.9, "brightness", 6, 0.2, "color", 8, fillcolor),
+            SubPolicy(0.5, "solarize", 2, 0.0, "invert", 3, fillcolor),
+            SubPolicy(0.2, "equalize", 0, 0.6, "autocontrast", 0, fillcolor),
+            SubPolicy(0.2, "equalize", 8, 0.8, "equalize", 4, fillcolor),
+            SubPolicy(0.9, "color", 9, 0.6, "equalize", 6, fillcolor),
+            SubPolicy(0.8, "autocontrast", 4, 0.2, "solarize", 8, fillcolor),
+            SubPolicy(0.1, "brightness", 3, 0.7, "color", 0, fillcolor),
+            SubPolicy(0.4, "solarize", 5, 0.9, "autocontrast", 3, fillcolor),
+            SubPolicy(0.9, "translateY", 9, 0.7, "translateY", 9, fillcolor),
+            SubPolicy(0.9, "autocontrast", 2, 0.8, "solarize", 3, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.1, "invert", 3, fillcolor),
+            SubPolicy(0.7, "translateY", 9, 0.9, "autocontrast", 1, fillcolor)
+        ]
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+    def __repr__(self):
+        return "AutoAugment CIFAR10 Policy"
+class SVHNPolicy(object):
+    """ Randomly choose one of the best 25 Sub-policies on SVHN.
+        Example:
+        >>> policy = SVHNPolicy()
+        >>> transformed = policy(image)
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     SVHNPolicy(),
+        >>>     transforms.ToTensor()])
+    """
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.9, "shearX", 4, 0.2, "invert", 3, fillcolor),
+            SubPolicy(0.9, "shearY", 8, 0.7, "invert", 5, fillcolor),
+            SubPolicy(0.6, "equalize", 5, 0.6, "solarize", 6, fillcolor),
+            SubPolicy(0.9, "invert", 3, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.6, "equalize", 1, 0.9, "rotate", 3, fillcolor),
+            SubPolicy(0.9, "shearX", 4, 0.8, "autocontrast", 3, fillcolor),
+            SubPolicy(0.9, "shearY", 8, 0.4, "invert", 5, fillcolor),
+            SubPolicy(0.9, "shearY", 5, 0.2, "solarize", 6, fillcolor),
+            SubPolicy(0.9, "invert", 6, 0.8, "autocontrast", 1, fillcolor),
+            SubPolicy(0.6, "equalize", 3, 0.9, "rotate", 3, fillcolor),
+            SubPolicy(0.9, "shearX", 4, 0.3, "solarize", 3, fillcolor),
+            SubPolicy(0.8, "shearY", 8, 0.7, "invert", 4, fillcolor),
+            SubPolicy(0.9, "equalize", 5, 0.6, "translateY", 6, fillcolor),
+            SubPolicy(0.9, "invert", 4, 0.6, "equalize", 7, fillcolor),
+            SubPolicy(0.3, "contrast", 3, 0.8, "rotate", 4, fillcolor),
+            SubPolicy(0.8, "invert", 5, 0.0, "translateY", 2, fillcolor),
+            SubPolicy(0.7, "shearY", 6, 0.4, "solarize", 8, fillcolor),
+            SubPolicy(0.6, "invert", 4, 0.8, "rotate", 4, fillcolor),
+            SubPolicy(
+                0.3, "shearY", 7, 0.9, "translateX", 3, fillcolor), SubPolicy(
+                    0.1, "shearX", 6, 0.6, "invert", 5, fillcolor), SubPolicy(
+                        0.7, "solarize", 2, 0.6, "translateY", 7,
+                        fillcolor), SubPolicy(0.8, "shearY", 4, 0.8, "invert",
+                                              8, fillcolor), SubPolicy(
+                                                  0.7, "shearX", 9, 0.8,
+                                                  "translateY", 3,
+                                                  fillcolor), SubPolicy(
+                                                      0.8, "shearY", 5, 0.7,
+                                                      "autocontrast", 3,
+                                                      fillcolor),
+            SubPolicy(0.7, "shearX", 2, 0.1, "invert", 5, fillcolor)
+        ]
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+    def __repr__(self):
+        return "AutoAugment SVHN Policy"
+class SubPolicy(object):
+    def __init__(self,
+                 p1,
+                 operation1,
+                 magnitude_idx1,
+                 p2,
+                 operation2,
+                 magnitude_idx2,
+                 fillcolor=(128, 128, 128)):
+        ranges = {
+            "shearX": np.linspace(0, 0.3, 10),
+            "shearY": np.linspace(0, 0.3, 10),
+            "translateX": np.linspace(0, 150 / 331, 10),
+            "translateY": np.linspace(0, 150 / 331, 10),
+            "rotate": np.linspace(0, 30, 10),
+            "color": np.linspace(0.0, 0.9, 10),
+            "posterize": np.round(np.linspace(8, 4, 10), 0).astype(np.int),
+            "solarize": np.linspace(256, 0, 10),
+            "contrast": np.linspace(0.0, 0.9, 10),
+            "sharpness": np.linspace(0.0, 0.9, 10),
+            "brightness": np.linspace(0.0, 0.9, 10),
+            "autocontrast": [0] * 10,
+            "equalize": [0] * 10,
+            "invert": [0] * 10
+        }
+        # from https://stackoverflow.com/questions/5252170/specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
+        def rotate_with_fill(img, magnitude):
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot,
+                                   Image.new("RGBA", rot.size, (128, ) * 4),
+                                   rot).convert(img.mode)
+        func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, magnitude * img.size[0] * random.choice([-1, 1]), 0, 1, 0),
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude * img.size[1] * random.choice([-1, 1])),
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            # "rotate": lambda img, magnitude: img.rotate(magnitude * random.choice([-1, 1])),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(1 + magnitude * random.choice([-1, 1])),
+            "posterize": lambda img, magnitude: ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude: ImageOps.solarize(img, magnitude),
+            "contrast": lambda img, magnitude: ImageEnhance.Contrast(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "sharpness": lambda img, magnitude: ImageEnhance.Sharpness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "brightness": lambda img, magnitude: ImageEnhance.Brightness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "autocontrast": lambda img, magnitude: ImageOps.autocontrast(img),
+            "equalize": lambda img, magnitude: ImageOps.equalize(img),
+            "invert": lambda img, magnitude: ImageOps.invert(img)
+        }
+        self.p1 = p1
+        self.operation1 = func[operation1]
+        self.magnitude1 = ranges[operation1][magnitude_idx1]
+        self.p2 = p2
+        self.operation2 = func[operation2]
+        self.magnitude2 = ranges[operation2][magnitude_idx2]
+    def __call__(self, img):
+        if random.random() < self.p1:
+            img = self.operation1(img, self.magnitude1)
+        if random.random() < self.p2:
+            img = self.operation2(img, self.magnitude2)
+        return img
--- a/ppcls/data/imaug/batch_operators.py
+++ b/ppcls/data/imaug/batch_operators.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import numpy as np
+from .fmix import sample_mask
+class BatchOperator(object):
+    """ BatchOperator """
+    def __init__(self, *args, **kwargs):
+        pass
+    def _unpack(self, batch):
+        """ _unpack """
+        assert isinstance(batch, list), \
+                'batch should be a list filled with tuples (img, label)'
+        bs = len(batch)
+        assert bs > 0, 'size of the batch data should > 0'
+        imgs, labels = list(zip(*batch))
+        return np.array(imgs), np.array(labels), bs
+    def __call__(self, batch):
+        return batch
+class MixupOperator(BatchOperator):
+    """ Mixup operator """
+    def __init__(self, alpha=0.2):
+        assert alpha > 0., \
+                'parameter alpha[%f] should > 0.0' % (alpha)
+        self._alpha = alpha
+    def __call__(self, batch):
+        imgs, labels, bs = self._unpack(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self._alpha, self._alpha)
+        imgs = lam * imgs + (1 - lam) * imgs[idx]
+        return list(zip(imgs, labels, labels[idx], [lam] * bs))
+class CutmixOperator(BatchOperator):
+    """ Cutmix operator """
+    def __init__(self, alpha=0.2):
+        assert alpha > 0., \
+                'parameter alpha[%f] should > 0.0' % (alpha)
+        self._alpha = alpha
+    def _rand_bbox(self, size, lam):
+        """ _rand_bbox """
+        w = size[2]
+        h = size[3]
+        cut_rat = np.sqrt(1. - lam)
+        cut_w = np.int(w * cut_rat)
+        cut_h = np.int(h * cut_rat)
+        # uniform
+        cx = np.random.randint(w)
+        cy = np.random.randint(h)
+        bbx1 = np.clip(cx - cut_w // 2, 0, w)
+        bby1 = np.clip(cy - cut_h // 2, 0, h)
+        bbx2 = np.clip(cx + cut_w // 2, 0, w)
+        bby2 = np.clip(cy + cut_h // 2, 0, h)
+        return bbx1, bby1, bbx2, bby2
+    def __call__(self, batch):
+        imgs, labels, bs = self._unpack(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self._alpha, self._alpha)
+        bbx1, bby1, bbx2, bby2 = self._rand_bbox(imgs.shape, lam)
+        imgs[:, :, bbx1:bbx2, bby1:bby2] = imgs[idx, :, bbx1:bbx2, bby1:bby2]
+        lam = 1 - (float(bbx2 - bbx1) * (bby2 - bby1) /
+                   (imgs.shape[-2] * imgs.shape[-1]))
+        return list(zip(imgs, labels, labels[idx], [lam] * bs))
+class FmixOperator(BatchOperator):
+    """ Fmix operator """
+    def __init__(self, alpha=1, decay_power=3, max_soft=0., reformulate=False):
+        self._alpha = alpha
+        self._decay_power = decay_power
+        self._max_soft = max_soft
+        self._reformulate = reformulate
+    def __call__(self, batch):
+        imgs, labels, bs = self._unpack(batch)
+        idx = np.random.permutation(bs)
+        size = (imgs.shape[2], imgs.shape[3])
+        lam, mask = sample_mask(self._alpha, self._decay_power, \
+                size, self._max_soft, self._reformulate)
+        imgs = mask * imgs + (1 - mask) * imgs[idx]
+        return list(zip(imgs, labels, labels[idx], [lam] * bs))
--- a/ppcls/data/imaug/cutout.py
+++ b/ppcls/data/imaug/cutout.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import random
+class Cutout(object):
+    def __init__(self, n_holes=1, length=112):
+        self.n_holes = n_holes
+        self.length = length
+    def __call__(self, img):
+        """ cutout_image """
+        h, w = img.shape[:2]
+        mask = np.ones((h, w), np.float32)
+        for n in range(self.n_holes):
+            y = np.random.randint(h)
+            x = np.random.randint(w)
+            y1 = np.clip(y - self.length // 2, 0, h)
+            y2 = np.clip(y + self.length // 2, 0, h)
+            x1 = np.clip(x - self.length // 2, 0, w)
+            x2 = np.clip(x + self.length // 2, 0, w)
+            img[y1:y2, x1:x2] = 0
+        return img
--- a/ppcls/data/imaug/fmix.py
+++ b/ppcls/data/imaug/fmix.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import random
+import numpy as np
+from scipy.stats import beta
+def fftfreqnd(h, w=None, z=None):
+    """ Get bin values for discrete fourier transform of size (h, w, z)
+    :param h: Required, first dimension size
+    :param w: Optional, second dimension size
+    :param z: Optional, third dimension size
+    """
+    fz = fx = 0
+    fy = np.fft.fftfreq(h)
+    if w is not None:
+        fy = np.expand_dims(fy, -1)
+        if w % 2 == 1:
+            fx = np.fft.fftfreq(w)[:w // 2 + 2]
+        else:
+            fx = np.fft.fftfreq(w)[:w // 2 + 1]
+    if z is not None:
+        fy = np.expand_dims(fy, -1)
+        if z % 2 == 1:
+            fz = np.fft.fftfreq(z)[:, None]
+        else:
+            fz = np.fft.fftfreq(z)[:, None]
+    return np.sqrt(fx * fx + fy * fy + fz * fz)
+def get_spectrum(freqs, decay_power, ch, h, w=0, z=0):
+    """ Samples a fourier image with given size and frequencies decayed by decay power
+    :param freqs: Bin values for the discrete fourier transform
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param ch: Number of channels for the resulting mask
+    :param h: Required, first dimension size
+    :param w: Optional, second dimension size
+    :param z: Optional, third dimension size
+    """
+    scale = np.ones(1) / (np.maximum(freqs, np.array([1. / max(w, h, z)]))
+                          **decay_power)
+    param_size = [ch] + list(freqs.shape) + [2]
+    param = np.random.randn(*param_size)
+    scale = np.expand_dims(scale, -1)[None, :]
+    return scale * param
+def make_low_freq_image(decay, shape, ch=1):
+    """ Sample a low frequency image from fourier space
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param shape: Shape of desired mask, list up to 3 dims
+    :param ch: Number of channels for desired mask
+    """
+    freqs = fftfreqnd(*shape)
+    spectrum = get_spectrum(freqs, decay, ch,
+                            *shape)  #.reshape((1, *shape[:-1], -1))
+    spectrum = spectrum[:, 0] + 1j * spectrum[:, 1]
+    mask = np.real(np.fft.irfftn(spectrum, shape))
+    if len(shape) == 1:
+        mask = mask[:1, :shape[0]]
+    if len(shape) == 2:
+        mask = mask[:1, :shape[0], :shape[1]]
+    if len(shape) == 3:
+        mask = mask[:1, :shape[0], :shape[1], :shape[2]]
+    mask = mask
+    mask = (mask - mask.min())
+    mask = mask / mask.max()
+    return mask
+def sample_lam(alpha, reformulate=False):
+    """ Sample a lambda from symmetric beta distribution with given alpha
+    :param alpha: Alpha value for beta distribution
+    :param reformulate: If True, uses the reformulation of [1].
+    """
+    if reformulate:
+        lam = beta.rvs(alpha + 1, alpha)
+    else:
+        lam = beta.rvs(alpha, alpha)
+    return lam
+def binarise_mask(mask, lam, in_shape, max_soft=0.0):
+    """ Binarises a given low frequency image such that it has mean lambda.
+    :param mask: Low frequency image, usually the result of `make_low_freq_image`
+    :param lam: Mean value of final mask
+    :param in_shape: Shape of inputs
+    :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask.
+    :return:
+    """
+    idx = mask.reshape(-1).argsort()[::-1]
+    mask = mask.reshape(-1)
+    num = math.ceil(lam * mask.size) if random.random() > 0.5 else math.floor(
+        lam * mask.size)
+    eff_soft = max_soft
+    if max_soft > lam or max_soft > (1 - lam):
+        eff_soft = min(lam, 1 - lam)
+    soft = int(mask.size * eff_soft)
+    num_low = int(num - soft)
+    num_high = int(num + soft)
+    mask[idx[:num_high]] = 1
+    mask[idx[num_low:]] = 0
+    mask[idx[num_low:num_high]] = np.linspace(1, 0, (num_high - num_low))
+    mask = mask.reshape((1, 1, in_shape[0], in_shape[1]))
+    return mask
+def sample_mask(alpha, decay_power, shape, max_soft=0.0, reformulate=False):
+    """ Samples a mean lambda from beta distribution parametrised by alpha, creates a low frequency image and binarises
+    it based on this lambda
+    :param alpha: Alpha value for beta distribution from which to sample mean of mask
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param shape: Shape of desired mask, list up to 3 dims
+    :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask.
+    :param reformulate: If True, uses the reformulation of [1].
+    """
+    if isinstance(shape, int):
+        shape = (shape, )
+    # Choose lambda
+    lam = sample_lam(alpha, reformulate)
+    # Make mask, get mean / std
+    mask = make_low_freq_image(decay_power, shape)
+    mask = binarise_mask(mask, lam, shape, max_soft)
+    return float(lam), mask
+def sample_and_apply(x,
+                     alpha,
+                     decay_power,
+                     shape,
+                     max_soft=0.0,
+                     reformulate=False):
+    """
+    :param x: Image batch on which to apply fmix of shape [b, c, shape*]
+    :param alpha: Alpha value for beta distribution from which to sample mean of mask
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param shape: Shape of desired mask, list up to 3 dims
+    :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask.
+    :param reformulate: If True, uses the reformulation of [1].
+    :return: mixed input, permutation indices, lambda value of mix,
+    """
+    lam, mask = sample_mask(alpha, decay_power, shape, max_soft, reformulate)
+    index = np.random.permutation(x.shape[0])
+    x1, x2 = x * mask, x[index] * (1 - mask)
+    return x1 + x2, index, lam
+class FMixBase:
+    """ FMix augmentation
+        Args:
+            decay_power (float): Decay power for frequency decay prop 1/f**d
+            alpha (float): Alpha value for beta distribution from which to sample mean of mask
+            size ([int] | [int, int] | [int, int, int]): Shape of desired mask, list up to 3 dims
+            max_soft (float): Softening value between 0 and 0.5 which smooths hard edges in the mask.
+            reformulate (bool): If True, uses the reformulation of [1].
+    """
+    def __init__(self,
+                 decay_power=3,
+                 alpha=1,
+                 size=(32, 32),
+                 max_soft=0.0,
+                 reformulate=False):
+        super().__init__()
+        self.decay_power = decay_power
+        self.reformulate = reformulate
+        self.size = size
+        self.alpha = alpha
+        self.max_soft = max_soft
+        self.index = None
+        self.lam = None
+    def __call__(self, x):
+        raise NotImplementedError
+    def loss(self, *args, **kwargs):
+        raise NotImplementedError
--- a/ppcls/data/imaug/grid.py
+++ b/ppcls/data/imaug/grid.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from PIL import Image
+import pdb
+# curr
+CURR_EPOCH = 0
+# epoch for the prob to be the upper limit
+NUM_EPOCHS = 240
+class GridMask(object):
+    def __init__(self, d1, d2, rotate=1, ratio=0.5, mode=0, prob=1.):
+        self.d1 = d1
+        self.d2 = d2
+        self.rotate = rotate
+        self.ratio = ratio
+        self.mode = mode
+        self.st_prob = prob
+        self.prob = prob
+        self.last_prob = -1
+    def set_prob(self):
+        global CURR_EPOCH
+        global NUM_EPOCHS
+        self.prob = self.st_prob * min(1, 1.0 * CURR_EPOCH / NUM_EPOCHS)
+    def __call__(self, img):
+        self.set_prob()
+        if abs(self.last_prob - self.prob) > 1e-10:
+            global CURR_EPOCH
+            global NUM_EPOCHS
+            print(
+                "self.prob is updated, self.prob={}, CURR_EPOCH: {}, NUM_EPOCHS: {}".
+                format(self.prob, CURR_EPOCH, NUM_EPOCHS))
+            self.last_prob = self.prob
+        # print("CURR_EPOCH: {}, NUM_EPOCHS: {}, self.prob is set as: {}".format(CURR_EPOCH, NUM_EPOCHS, self.prob) )
+        if np.random.rand() > self.prob:
+            return img
+        _, h, w = img.shape
+        hh = int(1.5 * h)
+        ww = int(1.5 * w)
+        d = np.random.randint(self.d1, self.d2)
+        #d = self.d
+        self.l = int(d * self.ratio + 0.5)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        for i in range(-1, hh // d + 1):
+            s = d * i + st_h
+            t = s + self.l
+            s = max(min(s, hh), 0)
+            t = max(min(t, hh), 0)
+            mask[s:t, :] *= 0
+        for i in range(-1, ww // d + 1):
+            s = d * i + st_w
+            t = s + self.l
+            s = max(min(s, ww), 0)
+            t = max(min(t, ww), 0)
+            mask[:, s:t] *= 0
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) //
+                    2 + w]
+        if self.mode == 1:
+            mask = 1 - mask
+        mask = np.expand_dims(mask, axis=0)
+        img = (img * mask).astype(img.dtype)
+        return img
--- a/ppcls/data/imaug/hide_and_seek.py
+++ b/ppcls/data/imaug/hide_and_seek.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import random
+class HideAndSeek(object):
+    def __init__(self):
+        # possible grid size, 0 means no hiding
+        self.grid_sizes = [0, 16, 32, 44, 56]
+        # hiding probability
+        self.hide_prob = 0.5
+    def __call__(self, img):
+        # randomly choose one grid size
+        grid_size = np.random.choice(self.grid_sizes)
+        _, h, w = img.shape
+        # hide the patches
+        if grid_size == 0:
+            return img
+        for x in range(0, w, grid_size):
+            for y in range(0, h, grid_size):
+                x_end = min(w, x + grid_size)
+                y_end = min(h, y + grid_size)
+                if (random.random() <= self.hide_prob):
+                    img[:, x:x_end, y:y_end] = 0
+        return img
--- a/ppcls/data/imaug/operators.py
+++ b/ppcls/data/imaug/operators.py
+"""
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import six
+import math
+import random
+import functools
+import cv2
+import numpy as np
+class OperatorParamError(ValueError):
+    """ OperatorParamError
+    """
+    pass
+class DecodeImage(object):
+    """ decode image """
+    def __init__(self, to_rgb=True, to_np=False, channel_first=False):
+        self.to_rgb = to_rgb
+        self.to_np = to_np  #to numpy
+        self.channel_first = channel_first  #only enabled when to_np is True
+    def __call__(self, img):
+        if six.PY2:
+            assert type(img) is str and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        else:
+            assert type(img) is bytes and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        data = np.frombuffer(img, dtype='uint8')
+        img = cv2.imdecode(data, 1)
+        if self.to_rgb:
+            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (
+                img.shape)
+            img = img[:, :, ::-1]
+        if self.channel_first:
+            img = img.transpose((2, 0, 1))
+        return img
+class ResizeImage(object):
+    """ resize image """
+    def __init__(self, size=None, resize_short=None):
+        if resize_short is not None and resize_short > 0:
+            self.resize_short = resize_short
+            self.w = None
+            self.h = None
+        elif size is not None:
+            self.resize_short = None
+            self.w = size if type(size) is int else size[0]
+            self.h = size if type(size) is int else size[1]
+        else:
+            raise OperatorParamError("invalid params for ReisizeImage for '\
+                'both 'size' and 'resize_short' are None")
+    def __call__(self, img):
+        img_h, img_w = img.shape[:2]
+        if self.resize_short is not None:
+            percent = float(self.resize_short) / min(img_w, img_h)
+            w = int(round(img_w * percent))
+            h = int(round(img_h * percent))
+        else:
+            w = self.w
+            h = self.h
+        return cv2.resize(img, (w, h))
+class CropImage(object):
+    """ crop image """
+    def __init__(self, size):
+        if type(size) is int:
+            self.size = (size, size)
+        else:
+            self.size = size  # (h, w)
+    def __call__(self, img):
+        w, h = self.size
+        img_h, img_w = img.shape[:2]
+        w_start = (img_w - w) // 2
+        h_start = (img_h - h) // 2
+        w_end = w_start + w
+        h_end = h_start + h
+        return img[h_start:h_end, w_start:w_end, :]
+class RandCropImage(object):
+    """ random crop image """
+    def __init__(self, size, scale=None, ratio=None):
+        if type(size) is int:
+            self.size = (size, size)  # (h, w)
+        else:
+            self.size = size
+        self.scale = [0.08, 1.0] if scale is None else scale
+        self.ratio = [3. / 4., 4. / 3.] if ratio is None else ratio
+    def __call__(self, img):
+        size = self.size
+        scale = self.scale
+        ratio = self.ratio
+        aspect_ratio = math.sqrt(random.uniform(*ratio))
+        w = 1. * aspect_ratio
+        h = 1. / aspect_ratio
+        img_h, img_w = img.shape[:2]
+        bound = min((float(img_w) / img_h) / (w**2),
+                    (float(img_h) / img_w) / (h**2))
+        scale_max = min(scale[1], bound)
+        scale_min = min(scale[0], bound)
+        target_area = img_w * img_h * random.uniform(\
+            scale_min, scale_max)
+        target_size = math.sqrt(target_area)
+        w = int(target_size * w)
+        h = int(target_size * h)
+        i = random.randint(0, img_w - w)
+        j = random.randint(0, img_h - h)
+        img = img[j:j + h, i:i + w, :]
+        return cv2.resize(img, size)
+class RandFlipImage(object):
+    """ random flip image
+        flip_code:
+            1: Flipped Horizontally
+            0: Flipped Vertically
+            -1: Flipped Horizontally & Vertically
+    """
+    def __init__(self, flip_code=1):
+        assert flip_code in [-1, 0, 1
+                             ], "flip_code should be a value in [-1, 0, 1]"
+        self.flip_code = flip_code
+    def __call__(self, img):
+        if random.randint(0, 1) == 1:
+            return cv2.flip(img, self.flip_code)
+        else:
+            return img
+class NormalizeImage(object):
+    """ normalize image such as substract mean, divide std
+    """
+    def __init__(self, scale=None, mean=None, std=None, order='chw'):
+        if isinstance(scale, str): scale = eval(scale)
+        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+        mean = mean if mean is not None else [0.485, 0.456, 0.406]
+        std = std if std is not None else [0.229, 0.224, 0.225]
+        shape = (3, 1, 1) if order == 'chw' else (1, 1, 3)
+        self.mean = np.array(mean).reshape(shape).astype('float32')
+        self.std = np.array(std).reshape(shape).astype('float32')
+    def __call__(self, img):
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+        assert isinstance(img,
+                          np.ndarray), "invalid input 'img' in NormalizeImage"
+        return (img.astype('float32') * self.scale - self.mean) / self.std
+class ToCHWImage(object):
+    """ convert hwc image to chw image
+    """
+    def __init__(self):
+        pass
+    def __call__(self, img):
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+        return img.transpose((2, 0, 1))
--- a/ppcls/data/imaug/randaugment.py
+++ b/ppcls/data/imaug/randaugment.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#This code is based on https://github.com/
+from PIL import Image, ImageEnhance, ImageOps
+import numpy as np
+import random
+class RandAugment(object):
+    def __init__(self, num_layers, magnitude, fillcolor=(128, 128, 128)):
+        self.num_layers = num_layers
+        self.magnitude = magnitude
+        self.max_level = 10
+        abso_level = self.magnitude / self.max_level
+        self.level_map = {
+            "shearX": 0.3 * abso_level,
+            "shearY": 0.3 * abso_level,
+            "translateX": 150.0 / 331 * abso_level,
+            "translateY": 150.0 / 331 * abso_level,
+            "rotate": 30 * abso_level,
+            "color": 0.9 * abso_level,
+            "posterize": int(4.0 * abso_level),
+            "solarize": 256.0 * abso_level,
+            "contrast": 0.9 * abso_level,
+            "sharpness": 0.9 * abso_level,
+            "brightness": 0.9 * abso_level,
+            "autocontrast": 0,
+            "equalize": 0,
+            "invert": 0
+        }
+        # from https://stackoverflow.com/questions/5252170/specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
+        def rotate_with_fill(img, magnitude):
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot,
+                                   Image.new("RGBA", rot.size, (128, ) * 4),
+                                   rot).convert(img.mode)
+        self.func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, magnitude * img.size[0] * random.choice([-1, 1]), 0, 1, 0),
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude * img.size[1] * random.choice([-1, 1])),
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            # "rotate": lambda img, magnitude: img.rotate(magnitude * random.choice([-1, 1])),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(1 + magnitude * random.choice([-1, 1])),
+            "posterize": lambda img, magnitude: ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude: ImageOps.solarize(img, magnitude),
+            "contrast": lambda img, magnitude: ImageEnhance.Contrast(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "sharpness": lambda img, magnitude: ImageEnhance.Sharpness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "brightness": lambda img, magnitude: ImageEnhance.Brightness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "autocontrast": lambda img, magnitude: ImageOps.autocontrast(img),
+            "equalize": lambda img, magnitude: ImageOps.equalize(img),
+            "invert": lambda img, magnitude: ImageOps.invert(img)
+        }
+    def __call__(self, img):
+        avaiable_op_names = self.level_map.keys()
+        for layer_num in range(self.num_layers):
+            op_name = np.random.choice(avaiable_op_names)
+            img = self.func[op_name](img, self.level_map[op_name])
+        return img
--- a/ppcls/data/imaug/random_erasing.py
+++ b/ppcls/data/imaug/random_erasing.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import random
+import numpy as np
+class RandomErasing(object):
+    def __init__(self, EPSILON=0.5, sl=0.02, sh=0.4, r1=0.3,
+                 mean=[0., 0., 0.]):
+        self.EPSILON = EPSILON
+        self.mean = mean
+        self.sl = sl
+        self.sh = sh
+        self.r1 = r1
+    def __call__(self, img):
+        if random.uniform(0, 1) > self.EPSILON:
+            return img
+        for attempt in range(100):
+            area = img.shape[1] * img.shape[2]
+            target_area = random.uniform(self.sl, self.sh) * area
+            aspect_ratio = random.uniform(self.r1, 1 / self.r1)
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+            if w < img.shape[2] and h < img.shape[1]:
+                x1 = random.randint(0, img.shape[1] - h)
+                y1 = random.randint(0, img.shape[2] - w)
+                if img.shape[0] == 3:
+                    img[0, x1:x1 + h, y1:y1 + w] = self.mean[0]
+                    img[1, x1:x1 + h, y1:y1 + w] = self.mean[1]
+                    img[2, x1:x1 + h, y1:y1 + w] = self.mean[2]
+                else:
+                    img[0, x1:x1 + h, y1:y1 + w] = self.mean[1]
+                return img
+        return img
--- a/ppcls/data/reader.py
+++ b/ppcls/data/reader.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import cv2
+import numpy as np
+import os
+import signal
+import paddle
+import imaug
+from imaug import transform
+from imaug import MixupOperator
+from ppcls.utils import logger
+trainers_num = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
+trainer_id = int(os.environ.get("PADDLE_TRAINER_ID", 0))
+class ModeException(Exception):
+    """
+    ModeException
+    """
+    def __init__(self, message='', mode=''):
+        message += "\nOnly the following 3 modes are supported: " \
+                "train, valid, test. Given mode is {}".format(mode)
+        super(ModeException, self).__init__(message)
+class SampleNumException(Exception):
+    """
+    SampleNumException
+    """
+    def __init__(self, message='', sample_num=0, batch_size=1):
+        message += "\nError: The number of the whole data ({}) " \
+                "is smaller than the batch_size ({}), and drop_last " \
+                "is turnning on, so nothing  will feed in program, " \
+                "Terminated now. Please reset batch_size to a smaller " \
+                "number or feed more data!".format(sample_num, batch_size)
+        super(SampleNumException, self).__init__(message)
+class ShuffleSeedException(Exception):
+    """
+    ShuffleSeedException
+    """
+    def __init__(self, message=''):
+        message += "\nIf trainers_num > 1, the shuffle_seed must be set, " \
+            "because the order of batch data generated by reader " \
+            "must be the same in the respective processes."
+        super(ShuffleSeedException, self).__init__(message)
+def check_params(params):
+    """
+    check params to avoid unexpect errors
+    Args:
+        params(dict):
+    """
+    if 'shuffle_seed' not in params:
+        params['shuffle_seed'] = None
+    if trainers_num > 1 and params['shuffle_seed'] is None:
+        raise ShuffleSeedException()
+    data_dir = params.get('data_dir', '')
+    assert os.path.isdir(data_dir), \
+            "{} doesn't exist, please check datadir path".format(data_dir)
+    if params['mode'] != 'test':
+        file_list = params.get('file_list', '')
+        assert os.path.isfile(file_list), \
+                "{} doesn't exist, please check file list path".format(file_list)
+def create_file_list(params):
+    """
+    if mode is test, create the file list
+    Args:
+        params(dict):
+    """
+    data_dir = params.get('data_dir', '')
+    params['file_list'] = ".tmp.txt"
+    imgtype_list = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff'}
+    with open(params['file_list'], "w") as fout:
+        tmp_file_list = os.listdir(data_dir)
+        for file_name in tmp_file_list:
+            file_path = os.path.join(data_dir, file_name)
+            if imghdr.what(file_path) not in imgtype_list:
+                continue
+            fout.write(file_name + " 0" + "\n")
+def shuffle_lines(full_lines, seed=None):
+    """
+    random shuffle lines
+    Args:
+        full_lines(list):
+        seed(int): random seed
+    """
+    if seed is not None:
+        np.random.RandomState(seed).shuffle(full_lines)
+    else:
+        np.random.shuffle(full_lines)
+    return full_lines
+def get_file_list(params):
+    """
+    read label list from file and shuffle the list
+    Args:
+        params(dict):
+    """
+    if params['mode'] == 'test':
+        create_file_list(params)
+    with open(params['file_list']) as flist:
+        full_lines = [line.strip() for line in flist]
+    full_lines = shuffle_lines(full_lines, params["shuffle_seed"])
+    # use only partial data for each trainer in distributed training
+    full_lines = full_lines[trainer_id::trainers_num]
+    return full_lines
+def create_operators(params):
+    """
+    create operators based on the config
+    Args:
+        params(list): a dict list, used to create some operators
+    """
+    assert isinstance(params, list), ('operator config should be a list')
+    ops = []
+    for operator in params:
+        assert isinstance(operator,
+                          dict) and len(operator) == 1, "yaml format error"
+        op_name = list(operator)[0]
+        param = {} if operator[op_name] is None else operator[op_name]
+        op = getattr(imaug, op_name)(**param)
+        ops.append(op)
+    return ops
+def partial_reader(params, full_lines, part_id=0, part_num=1):
+    """
+    create a reader with partial data
+    Args:
+        params(dict):
+        full_lines: label list
+        part_id(int): part index of the current partial data
+        part_num(int): part num of the dataset
+    """
+    assert part_id < part_num, ("part_num: {} should be larger " \
+            "than part_id: {}".format(part_num, part_id))
+    full_lines = full_lines[part_id::part_num]
+    batch_size = int(params['batch_size']) // trainers_num
+    if params['mode'] != "test" and len(full_lines) < batch_size:
+        raise SampleNumException('', len(full_lines), batch_size)
+    def reader():
+        ops = create_operators(params['transforms'])
+        for line in full_lines:
+            img_path, label = line.split()
+            img_path = os.path.join(params['data_dir'], img_path)
+            img = open(img_path).read()
+            img = transform(img, ops)
+            yield (img, int(label))
+    return reader
+def mp_reader(params):
+    """
+    multiprocess reader
+    Args:
+        params(dict):
+    """
+    check_params(params)
+    full_lines = get_file_list(params)
+    part_num = 1 if 'num_workers' not in params else params['num_workers']
+    readers = []
+    for part_id in range(part_num):
+        readers.append(partial_reader(params, full_lines, part_id, part_num))
+    return paddle.reader.multiprocess_reader(readers, use_pipe=False)
+def term_mp(sig_num, frame):
+    """ kill all child processes 
+    """
+    pid = os.getpid()
+    pgid = os.getpgid(os.getpid())
+    logger.info("main proc {} exit, kill process group "
+                "{}".format(pid, pgid))
+    os.killpg(pgid, signal.SIGKILL)
+class Reader:
+    """
+    Create a reader for trainning/validate/test
+    Args:
+        config(dict): arguments
+        mode(str): train or val or test
+        seed(int): random seed used to generate same sequence in each trainer
+    Returns:
+        the specific reader
+    """
+    def __init__(self, config, mode='train', seed=None):
+        try:
+            self.params = config[mode.upper()]
+        except KeyError:
+            raise ModeException(mode=mode)
+        use_mix = config.get('use_mix')
+        self.params['mode'] = mode
+        if seed is not None:
+            self.params['shuffle_seed'] = seed
+        self.batch_ops = []
+        if use_mix and mode == "train":
+            self.batch_ops = create_operators(self.params['mix'])
+    def __call__(self):
+        reader = mp_reader(self.params)
+        batch_size = int(self.params['batch_size']) // trainers_num
+        def wrapper():
+            batch = []
+            for idx, sample in enumerate(reader()):
+                img, label = sample
+                batch.append((img, label))
+                if (idx + 1) % batch_size == 0:
+                    batch = transform(batch, self.batch_ops)
+                    yield batch
+                    batch = []
+        return wrapper
+signal.signal(signal.SIGINT, term_mp)
+signal.signal(signal.SIGTERM, term_mp)
--- a/ppcls/modeling/__init__.py
+++ b/ppcls/modeling/__init__.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from . import architectures
+from . import loss
+from .architectures import *
+from .loss import *
+from .utils import similar_architectures
--- a/ppcls/modeling/architectures/__init__.py
+++ b/ppcls/modeling/architectures/__init__.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from .alexnet import AlexNet
+from .mobilenet_v1 import MobileNetV1_x0_25, MobileNetV1_x0_5, MobileNetV1_x1_0, MobileNetV1_x0_75, MobileNetV1
+from .mobilenet_v2 import MobileNetV2_x0_25, MobileNetV2_x0_5, MobileNetV2_x0_75, MobileNetV2_x1_0, MobileNetV2_x1_5, MobileNetV2_x2_0, MobileNetV2
+from .mobilenet_v3 import MobileNetV3_small_x0_35, MobileNetV3_small_x0_5, MobileNetV3_small_x0_75, MobileNetV3_small_x1_0, MobileNetV3_small_x1_25, MobileNetV3_large_x0_35, MobileNetV3_large_x0_5, MobileNetV3_large_x0_75, MobileNetV3_large_x1_0, MobileNetV3_large_x1_25
+from .googlenet import GoogLeNet
+from .vgg import VGG11, VGG13, VGG16, VGG19
+from .resnet import ResNet18, ResNet34, ResNet50, ResNet101, ResNet152
+from .resnet_vc import ResNet50_vc, ResNet101_vc, ResNet152_vc
+from .resnet_vd import ResNet18_vd, ResNet34_vd, ResNet50_vd, ResNet101_vd, ResNet152_vd, ResNet200_vd
+from .resnext import ResNeXt50_64x4d, ResNeXt101_64x4d, ResNeXt152_64x4d, ResNeXt50_32x4d, ResNeXt101_32x4d, ResNeXt152_32x4d
+from .resnext_vd import ResNeXt50_vd_64x4d, ResNeXt101_vd_64x4d, ResNeXt152_vd_64x4d, ResNeXt50_vd_32x4d, ResNeXt101_vd_32x4d, ResNeXt152_vd_32x4d
+from .inception_v4 import InceptionV4
+from .se_resnet_vd import SE_ResNet18_vd, SE_ResNet34_vd, SE_ResNet50_vd, SE_ResNet101_vd, SE_ResNet152_vd, SE_ResNet200_vd
+from .se_resnext import SE_ResNeXt50_32x4d, SE_ResNeXt101_32x4d, SE_ResNeXt152_32x4d
+from .se_resnext_vd import SE_ResNeXt50_vd_32x4d, SE_ResNeXt101_vd_32x4d, SENet154_vd
+from .dpn import DPN68, DPN92, DPN98, DPN107, DPN131
+from .shufflenet_v2_swish import ShuffleNetV2_swish, ShuffleNetV2_x0_5_swish, ShuffleNetV2_x1_0_swish, ShuffleNetV2_x1_5_swish, ShuffleNetV2_x2_0_swish
+from .shufflenet_v2 import ShuffleNetV2_x0_25, ShuffleNetV2_x0_33, ShuffleNetV2_x0_5, ShuffleNetV2_x1_0, ShuffleNetV2_x1_5, ShuffleNetV2_x2_0, ShuffleNetV2
+from .xception import Xception41, Xception65, Xception71
+from .xception_deeplab import Xception41_deeplab, Xception65_deeplab, Xception71_deeplab
+from .densenet import DenseNet121, DenseNet161, DenseNet169, DenseNet201, DenseNet264
+from .squeezenet import SqueezeNet1_0, SqueezeNet1_1
+from .darknet import DarkNet53
+from .resnext101_wsl import ResNeXt101_32x8d_wsl, ResNeXt101_32x16d_wsl, ResNeXt101_32x32d_wsl, ResNeXt101_32x48d_wsl, Fix_ResNeXt101_32x48d_wsl
+from .efficientnet import EfficientNet, EfficientNetB0, EfficientNetB1, EfficientNetB2, EfficientNetB3, EfficientNetB4, EfficientNetB5, EfficientNetB6, EfficientNetB7
+from .res2net import Res2Net50_48w_2s, Res2Net50_26w_4s, Res2Net50_14w_8s, Res2Net50_26w_6s, Res2Net50_26w_8s, Res2Net101_26w_4s, Res2Net152_26w_4s
+from .res2net_vd import Res2Net50_vd_48w_2s, Res2Net50_vd_26w_4s, Res2Net50_vd_14w_8s, Res2Net50_vd_26w_6s, Res2Net50_vd_26w_8s, Res2Net101_vd_26w_4s, Res2Net152_vd_26w_4s, Res2Net200_vd_26w_4s
+from .hrnet import HRNet_W18_C, HRNet_W30_C, HRNet_W32_C, HRNet_W40_C, HRNet_W44_C, HRNet_W48_C, HRNet_W60_C, HRNet_W64_C, SE_HRNet_W18_C, SE_HRNet_W30_C, SE_HRNet_W32_C, SE_HRNet_W40_C, SE_HRNet_W44_C, SE_HRNet_W48_C, SE_HRNet_W60_C, SE_HRNet_W64_C
+from .darts_gs import DARTS_GS_6M, DARTS_GS_4M
+from .resnet_acnet import ResNet18_ACNet, ResNet34_ACNet, ResNet50_ACNet, ResNet101_ACNet, ResNet152_ACNet
--- a/ppcls/modeling/architectures/alexnet.py
+++ b/ppcls/modeling/architectures/alexnet.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+import paddle.fluid as fluid
+__all__ = ['AlexNet']
+class AlexNet():
+    def __init__(self):
+        pass
+    def net(self, input, class_dim=1000):
+        stdv = 1.0 / math.sqrt(input.shape[1] * 11 * 11)
+        layer_name = [
+            "conv1", "conv2", "conv3", "conv4", "conv5", "fc6", "fc7", "fc8"
+        ]
+        conv1 = fluid.layers.conv2d(
+            input=input,
+            num_filters=64,
+            filter_size=11,
+            stride=4,
+            padding=2,
+            groups=1,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[0] + "_offset"),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[0] + "_weights"))
+        pool1 = fluid.layers.pool2d(
+            input=conv1,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=0,
+            pool_type='max')
+        stdv = 1.0 / math.sqrt(pool1.shape[1] * 5 * 5)
+        conv2 = fluid.layers.conv2d(
+            input=pool1,
+            num_filters=192,
+            filter_size=5,
+            stride=1,
+            padding=2,
+            groups=1,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[1] + "_offset"),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[1] + "_weights"))
+        pool2 = fluid.layers.pool2d(
+            input=conv2,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=0,
+            pool_type='max')
+        stdv = 1.0 / math.sqrt(pool2.shape[1] * 3 * 3)
+        conv3 = fluid.layers.conv2d(
+            input=pool2,
+            num_filters=384,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[2] + "_offset"),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[2] + "_weights"))
+        stdv = 1.0 / math.sqrt(conv3.shape[1] * 3 * 3)
+        conv4 = fluid.layers.conv2d(
+            input=conv3,
+            num_filters=256,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[3] + "_offset"),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[3] + "_weights"))
+        stdv = 1.0 / math.sqrt(conv4.shape[1] * 3 * 3)
+        conv5 = fluid.layers.conv2d(
+            input=conv4,
+            num_filters=256,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[4] + "_offset"),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[4] + "_weights"))
+        pool5 = fluid.layers.pool2d(
+            input=conv5,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=0,
+            pool_type='max')
+        drop6 = fluid.layers.dropout(x=pool5, dropout_prob=0.5)
+        stdv = 1.0 / math.sqrt(drop6.shape[1] * drop6.shape[2] *
+                               drop6.shape[3] * 1.0)
+        fc6 = fluid.layers.fc(
+            input=drop6,
+            size=4096,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[5] + "_offset"),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[5] + "_weights"))
+        drop7 = fluid.layers.dropout(x=fc6, dropout_prob=0.5)
+        stdv = 1.0 / math.sqrt(drop7.shape[1] * 1.0)
+        fc7 = fluid.layers.fc(
+            input=drop7,
+            size=4096,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[6] + "_offset"),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[6] + "_weights"))
+        stdv = 1.0 / math.sqrt(fc7.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=fc7,
+            size=class_dim,
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[7] + "_offset"),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[7] + "_weights"))
+        return out
--- a/ppcls/modeling/architectures/darknet.py
+++ b/ppcls/modeling/architectures/darknet.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+import math
+__all__ = ["DarkNet53"]
+class DarkNet53():
+    def __init__(self):
+        pass
+    def net(self, input, class_dim=1000):
+        DarkNet_cfg = {53: ([1, 2, 8, 8, 4], self.basicblock)}
+        stages, block_func = DarkNet_cfg[53]
+        stages = stages[0:5]
+        conv1 = self.conv_bn_layer(
+            input,
+            ch_out=32,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            name="yolo_input")
+        conv = self.downsample(
+            conv1, ch_out=conv1.shape[1] * 2, name="yolo_input.downsample")
+        for i, stage in enumerate(stages):
+            conv = self.layer_warp(
+                block_func,
+                conv,
+                32 * (2**i),
+                stage,
+                name="stage.{}".format(i))
+            if i < len(stages) - 1:  # do not downsaple in the last stage
+                conv = self.downsample(
+                    conv,
+                    ch_out=conv.shape[1] * 2,
+                    name="stage.{}.downsample".format(i))
+        pool = fluid.layers.pool2d(
+            input=conv, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=pool,
+            size=class_dim,
+            param_attr=ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name='fc_weights'),
+            bias_attr=ParamAttr(name='fc_offset'))
+        return out
+    def conv_bn_layer(self,
+                      input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=ch_out,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            act=None,
+            param_attr=ParamAttr(name=name + ".conv.weights"),
+            bias_attr=False)
+        bn_name = name + ".bn"
+        out = fluid.layers.batch_norm(
+            input=conv,
+            act='relu',
+            param_attr=ParamAttr(name=bn_name + '.scale'),
+            bias_attr=ParamAttr(name=bn_name + '.offset'),
+            moving_mean_name=bn_name + '.mean',
+            moving_variance_name=bn_name + '.var')
+        return out
+    def downsample(self,
+                   input,
+                   ch_out,
+                   filter_size=3,
+                   stride=2,
+                   padding=1,
+                   name=None):
+        return self.conv_bn_layer(
+            input,
+            ch_out=ch_out,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            name=name)
+    def basicblock(self, input, ch_out, name=None):
+        conv1 = self.conv_bn_layer(input, ch_out, 1, 1, 0, name=name + ".0")
+        conv2 = self.conv_bn_layer(
+            conv1, ch_out * 2, 3, 1, 1, name=name + ".1")
+        out = fluid.layers.elementwise_add(x=input, y=conv2, act=None)
+        return out
+    def layer_warp(self, block_func, input, ch_out, count, name=None):
+        res_out = block_func(input, ch_out, name='{}.0'.format(name))
+        for j in range(1, count):
+            res_out = block_func(res_out, ch_out, name='{}.{}'.format(name, j))
+        return res_out
--- a/ppcls/modeling/architectures/darts_gs.py
+++ b/ppcls/modeling/architectures/darts_gs.py
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+#
+# Based on:
+# --------------------------------------------------------
+# DARTS
+# Copyright (c) 2018, Hanxiao Liu.
+# Licensed under the Apache License, Version 2.0;
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import numpy as np
+import time
+import functools
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.initializer import Xavier
+from paddle.fluid.initializer import Normal
+from paddle.fluid.initializer import Constant
+from collections import namedtuple
+Genotype = namedtuple('Genotype', 'normal normal_concat reduce reduce_concat')
+arch_dict = {
+    'DARTS_GS_6M': Genotype(
+        normal=[('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('sep_conv_5x5', 1),
+                ('sep_conv_5x5', 0), ('sep_conv_3x3', 2), ('sep_conv_3x3', 1),
+                ('skip_connect', 4), ('sep_conv_3x3', 3)],
+        normal_concat=range(2, 6),
+        reduce=[('sep_conv_5x5', 0), ('max_pool_3x3', 1), ('dil_conv_5x5', 2),
+                ('sep_conv_5x5', 0), ('sep_conv_3x3', 1), ('dil_conv_5x5', 3),
+                ('dil_conv_3x3', 1), ('sep_conv_3x3', 2)],
+        reduce_concat=range(2, 6)),
+    'DARTS_GS_4M': Genotype(
+        normal=[('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0),
+                ('sep_conv_3x3', 1), ('sep_conv_3x3', 1), ('skip_connect', 0),
+                ('skip_connect', 0), ('dil_conv_3x3', 1)],
+        normal_concat=range(2, 6),
+        reduce=[('max_pool_3x3', 0), ('max_pool_3x3', 1), ('max_pool_3x3', 0),
+                ('avg_pool_3x3', 1), ('skip_connect', 3), ('skip_connect', 2),
+                ('sep_conv_3x3', 0), ('sep_conv_5x5', 2)],
+        reduce_concat=range(2, 6)),
+}
+__all__ = list(arch_dict.keys())
+OPS = {
+    'none' : lambda input, C, stride, name, affine: Zero(input, stride, name),
+    'avg_pool_3x3' : lambda input, C, stride, name, affine: fluid.layers.pool2d(input, 3, 'avg', pool_stride=stride, pool_padding=1, name=name),
+    'max_pool_3x3' : lambda input, C, stride, name, affine: fluid.layers.pool2d(input, 3, 'max', pool_stride=stride, pool_padding=1, name=name),
+    'skip_connect' : lambda input,C, stride, name, affine: Identity(input, name) if stride == 1 else FactorizedReduce(input, C, name=name, affine=affine),
+    'sep_conv_3x3' : lambda input,C, stride, name, affine: SepConv(input, C, C, 3, stride, 1, name=name, affine=affine),
+    'sep_conv_5x5' : lambda input,C, stride, name, affine: SepConv(input, C, C, 5, stride, 2, name=name, affine=affine),
+    'sep_conv_7x7' : lambda input,C, stride, name, affine: SepConv(input, C, C, 7, stride, 3, name=name, affine=affine),
+    'dil_conv_3x3' : lambda input,C, stride, name, affine: DilConv(input, C, C, 3, stride, 2, 2, name=name, affine=affine),
+    'dil_conv_5x5' : lambda input,C, stride, name, affine: DilConv(input, C, C, 5, stride, 4, 2, name=name, affine=affine),
+    'conv_7x1_1x7' : lambda input,C, stride, name, affine: SevenConv(input, C, name=name, affine=affine)
+}
+def ReLUConvBN(input,
+               C_out,
+               kernel_size,
+               stride,
+               padding,
+               name='',
+               affine=True):
+    relu_a = fluid.layers.relu(input)
+    conv2d_a = fluid.layers.conv2d(
+        relu_a, C_out, kernel_size, stride, padding, bias_attr=False)
+    if affine:
+        reluconvbn_out = fluid.layers.batch_norm(
+            conv2d_a,
+            param_attr=ParamAttr(
+                initializer=Constant(1.), name=name + 'op.2.weight'),
+            bias_attr=ParamAttr(
+                initializer=Constant(0.), name=name + 'op.2.bias'),
+            moving_mean_name=name + 'op.2.running_mean',
+            moving_variance_name=name + 'op.2.running_var')
+    else:
+        reluconvbn_out = fluid.layers.batch_norm(
+            conv2d_a,
+            param_attr=ParamAttr(
+                initializer=Constant(1.),
+                learning_rate=0.,
+                name=name + 'op.2.weight'),
+            bias_attr=ParamAttr(
+                initializer=Constant(0.),
+                learning_rate=0.,
+                name=name + 'op.2.bias'),
+            moving_mean_name=name + 'op.2.running_mean',
+            moving_variance_name=name + 'op.2.running_var')
+    return reluconvbn_out
+def DilConv(input,
+            C_in,
+            C_out,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            name='',
+            affine=True):
+    relu_a = fluid.layers.relu(input)
+    conv2d_a = fluid.layers.conv2d(
+        relu_a,
+        C_in,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        groups=C_in,
+        bias_attr=False,
+        use_cudnn=False)
+    conv2d_b = fluid.layers.conv2d(conv2d_a, C_out, 1, bias_attr=False)
+    if affine:
+        dilconv_out = fluid.layers.batch_norm(
+            conv2d_b,
+            param_attr=ParamAttr(
+                initializer=Constant(1.), name=name + 'op.3.weight'),
+            bias_attr=ParamAttr(
+                initializer=Constant(0.), name=name + 'op.3.bias'),
+            moving_mean_name=name + 'op.3.running_mean',
+            moving_variance_name=name + 'op.3.running_var')
+    else:
+        dilconv_out = fluid.layers.batch_norm(
+            conv2d_b,
+            param_attr=ParamAttr(
+                initializer=Constant(1.),
+                learning_rate=0.,
+                name=name + 'op.3.weight'),
+            bias_attr=ParamAttr(
+                initializer=Constant(0.),
+                learning_rate=0.,
+                name=name + 'op.3.bias'),
+            moving_mean_name=name + 'op.3.running_mean',
+            moving_variance_name=name + 'op.3.running_var')
+    return dilconv_out
+def SepConv(input,
+            C_in,
+            C_out,
+            kernel_size,
+            stride,
+            padding,
+            name='',
+            affine=True):
+    relu_a = fluid.layers.relu(input)
+    conv2d_a = fluid.layers.conv2d(
+        relu_a,
+        C_in,
+        kernel_size,
+        stride,
+        padding,
+        groups=C_in,
+        bias_attr=False,
+        use_cudnn=False)
+    conv2d_b = fluid.layers.conv2d(conv2d_a, C_in, 1, bias_attr=False)
+    if affine:
+        bn_a = fluid.layers.batch_norm(
+            conv2d_b,
+            param_attr=ParamAttr(
+                initializer=Constant(1.), name=name + 'op.3.weight'),
+            bias_attr=ParamAttr(
+                initializer=Constant(0.), name=name + 'op.3.bias'),
+            moving_mean_name=name + 'op.3.running_mean',
+            moving_variance_name=name + 'op.3.running_var')
+    else:
+        bn_a = fluid.layers.batch_norm(
+            conv2d_b,
+            param_attr=ParamAttr(
+                initializer=Constant(1.),
+                learning_rate=0.,
+                name=name + 'op.3.weight'),
+            bias_attr=ParamAttr(
+                initializer=Constant(0.),
+                learning_rate=0.,
+                name=name + 'op.3.bias'),
+            moving_mean_name=name + 'op.3.running_mean',
+            moving_variance_name=name + 'op.3.running_var')
+    relu_b = fluid.layers.relu(bn_a)
+    conv2d_d = fluid.layers.conv2d(
+        relu_b,
+        C_in,
+        kernel_size,
+        1,
+        padding,
+        groups=C_in,
+        bias_attr=False,
+        use_cudnn=False)
+    conv2d_e = fluid.layers.conv2d(conv2d_d, C_out, 1, bias_attr=False)
+    if affine:
+        sepconv_out = fluid.layers.batch_norm(
+            conv2d_e,
+            param_attr=ParamAttr(
+                initializer=Constant(1.), name=name + 'op.7.weight'),
+            bias_attr=ParamAttr(
+                initializer=Constant(0.), name=name + 'op.7.bias'),
+            moving_mean_name=name + 'op.7.running_mean',
+            moving_variance_name=name + 'op.7.running_var')
+    else:
+        sepconv_out = fluid.layers.batch_norm(
+            conv2d_e,
+            param_attr=ParamAttr(
+                initializer=Constant(1.),
+                learning_rate=0.,
+                name=name + 'op.7.weight'),
+            bias_attr=ParamAttr(
+                initializer=Constant(0.),
+                learning_rate=0.,
+                name=name + 'op.7.bias'),
+            moving_mean_name=name + 'op.7.running_mean',
+            moving_variance_name=name + 'op.7.running_var')
+    return sepconv_out
+def SevenConv(input, C_out, stride, name='', affine=True):
+    relu_a = fluid.layers.relu(input)
+    conv2d_a = fluid.layers.conv2d(
+        relu_a,
+        C_out, (1, 7), (1, stride), (0, 3),
+        param_attr=ParamAttr(
+            initializer=Xavier(
+                uniform=False, fan_in=0),
+            name=name + 'op.1.weight'),
+        bias_attr=False)
+    conv2d_b = fluid.layers.conv2d(
+        conv2d_a,
+        C_out, (7, 1), (stride, 1), (3, 0),
+        param_attr=ParamAttr(
+            initializer=Xavier(
+                uniform=False, fan_in=0),
+            name=name + 'op.2.weight'),
+        bias_attr=False)
+    if affine:
+        out = fluid.layers.batch_norm(
+            conv2d_b,
+            param_attr=ParamAttr(
+                initializer=Constant(1.), name=name + 'op.3.weight'),
+            bias_attr=ParamAttr(
+                initializer=Constant(0.), name=name + 'op.3.bias'),
+            moving_mean_name=name + 'op.3.running_mean',
+            moving_variance_name=name + 'op.3.running_var')
+    else:
+        out = fluid.layers.batch_norm(
+            conv2d_b,
+            param_attr=ParamAttr(
+                initializer=Constant(1.),
+                learning_rate=0.,
+                name=name + 'op.3.weight'),
+            bias_attr=ParamAttr(
+                initializer=Constant(0.),
+                learning_rate=0.,
+                name=name + 'op.3.bias'),
+            moving_mean_name=name + 'op.3.running_mean',
+            moving_variance_name=name + 'op.3.running_var')
+def Identity(input, name=''):
+    return input
+def Zero(input, stride, name=''):
+    ones = np.ones(input.shape[-2:])
+    ones[::stride, ::stride] = 0
+    ones = fluid.layers.assign(ones)
+    return input * ones
+def FactorizedReduce(input, C_out, name='', affine=True):
+    relu_a = fluid.layers.relu(input)
+    conv2d_a = fluid.layers.conv2d(
+        relu_a,
+        C_out // 2,
+        1,
+        2,
+        param_attr=ParamAttr(
+            initializer=Xavier(
+                uniform=False, fan_in=0),
+            name=name + 'conv_1.weight'),
+        bias_attr=False)
+    h_end = relu_a.shape[2]
+    w_end = relu_a.shape[3]
+    slice_a = fluid.layers.slice(relu_a, [2, 3], [1, 1], [h_end, w_end])
+    conv2d_b = fluid.layers.conv2d(
+        slice_a,
+        C_out // 2,
+        1,
+        2,
+        param_attr=ParamAttr(
+            initializer=Xavier(
+                uniform=False, fan_in=0),
+            name=name + 'conv_2.weight'),
+        bias_attr=False)
+    out = fluid.layers.concat([conv2d_a, conv2d_b], axis=1)
+    if affine:
+        out = fluid.layers.batch_norm(
+            out,
+            param_attr=ParamAttr(
+                initializer=Constant(1.), name=name + 'bn.weight'),
+            bias_attr=ParamAttr(
+                initializer=Constant(0.), name=name + 'bn.bias'),
+            moving_mean_name=name + 'bn.running_mean',
+            moving_variance_name=name + 'bn.running_var')
+    else:
+        out = fluid.layers.batch_norm(
+            out,
+            param_attr=ParamAttr(
+                initializer=Constant(1.),
+                learning_rate=0.,
+                name=name + 'bn.weight'),
+            bias_attr=ParamAttr(
+                initializer=Constant(0.),
+                learning_rate=0.,
+                name=name + 'bn.bias'),
+            moving_mean_name=name + 'bn.running_mean',
+            moving_variance_name=name + 'bn.running_var')
+    return out
+class Cell():
+    def __init__(self, genotype, C_prev_prev, C_prev, C, reduction,
+                 reduction_prev):
+        if reduction_prev:
+            self.preprocess0 = functools.partial(FactorizedReduce, C_out=C)
+        else:
+            self.preprocess0 = functools.partial(
+                ReLUConvBN, C_out=C, kernel_size=1, stride=1, padding=0)
+        self.preprocess1 = functools.partial(
+            ReLUConvBN, C_out=C, kernel_size=1, stride=1, padding=0)
+        if reduction:
+            op_names, indices = zip(*genotype.reduce)
+            concat = genotype.reduce_concat
+        else:
+            op_names, indices = zip(*genotype.normal)
+            concat = genotype.normal_concat
+        print(op_names, indices, concat, reduction)
+        self._compile(C, op_names, indices, concat, reduction)
+    def _compile(self, C, op_names, indices, concat, reduction):
+        assert len(op_names) == len(indices)
+        self._steps = len(op_names) // 2
+        self._concat = concat
+        self.multiplier = len(concat)
+        self._ops = []
+        for name, index in zip(op_names, indices):
+            stride = 2 if reduction and index < 2 else 1
+            op = functools.partial(OPS[name], C=C, stride=stride, affine=True)
+            self._ops += [op]
+        self._indices = indices
+    def forward(self, s0, s1, drop_prob, is_train, name):
+        self.training = is_train
+        preprocess0_name = name + 'preprocess0.'
+        preprocess1_name = name + 'preprocess1.'
+        s0 = self.preprocess0(s0, name=preprocess0_name)
+        s1 = self.preprocess1(s1, name=preprocess1_name)
+        out = [s0, s1]
+        for i in range(self._steps):
+            h1 = out[self._indices[2 * i]]
+            h2 = out[self._indices[2 * i + 1]]
+            op1 = self._ops[2 * i]
+            op2 = self._ops[2 * i + 1]
+            h3 = op1(h1, name=name + '_ops.' + str(2 * i) + '.')
+            h4 = op2(h2, name=name + '_ops.' + str(2 * i + 1) + '.')
+            if self.training and drop_prob > 0.:
+                if h3 != h1:
+                    h3 = fluid.layers.dropout(
+                        h3,
+                        drop_prob,
+                        dropout_implementation='upscale_in_train')
+                if h4 != h2:
+                    h4 = fluid.layers.dropout(
+                        h4,
+                        drop_prob,
+                        dropout_implementation='upscale_in_train')
+            s = h3 + h4
+            out += [s]
+        return fluid.layers.concat([out[i] for i in self._concat], axis=1)
+def AuxiliaryHeadImageNet(input, num_classes, aux_name='auxiliary_head'):
+    relu_a = fluid.layers.relu(input)
+    pool_a = fluid.layers.pool2d(relu_a, 5, 'avg', 2)
+    conv2d_a = fluid.layers.conv2d(
+        pool_a, 128, 1, name=aux_name + '.features.2', bias_attr=False)
+    bn_a_name = aux_name + '.features.3'
+    bn_a = fluid.layers.batch_norm(
+        conv2d_a,
+        act='relu',
+        name=bn_a_name,
+        param_attr=ParamAttr(
+            initializer=Constant(1.), name=bn_a_name + '.weight'),
+        bias_attr=ParamAttr(
+            initializer=Constant(0.), name=bn_a_name + '.bias'),
+        moving_mean_name=bn_a_name + '.running_mean',
+        moving_variance_name=bn_a_name + '.running_var')
+    conv2d_b = fluid.layers.conv2d(
+        bn_a, 768, 2, name=aux_name + '.features.5', bias_attr=False)
+    bn_b_name = aux_name + '.features.6'
+    bn_b = fluid.layers.batch_norm(
+        conv2d_b,
+        act='relu',
+        name=bn_b_name,
+        param_attr=ParamAttr(
+            initializer=Constant(1.), name=bn_b_name + '.weight'),
+        bias_attr=ParamAttr(
+            initializer=Constant(0.), name=bn_b_name + '.bias'),
+        moving_mean_name=bn_b_name + '.running_mean',
+        moving_variance_name=bn_b_name + '.running_var')
+    pool_b = fluid.layers.adaptive_pool2d(bn_b, (1, 1), "avg")
+    fc_name = aux_name + '.classifier'
+    fc = fluid.layers.fc(pool_b,
+                         num_classes,
+                         name=fc_name,
+                         param_attr=ParamAttr(
+                             initializer=Normal(scale=1e-3),
+                             name=fc_name + '.weight'),
+                         bias_attr=ParamAttr(
+                             initializer=Constant(0.), name=fc_name + '.bias'))
+    return fc
+def StemConv0(input, C_out):
+    conv_a = fluid.layers.conv2d(
+        input, C_out // 2, 3, stride=2, padding=1, bias_attr=False)
+    bn_a = fluid.layers.batch_norm(
+        conv_a,
+        act='relu',
+        param_attr=ParamAttr(
+            initializer=Constant(1.), name='stem0.1.weight'),
+        bias_attr=ParamAttr(
+            initializer=Constant(0.), name='stem0.1.bias'),
+        moving_mean_name='stem0.1.running_mean',
+        moving_variance_name='stem0.1.running_var')
+    conv_b = fluid.layers.conv2d(
+        bn_a, C_out, 3, stride=2, padding=1, bias_attr=False)
+    bn_b = fluid.layers.batch_norm(
+        conv_b,
+        param_attr=ParamAttr(
+            initializer=Constant(1.), name='stem0.3.weight'),
+        bias_attr=ParamAttr(
+            initializer=Constant(0.), name='stem0.3.bias'),
+        moving_mean_name='stem0.3.running_mean',
+        moving_variance_name='stem0.3.running_var')
+    return bn_b
+def StemConv1(input, C_out):
+    relu_a = fluid.layers.relu(input)
+    conv_a = fluid.layers.conv2d(
+        relu_a, C_out, 3, stride=2, padding=1, bias_attr=False)
+    bn_a = fluid.layers.batch_norm(
+        conv_a,
+        param_attr=ParamAttr(
+            initializer=Constant(1.), name='stem1.1.weight'),
+        bias_attr=ParamAttr(
+            initializer=Constant(0.), name='stem1.1.bias'),
+        moving_mean_name='stem1.1.running_mean',
+        moving_variance_name='stem1.1.running_var')
+    return bn_a
+class NetworkImageNet(object):
+    def __init__(self, arch='DARTS_6M'):
+        self.class_num = 1000
+        self.init_channel = 48
+        self._layers = 14
+        self._auxiliary = False
+        self.drop_path_prob = 0
+        genotype = arch_dict[arch]
+        C = self.init_channel
+        layers = self._layers
+        C_prev_prev, C_prev, C_curr = C, C, C
+        self.cells = []
+        reduction_prev = True
+        for i in range(layers):
+            if i in [layers // 3, 2 * layers // 3]:
+                C_curr *= 2
+                reduction = True
+            else:
+                reduction = False
+            cell = Cell(genotype, C_prev_prev, C_prev, C_curr, reduction,
+                        reduction_prev)
+            reduction_prev = reduction
+            self.cells += [cell]
+            C_prev_prev, C_prev = C_prev, cell.multiplier * C_curr
+            if i == 2 * layers // 3:
+                C_to_auxiliary = C_prev
+    def net(self, input, class_dim=1000, is_train=True):
+        self.logits_aux = None
+        num_channel = self.init_channel
+        s0 = StemConv0(input, num_channel)
+        s1 = StemConv1(s0, num_channel)
+        for i, cell in enumerate(self.cells):
+            name = 'cells.' + str(i) + '.'
+            s0, s1 = s1, cell.forward(s0, s1, self.drop_path_prob, is_train,
+                                      name)
+            if i == int(2 * self._layers // 3):
+                if self._auxiliary and is_train:
+                    self.logits_aux = AuxiliaryHeadImageNet(s1, self.class_num)
+        out = fluid.layers.adaptive_pool2d(s1, (1, 1), "avg")
+        self.logits = fluid.layers.fc(out,
+                                      size=self.class_num,
+                                      param_attr=ParamAttr(
+                                          initializer=Normal(scale=1e-4),
+                                          name='classifier.weight'),
+                                      bias_attr=ParamAttr(
+                                          initializer=Constant(0.),
+                                          name='classifier.bias'))
+        return self.logits
+def DARTS_GS_6M():
+    return NetworkImageNet(arch='DARTS_GS_6M')
+def DARTS_GS_4M():
+    return NetworkImageNet(arch='DARTS_GS_4M')
--- a/ppcls/modeling/architectures/densenet.py
+++ b/ppcls/modeling/architectures/densenet.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+__all__ = [
+    "DenseNet", "DenseNet121", "DenseNet161", "DenseNet169", "DenseNet201",
+    "DenseNet264"
+]
+class DenseNet():
+    def __init__(self, layers=121):
+        self.layers = layers
+    def net(self, input, bn_size=4, dropout=0, class_dim=1000):
+        layers = self.layers
+        supported_layers = [121, 161, 169, 201, 264]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        densenet_spec = {
+            121: (64, 32, [6, 12, 24, 16]),
+            161: (96, 48, [6, 12, 36, 24]),
+            169: (64, 32, [6, 12, 32, 32]),
+            201: (64, 32, [6, 12, 48, 32]),
+            264: (64, 32, [6, 12, 64, 48])
+        }
+        num_init_features, growth_rate, block_config = densenet_spec[layers]
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_init_features,
+            filter_size=7,
+            stride=2,
+            padding=3,
+            act=None,
+            param_attr=ParamAttr(name="conv1_weights"),
+            bias_attr=False)
+        conv = fluid.layers.batch_norm(
+            input=conv,
+            act='relu',
+            param_attr=ParamAttr(name='conv1_bn_scale'),
+            bias_attr=ParamAttr(name='conv1_bn_offset'),
+            moving_mean_name='conv1_bn_mean',
+            moving_variance_name='conv1_bn_variance')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+        num_features = num_init_features
+        for i, num_layers in enumerate(block_config):
+            conv = self.make_dense_block(
+                conv,
+                num_layers,
+                bn_size,
+                growth_rate,
+                dropout,
+                name='conv' + str(i + 2))
+            num_features = num_features + num_layers * growth_rate
+            if i != len(block_config) - 1:
+                conv = self.make_transition(
+                    conv, num_features // 2, name='conv' + str(i + 2) + '_blk')
+                num_features = num_features // 2
+        conv = fluid.layers.batch_norm(
+            input=conv,
+            act='relu',
+            param_attr=ParamAttr(name='conv5_blk_bn_scale'),
+            bias_attr=ParamAttr(name='conv5_blk_bn_offset'),
+            moving_mean_name='conv5_blk_bn_mean',
+            moving_variance_name='conv5_blk_bn_variance')
+        conv = fluid.layers.pool2d(
+            input=conv, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(conv.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=conv,
+            size=class_dim,
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name="fc_weights"),
+            bias_attr=ParamAttr(name='fc_offset'))
+        return out
+    def make_transition(self, input, num_output_features, name=None):
+        bn_ac = fluid.layers.batch_norm(
+            input,
+            act='relu',
+            param_attr=ParamAttr(name=name + '_bn_scale'),
+            bias_attr=ParamAttr(name + '_bn_offset'),
+            moving_mean_name=name + '_bn_mean',
+            moving_variance_name=name + '_bn_variance')
+        bn_ac_conv = fluid.layers.conv2d(
+            input=bn_ac,
+            num_filters=num_output_features,
+            filter_size=1,
+            stride=1,
+            act=None,
+            bias_attr=False,
+            param_attr=ParamAttr(name=name + "_weights"))
+        pool = fluid.layers.pool2d(
+            input=bn_ac_conv, pool_size=2, pool_stride=2, pool_type='avg')
+        return pool
+    def make_dense_block(self,
+                         input,
+                         num_layers,
+                         bn_size,
+                         growth_rate,
+                         dropout,
+                         name=None):
+        conv = input
+        for layer in range(num_layers):
+            conv = self.make_dense_layer(
+                conv,
+                growth_rate,
+                bn_size,
+                dropout,
+                name=name + '_' + str(layer + 1))
+        return conv
+    def make_dense_layer(self, input, growth_rate, bn_size, dropout,
+                         name=None):
+        bn_ac = fluid.layers.batch_norm(
+            input,
+            act='relu',
+            param_attr=ParamAttr(name=name + '_x1_bn_scale'),
+            bias_attr=ParamAttr(name + '_x1_bn_offset'),
+            moving_mean_name=name + '_x1_bn_mean',
+            moving_variance_name=name + '_x1_bn_variance')
+        bn_ac_conv = fluid.layers.conv2d(
+            input=bn_ac,
+            num_filters=bn_size * growth_rate,
+            filter_size=1,
+            stride=1,
+            act=None,
+            bias_attr=False,
+            param_attr=ParamAttr(name=name + "_x1_weights"))
+        bn_ac = fluid.layers.batch_norm(
+            bn_ac_conv,
+            act='relu',
+            param_attr=ParamAttr(name=name + '_x2_bn_scale'),
+            bias_attr=ParamAttr(name + '_x2_bn_offset'),
+            moving_mean_name=name + '_x2_bn_mean',
+            moving_variance_name=name + '_x2_bn_variance')
+        bn_ac_conv = fluid.layers.conv2d(
+            input=bn_ac,
+            num_filters=growth_rate,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            act=None,
+            bias_attr=False,
+            param_attr=ParamAttr(name=name + "_x2_weights"))
+        if dropout:
+            bn_ac_conv = fluid.layers.dropout(
+                x=bn_ac_conv, dropout_prob=dropout)
+        bn_ac_conv = fluid.layers.concat([input, bn_ac_conv], axis=1)
+        return bn_ac_conv
+def DenseNet121():
+    model = DenseNet(layers=121)
+    return model
+def DenseNet161():
+    model = DenseNet(layers=161)
+    return model
+def DenseNet169():
+    model = DenseNet(layers=169)
+    return model
+def DenseNet201():
+    model = DenseNet(layers=201)
+    return model
+def DenseNet264():
+    model = DenseNet(layers=264)
+    return model
--- a/ppcls/modeling/architectures/dpn.py
+++ b/ppcls/modeling/architectures/dpn.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import numpy as np
+import time
+import sys
+import math
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+__all__ = ["DPN", "DPN68", "DPN92", "DPN98", "DPN107", "DPN131"]
+class DPN(object):
+    def __init__(self, layers=68):
+        self.layers = layers
+    def net(self, input, class_dim=1000):
+        # get network args
+        args = self.get_net_args(self.layers)
+        bws = args['bw']
+        inc_sec = args['inc_sec']
+        rs = args['r']
+        k_r = args['k_r']
+        k_sec = args['k_sec']
+        G = args['G']
+        init_num_filter = args['init_num_filter']
+        init_filter_size = args['init_filter_size']
+        init_padding = args['init_padding']
+        ## define Dual Path Network
+        # conv1
+        conv1_x_1 = fluid.layers.conv2d(
+            input=input,
+            num_filters=init_num_filter,
+            filter_size=init_filter_size,
+            stride=2,
+            padding=init_padding,
+            groups=1,
+            act=None,
+            bias_attr=False,
+            name="conv1",
+            param_attr=ParamAttr(name="conv1_weights"), )
+        conv1_x_1 = fluid.layers.batch_norm(
+            input=conv1_x_1,
+            act='relu',
+            is_test=False,
+            name="conv1_bn",
+            param_attr=ParamAttr(name='conv1_bn_scale'),
+            bias_attr=ParamAttr('conv1_bn_offset'),
+            moving_mean_name='conv1_bn_mean',
+            moving_variance_name='conv1_bn_variance', )
+        convX_x_x = fluid.layers.pool2d(
+            input=conv1_x_1,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max',
+            name="pool1")
+        #conv2 - conv5
+        match_list, num = [], 0
+        for gc in range(4):
+            bw = bws[gc]
+            inc = inc_sec[gc]
+            R = (k_r * bw) // rs[gc]
+            if gc == 0:
+                _type1 = 'proj'
+                _type2 = 'normal'
+                match = 1
+            else:
+                _type1 = 'down'
+                _type2 = 'normal'
+                match = match + k_sec[gc - 1]
+            match_list.append(match)
+            convX_x_x = self.dual_path_factory(
+                convX_x_x, R, R, bw, inc, G, _type1, name="dpn" + str(match))
+            for i_ly in range(2, k_sec[gc] + 1):
+                num += 1
+                if num in match_list:
+                    num += 1
+                convX_x_x = self.dual_path_factory(
+                    convX_x_x, R, R, bw, inc, G, _type2, name="dpn" + str(num))
+        conv5_x_x = fluid.layers.concat(convX_x_x, axis=1)
+        conv5_x_x = fluid.layers.batch_norm(
+            input=conv5_x_x,
+            act='relu',
+            is_test=False,
+            name="final_concat_bn",
+            param_attr=ParamAttr(name='final_concat_bn_scale'),
+            bias_attr=ParamAttr('final_concat_bn_offset'),
+            moving_mean_name='final_concat_bn_mean',
+            moving_variance_name='final_concat_bn_variance', )
+        pool5 = fluid.layers.pool2d(
+            input=conv5_x_x,
+            pool_size=7,
+            pool_stride=1,
+            pool_padding=0,
+            pool_type='avg', )
+        stdv = 0.01
+        fc6 = fluid.layers.fc(
+            input=pool5,
+            size=class_dim,
+            param_attr=ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name='fc_weights'),
+            bias_attr=ParamAttr(name='fc_offset'))
+        return fc6
+    def get_net_args(self, layers):
+        if layers == 68:
+            k_r = 128
+            G = 32
+            k_sec = [3, 4, 12, 3]
+            inc_sec = [16, 32, 32, 64]
+            bw = [64, 128, 256, 512]
+            r = [64, 64, 64, 64]
+            init_num_filter = 10
+            init_filter_size = 3
+            init_padding = 1
+        elif layers == 92:
+            k_r = 96
+            G = 32
+            k_sec = [3, 4, 20, 3]
+            inc_sec = [16, 32, 24, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 64
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 98:
+            k_r = 160
+            G = 40
+            k_sec = [3, 6, 20, 3]
+            inc_sec = [16, 32, 32, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 96
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 107:
+            k_r = 200
+            G = 50
+            k_sec = [4, 8, 20, 3]
+            inc_sec = [20, 64, 64, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 128
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 131:
+            k_r = 160
+            G = 40
+            k_sec = [4, 8, 28, 3]
+            inc_sec = [16, 32, 32, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 128
+            init_filter_size = 7
+            init_padding = 3
+        else:
+            raise NotImplementedError
+        net_arg = {
+            'k_r': k_r,
+            'G': G,
+            'k_sec': k_sec,
+            'inc_sec': inc_sec,
+            'bw': bw,
+            'r': r
+        }
+        net_arg['init_num_filter'] = init_num_filter
+        net_arg['init_filter_size'] = init_filter_size
+        net_arg['init_padding'] = init_padding
+        return net_arg
+    def dual_path_factory(self,
+                          data,
+                          num_1x1_a,
+                          num_3x3_b,
+                          num_1x1_c,
+                          inc,
+                          G,
+                          _type='normal',
+                          name=None):
+        kw = 3
+        kh = 3
+        pw = (kw - 1) // 2
+        ph = (kh - 1) // 2
+        # type
+        if _type is 'proj':
+            key_stride = 1
+            has_proj = True
+        if _type is 'down':
+            key_stride = 2
+            has_proj = True
+        if _type is 'normal':
+            key_stride = 1
+            has_proj = False
+        # PROJ
+        if type(data) is list:
+            data_in = fluid.layers.concat([data[0], data[1]], axis=1)
+        else:
+            data_in = data
+        if has_proj:
+            c1x1_w = self.bn_ac_conv(
+                data=data_in,
+                num_filter=(num_1x1_c + 2 * inc),
+                kernel=(1, 1),
+                pad=(0, 0),
+                stride=(key_stride, key_stride),
+                name=name + "_match")
+            data_o1, data_o2 = fluid.layers.split(
+                c1x1_w,
+                num_or_sections=[num_1x1_c, 2 * inc],
+                dim=1,
+                name=name + "_match_conv_Slice")
+        else:
+            data_o1 = data[0]
+            data_o2 = data[1]
+        # MAIN
+        c1x1_a = self.bn_ac_conv(
+            data=data_in,
+            num_filter=num_1x1_a,
+            kernel=(1, 1),
+            pad=(0, 0),
+            name=name + "_conv1")
+        c3x3_b = self.bn_ac_conv(
+            data=c1x1_a,
+            num_filter=num_3x3_b,
+            kernel=(kw, kh),
+            pad=(pw, ph),
+            stride=(key_stride, key_stride),
+            num_group=G,
+            name=name + "_conv2")
+        c1x1_c = self.bn_ac_conv(
+            data=c3x3_b,
+            num_filter=(num_1x1_c + inc),
+            kernel=(1, 1),
+            pad=(0, 0),
+            name=name + "_conv3")
+        c1x1_c1, c1x1_c2 = fluid.layers.split(
+            c1x1_c,
+            num_or_sections=[num_1x1_c, inc],
+            dim=1,
+            name=name + "_conv3_Slice")
+        # OUTPUTS
+        summ = fluid.layers.elementwise_add(
+            x=data_o1, y=c1x1_c1, name=name + "_elewise")
+        dense = fluid.layers.concat(
+            [data_o2, c1x1_c2], axis=1, name=name + "_concat")
+        return [summ, dense]
+    def bn_ac_conv(self,
+                   data,
+                   num_filter,
+                   kernel,
+                   pad,
+                   stride=(1, 1),
+                   num_group=1,
+                   name=None):
+        bn_ac = fluid.layers.batch_norm(
+            input=data,
+            act='relu',
+            is_test=False,
+            name=name + '.output.1',
+            param_attr=ParamAttr(name=name + '_bn_scale'),
+            bias_attr=ParamAttr(name + '_bn_offset'),
+            moving_mean_name=name + '_bn_mean',
+            moving_variance_name=name + '_bn_variance', )
+        bn_ac_conv = fluid.layers.conv2d(
+            input=bn_ac,
+            num_filters=num_filter,
+            filter_size=kernel,
+            stride=stride,
+            padding=pad,
+            groups=num_group,
+            act=None,
+            bias_attr=False,
+            param_attr=ParamAttr(name=name + "_weights"))
+        return bn_ac_conv
+def DPN68():
+    model = DPN(layers=68)
+    return model
+def DPN92():
+    model = DPN(layers=92)
+    return model
+def DPN98():
+    model = DPN(layers=98)
+    return model
+def DPN107():
+    model = DPN(layers=107)
+    return model
+def DPN131():
+    model = DPN(layers=131)
+    return model
--- a/ppcls/modeling/architectures/efficientnet.py
+++ b/ppcls/modeling/architectures/efficientnet.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import re
+import math
+import copy
+import paddle.fluid as fluid
+from .layers import conv2d, init_batch_norm_layer, init_fc_layer
+__all__ = [
+    'EfficientNet', 'EfficientNetB0', 'EfficientNetB1', 'EfficientNetB2',
+    'EfficientNetB3', 'EfficientNetB4', 'EfficientNetB5', 'EfficientNetB6',
+    'EfficientNetB7'
+]
+GlobalParams = collections.namedtuple('GlobalParams', [
+    'batch_norm_momentum',
+    'batch_norm_epsilon',
+    'dropout_rate',
+    'num_classes',
+    'width_coefficient',
+    'depth_coefficient',
+    'depth_divisor',
+    'min_depth',
+    'drop_connect_rate',
+])
+BlockArgs = collections.namedtuple('BlockArgs', [
+    'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
+    'expand_ratio', 'id_skip', 'stride', 'se_ratio'
+])
+GlobalParams.__new__.__defaults__ = (None, ) * len(GlobalParams._fields)
+BlockArgs.__new__.__defaults__ = (None, ) * len(BlockArgs._fields)
+def efficientnet_params(model_name):
+    """ Map EfficientNet model name to parameter coefficients. """
+    params_dict = {
+        # Coefficients:   width,depth,resolution,dropout
+        'efficientnet-b0': (1.0, 1.0, 224, 0.2),
+        'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+        'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+        'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+        'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+        'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+        'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+        'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+    }
+    return params_dict[model_name]
+def efficientnet(width_coefficient=None,
+                 depth_coefficient=None,
+                 dropout_rate=0.2,
+                 drop_connect_rate=0.2):
+    """ Get block arguments according to parameter and coefficients. """
+    blocks_args = [
+        'r1_k3_s11_e1_i32_o16_se0.25',
+        'r2_k3_s22_e6_i16_o24_se0.25',
+        'r2_k5_s22_e6_i24_o40_se0.25',
+        'r3_k3_s22_e6_i40_o80_se0.25',
+        'r3_k5_s11_e6_i80_o112_se0.25',
+        'r4_k5_s22_e6_i112_o192_se0.25',
+        'r1_k3_s11_e6_i192_o320_se0.25',
+    ]
+    blocks_args = BlockDecoder.decode(blocks_args)
+    global_params = GlobalParams(
+        batch_norm_momentum=0.99,
+        batch_norm_epsilon=1e-3,
+        dropout_rate=dropout_rate,
+        drop_connect_rate=drop_connect_rate,
+        num_classes=1000,
+        width_coefficient=width_coefficient,
+        depth_coefficient=depth_coefficient,
+        depth_divisor=8,
+        min_depth=None)
+    return blocks_args, global_params
+def get_model_params(model_name, override_params):
+    """ Get the block args and global params for a given model """
+    if model_name.startswith('efficientnet'):
+        w, d, _, p = efficientnet_params(model_name)
+        blocks_args, global_params = efficientnet(
+            width_coefficient=w, depth_coefficient=d, dropout_rate=p)
+    else:
+        raise NotImplementedError('model name is not pre-defined: %s' %
+                                  model_name)
+    if override_params:
+        global_params = global_params._replace(**override_params)
+    return blocks_args, global_params
+def round_filters(filters, global_params):
+    """ Calculate and round number of filters based on depth multiplier. """
+    multiplier = global_params.width_coefficient
+    if not multiplier:
+        return filters
+    divisor = global_params.depth_divisor
+    min_depth = global_params.min_depth
+    filters *= multiplier
+    min_depth = min_depth or divisor
+    new_filters = max(min_depth,
+                      int(filters + divisor / 2) // divisor * divisor)
+    if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
+        new_filters += divisor
+    return int(new_filters)
+def round_repeats(repeats, global_params):
+    """ Round number of filters based on depth multiplier. """
+    multiplier = global_params.depth_coefficient
+    if not multiplier:
+        return repeats
+    return int(math.ceil(multiplier * repeats))
+class EfficientNet():
+    def __init__(self,
+                 name='b0',
+                 padding_type='SAME',
+                 override_params=None,
+                 is_test=False,
+                 use_se=True):
+        valid_names = ['b' + str(i) for i in range(8)]
+        assert name in valid_names, 'efficient name should be in b0~b7'
+        model_name = 'efficientnet-' + name
+        self._blocks_args, self._global_params = get_model_params(
+            model_name, override_params)
+        self._bn_mom = self._global_params.batch_norm_momentum
+        self._bn_eps = self._global_params.batch_norm_epsilon
+        self.is_test = is_test
+        self.padding_type = padding_type
+        self.use_se = use_se
+    def net(self, input, class_dim=1000, is_test=False):
+        conv = self.extract_features(input, is_test=is_test)
+        out_channels = round_filters(1280, self._global_params)
+        conv = self.conv_bn_layer(
+            conv,
+            num_filters=out_channels,
+            filter_size=1,
+            bn_act='swish',
+            bn_mom=self._bn_mom,
+            bn_eps=self._bn_eps,
+            padding_type=self.padding_type,
+            name='',
+            conv_name='_conv_head',
+            bn_name='_bn1')
+        pool = fluid.layers.pool2d(
+            input=conv, pool_type='avg', global_pooling=True, use_cudnn=False)
+        if self._global_params.dropout_rate:
+            pool = fluid.layers.dropout(
+                pool,
+                self._global_params.dropout_rate,
+                dropout_implementation='upscale_in_train')
+        param_attr, bias_attr = init_fc_layer(class_dim, '_fc')
+        out = fluid.layers.fc(pool,
+                              class_dim,
+                              name='_fc',
+                              param_attr=param_attr,
+                              bias_attr=bias_attr)
+        return out
+    def _drop_connect(self, inputs, prob, is_test):
+        if is_test:
+            return inputs
+        keep_prob = 1.0 - prob
+        random_tensor = keep_prob + fluid.layers.uniform_random_batch_size_like(
+            inputs, [-1, 1, 1, 1], min=0., max=1.)
+        binary_tensor = fluid.layers.floor(random_tensor)
+        output = inputs / keep_prob * binary_tensor
+        return output
+    def _expand_conv_norm(self, inputs, block_args, is_test, name=None):
+        # Expansion phase
+        oup = block_args.input_filters * block_args.expand_ratio  # number of output channels
+        if block_args.expand_ratio != 1:
+            conv = self.conv_bn_layer(
+                inputs,
+                num_filters=oup,
+                filter_size=1,
+                bn_act=None,
+                bn_mom=self._bn_mom,
+                bn_eps=self._bn_eps,
+                padding_type=self.padding_type,
+                name=name,
+                conv_name=name + '_expand_conv',
+                bn_name='_bn0')
+        return conv
+    def _depthwise_conv_norm(self, inputs, block_args, is_test, name=None):
+        k = block_args.kernel_size
+        s = block_args.stride
+        if isinstance(s, list) or isinstance(s, tuple):
+            s = s[0]
+        oup = block_args.input_filters * block_args.expand_ratio  # number of output channels
+        conv = self.conv_bn_layer(
+            inputs,
+            num_filters=oup,
+            filter_size=k,
+            stride=s,
+            num_groups=oup,
+            bn_act=None,
+            padding_type=self.padding_type,
+            bn_mom=self._bn_mom,
+            bn_eps=self._bn_eps,
+            name=name,
+            use_cudnn=False,
+            conv_name=name + '_depthwise_conv',
+            bn_name='_bn1')
+        return conv
+    def _project_conv_norm(self, inputs, block_args, is_test, name=None):
+        final_oup = block_args.output_filters
+        conv = self.conv_bn_layer(
+            inputs,
+            num_filters=final_oup,
+            filter_size=1,
+            bn_act=None,
+            padding_type=self.padding_type,
+            bn_mom=self._bn_mom,
+            bn_eps=self._bn_eps,
+            name=name,
+            conv_name=name + '_project_conv',
+            bn_name='_bn2')
+        return conv
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride=1,
+                      num_groups=1,
+                      padding_type="SAME",
+                      conv_act=None,
+                      bn_act='swish',
+                      use_cudnn=True,
+                      use_bn=True,
+                      bn_mom=0.9,
+                      bn_eps=1e-05,
+                      use_bias=False,
+                      name=None,
+                      conv_name=None,
+                      bn_name=None):
+        conv = conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            groups=num_groups,
+            act=conv_act,
+            padding_type=padding_type,
+            use_cudnn=use_cudnn,
+            name=conv_name,
+            use_bias=use_bias)
+        if use_bn == False:
+            return conv
+        else:
+            bn_name = name + bn_name
+            param_attr, bias_attr = init_batch_norm_layer(bn_name)
+            return fluid.layers.batch_norm(
+                input=conv,
+                act=bn_act,
+                momentum=bn_mom,
+                epsilon=bn_eps,
+                name=bn_name,
+                moving_mean_name=bn_name + '_mean',
+                moving_variance_name=bn_name + '_variance',
+                param_attr=param_attr,
+                bias_attr=bias_attr)
+    def _conv_stem_norm(self, inputs, is_test):
+        out_channels = round_filters(32, self._global_params)
+        bn = self.conv_bn_layer(
+            inputs,
+            num_filters=out_channels,
+            filter_size=3,
+            stride=2,
+            bn_act=None,
+            bn_mom=self._bn_mom,
+            padding_type=self.padding_type,
+            bn_eps=self._bn_eps,
+            name='',
+            conv_name='_conv_stem',
+            bn_name='_bn0')
+        return bn
+    def mb_conv_block(self,
+                      inputs,
+                      block_args,
+                      is_test=False,
+                      drop_connect_rate=None,
+                      name=None):
+        # Expansion and Depthwise Convolution
+        oup = block_args.input_filters * block_args.expand_ratio  # number of output channels
+        has_se = self.use_se and (block_args.se_ratio is not None) and (
+            0 < block_args.se_ratio <= 1)
+        id_skip = block_args.id_skip  # skip connection and drop connect
+        conv = inputs
+        if block_args.expand_ratio != 1:
+            conv = fluid.layers.swish(
+                self._expand_conv_norm(conv, block_args, is_test, name))
+        conv = fluid.layers.swish(
+            self._depthwise_conv_norm(conv, block_args, is_test, name))
+        # Squeeze and Excitation
+        if has_se:
+            num_squeezed_channels = max(
+                1, int(block_args.input_filters * block_args.se_ratio))
+            conv = self.se_block(conv, num_squeezed_channels, oup, name)
+        conv = self._project_conv_norm(conv, block_args, is_test, name)
+        # Skip connection and drop connect
+        input_filters, output_filters = block_args.input_filters, block_args.output_filters
+        if id_skip and block_args.stride == 1 and input_filters == output_filters:
+            if drop_connect_rate:
+                conv = self._drop_connect(conv, drop_connect_rate,
+                                          self.is_test)
+            conv = fluid.layers.elementwise_add(conv, inputs)
+        return conv
+    def se_block(self, inputs, num_squeezed_channels, oup, name):
+        x_squeezed = fluid.layers.pool2d(
+            input=inputs,
+            pool_type='avg',
+            global_pooling=True,
+            use_cudnn=False)
+        x_squeezed = conv2d(
+            x_squeezed,
+            num_filters=num_squeezed_channels,
+            filter_size=1,
+            use_bias=True,
+            padding_type=self.padding_type,
+            act='swish',
+            name=name + '_se_reduce')
+        x_squeezed = conv2d(
+            x_squeezed,
+            num_filters=oup,
+            filter_size=1,
+            use_bias=True,
+            padding_type=self.padding_type,
+            name=name + '_se_expand')
+        se_out = inputs * fluid.layers.sigmoid(x_squeezed)
+        return se_out
+    def extract_features(self, inputs, is_test):
+        """ Returns output of the final convolution layer """
+        conv = fluid.layers.swish(
+            self._conv_stem_norm(
+                inputs, is_test=is_test))
+        block_args_copy = copy.deepcopy(self._blocks_args)
+        idx = 0
+        block_size = 0
+        for block_arg in block_args_copy:
+            block_arg = block_arg._replace(
+                input_filters=round_filters(block_arg.input_filters,
+                                            self._global_params),
+                output_filters=round_filters(block_arg.output_filters,
+                                             self._global_params),
+                num_repeat=round_repeats(block_arg.num_repeat,
+                                         self._global_params))
+            block_size += 1
+            for _ in range(block_arg.num_repeat - 1):
+                block_size += 1
+        for block_args in self._blocks_args:
+            # Update block input and output filters based on depth multiplier.
+            block_args = block_args._replace(
+                input_filters=round_filters(block_args.input_filters,
+                                            self._global_params),
+                output_filters=round_filters(block_args.output_filters,
+                                             self._global_params),
+                num_repeat=round_repeats(block_args.num_repeat,
+                                         self._global_params))
+            # The first block needs to take care of stride and filter size increase.
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / block_size
+            conv = self.mb_conv_block(conv, block_args, is_test,
+                                      drop_connect_rate,
+                                      '_blocks.' + str(idx) + '.')
+            idx += 1
+            if block_args.num_repeat > 1:
+                block_args = block_args._replace(
+                    input_filters=block_args.output_filters, stride=1)
+            for _ in range(block_args.num_repeat - 1):
+                drop_connect_rate = self._global_params.drop_connect_rate
+                if drop_connect_rate:
+                    drop_connect_rate *= float(idx) / block_size
+                conv = self.mb_conv_block(conv, block_args, is_test,
+                                          drop_connect_rate,
+                                          '_blocks.' + str(idx) + '.')
+                idx += 1
+        return conv
+    def shortcut(self, input, data_residual):
+        return fluid.layers.elementwise_add(input, data_residual)
+class BlockDecoder(object):
+    """ Block Decoder for readability, straight from the official TensorFlow repository """
+    @staticmethod
+    def _decode_block_string(block_string):
+        """ Gets a block through a string notation of arguments. """
+        assert isinstance(block_string, str)
+        ops = block_string.split('_')
+        options = {}
+        for op in ops:
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+        # Check stride
+        assert (
+            ('s' in options and len(options['s']) == 1) or
+            (len(options['s']) == 2 and options['s'][0] == options['s'][1]))
+        return BlockArgs(
+            kernel_size=int(options['k']),
+            num_repeat=int(options['r']),
+            input_filters=int(options['i']),
+            output_filters=int(options['o']),
+            expand_ratio=int(options['e']),
+            id_skip=('noskip' not in block_string),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            stride=[int(options['s'][0])])
+    @staticmethod
+    def _encode_block_string(block):
+        """Encodes a block to a string."""
+        args = [
+            'r%d' % block.num_repeat, 'k%d' % block.kernel_size, 's%d%d' %
+            (block.strides[0], block.strides[1]), 'e%s' % block.expand_ratio,
+            'i%d' % block.input_filters, 'o%d' % block.output_filters
+        ]
+        if 0 < block.se_ratio <= 1:
+            args.append('se%s' % block.se_ratio)
+        if block.id_skip is False:
+            args.append('noskip')
+        return '_'.join(args)
+    @staticmethod
+    def decode(string_list):
+        """
+        Decodes a list of string notations to specify blocks inside the network.
+        :param string_list: a list of strings, each string is a notation of block
+        :return: a list of BlockArgs namedtuples of block args
+        """
+        assert isinstance(string_list, list)
+        blocks_args = []
+        for block_string in string_list:
+            blocks_args.append(BlockDecoder._decode_block_string(block_string))
+        return blocks_args
+    @staticmethod
+    def encode(blocks_args):
+        """
+        Encodes a list of BlockArgs to a list of strings.
+        :param blocks_args: a list of BlockArgs namedtuples of block args
+        :return: a list of strings, each string is a notation of block
+        """
+        block_strings = []
+        for block in blocks_args:
+            block_strings.append(BlockDecoder._encode_block_string(block))
+        return block_strings
+def EfficientNetB0(is_test=False,
+                   padding_type='SAME',
+                   override_params=None,
+                   use_se=True):
+    model = EfficientNet(
+        name='b0',
+        is_test=is_test,
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=use_se)
+    return model
+def EfficientNetB1(is_test=False,
+                   padding_type='SAME',
+                   override_params=None,
+                   use_se=True):
+    model = EfficientNet(
+        name='b1',
+        is_test=is_test,
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=use_se)
+    return model
+def EfficientNetB2(is_test=False,
+                   padding_type='SAME',
+                   override_params=None,
+                   use_se=True):
+    model = EfficientNet(
+        name='b2',
+        is_test=is_test,
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=use_se)
+    return model
+def EfficientNetB3(is_test=False,
+                   padding_type='SAME',
+                   override_params=None,
+                   use_se=True):
+    model = EfficientNet(
+        name='b3',
+        is_test=is_test,
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=use_se)
+    return model
+def EfficientNetB4(is_test=False,
+                   padding_type='SAME',
+                   override_params=None,
+                   use_se=True):
+    model = EfficientNet(
+        name='b4',
+        is_test=is_test,
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=use_se)
+    return model
+def EfficientNetB5(is_test=False,
+                   padding_type='SAME',
+                   override_params=None,
+                   use_se=True):
+    model = EfficientNet(
+        name='b5',
+        is_test=is_test,
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=use_se)
+    return model
+def EfficientNetB6(is_test=False,
+                   padding_type='SAME',
+                   override_params=None,
+                   use_se=True):
+    model = EfficientNet(
+        name='b6',
+        is_test=is_test,
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=use_se)
+    return model
+def EfficientNetB7(is_test=False,
+                   padding_type='SAME',
+                   override_params=None,
+                   use_se=True):
+    model = EfficientNet(
+        name='b7',
+        is_test=is_test,
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=use_se)
+    return model
--- a/ppcls/modeling/architectures/googlenet.py
+++ b/ppcls/modeling/architectures/googlenet.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+__all__ = ['GoogLeNet']
+class GoogLeNet():
+    def __init__(self):
+        pass
+    def conv_layer(self,
+                   input,
+                   num_filters,
+                   filter_size,
+                   stride=1,
+                   groups=1,
+                   act=None,
+                   name=None):
+        channels = input.shape[1]
+        stdv = (3.0 / (filter_size**2 * channels))**0.5
+        param_attr = ParamAttr(
+            initializer=fluid.initializer.Uniform(-stdv, stdv),
+            name=name + "_weights")
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=act,
+            param_attr=param_attr,
+            bias_attr=False,
+            name=name)
+        return conv
+    def xavier(self, channels, filter_size, name):
+        stdv = (3.0 / (filter_size**2 * channels))**0.5
+        param_attr = ParamAttr(
+            initializer=fluid.initializer.Uniform(-stdv, stdv),
+            name=name + "_weights")
+        return param_attr
+    def inception(self,
+                  input,
+                  channels,
+                  filter1,
+                  filter3R,
+                  filter3,
+                  filter5R,
+                  filter5,
+                  proj,
+                  name=None):
+        conv1 = self.conv_layer(
+            input=input,
+            num_filters=filter1,
+            filter_size=1,
+            stride=1,
+            act=None,
+            name="inception_" + name + "_1x1")
+        conv3r = self.conv_layer(
+            input=input,
+            num_filters=filter3R,
+            filter_size=1,
+            stride=1,
+            act=None,
+            name="inception_" + name + "_3x3_reduce")
+        conv3 = self.conv_layer(
+            input=conv3r,
+            num_filters=filter3,
+            filter_size=3,
+            stride=1,
+            act=None,
+            name="inception_" + name + "_3x3")
+        conv5r = self.conv_layer(
+            input=input,
+            num_filters=filter5R,
+            filter_size=1,
+            stride=1,
+            act=None,
+            name="inception_" + name + "_5x5_reduce")
+        conv5 = self.conv_layer(
+            input=conv5r,
+            num_filters=filter5,
+            filter_size=5,
+            stride=1,
+            act=None,
+            name="inception_" + name + "_5x5")
+        pool = fluid.layers.pool2d(
+            input=input,
+            pool_size=3,
+            pool_stride=1,
+            pool_padding=1,
+            pool_type='max')
+        convprj = fluid.layers.conv2d(
+            input=pool,
+            filter_size=1,
+            num_filters=proj,
+            stride=1,
+            padding=0,
+            name="inception_" + name + "_3x3_proj",
+            param_attr=ParamAttr(
+                name="inception_" + name + "_3x3_proj_weights"),
+            bias_attr=False)
+        cat = fluid.layers.concat(input=[conv1, conv3, conv5, convprj], axis=1)
+        cat = fluid.layers.relu(cat)
+        return cat
+    def net(self, input, class_dim=1000):
+        conv = self.conv_layer(
+            input=input,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act=None,
+            name="conv1")
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=3, pool_type='max', pool_stride=2)
+        conv = self.conv_layer(
+            input=pool,
+            num_filters=64,
+            filter_size=1,
+            stride=1,
+            act=None,
+            name="conv2_1x1")
+        conv = self.conv_layer(
+            input=conv,
+            num_filters=192,
+            filter_size=3,
+            stride=1,
+            act=None,
+            name="conv2_3x3")
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=3, pool_type='max', pool_stride=2)
+        ince3a = self.inception(pool, 192, 64, 96, 128, 16, 32, 32, "ince3a")
+        ince3b = self.inception(ince3a, 256, 128, 128, 192, 32, 96, 64,
+                                "ince3b")
+        pool3 = fluid.layers.pool2d(
+            input=ince3b, pool_size=3, pool_type='max', pool_stride=2)
+        ince4a = self.inception(pool3, 480, 192, 96, 208, 16, 48, 64, "ince4a")
+        ince4b = self.inception(ince4a, 512, 160, 112, 224, 24, 64, 64,
+                                "ince4b")
+        ince4c = self.inception(ince4b, 512, 128, 128, 256, 24, 64, 64,
+                                "ince4c")
+        ince4d = self.inception(ince4c, 512, 112, 144, 288, 32, 64, 64,
+                                "ince4d")
+        ince4e = self.inception(ince4d, 528, 256, 160, 320, 32, 128, 128,
+                                "ince4e")
+        pool4 = fluid.layers.pool2d(
+            input=ince4e, pool_size=3, pool_type='max', pool_stride=2)
+        ince5a = self.inception(pool4, 832, 256, 160, 320, 32, 128, 128,
+                                "ince5a")
+        ince5b = self.inception(ince5a, 832, 384, 192, 384, 48, 128, 128,
+                                "ince5b")
+        pool5 = fluid.layers.pool2d(
+            input=ince5b, pool_size=7, pool_type='avg', pool_stride=7)
+        dropout = fluid.layers.dropout(x=pool5, dropout_prob=0.4)
+        out = fluid.layers.fc(input=dropout,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=self.xavier(1024, 1, "out"),
+                              name="out",
+                              bias_attr=ParamAttr(name="out_offset"))
+        pool_o1 = fluid.layers.pool2d(
+            input=ince4a, pool_size=5, pool_type='avg', pool_stride=3)
+        conv_o1 = self.conv_layer(
+            input=pool_o1,
+            num_filters=128,
+            filter_size=1,
+            stride=1,
+            act=None,
+            name="conv_o1")
+        fc_o1 = fluid.layers.fc(input=conv_o1,
+                                size=1024,
+                                act='relu',
+                                param_attr=self.xavier(2048, 1, "fc_o1"),
+                                name="fc_o1",
+                                bias_attr=ParamAttr(name="fc_o1_offset"))
+        dropout_o1 = fluid.layers.dropout(x=fc_o1, dropout_prob=0.7)
+        out1 = fluid.layers.fc(input=dropout_o1,
+                               size=class_dim,
+                               act='softmax',
+                               param_attr=self.xavier(1024, 1, "out1"),
+                               name="out1",
+                               bias_attr=ParamAttr(name="out1_offset"))
+        pool_o2 = fluid.layers.pool2d(
+            input=ince4d, pool_size=5, pool_type='avg', pool_stride=3)
+        conv_o2 = self.conv_layer(
+            input=pool_o2,
+            num_filters=128,
+            filter_size=1,
+            stride=1,
+            act=None,
+            name="conv_o2")
+        fc_o2 = fluid.layers.fc(input=conv_o2,
+                                size=1024,
+                                act='relu',
+                                param_attr=self.xavier(2048, 1, "fc_o2"),
+                                name="fc_o2",
+                                bias_attr=ParamAttr(name="fc_o2_offset"))
+        dropout_o2 = fluid.layers.dropout(x=fc_o2, dropout_prob=0.7)
+        out2 = fluid.layers.fc(input=dropout_o2,
+                               size=class_dim,
+                               act='softmax',
+                               param_attr=self.xavier(1024, 1, "out2"),
+                               name="out2",
+                               bias_attr=ParamAttr(name="out2_offset"))
+        # last fc layer is "out"
+        return [out, out1, out2]
--- a/ppcls/modeling/architectures/hrnet.py
+++ b/ppcls/modeling/architectures/hrnet.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+__all__ = [
+    "HRNet", "HRNet_W18_C", "HRNet_W30_C", "HRNet_W32_C", "HRNet_W40_C",
+    "HRNet_W44_C", "HRNet_W48_C", "HRNet_W60_C", "HRNet_W64_C",
+    "SE_HRNet_W18_C", "SE_HRNet_W30_C", "SE_HRNet_W32_C", "SE_HRNet_W40_C",
+    "SE_HRNet_W44_C", "SE_HRNet_W48_C", "SE_HRNet_W60_C", "SE_HRNet_W64_C"
+]
+class HRNet():
+    def __init__(self, width=18, has_se=False):
+        self.width = width
+        self.has_se = has_se
+        self.channels = {
+            18: [[18, 36], [18, 36, 72], [18, 36, 72, 144]],
+            30: [[30, 60], [30, 60, 120], [30, 60, 120, 240]],
+            32: [[32, 64], [32, 64, 128], [32, 64, 128, 256]],
+            40: [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
+            44: [[44, 88], [44, 88, 176], [44, 88, 176, 352]],
+            48: [[48, 96], [48, 96, 192], [48, 96, 192, 384]],
+            60: [[60, 120], [60, 120, 240], [60, 120, 240, 480]],
+            64: [[64, 128], [64, 128, 256], [64, 128, 256, 512]]
+        }
+    def net(self, input, class_dim=1000):
+        width = self.width
+        channels_2, channels_3, channels_4 = self.channels[width]
+        num_modules_2, num_modules_3, num_modules_4 = 1, 4, 3
+        x = self.conv_bn_layer(
+            input=input,
+            filter_size=3,
+            num_filters=64,
+            stride=2,
+            if_act=True,
+            name='layer1_1')
+        x = self.conv_bn_layer(
+            input=x,
+            filter_size=3,
+            num_filters=64,
+            stride=2,
+            if_act=True,
+            name='layer1_2')
+        la1 = self.layer1(x, name='layer2')
+        tr1 = self.transition_layer([la1], [256], channels_2, name='tr1')
+        st2 = self.stage(tr1, num_modules_2, channels_2, name='st2')
+        tr2 = self.transition_layer(st2, channels_2, channels_3, name='tr2')
+        st3 = self.stage(tr2, num_modules_3, channels_3, name='st3')
+        tr3 = self.transition_layer(st3, channels_3, channels_4, name='tr3')
+        st4 = self.stage(tr3, num_modules_4, channels_4, name='st4')
+        #classification
+        last_cls = self.last_cls_out(x=st4, name='cls_head')
+        y = last_cls[0]
+        last_num_filters = [256, 512, 1024]
+        for i in range(3):
+            y = fluid.layers.elementwise_add(
+                last_cls[i + 1],
+                self.conv_bn_layer(
+                    input=y,
+                    filter_size=3,
+                    num_filters=last_num_filters[i],
+                    stride=2,
+                    name='cls_head_add' + str(i + 1)))
+        y = self.conv_bn_layer(
+            input=y,
+            filter_size=1,
+            num_filters=2048,
+            stride=1,
+            name='cls_head_last_conv')
+        pool = fluid.layers.pool2d(
+            input=y, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=pool,
+            size=class_dim,
+            param_attr=ParamAttr(
+                name='fc_weights',
+                initializer=fluid.initializer.Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(name='fc_offset'))
+        return out
+    def layer1(self, input, name=None):
+        conv = input
+        for i in range(4):
+            conv = self.bottleneck_block(
+                conv,
+                num_filters=64,
+                downsample=True if i == 0 else False,
+                name=name + '_' + str(i + 1))
+        return conv
+    def transition_layer(self, x, in_channels, out_channels, name=None):
+        num_in = len(in_channels)
+        num_out = len(out_channels)
+        out = []
+        for i in range(num_out):
+            if i < num_in:
+                if in_channels[i] != out_channels[i]:
+                    residual = self.conv_bn_layer(
+                        x[i],
+                        filter_size=3,
+                        num_filters=out_channels[i],
+                        name=name + '_layer_' + str(i + 1))
+                    out.append(residual)
+                else:
+                    out.append(x[i])
+            else:
+                residual = self.conv_bn_layer(
+                    x[-1],
+                    filter_size=3,
+                    num_filters=out_channels[i],
+                    stride=2,
+                    name=name + '_layer_' + str(i + 1))
+                out.append(residual)
+        return out
+    def branches(self, x, block_num, channels, name=None):
+        out = []
+        for i in range(len(channels)):
+            residual = x[i]
+            for j in range(block_num):
+                residual = self.basic_block(
+                    residual,
+                    channels[i],
+                    name=name + '_branch_layer_' + str(i + 1) + '_' +
+                    str(j + 1))
+            out.append(residual)
+        return out
+    def fuse_layers(self, x, channels, multi_scale_output=True, name=None):
+        out = []
+        for i in range(len(channels) if multi_scale_output else 1):
+            residual = x[i]
+            for j in range(len(channels)):
+                if j > i:
+                    y = self.conv_bn_layer(
+                        x[j],
+                        filter_size=1,
+                        num_filters=channels[i],
+                        if_act=False,
+                        name=name + '_layer_' + str(i + 1) + '_' + str(j + 1))
+                    y = fluid.layers.resize_nearest(input=y, scale=2**(j - i))
+                    residual = fluid.layers.elementwise_add(
+                        x=residual, y=y, act=None)
+                elif j < i:
+                    y = x[j]
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            y = self.conv_bn_layer(
+                                y,
+                                filter_size=3,
+                                num_filters=channels[i],
+                                stride=2,
+                                if_act=False,
+                                name=name + '_layer_' + str(i + 1) + '_' +
+                                str(j + 1) + '_' + str(k + 1))
+                        else:
+                            y = self.conv_bn_layer(
+                                y,
+                                filter_size=3,
+                                num_filters=channels[j],
+                                stride=2,
+                                name=name + '_layer_' + str(i + 1) + '_' +
+                                str(j + 1) + '_' + str(k + 1))
+                    residual = fluid.layers.elementwise_add(
+                        x=residual, y=y, act=None)
+            residual = fluid.layers.relu(residual)
+            out.append(residual)
+        return out
+    def high_resolution_module(self,
+                               x,
+                               channels,
+                               multi_scale_output=True,
+                               name=None):
+        residual = self.branches(x, 4, channels, name=name)
+        out = self.fuse_layers(
+            residual,
+            channels,
+            multi_scale_output=multi_scale_output,
+            name=name)
+        return out
+    def stage(self,
+              x,
+              num_modules,
+              channels,
+              multi_scale_output=True,
+              name=None):
+        out = x
+        for i in range(num_modules):
+            if i == num_modules - 1 and multi_scale_output == False:
+                out = self.high_resolution_module(
+                    out,
+                    channels,
+                    multi_scale_output=False,
+                    name=name + '_' + str(i + 1))
+            else:
+                out = self.high_resolution_module(
+                    out, channels, name=name + '_' + str(i + 1))
+        return out
+    def last_cls_out(self, x, name=None):
+        out = []
+        num_filters_list = [32, 64, 128, 256]
+        for i in range(len(x)):
+            out.append(
+                self.bottleneck_block(
+                    input=x[i],
+                    num_filters=num_filters_list[i],
+                    name=name + 'conv_' + str(i + 1),
+                    downsample=True))
+        return out
+    def basic_block(self,
+                    input,
+                    num_filters,
+                    stride=1,
+                    downsample=False,
+                    name=None):
+        residual = input
+        conv = self.conv_bn_layer(
+            input=input,
+            filter_size=3,
+            num_filters=num_filters,
+            stride=stride,
+            name=name + '_conv1')
+        conv = self.conv_bn_layer(
+            input=conv,
+            filter_size=3,
+            num_filters=num_filters,
+            if_act=False,
+            name=name + '_conv2')
+        if downsample:
+            residual = self.conv_bn_layer(
+                input=input,
+                filter_size=1,
+                num_filters=num_filters,
+                if_act=False,
+                name=name + '_downsample')
+        if self.has_se:
+            conv = self.squeeze_excitation(
+                input=conv,
+                num_channels=num_filters,
+                reduction_ratio=16,
+                name=name + '_fc')
+        return fluid.layers.elementwise_add(x=residual, y=conv, act='relu')
+    def bottleneck_block(self,
+                         input,
+                         num_filters,
+                         stride=1,
+                         downsample=False,
+                         name=None):
+        residual = input
+        conv = self.conv_bn_layer(
+            input=input,
+            filter_size=1,
+            num_filters=num_filters,
+            name=name + '_conv1')
+        conv = self.conv_bn_layer(
+            input=conv,
+            filter_size=3,
+            num_filters=num_filters,
+            stride=stride,
+            name=name + '_conv2')
+        conv = self.conv_bn_layer(
+            input=conv,
+            filter_size=1,
+            num_filters=num_filters * 4,
+            if_act=False,
+            name=name + '_conv3')
+        if downsample:
+            residual = self.conv_bn_layer(
+                input=input,
+                filter_size=1,
+                num_filters=num_filters * 4,
+                if_act=False,
+                name=name + '_downsample')
+        if self.has_se:
+            conv = self.squeeze_excitation(
+                input=conv,
+                num_channels=num_filters * 4,
+                reduction_ratio=16,
+                name=name + '_fc')
+        return fluid.layers.elementwise_add(x=residual, y=conv, act='relu')
+    def squeeze_excitation(self,
+                           input,
+                           num_channels,
+                           reduction_ratio,
+                           name=None):
+        pool = fluid.layers.pool2d(
+            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(
+            input=pool,
+            size=num_channels / reduction_ratio,
+            act='relu',
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=name + '_sqz_weights'),
+            bias_attr=ParamAttr(name=name + '_sqz_offset'))
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(
+            input=squeeze,
+            size=num_channels,
+            act='sigmoid',
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=name + '_exc_weights'),
+            bias_attr=ParamAttr(name=name + '_exc_offset'))
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride=1,
+                      padding=1,
+                      num_groups=1,
+                      if_act=True,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=num_groups,
+            act=None,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=name + '_weights'),
+            bias_attr=False)
+        bn_name = name + '_bn'
+        bn = fluid.layers.batch_norm(
+            input=conv,
+            param_attr=ParamAttr(
+                name=bn_name + "_scale",
+                initializer=fluid.initializer.Constant(1.0)),
+            bias_attr=ParamAttr(
+                name=bn_name + "_offset",
+                initializer=fluid.initializer.Constant(0.0)),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+        if if_act:
+            bn = fluid.layers.relu(bn)
+        return bn
+def HRNet_W18_C():
+    model = HRNet(width=18)
+    return model
+def HRNet_W30_C():
+    model = HRNet(width=30)
+    return model
+def HRNet_W32_C():
+    model = HRNet(width=32)
+    return model
+def HRNet_W40_C():
+    model = HRNet(width=40)
+    return model
+def HRNet_W44_C():
+    model = HRNet(width=44)
+    return model
+def HRNet_W48_C():
+    model = HRNet(width=48)
+    return model
+def HRNet_W60_C():
+    model = HRNet(width=60)
+    return model
+def HRNet_W64_C():
+    model = HRNet(width=64)
+    return model
+def SE_HRNet_W18_C():
+    model = HRNet(width=18, has_se=True)
+    return model
+def SE_HRNet_W30_C():
+    model = HRNet(width=30, has_se=True)
+    return model
+def SE_HRNet_W32_C():
+    model = HRNet(width=32, has_se=True)
+    return model
+def SE_HRNet_W40_C():
+    model = HRNet(width=40, has_se=True)
+    return model
+def SE_HRNet_W44_C():
+    model = HRNet(width=44, has_se=True)
+    return model
+def SE_HRNet_W48_C():
+    model = HRNet(width=48, has_se=True)
+    return model
+def SE_HRNet_W60_C():
+    model = HRNet(width=60, has_se=True)
+    return model
+def SE_HRNet_W64_C():
+    model = HRNet(width=64, has_se=True)
+    return model
--- a/ppcls/modeling/architectures/inception_v4.py
+++ b/ppcls/modeling/architectures/inception_v4.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+__all__ = ['InceptionV4']
+class InceptionV4():
+    def __init__(self):
+        pass
+    def net(self, input, class_dim=1000):
+        x = self.inception_stem(input)
+        for i in range(4):
+            x = self.inceptionA(x, name=str(i + 1))
+        x = self.reductionA(x)
+        for i in range(7):
+            x = self.inceptionB(x, name=str(i + 1))
+        x = self.reductionB(x)
+        for i in range(3):
+            x = self.inceptionC(x, name=str(i + 1))
+        pool = fluid.layers.pool2d(
+            input=x, pool_type='avg', global_pooling=True)
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=drop,
+            size=class_dim,
+            param_attr=ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name="final_fc_weights"),
+            bias_attr=ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name="final_fc_offset"))
+        return out
+    def conv_bn_layer(self,
+                      data,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      padding=0,
+                      groups=1,
+                      act='relu',
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=data,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            name=name)
+        bn_name = name + "_bn"
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            name=bn_name,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+    def inception_stem(self, data, name=None):
+        conv = self.conv_bn_layer(
+            data, 32, 3, stride=2, act='relu', name="conv1_3x3_s2")
+        conv = self.conv_bn_layer(conv, 32, 3, act='relu', name="conv2_3x3_s1")
+        conv = self.conv_bn_layer(
+            conv, 64, 3, padding=1, act='relu', name="conv3_3x3_s1")
+        pool1 = fluid.layers.pool2d(
+            input=conv, pool_size=3, pool_stride=2, pool_type='max')
+        conv2 = self.conv_bn_layer(
+            conv, 96, 3, stride=2, act='relu', name="inception_stem1_3x3_s2")
+        concat = fluid.layers.concat([pool1, conv2], axis=1)
+        conv1 = self.conv_bn_layer(
+            concat, 64, 1, act='relu', name="inception_stem2_3x3_reduce")
+        conv1 = self.conv_bn_layer(
+            conv1, 96, 3, act='relu', name="inception_stem2_3x3")
+        conv2 = self.conv_bn_layer(
+            concat, 64, 1, act='relu', name="inception_stem2_1x7_reduce")
+        conv2 = self.conv_bn_layer(
+            conv2,
+            64, (7, 1),
+            padding=(3, 0),
+            act='relu',
+            name="inception_stem2_1x7")
+        conv2 = self.conv_bn_layer(
+            conv2,
+            64, (1, 7),
+            padding=(0, 3),
+            act='relu',
+            name="inception_stem2_7x1")
+        conv2 = self.conv_bn_layer(
+            conv2, 96, 3, act='relu', name="inception_stem2_3x3_2")
+        concat = fluid.layers.concat([conv1, conv2], axis=1)
+        conv1 = self.conv_bn_layer(
+            concat,
+            192,
+            3,
+            stride=2,
+            act='relu',
+            name="inception_stem3_3x3_s2")
+        pool1 = fluid.layers.pool2d(
+            input=concat, pool_size=3, pool_stride=2, pool_type='max')
+        concat = fluid.layers.concat([conv1, pool1], axis=1)
+        return concat
+    def inceptionA(self, data, name=None):
+        pool1 = fluid.layers.pool2d(
+            input=data, pool_size=3, pool_padding=1, pool_type='avg')
+        conv1 = self.conv_bn_layer(
+            pool1, 96, 1, act='relu', name="inception_a" + name + "_1x1")
+        conv2 = self.conv_bn_layer(
+            data, 96, 1, act='relu', name="inception_a" + name + "_1x1_2")
+        conv3 = self.conv_bn_layer(
+            data, 64, 1, act='relu', name="inception_a" + name + "_3x3_reduce")
+        conv3 = self.conv_bn_layer(
+            conv3,
+            96,
+            3,
+            padding=1,
+            act='relu',
+            name="inception_a" + name + "_3x3")
+        conv4 = self.conv_bn_layer(
+            data,
+            64,
+            1,
+            act='relu',
+            name="inception_a" + name + "_3x3_2_reduce")
+        conv4 = self.conv_bn_layer(
+            conv4,
+            96,
+            3,
+            padding=1,
+            act='relu',
+            name="inception_a" + name + "_3x3_2")
+        conv4 = self.conv_bn_layer(
+            conv4,
+            96,
+            3,
+            padding=1,
+            act='relu',
+            name="inception_a" + name + "_3x3_3")
+        concat = fluid.layers.concat([conv1, conv2, conv3, conv4], axis=1)
+        return concat
+    def reductionA(self, data, name=None):
+        pool1 = fluid.layers.pool2d(
+            input=data, pool_size=3, pool_stride=2, pool_type='max')
+        conv2 = self.conv_bn_layer(
+            data, 384, 3, stride=2, act='relu', name="reduction_a_3x3")
+        conv3 = self.conv_bn_layer(
+            data, 192, 1, act='relu', name="reduction_a_3x3_2_reduce")
+        conv3 = self.conv_bn_layer(
+            conv3, 224, 3, padding=1, act='relu', name="reduction_a_3x3_2")
+        conv3 = self.conv_bn_layer(
+            conv3, 256, 3, stride=2, act='relu', name="reduction_a_3x3_3")
+        concat = fluid.layers.concat([pool1, conv2, conv3], axis=1)
+        return concat
+    def inceptionB(self, data, name=None):
+        pool1 = fluid.layers.pool2d(
+            input=data, pool_size=3, pool_padding=1, pool_type='avg')
+        conv1 = self.conv_bn_layer(
+            pool1, 128, 1, act='relu', name="inception_b" + name + "_1x1")
+        conv2 = self.conv_bn_layer(
+            data, 384, 1, act='relu', name="inception_b" + name + "_1x1_2")
+        conv3 = self.conv_bn_layer(
+            data,
+            192,
+            1,
+            act='relu',
+            name="inception_b" + name + "_1x7_reduce")
+        conv3 = self.conv_bn_layer(
+            conv3,
+            224, (1, 7),
+            padding=(0, 3),
+            act='relu',
+            name="inception_b" + name + "_1x7")
+        conv3 = self.conv_bn_layer(
+            conv3,
+            256, (7, 1),
+            padding=(3, 0),
+            act='relu',
+            name="inception_b" + name + "_7x1")
+        conv4 = self.conv_bn_layer(
+            data,
+            192,
+            1,
+            act='relu',
+            name="inception_b" + name + "_7x1_2_reduce")
+        conv4 = self.conv_bn_layer(
+            conv4,
+            192, (1, 7),
+            padding=(0, 3),
+            act='relu',
+            name="inception_b" + name + "_1x7_2")
+        conv4 = self.conv_bn_layer(
+            conv4,
+            224, (7, 1),
+            padding=(3, 0),
+            act='relu',
+            name="inception_b" + name + "_7x1_2")
+        conv4 = self.conv_bn_layer(
+            conv4,
+            224, (1, 7),
+            padding=(0, 3),
+            act='relu',
+            name="inception_b" + name + "_1x7_3")
+        conv4 = self.conv_bn_layer(
+            conv4,
+            256, (7, 1),
+            padding=(3, 0),
+            act='relu',
+            name="inception_b" + name + "_7x1_3")
+        concat = fluid.layers.concat([conv1, conv2, conv3, conv4], axis=1)
+        return concat
+    def reductionB(self, data, name=None):
+        pool1 = fluid.layers.pool2d(
+            input=data, pool_size=3, pool_stride=2, pool_type='max')
+        conv2 = self.conv_bn_layer(
+            data, 192, 1, act='relu', name="reduction_b_3x3_reduce")
+        conv2 = self.conv_bn_layer(
+            conv2, 192, 3, stride=2, act='relu', name="reduction_b_3x3")
+        conv3 = self.conv_bn_layer(
+            data, 256, 1, act='relu', name="reduction_b_1x7_reduce")
+        conv3 = self.conv_bn_layer(
+            conv3,
+            256, (1, 7),
+            padding=(0, 3),
+            act='relu',
+            name="reduction_b_1x7")
+        conv3 = self.conv_bn_layer(
+            conv3,
+            320, (7, 1),
+            padding=(3, 0),
+            act='relu',
+            name="reduction_b_7x1")
+        conv3 = self.conv_bn_layer(
+            conv3, 320, 3, stride=2, act='relu', name="reduction_b_3x3_2")
+        concat = fluid.layers.concat([pool1, conv2, conv3], axis=1)
+        return concat
+    def inceptionC(self, data, name=None):
+        pool1 = fluid.layers.pool2d(
+            input=data, pool_size=3, pool_padding=1, pool_type='avg')
+        conv1 = self.conv_bn_layer(
+            pool1, 256, 1, act='relu', name="inception_c" + name + "_1x1")
+        conv2 = self.conv_bn_layer(
+            data, 256, 1, act='relu', name="inception_c" + name + "_1x1_2")
+        conv3 = self.conv_bn_layer(
+            data, 384, 1, act='relu', name="inception_c" + name + "_1x1_3")
+        conv3_1 = self.conv_bn_layer(
+            conv3,
+            256, (1, 3),
+            padding=(0, 1),
+            act='relu',
+            name="inception_c" + name + "_1x3")
+        conv3_2 = self.conv_bn_layer(
+            conv3,
+            256, (3, 1),
+            padding=(1, 0),
+            act='relu',
+            name="inception_c" + name + "_3x1")
+        conv4 = self.conv_bn_layer(
+            data, 384, 1, act='relu', name="inception_c" + name + "_1x1_4")
+        conv4 = self.conv_bn_layer(
+            conv4,
+            448, (1, 3),
+            padding=(0, 1),
+            act='relu',
+            name="inception_c" + name + "_1x3_2")
+        conv4 = self.conv_bn_layer(
+            conv4,
+            512, (3, 1),
+            padding=(1, 0),
+            act='relu',
+            name="inception_c" + name + "_3x1_2")
+        conv4_1 = self.conv_bn_layer(
+            conv4,
+            256, (1, 3),
+            padding=(0, 1),
+            act='relu',
+            name="inception_c" + name + "_1x3_3")
+        conv4_2 = self.conv_bn_layer(
+            conv4,
+            256, (3, 1),
+            padding=(1, 0),
+            act='relu',
+            name="inception_c" + name + "_3x1_3")
+        concat = fluid.layers.concat(
+            [conv1, conv2, conv3_1, conv3_2, conv4_1, conv4_2], axis=1)
+        return concat
--- a/ppcls/modeling/architectures/layers.py
+++ b/ppcls/modeling/architectures/layers.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import warnings
+import paddle.fluid as fluid
+def initial_type(name,
+                 input,
+                 op_type,
+                 fan_out,
+                 init="google",
+                 use_bias=False,
+                 filter_size=0,
+                 stddev=0.02):
+    if init == "kaiming":
+        if op_type == 'conv':
+            fan_in = input.shape[1] * filter_size * filter_size
+        elif op_type == 'deconv':
+            fan_in = fan_out * filter_size * filter_size
+        else:
+            if len(input.shape) > 2:
+                fan_in = input.shape[1] * input.shape[2] * input.shape[3]
+            else:
+                fan_in = input.shape[1]
+        bound = 1 / math.sqrt(fan_in)
+        param_attr = fluid.ParamAttr(
+            name=name + "_weights",
+            initializer=fluid.initializer.Uniform(
+                low=-bound, high=bound))
+        if use_bias == True:
+            bias_attr = fluid.ParamAttr(
+                name=name + '_offset',
+                initializer=fluid.initializer.Uniform(
+                    low=-bound, high=bound))
+        else:
+            bias_attr = False
+    elif init == 'google':
+        n = filter_size * filter_size * fan_out
+        param_attr = fluid.ParamAttr(
+            name=name + "_weights",
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=math.sqrt(2.0 / n)))
+        if use_bias == True:
+            bias_attr = fluid.ParamAttr(
+                name=name + "_offset",
+                initializer=fluid.initializer.Constant(0.0))
+        else:
+            bias_attr = False
+    else:
+        param_attr = fluid.ParamAttr(
+            name=name + "_weights",
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=stddev))
+        if use_bias == True:
+            bias_attr = fluid.ParamAttr(
+                name=name + "_offset",
+                initializer=fluid.initializer.Constant(0.0))
+        else:
+            bias_attr = False
+    return param_attr, bias_attr
+def cal_padding(img_size, stride, filter_size, dilation=1):
+    """Calculate padding size."""
+    if img_size % stride == 0:
+        out_size = max(filter_size - stride, 0)
+    else:
+        out_size = max(filter_size - (img_size % stride), 0)
+    return out_size // 2, out_size - out_size // 2
+def init_batch_norm_layer(name="batch_norm"):
+    param_attr = fluid.ParamAttr(
+        name=name + '_scale', initializer=fluid.initializer.Constant(1.0))
+    bias_attr = fluid.ParamAttr(
+        name=name + '_offset',
+        initializer=fluid.initializer.Constant(value=0.0))
+    return param_attr, bias_attr
+def init_fc_layer(fout, name='fc'):
+    n = fout  # fan-out
+    init_range = 1.0 / math.sqrt(n)
+    param_attr = fluid.ParamAttr(
+        name=name + '_weights',
+        initializer=fluid.initializer.UniformInitializer(
+            low=-init_range, high=init_range))
+    bias_attr = fluid.ParamAttr(
+        name=name + '_offset',
+        initializer=fluid.initializer.Constant(value=0.0))
+    return param_attr, bias_attr
+def norm_layer(input, norm_type='batch_norm', name=None):
+    if norm_type == 'batch_norm':
+        param_attr = fluid.ParamAttr(
+            name=name + '_weights',
+            initializer=fluid.initializer.Constant(1.0))
+        bias_attr = fluid.ParamAttr(
+            name=name + '_offset',
+            initializer=fluid.initializer.Constant(value=0.0))
+        return fluid.layers.batch_norm(
+            input,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            moving_mean_name=name + '_mean',
+            moving_variance_name=name + '_variance')
+    elif norm_type == 'instance_norm':
+        helper = fluid.layer_helper.LayerHelper("instance_norm", **locals())
+        dtype = helper.input_dtype()
+        epsilon = 1e-5
+        mean = fluid.layers.reduce_mean(input, dim=[2, 3], keep_dim=True)
+        var = fluid.layers.reduce_mean(
+            fluid.layers.square(input - mean), dim=[2, 3], keep_dim=True)
+        if name is not None:
+            scale_name = name + "_scale"
+            offset_name = name + "_offset"
+        scale_param = fluid.ParamAttr(
+            name=scale_name,
+            initializer=fluid.initializer.Constant(1.0),
+            trainable=True)
+        offset_param = fluid.ParamAttr(
+            name=offset_name,
+            initializer=fluid.initializer.Constant(0.0),
+            trainable=True)
+        scale = helper.create_parameter(
+            attr=scale_param, shape=input.shape[1:2], dtype=dtype)
+        offset = helper.create_parameter(
+            attr=offset_param, shape=input.shape[1:2], dtype=dtype)
+        tmp = fluid.layers.elementwise_mul(x=(input - mean), y=scale, axis=1)
+        tmp = tmp / fluid.layers.sqrt(var + epsilon)
+        tmp = fluid.layers.elementwise_add(tmp, offset, axis=1)
+        return tmp
+    else:
+        raise NotImplementedError("norm tyoe: [%s] is not support" % norm_type)
+def conv2d(input,
+           num_filters=64,
+           filter_size=7,
+           stride=1,
+           stddev=0.02,
+           padding=0,
+           groups=None,
+           name="conv2d",
+           norm=None,
+           act=None,
+           relufactor=0.0,
+           use_bias=False,
+           padding_type=None,
+           initial="normal",
+           use_cudnn=True):
+    if padding != 0 and padding_type != None:
+        warnings.warn(
+            'padding value and padding type are set in the same time, and the final padding width and padding height are computed by padding_type'
+        )
+    param_attr, bias_attr = initial_type(
+        name=name,
+        input=input,
+        op_type='conv',
+        fan_out=num_filters,
+        init=initial,
+        use_bias=use_bias,
+        filter_size=filter_size,
+        stddev=stddev)
+    def get_padding(filter_size, stride=1, dilation=1):
+        padding = ((stride - 1) + dilation * (filter_size - 1)) // 2
+        return padding
+    need_crop = False
+    if padding_type == "SAME":
+        top_padding, bottom_padding = cal_padding(input.shape[2], stride,
+                                                  filter_size)
+        left_padding, right_padding = cal_padding(input.shape[2], stride,
+                                                  filter_size)
+        height_padding = bottom_padding
+        width_padding = right_padding
+        if top_padding != bottom_padding or left_padding != right_padding:
+            height_padding = top_padding + stride
+            width_padding = left_padding + stride
+            need_crop = True
+        padding = [height_padding, width_padding]
+    elif padding_type == "VALID":
+        height_padding = 0
+        width_padding = 0
+        padding = [height_padding, width_padding]
+    elif padding_type == "DYNAMIC":
+        padding = get_padding(filter_size, stride)
+    else:
+        padding = padding
+    conv = fluid.layers.conv2d(
+        input,
+        num_filters,
+        filter_size,
+        groups=groups,
+        name=name,
+        stride=stride,
+        padding=padding,
+        use_cudnn=use_cudnn,
+        param_attr=param_attr,
+        bias_attr=bias_attr)
+    if need_crop:
+        conv = conv[:, :, 1:, 1:]
+    if norm is not None:
+        conv = norm_layer(input=conv, norm_type=norm, name=name + "_norm")
+    if act == 'relu':
+        conv = fluid.layers.relu(conv, name=name + '_relu')
+    elif act == 'leaky_relu':
+        conv = fluid.layers.leaky_relu(
+            conv, alpha=relufactor, name=name + '_leaky_relu')
+    elif act == 'tanh':
+        conv = fluid.layers.tanh(conv, name=name + '_tanh')
+    elif act == 'sigmoid':
+        conv = fluid.layers.sigmoid(conv, name=name + '_sigmoid')
+    elif act == 'swish':
+        conv = fluid.layers.swish(conv, name=name + '_swish')
+    elif act == None:
+        conv = conv
+    else:
+        raise NotImplementedError("activation: [%s] is not support" % act)
+    return conv
--- a/ppcls/modeling/architectures/mobilenet_v1.py
+++ b/ppcls/modeling/architectures/mobilenet_v1.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+__all__ = [
+    'MobileNetV1', 'MobileNetV1_x0_25', 'MobileNetV1_x0_5', 'MobileNetV1_x1_0',
+    'MobileNetV1_x0_75'
+]
+class MobileNetV1():
+    def __init__(self, scale=1.0):
+        self.scale = scale
+    def net(self, input, class_dim=1000):
+        scale = self.scale
+        # conv1: 112x112
+        input = self.conv_bn_layer(
+            input,
+            filter_size=3,
+            channels=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1,
+            name="conv1")
+        # 56x56
+        input = self.depthwise_separable(
+            input,
+            num_filters1=32,
+            num_filters2=64,
+            num_groups=32,
+            stride=1,
+            scale=scale,
+            name="conv2_1")
+        input = self.depthwise_separable(
+            input,
+            num_filters1=64,
+            num_filters2=128,
+            num_groups=64,
+            stride=2,
+            scale=scale,
+            name="conv2_2")
+        # 28x28
+        input = self.depthwise_separable(
+            input,
+            num_filters1=128,
+            num_filters2=128,
+            num_groups=128,
+            stride=1,
+            scale=scale,
+            name="conv3_1")
+        input = self.depthwise_separable(
+            input,
+            num_filters1=128,
+            num_filters2=256,
+            num_groups=128,
+            stride=2,
+            scale=scale,
+            name="conv3_2")
+        # 14x14
+        input = self.depthwise_separable(
+            input,
+            num_filters1=256,
+            num_filters2=256,
+            num_groups=256,
+            stride=1,
+            scale=scale,
+            name="conv4_1")
+        input = self.depthwise_separable(
+            input,
+            num_filters1=256,
+            num_filters2=512,
+            num_groups=256,
+            stride=2,
+            scale=scale,
+            name="conv4_2")
+        # 14x14
+        for i in range(5):
+            input = self.depthwise_separable(
+                input,
+                num_filters1=512,
+                num_filters2=512,
+                num_groups=512,
+                stride=1,
+                scale=scale,
+                name="conv5" + "_" + str(i + 1))
+        # 7x7
+        input = self.depthwise_separable(
+            input,
+            num_filters1=512,
+            num_filters2=1024,
+            num_groups=512,
+            stride=2,
+            scale=scale,
+            name="conv5_6")
+        input = self.depthwise_separable(
+            input,
+            num_filters1=1024,
+            num_filters2=1024,
+            num_groups=1024,
+            stride=1,
+            scale=scale,
+            name="conv6")
+        input = fluid.layers.pool2d(
+            input=input, pool_type='avg', global_pooling=True)
+        output = fluid.layers.fc(input=input,
+                                 size=class_dim,
+                                 param_attr=ParamAttr(
+                                     initializer=MSRA(), name="fc7_weights"),
+                                 bias_attr=ParamAttr(name="fc7_offset"))
+        return output
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      channels=None,
+                      num_groups=1,
+                      act='relu',
+                      use_cudnn=True,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=name + "_weights"),
+            bias_attr=False)
+        bn_name = name + "_bn"
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+    def depthwise_separable(self,
+                            input,
+                            num_filters1,
+                            num_filters2,
+                            num_groups,
+                            stride,
+                            scale,
+                            name=None):
+        depthwise_conv = self.conv_bn_layer(
+            input=input,
+            filter_size=3,
+            num_filters=int(num_filters1 * scale),
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale),
+            use_cudnn=False,
+            name=name + "_dw")
+        pointwise_conv = self.conv_bn_layer(
+            input=depthwise_conv,
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0,
+            name=name + "_sep")
+        return pointwise_conv
+def MobileNetV1_x0_25():
+    model = MobileNetV1(scale=0.25)
+    return model
+def MobileNetV1_x0_5():
+    model = MobileNetV1(scale=0.5)
+    return model
+def MobileNetV1_x1_0():
+    model = MobileNetV1(scale=1.0)
+    return model
+def MobileNetV1_x0_75():
+    model = MobileNetV1(scale=0.75)
+    return model
--- a/ppcls/modeling/architectures/mobilenet_v2.py
+++ b/ppcls/modeling/architectures/mobilenet_v2.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+__all__ = [
+    'MobileNetV2_x0_25', 'MobileNetV2_x0_5'
+    'MobileNetV2_x0_75', 'MobileNetV2_x1_0', 'MobileNetV2_x1_5',
+    'MobileNetV2_x2_0', 'MobileNetV2'
+]
+class MobileNetV2():
+    def __init__(self, scale=1.0):
+        self.scale = scale
+    def net(self, input, class_dim=1000):
+        scale = self.scale
+        bottleneck_params_list = [
+            (1, 16, 1, 1),
+            (6, 24, 2, 2),
+            (6, 32, 3, 2),
+            (6, 64, 4, 2),
+            (6, 96, 3, 1),
+            (6, 160, 3, 2),
+            (6, 320, 1, 1),
+        ]
+        #conv1 
+        input = self.conv_bn_layer(
+            input,
+            num_filters=int(32 * scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            if_act=True,
+            name='conv1_1')
+        # bottleneck sequences
+        i = 1
+        in_c = int(32 * scale)
+        for layer_setting in bottleneck_params_list:
+            t, c, n, s = layer_setting
+            i += 1
+            input = self.invresi_blocks(
+                input=input,
+                in_c=in_c,
+                t=t,
+                c=int(c * scale),
+                n=n,
+                s=s,
+                name='conv' + str(i))
+            in_c = int(c * scale)
+        #last_conv
+        input = self.conv_bn_layer(
+            input=input,
+            num_filters=int(1280 * scale) if scale > 1.0 else 1280,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=True,
+            name='conv9')
+        input = fluid.layers.pool2d(
+            input=input, pool_type='avg', global_pooling=True)
+        output = fluid.layers.fc(input=input,
+                                 size=class_dim,
+                                 param_attr=ParamAttr(name='fc10_weights'),
+                                 bias_attr=ParamAttr(name='fc10_offset'))
+        return output
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      channels=None,
+                      num_groups=1,
+                      if_act=True,
+                      name=None,
+                      use_cudnn=True):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(name=name + '_weights'),
+            bias_attr=False)
+        bn_name = name + '_bn'
+        bn = fluid.layers.batch_norm(
+            input=conv,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+        if if_act:
+            return fluid.layers.relu6(bn)
+        else:
+            return bn
+    def shortcut(self, input, data_residual):
+        return fluid.layers.elementwise_add(input, data_residual)
+    def inverted_residual_unit(self,
+                               input,
+                               num_in_filter,
+                               num_filters,
+                               ifshortcut,
+                               stride,
+                               filter_size,
+                               padding,
+                               expansion_factor,
+                               name=None):
+        num_expfilter = int(round(num_in_filter * expansion_factor))
+        channel_expand = self.conv_bn_layer(
+            input=input,
+            num_filters=num_expfilter,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            name=name + '_expand')
+        bottleneck_conv = self.conv_bn_layer(
+            input=channel_expand,
+            num_filters=num_expfilter,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            num_groups=num_expfilter,
+            if_act=True,
+            name=name + '_dwise',
+            use_cudnn=False)
+        linear_out = self.conv_bn_layer(
+            input=bottleneck_conv,
+            num_filters=num_filters,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=False,
+            name=name + '_linear')
+        if ifshortcut:
+            out = self.shortcut(input=input, data_residual=linear_out)
+            return out
+        else:
+            return linear_out
+    def invresi_blocks(self, input, in_c, t, c, n, s, name=None):
+        first_block = self.inverted_residual_unit(
+            input=input,
+            num_in_filter=in_c,
+            num_filters=c,
+            ifshortcut=False,
+            stride=s,
+            filter_size=3,
+            padding=1,
+            expansion_factor=t,
+            name=name + '_1')
+        last_residual_block = first_block
+        last_c = c
+        for i in range(1, n):
+            last_residual_block = self.inverted_residual_unit(
+                input=last_residual_block,
+                num_in_filter=last_c,
+                num_filters=c,
+                ifshortcut=True,
+                stride=1,
+                filter_size=3,
+                padding=1,
+                expansion_factor=t,
+                name=name + '_' + str(i + 1))
+        return last_residual_block
+def MobileNetV2_x0_25():
+    model = MobileNetV2(scale=0.25)
+    return model
+def MobileNetV2_x0_5():
+    model = MobileNetV2(scale=0.5)
+    return model
+def MobileNetV2_x0_75():
+    model = MobileNetV2(scale=0.75)
+    return model
+def MobileNetV2_x1_0():
+    model = MobileNetV2(scale=1.0)
+    return model
+def MobileNetV2_x1_5():
+    model = MobileNetV2(scale=1.5)
+    return model
+def MobileNetV2_x2_0():
+    model = MobileNetV2(scale=2.0)
+    return model
--- a/ppcls/modeling/architectures/mobilenet_v3.py
+++ b/ppcls/modeling/architectures/mobilenet_v3.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+__all__ = [
+    'MobileNetV3', 'MobileNetV3_small_x0_35', 'MobileNetV3_small_x0_5',
+    'MobileNetV3_small_x0_75', 'MobileNetV3_small_x1_0',
+    'MobileNetV3_small_x1_25', 'MobileNetV3_large_x0_35',
+    'MobileNetV3_large_x0_5', 'MobileNetV3_large_x0_75',
+    'MobileNetV3_large_x1_0', 'MobileNetV3_large_x1_25'
+]
+class MobileNetV3():
+    def __init__(self, scale=1.0, model_name='small'):
+        self.scale = scale
+        self.inplanes = 16
+        if model_name == "large":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, 'relu', 1],
+                [3, 64, 24, False, 'relu', 2],
+                [3, 72, 24, False, 'relu', 1],
+                [5, 72, 40, True, 'relu', 2],
+                [5, 120, 40, True, 'relu', 1],
+                [5, 120, 40, True, 'relu', 1],
+                [3, 240, 80, False, 'hard_swish', 2],
+                [3, 200, 80, False, 'hard_swish', 1],
+                [3, 184, 80, False, 'hard_swish', 1],
+                [3, 184, 80, False, 'hard_swish', 1],
+                [3, 480, 112, True, 'hard_swish', 1],
+                [3, 672, 112, True, 'hard_swish', 1],
+                [5, 672, 160, True, 'hard_swish', 2],
+                [5, 960, 160, True, 'hard_swish', 1],
+                [5, 960, 160, True, 'hard_swish', 1],
+            ]
+            self.cls_ch_squeeze = 960
+            self.cls_ch_expand = 1280
+        elif model_name == "small":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, 'relu', 2],
+                [3, 72, 24, False, 'relu', 2],
+                [3, 88, 24, False, 'relu', 1],
+                [5, 96, 40, True, 'hard_swish', 2],
+                [5, 240, 40, True, 'hard_swish', 1],
+                [5, 240, 40, True, 'hard_swish', 1],
+                [5, 120, 48, True, 'hard_swish', 1],
+                [5, 144, 48, True, 'hard_swish', 1],
+                [5, 288, 96, True, 'hard_swish', 2],
+                [5, 576, 96, True, 'hard_swish', 1],
+                [5, 576, 96, True, 'hard_swish', 1],
+            ]
+            self.cls_ch_squeeze = 576
+            self.cls_ch_expand = 1280
+        else:
+            raise NotImplementedError("mode[" + model_name +
+                                      "_model] is not implemented!")
+    def net(self, input, class_dim=1000):
+        scale = self.scale
+        inplanes = self.inplanes
+        cfg = self.cfg
+        cls_ch_squeeze = self.cls_ch_squeeze
+        cls_ch_expand = self.cls_ch_expand
+        #conv1
+        conv = self.conv_bn_layer(
+            input,
+            filter_size=3,
+            num_filters=self.make_divisible(inplanes * scale),
+            stride=2,
+            padding=1,
+            num_groups=1,
+            if_act=True,
+            act='hard_swish',
+            name='conv1')
+        i = 0
+        inplanes = self.make_divisible(inplanes * scale)
+        for layer_cfg in cfg:
+            conv = self.residual_unit(
+                input=conv,
+                num_in_filter=inplanes,
+                num_mid_filter=self.make_divisible(scale * layer_cfg[1]),
+                num_out_filter=self.make_divisible(scale * layer_cfg[2]),
+                act=layer_cfg[4],
+                stride=layer_cfg[5],
+                filter_size=layer_cfg[0],
+                use_se=layer_cfg[3],
+                name='conv' + str(i + 2))
+            inplanes = self.make_divisible(scale * layer_cfg[2])
+            i += 1
+        conv = self.conv_bn_layer(
+            input=conv,
+            filter_size=1,
+            num_filters=self.make_divisible(scale * cls_ch_squeeze),
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            act='hard_swish',
+            name='conv_last')
+        conv = fluid.layers.pool2d(
+            input=conv, pool_type='avg', global_pooling=True, use_cudnn=False)
+        conv = fluid.layers.conv2d(
+            input=conv,
+            num_filters=cls_ch_expand,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=None,
+            param_attr=ParamAttr(name='last_1x1_conv_weights'),
+            bias_attr=False)
+        conv = fluid.layers.hard_swish(conv)
+        drop = fluid.layers.dropout(x=conv, dropout_prob=0.2)
+        out = fluid.layers.fc(input=drop,
+                              size=class_dim,
+                              param_attr=ParamAttr(name='fc_weights'),
+                              bias_attr=ParamAttr(name='fc_offset'))
+        return out
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      num_groups=1,
+                      if_act=True,
+                      act=None,
+                      name=None,
+                      use_cudnn=True,
+                      res_last_bn_init=False):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(name=name + '_weights'),
+            bias_attr=False)
+        bn_name = name + '_bn'
+        bn = fluid.layers.batch_norm(
+            input=conv,
+            param_attr=ParamAttr(
+                name=bn_name + "_scale",
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=0.0)),
+            bias_attr=ParamAttr(
+                name=bn_name + "_offset",
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=0.0)),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+        if if_act:
+            if act == 'relu':
+                bn = fluid.layers.relu(bn)
+            elif act == 'hard_swish':
+                bn = fluid.layers.hard_swish(bn)
+        return bn
+    def make_divisible(self, v, divisor=8, min_value=None):
+        if min_value is None:
+            min_value = divisor
+        new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+        if new_v < 0.9 * v:
+            new_v += divisor
+        return new_v
+    def se_block(self, input, num_out_filter, ratio=4, name=None):
+        num_mid_filter = num_out_filter // ratio
+        pool = fluid.layers.pool2d(
+            input=input, pool_type='avg', global_pooling=True, use_cudnn=False)
+        conv1 = fluid.layers.conv2d(
+            input=pool,
+            filter_size=1,
+            num_filters=num_mid_filter,
+            act='relu',
+            param_attr=ParamAttr(name=name + '_1_weights'),
+            bias_attr=ParamAttr(name=name + '_1_offset'))
+        conv2 = fluid.layers.conv2d(
+            input=conv1,
+            filter_size=1,
+            num_filters=num_out_filter,
+            act='hard_sigmoid',
+            param_attr=ParamAttr(name=name + '_2_weights'),
+            bias_attr=ParamAttr(name=name + '_2_offset'))
+        scale = fluid.layers.elementwise_mul(x=input, y=conv2, axis=0)
+        return scale
+    def residual_unit(self,
+                      input,
+                      num_in_filter,
+                      num_mid_filter,
+                      num_out_filter,
+                      stride,
+                      filter_size,
+                      act=None,
+                      use_se=False,
+                      name=None):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            filter_size=1,
+            num_filters=num_mid_filter,
+            stride=1,
+            padding=0,
+            if_act=True,
+            act=act,
+            name=name + '_expand')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            filter_size=filter_size,
+            num_filters=num_mid_filter,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            if_act=True,
+            act=act,
+            num_groups=num_mid_filter,
+            use_cudnn=False,
+            name=name + '_depthwise')
+        if use_se:
+            conv1 = self.se_block(
+                input=conv1, num_out_filter=num_mid_filter, name=name + '_se')
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            filter_size=1,
+            num_filters=num_out_filter,
+            stride=1,
+            padding=0,
+            if_act=False,
+            name=name + '_linear',
+            res_last_bn_init=True)
+        if num_in_filter != num_out_filter or stride != 1:
+            return conv2
+        else:
+            return fluid.layers.elementwise_add(x=input, y=conv2, act=None)
+def MobileNetV3_small_x0_35():
+    model = MobileNetV3(model_name='small', scale=0.35)
+    return model
+def MobileNetV3_small_x0_5():
+    model = MobileNetV3(model_name='small', scale=0.5)
+    return model
+def MobileNetV3_small_x0_75():
+    model = MobileNetV3(model_name='small', scale=0.75)
+    return model
+def MobileNetV3_small_x1_0():
+    model = MobileNetV3(model_name='small', scale=1.0)
+    return model
+def MobileNetV3_small_x1_25():
+    model = MobileNetV3(model_name='small', scale=1.25)
+    return model
+def MobileNetV3_large_x0_35():
+    model = MobileNetV3(model_name='large', scale=0.35)
+    return model
+def MobileNetV3_large_x0_5():
+    model = MobileNetV3(model_name='large', scale=0.5)
+    return model
+def MobileNetV3_large_x0_75():
+    model = MobileNetV3(model_name='large', scale=0.75)
+    return model
+def MobileNetV3_large_x1_0():
+    model = MobileNetV3(model_name='large', scale=1.0)
+    return model
+def MobileNetV3_large_x1_25():
+    model = MobileNetV3(model_name='large', scale=1.25)
+    return model
--- a/ppcls/modeling/architectures/model_libs.py
+++ b/ppcls/modeling/architectures/model_libs.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+import contextlib
+bn_regularizer = fluid.regularizer.L2DecayRegularizer(regularization_coeff=0.0)
+name_scope = ""
+@contextlib.contextmanager
+def scope(name):
+    global name_scope
+    bk = name_scope
+    name_scope = name_scope + name + '/'
+    yield
+    name_scope = bk
+def max_pool(input, kernel, stride, padding):
+    data = fluid.layers.pool2d(
+        input,
+        pool_size=kernel,
+        pool_type='max',
+        pool_stride=stride,
+        pool_padding=padding)
+    return data
+def group_norm(input, G, eps=1e-5, param_attr=None, bias_attr=None):
+    N, C, H, W = input.shape
+    if C % G != 0:
+        # print "group can not divide channle:", C, G
+        for d in range(10):
+            for t in [d, -d]:
+                if G + t <= 0: continue
+                if C % (G + t) == 0:
+                    G = G + t
+                    break
+            if C % G == 0:
+                # print "use group size:", G
+                break
+    assert C % G == 0
+    x = fluid.layers.group_norm(
+        input,
+        groups=G,
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        name=name_scope + 'group_norm')
+    return x
+def bn(*args, **kargs):
+    with scope('BatchNorm'):
+        return fluid.layers.batch_norm(
+            *args,
+            epsilon=1e-3,
+            momentum=0.99,
+            param_attr=fluid.ParamAttr(
+                name=name_scope + 'gamma', regularizer=bn_regularizer),
+            bias_attr=fluid.ParamAttr(
+                name=name_scope + 'beta', regularizer=bn_regularizer),
+            moving_mean_name=name_scope + 'moving_mean',
+            moving_variance_name=name_scope + 'moving_variance',
+            **kargs)
+def bn_relu(data):
+    return fluid.layers.relu(bn(data))
+def relu(data):
+    return fluid.layers.relu(data)
+def conv(*args, **kargs):
+    kargs['param_attr'] = name_scope + 'weights'
+    if 'bias_attr' in kargs and kargs['bias_attr']:
+        kargs['bias_attr'] = fluid.ParamAttr(
+            name=name_scope + 'biases',
+            regularizer=None,
+            initializer=fluid.initializer.ConstantInitializer(value=0.0))
+    else:
+        kargs['bias_attr'] = False
+    return fluid.layers.conv2d(*args, **kargs)
+def deconv(*args, **kargs):
+    kargs['param_attr'] = name_scope + 'weights'
+    if 'bias_attr' in kargs and kargs['bias_attr']:
+        kargs['bias_attr'] = name_scope + 'biases'
+    else:
+        kargs['bias_attr'] = False
+    return fluid.layers.conv2d_transpose(*args, **kargs)
+def seperate_conv(input, channel, stride, filter, dilation=1, act=None):
+    param_attr = fluid.ParamAttr(
+        name=name_scope + 'weights',
+        regularizer=fluid.regularizer.L2DecayRegularizer(
+            regularization_coeff=0.0),
+        initializer=fluid.initializer.TruncatedNormal(
+            loc=0.0, scale=0.33))
+    with scope('depthwise'):
+        input = conv(
+            input,
+            input.shape[1],
+            filter,
+            stride,
+            groups=input.shape[1],
+            padding=(filter // 2) * dilation,
+            dilation=dilation,
+            use_cudnn=False,
+            param_attr=param_attr)
+        input = bn(input)
+        if act: input = act(input)
+    param_attr = fluid.ParamAttr(
+        name=name_scope + 'weights',
+        regularizer=None,
+        initializer=fluid.initializer.TruncatedNormal(
+            loc=0.0, scale=0.06))
+    with scope('pointwise'):
+        input = conv(
+            input, channel, 1, 1, groups=1, padding=0, param_attr=param_attr)
+        input = bn(input)
+        if act: input = act(input)
+    return input
--- a/ppcls/modeling/architectures/res2net.py
+++ b/ppcls/modeling/architectures/res2net.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+import math
+from paddle.fluid.param_attr import ParamAttr
+__all__ = [
+    "Res2Net", "Res2Net50_48w_2s", "Res2Net50_26w_4s", "Res2Net50_14w_8s",
+    "Res2Net50_26w_6s", "Res2Net50_26w_8s", "Res2Net101_26w_4s",
+    "Res2Net152_26w_4s"
+]
+class Res2Net():
+    def __init__(self, layers=50, scales=4, width=26):
+        self.layers = layers
+        self.scales = scales
+        self.width = width
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        basic_width = self.width * self.scales
+        num_filters1 = [basic_width * t for t in [1, 2, 4, 8]]
+        num_filters2 = [256 * t for t in [1, 2, 4, 8]]
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        conv = self.conv_bn_layer(
+            input=input,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu',
+            name="conv1")
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                if layers in [101, 152] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters1=num_filters1[block],
+                    num_filters2=num_filters2[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    name=conv_name)
+        pool = fluid.layers.pool2d(
+            input=conv,
+            pool_size=7,
+            pool_stride=1,
+            pool_type='avg',
+            global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=pool,
+            size=class_dim,
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name='fc_weights'),
+            bias_attr=fluid.param_attr.ParamAttr(name='fc_offset'))
+        return out
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+    def shortcut(self, input, ch_out, stride, name):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+        else:
+            return input
+    def bottleneck_block(self, input, num_filters1, num_filters2, stride,
+                         name):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters1,
+            filter_size=1,
+            stride=1,
+            act='relu',
+            name=name + '_branch2a')
+        xs = fluid.layers.split(conv0, self.scales, 1)
+        ys = []
+        for s in range(self.scales - 1):
+            if s == 0 or stride == 2:
+                ys.append(
+                    self.conv_bn_layer(
+                        input=xs[s],
+                        num_filters=num_filters1 // self.scales,
+                        stride=stride,
+                        filter_size=3,
+                        act='relu',
+                        name=name + '_branch2b_' + str(s + 1)))
+            else:
+                ys.append(
+                    self.conv_bn_layer(
+                        input=xs[s] + ys[-1],
+                        num_filters=num_filters1 // self.scales,
+                        stride=stride,
+                        filter_size=3,
+                        act='relu',
+                        name=name + '_branch2b_' + str(s + 1)))
+        if stride == 1:
+            ys.append(xs[-1])
+        else:
+            ys.append(
+                fluid.layers.pool2d(
+                    input=xs[-1],
+                    pool_size=3,
+                    pool_stride=stride,
+                    pool_padding=1,
+                    pool_type='avg'))
+        conv1 = fluid.layers.concat(ys, axis=1)
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            num_filters=num_filters2,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+        short = self.shortcut(
+            input, num_filters2, stride, name=name + "_branch1")
+        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+def Res2Net50_48w_2s():
+    model = Res2Net(layers=50, scales=2, width=48)
+    return model
+def Res2Net50_26w_4s():
+    model = Res2Net(layers=50, scales=4, width=26)
+    return model
+def Res2Net50_14w_8s():
+    model = Res2Net(layers=50, scales=8, width=14)
+    return model
+def Res2Net50_26w_6s():
+    model = Res2Net(layers=50, scales=6, width=26)
+    return model
+def Res2Net50_26w_8s():
+    model = Res2Net(layers=50, scales=8, width=26)
+    return model
+def Res2Net101_26w_4s():
+    model = Res2Net(layers=101, scales=4, width=26)
+    return model
+def Res2Net152_26w_4s():
+    model = Res2Net(layers=152, scales=4, width=26)
+    return model
--- a/ppcls/modeling/architectures/res2net_vd.py
+++ b/ppcls/modeling/architectures/res2net_vd.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+__all__ = [
+    "Res2Net_vd", "Res2Net50_vd_48w_2s", "Res2Net50_vd_26w_4s",
+    "Res2Net50_vd_14w_8s", "Res2Net50_vd_26w_6s", "Res2Net50_vd_26w_8s",
+    "Res2Net101_vd_26w_4s", "Res2Net152_vd_26w_4s", "Res2Net200_vd_26w_4s"
+]
+class Res2Net_vd():
+    def __init__(self, layers=50, scales=4, width=26):
+        self.layers = layers
+        self.scales = scales
+        self.width = width
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        basic_width = self.width * self.scales
+        num_filters1 = [basic_width * t for t in [1, 2, 4, 8]]
+        num_filters2 = [256 * t for t in [1, 2, 4, 8]]
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        conv = self.conv_bn_layer(
+            input=input,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name='conv1_1')
+        conv = self.conv_bn_layer(
+            input=conv,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name='conv1_2')
+        conv = self.conv_bn_layer(
+            input=conv,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name='conv1_3')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                if layers in [101, 152] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters1=num_filters1[block],
+                    num_filters2=num_filters2[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    if_first=block == i == 0,
+                    name=conv_name)
+        pool = fluid.layers.pool2d(
+            input=conv,
+            pool_size=7,
+            pool_stride=1,
+            pool_type='avg',
+            global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=pool,
+            size=class_dim,
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name='fc_weights'),
+            bias_attr=fluid.param_attr.ParamAttr(name='fc_offset'))
+        return out
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+    def conv_bn_layer_new(self,
+                          input,
+                          num_filters,
+                          filter_size,
+                          stride=1,
+                          groups=1,
+                          act=None,
+                          name=None):
+        pool = fluid.layers.pool2d(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            pool_padding=0,
+            pool_type='avg',
+            ceil_mode=True)
+        conv = fluid.layers.conv2d(
+            input=pool,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=1,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+    def shortcut(self, input, ch_out, stride, name, if_first=False):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            if if_first:
+                return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+            else:
+                return self.conv_bn_layer_new(
+                    input, ch_out, 1, stride, name=name)
+        elif if_first:
+            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+        else:
+            return input
+    def bottleneck_block(self, input, num_filters1, num_filters2, stride, name,
+                         if_first):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters1,
+            filter_size=1,
+            stride=1,
+            act='relu',
+            name=name + '_branch2a')
+        xs = fluid.layers.split(conv0, self.scales, 1)
+        ys = []
+        for s in range(self.scales - 1):
+            if s == 0 or stride == 2:
+                ys.append(
+                    self.conv_bn_layer(
+                        input=xs[s],
+                        num_filters=num_filters1 // self.scales,
+                        stride=stride,
+                        filter_size=3,
+                        act='relu',
+                        name=name + '_branch2b_' + str(s + 1)))
+            else:
+                ys.append(
+                    self.conv_bn_layer(
+                        input=xs[s] + ys[-1],
+                        num_filters=num_filters1 // self.scales,
+                        stride=stride,
+                        filter_size=3,
+                        act='relu',
+                        name=name + '_branch2b_' + str(s + 1)))
+        if stride == 1:
+            ys.append(xs[-1])
+        else:
+            ys.append(
+                fluid.layers.pool2d(
+                    input=xs[-1],
+                    pool_size=3,
+                    pool_stride=stride,
+                    pool_padding=1,
+                    pool_type='avg'))
+        conv1 = fluid.layers.concat(ys, axis=1)
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            num_filters=num_filters2,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+        short = self.shortcut(
+            input,
+            num_filters2,
+            stride,
+            if_first=if_first,
+            name=name + "_branch1")
+        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+def Res2Net50_vd_48w_2s():
+    model = Res2Net_vd(layers=50, scales=2, width=48)
+    return model
+def Res2Net50_vd_26w_4s():
+    model = Res2Net_vd(layers=50, scales=4, width=26)
+    return model
+def Res2Net50_vd_14w_8s():
+    model = Res2Net_vd(layers=50, scales=8, width=14)
+    return model
+def Res2Net50_vd_26w_6s():
+    model = Res2Net_vd(layers=50, scales=6, width=26)
+    return model
+def Res2Net50_vd_26w_8s():
+    model = Res2Net_vd(layers=50, scales=8, width=26)
+    return model
+def Res2Net101_vd_26w_4s():
+    model = Res2Net_vd(layers=101, scales=4, width=26)
+    return model
+def Res2Net152_vd_26w_4s():
+    model = Res2Net_vd(layers=152, scales=4, width=26)
+    return model
+def Res2Net200_vd_26w_4s():
+    model = Res2Net_vd(layers=200, scales=4, width=26)
+    return model
--- a/ppcls/modeling/architectures/resnet.py
+++ b/ppcls/modeling/architectures/resnet.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+__all__ = [
+    "ResNet", "ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"
+]
+class ResNet():
+    def __init__(self, layers=50):
+        self.layers = layers
+    def net(self, input, class_dim=1000, data_format="NCHW"):
+        layers = self.layers
+        supported_layers = [18, 34, 50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_filters = [64, 128, 256, 512]
+        conv = self.conv_bn_layer(
+            input=input,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu',
+            name="conv1",
+            data_format=data_format)
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max',
+            data_format=data_format)
+        if layers >= 50:
+            for block in range(len(depth)):
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    conv = self.bottleneck_block(
+                        input=conv,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        name=conv_name,
+                        data_format=data_format)
+        else:
+            for block in range(len(depth)):
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    conv = self.basic_block(
+                        input=conv,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        is_first=block == i == 0,
+                        name=conv_name,
+                        data_format=data_format)
+        pool = fluid.layers.pool2d(
+            input=conv,
+            pool_type='avg',
+            global_pooling=True,
+            data_format=data_format)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=pool,
+            size=class_dim,
+            param_attr=fluid.param_attr.ParamAttr(
+                name="fc_0.w_0",
+                initializer=fluid.initializer.Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(name="fc_0.b_0"))
+        return out
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      name=None,
+                      data_format='NCHW'):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            name=name + '.conv2d.output.1',
+            data_format=data_format)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            name=bn_name + '.output.1',
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance',
+            data_layout=data_format)
+    def shortcut(self, input, ch_out, stride, is_first, name, data_format):
+        if data_format == 'NCHW':
+            ch_in = input.shape[1]
+        else:
+            ch_in = input.shape[-1]
+        if ch_in != ch_out or stride != 1 or is_first == True:
+            return self.conv_bn_layer(
+                input, ch_out, 1, stride, name=name, data_format=data_format)
+        else:
+            return input
+    def bottleneck_block(self, input, num_filters, stride, name, data_format):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a",
+            data_format=data_format)
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b",
+            data_format=data_format)
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c",
+            data_format=data_format)
+        short = self.shortcut(
+            input,
+            num_filters * 4,
+            stride,
+            is_first=False,
+            name=name + "_branch1",
+            data_format=data_format)
+        return fluid.layers.elementwise_add(
+            x=short, y=conv2, act='relu', name=name + ".add.output.5")
+    def basic_block(self, input, num_filters, stride, is_first, name,
+                    data_format):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=3,
+            act='relu',
+            stride=stride,
+            name=name + "_branch2a",
+            data_format=data_format)
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b",
+            data_format=data_format)
+        short = self.shortcut(
+            input,
+            num_filters,
+            stride,
+            is_first,
+            name=name + "_branch1",
+            data_format=data_format)
+        return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
+def ResNet18():
+    model = ResNet(layers=18)
+    return model
+def ResNet34():
+    model = ResNet(layers=34)
+    return model
+def ResNet50():
+    model = ResNet(layers=50)
+    return model
+def ResNet101():
+    model = ResNet(layers=101)
+    return model
+def ResNet152():
+    model = ResNet(layers=152)
+    return model
--- a/ppcls/modeling/architectures/resnet_acnet.py
+++ b/ppcls/modeling/architectures/resnet_acnet.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+__all__ = [
+    "ResNet_ACNet", "ResNet18_ACNet", "ResNet34_ACNet", "ResNet50_ACNet",
+    "ResNet101_ACNet", "ResNet152_ACNet"
+]
+class ResNetACNet(object):
+    """ ACNet """
+    def __init__(self, layers=50, deploy=False):
+        """init"""
+        self.layers = layers
+        self.deploy = deploy
+    def net(self, input, class_dim=1000):
+        """model"""
+        layers = self.layers
+        supported_layers = [18, 34, 50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_filters = [64, 128, 256, 512]
+        conv = self.conv_bn_layer(
+            input=input,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu',
+            name="conv1")
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+        if layers >= 50:
+            for block in range(len(depth)):
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    conv = self.bottleneck_block(
+                        input=conv,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        name=conv_name)
+        else:
+            for block in range(len(depth)):
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    conv = self.basic_block(
+                        input=conv,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        is_first=block == i == 0,
+                        name=conv_name)
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=pool,
+            size=class_dim,
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)))
+        return out
+    def conv_bn_layer(self, **kwargs):
+        """
+        conv_bn_layer
+        """
+        if kwargs['filter_size'] == 1:
+            return self.conv_bn_layer_ori(**kwargs)
+        else:
+            return self.conv_bn_layer_ac(**kwargs)
+    # conv bn+relu
+    def conv_bn_layer_ori(self,
+                          input,
+                          num_filters,
+                          filter_size,
+                          stride=1,
+                          groups=1,
+                          act=None,
+                          name=None):
+        """
+        standard convbn
+        used for 1x1 convbn in acnet
+        """
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            name=name + '.conv2d.output.1')
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            name=bn_name + '.output.1',
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance', )
+    # conv bn+relu
+    def conv_bn_layer_ac(self,
+                         input,
+                         num_filters,
+                         filter_size,
+                         stride=1,
+                         groups=1,
+                         act=None,
+                         name=None):
+        """ ACNet conv bn """
+        padding = (filter_size - 1) // 2
+        square_conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            act=act if self.deploy else None,
+            param_attr=ParamAttr(name=name + "_acsquare_weights"),
+            bias_attr=ParamAttr(name=name + "_acsquare_bias")
+            if self.deploy else False,
+            name=name + '.acsquare.conv2d.output.1')
+        if self.deploy:
+            return square_conv
+        else:
+            ver_conv = fluid.layers.conv2d(
+                input=input,
+                num_filters=num_filters,
+                filter_size=(filter_size, 1),
+                stride=stride,
+                padding=(padding, 0),
+                groups=groups,
+                act=None,
+                param_attr=ParamAttr(name=name + "_acver_weights"),
+                bias_attr=False,
+                name=name + '.acver.conv2d.output.1')
+            hor_conv = fluid.layers.conv2d(
+                input=input,
+                num_filters=num_filters,
+                filter_size=(1, filter_size),
+                stride=stride,
+                padding=(0, padding),
+                groups=groups,
+                act=None,
+                param_attr=ParamAttr(name=name + "_achor_weights"),
+                bias_attr=False,
+                name=name + '.achor.conv2d.output.1')
+            if name == "conv1":
+                bn_name = "bn_" + name
+            else:
+                bn_name = "bn" + name[3:]
+            square_bn = fluid.layers.batch_norm(
+                input=square_conv,
+                act=None,
+                name=bn_name + '.acsquare.output.1',
+                param_attr=ParamAttr(name=bn_name + '_acsquare_scale'),
+                bias_attr=ParamAttr(bn_name + '_acsquare_offset'),
+                moving_mean_name=bn_name + '_acsquare_mean',
+                moving_variance_name=bn_name + '_acsquare_variance', )
+            ver_bn = fluid.layers.batch_norm(
+                input=ver_conv,
+                act=None,
+                name=bn_name + '.acver.output.1',
+                param_attr=ParamAttr(name=bn_name + '_acver_scale'),
+                bias_attr=ParamAttr(bn_name + '_acver_offset'),
+                moving_mean_name=bn_name + '_acver_mean',
+                moving_variance_name=bn_name + '_acver_variance', )
+            hor_bn = fluid.layers.batch_norm(
+                input=hor_conv,
+                act=None,
+                name=bn_name + '.achor.output.1',
+                param_attr=ParamAttr(name=bn_name + '_achor_scale'),
+                bias_attr=ParamAttr(bn_name + '_achor_offset'),
+                moving_mean_name=bn_name + '_achor_mean',
+                moving_variance_name=bn_name + '_achor_variance', )
+            return fluid.layers.elementwise_add(
+                x=square_bn, y=ver_bn + hor_bn, act=act)
+    def shortcut(self, input, ch_out, stride, is_first, name):
+        """ shortcut """
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1 or is_first == True:
+            return self.conv_bn_layer(
+                input=input,
+                num_filters=ch_out,
+                filter_size=1,
+                stride=stride,
+                name=name)
+        else:
+            return input
+    def bottleneck_block(self, input, num_filters, stride, name):
+        """" bottleneck_block """
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+        short = self.shortcut(
+            input,
+            num_filters * 4,
+            stride,
+            is_first=False,
+            name=name + "_branch1")
+        return fluid.layers.elementwise_add(
+            x=short, y=conv2, act='relu', name=name + ".add.output.5")
+    def basic_block(self, input, num_filters, stride, is_first, name):
+        """ basic_block """
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=3,
+            act='relu',
+            stride=stride,
+            name=name + "_branch2a")
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b")
+        short = self.shortcut(
+            input, num_filters, stride, is_first, name=name + "_branch1")
+        return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
+def ResNet18_ACNet(deploy=False):
+    """ResNet18 + ACNet"""
+    model = ResNet_ACNet(layers=18, deploy=deploy)
+    return model
+def ResNet34_ACNet(deploy=False):
+    """ResNet34 + ACNet"""
+    model = ResNetACNet(layers=34, deploy=deploy)
+    return model
+def ResNet50_ACNet(deploy=False):
+    """ResNet50 + ACNet"""
+    model = ResNetACNet(layers=50, deploy=deploy)
+    return model
+def ResNet101_ACNet(deploy=False):
+    """ResNet101 + ACNet"""
+    model = ResNetACNet(layers=101, deploy=deploy)
+    return model
+def ResNet152_ACNet(deploy=False):
+    """ResNet152 + ACNet"""
+    model = ResNetACNet(layers=152, deploy=deploy)
+    return model
--- a/ppcls/modeling/architectures/resnet_vc.py
+++ b/ppcls/modeling/architectures/resnet_vc.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+__all__ = ["ResNet", "ResNet50_vc", "ResNet101_vc", "ResNet152_vc"]
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+class ResNet():
+    def __init__(self, layers=50):
+        self.params = train_parameters
+        self.layers = layers
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_filters = [64, 128, 256, 512]
+        conv = self.conv_bn_layer(
+            input=input,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name='conv1_1')
+        conv = self.conv_bn_layer(
+            input=conv,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name='conv1_2')
+        conv = self.conv_bn_layer(
+            input=conv,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name='conv1_3')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                if layers in [101, 152] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    name=conv_name)
+        pool = fluid.layers.pool2d(
+            input=conv, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(input=pool,
+                              size=class_dim,
+                              param_attr=fluid.param_attr.ParamAttr(
+                                  name="fc_0.w_0",
+                                  initializer=fluid.initializer.Uniform(-stdv,
+                                                                        stdv)),
+                              bias_attr=ParamAttr(name="fc_0.b_0"))
+        return out
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            name=name + '.conv2d.output.1')
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            name=bn_name + '.output.1',
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance', )
+    def shortcut(self, input, ch_out, stride, name):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+        else:
+            return input
+    def bottleneck_block(self, input, num_filters, stride, name):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+        short = self.shortcut(
+            input, num_filters * 4, stride, name=name + "_branch1")
+        return fluid.layers.elementwise_add(
+            x=short, y=conv2, act='relu', name=name + ".add.output.5")
+def ResNet50_vc():
+    model = ResNet(layers=50)
+    return model
+def ResNet101_vc():
+    model = ResNet(layers=101)
+    return model
+def ResNet152_vc():
+    model = ResNet(layers=152)
+    return model
--- a/ppcls/modeling/architectures/resnet_vd.py
+++ b/ppcls/modeling/architectures/resnet_vd.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+__all__ = [
+    "ResNet", "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd",
+    "ResNet152_vd", "ResNet200_vd"
+]
+class ResNet():
+    def __init__(self, layers=50, is_3x3=False):
+        self.layers = layers
+        self.is_3x3 = is_3x3
+    def net(self, input, class_dim=1000):
+        is_3x3 = self.is_3x3
+        layers = self.layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_filters = [64, 128, 256, 512]
+        if is_3x3 == False:
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+        else:
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=32,
+                filter_size=3,
+                stride=2,
+                act='relu',
+                name='conv1_1')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=32,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv1_2')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=64,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv1_3')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+        if layers >= 50:
+            for block in range(len(depth)):
+                for i in range(depth[block]):
+                    if layers in [101, 152, 200] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    conv = self.bottleneck_block(
+                        input=conv,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        if_first=block == i == 0,
+                        name=conv_name)
+        else:
+            for block in range(len(depth)):
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    conv = self.basic_block(
+                        input=conv,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        if_first=block == i == 0,
+                        name=conv_name)
+        pool = fluid.layers.pool2d(
+            input=conv, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=pool,
+            size=class_dim,
+            param_attr=fluid.param_attr.ParamAttr(
+                name="fc_0.w_0",
+                initializer=fluid.initializer.Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(name="fc_0.b_0"))
+        return out
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+    def conv_bn_layer_new(self,
+                          input,
+                          num_filters,
+                          filter_size,
+                          stride=1,
+                          groups=1,
+                          act=None,
+                          name=None):
+        pool = fluid.layers.pool2d(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            pool_padding=0,
+            pool_type='avg',
+            ceil_mode=True)
+        conv = fluid.layers.conv2d(
+            input=pool,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=1,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+    def shortcut(self, input, ch_out, stride, name, if_first=False):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            if if_first:
+                return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+            else:
+                return self.conv_bn_layer_new(
+                    input, ch_out, 1, stride, name=name)
+        elif if_first:
+            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+        else:
+            return input
+    def bottleneck_block(self, input, num_filters, stride, name, if_first):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+        short = self.shortcut(
+            input,
+            num_filters * 4,
+            stride,
+            if_first=if_first,
+            name=name + "_branch1")
+        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+    def basic_block(self, input, num_filters, stride, name, if_first):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=3,
+            act='relu',
+            stride=stride,
+            name=name + "_branch2a")
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b")
+        short = self.shortcut(
+            input,
+            num_filters,
+            stride,
+            if_first=if_first,
+            name=name + "_branch1")
+        return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
+def ResNet18_vd():
+    model = ResNet(layers=18, is_3x3=True)
+    return model
+def ResNet34_vd():
+    model = ResNet(layers=34, is_3x3=True)
+    return model
+def ResNet50_vd():
+    model = ResNet(layers=50, is_3x3=True)
+    return model
+def ResNet101_vd():
+    model = ResNet(layers=101, is_3x3=True)
+    return model
+def ResNet152_vd():
+    model = ResNet(layers=152, is_3x3=True)
+    return model
+def ResNet200_vd():
+    model = ResNet(layers=200, is_3x3=True)
+    return model
--- a/ppcls/modeling/architectures/resnext.py
+++ b/ppcls/modeling/architectures/resnext.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+__all__ = [
+    "ResNeXt", "ResNeXt50_64x4d", "ResNeXt101_64x4d", "ResNeXt152_64x4d",
+    "ResNeXt50_32x4d", "ResNeXt101_32x4d", "ResNeXt152_32x4d"
+]
+class ResNeXt():
+    def __init__(self, layers=50, cardinality=64):
+        self.layers = layers
+        self.cardinality = cardinality
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        cardinality = self.cardinality
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_filters1 = [256, 512, 1024, 2048]
+        num_filters2 = [128, 256, 512, 1024]
+        conv = self.conv_bn_layer(
+            input=input,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu',
+            name="res_conv1")  #debug
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                if layers in [101, 152] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters1[block]
+                    if cardinality == 64 else num_filters2[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    name=conv_name)
+        pool = fluid.layers.pool2d(
+            input=conv, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=pool,
+            size=class_dim,
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name='fc_weights'),
+            bias_attr=fluid.param_attr.ParamAttr(name='fc_offset'))
+        return out
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            name=name + '.conv2d.output.1')
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            name=bn_name + '.output.1',
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance', )
+    def shortcut(self, input, ch_out, stride, name):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+        else:
+            return input
+    def bottleneck_block(self, input, num_filters, stride, cardinality, name):
+        cardinality = self.cardinality
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu',
+            name=name + "_branch2b")
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            num_filters=num_filters if cardinality == 64 else num_filters * 2,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+        short = self.shortcut(
+            input,
+            num_filters if cardinality == 64 else num_filters * 2,
+            stride,
+            name=name + "_branch1")
+        return fluid.layers.elementwise_add(
+            x=short, y=conv2, act='relu', name=name + ".add.output.5")
+def ResNeXt50_64x4d():
+    model = ResNeXt(layers=50, cardinality=64)
+    return model
+def ResNeXt50_32x4d():
+    model = ResNeXt(layers=50, cardinality=32)
+    return model
+def ResNeXt101_64x4d():
+    model = ResNeXt(layers=101, cardinality=64)
+    return model
+def ResNeXt101_32x4d():
+    model = ResNeXt(layers=101, cardinality=32)
+    return model
+def ResNeXt152_64x4d():
+    model = ResNeXt(layers=152, cardinality=64)
+    return model
+def ResNeXt152_32x4d():
+    model = ResNeXt(layers=152, cardinality=32)
+    return model
--- a/ppcls/modeling/architectures/resnext101_wsl.py
+++ b/ppcls/modeling/architectures/resnext101_wsl.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+__all__ = [
+    "ResNeXt101_32x8d_wsl", "ResNeXt101_32x16d_wsl", "ResNeXt101_32x32d_wsl",
+    "ResNeXt101_32x48d_wsl", "Fix_ResNeXt101_32x48d_wsl"
+]
+class ResNeXt101_wsl():
+    def __init__(self, layers=101, cardinality=32, width=48):
+        self.layers = layers
+        self.cardinality = cardinality
+        self.width = width
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        cardinality = self.cardinality
+        width = self.width
+        depth = [3, 4, 23, 3]
+        base_width = cardinality * width
+        num_filters = [base_width * i for i in [1, 2, 4, 8]]
+        conv = self.conv_bn_layer(
+            input=input,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu',
+            name="conv1")  #debug
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv_name = 'layer' + str(block + 1) + "." + str(i)
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    name=conv_name)
+        pool = fluid.layers.pool2d(
+            input=conv, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=pool,
+            size=class_dim,
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name='fc.weight'),
+            bias_attr=fluid.param_attr.ParamAttr(name='fc.bias'))
+        return out
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      name=None):
+        if "downsample" in name:
+            conv_name = name + '.0'
+        else:
+            conv_name = name
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=conv_name + ".weight"),
+            bias_attr=False)
+        if "downsample" in name:
+            bn_name = name[:9] + 'downsample' + '.1'
+        else:
+            if "conv1" == name:
+                bn_name = 'bn' + name[-1]
+            else:
+                bn_name = (name[:10] if name[7:9].isdigit() else name[:9]
+                           ) + 'bn' + name[-1]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '.weight'),
+            bias_attr=ParamAttr(bn_name + '.bias'),
+            moving_mean_name=bn_name + '.running_mean',
+            moving_variance_name=bn_name + '.running_var', )
+    def shortcut(self, input, ch_out, stride, name):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+        else:
+            return input
+    def bottleneck_block(self, input, num_filters, stride, cardinality, name):
+        cardinality = self.cardinality
+        width = self.width
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + ".conv1")
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu',
+            name=name + ".conv2")
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            num_filters=num_filters // (width // 8),
+            filter_size=1,
+            act=None,
+            name=name + ".conv3")
+        short = self.shortcut(
+            input,
+            num_filters // (width // 8),
+            stride,
+            name=name + ".downsample")
+        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+def ResNeXt101_32x8d_wsl():
+    model = ResNeXt101_wsl(cardinality=32, width=8)
+    return model
+def ResNeXt101_32x16d_wsl():
+    model = ResNeXt101_wsl(cardinality=32, width=16)
+    return model
+def ResNeXt101_32x32d_wsl():
+    model = ResNeXt101_wsl(cardinality=32, width=32)
+    return model
+def ResNeXt101_32x48d_wsl():
+    model = ResNeXt101_wsl(cardinality=32, width=48)
+    return model
+def Fix_ResNeXt101_32x48d_wsl():
+    model = ResNeXt101_wsl(cardinality=32, width=48)
+    return model
--- a/ppcls/modeling/architectures/resnext_vd.py
+++ b/ppcls/modeling/architectures/resnext_vd.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+import math
+__all__ = [
+    "ResNeXt", "ResNeXt50_vd_64x4d", "ResNeXt101_vd_64x4d",
+    "ResNeXt152_vd_64x4d", "ResNeXt50_vd_32x4d", "ResNeXt101_vd_32x4d",
+    "ResNeXt152_vd_32x4d"
+]
+class ResNeXt():
+    def __init__(self, layers=50, is_3x3=False, cardinality=64):
+        self.layers = layers
+        self.is_3x3 = is_3x3
+        self.cardinality = cardinality
+    def net(self, input, class_dim=1000):
+        is_3x3 = self.is_3x3
+        layers = self.layers
+        cardinality = self.cardinality
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_filters1 = [256, 512, 1024, 2048]
+        num_filters2 = [128, 256, 512, 1024]
+        if is_3x3 == False:
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+        else:
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=32,
+                filter_size=3,
+                stride=2,
+                act='relu',
+                name='conv1_1')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=32,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv1_2')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=64,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv1_3')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                if layers in [101, 152, 200] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters1[block]
+                    if cardinality == 64 else num_filters2[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    if_first=block == 0,
+                    name=conv_name)
+        pool = fluid.layers.pool2d(
+            input=conv, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=pool,
+            size=class_dim,
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name='fc_weights'),
+            bias_attr=fluid.param_attr.ParamAttr(name='fc_offset'))
+        return out
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+    def conv_bn_layer_new(self,
+                          input,
+                          num_filters,
+                          filter_size,
+                          stride=1,
+                          groups=1,
+                          act=None,
+                          name=None):
+        pool = fluid.layers.pool2d(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            pool_padding=0,
+            pool_type='avg',
+            ceil_mode=True)
+        conv = fluid.layers.conv2d(
+            input=pool,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=1,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+    def shortcut(self, input, ch_out, stride, name, if_first=False):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            if if_first:
+                return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+            else:
+                return self.conv_bn_layer_new(
+                    input, ch_out, 1, stride, name=name)
+        else:
+            return input
+    def bottleneck_block(self, input, num_filters, stride, cardinality, name,
+                         if_first):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            groups=cardinality,
+            name=name + "_branch2b")
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            num_filters=num_filters if cardinality == 64 else num_filters * 2,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+        short = self.shortcut(
+            input,
+            num_filters if cardinality == 64 else num_filters * 2,
+            stride,
+            if_first=if_first,
+            name=name + "_branch1")
+        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+def ResNeXt50_vd_64x4d():
+    model = ResNeXt(layers=50, is_3x3=True)
+    return model
+def ResNeXt50_vd_32x4d():
+    model = ResNeXt(layers=50, cardinality=32, is_3x3=True)
+    return model
+def ResNeXt101_vd_64x4d():
+    model = ResNeXt(layers=101, is_3x3=True)
+    return model
+def ResNeXt101_vd_32x4d():
+    model = ResNeXt(layers=101, cardinality=32, is_3x3=True)
+    return model
+def ResNeXt152_vd_64x4d():
+    model = ResNeXt(layers=152, is_3x3=True)
+    return model
+def ResNeXt152_vd_32x4d():
+    model = ResNeXt(layers=152, cardinality=32, is_3x3=True)
+    return model
--- a/ppcls/modeling/architectures/se_resnet_vd.py
+++ b/ppcls/modeling/architectures/se_resnet_vd.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+__all__ = [
+    "SE_ResNet_vd", "SE_ResNet18_vd", "SE_ResNet34_vd", "SE_ResNet50_vd",
+    "SE_ResNet101_vd", "SE_ResNet152_vd", "SE_ResNet200_vd"
+]
+class SE_ResNet_vd():
+    def __init__(self, layers=50, is_3x3=False):
+        self.layers = layers
+        self.is_3x3 = is_3x3
+    def net(self, input, class_dim=1000):
+        is_3x3 = self.is_3x3
+        layers = self.layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_filters = [64, 128, 256, 512]
+        reduction_ratio = 16
+        if is_3x3 == False:
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+        else:
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=32,
+                filter_size=3,
+                stride=2,
+                act='relu',
+                name='conv1_1')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=32,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv1_2')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=64,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv1_3')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+        if layers >= 50:
+            for block in range(len(depth)):
+                for i in range(depth[block]):
+                    if layers in [101, 152, 200] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    conv = self.bottleneck_block(
+                        input=conv,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        if_first=block == i == 0,
+                        reduction_ratio=reduction_ratio,
+                        name=conv_name)
+        else:
+            for block in range(len(depth)):
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    conv = self.basic_block(
+                        input=conv,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        if_first=block == i == 0,
+                        reduction_ratio=reduction_ratio,
+                        name=conv_name)
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=pool,
+            size=class_dim,
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name='fc6_weights'),
+            bias_attr=ParamAttr(name='fc6_offset'))
+        return out
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+    def conv_bn_layer_new(self,
+                          input,
+                          num_filters,
+                          filter_size,
+                          stride=1,
+                          groups=1,
+                          act=None,
+                          name=None):
+        pool = fluid.layers.pool2d(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            pool_padding=0,
+            pool_type='avg',
+            ceil_mode=True)
+        conv = fluid.layers.conv2d(
+            input=pool,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=1,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+    def shortcut(self, input, ch_out, stride, name, if_first=False):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            if if_first:
+                return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+            else:
+                return self.conv_bn_layer_new(
+                    input, ch_out, 1, stride, name=name)
+        elif if_first:
+            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+        else:
+            return input
+    def bottleneck_block(self, input, num_filters, stride, name, if_first,
+                         reduction_ratio):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+        scale = self.squeeze_excitation(
+            input=conv2,
+            num_channels=num_filters * 4,
+            reduction_ratio=reduction_ratio,
+            name='fc_' + name)
+        short = self.shortcut(
+            input,
+            num_filters * 4,
+            stride,
+            if_first=if_first,
+            name=name + "_branch1")
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+    def basic_block(self, input, num_filters, stride, name, if_first,
+                    reduction_ratio):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=3,
+            act='relu',
+            stride=stride,
+            name=name + "_branch2a")
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b")
+        scale = self.squeeze_excitation(
+            input=conv1,
+            num_channels=num_filters,
+            reduction_ratio=reduction_ratio,
+            name='fc_' + name)
+        short = self.shortcut(
+            input,
+            num_filters,
+            stride,
+            if_first=if_first,
+            name=name + "_branch1")
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+    def squeeze_excitation(self,
+                           input,
+                           num_channels,
+                           reduction_ratio,
+                           name=None):
+        pool = fluid.layers.pool2d(
+            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(
+            input=pool,
+            size=num_channels // reduction_ratio,
+            act='relu',
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=name + '_sqz_weights'),
+            bias_attr=ParamAttr(name=name + '_sqz_offset'))
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(
+            input=squeeze,
+            size=num_channels,
+            act='sigmoid',
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=name + '_exc_weights'),
+            bias_attr=ParamAttr(name=name + '_exc_offset'))
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+def SE_ResNet18_vd():
+    model = SE_ResNet_vd(layers=18, is_3x3=True)
+    return model
+def SE_ResNet34_vd():
+    model = SE_ResNet_vd(layers=34, is_3x3=True)
+    return model
+def SE_ResNet50_vd():
+    model = SE_ResNet_vd(layers=50, is_3x3=True)
+    return model
+def SE_ResNet101_vd():
+    model = SE_ResNet_vd(layers=101, is_3x3=True)
+    return model
+def SE_ResNet152_vd():
+    model = SE_ResNet_vd(layers=152, is_3x3=True)
+    return model
+def SE_ResNet200_vd():
+    model = SE_ResNet_vd(layers=200, is_3x3=True)
+    return model
--- a/ppcls/modeling/architectures/se_resnext.py
+++ b/ppcls/modeling/architectures/se_resnext.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+__all__ = [
+    "SE_ResNeXt", "SE_ResNeXt50_32x4d", "SE_ResNeXt101_32x4d",
+    "SE_ResNeXt152_32x4d"
+]
+class SE_ResNeXt():
+    def __init__(self, layers=50):
+        self.layers = layers
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 6, 3]
+            num_filters = [128, 256, 512, 1024]
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu',
+                name='conv1', )
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max',
+                use_cudnn=False)
+        elif layers == 101:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 23, 3]
+            num_filters = [128, 256, 512, 1024]
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu',
+                name="conv1", )
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max',
+                use_cudnn=False)
+        elif layers == 152:
+            cardinality = 64
+            reduction_ratio = 16
+            depth = [3, 8, 36, 3]
+            num_filters = [128, 256, 512, 1024]
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu',
+                name='conv1')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=64,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv2')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv3')
+            conv = fluid.layers.pool2d(
+                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
+                pool_type='max', use_cudnn=False)
+        n = 1 if layers == 50 or layers == 101 else 3
+        for block in range(len(depth)):
+            n += 1
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    reduction_ratio=reduction_ratio,
+                    name=str(n) + '_' + str(i + 1))
+        pool = fluid.layers.pool2d(
+            input=conv, pool_type='avg', global_pooling=True, use_cudnn=False)
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.5)
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=drop,
+            size=class_dim,
+            param_attr=ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name='fc6_weights'),
+            bias_attr=ParamAttr(name='fc6_offset'))
+        return out
+    def shortcut(self, input, ch_out, stride, name):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            filter_size = 1
+            return self.conv_bn_layer(
+                input,
+                ch_out,
+                filter_size,
+                stride,
+                name='conv' + name + '_prj')
+        else:
+            return input
+    def bottleneck_block(self,
+                         input,
+                         num_filters,
+                         stride,
+                         cardinality,
+                         reduction_ratio,
+                         name=None):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name='conv' + name + '_x1')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu',
+            name='conv' + name + '_x2')
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            num_filters=num_filters * 2,
+            filter_size=1,
+            act=None,
+            name='conv' + name + '_x3')
+        scale = self.squeeze_excitation(
+            input=conv2,
+            num_channels=num_filters * 2,
+            reduction_ratio=reduction_ratio,
+            name='fc' + name)
+        short = self.shortcut(input, num_filters * 2, stride, name=name)
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=False,
+            param_attr=ParamAttr(name=name + '_weights'), )
+        bn_name = name + "_bn"
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+    def squeeze_excitation(self,
+                           input,
+                           num_channels,
+                           reduction_ratio,
+                           name=None):
+        pool = fluid.layers.pool2d(
+            input=input, pool_type='avg', global_pooling=True, use_cudnn=False)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(
+            input=pool,
+            size=num_channels // reduction_ratio,
+            act='relu',
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=name + '_sqz_weights'),
+            bias_attr=ParamAttr(name=name + '_sqz_offset'))
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(
+            input=squeeze,
+            size=num_channels,
+            act='sigmoid',
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=name + '_exc_weights'),
+            bias_attr=ParamAttr(name=name + '_exc_offset'))
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+def SE_ResNeXt50_32x4d():
+    model = SE_ResNeXt(layers=50)
+    return model
+def SE_ResNeXt101_32x4d():
+    model = SE_ResNeXt(layers=101)
+    return model
+def SE_ResNeXt152_32x4d():
+    model = SE_ResNeXt(layers=152)
+    return model
--- a/ppcls/modeling/architectures/se_resnext_vd.py
+++ b/ppcls/modeling/architectures/se_resnext_vd.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+__all__ = [
+    "SE_ResNeXt_vd", "SE_ResNeXt50_32x4d_vd", "SE_ResNeXt101_32x4d_vd",
+    "SENet154_vd"
+]
+class SE_ResNeXt_vd():
+    def __init__(self, layers=50):
+        self.layers = layers
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 6, 3]
+            num_filters = [128, 256, 512, 1024]
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu',
+                name='conv1_1')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=64,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv1_2')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv1_3')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 101:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 23, 3]
+            num_filters = [128, 256, 512, 1024]
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu',
+                name='conv1_1')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=64,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv1_2')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv1_3')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 152:
+            cardinality = 64
+            reduction_ratio = 16
+            depth = [3, 8, 36, 3]
+            num_filters = [256, 512, 1024, 2048]
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu',
+                name='conv1_1')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=64,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv1_2')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv1_3')
+            conv = fluid.layers.pool2d(
+                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
+                pool_type='max')
+        n = 1 if layers == 50 or layers == 101 else 3
+        for block in range(len(depth)):
+            n += 1
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    reduction_ratio=reduction_ratio,
+                    if_first=block == 0,
+                    name=str(n) + '_' + str(i + 1))
+        pool = fluid.layers.pool2d(
+            input=conv, pool_type='avg', global_pooling=True)
+        if layers == 152:
+            pool = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=pool,
+            size=class_dim,
+            param_attr=ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name='fc6_weights'),
+            bias_attr=ParamAttr(name='fc6_offset'))
+        return out
+    def shortcut(self, input, ch_out, stride, name, if_first=False):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            filter_size = 1
+            if if_first:
+                return self.conv_bn_layer(
+                    input,
+                    ch_out,
+                    filter_size,
+                    stride,
+                    name='conv' + name + '_prj')
+            else:
+                return self.conv_bn_layer_new(
+                    input,
+                    ch_out,
+                    filter_size,
+                    stride,
+                    name='conv' + name + '_prj')
+        else:
+            return input
+    def bottleneck_block(self,
+                         input,
+                         num_filters,
+                         stride,
+                         cardinality,
+                         reduction_ratio,
+                         if_first,
+                         name=None):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name='conv' + name + '_x1')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu',
+            name='conv' + name + '_x2')
+        if cardinality == 64:
+            num_filters = num_filters // 2
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            num_filters=num_filters * 2,
+            filter_size=1,
+            act=None,
+            name='conv' + name + '_x3')
+        scale = self.squeeze_excitation(
+            input=conv2,
+            num_channels=num_filters * 2,
+            reduction_ratio=reduction_ratio,
+            name='fc' + name)
+        short = self.shortcut(
+            input, num_filters * 2, stride, if_first=if_first, name=name)
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=False,
+            param_attr=ParamAttr(name=name + '_weights'), )
+        bn_name = name + "_bn"
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+    def conv_bn_layer_new(self,
+                          input,
+                          num_filters,
+                          filter_size,
+                          stride=1,
+                          groups=1,
+                          act=None,
+                          name=None):
+        pool = fluid.layers.pool2d(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            pool_padding=0,
+            pool_type='avg',
+            ceil_mode=True)
+        conv = fluid.layers.conv2d(
+            input=pool,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=1,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        bn_name = name + "_bn"
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+    def squeeze_excitation(self,
+                           input,
+                           num_channels,
+                           reduction_ratio,
+                           name=None):
+        pool = fluid.layers.pool2d(
+            input=input, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(
+            input=pool,
+            size=num_channels // reduction_ratio,
+            act='relu',
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=name + '_sqz_weights'),
+            bias_attr=ParamAttr(name=name + '_sqz_offset'))
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(
+            input=squeeze,
+            size=num_channels,
+            act='sigmoid',
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=name + '_exc_weights'),
+            bias_attr=ParamAttr(name=name + '_exc_offset'))
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+def SE_ResNeXt50_vd_32x4d():
+    model = SE_ResNeXt_vd(layers=50)
+    return model
+def SE_ResNeXt101_vd_32x4d():
+    model = SE_ResNeXt_vd(layers=101)
+    return model
+def SENet154_vd():
+    model = SE_ResNeXt_vd(layers=152)
+    return model
--- a/ppcls/modeling/architectures/shufflenet_v2.py
+++ b/ppcls/modeling/architectures/shufflenet_v2.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+__all__ = [
+    'ShuffleNetV2_x0_25', 'ShuffleNetV2_x0_33', 'ShuffleNetV2_x0_5',
+    'ShuffleNetV2_x1_0', 'ShuffleNetV2_x1_5', 'ShuffleNetV2_x2_0',
+    'ShuffleNetV2'
+]
+class ShuffleNetV2():
+    def __init__(self, scale=1.0):
+        self.scale = scale
+    def net(self, input, class_dim=1000):
+        scale = self.scale
+        stage_repeats = [4, 8, 4]
+        if scale == 0.25:
+            stage_out_channels = [-1, 24, 24, 48, 96, 512]
+        elif scale == 0.33:
+            stage_out_channels = [-1, 24, 32, 64, 128, 512]
+        elif scale == 0.5:
+            stage_out_channels = [-1, 24, 48, 96, 192, 1024]
+        elif scale == 1.0:
+            stage_out_channels = [-1, 24, 116, 232, 464, 1024]
+        elif scale == 1.5:
+            stage_out_channels = [-1, 24, 176, 352, 704, 1024]
+        elif scale == 2.0:
+            stage_out_channels = [-1, 24, 224, 488, 976, 2048]
+        else:
+            raise NotImplementedError("This scale size:[" + str(scale) +
+                                      "] is not implemented!")
+        #conv1
+        input_channel = stage_out_channels[1]
+        conv1 = self.conv_bn_layer(
+            input=input,
+            filter_size=3,
+            num_filters=input_channel,
+            padding=1,
+            stride=2,
+            name='stage1_conv')
+        pool1 = fluid.layers.pool2d(
+            input=conv1,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+        conv = pool1
+        # bottleneck sequences
+        for idxstage in range(len(stage_repeats)):
+            numrepeat = stage_repeats[idxstage]
+            output_channel = stage_out_channels[idxstage + 2]
+            for i in range(numrepeat):
+                if i == 0:
+                    conv = self.inverted_residual_unit(
+                        input=conv,
+                        num_filters=output_channel,
+                        stride=2,
+                        benchmodel=2,
+                        name=str(idxstage + 2) + '_' + str(i + 1))
+                else:
+                    conv = self.inverted_residual_unit(
+                        input=conv,
+                        num_filters=output_channel,
+                        stride=1,
+                        benchmodel=1,
+                        name=str(idxstage + 2) + '_' + str(i + 1))
+        conv_last = self.conv_bn_layer(
+            input=conv,
+            filter_size=1,
+            num_filters=stage_out_channels[-1],
+            padding=0,
+            stride=1,
+            name='conv5')
+        pool_last = fluid.layers.pool2d(
+            input=conv_last,
+            pool_size=7,
+            pool_stride=1,
+            pool_padding=0,
+            pool_type='avg')
+        output = fluid.layers.fc(input=pool_last,
+                                 size=class_dim,
+                                 param_attr=ParamAttr(
+                                     initializer=MSRA(), name='fc6_weights'),
+                                 bias_attr=ParamAttr(name='fc6_offset'))
+        return output
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      num_groups=1,
+                      use_cudnn=True,
+                      if_act=True,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=name + '_weights'),
+            bias_attr=False)
+        out = int((input.shape[2] - 1) / float(stride) + 1)
+        bn_name = name + '_bn'
+        if if_act:
+            return fluid.layers.batch_norm(
+                input=conv,
+                act='relu',
+                param_attr=ParamAttr(name=bn_name + "_scale"),
+                bias_attr=ParamAttr(name=bn_name + "_offset"),
+                moving_mean_name=bn_name + '_mean',
+                moving_variance_name=bn_name + '_variance')
+        else:
+            return fluid.layers.batch_norm(
+                input=conv,
+                param_attr=ParamAttr(name=bn_name + "_scale"),
+                bias_attr=ParamAttr(name=bn_name + "_offset"),
+                moving_mean_name=bn_name + '_mean',
+                moving_variance_name=bn_name + '_variance')
+    def channel_shuffle(self, x, groups):
+        batchsize, num_channels, height, width = x.shape[0], x.shape[
+            1], x.shape[2], x.shape[3]
+        channels_per_group = num_channels // groups
+        # reshape
+        x = fluid.layers.reshape(
+            x=x, shape=[batchsize, groups, channels_per_group, height, width])
+        x = fluid.layers.transpose(x=x, perm=[0, 2, 1, 3, 4])
+        # flatten
+        x = fluid.layers.reshape(
+            x=x, shape=[batchsize, num_channels, height, width])
+        return x
+    def inverted_residual_unit(self,
+                               input,
+                               num_filters,
+                               stride,
+                               benchmodel,
+                               name=None):
+        assert stride in [1, 2], \
+            "supported stride are {} but your stride is {}".format([1,2], stride)
+        oup_inc = num_filters // 2
+        inp = input.shape[1]
+        if benchmodel == 1:
+            x1, x2 = fluid.layers.split(
+                input,
+                num_or_sections=[input.shape[1] // 2, input.shape[1] // 2],
+                dim=1)
+            conv_pw = self.conv_bn_layer(
+                input=x2,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True,
+                name='stage_' + name + '_conv1')
+            conv_dw = self.conv_bn_layer(
+                input=conv_pw,
+                num_filters=oup_inc,
+                filter_size=3,
+                stride=stride,
+                padding=1,
+                num_groups=oup_inc,
+                if_act=False,
+                use_cudnn=False,
+                name='stage_' + name + '_conv2')
+            conv_linear = self.conv_bn_layer(
+                input=conv_dw,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True,
+                name='stage_' + name + '_conv3')
+            out = fluid.layers.concat([x1, conv_linear], axis=1)
+        else:
+            #branch1
+            conv_dw_1 = self.conv_bn_layer(
+                input=input,
+                num_filters=inp,
+                filter_size=3,
+                stride=stride,
+                padding=1,
+                num_groups=inp,
+                if_act=False,
+                use_cudnn=False,
+                name='stage_' + name + '_conv4')
+            conv_linear_1 = self.conv_bn_layer(
+                input=conv_dw_1,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True,
+                name='stage_' + name + '_conv5')
+            #branch2
+            conv_pw_2 = self.conv_bn_layer(
+                input=input,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True,
+                name='stage_' + name + '_conv1')
+            conv_dw_2 = self.conv_bn_layer(
+                input=conv_pw_2,
+                num_filters=oup_inc,
+                filter_size=3,
+                stride=stride,
+                padding=1,
+                num_groups=oup_inc,
+                if_act=False,
+                use_cudnn=False,
+                name='stage_' + name + '_conv2')
+            conv_linear_2 = self.conv_bn_layer(
+                input=conv_dw_2,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True,
+                name='stage_' + name + '_conv3')
+            out = fluid.layers.concat([conv_linear_1, conv_linear_2], axis=1)
+        return self.channel_shuffle(out, 2)
+def ShuffleNetV2_x0_25():
+    model = ShuffleNetV2(scale=0.25)
+    return model
+def ShuffleNetV2_x0_33():
+    model = ShuffleNetV2(scale=0.33)
+    return model
+def ShuffleNetV2_x0_5():
+    model = ShuffleNetV2(scale=0.5)
+    return model
+def ShuffleNetV2_x1_0():
+    model = ShuffleNetV2(scale=1.0)
+    return model
+def ShuffleNetV2_x1_5():
+    model = ShuffleNetV2(scale=1.5)
+    return model
+def ShuffleNetV2_x2_0():
+    model = ShuffleNetV2(scale=2.0)
+    return model
--- a/ppcls/modeling/architectures/shufflenet_v2_swish.py
+++ b/ppcls/modeling/architectures/shufflenet_v2_swish.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+__all__ = [
+    'ShuffleNetV2_x0_5_swish', 'ShuffleNetV2_x1_0_swish',
+    'ShuffleNetV2_x1_5_swish', 'ShuffleNetV2_x2_0_swish', 'ShuffleNetV2_swish'
+]
+class ShuffleNetV2_swish():
+    def __init__(self, scale=1.0):
+        self.scale = scale
+    def net(self, input, class_dim=1000):
+        scale = self.scale
+        stage_repeats = [4, 8, 4]
+        if scale == 0.5:
+            stage_out_channels = [-1, 24, 48, 96, 192, 1024]
+        elif scale == 1.0:
+            stage_out_channels = [-1, 24, 116, 232, 464, 1024]
+        elif scale == 1.5:
+            stage_out_channels = [-1, 24, 176, 352, 704, 1024]
+        elif scale == 2.0:
+            stage_out_channels = [-1, 24, 224, 488, 976, 2048]
+        else:
+            raise ValueError("""{} groups is not supported for
+                       1x1 Grouped Convolutions""".format(num_groups))
+        #conv1
+        input_channel = stage_out_channels[1]
+        conv1 = self.conv_bn_layer(
+            input=input,
+            filter_size=3,
+            num_filters=input_channel,
+            padding=1,
+            stride=2,
+            name='stage1_conv')
+        pool1 = fluid.layers.pool2d(
+            input=conv1,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+        conv = pool1
+        # bottleneck sequences
+        for idxstage in range(len(stage_repeats)):
+            numrepeat = stage_repeats[idxstage]
+            output_channel = stage_out_channels[idxstage + 2]
+            for i in range(numrepeat):
+                if i == 0:
+                    conv = self.inverted_residual_unit(
+                        input=conv,
+                        num_filters=output_channel,
+                        stride=2,
+                        benchmodel=2,
+                        name=str(idxstage + 2) + '_' + str(i + 1))
+                else:
+                    conv = self.inverted_residual_unit(
+                        input=conv,
+                        num_filters=output_channel,
+                        stride=1,
+                        benchmodel=1,
+                        name=str(idxstage + 2) + '_' + str(i + 1))
+        conv_last = self.conv_bn_layer(
+            input=conv,
+            filter_size=1,
+            num_filters=stage_out_channels[-1],
+            padding=0,
+            stride=1,
+            name='conv5')
+        pool_last = fluid.layers.pool2d(
+            input=conv_last,
+            pool_size=7,
+            pool_stride=1,
+            pool_padding=0,
+            pool_type='avg')
+        output = fluid.layers.fc(input=pool_last,
+                                 size=class_dim,
+                                 param_attr=ParamAttr(
+                                     initializer=MSRA(), name='fc6_weights'),
+                                 bias_attr=ParamAttr(name='fc6_offset'))
+        return output
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      num_groups=1,
+                      use_cudnn=True,
+                      if_act=True,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=name + '_weights'),
+            bias_attr=False)
+        out = int((input.shape[2] - 1) / float(stride) + 1)
+        bn_name = name + '_bn'
+        if if_act:
+            return fluid.layers.batch_norm(
+                input=conv,
+                act='swish',
+                param_attr=ParamAttr(name=bn_name + "_scale"),
+                bias_attr=ParamAttr(name=bn_name + "_offset"),
+                moving_mean_name=bn_name + '_mean',
+                moving_variance_name=bn_name + '_variance')
+        else:
+            return fluid.layers.batch_norm(
+                input=conv,
+                param_attr=ParamAttr(name=bn_name + "_scale"),
+                bias_attr=ParamAttr(name=bn_name + "_offset"),
+                moving_mean_name=bn_name + '_mean',
+                moving_variance_name=bn_name + '_variance')
+    def channel_shuffle(self, x, groups):
+        batchsize, num_channels, height, width = x.shape[0], x.shape[
+            1], x.shape[2], x.shape[3]
+        channels_per_group = num_channels // groups
+        # reshape
+        x = fluid.layers.reshape(
+            x=x, shape=[batchsize, groups, channels_per_group, height, width])
+        x = fluid.layers.transpose(x=x, perm=[0, 2, 1, 3, 4])
+        # flatten
+        x = fluid.layers.reshape(
+            x=x, shape=[batchsize, num_channels, height, width])
+        return x
+    def inverted_residual_unit(self,
+                               input,
+                               num_filters,
+                               stride,
+                               benchmodel,
+                               name=None):
+        assert stride in [1, 2], \
+            "supported stride are {} but your stride is {}".format([1,2], stride)
+        oup_inc = num_filters // 2
+        inp = input.shape[1]
+        if benchmodel == 1:
+            x1, x2 = fluid.layers.split(
+                input,
+                num_or_sections=[input.shape[1] // 2, input.shape[1] // 2],
+                dim=1)
+            conv_pw = self.conv_bn_layer(
+                input=x2,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True,
+                name='stage_' + name + '_conv1')
+            conv_dw = self.conv_bn_layer(
+                input=conv_pw,
+                num_filters=oup_inc,
+                filter_size=3,
+                stride=stride,
+                padding=1,
+                num_groups=oup_inc,
+                if_act=False,
+                use_cudnn=False,
+                name='stage_' + name + '_conv2')
+            conv_linear = self.conv_bn_layer(
+                input=conv_dw,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True,
+                name='stage_' + name + '_conv3')
+            out = fluid.layers.concat([x1, conv_linear], axis=1)
+        else:
+            #branch1
+            conv_dw_1 = self.conv_bn_layer(
+                input=input,
+                num_filters=inp,
+                filter_size=3,
+                stride=stride,
+                padding=1,
+                num_groups=inp,
+                if_act=False,
+                use_cudnn=False,
+                name='stage_' + name + '_conv4')
+            conv_linear_1 = self.conv_bn_layer(
+                input=conv_dw_1,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True,
+                name='stage_' + name + '_conv5')
+            #branch2
+            conv_pw_2 = self.conv_bn_layer(
+                input=input,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True,
+                name='stage_' + name + '_conv1')
+            conv_dw_2 = self.conv_bn_layer(
+                input=conv_pw_2,
+                num_filters=oup_inc,
+                filter_size=3,
+                stride=stride,
+                padding=1,
+                num_groups=oup_inc,
+                if_act=False,
+                use_cudnn=False,
+                name='stage_' + name + '_conv2')
+            conv_linear_2 = self.conv_bn_layer(
+                input=conv_dw_2,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True,
+                name='stage_' + name + '_conv3')
+            out = fluid.layers.concat([conv_linear_1, conv_linear_2], axis=1)
+        return self.channel_shuffle(out, 2)
+def ShuffleNetV2_x0_5_swish():
+    model = ShuffleNetV2_swish(scale=0.5)
+    return model
+def ShuffleNetV2_x1_0_swish():
+    model = ShuffleNetV2_swish(scale=1.0)
+    return model
+def ShuffleNetV2_x1_5_swish():
+    model = ShuffleNetV2_swish(scale=1.5)
+    return model
+def ShuffleNetV2_x2_0_swish():
+    model = ShuffleNetV2_swish(scale=2.0)
+    return model
--- a/ppcls/modeling/architectures/squeezenet.py
+++ b/ppcls/modeling/architectures/squeezenet.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+__all__ = ["SqueezeNet", "SqueezeNet1_0", "SqueezeNet1_1"]
+class SqueezeNet():
+    def __init__(self, version='1.0'):
+        self.version = version
+    def net(self, input, class_dim=1000):
+        version = self.version
+        assert version in ['1.0', '1.1'], \
+            "supported version are {} but input version is {}".format(['1.0', '1.1'], version)
+        if version == '1.0':
+            conv = fluid.layers.conv2d(
+                input,
+                num_filters=96,
+                filter_size=7,
+                stride=2,
+                act='relu',
+                param_attr=fluid.param_attr.ParamAttr(name="conv1_weights"),
+                bias_attr=ParamAttr(name='conv1_offset'))
+            conv = fluid.layers.pool2d(
+                conv, pool_size=3, pool_stride=2, pool_type='max')
+            conv = self.make_fire(conv, 16, 64, 64, name='fire2')
+            conv = self.make_fire(conv, 16, 64, 64, name='fire3')
+            conv = self.make_fire(conv, 32, 128, 128, name='fire4')
+            conv = fluid.layers.pool2d(
+                conv, pool_size=3, pool_stride=2, pool_type='max')
+            conv = self.make_fire(conv, 32, 128, 128, name='fire5')
+            conv = self.make_fire(conv, 48, 192, 192, name='fire6')
+            conv = self.make_fire(conv, 48, 192, 192, name='fire7')
+            conv = self.make_fire(conv, 64, 256, 256, name='fire8')
+            conv = fluid.layers.pool2d(
+                conv, pool_size=3, pool_stride=2, pool_type='max')
+            conv = self.make_fire(conv, 64, 256, 256, name='fire9')
+        else:
+            conv = fluid.layers.conv2d(
+                input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                padding=1,
+                act='relu',
+                param_attr=fluid.param_attr.ParamAttr(name="conv1_weights"),
+                bias_attr=ParamAttr(name='conv1_offset'))
+            conv = fluid.layers.pool2d(
+                conv, pool_size=3, pool_stride=2, pool_type='max')
+            conv = self.make_fire(conv, 16, 64, 64, name='fire2')
+            conv = self.make_fire(conv, 16, 64, 64, name='fire3')
+            conv = fluid.layers.pool2d(
+                conv, pool_size=3, pool_stride=2, pool_type='max')
+            conv = self.make_fire(conv, 32, 128, 128, name='fire4')
+            conv = self.make_fire(conv, 32, 128, 128, name='fire5')
+            conv = fluid.layers.pool2d(
+                conv, pool_size=3, pool_stride=2, pool_type='max')
+            conv = self.make_fire(conv, 48, 192, 192, name='fire6')
+            conv = self.make_fire(conv, 48, 192, 192, name='fire7')
+            conv = self.make_fire(conv, 64, 256, 256, name='fire8')
+            conv = self.make_fire(conv, 64, 256, 256, name='fire9')
+        conv = fluid.layers.dropout(conv, dropout_prob=0.5)
+        conv = fluid.layers.conv2d(
+            conv,
+            num_filters=class_dim,
+            filter_size=1,
+            act='relu',
+            param_attr=fluid.param_attr.ParamAttr(name="conv10_weights"),
+            bias_attr=ParamAttr(name='conv10_offset'))
+        conv = fluid.layers.pool2d(conv, pool_type='avg', global_pooling=True)
+        out = fluid.layers.flatten(conv)
+        return out
+    def make_fire_conv(self,
+                       input,
+                       num_filters,
+                       filter_size,
+                       padding=0,
+                       name=None):
+        conv = fluid.layers.conv2d(
+            input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            padding=padding,
+            act='relu',
+            param_attr=fluid.param_attr.ParamAttr(name=name + "_weights"),
+            bias_attr=ParamAttr(name=name + '_offset'))
+        return conv
+    def make_fire(self,
+                  input,
+                  squeeze_channels,
+                  expand1x1_channels,
+                  expand3x3_channels,
+                  name=None):
+        conv = self.make_fire_conv(
+            input, squeeze_channels, 1, name=name + '_squeeze1x1')
+        conv_path1 = self.make_fire_conv(
+            conv, expand1x1_channels, 1, name=name + '_expand1x1')
+        conv_path2 = self.make_fire_conv(
+            conv, expand3x3_channels, 3, 1, name=name + '_expand3x3')
+        out = fluid.layers.concat([conv_path1, conv_path2], axis=1)
+        return out
+def SqueezeNet1_0():
+    model = SqueezeNet(version='1.0')
+    return model
+def SqueezeNet1_1():
+    model = SqueezeNet(version='1.1')
+    return model
--- a/ppcls/modeling/architectures/vgg.py
+++ b/ppcls/modeling/architectures/vgg.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+__all__ = ["VGGNet", "VGG11", "VGG13", "VGG16", "VGG19"]
+class VGGNet():
+    def __init__(self, layers=16):
+        self.layers = layers
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        vgg_spec = {
+            11: ([1, 1, 2, 2, 2]),
+            13: ([2, 2, 2, 2, 2]),
+            16: ([2, 2, 3, 3, 3]),
+            19: ([2, 2, 4, 4, 4])
+        }
+        assert layers in vgg_spec.keys(), \
+            "supported layers are {} but input layer is {}".format(vgg_spec.keys(), layers)
+        nums = vgg_spec[layers]
+        conv1 = self.conv_block(input, 64, nums[0], name="conv1_")
+        conv2 = self.conv_block(conv1, 128, nums[1], name="conv2_")
+        conv3 = self.conv_block(conv2, 256, nums[2], name="conv3_")
+        conv4 = self.conv_block(conv3, 512, nums[3], name="conv4_")
+        conv5 = self.conv_block(conv4, 512, nums[4], name="conv5_")
+        fc_dim = 4096
+        fc_name = ["fc6", "fc7", "fc8"]
+        fc1 = fluid.layers.fc(
+            input=conv5,
+            size=fc_dim,
+            act='relu',
+            param_attr=fluid.param_attr.ParamAttr(
+                name=fc_name[0] + "_weights"),
+            bias_attr=fluid.param_attr.ParamAttr(name=fc_name[0] + "_offset"))
+        fc1 = fluid.layers.dropout(x=fc1, dropout_prob=0.5)
+        fc2 = fluid.layers.fc(
+            input=fc1,
+            size=fc_dim,
+            act='relu',
+            param_attr=fluid.param_attr.ParamAttr(
+                name=fc_name[1] + "_weights"),
+            bias_attr=fluid.param_attr.ParamAttr(name=fc_name[1] + "_offset"))
+        fc2 = fluid.layers.dropout(x=fc2, dropout_prob=0.5)
+        out = fluid.layers.fc(
+            input=fc2,
+            size=class_dim,
+            param_attr=fluid.param_attr.ParamAttr(
+                name=fc_name[2] + "_weights"),
+            bias_attr=fluid.param_attr.ParamAttr(name=fc_name[2] + "_offset"))
+        return out
+    def conv_block(self, input, num_filter, groups, name=None):
+        conv = input
+        for i in range(groups):
+            conv = fluid.layers.conv2d(
+                input=conv,
+                num_filters=num_filter,
+                filter_size=3,
+                stride=1,
+                padding=1,
+                act='relu',
+                param_attr=fluid.param_attr.ParamAttr(
+                    name=name + str(i + 1) + "_weights"),
+                bias_attr=False)
+        return fluid.layers.pool2d(
+            input=conv, pool_size=2, pool_type='max', pool_stride=2)
+def VGG11():
+    model = VGGNet(layers=11)
+    return model
+def VGG13():
+    model = VGGNet(layers=13)
+    return model
+def VGG16():
+    model = VGGNet(layers=16)
+    return model
+def VGG19():
+    model = VGGNet(layers=19)
+    return model
--- a/ppcls/modeling/architectures/xception.py
+++ b/ppcls/modeling/architectures/xception.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import sys
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+__all__ = ['Xception', 'Xception41', 'Xception65', 'Xception71']
+class Xception(object):
+    """Xception"""
+    def __init__(self, entry_flow_block_num=3, middle_flow_block_num=8):
+        self.entry_flow_block_num = entry_flow_block_num
+        self.middle_flow_block_num = middle_flow_block_num
+        return
+    def net(self, input, class_dim=1000):
+        conv = self.entry_flow(input, self.entry_flow_block_num)
+        conv = self.middle_flow(conv, self.middle_flow_block_num)
+        conv = self.exit_flow(conv, class_dim)
+        return conv
+    def entry_flow(self, input, block_num=3):
+        '''xception entry_flow'''
+        name = "entry_flow"
+        conv = self.conv_bn_layer(
+            input=input,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name=name + "_conv1")
+        conv = self.conv_bn_layer(
+            input=conv,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name=name + "_conv2")
+        if block_num == 3:
+            relu_first = [False, True, True]
+            num_filters = [128, 256, 728]
+            stride = [2, 2, 2]
+        elif block_num == 5:
+            relu_first = [False, True, True, True, True]
+            num_filters = [128, 256, 256, 728, 728]
+            stride = [2, 1, 2, 1, 2]
+        else:
+            sys.exit(-1)
+        for block in range(block_num):
+            curr_name = "{}_{}".format(name, block)
+            conv = self.entry_flow_bottleneck_block(
+                conv,
+                num_filters=num_filters[block],
+                name=curr_name,
+                stride=stride[block],
+                relu_first=relu_first[block])
+        return conv
+    def entry_flow_bottleneck_block(self,
+                                    input,
+                                    num_filters,
+                                    name,
+                                    stride=2,
+                                    relu_first=False):
+        '''entry_flow_bottleneck_block'''
+        short = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=1,
+            stride=stride,
+            padding=0,
+            act=None,
+            param_attr=ParamAttr(name + "_branch1_weights"),
+            bias_attr=False)
+        conv0 = input
+        if relu_first:
+            conv0 = fluid.layers.relu(conv0)
+        conv1 = self.separable_conv(
+            conv0, num_filters, stride=1, name=name + "_branch2a_weights")
+        conv2 = fluid.layers.relu(conv1)
+        conv2 = self.separable_conv(
+            conv2, num_filters, stride=1, name=name + "_branch2b_weights")
+        pool = fluid.layers.pool2d(
+            input=conv2,
+            pool_size=3,
+            pool_stride=stride,
+            pool_padding=1,
+            pool_type='max')
+        return fluid.layers.elementwise_add(x=short, y=pool)
+    def middle_flow(self, input, block_num=8):
+        '''xception middle_flow'''
+        num_filters = 728
+        conv = input
+        for block in range(block_num):
+            name = "middle_flow_{}".format(block)
+            conv = self.middle_flow_bottleneck_block(conv, num_filters, name)
+        return conv
+    def middle_flow_bottleneck_block(self, input, num_filters, name):
+        '''middle_flow_bottleneck_block'''
+        conv0 = fluid.layers.relu(input)
+        conv0 = self.separable_conv(
+            conv0,
+            num_filters=num_filters,
+            stride=1,
+            name=name + "_branch2a_weights")
+        conv1 = fluid.layers.relu(conv0)
+        conv1 = self.separable_conv(
+            conv1,
+            num_filters=num_filters,
+            stride=1,
+            name=name + "_branch2b_weights")
+        conv2 = fluid.layers.relu(conv1)
+        conv2 = self.separable_conv(
+            conv2,
+            num_filters=num_filters,
+            stride=1,
+            name=name + "_branch2c_weights")
+        return fluid.layers.elementwise_add(x=input, y=conv2)
+    def exit_flow(self, input, class_dim):
+        '''xception exit flow'''
+        name = "exit_flow"
+        num_filters1 = 728
+        num_filters2 = 1024
+        conv0 = self.exit_flow_bottleneck_block(
+            input, num_filters1, num_filters2, name=name + "_1")
+        conv1 = self.separable_conv(
+            conv0, num_filters=1536, stride=1, name=name + "_2")
+        conv1 = fluid.layers.relu(conv1)
+        conv2 = self.separable_conv(
+            conv1, num_filters=2048, stride=1, name=name + "_3")
+        conv2 = fluid.layers.relu(conv2)
+        pool = fluid.layers.pool2d(
+            input=conv2, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=pool,
+            size=class_dim,
+            param_attr=fluid.param_attr.ParamAttr(
+                name='fc_weights',
+                initializer=fluid.initializer.Uniform(-stdv, stdv)),
+            bias_attr=fluid.param_attr.ParamAttr(name='fc_offset'))
+        return out
+    def exit_flow_bottleneck_block(self, input, num_filters1, num_filters2,
+                                   name):
+        '''entry_flow_bottleneck_block'''
+        short = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters2,
+            filter_size=1,
+            stride=2,
+            padding=0,
+            act=None,
+            param_attr=ParamAttr(name + "_branch1_weights"),
+            bias_attr=False)
+        conv0 = fluid.layers.relu(input)
+        conv1 = self.separable_conv(
+            conv0, num_filters1, stride=1, name=name + "_branch2a_weights")
+        conv2 = fluid.layers.relu(conv1)
+        conv2 = self.separable_conv(
+            conv2, num_filters2, stride=1, name=name + "_branch2b_weights")
+        pool = fluid.layers.pool2d(
+            input=conv2,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+        return fluid.layers.elementwise_add(x=short, y=pool)
+    def separable_conv(self, input, num_filters, stride=1, name=None):
+        """separable_conv"""
+        pointwise_conv = self.conv_bn_layer(
+            input=input,
+            filter_size=1,
+            num_filters=num_filters,
+            stride=1,
+            name=name + "_sep")
+        depthwise_conv = self.conv_bn_layer(
+            input=pointwise_conv,
+            filter_size=3,
+            num_filters=num_filters,
+            stride=stride,
+            groups=num_filters,
+            use_cudnn=False,
+            name=name + "_dw")
+        return depthwise_conv
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      use_cudnn=True,
+                      name=None):
+        """conv_bn_layer"""
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            use_cudnn=use_cudnn)
+        bn_name = "bn_" + name
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+def Xception41():
+    model = Xception(entry_flow_block_num=3, middle_flow_block_num=8)
+    return model
+def Xception65():
+    model = Xception(entry_flow_block_num=3, middle_flow_block_num=16)
+    return model
+def Xception71():
+    model = Xception(entry_flow_block_num=5, middle_flow_block_num=16)
+    return model
--- a/ppcls/modeling/architectures/xception_deeplab.py
+++ b/ppcls/modeling/architectures/xception_deeplab.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import contextlib
+import paddle
+import math
+import paddle.fluid as fluid
+from .model_libs import scope, name_scope
+from .model_libs import bn, bn_relu, relu
+from .model_libs import conv
+from .model_libs import seperate_conv
+__all__ = ['Xception41_deeplab', 'Xception65_deeplab', 'Xception71_deeplab']
+def check_data(data, number):
+    if type(data) == int:
+        return [data] * number
+    assert len(data) == number
+    return data
+def check_stride(s, os):
+    if s <= os:
+        return True
+    else:
+        return False
+def check_points(count, points):
+    if points is None:
+        return False
+    else:
+        if isinstance(points, list):
+            return (True if count in points else False)
+        else:
+            return (True if count == points else False)
+class Xception():
+    def __init__(self, backbone="xception_65"):
+        self.bottleneck_params = self.gen_bottleneck_params(backbone)
+        self.backbone = backbone
+    def gen_bottleneck_params(self, backbone='xception_65'):
+        if backbone == 'xception_65':
+            bottleneck_params = {
+                "entry_flow": (3, [2, 2, 2], [128, 256, 728]),
+                "middle_flow": (16, 1, 728),
+                "exit_flow":
+                (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+            }
+        elif backbone == 'xception_41':
+            bottleneck_params = {
+                "entry_flow": (3, [2, 2, 2], [128, 256, 728]),
+                "middle_flow": (8, 1, 728),
+                "exit_flow":
+                (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+            }
+        elif backbone == 'xception_71':
+            bottleneck_params = {
+                "entry_flow": (5, [2, 1, 2, 1, 2], [128, 256, 256, 728, 728]),
+                "middle_flow": (16, 1, 728),
+                "exit_flow":
+                (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+            }
+        else:
+            raise Exception(
+                "xception backbont only support xception_41/xception_65/xception_71"
+            )
+        return bottleneck_params
+    def net(self,
+            input,
+            output_stride=32,
+            class_dim=1000,
+            end_points=None,
+            decode_points=None):
+        self.stride = 2
+        self.block_point = 0
+        self.output_stride = output_stride
+        self.decode_points = decode_points
+        self.short_cuts = dict()
+        with scope(self.backbone):
+            # Entry flow
+            data = self.entry_flow(input)
+            if check_points(self.block_point, end_points):
+                return data, self.short_cuts
+            # Middle flow
+            data = self.middle_flow(data)
+            if check_points(self.block_point, end_points):
+                return data, self.short_cuts
+            # Exit flow
+            data = self.exit_flow(data)
+            if check_points(self.block_point, end_points):
+                return data, self.short_cuts
+            data = fluid.layers.reduce_mean(data, [2, 3], keep_dim=True)
+            data = fluid.layers.dropout(data, 0.5)
+            stdv = 1.0 / math.sqrt(data.shape[1] * 1.0)
+            with scope("logit"):
+                out = fluid.layers.fc(
+                    input=data,
+                    size=class_dim,
+                    param_attr=fluid.param_attr.ParamAttr(
+                        name='fc_weights',
+                        initializer=fluid.initializer.Uniform(-stdv, stdv)),
+                    bias_attr=fluid.param_attr.ParamAttr(name='fc_bias'))
+            return out
+    def entry_flow(self, data):
+        param_attr = fluid.ParamAttr(
+            name=name_scope + 'weights',
+            regularizer=None,
+            initializer=fluid.initializer.TruncatedNormal(
+                loc=0.0, scale=0.09))
+        with scope("entry_flow"):
+            with scope("conv1"):
+                data = bn_relu(
+                    conv(
+                        data,
+                        32,
+                        3,
+                        stride=2,
+                        padding=1,
+                        param_attr=param_attr))
+            with scope("conv2"):
+                data = bn_relu(
+                    conv(
+                        data,
+                        64,
+                        3,
+                        stride=1,
+                        padding=1,
+                        param_attr=param_attr))
+        # get entry flow params
+        block_num = self.bottleneck_params["entry_flow"][0]
+        strides = self.bottleneck_params["entry_flow"][1]
+        chns = self.bottleneck_params["entry_flow"][2]
+        strides = check_data(strides, block_num)
+        chns = check_data(chns, block_num)
+        # params to control your flow
+        s = self.stride
+        block_point = self.block_point
+        output_stride = self.output_stride
+        with scope("entry_flow"):
+            for i in range(block_num):
+                block_point = block_point + 1
+                with scope("block" + str(i + 1)):
+                    stride = strides[i] if check_stride(s * strides[i],
+                                                        output_stride) else 1
+                    data, short_cuts = self.xception_block(data, chns[i],
+                                                           [1, 1, stride])
+                    s = s * stride
+                    if check_points(block_point, self.decode_points):
+                        self.short_cuts[block_point] = short_cuts[1]
+        self.stride = s
+        self.block_point = block_point
+        return data
+    def middle_flow(self, data):
+        block_num = self.bottleneck_params["middle_flow"][0]
+        strides = self.bottleneck_params["middle_flow"][1]
+        chns = self.bottleneck_params["middle_flow"][2]
+        strides = check_data(strides, block_num)
+        chns = check_data(chns, block_num)
+        # params to control your flow
+        s = self.stride
+        block_point = self.block_point
+        output_stride = self.output_stride
+        with scope("middle_flow"):
+            for i in range(block_num):
+                block_point = block_point + 1
+                with scope("block" + str(i + 1)):
+                    stride = strides[i] if check_stride(s * strides[i],
+                                                        output_stride) else 1
+                    data, short_cuts = self.xception_block(
+                        data, chns[i], [1, 1, strides[i]], skip_conv=False)
+                    s = s * stride
+                    if check_points(block_point, self.decode_points):
+                        self.short_cuts[block_point] = short_cuts[1]
+        self.stride = s
+        self.block_point = block_point
+        return data
+    def exit_flow(self, data):
+        block_num = self.bottleneck_params["exit_flow"][0]
+        strides = self.bottleneck_params["exit_flow"][1]
+        chns = self.bottleneck_params["exit_flow"][2]
+        strides = check_data(strides, block_num)
+        chns = check_data(chns, block_num)
+        assert (block_num == 2)
+        # params to control your flow
+        s = self.stride
+        block_point = self.block_point
+        output_stride = self.output_stride
+        with scope("exit_flow"):
+            with scope('block1'):
+                block_point += 1
+                stride = strides[0] if check_stride(s * strides[0],
+                                                    output_stride) else 1
+                data, short_cuts = self.xception_block(data, chns[0],
+                                                       [1, 1, stride])
+                s = s * stride
+                if check_points(block_point, self.decode_points):
+                    self.short_cuts[block_point] = short_cuts[1]
+            with scope('block2'):
+                block_point += 1
+                stride = strides[1] if check_stride(s * strides[1],
+                                                    output_stride) else 1
+                data, short_cuts = self.xception_block(
+                    data,
+                    chns[1], [1, 1, stride],
+                    dilation=2,
+                    has_skip=False,
+                    activation_fn_in_separable_conv=True)
+                s = s * stride
+                if check_points(block_point, self.decode_points):
+                    self.short_cuts[block_point] = short_cuts[1]
+        self.stride = s
+        self.block_point = block_point
+        return data
+    def xception_block(self,
+                       input,
+                       channels,
+                       strides=1,
+                       filters=3,
+                       dilation=1,
+                       skip_conv=True,
+                       has_skip=True,
+                       activation_fn_in_separable_conv=False):
+        repeat_number = 3
+        channels = check_data(channels, repeat_number)
+        filters = check_data(filters, repeat_number)
+        strides = check_data(strides, repeat_number)
+        data = input
+        results = []
+        for i in range(repeat_number):
+            with scope('separable_conv' + str(i + 1)):
+                if not activation_fn_in_separable_conv:
+                    data = relu(data)
+                    data = seperate_conv(
+                        data,
+                        channels[i],
+                        strides[i],
+                        filters[i],
+                        dilation=dilation)
+                else:
+                    data = seperate_conv(
+                        data,
+                        channels[i],
+                        strides[i],
+                        filters[i],
+                        dilation=dilation,
+                        act=relu)
+                results.append(data)
+        if not has_skip:
+            return data, results
+        if skip_conv:
+            param_attr = fluid.ParamAttr(
+                name=name_scope + 'weights',
+                regularizer=None,
+                initializer=fluid.initializer.TruncatedNormal(
+                    loc=0.0, scale=0.09))
+            with scope('shortcut'):
+                skip = bn(
+                    conv(
+                        input,
+                        channels[-1],
+                        1,
+                        strides[-1],
+                        groups=1,
+                        padding=0,
+                        param_attr=param_attr))
+        else:
+            skip = input
+        return data + skip, results
+def Xception41_deeplab():
+    model = Xception("xception_41")
+    return model
+def Xception65_deeplab():
+    model = Xception("xception_65")
+    return model
+def Xception71_deeplab():
+    model = Xception("xception_71")
+    return model
--- a/ppcls/modeling/loss.py
+++ b/ppcls/modeling/loss.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import paddle
+import paddle.fluid as fluid
+__all__ = ['CELoss', 'MixCELoss', 'GoogLeNetLoss']
+class Loss(object):
+    """
+    Loss
+    """
+    def __init__(self, class_dim=1000, epsilon=None):
+        assert class_dim > 1, "class_dim=%d is not larger than 1" % (class_dim)
+        self._class_dim = class_dim
+        if epsilon and epsilon >= 0.0 and epsilon <= 1.0:
+            self._epsilon = epsilon
+            self._label_smoothing = True
+        else:
+            self._epsilon = None
+            self._label_smoothing = False
+    def _labelsmoothing(self, target):
+        one_hot_target = fluid.layers.one_hot(
+            input=target, depth=self._class_dim)
+        soft_target = fluid.layers.label_smooth(
+            label=one_hot_target, epsilon=self._epsilon, dtype="float32")
+        return soft_target
+    def _crossentropy(self, input, target):
+        if self._label_smoothing:
+            target = self._labelsmoothing(target)
+        softmax_out = fluid.layers.softmax(input, use_cudnn=False)
+        cost = fluid.layers.cross_entropy(
+            input=softmax_out, label=target, soft_label=self._label_smoothing)
+        avg_cost = fluid.layers.mean(cost)
+        return avg_cost
+    def __call__(self, input, target):
+        pass
+class CELoss(Loss):
+    """
+    Cross entropy loss
+    """
+    def __init__(self, class_dim=1000, epsilon=None):
+        super(CELoss, self).__init__(class_dim, epsilon)
+    def __call__(self, input, target):
+        cost = self._crossentropy(input, target)
+        return cost
+class MixCELoss(Loss):
+    """
+    Cross entropy loss with mix(mixup, cutmix, fixmix)
+    """
+    def __init__(self, class_dim=1000, epsilon=None):
+        super(MixCELoss, self).__init__(class_dim, epsilon)
+    def __call__(self, input, target0, target1, lam):
+        cost0 = self._crossentropy(input, target0)
+        cost1 = self._crossentropy(input, target1)
+        cost = lam * cost0 + (1.0 - lam) * cost1
+        avg_cost = fluid.layers.mean(cost)
+        return avg_cost
+class GoogLeNetLoss(Loss):
+    """
+    Cross entropy loss used after googlenet
+    """
+    def __init__(self, class_dim=1000, epsilon=None):
+        super(GoogLeNetLoss, self).__init__(class_dim, epsilon)
+    def __call__(self, input0, input1, input2, target):
+        cost0 = self._crossentropy(input0, target)
+        cost1 = self._crossentropy(input1, target)
+        cost2 = self._crossentropy(input2, target)
+        cost = cost0 + 0.3 * cost1 + 0.3 * cost2
+        avg_cost = fluid.layers.mean(cost)
+        return avg_cost
--- a/ppcls/modeling/utils.py
+++ b/ppcls/modeling/utils.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import types
+import architectures
+from difflib import SequenceMatcher
+def get_architectures():
+    """
+    get all of model architectures
+    """
+    names = []
+    for k, v in architectures.__dict__.items():
+        if isinstance(v, (types.FunctionType, types.ClassType)):
+            names.append(k)
+    return names
+def similar_architectures(name='', thresh=0.1, topk=10):
+    """
+    inferred similar architectures
+    """
+    scores = []
+    names = get_architectures()
+    for idx, n in enumerate(names):
+        if n[:2] == '__': continue
+        score = SequenceMatcher(None, n.lower(), name.lower()).quick_ratio()
+        if score > thresh: scores.append((idx, score))
+    scores.sort(key=lambda x: x[1], reverse=True)
+    similar_names = [names[s[0]] for s in scores[:min(topk, len(scores))]]
+    return similar_names
--- a/ppcls/optimizer/__init__.py
+++ b/ppcls/optimizer/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import optimizer
+from . import learning_rate
+from optimizer import OptimizerBuilder
+from learning_rate import LearningRateBuilder
--- a/ppcls/optimizer/learning_rate.py
+++ b/ppcls/optimizer/learning_rate.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import math
+import paddle.fluid as fluid
+import paddle.fluid.layers.ops as ops
+from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
+__all__ = ['LearningRateBuilder']
+class Linear(object):
+    """
+    Linear learning rate decay
+    Args:
+        lr(float): initial learning rate
+        steps(int): total decay steps
+        end_lr(float): end learning rate, default: 0.0.
+    """
+    def __init__(self, lr, steps, end_lr=0.0, **kwargs):
+        super(Linear, self).__init__()
+        self.lr = lr
+        self.steps = steps
+        self.end_lr = end_lr
+    def __call__(self):
+        learning_rate = fluid.layers.polynomial_decay(
+            self.lr, self.steps, self.end_lr, power=1)
+        return learning_rate
+class Cosine(object):
+    """
+    Cosine learning rate decay
+    lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1)
+    Args:
+        lr(float): initial learning rate
+        step_each_epoch(int): steps each epoch
+        epochs(int): total training epochs
+    """
+    def __init__(self, lr, step_each_epoch, epochs, **kwargs):
+        super(Cosine, self).__init__()
+        self.lr = lr
+        self.step_each_epoch = step_each_epoch
+        self.epochs = epochs
+    def __call__(self):
+        learning_rate = fluid.layers.cosine_decay(
+            learning_rate=self.lr,
+            step_each_epoch=self.step_each_epoch,
+            epochs=self.epochs)
+        return learning_rate
+class Piecewise(object):
+    """
+    Piecewise learning rate decay
+    Args:
+        lr(float): initial learning rate
+        step_each_epoch(int): steps each epoch
+        decay_epochs(list): piecewise decay epochs
+        gamma(float): decay factor
+    """
+    def __init__(self, lr, step_each_epoch, decay_epochs, gamma=0.1, **kwargs):
+        super(Piecewise, self).__init__()
+        self.bd = [step_each_epoch * e for e in decay_epochs]
+        self.lr = [lr * (gamma**i) for i in range(len(self.bd) + 1)]
+    def __call__(self):
+        learning_rate = fluid.layers.piecewise_decay(self.bd, self.lr)
+        return learning_rate
+class CosineWarmup(object):
+    """
+    Cosine learning rate decay with warmup
+    [0, warmup_epoch): linear warmup
+    [warmup_epoch, epochs): cosine decay
+    Args:
+        lr(float): initial learning rate
+        step_each_epoch(int): steps each epoch
+        epochs(int): total training epochs
+        warmup_epoch(int): epoch num of warmup
+    """
+    def __init__(self, lr, step_each_epoch, epochs, warmup_epoch=5, **kwargs):
+        super(CosineWarmup, self).__init__()
+        self.lr = lr
+        self.step_each_epoch = step_each_epoch
+        self.epochs = epochs
+        self.warmup_epoch = fluid.layers.fill_constant(
+            shape=[1],
+            value=float(warmup_epoch),
+            dtype='float32',
+            force_cpu=True)
+    def __call__(self):
+        global_step = _decay_step_counter()
+        learning_rate = fluid.layers.tensor.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="learning_rate")
+        epoch = ops.floor(global_step / self.step_each_epoch)
+        with fluid.layers.control_flow.Switch() as switch:
+            with switch.case(epoch < self.warmup_epoch):
+                decayed_lr = self.lr * \
+                        (global_step / (self.step_each_epoch * self.warmup_epoch))
+                fluid.layers.tensor.assign(
+                    input=decayed_lr, output=learning_rate)
+            with switch.default():
+                current_step = global_step - self.warmup_epoch * self.step_each_epoch
+                total_step = (
+                    self.epochs - self.warmup_epoch) * self.step_each_epoch
+                decayed_lr = self.lr * \
+                    (ops.cos(current_step * math.pi / total_step) + 1) / 2
+                fluid.layers.tensor.assign(
+                    input=decayed_lr, output=learning_rate)
+        return learning_rate
+class LearningRateBuilder():
+    """
+    Build learning rate variable
+    https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/layers_cn.html
+    Args:
+        function(str): class name of learning rate
+        params(dict): parameters used for init the class
+    """
+    def __init__(self,
+                 function='Linear',
+                 params={'lr': 0.1,
+                         'steps': 100,
+                         'end_lr': 0.0}):
+        self.function = function
+        self.params = params
+    def __call__(self):
+        mod = sys.modules[__name__]
+        lr = getattr(mod, self.function)(**self.params)()
+        return lr
--- a/ppcls/optimizer/optimizer.py
+++ b/ppcls/optimizer/optimizer.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.fluid.optimizer as pfopt
+import paddle.fluid.regularizer as pfreg
+__all__ = ['OptimizerBuilder']
+class OptimizerBuilder(object):
+    """
+    Build optimizer with fluid api in fluid.layers.optimizer,
+    such as fluid.layers.optimizer.Momentum()
+    https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/optimizer_cn.html
+    https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/regularizer_cn.html
+    Args:
+        function(str): optimizer name of learning rate
+        params(dict): parameters used for init the class
+        regularizer (dict): parameters used for create regularization
+    """
+    def __init__(self,
+                 function='Momentum',
+                 params={'momentum': 0.9},
+                 regularizer=None):
+        self.function = function
+        self.params = params
+        # create regularizer
+        if regularizer is not None:
+            reg_func = regularizer['function'] + 'Decay'
+            reg_factor = regularizer['factor']
+            reg = getattr(pfreg, reg_func)(reg_factor)
+            self.params['regularization'] = reg
+    def __call__(self, learning_rate):
+        opt = getattr(pfopt, self.function)
+        return opt(learning_rate=learning_rate, **self.params)
--- a/ppcls/test/demo.jpeg
+++ b/ppcls/test/demo.jpeg
--- a/ppcls/test/test_download.py
+++ b/ppcls/test/test_download.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import dl
+import os
+import shutil
+url = "https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar"
+class DownloadDecompressTestCase(unittest.TestCase):
+    def setUp(self):
+        print("Test Download and Decompress Function...")
+    def test_decompress(self):
+        if os.path.exists('./ResNet50_vd_pretrained'):
+            shutil.rmtree('./ResNet50_vd_pretrained')
+        if os.path.exists("./ResNet50_vd_pretrained.tar"):
+            shutil.rmtree("./ResNet50_vd_pretrained.tar")
+        dl.decompress(dl.download(url, "./"))
+        self.assertTrue(os.path.exists("./ResNet50_vd_pretrained"))
+        shutil.rmtree('./ResNet50_vd_pretrained')
+if __name__ == "__main__":
+    unittest.main()
--- a/ppcls/test/test_imaug.py
+++ b/ppcls/test/test_imaug.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ppcls.data.imaug import DecodeImage
+from ppcls.data.imaug import ResizeImage
+from ppcls.data.imaug import RandCropImage
+from ppcls.data.imaug import RandFlipImage
+from ppcls.data.imaug import NormalizeImage
+from ppcls.data.imaug import ToCHWImage
+from ppcls.data.imaug import ImageNetPolicy
+from ppcls.data.imaug import RandAugment
+from ppcls.data.imaug import Cutout
+from ppcls.data.imaug import HideAndSeek
+from ppcls.data.imaug import RandomErasing
+from ppcls.data.imaug import GridMask
+from ppcls.data.imaug import MixupOperator
+from ppcls.data.imaug import CutmixOperator
+from ppcls.data.imaug import FmixOperator
+from ppcls.data.imaug import transform
+import numpy as np
+fname = './test/demo.jpeg'
+size = 224
+img_mean = [0.485, 0.456, 0.406]
+img_std = [0.229, 0.224, 0.225]
+img_scale = 1.0 / 255.0
+decode_op = DecodeImage()
+randcrop_op = RandCropImage(size=(size, size))
+randflip_op = RandFlipImage(flip_code=1)
+normalize_op = NormalizeImage(
+    scale=img_scale, mean=img_mean, std=img_std, order='')
+tochw_op = ToCHWImage()
+data = open(fname).read()
+def print_function_name(func):
+    """ print function name"""
+    def wrapper(*args, **kwargs):
+        """ wrapper """
+        print("========Test Fuction: [%s]:" % (func.__name__))
+        func(*args, **kwargs)
+        print("========Test Fuction: [%s] done!\n" % (func.__name__))
+    return wrapper
+@print_function_name
+def test_decode():
+    """ test decode operator """
+    img = decode_op(data)
+    print('img shape is %s' % (str(img.shape)))
+@print_function_name
+def test_randcrop():
+    """ test randcrop operator """
+    img = decode_op(data)
+    img = randcrop_op(img)
+    assert img.shape == (size, size, 3), \
+            'image shape[%s] should be equal to [%s]' % (img.shape, (size, size, 3))
+@print_function_name
+def test_randflip():
+    """ test randflip operator """
+    import cv2
+    img = transform(data, [decode_op, randcrop_op])
+    for i in xrange(10):
+        flip_img = randflip_op(img)
+        if np.array_equal(cv2.flip(img, 1), flip_img):
+            break
+    assert np.array_equal(cv2.flip(img, 1),
+                          flip_img), 'you should check randcrop operator'
+@print_function_name
+def test_normalize():
+    """ test normalize operator """
+    img = transform(data, [decode_op, randcrop_op])
+    norm_img = normalize_op(img)
+    assert norm_img.dtype == np.float32, 'img.dtype should be float32 after normalizing'
+    assert norm_img.shape == (size, size, 3), \
+            'image shape[%s] should be equal to [%s]' % (norm_img.shape, (size, size, 3))
+    print('max value of the img after normalizing is : %f' %
+          (np.max(norm_img.flatten())))
+    print('min value of the img after normalizing is : %f' %
+          (np.min(norm_img.flatten())))
+@print_function_name
+def test_tochw():
+    """ test  tochw operator """
+    img = transform(data, [decode_op, randcrop_op, randflip_op, normalize_op])
+    tochw_img = tochw_op(img)
+    assert tochw_img.dtype == np.float32, 'img.dtype should be float32 after tochw'
+    assert tochw_img.shape == (3, size, size), \
+            'image shape[%s] should be equal to [%s]' % (tochw_img.shape, (3, size, size))
+@print_function_name
+def test_autoaugment():
+    """ test autoaugment operator """
+    from PIL import Image
+    autoaugment_op = ImageNetPolicy()
+    img = transform(data, [decode_op, randcrop_op])
+    aa_img = autoaugment_op(img)
+    assert aa_img.dtype == np.uint8, 'img.dtype should be uint8 after autoaugment'
+    assert aa_img.shape == (size, size, 3), \
+            'image shape[%s] should be equal to [%s]' % (aa_img.shape, (size, size, 3))
+@print_function_name
+def test_randaugment():
+    """ test randaugment operator """
+    from PIL import Image
+    randaugment_op = RandAugment(3, 1)
+    img = transform(data, [decode_op, randcrop_op])
+    ra_img = randaugment_op(img)
+    assert ra_img.dtype == np.uint8, 'img.dtype should be uint8 after randaugment'
+    assert ra_img.shape == (size, size, 3), \
+            'image shape[%s] should be equal to [%s]' % (ra_img.shape, (size, size, 3))
+@print_function_name
+def test_cutout():
+    """ test cutout operator """
+    cutout_op = Cutout()
+    img = transform(data, [decode_op, randcrop_op])
+    cutout_img = cutout_op(img)
+    assert cutout_img.dtype == np.uint8, 'img.dtype should be uint8 after cutout'
+    assert cutout_img.shape == (size, size, 3), \
+            'image shape[%s] should be equal to [%s]' % (cutout_img.shape, (size, size, 3))
+@print_function_name
+def test_hideandseek():
+    """ test hide and seek operator """
+    img = transform(
+        data, [decode_op, randcrop_op, randflip_op, normalize_op, tochw_op])
+    hide_and_seek_op = HideAndSeek()
+    hs_img = hide_and_seek_op(img)
+    assert hs_img.dtype == np.float32, 'img.dtype should be float32 after hide and seek'
+    assert hs_img.shape == (3, size, size), \
+            'image shape[%s] should be equal to [%s]' % (hs_img.shape, (3, size, size))
+@print_function_name
+def test_randerasing():
+    """ test randerasing operator """
+    img = transform(
+        data, [decode_op, randcrop_op, randflip_op, normalize_op, tochw_op])
+    randomerasing_op = RandomErasing()
+    re_img = randomerasing_op(img)
+    assert re_img.dtype == np.float32, 'img.dtype should be float32 after randomerasing'
+    assert re_img.shape == (3, size, size), \
+            'image shape[%s] should be equal to [%s]' % (re_img.shape, (3, size, size))
+@print_function_name
+def test_gridmask():
+    """ test gridmask operator """
+    img = transform(
+        data, [decode_op, randcrop_op, randflip_op, normalize_op, tochw_op])
+    gridmask_op = GridMask(
+        d1=96, d2=224, rotate=360, ratio=0.6, mode=1, prob=0.8)
+    gm_img = gridmask_op(img)
+    assert gm_img.dtype == np.float32, 'img.dtype should be float32 after gridmask'
+    assert gm_img.shape == (3, size, size), \
+            'image shape[%s] should be equal to [%s]' % (gr_img.shape, (3, size, size))
+def generate_batch(batch_size=32):
+    """ generate_batch """
+    import random
+    ops = [decode_op, randcrop_op, randflip_op, normalize_op, tochw_op]
+    batch = [(transform(data, ops), random.randint(0, 1000))
+             for i in xrange(batch_size)]
+    return batch
+def test_batch_operator(operator, batch_size):
+    """ test batch operator """
+    batch = generate_batch(batch_size)
+    assert len(batch) == batch_size, \
+            'num of samples not equal to batch_size: %d != %d' % (len(batch), batch_size)
+    assert len(batch[0]) == 2, \
+            'length of sample not equal to 2: %d != 2' % (len(batch[0]))
+    import time
+    tic = time.time()
+    new_batch = operator(batch)
+    cost = time.time() - tic
+    print("operator cost: %.4fms" % (cost * 1000))
+    assert len(batch) == len(new_batch), \
+            'num of samples not equal: %d != %d' % (len(batch), len(new_batch))
+    assert len(new_batch[0]) == 4, \
+            'length of sample not equal to 4: %d != 4' % (len(new_batch[0]))
+@print_function_name
+def test_mixup():
+    """ test mixup operator """
+    batch_size = 32
+    mixup_op = MixupOperator(alpha=0.2)
+    test_batch_operator(mixup_op, batch_size)
+@print_function_name
+def test_cutmix():
+    """ test cutmix operator """
+    batch_size = 32
+    cutmix_op = CutmixOperator(alpha=0.2)
+    test_batch_operator(cutmix_op, batch_size)
+@print_function_name
+def test_fmix():
+    """ test fmix operator """
+    batch_size = 32
+    fmix_op = FmixOperator()
+    test_batch_operator(fmix_op, batch_size)
+if __name__ == '__main__':
+    test_decode()
+    test_randcrop()
+    test_randflip()
+    test_normalize()
+    test_tochw()
+    test_autoaugment()
+    test_randaugment()
+    test_cutout()
+    test_hideandseek()
+    test_randerasing()
+    test_gridmask()
+    test_mixup()
+    test_cutmix()
+    test_fmix()
--- a/ppcls/test/test_super_reader.py
+++ b/ppcls/test/test_super_reader.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ppcls.data.imaug import DecodeImage
+from ppcls.data.imaug import RandCropImage
+from ppcls.data.imaug import RandFlipImage
+from ppcls.data.imaug import NormalizeImage
+from ppcls.data.imaug import ToCHWImage
+from ppcls.data.imaug import ImageNetPolicy
+from ppcls.data.imaug import RandAugment
+from ppcls.data.imaug import Cutout
+from ppcls.data.imaug import HideAndSeek
+from ppcls.data.imaug import RandomErasing
+from ppcls.data.imaug import GridMask
+from ppcls.data.imaug import MixupOperator
+from ppcls.data.imaug import CutmixOperator
+from ppcls.data.imaug import FmixOperator
+from ppcls.data.imaug import transform
+import numpy as np
+fname = './test/demo.jpeg'
+size = 224
+img_mean = [0.485, 0.456, 0.406]
+img_std = [0.229, 0.224, 0.225]
+img_scale = 1.0 / 255.0
+# normal_ops_1
+decode_op = DecodeImage()
+randcrop_op = RandCropImage(size=(size, size))
+# trans_ops
+autoaugment_op = ImageNetPolicy()
+randaugment_op = RandAugment(3, 1)
+cutout_op = Cutout()
+# normal_ops_2
+randflip_op = RandFlipImage(flip_code=1)
+normalize_op = NormalizeImage(
+    scale=img_scale, mean=img_mean, std=img_std, order='')
+tochw_op = ToCHWImage()
+# mask_ops
+hide_and_seek_op = HideAndSeek()
+randomerasing_op = RandomErasing()
+gridmask_op = GridMask(d1=96, d2=224, rotate=360, ratio=0.6, mode=1, prob=0.8)
+# batch_ops
+mixup_op = MixupOperator(alpha=0.2)
+cutmix_op = CutmixOperator(alpha=0.2)
+fmix_op = FmixOperator()
+def fakereader():
+    """ fake reader """
+    import random
+    data = open(fname).read()
+    def wrapper():
+        while True:
+            yield (data, random.randint(0, 1000))
+    return wrapper
+def superreader(batch_size=32):
+    """ super reader """
+    normal_ops_1 = [decode_op, randcrop_op]
+    normal_ops_2 = [randflip_op, normalize_op, tochw_op]
+    trans_ops = [autoaugment_op, randaugment_op, cutout_op]
+    trans_ops_p = [0.2, 0.3, 0.5]
+    mask_ops = [hide_and_seek_op, randomerasing_op, gridmask_op]
+    mask_ops_p = [0.1, 0.6, 0.3]
+    batch_ops = [mixup_op, cutmix_op, fmix_op]
+    batch_ops_p = [0.3, 0.3, 0.4]
+    reader = fakereader()
+    def wrapper():
+        batch = []
+        for idx, sample in enumerate(reader()):
+            img, label = sample
+            ops = normal_ops_1 + [np.random.choice(trans_ops, p=trans_ops_p)] +\
+                    normal_ops_2 + [np.random.choice(mask_ops, p=mask_ops_p)]
+            img = transform(img, ops)
+            batch.append((img, label))
+            if (idx + 1) % batch_size == 0:
+                batch = transform(
+                    batch, [np.random.choice(
+                        batch_ops, p=batch_ops_p)])
+                yield batch
+                batch = []
+    return wrapper
+if __name__ == '__main__':
+    reader = superreader(32)
+    for batch in reader():
+        print(len(batch), len(batch[0]), batch[0][0].shape, batch[0][1:])
--- a/ppcls/utils/__init__.py
+++ b/ppcls/utils/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import environment
+from . import model_zoo
+from . import misc
+from . import logger
+from .save_load import init_model, save_model
+from .config import get_config
+from .misc import AverageMeter
--- a/ppcls/utils/check.py
+++ b/ppcls/utils/check.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import paddle.fluid as fluid
+from ppcls.modeling import similar_architectures
+from ppcls.utils import logger
+def check_version():
+    """
+    Log error and exit when the installed version of paddlepaddle is
+    not satisfied.
+    """
+    err = "PaddlePaddle version 1.7 or higher is required, " \
+          "or a suitable develop version is satisfied as well. \n" \
+          "Please make sure the version is good with your code." \
+    try:
+        fluid.require_version('1.7.0')
+    except Exception as e:
+        logger.error(err)
+        sys.exit(1)
+def check_gpu():
+    """
+    Log error and exit when using paddlepaddle cpu version.
+    """
+    err = "You are using paddlepaddle cpu version! Please try to " \
+          "install paddlepaddle-gpu to run model on GPU."
+    try:
+        assert fluid.is_compiled_with_cuda()
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+def check_architecture(architecture):
+    """
+    check architecture and recommend similar architectures
+    """
+    assert isinstance(architecture, str), \
+            ("the type of architecture({}) should be str". format(architecture))
+    similar_names = similar_architectures(architecture)
+    model_list = ', '.join(similar_names)
+    err = "{} is not exist! Maybe you want: [{}]" \
+          "".format(architecture, model_list)
+    try:
+        assert architecture in similar_names
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+def check_mix(architecture, use_mix=False):
+    """
+    check mix parameter
+    """
+    err = "Cannot use mix processing in GoogLeNet, " \
+          "please set use_mix = False."
+    try:
+        if architecture == "GoogLeNet": assert use_mix == False
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+def check_classes_num(classes_num):
+    """
+    check classes_num
+    """
+    err = "classes_num({}) should be a positive integer" \
+           "and larger than 1".format(classes_num)
+    try:
+        assert isinstance(classes_num, int)
+        assert classes_num > 1
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+def check_data_dir(path):
+    """
+    check cata_dir
+    """
+    err = "Data path is not exist, please given a right path" \
+          "".format(path)
+    try:
+        assert os.isdir(path)
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+def check_function_params(config, key):
+    """
+    check specify config
+    """
+    k_config = config.get(key)
+    assert k_config is not None, \
+            ('{} is required in config'.format(key))
+    assert k_config.get('function'), \
+            ('function is required {} config'.format(key))
+    params = k_config.get('params')
+    assert params is not None, \
+            ('params is required in {} config'.format(key))
+    assert isinstance(params, dict), \
+            ('the params in {} config should be a dict'.format(key))
--- a/ppcls/utils/config.py
+++ b/ppcls/utils/config.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import os
+import yaml
+from ppcls.utils import check
+from ppcls.utils import logger
+__all__ = ['get_config']
+CONFIG_SECS = ['TRAIN', 'VALID', 'OPTIMIZER', 'LEARNING_RATE']
+class AttrDict(dict):
+    def __getattr__(self, key):
+        return self[key]
+    def __setattr__(self, key, value):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self[key] = value
+def create_attr_dict(yaml_config):
+    from ast import literal_eval
+    for key, value in yaml_config.items():
+        if type(value) is dict:
+            yaml_config[key] = value = AttrDict(value)
+        if isinstance(value, str):
+            try:
+                value = literal_eval(value)
+            except BaseException:
+                pass
+        if isinstance(value, AttrDict):
+            create_attr_dict(yaml_config[key])
+        else:
+            yaml_config[key] = value
+    return
+def parse_config(cfg_file):
+    """Load a config file into AttrDict"""
+    with open(cfg_file, 'r') as fopen:
+        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.FullLoader))
+    create_attr_dict(yaml_config)
+    return yaml_config
+def print_dict(d, delimiter=0):
+    """
+    Recursively visualize a dict and
+    indenting acrrording by the relationship of keys.
+    """
+    for k, v in d.items():
+        if k in CONFIG_SECS:
+            logger.info("-" * 60)
+        if isinstance(v, dict):
+            logger.info("{}{} : ".format(delimiter * " ", k))
+            print_dict(v, delimiter + 4)
+        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):
+            logger.info("{}{} : ".format(delimiter * " ", k))
+            for value in v:
+                print_dict(value, delimiter + 4)
+        else:
+            logger.info("{}{} : {}".format(delimiter * " ", k, v))
+        if k in CONFIG_SECS:
+            logger.info("-" * 60)
+def print_config(config):
+    """
+    visualize configs
+    Arguments:
+        config: configs
+    """
+    copyright = "PaddleCLS is powered by PaddlePaddle"
+    ad = "https://github.com/PaddlePaddle/PaddleCLS"
+    logger.info("\n" * 2)
+    logger.info(copyright)
+    logger.info(ad)
+    print_dict(config)
+    logger.info("-" * 60)
+def check_config(config):
+    """
+    Check config
+    """
+    check.check_version()
+    mode = config.get('mode', 'train')
+    check.check_gpu()
+    architecture = config.get('architecture')
+    check.check_architecture(architecture)
+    use_mix = config.get('use_mix')
+    check.check_mix(architecture, use_mix)
+    classes_num = config.get('classes_num')
+    check.check_classes_num(classes_num)
+    if mode.lower() == 'train':
+        check.check_function_params(config, 'LEARNING_RATE')
+        check.check_function_params(config, 'OPTIMIZER')
+def override(dl, ks, v):
+    """
+    Recursively replace dict of list
+    Args:
+        dl(dict or list): dict or list to be replaced
+        ks(list): list of keys
+        v(str): value to be replaced
+    """
+    def str2num(v):
+        try:
+            return eval(v)
+        except Exception:
+            return v
+    assert isinstance(dl, (list, dict)), ("{} should be a list or a dict")
+    assert len(ks) > 0, ('lenght of keys should larger than 0')
+    if isinstance(dl, list):
+        k = str2num(ks[0])
+        if len(ks) == 1:
+            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))
+            dl[k] = str2num(v)
+        else:
+            override(dl[k], ks[1:], v)
+    else:
+        if len(ks) == 1:
+            assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))
+            dl[ks[0]] = str2num(v)
+        else:
+            override(dl[ks[0]], ks[1:], v)
+def override_config(config, options=[]):
+    """
+    Recursively override the config
+    Args:
+        config(dict): dict to be replaced
+        options(list): list of pairs(key0.key1.idx.key2=value)
+            such as: [
+                'topk=2',
+                'VALID.transforms.1.ResizeImage.resize_short=300'
+            ]
+    Returns:
+        config(dict): replaced config
+    """
+    for opt in options:
+        assert isinstance(opt, str), \
+                ("option({}) should be a str".format(opt))
+        assert "=" in opt, ("option({}) should contain " \
+                "a = to distinguish between key and value".format(opt))
+        pair = opt.split('=')
+        assert len(pair) == 2, ("there can be only a = in the option")
+        key, value = pair
+        keys = key.split('.')
+        override(config, keys, value)
+    return config
+def get_config(fname, overrides=[], show=True):
+    """
+    Read config from file
+    """
+    assert os.path.exists(fname), \
+            ('config file({}) is not exist'.format(fname))
+    config = parse_config(fname)
+    if show: print_config(config)
+    if len(overrides) > 0:
+        override_config(config, overrides)
+        print_config(config)
+    check_config(config)
+    return config
--- a/ppcls/utils/environment.py
+++ b/ppcls/utils/environment.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import os
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.framework as pff
+trainers_num = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
+trainer_id = int(os.environ.get("PADDLE_TRAINER_ID", 0))
+def place():
+    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
+    return fluid.CUDAPlace(gpu_id)
+def places():
+    """
+    Returns available running places, the numbers are usually
+    indicated by 'export CUDA_VISIBLE_DEVICES= '
+    Args:
+    """
+    if trainers_num <= 1:
+        return pff.cuda_places()
+    else:
+        return place()
--- a/ppcls/utils/misc.py
+++ b/ppcls/utils/misc.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+__all__ = ['AverageMeter']
+class AverageMeter(object):
+    """
+    Computes and stores the average and current value
+    """
+    def __init__(self, name='', fmt=':f', avg=False):
+        self.name = name
+        self.fmt = fmt
+        self.avg_flag = avg
+        self.reset()
+    def reset(self):
+        """ reset """
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        """ update """
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+    def __str__(self):
+        fmtstr = '[{name}: {val' + self.fmt + '}]'
+        if self.avg_flag:
+            fmtstr += '[{name}(avg): {avg' + self.fmt + '}]'
+        return fmtstr.format(**self.__dict__)
--- a/ppcls/utils/model_zoo.py
+++ b/ppcls/utils/model_zoo.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import shutil
+import requests
+import tqdm
+import tarfile
+import zipfile
+from ppcls.utils.check import check_architecture
+from ppcls.utils import logger
+__all__ = ['get']
+DOWNLOAD_RETRY_LIMIT = 3
+class UrlError(Exception):
+    """ UrlError
+    """
+    def __init__(self, url='', code=''):
+        message = "Downloading from {} failed with code {}!".format(url, code)
+        super(UrlError, self).__init__(message)
+class ModelNameError(Exception):
+    """ ModelNameError
+    """
+    def __init__(self, message='', architecture=''):
+        similar_names = similar_architectures(architecture)
+        model_list = ', '.join(similar_names)
+        message += '\n{} is not exist. \nMaybe you want: [{}]'.format(
+            architecture, model_list)
+        super(ModelNameError, self).__init__(message)
+class RetryError(Exception):
+    """ RetryError
+    """
+    def __init__(self, url='', times=''):
+        message = "Download from {} failed. Retry({}) limit reached".format(
+            url, times)
+        super(RetryError, self).__init__(message)
+def _get_url(architecture):
+    prefix = "https://paddle-imagenet-models-name.bj.bcebos.com/"
+    fname = architecture + "_pretrained.tar"
+    return prefix + fname
+def _move_and_merge_tree(src, dst):
+    """
+    Move src directory to dst, if dst is already exists,
+    merge src to dst
+    """
+    if not os.path.exists(dst):
+        shutil.move(src, dst)
+    elif os.path.isfile(src):
+        shutil.move(src, dst)
+    else:
+        for fp in os.listdir(src):
+            src_fp = os.path.join(src, fp)
+            dst_fp = os.path.join(dst, fp)
+            if os.path.isdir(src_fp):
+                if os.path.isdir(dst_fp):
+                    _move_and_merge_tree(src_fp, dst_fp)
+                else:
+                    shutil.move(src_fp, dst_fp)
+            elif os.path.isfile(src_fp) and \
+                    not os.path.isfile(dst_fp):
+                shutil.move(src_fp, dst_fp)
+def _download(url, path):
+    """
+    Download from url, save to path.
+    url (str): download url
+    path (str): download to given path
+    """
+    if not os.path.exists(path):
+        os.makedirs(path)
+    fname = os.path.split(url)[-1]
+    fullname = os.path.join(path, fname)
+    retry_cnt = 0
+    while not os.path.exists(fullname):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RetryError(url, DOWNLOAD_RETRY_LIMIT)
+        logger.info("Downloading {} from {}".format(fname, url))
+        req = requests.get(url, stream=True)
+        if req.status_code != 200:
+            raise UrlError(url, req.status_code)
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                for chunk in tqdm.tqdm(
+                        req.iter_content(chunk_size=1024),
+                        total=(int(total_size) + 1023) // 1024,
+                        unit='KB'):
+                    f.write(chunk)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+    return fullname
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+    fpath = os.path.split(fname)[0]
+    fpath_tmp = os.path.join(fpath, 'tmp')
+    if os.path.isdir(fpath_tmp):
+        shutil.rmtree(fpath_tmp)
+        os.makedirs(fpath_tmp)
+    if fname.find('tar') >= 0:
+        with tarfile.open(fname) as tf:
+            tf.extractall(path=fpath_tmp)
+    elif fname.find('zip') >= 0:
+        with zipfile.ZipFile(fname) as zf:
+            zf.extractall(path=fpath_tmp)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+    for f in os.listdir(fpath_tmp):
+        src_dir = os.path.join(fpath_tmp, f)
+        dst_dir = os.path.join(fpath, f)
+        _move_and_merge_tree(src_dir, dst_dir)
+    shutil.rmtree(fpath_tmp)
+    os.remove(fname)
+def get(architecture, path, decompress=True):
+    check_architecture(architecture)
+    url = _get_url(architecture)
+    fname = _download(url, path)
+    if decompress: _decompress(fname)
+    logger.info("download {} finished ".format(fname))
--- a/ppcls/utils/save_load.py
+++ b/ppcls/utils/save_load.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import tempfile
+import shutil
+import paddle
+import paddle.fluid as fluid
+from ppcls.utils import logger
+__all__ = ['init_model', 'save_model']
+def _mkdir_if_not_exist(path):
+    """
+        mkdir if not exists
+    """
+    if not os.path.exists(os.path.join(path)):
+        os.makedirs(os.path.join(path))
+def _load_state(path):
+    print("path: ", path)
+    if os.path.exists(path + '.pdopt'):
+        # XXX another hack to ignore the optimizer state
+        tmp = tempfile.mkdtemp()
+        dst = os.path.join(tmp, os.path.basename(os.path.normpath(path)))
+        shutil.copy(path + '.pdparams', dst + '.pdparams')
+        state = fluid.io.load_program_state(dst)
+        shutil.rmtree(tmp)
+    else:
+        print("path: ", path)
+        state = fluid.io.load_program_state(path)
+    return state
+def load_params(exe, prog, path, ignore_params=[]):
+    """
+    Load model from the given path.
+    Args:
+        exe (fluid.Executor): The fluid.Executor object.
+        prog (fluid.Program): load weight to which Program object.
+        path (string): URL string or loca model path.
+        ignore_params (list): ignore variable to load when finetuning.
+            It can be specified by finetune_exclude_pretrained_params
+            and the usage can refer to docs/advanced_tutorials/TRANSFER_LEARNING.md
+    """
+    if not (os.path.isdir(path) or os.path.exists(path + '.pdparams')):
+        raise ValueError("Model pretrain path {} does not "
+                         "exists.".format(path))
+    logger.info('Loading parameters from {}...'.format(path))
+    ignore_set = set()
+    state = _load_state(path)
+    # ignore the parameter which mismatch the shape
+    # between the model and pretrain weight.
+    all_var_shape = {}
+    for block in prog.blocks:
+        for param in block.all_parameters():
+            all_var_shape[param.name] = param.shape
+    ignore_set.update([
+        name for name, shape in all_var_shape.items()
+        if name in state and shape != state[name].shape
+    ])
+    if ignore_params:
+        all_var_names = [var.name for var in prog.list_vars()]
+        ignore_list = filter(
+            lambda var: any([re.match(name, var) for name in ignore_params]),
+            all_var_names)
+        ignore_set.update(list(ignore_list))
+    if len(ignore_set) > 0:
+        for k in ignore_set:
+            if k in state:
+                logger.warning('variable {} not used'.format(k))
+                del state[k]
+    fluid.io.set_program_state(prog, state)
+def init_model(config, program, exe):
+    """
+        load model from checkpoint or pretrained_model
+    """
+    checkpoints = config.get('checkpoints')
+    if checkpoints and os.path.exists(checkpoints):
+        fluid.load(program, checkpoints, exe)
+        logger.info("Finish initing model from {}".format(checkpoints))
+        return
+    pretrained_model = config.get('pretrained_model')
+    if pretrained_model and os.path.exists(pretrained_model):
+        load_params(exe, program, pretrained_model)
+        logger.info("Finish initing model from {}".format(pretrained_model))
+def save_model(program, model_path, epoch_id, prefix='ppcls'):
+    """
+        save model to the target path
+    """
+    model_path = os.path.join(model_path, str(epoch_id))
+    _mkdir_if_not_exist(model_path)
+    model_prefix = os.path.join(model_path, prefix)
+    fluid.save(program, model_prefix)
+    logger.info("Already save model in {}".format(model_path))
--- a/requirements.txt
+++ b/requirements.txt
+opencv-python
+pillow
+tqdm
+PyYAML
--- a/tools/download.py
+++ b/tools/download.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import sys
+import argparse
+sys.path.append("../")
+from ppcls import model_zoo
+def parse_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-a', '--architecture', type=str, default='ResNet50')
+    parser.add_argument('-p', '--path', type=str, default='./pretrained/')
+    parser.add_argument('-d', '--decompress', type=str2bool, default=True)
+    args = parser.parse_args()
+    return args
+def main():
+    args = parse_args()
+    model_zoo.get(args.architecture, args.path, args.decompress)
+if __name__ == '__main__':
+    main()
--- a/tools/eval.py
+++ b/tools/eval.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import argparse
+import paddle
+import paddle.fluid as fluid
+import program
+from ppcls.data import Reader
+import ppcls.utils.environment as env
+from ppcls.utils.config import get_config
+from ppcls.utils.save_load import init_model, save_model
+from ppcls.utils import logger
+from paddle.fluid.incubate.fleet.collective import fleet
+from paddle.fluid.incubate.fleet.base import role_maker
+def parse_args():
+    parser = argparse.ArgumentParser("PaddleClas eval script")
+    parser.add_argument(
+        '-c',
+        '--config',
+        type=str,
+        default='configs/eval.yaml',
+        help='config file path')
+    parser.add_argument(
+        '-o',
+        '--override',
+        action='append',
+        default=[],
+        help='config options to be overridden')
+    args = parser.parse_args()
+    return args
+def main(args):
+    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+    fleet.init(role)
+    config = get_config(args.config, overrides=args.override, show=True)
+    place = env.place()
+    startup_prog = fluid.Program()
+    valid_prog = fluid.Program()
+    valid_dataloader, valid_fetchs = program.build(
+        config, valid_prog, startup_prog, is_train=False)
+    valid_prog = valid_prog.clone(for_test=True)
+    exe = fluid.Executor(place)
+    exe.run(startup_prog)
+    init_model(config, valid_prog, exe)
+    valid_reader = Reader(config, 'valid')()
+    valid_dataloader.set_sample_list_generator(valid_reader, place)
+    compiled_valid_prog = program.compile(config, valid_prog)
+    program.run(valid_dataloader, exe, compiled_valid_prog, valid_fetchs, 0,
+                'valid')
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
--- a/tools/export_model.py
+++ b/tools/export_model.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import numpy as np
+from ppcls.modeling import architectures
+import paddle.fluid as fluid
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model", type=str)
+    parser.add_argument("-p", "--pretrained_model", type=str)
+    parser.add_argument("-o", "--output_path", type=str)
+    return parser.parse_args()
+def create_input():
+    image = fluid.data(
+        name='image', shape=[None, 3, 224, 224], dtype='float32')
+    return image
+def create_model(args, model, input, class_dim=1000):
+    if args.model == "GoogLeNet":
+        out, _, _ = model.net(input=input, class_dim=class_dim)
+    else:
+        out = model.net(input=input, class_dim=class_dim)
+        out = fluid.layers.softmax(out)
+    return out
+def main():
+    args = parse_args()
+    model = architectures.__dict__[args.model]()
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    startup_prog = fluid.Program()
+    infer_prog = fluid.Program()
+    with fluid.program_guard(infer_prog, startup_prog):
+        with fluid.unique_name.guard():
+            image = create_input()
+            out = create_model(args, model, image)
+    infer_prog = infer_prog.clone(for_test=True)
+    fluid.load(
+        program=infer_prog, model_path=args.pretrained_model, executor=exe)
+    fluid.io.save_inference_model(
+        dirname=args.output_path,
+        feeded_var_names=[image.name],
+        main_program=infer_prog,
+        target_vars=out,
+        executor=exe,
+        model_filename='model',
+        params_filename='params')
+if __name__ == "__main__":
+    main()
--- a/tools/infer/cpp_infer.py
+++ b/tools/infer/cpp_infer.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import utils
+import argparse
+import numpy as np
+from paddle.fluid.core import PaddleTensor
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import create_paddle_predictor
+def parse_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--image_file", type=str)
+    parser.add_argument("-m", "--model_file", type=str)
+    parser.add_argument("-p", "--params_file", type=str)
+    parser.add_argument("-b", "--max_batch_size", type=int, default=1)
+    parser.add_argument("--use_gpu", type=str2bool, default=True)
+    parser.add_argument("--ir_optim", type=str2bool, default=True)
+    parser.add_argument("--use_tensorrt", type=str2bool, default=False)
+    return parser.parse_args()
+def create_predictor(args):
+    config = AnalysisConfig(args.model_file, args.params_file)
+    if args.use_gpu:
+        config.enable_use_gpu(1000, 0)
+    else:
+        config.disable_gpu()
+    config.switch_ir_optim(args.ir_optim)  # default true
+    if args.use_tensorrt:
+        config.enable_tensorrt_engine(
+            precision_mode=AnalysisConfig.Precision.Float32,
+            max_batch_size=args.max_batch_size)
+    predictor = create_paddle_predictor(config)
+    return predictor
+def create_operators():
+    size = 224
+    img_mean = [0.485, 0.456, 0.406]
+    img_std = [0.229, 0.224, 0.225]
+    img_scale = 1.0 / 255.0
+    decode_op = utils.DecodeImage()
+    resize_op = utils.ResizeImage(resize_short=256)
+    crop_op = utils.CropImage(size=(size, size))
+    normalize_op = utils.NormalizeImage(
+        scale=img_scale, mean=img_mean, std=img_std)
+    totensor_op = utils.ToTensor()
+    return [decode_op, resize_op, crop_op, normalize_op, totensor_op]
+def preprocess(fname, ops):
+    data = open(fname).read()
+    for op in ops:
+        data = op(data)
+    return data
+def postprocess(outputs, topk=5):
+    output = outputs[0]
+    prob = output.as_ndarray().flatten()
+    index = prob.argsort(axis=0)[-topk:][::-1].astype('int32')
+    return zip(index, prob[index])
+def main():
+    args = parse_args()
+    operators = create_operators()
+    predictor = create_predictor(args)
+    data = preprocess(args.image_file, operators)
+    inputs = [PaddleTensor(data.copy())]
+    outputs = predictor.run(inputs)
+    probs = postprocess(outputs)
+    for idx, prob in probs:
+        print("class id: {:d}, probability: {:.4f}".format(idx, prob))
+if __name__ == "__main__":
+    main()
--- a/tools/infer/infer.py
+++ b/tools/infer/infer.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import utils
+import argparse
+import numpy as np
+import paddle.fluid as fluid
+from ppcls.modeling import architectures
+def parse_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--image_file", type=str)
+    parser.add_argument("-m", "--model", type=str)
+    parser.add_argument("-p", "--pretrained_model", type=str)
+    parser.add_argument("--use_gpu", type=str2bool, default=True)
+    return parser.parse_args()
+def create_predictor(args):
+    def create_input():
+        image = fluid.data(
+            name='image', shape=[None, 3, 224, 224], dtype='float32')
+        return image
+    def create_model(args, model, input, class_dim=1000):
+        if args.model == "GoogLeNet":
+            out, _, _ = model.net(input=input, class_dim=class_dim)
+        else:
+            out = model.net(input=input, class_dim=class_dim)
+            out = fluid.layers.softmax(out)
+        return out
+    model = architectures.__dict__[args.model]()
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    startup_prog = fluid.Program()
+    infer_prog = fluid.Program()
+    with fluid.program_guard(infer_prog, startup_prog):
+        with fluid.unique_name.guard():
+            image = create_input()
+            out = create_model(args, model, image)
+    infer_prog = infer_prog.clone(for_test=True)
+    fluid.load(
+        program=infer_prog, model_path=args.pretrained_model, executor=exe)
+    return exe, infer_prog, [image.name], [out.name]
+def create_operators():
+    size = 224
+    img_mean = [0.485, 0.456, 0.406]
+    img_std = [0.229, 0.224, 0.225]
+    img_scale = 1.0 / 255.0
+    decode_op = utils.DecodeImage()
+    resize_op = utils.ResizeImage(resize_short=256)
+    crop_op = utils.CropImage(size=(size, size))
+    normalize_op = utils.NormalizeImage(
+        scale=img_scale, mean=img_mean, std=img_std)
+    totensor_op = utils.ToTensor()
+    return [decode_op, resize_op, crop_op, normalize_op, totensor_op]
+def preprocess(fname, ops):
+    data = open(fname).read()
+    for op in ops:
+        data = op(data)
+    return data
+def postprocess(outputs, topk=5):
+    output = outputs[0]
+    prob = np.array(output).flatten()
+    index = prob.argsort(axis=0)[-topk:][::-1].astype('int32')
+    return zip(index, prob[index])
+def main():
+    args = parse_args()
+    operators = create_operators()
+    exe, program, feed_names, fetch_names = create_predictor(args)
+    data = preprocess(args.image_file, operators)
+    outputs = exe.run(program,
+                      feed={feed_names[0]: data},
+                      fetch_list=fetch_names,
+                      return_numpy=False)
+    probs = postprocess(outputs)
+    for idx, prob in probs:
+        print("class id: {:d}, probability: {:.4f}".format(idx, prob))
+if __name__ == "__main__":
+    main()
--- a/tools/infer/py_infer.py
+++ b/tools/infer/py_infer.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import utils
+import argparse
+import numpy as np
+import paddle.fluid as fluid
+def parse_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--image_file", type=str)
+    parser.add_argument("-d", "--model_dir", type=str)
+    parser.add_argument("-m", "--model_file", type=str)
+    parser.add_argument("-p", "--params_file", type=str)
+    parser.add_argument("--use_gpu", type=str2bool, default=True)
+    return parser.parse_args()
+def create_predictor(args):
+    if args.use_gpu:
+        place = fluid.CUDAPlace(0)
+    else:
+        place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    [program, feed_names, fetch_lists] = fluid.io.load_inference_model(
+        args.model_dir,
+        exe,
+        model_filename=args.model_file,
+        params_filename=args.params_file)
+    compiled_program = fluid.compiler.CompiledProgram(program)
+    return exe, compiled_program, feed_names, fetch_lists
+def create_operators():
+    size = 224
+    img_mean = [0.485, 0.456, 0.406]
+    img_std = [0.229, 0.224, 0.225]
+    img_scale = 1.0 / 255.0
+    decode_op = utils.DecodeImage()
+    resize_op = utils.ResizeImage(resize_short=256)
+    crop_op = utils.CropImage(size=(size, size))
+    normalize_op = utils.NormalizeImage(
+        scale=img_scale, mean=img_mean, std=img_std)
+    totensor_op = utils.ToTensor()
+    return [decode_op, resize_op, crop_op, normalize_op, totensor_op]
+def preprocess(fname, ops):
+    data = open(fname).read()
+    for op in ops:
+        data = op(data)
+    return data
+def postprocess(outputs, topk=5):
+    output = outputs[0]
+    prob = np.array(output).flatten()
+    index = prob.argsort(axis=0)[-topk:][::-1].astype('int32')
+    return zip(index, prob[index])
+def main():
+    args = parse_args()
+    operators = create_operators()
+    exe, program, feed_names, fetch_lists = create_predictor(args)
+    data = preprocess(args.image_file, operators)
+    outputs = exe.run(program,
+                      feed={feed_names[0]: data},
+                      fetch_list=fetch_lists,
+                      return_numpy=False)
+    probs = postprocess(outputs)
+    for idx, prob in probs:
+        print("class id: {:d}, probability: {:.4f}".format(idx, prob))
+if __name__ == "__main__":
+    main()
--- a/tools/infer/run.sh
+++ b/tools/infer/run.sh
+#!/usr/bin/env bash
+python ./cpp_infer.py \
+    -i=./test.jpeg \
+    -m=./resnet50-vd/model \
+    -p=./resnet50-vd/params \
+    --use_gpu=1
+python ./cpp_infer.py \
+    -i=./test.jpeg \
+    -m=./resnet50-vd/model \
+    -p=./resnet50-vd/params \
+    --use_gpu=0
+python py_infer.py \
+    -i=./test.jpeg \
+    -d ./resnet50-vd/ \
+    -m=model -p=params \
+    --use_gpu=0
+python py_infer.py \
+    -i=./test.jpeg \
+    -d ./resnet50-vd/ \
+    -m=model -p=params \
+    --use_gpu=1
+python infer.py \
+    -i=./test.jpeg \
+    -m ResNet50_vd \
+    -p ./resnet50-vd-persistable/ \
+    --use_gpu=0
+python infer.py \
+    -i=./test.jpeg \
+    -m ResNet50_vd \
+    -p ./resnet50-vd-persistable/ \
+    --use_gpu=1
+python export_model.py \
+    -m ResNet50_vd \
+    -p ./resnet50-vd-persistable/ \
+    -o ./test/
+python py_infer.py \
+    -i=./test.jpeg \
+    -d ./test/ \
+    -m=model \
+    -p=params \
+    --use_gpu=0
--- a/tools/infer/utils.py
+++ b/tools/infer/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cv2
+import numpy as np
+class DecodeImage(object):
+    def __init__(self, to_rgb=True):
+        self.to_rgb = to_rgb
+    def __call__(self, img):
+        data = np.frombuffer(img, dtype='uint8')
+        img = cv2.imdecode(data, 1)
+        if self.to_rgb:
+            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (
+                img.shape)
+            img = img[:, :, ::-1]
+        return img
+class ResizeImage(object):
+    def __init__(self, resize_short=None):
+        self.resize_short = resize_short
+    def __call__(self, img):
+        img_h, img_w = img.shape[:2]
+        percent = float(self.resize_short) / min(img_w, img_h)
+        w = int(round(img_w * percent))
+        h = int(round(img_h * percent))
+        return cv2.resize(img, (w, h))
+class CropImage(object):
+    def __init__(self, size):
+        if type(size) is int:
+            self.size = (size, size)
+        else:
+            self.size = size
+    def __call__(self, img):
+        w, h = self.size
+        img_h, img_w = img.shape[:2]
+        w_start = (img_w - w) // 2
+        h_start = (img_h - h) // 2
+        w_end = w_start + w
+        h_end = h_start + h
+        return img[h_start:h_end, w_start:w_end, :]
+class NormalizeImage(object):
+    def __init__(self, scale=None, mean=None, std=None):
+        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+        mean = mean if mean is not None else [0.485, 0.456, 0.406]
+        std = std if std is not None else [0.229, 0.224, 0.225]
+        shape = (1, 1, 3)
+        self.mean = np.array(mean).reshape(shape).astype('float32')
+        self.std = np.array(std).reshape(shape).astype('float32')
+    def __call__(self, img):
+        return (img.astype('float32') * self.scale - self.mean) / self.std
+class ToTensor(object):
+    def __init__(self):
+        pass
+    def __call__(self, img):
+        img = img.transpose((2, 0, 1))
+        img = np.expand_dims(img, axis=0)
+        return img
--- a/tools/program.py
+++ b/tools/program.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import time
+from collections import OrderedDict
+import paddle
+import paddle.fluid as fluid
+from ppcls.optimizer import LearningRateBuilder
+from ppcls.optimizer import OptimizerBuilder
+from ppcls.modeling import architectures
+from ppcls.modeling.loss import CELoss
+from ppcls.modeling.loss import MixCELoss
+from ppcls.modeling.loss import GoogLeNetLoss
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+from paddle.fluid.incubate.fleet.collective import fleet
+from paddle.fluid.incubate.fleet.collective import DistributedStrategy
+def create_feeds(image_shape, mix=None):
+    """
+    Create feeds as model input
+    Args:
+        image_shape(list[int]): model input shape, such as [3, 224, 224]
+        mix(bool): whether to use mix(include mixup, cutmix, fmix)
+    Returns:
+        feeds(dict): dict of model input variables
+    """
+    feeds = OrderedDict()
+    feeds['image'] = fluid.data(
+        name="feed_image", shape=[None] + image_shape, dtype="float32")
+    if mix:
+        feeds['feed_y_a'] = fluid.data(
+            name="feed_y_a", shape=[None, 1], dtype="int64")
+        feeds['feed_y_b'] = fluid.data(
+            name="feed_y_b", shape=[None, 1], dtype="int64")
+        feeds['feed_lam'] = fluid.data(
+            name="feed_lam", shape=[None, 1], dtype="float32")
+    else:
+        feeds['label'] = fluid.data(
+            name="feed_label", shape=[None, 1], dtype="int64")
+    return feeds
+def create_dataloader(feeds):
+    """
+    Create a dataloader with model input variables
+    Args:
+        feeds(dict): dict of model input variables
+    Returns:
+        dataloader(fluid dataloader):
+    """
+    trainer_num = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
+    capacity = 64 if trainer_num <= 1 else 8
+    dataloader = fluid.io.DataLoader.from_generator(
+        feed_list=feeds,
+        capacity=capacity,
+        use_double_buffer=True,
+        iterable=True)
+    return dataloader
+def create_model(name, image, classes_num):
+    """
+    Create a model
+    Args:
+        name(str): model name, such as ResNet50
+        image(variable): model input variable
+        classes_num(int): num of classes
+    Returns:
+        out(variable): model output variable
+    """
+    model = architectures.__dict__[name]()
+    out = model.net(input=image, class_dim=classes_num)
+    return out
+def create_loss(out,
+                feeds,
+                architecture,
+                classes_num=1000,
+                epsilon=None,
+                mix=False):
+    """
+    Create a loss for optimization, such as:
+        1. CrossEnotry loss
+        2. CrossEnotry loss with label smoothing
+        3. CrossEnotry loss with mix(mixup, cutmix, fmix)
+        4. CrossEnotry loss with label smoothing and (mixup, cutmix, fmix)
+        5. GoogLeNet loss
+    Args:
+        out(variable): model output variable
+        feeds(dict): dict of model input variables
+        architecture(str): model name, such as ResNet50
+        classes_num(int): num of classes
+        epsilon(float): parameter for label smoothing, 0.0 <= epsilon <= 1.0
+        mix(bool): whether to use mix(include mixup, cutmix, fmix)
+    Returns:
+        loss(variable): loss variable
+    """
+    if architecture == "GoogLeNet":
+        assert len(out) == 3, "GoogLeNet should have 3 outputs"
+        loss = GoogLeNetLoss(class_dim=classes_num, epsilon=epsilon)
+        target = feeds['label']
+        return loss(out[0], out[1], out[2], target)
+    if mix:
+        loss = MixCELoss(class_dim=classes_num, epsilon=epsilon)
+        feed_y_a = feeds['feed_y_a']
+        feed_y_b = feeds['feed_y_b']
+        feed_lam = feeds['feed_lam']
+        return loss(out, feed_y_a, feed_y_b, feed_lam)
+    else:
+        loss = CELoss(class_dim=classes_num, epsilon=epsilon)
+        target = feeds['label']
+        return loss(out, target)
+def create_metric(out, feeds, topk=5, classes_num=1000):
+    """
+    Create measures of model accuracy, such as top1 and top5
+    Args:
+        out(variable): model output variable
+        feeds(dict): dict of model input variables(included label)
+        topk(int): usually top5
+        classes_num(int): num of classes
+    Returns:
+        fetchs(dict): dict of measures
+    """
+    fetchs = OrderedDict()
+    label = feeds['label']
+    softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+    top1 = fluid.layers.accuracy(softmax_out, label=label, k=1)
+    fetchs['top1'] = (top1, AverageMeter('top1', ':2.4f', True))
+    k = min(topk, classes_num)
+    topk = fluid.layers.accuracy(softmax_out, label=label, k=k)
+    topk_name = 'top{}'.format(k)
+    fetchs[topk_name] = (topk, AverageMeter(topk_name, ':2.4f', True))
+    return fetchs
+def create_fetchs(out,
+                  feeds,
+                  architecture,
+                  topk=5,
+                  classes_num=1000,
+                  epsilon=None,
+                  mix=False):
+    """
+    Create fetchs as model outputs(included loss and measures),
+    will call create_loss and create_metric(if mix).
+    Args:
+        out(variable): model output variable
+        feeds(dict): dict of model input variables(included label)
+        architecture(str): model name, such as ResNet50
+        topk(int): usually top5
+        classes_num(int): num of classes
+        epsilon(float): parameter for label smoothing, 0.0 <= epsilon <= 1.0
+        mix(bool): whether to use mix(include mixup, cutmix, fmix)
+    Returns:
+        fetchs(dict): dict of model outputs(included loss and measures)
+    """
+    fetchs = OrderedDict()
+    loss = create_loss(out, feeds, architecture, classes_num, epsilon, mix)
+    fetchs['loss'] = (loss, AverageMeter('loss', ':2.4f', True))
+    if not mix:
+        metric = create_metric(out, feeds, topk, classes_num)
+        fetchs.update(metric)
+    return fetchs
+def create_optimizer(config):
+    """
+    Create an optimizer using config, usually including
+    learning rate and regularization.
+    Args:
+        config(dict):  such as
+        {
+            'LEARNING_RATE':
+                {'function': 'Cosine',
+                 'params': {'lr': 0.1}
+                },
+            'OPTIMIZER':
+                {'function': 'Momentum',
+                 'params':{'momentum': 0.9},
+                 'regularizer':
+                    {'function': 'L2', 'factor': 0.0001}
+                }
+        }
+    Returns:
+        an optimizer instance
+    """
+    # create learning_rate instance
+    lr_config = config['LEARNING_RATE']
+    lr_config['params'].update({
+        'epochs': config['epochs'],
+        'step_each_epoch':
+        config['total_images'] // config['TRAIN']['batch_size'],
+    })
+    lr = LearningRateBuilder(**lr_config)()
+    # create optimizer instance
+    opt_config = config['OPTIMIZER']
+    opt = OptimizerBuilder(**opt_config)
+    return opt(lr)
+def dist_optimizer(config, optimizer):
+    """
+    Create a distributed optimizer based on a normal optimizer
+    Args:
+        config(dict):
+        optimizer(): a normal optimizer
+    Returns:
+        optimizer: a distributed optimizer
+    """
+    exec_strategy = fluid.ExecutionStrategy()
+    exec_strategy.num_threads = 3
+    exec_strategy.num_iteration_per_drop_scope = 10
+    dist_strategy = DistributedStrategy()
+    dist_strategy.nccl_comm_num = 1
+    dist_strategy.fuse_all_reduce_ops = True
+    dist_strategy.exec_strategy = exec_strategy
+    optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
+    return optimizer
+def build(config, main_prog, startup_prog, is_train=True):
+    """
+    Build a program using a model and an optimizer
+        1. create feeds
+        2. create a dataloader
+        3. create a model
+        4. create fetchs
+        5. create an optimizer
+    Args:
+        config(dict): config
+        main_prog(): main program
+        startup_prog(): startup program
+        is_train(bool): train or valid
+    Returns:
+        dataloader(): a bridge between the model and the data
+        fetchs(dict): dict of model outputs(included loss and measures)
+    """
+    with fluid.program_guard(main_prog, startup_prog):
+        with fluid.unique_name.guard():
+            use_mix = config.get('use_mix') and is_train
+            feeds = create_feeds(config.image_shape, mix=use_mix)
+            dataloader = create_dataloader(feeds.values())
+            out = create_model(config.architecture, feeds['image'],
+                               config.classes_num)
+            fetchs = create_fetchs(
+                out,
+                feeds,
+                config.architecture,
+                config.topk,
+                config.classes_num,
+                epsilon=config.get('ls_epsilon'),
+                mix=use_mix)
+            if is_train:
+                optimizer = create_optimizer(config)
+                lr = optimizer._global_learning_rate()
+                fetchs['lr'] = (lr, AverageMeter('lr', ':f', False))
+                optimizer = dist_optimizer(config, optimizer)
+                optimizer.minimize(fetchs['loss'][0])
+    return dataloader, fetchs
+def compile(config, program, loss_name=None):
+    """
+    Compile the program
+    Args:
+        config(dict): config
+        program(): the program which is wrapped by
+        loss_name(str): loss name
+    Returns:
+        compiled_program(): a compiled program
+    """
+    build_strategy = fluid.compiler.BuildStrategy()
+    #build_strategy.fuse_bn_act_ops = config.get("fuse_bn_act_ops")
+    #build_strategy.fuse_elewise_add_act_ops = config.get("fuse_elewise_add_act_ops")
+    exec_strategy = fluid.ExecutionStrategy()
+    exec_strategy.num_threads = 1
+    exec_strategy.num_iteration_per_drop_scope = 10
+    compiled_program = fluid.CompiledProgram(program).with_data_parallel(
+        loss_name=loss_name,
+        build_strategy=build_strategy,
+        exec_strategy=exec_strategy)
+    return compiled_program
+def run(dataloader, exe, program, fetchs, epoch=0, mode='train'):
+    """
+    Feed data to the model and fetch the measures and loss
+    Args:
+        dataloader(fluid dataloader):
+        exe():
+        program():
+        fetchs(dict): dict of measures and the loss
+        epoch(int): epoch of training or validation
+        model(str): log only
+    Returns:
+    """
+    fetch_list = [f[0] for f in fetchs.values()]
+    metric_list = [f[1] for f in fetchs.values()]
+    batch_time = AverageMeter('cost', ':6.3f')
+    tic = time.time()
+    for idx, batch in enumerate(dataloader()):
+        metrics = exe.run(program=program, feed=batch, fetch_list=fetch_list)
+        batch_time.update(time.time() - tic)
+        tic = time.time()
+        for i, m in enumerate(metrics):
+            metric_list[i].update(m[0], len(batch[0]))
+        fetchs_str = ''.join([str(m) for m in metric_list] + [str(batch_time)])
+        logger.info("[epoch:%3d][%s][step:%4d]%s" %
+                    (epoch, mode, idx, fetchs_str))
--- a/tools/run.sh
+++ b/tools/run.sh
+#!/usr/bin/env bash
+export PYTHONPATH=$(dirname "$PWD"):$PWD:$PYTHONPATH
+#python download.py -a ResNet181 -p ./pretrained/ -d 1
+#python download.py -a ResNet18 -p ./pretrained/ -d 1
+#python download.py -a ResNet34 -p ./pretrained/ -d 0
+#python -m paddle.distributed.launch --selected_gpus="0,1,2,3" --log_dir=mylog tools/train.py
+#python -m paddle.distributed.launch --selected_gpus="0,1,2,3" --log_dir=mylog ./eval.py
+python -m paddle.distributed.launch \
+    --selected_gpus="0,1,2,3" \
+    --log_dir=mylog \
+    tools/train.py \
+        -c configs/ResNet/ResNet50_vd.yaml \
+        -o use_mix=0 \
+        -o TRAIN.batch_size=128 \
+        -o TRAIN.transforms.3.NormalizeImage.mean.2=0.4
--- a/tools/train.py
+++ b/tools/train.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import argparse
+import os
+import sys
+import paddle
+import paddle.fluid as fluid
+import program
+from ppcls.data import Reader
+import ppcls.utils.environment as env
+from ppcls.utils.config import get_config
+from ppcls.utils.save_load import init_model, save_model
+from ppcls.utils import logger
+from paddle.fluid.incubate.fleet.collective import fleet
+from paddle.fluid.incubate.fleet.base import role_maker
+def parse_args():
+    parser = argparse.ArgumentParser("PaddleClas train script")
+    parser.add_argument(
+        '-c',
+        '--config',
+        type=str,
+        default='configs/ResNet/ResNet18_vd.yaml',
+        help='config file path')
+    parser.add_argument(
+        '-o',
+        '--override',
+        action='append',
+        default=[],
+        help='config options to be overridden')
+    args = parser.parse_args()
+    return args
+def main(args):
+    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+    fleet.init(role)
+    config = get_config(args.config, overrides=args.override, show=True)
+    place = env.place()
+    startup_prog = fluid.Program()
+    train_prog = fluid.Program()
+    train_dataloader, train_fetchs = program.build(
+        config, train_prog, startup_prog, is_train=True)
+    if config.validate:
+        valid_prog = fluid.Program()
+        valid_dataloader, valid_fetchs = program.build(
+            config, valid_prog, startup_prog, is_train=False)
+        valid_prog = valid_prog.clone(for_test=True)
+    exe = fluid.Executor(place)
+    exe.run(startup_prog)
+    init_model(config, train_prog, exe)
+    train_reader = Reader(config, 'train')()
+    train_dataloader.set_sample_list_generator(train_reader, place)
+    if config.validate:
+        valid_reader = Reader(config, 'valid')()
+        valid_dataloader.set_sample_list_generator(valid_reader, place)
+        compiled_valid_prog = program.compile(config, valid_prog)
+    compiled_train_prog = fleet.main_program
+    for epoch_id in range(config.epochs):
+        program.run(train_dataloader, exe, compiled_train_prog, train_fetchs,
+                    epoch_id, 'train')
+        if config.validate and epoch_id % config.valid_interval == 0:
+            program.run(valid_dataloader, exe, compiled_valid_prog,
+                        valid_fetchs, epoch_id, 'valid')
+        if epoch_id % config.save_interval == 0:
+            model_path = os.path.join(config.model_save_dir,
+                                      config.architecture)
+            save_model(train_prog, model_path, epoch_id)
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)