Add the DeiT models (#573)

* Add the DeiT models * fix a training bug * update the model init config * update the License

Add the DeiT models (#573)
* Add the DeiT models * fix a training bug * update the model init config * update the License
1b89b8a5 · jm_12138 · GitHub · 5c6979ce · 1b89b8a5 · 1b89b8a5
12 changed file
--- a/configs/DeiT/DeiT_base_distilled_patch16_224.yaml
+++ b/configs/DeiT/DeiT_base_distilled_patch16_224.yaml
+mode: 'train'
+ARCHITECTURE:
+    name: 'DeiT_base_distilled_patch16_224'
+pretrained_model: ""
+model_save_dir: "./output/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.01               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            size: 248
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DeiT/DeiT_base_distilled_patch16_384.yaml
+++ b/configs/DeiT/DeiT_base_distilled_patch16_384.yaml
+mode: 'train'
+ARCHITECTURE:
+    name: 'DeiT_base_distilled_patch16_384'
+pretrained_model: ""
+model_save_dir: "./output/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 384, 384]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.01               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 384
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            size: 426
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DeiT/DeiT_base_patch16_224.yaml
+++ b/configs/DeiT/DeiT_base_patch16_224.yaml
+mode: 'train'
+ARCHITECTURE:
+    name: 'DeiT_base_patch16_224'
+pretrained_model: ""
+model_save_dir: "./output/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.01               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            size: 248
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DeiT/DeiT_base_patch16_384.yaml
+++ b/configs/DeiT/DeiT_base_patch16_384.yaml
+mode: 'train'
+ARCHITECTURE:
+    name: 'DeiT_base_patch16_384'
+pretrained_model: ""
+model_save_dir: "./output/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 384, 384]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.01               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 384
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            size: 426
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DeiT/DeiT_small_distilled_patch16_224.yaml
+++ b/configs/DeiT/DeiT_small_distilled_patch16_224.yaml
+mode: 'train'
+ARCHITECTURE:
+    name: 'DeiT_small_distilled_patch16_224'
+pretrained_model: ""
+model_save_dir: "./output/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.01               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            size: 248
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DeiT/DeiT_small_patch16_224.yaml
+++ b/configs/DeiT/DeiT_small_patch16_224.yaml
+mode: 'train'
+ARCHITECTURE:
+    name: 'DeiT_small_patch16_224'
+pretrained_model: ""
+model_save_dir: "./output/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.01               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            size: 248
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DeiT/DeiT_tiny_distilled_patch16_224.yaml
+++ b/configs/DeiT/DeiT_tiny_distilled_patch16_224.yaml
+mode: 'train'
+ARCHITECTURE:
+    name: 'DeiT_tiny_distilled_patch16_224'
+pretrained_model: ""
+model_save_dir: "./output/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.01               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            size: 248
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DeiT/DeiT_tiny_patch16_224.yaml
+++ b/configs/DeiT/DeiT_tiny_patch16_224.yaml
+mode: 'train'
+ARCHITECTURE:
+    name: 'DeiT_tiny_patch16_224'
+pretrained_model: ""
+model_save_dir: "./output/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.01               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            size: 248
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/configs/DeiT/ViT_small_patch16_224.yaml
+++ b/configs/DeiT/ViT_small_patch16_224.yaml
+mode: 'train'
+ARCHITECTURE:
+    name: 'ViT_small_patch16_224'
+pretrained_model: ""
+model_save_dir: "./output/"
+classes_num: 1000
+total_images: 1281167
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 120
+topk: 5
+image_shape: [3, 224, 224]
+use_mix: False
+ls_epsilon: -1
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.01               
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000100
+TRAIN:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+VALID:
+    batch_size: 64
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            to_np: False
+            channel_first: False
+        - ResizeImage:
+            size: 248
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
--- a/ppcls/modeling/architectures/__init__.py
+++ b/ppcls/modeling/architectures/__init__.py
@@ -44,4 +44,5 @@ from .vgg import VGG11, VGG13, VGG16, VGG19
 from .darknet import DarkNet53
 from .regnet import RegNetX_200MF, RegNetX_4GF, RegNetX_32GF, RegNetY_200MF, RegNetY_4GF, RegNetY_32GF
 from .vision_transformer import ViT_small_patch16_224, ViT_base_patch16_224, ViT_base_patch16_384, ViT_base_patch32_384, ViT_large_patch16_224, ViT_large_patch16_384, ViT_large_patch32_384, ViT_huge_patch16_224, ViT_huge_patch32_384
+from .distilled_vision_transformer import DeiT_tiny_patch16_224, DeiT_small_patch16_224, DeiT_base_patch16_224, DeiT_tiny_distilled_patch16_224, DeiT_small_distilled_patch16_224, DeiT_base_distilled_patch16_224, DeiT_base_patch16_384, DeiT_base_distilled_patch16_384
 from .distillation_models import ResNet50_vd_distill_MobileNetV3_large_x1_0
--- a/ppcls/modeling/architectures/distilled_vision_transformer.py
+++ b/ppcls/modeling/architectures/distilled_vision_transformer.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn as nn
+from .vision_transformer import VisionTransformer, Identity, trunc_normal_, zeros_
+__all__ = [
+    'DeiT_tiny_patch16_224', 'DeiT_small_patch16_224', 'DeiT_base_patch16_224',
+    'DeiT_tiny_distilled_patch16_224', 'DeiT_small_distilled_patch16_224',
+    'DeiT_base_distilled_patch16_224', 'DeiT_base_patch16_384',
+    'DeiT_base_distilled_patch16_384'
+]
+class DistilledVisionTransformer(VisionTransformer):
+    def __init__(self, img_size=224, patch_size=16, class_dim=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4, qkv_bias=False, norm_layer='nn.LayerNorm', epsilon=1e-5,
+                 **kwargs):
+        super().__init__(img_size=img_size, patch_size=patch_size, class_dim=class_dim, embed_dim=embed_dim, depth=depth,
+                         num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, norm_layer=norm_layer, epsilon=epsilon,
+                         **kwargs)
+        self.pos_embed = self.create_parameter(
+            shape=(1, self.patch_embed.num_patches + 2, self.embed_dim), default_initializer=zeros_)
+        self.add_parameter("pos_embed", self.pos_embed)
+        self.dist_token = self.create_parameter(
+            shape=(1, 1, self.embed_dim), default_initializer=zeros_)
+        self.add_parameter("cls_token", self.cls_token)
+        self.head_dist = nn.Linear(
+            self.embed_dim, self.class_dim) if self.class_dim > 0 else Identity()
+        trunc_normal_(self.dist_token)
+        trunc_normal_(self.pos_embed)
+        self.head_dist.apply(self._init_weights)
+    def forward_features(self, x):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+        cls_tokens = self.cls_token.expand((B, -1, -1))
+        dist_token = self.dist_token.expand((B, -1, -1))
+        x = paddle.concat((cls_tokens, dist_token, x), axis=1)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        return x[:, 0], x[:, 1]
+    def forward(self, x):
+        x, x_dist = self.forward_features(x)
+        x = self.head(x)
+        x_dist = self.head_dist(x_dist)
+        return (x + x_dist) / 2
+def DeiT_tiny_patch16_224(**kwargs):
+    model = VisionTransformer(
+        patch_size=16, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4, qkv_bias=True,
+        epsilon=1e-6, **kwargs)
+    return model
+def DeiT_small_patch16_224(**kwargs):
+    model = VisionTransformer(
+        patch_size=16, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4, qkv_bias=True,
+        epsilon=1e-6, **kwargs)
+    return model
+def DeiT_base_patch16_224(**kwargs):
+    model = VisionTransformer(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
+        epsilon=1e-6, **kwargs)
+    return model
+def DeiT_tiny_distilled_patch16_224(**kwargs):
+    model = DistilledVisionTransformer(
+        patch_size=16, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4, qkv_bias=True,
+        epsilon=1e-6, **kwargs)
+    return model
+def DeiT_small_distilled_patch16_224(**kwargs):
+    model = DistilledVisionTransformer(
+        patch_size=16, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4, qkv_bias=True,
+        epsilon=1e-6, **kwargs)
+    return model
+def DeiT_base_distilled_patch16_224(**kwargs):
+    model = DistilledVisionTransformer(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
+        epsilon=1e-6, **kwargs)
+    return model
+def DeiT_base_patch16_384(**kwargs):
+    model = VisionTransformer(
+        img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
+        epsilon=1e-6, **kwargs)
+    return model
+def DeiT_base_distilled_patch16_384(**kwargs):
+    model = DistilledVisionTransformer(
+        img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
+        epsilon=1e-6, **kwargs)
+    return model
--- a/ppcls/modeling/architectures/vision_transformer.py
+++ b/ppcls/modeling/architectures/vision_transformer.py
-""" Vision Transformer (ViT) in Paddle
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
-A Paddle implement of Vision Transformers as described in
+#
-'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale' - https://arxiv.org/abs/2010.11929
+# Licensed under the Apache License, Version 2.0 (the "License");
-The official jax code is released and available at https://github.com/google-research/vision_transformer
+# you may not use this file except in compliance with the License.
-"""
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle
 import paddle.nn as nn
 from paddle.nn.initializer import TruncatedNormal, Constant