Merge pull request #1544 from cuicheng01/update_vit

update vit

Merge pull request #1544 from cuicheng01/update_vit
update vit
897760f0 · cuicheng01 · GitHub · b3922c96 · f5450c3d · 897760f0
6 changed file
--- a/ppcls/arch/backbone/__init__.py
+++ b/ppcls/arch/backbone/__init__.py
@@ -48,7 +48,7 @@ from ppcls.arch.backbone.model_zoo.resnext101_wsl import ResNeXt101_32x8d_wsl, R
 from ppcls.arch.backbone.model_zoo.squeezenet import SqueezeNet1_0, SqueezeNet1_1
 from ppcls.arch.backbone.model_zoo.darknet import DarkNet53
 from ppcls.arch.backbone.model_zoo.regnet import RegNetX_200MF, RegNetX_4GF, RegNetX_32GF, RegNetY_200MF, RegNetY_4GF, RegNetY_32GF
-from ppcls.arch.backbone.model_zoo.vision_transformer import ViT_small_patch16_224, ViT_base_patch16_224, ViT_base_patch16_384, ViT_base_patch32_384, ViT_large_patch16_224, ViT_large_patch16_384, ViT_large_patch32_384, ViT_huge_patch16_224, ViT_huge_patch32_384
+from ppcls.arch.backbone.model_zoo.vision_transformer import ViT_small_patch16_224, ViT_base_patch16_224, ViT_base_patch16_384, ViT_base_patch32_384, ViT_large_patch16_224, ViT_large_patch16_384, ViT_large_patch32_384
 from ppcls.arch.backbone.model_zoo.distilled_vision_transformer import DeiT_tiny_patch16_224, DeiT_small_patch16_224, DeiT_base_patch16_224, DeiT_tiny_distilled_patch16_224, DeiT_small_distilled_patch16_224, DeiT_base_distilled_patch16_224, DeiT_base_patch16_384, DeiT_base_distilled_patch16_384
 from ppcls.arch.backbone.model_zoo.swin_transformer import SwinTransformer_tiny_patch4_window7_224, SwinTransformer_small_patch4_window7_224, SwinTransformer_base_patch4_window7_224, SwinTransformer_base_patch4_window12_384, SwinTransformer_large_patch4_window7_224, SwinTransformer_large_patch4_window12_384
 from ppcls.arch.backbone.model_zoo.mixnet import MixNet_S, MixNet_M, MixNet_L

--- a/ppcls/arch/backbone/model_zoo/vision_transformer.py
+++ b/ppcls/arch/backbone/model_zoo/vision_transformer.py
@@ -38,10 +38,6 @@ MODEL_URLS = {
    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_large_patch16_384_pretrained.pdparams",
    "ViT_large_patch32_384":
    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_large_patch32_384_pretrained.pdparams",
-    "ViT_huge_patch16_224":
-    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_huge_patch16_224_pretrained.pdparams",
-    "ViT_huge_patch32_384":
-    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_huge_patch32_384_pretrained.pdparams"
 }

 __all__ = list(MODEL_URLS.keys())
@@ -460,36 +456,3 @@ def ViT_large_patch32_384(pretrained=False, use_ssld=False, **kwargs):
        MODEL_URLS["ViT_large_patch32_384"],
        use_ssld=use_ssld)
    return model
-
-
-def ViT_huge_patch16_224(pretrained=False, use_ssld=False, **kwargs):
-    model = VisionTransformer(
-        patch_size=16,
-        embed_dim=1280,
-        depth=32,
-        num_heads=16,
-        mlp_ratio=4,
-        **kwargs)
-    _load_pretrained(
-        pretrained,
-        model,
-        MODEL_URLS["ViT_huge_patch16_224"],
-        use_ssld=use_ssld)
-    return model
-
-
-def ViT_huge_patch32_384(pretrained=False, use_ssld=False, **kwargs):
-    model = VisionTransformer(
-        img_size=384,
-        patch_size=32,
-        embed_dim=1280,
-        depth=32,
-        num_heads=16,
-        mlp_ratio=4,
-        **kwargs)
-    _load_pretrained(
-        pretrained,
-        model,
-        MODEL_URLS["ViT_huge_patch32_384"],
-        use_ssld=use_ssld)
-    return model
--- a/ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch16_224.yaml
+++ b/ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch16_224.yaml
-# global configs
-Global:
-  checkpoints: null
-  pretrained_model: null
-  output_dir: ./output/
-  device: gpu
-  save_interval: 1
-  eval_during_train: True
-  eval_interval: 1
-  epochs: 120
-  print_batch_step: 10
-  use_visualdl: False
-  # used for static mode and model export
-  image_shape: [3, 224, 224]
-  save_inference_dir: ./inference
-
-# model architecture
-Arch:
-  name:  ViT_huge_patch16_224
-  class_num: 1000
- 
-# loss function config for traing/eval process
-Loss:
-  Train:
-    - CELoss:
-        weight: 1.0
-  Eval:
-    - CELoss:
-        weight: 1.0
-
-
-Optimizer:
-  name: Momentum
-  momentum: 0.9
-  lr:
-    name: Piecewise
-    learning_rate: 0.1
-    decay_epochs: [30, 60, 90]
-    values: [0.1, 0.01, 0.001, 0.0001]
-  regularizer:
-    name: 'L2'
-    coeff: 0.0001
-
-
-# data loader for train and eval
-DataLoader:
-  Train:
-    dataset:
-      name: ImageNetDataset
-      image_root: ./dataset/ILSVRC2012/
-      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
-      transform_ops:
-        - DecodeImage:
-            to_rgb: True
-            channel_first: False
-        - RandCropImage:
-            size: 224
-        - RandFlipImage:
-            flip_code: 1
-        - NormalizeImage:
-            scale: 1.0/255.0
-            mean: [0.5, 0.5, 0.5]
-            std: [0.5, 0.5, 0.5]
-            order: ''
-
-    sampler:
-      name: DistributedBatchSampler
-      batch_size: 64
-      drop_last: False
-      shuffle: True
-    loader:
-      num_workers: 4
-      use_shared_memory: True
-
-  Eval:
-    dataset: 
-      name: ImageNetDataset
-      image_root: ./dataset/ILSVRC2012/
-      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
-      transform_ops:
-        - DecodeImage:
-            to_rgb: True
-            channel_first: False
-        - ResizeImage:
-            resize_short: 256
-        - CropImage:
-            size: 224
-        - NormalizeImage:
-            scale: 1.0/255.0
-            mean: [0.5, 0.5, 0.5]
-            std: [0.5, 0.5, 0.5]
-            order: ''
-    sampler:
-      name: DistributedBatchSampler
-      batch_size: 64
-      drop_last: False
-      shuffle: False
-    loader:
-      num_workers: 4
-      use_shared_memory: True
-
-Infer:
-  infer_imgs: docs/images/whl/demo.jpg
-  batch_size: 10
-  transforms:
-    - DecodeImage:
-        to_rgb: True
-        channel_first: False
-    - ResizeImage:
-        resize_short: 256
-    - CropImage:
-        size: 224
-    - NormalizeImage:
-        scale: 1.0/255.0
-        mean: [0.5, 0.5, 0.5]
-        std: [0.5, 0.5, 0.5]
-        order: ''
-    - ToCHWImage:
-  PostProcess:
-    name: Topk
-    topk: 5
-    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
-
-Metric:
-  Train:
-    - TopkAcc:
-        topk: [1, 5]
-  Eval:
-    - TopkAcc:
-        topk: [1, 5]
--- a/ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch32_384.yaml
+++ b/ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch32_384.yaml
-# global configs
-Global:
-  checkpoints: null
-  pretrained_model: null
-  output_dir: ./output/
-  device: gpu
-  save_interval: 1
-  eval_during_train: True
-  eval_interval: 1
-  epochs: 120
-  print_batch_step: 10
-  use_visualdl: False
-  # used for static mode and model export
-  image_shape: [3, 384, 384]
-  save_inference_dir: ./inference
-
-# model architecture
-Arch:
-  name: ViT_huge_patch32_384
-  class_num: 1000
- 
-# loss function config for traing/eval process
-Loss:
-  Train:
-    - CELoss:
-        weight: 1.0
-  Eval:
-    - CELoss:
-        weight: 1.0
-
-
-Optimizer:
-  name: Momentum
-  momentum: 0.9
-  lr:
-    name: Piecewise
-    learning_rate: 0.1
-    decay_epochs: [30, 60, 90]
-    values: [0.1, 0.01, 0.001, 0.0001]
-  regularizer:
-    name: 'L2'
-    coeff: 0.0001
-
-
-# data loader for train and eval
-DataLoader:
-  Train:
-    dataset:
-      name: ImageNetDataset
-      image_root: ./dataset/ILSVRC2012/
-      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
-      transform_ops:
-        - DecodeImage:
-            to_rgb: True
-            channel_first: False
-        - RandCropImage:
-            size: 384
-        - RandFlipImage:
-            flip_code: 1
-        - NormalizeImage:
-            scale: 1.0/255.0
-            mean: [0.5, 0.5, 0.5]
-            std: [0.5, 0.5, 0.5]
-            order: ''
-
-    sampler:
-      name: DistributedBatchSampler
-      batch_size: 64
-      drop_last: False
-      shuffle: True
-    loader:
-      num_workers: 4
-      use_shared_memory: True
-
-  Eval:
-    dataset: 
-      name: ImageNetDataset
-      image_root: ./dataset/ILSVRC2012/
-      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
-      transform_ops:
-        - DecodeImage:
-            to_rgb: True
-            channel_first: False
-        - ResizeImage:
-            resize_short: 384
-        - CropImage:
-            size: 384
-        - NormalizeImage:
-            scale: 1.0/255.0
-            mean: [0.5, 0.5, 0.5]
-            std: [0.5, 0.5, 0.5]
-            order: ''
-    sampler:
-      name: DistributedBatchSampler
-      batch_size: 64
-      drop_last: False
-      shuffle: False
-    loader:
-      num_workers: 4
-      use_shared_memory: True
-
-Infer:
-  infer_imgs: docs/images/whl/demo.jpg
-  batch_size: 10
-  transforms:
-    - DecodeImage:
-        to_rgb: True
-        channel_first: False
-    - ResizeImage:
-        resize_short: 384
-    - CropImage:
-        size: 384
-    - NormalizeImage:
-        scale: 1.0/255.0
-        mean: [0.5, 0.5, 0.5]
-        std: [0.5, 0.5, 0.5]
-        order: ''
-    - ToCHWImage:
-  PostProcess:
-    name: Topk
-    topk: 5
-    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
-
-Metric:
-  Train:
-    - TopkAcc:
-        topk: [1, 5]
-  Eval:
-    - TopkAcc:
-        topk: [1, 5]
--- a/test_tipc/config/VisionTransformer/ViT_huge_patch16_224_train_infer_python.txt
+++ b/test_tipc/config/VisionTransformer/ViT_huge_patch16_224_train_infer_python.txt
-===========================train_params===========================
-model_name:ViT_huge_patch16_224
-python:python3.7
-gpu_list:0|0,1
-o Global.device:gpu
-o Global.auto_cast:null
-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120
-o Global.output_dir:./output/
-o DataLoader.Train.sampler.batch_size:8
-o Global.pretrained_model:null
-train_model_name:latest
-train_infer_img_dir:./dataset/ILSVRC2012/val
-null:null
-##
-trainer:norm_train
-norm_train:tools/train.py -c ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch16_224.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
-pact_train:null
-fpgm_train:null
-distill_train:null
-null:null
-null:null
-##
-===========================eval_params=========================== 
-eval:tools/eval.py -c ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch16_224.yaml
-null:null
-##
-===========================infer_params==========================
-o Global.save_inference_dir:./inference
-o Global.pretrained_model:
-norm_export:tools/export_model.py -c ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch16_224.yaml
-quant_export:null
-fpgm_export:null
-distill_export:null
-kl_quant:null
-export2:null
-pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_huge_patch16_224_pretrained.pdparams
-infer_model:../inference/
-infer_export:True
-infer_quant:Fasle
-inference:python/predict_cls.py -c configs/inference_cls.yaml
-o Global.use_gpu:True|False
-o Global.enable_mkldnn:True|False
-o Global.cpu_num_threads:1|6
-o Global.batch_size:1|16
-o Global.use_tensorrt:True|False
-o Global.use_fp16:True|False
-o Global.inference_model_dir:../inference
-o Global.infer_imgs:../dataset/ILSVRC2012/val
-o Global.save_log_path:null
-o Global.benchmark:True
-null:null
-null:null
--- a/test_tipc/config/VisionTransformer/ViT_huge_patch32_384_train_infer_python.txt
+++ b/test_tipc/config/VisionTransformer/ViT_huge_patch32_384_train_infer_python.txt
-===========================train_params===========================
-model_name:ViT_huge_patch32_384
-python:python3.7
-gpu_list:0|0,1
-o Global.device:gpu
-o Global.auto_cast:null
-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120
-o Global.output_dir:./output/
-o DataLoader.Train.sampler.batch_size:8
-o Global.pretrained_model:null
-train_model_name:latest
-train_infer_img_dir:./dataset/ILSVRC2012/val
-null:null
-##
-trainer:norm_train
-norm_train:tools/train.py -c ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch32_384.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
-pact_train:null
-fpgm_train:null
-distill_train:null
-null:null
-null:null
-##
-===========================eval_params=========================== 
-eval:tools/eval.py -c ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch32_384.yaml
-null:null
-##
-===========================infer_params==========================
-o Global.save_inference_dir:./inference
-o Global.pretrained_model:
-norm_export:tools/export_model.py -c ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch32_384.yaml
-quant_export:null
-fpgm_export:null
-distill_export:null
-kl_quant:null
-export2:null
-pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_huge_patch32_384_pretrained.pdparams
-infer_model:../inference/
-infer_export:True
-infer_quant:Fasle
-inference:python/predict_cls.py -c configs/inference_cls.yaml -o PreProcess.transform_ops.0.ResizeImage.resize_short=384 -o PreProcess.transform_ops.1.CropImage.size=384
-o Global.use_gpu:True|False
-o Global.enable_mkldnn:True|False
-o Global.cpu_num_threads:1|6
-o Global.batch_size:1|16
-o Global.use_tensorrt:True|False
-o Global.use_fp16:True|False
-o Global.inference_model_dir:../inference
-o Global.infer_imgs:../dataset/ILSVRC2012/val
-o Global.save_log_path:null
-o Global.benchmark:True
-null:null
-null:null