add MobileViTv3

dc4fdba0 · Yang Nie · Tingquan Gao · df31d808 · dc4fdba0 · dc4fdba0
6 changed file
--- a/ppcls/arch/backbone/__init__.py
+++ b/ppcls/arch/backbone/__init__.py
@@ -78,6 +78,7 @@ from .model_zoo.cae import cae_base_patch16_224, cae_large_patch16_224
 from .model_zoo.cvt import CvT_13_224, CvT_13_384, CvT_21_224, CvT_21_384, CvT_W24_384
 from .model_zoo.micronet import MicroNet_M0, MicroNet_M1, MicroNet_M2, MicroNet_M3
 from .model_zoo.mobilenext import MobileNeXt_x0_35, MobileNeXt_x0_5, MobileNeXt_x0_75, MobileNeXt_x1_0, MobileNeXt_x1_4
+from .model_zoo.mobilevit_v3 import MobileViTv3_XXS, MobileViTv3_XS, MobileViTv3_S, MobileViTv3_x0_5, MobileViTv3_x0_75, MobileViTv3_x1_0

 from .variant_models.resnet_variant import ResNet50_last_stage_stride1
 from .variant_models.resnet_variant import ResNet50_adaptive_max_pool2d

--- a/ppcls/arch/backbone/model_zoo/mobilevit_v3.py
+++ b/ppcls/arch/backbone/model_zoo/mobilevit_v3.py
--- a/ppcls/configs/ImageNet/MobileViTv3/MobileViTv3_S.yaml
+++ b/ppcls/configs/ImageNet/MobileViTv3/MobileViTv3_S.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTv3_S
+  class_num: 1000
+  dropout: 0.1
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.002
+    eta_min: 0.0002
+    warmup_epoch: 1  # 3000 iterations
+    warmup_start_lr: 0.0002
+    # by_epoch: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    # support to specify width and height respectively:
+    # scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [256, 160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 48
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bilinear
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 48
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bilinear
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
--- a/ppcls/configs/ImageNet/MobileViTv3/MobileViTv3_XS.yaml
+++ b/ppcls/configs/ImageNet/MobileViTv3/MobileViTv3_XS.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTv3_XS
+  class_num: 1000
+  dropout: 0.1
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.002
+    eta_min: 0.0002
+    warmup_epoch: 1  # 3000 iterations
+    warmup_start_lr: 0.0002
+    # by_epoch: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    # support to specify width and height respectively:
+    # scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [256, 160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 48
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bilinear
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 48
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bilinear
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
--- a/ppcls/configs/ImageNet/MobileViTv3/MobileViTv3_XXS.yaml
+++ b/ppcls/configs/ImageNet/MobileViTv3/MobileViTv3_XXS.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTv3_XXS
+  class_num: 1000
+  dropout: 0.05
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.002
+    eta_min: 0.0002
+    warmup_epoch: 1  # 3000 iterations
+    warmup_start_lr: 0.0002
+    # by_epoch: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    # support to specify width and height respectively:
+    # scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [256, 160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 48
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bilinear
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 48
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bilinear
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
--- a/ppcls/configs/ImageNet/MobileViTv3/MobileViTv3_x0_5.yaml
+++ b/ppcls/configs/ImageNet/MobileViTv3/MobileViTv3_x0_5.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTv3_x0_5
+  class_num: 1000
+  classifier_dropout: 0.
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.002
+    eta_min: 0.0002
+    warmup_epoch: 1  # 3000 iterations
+    warmup_start_lr: 0.0002
+    # by_epoch: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    # support to specify width and height respectively:
+    # scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [256, 160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 48
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bilinear
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 48
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bilinear
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]