modify some default hyperparams to adapt to fine-tune downstream tasks (#2921)

1. unset EMA because of the relatively small size of most downstream dataset; 2. use mean and std of IMN.

modify some default hyperparams to adapt to fine-tune downstream tasks (#2921)
1. unset EMA because of the relatively small size of most downstream dataset; 2. use mean and std of IMN.
eddba911 · Tingquan Gao · GitHub · 1f8a830b · eddba911 · eddba911
3 changed file
--- a/docs/zh_CN/models/Foundation_models/FoundationViT.md
+++ b/docs/zh_CN/models/Foundation_models/FoundationViT.md
@@ -49,6 +49,20 @@ output = model(inputs)  # the output of model embeding
 |  EVA  |       EVA_vit_giant_patch14       |  1010M  |      1408      | ImageNet-21k, CC12M,   CC2M, Object365,COCO, ADE | [下载地址](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/EVA_vit_giant_patch14.pdparams)               |
 |  CAE  |      CAE_vit_base_patch16_224      |   85M   |      768      |                   ImageNet-1k                   | [下载地址](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/CAE_vit_base_patch16_224.pdparams)           |
+**备注：** PaddleClas 提供的 CLIP 系列模型在 ImageNet1k 数据集 fine-tune 的配置（[CLIP_vit_base_patch14_224](ppcls/configs/CLIP/CLIP_vit_base_patch16_224_finetune.yaml)，[CLIP_vit_large_patch14_224](ppcls/configs/CLIP/CLIP_vit_large_patch16_224_finetune.yaml)）中：
+* 默认未使用 `EMA`，如需使用，请自行修改配置文件增加字段：
+    ```yaml
+    EMA:
+      decay: 0.9999
+    ```
+* 数据预处理中，`NormalizeImage` 默认使用 ImageNet1k 数据集的 `mean` 和 `std` 参数（`mean` 为 `[0.485, 0.456, 0.406]`，`std` 为 `[0.229, 0.224, 0.225]`），如需使用 LAION 数据集相应参数，请自行修改相应字段：
+    ```yaml
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.48145466, 0.4578275, 0.40821073]
+        std: [0.26862954, 0.26130258, 0.27577711]
+    ```
 ## 4. 参考文献
 1. [MoCo v3: An Empirical Study of Training Self-Supervised Vision Transformers](https://arxiv.org/pdf/2104.02057.pdf)

--- a/ppcls/configs/CLIP/CLIP_vit_base_patch16_224_finetune.yaml
+++ b/ppcls/configs/CLIP/CLIP_vit_base_patch16_224_finetune.yaml
@@ -21,10 +21,6 @@ AMP:
  # O1: mixed fp16
  level: O1
-# model ema
-EMA:
-  decay: 0.9999
 # model architecture
 Arch:
  name: CLIP_vit_base_patch16_224
@@ -81,8 +77,8 @@ DataLoader:
            img_size: 224
        - NormalizeImage:
            scale: 1.0/255.0
-            mean: [0.48145466, 0.4578275, 0.40821073]
+            mean: [0.485, 0.456, 0.406]
-            std: [0.26862954, 0.26130258, 0.27577711] 
+            std: [0.229, 0.224, 0.225]
            order: ''
        - RandomErasing:
            EPSILON: 0.25
@@ -118,8 +114,8 @@ DataLoader:
            size: 224
        - NormalizeImage:
            scale: 1.0/255.0
-            mean: [0.48145466, 0.4578275, 0.40821073]
+            mean: [0.485, 0.456, 0.406]
-            std: [0.26862954, 0.26130258, 0.27577711] 
+            std: [0.229, 0.224, 0.225]
            order: ''
    sampler:
      name: DistributedBatchSampler
@@ -143,8 +139,8 @@ Infer:
        size: 224
    - NormalizeImage:
        scale: 1.0/255.0
-        mean: [0.5, 0.5, 0.5]
+        mean: [0.485, 0.456, 0.406]
-        std: [0.5, 0.5, 0.5]
+        std: [0.229, 0.224, 0.225]
        order: ''
    - ToCHWImage:
  PostProcess:

--- a/ppcls/configs/CLIP/CLIP_vit_large_patch16_224_finetune.yaml
+++ b/ppcls/configs/CLIP/CLIP_vit_large_patch16_224_finetune.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+# model architecture
+Arch:
+  name: CLIP_vit_large_patch14_224
+  class_num: 1000
+  return_embed: False
+  pretrained: True
+  head_init_scale: 0.001
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+Optimizer:
+  name: AdamWDL
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  layerwise_decay: 0.6
+  filter_bias_and_bn: True
+  lr:
+    name: Cosine
+    learning_rate: 0.0003
+    eta_min: 1e-6
+    warmup_epoch: 10
+    warmup_start_lr: 1e-6
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 224
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]