diff --git a/docs/zh_CN/models/Foundation_models/FoundationViT.md b/docs/zh_CN/models/Foundation_models/FoundationViT.md index e34bf2ba3553637e60aa1a90b5ef658062e1fcf4..6035af5e33e27ea733c9dfb0470bf7b367ca3c0c 100644 --- a/docs/zh_CN/models/Foundation_models/FoundationViT.md +++ b/docs/zh_CN/models/Foundation_models/FoundationViT.md @@ -49,6 +49,20 @@ output = model(inputs) # the output of model embeding | EVA | EVA_vit_giant_patch14 | 1010M | 1408 | ImageNet-21k, CC12M, CC2M, Object365,COCO, ADE | [下载地址](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/EVA_vit_giant_patch14.pdparams) | | CAE | CAE_vit_base_patch16_224 | 85M | 768 | ImageNet-1k | [下载地址](https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/CAE_vit_base_patch16_224.pdparams) | +**备注:** PaddleClas 提供的 CLIP 系列模型在 ImageNet1k 数据集 fine-tune 的配置([CLIP_vit_base_patch14_224](ppcls/configs/CLIP/CLIP_vit_base_patch16_224_finetune.yaml),[CLIP_vit_large_patch14_224](ppcls/configs/CLIP/CLIP_vit_large_patch16_224_finetune.yaml))中: +* 默认未使用 `EMA`,如需使用,请自行修改配置文件增加字段: + ```yaml + EMA: + decay: 0.9999 + ``` +* 数据预处理中,`NormalizeImage` 默认使用 ImageNet1k 数据集的 `mean` 和 `std` 参数(`mean` 为 `[0.485, 0.456, 0.406]`,`std` 为 `[0.229, 0.224, 0.225]`),如需使用 LAION 数据集相应参数,请自行修改相应字段: + ```yaml + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.48145466, 0.4578275, 0.40821073] + std: [0.26862954, 0.26130258, 0.27577711] + ``` + ## 4. 参考文献 1. [MoCo v3: An Empirical Study of Training Self-Supervised Vision Transformers](https://arxiv.org/pdf/2104.02057.pdf) diff --git a/ppcls/configs/CLIP/CLIP_vit_base_patch16_224_finetune.yaml b/ppcls/configs/CLIP/CLIP_vit_base_patch16_224_finetune.yaml index 919489a6b301fb5fe46e36477cb01ff5e6942ffe..c7e6e0de82d9e1c8e9ffd269add9bbfffc99f057 100644 --- a/ppcls/configs/CLIP/CLIP_vit_base_patch16_224_finetune.yaml +++ b/ppcls/configs/CLIP/CLIP_vit_base_patch16_224_finetune.yaml @@ -21,10 +21,6 @@ AMP: # O1: mixed fp16 level: O1 -# model ema -EMA: - decay: 0.9999 - # model architecture Arch: name: CLIP_vit_base_patch16_224 @@ -81,8 +77,8 @@ DataLoader: img_size: 224 - NormalizeImage: scale: 1.0/255.0 - mean: [0.48145466, 0.4578275, 0.40821073] - std: [0.26862954, 0.26130258, 0.27577711] + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] order: '' - RandomErasing: EPSILON: 0.25 @@ -118,8 +114,8 @@ DataLoader: size: 224 - NormalizeImage: scale: 1.0/255.0 - mean: [0.48145466, 0.4578275, 0.40821073] - std: [0.26862954, 0.26130258, 0.27577711] + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] order: '' sampler: name: DistributedBatchSampler @@ -143,8 +139,8 @@ Infer: size: 224 - NormalizeImage: scale: 1.0/255.0 - mean: [0.5, 0.5, 0.5] - std: [0.5, 0.5, 0.5] + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] order: '' - ToCHWImage: PostProcess: diff --git a/ppcls/configs/CLIP/CLIP_vit_large_patch16_224_finetune.yaml b/ppcls/configs/CLIP/CLIP_vit_large_patch16_224_finetune.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1380587bed9a58e130c3b6c2bd38244b1a76dca9 --- /dev/null +++ b/ppcls/configs/CLIP/CLIP_vit_large_patch16_224_finetune.yaml @@ -0,0 +1,157 @@ +# global configs +Global: + checkpoints: null + pretrained_model: null + output_dir: ./output/ + device: gpu + save_interval: 10 + eval_during_train: True + eval_interval: 1 + epochs: 50 + print_batch_step: 10 + use_visualdl: False + # used for static mode and model export + image_shape: [3, 224, 224] + save_inference_dir: ./inference + +# mixed precision training +AMP: + scale_loss: 128.0 + use_dynamic_loss_scaling: True + # O1: mixed fp16 + level: O1 + +# model architecture +Arch: + name: CLIP_vit_large_patch14_224 + class_num: 1000 + return_embed: False + pretrained: True + head_init_scale: 0.001 + +# loss function config for traing/eval process +Loss: + Train: + - CELoss: + weight: 1.0 + epsilon: 0.1 + Eval: + - CELoss: + weight: 1.0 + +Optimizer: + name: AdamWDL + beta1: 0.9 + beta2: 0.999 + epsilon: 1e-8 + weight_decay: 0.05 + layerwise_decay: 0.6 + filter_bias_and_bn: True + lr: + name: Cosine + learning_rate: 0.0003 + eta_min: 1e-6 + warmup_epoch: 10 + warmup_start_lr: 1e-6 + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: ImageNetDataset + image_root: ./dataset/ILSVRC2012/ + cls_label_path: ./dataset/ILSVRC2012/train_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - RandCropImage: + size: 224 + interpolation: bicubic + backend: pil + - RandFlipImage: + flip_code: 1 + - TimmAutoAugment: + config_str: rand-m9-mstd0.5-inc1 + interpolation: bicubic + img_size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + - RandomErasing: + EPSILON: 0.25 + sl: 0.02 + sh: 1.0/3.0 + r1: 0.3 + attempt: 10 + use_log_aspect: True + mode: pixel + sampler: + name: DistributedBatchSampler + batch_size: 128 + drop_last: True + shuffle: True + loader: + num_workers: 16 + use_shared_memory: True + + Eval: + dataset: + name: ImageNetDataset + image_root: ./dataset/ILSVRC2012/ + cls_label_path: ./dataset/ILSVRC2012/val_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + resize_short: 224 + interpolation: bicubic + backend: pil + - CropImage: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + sampler: + name: DistributedBatchSampler + batch_size: 128 + drop_last: False + shuffle: False + loader: + num_workers: 8 + use_shared_memory: True + +Infer: + infer_imgs: docs/images/inference_deployment/whl_demo.jpg + batch_size: 10 + transforms: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + resize_short: 256 + - CropImage: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + - ToCHWImage: + PostProcess: + name: Topk + topk: 5 + class_id_map_file: ppcls/utils/imagenet1k_label_list.txt + +Metric: + Train: + - TopkAcc: + topk: [1, 5] + Eval: + - TopkAcc: + topk: [1, 5]