From 1be468bf0a50f600a37653577a13f23b440b97cc Mon Sep 17 00:00:00 2001 From: Guoxia Wang Date: Tue, 17 Jan 2023 13:23:15 +0800 Subject: [PATCH] add a model PLSC-ViT (#5697) (#5706) * add a model and app for VisionTransformer Co-authored-by: liuTINA0907 <65896652+liuTINA0907@users.noreply.github.com> Co-authored-by: qizhaoaoe <2285142981@qq.com> Co-authored-by: liuTINA0907 <65896652+liuTINA0907@users.noreply.github.com> --- modelcenter/PLSC-ViT/APP/__init__.py | 0 modelcenter/PLSC-ViT/APP/app.py | 47 +++++ modelcenter/PLSC-ViT/APP/app.yaml | 11 + modelcenter/PLSC-ViT/APP/download.py | 229 +++++++++++++++++++++ modelcenter/PLSC-ViT/APP/predictor.py | 88 ++++++++ modelcenter/PLSC-ViT/APP/requirements.txt | 11 + modelcenter/PLSC-ViT/benchmark_cn.md | 22 ++ modelcenter/PLSC-ViT/benchmark_en.md | 22 ++ modelcenter/PLSC-ViT/download_cn.md | 8 + modelcenter/PLSC-ViT/download_en.md | 8 + modelcenter/PLSC-ViT/info.yaml | 30 +++ modelcenter/PLSC-ViT/introduction_cn.ipynb | 210 +++++++++++++++++++ modelcenter/PLSC-ViT/introduction_en.ipynb | 199 ++++++++++++++++++ 13 files changed, 885 insertions(+) create mode 100644 modelcenter/PLSC-ViT/APP/__init__.py create mode 100644 modelcenter/PLSC-ViT/APP/app.py create mode 100644 modelcenter/PLSC-ViT/APP/app.yaml create mode 100644 modelcenter/PLSC-ViT/APP/download.py create mode 100644 modelcenter/PLSC-ViT/APP/predictor.py create mode 100644 modelcenter/PLSC-ViT/APP/requirements.txt create mode 100644 modelcenter/PLSC-ViT/benchmark_cn.md create mode 100644 modelcenter/PLSC-ViT/benchmark_en.md create mode 100644 modelcenter/PLSC-ViT/download_cn.md create mode 100644 modelcenter/PLSC-ViT/download_en.md create mode 100644 modelcenter/PLSC-ViT/info.yaml create mode 100644 modelcenter/PLSC-ViT/introduction_cn.ipynb create mode 100644 modelcenter/PLSC-ViT/introduction_en.ipynb diff --git a/modelcenter/PLSC-ViT/APP/__init__.py b/modelcenter/PLSC-ViT/APP/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelcenter/PLSC-ViT/APP/app.py b/modelcenter/PLSC-ViT/APP/app.py new file mode 100644 index 00000000..89e38d59 --- /dev/null +++ b/modelcenter/PLSC-ViT/APP/app.py @@ -0,0 +1,47 @@ +import gradio as gr +from predictor import Predictor + +model_path = "paddlecv://models/vit/v2.4/imagenet2012-ViT-B_16-224_infer.pdmodel" +params_path = "paddlecv://models/vit/v2.4/imagenet2012-ViT-B_16-224_infer.pdiparams" +label_path = "paddlecv://dataset/imagenet2012_labels.txt" + +predictor = None + + +def model_inference(image): + global predictor + if predictor is None: + predictor = Predictor( + model_path=model_path, + params_path=params_path, + label_path=label_path) + scores, labels = predictor.predict(image) + json_out = {"scores": scores.tolist(), "labels": labels.tolist()} + return image, json_out + + +def clear_all(): + return None, None, None + + +with gr.Blocks() as demo: + gr.Markdown("Classification based on ViT") + + with gr.Column(scale=1, min_width=100): + + img_in = gr.Image( + value="https://plsc.bj.bcebos.com/dataset/test_images/cat.jpg", + label="Input") + + with gr.Row(): + btn1 = gr.Button("Clear") + btn2 = gr.Button("Submit") + + img_out = gr.Image(label="Output") + json_out = gr.JSON(label="jsonOutput") + + btn2.click(fn=model_inference, inputs=img_in, outputs=[img_out, json_out]) + btn1.click(fn=clear_all, inputs=None, outputs=[img_in, img_out, json_out]) + gr.Button.style(1) + +demo.launch() diff --git a/modelcenter/PLSC-ViT/APP/app.yaml b/modelcenter/PLSC-ViT/APP/app.yaml new file mode 100644 index 00000000..71e01ca3 --- /dev/null +++ b/modelcenter/PLSC-ViT/APP/app.yaml @@ -0,0 +1,11 @@ +【PLSC-ViT-App-YAML】 + +APP_Info: + title: PLSC-ViT-App + colorFrom: blue + colorTo: yellow + sdk: gradio + sdk_version: 3.9.1 + app_file: app.py + license: apache-2.0 + device: cpu \ No newline at end of file diff --git a/modelcenter/PLSC-ViT/APP/download.py b/modelcenter/PLSC-ViT/APP/download.py new file mode 100644 index 00000000..648d568e --- /dev/null +++ b/modelcenter/PLSC-ViT/APP/download.py @@ -0,0 +1,229 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import os.path as osp +import sys +import yaml +import time +import shutil +import requests +import tqdm +import hashlib +import base64 +import binascii +import tarfile +import zipfile + +__all__ = [ + 'get_model_path', + 'get_config_path', + 'get_dict_path', + 'get_data_path', +] + +WEIGHTS_HOME = osp.expanduser("~/.cache/paddlecv/models/plsc") +CONFIGS_HOME = osp.expanduser("~/.cache/paddlecv/configs/plsc") +DICTS_HOME = osp.expanduser("~/.cache/paddlecv/dicts/plsc/") +DATA_HOME = osp.expanduser("~/.cache/paddlecv/dataset/plsc") +# dict of {dataset_name: (download_info, sub_dirs)} +# download info: [(url, md5sum)] + +DOWNLOAD_RETRY_LIMIT = 3 + +PMP_DOWNLOAD_URL_PREFIX = 'https://plsc.bj.bcebos.com/' + + +def is_url(path): + """ + Whether path is URL. + Args: + path (string): URL string or not. + """ + return path.startswith('http://') \ + or path.startswith('https://') \ + or path.startswith('paddlecv://') + + +def parse_url(url): + url = url.replace("paddlecv://", PMP_DOWNLOAD_URL_PREFIX) + return url + + +def get_model_path(path): + """Get model path from WEIGHTS_HOME, if not exists, + download it from url. + """ + if not is_url(path): + return path + url = parse_url(path) + path, _ = get_path(url, WEIGHTS_HOME, path_depth=3) + return path + + +def get_data_path(path): + """Get model path from DATA_HOME, if not exists, + download it from url. + """ + if not is_url(path): + return path + url = parse_url(path) + path, _ = get_path(url, DATA_HOME, path_depth=1) + return path + + +def get_config_path(path): + """Get config path from CONFIGS_HOME, if not exists, + download it from url. + """ + if not is_url(path): + return path + url = parse_url(path) + path, _ = get_path(url, CONFIGS_HOME) + return path + + +def get_dict_path(path): + """Get config path from CONFIGS_HOME, if not exists, + download it from url. + """ + if not is_url(path): + return path + url = parse_url(path) + path, _ = get_path(url, DICTS_HOME) + return path + + +def map_path(url, root_dir, path_depth=1): + # parse path after download to decompress under root_dir + assert path_depth > 0, "path_depth should be a positive integer" + dirname = url + for _ in range(path_depth): + dirname = osp.dirname(dirname) + fpath = osp.relpath(url, dirname) + path = osp.join(root_dir, fpath) + dirname = osp.dirname(path) + return path, dirname + + +def get_path(url, root_dir, md5sum=None, check_exist=True, path_depth=1): + """ Download from given url to root_dir. + if file or directory specified by url is exists under + root_dir, return the path directly, otherwise download + from url, return the path. + url (str): download url + root_dir (str): root dir for downloading, it should be + WEIGHTS_HOME + md5sum (str): md5 sum of download package + """ + # parse path after download to decompress under root_dir + fullpath, dirname = map_path(url, root_dir, path_depth) + + if osp.exists(fullpath) and check_exist: + if not osp.isfile(fullpath) or \ + _check_exist_file_md5(fullpath, md5sum, url): + return fullpath, True + else: + os.remove(fullpath) + + fullname = _download(url, dirname, md5sum) + return fullpath, False + + +def _download(url, path, md5sum=None): + """ + Download from url, save to path. + url (str): download url + path (str): download to given path + """ + if not osp.exists(path): + os.makedirs(path) + + fname = osp.split(url)[-1] + fullname = osp.join(path, fname) + retry_cnt = 0 + + while not (osp.exists(fullname) and _check_exist_file_md5(fullname, md5sum, + url)): + if retry_cnt < DOWNLOAD_RETRY_LIMIT: + retry_cnt += 1 + else: + raise RuntimeError("Download from {} failed. " + "Retry limit reached".format(url)) + + # NOTE: windows path join may incur \, which is invalid in url + if sys.platform == "win32": + url = url.replace('\\', '/') + + req = requests.get(url, stream=True) + if req.status_code != 200: + raise RuntimeError("Downloading from {} failed with code " + "{}!".format(url, req.status_code)) + + # For protecting download interupted, download to + # tmp_fullname firstly, move tmp_fullname to fullname + # after download finished + tmp_fullname = fullname + "_tmp" + total_size = req.headers.get('content-length') + with open(tmp_fullname, 'wb') as f: + if total_size: + for chunk in tqdm.tqdm( + req.iter_content(chunk_size=1024), + total=(int(total_size) + 1023) // 1024, + unit='KB'): + f.write(chunk) + else: + for chunk in req.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + shutil.move(tmp_fullname, fullname) + return fullname + + +def _check_exist_file_md5(filename, md5sum, url): + # if md5sum is None, and file to check is model file, + # read md5um from url and check, else check md5sum directly + return _md5check_from_url(filename, url) if md5sum is None \ + and filename.endswith('pdparams') \ + else _md5check(filename, md5sum) + + +def _md5check_from_url(filename, url): + # For model in bcebos URLs, MD5 value is contained + # in request header as 'content_md5' + req = requests.get(url, stream=True) + content_md5 = req.headers.get('content-md5') + req.close() + if not content_md5 or _md5check( + filename, + binascii.hexlify(base64.b64decode(content_md5.strip('"'))).decode( + )): + return True + else: + return False + + +def _md5check(fullname, md5sum=None): + if md5sum is None: + return True + + md5 = hashlib.md5() + with open(fullname, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + md5.update(chunk) + calc_md5sum = md5.hexdigest() + + if calc_md5sum != md5sum: + return False + return True diff --git a/modelcenter/PLSC-ViT/APP/predictor.py b/modelcenter/PLSC-ViT/APP/predictor.py new file mode 100644 index 00000000..46fd81a7 --- /dev/null +++ b/modelcenter/PLSC-ViT/APP/predictor.py @@ -0,0 +1,88 @@ +import os +import cv2 +import numpy as np +import paddle +from download import get_model_path, get_data_path + +class Predictor(object): + def __init__(self, + model_type="paddle", + model_path=None, + params_path=None, + label_path=None): + ''' + model_path: str, http url + params_path: str, http url, could be downloaded + ''' + assert model_type in ["paddle"] + assert model_path is not None and os.path.splitext(model_path)[ + 1] == '.pdmodel' + assert params_path is not None and os.path.splitext(params_path)[ + 1] == '.pdiparams' + + import paddle.inference as paddle_infer + infer_model = get_model_path(model_path) + infer_params = get_model_path(params_path) + config = paddle_infer.Config(infer_model, infer_params) + self.predictor = paddle_infer.create_predictor(config) + self.input_names = self.predictor.get_input_names() + self.output_names = self.predictor.get_output_names() + self.labels = self.parse_labes(get_data_path(label_path)) + self.model_type = model_type + + def predict(self, img): + + if self.preprocess is not None: + inputs = self.preprocess(img) + else: + inputs = img + for input_name in self.input_names: + input_tensor = self.predictor.get_input_handle(input_name) + input_tensor.copy_from_cpu(inputs[input_name]) + self.predictor.run() + outputs = [] + for output_idx in range(len(self.output_names)): + output_tensor = self.predictor.get_output_handle( + self.output_names[output_idx]) + outputs.append(output_tensor.copy_to_cpu()) + if self.postprocess is not None: + output_data = self.postprocess(outputs) + else: + output_data = outputs + + return output_data + + def preprocess(self, img): + img = cv2.resize(img, (224, 224)) + scale = 1.0 / 255.0 + mean = 0.5 + std = 0.5 + img = (img.astype('float32') * scale - mean) / std + img = img[np.newaxis, :, :, :] + img = img.transpose((0, 3, 1, 2)) + return {'x': img} + + @staticmethod + def parse_labes(label_path): + with open(label_path, 'r') as f: + labels = [] + for line in f: + if len(line) < 2: + continue + label = line.strip().split(',')[0].split(' ')[2] + labels.append(label) + return labels + + @staticmethod + def softmax(x, epsilon=1e-6): + exp_x = np.exp(x) + sfm = (exp_x + epsilon) / (np.sum(exp_x) + epsilon) + return sfm + + def postprocess(self, logits): + pred = np.array(logits).squeeze() + pred = self.softmax(pred) + class_idx = pred.argsort()[::-1] + return pred[class_idx[:5]], np.array(self.labels)[class_idx[:5]] + + diff --git a/modelcenter/PLSC-ViT/APP/requirements.txt b/modelcenter/PLSC-ViT/APP/requirements.txt new file mode 100644 index 00000000..53726ed9 --- /dev/null +++ b/modelcenter/PLSC-ViT/APP/requirements.txt @@ -0,0 +1,11 @@ +gradio +opencv-python +paddlepaddle +PyYAML +shapely +scipy +Cython +numpy +setuptools +pillow +tqdm \ No newline at end of file diff --git a/modelcenter/PLSC-ViT/benchmark_cn.md b/modelcenter/PLSC-ViT/benchmark_cn.md new file mode 100644 index 00000000..ef04fbfd --- /dev/null +++ b/modelcenter/PLSC-ViT/benchmark_cn.md @@ -0,0 +1,22 @@ +# 1. 推理Benchmark + +## 1.1 软硬件环境 + +- PLSC-ViT模型推理采用GPU的型号为A100,不同的尺度的模型采用了单机8卡或是4机32卡。 + +## 1.2 数据集 +- 测试使用的数据集为ImageNet. + +## 1.3 指标 + + +| Model | Phase | Dataset | gpu | img/sec | Top1 Acc | Official | +| --- | --- | --- | --- | --- | --- | --- | +| ViT-B_16_224 |pretrain |ImageNet2012 |A100*N1C8 | 3583| 0.75196 | 0.7479 | +| ViT-B_16_384 |finetune | ImageNet2012 | A100*N1C8 | 719 | 0.77972 | 0.7791 | +| ViT-L_16_224 | pretrain | ImageNet21K | A100*N4C32 | 5256 | - | - | | +|ViT-L_16_384 |finetune | ImageNet2012 | A100*N4C32 | 934 | 0.85030 | 0.8505 | + +# 2. 相关使用说明 + +https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/vit/README.md diff --git a/modelcenter/PLSC-ViT/benchmark_en.md b/modelcenter/PLSC-ViT/benchmark_en.md new file mode 100644 index 00000000..f2b4bc29 --- /dev/null +++ b/modelcenter/PLSC-ViT/benchmark_en.md @@ -0,0 +1,22 @@ +# 1. Benchmark + +## 1.1 Environment + +- We train the ViT on 1 node with 8 A100 gpus or 4 nodes with 32 A100 gpus. + +## 1.2 DataSet +- We train the ViT on ImageNet. + +## 1.3 Benchmark + + +| Model | Phase | Dataset | gpu | img/sec | Top1 Acc | Official | +| --- | --- | --- | --- | --- | --- | --- | +| ViT-B_16_224 |pretrain |ImageNet2012 |A100*N1C8 | 3583| 0.75196 | 0.7479 | +| ViT-B_16_384 |finetune | ImageNet2012 | A100*N1C8 | 719 | 0.77972 | 0.7791 | +| ViT-L_16_224 | pretrain | ImageNet21K | A100*N4C32 | 5256 | - | - | | +|ViT-L_16_384 |finetune | ImageNet2012 | A100*N4C32 | 934 | 0.85030 | 0.8505 | + +# 2. Reference + +https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/vit/README.md diff --git a/modelcenter/PLSC-ViT/download_cn.md b/modelcenter/PLSC-ViT/download_cn.md new file mode 100644 index 00000000..c7353cd5 --- /dev/null +++ b/modelcenter/PLSC-ViT/download_cn.md @@ -0,0 +1,8 @@ +# 模型列表 + +|模型名称|模型简介|模型配置|预训练checkpoint下载地址| +| --- | --- | --- | --- | +| ViT-B_16_224 |输入size为224,layers=12|[config](https://github.com/PaddlePaddle/PLSC/blob/release/2.4/task/classification/vit/configs/ViT_base_patch16_224_in1k_1n8c_dp_fp16o2.yaml) |[download](https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet2012-ViT-B_16-224.pdparams) | +| ViT-B_16_384 |输入size为384,layers=12|[config](https://github.com/PaddlePaddle/PLSC/blob/release/2.4/task/classification/vit/configs/ViT_base_patch16_384_ft_in1k_1n8c_dp_fp16o2.yaml)| [download](https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet2012-ViT-B_16-224.pdparams) | +| ViT-L_16_224 |输入size为224,layers=24|[config](https://github.com/PaddlePaddle/PLSC/blob/release/2.4/task/classification/vit/configs/ViT_large_patch16_224_in21k_4n32c_dp_fp16o2.yaml)| [download](https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet21k-ViT-L_16-224.pdparams) | +| ViT-L_16_384 |输入size为384,layers=32|[config](https://github.com/PaddlePaddle/PLSC/blob/release/2.4/task/classification/vit/configs/ViT_large_patch16_384_in1k_ft_4n32c_dp_fp16o2.yaml) | [download](https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet21k-ViT-L_16-224.pdparams) | diff --git a/modelcenter/PLSC-ViT/download_en.md b/modelcenter/PLSC-ViT/download_en.md new file mode 100644 index 00000000..1bdbc40e --- /dev/null +++ b/modelcenter/PLSC-ViT/download_en.md @@ -0,0 +1,8 @@ +# Model List + +|Model Name|Introduction|Config|Pretrained checkpoint Download| +| --- | --- | --- | --- | +| ViT-B_16_224 |input_size=224,layers=12|[config](https://github.com/PaddlePaddle/PLSC/blob/release/2.4/task/classification/vit/configs/ViT_base_patch16_224_in1k_1n8c_dp_fp16o2.yaml) |[download](https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet2012-ViT-B_16-224.pdparams) | +| ViT-B_16_384 |input_size=384,layers=12|[config](https://github.com/PaddlePaddle/PLSC/blob/release/2.4/task/classification/vit/configs/ViT_base_patch16_384_ft_in1k_1n8c_dp_fp16o2.yaml)| [download](https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet2012-ViT-B_16-224.pdparams) | +| ViT-L_16_224 |input_size=224,layers=24|[config](https://github.com/PaddlePaddle/PLSC/blob/release/2.4/task/classification/vit/configs/ViT_large_patch16_224_in21k_4n32c_dp_fp16o2.yaml)| [download](https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet21k-ViT-L_16-224.pdparams) | +| ViT-L_16_384 |input_size=384,layers=32|[config](https://github.com/PaddlePaddle/PLSC/blob/release/2.4/task/classification/vit/configs/ViT_large_patch16_384_in1k_ft_4n32c_dp_fp16o2.yaml) | [download](https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet21k-ViT-L_16-224.pdparams) | diff --git a/modelcenter/PLSC-ViT/info.yaml b/modelcenter/PLSC-ViT/info.yaml new file mode 100644 index 00000000..2f88d6b9 --- /dev/null +++ b/modelcenter/PLSC-ViT/info.yaml @@ -0,0 +1,30 @@ +--- +Model_Info: + name: "PLSC-ViT" + description: "PaddlePaddle 重新实现 Google 官方 Repo 中的 Vision Transformer 算法 《An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale》" + description_en: "PaddlePaddle reimplementation of Google's repository for the Vision Transformer model that was released with the paper An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale." + update_time: + icon: "https://plsc.bj.bcebos.com/assets/modelcenter-icon.png" + from_repo: "PLSC" +Task: +- tag: 计算机视觉 + tag_en: Computer Vision + sub_tag: 图像分类 + sub_tag_en: Image Classification +Example: +- tag: + tag_en: + sub_tag: + sub_tag_en: + title: + title_en: + url: + url_en: +Datasets: ImageNet 1K, ImageNet 21K +Publisher: Baidu +License: Apache 2.0 +Paper: +- title: "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" + url: https://arxiv.org/abs/2010.11929 +IfTraining: 1 +IfOnlineDemo: 1 diff --git a/modelcenter/PLSC-ViT/introduction_cn.ipynb b/modelcenter/PLSC-ViT/introduction_cn.ipynb new file mode 100644 index 00000000..58b99fba --- /dev/null +++ b/modelcenter/PLSC-ViT/introduction_cn.ipynb @@ -0,0 +1,210 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ae69ce68", + "metadata": {}, + "source": [ + "## 1. PLSC-ViT模型简介\n" + ] + }, + { + "cell_type": "markdown", + "id": "35485bc6", + "metadata": {}, + "source": [ + "PLSC-ViT实现了基于Transformer的视觉分类模型。ViT对图像进行切分成patch,之后基于patch拉平的sequence进行线性embedding,并且添加了position embeddings和classfication token,然后将patch序列输入到标准的transformer编码器,最终经过一个MLP进行分类。模型结构如下,\n", + "\n", + "![Figure 1 from paper](https://github.com/google-research/vision_transformer/raw/main/vit_figure.png)\n" + ] + }, + { + "cell_type": "markdown", + "id": "97e174e6", + "metadata": {}, + "source": [ + "## 2. 模型效果 " + ] + }, + { + "cell_type": "markdown", + "id": "78137a72", + "metadata": {}, + "source": [ + "| Model | Phase | Dataset | gpu | img/sec | Top1 Acc | Official |\n", + "| --- | --- | --- | --- | --- | --- | --- |\n", + "| ViT-B_16_224 |pretrain |ImageNet2012 |A100*N1C8 | 3583| 0.75196 | 0.7479 |\n", + "| ViT-B_16_384 |finetune | ImageNet2012 | A100*N1C8 | 719 | 0.77972 | 0.7791 |\n", + "| ViT-L_16_224 | pretrain | ImageNet21K | A100*N4C32 | 5256 | - | - | |\n", + "|ViT-L_16_384 |finetune | ImageNet2012 | A100*N4C32 | 934 | 0.85030 | 0.8505 |" + ] + }, + { + "cell_type": "markdown", + "id": "ace3c48d", + "metadata": {}, + "source": [ + "## 3. 模型如何使用" + ] + }, + { + "cell_type": "markdown", + "id": "a97a5f56", + "metadata": {}, + "source": [ + "### 3.1 安装PLSC" + ] + }, + { + "cell_type": "markdown", + "id": "492fa769-2fe0-4220-b6d9-bbc32f8cca10", + "metadata": {}, + "source": [ + "```\n", + "git clone https://github.com/PaddlePaddle/PLSC.git\n", + "cd /path/to/PLSC/\n", + "# [optional] pip install -r requirements.txt\n", + "python setup.py develop\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "6b22824d", + "metadata": {}, + "source": [ + "### 3.2 模型训练" + ] + }, + { + "cell_type": "markdown", + "id": "d68ca5fb", + "metadata": {}, + "source": [ + "1. 进入任务目录\n", + "\n", + "```\n", + "cd task/classification/vit\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "9048df01", + "metadata": {}, + "source": [ + "2. 准备数据\n", + "\n", + "将数据整理成以下格式:\n", + "```text\n", + "dataset/\n", + "└── ILSVRC2012\n", + " ├── train\n", + " ├── val\n", + " ├── train_list.txt\n", + " └── val_list.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "bea743ea", + "metadata": {}, + "source": [ + "3. 执行训练命令\n", + "\n", + "```shell\n", + "export PADDLE_NNODES=1\n", + "export PADDLE_MASTER=\"127.0.0.1:12538\"\n", + "export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + "python -m paddle.distributed.launch \\\n", + " --nnodes=$PADDLE_NNODES \\\n", + " --master=$PADDLE_MASTER \\\n", + " --devices=$CUDA_VISIBLE_DEVICES \\\n", + " plsc-train \\\n", + " -c ./configs/ViT_base_patch16_224_in1k_1n8c_dp_fp16o2.yaml\n", + "```\n", + "\n", + "更多模型的训练教程可参考文档:[ViT训练文档](https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/vit/README.md)" + ] + }, + { + "cell_type": "markdown", + "id": "186a0c17", + "metadata": {}, + "source": [ + "### 3.3 模型推理" + ] + }, + { + "cell_type": "markdown", + "id": "e97c527c", + "metadata": {}, + "source": [ + "1. 下载预训练模型\n", + "\n", + "```shell\n", + "mkdir -p pretrained/vit/ViT_base_patch16_224/\n", + "wget -O ./pretrained/vit/ViT_base_patch16_224/imagenet2012-ViT-B_16-224.pdparams https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet2012-ViT-B_16-224.pdparams\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "a07c6549", + "metadata": {}, + "source": [ + "2. 导出推理模型\n", + "\n", + "```shell\n", + "plsc-export -c ./configs/ViT_base_patch16_224_in1k_1n8c_dp_fp16o2.yaml -o Global.pretrained_model=./pretrained/vit/ViT_base_patch16_224/imagenet2012-ViT-B_16-224 -o Model.data_format=NCHW -o FP16.level=O0\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "id": "d375934d", + "metadata": {}, + "source": [ + "## 4. 相关论文及引用信息\n" + ] + }, + { + "cell_type": "markdown", + "id": "29f05b07-d323-45e4-b00d-0728eafb5af7", + "metadata": {}, + "source": [ + "```text\n", + "@article{dosovitskiy2020,\n", + " title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},\n", + " author={Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},\n", + " journal={arXiv preprint arXiv:2010.11929},\n", + " year={2020}\n", + "}\n", + "```" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/modelcenter/PLSC-ViT/introduction_en.ipynb b/modelcenter/PLSC-ViT/introduction_en.ipynb new file mode 100644 index 00000000..c41199ed --- /dev/null +++ b/modelcenter/PLSC-ViT/introduction_en.ipynb @@ -0,0 +1,199 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ae69ce68", + "metadata": {}, + "source": [ + "## 1. PLSC-ViT Introduction\n" + ] + }, + { + "cell_type": "markdown", + "id": "35485bc6", + "metadata": {}, + "source": [ + "PLSC-ViT reimplemented Google's repository for the ViT model. The overview of the model is as follows. The input image is splited into fixed-size patches, then linear projection and position embeddings are applied. The resulting sequence are feed into a standard Transformer encoder. In order to perform classification, the standard approach of adding an extra learnable \"classification token\" is utilized to the sequence. \n", + "\n", + "![Figure 1 from paper](https://github.com/google-research/vision_transformer/raw/main/vit_figure.png)\n" + ] + }, + { + "cell_type": "markdown", + "id": "97e174e6", + "metadata": {}, + "source": [ + "## 2. Model Effects and Application Scenarios" + ] + }, + { + "cell_type": "markdown", + "id": "67ae978f", + "metadata": {}, + "source": [ + "| Model | Phase | Dataset | gpu | img/sec | Top1 Acc | Official |\n", + "| --- | --- | --- | --- | --- | --- | --- |\n", + "| ViT-B_16_224 |pretrain |ImageNet2012 |A100*N1C8 | 3583| 0.75196 | 0.7479 |\n", + "| ViT-B_16_384 |finetune | ImageNet2012 | A100*N1C8 | 719 | 0.77972 | 0.7791 |\n", + "| ViT-L_16_224 | pretrain | ImageNet21K | A100*N4C32 | 5256 | - | - | |\n", + "|ViT-L_16_384 |finetune | ImageNet2012 | A100*N4C32 | 934 | 0.85030 | 0.8505 |" + ] + }, + { + "cell_type": "markdown", + "id": "ace3c48d", + "metadata": {}, + "source": [ + "## 3. How to use the Model" + ] + }, + { + "cell_type": "markdown", + "id": "186a0c17", + "metadata": {}, + "source": [ + "### 3.1 Install PLSC\n", + "\n", + "```shell\n", + "git clone https://github.com/PaddlePaddle/PLSC.git\n", + "cd /path/to/PLSC/\n", + "# [optional] pip install -r requirements.txt\n", + "python setup.py develop\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "6b22824d", + "metadata": {}, + "source": [ + "### 3.2 Model Training" + ] + }, + { + "cell_type": "markdown", + "id": "a562bf23", + "metadata": {}, + "source": [ + "1. Enter into the task directory\n", + "\n", + "```shell\n", + "cd task/classification/vit\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "de109245", + "metadata": {}, + "source": [ + "2. Prepare the data\n", + "\n", + "Organize the data into the following format:\n", + "\n", + "```text\n", + "dataset/\n", + "└── ILSVRC2012\n", + " ├── train\n", + " ├── val\n", + " ├── train_list.txt\n", + " └── val_list.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "ec78efdf", + "metadata": {}, + "source": [ + "3. Run the command\n", + "\n", + "```shell\n", + "export PADDLE_NNODES=1\n", + "export PADDLE_MASTER=\"127.0.0.1:12538\"\n", + "export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + "python -m paddle.distributed.launch \\\n", + " --nnodes=$PADDLE_NNODES \\\n", + " --master=$PADDLE_MASTER \\\n", + " --devices=$CUDA_VISIBLE_DEVICES \\\n", + " plsc-train \\\n", + " -c ./configs/ViT_base_patch16_224_in1k_1n8c_dp_fp16o2.yaml\n", + "```\n", + "\n", + "More courses about model training can be learned here [ViT readme](https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/vit/README.md)" + ] + }, + { + "cell_type": "markdown", + "id": "05ba38c3", + "metadata": {}, + "source": [ + "### 3.3 Model Inference" + ] + }, + { + "cell_type": "markdown", + "id": "7a3ce1ab", + "metadata": {}, + "source": [ + "1. Download pretrained model\n", + "\n", + "```shell\n", + "mkdir -p pretrained/vit/ViT_base_patch16_224/\n", + "wget -O ./pretrained/vit/ViT_base_patch16_224/imagenet2012-ViT-B_16-224.pdparams https://plsc.bj.bcebos.com/models/vit/v2.4/imagenet2012-ViT-B_16-224.pdparams\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "cff5ac83", + "metadata": {}, + "source": [ + "2. Export model for inference\n", + "\n", + "```shell\n", + "plsc-export -c ./configs/ViT_base_patch16_224_in1k_1n8c_dp_fp16o2.yaml -o Global.pretrained_model=./pretrained/vit/ViT_base_patch16_224/imagenet2012-ViT-B_16-224 -o Model.data_format=NCHW -o FP16.level=O0\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "d375934d", + "metadata": {}, + "source": [ + "## 4. Related papers and citations\n", + "\n", + "```text\n", + "@article{dosovitskiy2020,\n", + " title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},\n", + " author={Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},\n", + " journal={arXiv preprint arXiv:2010.11929},\n", + " year={2020}\n", + "}\n", + "```" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab