From c7e73758b55a5689dcb701c97748cd15b551505e Mon Sep 17 00:00:00 2001 From: qizhaoaoe <10208099+qizhaoaoe@users.noreply.github.com> Date: Thu, 2 Mar 2023 16:01:10 +0800 Subject: [PATCH] feat: add swin-transformer (#5727) * feat: add swin-transformer * fix repo * refine docs * fix: update resize method and image, add inference codes in docs. --- .../PLSC-SwinTransformer/APP/__init__.py | 0 modelcenter/PLSC-SwinTransformer/APP/app.py | 96 ++++++ modelcenter/PLSC-SwinTransformer/APP/app.yaml | 11 + .../PLSC-SwinTransformer/APP/download.py | 229 ++++++++++++++ .../PLSC-SwinTransformer/APP/requirements.txt | 12 + .../PLSC-SwinTransformer/benchmark_cn.md | 22 ++ .../PLSC-SwinTransformer/benchmark_en.md | 22 ++ .../PLSC-SwinTransformer/download_cn.md | 6 + .../PLSC-SwinTransformer/download_en.md | 6 + modelcenter/PLSC-SwinTransformer/info.yaml | 30 ++ .../introduction_cn.ipynb | 276 +++++++++++++++++ .../introduction_en.ipynb | 284 ++++++++++++++++++ 12 files changed, 994 insertions(+) create mode 100644 modelcenter/PLSC-SwinTransformer/APP/__init__.py create mode 100644 modelcenter/PLSC-SwinTransformer/APP/app.py create mode 100644 modelcenter/PLSC-SwinTransformer/APP/app.yaml create mode 100644 modelcenter/PLSC-SwinTransformer/APP/download.py create mode 100644 modelcenter/PLSC-SwinTransformer/APP/requirements.txt create mode 100644 modelcenter/PLSC-SwinTransformer/benchmark_cn.md create mode 100644 modelcenter/PLSC-SwinTransformer/benchmark_en.md create mode 100644 modelcenter/PLSC-SwinTransformer/download_cn.md create mode 100644 modelcenter/PLSC-SwinTransformer/download_en.md create mode 100644 modelcenter/PLSC-SwinTransformer/info.yaml create mode 100644 modelcenter/PLSC-SwinTransformer/introduction_cn.ipynb create mode 100644 modelcenter/PLSC-SwinTransformer/introduction_en.ipynb diff --git a/modelcenter/PLSC-SwinTransformer/APP/__init__.py b/modelcenter/PLSC-SwinTransformer/APP/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelcenter/PLSC-SwinTransformer/APP/app.py b/modelcenter/PLSC-SwinTransformer/APP/app.py new file mode 100644 index 00000000..de70268a --- /dev/null +++ b/modelcenter/PLSC-SwinTransformer/APP/app.py @@ -0,0 +1,96 @@ +import numpy as np +import gradio as gr + +from download import get_model_path, get_data_path + +from plsc.data.preprocess import Resize +from plsc.engine.inference import Predictor + +predictor = None + + +def model_inference(image): + global predictor + + if predictor is None: + + model_path = "paddlecv://models/swin/v2.5/swin_base_patch4_window7_224_infer.pdmodel" + params_path = "paddlecv://models/swin/v2.5/swin_base_patch4_window7_224_infer.pdiparams" + label_path = "paddlecv://dataset/imagenet2012_labels.txt" + infer_model = get_model_path(model_path) + infer_params = get_model_path(params_path) + + def parse_labels(label_path): + labels = [] + with open(label_path, 'r') as f: + for line in f: + if len(line) < 2: + continue + label = line.strip().split(',')[1] + labels.append(label) + return np.array(labels) + + labels = parse_labels(get_data_path(label_path)) + + def preprocess(img): + resize = Resize(size=224, interpolation="bicubic", backend="pil") + img = np.array(resize(img)) + scale = 1.0 / 255.0 + mean = np.array([0.485, 0.456, 0.406]) + std = np.array([0.229, 0.224, 0.225]) + img = (img * scale - mean) / std + img = img[np.newaxis, :, :, :] + img = img.transpose((0, 3, 1, 2)) + return {'x': img.astype('float32')} + + def postprocess(logits): + def softmax(x, epsilon=1e-6): + exp_x = np.exp(x) + sfm = (exp_x + epsilon) / (np.sum(exp_x) + epsilon) + return sfm + + pred = np.array(logits).squeeze() + pred = softmax(pred) + class_idx = pred.argsort()[::-1] + class_idx_top5 = class_idx[:5] + return class_idx_top5, pred[class_idx_top5], labels[class_idx_top5] + + predictor = Predictor( + model_file=infer_model, + params_file=infer_params, + preprocess_fn=preprocess, + postprocess_fn=postprocess) + + class_ids, scores, classes = predictor.predict(image) + json_out = { + "class_ids": class_ids.tolist(), + "scores": scores.tolist(), + "labels": classes.tolist() + } + return image, json_out + + +def clear_all(): + return None, None, None + + +with gr.Blocks() as demo: + gr.Markdown("Classification based on SwinTransformer") + + with gr.Column(scale=1, min_width=100): + img_in = gr.Image( + value="https://plsc.bj.bcebos.com/dataset/test_images/zebra.png", + label="Input").style(height=200) + + with gr.Row(): + btn1 = gr.Button("Clear") + btn2 = gr.Button("Submit") + + img_out = gr.Image(label="Output").style(height=200) + json_out = gr.JSON(label="jsonOutput") + + btn2.click(fn=model_inference, inputs=img_in, outputs=[img_out, json_out]) + btn1.click(fn=clear_all, inputs=None, outputs=[img_in, img_out, json_out]) + gr.Button.style(1) + +demo.launch() diff --git a/modelcenter/PLSC-SwinTransformer/APP/app.yaml b/modelcenter/PLSC-SwinTransformer/APP/app.yaml new file mode 100644 index 00000000..6e4b3057 --- /dev/null +++ b/modelcenter/PLSC-SwinTransformer/APP/app.yaml @@ -0,0 +1,11 @@ +【PLSC-SwinTransformer-App-YAML】 + +APP_Info: + title: PLSC-SwinTransformer-App + colorFrom: blue + colorTo: yellow + sdk: gradio + sdk_version: 3.9.1 + app_file: app.py + license: apache-2.0 + device: cpu \ No newline at end of file diff --git a/modelcenter/PLSC-SwinTransformer/APP/download.py b/modelcenter/PLSC-SwinTransformer/APP/download.py new file mode 100644 index 00000000..648d568e --- /dev/null +++ b/modelcenter/PLSC-SwinTransformer/APP/download.py @@ -0,0 +1,229 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import os.path as osp +import sys +import yaml +import time +import shutil +import requests +import tqdm +import hashlib +import base64 +import binascii +import tarfile +import zipfile + +__all__ = [ + 'get_model_path', + 'get_config_path', + 'get_dict_path', + 'get_data_path', +] + +WEIGHTS_HOME = osp.expanduser("~/.cache/paddlecv/models/plsc") +CONFIGS_HOME = osp.expanduser("~/.cache/paddlecv/configs/plsc") +DICTS_HOME = osp.expanduser("~/.cache/paddlecv/dicts/plsc/") +DATA_HOME = osp.expanduser("~/.cache/paddlecv/dataset/plsc") +# dict of {dataset_name: (download_info, sub_dirs)} +# download info: [(url, md5sum)] + +DOWNLOAD_RETRY_LIMIT = 3 + +PMP_DOWNLOAD_URL_PREFIX = 'https://plsc.bj.bcebos.com/' + + +def is_url(path): + """ + Whether path is URL. + Args: + path (string): URL string or not. + """ + return path.startswith('http://') \ + or path.startswith('https://') \ + or path.startswith('paddlecv://') + + +def parse_url(url): + url = url.replace("paddlecv://", PMP_DOWNLOAD_URL_PREFIX) + return url + + +def get_model_path(path): + """Get model path from WEIGHTS_HOME, if not exists, + download it from url. + """ + if not is_url(path): + return path + url = parse_url(path) + path, _ = get_path(url, WEIGHTS_HOME, path_depth=3) + return path + + +def get_data_path(path): + """Get model path from DATA_HOME, if not exists, + download it from url. + """ + if not is_url(path): + return path + url = parse_url(path) + path, _ = get_path(url, DATA_HOME, path_depth=1) + return path + + +def get_config_path(path): + """Get config path from CONFIGS_HOME, if not exists, + download it from url. + """ + if not is_url(path): + return path + url = parse_url(path) + path, _ = get_path(url, CONFIGS_HOME) + return path + + +def get_dict_path(path): + """Get config path from CONFIGS_HOME, if not exists, + download it from url. + """ + if not is_url(path): + return path + url = parse_url(path) + path, _ = get_path(url, DICTS_HOME) + return path + + +def map_path(url, root_dir, path_depth=1): + # parse path after download to decompress under root_dir + assert path_depth > 0, "path_depth should be a positive integer" + dirname = url + for _ in range(path_depth): + dirname = osp.dirname(dirname) + fpath = osp.relpath(url, dirname) + path = osp.join(root_dir, fpath) + dirname = osp.dirname(path) + return path, dirname + + +def get_path(url, root_dir, md5sum=None, check_exist=True, path_depth=1): + """ Download from given url to root_dir. + if file or directory specified by url is exists under + root_dir, return the path directly, otherwise download + from url, return the path. + url (str): download url + root_dir (str): root dir for downloading, it should be + WEIGHTS_HOME + md5sum (str): md5 sum of download package + """ + # parse path after download to decompress under root_dir + fullpath, dirname = map_path(url, root_dir, path_depth) + + if osp.exists(fullpath) and check_exist: + if not osp.isfile(fullpath) or \ + _check_exist_file_md5(fullpath, md5sum, url): + return fullpath, True + else: + os.remove(fullpath) + + fullname = _download(url, dirname, md5sum) + return fullpath, False + + +def _download(url, path, md5sum=None): + """ + Download from url, save to path. + url (str): download url + path (str): download to given path + """ + if not osp.exists(path): + os.makedirs(path) + + fname = osp.split(url)[-1] + fullname = osp.join(path, fname) + retry_cnt = 0 + + while not (osp.exists(fullname) and _check_exist_file_md5(fullname, md5sum, + url)): + if retry_cnt < DOWNLOAD_RETRY_LIMIT: + retry_cnt += 1 + else: + raise RuntimeError("Download from {} failed. " + "Retry limit reached".format(url)) + + # NOTE: windows path join may incur \, which is invalid in url + if sys.platform == "win32": + url = url.replace('\\', '/') + + req = requests.get(url, stream=True) + if req.status_code != 200: + raise RuntimeError("Downloading from {} failed with code " + "{}!".format(url, req.status_code)) + + # For protecting download interupted, download to + # tmp_fullname firstly, move tmp_fullname to fullname + # after download finished + tmp_fullname = fullname + "_tmp" + total_size = req.headers.get('content-length') + with open(tmp_fullname, 'wb') as f: + if total_size: + for chunk in tqdm.tqdm( + req.iter_content(chunk_size=1024), + total=(int(total_size) + 1023) // 1024, + unit='KB'): + f.write(chunk) + else: + for chunk in req.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + shutil.move(tmp_fullname, fullname) + return fullname + + +def _check_exist_file_md5(filename, md5sum, url): + # if md5sum is None, and file to check is model file, + # read md5um from url and check, else check md5sum directly + return _md5check_from_url(filename, url) if md5sum is None \ + and filename.endswith('pdparams') \ + else _md5check(filename, md5sum) + + +def _md5check_from_url(filename, url): + # For model in bcebos URLs, MD5 value is contained + # in request header as 'content_md5' + req = requests.get(url, stream=True) + content_md5 = req.headers.get('content-md5') + req.close() + if not content_md5 or _md5check( + filename, + binascii.hexlify(base64.b64decode(content_md5.strip('"'))).decode( + )): + return True + else: + return False + + +def _md5check(fullname, md5sum=None): + if md5sum is None: + return True + + md5 = hashlib.md5() + with open(fullname, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + md5.update(chunk) + calc_md5sum = md5.hexdigest() + + if calc_md5sum != md5sum: + return False + return True diff --git a/modelcenter/PLSC-SwinTransformer/APP/requirements.txt b/modelcenter/PLSC-SwinTransformer/APP/requirements.txt new file mode 100644 index 00000000..4ea1cb70 --- /dev/null +++ b/modelcenter/PLSC-SwinTransformer/APP/requirements.txt @@ -0,0 +1,12 @@ +plsc==2.4 +gradio +opencv-python +paddlepaddle +PyYAML +shapely +scipy +Cython +numpy +setuptools +pillow +tqdm \ No newline at end of file diff --git a/modelcenter/PLSC-SwinTransformer/benchmark_cn.md b/modelcenter/PLSC-SwinTransformer/benchmark_cn.md new file mode 100644 index 00000000..5c462729 --- /dev/null +++ b/modelcenter/PLSC-SwinTransformer/benchmark_cn.md @@ -0,0 +1,22 @@ +# 1. Benchmark + +## 1.1 软硬件环境 + +* 单机 8卡 A100(40G) +* CUDA 11.2 +* CUDNN 8.1 + +## 1.2 数据集 +- 测试使用的数据集为ImageNet. + +## 1.3 指标 + + +| Model |DType | Phase | Dataset | gpu | img/sec | Top1 Acc | Official | +| --- | --- | --- | --- | --- | --- | --- | --- | +| Swin-B |FP16 O1|pretrain |ImageNet2012 |A100*N1C8 | 2155| 0.83362 | 0.835 | +| Swin-B |FP16 O2|pretrain | ImageNet2012 | A100*N1C8 | 3006 | 0.83223 | 0.835 | + +# 2. 相关使用说明 + +https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/swin/README.md diff --git a/modelcenter/PLSC-SwinTransformer/benchmark_en.md b/modelcenter/PLSC-SwinTransformer/benchmark_en.md new file mode 100644 index 00000000..bdc0e3d1 --- /dev/null +++ b/modelcenter/PLSC-SwinTransformer/benchmark_en.md @@ -0,0 +1,22 @@ +# 1. Benchmark + +## 1.1 Environment + +* 8 A100(40G) on single Node +* CUDA 11.2 +* CUDNN 8.1 + +## 1.2 DataSet +- We train the Swin Transformer on ImageNet. + +## 1.3 Benchmark + + +| Model |DType | Phase | Dataset | gpu | img/sec | Top1 Acc | Official | +| --- | --- | --- | --- | --- | --- | --- | --- | +| Swin-B |FP16 O1|pretrain |ImageNet2012 |A100*N1C8 | 2155| 0.83362 | 0.835 | +| Swin-B |FP16 O2|pretrain | ImageNet2012 | A100*N1C8 | 3006 | 0.83223 | 0.835 | + +# 2. Reference + +https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/swin/README.md diff --git a/modelcenter/PLSC-SwinTransformer/download_cn.md b/modelcenter/PLSC-SwinTransformer/download_cn.md new file mode 100644 index 00000000..56566ddf --- /dev/null +++ b/modelcenter/PLSC-SwinTransformer/download_cn.md @@ -0,0 +1,6 @@ +# 模型列表 + +|模型名称|模型简介|模型配置|预训练checkpoint下载地址| +| --- | --- | --- | --- | +| ViT-B |输入size为224,patch=4, FP16-O1|[config](https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/swin/configs/swin_base_patch4_window7_224_in1k_1n8c_dp_fp16o1.yaml) |[download](https://plsc.bj.bcebos.com/models/swin/v2.5/swin_base_patch4_window7_224_fp16o1.pdparams) | +| ViT-B |输入size为224,patch=4, FP16-O2|[config](https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/swin/configs/swin_base_patch4_window7_224_in1k_1n8c_dp_fp16o2.yaml)| [download](https://plsc.bj.bcebos.com/models/swin/v2.5/swin_base_patch4_window7_224_fp16o2.pdparams) | diff --git a/modelcenter/PLSC-SwinTransformer/download_en.md b/modelcenter/PLSC-SwinTransformer/download_en.md new file mode 100644 index 00000000..047f01be --- /dev/null +++ b/modelcenter/PLSC-SwinTransformer/download_en.md @@ -0,0 +1,6 @@ +# Model List + +|Model Name| Introduction |Config|Pretrained checkpoint Download| +| --- |---------------------------------| --- | --- | +| ViT-B | input size=224,patch=4, FP16-O1 |[config](https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/swin/configs/swin_base_patch4_window7_224_in1k_1n8c_dp_fp16o1.yaml) |[download](https://plsc.bj.bcebos.com/models/swin/v2.5/swin_base_patch4_window7_224_fp16o1.pdparams) | +| ViT-B | input size=224,patch=4, FP16-O2 |[config](https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/swin/configs/swin_base_patch4_window7_224_in1k_1n8c_dp_fp16o2.yaml)| [download](https://plsc.bj.bcebos.com/models/swin/v2.5/swin_base_patch4_window7_224_fp16o2.pdparams) | diff --git a/modelcenter/PLSC-SwinTransformer/info.yaml b/modelcenter/PLSC-SwinTransformer/info.yaml new file mode 100644 index 00000000..0c12a011 --- /dev/null +++ b/modelcenter/PLSC-SwinTransformer/info.yaml @@ -0,0 +1,30 @@ +--- +Model_Info: + name: "PLSC-SwinTransformer" + description: "PaddlePaddle 重新实现 Microsoft 官方 Repo 中的 Swin Transformer 算法 《Swin Transformer: Hierarchical Vision Transformer using Shifted Windows》" + description_en: "PaddlePaddle reimplementation of Google's repository for the Swin Transformer model that was released with the paper Swin Transformer: Hierarchical Vision Transformer using Shifted Windows." + update_time: + icon: "https://plsc.bj.bcebos.com/assets/modelcenter-icon.png" + from_repo: "PLSC" +Task: +- tag: 计算机视觉 + tag_en: Computer Vision + sub_tag: 图像分类 + sub_tag_en: Image Classification +Example: +- tag: + tag_en: + sub_tag: + sub_tag_en: + title: + title_en: + url: + url_en: +Datasets: ImageNet 1K +Publisher: Baidu +License: Apache 2.0 +Paper: +- title: "Swin Transformer: Hierarchical Vision Transformer using Shifted Windows" + url: https://arxiv.org/pdf/2103.14030.pdf +IfTraining: 1 +IfOnlineDemo: 1 \ No newline at end of file diff --git a/modelcenter/PLSC-SwinTransformer/introduction_cn.ipynb b/modelcenter/PLSC-SwinTransformer/introduction_cn.ipynb new file mode 100644 index 00000000..42de5a59 --- /dev/null +++ b/modelcenter/PLSC-SwinTransformer/introduction_cn.ipynb @@ -0,0 +1,276 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ae69ce68", + "metadata": {}, + "source": [ + "## 1. PLSC-SwinTransformer模型简介\n" + ] + }, + { + "cell_type": "markdown", + "id": "35485bc6", + "metadata": {}, + "source": [ + "PLSC-SwinTransformer实现了基于[Swin Transformer](https://github.com/microsoft/Swin-Transformer)的视觉分类模型。Swin Transformer是一个层级结构的Vision Transformer(ViT),Swin代表的是滑动窗口。与ViT不同,Swin基于非重叠的局部窗口计算自注意力,并且跨窗口进行连接保证窗口间信息共享,因此Swin Transormer相比于基于全局的ViT更高效。Swin Transformer可以作为CV领域的一个通用的backbone。模型结构如下,\n", + "\n", + "![Figure 1 from paper](https://github.com/microsoft/Swin-Transformer/blob/main/figures/teaser.png?raw=true)\n" + ] + }, + { + "cell_type": "markdown", + "id": "97e174e6", + "metadata": {}, + "source": [ + "## 2. 模型效果 " + ] + }, + { + "cell_type": "markdown", + "id": "78137a72", + "metadata": {}, + "source": [ + "| Model |DType | Phase | Dataset | gpu | img/sec | Top1 Acc | Official |\n", + "| --- | --- | --- | --- | --- | --- | --- | --- |\n", + "| Swin-B |FP16 O1|pretrain |ImageNet2012 |A100*N1C8 | 2155| 0.83362 | 0.835 |\n", + "| Swin-B |FP16 O2|pretrain | ImageNet2012 | A100*N1C8 | 3006 | 0.83223\t | 0.835 |\n" + ] + }, + { + "cell_type": "markdown", + "id": "ace3c48d", + "metadata": {}, + "source": [ + "## 3. 模型如何使用" + ] + }, + { + "cell_type": "markdown", + "id": "a97a5f56", + "metadata": {}, + "source": [ + "### 3.1 安装PLSC" + ] + }, + { + "cell_type": "markdown", + "id": "492fa769-2fe0-4220-b6d9-bbc32f8cca10", + "metadata": {}, + "source": [ + "```\n", + "git clone https://github.com/PaddlePaddle/PLSC.git\n", + "cd /path/to/PLSC/\n", + "# [optional] pip install -r requirements.txt\n", + "python setup.py develop\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "6b22824d", + "metadata": {}, + "source": [ + "### 3.2 模型训练" + ] + }, + { + "cell_type": "markdown", + "id": "d68ca5fb", + "metadata": {}, + "source": [ + "1. 进入任务目录\n", + "\n", + "```\n", + "cd task/classification/swin\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "9048df01", + "metadata": {}, + "source": [ + "2. 准备数据\n", + "\n", + "将数据整理成以下格式:\n", + "```text\n", + "dataset/\n", + "└── ILSVRC2012\n", + " ├── train\n", + " ├── val\n", + " ├── train_list.txt\n", + " └── val_list.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "bea743ea", + "metadata": {}, + "source": [ + "3. 执行训练命令\n", + "\n", + "```shell\n", + "export PADDLE_NNODES=1\n", + "export PADDLE_MASTER=\"127.0.0.1:12538\"\n", + "export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + "python -m paddle.distributed.launch \\\n", + " --nnodes=$PADDLE_NNODES \\\n", + " --master=$PADDLE_MASTER \\\n", + " --devices=$CUDA_VISIBLE_DEVICES \\\n", + " plsc-train \\\n", + " -c ./configs/swin_base_patch4_window7_224_in1k_1n8c_dp_fp16o1.yaml\n", + "```\n", + "\n", + "更多模型的训练教程可参考文档:[Swin训练文档](https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/swin/README.md)" + ] + }, + { + "cell_type": "markdown", + "id": "186a0c17", + "metadata": {}, + "source": [ + "### 3.3 模型推理" + ] + }, + { + "cell_type": "markdown", + "id": "e97c527c", + "metadata": {}, + "source": [ + "1. 下载预训练模型和图片\n", + "\n", + "```shell\n", + "# download pretrained model\n", + "mkdir -p pretrained/swin/Swin_base/\n", + "wget -O ./pretrained/swin/Swin_base/swin_base_patch4_window7_224_fp16o1.pdparams \n", + "https://plsc.bj.bcebos.com/models/swin/v2.5/swin_base_patch4_window7_224_fp16o1.pdparams\n", + "\n", + "# download image\n", + "mkdir -p images/\n", + "wget -O ./images/zebra.png https://plsc.bj.bcebos.com/dataset/test_images/zebra.png \n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "a07c6549", + "metadata": {}, + "source": [ + "2. 导出推理模型\n", + "\n", + "```shell\n", + "plsc-export -c ./configs/swin_base_patch4_window7_224_in1k_1n8c_dp_fp16o1.yaml -o Global.pretrained_model=./pretrained/swin/Swin_base/swin_base_patch4_window7_224_fp16o1 -o Model.data_format=NCHW -o FP16.level=O0\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "id": "3ded8e73-3dba-49ce-bfb3-fcf7f3f0fc1d", + "metadata": {}, + "source": [ + "3. 图片预测" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9533d4df-acb3-474f-b591-f210639a0a02", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from plsc.data.dataset import default_loader\n", + "from plsc.data.preprocess import Resize\n", + "from plsc.engine.inference import Predictor\n", + "\n", + "\n", + "def preprocess(img):\n", + " resize = Resize(size=224, \n", + " interpolation=\"bicubic\", \n", + " backend=\"pil\")\n", + " img = np.array(resize(img))\n", + " scale = 1.0 / 255.0\n", + " mean = np.array([0.485, 0.456, 0.406])\n", + " std = np.array([0.229, 0.224, 0.225])\n", + " img = (img * scale - mean) / std\n", + " img = img[np.newaxis, :, :, :]\n", + " img = img.transpose((0, 3, 1, 2))\n", + " return {'x': img.astype('float32')}\n", + "\n", + "\n", + "def postprocess(logits):\n", + " \n", + " def softmax(x, epsilon=1e-6):\n", + " exp_x = np.exp(x)\n", + " sfm = (exp_x + epsilon) / (np.sum(exp_x) + epsilon)\n", + " return sfm\n", + "\n", + " pred = np.array(logits).squeeze()\n", + " pred = softmax(pred)\n", + " pred_class_idx = pred.argsort()[::-1][0]\n", + " return pred_class_idx, pred[pred_class_idx]\n", + "\n", + "\n", + "infer_model = \"./output/swin_base_patch4_window7_224/swin_base_patch4_window7_224.pdmodel\"\n", + "infer_params = \"./output/swin_base_patch4_window7_224/swin_base_patch4_window7_224.pdiparams\"\n", + "\n", + "predictor = Predictor(\n", + " model_file=infer_model,\n", + " params_file=infer_params,\n", + " preprocess_fn=preprocess,\n", + " postprocess_fn=postprocess)\n", + "\n", + "image = default_loader(\"./images/zebra.png \")\n", + "pred_class_idx, pred_score = predictor.predict(image)" + ] + }, + { + "cell_type": "markdown", + "id": "d375934d", + "metadata": {}, + "source": [ + "## 4. 相关论文及引用信息\n" + ] + }, + { + "cell_type": "markdown", + "id": "29f05b07-d323-45e4-b00d-0728eafb5af7", + "metadata": {}, + "source": [ + "```text\n", + "@inproceedings{liu2021Swin,\n", + " title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows},\n", + " author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},\n", + " booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},\n", + " year={2021}\n", + "}\n", + "```" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/modelcenter/PLSC-SwinTransformer/introduction_en.ipynb b/modelcenter/PLSC-SwinTransformer/introduction_en.ipynb new file mode 100644 index 00000000..ef266f4a --- /dev/null +++ b/modelcenter/PLSC-SwinTransformer/introduction_en.ipynb @@ -0,0 +1,284 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ae69ce68", + "metadata": {}, + "source": [ + "## 1. PLSC-SwinTransformer Introduction\n" + ] + }, + { + "cell_type": "markdown", + "id": "35485bc6", + "metadata": {}, + "source": [ + "PLSC-SwinTransformer reimplementation of [microsoft's repository for the Swin-Transformer](https://github.com/microsoft/Swin-Transformer) model that was released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/pdf/2103.14030.pdf).\n", + "\n", + "Swin Transformer (the name Swin stands for Shifted window) capably serves as a general-purpose backbone for computer vision. It is basically a hierarchical Transformer whose representation is computed with shifted windows. The shifted windowing scheme brings greater efficiency by limiting self-attention computation to non-overlapping local windows while also allowing for cross-window connection.\n", + "\n", + "![Figure 1 from paper](https://github.com/microsoft/Swin-Transformer/blob/main/figures/teaser.png?raw=true)\n" + ] + }, + { + "cell_type": "markdown", + "id": "97e174e6", + "metadata": { + "tags": [] + }, + "source": [ + "## 2. Model Effects" + ] + }, + { + "cell_type": "markdown", + "id": "78137a72", + "metadata": {}, + "source": [ + "| Model |DType | Phase | Dataset | gpu | img/sec | Top1 Acc | Official |\n", + "| --- | --- | --- | --- | --- | --- | --- | --- |\n", + "| Swin-B |FP16 O1|pretrain |ImageNet2012 |A100*N1C8 | 2155| 0.83362 | 0.835 |\n", + "| Swin-B |FP16 O2|pretrain | ImageNet2012 | A100*N1C8 | 3006 | 0.83223\t | 0.835 |\n" + ] + }, + { + "cell_type": "markdown", + "id": "ace3c48d", + "metadata": {}, + "source": [ + "## 3. How to use the Model" + ] + }, + { + "cell_type": "markdown", + "id": "a97a5f56", + "metadata": {}, + "source": [ + "### 3.1 Install PLSC" + ] + }, + { + "cell_type": "markdown", + "id": "492fa769-2fe0-4220-b6d9-bbc32f8cca10", + "metadata": {}, + "source": [ + "```\n", + "git clone https://github.com/PaddlePaddle/PLSC.git\n", + "cd /path/to/PLSC/\n", + "# [optional] pip install -r requirements.txt\n", + "python setup.py develop\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "6b22824d", + "metadata": {}, + "source": [ + "### 3.2 Model Training" + ] + }, + { + "cell_type": "markdown", + "id": "d68ca5fb", + "metadata": {}, + "source": [ + "1. Enter into the task directory\n", + "\n", + "```\n", + "cd task/classification/swin\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "9048df01", + "metadata": {}, + "source": [ + "2. Prepare the data\n", + "\n", + "Organize the data into the following format:\n", + "\n", + "\n", + "```text\n", + "dataset/\n", + "└── ILSVRC2012\n", + " ├── train\n", + " ├── val\n", + " ├── train_list.txt\n", + " └── val_list.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "bea743ea", + "metadata": {}, + "source": [ + "3. Run the command\n", + "\n", + "```shell\n", + "export PADDLE_NNODES=1\n", + "export PADDLE_MASTER=\"127.0.0.1:12538\"\n", + "export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + "python -m paddle.distributed.launch \\\n", + " --nnodes=$PADDLE_NNODES \\\n", + " --master=$PADDLE_MASTER \\\n", + " --devices=$CUDA_VISIBLE_DEVICES \\\n", + " plsc-train \\\n", + " -c ./configs/swin_base_patch4_window7_224_in1k_1n8c_dp_fp16o1.yaml\n", + "```\n", + "\n", + "More courses about model training can be learned here [Swin](https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/swin/README.md)" + ] + }, + { + "cell_type": "markdown", + "id": "186a0c17", + "metadata": {}, + "source": [ + "### 3.3 Model Inference" + ] + }, + { + "cell_type": "markdown", + "id": "e97c527c", + "metadata": {}, + "source": [ + "1. Download pretrained model and image\n", + "\n", + "\n", + "```shell\n", + "# download pretrained model\n", + "mkdir -p pretrained/swin/Swin_base/\n", + "wget -O ./pretrained/swin/Swin_base/swin_base_patch4_window7_224_fp16o1.pdparams \n", + "https://plsc.bj.bcebos.com/models/swin/v2.5/swin_base_patch4_window7_224_fp16o1.pdparams\n", + "\n", + "# download image\n", + "mkdir -p images/\n", + "wget -O ./images/zebra.png https://plsc.bj.bcebos.com/dataset/test_images/zebra.png\n", + "```\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "a07c6549", + "metadata": {}, + "source": [ + "2. Export model for inference\n", + "\n", + "```shell\n", + "plsc-export -c ./configs/swin_base_patch4_window7_224_in1k_1n8c_dp_fp16o1.yaml -o Global.pretrained_model=./pretrained/swin/Swin_base/swin_base_patch4_window7_224_fp16o1 -o Model.data_format=NCHW -o FP16.level=O0\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "id": "e92efe35-ea6d-4aee-9a4d-a2c79f40f473", + "metadata": {}, + "source": [ + "3. Image inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22f4a080-ad97-4e00-a9fa-697601f579ef", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from plsc.data.dataset import default_loader\n", + "from plsc.data.preprocess import Resize\n", + "from plsc.engine.inference import Predictor\n", + "\n", + "\n", + "def preprocess(img):\n", + " resize = Resize(size=224, \n", + " interpolation=\"bicubic\", \n", + " backend=\"pil\")\n", + " img = np.array(resize(img))\n", + " scale = 1.0 / 255.0\n", + " mean = np.array([0.485, 0.456, 0.406])\n", + " std = np.array([0.229, 0.224, 0.225])\n", + " img = (img * scale - mean) / std\n", + " img = img[np.newaxis, :, :, :]\n", + " img = img.transpose((0, 3, 1, 2))\n", + " return {'x': img.astype('float32')}\n", + "\n", + "\n", + "def postprocess(logits):\n", + " \n", + " def softmax(x, epsilon=1e-6):\n", + " exp_x = np.exp(x)\n", + " sfm = (exp_x + epsilon) / (np.sum(exp_x) + epsilon)\n", + " return sfm\n", + "\n", + " pred = np.array(logits).squeeze()\n", + " pred = softmax(pred)\n", + " pred_class_idx = pred.argsort()[::-1][0]\n", + " return pred_class_idx, pred[pred_class_idx]\n", + "\n", + "\n", + "infer_model = \"./output/swin_base_patch4_window7_224/swin_base_patch4_window7_224.pdmodel\"\n", + "infer_params = \"./output/swin_base_patch4_window7_224/swin_base_patch4_window7_224.pdiparams\"\n", + "\n", + "predictor = Predictor(\n", + " model_file=infer_model,\n", + " params_file=infer_params,\n", + " preprocess_fn=preprocess,\n", + " postprocess_fn=postprocess)\n", + "\n", + "image = default_loader(\"./images/zebra.png\")\n", + "pred_class_idx, pred_score = predictor.predict(image)" + ] + }, + { + "cell_type": "markdown", + "id": "d375934d", + "metadata": {}, + "source": [ + "## 4. Related papers and citations" + ] + }, + { + "cell_type": "markdown", + "id": "29f05b07-d323-45e4-b00d-0728eafb5af7", + "metadata": {}, + "source": [ + "```text\n", + "@inproceedings{liu2021Swin,\n", + " title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows},\n", + " author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},\n", + " booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},\n", + " year={2021}\n", + "}\n", + "```" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab