未验证 提交 c7e73758 编写于 作者: Q qizhaoaoe 提交者: GitHub

feat: add swin-transformer (#5727)

* feat: add swin-transformer

* fix repo

* refine docs

* fix: update resize method and image, add inference codes in docs.
上级 cc2f20c4
import numpy as np
import gradio as gr
from download import get_model_path, get_data_path
from plsc.data.preprocess import Resize
from plsc.engine.inference import Predictor
predictor = None
def model_inference(image):
global predictor
if predictor is None:
model_path = "paddlecv://models/swin/v2.5/swin_base_patch4_window7_224_infer.pdmodel"
params_path = "paddlecv://models/swin/v2.5/swin_base_patch4_window7_224_infer.pdiparams"
label_path = "paddlecv://dataset/imagenet2012_labels.txt"
infer_model = get_model_path(model_path)
infer_params = get_model_path(params_path)
def parse_labels(label_path):
labels = []
with open(label_path, 'r') as f:
for line in f:
if len(line) < 2:
continue
label = line.strip().split(',')[1]
labels.append(label)
return np.array(labels)
labels = parse_labels(get_data_path(label_path))
def preprocess(img):
resize = Resize(size=224, interpolation="bicubic", backend="pil")
img = np.array(resize(img))
scale = 1.0 / 255.0
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
img = (img * scale - mean) / std
img = img[np.newaxis, :, :, :]
img = img.transpose((0, 3, 1, 2))
return {'x': img.astype('float32')}
def postprocess(logits):
def softmax(x, epsilon=1e-6):
exp_x = np.exp(x)
sfm = (exp_x + epsilon) / (np.sum(exp_x) + epsilon)
return sfm
pred = np.array(logits).squeeze()
pred = softmax(pred)
class_idx = pred.argsort()[::-1]
class_idx_top5 = class_idx[:5]
return class_idx_top5, pred[class_idx_top5], labels[class_idx_top5]
predictor = Predictor(
model_file=infer_model,
params_file=infer_params,
preprocess_fn=preprocess,
postprocess_fn=postprocess)
class_ids, scores, classes = predictor.predict(image)
json_out = {
"class_ids": class_ids.tolist(),
"scores": scores.tolist(),
"labels": classes.tolist()
}
return image, json_out
def clear_all():
return None, None, None
with gr.Blocks() as demo:
gr.Markdown("Classification based on SwinTransformer")
with gr.Column(scale=1, min_width=100):
img_in = gr.Image(
value="https://plsc.bj.bcebos.com/dataset/test_images/zebra.png",
label="Input").style(height=200)
with gr.Row():
btn1 = gr.Button("Clear")
btn2 = gr.Button("Submit")
img_out = gr.Image(label="Output").style(height=200)
json_out = gr.JSON(label="jsonOutput")
btn2.click(fn=model_inference, inputs=img_in, outputs=[img_out, json_out])
btn1.click(fn=clear_all, inputs=None, outputs=[img_in, img_out, json_out])
gr.Button.style(1)
demo.launch()
【PLSC-SwinTransformer-App-YAML】
APP_Info:
title: PLSC-SwinTransformer-App
colorFrom: blue
colorTo: yellow
sdk: gradio
sdk_version: 3.9.1
app_file: app.py
license: apache-2.0
device: cpu
\ No newline at end of file
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import os.path as osp
import sys
import yaml
import time
import shutil
import requests
import tqdm
import hashlib
import base64
import binascii
import tarfile
import zipfile
__all__ = [
'get_model_path',
'get_config_path',
'get_dict_path',
'get_data_path',
]
WEIGHTS_HOME = osp.expanduser("~/.cache/paddlecv/models/plsc")
CONFIGS_HOME = osp.expanduser("~/.cache/paddlecv/configs/plsc")
DICTS_HOME = osp.expanduser("~/.cache/paddlecv/dicts/plsc/")
DATA_HOME = osp.expanduser("~/.cache/paddlecv/dataset/plsc")
# dict of {dataset_name: (download_info, sub_dirs)}
# download info: [(url, md5sum)]
DOWNLOAD_RETRY_LIMIT = 3
PMP_DOWNLOAD_URL_PREFIX = 'https://plsc.bj.bcebos.com/'
def is_url(path):
"""
Whether path is URL.
Args:
path (string): URL string or not.
"""
return path.startswith('http://') \
or path.startswith('https://') \
or path.startswith('paddlecv://')
def parse_url(url):
url = url.replace("paddlecv://", PMP_DOWNLOAD_URL_PREFIX)
return url
def get_model_path(path):
"""Get model path from WEIGHTS_HOME, if not exists,
download it from url.
"""
if not is_url(path):
return path
url = parse_url(path)
path, _ = get_path(url, WEIGHTS_HOME, path_depth=3)
return path
def get_data_path(path):
"""Get model path from DATA_HOME, if not exists,
download it from url.
"""
if not is_url(path):
return path
url = parse_url(path)
path, _ = get_path(url, DATA_HOME, path_depth=1)
return path
def get_config_path(path):
"""Get config path from CONFIGS_HOME, if not exists,
download it from url.
"""
if not is_url(path):
return path
url = parse_url(path)
path, _ = get_path(url, CONFIGS_HOME)
return path
def get_dict_path(path):
"""Get config path from CONFIGS_HOME, if not exists,
download it from url.
"""
if not is_url(path):
return path
url = parse_url(path)
path, _ = get_path(url, DICTS_HOME)
return path
def map_path(url, root_dir, path_depth=1):
# parse path after download to decompress under root_dir
assert path_depth > 0, "path_depth should be a positive integer"
dirname = url
for _ in range(path_depth):
dirname = osp.dirname(dirname)
fpath = osp.relpath(url, dirname)
path = osp.join(root_dir, fpath)
dirname = osp.dirname(path)
return path, dirname
def get_path(url, root_dir, md5sum=None, check_exist=True, path_depth=1):
""" Download from given url to root_dir.
if file or directory specified by url is exists under
root_dir, return the path directly, otherwise download
from url, return the path.
url (str): download url
root_dir (str): root dir for downloading, it should be
WEIGHTS_HOME
md5sum (str): md5 sum of download package
"""
# parse path after download to decompress under root_dir
fullpath, dirname = map_path(url, root_dir, path_depth)
if osp.exists(fullpath) and check_exist:
if not osp.isfile(fullpath) or \
_check_exist_file_md5(fullpath, md5sum, url):
return fullpath, True
else:
os.remove(fullpath)
fullname = _download(url, dirname, md5sum)
return fullpath, False
def _download(url, path, md5sum=None):
"""
Download from url, save to path.
url (str): download url
path (str): download to given path
"""
if not osp.exists(path):
os.makedirs(path)
fname = osp.split(url)[-1]
fullname = osp.join(path, fname)
retry_cnt = 0
while not (osp.exists(fullname) and _check_exist_file_md5(fullname, md5sum,
url)):
if retry_cnt < DOWNLOAD_RETRY_LIMIT:
retry_cnt += 1
else:
raise RuntimeError("Download from {} failed. "
"Retry limit reached".format(url))
# NOTE: windows path join may incur \, which is invalid in url
if sys.platform == "win32":
url = url.replace('\\', '/')
req = requests.get(url, stream=True)
if req.status_code != 200:
raise RuntimeError("Downloading from {} failed with code "
"{}!".format(url, req.status_code))
# For protecting download interupted, download to
# tmp_fullname firstly, move tmp_fullname to fullname
# after download finished
tmp_fullname = fullname + "_tmp"
total_size = req.headers.get('content-length')
with open(tmp_fullname, 'wb') as f:
if total_size:
for chunk in tqdm.tqdm(
req.iter_content(chunk_size=1024),
total=(int(total_size) + 1023) // 1024,
unit='KB'):
f.write(chunk)
else:
for chunk in req.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
shutil.move(tmp_fullname, fullname)
return fullname
def _check_exist_file_md5(filename, md5sum, url):
# if md5sum is None, and file to check is model file,
# read md5um from url and check, else check md5sum directly
return _md5check_from_url(filename, url) if md5sum is None \
and filename.endswith('pdparams') \
else _md5check(filename, md5sum)
def _md5check_from_url(filename, url):
# For model in bcebos URLs, MD5 value is contained
# in request header as 'content_md5'
req = requests.get(url, stream=True)
content_md5 = req.headers.get('content-md5')
req.close()
if not content_md5 or _md5check(
filename,
binascii.hexlify(base64.b64decode(content_md5.strip('"'))).decode(
)):
return True
else:
return False
def _md5check(fullname, md5sum=None):
if md5sum is None:
return True
md5 = hashlib.md5()
with open(fullname, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b""):
md5.update(chunk)
calc_md5sum = md5.hexdigest()
if calc_md5sum != md5sum:
return False
return True
plsc==2.4
gradio
opencv-python
paddlepaddle
PyYAML
shapely
scipy
Cython
numpy
setuptools
pillow
tqdm
\ No newline at end of file
# 1. Benchmark
## 1.1 软硬件环境
* 单机 8卡 A100(40G)
* CUDA 11.2
* CUDNN 8.1
## 1.2 数据集
- 测试使用的数据集为ImageNet.
## 1.3 指标
| Model |DType | Phase | Dataset | gpu | img/sec | Top1 Acc | Official |
| --- | --- | --- | --- | --- | --- | --- | --- |
| Swin-B |FP16 O1|pretrain |ImageNet2012 |A100*N1C8 | 2155| 0.83362 | 0.835 |
| Swin-B |FP16 O2|pretrain | ImageNet2012 | A100*N1C8 | 3006 | 0.83223 | 0.835 |
# 2. 相关使用说明
https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/swin/README.md
# 1. Benchmark
## 1.1 Environment
* 8 A100(40G) on single Node
* CUDA 11.2
* CUDNN 8.1
## 1.2 DataSet
- We train the Swin Transformer on ImageNet.
## 1.3 Benchmark
| Model |DType | Phase | Dataset | gpu | img/sec | Top1 Acc | Official |
| --- | --- | --- | --- | --- | --- | --- | --- |
| Swin-B |FP16 O1|pretrain |ImageNet2012 |A100*N1C8 | 2155| 0.83362 | 0.835 |
| Swin-B |FP16 O2|pretrain | ImageNet2012 | A100*N1C8 | 3006 | 0.83223 | 0.835 |
# 2. Reference
https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/swin/README.md
# 模型列表
|模型名称|模型简介|模型配置|预训练checkpoint下载地址|
| --- | --- | --- | --- |
| ViT-B |输入size为224,patch=4, FP16-O1|[config](https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/swin/configs/swin_base_patch4_window7_224_in1k_1n8c_dp_fp16o1.yaml) |[download](https://plsc.bj.bcebos.com/models/swin/v2.5/swin_base_patch4_window7_224_fp16o1.pdparams) |
| ViT-B |输入size为224,patch=4, FP16-O2|[config](https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/swin/configs/swin_base_patch4_window7_224_in1k_1n8c_dp_fp16o2.yaml)| [download](https://plsc.bj.bcebos.com/models/swin/v2.5/swin_base_patch4_window7_224_fp16o2.pdparams) |
# Model List
|Model Name| Introduction |Config|Pretrained checkpoint Download|
| --- |---------------------------------| --- | --- |
| ViT-B | input size=224,patch=4, FP16-O1 |[config](https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/swin/configs/swin_base_patch4_window7_224_in1k_1n8c_dp_fp16o1.yaml) |[download](https://plsc.bj.bcebos.com/models/swin/v2.5/swin_base_patch4_window7_224_fp16o1.pdparams) |
| ViT-B | input size=224,patch=4, FP16-O2 |[config](https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/swin/configs/swin_base_patch4_window7_224_in1k_1n8c_dp_fp16o2.yaml)| [download](https://plsc.bj.bcebos.com/models/swin/v2.5/swin_base_patch4_window7_224_fp16o2.pdparams) |
---
Model_Info:
name: "PLSC-SwinTransformer"
description: "PaddlePaddle 重新实现 Microsoft 官方 Repo 中的 Swin Transformer 算法 《Swin Transformer: Hierarchical Vision Transformer using Shifted Windows》"
description_en: "PaddlePaddle reimplementation of Google's repository for the Swin Transformer model that was released with the paper Swin Transformer: Hierarchical Vision Transformer using Shifted Windows."
update_time:
icon: "https://plsc.bj.bcebos.com/assets/modelcenter-icon.png"
from_repo: "PLSC"
Task:
- tag: 计算机视觉
tag_en: Computer Vision
sub_tag: 图像分类
sub_tag_en: Image Classification
Example:
- tag:
tag_en:
sub_tag:
sub_tag_en:
title:
title_en:
url:
url_en:
Datasets: ImageNet 1K
Publisher: Baidu
License: Apache 2.0
Paper:
- title: "Swin Transformer: Hierarchical Vision Transformer using Shifted Windows"
url: https://arxiv.org/pdf/2103.14030.pdf
IfTraining: 1
IfOnlineDemo: 1
\ No newline at end of file
{
"cells": [
{
"cell_type": "markdown",
"id": "ae69ce68",
"metadata": {},
"source": [
"## 1. PLSC-SwinTransformer模型简介\n"
]
},
{
"cell_type": "markdown",
"id": "35485bc6",
"metadata": {},
"source": [
"PLSC-SwinTransformer实现了基于[Swin Transformer](https://github.com/microsoft/Swin-Transformer)的视觉分类模型。Swin Transformer是一个层级结构的Vision Transformer(ViT),Swin代表的是滑动窗口。与ViT不同,Swin基于非重叠的局部窗口计算自注意力,并且跨窗口进行连接保证窗口间信息共享,因此Swin Transormer相比于基于全局的ViT更高效。Swin Transformer可以作为CV领域的一个通用的backbone。模型结构如下,\n",
"\n",
"![Figure 1 from paper](https://github.com/microsoft/Swin-Transformer/blob/main/figures/teaser.png?raw=true)\n"
]
},
{
"cell_type": "markdown",
"id": "97e174e6",
"metadata": {},
"source": [
"## 2. 模型效果 "
]
},
{
"cell_type": "markdown",
"id": "78137a72",
"metadata": {},
"source": [
"| Model |DType | Phase | Dataset | gpu | img/sec | Top1 Acc | Official |\n",
"| --- | --- | --- | --- | --- | --- | --- | --- |\n",
"| Swin-B |FP16 O1|pretrain |ImageNet2012 |A100*N1C8 | 2155| 0.83362 | 0.835 |\n",
"| Swin-B |FP16 O2|pretrain | ImageNet2012 | A100*N1C8 | 3006 | 0.83223\t | 0.835 |\n"
]
},
{
"cell_type": "markdown",
"id": "ace3c48d",
"metadata": {},
"source": [
"## 3. 模型如何使用"
]
},
{
"cell_type": "markdown",
"id": "a97a5f56",
"metadata": {},
"source": [
"### 3.1 安装PLSC"
]
},
{
"cell_type": "markdown",
"id": "492fa769-2fe0-4220-b6d9-bbc32f8cca10",
"metadata": {},
"source": [
"```\n",
"git clone https://github.com/PaddlePaddle/PLSC.git\n",
"cd /path/to/PLSC/\n",
"# [optional] pip install -r requirements.txt\n",
"python setup.py develop\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "6b22824d",
"metadata": {},
"source": [
"### 3.2 模型训练"
]
},
{
"cell_type": "markdown",
"id": "d68ca5fb",
"metadata": {},
"source": [
"1. 进入任务目录\n",
"\n",
"```\n",
"cd task/classification/swin\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "9048df01",
"metadata": {},
"source": [
"2. 准备数据\n",
"\n",
"将数据整理成以下格式:\n",
"```text\n",
"dataset/\n",
"└── ILSVRC2012\n",
" ├── train\n",
" ├── val\n",
" ├── train_list.txt\n",
" └── val_list.txt\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "bea743ea",
"metadata": {},
"source": [
"3. 执行训练命令\n",
"\n",
"```shell\n",
"export PADDLE_NNODES=1\n",
"export PADDLE_MASTER=\"127.0.0.1:12538\"\n",
"export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n",
"\n",
"python -m paddle.distributed.launch \\\n",
" --nnodes=$PADDLE_NNODES \\\n",
" --master=$PADDLE_MASTER \\\n",
" --devices=$CUDA_VISIBLE_DEVICES \\\n",
" plsc-train \\\n",
" -c ./configs/swin_base_patch4_window7_224_in1k_1n8c_dp_fp16o1.yaml\n",
"```\n",
"\n",
"更多模型的训练教程可参考文档:[Swin训练文档](https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/swin/README.md)"
]
},
{
"cell_type": "markdown",
"id": "186a0c17",
"metadata": {},
"source": [
"### 3.3 模型推理"
]
},
{
"cell_type": "markdown",
"id": "e97c527c",
"metadata": {},
"source": [
"1. 下载预训练模型和图片\n",
"\n",
"```shell\n",
"# download pretrained model\n",
"mkdir -p pretrained/swin/Swin_base/\n",
"wget -O ./pretrained/swin/Swin_base/swin_base_patch4_window7_224_fp16o1.pdparams \n",
"https://plsc.bj.bcebos.com/models/swin/v2.5/swin_base_patch4_window7_224_fp16o1.pdparams\n",
"\n",
"# download image\n",
"mkdir -p images/\n",
"wget -O ./images/zebra.png https://plsc.bj.bcebos.com/dataset/test_images/zebra.png \n",
"```"
]
},
{
"cell_type": "markdown",
"id": "a07c6549",
"metadata": {},
"source": [
"2. 导出推理模型\n",
"\n",
"```shell\n",
"plsc-export -c ./configs/swin_base_patch4_window7_224_in1k_1n8c_dp_fp16o1.yaml -o Global.pretrained_model=./pretrained/swin/Swin_base/swin_base_patch4_window7_224_fp16o1 -o Model.data_format=NCHW -o FP16.level=O0\n",
"```\n"
]
},
{
"cell_type": "markdown",
"id": "3ded8e73-3dba-49ce-bfb3-fcf7f3f0fc1d",
"metadata": {},
"source": [
"3. 图片预测"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9533d4df-acb3-474f-b591-f210639a0a02",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"from plsc.data.dataset import default_loader\n",
"from plsc.data.preprocess import Resize\n",
"from plsc.engine.inference import Predictor\n",
"\n",
"\n",
"def preprocess(img):\n",
" resize = Resize(size=224, \n",
" interpolation=\"bicubic\", \n",
" backend=\"pil\")\n",
" img = np.array(resize(img))\n",
" scale = 1.0 / 255.0\n",
" mean = np.array([0.485, 0.456, 0.406])\n",
" std = np.array([0.229, 0.224, 0.225])\n",
" img = (img * scale - mean) / std\n",
" img = img[np.newaxis, :, :, :]\n",
" img = img.transpose((0, 3, 1, 2))\n",
" return {'x': img.astype('float32')}\n",
"\n",
"\n",
"def postprocess(logits):\n",
" \n",
" def softmax(x, epsilon=1e-6):\n",
" exp_x = np.exp(x)\n",
" sfm = (exp_x + epsilon) / (np.sum(exp_x) + epsilon)\n",
" return sfm\n",
"\n",
" pred = np.array(logits).squeeze()\n",
" pred = softmax(pred)\n",
" pred_class_idx = pred.argsort()[::-1][0]\n",
" return pred_class_idx, pred[pred_class_idx]\n",
"\n",
"\n",
"infer_model = \"./output/swin_base_patch4_window7_224/swin_base_patch4_window7_224.pdmodel\"\n",
"infer_params = \"./output/swin_base_patch4_window7_224/swin_base_patch4_window7_224.pdiparams\"\n",
"\n",
"predictor = Predictor(\n",
" model_file=infer_model,\n",
" params_file=infer_params,\n",
" preprocess_fn=preprocess,\n",
" postprocess_fn=postprocess)\n",
"\n",
"image = default_loader(\"./images/zebra.png \")\n",
"pred_class_idx, pred_score = predictor.predict(image)"
]
},
{
"cell_type": "markdown",
"id": "d375934d",
"metadata": {},
"source": [
"## 4. 相关论文及引用信息\n"
]
},
{
"cell_type": "markdown",
"id": "29f05b07-d323-45e4-b00d-0728eafb5af7",
"metadata": {},
"source": [
"```text\n",
"@inproceedings{liu2021Swin,\n",
" title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows},\n",
" author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},\n",
" booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},\n",
" year={2021}\n",
"}\n",
"```"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
{
"cells": [
{
"cell_type": "markdown",
"id": "ae69ce68",
"metadata": {},
"source": [
"## 1. PLSC-SwinTransformer Introduction\n"
]
},
{
"cell_type": "markdown",
"id": "35485bc6",
"metadata": {},
"source": [
"PLSC-SwinTransformer reimplementation of [microsoft's repository for the Swin-Transformer](https://github.com/microsoft/Swin-Transformer) model that was released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/pdf/2103.14030.pdf).\n",
"\n",
"Swin Transformer (the name Swin stands for Shifted window) capably serves as a general-purpose backbone for computer vision. It is basically a hierarchical Transformer whose representation is computed with shifted windows. The shifted windowing scheme brings greater efficiency by limiting self-attention computation to non-overlapping local windows while also allowing for cross-window connection.\n",
"\n",
"![Figure 1 from paper](https://github.com/microsoft/Swin-Transformer/blob/main/figures/teaser.png?raw=true)\n"
]
},
{
"cell_type": "markdown",
"id": "97e174e6",
"metadata": {
"tags": []
},
"source": [
"## 2. Model Effects"
]
},
{
"cell_type": "markdown",
"id": "78137a72",
"metadata": {},
"source": [
"| Model |DType | Phase | Dataset | gpu | img/sec | Top1 Acc | Official |\n",
"| --- | --- | --- | --- | --- | --- | --- | --- |\n",
"| Swin-B |FP16 O1|pretrain |ImageNet2012 |A100*N1C8 | 2155| 0.83362 | 0.835 |\n",
"| Swin-B |FP16 O2|pretrain | ImageNet2012 | A100*N1C8 | 3006 | 0.83223\t | 0.835 |\n"
]
},
{
"cell_type": "markdown",
"id": "ace3c48d",
"metadata": {},
"source": [
"## 3. How to use the Model"
]
},
{
"cell_type": "markdown",
"id": "a97a5f56",
"metadata": {},
"source": [
"### 3.1 Install PLSC"
]
},
{
"cell_type": "markdown",
"id": "492fa769-2fe0-4220-b6d9-bbc32f8cca10",
"metadata": {},
"source": [
"```\n",
"git clone https://github.com/PaddlePaddle/PLSC.git\n",
"cd /path/to/PLSC/\n",
"# [optional] pip install -r requirements.txt\n",
"python setup.py develop\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "6b22824d",
"metadata": {},
"source": [
"### 3.2 Model Training"
]
},
{
"cell_type": "markdown",
"id": "d68ca5fb",
"metadata": {},
"source": [
"1. Enter into the task directory\n",
"\n",
"```\n",
"cd task/classification/swin\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "9048df01",
"metadata": {},
"source": [
"2. Prepare the data\n",
"\n",
"Organize the data into the following format:\n",
"\n",
"\n",
"```text\n",
"dataset/\n",
"└── ILSVRC2012\n",
" ├── train\n",
" ├── val\n",
" ├── train_list.txt\n",
" └── val_list.txt\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "bea743ea",
"metadata": {},
"source": [
"3. Run the command\n",
"\n",
"```shell\n",
"export PADDLE_NNODES=1\n",
"export PADDLE_MASTER=\"127.0.0.1:12538\"\n",
"export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n",
"\n",
"python -m paddle.distributed.launch \\\n",
" --nnodes=$PADDLE_NNODES \\\n",
" --master=$PADDLE_MASTER \\\n",
" --devices=$CUDA_VISIBLE_DEVICES \\\n",
" plsc-train \\\n",
" -c ./configs/swin_base_patch4_window7_224_in1k_1n8c_dp_fp16o1.yaml\n",
"```\n",
"\n",
"More courses about model training can be learned here [Swin](https://github.com/PaddlePaddle/PLSC/blob/master/task/classification/swin/README.md)"
]
},
{
"cell_type": "markdown",
"id": "186a0c17",
"metadata": {},
"source": [
"### 3.3 Model Inference"
]
},
{
"cell_type": "markdown",
"id": "e97c527c",
"metadata": {},
"source": [
"1. Download pretrained model and image\n",
"\n",
"\n",
"```shell\n",
"# download pretrained model\n",
"mkdir -p pretrained/swin/Swin_base/\n",
"wget -O ./pretrained/swin/Swin_base/swin_base_patch4_window7_224_fp16o1.pdparams \n",
"https://plsc.bj.bcebos.com/models/swin/v2.5/swin_base_patch4_window7_224_fp16o1.pdparams\n",
"\n",
"# download image\n",
"mkdir -p images/\n",
"wget -O ./images/zebra.png https://plsc.bj.bcebos.com/dataset/test_images/zebra.png\n",
"```\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "a07c6549",
"metadata": {},
"source": [
"2. Export model for inference\n",
"\n",
"```shell\n",
"plsc-export -c ./configs/swin_base_patch4_window7_224_in1k_1n8c_dp_fp16o1.yaml -o Global.pretrained_model=./pretrained/swin/Swin_base/swin_base_patch4_window7_224_fp16o1 -o Model.data_format=NCHW -o FP16.level=O0\n",
"```\n"
]
},
{
"cell_type": "markdown",
"id": "e92efe35-ea6d-4aee-9a4d-a2c79f40f473",
"metadata": {},
"source": [
"3. Image inference"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "22f4a080-ad97-4e00-a9fa-697601f579ef",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"from plsc.data.dataset import default_loader\n",
"from plsc.data.preprocess import Resize\n",
"from plsc.engine.inference import Predictor\n",
"\n",
"\n",
"def preprocess(img):\n",
" resize = Resize(size=224, \n",
" interpolation=\"bicubic\", \n",
" backend=\"pil\")\n",
" img = np.array(resize(img))\n",
" scale = 1.0 / 255.0\n",
" mean = np.array([0.485, 0.456, 0.406])\n",
" std = np.array([0.229, 0.224, 0.225])\n",
" img = (img * scale - mean) / std\n",
" img = img[np.newaxis, :, :, :]\n",
" img = img.transpose((0, 3, 1, 2))\n",
" return {'x': img.astype('float32')}\n",
"\n",
"\n",
"def postprocess(logits):\n",
" \n",
" def softmax(x, epsilon=1e-6):\n",
" exp_x = np.exp(x)\n",
" sfm = (exp_x + epsilon) / (np.sum(exp_x) + epsilon)\n",
" return sfm\n",
"\n",
" pred = np.array(logits).squeeze()\n",
" pred = softmax(pred)\n",
" pred_class_idx = pred.argsort()[::-1][0]\n",
" return pred_class_idx, pred[pred_class_idx]\n",
"\n",
"\n",
"infer_model = \"./output/swin_base_patch4_window7_224/swin_base_patch4_window7_224.pdmodel\"\n",
"infer_params = \"./output/swin_base_patch4_window7_224/swin_base_patch4_window7_224.pdiparams\"\n",
"\n",
"predictor = Predictor(\n",
" model_file=infer_model,\n",
" params_file=infer_params,\n",
" preprocess_fn=preprocess,\n",
" postprocess_fn=postprocess)\n",
"\n",
"image = default_loader(\"./images/zebra.png\")\n",
"pred_class_idx, pred_score = predictor.predict(image)"
]
},
{
"cell_type": "markdown",
"id": "d375934d",
"metadata": {},
"source": [
"## 4. Related papers and citations"
]
},
{
"cell_type": "markdown",
"id": "29f05b07-d323-45e4-b00d-0728eafb5af7",
"metadata": {},
"source": [
"```text\n",
"@inproceedings{liu2021Swin,\n",
" title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows},\n",
" author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},\n",
" booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},\n",
" year={2021}\n",
"}\n",
"```"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册