提交 b4942bdd 编写于 作者: C chenyuntc

init

上级
class Config:
voc_data_dir = '/mnt/3/VOC/VOCdevkit/VOC2007/'
min_size = 600
max_size = 1000
opt = Config()
\ No newline at end of file
import torch as t
from .voc_dataset import VOCBboxDataset
from skimage import transform as sktsf
from torchvision import transforms as tvtsf
from . import util
def preprocess(img,min_size = 600, max_size = 1000):
"""Preprocess an image for feature extraction.
The length of the shorter edge is scaled to :obj:`self.min_size`.
After the scaling, if the length of the longer edge is longer than
:obj:`self.max_size`, the image is scaled to fit the longer edge
to :obj:`self.max_size`.
After resizing the image, the image is subtracted by a mean image value
:obj:`self.mean`.
Args:
img (~numpy.ndarray): An image. This is in CHW and RGB format.
The range of its value is :math:`[0, 255]`.
Returns:
~numpy.ndarray:
A preprocessed image.
"""
C, H, W = img.shape
scale1 = min_size / min(H, W)
scale2 = max_size / max(H, W)
scale = min(scale1, scale2)
# 总之较长的一边和较短的一边都要小于(max_size和min_size)
#img = resize(img, (int(H * scale), int(W * scale)))
img = img / 256
img = sktsf.resize(img, (C,H*scale,W*scale),mode='reflect')
normalize = tvtsf.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
img = normalize(t.from_numpy(img))
return img.numpy()
#NOTE: 原始的normalize 为什么只减均值,不除以标准差??
# mean=np.array([122.7717, 115.9465, 102.9801],
# img = (img - self.mean).astype(np.float32, copy=False)
class Transform(object):
def __init__(self, min_size=600,max_size=1000):
self.min_size = min_size
self.max_size = max_size
def __call__(self, in_data):
img, bbox, label = in_data
_, H, W = img.shape
img = preprocess(img, self.min_size, self.max_size)
_, o_H, o_W = img.shape
scale = o_H / H
bbox = util.resize_bbox(bbox, (H, W), (o_H, o_W))
# horizontally flip
img, params = util.random_flip(
img, x_random=True, return_param=True)
bbox = util.flip_bbox(
bbox, (o_H, o_W), x_flip=params['x_flip'])
return img, bbox, label, scale
class Dataset():
def __init__(self, opt):
self.opt = opt
self.db = VOCBboxDataset(opt.voc_data_dir)
self.tsf = Transform(opt.min_size,opt.max_size)
def __getitem__(self, idx):
img, bbox, label, difficult = self.db.get_example(idx)
img, bbox, label, scale = self.tsf((img, bbox, label))
return img, bbox, label, scale
import numpy as np
from PIL import Image
import random
def read_image(path, dtype=np.float32, color=True):
"""Read an image from a file.
This function reads an image from given file. The image is CHW format and
the range of its value is :math:`[0, 255]`. If :obj:`color = True`, the
order of the channels is RGB.
Args:
path (str): A path of image file.
dtype: The type of array. The default value is :obj:`~numpy.float32`.
color (bool): This option determines the number of channels.
If :obj:`True`, the number of channels is three. In this case,
the order of the channels is RGB. This is the default behaviour.
If :obj:`False`, this function returns a grayscale image.
Returns:
~numpy.ndarray: An image.
"""
f = Image.open(path)
try:
if color:
img = f.convert('RGB')
else:
img = f.convert('P')
img = np.asarray(img, dtype=dtype)
finally:
if hasattr(f, 'close'):
f.close()
if img.ndim == 2:
# reshape (H, W) -> (1, H, W)
return img[np.newaxis]
else:
# transpose (H, W, C) -> (C, H, W)
return img.transpose((2, 0, 1))
def resize_bbox(bbox, in_size, out_size):
"""Resize bounding boxes according to image resize.
The bounding boxes are expected to be packed into a two dimensional
tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
bounding boxes in the image. The second axis represents attributes of
the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
where the four attributes are coordinates of the top left and the
bottom right vertices.
Args:
bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`.
:math:`R` is the number of bounding boxes.
in_size (tuple): A tuple of length 2. The height and the width
of the image before resized.
out_size (tuple): A tuple of length 2. The height and the width
of the image after resized.
Returns:
~numpy.ndarray:
Bounding boxes rescaled according to the given image shapes.
"""
bbox = bbox.copy()
y_scale = float(out_size[0]) / in_size[0]
x_scale = float(out_size[1]) / in_size[1]
bbox[:, 0] = y_scale * bbox[:, 0]
bbox[:, 2] = y_scale * bbox[:, 2]
bbox[:, 1] = x_scale * bbox[:, 1]
bbox[:, 3] = x_scale * bbox[:, 3]
return bbox
def flip_bbox(bbox, size, y_flip=False, x_flip=False):
"""Flip bounding boxes accordingly.
The bounding boxes are expected to be packed into a two dimensional
tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
bounding boxes in the image. The second axis represents attributes of
the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
where the four attributes are coordinates of the top left and the
bottom right vertices.
Args:
bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`.
:math:`R` is the number of bounding boxes.
size (tuple): A tuple of length 2. The height and the width
of the image before resized.
y_flip (bool): Flip bounding box according to a vertical flip of
an image.
x_flip (bool): Flip bounding box according to a horizontal flip of
an image.
Returns:
~numpy.ndarray:
Bounding boxes flipped according to the given flips.
"""
H, W = size
bbox = bbox.copy()
if y_flip:
y_max = H - bbox[:, 0]
y_min = H - bbox[:, 2]
bbox[:, 0] = y_min
bbox[:, 2] = y_max
if x_flip:
x_max = W - bbox[:, 1]
x_min = W - bbox[:, 3]
bbox[:, 1] = x_min
bbox[:, 3] = x_max
return bbox
def crop_bbox(
bbox, y_slice=None, x_slice=None,
allow_outside_center=True, return_param=False):
"""Translate bounding boxes to fit within the cropped area of an image.
This method is mainly used together with image cropping.
This method translates the coordinates of bounding boxes like
:func:`~chainercv.transforms.translate_bbox`. In addition,
this function truncates the bounding boxes to fit within the cropped area.
If a bounding box does not overlap with the cropped area,
this bounding box will be removed.
The bounding boxes are expected to be packed into a two dimensional
tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
bounding boxes in the image. The second axis represents attributes of
the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
where the four attributes are coordinates of the top left and the
bottom right vertices.
Args:
bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is
:math:`(R, 4)`. :math:`R` is the number of bounding boxes.
y_slice (slice): The slice of y axis.
x_slice (slice): The slice of x axis.
allow_outside_center (bool): If this argument is :obj:`False`,
bounding boxes whose centers are outside of the cropped area
are removed. The default value is :obj:`True`.
return_param (bool): If :obj:`True`, this function returns
indices of kept bounding boxes.
Returns:
~numpy.ndarray or (~numpy.ndarray, dict):
If :obj:`return_param = False`, returns an array :obj:`bbox`.
If :obj:`return_param = True`,
returns a tuple whose elements are :obj:`bbox, param`.
:obj:`param` is a dictionary of intermediate parameters whose
contents are listed below with key, value-type and the description
of the value.
* **index** (*numpy.ndarray*): An array holding indices of used \
bounding boxes.
"""
t, b = _slice_to_bounds(y_slice)
l, r = _slice_to_bounds(x_slice)
crop_bb = np.array((t, l, b, r))
if allow_outside_center:
mask = np.ones(bbox.shape[0], dtype=bool)
else:
center = (bbox[:, :2] + bbox[:, 2:]) / 2
mask = np.logical_and(crop_bb[:2] <= center, center < crop_bb[2:]) \
.all(axis=1)
bbox = bbox.copy()
bbox[:, :2] = np.maximum(bbox[:, :2], crop_bb[:2])
bbox[:, 2:] = np.minimum(bbox[:, 2:], crop_bb[2:])
bbox[:, :2] -= crop_bb[:2]
bbox[:, 2:] -= crop_bb[:2]
mask = np.logical_and(mask, (bbox[:, :2] < bbox[:, 2:]).all(axis=1))
bbox = bbox[mask]
if return_param:
return bbox, {'index': np.flatnonzero(mask)}
else:
return bbox
def _slice_to_bounds(slice_):
if slice_ is None:
return 0, np.inf
if slice_.start is None:
l = 0
else:
l = slice_.start
if slice_.stop is None:
u = np.inf
else:
u = slice_.stop
return l, u
def translate_bbox(bbox, y_offset=0, x_offset=0):
"""Translate bounding boxes.
This method is mainly used together with image transforms, such as padding
and cropping, which translates the left top point of the image from
coordinate :math:`(0, 0)` to coordinate
:math:`(y, x) = (y_{offset}, x_{offset})`.
The bounding boxes are expected to be packed into a two dimensional
tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
bounding boxes in the image. The second axis represents attributes of
the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
where the four attributes are coordinates of the top left and the
bottom right vertices.
Args:
bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is
:math:`(R, 4)`. :math:`R` is the number of bounding boxes.
y_offset (int or float): The offset along y axis.
x_offset (int or float): The offset along x axis.
Returns:
~numpy.ndarray:
Bounding boxes translated according to the given offsets.
"""
out_bbox = bbox.copy()
out_bbox[:, :2] += (y_offset, x_offset)
out_bbox[:, 2:] += (y_offset, x_offset)
return out_bbox
def random_flip(img, y_random=False, x_random=False,
return_param=False, copy=False):
"""Randomly flip an image in vertical or horizontal direction.
Args:
img (~numpy.ndarray): An array that gets flipped. This is in
CHW format.
y_random (bool): Randomly flip in vertical direction.
x_random (bool): Randomly flip in horizontal direction.
return_param (bool): Returns information of flip.
copy (bool): If False, a view of :obj:`img` will be returned.
Returns:
~numpy.ndarray or (~numpy.ndarray, dict):
If :obj:`return_param = False`,
returns an array :obj:`out_img` that is the result of flipping.
If :obj:`return_param = True`,
returns a tuple whose elements are :obj:`out_img, param`.
:obj:`param` is a dictionary of intermediate parameters whose
contents are listed below with key, value-type and the description
of the value.
* **y_flip** (*bool*): Whether the image was flipped in the\
vertical direction or not.
* **x_flip** (*bool*): Whether the image was flipped in the\
horizontal direction or not.
"""
y_flip, x_flip = False, False
if y_random:
y_flip = random.choice([True, False])
if x_random:
x_flip = random.choice([True, False])
if y_flip:
img = img[:, ::-1, :]
if x_flip:
img = img[:, :, ::-1]
if copy:
img = img.copy()
if return_param:
return img, {'y_flip': y_flip, 'x_flip': x_flip}
else:
return img
# Mofidied work:
# Original works by:
# --------------------------------------------------------
# Faster R-CNN implementation In ChainerCV
# Copyright (c) 2017 Preferred Networks, Inc.
# Licensed under The MIT License [see LICENSE for details]
# https://github.com/chainer/chainercv
# --------------------------------------------------------
# Faster R-CNN implementation by Chainer
# Copyright (c) 2016 Shunta Saito
# Licensed under The MIT License [see LICENSE for details]
# https://github.com/mitmul/chainer-faster-rcnn
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Sean Bell
# https://github.com/rbgirshick/py-faster-rcnn
# ----------------------------------------------------
import numpy as np
import os
import warnings
import xml.etree.ElementTree as ET
from .util import read_image
class VOCBboxDataset():
"""Bounding box dataset for PASCAL `VOC`_.
.. _`VOC`: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/
The index corresponds to each image.
When queried by an index, if :obj:`return_difficult == False`,
this dataset returns a corresponding
:obj:`img, bbox, label`, a tuple of an image, bounding boxes and labels.
This is the default behaviour.
If :obj:`return_difficult == True`, this dataset returns corresponding
:obj:`img, bbox, label, difficult`. :obj:`difficult` is a boolean array
that indicates whether bounding boxes are labeled as difficult or not.
The bounding boxes are packed into a two dimensional tensor of shape
:math:`(R, 4)`, where :math:`R` is the number of bounding boxes in
the image. The second axis represents attributes of the bounding box.
They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, where the
four attributes are coordinates of the top left and the bottom right
vertices.
The labels are packed into a one dimensional tensor of shape :math:`(R,)`.
:math:`R` is the number of bounding boxes in the image.
The class name of the label :math:`l` is :math:`l` th element of
:obj:`chainercv.datasets.voc_bbox_label_names`.
The array :obj:`difficult` is a one dimensional boolean array of shape
:math:`(R,)`. :math:`R` is the number of bounding boxes in the image.
If :obj:`use_difficult` is :obj:`False`, this array is
a boolean array with all :obj:`False`.
The type of the image, the bounding boxes and the labels are as follows.
* :obj:`img.dtype == numpy.float32`
* :obj:`bbox.dtype == numpy.float32`
* :obj:`label.dtype == numpy.int32`
* :obj:`difficult.dtype == numpy.bool`
Args:
data_dir (string): Path to the root of the training data.
i.e. "/data/image/voc/VOCdevkit/VOC2007/"
split ({'train', 'val', 'trainval', 'test'}): Select a split of the
dataset. :obj:`test` split is only available for
2007 dataset.
year ({'2007', '2012'}): Use a dataset prepared for a challenge
held in :obj:`year`.
use_difficult (bool): If :obj:`True`, use images that are labeled as
difficult in the original annotation.
return_difficult (bool): If :obj:`True`, this dataset returns
a boolean array
that indicates whether bounding boxes are labeled as difficult
or not. The default value is :obj:`False`.
"""
def __init__(self, data_dir, split='train', year='2012',
use_difficult=False, return_difficult=False,
transforms=None):
if split not in ['train', 'trainval', 'val']:
if not (split == 'test' and year == '2007'):
warnings.warn(
'please pick split from \'train\', \'trainval\', \'val\''
'for 2012 dataset. For 2007 dataset, you can pick \'test\''
' in addition to the above mentioned splits.'
)
id_list_file = os.path.join(
data_dir, 'ImageSets/Main/{0}.txt'.format(split))
self.ids = [id_.strip() for id_ in open(id_list_file)]
self.data_dir = data_dir
self.use_difficult = use_difficult
self.return_difficult = return_difficult
def __len__(self):
return len(self.ids)
def get_example(self, i):
"""Returns the i-th example.
Returns a color image and bounding boxes. The image is in CHW format.
The returned image is RGB.
Args:
i (int): The index of the example.
Returns:
tuple of an image and bounding boxes
"""
id_ = self.ids[i]
anno = ET.parse(
os.path.join(self.data_dir, 'Annotations', id_ + '.xml'))
bbox = list()
label = list()
difficult = list()
for obj in anno.findall('object'):
# when in not using difficult split, and the object is
# difficult, skipt it.
if not self.use_difficult and int(obj.find('difficult').text) == 1:
continue
difficult.append(int(obj.find('difficult').text))
bndbox_anno = obj.find('bndbox')
# subtract 1 to make pixel indexes 0-based
bbox.append([
int(bndbox_anno.find(tag).text) - 1
for tag in ('ymin', 'xmin', 'ymax', 'xmax')])
name = obj.find('name').text.lower().strip()
label.append(VOC_BBOX_LABEL_NAMES.index(name))
bbox = np.stack(bbox).astype(np.float32)
label = np.stack(label).astype(np.int32)
# When `use_difficult==False`, all elements in `difficult` are False.
difficult = np.array(difficult, dtype=np.bool)
# Load a image
img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg')
img = read_image(img_file, color=True)
# if self.return_difficult:
# return img, bbox, label, difficult
return img, bbox, label,difficult
def get_voc(root,year,split):
key = year
base_path = os.path.join(root, 'VOCdevkit/VOC{}'.format(year))
split_file = os.path.join(base_path, 'ImageSets/Main/{}.txt'.format(split))
if os.path.exists(split_file):
return base_path
else:
raise FileNotFoundError("VOC Data Not Downloaded")
VOC_BBOX_LABEL_NAMES = (
'aeroplane',
'bicycle',
'bird',
'boat',
'bottle',
'bus',
'car',
'cat',
'chair',
'cow',
'diningtable',
'dog',
'horse',
'motorbike',
'person',
'pottedplant',
'sheep',
'sofa',
'train',
'tvmonitor')
# Mofidied work:
# --------------------------------------------------------
# Copyright (c) 2017 Preferred Networks, Inc.
# --------------------------------------------------------
#
# Original works by:
# --------------------------------------------------------
# Faster R-CNN implementation by Chainer
# Copyright (c) 2016 Shunta Saito
# Licensed under The MIT License [see LICENSE for details]
# https://github.com/mitmul/chainer-faster-rcnn
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Sean Bell
# https://github.com/rbgirshick/py-faster-rcnn
# --------------------------------------------------------
from __future__ import division
import numpy as np
import chainer
from chainer import cuda
import chainer.functions as F
from chainercv.links.model.faster_rcnn.utils.loc2bbox import loc2bbox
from chainercv.utils import non_maximum_suppression
from chainercv.transforms.image.resize import resize
from torch import nn
class FasterRCNN(nn.Module):
"""Base class for Faster R-CNN.
This is a base class for Faster R-CNN links supporting object detection
API [#]_. The following three stages constitute Faster R-CNN.
1. **Feature extraction**: Images are taken and their \
feature maps are calculated.
2. **Region Proposal Networks**: Given the feature maps calculated in \
the previous stage, produce set of RoIs around objects.
3. **Localization and Classification Heads**: Using feature maps that \
belong to the proposed RoIs, classify the categories of the objects \
in the RoIs and improve localizations.
Each stage is carried out by one of the callable
:class:`chainer.Chain` objects :obj:`feature`, :obj:`rpn` and :obj:`head`.
There are two functions :meth:`predict` and :meth:`__call__` to conduct
object detection.
:meth:`predict` takes images and returns bounding boxes that are converted
to image coordinates. This will be useful for a scenario when
Faster R-CNN is treated as a black box function, for instance.
:meth:`__call__` is provided for a scnerario when intermediate outputs
are needed, for instance, for training and debugging.
Links that support obejct detection API have method :meth:`predict` with
the same interface. Please refer to :meth:`predict` for
further details.
.. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
Faster R-CNN: Towards Real-Time Object Detection with \
Region Proposal Networks. NIPS 2015.
Args:
extractor (callable Chain): A callable that takes a BCHW image
array and returns feature maps.
rpn (callable Chain): A callable that has the same interface as
:class:`~chainercv.links.model.faster_rcnn.RegionProposalNetwork`.
Please refer to the documentation found there.
head (callable Chain): A callable that takes
a BCHW array, RoIs and batch indices for RoIs. This returns class
dependent localization paramters and class scores.
mean (numpy.ndarray): A value to be subtracted from an image
in :meth:`prepare`.
min_size (int): A preprocessing paramter for :meth:`prepare`. Please
refer to a docstring found for :meth:`prepare`.
max_size (int): A preprocessing paramter for :meth:`prepare`.
loc_normalize_mean (tuple of four floats): Mean values of
localization estimates.
loc_normalize_std (tupler of four floats): Standard deviation
of localization estimates.
"""
def __init__(
self, extractor, rpn, head,
loc_normalize_mean=(0., 0., 0., 0.),
loc_normalize_std=(0.1, 0.1, 0.2, 0.2),
):
super(FasterRCNN, self).__init__()
self.extractor = extractor
self.rpn = rpn
self.head = head
self.loc_normalize_mean = loc_normalize_mean
self.loc_normalize_std = loc_normalize_std
self.use_preset('visualize')
@property
def n_class(self):
# Total number of classes including the background.
return self.head.n_class
def __call__(self, x, scale=1.):
"""Forward Faster R-CNN.
Scaling paramter :obj:`scale` is used by RPN to determine the
threshold to select small objects, which are going to be
rejected irrespective of their confidence scores.
Here are notations used.
* :math:`N` is the number of batch size
* :math:`R'` is the total number of RoIs produced across batches. \
Given :math:`R_i` proposed RoIs from the :math:`i` th image, \
:math:`R' = \\sum _{i=1} ^ N R_i`.
* :math:`L` is the number of classes excluding the background.
Classes are ordered by the background, the first class, ..., and
the :math:`L` th class.
Args:
x (~chainer.Variable): 4D image variable.
scale (float): Amount of scaling applied to the raw image
during preprocessing.
Returns:
Variable, Variable, array, array:
Returns tuple of four values listed below.
* **roi_cls_locs**: Offsets and scalings for the proposed RoIs. \
Its shape is :math:`(R', (L + 1) \\times 4)`.
* **roi_scores**: Class predictions for the proposed RoIs. \
Its shape is :math:`(R', L + 1)`.
* **rois**: RoIs proposed by RPN. Its shape is \
:math:`(R', 4)`.
* **roi_indices**: Batch indices of RoIs. Its shape is \
:math:`(R',)`.
"""
img_size = x.shape[2:]
h = self.extractor(x)
rpn_locs, rpn_scores, rois, roi_indices, anchor =\
self.rpn(h, img_size, scale)
roi_cls_locs, roi_scores = self.head(
h, rois, roi_indices)
return roi_cls_locs, roi_scores, rois, roi_indices
def use_preset(self, preset):
"""Use the given preset during prediction.
This method changes values of :obj:`self.nms_thresh` and
:obj:`self.score_thresh`. These values are a threshold value
used for non maximum suppression and a threshold value
to discard low confidence proposals in :meth:`predict`,
respectively.
If the attributes need to be changed to something
other than the values provided in the presets, please modify
them by directly accessing the public attributes.
Args:
preset ({'visualize', 'evaluate'): A string to determine the
preset to use.
"""
if preset == 'visualize':
self.nms_thresh = 0.3
self.score_thresh = 0.7
elif preset == 'evaluate':
self.nms_thresh = 0.3
self.score_thresh = 0.05
else:
raise ValueError('preset must be visualize or evaluate')
def prepare(self, img):
"""Preprocess an image for feature extraction.
The length of the shorter edge is scaled to :obj:`self.min_size`.
After the scaling, if the length of the longer edge is longer than
:obj:`self.max_size`, the image is scaled to fit the longer edge
to :obj:`self.max_size`.
After resizing the image, the image is subtracted by a mean image value
:obj:`self.mean`.
Args:
img (~numpy.ndarray): An image. This is in CHW and RGB format.
The range of its value is :math:`[0, 255]`.
Returns:
~numpy.ndarray:
A preprocessed image.
"""
_, H, W = img.shape
scale = 1.
scale = self.min_size / min(H, W)
if scale * max(H, W) > self.max_size:
scale = self.max_size / max(H, W)
img = resize(img, (int(H * scale), int(W * scale)))
img = (img - self.mean).astype(np.float32, copy=False)
return img
def _suppress(self, raw_cls_bbox, raw_prob):
bbox = list()
label = list()
score = list()
# skip cls_id = 0 because it is the background class
for l in range(1, self.n_class):
cls_bbox_l = raw_cls_bbox.reshape((-1, self.n_class, 4))[:, l, :]
prob_l = raw_prob[:, l]
mask = prob_l > self.score_thresh
cls_bbox_l = cls_bbox_l[mask]
prob_l = prob_l[mask]
keep = non_maximum_suppression(
cls_bbox_l, self.nms_thresh, prob_l)
bbox.append(cls_bbox_l[keep])
# The labels are in [0, self.n_class - 2].
label.append((l - 1) * np.ones((len(keep),)))
score.append(prob_l[keep])
bbox = np.concatenate(bbox, axis=0).astype(np.float32)
label = np.concatenate(label, axis=0).astype(np.int32)
score = np.concatenate(score, axis=0).astype(np.float32)
return bbox, label, score
def predict(self, imgs):
"""Detect objects from images.
This method predicts objects for each image.
Args:
imgs (iterable of numpy.ndarray): Arrays holding images.
All images are in CHW and RGB format
and the range of their value is :math:`[0, 255]`.
Returns:
tuple of lists:
This method returns a tuple of three lists,
:obj:`(bboxes, labels, scores)`.
* **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \
where :math:`R` is the number of bounding boxes in a image. \
Each bouding box is organized by \
:math:`(y_{min}, x_{min}, y_{max}, x_{max})` \
in the second axis.
* **labels** : A list of integer arrays of shape :math:`(R,)`. \
Each value indicates the class of the bounding box. \
Values are in range :math:`[0, L - 1]`, where :math:`L` is the \
number of the foreground classes.
* **scores** : A list of float arrays of shape :math:`(R,)`. \
Each value indicates how confident the prediction is.
"""
prepared_imgs = list()
sizes = list()
for img in imgs:
size = img.shape[1:]
img = self.prepare(img.astype(np.float32))
prepared_imgs.append(img)
sizes.append(size)
bboxes = list()
labels = list()
scores = list()
for img, size in zip(prepared_imgs, sizes):
with chainer.using_config('train', False), \
chainer.function.no_backprop_mode():
img_var = chainer.Variable(self.xp.asarray(img[None]))
scale = img_var.shape[3] / size[1]
roi_cls_locs, roi_scores, rois, _ = self.__call__(
img_var, scale=scale)
# We are assuming that batch size is 1.
roi_cls_loc = roi_cls_locs.array
roi_score = roi_scores.array
roi = rois / scale
# Convert predictions to bounding boxes in image coordinates.
# Bounding boxes are scaled to the scale of the input images.
mean = self.xp.tile(self.xp.asarray(self.loc_normalize_mean),
self.n_class)
std = self.xp.tile(self.xp.asarray(self.loc_normalize_std),
self.n_class)
roi_cls_loc = (roi_cls_loc * std + mean).astype(np.float32)
roi_cls_loc = roi_cls_loc.reshape((-1, self.n_class, 4))
roi = self.xp.broadcast_to(roi[:, None], roi_cls_loc.shape)
cls_bbox = loc2bbox(roi.reshape((-1, 4)),
roi_cls_loc.reshape((-1, 4)))
cls_bbox = cls_bbox.reshape((-1, self.n_class * 4))
# clip bounding box
cls_bbox[:, 0::2] = self.xp.clip(cls_bbox[:, 0::2], 0, size[0])
cls_bbox[:, 1::2] = self.xp.clip(cls_bbox[:, 1::2], 0, size[1])
prob = F.softmax(roi_score).array
raw_cls_bbox = cuda.to_cpu(cls_bbox)
raw_prob = cuda.to_cpu(prob)
bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
bboxes.append(bbox)
labels.append(label)
scores.append(score)
return bboxes, labels, scores
import numpy as np
import chainer
import chainer.functions as F
import chainer.links as L
from chainercv.links.model.faster_rcnn.faster_rcnn import FasterRCNN
from chainercv.links.model.faster_rcnn.region_proposal_network import \
RegionProposalNetwork
from chainercv.links.model.vgg.vgg16 import VGG16
from chainercv.utils import download_model
from torch import nn
from torchvision.models import vgg16
class FasterRCNNVGG16(FasterRCNN):
"""Faster R-CNN based on VGG-16.
When you specify the path of a pre-trained chainer model serialized as
a :obj:`.npz` file in the constructor, this chain model automatically
initializes all the parameters with it.
When a string in prespecified set is provided, a pretrained model is
loaded from weights distributed on the Internet.
The list of pretrained models supported are as follows:
* :obj:`voc07`: Loads weights trained with the trainval split of \
PASCAL VOC2007 Detection Dataset.
* :obj:`imagenet`: Loads weights trained with ImageNet Classfication \
task for the feature extractor and the head modules. \
Weights that do not have a corresponding layer in VGG-16 \
will be randomly initialized.
For descriptions on the interface of this model, please refer to
:class:`~chainercv.links.model.faster_rcnn.FasterRCNN`.
:class:`~chainercv.links.model.faster_rcnn.FasterRCNNVGG16`
supports finer control on random initializations of weights by arguments
:obj:`vgg_initialW`, :obj:`rpn_initialW`, :obj:`loc_initialW` and
:obj:`score_initialW`.
It accepts a callable that takes an array and edits its values.
If :obj:`None` is passed as an initializer, the default initializer is
used.
Args:
n_fg_class (int): The number of classes excluding the background.
pretrained_model (str): The destination of the pre-trained
chainer model serialized as a :obj:`.npz` file.
If this is one of the strings described
above, it automatically loads weights stored under a directory
:obj:`$CHAINER_DATASET_ROOT/pfnet/chainercv/models/`,
where :obj:`$CHAINER_DATASET_ROOT` is set as
:obj:`$HOME/.chainer/dataset` unless you specify another value
by modifying the environment variable.
min_size (int): A preprocessing paramter for :meth:`prepare`.
max_size (int): A preprocessing paramter for :meth:`prepare`.
ratios (list of floats): This is ratios of width to height of
the anchors.
anchor_scales (list of numbers): This is areas of anchors.
Those areas will be the product of the square of an element in
:obj:`anchor_scales` and the original area of the reference
window.
vgg_initialW (callable): Initializer for the layers corresponding to
the VGG-16 layers.
rpn_initialW (callable): Initializer for Region Proposal Network
layers.
loc_initialW (callable): Initializer for the localization head.
score_initialW (callable): Initializer for the score head.
proposal_creator_params (dict): Key valued paramters for
:class:`~chainercv.links.model.faster_rcnn.ProposalCreator`.
"""
_models = {
'voc07': {
'n_fg_class': 20,
'url': 'https://github.com/yuyu2172/share-weights/releases/'
'download/0.0.4/'
'faster_rcnn_vgg16_voc07_trained_2017_08_06.npz'
},
'voc0712': {
'n_fg_class': 20,
'url': 'https://github.com/yuyu2172/share-weights/releases/'
'download/0.0.4/faster_rcnn_vgg16_voc0712_trained_2017_07_21.npz'
},
}
feat_stride = 16
def __init__(self,
n_fg_class=None,
pretrained_model=None,
min_size=600, max_size=1000,
ratios=[0.5, 1, 2], anchor_scales=[8, 16, 32],
vgg_initialW=None, rpn_initialW=None,
loc_initialW=None, score_initialW=None,
proposal_creator_params=dict()
):
if n_fg_class is None:
if pretrained_model not in self._models:
raise ValueError(
'The n_fg_class needs to be supplied as an argument')
n_fg_class = self._models[pretrained_model]['n_fg_class']
extractor,classifier = decom_vgg16()
rpn = RegionProposalNetwork(
512, 512,
ratios=ratios,
anchor_scales=anchor_scales,
feat_stride=self.feat_stride,
initialW=rpn_initialW,
proposal_creator_params=proposal_creator_params,
)
head = VGG16RoIHead(
n_fg_class + 1,
roi_size=7, spatial_scale=1. / self.feat_stride,
vgg_initialW=vgg_initialW,
loc_initialW=loc_initialW,
score_initialW=score_initialW
)
super(FasterRCNNVGG16, self).__init__(
extractor,
rpn,
head,
mean=np.array([122.7717, 115.9465, 102.9801],
dtype=np.float32)[:, None, None],
min_size=min_size,
max_size=max_size
)
def decom_vgg16(pretrained=True):
# the 30th layer of features is relu of conv5_3
model = vgg16(pretrained)
features = list(model.features)[:30]
classifier = model.classifier
del classifier._modules['6']
return nn.Sequential(features),classifier
class VGG16RoIHead(nn.Module):
"""Faster R-CNN Head for VGG-16 based implementation.
This class is used as a head for Faster R-CNN.
This outputs class-wise localizations and classification based on feature
maps in the given RoIs.
Args:
n_class (int): The number of classes possibly including the background.
roi_size (int): Height and width of the feature maps after RoI-pooling.
spatial_scale (float): Scale of the roi is resized.
vgg_initialW (callable): Initializer for the layers corresponding to
the VGG-16 layers.
loc_initialW (callable): Initializer for the localization head.
score_initialW (callable): Initializer for the score head.
"""
def __init__(self, n_class, roi_size, spatial_scale,
vgg_initialW=None, loc_initialW=None, score_initialW=None):
# n_class includes the background
super(VGG16RoIHead, self).__init__()
#NOTE: 这里初始化的方式都被我修改,使用默认的初始化方式
self.fc6 = nn.Linear(25088, 4096)
self.fc7 = nn.Linear(4096, 4096, initialW = vgg_initialW)
self.cls_loc = nn.Linear(4096, n_class * 4, initialW=loc_initialW)
self.score = nn.Linear(4096, n_class, initialW=score_initialW)
self.n_class = n_class
self.roi_size = roi_size
self.spatial_scale = spatial_scale
def __call__(self, x, rois, roi_indices):
"""Forward the chain.
We assume that there are :math:`N` batches.
Args:
x (Variable): 4D image variable.
rois (Tensor): A bounding box array containing coordinates of
proposal boxes. This is a concatenation of bounding box
arrays from multiple images in the batch.
Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed
RoIs from the :math:`i` th image,
:math:`R' = \\sum _{i=1} ^ N R_i`.
roi_indices (Tensor): An array containing indices of images to
which bounding boxes correspond to. Its shape is :math:`(R',)`.
"""
roi_indices = roi_indices.float()
indices_and_rois = t.cat(roi_indices[:,None],rois,dim=1)
### TODO: implemented roi_pooling
pool = _roi_pooling_2d_yx(
x, indices_and_rois, self.roi_size, self.roi_size,
self.spatial_scale)
fc6 = F.relu(self.fc6(pool))
fc7 = F.relu(self.fc7(fc6))
roi_cls_locs = self.cls_loc(fc7)
roi_scores = self.score(fc7)
return roi_cls_locs, roi_scores
def _roi_pooling_2d_yx(x, indices_and_rois, outh, outw, spatial_scale):
xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]]
pool = F.roi_pooling_2d(
x, xy_indices_and_rois, outh, outw, spatial_scale)
return pool
from collections import namedtuple
from string import Template
import chainer.functions as F
import cupy,torch
import cupy as cp
import torch as t
from cupy.cuda import function
from torch.autograd import Function
from roi_cupy import kernel_backward,kernel_forward
Stream = namedtuple('Stream', ['ptr'])
@cupy.util.memoize(for_each_device=True)
def load_kernel(kernel_name, code, **kwargs):
code = Template(code).substitute(**kwargs)
kernel_code = cupy.cuda.compile_with_cache(code)
return kernel_code.get_function(kernel_name)
CUDA_NUM_THREADS = 1024
def GET_BLOCKS(N, K=CUDA_NUM_THREADS):
return (N + K - 1) // K
class ROI(Function):
"""
NOTE:only CUDA-compatible
"""
def __init__(self,outh,outw,spatial_scale):
self.forward_fn = load_kernel('roi_forward',kernel_forward)
self.backward_fn = load_kernel('roi_backward',kernel_backward)
self.outh,self.outw,self.spatial_scale = outh,outw,spatial_scale
def forward(self,x,rois):
self.in_size = B, C, H, W = x.size()
N = rois.size(0)
output = t.zeros(N, C, self.outh, self.outw).cuda()
self.argmax_data = t.zeros(N, C, self.outh, self.outw).int().cuda()
self.rois = rois
args = [x.data_ptr(),rois.data_ptr(),
output.data_ptr(),
self.argmax_data.data_ptr(),
self.spatial_scale,C,H,W,
self.outh,self.outw,
output.numel()]
stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
self.forward_fn(args=args,
block=(CUDA_NUM_THREADS,1,1),
grid=(GET_BLOCKS(top_data.numel()),1,1),
stream=stream)
return output
def backward(self,grad_output):
grad_input = t.zeros(self.in_size).cuda()
stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)
args = [grad_output.data_ptr(),
self.argmax_data.data_ptr(),
self.rois.data_ptr(),
grad_input.data_ptr(),
N,spatial_scale,C,H,W,PH,PW,
grad_input.numel()]
self.backward_fn(args=args,
block=(CUDA_NUM_THREADS,1,1),
grid=(GET_BLOCKS(grad_input.numel()),1,1),
stream = stream
)
return grad_input,None
class ROIPooling2D(t.nn.Module):
def __init__(self, outh,outw,spatial_scale):
super(ROIPooling2D, self).__init__()
self.ROI = ROI(outh,outw,spatial_scale)
def forward(self,x,rois):
return self.ROI(x,rois)
def test_roi_module():
## fake data###
B,N,C,H,W,PH,PW = 2,8,4,32,32,7,7
bottom_data = t.randn(B,C,H,W).cuda()
bottom_rois = t.randn(N,5)
bottom_rois[:int(N/2),0]=0
bottom_rois[int(N/2):,0]=1
bottom_rois[:,1:] = (t.rand(N,4)*100).float()
bottom_rois = bottom_rois.cuda()
spatial_scale = 1./16
outh,outw = PH,PW
# pytorch version
module = ROIPooling2D(outh,outw,spatial_scale)
x = t.autograd.Variable(bottom_data,requires_grad=True)
rois = t.autograd.Variable(bottom_rois)
output = module(x,rois)
output.sum().backward()
grad_x = x.grad.cpu().data.numpy()
def t2c(variable):
npa = variable.data.cpu().numpy()
return cp.array(npa)
def test_eq(variable,array,info):
cc=cp.asnumpy(array.data)
neq = (cc!=variable.data.cpu().numpy())
assert neq.sum()==0 ,'test failed: %s' %info
# chainer version
import chainer.functions as F
x_cn = Variable(t2c(x))
from chainer import Variable
o_cn = F.roi_pooling_2d(x_cn, t2c(rois), outh, outw, spatial_scale)
test_eq(output,o_cn,'forward')
F.sum(o_cn).backward()
test_eq(x.grad, x_cn.grad,'backward')
print('test pass')
from collections import namedtuple
from string import Template
import chainer.functions as F
import cupy as cp
import torch as t
from pynvrtc.compiler import Program
Stream = namedtuple('Stream', ['ptr'])
def Dtype(t):
if isinstance(t, torch.cuda.FloatTensor):
return 'float'
elif isinstance(t, torch.cuda.DoubleTensor):
return 'double'
@cupy.util.memoize(for_each_device=True)
def load_kernel(kernel_name, code, **kwargs):
code = Template(code).substitute(**kwargs)
kernel_code = cupy.cuda.compile_with_cache(code)
return kernel_code.get_function(kernel_name)
CUDA_NUM_THREADS = 1024
def GET_BLOCKS(N, K=CUDA_NUM_THREADS):
return (N + K - 1) // K
forward_kernel = '''
extern "C"
__global__ void roi_forward(const float* const bottom_data,const float* const bottom_rois,
float* top_data, int* argmax_data,
const double spatial_scale,const int channels,const int height,
const int width, const int pooled_height,
const int pooled_width,const int NN
){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
//printf("%d,%d,%d,%d ", blockIdx.x, blockDim.x,threadIdx.x,i);
//printf("%d-" ,NN);
if(idx>NN)
return;
const int pw = idx % pooled_width;
const int ph = (idx / pooled_width) % pooled_height;
const int c = (idx / pooled_width / pooled_height) % channels;
int num = idx / pooled_width / pooled_height / channels;
const int roi_batch_ind = bottom_rois[num * 5 + 0];
const int roi_start_w = round(bottom_rois[num * 5 + 1] * spatial_scale);
const int roi_start_h = round(bottom_rois[num * 5 + 2] * spatial_scale);
const int roi_end_w = round(bottom_rois[num * 5 + 3] * spatial_scale);
const int roi_end_h = round(bottom_rois[num * 5 + 4] * spatial_scale);
//printf("-%f-",spatial_scale);
//printf("%f,%f,%d,%d,%d ",bottom_rois[num * 5 + 3],bottom_rois[num * 5 + 2] * spatial_scale,round(bottom_rois[num * 5 + 3] * spatial_scale),num,num*5+3);
//printf("-%d,%d,%d,%d- ",roi_start_w,roi_start_h,roi_end_w,roi_end_h);
// Force malformed ROIs to be 1x1
const int roi_width = max(roi_end_w - roi_start_w + 1, 1);
const int roi_height = max(roi_end_h - roi_start_h + 1, 1);
const float bin_size_h = static_cast<float>(roi_height)
/ static_cast<float>(pooled_height);
const float bin_size_w = static_cast<float>(roi_width)
/ static_cast<float>(pooled_width);
int hstart = static_cast<int>(floor(static_cast<float>(ph)
* bin_size_h));
int wstart = static_cast<int>(floor(static_cast<float>(pw)
* bin_size_w));
int hend = static_cast<int>(ceil(static_cast<float>(ph + 1)
* bin_size_h));
int wend = static_cast<int>(ceil(static_cast<float>(pw + 1)
* bin_size_w));
// Add roi offsets and clip to input boundaries
hstart = min(max(hstart + roi_start_h, 0), height);
hend = min(max(hend + roi_start_h, 0), height);
wstart = min(max(wstart + roi_start_w, 0), width);
wend = min(max(wend + roi_start_w, 0), width);
bool is_empty = (hend <= hstart) || (wend <= wstart);
// Define an empty pooling region to be zero
float maxval = is_empty ? 0 : -1E+37;
// If nothing is pooled, argmax=-1 causes nothing to be backprop'd
int maxidx = -1;
const int data_offset = (roi_batch_ind * channels + c) * height * width;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
int bottom_index = h * width + w;
if (bottom_data[data_offset + bottom_index] > maxval) {
maxval = bottom_data[data_offset + bottom_index];
maxidx = bottom_index;
}
}
}
top_data[idx]=maxval;
argmax_data[idx]=maxidx;
//printf("%d,%d,%d,%d ",pw,ph,num,c);
//printf("%d,%d,%f,%f ",wstart-wend,roi_width,bin_size_h,roi_start_h);
//printf("%d,%d,%d,%d ",roi_start_w,roi_start_h,roi_end_w,roi_end_h);
// }
}'''
backward_kernel='''
__global__ void roi_backward(const float* const top_diff, const int* const argmax_data, const int num_rois,
const double spatial_scale, int channels, int height, int width,
int pooled_height, int pooled_width,const float* const bottom_rois,float* bottom_diff)
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx>NN)
return;
int w = i % width;
int h = (i / width) % height;
int c = (i / (width * height)) % channels;
int num = i / (width * height * channels);
float gradient = 0;
// Accumulate gradient over all ROIs that pooled this element
for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
// Skip if ROI's batch index doesn't match num
if (num != static_cast<int>(bottom_rois[roi_n * 5])) {
continue;
}
int roi_start_w = round(bottom_rois[roi_n * 5 + 1]
* spatial_scale);
int roi_start_h = round(bottom_rois[roi_n * 5 + 2]
* spatial_scale);
int roi_end_w = round(bottom_rois[roi_n * 5 + 3]
* spatial_scale);
int roi_end_h = round(bottom_rois[roi_n * 5 + 4]
* spatial_scale);
// Skip if ROI doesn't include (h, w)
const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
h >= roi_start_h && h <= roi_end_h);
if (!in_roi) {
continue;
}
int offset = (roi_n * channels + c) * pooled_height
* pooled_width;
// Compute feasible set of pooled units that could have pooled
// this bottom unit
// Force malformed ROIs to be 1x1
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
int roi_height = max(roi_end_h - roi_start_h + 1, 1);
float bin_size_h = static_cast<float>(roi_height)
/ static_cast<float>(pooled_height);
float bin_size_w = static_cast<float>(roi_width)
/ static_cast<float>(pooled_width);
int phstart = floor(static_cast<float>(h - roi_start_h)
/ bin_size_h);
int phend = ceil(static_cast<float>(h - roi_start_h + 1)
/ bin_size_h);
int pwstart = floor(static_cast<float>(w - roi_start_w)
/ bin_size_w);
int pwend = ceil(static_cast<float>(w - roi_start_w + 1)
/ bin_size_w);
phstart = min(max(phstart, 0), pooled_height);
phend = min(max(phend, 0), pooled_height);
pwstart = min(max(pwstart, 0), pooled_width);
pwend = min(max(pwend, 0), pooled_width);
for (int ph = phstart; ph < phend; ++ph) {
for (int pw = pwstart; pw < pwend; ++pw) {
int index_ = ph * pooled_width + pw + offset;
if (argmax_data[index_] == (h * width + w)) {
gradient += top_diff[index_];
}
}
}
}
bottom_diff[idx] = gradient;
)(gy[0], self.argmax_data, bottom_rois.shape[0], self.spatial_scale,
channels, height, width, self.outh, self.outw,
bottom_rois, bottom_diff)
'''
cupy.cuda.runtime.free(0)
f_b = load_kernel('roi_backward',backward_kernel)
f=load_kernel('roi_forward',forward_kernel)
B,N,C,H,W,PH,PW = 2,8,4,32,32,7,7
bottom_data = t.randn(B,C,H,W).cuda()
bottom_rois = t.randn(N,5)
bottom_rois[:int(N/2),0]=0
bottom_rois[int(N/2):,0]=1
bottom_rois[:,1:] = (t.rand(N,4)*100).float()
bottom_rois = bottom_rois.cuda()
top_data = t.zeros(N,C,PH,PW).cuda()
argmax_data = t.zeros(N,C,PH,PW).cuda().int()
spatial_scale = 1./16
channels,height,width,pooled_height,pooled_width =\
C,H,W,PH,PW
bottom_diff = bottom_data.new(bottom_data.size()).fill_(0)
top_diff = top_data.new(top_data.size()).fill_(0)
##NOTE: python float 其实是c中的double
# f(args=[bottom_data.data_ptr(),bottom_rois.data_ptr(),
# top_data.data_ptr(),argmax_data.data_ptr(),
# spatial_scale,C,H,W,PH,PW,top_data.numel()],
# block=(CUDA_NUM_THREADS,1,1),
# grid=(GET_BLOCKS(top_data.numel()),1,1),
# stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
x=cp.array(bottom_data.cpu().numpy())
rois=cp.array(bottom_rois.cpu().numpy())
outh=PH
outw =PW
# cp_result = F.roi_pooling_2d(x, rois, outh, outw, spatial_scale)
cproi = F.ROIPooling2D(outh, outw, spatial_scale)
cp_result2=cproi.forward_gpu((x,rois))
aa = cp.asnumpy(cp_result2[0])
bb = top_data.cpu().numpy()
neq = (aa!=bb).sum()
assert neq==0,'output failed'
bb=argmax_data.cpu().numpy()
aa= cp.asnumpy(cproi.argmax_data)
neq = (aa!=bb).sum()
assert neq==0,'argmax failed'
def get_dataset_directory(dataset_name, create_directory=True):
"""Gets the path to the directory of given dataset.
The generated path is just a concatenation of the global root directory
(see :func:`set_dataset_root` for how to change it) and the dataset name.
The dataset name can contain slashes, which are treated as path separators.
Args:
dataset_name (str): Name of the dataset.
create_directory (bool): If True (default), this function also creates
the directory at the first time. If the directory already exists,
then this option is ignored.
Returns:
str: Path to the dataset directory.
"""
path = os.path.join(_dataset_root, dataset_name)
if create_directory:
try:
os.makedirs(path)
except OSError:
if not os.path.isdir(path):
raise
return path
import numpy as np
from PIL import Image
def read_image(path, dtype=np.float32, color=True):
"""Read an image from a file.
This function reads an image from given file. The image is CHW format and
the range of its value is :math:`[0, 255]`. If :obj:`color = True`, the
order of the channels is RGB.
Args:
path (str): A path of image file.
dtype: The type of array. The default value is :obj:`~numpy.float32`.
color (bool): This option determines the number of channels.
If :obj:`True`, the number of channels is three. In this case,
the order of the channels is RGB. This is the default behaviour.
If :obj:`False`, this function returns a grayscale image.
Returns:
~numpy.ndarray: An image.
"""
f = Image.open(path)
try:
if color:
img = f.convert('RGB')
else:
img = f.convert('P')
img = np.asarray(img, dtype=dtype)
finally:
if hasattr(f, 'close'):
f.close()
if img.ndim == 2:
# reshape (H, W) -> (1, H, W)
return img[np.newaxis]
else:
# transpose (H, W, C) -> (C, H, W)
return img.transpose((2, 0, 1))
import numpy as np
def vis_image(img, ax=None):
"""Visualize a color image.
Args:
img (~numpy.ndarray): An array of shape :math:`(3, height, width)`.
This is in RGB format and the range of its value is
:math:`[0, 255]`.
ax (matplotlib.axes.Axis): The visualization is displayed on this
axis. If this is :obj:`None` (default), a new axis is created.
Returns:
~matploblib.axes.Axes:
Returns the Axes object with the plot for further tweaking.
"""
from matplotlib import pyplot as plot
if ax is None:
fig = plot.figure()
ax = fig.add_subplot(1, 1, 1)
# CHW -> HWC
img = img.transpose((1, 2, 0))
ax.imshow(img.astype(np.uint8))
return ax
def vis_bbox(img, bbox, label=None, score=None, label_names=None, ax=None):
"""Visualize bounding boxes inside image.
Example:
>>> from chainercv.datasets import VOCDetectionDataset
>>> from chainercv.datasets import voc_bbox_label_names
>>> from chainercv.visualizations import vis_bbox
>>> import matplotlib.pyplot as plot
>>> dataset = VOCDetectionDataset()
>>> img, bbox, label = dataset[60]
>>> vis_bbox(img, bbox, label,
... label_names=voc_bbox_label_names)
>>> plot.show()
Args:
img (~numpy.ndarray): An array of shape :math:`(3, height, width)`.
This is in RGB format and the range of its value is
:math:`[0, 255]`.
bbox (~numpy.ndarray): An array of shape :math:`(R, 4)`, where
:math:`R` is the number of bounding boxes in the image.
Each element is organized
by :math:`(y_{min}, x_{min}, y_{max}, x_{max})` in the second axis.
label (~numpy.ndarray): An integer array of shape :math:`(R,)`.
The values correspond to id for label names stored in
:obj:`label_names`. This is optional.
score (~numpy.ndarray): A float array of shape :math:`(R,)`.
Each value indicates how confident the prediction is.
This is optional.
label_names (iterable of strings): Name of labels ordered according
to label ids. If this is :obj:`None`, labels will be skipped.
ax (matplotlib.axes.Axis): The visualization is displayed on this
axis. If this is :obj:`None` (default), a new axis is created.
Returns:
~matploblib.axes.Axes:
Returns the Axes object with the plot for further tweaking.
"""
from matplotlib import pyplot as plot
if label is not None and not len(bbox) == len(label):
raise ValueError('The length of label must be same as that of bbox')
if score is not None and not len(bbox) == len(score):
raise ValueError('The length of score must be same as that of bbox')
# Returns newly instantiated matplotlib.axes.Axes object if ax is None
ax = vis_image(img, ax=ax)
# If there is no bounding box to display, visualize the image and exit.
if len(bbox) == 0:
return ax
for i, bb in enumerate(bbox):
xy = (bb[1], bb[0])
height = bb[2] - bb[0]
width = bb[3] - bb[1]
ax.add_patch(plot.Rectangle(
xy, width, height, fill=False, edgecolor='red', linewidth=3))
caption = list()
if label is not None and label_names is not None:
lb = label[i]
if not (0 <= lb < len(label_names)):
raise ValueError('No corresponding name is given')
caption.append(label_names[lb])
if score is not None:
sc = score[i]
caption.append('{:.2f}'.format(sc))
if len(caption) > 0:
ax.text(bb[1], bb[0],
': '.join(caption),
style='italic',
bbox={'facecolor': 'white', 'alpha': 0.7, 'pad': 10})
return ax
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册