init

b4942bdd · chenyuntc · b4942bdd · b4942bdd · b4942bdd · b4942bdd
16 changed file
--- a/config.py
+++ b/config.py
+class Config:
+    voc_data_dir = '/mnt/3/VOC/VOCdevkit/VOC2007/'
+    min_size = 600
+    max_size = 1000
+opt = Config()
\ No newline at end of file
--- a/data/__init__.py
+++ b/data/__init__.py
--- a/data/dataset.py
+++ b/data/dataset.py
+import torch as t
+from .voc_dataset import VOCBboxDataset
+from skimage import transform as sktsf
+from torchvision import transforms as tvtsf
+from . import util
+def preprocess(img,min_size = 600, max_size = 1000):
+    """Preprocess an image for feature extraction.
+    The length of the shorter edge is scaled to :obj:`self.min_size`.
+    After the scaling, if the length of the longer edge is longer than
+    :obj:`self.max_size`, the image is scaled to fit the longer edge
+    to :obj:`self.max_size`.
+    After resizing the image, the image is subtracted by a mean image value
+    :obj:`self.mean`.
+    Args:
+        img (~numpy.ndarray): An image. This is in CHW and RGB format.
+            The range of its value is :math:`[0, 255]`.
+    Returns:
+        ~numpy.ndarray:
+        A preprocessed image.
+    """
+    C, H, W = img.shape
+    scale1 = min_size / min(H, W)
+    scale2 = max_size / max(H, W)
+    scale = min(scale1, scale2)
+    # 总之较长的一边和较短的一边都要小于（max_size和min_size）
+    #img = resize(img, (int(H * scale), int(W * scale)))
+    img = img / 256
+    img = sktsf.resize(img, (C,H*scale,W*scale),mode='reflect')
+    normalize = tvtsf.Normalize(mean=[0.485, 0.456, 0.406],
+                                    std=[0.229, 0.224, 0.225])
+    img = normalize(t.from_numpy(img))
+    return img.numpy()
+    #NOTE: 原始的normalize 为什么只减均值，不除以标准差？？
+    # mean=np.array([122.7717, 115.9465, 102.9801],
+    # img = (img - self.mean).astype(np.float32, copy=False)
+class Transform(object):
+    def __init__(self, min_size=600,max_size=1000):
+        self.min_size = min_size
+        self.max_size = max_size
+    def __call__(self, in_data):
+        img, bbox, label = in_data
+        _, H, W = img.shape
+        img = preprocess(img, self.min_size, self.max_size)
+        _, o_H, o_W = img.shape
+        scale = o_H / H
+        bbox = util.resize_bbox(bbox, (H, W), (o_H, o_W))
+        # horizontally flip
+        img, params = util.random_flip(
+            img, x_random=True, return_param=True)
+        bbox = util.flip_bbox(
+            bbox, (o_H, o_W), x_flip=params['x_flip'])
+        return img, bbox, label, scale
+class Dataset():
+    def __init__(self, opt):
+        self.opt = opt
+        self.db = VOCBboxDataset(opt.voc_data_dir)
+        self.tsf = Transform(opt.min_size,opt.max_size)
+    def __getitem__(self, idx):
+        img, bbox, label, difficult = self.db.get_example(idx)
+        img, bbox, label, scale = self.tsf((img, bbox, label))
+        return img, bbox, label, scale
--- a/data/util.py
+++ b/data/util.py
+import numpy as np
+from PIL import Image
+import random
+def read_image(path, dtype=np.float32, color=True):
+    """Read an image from a file.
+    This function reads an image from given file. The image is CHW format and
+    the range of its value is :math:`[0, 255]`. If :obj:`color = True`, the
+    order of the channels is RGB.
+    Args:
+        path (str): A path of image file.
+        dtype: The type of array. The default value is :obj:`~numpy.float32`.
+        color (bool): This option determines the number of channels.
+            If :obj:`True`, the number of channels is three. In this case,
+            the order of the channels is RGB. This is the default behaviour.
+            If :obj:`False`, this function returns a grayscale image.
+    Returns:
+        ~numpy.ndarray: An image.
+    """
+    f = Image.open(path)
+    try:
+        if color:
+            img = f.convert('RGB')
+        else:
+            img = f.convert('P')
+        img = np.asarray(img, dtype=dtype)
+    finally:
+        if hasattr(f, 'close'):
+            f.close()
+    if img.ndim == 2:
+        # reshape (H, W) -> (1, H, W)
+        return img[np.newaxis]
+    else:
+        # transpose (H, W, C) -> (C, H, W)
+        return img.transpose((2, 0, 1))
+def resize_bbox(bbox, in_size, out_size):
+    """Resize bounding boxes according to image resize.
+    The bounding boxes are expected to be packed into a two dimensional
+    tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
+    bounding boxes in the image. The second axis represents attributes of
+    the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
+    where the four attributes are coordinates of the top left and the
+    bottom right vertices.
+    Args:
+        bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`.
+            :math:`R` is the number of bounding boxes.
+        in_size (tuple): A tuple of length 2. The height and the width
+            of the image before resized.
+        out_size (tuple): A tuple of length 2. The height and the width
+            of the image after resized.
+    Returns:
+        ~numpy.ndarray:
+        Bounding boxes rescaled according to the given image shapes.
+    """
+    bbox = bbox.copy()
+    y_scale = float(out_size[0]) / in_size[0]
+    x_scale = float(out_size[1]) / in_size[1]
+    bbox[:, 0] = y_scale * bbox[:, 0]
+    bbox[:, 2] = y_scale * bbox[:, 2]
+    bbox[:, 1] = x_scale * bbox[:, 1]
+    bbox[:, 3] = x_scale * bbox[:, 3]
+    return bbox
+def flip_bbox(bbox, size, y_flip=False, x_flip=False):
+    """Flip bounding boxes accordingly.
+    The bounding boxes are expected to be packed into a two dimensional
+    tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
+    bounding boxes in the image. The second axis represents attributes of
+    the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
+    where the four attributes are coordinates of the top left and the
+    bottom right vertices.
+    Args:
+        bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`.
+            :math:`R` is the number of bounding boxes.
+        size (tuple): A tuple of length 2. The height and the width
+            of the image before resized.
+        y_flip (bool): Flip bounding box according to a vertical flip of
+            an image.
+        x_flip (bool): Flip bounding box according to a horizontal flip of
+            an image.
+    Returns:
+        ~numpy.ndarray:
+        Bounding boxes flipped according to the given flips.
+    """
+    H, W = size
+    bbox = bbox.copy()
+    if y_flip:
+        y_max = H - bbox[:, 0]
+        y_min = H - bbox[:, 2]
+        bbox[:, 0] = y_min
+        bbox[:, 2] = y_max
+    if x_flip:
+        x_max = W - bbox[:, 1]
+        x_min = W - bbox[:, 3]
+        bbox[:, 1] = x_min
+        bbox[:, 3] = x_max
+    return bbox
+def crop_bbox(
+        bbox, y_slice=None, x_slice=None,
+        allow_outside_center=True, return_param=False):
+    """Translate bounding boxes to fit within the cropped area of an image.
+    This method is mainly used together with image cropping.
+    This method translates the coordinates of bounding boxes like
+    :func:`~chainercv.transforms.translate_bbox`. In addition,
+    this function truncates the bounding boxes to fit within the cropped area.
+    If a bounding box does not overlap with the cropped area,
+    this bounding box will be removed.
+    The bounding boxes are expected to be packed into a two dimensional
+    tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
+    bounding boxes in the image. The second axis represents attributes of
+    the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
+    where the four attributes are coordinates of the top left and the
+    bottom right vertices.
+    Args:
+        bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is
+            :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
+        y_slice (slice): The slice of y axis.
+        x_slice (slice): The slice of x axis.
+        allow_outside_center (bool): If this argument is :obj:`False`,
+            bounding boxes whose centers are outside of the cropped area
+            are removed. The default value is :obj:`True`.
+        return_param (bool): If :obj:`True`, this function returns
+            indices of kept bounding boxes.
+    Returns:
+        ~numpy.ndarray or (~numpy.ndarray, dict):
+        If :obj:`return_param = False`, returns an array :obj:`bbox`.
+        If :obj:`return_param = True`,
+        returns a tuple whose elements are :obj:`bbox, param`.
+        :obj:`param` is a dictionary of intermediate parameters whose
+        contents are listed below with key, value-type and the description
+        of the value.
+        * **index** (*numpy.ndarray*): An array holding indices of used \
+            bounding boxes.
+    """
+    t, b = _slice_to_bounds(y_slice)
+    l, r = _slice_to_bounds(x_slice)
+    crop_bb = np.array((t, l, b, r))
+    if allow_outside_center:
+        mask = np.ones(bbox.shape[0], dtype=bool)
+    else:
+        center = (bbox[:, :2] + bbox[:, 2:]) / 2
+        mask = np.logical_and(crop_bb[:2] <= center, center < crop_bb[2:]) \
+                 .all(axis=1)
+    bbox = bbox.copy()
+    bbox[:, :2] = np.maximum(bbox[:, :2], crop_bb[:2])
+    bbox[:, 2:] = np.minimum(bbox[:, 2:], crop_bb[2:])
+    bbox[:, :2] -= crop_bb[:2]
+    bbox[:, 2:] -= crop_bb[:2]
+    mask = np.logical_and(mask, (bbox[:, :2] < bbox[:, 2:]).all(axis=1))
+    bbox = bbox[mask]
+    if return_param:
+        return bbox, {'index': np.flatnonzero(mask)}
+    else:
+        return bbox
+def _slice_to_bounds(slice_):
+    if slice_ is None:
+        return 0, np.inf
+    if slice_.start is None:
+        l = 0
+    else:
+        l = slice_.start
+    if slice_.stop is None:
+        u = np.inf
+    else:
+        u = slice_.stop
+    return l, u
+def translate_bbox(bbox, y_offset=0, x_offset=0):
+    """Translate bounding boxes.
+    This method is mainly used together with image transforms, such as padding
+    and cropping, which translates the left top point of the image from
+    coordinate :math:`(0, 0)` to coordinate
+    :math:`(y, x) = (y_{offset}, x_{offset})`.
+    The bounding boxes are expected to be packed into a two dimensional
+    tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
+    bounding boxes in the image. The second axis represents attributes of
+    the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
+    where the four attributes are coordinates of the top left and the
+    bottom right vertices.
+    Args:
+        bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is
+            :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
+        y_offset (int or float): The offset along y axis.
+        x_offset (int or float): The offset along x axis.
+    Returns:
+        ~numpy.ndarray:
+        Bounding boxes translated according to the given offsets.
+    """
+    out_bbox = bbox.copy()
+    out_bbox[:, :2] += (y_offset, x_offset)
+    out_bbox[:, 2:] += (y_offset, x_offset)
+    return out_bbox
+def random_flip(img, y_random=False, x_random=False,
+                return_param=False, copy=False):
+    """Randomly flip an image in vertical or horizontal direction.
+    Args:
+        img (~numpy.ndarray): An array that gets flipped. This is in
+            CHW format.
+        y_random (bool): Randomly flip in vertical direction.
+        x_random (bool): Randomly flip in horizontal direction.
+        return_param (bool): Returns information of flip.
+        copy (bool): If False, a view of :obj:`img` will be returned.
+    Returns:
+        ~numpy.ndarray or (~numpy.ndarray, dict):
+        If :obj:`return_param = False`,
+        returns an array :obj:`out_img` that is the result of flipping.
+        If :obj:`return_param = True`,
+        returns a tuple whose elements are :obj:`out_img, param`.
+        :obj:`param` is a dictionary of intermediate parameters whose
+        contents are listed below with key, value-type and the description
+        of the value.
+        * **y_flip** (*bool*): Whether the image was flipped in the\
+            vertical direction or not.
+        * **x_flip** (*bool*): Whether the image was flipped in the\
+            horizontal direction or not.
+    """
+    y_flip, x_flip = False, False
+    if y_random:
+        y_flip = random.choice([True, False])
+    if x_random:
+        x_flip = random.choice([True, False])
+    if y_flip:
+        img = img[:, ::-1, :]
+    if x_flip:
+        img = img[:, :, ::-1]
+    if copy:
+        img = img.copy()
+    if return_param:
+        return img, {'y_flip': y_flip, 'x_flip': x_flip}
+    else:
+        return img
--- a/data/voc_dataset.py
+++ b/data/voc_dataset.py
+# Mofidied work:
+# Original works by:
+# --------------------------------------------------------
+# Faster R-CNN implementation In ChainerCV
+# Copyright (c) 2017 Preferred Networks, Inc.
+# Licensed under The MIT License [see LICENSE for details]
+# https://github.com/chainer/chainercv
+# --------------------------------------------------------
+# Faster R-CNN implementation by Chainer
+# Copyright (c) 2016 Shunta Saito
+# Licensed under The MIT License [see LICENSE for details]
+# https://github.com/mitmul/chainer-faster-rcnn
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick and Sean Bell
+# https://github.com/rbgirshick/py-faster-rcnn
+# ----------------------------------------------------
+import numpy as np
+import os
+import warnings
+import xml.etree.ElementTree as ET
+from .util import read_image
+class VOCBboxDataset():
+    """Bounding box dataset for PASCAL `VOC`_.
+    .. _`VOC`: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/
+    The index corresponds to each image.
+    When queried by an index, if :obj:`return_difficult == False`,
+    this dataset returns a corresponding
+    :obj:`img, bbox, label`, a tuple of an image, bounding boxes and labels.
+    This is the default behaviour.
+    If :obj:`return_difficult == True`, this dataset returns corresponding
+    :obj:`img, bbox, label, difficult`. :obj:`difficult` is a boolean array
+    that indicates whether bounding boxes are labeled as difficult or not.
+    The bounding boxes are packed into a two dimensional tensor of shape
+    :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in
+    the image. The second axis represents attributes of the bounding box.
+    They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, where the
+    four attributes are coordinates of the top left and the bottom right
+    vertices.
+    The labels are packed into a one dimensional tensor of shape :math:`(R,)`.
+    :math:`R` is the number of bounding boxes in the image.
+    The class name of the label :math:`l` is :math:`l` th element of
+    :obj:`chainercv.datasets.voc_bbox_label_names`.
+    The array :obj:`difficult` is a one dimensional boolean array of shape
+    :math:`(R,)`. :math:`R` is the number of bounding boxes in the image.
+    If :obj:`use_difficult` is :obj:`False`, this array is
+    a boolean array with all :obj:`False`.
+    The type of the image, the bounding boxes and the labels are as follows.
+    * :obj:`img.dtype == numpy.float32`
+    * :obj:`bbox.dtype == numpy.float32`
+    * :obj:`label.dtype == numpy.int32`
+    * :obj:`difficult.dtype == numpy.bool`
+    Args:
+        data_dir (string): Path to the root of the training data. 
+            i.e. "/data/image/voc/VOCdevkit/VOC2007/"
+        split ({'train', 'val', 'trainval', 'test'}): Select a split of the
+            dataset. :obj:`test` split is only available for
+            2007 dataset.
+        year ({'2007', '2012'}): Use a dataset prepared for a challenge
+            held in :obj:`year`.
+        use_difficult (bool): If :obj:`True`, use images that are labeled as
+            difficult in the original annotation.
+        return_difficult (bool): If :obj:`True`, this dataset returns
+            a boolean array
+            that indicates whether bounding boxes are labeled as difficult
+            or not. The default value is :obj:`False`.
+    """
+    def __init__(self, data_dir, split='train', year='2012',
+                 use_difficult=False, return_difficult=False,
+                 transforms=None):
+        if split not in ['train', 'trainval', 'val']:
+            if not (split == 'test' and year == '2007'):
+                warnings.warn(
+                    'please pick split from \'train\', \'trainval\', \'val\''
+                    'for 2012 dataset. For 2007 dataset, you can pick \'test\''
+                    ' in addition to the above mentioned splits.'
+                )
+        id_list_file = os.path.join(
+            data_dir, 'ImageSets/Main/{0}.txt'.format(split))
+        self.ids = [id_.strip() for id_ in open(id_list_file)]
+        self.data_dir = data_dir
+        self.use_difficult = use_difficult
+        self.return_difficult = return_difficult
+    def __len__(self):
+        return len(self.ids)
+    def get_example(self, i):
+        """Returns the i-th example.
+        Returns a color image and bounding boxes. The image is in CHW format.
+        The returned image is RGB.
+        Args:
+            i (int): The index of the example.
+        Returns:
+            tuple of an image and bounding boxes
+        """
+        id_ = self.ids[i]
+        anno = ET.parse(
+            os.path.join(self.data_dir, 'Annotations', id_ + '.xml'))
+        bbox = list()
+        label = list()
+        difficult = list()
+        for obj in anno.findall('object'):
+            # when in not using difficult split, and the object is
+            # difficult, skipt it.
+            if not self.use_difficult and int(obj.find('difficult').text) == 1:
+                continue
+            difficult.append(int(obj.find('difficult').text))
+            bndbox_anno = obj.find('bndbox')
+            # subtract 1 to make pixel indexes 0-based
+            bbox.append([
+                int(bndbox_anno.find(tag).text) - 1
+                for tag in ('ymin', 'xmin', 'ymax', 'xmax')])
+            name = obj.find('name').text.lower().strip()
+            label.append(VOC_BBOX_LABEL_NAMES.index(name))
+        bbox = np.stack(bbox).astype(np.float32)
+        label = np.stack(label).astype(np.int32)
+        # When `use_difficult==False`, all elements in `difficult` are False.
+        difficult = np.array(difficult, dtype=np.bool)
+        # Load a image
+        img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg')
+        img = read_image(img_file, color=True)
+        # if self.return_difficult:
+        #     return img, bbox, label, difficult
+        return img, bbox, label,difficult
+def get_voc(root,year,split):
+    key = year
+    base_path = os.path.join(root, 'VOCdevkit/VOC{}'.format(year))
+    split_file = os.path.join(base_path, 'ImageSets/Main/{}.txt'.format(split))
+    if os.path.exists(split_file):
+        return base_path
+    else:
+        raise FileNotFoundError("VOC Data Not Downloaded")
+VOC_BBOX_LABEL_NAMES = (
+    'aeroplane',
+    'bicycle',
+    'bird',
+    'boat',
+    'bottle',
+    'bus',
+    'car',
+    'cat',
+    'chair',
+    'cow',
+    'diningtable',
+    'dog',
+    'horse',
+    'motorbike',
+    'person',
+    'pottedplant',
+    'sheep',
+    'sofa',
+    'train',
+    'tvmonitor')
--- a/main.py
+++ b/main.py
--- a/model/FasterRCNN.py
+++ b/model/FasterRCNN.py
+# Mofidied work:
+# --------------------------------------------------------
+# Copyright (c) 2017 Preferred Networks, Inc.
+# --------------------------------------------------------
+#
+# Original works by:
+# --------------------------------------------------------
+# Faster R-CNN implementation by Chainer
+# Copyright (c) 2016 Shunta Saito
+# Licensed under The MIT License [see LICENSE for details]
+# https://github.com/mitmul/chainer-faster-rcnn
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick and Sean Bell
+# https://github.com/rbgirshick/py-faster-rcnn
+# --------------------------------------------------------
+from __future__ import division
+import numpy as np
+import chainer
+from chainer import cuda
+import chainer.functions as F
+from chainercv.links.model.faster_rcnn.utils.loc2bbox import loc2bbox
+from chainercv.utils import non_maximum_suppression
+from chainercv.transforms.image.resize import resize
+from torch import nn
+class FasterRCNN(nn.Module):
+    """Base class for Faster R-CNN.
+    This is a base class for Faster R-CNN links supporting object detection
+    API [#]_. The following three stages constitute Faster R-CNN.
+    1. **Feature extraction**: Images are taken and their \
+        feature maps are calculated.
+    2. **Region Proposal Networks**: Given the feature maps calculated in \
+        the previous stage, produce set of RoIs around objects.
+    3. **Localization and Classification Heads**: Using feature maps that \
+        belong to the proposed RoIs, classify the categories of the objects \
+        in the RoIs and improve localizations.
+    Each stage is carried out by one of the callable
+    :class:`chainer.Chain` objects :obj:`feature`, :obj:`rpn` and :obj:`head`.
+    There are two functions :meth:`predict` and :meth:`__call__` to conduct
+    object detection.
+    :meth:`predict` takes images and returns bounding boxes that are converted
+    to image coordinates. This will be useful for a scenario when
+    Faster R-CNN is treated as a black box function, for instance.
+    :meth:`__call__` is provided for a scnerario when intermediate outputs
+    are needed, for instance, for training and debugging.
+    Links that support obejct detection API have method :meth:`predict` with
+    the same interface. Please refer to :meth:`predict` for
+    further details.
+    .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
+    Faster R-CNN: Towards Real-Time Object Detection with \
+    Region Proposal Networks. NIPS 2015.
+    Args:
+        extractor (callable Chain): A callable that takes a BCHW image
+            array and returns feature maps.
+        rpn (callable Chain): A callable that has the same interface as
+            :class:`~chainercv.links.model.faster_rcnn.RegionProposalNetwork`.
+            Please refer to the documentation found there.
+        head (callable Chain): A callable that takes
+            a BCHW array, RoIs and batch indices for RoIs. This returns class
+            dependent localization paramters and class scores.
+        mean (numpy.ndarray): A value to be subtracted from an image
+            in :meth:`prepare`.
+        min_size (int): A preprocessing paramter for :meth:`prepare`. Please
+            refer to a docstring found for :meth:`prepare`.
+        max_size (int): A preprocessing paramter for :meth:`prepare`.
+        loc_normalize_mean (tuple of four floats): Mean values of
+            localization estimates.
+        loc_normalize_std (tupler of four floats): Standard deviation
+            of localization estimates.
+    """
+    def __init__(
+            self, extractor, rpn, head,
+            loc_normalize_mean=(0., 0., 0., 0.),
+            loc_normalize_std=(0.1, 0.1, 0.2, 0.2),
+    ):
+        super(FasterRCNN, self).__init__()
+        self.extractor = extractor
+        self.rpn = rpn
+        self.head = head
+        self.loc_normalize_mean = loc_normalize_mean
+        self.loc_normalize_std = loc_normalize_std
+        self.use_preset('visualize')
+    @property
+    def n_class(self):
+        # Total number of classes including the background.
+        return self.head.n_class
+    def __call__(self, x, scale=1.):
+        """Forward Faster R-CNN.
+        Scaling paramter :obj:`scale` is used by RPN to determine the
+        threshold to select small objects, which are going to be
+        rejected irrespective of their confidence scores.
+        Here are notations used.
+        * :math:`N` is the number of batch size
+        * :math:`R'` is the total number of RoIs produced across batches. \
+            Given :math:`R_i` proposed RoIs from the :math:`i` th image, \
+            :math:`R' = \\sum _{i=1} ^ N R_i`.
+        * :math:`L` is the number of classes excluding the background.
+        Classes are ordered by the background, the first class, ..., and
+        the :math:`L` th class.
+        Args:
+            x (~chainer.Variable): 4D image variable.
+            scale (float): Amount of scaling applied to the raw image
+                during preprocessing.
+        Returns:
+            Variable, Variable, array, array:
+            Returns tuple of four values listed below.
+            * **roi_cls_locs**: Offsets and scalings for the proposed RoIs. \
+                Its shape is :math:`(R', (L + 1) \\times 4)`.
+            * **roi_scores**: Class predictions for the proposed RoIs. \
+                Its shape is :math:`(R', L + 1)`.
+            * **rois**: RoIs proposed by RPN. Its shape is \
+                :math:`(R', 4)`.
+            * **roi_indices**: Batch indices of RoIs. Its shape is \
+                :math:`(R',)`.
+        """
+        img_size = x.shape[2:]
+        h = self.extractor(x)
+        rpn_locs, rpn_scores, rois, roi_indices, anchor =\
+            self.rpn(h, img_size, scale)
+        roi_cls_locs, roi_scores = self.head(
+            h, rois, roi_indices)
+        return roi_cls_locs, roi_scores, rois, roi_indices
+    def use_preset(self, preset):
+        """Use the given preset during prediction.
+        This method changes values of :obj:`self.nms_thresh` and
+        :obj:`self.score_thresh`. These values are a threshold value
+        used for non maximum suppression and a threshold value
+        to discard low confidence proposals in :meth:`predict`,
+        respectively.
+        If the attributes need to be changed to something
+        other than the values provided in the presets, please modify
+        them by directly accessing the public attributes.
+        Args:
+            preset ({'visualize', 'evaluate'): A string to determine the
+                preset to use.
+        """
+        if preset == 'visualize':
+            self.nms_thresh = 0.3
+            self.score_thresh = 0.7
+        elif preset == 'evaluate':
+            self.nms_thresh = 0.3
+            self.score_thresh = 0.05
+        else:
+            raise ValueError('preset must be visualize or evaluate')
+    def prepare(self, img):
+        """Preprocess an image for feature extraction.
+        The length of the shorter edge is scaled to :obj:`self.min_size`.
+        After the scaling, if the length of the longer edge is longer than
+        :obj:`self.max_size`, the image is scaled to fit the longer edge
+        to :obj:`self.max_size`.
+        After resizing the image, the image is subtracted by a mean image value
+        :obj:`self.mean`.
+        Args:
+            img (~numpy.ndarray): An image. This is in CHW and RGB format.
+                The range of its value is :math:`[0, 255]`.
+        Returns:
+            ~numpy.ndarray:
+            A preprocessed image.
+        """
+        _, H, W = img.shape
+        scale = 1.
+        scale = self.min_size / min(H, W)
+        if scale * max(H, W) > self.max_size:
+            scale = self.max_size / max(H, W)
+        img = resize(img, (int(H * scale), int(W * scale)))
+        img = (img - self.mean).astype(np.float32, copy=False)
+        return img
+    def _suppress(self, raw_cls_bbox, raw_prob):
+        bbox = list()
+        label = list()
+        score = list()
+        # skip cls_id = 0 because it is the background class
+        for l in range(1, self.n_class):
+            cls_bbox_l = raw_cls_bbox.reshape((-1, self.n_class, 4))[:, l, :]
+            prob_l = raw_prob[:, l]
+            mask = prob_l > self.score_thresh
+            cls_bbox_l = cls_bbox_l[mask]
+            prob_l = prob_l[mask]
+            keep = non_maximum_suppression(
+                cls_bbox_l, self.nms_thresh, prob_l)
+            bbox.append(cls_bbox_l[keep])
+            # The labels are in [0, self.n_class - 2].
+            label.append((l - 1) * np.ones((len(keep),)))
+            score.append(prob_l[keep])
+        bbox = np.concatenate(bbox, axis=0).astype(np.float32)
+        label = np.concatenate(label, axis=0).astype(np.int32)
+        score = np.concatenate(score, axis=0).astype(np.float32)
+        return bbox, label, score
+    def predict(self, imgs):
+        """Detect objects from images.
+        This method predicts objects for each image.
+        Args:
+            imgs (iterable of numpy.ndarray): Arrays holding images.
+                All images are in CHW and RGB format
+                and the range of their value is :math:`[0, 255]`.
+        Returns:
+           tuple of lists:
+           This method returns a tuple of three lists,
+           :obj:`(bboxes, labels, scores)`.
+           * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \
+               where :math:`R` is the number of bounding boxes in a image. \
+               Each bouding box is organized by \
+               :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \
+               in the second axis.
+           * **labels** : A list of integer arrays of shape :math:`(R,)`. \
+               Each value indicates the class of the bounding box. \
+               Values are in range :math:`[0, L - 1]`, where :math:`L` is the \
+               number of the foreground classes.
+           * **scores** : A list of float arrays of shape :math:`(R,)`. \
+               Each value indicates how confident the prediction is.
+        """
+        prepared_imgs = list()
+        sizes = list()
+        for img in imgs:
+            size = img.shape[1:]
+            img = self.prepare(img.astype(np.float32))
+            prepared_imgs.append(img)
+            sizes.append(size)
+        bboxes = list()
+        labels = list()
+        scores = list()
+        for img, size in zip(prepared_imgs, sizes):
+            with chainer.using_config('train', False), \
+                    chainer.function.no_backprop_mode():
+                img_var = chainer.Variable(self.xp.asarray(img[None]))
+                scale = img_var.shape[3] / size[1]
+                roi_cls_locs, roi_scores, rois, _ = self.__call__(
+                    img_var, scale=scale)
+            # We are assuming that batch size is 1.
+            roi_cls_loc = roi_cls_locs.array
+            roi_score = roi_scores.array
+            roi = rois / scale
+            # Convert predictions to bounding boxes in image coordinates.
+            # Bounding boxes are scaled to the scale of the input images.
+            mean = self.xp.tile(self.xp.asarray(self.loc_normalize_mean),
+                                self.n_class)
+            std = self.xp.tile(self.xp.asarray(self.loc_normalize_std),
+                               self.n_class)
+            roi_cls_loc = (roi_cls_loc * std + mean).astype(np.float32)
+            roi_cls_loc = roi_cls_loc.reshape((-1, self.n_class, 4))
+            roi = self.xp.broadcast_to(roi[:, None], roi_cls_loc.shape)
+            cls_bbox = loc2bbox(roi.reshape((-1, 4)),
+                                roi_cls_loc.reshape((-1, 4)))
+            cls_bbox = cls_bbox.reshape((-1, self.n_class * 4))
+            # clip bounding box
+            cls_bbox[:, 0::2] = self.xp.clip(cls_bbox[:, 0::2], 0, size[0])
+            cls_bbox[:, 1::2] = self.xp.clip(cls_bbox[:, 1::2], 0, size[1])
+            prob = F.softmax(roi_score).array
+            raw_cls_bbox = cuda.to_cpu(cls_bbox)
+            raw_prob = cuda.to_cpu(prob)
+            bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
+            bboxes.append(bbox)
+            labels.append(label)
+            scores.append(score)
+        return bboxes, labels, scores
--- a/model/FasterRCNNVgg16.py
+++ b/model/FasterRCNNVgg16.py
+import numpy as np
+import chainer
+import chainer.functions as F
+import chainer.links as L
+from chainercv.links.model.faster_rcnn.faster_rcnn import FasterRCNN
+from chainercv.links.model.faster_rcnn.region_proposal_network import \
+    RegionProposalNetwork
+from chainercv.links.model.vgg.vgg16 import VGG16
+from chainercv.utils import download_model
+from torch import nn
+from torchvision.models import vgg16
+class FasterRCNNVGG16(FasterRCNN):
+    """Faster R-CNN based on VGG-16.
+    When you specify the path of a pre-trained chainer model serialized as
+    a :obj:`.npz` file in the constructor, this chain model automatically
+    initializes all the parameters with it.
+    When a string in prespecified set is provided, a pretrained model is
+    loaded from weights distributed on the Internet.
+    The list of pretrained models supported are as follows:
+    * :obj:`voc07`: Loads weights trained with the trainval split of \
+        PASCAL VOC2007 Detection Dataset.
+    * :obj:`imagenet`: Loads weights trained with ImageNet Classfication \
+        task for the feature extractor and the head modules. \
+        Weights that do not have a corresponding layer in VGG-16 \
+        will be randomly initialized.
+    For descriptions on the interface of this model, please refer to
+    :class:`~chainercv.links.model.faster_rcnn.FasterRCNN`.
+    :class:`~chainercv.links.model.faster_rcnn.FasterRCNNVGG16`
+    supports finer control on random initializations of weights by arguments
+    :obj:`vgg_initialW`, :obj:`rpn_initialW`, :obj:`loc_initialW` and
+    :obj:`score_initialW`.
+    It accepts a callable that takes an array and edits its values.
+    If :obj:`None` is passed as an initializer, the default initializer is
+    used.
+    Args:
+        n_fg_class (int): The number of classes excluding the background.
+        pretrained_model (str): The destination of the pre-trained
+            chainer model serialized as a :obj:`.npz` file.
+            If this is one of the strings described
+            above, it automatically loads weights stored under a directory
+            :obj:`$CHAINER_DATASET_ROOT/pfnet/chainercv/models/`,
+            where :obj:`$CHAINER_DATASET_ROOT` is set as
+            :obj:`$HOME/.chainer/dataset` unless you specify another value
+            by modifying the environment variable.
+        min_size (int): A preprocessing paramter for :meth:`prepare`.
+        max_size (int): A preprocessing paramter for :meth:`prepare`.
+        ratios (list of floats): This is ratios of width to height of
+            the anchors.
+        anchor_scales (list of numbers): This is areas of anchors.
+            Those areas will be the product of the square of an element in
+            :obj:`anchor_scales` and the original area of the reference
+            window.
+        vgg_initialW (callable): Initializer for the layers corresponding to
+            the VGG-16 layers.
+        rpn_initialW (callable): Initializer for Region Proposal Network
+            layers.
+        loc_initialW (callable): Initializer for the localization head.
+        score_initialW (callable): Initializer for the score head.
+        proposal_creator_params (dict): Key valued paramters for
+            :class:`~chainercv.links.model.faster_rcnn.ProposalCreator`.
+    """
+    _models = {
+        'voc07': {
+            'n_fg_class': 20,
+            'url': 'https://github.com/yuyu2172/share-weights/releases/'
+            'download/0.0.4/'
+            'faster_rcnn_vgg16_voc07_trained_2017_08_06.npz'
+        },
+        'voc0712': {
+            'n_fg_class': 20,
+            'url': 'https://github.com/yuyu2172/share-weights/releases/'
+            'download/0.0.4/faster_rcnn_vgg16_voc0712_trained_2017_07_21.npz'
+        },
+    }
+    feat_stride = 16
+    def __init__(self,
+                 n_fg_class=None,
+                 pretrained_model=None,
+                 min_size=600, max_size=1000,
+                 ratios=[0.5, 1, 2], anchor_scales=[8, 16, 32],
+                 vgg_initialW=None, rpn_initialW=None,
+                 loc_initialW=None, score_initialW=None,
+                 proposal_creator_params=dict()
+                 ):
+        if n_fg_class is None:
+            if pretrained_model not in self._models:
+                raise ValueError(
+                    'The n_fg_class needs to be supplied as an argument')
+            n_fg_class = self._models[pretrained_model]['n_fg_class']
+        extractor,classifier = decom_vgg16()
+        rpn = RegionProposalNetwork(
+            512, 512,
+            ratios=ratios,
+            anchor_scales=anchor_scales,
+            feat_stride=self.feat_stride,
+            initialW=rpn_initialW,
+            proposal_creator_params=proposal_creator_params,
+        )
+        head = VGG16RoIHead(
+            n_fg_class + 1,
+            roi_size=7, spatial_scale=1. / self.feat_stride,
+            vgg_initialW=vgg_initialW,
+            loc_initialW=loc_initialW,
+            score_initialW=score_initialW
+        )
+        super(FasterRCNNVGG16, self).__init__(
+            extractor,
+            rpn,
+            head,
+            mean=np.array([122.7717, 115.9465, 102.9801],
+                          dtype=np.float32)[:, None, None],
+            min_size=min_size,
+            max_size=max_size
+        )
+def decom_vgg16(pretrained=True):
+    # the 30th layer of features is relu of conv5_3
+    model = vgg16(pretrained)
+    features = list(model.features)[:30]
+    classifier = model.classifier
+    del classifier._modules['6']
+    return nn.Sequential(features),classifier
+class VGG16RoIHead(nn.Module):
+    """Faster R-CNN Head for VGG-16 based implementation.
+    This class is used as a head for Faster R-CNN.
+    This outputs class-wise localizations and classification based on feature
+    maps in the given RoIs.
+    Args:
+        n_class (int): The number of classes possibly including the background.
+        roi_size (int): Height and width of the feature maps after RoI-pooling.
+        spatial_scale (float): Scale of the roi is resized.
+        vgg_initialW (callable): Initializer for the layers corresponding to
+            the VGG-16 layers.
+        loc_initialW (callable): Initializer for the localization head.
+        score_initialW (callable): Initializer for the score head.
+    """
+    def __init__(self, n_class, roi_size, spatial_scale,
+                 vgg_initialW=None, loc_initialW=None, score_initialW=None):
+        # n_class includes the background
+        super(VGG16RoIHead, self).__init__()
+        #NOTE： 这里初始化的方式都被我修改，使用默认的初始化方式
+        self.fc6 = nn.Linear(25088, 4096)
+        self.fc7 = nn.Linear(4096, 4096, initialW = vgg_initialW)
+        self.cls_loc = nn.Linear(4096, n_class * 4, initialW=loc_initialW)
+        self.score = nn.Linear(4096, n_class, initialW=score_initialW)
+        self.n_class = n_class
+        self.roi_size = roi_size
+        self.spatial_scale = spatial_scale
+    def __call__(self, x, rois, roi_indices):
+        """Forward the chain.
+        We assume that there are :math:`N` batches.
+        Args:
+            x (Variable): 4D image variable.
+            rois (Tensor): A bounding box array containing coordinates of
+                proposal boxes.  This is a concatenation of bounding box
+                arrays from multiple images in the batch.
+                Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed
+                RoIs from the :math:`i` th image,
+                :math:`R' = \\sum _{i=1} ^ N R_i`.
+            roi_indices (Tensor): An array containing indices of images to
+                which bounding boxes correspond to. Its shape is :math:`(R',)`.
+        """
+        roi_indices = roi_indices.float()
+        indices_and_rois = t.cat(roi_indices[:,None],rois,dim=1)
+        ### TODO: implemented roi_pooling
+        pool = _roi_pooling_2d_yx(
+            x, indices_and_rois, self.roi_size, self.roi_size,
+            self.spatial_scale)
+        fc6 = F.relu(self.fc6(pool))
+        fc7 = F.relu(self.fc7(fc6))
+        roi_cls_locs = self.cls_loc(fc7)
+        roi_scores = self.score(fc7)
+        return roi_cls_locs, roi_scores
+def _roi_pooling_2d_yx(x, indices_and_rois, outh, outw, spatial_scale):
+    xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]]
+    pool = F.roi_pooling_2d(
+        x, xy_indices_and_rois, outh, outw, spatial_scale)
+    return pool
--- a/model/ROIModule.py
+++ b/model/ROIModule.py
+from collections import namedtuple
+from string import Template
+import chainer.functions as F
+import cupy,torch
+import cupy as cp
+import torch as t
+from cupy.cuda import function
+from torch.autograd import Function
+from roi_cupy import kernel_backward,kernel_forward
+Stream = namedtuple('Stream', ['ptr'])
+@cupy.util.memoize(for_each_device=True)
+def load_kernel(kernel_name, code, **kwargs):
+    code = Template(code).substitute(**kwargs)
+    kernel_code = cupy.cuda.compile_with_cache(code)
+    return kernel_code.get_function(kernel_name)
+CUDA_NUM_THREADS = 1024
+def GET_BLOCKS(N, K=CUDA_NUM_THREADS):
+    return (N + K - 1) // K
+class ROI(Function):
+    """
+    NOTE：only CUDA-compatible
+    """
+    def __init__(self,outh,outw,spatial_scale):
+        self.forward_fn = load_kernel('roi_forward',kernel_forward)
+        self.backward_fn = load_kernel('roi_backward',kernel_backward)
+        self.outh,self.outw,self.spatial_scale = outh,outw,spatial_scale
+    def forward(self,x,rois):
+        self.in_size = B, C, H, W = x.size()
+        N = rois.size(0)
+        output = t.zeros(N, C, self.outh, self.outw).cuda()
+        self.argmax_data = t.zeros(N, C, self.outh, self.outw).int().cuda()
+        self.rois = rois
+        args = [x.data_ptr(),rois.data_ptr(),
+                output.data_ptr(),
+                self.argmax_data.data_ptr(),
+                self.spatial_scale,C,H,W,
+                self.outh,self.outw,
+                output.numel()]
+        stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
+        self.forward_fn(args=args,
+                block=(CUDA_NUM_THREADS,1,1),
+                grid=(GET_BLOCKS(top_data.numel()),1,1),
+                stream=stream)
+        return output
+    def backward(self,grad_output):
+        grad_input = t.zeros(self.in_size).cuda()
+        stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)
+        args = [grad_output.data_ptr(),
+                self.argmax_data.data_ptr(),
+                self.rois.data_ptr(),
+                grad_input.data_ptr(),
+                N,spatial_scale,C,H,W,PH,PW,
+                grad_input.numel()]
+        self.backward_fn(args=args,
+            block=(CUDA_NUM_THREADS,1,1),
+            grid=(GET_BLOCKS(grad_input.numel()),1,1),
+            stream = stream
+            )
+        return grad_input,None
+class ROIPooling2D(t.nn.Module):
+    def __init__(self, outh,outw,spatial_scale):
+        super(ROIPooling2D, self).__init__()
+        self.ROI = ROI(outh,outw,spatial_scale)
+    def forward(self,x,rois):
+        return self.ROI(x,rois)
+def test_roi_module():
+    ## fake data###
+    B,N,C,H,W,PH,PW = 2,8,4,32,32,7,7
+    bottom_data = t.randn(B,C,H,W).cuda()
+    bottom_rois = t.randn(N,5)
+    bottom_rois[:int(N/2),0]=0
+    bottom_rois[int(N/2):,0]=1
+    bottom_rois[:,1:] = (t.rand(N,4)*100).float()
+    bottom_rois = bottom_rois.cuda()
+    spatial_scale = 1./16
+    outh,outw = PH,PW
+    # pytorch version
+    module = ROIPooling2D(outh,outw,spatial_scale)
+    x = t.autograd.Variable(bottom_data,requires_grad=True)
+    rois = t.autograd.Variable(bottom_rois)
+    output = module(x,rois)
+    output.sum().backward()
+    grad_x = x.grad.cpu().data.numpy()
+    def t2c(variable):
+        npa = variable.data.cpu().numpy()
+        return cp.array(npa)
+    def test_eq(variable,array,info):
+        cc=cp.asnumpy(array.data)
+        neq = (cc!=variable.data.cpu().numpy())
+        assert neq.sum()==0 ,'test failed: %s' %info
+    # chainer version
+    import chainer.functions as F
+    x_cn = Variable(t2c(x))
+    from chainer import Variable
+    o_cn = F.roi_pooling_2d(x_cn, t2c(rois), outh, outw, spatial_scale)
+    test_eq(output,o_cn,'forward')
+    F.sum(o_cn).backward()
+    test_eq(x.grad, x_cn.grad,'backward')
+    print('test pass')
--- a/model/RPN.py
+++ b/model/RPN.py
--- a/model/__init__.py
+++ b/model/__init__.py
--- a/model/backward2.py
+++ b/model/backward2.py
+from collections import namedtuple
+from string import Template
+import chainer.functions as F
+import cupy as cp
+import torch as t
+from pynvrtc.compiler import Program
+Stream = namedtuple('Stream', ['ptr'])
+def Dtype(t):
+    if isinstance(t, torch.cuda.FloatTensor):
+        return 'float'
+    elif isinstance(t, torch.cuda.DoubleTensor):
+        return 'double'
+@cupy.util.memoize(for_each_device=True)
+def load_kernel(kernel_name, code, **kwargs):
+    code = Template(code).substitute(**kwargs)
+    kernel_code = cupy.cuda.compile_with_cache(code)
+    return kernel_code.get_function(kernel_name)
+CUDA_NUM_THREADS = 1024
+def GET_BLOCKS(N, K=CUDA_NUM_THREADS):
+    return (N + K - 1) // K
+forward_kernel = '''
+extern "C"
+__global__ void roi_forward(const float* const bottom_data,const float* const bottom_rois,
+            float* top_data, int* argmax_data,
+            const double spatial_scale,const int channels,const int height, 
+            const int width, const int pooled_height, 
+            const int pooled_width,const int NN
+){
+int idx = blockIdx.x * blockDim.x + threadIdx.x;
+//printf("%d,%d,%d,%d  ", blockIdx.x, blockDim.x,threadIdx.x,i);
+//printf("%d-" ,NN);
+if(idx>NN)
+    return;
+const int pw = idx % pooled_width;
+const int ph = (idx / pooled_width) % pooled_height;
+const int c = (idx / pooled_width / pooled_height) % channels;
+int num = idx / pooled_width / pooled_height / channels;
+const int roi_batch_ind = bottom_rois[num * 5 + 0];
+const int roi_start_w = round(bottom_rois[num * 5 + 1] * spatial_scale);
+const int roi_start_h = round(bottom_rois[num * 5 + 2] * spatial_scale);
+const int roi_end_w = round(bottom_rois[num * 5 + 3] * spatial_scale);
+const int roi_end_h = round(bottom_rois[num * 5 + 4] * spatial_scale);
+//printf("-%f-",spatial_scale);
+//printf("%f,%f,%d,%d,%d  ",bottom_rois[num * 5 + 3],bottom_rois[num * 5 + 2] * spatial_scale,round(bottom_rois[num * 5 + 3] * spatial_scale),num,num*5+3);
+//printf("-%d,%d,%d,%d-  ",roi_start_w,roi_start_h,roi_end_w,roi_end_h);
+// Force malformed ROIs to be 1x1
+const int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+const int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+const float bin_size_h = static_cast<float>(roi_height)
+                / static_cast<float>(pooled_height);
+const float bin_size_w = static_cast<float>(roi_width)
+                / static_cast<float>(pooled_width);
+int hstart = static_cast<int>(floor(static_cast<float>(ph)
+                                * bin_size_h));
+    int wstart = static_cast<int>(floor(static_cast<float>(pw)
+                                * bin_size_w));
+int hend = static_cast<int>(ceil(static_cast<float>(ph + 1)
+                            * bin_size_h));
+    int wend = static_cast<int>(ceil(static_cast<float>(pw + 1)
+                            * bin_size_w));
+// Add roi offsets and clip to input boundaries
+hstart = min(max(hstart + roi_start_h, 0), height);
+hend = min(max(hend + roi_start_h, 0), height);
+wstart = min(max(wstart + roi_start_w, 0), width);
+wend = min(max(wend + roi_start_w, 0), width);
+bool is_empty = (hend <= hstart) || (wend <= wstart);
+// Define an empty pooling region to be zero
+float maxval = is_empty ? 0 : -1E+37;
+// If nothing is pooled, argmax=-1 causes nothing to be backprop'd
+int maxidx = -1;
+const int data_offset = (roi_batch_ind * channels + c) * height * width;
+for (int h = hstart; h < hend; ++h) {
+    for (int w = wstart; w < wend; ++w) {
+        int bottom_index = h * width + w;
+        if (bottom_data[data_offset + bottom_index] > maxval) {
+            maxval = bottom_data[data_offset + bottom_index];
+            maxidx = bottom_index;
+        }
+    }
+}
+top_data[idx]=maxval;
+argmax_data[idx]=maxidx;
+//printf("%d,%d,%d,%d  ",pw,ph,num,c);
+//printf("%d,%d,%f,%f  ",wstart-wend,roi_width,bin_size_h,roi_start_h);
+//printf("%d,%d,%d,%d  ",roi_start_w,roi_start_h,roi_end_w,roi_end_h);
+// }
+}'''
+backward_kernel='''
+__global__ void roi_backward(const float* const top_diff, const int* const argmax_data, const int num_rois,
+    const double spatial_scale, int channels, int height, int width,
+    int pooled_height, int pooled_width,const float* const bottom_rois,float* bottom_diff)
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if(idx>NN)
+       return;
+    int w = i % width;
+    int h = (i / width) % height;
+    int c = (i / (width * height)) % channels;
+    int num = i / (width * height * channels);
+    float gradient = 0;
+    // Accumulate gradient over all ROIs that pooled this element
+    for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
+        // Skip if ROI's batch index doesn't match num
+        if (num != static_cast<int>(bottom_rois[roi_n * 5])) {
+            continue;
+        }
+        int roi_start_w = round(bottom_rois[roi_n * 5 + 1]
+                                * spatial_scale);
+        int roi_start_h = round(bottom_rois[roi_n * 5 + 2]
+                                * spatial_scale);
+        int roi_end_w = round(bottom_rois[roi_n * 5 + 3]
+                              * spatial_scale);
+        int roi_end_h = round(bottom_rois[roi_n * 5 + 4]
+                              * spatial_scale);
+        // Skip if ROI doesn't include (h, w)
+        const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
+                             h >= roi_start_h && h <= roi_end_h);
+        if (!in_roi) {
+            continue;
+        }
+        int offset = (roi_n * channels + c) * pooled_height
+                     * pooled_width;
+        // Compute feasible set of pooled units that could have pooled
+        // this bottom unit
+        // Force malformed ROIs to be 1x1
+        int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+        int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+        float bin_size_h = static_cast<float>(roi_height)
+                       / static_cast<float>(pooled_height);
+        float bin_size_w = static_cast<float>(roi_width)
+                       / static_cast<float>(pooled_width);
+        int phstart = floor(static_cast<float>(h - roi_start_h)
+                            / bin_size_h);
+        int phend = ceil(static_cast<float>(h - roi_start_h + 1)
+                         / bin_size_h);
+        int pwstart = floor(static_cast<float>(w - roi_start_w)
+                            / bin_size_w);
+        int pwend = ceil(static_cast<float>(w - roi_start_w + 1)
+                         / bin_size_w);
+        phstart = min(max(phstart, 0), pooled_height);
+        phend = min(max(phend, 0), pooled_height);
+        pwstart = min(max(pwstart, 0), pooled_width);
+        pwend = min(max(pwend, 0), pooled_width);
+        for (int ph = phstart; ph < phend; ++ph) {
+            for (int pw = pwstart; pw < pwend; ++pw) {
+                int index_ = ph * pooled_width + pw + offset;
+                if (argmax_data[index_] == (h * width + w)) {
+                    gradient += top_diff[index_];
+                }
+            }
+        }
+    }
+    bottom_diff[idx] = gradient;
+)(gy[0], self.argmax_data, bottom_rois.shape[0], self.spatial_scale,
+  channels, height, width, self.outh, self.outw,
+  bottom_rois, bottom_diff)
+'''
+cupy.cuda.runtime.free(0)
+f_b = load_kernel('roi_backward',backward_kernel)
+f=load_kernel('roi_forward',forward_kernel)
+B,N,C,H,W,PH,PW = 2,8,4,32,32,7,7
+bottom_data = t.randn(B,C,H,W).cuda()
+bottom_rois = t.randn(N,5)
+bottom_rois[:int(N/2),0]=0
+bottom_rois[int(N/2):,0]=1
+bottom_rois[:,1:] = (t.rand(N,4)*100).float()
+bottom_rois = bottom_rois.cuda()
+top_data = t.zeros(N,C,PH,PW).cuda()
+argmax_data = t.zeros(N,C,PH,PW).cuda().int()
+spatial_scale = 1./16
+channels,height,width,pooled_height,pooled_width =\
+C,H,W,PH,PW
+bottom_diff = bottom_data.new(bottom_data.size()).fill_(0)
+top_diff = top_data.new(top_data.size()).fill_(0)
+##NOTE: python float 其实是c中的double
+# f(args=[bottom_data.data_ptr(),bottom_rois.data_ptr(),
+# top_data.data_ptr(),argmax_data.data_ptr(),
+# spatial_scale,C,H,W,PH,PW,top_data.numel()],
+# block=(CUDA_NUM_THREADS,1,1),
+# grid=(GET_BLOCKS(top_data.numel()),1,1),
+# stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
+x=cp.array(bottom_data.cpu().numpy())
+rois=cp.array(bottom_rois.cpu().numpy())
+outh=PH
+outw =PW
+# cp_result = F.roi_pooling_2d(x, rois, outh, outw, spatial_scale)
+cproi = F.ROIPooling2D(outh, outw, spatial_scale)
+cp_result2=cproi.forward_gpu((x,rois))
+aa = cp.asnumpy(cp_result2[0])
+bb = top_data.cpu().numpy()
+neq = (aa!=bb).sum()
+assert neq==0,'output failed'
+bb=argmax_data.cpu().numpy()
+aa= cp.asnumpy(cproi.argmax_data)
+neq = (aa!=bb).sum()
+assert neq==0,'argmax failed'
--- a/util/__init__.py
+++ b/util/__init__.py
--- a/util/file_tool.py
+++ b/util/file_tool.py
+def get_dataset_directory(dataset_name, create_directory=True):
+    """Gets the path to the directory of given dataset.
+    The generated path is just a concatenation of the global root directory
+    (see :func:`set_dataset_root` for how to change it) and the dataset name.
+    The dataset name can contain slashes, which are treated as path separators.
+    Args:
+        dataset_name (str): Name of the dataset.
+        create_directory (bool): If True (default), this function also creates
+            the directory at the first time. If the directory already exists,
+            then this option is ignored.
+    Returns:
+        str: Path to the dataset directory.
+    """
+    path = os.path.join(_dataset_root, dataset_name)
+    if create_directory:
+        try:
+            os.makedirs(path)
+        except OSError:
+            if not os.path.isdir(path):
+                raise
+    return path
+import numpy as np
+from PIL import Image
+def read_image(path, dtype=np.float32, color=True):
+    """Read an image from a file.
+    This function reads an image from given file. The image is CHW format and
+    the range of its value is :math:`[0, 255]`. If :obj:`color = True`, the
+    order of the channels is RGB.
+    Args:
+        path (str): A path of image file.
+        dtype: The type of array. The default value is :obj:`~numpy.float32`.
+        color (bool): This option determines the number of channels.
+            If :obj:`True`, the number of channels is three. In this case,
+            the order of the channels is RGB. This is the default behaviour.
+            If :obj:`False`, this function returns a grayscale image.
+    Returns:
+        ~numpy.ndarray: An image.
+    """
+    f = Image.open(path)
+    try:
+        if color:
+            img = f.convert('RGB')
+        else:
+            img = f.convert('P')
+        img = np.asarray(img, dtype=dtype)
+    finally:
+        if hasattr(f, 'close'):
+            f.close()
+    if img.ndim == 2:
+        # reshape (H, W) -> (1, H, W)
+        return img[np.newaxis]
+    else:
+        # transpose (H, W, C) -> (C, H, W)
+        return img.transpose((2, 0, 1))
--- a/util/vis_tool.py
+++ b/util/vis_tool.py
+import numpy as np
+def vis_image(img, ax=None):
+    """Visualize a color image.
+    Args:
+        img (~numpy.ndarray): An array of shape :math:`(3, height, width)`.
+            This is in RGB format and the range of its value is
+            :math:`[0, 255]`.
+        ax (matplotlib.axes.Axis): The visualization is displayed on this
+            axis. If this is :obj:`None` (default), a new axis is created.
+    Returns:
+        ~matploblib.axes.Axes:
+        Returns the Axes object with the plot for further tweaking.
+    """
+    from matplotlib import pyplot as plot
+    if ax is None:
+        fig = plot.figure()
+        ax = fig.add_subplot(1, 1, 1)
+    # CHW -> HWC
+    img = img.transpose((1, 2, 0))
+    ax.imshow(img.astype(np.uint8))
+    return ax
+def vis_bbox(img, bbox, label=None, score=None, label_names=None, ax=None):
+    """Visualize bounding boxes inside image.
+    Example:
+        >>> from chainercv.datasets import VOCDetectionDataset
+        >>> from chainercv.datasets import voc_bbox_label_names
+        >>> from chainercv.visualizations import vis_bbox
+        >>> import matplotlib.pyplot as plot
+        >>> dataset = VOCDetectionDataset()
+        >>> img, bbox, label = dataset[60]
+        >>> vis_bbox(img, bbox, label,
+        ...         label_names=voc_bbox_label_names)
+        >>> plot.show()
+    Args:
+        img (~numpy.ndarray): An array of shape :math:`(3, height, width)`.
+            This is in RGB format and the range of its value is
+            :math:`[0, 255]`.
+        bbox (~numpy.ndarray): An array of shape :math:`(R, 4)`, where
+            :math:`R` is the number of bounding boxes in the image.
+            Each element is organized
+            by :math:`(y_{min}, x_{min}, y_{max}, x_{max})` in the second axis.
+        label (~numpy.ndarray): An integer array of shape :math:`(R,)`.
+            The values correspond to id for label names stored in
+            :obj:`label_names`. This is optional.
+        score (~numpy.ndarray): A float array of shape :math:`(R,)`.
+             Each value indicates how confident the prediction is.
+             This is optional.
+        label_names (iterable of strings): Name of labels ordered according
+            to label ids. If this is :obj:`None`, labels will be skipped.
+        ax (matplotlib.axes.Axis): The visualization is displayed on this
+            axis. If this is :obj:`None` (default), a new axis is created.
+    Returns:
+        ~matploblib.axes.Axes:
+        Returns the Axes object with the plot for further tweaking.
+    """
+    from matplotlib import pyplot as plot
+    if label is not None and not len(bbox) == len(label):
+        raise ValueError('The length of label must be same as that of bbox')
+    if score is not None and not len(bbox) == len(score):
+        raise ValueError('The length of score must be same as that of bbox')
+    # Returns newly instantiated matplotlib.axes.Axes object if ax is None
+    ax = vis_image(img, ax=ax)
+    # If there is no bounding box to display, visualize the image and exit.
+    if len(bbox) == 0:
+        return ax
+    for i, bb in enumerate(bbox):
+        xy = (bb[1], bb[0])
+        height = bb[2] - bb[0]
+        width = bb[3] - bb[1]
+        ax.add_patch(plot.Rectangle(
+            xy, width, height, fill=False, edgecolor='red', linewidth=3))
+        caption = list()
+        if label is not None and label_names is not None:
+            lb = label[i]
+            if not (0 <= lb < len(label_names)):
+                raise ValueError('No corresponding name is given')
+            caption.append(label_names[lb])
+        if score is not None:
+            sc = score[i]
+            caption.append('{:.2f}'.format(sc))
+        if len(caption) > 0:
+            ax.text(bb[1], bb[0],
+                    ': '.join(caption),
+                    style='italic',
+                    bbox={'facecolor': 'white', 'alpha': 0.7, 'pad': 10})
+    return ax
--- a/util/visulizer.py
+++ b/util/visulizer.py