diff --git a/ppdet/py_op/__init__.py b/ppdet/py_op/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ceb1cef677e14329a66c492bd652a48632d4119
--- /dev/null
+++ b/ppdet/py_op/__init__.py
@@ -0,0 +1,4 @@
+from .bbox import *
+from .mask import *
+from .target import *
+from .post_process import *
diff --git a/ppdet/py_op/bbox.py b/ppdet/py_op/bbox.py
new file mode 100755
index 0000000000000000000000000000000000000000..83b68a78222cf553e84ba52e52586d7b3aefd944
--- /dev/null
+++ b/ppdet/py_op/bbox.py
@@ -0,0 +1,261 @@
+import numpy as np
+from numba import jit
+
+
+@jit
+def bbox2delta(bboxes1, bboxes2, weights):
+    ex_w = bboxes1[:, 2] - bboxes1[:, 0] + 1
+    ex_h = bboxes1[:, 3] - bboxes1[:, 1] + 1
+    ex_ctr_x = bboxes1[:, 0] + 0.5 * ex_w
+    ex_ctr_y = bboxes1[:, 1] + 0.5 * ex_h
+
+    gt_w = bboxes2[:, 2] - bboxes2[:, 0] + 1
+    gt_h = bboxes2[:, 3] - bboxes2[:, 1] + 1
+    gt_ctr_x = bboxes2[:, 0] + 0.5 * gt_w
+    gt_ctr_y = bboxes2[:, 1] + 0.5 * gt_h
+
+    dx = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0]
+    dy = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1]
+    dw = (np.log(gt_w / ex_w)) / weights[2]
+    dh = (np.log(gt_h / ex_h)) / weights[3]
+
+    deltas = np.vstack([dx, dy, dw, dh]).transpose()
+    return deltas
+
+
+@jit
+def delta2bbox(deltas, boxes, weights, bbox_clip=4.13):
+    if boxes.shape[0] == 0:
+        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
+    boxes = boxes.astype(deltas.dtype, copy=False)
+
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    wx, wy, ww, wh = weights
+    dx = deltas[:, 0::4] * wx
+    dy = deltas[:, 1::4] * wy
+    dw = deltas[:, 2::4] * ww
+    dh = deltas[:, 3::4] * wh
+
+    # Prevent sending too large values into np.exp()
+    dw = np.minimum(dw, bbox_clip)
+    dh = np.minimum(dh, bbox_clip)
+
+    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+    pred_w = np.exp(dw) * widths[:, np.newaxis]
+    pred_h = np.exp(dh) * heights[:, np.newaxis]
+
+    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
+    # x1
+    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
+    # y1
+    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
+    # x2 (note: "- 1" is correct; don't be fooled by the asymmetry)
+    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1
+    # y2 (note: "- 1" is correct; don't be fooled by the asymmetry)
+    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1
+
+    return pred_boxes
+
+
+@jit
+def expand_bbox(bboxes, scale):
+    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
+    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
+    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
+    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
+
+    w_half *= scale
+    h_half *= scale
+
+    bboxes_exp = np.zeros(bboxes.shape)
+    bboxes_exp[:, 0] = x_c - w_half
+    bboxes_exp[:, 2] = x_c + w_half
+    bboxes_exp[:, 1] = y_c - h_half
+    bboxes_exp[:, 3] = y_c + h_half
+
+    return bboxes_exp
+
+
+@jit
+def clip_bbox(boxes, im_shape):
+    assert boxes.shape[1] % 4 == 0, \
+        'boxes.shape[1] is {:d}, but must be divisible by 4.'.format(
+        boxes.shape[1]
+    )
+    # x1 >= 0
+    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
+    # y1 >= 0
+    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
+    # x2 < im_shape[1]
+    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
+    # y2 < im_shape[0]
+    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
+    return boxes
+
+
+@jit
+def bbox_overlaps(bboxes1, bboxes2):
+    w1 = np.maximum(bboxes1[:, 2] - bboxes1[:, 0] + 1, 0)
+    h1 = np.maximum(bboxes1[:, 3] - bboxes1[:, 1] + 1, 0)
+    w2 = np.maximum(bboxes2[:, 2] - bboxes2[:, 0] + 1, 0)
+    h2 = np.maximum(bboxes2[:, 3] - bboxes2[:, 1] + 1, 0)
+    area1 = w1 * h1
+    area2 = w2 * h2
+
+    overlaps = np.zeros((bboxes1.shape[0], bboxes2.shape[0]))
+    for ind1 in range(bboxes1.shape[0]):
+        for ind2 in range(bboxes2.shape[0]):
+            inter_x1 = np.maximum(bboxes1[ind1, 0], bboxes2[ind2, 0])
+            inter_y1 = np.maximum(bboxes1[ind1, 1], bboxes2[ind2, 1])
+            inter_x2 = np.minimum(bboxes1[ind1, 2], bboxes2[ind2, 2])
+            inter_y2 = np.minimum(bboxes1[ind1, 3], bboxes2[ind2, 3])
+            inter_w = np.maximum(inter_x2 - inter_x1 + 1, 0)
+            inter_h = np.maximum(inter_y2 - inter_y1 + 1, 0)
+            inter_area = inter_w * inter_h
+            iou = inter_area * 1.0 / (area1[ind1] + area2[ind2] - inter_area)
+            overlaps[ind1, ind2] = iou
+    return overlaps
+
+
+@jit
+def nms(dets, thresh):
+    if dets.shape[0] == 0:
+        return []
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    ndets = dets.shape[0]
+    suppressed = np.zeros((ndets), dtype=np.int)
+
+    for _i in range(ndets):
+        i = order[_i]
+        if suppressed[i] == 1:
+            continue
+        ix1 = x1[i]
+        iy1 = y1[i]
+        ix2 = x2[i]
+        iy2 = y2[i]
+        iarea = areas[i]
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            xx1 = max(ix1, x1[j])
+            yy1 = max(iy1, y1[j])
+            xx2 = min(ix2, x2[j])
+            yy2 = min(iy2, y2[j])
+            w = max(0.0, xx2 - xx1 + 1)
+            h = max(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (iarea + areas[j] - inter)
+            if ovr >= thresh:
+                suppressed[j] = 1
+
+    return np.where(suppressed == 0)[0]
+
+
+def nms_with_decode(bboxes,
+                    bbox_probs,
+                    bbox_deltas,
+                    im_info,
+                    keep_top_k=100,
+                    score_thresh=0.05,
+                    nms_thresh=0.5,
+                    class_nums=81,
+                    bbox_reg_weights=[0.1, 0.1, 0.2, 0.2]):
+    bboxes_num = [0, bboxes.shape[0]]
+    bboxes_v = np.array(bboxes)
+    bbox_probs_v = np.array(bbox_probs)
+    bbox_deltas_v = np.array(bbox_deltas)
+    variance_v = np.array(bbox_reg_weights)
+    im_results = [[] for _ in range(len(bboxes_num) - 1)]
+    new_bboxes_num = [0]
+    for i in range(len(bboxes_num) - 1):
+        start = bboxes_num[i]
+        end = bboxes_num[i + 1]
+        if start == end:
+            continue
+
+        bbox_deltas_n = bbox_deltas_v[start:end, :]  # box delta 
+        rois_n = bboxes_v[start:end, :]  # box 
+        rois_n = rois_n / im_info[i][2]  # scale 
+        rois_n = delta2bbox(bbox_deltas_n, rois_n, variance_v)
+        rois_n = clip_bbox(rois_n, im_info[i][:2] / im_info[i][2])
+        cls_boxes = [[] for _ in range(class_nums)]
+        scores_n = bbox_probs_v[start:end, :]
+        for j in range(1, class_nums):
+            inds = np.where(scores_n[:, j] > score_thresh)[0]
+            scores_j = scores_n[inds, j]
+            rois_j = rois_n[inds, j * 4:(j + 1) * 4]
+            dets_j = np.hstack((scores_j[:, np.newaxis], rois_j)).astype(
+                np.float32, copy=False)
+            keep = nms(dets_j, nms_thresh)
+            nms_dets = dets_j[keep, :]
+            #add labels
+            label = np.array([j for _ in range(len(keep))])
+            nms_dets = np.hstack((label[:, np.newaxis], nms_dets)).astype(
+                np.float32, copy=False)
+            cls_boxes[j] = nms_dets
+
+        # Limit to max_per_image detections **over all classes**
+        image_scores = np.hstack(
+            [cls_boxes[j][:, 1] for j in range(1, class_nums)])
+        if len(image_scores) > keep_top_k:
+            image_thresh = np.sort(image_scores)[-keep_top_k]
+            for j in range(1, class_nums):
+                keep = np.where(cls_boxes[j][:, 1] >= image_thresh)[0]
+                cls_boxes[j] = cls_boxes[j][keep, :]
+        im_results_n = np.vstack([cls_boxes[j] for j in range(1, class_nums)])
+        im_results[i] = im_results_n
+        new_bboxes_num.append(len(im_results_n) + new_bboxes_num[-1])
+        labels = im_results_n[:, 0]
+        scores = im_results_n[:, 1]
+        boxes = im_results_n[:, 2:]
+    im_results = np.vstack([im_results[k] for k in range(len(bboxes_num) - 1)])
+    new_bboxes_num = np.array(new_bboxes_num)
+    return new_bboxes_num, im_results
+
+
+@jit
+def compute_bbox_targets(bboxes1, bboxes2, labels, bbox_reg_weights):
+    assert bboxes1.shape[0] == bboxes2.shape[0]
+    assert bboxes1.shape[1] == 4
+    assert bboxes2.shape[1] == 4
+
+    targets = np.zeros(bboxes1.shape)
+    bbox_reg_weights = np.asarray(bbox_reg_weights)
+    targets = bbox2delta(
+        bboxes1=bboxes1, bboxes2=bboxes2, weights=bbox_reg_weights)
+
+    return np.hstack([labels[:, np.newaxis], targets]).astype(
+        np.float32, copy=False)
+
+
+@jit
+def expand_bbox_targets(bbox_targets_input,
+                        class_nums=81,
+                        is_cls_agnostic=False):
+    class_labels = bbox_targets_input[:, 0]
+    fg_inds = np.where(class_labels > 0)[0]
+    if not is_cls_agnostic:
+        class_nums = 2
+    bbox_targets = np.zeros((class_labels.shape[0], 4 * class_nums))
+    bbox_inside_weights = np.zeros(bbox_targets.shape)
+    for ind in fg_inds:
+        class_label = int(class_labels[ind]) if not is_cls_agnostic else 1
+        start_ind = class_label * 4
+        end_ind = class_label * 4 + 4
+        bbox_targets[ind, start_ind:end_ind] = bbox_targets_input[ind, 1:]
+        bbox_inside_weights[ind, start_ind:end_ind] = (1.0, 1.0, 1.0, 1.0)
+    return bbox_targets, bbox_inside_weights
diff --git a/ppdet/py_op/mask.py b/ppdet/py_op/mask.py
new file mode 100755
index 0000000000000000000000000000000000000000..07ff76a9a4e34d67b826d5254ce6fb11160083f8
--- /dev/null
+++ b/ppdet/py_op/mask.py
@@ -0,0 +1,202 @@
+import six
+import math
+import numpy as np
+from numba import jit
+
+
+@jit
+def decode(cnts, m):
+    v = 0
+    mask = []
+    for j in range(m):
+        for k in range(cnts[j]):
+            mask.append(v)
+        v = 1 - v
+    return mask
+
+
+#@jit 
+def poly2mask(xy, k, h, w):
+    scale = 5.
+    x = [int(scale * p + 0.5) for p in xy[::2]]
+    x = x + [x[0]]
+    y = [int(scale * p + 0.5) for p in xy[1::2]]
+    y = y + [y[0]]
+    m = sum([
+        int(max(abs(x[j] - x[j + 1]), abs(y[j] - y[j + 1]))) + int(1)
+        for j in range(k)
+    ])
+    u, v = [], []
+    for j in range(k):
+        xs = x[j]
+        xe = x[j + 1]
+        ys = y[j]
+        ye = y[j + 1]
+        dx = abs(xe - xs)
+        dy = abs(ys - ye)
+        flip = (dx >= dy and xs > xe) or (dx < dy and ys > ye)
+        if flip:
+            xs, xe = xe, xs
+            ys, ye = ye, ys
+
+        if dx >= dy:
+            if (dx == 0):
+                assert ye - ys == 0
+
+            s = 0 if dx == 0 else float(ye - ys) / dx
+        else:
+            if (dy == 0):
+                assert xe - xs == 0
+            s = 0 if dy == 0 else float(xe - xs) / dy
+
+        if dx >= dy:
+            ts = [dx - d if flip else d for d in range(dx + 1)]
+            u.extend([xs + t for t in ts])
+            v.extend([int(ys + s * t + .5) for t in ts])
+        else:
+            ts = [dy - d if flip else d for d in range(dy + 1)]
+            v.extend([t + ys for t in ts])
+            u.extend([int(xs + s * t + .5) for t in ts])
+
+    k = len(u)
+    x = np.zeros((k), np.int)
+    y = np.zeros((k), np.int)
+    m = 0
+    for j in six.moves.xrange(1, k):
+        if u[j] != u[j - 1]:
+            xd = float(u[j] if (u[j] < u[j - 1]) else (u[j] - 1))
+            xd = (xd + .5) / scale - .5
+            if (math.floor(xd) != xd or xd < 0 or xd > (w - 1)):
+                continue
+            yd = float(v[j] if v[j] < v[j - 1] else v[j - 1])
+            yd = (yd + .5) / scale - .5
+            yd = math.ceil(0 if yd < 0 else (h if yd > h else yd))
+            x[m] = int(xd)
+            y[m] = int(yd)
+            m += 1
+    k = m
+    a = [int(x[i] * h + y[i]) for i in range(k)]
+    a.append(h * w)
+    a.sort()
+    b = [0] + a[:len(a) - 1]
+    a = [c - d for (c, d) in zip(a, b)]
+
+    k += 1
+    b = [0 for i in range(k)]
+    b[0] = a[0]
+    m, j = 1, 1
+    while (j < k):
+        if a[j] > 0:
+            b[m] = a[j]
+            m += 1
+            j += 1
+        else:
+            j += 1
+            if (j < k):
+                b[m - 1] += a[j]
+                j += 1
+    mask = decode(b, m)
+    mask = np.array(mask, dtype=np.int).reshape((w, h))
+    mask = mask.transpose((1, 0))
+    return mask
+
+
+def polys_to_boxes(polys):
+    """Convert a list of polygons into an array of tight bounding boxes."""
+    boxes_from_polys = np.zeros((len(polys), 4), dtype=np.float32)
+    for j in range(len(polys)):
+        x_min, y_min = 10000000, 10000000
+        x_max, y_max = 0, 0
+        for i in range(len(polys[j])):
+            poly = polys[j][i]
+            x0 = min(min(p[::2]) for p in poly)
+            x_min = min(x0, x_min)
+            y0 = min(min(p[1::2]) for p in poly)
+            y_min = min(y0, y_min)
+            x1 = max(max(p[::2]) for p in poly)
+            x_max = max(x_max, x1)
+            y1 = max(max(p[1::2]) for p in poly)
+            y_max = max(y1, y_max)
+        boxes_from_polys[j, :] = [x_min, y_min, x_max, y_max]
+    return boxes_from_polys
+
+
+@jit
+def bbox_overlaps_mask(boxes, query_boxes):
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    overlaps = np.zeros((N, K), dtype=boxes.dtype)
+    for k in range(K):
+        box_area = (query_boxes[k, 2] - query_boxes[k, 0] + 1) *\
+                   (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+        for n in range(N):
+            iw = min(boxes[n, 2], query_boxes[k, 2]) -\
+                 max(boxes[n, 0], query_boxes[k, 0]) + 1
+            if iw > 0:
+                ih = min(boxes[n, 3], query_boxes[k, 3]) -\
+                     max(boxes[n, 1], query_boxes[k, 1]) + 1
+                if ih > 0:
+                    ua = float(
+                         (boxes[n, 2] - boxes[n, 0] + 1) *\
+                         (boxes[n, 3] - boxes[n, 1] + 1) +\
+                         box_area - iw * ih)
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
+
+@jit
+def polys_to_mask_wrt_box(polygons, box, M):
+    """Convert from the COCO polygon segmentation format to a binary mask
+    encoded as a 2D array of data type numpy.float32. The polygon segmentation
+    is understood to be enclosed in the given box and rasterized to an M x M
+    mask. The resulting mask is therefore of shape (M, M).
+    """
+    w = box[2] - box[0]
+    h = box[3] - box[1]
+    w = np.maximum(w, 1)
+    h = np.maximum(h, 1)
+
+    polygons_norm = []
+    i = 0
+    for poly in polygons:
+        p = np.array(poly, dtype=np.float32)
+        p = p.reshape(-1)
+        p[0::2] = (p[0::2] - box[0]) * M / w
+        p[1::2] = (p[1::2] - box[1]) * M / h
+        polygons_norm.append(p)
+
+    mask = []
+    for polygons in polygons_norm:
+        assert polygons.shape[0] % 2 == 0, polygons.shape
+        k = polygons.shape[0] // 2
+
+        one_msk = poly2mask(polygons, k, M, M)
+        mask.append(one_msk)
+
+    mask = np.array(mask)
+    # Flatten in case polygons was a list
+    mask = np.sum(mask, axis=0)
+    mask = np.array(mask > 0, dtype=np.float32)
+    return mask
+
+
+@jit
+def expand_mask_targets(masks, mask_class_labels, resolution, num_classes):
+    """Expand masks from shape (#masks, resolution ** 2)
+    to (#masks, #classes * resolution ** 2) to encode class
+    specific mask targets.
+    """
+    assert masks.shape[0] == mask_class_labels.shape[0]
+    # Target values of -1 are "don't care" / ignore labels
+    mask_targets = -np.ones(
+        (masks.shape[0], num_classes * resolution**2), dtype=np.int32)
+    for i in range(masks.shape[0]):
+        cls = int(mask_class_labels[i])
+        start = resolution**2 * cls
+        end = start + resolution**2
+        # Ignore background instance
+        # (only happens when there is no fg samples in an image)
+        if cls > 0:
+            mask_targets[i, start:end] = masks[i, :]
+
+    return mask_targets
diff --git a/ppdet/py_op/post_process.py b/ppdet/py_op/post_process.py
new file mode 100755
index 0000000000000000000000000000000000000000..bcbb027caac6d460c395942b0154a487f50e3d12
--- /dev/null
+++ b/ppdet/py_op/post_process.py
@@ -0,0 +1,185 @@
+import six
+import os
+import numpy as np
+from numba import jit
+from .bbox import delta2bbox, clip_bbox, expand_bbox, nms
+
+
+def bbox_post_process(bboxes,
+                      bbox_probs,
+                      bbox_deltas,
+                      im_info,
+                      keep_top_k=100,
+                      score_thresh=0.05,
+                      nms_thresh=0.5,
+                      class_nums=81,
+                      bbox_reg_weights=[0.1, 0.1, 0.2, 0.2]):
+    bbox_nums = [0, bboxes.shape[0]]
+    bboxes_v = np.array(bboxes)
+    bbox_probs_v = np.array(bbox_probs)
+    bbox_deltas_v = np.array(bbox_deltas)
+    variance_v = np.array(bbox_reg_weights)
+    new_bboxes = [[] for _ in range(len(bbox_nums) - 1)]
+    new_bbox_nums = [0]
+    for i in range(len(bbox_nums) - 1):
+        start = bbox_nums[i]
+        end = bbox_nums[i + 1]
+        if start == end:
+            continue
+
+        bbox_deltas_n = bbox_deltas_v[start:end, :]  # box delta 
+        rois_n = bboxes_v[start:end, :]  # box 
+        rois_n = rois_n / im_info[i][2]  # scale 
+        rois_n = delta2bbox(bbox_deltas_n, rois_n, variance_v)
+        rois_n = clip_bbox(rois_n, im_info[i][:2] / im_info[i][2])
+        cls_boxes = [[] for _ in range(class_nums)]
+        scores_n = bbox_probs_v[start:end, :]
+        for j in range(1, class_nums):
+            inds = np.where(scores_n[:, j] > score_thresh)[0]
+            scores_j = scores_n[inds, j]
+            rois_j = rois_n[inds, j * 4:(j + 1) * 4]
+            dets_j = np.hstack((scores_j[:, np.newaxis], rois_j)).astype(
+                np.float32, copy=False)
+            keep = nms(dets_j, nms_thresh)
+            nms_dets = dets_j[keep, :]
+            #add labels
+            label = np.array([j for _ in range(len(keep))])
+            nms_dets = np.hstack((label[:, np.newaxis], nms_dets)).astype(
+                np.float32, copy=False)
+            cls_boxes[j] = nms_dets
+
+        # Limit to max_per_image detections **over all classes**
+        image_scores = np.hstack(
+            [cls_boxes[j][:, 1] for j in range(1, class_nums)])
+        if len(image_scores) > keep_top_k:
+            image_thresh = np.sort(image_scores)[-keep_top_k]
+            for j in range(1, class_nums):
+                keep = np.where(cls_boxes[j][:, 1] >= image_thresh)[0]
+                cls_boxes[j] = cls_boxes[j][keep, :]
+        new_bboxes_n = np.vstack([cls_boxes[j] for j in range(1, class_nums)])
+        new_bboxes[i] = new_bboxes_n
+        new_bbox_nums.append(len(new_bboxes_n) + new_bbox_nums[-1])
+        labels = new_bboxes_n[:, 0]
+        scores = new_bboxes_n[:, 1]
+        boxes = new_bboxes_n[:, 2:]
+    new_bboxes = np.vstack([new_bboxes[k] for k in range(len(bbox_nums) - 1)])
+    new_bbox_nums = np.array(new_bbox_nums)
+    return new_bbox_nums, new_bboxes
+
+
+@jit
+def mask_post_process(bbox_nums, bboxes, masks, im_info):
+    bboxes = np.array(bboxes)
+    M = cfg.resolution
+    scale = (M + 2.0) / M
+    masks_v = np.array(masks)
+    boxes = bboxes[:, 2:]
+    labels = bboxes[:, 0]
+    segms_results = [[] for _ in range(len(bbox_nums) - 1)]
+    sum = 0
+    for i in range(len(bbox_nums) - 1):
+        bboxes_n = bboxes[bbox_nums[i]:bbox_nums[i + 1]]
+        cls_segms = []
+        masks_n = masks_v[bbox_nums[i]:bbox_nums[i + 1]]
+        boxes_n = boxes[bbox_nums[i]:bbox_nums[i + 1]]
+        labels_n = labels[bbox_nums[i]:bbox_nums[i + 1]]
+        im_h = int(round(im_info[i][0] / im_info[i][2]))
+        im_w = int(round(im_info[i][1] / im_info[i][2]))
+        boxes_n = expand_boxes(boxes_n, scale)
+        boxes_n = boxes_n.astype(np.int32)
+        padded_mask = np.zeros((M + 2, M + 2), dtype=np.float32)
+        for j in range(len(bboxes_n)):
+            class_id = int(labels_n[j])
+            padded_mask[1:-1, 1:-1] = masks_n[j, class_id, :, :]
+
+            ref_box = boxes_n[j, :]
+            w = ref_box[2] - ref_box[0] + 1
+            h = ref_box[3] - ref_box[1] + 1
+            w = np.maximum(w, 1)
+            h = np.maximum(h, 1)
+
+            mask = cv2.resize(padded_mask, (w, h))
+            mask = np.array(mask > cfg.mrcnn_thresh_binarize, dtype=np.uint8)
+            im_mask = np.zeros((im_h, im_w), dtype=np.uint8)
+
+            x_0 = max(ref_box[0], 0)
+            x_1 = min(ref_box[2] + 1, im_w)
+            y_0 = max(ref_box[1], 0)
+            y_1 = min(ref_box[3] + 1, im_h)
+            im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - ref_box[1]):(y_1 - ref_box[
+                1]), (x_0 - ref_box[0]):(x_1 - ref_box[0])]
+            sum += im_mask.sum()
+            rle = mask_util.encode(
+                np.array(
+                    im_mask[:, :, np.newaxis], order='F'))[0]
+            cls_segms.append(rle)
+        segms_results[i] = np.array(cls_segms)[:, np.newaxis]
+    segms_results = np.vstack([segms_results[k] for k in range(len(lod) - 1)])
+    bboxes = np.hstack([segms_results, bboxes])
+    return bboxes[:, :3]
+
+
+@jit
+def get_det_res(bbox_nums,
+                bbox,
+                image_id,
+                image_shape,
+                num_id_to_cat_id_map,
+                batch_size=1):
+    det_res = []
+    bbox_v = np.array(bbox)
+    if bbox_v.shape == (
+            1,
+            1, ):
+        return dts_res
+    assert (len(bbox_nums) == batch_size + 1), \
+      "Error bbox_nums Tensor offset dimension. bbox_nums({}) vs. batch_size({})"\
+                    .format(len(bbox_nums), batch_size)
+    k = 0
+    for i in range(batch_size):
+        dt_num_this_img = bbox_nums[i + 1] - bbox_nums[i]
+        image_id = int(image_id[i][0])
+        image_width = int(image_shape[i][1])  #int(data[i][-1][1])
+        image_height = int(image_shape[i][2])  #int(data[i][-1][2])
+        for j in range(dt_num_this_img):
+            dt = bbox_v[k]
+            k = k + 1
+            num_id, score, xmin, ymin, xmax, ymax = dt.tolist()
+            category_id = num_id_to_cat_id_map[num_id]
+            w = xmax - xmin + 1
+            h = ymax - ymin + 1
+            bbox = [xmin, ymin, w, h]
+            dt_res = {
+                'image_id': image_id,
+                'category_id': category_id,
+                'bbox': bbox,
+                'score': score
+            }
+            det_res.append(dt_res)
+    return det_res
+
+
+@jit
+def get_seg_res(mask_nums, mask, image_id, num_id_to_cat_id_map, batch_size=1):
+    seg_res = []
+    mask_v = np.array(mask)
+    k = 0
+    for i in range(batch_size):
+        image_id = int(image_id[i][0])
+        dt_num_this_img = mask_nums[i + 1] - mask_nums[i]
+        for j in range(dt_num_this_img):
+            dt = mask_v[k]
+            k = k + 1
+            sg, num_id, score = dt.tolist()
+            cat_id = num_id_to_cat_id_map[num_id]
+            if six.PY3:
+                if 'counts' in sg:
+                    sg['counts'] = sg['counts'].decode("utf8")
+            sg_res = {
+                'image_id': image_id,
+                'category_id': cat_id,
+                'segmentation': sg,
+                'score': score
+            }
+            seg_res.append(sg_res)
+    return seg_res
diff --git a/ppdet/py_op/post_processing.py b/ppdet/py_op/post_processing.py
new file mode 100755
index 0000000000000000000000000000000000000000..841b500aee568898063ccb6613e300f327c7d89e
--- /dev/null
+++ b/ppdet/py_op/post_processing.py
@@ -0,0 +1,187 @@
+import six
+import os
+import numpy as np
+from numba import jit
+from .bbox import nms
+
+
+@jit
+def box_decoder(deltas, boxes, weights, bbox_clip=4.13):
+    if boxes.shape[0] == 0:
+        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
+    boxes = boxes.astype(deltas.dtype, copy=False)
+
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    wx, wy, ww, wh = weights
+    dx = deltas[:, 0::4] * wx
+    dy = deltas[:, 1::4] * wy
+    dw = deltas[:, 2::4] * ww
+    dh = deltas[:, 3::4] * wh
+
+    # Prevent sending too large values into np.exp()
+    dw = np.minimum(dw, bbox_clip)
+    dh = np.minimum(dh, bbox_clip)
+
+    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+    pred_w = np.exp(dw) * widths[:, np.newaxis]
+    pred_h = np.exp(dh) * heights[:, np.newaxis]
+
+    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
+    # x1
+    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
+    # y1
+    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
+    # x2 (note: "- 1" is correct; don't be fooled by the asymmetry)
+    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1
+    # y2 (note: "- 1" is correct; don't be fooled by the asymmetry)
+    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1
+
+    return pred_boxes
+
+
+@jit
+def clip_tiled_boxes(boxes, im_shape):
+    """Clip boxes to image boundaries. im_shape is [height, width] and boxes
+    has shape (N, 4 * num_tiled_boxes)."""
+    assert boxes.shape[1] % 4 == 0, \
+        'boxes.shape[1] is {:d}, but must be divisible by 4.'.format(
+        boxes.shape[1]
+    )
+    # x1 >= 0
+    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
+    # y1 >= 0
+    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
+    # x2 < im_shape[1]
+    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
+    # y2 < im_shape[0]
+    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
+    return boxes
+
+
+#@jit 
+def get_nmsed_box(rpn_rois,
+                  confs,
+                  locs,
+                  class_nums,
+                  im_info,
+                  bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
+                  score_thresh=0.05,
+                  nms_thresh=0.5,
+                  detections_per_im=100):
+    box_nums = [0, rpn_rois.shape[0]]
+    variance_v = np.array(bbox_reg_weights)
+    rpn_rois_v = np.array(rpn_rois)
+    confs_v = np.array(confs)
+    locs_v = np.array(locs)
+
+    im_results = [[] for _ in range(len(box_nums) - 1)]
+    new_box_nums = [0]
+    for i in range(len(box_nums) - 1):
+        start = box_nums[i]
+        end = box_nums[i + 1]
+        if start == end:
+            continue
+
+        locs_n = locs_v[start:end, :]  # box delta 
+        rois_n = rpn_rois_v[start:end, :]  # box 
+        rois_n = rois_n / im_info[i][2]  # scale 
+        rois_n = box_decoder(locs_n, rois_n, variance_v)
+        rois_n = clip_tiled_boxes(rois_n, im_info[i][:2] / im_info[i][2])
+        cls_boxes = [[] for _ in range(class_nums)]
+        scores_n = confs_v[start:end, :]
+        for j in range(1, class_nums):
+            inds = np.where(scores_n[:, j] > TEST.score_thresh)[0]
+            scores_j = scores_n[inds, j]
+            rois_j = rois_n[inds, j * 4:(j + 1) * 4]
+            dets_j = np.hstack((scores_j[:, np.newaxis], rois_j)).astype(
+                np.float32, copy=False)
+            keep = nms(dets_j, TEST.nms_thresh)
+            nms_dets = dets_j[keep, :]
+            #add labels
+            label = np.array([j for _ in range(len(keep))])
+            nms_dets = np.hstack((label[:, np.newaxis], nms_dets)).astype(
+                np.float32, copy=False)
+            cls_boxes[j] = nms_dets
+
+        # Limit to max_per_image detections **over all classes**
+        image_scores = np.hstack(
+            [cls_boxes[j][:, 1] for j in range(1, class_nums)])
+        if len(image_scores) > detections_per_im:
+            image_thresh = np.sort(image_scores)[-detections_per_im]
+            for j in range(1, class_nums):
+                keep = np.where(cls_boxes[j][:, 1] >= image_thresh)[0]
+                cls_boxes[j] = cls_boxes[j][keep, :]
+        im_results_n = np.vstack([cls_boxes[j] for j in range(1, class_nums)])
+        im_results[i] = im_results_n
+        new_box_nums.append(len(im_results_n) + new_box_nums[-1])
+        labels = im_results_n[:, 0]
+        scores = im_results_n[:, 1]
+        boxes = im_results_n[:, 2:]
+    im_results = np.vstack([im_results[k] for k in range(len(box_nums) - 1)])
+    return new_box_nums, im_results
+
+
+@jit
+def get_dt_res(batch_size, box_nums, nmsed_out, data, num_id_to_cat_id_map):
+    dts_res = []
+    nmsed_out_v = np.array(nmsed_out)
+    if nmsed_out_v.shape == (
+            1,
+            1, ):
+        return dts_res
+    assert (len(box_nums) == batch_size + 1), \
+      "Error Tensor offset dimension. Box Nums({}) vs. batch_size({})"\
+                    .format(len(box_nums), batch_size)
+    k = 0
+    for i in range(batch_size):
+        dt_num_this_img = box_nums[i + 1] - box_nums[i]
+        image_id = int(data[i][-1])
+        image_width = int(data[i][1][1])
+        image_height = int(data[i][1][2])
+        for j in range(dt_num_this_img):
+            dt = nmsed_out_v[k]
+            k = k + 1
+            num_id, score, xmin, ymin, xmax, ymax = dt.tolist()
+            category_id = num_id_to_cat_id_map[num_id]
+            w = xmax - xmin + 1
+            h = ymax - ymin + 1
+            bbox = [xmin, ymin, w, h]
+            dt_res = {
+                'image_id': image_id,
+                'category_id': category_id,
+                'bbox': bbox,
+                'score': score
+            }
+            dts_res.append(dt_res)
+    return dts_res
+
+
+@jit
+def get_segms_res(batch_size, box_nums, segms_out, data, num_id_to_cat_id_map):
+    segms_res = []
+    segms_out_v = np.array(segms_out)
+    k = 0
+    for i in range(batch_size):
+        dt_num_this_img = box_nums[i + 1] - box_nums[i]
+        image_id = int(data[i][-1])
+        for j in range(dt_num_this_img):
+            dt = segms_out_v[k]
+            k = k + 1
+            segm, num_id, score = dt.tolist()
+            cat_id = num_id_to_cat_id_map[num_id]
+            if six.PY3:
+                if 'counts' in segm:
+                    segm['counts'] = segm['counts'].decode("utf8")
+            segm_res = {
+                'image_id': image_id,
+                'category_id': cat_id,
+                'segmentation': segm,
+                'score': score
+            }
+            segms_res.append(segm_res)
+    return segms_res
diff --git a/ppdet/py_op/target.py b/ppdet/py_op/target.py
new file mode 100755
index 0000000000000000000000000000000000000000..04329681a77da3f24d7a4991dbaba6bc29bf5a37
--- /dev/null
+++ b/ppdet/py_op/target.py
@@ -0,0 +1,398 @@
+import six
+import math
+import numpy as np
+from numba import jit
+from .bbox import *
+from .mask import *
+
+
+@jit
+def generate_rpn_anchor_target(anchor_box,
+                               gt_boxes,
+                               is_crowd,
+                               im_info,
+                               rpn_straddle_thresh,
+                               rpn_batch_size_per_im,
+                               rpn_positive_overlap,
+                               rpn_negative_overlap,
+                               rpn_fg_fraction,
+                               use_random=True):
+    anchor_num = anchor_box.shape[0]
+    batch_size = gt_boxes.shape[0]
+
+    for i in range(batch_size):
+        im_height = im_info[i][0]
+        im_width = im_info[i][1]
+        im_scale = im_info[i][2]
+        if rpn_straddle_thresh >= 0:
+            # Only keep anchors inside the image by a margin of straddle_thresh
+            inds_inside = np.where(
+                (anchor_box[:, 0] >= -rpn_straddle_thresh
+                 ) & (anchor_box[:, 1] >= -rpn_straddle_thresh) & (
+                     anchor_box[:, 2] < im_width + rpn_straddle_thresh) & (
+                         anchor_box[:, 3] < im_height + rpn_straddle_thresh))[0]
+            # keep only inside anchors
+            inside_anchors = anchor_box[inds_inside, :]
+        else:
+            inds_inside = np.arange(anchor_box.shape[0])
+            inside_anchors = anchor_box
+        gt_boxes_slice = gt_boxes[i] * im_scale
+        is_crowd_slice = is_crowd[i]
+
+        not_crowd_inds = np.where(is_crowd_slice == 0)[0]
+        gt_boxes_slice = gt_boxes_slice[not_crowd_inds]
+        iou = bbox_overlaps(inside_anchors, gt_boxes_slice)
+
+        loc_inds, score_inds, labels, gt_inds, bbox_inside_weight = _sample_anchor(
+            iou, rpn_batch_size_per_im, rpn_positive_overlap,
+            rpn_negative_overlap, rpn_fg_fraction, use_random)
+        # unmap to all anchor 
+        loc_inds = inds_inside[loc_inds]
+        score_inds = inds_inside[score_inds]
+        sampled_anchor = anchor_box[loc_inds]
+        sampled_gt = gt_boxes_slice[gt_inds]
+        box_deltas = bbox2delta(sampled_anchor, sampled_gt, [1., 1., 1., 1.])
+
+        if i == 0:
+            loc_indexes = loc_inds
+            score_indexes = score_inds
+            tgt_labels = labels
+            tgt_bboxes = box_deltas
+            bbox_inside_weights = bbox_inside_weight
+        else:
+            loc_indexes = np.concatenate(
+                [loc_indexes, loc_inds + i * anchor_num])
+            score_indexes = np.concatenate(
+                [score_indexes, score_inds + i * anchor_num])
+            tgt_labels = np.concatenate([tgt_labels, labels])
+            tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
+            bbox_inside_weights = np.vstack([bbox_inside_weights, \
+                                             bbox_inside_weight])
+    tgt_labels = tgt_labels.astype('float32')
+    tgt_bboxes = tgt_bboxes.astype('float32')
+    return loc_indexes, score_indexes, tgt_labels, tgt_bboxes, bbox_inside_weights
+
+
+@jit
+def _sample_anchor(anchor_by_gt_overlap,
+                   rpn_batch_size_per_im,
+                   rpn_positive_overlap,
+                   rpn_negative_overlap,
+                   rpn_fg_fraction,
+                   use_random=True):
+
+    anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
+    anchor_to_gt_max = anchor_by_gt_overlap[np.arange(
+        anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax]
+
+    gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
+    gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange(
+        anchor_by_gt_overlap.shape[1])]
+    anchors_with_max_overlap = np.where(
+        anchor_by_gt_overlap == gt_to_anchor_max)[0]
+
+    labels = np.ones((anchor_by_gt_overlap.shape[0], ), dtype=np.int32) * -1
+    labels[anchors_with_max_overlap] = 1
+    labels[anchor_to_gt_max >= rpn_positive_overlap] = 1
+
+    num_fg = int(rpn_fg_fraction * rpn_batch_size_per_im)
+    fg_inds = np.where(labels == 1)[0]
+    if len(fg_inds) > num_fg and use_random:
+        disable_inds = np.random.choice(
+            fg_inds, size=(len(fg_inds) - num_fg), replace=False)
+    else:
+        disable_inds = fg_inds[num_fg:]
+
+    labels[disable_inds] = -1
+    fg_inds = np.where(labels == 1)[0]
+
+    num_bg = rpn_batch_size_per_im - np.sum(labels == 1)
+    bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
+    if len(bg_inds) > num_bg and use_random:
+        enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
+    else:
+        enable_inds = bg_inds[:num_bg]
+
+    fg_fake_inds = np.array([], np.int32)
+    fg_value = np.array([fg_inds[0]], np.int32)
+    fake_num = 0
+    for bg_id in enable_inds:
+        if bg_id in fg_inds:
+            fake_num += 1
+            fg_fake_inds = np.hstack([fg_fake_inds, fg_value])
+    labels[enable_inds] = 0
+
+    fg_inds = np.where(labels == 1)[0]
+    bg_inds = np.where(labels == 0)[0]
+
+    loc_index = np.hstack([fg_fake_inds, fg_inds])
+    score_index = np.hstack([fg_inds, bg_inds])
+    labels = labels[score_index]
+
+    gt_inds = anchor_to_gt_argmax[loc_index]
+
+    bbox_inside_weight = np.zeros((len(loc_index), 4), dtype=np.float32)
+    bbox_inside_weight[fake_num:, :] = 1
+    return loc_index, score_index, labels, gt_inds, bbox_inside_weight
+
+
+@jit
+def generate_proposal_target(rpn_rois,
+                             rpn_rois_nums,
+                             gt_classes,
+                             is_crowd,
+                             gt_boxes,
+                             im_info,
+                             batch_size_per_im,
+                             fg_fraction,
+                             fg_thresh,
+                             bg_thresh_hi,
+                             bg_thresh_lo,
+                             bbox_reg_weights,
+                             class_nums=81,
+                             use_random=True,
+                             is_cls_agnostic=False,
+                             is_cascade_rcnn=False):
+
+    rois = []
+    labels_int32 = []
+    bbox_targets = []
+    bbox_inside_weights = []
+    bbox_outside_weights = []
+    rois_nums = []
+    batch_size = gt_boxes.shape[0]
+    # TODO: modify here
+    # rpn_rois = rpn_rois.reshape(batch_size, -1, 4)
+    st_num = 0
+    print("debug: ", rpn_rois_nums)
+    for im_i in range(len(rpn_rois_nums)):
+        rpn_rois_num = rpn_rois_nums[im_i]
+        frcn_blobs = _sample_rois(
+            rpn_rois[st_num:rpn_rois_num], gt_classes[im_i], is_crowd[im_i],
+            gt_boxes[im_i], im_info[im_i], batch_size_per_im, fg_fraction,
+            fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums,
+            use_random, is_cls_agnostic, is_cascade_rcnn)
+        st_num = rpn_rois_num
+
+        rois.append(frcn_blobs['rois'])
+        labels_int32.append(frcn_blobs['labels_int32'])
+        bbox_targets.append(frcn_blobs['bbox_targets'])
+        bbox_inside_weights.append(frcn_blobs['bbox_inside_weights'])
+        bbox_outside_weights.append(frcn_blobs['bbox_outside_weights'])
+        rois_nums.append(frcn_blobs['rois'].shape[0])
+
+    rois = np.concatenate(rois, axis=0).astype(np.float32)
+    bbox_labels = np.concatenate(
+        labels_int32, axis=0).astype(np.int32).reshape(-1, 1)
+    bbox_gts = np.concatenate(bbox_targets, axis=0).astype(np.float32)
+    bbox_inside_weights = np.concatenate(
+        bbox_inside_weights, axis=0).astype(np.float32)
+    bbox_outside_weights = np.concatenate(
+        bbox_outside_weights, axis=0).astype(np.float32)
+    rois_nums = np.asarray(rois_nums, np.int32)
+
+    return rois, bbox_labels, bbox_gts, bbox_inside_weights, bbox_outside_weights, rois_nums
+
+
+@jit
+def _sample_rois(rpn_rois,
+                 gt_classes,
+                 is_crowd,
+                 gt_boxes,
+                 im_info,
+                 batch_size_per_im,
+                 fg_fraction,
+                 fg_thresh,
+                 bg_thresh_hi,
+                 bg_thresh_lo,
+                 bbox_reg_weights,
+                 class_nums,
+                 use_random=True,
+                 is_cls_agnostic=False,
+                 is_cascade_rcnn=False):
+    rois_per_image = int(batch_size_per_im)
+    fg_rois_per_im = int(np.round(fg_fraction * rois_per_image))
+
+    # Roidb
+    im_scale = im_info[2]
+    inv_im_scale = 1. / im_scale
+    rpn_rois = rpn_rois * inv_im_scale
+    if is_cascade_rcnn:
+        rpn_rois = rpn_rois[gt_boxes.shape[0]:, :]
+    boxes = np.vstack([gt_boxes, rpn_rois])
+    gt_overlaps = np.zeros((boxes.shape[0], class_nums))
+    box_to_gt_ind_map = np.zeros((boxes.shape[0]), dtype=np.int32)
+    if len(gt_boxes) > 0:
+        proposal_to_gt_overlaps = bbox_overlaps(boxes, gt_boxes)
+        overlaps_argmax = proposal_to_gt_overlaps.argmax(axis=1)
+        overlaps_max = proposal_to_gt_overlaps.max(axis=1)
+        # Boxes which with non-zero overlap with gt boxes
+        overlapped_boxes_ind = np.where(overlaps_max > 0)[0].astype('int32')
+        overlapped_boxes_gt_classes = gt_classes[overlaps_argmax[
+            overlapped_boxes_ind]].astype('int32')
+        gt_overlaps[overlapped_boxes_ind,
+                    overlapped_boxes_gt_classes] = overlaps_max[
+                        overlapped_boxes_ind]
+        box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[
+            overlapped_boxes_ind]
+
+    crowd_ind = np.where(is_crowd)[0]
+    gt_overlaps[crowd_ind] = -1
+
+    max_overlaps = gt_overlaps.max(axis=1)
+    max_classes = gt_overlaps.argmax(axis=1)
+
+    # Cascade RCNN Decode Filter
+    if is_cascade_rcnn:
+        ws = boxes[:, 2] - boxes[:, 0] + 1
+        hs = boxes[:, 3] - boxes[:, 1] + 1
+        keep = np.where((ws > 0) & (hs > 0))[0]
+        boxes = boxes[keep]
+        fg_inds = np.where(max_overlaps >= fg_thresh)[0]
+        bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >=
+                                                            bg_thresh_lo))[0]
+        fg_rois_per_this_image = fg_inds.shape[0]
+        bg_rois_per_this_image = bg_inds.shape[0]
+    else:
+        # Foreground
+        fg_inds = np.where(max_overlaps >= fg_thresh)[0]
+        fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0])
+        # Sample foreground if there are too many
+        if (fg_inds.shape[0] > fg_rois_per_this_image) and use_random:
+            fg_inds = np.random.choice(
+                fg_inds, size=fg_rois_per_this_image, replace=False)
+        fg_inds = fg_inds[:fg_rois_per_this_image]
+        # Background
+        bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >=
+                                                            bg_thresh_lo))[0]
+        bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
+        bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
+                                            bg_inds.shape[0])
+        # Sample background if there are too many
+        if (bg_inds.shape[0] > bg_rois_per_this_image) and use_random:
+            bg_inds = np.random.choice(
+                bg_inds, size=bg_rois_per_this_image, replace=False)
+        bg_inds = bg_inds[:bg_rois_per_this_image]
+
+    keep_inds = np.append(fg_inds, bg_inds)
+    sampled_labels = max_classes[keep_inds]
+    sampled_labels[fg_rois_per_this_image:] = 0
+    sampled_boxes = boxes[keep_inds]
+    sampled_gts = gt_boxes[box_to_gt_ind_map[keep_inds]]
+    sampled_gts[fg_rois_per_this_image:, :] = gt_boxes[0]
+    bbox_label_targets = compute_bbox_targets(sampled_boxes, sampled_gts,
+                                              sampled_labels, bbox_reg_weights)
+    bbox_targets, bbox_inside_weights = expand_bbox_targets(
+        bbox_label_targets, class_nums, is_cls_agnostic)
+    bbox_outside_weights = np.array(
+        bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype)
+
+    # Scale rois
+    sampled_rois = sampled_boxes * im_scale
+
+    # Faster RCNN blobs
+    frcn_blobs = dict(
+        rois=sampled_rois,
+        labels_int32=sampled_labels,
+        bbox_targets=bbox_targets,
+        bbox_inside_weights=bbox_inside_weights,
+        bbox_outside_weights=bbox_outside_weights)
+    return frcn_blobs
+
+
+@jit
+def generate_mask_target(im_info, gt_classes, is_crowd, gt_segms, rois,
+                         rois_nums, labels_int32, num_classes, resolution):
+    mask_rois = []
+    rois_has_mask_int32 = []
+    mask_int32 = []
+    st_num = 0
+    for i in range(len(rois_nums)):
+        rois_num = rois_nums[i]
+        mask_blob = _sample_mask(
+            rois[st_num:rois_num], labels_int32[st_num:rois_num], gt_segms[i],
+            im_info[i], gt_classes[i], is_crowd[i], num_classes, resolution)
+
+        st_num = rois_num
+        mask_rois.append(mask_blob['mask_rois'])
+        rois_has_mask_int32.append(mask_blob['roi_has_mask_int32'])
+        mask_int32.append(mask_blob['mask_int32'])
+    mask_rois = np.concatenate(mask_rois, axis=0).astype(np.float32)
+    rois_has_mask_int32 = np.concatenate(
+        rois_has_mask_int32, axis=0).astype(np.int32)
+    mask_int32 = np.concatenate(mask_int32, axis=0).astype(np.int32)
+
+    return mask_rois, rois_has_mask_int32, mask_int32
+
+
+@jit
+def _sample_mask(
+        rois,
+        label_int32,
+        gt_polys,
+        im_info,
+        gt_classes,
+        is_crowd,
+        num_classes,
+        resolution, ):
+
+    # remove padding 
+    new_gt_polys = []
+    for i in range(gt_polys.shape[0]):
+        gt_segs = []
+        for j in range(gt_polys[i].shape[0]):
+            new_poly = []
+            polys = gt_polys[i][j]
+            for ii in range(polys.shape[0]):
+                x, y = polys[ii]
+                if (x == -1 and y == -1):
+                    continue
+                elif (x >= 0 and y >= 0):
+                    new_poly.append([x, y])  # array, one poly 
+            if len(new_poly) > 0:
+                gt_segs.append(new_poly)
+        new_gt_polys.append(gt_segs)
+
+    im_scale = im_info[2]
+    sample_boxes = rois / im_scale
+
+    polys_gt_inds = np.where((gt_classes > 0) & (is_crowd == 0))[0]
+
+    polys_gt = [new_gt_polys[i] for i in polys_gt_inds]
+    boxes_from_polys = polys_to_boxes(polys_gt)
+    fg_inds = np.where(label_int32 > 0)[0]
+    roi_has_mask = fg_inds.copy()
+
+    if fg_inds.shape[0] > 0:
+        mask_class_labels = label_int32[fg_inds]
+        masks = np.zeros((fg_inds.shape[0], resolution**2), dtype=np.int32)
+        rois_fg = sample_boxes[fg_inds]
+
+        overlaps_bbfg_bbpolys = bbox_overlaps_mask(rois_fg, boxes_from_polys)
+        fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1)
+
+        for i in range(rois_fg.shape[0]):
+            fg_polys_ind = fg_polys_inds[i]
+            poly_gt = polys_gt[fg_polys_ind]
+            roi_fg = rois_fg[i]
+
+            mask = polys_to_mask_wrt_box(poly_gt, roi_fg, resolution)
+            mask = np.array(mask > 0, dtype=np.int32)
+            masks[i, :] = np.reshape(mask, resolution**2)
+    else:
+        bg_inds = np.where(label_int32 == 0)[0]
+        rois_fg = sample_boxes[bg_inds[0]].reshape((1, -1))
+        masks = -np.ones((1, resolution**2), dtype=np.int32)
+        mask_class_labels = np.zeros((1, ))
+        roi_has_mask = np.append(roi_has_mask, 0)
+
+    masks = expand_mask_targets(masks, mask_class_labels, resolution,
+                                num_classes)
+
+    rois_fg *= im_scale
+    mask_blob = dict()
+    mask_blob['mask_rois'] = rois_fg
+    mask_blob['roi_has_mask_int32'] = roi_has_mask
+    mask_blob['mask_int32'] = masks
+
+    return mask_blob