test=develop clean code

98f4e41c · FDInSky · 4a6db5ef · 4a6db5ef · 98f4e41c · 4a6db5ef
10 changed file
--- a/dygraph/rcnn/.run_ce.sh
+++ b/dygraph/rcnn/.run_ce.sh
-#!/bin/bash
-
-export MKL_NUM_THREADS=1
-export OMP_NUM_THREADS=1
-
-
-cudaid=${face_detection:=0} # use 0-th card as default
-export CUDA_VISIBLE_DEVICES=$cudaid
-
-FLAGS_benchmark=true  python train.py --model_save_dir=output/ --data_dir=dataset/coco/ --max_iter=500 --enable_ce --pretrained_model=./imagenet_resnet50_fusebn --learning_rate=0.00125 | python _ce.py
-
-
-cudaid=${face_detection_m:=0,1,2,3} # use 0,1,2,3 card as default
-export CUDA_VISIBLE_DEVICES=$cudaid
-
-FLAGS_benchmark=true  python train.py --model_save_dir=output/ --data_dir=dataset/coco/ --max_iter=500 --enable_ce --pretrained_model=./imagenet_resnet50_fusebn --learning_rate=0.005 | python _ce.py
-
--- a/dygraph/rcnn/README.md
+++ b/dygraph/rcnn/README.md
@@ -29,7 +29,7 @@ RCNN系列目前包含两个代表模型：Faster RCNN，Mask RCNN

 [Mask RCNN](https://arxiv.org/abs/1703.06870) 扩展自Faster RCNN，是经典的实例分割模型。

-Mask RCNN同样为两阶段框架，第一阶段扫描图像生成候选框；第二阶段根据候选框得到分类结果，边界框，同时在原有Faster RCNN模型基础上添加分割分支，得到掩码结果，实现了掩码和类别预测关系的解藕。
+Mask RCNN同样为两阶段框架，第一阶段扫描图像生成候选框；第二阶段根据候选框得到分类结果，边界框，同时在原有Faster RCNN模型基础上添加分割分支，得到掩码结果，实现了掩码和类别预测关系的解耦。


 ## 数据准备
@@ -62,7 +62,7 @@ data/coco/

 ## 模型训练

-**下载预训练模型：** 本示例提供Resnet-50预训练模型，该模性转换自Caffe，并对批标准化层(Batch Normalization Layer)进行参数融合。采用如下命令下载预训练模型：
+**下载预训练模型：** 本示例提供Resnet-50预训练模型，该模型转换自Caffe，并对批标准化层(Batch Normalization Layer)进行参数融合。采用如下命令下载预训练模型：

    sh ./pretrained/download.sh


--- a/dygraph/rcnn/_ce.py
+++ b/dygraph/rcnn/_ce.py
-# this file is only used for continuous evaluation test!
-
-import os
-import sys
-sys.path.append(os.environ['ceroot'])
-from kpi import CostKpi
-from kpi import DurationKpi
-
-each_pass_duration_card1_kpi = DurationKpi(
-    'each_pass_duration_card1', 0.08, 0, actived=True)
-train_loss_card1_kpi = CostKpi('train_loss_card1', 0.08, 0)
-each_pass_duration_card4_kpi = DurationKpi(
-    'each_pass_duration_card4', 0.08, 0, actived=True)
-train_loss_card4_kpi = CostKpi('train_loss_card4', 0.08, 0)
-
-tracking_kpis = [
-    each_pass_duration_card1_kpi,
-    train_loss_card1_kpi,
-    each_pass_duration_card4_kpi,
-    train_loss_card4_kpi,
-]
-
-
-def parse_log(log):
-    '''
-    This method should be implemented by model developers.
-
-    The suggestion:
-
-    each line in the log should be key, value, for example:
-
-    "
-    train_cost\t1.0
-    test_cost\t1.0
-    train_cost\t1.0
-    train_cost\t1.0
-    train_acc\t1.2
-    "
-    '''
-    for line in log.split('\n'):
-        fs = line.strip().split('\t')
-        print(fs)
-        if len(fs) == 3 and fs[0] == 'kpis':
-            kpi_name = fs[1]
-            kpi_value = float(fs[2])
-            yield kpi_name, kpi_value
-
-
-def log_to_ce(log):
-    kpi_tracker = {}
-    for kpi in tracking_kpis:
-        kpi_tracker[kpi.name] = kpi
-
-    for (kpi_name, kpi_value) in parse_log(log):
-        print(kpi_name, kpi_value)
-        kpi_tracker[kpi_name].add_record(kpi_value)
-        kpi_tracker[kpi_name].persist()
-
-
-if __name__ == '__main__':
-    log = sys.stdin.read()
-    log_to_ce(log)
--- a/dygraph/rcnn/eval_dyg.py
+++ b/dygraph/rcnn/eval_dyg.py
@@ -99,77 +99,73 @@ def eval():
        train_reader = fluid.contrib.reader.distributed_batch_reader(
            train_reader)

-    def eval_loop():
-        eval_start = time.time()
-        dts_res = []
-        segms_res = []
-        for iter_id, data in enumerate(test_reader()):
-            start = time.time()
-
-            image_data = np.array([x[0] for x in data]).astype('float32')
-            image_info_data = np.array([x[1] for x in data]).astype('float32')
-            image_id_data = np.array([x[2] for x in data]).astype('int32')
-
-            if cfg.enable_ce:
-                print("image_data: ", np.abs(image_data).mean(),
-                      image_data.shape)
-                print("im_info_dta: ", np.abs(image_info_data).mean(),
-                      image_info_data.shape, image_info_data)
-                print("img_id: ", image_id_data, image_id_data.shape)
-
-            # forward
-            outputs = model(image_data, image_info_data, image_id_data)
-
-            pred_boxes_v = outputs[1].numpy()
-            if cfg.MASK_ON:
-                masks_v = outputs[2].numpy()
-
-            new_lod = list(outputs[0].numpy())
-            #new_lod = [[0, pred_boxes_v.shape[0]]] #pred_boxes_v.lod()
-            nmsed_out = pred_boxes_v
-
-            dts_res += get_dt_res(total_batch_size, new_lod, nmsed_out, data,
-                                  num_id_to_cat_id_map)
-
-            if cfg.MASK_ON and np.array(masks_v).shape != (1, 1):
-                segms_out = segm_results(nmsed_out, masks_v, image_info_data)
-                segms_res += get_segms_res(total_batch_size, new_lod, segms_out,
-                                           data, num_id_to_cat_id_map)
-
-            end = time.time()
-            print('batch id: {}, time: {}'.format(iter_id, end - start))
-        eval_end = time.time()
-        total_time = eval_end - eval_start
-        print('average time of eval is: {}'.format(total_time / (iter_id + 1)))
-        assert len(dts_res) > 0, "The number of valid bbox detected is zero.\n \
-            Please use reasonable model and check input data."
-
+    eval_start = time.time()
+    dts_res = []
+    segms_res = []
+    for iter_id, data in enumerate(test_reader()):
+        start = time.time()
+
+        image_data = np.array([x[0] for x in data]).astype('float32')
+        image_info_data = np.array([x[1] for x in data]).astype('float32')
+        image_id_data = np.array([x[2] for x in data]).astype('int32')
+
+        if cfg.enable_ce:
+            print("image_data: ", np.abs(image_data).mean(), image_data.shape)
+            print("im_info_dta: ", np.abs(image_info_data).mean(),
+                  image_info_data.shape, image_info_data)
+            print("img_id: ", image_id_data, image_id_data.shape)
+
+        # forward
+        outputs = model(image_data, image_info_data, image_id_data)
+
+        pred_boxes_v = outputs[1].numpy()
        if cfg.MASK_ON:
-            assert len(
-                segms_res) > 0, "The number of valid mask detected is zero.\n \
-                Please use reasonable model and check input data."
+            masks_v = outputs[2].numpy()
+
+        new_lod = list(outputs[0].numpy())
+        #new_lod = [[0, pred_boxes_v.shape[0]]] #pred_boxes_v.lod()
+        nmsed_out = pred_boxes_v
+
+        dts_res += get_dt_res(total_batch_size, new_lod, nmsed_out, data,
+                              num_id_to_cat_id_map)
+
+        if cfg.MASK_ON and np.array(masks_v).shape != (1, 1):
+            segms_out = segm_results(nmsed_out, masks_v, image_info_data)
+            segms_res += get_segms_res(total_batch_size, new_lod, segms_out,
+                                       data, num_id_to_cat_id_map)
+
+        end = time.time()
+        print('batch id: {}, time: {}'.format(iter_id, end - start))
+    eval_end = time.time()
+    total_time = eval_end - eval_start
+    print('average time of eval is: {}'.format(total_time / (iter_id + 1)))
+    assert len(dts_res) > 0, "The number of valid bbox detected is zero.\n \
+        Please use reasonable model and check input data."
+
+    if cfg.MASK_ON:
+        assert len(
+            segms_res) > 0, "The number of valid mask detected is zero.\n \
+            Please use reasonable model and check input data."

-        with io.open("detection_bbox_result.json", 'w') as outfile:
+    with io.open("detection_bbox_result.json", 'w') as outfile:
+        encode_func = unicode if six.PY2 else str
+        outfile.write(encode_func(json.dumps(dts_res)))
+    print("start evaluate bbox using coco api")
+    cocoDt = cocoGt.loadRes("detection_bbox_result.json")
+    cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
+    cocoEval.evaluate()
+    cocoEval.accumulate()
+    cocoEval.summarize()
+
+    if cfg.MASK_ON:
+        with io.open("detection_segms_result.json", 'w') as outfile:
            encode_func = unicode if six.PY2 else str
-            outfile.write(encode_func(json.dumps(dts_res)))
-        print("start evaluate bbox using coco api")
-        cocoDt = cocoGt.loadRes("detection_bbox_result.json")
-        cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
+            outfile.write(encode_func(json.dumps(segms_res)))
+        print("start evaluate mask using coco api")
+        cocoDt = cocoGt.loadRes("detection_segms_result.json")
+        cocoEval = COCOeval(cocoGt, cocoDt, 'segm')
        cocoEval.evaluate()
        cocoEval.accumulate()
-        cocoEval.summarize()
-
-        if cfg.MASK_ON:
-            with io.open("detection_segms_result.json", 'w') as outfile:
-                encode_func = unicode if six.PY2 else str
-                outfile.write(encode_func(json.dumps(segms_res)))
-            print("start evaluate mask using coco api")
-            cocoDt = cocoGt.loadRes("detection_segms_result.json")
-            cocoEval = COCOeval(cocoGt, cocoDt, 'segm')
-            cocoEval.evaluate()
-            cocoEval.accumulate()
-
-    eval_loop()


 if __name__ == '__main__':

--- a/dygraph/rcnn/eval_helper.py
+++ b/dygraph/rcnn/eval_helper.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
 #
 #Licensed under the Apache License, Version 2.0 (the "License");
 #you may not use this file except in compliance with the License.

--- a/dygraph/rcnn/infer.py
+++ b/dygraph/rcnn/infer.py
-import os
-import time
-import numpy as np
-from eval_helper import *
-import paddle
-import paddle.fluid as fluid
-import reader
-from utility import print_arguments, parse_args, check_gpu
-import models.model_builder as model_builder
-import models.resnet as resnet
-from config import cfg
-from data_utils import DatasetPath
-
-
-def infer():
-
-    try:
-        from pycocotools.coco import COCO
-        from pycocotools.cocoeval import COCOeval, Params
-
-        data_path = DatasetPath('val')
-        test_list = data_path.get_file_list()
-        coco_api = COCO(test_list)
-        cid = coco_api.getCatIds()
-        cat_id_to_num_id_map = {
-            v: i + 1
-            for i, v in enumerate(coco_api.getCatIds())
-        }
-        category_ids = coco_api.getCatIds()
-        labels_map = {
-            cat_id_to_num_id_map[item['id']]: item['name']
-            for item in coco_api.loadCats(category_ids)
-        }
-        labels_map[0] = 'background'
-    except:
-        print("The COCO dataset or COCO API is not exist, use the default "
-              "mapping of class index and real category name on COCO17.")
-        assert cfg.dataset == 'coco2017'
-        labels_map = coco17_labels()
-
-    image_shape = [3, cfg.TEST.max_size, cfg.TEST.max_size]
-    class_nums = cfg.class_num
-
-    model = model_builder.RCNN(
-        add_conv_body_func=resnet.add_ResNet50_conv4_body,
-        add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head,
-        use_pyreader=False,
-        mode='infer')
-    model.build_model(image_shape)
-    pred_boxes = model.eval_bbox_out()
-    if cfg.MASK_ON:
-        masks = model.eval_mask_out()
-    place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    # yapf: disable
-    if not os.path.exists(cfg.pretrained_model):
-        raise ValueError("Model path [%s] does not exist." % (cfg.pretrained_model))
-
-    def if_exist(var):
-        return os.path.exists(os.path.join(cfg.pretrained_model, var.name))
-    fluid.io.load_vars(exe, cfg.pretrained_model, predicate=if_exist)
-    # yapf: enable
-    infer_reader = reader.infer(cfg.image_path)
-    feeder = fluid.DataFeeder(place=place, feed_list=model.feeds())
-
-    dts_res = []
-    segms_res = []
-    if cfg.MASK_ON:
-        fetch_list = [pred_boxes, masks]
-    else:
-        fetch_list = [pred_boxes]
-    data = next(infer_reader())
-    im_info = [data[0][1]]
-    result = exe.run(fetch_list=[v.name for v in fetch_list],
-                     feed=feeder.feed(data),
-                     return_numpy=False)
-    pred_boxes_v = result[0]
-    if cfg.MASK_ON:
-        masks_v = result[1]
-    new_lod = pred_boxes_v.lod()
-    nmsed_out = pred_boxes_v
-    image = None
-    if cfg.MASK_ON:
-        segms_out = segm_results(nmsed_out, masks_v, im_info)
-        image = draw_mask_on_image(cfg.image_path, segms_out,
-                                   cfg.draw_threshold)
-
-    draw_bounding_box_on_image(cfg.image_path, nmsed_out, cfg.draw_threshold,
-                               labels_map, image)
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    print_arguments(args)
-    check_gpu(args.use_gpu)
-    infer()
--- a/dygraph/rcnn/learning_rate.py
+++ b/dygraph/rcnn/learning_rate.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle.fluid as fluid
-import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler
-from paddle.fluid.layers import control_flow
-
-
-def exponential_with_warmup_decay(learning_rate, boundaries, values,
-                                  warmup_iter, warmup_factor):
-    global_step = lr_scheduler._decay_step_counter()
-
-    lr = fluid.layers.create_global_var(
-        shape=[1],
-        value=0.0,
-        dtype='float32',
-        persistable=True,
-        name="learning_rate")
-
-    warmup_iter_var = fluid.layers.fill_constant(
-        shape=[1], dtype='float32', value=float(warmup_iter), force_cpu=True)
-
-    with control_flow.Switch() as switch:
-        with switch.case(global_step < warmup_iter_var):
-            alpha = global_step / warmup_iter_var
-            factor = warmup_factor * (1 - alpha) + alpha
-            decayed_lr = learning_rate * factor
-            fluid.layers.assign(decayed_lr, lr)
-
-        for i in range(len(boundaries)):
-            boundary_val = fluid.layers.fill_constant(
-                shape=[1],
-                dtype='float32',
-                value=float(boundaries[i]),
-                force_cpu=True)
-            value_var = fluid.layers.fill_constant(
-                shape=[1], dtype='float32', value=float(values[i]))
-            with switch.case(global_step < boundary_val):
-                fluid.layers.assign(value_var, lr)
-
-        last_value_var = fluid.layers.fill_constant(
-            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
-        with switch.default():
-            fluid.layers.assign(last_value_var, lr)
-
-    return lr
--- a/dygraph/rcnn/models/dyg/cyops/1
+++ b/dygraph/rcnn/models/dyg/cyops/1
-cimport cython
-import numpy as np
-cimport numpy as np 
-from .bbox import * 
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def rpn_target_assign(
-    anchor_box,
-    gt_boxes,
-    is_crowd,
-    im_info,
-    rpn_straddle_thresh,
-    rpn_batch_size_per_im,
-    rpn_positive_overlap,
-    rpn_negative_overlap,
-    rpn_fg_fraction,
-    use_random=False):
-
-    anchor_num = anchor_box.shape[0]
-    batch_size = gt_boxes.shape[0]
-
-    for i in range(batch_size):
-        im_height = im_info[i][0]
-        im_width = im_info[i][1]
-        im_scale = im_info[i][2]
-        if rpn_straddle_thresh >= 0:
-            # Only keep anchors inside the image by a margin of straddle_thresh
-            inds_inside = np.where(
-                (anchor_box[:, 0] >= -rpn_straddle_thresh) &
-                (anchor_box[:, 1] >= -rpn_straddle_thresh) & (
-                    anchor_box[:, 2] < im_width + rpn_straddle_thresh) & (
-                        anchor_box[:, 3] < im_height + rpn_straddle_thresh))[0]
-            # keep only inside anchors
-            inside_anchors = anchor_box[inds_inside, :]
-        else:
-            inds_inside = np.arange(anchor_box.shape[0])
-            inside_anchors = anchor_box
-        gt_boxes_slice = gt_boxes[i] * im_scale
-        is_crowd_slice = is_crowd[i]
-
-        not_crowd_inds = np.where(is_crowd_slice == 0)[0]
-        gt_boxes_slice = gt_boxes_slice[not_crowd_inds]
-        iou = bbox_overlaps(inside_anchors, gt_boxes_slice)
-
-        loc_inds, score_inds, labels, gt_inds, bbox_inside_weight = \
-            _sample_anchor(iou, rpn_batch_size_per_im,
-                rpn_positive_overlap,
-                rpn_negative_overlap,
-                rpn_fg_fraction,
-                use_random)
-        # unmap to all anchor 
-        loc_inds = inds_inside[loc_inds]
-        score_inds = inds_inside[score_inds]
-
-	sampled_anchor = anchor_box[loc_inds]
-        sampled_gt = gt_boxes_slice[gt_inds]
-        box_deltas = box_to_delta(
-                sampled_anchor, sampled_gt, 
-                [1., 1., 1., 1.]
-        )
-
-        if i == 0:
-            loc_indexes = loc_inds
-            score_indexes = score_inds
-            tgt_labels = labels
-            tgt_bboxes = box_deltas
-            bbox_inside_weights = bbox_inside_weight
-        else:
-            loc_indexes = np.concatenate(
-                [loc_indexes, loc_inds + i * anchor_num])
-            score_indexes = np.concatenate(
-                [score_indexes, score_inds + i * anchor_num])
-            tgt_labels = np.concatenate([tgt_labels, labels])
-            tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
-            bbox_inside_weights = np.vstack([bbox_inside_weights, \
-                                             bbox_inside_weight])
-
-    return loc_indexes, score_indexes, tgt_labels, tgt_bboxes, bbox_inside_weights 
-
-#@jit 
-def _sample_anchor(
-    anchor_by_gt_overlap,
-    rpn_batch_size_per_im,
-    rpn_positive_overlap,
-    rpn_negative_overlap,
-    rpn_fg_fraction,
-    use_random=False):
-
-    anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
-    anchor_to_gt_max = anchor_by_gt_overlap[np.arange(
-        anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax]
-
-    gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
-    gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange(
-        anchor_by_gt_overlap.shape[1])]
-    anchors_with_max_overlap = np.where(
-        anchor_by_gt_overlap == gt_to_anchor_max)[0]
-
-    labels = np.ones((anchor_by_gt_overlap.shape[0], ), dtype=np.int32) * -1
-    labels[anchors_with_max_overlap] = 1
-    labels[anchor_to_gt_max >= rpn_positive_overlap] = 1
-   
-    num_fg = int(rpn_fg_fraction * rpn_batch_size_per_im)
-    fg_inds = np.where(labels == 1)[0]
-    if len(fg_inds) > num_fg and use_random:
-        disable_inds = np.random.choice(
-            fg_inds, size=(len(fg_inds) - num_fg), replace=False)
-    else:
-        disable_inds = fg_inds[num_fg:]
-
-    labels[disable_inds] = -1
-    fg_inds = np.where(labels == 1)[0]
-
-    num_bg = rpn_batch_size_per_im - np.sum(labels == 1)
-    bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
-    if len(bg_inds) > num_bg and use_random:
-        enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
-    else:
-        enable_inds = bg_inds[:num_bg]
-
-    fg_fake_inds = np.array([], np.int32)
-    fg_value = np.array([fg_inds[0]], np.int32)
-    fake_num = 0
-    for bg_id in enable_inds:
-        if bg_id in fg_inds:
-            fake_num += 1
-            fg_fake_inds = np.hstack([fg_fake_inds, fg_value])
-    labels[enable_inds] = 0
-
-    #bbox_inside_weight[fake_num:, :] = 1
-    
-    fg_inds = np.where(labels == 1)[0]
-    bg_inds = np.where(labels == 0)[0]
-    
-    loc_index = np.hstack([fg_fake_inds, fg_inds])
-    score_index = np.hstack([fg_inds, bg_inds])
-    labels = labels[score_index]
-    assert not np.any(labels == -1), "Wrong labels with -1"
-
-    gt_inds = anchor_to_gt_argmax[loc_index]
-   
-    bbox_inside_weight = np.zeros((len(loc_index), 4), dtype=np.float32)
-    bbox_inside_weight[fake_num:, :] = 1
-    return loc_index, score_index, labels, gt_inds, bbox_inside_weight
-
-#@jit 
-def generate_proposal_labels(
-        rpn_rois, rpn_rois_lod, gt_classes, is_crowd, gt_boxes, im_info, batch_size_per_im,
-        fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights,
-        class_nums, use_random=False, is_cls_agnostic=False, is_cascade_rcnn=False):
-    rois = []
-    labels_int32 = []
-    bbox_targets = []
-    bbox_inside_weights = []
-    bbox_outside_weights = []
-    lod = []
-    batch_size = gt_boxes.shape[0]
-    # TODO: modify here
-    # rpn_rois = rpn_rois.reshape(batch_size, -1, 4)
-    st_num = 0
-
-    for im_i, rpn_rois_num in enumerate(rpn_rois_lod):
-        frcn_blobs = _sample_rois(
-            rpn_rois[st_num:rpn_rois_num], 
-            gt_classes[im_i], is_crowd[im_i], gt_boxes[im_i], im_info[im_i],
-            batch_size_per_im, fg_fraction, fg_thresh,
-            bg_thresh_hi, bg_thresh_lo, bbox_reg_weights,
-            class_nums, use_random, is_cls_agnostic, is_cascade_rcnn)
-        st_num = rpn_rois_num
-        rois.append(frcn_blobs['rois'])
-        labels_int32.append(frcn_blobs['labels_int32'])
-        bbox_targets.append(frcn_blobs['bbox_targets'])
-        bbox_inside_weights.append(frcn_blobs['bbox_inside_weights'])
-        bbox_outside_weights.append(frcn_blobs['bbox_outside_weights'])
-        lod.append(frcn_blobs['rois'].shape[0])
-    
-                
-    o_rois = np.concatenate(rois, axis=0).astype(np.float32) 
-    o_labels =  np.concatenate(labels_int32, axis=0).astype(np.int32).reshape(-1, 1) 
-    o_bbox_targets = np.concatenate(bbox_targets, axis=0).astype(np.float32)
-    o_bbox_inside_weights = np.concatenate(bbox_inside_weights, axis=0).astype(np.float32)
-    o_bbox_outside_weights = np.concatenate(bbox_outside_weights, axis=0).astype(np.float32)
-    o_lod = np.asarray(lod, np.int32)
-    
-    return o_rois, o_labels, o_bbox_targets, o_bbox_inside_weights, o_bbox_outside_weights, o_lod 
-
-#@jit 
-def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
-                 batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi,
-                 bg_thresh_lo, bbox_reg_weights, class_nums, use_random, is_cls_agnostic,
-                 is_cascade_rcnn):
-    rois_per_image = int(batch_size_per_im)
-    #print("debug rois_per_image: ", rois_per_image)
-    fg_rois_per_im = int(np.round(fg_fraction * rois_per_image))
-
-    # Roidb
-    im_scale = im_info[2]
-    inv_im_scale = 1. / im_scale
-    rpn_rois = rpn_rois * inv_im_scale
-    if is_cascade_rcnn:
-        rpn_rois = rpn_rois[gt_boxes.shape[0]:, :]
-    boxes = np.vstack([gt_boxes, rpn_rois])
-    gt_overlaps = np.zeros((boxes.shape[0], class_nums))
-    box_to_gt_ind_map = np.zeros((boxes.shape[0]), dtype=np.int32)
-    if len(gt_boxes) > 0:
-        proposal_to_gt_overlaps = bbox_overlaps(boxes, gt_boxes)
-
-        overlaps_argmax = proposal_to_gt_overlaps.argmax(axis=1)
-        overlaps_max = proposal_to_gt_overlaps.max(axis=1)
-        # Boxes which with non-zero overlap with gt boxes
-        overlapped_boxes_ind = np.where(overlaps_max > 0)[0]
-        overlapped_boxes_gt_classes = gt_classes[overlaps_argmax[
-            overlapped_boxes_ind]]
-        gt_overlaps[overlapped_boxes_ind,
-                    overlapped_boxes_gt_classes] = overlaps_max[
-                        overlapped_boxes_ind]
-        box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[
-            overlapped_boxes_ind]
-
-    crowd_ind = np.where(is_crowd)[0]
-    gt_overlaps[crowd_ind] = -1
-
-    max_overlaps = gt_overlaps.max(axis=1)
-    max_classes = gt_overlaps.argmax(axis=1)
-
-    # Cascade RCNN Decode Filter
-    if is_cascade_rcnn:
-        ws = boxes[:, 2] - boxes[:, 0] + 1
-        hs = boxes[:, 3] - boxes[:, 1] + 1
-        keep = np.where((ws > 0) & (hs > 0))[0]
-        boxes = boxes[keep]
-        fg_inds = np.where(max_overlaps >= fg_thresh)[0]
-        bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >=
-                                                            bg_thresh_lo))[0]
-        fg_rois_per_this_image = fg_inds.shape[0]
-        bg_rois_per_this_image = bg_inds.shape[0]
-    else:
-        # Foreground
-        fg_inds = np.where(max_overlaps >= fg_thresh)[0]
-        fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0])
-        # Sample foreground if there are too many
-        if (fg_inds.shape[0] > fg_rois_per_this_image) and use_random:
-            fg_inds = np.random.choice(
-                fg_inds, size=fg_rois_per_this_image, replace=False)
-        fg_inds = fg_inds[:fg_rois_per_this_image]
-        # Background
-        bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >=
-                                                            bg_thresh_lo))[0]
-        bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
-        bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
-                                            bg_inds.shape[0])
-        # Sample background if there are too many
-        if (bg_inds.shape[0] > bg_rois_per_this_image) and use_random:
-            bg_inds = np.random.choice(
-                bg_inds, size=bg_rois_per_this_image, replace=False)
-        bg_inds = bg_inds[:bg_rois_per_this_image]
-    keep_inds = np.append(fg_inds, bg_inds)
-    sampled_labels = max_classes[keep_inds]
-    sampled_labels[fg_rois_per_this_image:] = 0
-    sampled_boxes = boxes[keep_inds]
-    sampled_gts = gt_boxes[box_to_gt_ind_map[keep_inds]]
-    sampled_gts[fg_rois_per_this_image:, :] = gt_boxes[0]
-    bbox_label_targets = compute_targets(sampled_boxes, sampled_gts,
-                                          sampled_labels, bbox_reg_weights)
-    bbox_targets, bbox_inside_weights = expand_bbox_targets(
-        bbox_label_targets, class_nums, is_cls_agnostic)
-    bbox_outside_weights = np.array(
-        bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype)
-    # Scale rois
-    sampled_rois = sampled_boxes * im_scale
-
-    # Faster RCNN blobs
-    frcn_blobs = dict(
-        rois=sampled_rois,
-        labels_int32=sampled_labels,
-        bbox_targets=bbox_targets,
-        bbox_inside_weights=bbox_inside_weights,
-        bbox_outside_weights=bbox_outside_weights)
-    #for k,v in frcn_blobs.items():
-    #    print(k, v.shape)
-    return frcn_blobs
-
-
--- a/dygraph/rcnn/models/dyg/pyops/roi_extractor.py
+++ b/dygraph/rcnn/models/dyg/pyops/roi_extractor.py
-import sys
-import math
-import numpy as np
-from paddle.fluid.dygraph.base import to_variable
-
-
-def roi_pool(input_x, rois, pooled_height, pooled_width, spatial_scale):
-    input_x = input_x.numpy()
-    rois = rois.numpy()
-    batch_size, channels, height, width = input_x.shape
-    print("debug roi pool")
-    print("debug input feat: ", input_x.shape)
-    rois_num = rois.shape[1]
-    #out_data = np.zeros((rois_num, channels, pooled_height, pooled_width))
-    #argmax_data = np.zeros((rois_num, channels, pooled_height, pooled_width))
-    outs_list = []
-    for bi in range(batch_size):
-        out_data = np.zeros((rois_num, channels, pooled_height, pooled_width))
-        argmax_data = np.zeros(
-            (rois_num, channels, pooled_height, pooled_width))
-        for i in range(rois_num):
-            roi = rois[bi][i]
-            # roi_batch_id = int(roi[0])
-            roi_start_w = int(np.round(roi[0] * spatial_scale))
-            roi_start_h = int(np.round(roi[1] * spatial_scale))
-            roi_end_w = int(np.round(roi[2] * spatial_scale))
-            roi_end_h = int(np.round(roi[3] * spatial_scale))
-
-            roi_height = int(max(roi_end_h - roi_start_h + 1, 1))
-            roi_width = int(max(roi_end_w - roi_start_w + 1, 1))
-
-            x_i = input_x[bi]  #input_x[roi_batch_id]
-
-            bin_size_h = float(roi_height) / float(pooled_height)
-            bin_size_w = float(roi_width) / float(pooled_width)
-
-            for c in range(channels):
-                for ph in range(pooled_height):
-                    for pw in range(pooled_width):
-                        hstart = int(math.floor(ph * bin_size_h))
-                        wstart = int(math.floor(pw * bin_size_w))
-                        hend = int(math.ceil((ph + 1) * bin_size_h))
-                        wend = int(math.ceil((pw + 1) * bin_size_w))
-
-                        hstart = min(max(hstart + roi_start_h, 0), height)
-                        hend = min(max(hend + roi_start_h, 0), height)
-                        wstart = min(max(wstart + roi_start_w, 0), width)
-                        wend = min(max(wend + roi_start_w, 0), width)
-
-                        is_empty = (hend <= hstart) or (wend <= wstart)
-                        if is_empty:
-                            out_data[i, c, ph, pw] = 0
-                        else:
-                            out_data[i, c, ph, pw] = -sys.float_info.max
-
-                        argmax_data[i, c, ph, pw] = -1
-
-                        for h in range(hstart, hend):
-                            for w in range(wstart, wend):
-                                if x_i[c, h, w] > out_data[i, c, ph, pw]:
-                                    out_data[i, c, ph, pw] = x_i[c, h, w]
-                                    argmax_data[i, c, ph, pw] = h * width + w
-
-        outs = out_data.astype('float32')
-        argmaxes = argmax_data.astype('int64')
-        outs_list.append(outs)
-    outs = np.asarray(outs_list, dtype=np.float32)
-    outs = to_variable(np.asarray(outs_list, dtype=np.float32))
-    return outs
--- a/dygraph/rcnn/train.py
+++ b/dygraph/rcnn/train.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-
-
-def set_paddle_flags(flags):
-    for key, value in flags.items():
-        if os.environ.get(key, None) is None:
-            os.environ[key] = str(value)
-
-
-set_paddle_flags({
-    'FLAGS_conv_workspace_size_limit': 500,
-    'FLAGS_eager_delete_tensor_gb': 0,  # enable gc
-    'FLAGS_memory_fraction_of_eager_deletion': 1,
-    'FLAGS_fraction_of_gpu_memory_to_use': 0.98
-})
-
-import sys
-import numpy as np
-import time
-import shutil
-from utility import parse_args, print_arguments, SmoothedValue, TrainingStats, now_time, check_gpu
-import collections
-
-import paddle
-import paddle.fluid as fluid
-import reader
-import models.model_builder as model_builder
-import models.resnet as resnet
-from learning_rate import exponential_with_warmup_decay
-from config import cfg
-import dist_utils
-
-num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
-
-
-def get_device_num():
-    # NOTE(zcd): for multi-processe training, each process use one GPU card.
-    if num_trainers > 1:
-        return 1
-    return fluid.core.get_cuda_device_count()
-
-
-def train():
-    learning_rate = cfg.learning_rate
-    image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size]
-
-    if cfg.enable_ce:
-        fluid.default_startup_program().random_seed = 1000
-        fluid.default_main_program().random_seed = 1000
-        import random
-        random.seed(0)
-        np.random.seed(0)
-
-    devices_num = get_device_num()
-    total_batch_size = devices_num * cfg.TRAIN.im_per_batch
-
-    use_random = True
-    if cfg.enable_ce:
-        use_random = False
-    model = model_builder.RCNN(
-        add_conv_body_func=resnet.add_ResNet50_conv4_body,
-        add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head,
-        use_pyreader=cfg.use_pyreader,
-        use_random=use_random)
-    model.build_model(image_shape)
-    losses, keys = model.loss()
-    loss = losses[0]
-    fetch_list = losses
-
-    boundaries = cfg.lr_steps
-    gamma = cfg.lr_gamma
-    step_num = len(cfg.lr_steps)
-    values = [learning_rate * (gamma**i) for i in range(step_num + 1)]
-
-    lr = exponential_with_warmup_decay(
-        learning_rate=learning_rate,
-        boundaries=boundaries,
-        values=values,
-        warmup_iter=cfg.warm_up_iter,
-        warmup_factor=cfg.warm_up_factor)
-    optimizer = fluid.optimizer.Momentum(
-        learning_rate=lr,
-        regularization=fluid.regularizer.L2Decay(cfg.weight_decay),
-        momentum=cfg.momentum)
-    optimizer.minimize(loss)
-    fetch_list = fetch_list + [lr]
-
-    for var in fetch_list:
-        var.persistable = True
-
-    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
-    place = fluid.CUDAPlace(gpu_id) if cfg.use_gpu else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-
-    if cfg.pretrained_model:
-
-        def if_exist(var):
-            return os.path.exists(os.path.join(cfg.pretrained_model, var.name))
-
-        fluid.io.load_vars(exe, cfg.pretrained_model, predicate=if_exist)
-
-    if cfg.parallel:
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.memory_optimize = False
-        build_strategy.enable_inplace = True
-        exec_strategy = fluid.ExecutionStrategy()
-        exec_strategy.num_iteration_per_drop_scope = 10
-
-        if num_trainers > 1 and cfg.use_gpu:
-            dist_utils.prepare_for_multi_process(exe, build_strategy,
-                                                 fluid.default_main_program())
-            # NOTE: the process is fast when num_threads is 1 
-            # for multi-process training.
-            exec_strategy.num_threads = 1
-
-        train_exe = fluid.ParallelExecutor(
-            use_cuda=bool(cfg.use_gpu),
-            loss_name=loss.name,
-            build_strategy=build_strategy,
-            exec_strategy=exec_strategy)
-    else:
-        train_exe = exe
-
-    shuffle = True
-    if cfg.enable_ce:
-        shuffle = False
-    # NOTE: do not shuffle dataset when using multi-process training 
-    shuffle_seed = None
-    if num_trainers > 1:
-        shuffle_seed = 1
-    if cfg.use_pyreader:
-        train_reader = reader.train(
-            batch_size=cfg.TRAIN.im_per_batch,
-            total_batch_size=total_batch_size,
-            padding_total=cfg.TRAIN.padding_minibatch,
-            shuffle=shuffle,
-            shuffle_seed=shuffle_seed)
-        if num_trainers > 1:
-            assert shuffle_seed is not None, \
-                "If num_trainers > 1, the shuffle_seed must be set, because " \
-                "the order of batch data generated by reader " \
-                "must be the same in the respective processes."
-            # NOTE: the order of batch data generated by batch_reader
-            # must be the same in the respective processes.
-            if num_trainers > 1:
-                train_reader = fluid.contrib.reader.distributed_batch_reader(
-                    train_reader)
-        py_reader = model.py_reader
-        py_reader.decorate_paddle_reader(train_reader)
-    else:
-        if num_trainers > 1: shuffle = False
-        train_reader = reader.train(
-            batch_size=total_batch_size, shuffle=shuffle)
-        feeder = fluid.DataFeeder(place=place, feed_list=model.feeds())
-
-    def save_model(postfix):
-        model_path = os.path.join(cfg.model_save_dir, postfix)
-        if os.path.isdir(model_path):
-            shutil.rmtree(model_path)
-        fluid.io.save_persistables(exe, model_path)
-
-    def train_loop_pyreader():
-        py_reader.start()
-        train_stats = TrainingStats(cfg.log_window, keys)
-        try:
-            start_time = time.time()
-            prev_start_time = start_time
-            for iter_id in range(cfg.max_iter):
-                prev_start_time = start_time
-                start_time = time.time()
-                outs = train_exe.run(fetch_list=[v.name for v in fetch_list])
-                stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])}
-                train_stats.update(stats)
-                logs = train_stats.log()
-                strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format(
-                    now_time(), iter_id,
-                    np.mean(outs[-1]), logs, start_time - prev_start_time)
-                print(strs)
-                sys.stdout.flush()
-                if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
-                    save_model("model_iter{}".format(iter_id))
-            end_time = time.time()
-            total_time = end_time - start_time
-            last_loss = np.array(outs[0]).mean()
-            if cfg.enable_ce:
-                gpu_num = devices_num
-                epoch_idx = iter_id + 1
-                loss = last_loss
-                print("kpis\teach_pass_duration_card%s\t%s" %
-                      (gpu_num, total_time / epoch_idx))
-                print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss))
-        except (StopIteration, fluid.core.EOFException):
-            py_reader.reset()
-
-    def train_loop():
-        start_time = time.time()
-        prev_start_time = start_time
-        start = start_time
-        train_stats = TrainingStats(cfg.log_window, keys)
-        for iter_id, data in enumerate(train_reader()):
-            prev_start_time = start_time
-            start_time = time.time()
-            outs = train_exe.run(fetch_list=[v.name for v in fetch_list],
-                                 feed=feeder.feed(data))
-            stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])}
-            train_stats.update(stats)
-            logs = train_stats.log()
-            strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format(
-                now_time(), iter_id,
-                np.mean(outs[-1]), logs, start_time - prev_start_time)
-            print(strs)
-            sys.stdout.flush()
-            if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
-                save_model("model_iter{}".format(iter_id))
-            if (iter_id + 1) == cfg.max_iter:
-                break
-        end_time = time.time()
-        total_time = end_time - start_time
-        last_loss = np.array(outs[0]).mean()
-        # only for ce
-        if cfg.enable_ce:
-            gpu_num = devices_num
-            epoch_idx = iter_id + 1
-            loss = last_loss
-            print("kpis\teach_pass_duration_card%s\t%s" %
-                  (gpu_num, total_time / epoch_idx))
-            print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss))
-
-    if cfg.use_pyreader:
-        train_loop_pyreader()
-    else:
-        train_loop()
-    save_model('model_final')
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    print_arguments(args)
-    check_gpu(args.use_gpu)
-    train()