add gtscore

5305956e · dengkaipeng · 2c3a3b36 · 5305956e · 5305956e · 5305956e
5 changed file
--- a/fluid/PaddleCV/yolov3/box_utils.py
+++ b/fluid/PaddleCV/yolov3/box_utils.py
@@ -140,7 +140,7 @@ def rescale_box_in_input_image(boxes, im_shape, input_size):
    boxes[boxes<0] = 0
    return boxes

-def box_crop(boxes, labels, crop, img_shape):
+def box_crop(boxes, labels, scores, crop, img_shape):
    x, y, w, h = map(float, crop)
    im_w, im_h = map(float, img_shape)

@@ -160,10 +160,11 @@ def box_crop(boxes, labels, crop, img_shape):
    mask = np.logical_and(mask, (boxes[:, :2] < boxes[:, 2:]).all(axis=1))
    boxes = boxes * np.expand_dims(mask.astype('float32'), axis=1)
    labels = labels * mask.astype('float32')
+    scores = scores * mask.astype('float32')
    boxes[:, 0], boxes[:, 2] = (boxes[:, 0] + boxes[:, 2]) / 2 / w, (boxes[:, 2] - boxes[:, 0]) / w
    boxes[:, 1], boxes[:, 3] = (boxes[:, 1] + boxes[:, 3]) / 2 / h, (boxes[:, 3] - boxes[:, 1]) / h

-    return boxes, labels, mask.sum()
+    return boxes, labels, scores, mask.sum()

 def get_yolo_detection(preds, anchors, class_num, img_width, img_height):
    """Get yolo box, confidence score, class label from Darknet53 output"""

--- a/fluid/PaddleCV/yolov3/image_utils.py
+++ b/fluid/PaddleCV/yolov3/image_utils.py
@@ -51,7 +51,7 @@ def random_distort(img):
    return img


-def random_crop(img, boxes, labels, scales=[0.3, 1.0], max_ratio=2.0, constraints=None, max_trial=50):
+def random_crop(img, boxes, labels, scores, scales=[0.3, 1.0], max_ratio=2.0, constraints=None, max_trial=50):
    if len(boxes) == 0:
        return img, boxes

@@ -65,7 +65,7 @@ def random_crop(img, boxes, labels, scales=[0.3, 1.0], max_ratio=2.0, constraint
                (0.0, 1.0)]

    img = Image.fromarray(img)
-    w, h = map(float, img.size)
+    w, h = img.size
    crops = [(0, 0, w, h)]
    for min_iou, max_iou in constraints:
        for _ in range(max_trial):
@@ -79,8 +79,8 @@ def random_crop(img, boxes, labels, scales=[0.3, 1.0], max_ratio=2.0, constraint
            crop_box = np.array([[
                (crop_x + crop_w / 2.0) / w,
                (crop_y + crop_h / 2.0) / h,
-                crop_w / w,
-                crop_h /h
+                crop_w / float(w),
+                crop_h /float(h)
                ]])

            iou = box_utils.box_iou_xywh(crop_box, boxes)
@@ -90,14 +90,14 @@ def random_crop(img, boxes, labels, scales=[0.3, 1.0], max_ratio=2.0, constraint

    while crops:
        crop = crops.pop(np.random.randint(0, len(crops)))
-        crop_boxes, crop_labels, box_num = box_utils.box_crop(boxes, labels, crop, (w, h))
+        crop_boxes, crop_labels, crop_scores, box_num = box_utils.box_crop(boxes, labels, scores, crop, (w, h))
        if box_num < 1:
            continue
        img = img.crop((crop[0], crop[1], crop[0] + crop[2], crop[1] + crop[3])).resize(img.size, Image.LANCZOS)
        img = np.asarray(img)
-        return img, crop_boxes, crop_labels
+        return img, crop_boxes, crop_labels, crop_scores
    img = np.asarray(img)
-    return img, boxes, labels
+    return img, boxes, labels, scores

 def random_flip(img, gtboxes, thresh=0.5):
    if random.random() > thresh:
@@ -151,13 +151,15 @@ def random_expand(img, gtboxes, max_ratio=4., fill=None, keep_ratio=True, thresh

    return out_img.astype('uint8'), gtboxes

-def image_mixup(img1, gtboxes1, gtlabels1, img2, gtboxes2, gtlabels2):
+def image_mixup(img1, gtboxes1, gtlabels1, gtscores1, img2, gtboxes2, gtlabels2, gtscores2):
    factor = np.random.beta(1.5, 1.5)
    factor = max(0.0, min(1.0, factor))
    if factor >= 1.0:
        return img1, gtboxes1, gtlabels1
    if factor <= 0.0:
        return img2, gtboxes2, gtlabels2
+    gtscores1 = gtscores1 * factor
+    gtscores2 = gtscores2 * (1.0 - factor)

    h = max(img1.shape[0], img2.shape[0])
    w = max(img1.shape[1], img2.shape[1])
@@ -166,10 +168,12 @@ def image_mixup(img1, gtboxes1, gtlabels1, img2, gtboxes2, gtlabels2):
    img[:img2.shape[0], :img2.shape[1], :] += img2.astype('float32') * (1.0 - factor)
    gtboxes = np.zeros_like(gtboxes1)
    gtlabels = np.zeros_like(gtlabels1)
+    gtscores = np.zeros_like(gtscores1)

    gt_valid_mask1 = np.logical_and(gtboxes1[:, 2] > 0, gtboxes1[:, 3] > 0)
    gtboxes1 = gtboxes1[gt_valid_mask1]
    gtlabels1 = gtlabels1[gt_valid_mask1]
+    gtscores1 = gtscores1[gt_valid_mask1]
    gtboxes1[:, 0] = gtboxes1[:, 0] * img1.shape[1] / w
    gtboxes1[:, 1] = gtboxes1[:, 1] * img1.shape[0] / h
    gtboxes1[:, 2] = gtboxes1[:, 2] * img1.shape[1] / w
@@ -178,23 +182,28 @@ def image_mixup(img1, gtboxes1, gtlabels1, img2, gtboxes2, gtlabels2):
    gt_valid_mask2 = np.logical_and(gtboxes2[:, 2] > 0, gtboxes2[:, 3] > 0)
    gtboxes2 = gtboxes2[gt_valid_mask2]
    gtlabels2 = gtlabels2[gt_valid_mask2]
+    gtscores2 = gtscores2[gt_valid_mask2]
    gtboxes2[:, 0] = gtboxes2[:, 0] * img2.shape[1] / w
    gtboxes2[:, 1] = gtboxes2[:, 1] * img2.shape[0] / h
    gtboxes2[:, 2] = gtboxes2[:, 2] * img2.shape[1] / w
    gtboxes2[:, 3] = gtboxes2[:, 3] * img2.shape[0] / h
+
    gtboxes_all = np.concatenate((gtboxes1, gtboxes2), axis=0)
    gtlabels_all = np.concatenate((gtlabels1, gtlabels2), axis=0)
+    gtscores_all = np.concatenate((gtscores1, gtscores2), axis=0)
    gt_num = min(len(gtboxes), len(gtboxes_all))
    gtboxes[:gt_num] = gtboxes_all[:gt_num]
    gtlabels[:gt_num] = gtlabels_all[:gt_num]
-    return img.astype('uint8'), gtboxes, gtlabels
+    gtscores[:gt_num] = gtscores_all[:gt_num]
+    return img.astype('uint8'), gtboxes, gtlabels, gtscores

-def image_augment(img, gtboxes, gtlabels, size, means=None):
+def image_augment(img, gtboxes, gtlabels, gtscores,  size, means=None):
    img = random_distort(img)
    img, gtboxes = random_expand(img, gtboxes, fill=means)
-    img, gtboxes, gtlabels = random_crop(img, gtboxes, gtlabels)
+    img, gtboxes, gtlabels, gtscores = random_crop(img, gtboxes, gtlabels, gtscores)
    img = random_interp(img, size)
    img, gtboxes = random_flip(img, gtboxes)

-    return img.astype('float32'), gtboxes.astype('float32'), gtlabels.astype('int32')
+    return img.astype('float32'), gtboxes.astype('float32'), \
+            gtlabels.astype('int32'), gtscores.astype('float32')

--- a/fluid/PaddleCV/yolov3/models.py
+++ b/fluid/PaddleCV/yolov3/models.py
@@ -204,6 +204,7 @@ class YOLOv3(object):
                            x=out,
                            gtbox=self.gtbox,
                            gtlabel=self.gtlabel,
+                            gtscore=self.gtscore,
                            anchors=anchors,
                            anchor_mask=anchor_mask,
                            class_num=class_num,
@@ -232,11 +233,11 @@ class YOLOv3(object):
        if self.use_pyreader and self.is_train:
            self.py_reader = fluid.layers.py_reader(
                capacity=64,
-                shapes = [[-1] + self.image_shape, [-1, cfg.max_box_num, 4], [-1, cfg.max_box_num]],
-                lod_levels=[0, 0, 0],
-                dtypes=['float32'] * 2 + ['int32'],
+                shapes = [[-1] + self.image_shape, [-1, cfg.max_box_num, 4], [-1, cfg.max_box_num], [-1, cfg.max_box_num]],
+                lod_levels=[0, 0, 0, 0],
+                dtypes=['float32'] * 2 + ['int32'] + ['float32'],
                use_double_buffer=True)
-            self.image, self.gtbox, self.gtlabel = fluid.layers.read_file(self.py_reader)
+            self.image, self.gtbox, self.gtlabel, self.gtscore = fluid.layers.read_file(self.py_reader)
        else:
            self.image = fluid.layers.data(
                    name='image', shape=self.image_shape, dtype='float32'
@@ -247,6 +248,9 @@ class YOLOv3(object):
            self.gtlabel = fluid.layers.data(
                    name='gtlabel', shape=[cfg.max_box_num], dtype='int32'
                    )
+            self.gtscore = fluid.layers.data(
+                    name='gtscore', shape=[cfg.max_box_num], dtype='float32'
+                    )
            self.im_shape = fluid.layers.data(
                    name="im_shape", shape=[2], dtype='int32')
            self.im_id = fluid.layers.data(
@@ -255,7 +259,7 @@ class YOLOv3(object):
    def feeds(self):
        if not self.is_train:
            return [self.image, self.im_id, self.im_shape]
-        return [self.image, self.gtbox, self.gtlabel]
+        return [self.image, self.gtbox, self.gtlabel, self.gtscore]

    def get_hyperparams(self):
        return self.hyperparams

--- a/fluid/PaddleCV/yolov3/reader.py
+++ b/fluid/PaddleCV/yolov3/reader.py
@@ -41,7 +41,7 @@ class DataSetReader(object):
        # cfg.data_dir = "dataset/coco"
        # cfg.train_file_list = 'annotations/instances_val2017.json'
        # cfg.train_data_dir = 'val2017'
-        cfg.dataset = "coco2017"
+        # cfg.dataset = "coco2017"
        if 'coco2014' in cfg.dataset:
            cfg.train_file_list = 'annotations/instances_train2014.json'
            cfg.train_data_dir = 'train2014'
@@ -170,16 +170,20 @@ class DataSetReader(object):
            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
            gt_boxes = img['gt_boxes'].copy()
            gt_labels = img['gt_labels'].copy()
+            gt_scores = np.ones_like(gt_labels)

            if mixup_img:
                mixup_im = cv2.imread(mixup_img['image'])
                mixup_im = cv2.cvtColor(mixup_im, cv2.COLOR_BGR2RGB)
                mixup_gt_boxes = mixup_img['gt_boxes'].copy()
                mixup_gt_labels = mixup_img['gt_labels'].copy()
-                im, gt_boxes, gt_labels = image_utils.image_mixup(im, gt_boxes, gt_labels, \
-                        mixup_im, mixup_gt_boxes, mixup_gt_labels)
+                mixup_gt_scores = np.ones_like(mixup_gt_labels)
+                im, gt_boxes, gt_labels, gt_scores = image_utils.image_mixup(im, gt_boxes, \
+                        gt_labels, gt_scores, mixup_im, mixup_gt_boxes, mixup_gt_labels, \
+                        mixup_gt_scores)
+
+            im, gt_boxes, gt_labels, gt_scores = image_utils.image_augment(im, gt_boxes, gt_labels, gt_scores, size, mean)

-            im, gt_boxes, gt_labels = image_utils.image_augment(im, gt_boxes, gt_labels, size, mean)
            # h, w, _ = im.shape
            # im_scale_x = size / float(w)
            # im_scale_y = size / float(h)
@@ -190,7 +194,7 @@ class DataSetReader(object):
            out_img = (im / 255.0 - mean) / std
            out_img = out_img.transpose((2, 0, 1)).astype('float32')

-            return (out_img, gt_boxes, gt_labels)
+            return (out_img, gt_boxes, gt_labels, gt_scores)

        def get_img_size(size, random_sizes=[]):
            if len(random_sizes):
@@ -222,9 +226,9 @@ class DataSetReader(object):
                    total_read_cnt += 1
                    if read_cnt % len(imgs) == 0 and shuffle:
                        np.random.shuffle(imgs)
-                    im, gt_boxes, gt_labels = img_reader_with_augment(img, img_size, cfg.pixel_means, cfg.pixel_stds, mixup_img)
-                    batch_out.append((im, gt_boxes, gt_labels))
-                    # img_ids.append(img['id'])
+                    im, gt_boxes, gt_labels, gt_scores = img_reader_with_augment(img, img_size, cfg.pixel_means, cfg.pixel_stds, mixup_img)
+                    batch_out.append((im, gt_boxes, gt_labels, gt_scores))
+                    # img_ids.append((img['id'], mixup_img['id'] if mixup_img else -1))

                    if len(batch_out) == batch_size:
                        # print("img_ids: ", img_ids)

--- a/fluid/PaddleCV/yolov3/utility.py
+++ b/fluid/PaddleCV/yolov3/utility.py
@@ -112,7 +112,7 @@ def parse_args():
    # TRAIN TEST INFER
    add_arg('input_size',       int,    608,    "Image input size of YOLOv3.")
    add_arg('random_shape',     bool,   False,     "Resize to random shape for train reader")
-    add_arg('no_mixup_iter',    int,    4000,      "Disable mixup in last N iter.")
+    add_arg('no_mixup_iter',    int,    40000,      "Disable mixup in last N iter.")
    add_arg('valid_thresh',     float,  0.01,    "Valid confidence score for NMS.")
    add_arg('nms_thresh',       float,  0.45,    "NMS threshold.")
    add_arg('nms_topk',         int,    400,    "The number of boxes to perform NMS.")