From 057efd1709db21744d672e3b1db74da561bd77ae Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Mon, 12 Feb 2018 16:21:44 +0800 Subject: [PATCH] Implement multibox loss wrapper for SSD in Python API. (#8385) * Implement multibox loss wrapper in Python API. * Add some wrappers for SSD detection. * Fix conflicts. * Add unit testing for SSD loss wrapper. * Update doc in Python API. * Refine unit testing. * Add more unit testing and update some interface arguments. --- .../fluid/operators/mine_hard_examples_op.cc | 2 + python/paddle/v2/fluid/layers/__init__.py | 3 - python/paddle/v2/fluid/layers/detection.py | 374 +++++++++++++++++- .../paddle/v2/fluid/tests/test_detection.py | 81 +++- 4 files changed, 424 insertions(+), 36 deletions(-) diff --git a/paddle/fluid/operators/mine_hard_examples_op.cc b/paddle/fluid/operators/mine_hard_examples_op.cc index 73a6c0b6793..540cf867418 100644 --- a/paddle/fluid/operators/mine_hard_examples_op.cc +++ b/paddle/fluid/operators/mine_hard_examples_op.cc @@ -237,6 +237,8 @@ class MineHardExamplesOp : public framework::OperatorWithKernel { } ctx->SetOutputDim("UpdatedMatchIndices", idx_dims); + // The first dimension of NegIndices will be set correcttly in Compute. + ctx->SetOutputDim("NegIndices", {-1, 1}); } protected: diff --git a/python/paddle/v2/fluid/layers/__init__.py b/python/paddle/v2/fluid/layers/__init__.py index cfbbf710b6a..f4fb2ca2798 100644 --- a/python/paddle/v2/fluid/layers/__init__.py +++ b/python/paddle/v2/fluid/layers/__init__.py @@ -16,8 +16,6 @@ import ops from ops import * import nn from nn import * -import detection -from detection import * import io from io import * import tensor @@ -33,7 +31,6 @@ from detection import * __all__ = [] __all__ += math_op_patch.__all__ -__all__ += detection.__all__ __all__ += nn.__all__ __all__ += io.__all__ __all__ += tensor.__all__ diff --git a/python/paddle/v2/fluid/layers/detection.py b/python/paddle/v2/fluid/layers/detection.py index 0f3256d7652..659ebd5f765 100644 --- a/python/paddle/v2/fluid/layers/detection.py +++ b/python/paddle/v2/fluid/layers/detection.py @@ -1,10 +1,10 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -15,17 +15,31 @@ All layers just related to the detection neural network. """ +from layer_function_generator import generate_layer_fn from ..layer_helper import LayerHelper -from ..framework import Variable -from tensor import concat -from ops import reshape +import nn +import ops +import tensor import math __all__ = [ - 'detection_output', 'prior_box', + 'bipartite_match', + 'target_assign', + 'detection_output', + 'ssd_loss', ] +__auto__ = [ + 'iou_similarity', + 'box_coder', +] + +__all__ += __auto__ + +for _OP in set(__auto__): + globals()[_OP] = generate_layer_fn(_OP) + def detection_output(scores, loc, @@ -95,18 +109,13 @@ def detection_output(scores, """ helper = LayerHelper("detection_output", **locals()) - decoded_box = helper.create_tmp_variable(dtype=loc.dtype) - helper.append_op( - type="box_coder", - inputs={ - 'PriorBox': prior_box, - 'PriorBoxVar': prior_box_var, - 'TargetBox': loc - }, - outputs={'OutputBox': decoded_box}, - attrs={'code_type': 'decode_center_size'}) - nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype) + decoded_box = box_coder( + prior_box=prior_box, + prior_box_var=prior_box_var, + target_box=loc, + code_type='decode_center_size') + nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype) helper.append_op( type="multiclass_nms", inputs={'Scores': scores, @@ -246,7 +255,7 @@ def prior_box(inputs, new_shape = [ -1, reduce(lambda x, y: x * y, input.shape[axis:len(input.shape)]) ] - out = reshape(x=input, shape=new_shape) + out = ops.reshape(x=input, shape=new_shape) return out assert isinstance(inputs, list), 'inputs should be a list.' @@ -322,7 +331,332 @@ def prior_box(inputs, reshaped_boxes.append(_reshape_with_axis_(box_results[i], axis=3)) reshaped_vars.append(_reshape_with_axis_(var_results[i], axis=3)) - box = concat(reshaped_boxes) - var = concat(reshaped_vars) + box = tensor.concat(reshaped_boxes) + var = tensor.concat(reshaped_vars) return box, var + + +def bipartite_match(dist_matrix, name=None): + """ + **Bipartite matchint operator** + + This operator is a greedy bipartite matching algorithm, which is used to + obtain the matching with the maximum distance based on the input + distance matrix. For input 2D matrix, the bipartite matching algorithm can + find the matched column for each row, also can find the matched row for + each column. And this operator only calculate matched indices from column + to row. For each instance, the number of matched indices is the number of + of columns of the input ditance matrix. + + There are two outputs to save matched indices and distance. + A simple description, this algothrim matched the best (maximum distance) + row entity to the column entity and the matched indices are not duplicated + in each row of ColToRowMatchIndices. If the column entity is not matched + any row entity, set -1 in ColToRowMatchIndices. + + Please note that the input DistMat can be LoDTensor (with LoD) or Tensor. + If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size. + If Tensor, the height of ColToRowMatchIndices is 1. + + Args: + dist_matrix(Variable): This input is a 2-D LoDTensor with shape + [K, M]. It is pair-wise distance matrix between the entities + represented by each row and each column. For example, assumed one + entity is A with shape [K], another entity is B with shape [M]. The + dist_matirx[i][j] is the distance between A[i] and B[j]. The bigger + the distance is, the better macthing the pairs are. Please note, + This tensor can contain LoD information to represent a batch of + inputs. One instance of this batch can contain different numbers of + entities. + Returns: + match_indices(Variable): A 2-D Tensor with shape [N, M] in int type. + N is the batch size. If match_indices[i][j] is -1, it + means B[j] does not match any entity in i-th instance. + Otherwise, it means B[j] is matched to row + match_indices[i][j] in i-th instance. The row number of + i-th instance is saved in match_indices[i][j]. + match_distance(Variable): A 2-D Tensor with shape [N, M] in float type. + N is batch size. If match_indices[i][j] is -1, + match_distance[i][j] is also -1.0. Otherwise, assumed + match_distance[i][j] = d, and the row offsets of each instance + are called LoD. Then match_distance[i][j] = dist_matrix[d+LoD[i]][j]. + """ + helper = LayerHelper('bipartite_match', **locals()) + match_indices = helper.create_tmp_variable(dtype='int32') + match_distance = helper.create_tmp_variable(dtype=dist_matrix.dtype) + helper.append_op( + type='bipartite_match', + inputs={'DistMat': dist_matrix}, + outputs={ + 'ColToRowMatchIndices': match_indices, + 'ColToRowMatchDist': match_distance + }) + return match_indices, match_distance + + +def target_assign(input, + matched_indices, + negative_indices=None, + mismatch_value=None, + name=None): + """ + **Target assigner operator** + + This operator can be, for given the target bounding boxes or labels, + to assign classification and regression targets to each prediction as well as + weights to prediction. The weights is used to specify which prediction would + not contribute to training loss. + + For each instance, the output `out` and`out_weight` are assigned based on + `match_indices` and `negative_indices`. + Assumed that the row offset for each instance in `input` is called lod, + this operator assigns classification/regression targets by performing the + following steps: + + 1. Assigning all outpts based on `match_indices`: + + If id = match_indices[i][j] > 0, + + out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K] + out_weight[i][j] = 1. + + Otherwise, + + out[j][j][0 : K] = {mismatch_value, mismatch_value, ...} + out_weight[i][j] = 0. + + 2. Assigning out_weight based on `neg_indices` if `neg_indices` is provided: + + Assumed that the row offset for each instance in `neg_indices` is called neg_lod, + for i-th instance and each `id` of neg_indices in this instance: + + out[i][id][0 : K] = {mismatch_value, mismatch_value, ...} + out_weight[i][id] = 1.0 + + Args: + inputs (Variable): This input is a 3D LoDTensor with shape [M, P, K]. + matched_indices (Variable): Tensor), The input matched indices + is 2D Tenosr with shape [N, P], If MatchIndices[i][j] is -1, + the j-th entity of column is not matched to any entity of row in + i-th instance. + negative_indices (Variable): The input negative example indices are + an optional input with shape [Neg, 1] and int32 type, where Neg is + the total number of negative example indices. + mismatch_value (float32): Fill this value to the mismatched location. + + Returns: + out (Variable): The output is a 3D Tensor with shape [N, P, K], + N and P is the same as they are in `neg_indices`, K is the + same as it in input of X. If `match_indices[i][j]`. + out_weight (Variable): The weight for output with the shape of [N, P, 1]. + """ + helper = LayerHelper('target_assign', **locals()) + out = helper.create_tmp_variable(dtype=input.dtype) + out_weight = helper.create_tmp_variable(dtype='float32') + helper.append_op( + type='target_assign', + inputs={ + 'X': input, + 'MatchIndices': matched_indices, + 'NegIndices': negative_indices + }, + outputs={'Out': out, + 'OutWeight': out_weight}, + attrs={'mismatch_value': mismatch_value}) + return out, out_weight + + +def ssd_loss(location, + confidence, + gt_box, + gt_label, + prior_box, + prior_box_var=None, + background_label=0, + overlap_threshold=0.5, + neg_pos_ratio=3.0, + neg_overlap=0.5, + loc_loss_weight=1.0, + conf_loss_weight=1.0, + match_type='per_prediction', + mining_type='max_negative', + sample_size=None): + """ + **Multi-box loss layer for object dection algorithm of SSD** + + This layer is to compute dection loss for SSD given the location offset + predictions, confidence predictions, prior boxes and ground-truth boudding + boxes and labels, and the type of hard example mining. The returned loss + is a weighted sum of the localization loss (or regression loss) and + confidence loss (or classification loss) by performing the following steps: + + 1. Find matched boundding box by bipartite matching algorithm. + 1.1 Compute IOU similarity between ground-truth boxes and prior boxes. + 1.2 Compute matched boundding box by bipartite matching algorithm. + 2. Compute confidence for mining hard examples + 2.1. Get the target label based on matched indices. + 2.2. Compute confidence loss. + 3. Apply hard example mining to get the negative example indices and update + the matched indices. + 4. Assign classification and regression targets + 4.1. Encoded bbox according to the prior boxes. + 4.2. Assign regression targets. + 4.3. Assign classification targets. + 5. Compute the overall objective loss. + 5.1 Compute confidence loss. + 5.1 Compute localization loss. + 5.3 Compute the overall weighted loss. + + Args: + location (Variable): The location predictions are a 3D Tensor with + shape [N, Np, 4], N is the batch size, Np is total number of + predictions for each instance. 4 is the number of coordinate values, + the layout is [xmin, ymin, xmax, ymax]. + confidence (Variable): The confidence predictions are a 3D Tensor + with shape [N, Np, C], N and Np are the same as they are in + `location`, C is the class number. + gt_box (Variable): The ground-truth boudding boxes (bboxes) are a 2D + LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth + bboxes of mini-batch input. + gt_label (Variable): The ground-truth labels are a 2D LoDTensor + with shape [Ng, 1]. + prior_box (Variable): The prior boxes are a 2D Tensor with shape [Np, 4]. + prior_box_var (Variable): The variance of prior boxes are a 2D Tensor + with shape [Np, 4]. + background_label (int): The index of background label, 0 by default. + overlap_threshold (float): If match_type is 'per_prediction', use + `overlap_threshold` to determine the extra matching bboxes when + finding matched boxes. 0.5 by default. + neg_pos_ratio (float): The ratio of the negative boxes to the positive + boxes, used only when mining_type is max_negative, 3.0 by defalut. + neg_overlap (float): The negative overlap upper bound for the unmatched + predictions. Use only when mining_type is max_negative, + 0.5 by default. + sample_size (int): The max sample size of negative box, used only when + mining_type is hard_example. + loc_loss_weight (float): Weight for localization loss, 1.0 by default. + conf_loss_weight (float): Weight for confidence loss, 1.0 by default. + match_type (str): The type of matching method during training, should + be 'bipartite' or 'per_prediction'. + mining_type (str): The hard example mining type, should be 'hard_example' + or 'max_negative', now only support `max_negative`. + + Returns: + Variable: The weighted sum of the localization loss and confidence loss, + with shape [N * Np, 1], N and Np are the same as they are + in `location`. + + Raises: + ValueError: If mining_type is 'hard_example', now only support + mining type of `max_negative`. + + Examples: + .. code-block:: python + + pb = layers.data( + name='prior_box', + shape=[10, 4], + append_batch_size=False, + dtype='float32') + pbv = layers.data( + name='prior_box_var', + shape=[10, 4], + append_batch_size=False, + dtype='float32') + loc = layers.data(name='target_box', shape=[10, 4], dtype='float32') + scores = layers.data(name='scores', shape=[10, 21], dtype='float32') + gt_box = layers.data( + name='gt_box', shape=[4], lod_level=1, dtype='float32') + gt_label = layers.data( + name='gt_label', shape=[1], lod_level=1, dtype='float32') + loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv) + """ + + helper = LayerHelper('ssd_loss', **locals()) + if mining_type != 'max_negative': + raise ValueError("Only support mining_type == max_negative now.") + + num, num_prior, num_class = confidence.shape + + def __reshape_to_2d(var): + return ops.reshape(x=var, shape=[-1, var.shape[-1]]) + + # 1. Find matched boundding box by prior box. + # 1.1 Compute IOU similarity between ground-truth boxes and prior boxes. + iou = iou_similarity(x=gt_box, y=prior_box) + # 1.2 Compute matched boundding box by bipartite matching algorithm. + matched_indices, matched_dist = bipartite_match(iou) + + # 2. Compute confidence for mining hard examples + # 2.1. Get the target label based on matched indices + gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, )) + target_label, _ = target_assign( + gt_label, matched_indices, mismatch_value=background_label) + # 2.2. Compute confidence loss. + # Reshape confidence to 2D tensor. + confidence = __reshape_to_2d(confidence) + target_label = tensor.cast(x=target_label, dtype='int64') + target_label = __reshape_to_2d(target_label) + conf_loss = nn.softmax_with_cross_entropy(confidence, target_label) + + # 3. Mining hard examples + conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior)) + neg_indices = helper.create_tmp_variable(dtype='int32') + dtype = matched_indices.dtype + updated_matched_indices = helper.create_tmp_variable(dtype=dtype) + helper.append_op( + type='mine_hard_examples', + inputs={ + 'ClsLoss': conf_loss, + 'LocLoss': None, + 'MatchIndices': matched_indices, + 'MatchDist': matched_dist, + }, + outputs={ + 'NegIndices': neg_indices, + 'UpdatedMatchIndices': updated_matched_indices + }, + attrs={ + 'neg_pos_ratio': neg_pos_ratio, + 'neg_dist_threshold': neg_pos_ratio, + 'mining_type': mining_type, + 'sample_size': sample_size, + }) + + # 4. Assign classification and regression targets + # 4.1. Encoded bbox according to the prior boxes. + encoded_bbox = box_coder( + prior_box=prior_box, + prior_box_var=prior_box_var, + target_box=gt_box, + code_type='encode_center_size') + # 4.2. Assign regression targets + target_bbox, target_loc_weight = target_assign( + encoded_bbox, updated_matched_indices, mismatch_value=background_label) + # 4.3. Assign classification targets + target_label, target_conf_weight = target_assign( + gt_label, + updated_matched_indices, + negative_indices=neg_indices, + mismatch_value=background_label) + + # 5. Compute loss. + # 5.1 Compute confidence loss. + target_label = __reshape_to_2d(target_label) + target_label = tensor.cast(x=target_label, dtype='int64') + conf_loss = nn.softmax_with_cross_entropy(confidence, target_label) + target_conf_weight = __reshape_to_2d(target_conf_weight) + conf_loss = conf_loss * target_conf_weight + + # 5.2 Compute regression loss. + location = __reshape_to_2d(location) + target_bbox = __reshape_to_2d(target_bbox) + + loc_loss = nn.smooth_l1(location, target_bbox) + target_loc_weight = __reshape_to_2d(target_loc_weight) + loc_loss = loc_loss * target_loc_weight + + # 5.3 Compute overall weighted loss. + loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss + return loss diff --git a/python/paddle/v2/fluid/tests/test_detection.py b/python/paddle/v2/fluid/tests/test_detection.py index fecc2a6226f..b731fc9b02e 100644 --- a/python/paddle/v2/fluid/tests/test_detection.py +++ b/python/paddle/v2/fluid/tests/test_detection.py @@ -13,16 +13,12 @@ # limitations under the License. from __future__ import print_function -import paddle.v2.fluid as fluid -import paddle.v2.fluid.core as core import paddle.v2.fluid.layers as layers -import paddle.v2.fluid.layers.detection as detection from paddle.v2.fluid.framework import Program, program_guard import unittest -import numpy as np -class TestBook(unittest.TestCase): +class TestDetection(unittest.TestCase): def test_detection_output(self): program = Program() with program_guard(program): @@ -49,6 +45,66 @@ class TestBook(unittest.TestCase): out = layers.detection_output( scores=scores, loc=loc, prior_box=pb, prior_box_var=pbv) self.assertIsNotNone(out) + self.assertEqual(out.shape[-1], 6) + print(str(program)) + + def test_detection_api(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[4], dtype='float32') + y = layers.data(name='y', shape=[4], dtype='float32') + z = layers.data(name='z', shape=[4], dtype='float32', lod_level=1) + iou = layers.iou_similarity(x=x, y=y) + bcoder = layers.box_coder( + prior_box=x, + prior_box_var=y, + target_box=z, + code_type='encode_center_size') + self.assertIsNotNone(iou) + self.assertIsNotNone(bcoder) + + matched_indices, matched_dist = layers.bipartite_match(iou) + self.assertIsNotNone(matched_indices) + self.assertIsNotNone(matched_dist) + + gt = layers.data( + name='gt', shape=[1, 1], dtype='int32', lod_level=1) + trg, trg_weight = layers.target_assign( + gt, matched_indices, mismatch_value=0) + self.assertIsNotNone(trg) + self.assertIsNotNone(trg_weight) + + gt2 = layers.data( + name='gt2', shape=[10, 4], dtype='float32', lod_level=1) + trg, trg_weight = layers.target_assign( + gt2, matched_indices, mismatch_value=0) + self.assertIsNotNone(trg) + self.assertIsNotNone(trg_weight) + + print(str(program)) + + def test_ssd_loss(self): + program = Program() + with program_guard(program): + pb = layers.data( + name='prior_box', + shape=[10, 4], + append_batch_size=False, + dtype='float32') + pbv = layers.data( + name='prior_box_var', + shape=[10, 4], + append_batch_size=False, + dtype='float32') + loc = layers.data(name='target_box', shape=[10, 4], dtype='float32') + scores = layers.data(name='scores', shape=[10, 21], dtype='float32') + gt_box = layers.data( + name='gt_box', shape=[4], lod_level=1, dtype='float32') + gt_label = layers.data( + name='gt_label', shape=[1], lod_level=1, dtype='int32') + loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv) + self.assertIsNotNone(loss) + self.assertEqual(loss.shape[-1], 1) print(str(program)) @@ -62,40 +118,39 @@ class TestPriorBox(unittest.TestCase): assert box.shape[1] == 4 def prior_box_output(self, data_shape): - images = fluid.layers.data( - name='pixel', shape=data_shape, dtype='float32') - conv1 = fluid.layers.conv2d( + images = layers.data(name='pixel', shape=data_shape, dtype='float32') + conv1 = layers.conv2d( input=images, num_filters=3, filter_size=3, stride=2, use_cudnn=False) - conv2 = fluid.layers.conv2d( + conv2 = layers.conv2d( input=conv1, num_filters=3, filter_size=3, stride=2, use_cudnn=False) - conv3 = fluid.layers.conv2d( + conv3 = layers.conv2d( input=conv2, num_filters=3, filter_size=3, stride=2, use_cudnn=False) - conv4 = fluid.layers.conv2d( + conv4 = layers.conv2d( input=conv3, num_filters=3, filter_size=3, stride=2, use_cudnn=False) - conv5 = fluid.layers.conv2d( + conv5 = layers.conv2d( input=conv4, num_filters=3, filter_size=3, stride=2, use_cudnn=False) - box, var = detection.prior_box( + box, var = layers.prior_box( inputs=[conv1, conv2, conv3, conv4, conv5, conv5], image=images, min_ratio=20, -- GitLab