Updated the input pipeline and data augmentation to process the new keypoint

depth and weight fields. PiperOrigin-RevId: 353975078

Updated the input pipeline and data augmentation to process the new keypoint
depth and weight fields. PiperOrigin-RevId: 353975078
c9d2886a · Yu-hui Chen · TF Object Detection Team · d04c9e9b · c9d2886a · c9d2886a
5 changed file
--- a/research/object_detection/core/model.py
+++ b/research/object_detection/core/model.py
@@ -315,7 +315,9 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
      is_annotated_list=None,
      groundtruth_labeled_classes=None,
      groundtruth_verified_neg_classes=None,
-      groundtruth_not_exhaustive_classes=None):
+      groundtruth_not_exhaustive_classes=None,
+      groundtruth_keypoint_depths_list=None,
+      groundtruth_keypoint_depth_weights_list=None):
    """Provide groundtruth tensors.

    Args:
@@ -379,6 +381,11 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
      groundtruth_not_exhaustive_classes: A list of 1-D tf.float32 tensors of
        shape [num_classes], containing a K-hot representation of classes
        which don't have all of their instances marked exhaustively.
+      groundtruth_keypoint_depths_list: a list of 2-D tf.float32 tensors
+        of shape [num_boxes, num_keypoints] containing keypoint relative depths.
+      groundtruth_keypoint_depth_weights_list: a list of 2-D tf.float32 tensors
+        of shape [num_boxes, num_keypoints] containing the weights of the
+        relative depths.
    """
    self._groundtruth_lists[fields.BoxListFields.boxes] = groundtruth_boxes_list
    self._groundtruth_lists[
@@ -399,6 +406,14 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
      self._groundtruth_lists[
          fields.BoxListFields.keypoint_visibilities] = (
              groundtruth_keypoint_visibilities_list)
+    if groundtruth_keypoint_depths_list:
+      self._groundtruth_lists[
+          fields.BoxListFields.keypoint_depths] = (
+              groundtruth_keypoint_depths_list)
+    if groundtruth_keypoint_depth_weights_list:
+      self._groundtruth_lists[
+          fields.BoxListFields.keypoint_depth_weights] = (
+              groundtruth_keypoint_depth_weights_list)
    if groundtruth_dp_num_points_list:
      self._groundtruth_lists[
          fields.BoxListFields.densepose_num_points] = (

--- a/research/object_detection/core/preprocessor.py
+++ b/research/object_detection/core/preprocessor.py
@@ -571,6 +571,8 @@ def random_horizontal_flip(image,
                           keypoint_visibilities=None,
                           densepose_part_ids=None,
                           densepose_surface_coords=None,
+                           keypoint_depths=None,
+                           keypoint_depth_weights=None,
                           keypoint_flip_permutation=None,
                           probability=0.5,
                           seed=None,
@@ -602,6 +604,12 @@ def random_horizontal_flip(image,
                              (y, x) are the normalized image coordinates for a
                              sampled point, and (v, u) is the surface
                              coordinate for the part.
+    keypoint_depths: (optional) rank 2 float32 tensor with shape [num_instances,
+                     num_keypoints] representing the relative depth of the
+                     keypoints.
+    keypoint_depth_weights: (optional) rank 2 float32 tensor with shape
+                            [num_instances, num_keypoints] representing the
+                            weights of the relative depth of the keypoints.
    keypoint_flip_permutation: rank 1 int32 tensor containing the keypoint flip
                               permutation.
    probability: the probability of performing this augmentation.
@@ -631,6 +639,10 @@ def random_horizontal_flip(image,
                        [num_instances, num_points].
    densepose_surface_coords: rank 3 float32 tensor with shape
                              [num_instances, num_points, 4].
+    keypoint_depths: rank 2 float32 tensor with shape [num_instances,
+                     num_keypoints]
+    keypoint_depth_weights: rank 2 float32 tensor with shape [num_instances,
+                            num_keypoints].

  Raises:
    ValueError: if keypoints are provided but keypoint_flip_permutation is not.
@@ -708,6 +720,21 @@ def random_horizontal_flip(image,
          lambda: (densepose_part_ids, densepose_surface_coords))
      result.extend(densepose_tensors)

+    # flip keypoint depths and weights.
+    if (keypoint_depths is not None and
+        keypoint_flip_permutation is not None):
+      kpt_flip_perm = keypoint_flip_permutation
+      keypoint_depths = tf.cond(
+          do_a_flip_random,
+          lambda: tf.gather(keypoint_depths, kpt_flip_perm, axis=1),
+          lambda: keypoint_depths)
+      keypoint_depth_weights = tf.cond(
+          do_a_flip_random,
+          lambda: tf.gather(keypoint_depth_weights, kpt_flip_perm, axis=1),
+          lambda: keypoint_depth_weights)
+      result.append(keypoint_depths)
+      result.append(keypoint_depth_weights)
+
    return tuple(result)


@@ -4293,7 +4320,8 @@ def get_default_func_arg_map(include_label_weights=True,
                             include_instance_masks=False,
                             include_keypoints=False,
                             include_keypoint_visibilities=False,
-                             include_dense_pose=False):
+                             include_dense_pose=False,
+                             include_keypoint_depths=False):
  """Returns the default mapping from a preprocessor function to its args.

  Args:
@@ -4311,6 +4339,8 @@ def get_default_func_arg_map(include_label_weights=True,
      the keypoint visibilities, too.
    include_dense_pose: If True, preprocessing functions will modify the
      DensePose labels, too.
+    include_keypoint_depths: If True, preprocessing functions will modify the
+      keypoint depth labels, too.

  Returns:
    A map from preprocessing functions to the arguments they receive.
@@ -4353,6 +4383,13 @@ def get_default_func_arg_map(include_label_weights=True,
        fields.InputDataFields.groundtruth_dp_part_ids)
    groundtruth_dp_surface_coords = (
        fields.InputDataFields.groundtruth_dp_surface_coords)
+  groundtruth_keypoint_depths = None
+  groundtruth_keypoint_depth_weights = None
+  if include_keypoint_depths:
+    groundtruth_keypoint_depths = (
+        fields.InputDataFields.groundtruth_keypoint_depths)
+    groundtruth_keypoint_depth_weights = (
+        fields.InputDataFields.groundtruth_keypoint_depth_weights)

  prep_func_arg_map = {
      normalize_image: (fields.InputDataFields.image,),
@@ -4364,6 +4401,8 @@ def get_default_func_arg_map(include_label_weights=True,
          groundtruth_keypoint_visibilities,
          groundtruth_dp_part_ids,
          groundtruth_dp_surface_coords,
+          groundtruth_keypoint_depths,
+          groundtruth_keypoint_depth_weights,
      ),
      random_vertical_flip: (
          fields.InputDataFields.image,

--- a/research/object_detection/core/preprocessor_test.py
+++ b/research/object_detection/core/preprocessor_test.py
@@ -105,6 +105,17 @@ class PreprocessorTest(test_case.TestCase, parameterized.TestCase):
        ])
    return keypoints, keypoint_visibilities

+  def createTestKeypointDepths(self):
+    keypoint_depths = tf.constant([
+        [1.0, 0.9, 0.8],
+        [0.7, 0.6, 0.5]
+    ], dtype=tf.float32)
+    keypoint_depth_weights = tf.constant([
+        [0.5, 0.6, 0.7],
+        [0.8, 0.9, 1.0]
+    ], dtype=tf.float32)
+    return keypoint_depths, keypoint_depth_weights
+
  def createTestKeypointsInsideCrop(self):
    keypoints = np.array([
        [[0.4, 0.4], [0.5, 0.5], [0.6, 0.6]],
@@ -713,6 +724,59 @@ class PreprocessorTest(test_case.TestCase, parameterized.TestCase):
                                test_keypoints=True)


+  def testRunRandomHorizontalFlipWithKeypointDepth(self):
+
+    def graph_fn():
+      preprocess_options = [(preprocessor.random_horizontal_flip, {})]
+      image_height = 3
+      image_width = 3
+      images = tf.random_uniform([1, image_height, image_width, 3])
+      boxes = self.createTestBoxes()
+      masks = self.createTestMasks()
+      keypoints, keypoint_visibilities = self.createTestKeypoints()
+      keypoint_depths, keypoint_depth_weights = self.createTestKeypointDepths()
+      keypoint_flip_permutation = self.createKeypointFlipPermutation()
+      tensor_dict = {
+          fields.InputDataFields.image:
+              images,
+          fields.InputDataFields.groundtruth_boxes:
+              boxes,
+          fields.InputDataFields.groundtruth_instance_masks:
+              masks,
+          fields.InputDataFields.groundtruth_keypoints:
+              keypoints,
+          fields.InputDataFields.groundtruth_keypoint_visibilities:
+              keypoint_visibilities,
+          fields.InputDataFields.groundtruth_keypoint_depths:
+              keypoint_depths,
+          fields.InputDataFields.groundtruth_keypoint_depth_weights:
+              keypoint_depth_weights,
+      }
+      preprocess_options = [(preprocessor.random_horizontal_flip, {
+          'keypoint_flip_permutation': keypoint_flip_permutation,
+          'probability': 1.0
+      })]
+      preprocessor_arg_map = preprocessor.get_default_func_arg_map(
+          include_instance_masks=True,
+          include_keypoints=True,
+          include_keypoint_visibilities=True,
+          include_dense_pose=False,
+          include_keypoint_depths=True)
+      tensor_dict = preprocessor.preprocess(
+          tensor_dict, preprocess_options, func_arg_map=preprocessor_arg_map)
+      keypoint_depths = tensor_dict[
+          fields.InputDataFields.groundtruth_keypoint_depths]
+      keypoint_depth_weights = tensor_dict[
+          fields.InputDataFields.groundtruth_keypoint_depth_weights]
+      output_tensors = [keypoint_depths, keypoint_depth_weights]
+      return output_tensors
+
+    output_tensors = self.execute_cpu(graph_fn, [])
+    expected_keypoint_depths = [[1.0, 0.8, 0.9], [0.7, 0.5, 0.6]]
+    expected_keypoint_depth_weights = [[0.5, 0.7, 0.6], [0.8, 1.0, 0.9]]
+    self.assertAllClose(expected_keypoint_depths, output_tensors[0])
+    self.assertAllClose(expected_keypoint_depth_weights, output_tensors[1])
+
  def testRandomVerticalFlip(self):

    def graph_fn():

--- a/research/object_detection/inputs.py
+++ b/research/object_detection/inputs.py
@@ -307,6 +307,14 @@ def transform_input_data(tensor_dict,
      out_tensor_dict[flds_gt_kpt_vis] = tf.ones_like(
          out_tensor_dict[flds_gt_kpt][:, :, 0],
          dtype=tf.bool)
+    flds_gt_kpt_depth = fields.InputDataFields.groundtruth_keypoint_depths
+    flds_gt_kpt_depth_weight = (
+        fields.InputDataFields.groundtruth_keypoint_depth_weights)
+    if flds_gt_kpt_depth in out_tensor_dict:
+      out_tensor_dict[flds_gt_kpt_depth] = out_tensor_dict[flds_gt_kpt_depth]
+      out_tensor_dict[flds_gt_kpt_depth_weight] = out_tensor_dict[
+          flds_gt_kpt_depth_weight]
+
    out_tensor_dict[flds_gt_kpt_weights] = (
        keypoint_ops.keypoint_weights_from_visibilities(
            out_tensor_dict[flds_gt_kpt_vis],
@@ -506,6 +514,15 @@ def pad_input_data_to_static_shapes(tensor_dict,
    padding_shapes[input_fields.
                   groundtruth_keypoint_visibilities] = padding_shape

+  if fields.InputDataFields.groundtruth_keypoint_depths in tensor_dict:
+    tensor_shape = tensor_dict[fields.InputDataFields.
+                               groundtruth_keypoint_depths].shape
+    padding_shape = [max_num_boxes, shape_utils.get_dim_as_int(tensor_shape[1])]
+    padding_shapes[fields.InputDataFields.
+                   groundtruth_keypoint_depths] = padding_shape
+    padding_shapes[fields.InputDataFields.
+                   groundtruth_keypoint_depth_weights] = padding_shape
+
  if input_fields.groundtruth_keypoint_weights in tensor_dict:
    tensor_shape = (
        tensor_dict[input_fields.groundtruth_keypoint_weights].shape)
@@ -587,6 +604,8 @@ def augment_input_data(tensor_dict, data_augmentation_options):
                       in tensor_dict)
  include_keypoint_visibilities = (
      fields.InputDataFields.groundtruth_keypoint_visibilities in tensor_dict)
+  include_keypoint_depths = (
+      fields.InputDataFields.groundtruth_keypoint_depths in tensor_dict)
  include_label_weights = (fields.InputDataFields.groundtruth_weights
                           in tensor_dict)
  include_label_confidences = (fields.InputDataFields.groundtruth_confidences
@@ -606,7 +625,8 @@ def augment_input_data(tensor_dict, data_augmentation_options):
          include_instance_masks=include_instance_masks,
          include_keypoints=include_keypoints,
          include_keypoint_visibilities=include_keypoint_visibilities,
-          include_dense_pose=include_dense_pose))
+          include_dense_pose=include_dense_pose,
+          include_keypoint_depths=include_keypoint_depths))
  tensor_dict[fields.InputDataFields.image] = tf.squeeze(
      tensor_dict[fields.InputDataFields.image], axis=0)
  return tensor_dict
@@ -628,6 +648,8 @@ def _get_labels_dict(input_dict):
      fields.InputDataFields.groundtruth_confidences,
      fields.InputDataFields.groundtruth_labeled_classes,
      fields.InputDataFields.groundtruth_keypoints,
+      fields.InputDataFields.groundtruth_keypoint_depths,
+      fields.InputDataFields.groundtruth_keypoint_depth_weights,
      fields.InputDataFields.groundtruth_instance_masks,
      fields.InputDataFields.groundtruth_area,
      fields.InputDataFields.groundtruth_is_crowd,

--- a/research/object_detection/inputs_test.py
+++ b/research/object_detection/inputs_test.py
@@ -1420,6 +1420,49 @@ class DataTransformationFnTest(test_case.TestCase, parameterized.TestCase):
        [[[0., 0., 0., 0.,], [0., 0., 0., 0.,]],
         [[0.1, 0.1, 0.3, 0.4,], [0.6, 0.4, 0.6, 0.7,]]])

+  def test_groundtruth_keypoint_depths(self):
+    def graph_fn():
+      tensor_dict = {
+          fields.InputDataFields.image:
+              tf.constant(np.random.rand(100, 50, 3).astype(np.float32)),
+          fields.InputDataFields.groundtruth_boxes:
+              tf.constant(np.array([[.5, .5, 1, 1], [.0, .0, .5, .5]],
+                                   np.float32)),
+          fields.InputDataFields.groundtruth_classes:
+              tf.constant(np.array([1, 2], np.int32)),
+          fields.InputDataFields.groundtruth_keypoints:
+              tf.constant([[[0.1, 0.2], [0.3, 0.4]],
+                           [[0.5, 0.6], [0.7, 0.8]]]),
+          fields.InputDataFields.groundtruth_keypoint_visibilities:
+              tf.constant([[True, False], [True, True]]),
+          fields.InputDataFields.groundtruth_keypoint_depths:
+              tf.constant([[1.0, 0.9], [0.8, 0.7]]),
+          fields.InputDataFields.groundtruth_keypoint_depth_weights:
+              tf.constant([[0.7, 0.8], [0.9, 1.0]]),
+      }
+
+      num_classes = 3
+      keypoint_type_weight = [1.0, 2.0]
+      input_transformation_fn = functools.partial(
+          inputs.transform_input_data,
+          model_preprocess_fn=_fake_resize50_preprocess_fn,
+          image_resizer_fn=_fake_image_resizer_fn,
+          num_classes=num_classes,
+          keypoint_type_weight=keypoint_type_weight)
+      transformed_inputs = input_transformation_fn(tensor_dict=tensor_dict)
+      return (transformed_inputs[
+          fields.InputDataFields.groundtruth_keypoint_depths],
+              transformed_inputs[
+                  fields.InputDataFields.groundtruth_keypoint_depth_weights])
+
+    keypoint_depths, keypoint_depth_weights = self.execute_cpu(graph_fn, [])
+    self.assertAllClose(
+        keypoint_depths,
+        [[1.0, 0.9], [0.8, 0.7]])
+    self.assertAllClose(
+        keypoint_depth_weights,
+        [[0.7, 0.8], [0.9, 1.0]])
+

 class PadInputDataToStaticShapesFnTest(test_case.TestCase):