...
 
Commits (18)
    https://gitcode.net/weixin_42428077/models/-/commit/4a07d460298b96ff2d3734c12cf11949c8b4b718 No public description 2023-07-26T09:49:03-07:00 Chaochao Yan allenyan@google.com PiperOrigin-RevId: 551234957 https://gitcode.net/weixin_42428077/models/-/commit/f54ccf5aa6026efc68184f8c2d2ab4933cd56cc1 No public description 2023-07-26T09:53:01-07:00 Chaochao Yan allenyan@google.com PiperOrigin-RevId: 551236095 https://gitcode.net/weixin_42428077/models/-/commit/deb52a6f2f44cc11e9524fd2995954742b426d2c No public description 2023-07-27T21:55:45-07:00 A. Unique TensorFlower gardener@tensorflow.org PiperOrigin-RevId: 551733884 https://gitcode.net/weixin_42428077/models/-/commit/311a39a94d8f9554a26984f2df626f9d9c0cbc15 No public description 2023-07-28T14:33:10-07:00 A. Unique TensorFlower gardener@tensorflow.org PiperOrigin-RevId: 551951832 https://gitcode.net/weixin_42428077/models/-/commit/5be8f01f158bfad4620a2668390f704b4a1f025d No public description 2023-07-31T12:12:17-07:00 A. Unique TensorFlower gardener@tensorflow.org PiperOrigin-RevId: 552554960 https://gitcode.net/weixin_42428077/models/-/commit/e22c7d3ef24d8a9def31ec99a31ad9b9a52dde48 No public description 2023-07-31T13:30:29-07:00 Chaochao Yan allenyan@google.com PiperOrigin-RevId: 552576382 https://gitcode.net/weixin_42428077/models/-/commit/19b7f479a99072a2dfa9a330142a76f2707d712c The coordinates of the detected boxes should be divided by 256.0 2023-07-31T13:42:07-07:00 A. Unique TensorFlower gardener@tensorflow.org PiperOrigin-RevId: 552579574 https://gitcode.net/weixin_42428077/models/-/commit/f6bcd8f940901af8c192a81abdeb3649411fc000 Add SWAP pooling to YT8M open-source code base. 2023-07-31T15:14:38-07:00 A. Unique TensorFlower gardener@tensorflow.org PiperOrigin-RevId: 552603688 https://gitcode.net/weixin_42428077/models/-/commit/beb316b421ed25d01a8c82ad6a40b9b0762d2b6f No public description 2023-08-01T21:32:58-07:00 Fan Yang fyangf@google.com PiperOrigin-RevId: 553017713 https://gitcode.net/weixin_42428077/models/-/commit/3dc108e49ccf925e2dd5bdcbda52c7cea742214e No public description 2023-08-02T04:30:46-07:00 Hongkun Yu hongkuny@google.com PiperOrigin-RevId: 553104176 https://gitcode.net/weixin_42428077/models/-/commit/129f900e5e8c2effc8bb0be4b17f4d3e278c000c Removing argument `num_of_examples` of function `show_batch` is never used 2023-08-02T14:42:19-07:00 A. Unique TensorFlower gardener@tensorflow.org PiperOrigin-RevId: 553260714 https://gitcode.net/weixin_42428077/models/-/commit/1a1e207049e6a8d11d6af76c60ac2767165618b8 No public description 2023-08-02T16:47:11-07:00 Chaochao Yan allenyan@google.com PiperOrigin-RevId: 553295125 https://gitcode.net/weixin_42428077/models/-/commit/febe4d8e973995005cf9de7e2c0b63d076bae4d1 No public description 2023-08-03T18:14:30-07:00 Fan Yang fyangf@google.com PiperOrigin-RevId: 553646554 https://gitcode.net/weixin_42428077/models/-/commit/a782bd708a86574fb28aa9a2713b20780c9b5882 No public description 2023-08-03T21:59:15-07:00 Chaochao Yan allenyan@google.com PiperOrigin-RevId: 553684763 https://gitcode.net/weixin_42428077/models/-/commit/7d241eee30c80ff0678a5f2484861df22e0f076f No public description 2023-08-03T23:17:45-07:00 Chaochao Yan allenyan@google.com PiperOrigin-RevId: 553700524 https://gitcode.net/weixin_42428077/models/-/commit/c015ef733f405855b1112e985b19732f918be328 No public description 2023-08-03T23:59:09-07:00 Chaochao Yan allenyan@google.com PiperOrigin-RevId: 553708377 https://gitcode.net/weixin_42428077/models/-/commit/2b67cb594ef806503e75cae54584d1cebdd1f651 No public description 2023-08-08T12:57:35-07:00 A. Unique TensorFlower gardener@tensorflow.org PiperOrigin-RevId: 554913917 https://gitcode.net/weixin_42428077/models/-/commit/15049a1b22542e357a49c1be510955e39da69c0b No public description 2023-08-08T12:58:58-07:00 Fan Yang fyangf@google.com PiperOrigin-RevId: 554914277
......@@ -745,7 +745,7 @@
},
"outputs": [],
"source": [
"def show_batch(raw_records, num_of_examples):\n",
"def show_batch(raw_records):\n",
" plt.figure(figsize=(20, 20))\n",
" use_normalized_coordinates=True\n",
" min_score_thresh = 0.30\n",
......@@ -802,7 +802,7 @@
"\n",
"train_tfrecords = tf.io.gfile.glob(exp_config.task.train_data.input_path)\n",
"raw_records = tf.data.TFRecordDataset(train_tfrecords).shuffle(buffer_size=buffer_size).take(num_of_examples)\n",
"show_batch(raw_records, num_of_examples)"
"show_batch(raw_records)"
]
},
{
......@@ -962,7 +962,7 @@
"\n",
"test_tfrecords = tf.io.gfile.glob('./lvis_tfrecords/val*')\n",
"test_ds = tf.data.TFRecordDataset(test_tfrecords).take(num_of_examples)\n",
"show_batch(test_ds, num_of_examples)"
"show_batch(test_ds)"
]
},
{
......@@ -1095,7 +1095,7 @@
" detection_masks = tf.convert_to_tensor(result['detection_masks'][0])\n",
" detection_boxes = tf.convert_to_tensor(result['detection_boxes'][0])\n",
" detection_masks_reframed = reframe_box_masks_to_image_masks(\n",
" detection_masks, detection_boxes/255.0,\n",
" detection_masks, detection_boxes/256.0,\n",
" image_np.shape[0], image_np.shape[1])\n",
" detection_masks_reframed = tf.cast(\n",
" detection_masks_reframed \u003e min_score_thresh,\n",
......
......@@ -46,6 +46,7 @@ def run_experiment(
model_dir: str,
run_post_eval: bool = False,
trainer: base_trainer.MultiTaskBaseTrainer = None,
eval_summary_manager: Optional[orbit.utils.SummaryManagerInterface] = None,
best_ckpt_exporter_creator: Optional[Any] = train_utils
.maybe_create_best_ckpt_exporter
) -> Union[base_model.MultiTaskBaseModel, Tuple[base_model.MultiTaskBaseModel,
......@@ -64,6 +65,10 @@ def run_experiment(
are returned.
trainer: (optional) A multi-task trainer to use. If none is provided, a
default one will be created based on `params`.
eval_summary_manager: Instance of the eval summary manager. If set, the
`eval_summary_dir` will be ignored. Otherwise the eval summary manager
will be created internally for TensorBoard summaries by default from the
`eval_summary_dir`.
best_ckpt_exporter_creator: A functor for creating best checkpoint exporter.
Returns:
......@@ -117,6 +122,7 @@ def run_experiment(
checkpoint_manager=checkpoint_manager,
summary_dir=os.path.join(model_dir, 'train'),
eval_summary_dir=os.path.join(model_dir, 'validation'),
eval_summary_manager=eval_summary_manager,
summary_interval=params.trainer.summary_interval)
logging.info('Starts to execute mode: %s', mode)
......@@ -162,6 +168,7 @@ def run_experiment_with_multitask_eval(
run_post_eval: bool = False,
save_summary: bool = True,
trainer: Optional[core_lib.Trainer] = None,
eval_summary_manager: Optional[orbit.utils.SummaryManagerInterface] = None,
best_ckpt_exporter_creator: Optional[Any] = train_utils
.maybe_create_best_ckpt_exporter,
) -> Tuple[Any, Any]:
......@@ -181,6 +188,10 @@ def run_experiment_with_multitask_eval(
trainer: the core_lib.Trainer instance. It should be created within the
strategy.scope(). If not provided, an instance will be created by default
if `mode` contains 'train'.
eval_summary_manager: Instance of the eval summary manager. If set, the
`eval_summary_dir` will be ignored. Otherwise the eval summary manager
will be created internally for TensorBoard summaries by default from the
`eval_summary_dir`.
best_ckpt_exporter_creator: A functor for creating best checkpoint exporter.
Returns:
......@@ -253,6 +264,7 @@ def run_experiment_with_multitask_eval(
summary_dir=os.path.join(model_dir, 'train') if save_summary else None,
eval_summary_dir=os.path.join(model_dir, 'validation') if
(save_summary) else None,
eval_summary_manager=eval_summary_manager,
summary_interval=params.trainer.summary_interval if
(save_summary) else None)
......
......@@ -14,8 +14,7 @@ gin-config
tf_slim>=1.1.0
Cython
matplotlib
# Loader becomes a required positional argument in 6.0 in yaml.load
pyyaml>=5.1,<6.0
pyyaml
# CV related dependencies
opencv-python-headless==4.5.2.52
Pillow
......
......@@ -75,6 +75,7 @@ class FunnelEncoderConfig(hyperparams.Config):
norm_first: bool = False
share_rezero: bool = False
append_dense_inputs: bool = False
transformer_cls: str = "TransformerEncoderBlock"
@dataclasses.dataclass
......@@ -559,6 +560,7 @@ def build_encoder(config: EncoderConfig,
norm_first=encoder_cfg.norm_first,
share_rezero=encoder_cfg.share_rezero,
append_dense_inputs=encoder_cfg.append_dense_inputs,
transformer_cls=encoder_cfg.transformer_cls,
)
if encoder_type == "kernel":
......
......@@ -113,6 +113,8 @@ class Pix2Seq(hyperparams.Config):
drop_units: float = 0.1
drop_att: float = 0.0
norm_first: bool = True
top_k: int = 0
top_p: float = 0.4
@dataclasses.dataclass
......
......@@ -236,6 +236,8 @@ class Pix2Seq(tf.keras.Model):
drop_path=0.1,
drop_units=0.1,
drop_att=0.0,
top_k=0,
top_p=0.4,
**kwargs
):
super().__init__(**kwargs)
......@@ -271,6 +273,8 @@ class Pix2Seq(tf.keras.Model):
drop_units=self._drop_units,
drop_att=self._drop_att,
)
self._top_k = top_k
self._top_p = top_p
@property
def backbone(self) -> tf.keras.Model:
......@@ -292,6 +296,8 @@ class Pix2Seq(tf.keras.Model):
"drop_path": self._drop_path,
"drop_units": self._drop_units,
"drop_att": self._drop_att,
"top_k": self._top_k,
"top_p": self._top_p,
}
@classmethod
......@@ -350,11 +356,15 @@ class Pix2Seq(tf.keras.Model):
training,
)
else:
tokens, logits = self._transformer.infer({
"inputs": features,
"tokens": targets,
"pos_emb": pos_emb,
})
tokens, logits = self._transformer.infer(
{
"inputs": features,
"tokens": targets,
"pos_emb": pos_emb,
},
top_k=self._top_k,
top_p=self._top_p,
)
return [tokens, logits]
......
......@@ -51,8 +51,9 @@ class DataConfig(cfg.DataConfig):
temporal_stride: Not used. Need to deprecated.
max_frames: Maxim Number of frames in a input example. It is used to crop
the input in the temporal dimension.
sample_random_frames: If sample random frames.
num_sample_frames: Number of frames to sample for each input example.
sample_random_frames: If sample random frames or random sequence.
num_sample_frames: Number of frames to sample for each input example. No
frame sampling if None.
num_classes: Number of classes to classify. Assuming it is a classification
task.
num_devices: Not used. To be deprecated.
......
......@@ -358,14 +358,23 @@ class Parser(parser.Parser):
if not self._include_video_id and "id" in decoded_tensors:
del decoded_tensors["id"]
# Valid `num_frames` comes from _concat_features().
outputs = self._process_label(video_matrix, num_frames, decoded_tensors)
if self._num_sample_frames is not None:
if self._num_sample_frames is None:
# Padding to max_frames.
outputs["video_matrix"] = resize_axis(
outputs["video_matrix"], 1, self._max_frames
)
else:
outputs["video_matrix"] = utils.sample_video_frames(
outputs["video_matrix"],
tf.reshape(outputs["num_frames"], [-1, 1]),
random_frames=self._sample_random_frames,
num_sample_frames=self._num_sample_frames,
)
outputs["num_frames"] = (
tf.ones_like(outputs["num_frames"]) * self._num_sample_frames
)
return outputs
def _parse_eval_data(self, decoded_tensors):
......@@ -379,13 +388,21 @@ class Parser(parser.Parser):
del decoded_tensors["id"]
outputs = self._process_label(video_matrix, num_frames, decoded_tensors)
if self._num_sample_frames is not None:
if self._num_sample_frames is None:
# Padding to max_frames.
outputs["video_matrix"] = resize_axis(
outputs["video_matrix"], 1, self._max_frames
)
else:
outputs["video_matrix"] = utils.sample_video_frames(
outputs["video_matrix"],
tf.reshape(outputs["num_frames"], [-1, 1]),
random_frames=self._sample_random_frames,
num_sample_frames=self._num_sample_frames,
)
outputs["num_frames"] = (
tf.ones_like(outputs["num_frames"]) * self._num_sample_frames
)
return outputs
def _process_label(self, video_matrix, num_frames, contexts):
......@@ -488,7 +505,9 @@ class PostBatchProcessor():
def __init__(self, input_params: exp_cfg.DataConfig):
self.segment_labels = input_params.segment_labels
self.num_classes = input_params.num_classes
self.num_sample_frames = input_params.num_sample_frames
self.num_batched_frames = (
input_params.num_sample_frames or input_params.max_frames
)
self.num_features = sum(input_params.feature_sizes)
def post_fn(self, batched_tensors: Dict[str,
......@@ -500,12 +519,13 @@ class PostBatchProcessor():
num_frames = batched_tensors["num_frames"]
if self.segment_labels:
# [batch x num_segment x num_sample_frames x num_features]
# -> [batch * num_segment x num_sample_frames x num_features]
# [batch x num_segment x num_batched_frames x num_features]
# -> [batch * num_segment x num_batched_frames x num_features]
if video_ids is not None:
video_ids = tf.reshape(video_ids, [-1])
video_matrix = tf.reshape(video_matrix,
[-1, self.num_sample_frames, self.num_features])
video_matrix = tf.reshape(
video_matrix, [-1, self.num_batched_frames, self.num_features]
)
labels = tf.reshape(labels, [-1, self.num_classes])
num_frames = tf.reshape(num_frames, [-1, 1])
batched_tensors["label_weights"] = tf.reshape(
......
......@@ -60,13 +60,15 @@ class Yt8mInputTest(parameterized.TestCase, tf.test.TestCase):
postprocess_fn=postprocess_fn,
transform_and_batch_fn=batch_fn)
@parameterized.parameters((True,), (False,))
def test_read_video_level_input(self, include_video_id):
@parameterized.parameters((True, 20), (False, 20), (False, None))
def test_read_video_level_input(self, include_video_id, num_sample_frames):
params = yt8m_configs.yt8m(is_training=False)
params.global_batch_size = 4
params.segment_labels = False
params.input_path = self.data_path
params.include_video_id = include_video_id
params.max_frames = 122
params.num_sample_frames = num_sample_frames
reader = self.create_input_reader(params)
dataset = reader.read()
......@@ -82,27 +84,40 @@ class Yt8mInputTest(parameterized.TestCase, tf.test.TestCase):
self.assertCountEqual(['video_matrix', 'labels', 'num_frames'],
example.keys())
batch_size = params.global_batch_size
expected_num_frames = num_sample_frames or params.max_frames
self.assertEqual(
example['video_matrix'].shape.as_list(),
[batch_size, params.num_sample_frames, sum(params.feature_sizes)],
[batch_size, expected_num_frames, sum(params.feature_sizes)],
)
self.assertEqual(
example['labels'].shape.as_list(), [batch_size, params.num_classes]
)
self.assertEqual(example['labels'].shape.as_list(),
[batch_size, params.num_classes])
# Check non empty labels.
self.assertGreater(np.nonzero(example['labels'][0].numpy())[0].shape[0], 0)
self.assertEqual(example['num_frames'].shape.as_list(), [batch_size, 1])
if num_sample_frames:
self.assertAllEqual(
example['num_frames'].numpy(),
[[num_sample_frames]] * batch_size,
)
else:
self.assertAllEqual(
example['num_frames'].numpy(),
[[120], [121], [122], [122]],
)
if include_video_id:
self.assertEqual(example['video_ids'].shape.as_list(), [batch_size, 1])
@parameterized.parameters((True,), (False,))
def test_read_segment_level_input(self, include_video_id=False):
@parameterized.parameters((True, 20), (False, 20), (False, None))
def test_read_segment_level_input(self, include_video_id, num_sample_frames):
params = yt8m_configs.yt8m(is_training=False)
params.global_batch_size = 2
params.segment_labels = True
params.segment_size = 24
params.input_path = self.data_path
params.include_video_id = include_video_id
params.num_sample_frames = num_sample_frames
reader = self.create_input_reader(params)
dataset = reader.read()
......@@ -120,21 +135,35 @@ class Yt8mInputTest(parameterized.TestCase, tf.test.TestCase):
['video_matrix', 'labels', 'num_frames', 'label_weights'],
example.keys())
batch_size = params.global_batch_size * self.num_segment
expected_num_frames = num_sample_frames or params.max_frames
self.assertEqual(
example['video_matrix'].shape.as_list(),
[batch_size, params.num_sample_frames, sum(params.feature_sizes)],
[batch_size, expected_num_frames, sum(params.feature_sizes)],
)
self.assertEqual(example['labels'].shape.as_list(),
[batch_size, params.num_classes])
self.assertGreater(np.nonzero(example['labels'][0].numpy())[0].shape[0], 0)
self.assertEqual(example['num_frames'].shape.as_list(), [batch_size, 1])
self.assertEqual(example['label_weights'].shape.as_list(),
[batch_size, params.num_classes])
if num_sample_frames:
self.assertAllEqual(
example['num_frames'].numpy(),
[[num_sample_frames]] * batch_size,
)
else:
self.assertAllEqual(
example['num_frames'].numpy(),
[[params.segment_size]] * batch_size,
)
if include_video_id:
self.assertEqual(example['video_ids'].shape.as_list(), [batch_size])
@parameterized.parameters((True,), (False,))
def test_read_video_level_float_input(self, include_video_id):
@parameterized.parameters((True, 4), (False, 4), (False, None))
def test_read_video_level_float_input(
self, include_video_id, num_sample_frames
):
data_dir = os.path.join(self.get_temp_dir(), 'data2')
tf.io.gfile.makedirs(data_dir)
data_path = os.path.join(data_dir, 'data2.tfrecord')
......@@ -150,6 +179,7 @@ class Yt8mInputTest(parameterized.TestCase, tf.test.TestCase):
params.input_path = data_path
params.num_frames = 2
params.max_frames = 2
params.num_sample_frames = num_sample_frames
params.feature_names = ('VIDEO_EMBEDDING/context_feature/floats',
'FEATURE/feature/floats')
params.feature_sources = ('context', 'feature')
......@@ -191,9 +221,10 @@ class Yt8mInputTest(parameterized.TestCase, tf.test.TestCase):
# Check tensor shape.
batch_size = params.global_batch_size
expected_num_frames = params.num_sample_frames or params.max_frames
self.assertEqual(
example['video_matrix'].shape.as_list(),
[batch_size, params.num_sample_frames, sum(params.feature_sizes)],
[batch_size, expected_num_frames, sum(params.feature_sizes)],
)
self.assertEqual(example['labels'].shape.as_list(),
[batch_size, params.num_classes])
......@@ -201,5 +232,6 @@ class Yt8mInputTest(parameterized.TestCase, tf.test.TestCase):
if include_video_id:
self.assertEqual(example['video_ids'].shape.as_list(), [batch_size, 1])
if __name__ == '__main__':
tf.test.main()
......@@ -15,14 +15,15 @@
"""Dbof model definitions."""
import functools
from typing import Optional
from typing import Any, Optional
import tensorflow as tf
from official.modeling import hyperparams
from official.modeling import tf_utils
from official.projects.yt8m.configs import yt8m as yt8m_cfg
from official.projects.yt8m.modeling import yt8m_model_utils as utils
from official.projects.yt8m.modeling import nn_layers
from official.projects.yt8m.modeling import yt8m_model_utils
from official.vision.configs import common
from official.vision.modeling.backbones import factory
......@@ -30,7 +31,7 @@ from official.vision.modeling.backbones import factory
layers = tf.keras.layers
class Dbof(tf.keras.Model):
class Dbof(layers.Layer):
"""A YT8M model class builder.
Creates a Deep Bag of Frames model.
......@@ -61,10 +62,11 @@ class Dbof(tf.keras.Model):
l2_regularizer: An optional kernel weight regularizer.
**kwargs: keyword arguments to be passed.
"""
self._self_setattr_tracking = False
super().__init__(**kwargs)
self._input_specs = input_specs
self._params = params
self._norm_activation = norm_activation
self._l2_regularizer = l2_regularizer
self._act_fn = tf_utils.get_activation(self._norm_activation.activation)
self._norm = functools.partial(
layers.BatchNormalization,
......@@ -72,80 +74,90 @@ class Dbof(tf.keras.Model):
epsilon=self._norm_activation.norm_epsilon,
synchronized=self._norm_activation.use_sync_bn,
)
# [batch_size x num_frames x num_features]
feature_size = input_specs.shape[-1]
# shape 'excluding' batch_size
model_input = tf.keras.Input(shape=self._input_specs.shape[1:])
# normalize input features
input_data = tf.nn.l2_normalize(model_input, -1)
tf.summary.histogram("input_hist", input_data)
# configure model
if params.add_batch_norm:
input_data = self._norm(name="input_bn")(input_data)
# activation = reshaped input * cluster weights
if params.cluster_size > 0:
activation = layers.Dense(
params.cluster_size,
kernel_regularizer=l2_regularizer,
# Configure model batch norm layer.
if self._params.add_batch_norm:
self._input_bn = self._norm(name="input_bn")
self._cluster_bn = self._norm(name="cluster_bn")
self._hidden_bn = self._norm(name="hidden_bn")
else:
self._hidden_biases = self.add_weight(
name="hidden_biases",
shape=[self._params.hidden_size],
initializer=tf.random_normal_initializer(stddev=0.01),
)
self._cluster_biases = self.add_weight(
name="cluster_biases",
shape=[self._params.cluster_size],
initializer=tf.random_normal_initializer(
stddev=1.0 / tf.math.sqrt(feature_size)
),
)
if self._params.use_context_gate_cluster_layer:
self._context_gate = nn_layers.ContextGate(
normalizer_fn=self._norm,
pooling_method=None,
hidden_layer_size=self._params.context_gate_cluster_bottleneck_size,
kernel_regularizer=self._l2_regularizer,
name="context_gate_cluster",
)
self._hidden_dense = layers.Dense(
self._params.hidden_size,
kernel_regularizer=self._l2_regularizer,
kernel_initializer=tf.random_normal_initializer(
stddev=1.0 / tf.sqrt(tf.cast(self._params.cluster_size, tf.float32))
),
name="hidden_dense",
)
if self._params.cluster_size > 0:
self._cluster_dense = layers.Dense(
self._params.cluster_size,
kernel_regularizer=self._l2_regularizer,
kernel_initializer=tf.random_normal_initializer(
stddev=1 / tf.sqrt(tf.cast(feature_size, tf.float32))
stddev=1.0 / tf.sqrt(tf.cast(feature_size, tf.float32))
),
)(input_data)
else:
activation = input_data
name="cluster_dense",
)
if params.add_batch_norm:
activation = self._norm(name="cluster_bn")(activation)
else:
cluster_biases = tf.Variable(
tf.random_normal_initializer(stddev=1 / tf.math.sqrt(feature_size))(
shape=[params.cluster_size]),
name="cluster_biases")
tf.summary.histogram("cluster_biases", cluster_biases)
activation += cluster_biases
def call(
self, inputs: tf.Tensor, num_frames: Any = None,
) -> tf.Tensor:
# L2 normalize input features
activation = tf.nn.l2_normalize(inputs, -1)
activation = self._act_fn(activation)
tf.summary.histogram("cluster_output", activation)
if self._params.add_batch_norm:
activation = self._input_bn(activation)
if params.use_context_gate_cluster_layer:
pooling_method = None
norm_args = dict(name="context_gate_bn")
activation = utils.context_gate(
activation,
normalizer_fn=self._norm,
normalizer_params=norm_args,
pooling_method=pooling_method,
hidden_layer_size=params.context_gate_cluster_bottleneck_size,
kernel_regularizer=l2_regularizer)
if self._params.cluster_size > 0:
activation = self._cluster_dense(activation)
if self._params.add_batch_norm:
activation = self._cluster_bn(activation)
if not self._params.add_batch_norm:
activation += self._cluster_biases
activation = utils.frame_pooling(activation, params.pooling_method)
activation = self._act_fn(activation)
# activation = activation * hidden1_weights
activation = layers.Dense(
params.hidden_size,
kernel_regularizer=l2_regularizer,
kernel_initializer=tf.random_normal_initializer(
stddev=1 / tf.sqrt(tf.cast(params.cluster_size, tf.float32))))(
activation)
if self._params.use_context_gate_cluster_layer:
activation = self._context_gate(activation)
if params.add_batch_norm:
activation = self._norm(name="hidden1_bn")(activation)
activation = yt8m_model_utils.frame_pooling(
activation,
method=self._params.pooling_method,
num_frames=num_frames,
)
activation = self._hidden_dense(activation)
if self._params.add_batch_norm:
activation = self._hidden_bn(activation)
else:
hidden1_biases = tf.Variable(
tf.random_normal_initializer(stddev=0.01)(shape=[params.hidden_size]),
name="hidden1_biases")
tf.summary.histogram("hidden1_biases", hidden1_biases)
activation += hidden1_biases
activation += self._hidden_biases
activation = self._act_fn(activation)
tf.summary.histogram("hidden1_output", activation)
super().__init__(inputs=model_input, outputs=activation, **kwargs)
return activation
@factory.register_backbone_builder("dbof")
......@@ -161,10 +173,14 @@ def build_dbof(
backbone_cfg = backbone_config.get()
assert backbone_type == "dbof", f"Inconsistent backbone type {backbone_type}"
return Dbof(
dbof = Dbof(
input_specs=input_specs,
params=backbone_cfg,
norm_activation=norm_activation_config,
l2_regularizer=l2_regularizer,
**kwargs,
)
# Warmup calls to build model variables.
dbof(tf.keras.Input(input_specs.shape[1:]))
return dbof
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for dbof."""
from absl.testing import parameterized
import tensorflow as tf
from official.projects.yt8m.configs import yt8m as yt8m_cfg
from official.projects.yt8m.modeling.backbones import dbof
class DbofTest(parameterized.TestCase, tf.test.TestCase):
"""Class for testing nn_layers."""
@parameterized.product(
pooling_method=["average", "max", "swap"],
use_context_gate_cluster_layer=[True, False],
context_gate_cluster_bottleneck_size=[0, 8],
)
def test_dbof_backbone(
self,
pooling_method,
use_context_gate_cluster_layer,
context_gate_cluster_bottleneck_size,
):
"""Test for creation of a context gate layer."""
model_cfg = yt8m_cfg.DbofModel(
cluster_size=30,
hidden_size=20,
pooling_method=pooling_method,
use_context_gate_cluster_layer=use_context_gate_cluster_layer,
context_gate_cluster_bottleneck_size=context_gate_cluster_bottleneck_size,
)
backbone = dbof.Dbof(
input_specs=tf.keras.layers.InputSpec(shape=[None, None, 32]),
params=model_cfg,
)
inputs = tf.ones([2, 24, 32], dtype=tf.float32)
outputs = backbone(inputs, num_frames=tf.constant([24, 16]))
self.assertAllEqual(outputs.shape.as_list(), [2, 20])
if __name__ == "__main__":
tf.test.main()
......@@ -22,12 +22,11 @@ import tensorflow as tf
layers = tf.keras.layers
class LogisticModel(tf.keras.Model):
class LogisticModel(layers.Layer):
"""Logistic prediction head model with L2 regularization."""
def __init__(
self,
input_specs: layers.InputSpec = layers.InputSpec(shape=[None, 128]),
vocab_size: int = 3862,
return_logits: bool = False,
l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
......@@ -36,22 +35,32 @@ class LogisticModel(tf.keras.Model):
"""Creates a logistic model.
Args:
input_specs: 'batch' x 'num_features' matrix of input features.
vocab_size: The number of classes in the dataset.
return_logits: if True also return logits.
l2_regularizer: An optional L2 weight regularizer.
**kwargs: extra key word args.
"""
super().__init__(**kwargs)
self._return_logits = return_logits
self._dense = layers.Dense(vocab_size, kernel_regularizer=l2_regularizer)
def call(
self,
inputs: tf.Tensor,
):
"""Logistic model forward call.
Args:
inputs: 'batch' x 'num_features' matrix of input features.
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
batch_size x num_classes.
"""
inputs = tf.keras.Input(shape=input_specs.shape[1:])
logits = layers.Dense(vocab_size, kernel_regularizer=l2_regularizer)(inputs)
logits = self._dense(inputs)
outputs = {"predictions": tf.nn.sigmoid(logits)}
if return_logits:
if self._return_logits:
outputs.update({"logits": logits})
super().__init__(inputs=inputs, outputs=outputs, **kwargs)
return outputs
......@@ -19,18 +19,17 @@ from typing import Any, Optional
import tensorflow as tf
from official.projects.yt8m.modeling import yt8m_model_utils as utils
from official.projects.yt8m.modeling import nn_layers
layers = tf.keras.layers
class MoeModel(tf.keras.Model):
class MoeModel(layers.Layer):
"""A softmax over a mixture of logistic models (with L2 regularization)."""
def __init__(
self,
input_specs: layers.InputSpec = layers.InputSpec(shape=[None, 128]),
vocab_size: int = 3862,
num_mixtures: int = 2,
use_input_context_gate: bool = False,
......@@ -45,8 +44,8 @@ class MoeModel(tf.keras.Model):
The model consists of a per-class softmax distribution over a
configurable number of logistic classifiers. One of the classifiers
in the mixture is not trained, and always predicts 0.
Args:
input_specs: 'batch_size' x 'num_features' matrix of input features.
vocab_size: The number of classes in the dataset.
num_mixtures: The number of mixtures (excluding a dummy 'expert' that
always predicts the non-existence of an entity).
......@@ -59,61 +58,95 @@ class MoeModel(tf.keras.Model):
be padded to 128, and the second to last will be padded to 8.
l2_regularizer: An optional L2 weight regularizer.
**kwargs: extra key word args.
Returns:
A dictionary with a tensor containing the probability predictions
of the model in the 'predictions' key. The dimensions of the tensor
are batch_size x num_classes.
"""
inputs = tf.keras.Input(shape=input_specs.shape[1:])
model_input = inputs
super().__init__(**kwargs)
self._vocab_size = vocab_size
self._num_mixtures = num_mixtures
self._use_input_context_gate = use_input_context_gate
self._use_output_context_gate = use_output_context_gate
self._vocab_as_last_dim = vocab_as_last_dim
self._normalizer_params = normalizer_params
self._l2_regularizer = l2_regularizer
if use_input_context_gate:
model_input = utils.context_gate(
model_input,
self._input_context_gate = nn_layers.ContextGate(
normalizer_fn=layers.BatchNormalization,
normalizer_params=normalizer_params,
name="input_context_gate",
)
if use_output_context_gate:
self._output_context_gate = nn_layers.ContextGate(
normalizer_fn=layers.BatchNormalization,
normalizer_params=normalizer_params,
name="output_context_gate",
)
gate_activations = layers.Dense(
self._gate_dense = layers.Dense(
vocab_size * (num_mixtures + 1),
activation=None,
bias_initializer=None,
kernel_regularizer=l2_regularizer)(
model_input)
expert_activations = layers.Dense(
kernel_regularizer=l2_regularizer,
name="gate",
)
self._expert_dense = layers.Dense(
vocab_size * num_mixtures,
activation=None,
kernel_regularizer=l2_regularizer)(
model_input)
kernel_regularizer=l2_regularizer,
name="expert",
)
def call(self, inputs: tf.Tensor) -> dict[str, tf.Tensor]:
"""MoE forward call.
Args:
inputs: 'batch_size' x 'num_features' matrix of input features.
Returns:
A dictionary with a tensor containing the probability predictions
of the model in the 'predictions' key. The dimensions of the tensor
are batch_size x num_classes.
"""
if vocab_as_last_dim:
if self._use_input_context_gate:
inputs = self._input_context_gate(inputs)
gate_activations = self._gate_dense(inputs)
expert_activations = self._expert_dense(inputs)
if self._vocab_as_last_dim:
# Batch x (num_mixtures + 1) x #Labels
gate_activations = tf.reshape(
gate_activations, [-1, num_mixtures + 1, vocab_size])
gate_activations, [-1, self._num_mixtures + 1, self._vocab_size]
)
# Batch x num_mixtures x #Labels
expert_activations = tf.reshape(
expert_activations, [-1, num_mixtures, vocab_size])
expert_activations,
[-1, self._num_mixtures, self._vocab_size],
)
else:
# (Batch * #Labels) x (num_mixtures + 1)
gate_activations = tf.reshape(gate_activations, [-1, num_mixtures + 1])
gate_activations = tf.reshape(
gate_activations,
[-1, self._num_mixtures + 1],
)
# (Batch * #Labels) x num_mixtures
expert_activations = tf.reshape(expert_activations, [-1, num_mixtures])
expert_activations = tf.reshape(
expert_activations,
[-1, self._num_mixtures],
)
gating_distribution = tf.nn.softmax(gate_activations, axis=1)
expert_distribution = tf.nn.sigmoid(expert_activations)
final_probabilities = tf.reduce_sum(
gating_distribution[:, :num_mixtures] * expert_distribution, axis=1)
gating_distribution[:, : self._num_mixtures] * expert_distribution,
axis=1,
)
if not vocab_as_last_dim:
final_probabilities = tf.reshape(final_probabilities, [-1, vocab_size])
if use_output_context_gate:
final_probabilities = utils.context_gate(
if not self._vocab_as_last_dim:
final_probabilities = tf.reshape(
final_probabilities,
normalizer_fn=layers.BatchNormalization,
normalizer_params=normalizer_params,
[-1, self._vocab_size],
)
outputs = {"predictions": final_probabilities}
super().__init__(inputs=inputs, outputs=outputs, **kwargs)
return {"predictions": final_probabilities}
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains a collection of util functions for model construction."""
from typing import Any, Dict, Optional, Union
import tensorflow as tf
from official.projects.yt8m.modeling import yt8m_model_utils
class ContextGate(tf.keras.layers.Layer):
"""Context Gating. More details: https://arxiv.org/pdf/1706.06905.pdf."""
def __init__(
self,
normalizer_fn=None,
normalizer_params: Optional[Dict[str, Any]] = None,
kernel_initializer: Union[
str, tf.keras.regularizers.Regularizer
] = "glorot_uniform",
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
bias_initializer: Union[str, tf.keras.regularizers.Regularizer] = "zeros",
hidden_layer_size: int = 0,
pooling_method: Optional[str] = None,
additive_residual: bool = False,
name: Optional[str] = None,
):
"""Initialization of context gate.
Args:
normalizer_fn: Normalization function to use instead of `biases` (e.g.
tf.contrib.layers.batch_norm). If None, bias is added.
normalizer_params: Normalization function parameters.
kernel_initializer: Weight initializer to use instead of Xavier (e.g.
tf.contrib.layers.variance_scaling_initializer).
kernel_regularizer: Weight regularizer to use instead of None (e.g.,
tf.contrib.layers.l2_regularizer(l2_penalty)).
bias_initializer: Biases initializer to use (default tf.zeros_initializer)
hidden_layer_size: Dimensionality of the context gating hidden layer size,
if any. If None, will apply a fully-connected context gating layer with
shape [input_size x input_size]. If set to an int N, will factorize the
context gating layer into [input_size x N] x [N x input_size] as in the
squeeze-and-excitation block from https://arxiv.org/pdf/1709.01507.pdf.
pooling_method: Whether to perform global pooling of the local features
before applying the context gating layer. This is relevant only if the
input_features tensor has rank > 2, e.g., it's a sequence of frame
features, [batch_size, num_frames, feature_dim], or spatial convolution
features, [batch_size*num_frames, h, w, feature_dim]. If the inputs are
a set of local features and pooling_method is not None, will pool
features across all but the batch_size dimension using the specified
pooling method, and pass the aggregated features as context to the
gating layer. For a list of pooling methods, see the frame_pooling()
function.
additive_residual: If true, will use ReLu6-activated (additive) residual
connections instead of Sigmoid-activated (multiplicative) connections
when combining the input_features with the context gating branch.
name: Optional `str` name of the module.
Returns:
A tensor with the same shape as input_features.
"""
super().__init__(name=name)
self._normalizer_fn = normalizer_fn
self._normalizer_params = normalizer_params or {}
self._kernel_initializer = kernel_initializer
self._kernel_regularizer = kernel_regularizer
self._bias_initializer = bias_initializer
self._hidden_layer_size = hidden_layer_size
self._pooling_method = pooling_method
self._additive_residual = additive_residual
if hidden_layer_size >= 2:
self._gates_bottleneck = tf.keras.layers.Dense(
hidden_layer_size,
activation="relu6",
kernel_initializer=kernel_initializer,
bias_initializer=bias_initializer,
kernel_regularizer=kernel_regularizer,
name="bottleneck",
)
if self._normalizer_fn:
self._gates_bottleneck_norm = self._normalizer_fn(
**self._normalizer_params,
name="bottleneck_norm",
)
def build(self, input_shape):
super().build(input_shape)
feature_size = input_shape[-1]
activation_fn = tf.nn.relu6 if self._additive_residual else tf.nn.sigmoid
self._gates = tf.keras.layers.Dense(
feature_size,
activation=activation_fn,
kernel_initializer=self._kernel_initializer,
bias_initializer=self._bias_initializer,
kernel_regularizer=self._kernel_regularizer,
name="gates_dense",
)
if self._normalizer_fn:
self._gates_norm = self._normalizer_fn(
**self._normalizer_params,
name="gates_norm",
)
def call(self, inputs: tf.Tensor):
num_dimensions = len(inputs.shape.as_list())
feature_size = inputs.shape.as_list()[-1]
if self._pooling_method:
assert num_dimensions > 2
# Collapse the inner axes of the original features shape into a 3D tensor
original_shape = tf.shape(inputs)
# The last dimension will change after concatenating the context
new_shape = tf.concat(
[original_shape[:-1], tf.constant([2 * feature_size])], 0
)
batch_size = original_shape[0]
reshaped_features = tf.reshape(inputs, [batch_size, -1, feature_size])
num_features = tf.shape(reshaped_features)[1]
# Pool the feature channels across the inner axes to get global context
context_features = yt8m_model_utils.frame_pooling(
reshaped_features, self._pooling_method
)
context_features = tf.expand_dims(context_features, 1)
# Replicate the global context features and concat to the local features.
context_features = tf.tile(context_features, [1, num_features, 1])
context_features = tf.concat([reshaped_features, context_features], 2)
context_features = tf.reshape(context_features, shape=new_shape)
else:
# num_dimensions should be 2
context_features = tf.identity(inputs)
if self._hidden_layer_size >= 2:
gates_bottleneck = self._gates_bottleneck(context_features)
if self._normalizer_fn:
gates_bottleneck = self._gates_bottleneck_norm(gates_bottleneck)
else:
gates_bottleneck = tf.identity(context_features)
gates = self._gates(gates_bottleneck)
if self._normalizer_fn:
gates = self._gates_norm(gates)
if self._additive_residual:
inputs += tf.cast(gates, inputs.dtype)
else:
inputs *= tf.cast(gates, inputs.dtype)
return inputs
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for nn_layers."""
from absl.testing import parameterized
import tensorflow as tf
from official.projects.yt8m.modeling import nn_layers
class NNLayersTest(parameterized.TestCase, tf.test.TestCase):
"""Class for testing nn_layers."""
@parameterized.product(
hidden_layer_size=(0, 8, 16),
additive_residual=(True, False),
pooling_method=["average", "max", "swap", "none", None],
)
def test_context_gate(
self, hidden_layer_size, additive_residual, pooling_method
):
"""Test for creation of a context gate layer."""
context_gate = nn_layers.ContextGate(
normalizer_fn=tf.keras.layers.BatchNormalization,
hidden_layer_size=hidden_layer_size,
additive_residual=additive_residual,
pooling_method=pooling_method,
)
if pooling_method is None:
inputs = tf.ones([2, 32], dtype=tf.float32)
elif pooling_method == "none":
inputs = tf.ones([2, 1, 32], dtype=tf.float32)
else:
inputs = tf.ones([2, 24, 32], dtype=tf.float32)
outputs = context_gate(inputs)
self.assertShapeEqual(inputs, outputs)
context_vars_len = 12 if hidden_layer_size else 6
context_trainable_vars_len = 8 if hidden_layer_size else 4
self.assertLen(context_gate.variables, context_vars_len)
self.assertLen(context_gate.trainable_variables, context_trainable_vars_len)
if __name__ == "__main__":
tf.test.main()
......@@ -118,9 +118,6 @@ class VideoClassificationModel(tf.keras.Model):
else None
)
self.head = aggregation_head(
input_specs=layers.InputSpec(
shape=[None, self._params.backbone.get().hidden_size]
),
vocab_size=self._num_classes,
l2_regularizer=l2_regularizer,
**head_cfg.as_dict(),
......@@ -134,10 +131,17 @@ class VideoClassificationModel(tf.keras.Model):
return cls(**config)
def call(
self, inputs: tf.Tensor, training: Any = None, mask: Any = None
self,
inputs: tf.Tensor,
num_frames: Any = None,
training: Any = None,
) -> dict[str, tf.Tensor]:
features = self.backbone(inputs)
outputs = self.head(features)
features = self.backbone(
inputs,
num_frames=num_frames,
training=training,
)
outputs = self.head(features, training=training)
return outputs
@property
......
......@@ -26,26 +26,42 @@ class YT8MNetworkTest(parameterized.TestCase, tf.test.TestCase):
"""Class for testing yt8m network."""
# test_yt8m_network_creation arbitrary params
@parameterized.parameters((32, 1152), (24, 1152)) # 1152 = 1024 + 128
def test_yt8m_network_creation(self, num_frames, feature_dims):
@parameterized.product(
num_sample_frames=(None, 16, 32),
pooling_method=('average', 'max', 'swap'),
)
def test_yt8m_network_creation(
self, num_sample_frames, pooling_method
):
"""Test for creation of a YT8M Model.
Args:
num_frames: number of frames.
feature_dims: indicates total dimension size of the features.
num_sample_frames: indicates number of frames to sample.
pooling_method: str of frame pooling method.
"""
num_frames = 24
feature_dims = 52
num_classes = 45
input_specs = tf.keras.layers.InputSpec(shape=[None, None, feature_dims])
num_classes = 3862
params = yt8m_cfg.YT8MTask().model
params.backbone.dbof.pooling_method = pooling_method
model = yt8m_model.VideoClassificationModel(
params=yt8m_cfg.YT8MTask().model,
params=params,
num_classes=num_classes,
input_specs=input_specs,
)
# batch = 2 -> arbitrary value for test.
inputs = np.random.rand(2, num_frames, feature_dims)
predictions = model(inputs)['predictions']
if num_sample_frames:
inputs = np.random.rand(2, num_sample_frames, feature_dims)
num_frames = tf.constant([num_sample_frames, num_sample_frames])
else:
# Add padding frames.
inputs = np.random.rand(2, num_frames + 4, feature_dims)
num_frames = tf.constant([num_frames, num_frames + 1])
predictions = model(inputs, num_frames=num_frames)['predictions']
self.assertAllEqual([2, num_classes], predictions.numpy().shape)
def test_serialize_deserialize(self):
......
......@@ -14,18 +14,81 @@
"""Contains a collection of util functions for model construction."""
from typing import Any, Dict, Optional, Union
from typing import Optional
import tensorflow as tf
def frame_pooling(frames, method):
def _large_compatible_negative(tensor_type):
"""Large negative number as Tensor.
This function is necessary because the standard value for epsilon
in this module (-1e9) cannot be represented using `tf.float16`.
Args:
tensor_type: A dtype to determine the type.
Returns:
A large negative number.
"""
if tensor_type == tf.float16:
return tf.float16.min
return -1e9
def weighted_average_pooling(features, weights, axis):
"""Weighted average pooling.
Args:
features: a tensor of at least rank 1.
weights: a weight tensor whose shape is broadcast compatible with features.
It doesn't have to be normalized.
axis: the dimensions to reduce.
Returns:
The reduced tensor.
"""
return tf.math.divide_no_nan(
tf.reduce_sum(weights * features, axis), # numerator.
tf.reduce_sum(weights, axis), # denominator.
)
def frame_swap(
frames: tf.Tensor, frame_mask: Optional[tf.Tensor] = None
) -> tf.Tensor:
"""Self-weighted average pooling over all frames of a video.
It does the following operation independently for each feature:
x_pooled = (sum_i x_i * |x_i|) / (sum_i |x_i|).
Basically the weight for the feature in each frame is determined by the
magnitude of the feature itself.
Paper: https://research.google/pubs/pub48351/
Args:
frames: A tensor with shape [batch_size, max_frames, feature_size].
frame_mask: A tensor with shape [batch_size, max_frames, 1].
Returns:
A tensor with shape [batch_size, feature_size].
"""
weights = tf.abs(frames)
if frame_mask is not None:
weights *= tf.cast(frame_mask, weights.dtype)
# We set axis to 1 to reduce the dimension corresponding to max_frames.
return weighted_average_pooling(frames, weights, axis=1)
def frame_pooling(frames, method="average", num_frames=None):
"""Pools over the frames of a video.
Args:
frames: tensor of shape [batch_size, num_frames, feature_size].
method: string indicating pooling method, one of: "average", "max",
"attention", or "none".
num_frames: optional tensor of shape [batch_size] indicating valid number of
frames for each video.
Returns:
tensor of shape [batch_size, feature_size] for average, max, or
......@@ -35,119 +98,38 @@ def frame_pooling(frames, method):
ValueError: if method is other than "average", "max", "attention", or
"none".
"""
frame_mask = None
if num_frames is not None:
max_frames = frames.shape.as_list()[1]
# Generate binary mask from number of frames.
frame_mask = tf.sequence_mask(num_frames, max_frames, frames.dtype)
frame_mask = tf.expand_dims(frame_mask, axis=2)
if method == "average":
reduced = tf.reduce_mean(frames, 1)
if num_frames is None:
reduced = tf.reduce_mean(frames, 1)
else:
num_frames = tf.reshape(tf.cast(num_frames, frames.dtype), [-1, 1])
reduced = tf.reduce_sum(frames * frame_mask, 1) / num_frames
elif method == "max":
if num_frames is not None:
frame_mask = tf.cast(frame_mask, tf.bool)
frames = tf.where(
frame_mask,
frames,
tf.ones_like(frames, dtype=frames.dtype)
* _large_compatible_negative(frames.dtype),
)
reduced = tf.reduce_max(frames, 1)
elif method == "swap":
# Note we assume the frames are in the shape of
# [batch_size, num_frames, feature_size]. Otherwise this function might
# fail.
reduced = frame_swap(frames, frame_mask)
elif method == "none":
feature_size = frames.shape_as_list()[2]
feature_size = frames.shape.as_list()[2]
reduced = tf.reshape(frames, [-1, feature_size])
else:
raise ValueError("Unrecognized pooling method: %s" % method)
return reduced
def context_gate(
input_features,
normalizer_fn=None,
normalizer_params: Optional[Dict[str, Any]] = None,
kernel_initializer: Union[
str, tf.keras.regularizers.Regularizer] = "glorot_uniform",
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
bias_initializer: Union[str, tf.keras.regularizers.Regularizer] = "zeros",
hidden_layer_size: int = 0,
pooling_method: Optional[str] = None,
additive_residual: bool = False):
"""Context Gating.
More details: https://arxiv.org/pdf/1706.06905.pdf.
Args:
input_features: a tensor of at least rank 2.
normalizer_fn: Normalization function to use instead of `biases` (e.g.
tf.contrib.layers.batch_norm). If None, bias is added.
normalizer_params: Normalization function parameters.
kernel_initializer: Weight initializer to use instead of Xavier (e.g.
tf.contrib.layers.variance_scaling_initializer).
kernel_regularizer: Weight regularizer to use instead of None (e.g.,
tf.contrib.layers.l2_regularizer(l2_penalty)).
bias_initializer: Biases initializer to use (default tf.zeros_initializer)
hidden_layer_size: Dimensionality of the context gating hidden layer size,
if any. If None, will apply a fully-connected context gating layer with
shape [input_size x input_size]. If set to an int N, will factorize the
context gating layer into [input_size x N] x [N x input_size] as in the
squeeze-and-excitation block from https://arxiv.org/pdf/1709.01507.pdf.
pooling_method: Whether to perform global pooling of the local features
before applying the context gating layer. This is relevant only if the
input_features tensor has rank > 2, e.g., it's a sequence of frame
features, [batch_size, num_frames, feature_dim], or spatial convolution
features, [batch_size*num_frames, h, w, feature_dim]. If the inputs are a
set of local features and pooling_method is not None, will pool features
across all but the batch_size dimension using the specified pooling
method, and pass the aggregated features as context to the gating layer.
For a list of pooling methods, see the frame_pooling() function.
additive_residual: If true, will use ReLu6-activated (additive) residual
connections instead of Sigmoid-activated (multiplicative) connections when
combining the input_features with the context gating branch.
Returns:
A tensor with the same shape as input_features.
"""
if normalizer_params is None:
normalizer_params = {}
with tf.name_scope("ContextGating"):
num_dimensions = len(input_features.shape.as_list())
feature_size = input_features.shape.as_list()[-1]
if pooling_method:
assert num_dimensions > 2
# Collapse the inner axes of the original features shape into a 3D tensor
original_shape = tf.shape(input_features)
# The last dimension will change after concatenating the context
new_shape = tf.concat(
[original_shape[:-1],
tf.constant([2 * feature_size])], 0)
batch_size = original_shape[0]
reshaped_features = tf.reshape(input_features,
[batch_size, -1, feature_size])
num_features = tf.shape(reshaped_features)[1]
# Pool the feature channels across the inner axes to get global context
context_features = frame_pooling(reshaped_features, pooling_method)
context_features = tf.expand_dims(context_features, 1)
# Replicate the global context features and concat to the local features.
context_features = tf.tile(context_features, [1, num_features, 1])
context_features = tf.concat([reshaped_features, context_features], 2)
context_features = tf.reshape(context_features, shape=new_shape)
else:
context_features = input_features
if hidden_layer_size >= 2:
gates_bottleneck = tf.keras.layers.Dense(
hidden_layer_size,
activation="relu6",
kernel_initializer=kernel_initializer,
bias_initializer=bias_initializer,
kernel_regularizer=kernel_regularizer,
)(context_features)
if normalizer_fn:
gates_bottleneck = normalizer_fn(**normalizer_params)(gates_bottleneck)
else:
gates_bottleneck = context_features
activation_fn = (tf.nn.relu6 if additive_residual else tf.nn.sigmoid)
gates = tf.keras.layers.Dense(
feature_size,
activation=activation_fn,
kernel_initializer=kernel_initializer,
bias_initializer=bias_initializer,
kernel_regularizer=kernel_regularizer,
)(gates_bottleneck)
if normalizer_fn:
gates = normalizer_fn(**normalizer_params)(gates)
if additive_residual:
input_features += tf.cast(gates, input_features.dtype)
else:
input_features *= tf.cast(gates, input_features.dtype)
return input_features
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for YT8M modeling utilities."""
from absl.testing import parameterized
import tensorflow as tf
from official.projects.yt8m.modeling import yt8m_model_utils
class Yt8MModelUtilsTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.product(
frame_pooling=("average", "max", "swap", "none"),
use_frame_mask=(True, False),
)
def test_frame_pooling(self, frame_pooling, use_frame_mask):
frame = tf.constant([
[[0.0, 0.0, 0.0], [0.0, 1.0, -1.0]],
[[0.0, 0.0, 0.0], [0.0, 2.0, -2.0]],
])
num_frames = tf.constant([2, 2]) if use_frame_mask else None
pooled_frame = yt8m_model_utils.frame_pooling(
frame, method=frame_pooling, num_frames=num_frames
)
if frame_pooling == "swap":
self.assertAllClose([[0.0, 1.0, -1.0], [0.0, 2.0, -2.0]], pooled_frame)
elif frame_pooling == "average":
self.assertAllClose([[0.0, 0.5, -0.5], [0.0, 1.0, -1.0]], pooled_frame)
elif frame_pooling == "max":
self.assertAllClose([[0.0, 1.0, 0.0], [0.0, 2.0, 0.0]], pooled_frame)
elif frame_pooling == "none":
self.assertAllClose(
[
[0.0, 0.0, 0.0],
[0.0, 1.0, -1.0],
[0.0, 0.0, 0.0],
[0.0, 2.0, -2.0],
],
pooled_frame,
)
if __name__ == "__main__":
tf.test.main()
......@@ -19,13 +19,13 @@ from absl import logging
import tensorflow as tf
from official.core import base_task
from official.core import input_reader
from official.core import task_factory
from official.modeling import tf_utils
from official.projects.yt8m.configs import yt8m as yt8m_cfg
from official.projects.yt8m.dataloaders import yt8m_input
from official.projects.yt8m.eval_utils import eval_util
from official.projects.yt8m.modeling import yt8m_model
from official.vision.dataloaders.google import input_reader
@task_factory.register_task_cls(yt8m_cfg.YT8MTask)
......@@ -48,7 +48,14 @@ class YT8MTask(base_task.Task):
params=model_config,
input_specs=input_specs,
num_classes=train_cfg.num_classes,
l2_weight_decay=l2_weight_decay)
l2_weight_decay=l2_weight_decay,
)
# Warmup calls to build model variables.
_ = model(
inputs=tf.keras.Input(common_input_shape, dtype=tf.float32),
num_frames=tf.keras.Input([], dtype=tf.float32),
)
non_trainable_batch_norm_variables = []
non_trainable_extra_variables = []
......@@ -114,17 +121,16 @@ class YT8MTask(base_task.Task):
decoder_fn=decoder_fn,
parser_fn=parser_fn,
postprocess_fn=postprocess_fn,
transform_and_batch_fn=batch_fn)
transform_and_batch_fn=batch_fn,
)
dataset = reader.read(input_context=input_context)
return dataset
def build_losses(self,
labels,
model_outputs,
label_weights=None,
aux_losses=None):
def build_losses(
self, labels, model_outputs, label_weights=None, aux_losses=None
):
"""Sigmoid Cross Entropy.
Args:
......@@ -143,7 +149,8 @@ class YT8MTask(base_task.Task):
tf.expand_dims(model_outputs, axis=-1),
from_logits=losses_config.from_logits,
label_smoothing=losses_config.label_smoothing,
axis=-1)
axis=-1,
)
if label_weights is None:
model_loss = tf_utils.safe_mean(model_loss)
else:
......@@ -151,7 +158,8 @@ class YT8MTask(base_task.Task):
# Manutally compute weighted mean loss.
total_loss = tf.reduce_sum(model_loss)
total_weight = tf.cast(
tf.reduce_sum(label_weights), dtype=total_loss.dtype)
tf.reduce_sum(label_weights), dtype=total_loss.dtype
)
model_loss = tf.math.divide_no_nan(total_loss, total_weight)
total_loss = model_loss
......@@ -188,7 +196,8 @@ class YT8MTask(base_task.Task):
top_k = self.task_config.evaluation.average_precision.top_k
top_n = self.task_config.evaluation.average_precision.top_n
self.avg_prec_metric = eval_util.EvaluationMetrics(
num_classes, top_k=top_k, top_n=top_n)
num_classes, top_k=top_k, top_n=top_n
)
return metrics
......@@ -233,17 +242,26 @@ class YT8MTask(base_task.Task):
logs[m.name] = m.result()
return logs
def _preprocess_model_inputs(self,
inputs: dict[str, tf.Tensor],
training: bool = True):
def _preprocess_model_inputs(
self,
inputs: dict[str, tf.Tensor],
require_num_frames: bool = True,
training: bool = True,
):
"""Preprocesses input tensors before model on device."""
del training
return inputs['video_matrix']
extra_inputs = {
'num_frames': (
tf.reshape(inputs['num_frames'], [-1])
if require_num_frames
else None
),
'training': training,
}
return inputs['video_matrix'], extra_inputs
def _preprocess_labels(self,
inputs: dict[str, tf.Tensor],
training: bool = True):
def _preprocess_labels(
self, inputs: dict[str, tf.Tensor], training: bool = True
):
"""Preprocesses labels."""
del training # training is unused in _preprocess_labels in YT8M.
labels = inputs['labels']
......@@ -251,11 +269,9 @@ class YT8MTask(base_task.Task):
return labels, label_weights
def _postprocess_outputs(self,
outputs,
labels,
label_weights,
training: bool = True):
def _postprocess_outputs(
self, outputs, labels, label_weights, training: bool = True
):
"""Postprocess model outputs (inputs / labels / label_weights)."""
if not training and self.task_config.validation_data.segment_labels:
# workaround to ignore the unrated labels.
......@@ -279,25 +295,34 @@ class YT8MTask(base_task.Task):
Returns:
a dictionary of logs.
"""
model_inputs = self._preprocess_model_inputs(inputs, training=True)
# Will require `num_frames` if `num_sample_frames` is None since
# video_matrix is padded to max_frames in this case.
require_num_frames = self.task_config.train_data.num_sample_frames is None
inputs_tensor, extra_inputs = self._preprocess_model_inputs(
inputs,
require_num_frames=require_num_frames,
training=True,
)
labels, label_weights = self._preprocess_labels(inputs, training=True)
num_replicas = tf.distribute.get_strategy().num_replicas_in_sync
with tf.GradientTape() as tape:
outputs = model(model_inputs, training=True)['predictions']
outputs = model(inputs_tensor, **extra_inputs)['predictions']
# Casting output layer as float32 is necessary when mixed_precision is
# mixed_float16 or mixed_bfloat16 to ensure output is casted as float32.
outputs = tf.nest.map_structure(lambda x: tf.cast(x, tf.float32), outputs)
# Post-process model / label outputs.
outputs, labels, label_weights = self._postprocess_outputs(
outputs, labels, label_weights, training=True)
outputs, labels, label_weights, training=True
)
# Computes per-replica loss
all_losses = self.build_losses(
model_outputs=outputs,
labels=labels,
label_weights=label_weights,
aux_losses=model.losses)
aux_losses=model.losses,
)
loss = all_losses['total_loss']
# Scales loss as the default gradients allreduce performs sum inside the
......@@ -318,8 +343,9 @@ class YT8MTask(base_task.Task):
# Apply gradient clipping.
if self.task_config.gradient_clip_norm > 0:
grads, _ = tf.clip_by_global_norm(grads,
self.task_config.gradient_clip_norm)
grads, _ = tf.clip_by_global_norm(
grads, self.task_config.gradient_clip_norm
)
optimizer.apply_gradients(list(zip(grads, tvars)))
logs = {self.loss: loss}
......@@ -330,7 +356,9 @@ class YT8MTask(base_task.Task):
outputs=outputs,
model_losses=all_losses,
label_weights=label_weights,
training=True))
training=True,
)
)
return logs
def validation_step(self, inputs, model, metrics=None):
......@@ -346,19 +374,26 @@ class YT8MTask(base_task.Task):
Returns:
a dictionary of logs.
"""
model_inputs = self._preprocess_model_inputs(inputs, training=False)
labels, label_weights = self._preprocess_labels(inputs, training=False)
outputs = self.inference_step(model_inputs, model)['predictions']
# Will require `num_frames` if `num_sample_frames` is None since
# video_matrix is padded to max_frames in this case.
require_num_frames = (
self.task_config.validation_data.num_sample_frames is None
)
outputs = self.inference_step(
model, inputs, require_num_frames=require_num_frames
)['predictions']
outputs = tf.nest.map_structure(lambda x: tf.cast(x, tf.float32), outputs)
labels, label_weights = self._preprocess_labels(inputs, training=False)
outputs, labels, label_weights = self._postprocess_outputs(
outputs, labels, label_weights, training=False)
outputs, labels, label_weights, training=False
)
all_losses = self.build_losses(
labels=labels,
model_outputs=outputs,
label_weights=label_weights,
aux_losses=model.losses)
aux_losses=model.losses,
)
logs = {self.loss: all_losses['total_loss']}
logs.update(
......@@ -368,13 +403,18 @@ class YT8MTask(base_task.Task):
outputs=outputs,
model_losses=all_losses,
label_weights=inputs.get('label_weights', None),
training=False))
training=False,
)
)
return logs
def inference_step(self, inputs, model):
def inference_step(self, model, inputs, require_num_frames=True):
"""Performs the forward step."""
return model(inputs, training=False)
model_inputs, extra_inputs = self._preprocess_model_inputs(
inputs, require_num_frames=require_num_frames, training=False
)
return model(model_inputs, **extra_inputs)
def aggregate_logs(self, state=None, step_logs=None):
if self.task_config.evaluation.average_precision is not None:
......@@ -382,13 +422,15 @@ class YT8MTask(base_task.Task):
state = self.avg_prec_metric
self.avg_prec_metric.accumulate(
labels=step_logs[self.avg_prec_metric.name][0],
predictions=step_logs[self.avg_prec_metric.name][1])
predictions=step_logs[self.avg_prec_metric.name][1],
)
return state
def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
if self.task_config.evaluation.average_precision is not None:
avg_prec_metrics = self.avg_prec_metric.get(
self.task_config.evaluation.average_precision.return_per_class_ap)
self.task_config.evaluation.average_precision.return_per_class_ap
)
self.avg_prec_metric.clear()
return avg_prec_metrics
return None
......@@ -45,25 +45,44 @@ class TrainTest(parameterized.TestCase, tf.test.TestCase):
testcase_name='segment_with_avg_precison',
use_segment_level_labels=True,
use_average_precision_metric=True,
num_sample_frames=24,
),
dict(
testcase_name='video_with_avg_precison',
use_segment_level_labels=False,
use_average_precision_metric=True,
num_sample_frames=24,
),
dict(
testcase_name='segment',
use_segment_level_labels=True,
use_average_precision_metric=False,
num_sample_frames=24,
),
dict(
testcase_name='video',
use_segment_level_labels=False,
use_average_precision_metric=False,
num_sample_frames=24,
),
dict(
testcase_name='segment_without_sampling_frames',
use_segment_level_labels=True,
use_average_precision_metric=False,
num_sample_frames=None,
),
dict(
testcase_name='video_without_sampling_frames',
use_segment_level_labels=False,
use_average_precision_metric=False,
num_sample_frames=None,
),
)
def test_train_and_eval(
self, use_segment_level_labels, use_average_precision_metric
self,
use_segment_level_labels,
use_average_precision_metric,
num_sample_frames,
):
saved_flag_values = flagsaver.save_flag_values()
train_lib.tfm_flags.define_flags()
......@@ -103,11 +122,13 @@ class TrainTest(parameterized.TestCase, tf.test.TestCase):
'train_data': {
'input_path': self._data_path,
'global_batch_size': 4,
'num_sample_frames': num_sample_frames,
},
'validation_data': {
'input_path': self._data_path,
'segment_labels': use_segment_level_labels,
'global_batch_size': 4,
'num_sample_frames': num_sample_frames,
},
'evaluation': {
'average_precision': average_precision,
......
......@@ -152,6 +152,9 @@ class DetectionGenerator(hyperparams.Config):
return_decoded: Optional[bool] = None
# Only works when nms_version='v2'.
use_class_agnostic_nms: Optional[bool] = False
# Weights or scales when encode and decode boxes coordinates. For Faster RCNN,
# the open-source implementation recommends using [10.0, 10.0, 5.0, 5.0].
box_coder_weights: list[float] | None = None
@dataclasses.dataclass
......
......@@ -64,6 +64,7 @@ class DataConfig(cfg.DataConfig):
mixup_and_cutmix: Optional[common.MixupAndCutmix] = None
image_field_key: str = 'image/encoded'
label_field_key: str = 'clip/label/index'
input_image_format: str = 'jpeg'
def kinetics400(is_training):
......
......@@ -23,8 +23,12 @@ from official.core import config_definitions as cfg
from official.core import input_reader
InputReader = input_reader.InputReader
def build_weighted_sampling_combine_fn(
weights: Mapping[Any, Any]) -> Callable[[tf.data.Dataset], tf.data.Dataset]:
weights: Mapping[Any, Any], stop_on_empty_dataset=True
) -> Callable[[tf.data.Dataset], tf.data.Dataset]:
"""Builds a combine_fn using weighted sampling."""
def combine_fn(datasets: Mapping[Any, tf.data.Dataset]) -> tf.data.Dataset:
......@@ -35,7 +39,7 @@ def build_weighted_sampling_combine_fn(
ds.append(dataset)
ws.append(weights[k])
return tf.data.Dataset.sample_from_datasets(
ds, ws, stop_on_empty_dataset=True)
ds, ws, stop_on_empty_dataset=stop_on_empty_dataset)
return combine_fn
......@@ -44,6 +48,14 @@ def create_combine_fn(
params: cfg.DataConfig
) -> Union[None, Callable[[tf.data.Dataset], tf.data.Dataset]]:
"""Creates and returns a combine_fn for dataset mixing."""
if (
hasattr(params, 'stop_on_empty_dataset')
and params.stop_on_empty_dataset is not None
):
stop_on_empty_dataset = params.stop_on_empty_dataset
else:
stop_on_empty_dataset = True
if params.is_training and params.weights:
# Combine multiple datasets using weighted sampling.
if (not isinstance(params.input_path, cfg.base_config.Config) or
......@@ -63,7 +75,7 @@ def create_combine_fn(
raise ValueError(
'input_path key \'%s\' does not have a corresponding weight.' % k)
return build_weighted_sampling_combine_fn(weights)
return build_weighted_sampling_combine_fn(weights, stop_on_empty_dataset)
return None
......
......@@ -45,6 +45,7 @@ class Parser(parser.Parser):
anchor_size,
match_threshold=0.5,
unmatched_threshold=0.5,
box_coder_weights=None,
aug_type=None,
aug_rand_hflip=False,
aug_scale_min=1.0,
......@@ -55,7 +56,8 @@ class Parser(parser.Parser):
max_num_instances=100,
dtype='bfloat16',
resize_first: Optional[bool] = None,
mode=None):
mode=None,
pad=True):
"""Initializes parameters for parsing annotations in the dataset.
Args:
......@@ -78,6 +80,10 @@ class Parser(parser.Parser):
unmatched_threshold: `float` number between 0 and 1 representing the
upper-bound threshold to assign negative labels for anchors. An anchor
with a score below the threshold is labeled negative.
box_coder_weights: Optional `list` of 4 positive floats to scale y, x, h,
and w when encoding box coordinates. If set to None, does not perform
scaling. For Faster RCNN, the open-source implementation recommends
using [10.0, 10.0, 5.0, 5.0].
aug_type: An optional Augmentation object to choose from AutoAugment and
RandAugment.
aug_rand_hflip: `bool`, if True, augment training with random horizontal
......@@ -99,6 +105,15 @@ class Parser(parser.Parser):
augmentations; computationally more efficient.
mode: a ModeKeys. Specifies if this is training, evaluation, prediction or
prediction with ground-truths in the outputs.
pad: A bool indicating whether to pad the input image to make it
size a factor of 2**max_level. The padded size will be the smallest
rectangle, such that each dimension is the smallest multiple of
2**max_level which is larger than the desired output size. For example,
if desired output size = (320, 320) and max_level = 7, the output padded
size = (384, 384). This is necessary when using FPN as it assumes each
lower feature map is 2x size of its higher neighbor. Without padding,
such relationship may be invalidated. The backbone may produce 5x5 and
2x2 consecutive feature maps, which does not work with FPN.
"""
self._mode = mode
self._max_num_instances = max_num_instances
......@@ -113,6 +128,7 @@ class Parser(parser.Parser):
self._anchor_size = anchor_size
self._match_threshold = match_threshold
self._unmatched_threshold = unmatched_threshold
self._box_coder_weights = box_coder_weights
# Data augmentation.
self._aug_rand_hflip = aug_rand_hflip
......@@ -150,6 +166,10 @@ class Parser(parser.Parser):
# Input pipeline optimization.
self._resize_first = resize_first
# Whether to pad image to make its size the smallest factor of 2*max_level.
# This is needed when using FPN decoder.
self._pad = pad
def _resize_and_crop_image_and_boxes(self, image, boxes, pad=True):
"""Resizes and crops image and boxes, optionally with padding."""
# Resizes and crops image.
......@@ -162,7 +182,8 @@ class Parser(parser.Parser):
self._output_size,
padded_size=padded_size,
aug_scale_min=self._aug_scale_min,
aug_scale_max=self._aug_scale_max)
aug_scale_max=self._aug_scale_max,
)
# Resizes and crops boxes.
image_scale = image_info[2, :]
......@@ -171,7 +192,7 @@ class Parser(parser.Parser):
image_info[1, :], offset)
return image, boxes, image_info
def _parse_train_data(self, data, anchor_labeler=None):
def _parse_train_data(self, data, anchor_labeler=None, input_anchor=None):
"""Parses data for training and evaluation."""
classes = data['groundtruth_classes']
boxes = data['groundtruth_boxes']
......@@ -209,7 +230,8 @@ class Parser(parser.Parser):
resize_first = self._resize_first and less_output_pixels
if resize_first:
image, boxes, image_info = self._resize_and_crop_image_and_boxes(
image, boxes, pad=False)
image, boxes, pad=False
)
image = tf.cast(image, dtype=tf.uint8)
# Apply autoaug or randaug.
......@@ -227,14 +249,23 @@ class Parser(parser.Parser):
# Converts boxes from normalized coordinates to pixel coordinates.
boxes = box_ops.denormalize_boxes(boxes, image_shape)
if not resize_first:
image, boxes, image_info = self._resize_and_crop_image_and_boxes(
image, boxes, pad=True)
if self._pad:
padded_size = preprocess_ops.compute_padded_size(
self._output_size, 2**self._max_level
)
else:
padded_size = preprocess_ops.compute_padded_size(self._output_size,
2**self._max_level)
image = tf.image.pad_to_bounding_box(
image, 0, 0, padded_size[0], padded_size[1])
padded_size = self._output_size
if not resize_first:
image, boxes, image_info = (
self._resize_and_crop_image_and_boxes(image, boxes, pad=self._pad)
)
image = tf.image.pad_to_bounding_box(
image, 0, 0, padded_size[0], padded_size[1]
)
image = tf.ensure_shape(image, padded_size + [3])
image_height, image_width, _ = image.get_shape().as_list()
# Filters out ground-truth boxes that are all zeros.
......@@ -245,16 +276,21 @@ class Parser(parser.Parser):
attributes[k] = tf.gather(v, indices)
# Assigns anchors.
input_anchor = anchor.build_anchor_generator(
min_level=self._min_level,
max_level=self._max_level,
num_scales=self._num_scales,
aspect_ratios=self._aspect_ratios,
anchor_size=self._anchor_size)
if input_anchor is None:
input_anchor = anchor.build_anchor_generator(
min_level=self._min_level,
max_level=self._max_level,
num_scales=self._num_scales,
aspect_ratios=self._aspect_ratios,
anchor_size=self._anchor_size,
)
anchor_boxes = input_anchor(image_size=(image_height, image_width))
if anchor_labeler is None:
anchor_labeler = anchor.AnchorLabeler(
self._match_threshold, self._unmatched_threshold
match_threshold=self._match_threshold,
unmatched_threshold=self._unmatched_threshold,
box_coder_weights=self._box_coder_weights,
)
(cls_targets, box_targets, att_targets, cls_weights,
box_weights) = anchor_labeler.label_anchors(
......@@ -276,7 +312,7 @@ class Parser(parser.Parser):
labels['attribute_targets'] = att_targets
return image, labels
def _parse_eval_data(self, data, anchor_labeler=None):
def _parse_eval_data(self, data, anchor_labeler=None, input_anchor=None):
"""Parses data for training and evaluation."""
classes = data['groundtruth_classes']
......@@ -296,13 +332,21 @@ class Parser(parser.Parser):
boxes = box_ops.denormalize_boxes(boxes, image_shape)
# Resizes and crops image.
if self._pad:
padded_size = preprocess_ops.compute_padded_size(
self._output_size, 2**self._max_level
)
else:
padded_size = self._output_size
image, image_info = preprocess_ops.resize_and_crop_image(
image,
self._output_size,
padded_size=preprocess_ops.compute_padded_size(self._output_size,
2**self._max_level),
padded_size=padded_size,
aug_scale_min=1.0,
aug_scale_max=1.0)
aug_scale_max=1.0,
)
image = tf.ensure_shape(image, padded_size + [3])
image_height, image_width, _ = image.get_shape().as_list()
# Resizes and crops boxes.
......@@ -318,16 +362,21 @@ class Parser(parser.Parser):
attributes[k] = tf.gather(v, indices)
# Assigns anchors.
input_anchor = anchor.build_anchor_generator(
min_level=self._min_level,
max_level=self._max_level,
num_scales=self._num_scales,
aspect_ratios=self._aspect_ratios,
anchor_size=self._anchor_size)
if input_anchor is None:
input_anchor = anchor.build_anchor_generator(
min_level=self._min_level,
max_level=self._max_level,
num_scales=self._num_scales,
aspect_ratios=self._aspect_ratios,
anchor_size=self._anchor_size,
)
anchor_boxes = input_anchor(image_size=(image_height, image_width))
if anchor_labeler is None:
anchor_labeler = anchor.AnchorLabeler(
self._match_threshold, self._unmatched_threshold
match_threshold=self._match_threshold,
unmatched_threshold=self._unmatched_threshold,
box_coder_weights=self._box_coder_weights,
)
(cls_targets, box_targets, att_targets, cls_weights,
box_weights) = anchor_labeler.label_anchors(
......
......@@ -45,7 +45,8 @@ def process_image(image: tf.Tensor,
min_area_ratio: float = 0.49,
max_area_ratio: float = 1.0,
augmenter: Optional[augment.ImageAugment] = None,
seed: Optional[int] = None) -> tf.Tensor:
seed: Optional[int] = None,
input_image_format: Optional[str] = 'jpeg') -> tf.Tensor:
"""Processes a serialized image tensor.
Args:
......@@ -78,6 +79,8 @@ def process_image(image: tf.Tensor,
max_area_ratio: The maximum area range for cropping.
augmenter: Image augmenter to distort each image.
seed: A deterministic seed to use when sampling.
input_image_format: The format of input image which could be jpeg, png or
none for unknown or mixed datasets.
Returns:
Processed frames. Tensor of shape
......@@ -93,6 +96,10 @@ def process_image(image: tf.Tensor,
raise ValueError('Random stride range should be >= 0, got {}'.format(
random_stride_range))
if input_image_format not in ('jpeg', 'png', 'none'):
raise ValueError('Unknown input image format: {}'.format(
input_image_format))
if isinstance(crop_size, int):
crop_size = (crop_size, crop_size)
crop_height, crop_width = crop_size
......@@ -120,7 +127,7 @@ def process_image(image: tf.Tensor,
# Decode JPEG string to tf.uint8.
if image.dtype == tf.string:
image = preprocess_ops_3d.decode_jpeg(image, num_channels)
image = preprocess_ops_3d.decode_image(image, num_channels)
if is_training:
# Standard image data augmentation: random resized crop and random flip.
......@@ -295,6 +302,7 @@ class Parser(parser.Parser):
self._max_aspect_ratio = input_params.aug_max_aspect_ratio
self._min_area_ratio = input_params.aug_min_area_ratio
self._max_area_ratio = input_params.aug_max_area_ratio
self._input_image_format = input_params.input_image_format
if self._output_audio:
self._audio_feature = input_params.audio_feature
self._audio_shape = input_params.audio_feature_shape
......@@ -343,7 +351,8 @@ class Parser(parser.Parser):
min_area_ratio=self._min_area_ratio,
max_area_ratio=self._max_area_ratio,
augmenter=self._augmenter,
zero_centering_image=self._zero_centering_image)
zero_centering_image=self._zero_centering_image,
input_image_format=self._input_image_format)
image = tf.cast(image, dtype=self._dtype)
features = {'image': image}
......@@ -378,7 +387,8 @@ class Parser(parser.Parser):
crop_size=self._crop_size,
num_channels=self._num_channels,
num_crops=self._num_crops,
zero_centering_image=self._zero_centering_image)
zero_centering_image=self._zero_centering_image,
input_image_format=self._input_image_format)
image = tf.cast(image, dtype=self._dtype)
features = {'image': image}
......
......@@ -93,18 +93,18 @@ class FPN(tf.keras.Model):
'kernel_regularizer': kernel_regularizer,
'bias_regularizer': bias_regularizer,
}
if use_separable_conv:
conv2d = tf.keras.layers.SeparableConv2D
else:
conv2d = tf.keras.layers.Conv2D
conv2d = (
tf.keras.layers.SeparableConv2D
if use_separable_conv
else tf.keras.layers.Conv2D
)
norm = tf.keras.layers.BatchNormalization
activation_fn = tf_utils.get_activation(activation, use_keras_layer=True)
# Build input feature pyramid.
if tf.keras.backend.image_data_format() == 'channels_last':
bn_axis = -1
else:
bn_axis = 1
bn_axis = (
-1 if tf.keras.backend.image_data_format() == 'channels_last' else 1
)
# Get input feature pyramid from backbone.
logging.info('FPN input_specs: %s', input_specs)
......@@ -191,7 +191,7 @@ class FPN(tf.keras.Model):
for level in range(min_level, max_level + 1)
}
super(FPN, self).__init__(inputs=inputs, outputs=feats, **kwargs)
super().__init__(inputs=inputs, outputs=feats, **kwargs)
def _build_input_pyramid(self, input_specs: Mapping[str, tf.TensorShape],
min_level: int):
......
......@@ -311,10 +311,13 @@ def build_retinanet(
_ = head(decoder_features)
# Add `input_image_size` into `tflite_post_processing_config`.
tflite_post_processing_config = generator_config.tflite_post_processing.as_dict(
tflite_post_processing_config = (
generator_config.tflite_post_processing.as_dict()
)
tflite_post_processing_config['input_image_size'] = (
input_specs.shape[1],
input_specs.shape[2],
)
tflite_post_processing_config['input_image_size'] = (input_specs.shape[1],
input_specs.shape[2])
detection_generator_obj = detection_generator.MultilevelDetectionGenerator(
apply_nms=generator_config.apply_nms,
pre_nms_top_k=generator_config.pre_nms_top_k,
......@@ -327,6 +330,7 @@ def build_retinanet(
tflite_post_processing_config=tflite_post_processing_config,
return_decoded=generator_config.return_decoded,
use_class_agnostic_nms=generator_config.use_class_agnostic_nms,
box_coder_weights=generator_config.box_coder_weights,
)
model = retinanet_model.RetinaNetModel(
......
......@@ -1128,6 +1128,7 @@ class MultilevelDetectionGenerator(tf.keras.layers.Layer):
nms_v3_refinements: Optional[int] = None,
return_decoded: Optional[bool] = None,
use_class_agnostic_nms: Optional[bool] = None,
box_coder_weights: list[float] | None = None,
**kwargs,
):
"""Initializes a multi-level detection generator.
......@@ -1162,6 +1163,10 @@ class MultilevelDetectionGenerator(tf.keras.layers.Layer):
regardless of whether `apply_nms` is True or not.
use_class_agnostic_nms: A `bool` of whether non max suppression is
operated on all the boxes using max scores across all classes.
box_coder_weights: An optional `list` of 4 positive floats to scale y, x,
h, and w when encoding box coordinates. If set to None, does not perform
scaling. For Faster RCNN, the open-source implementation recommends
using [10.0, 10.0, 5.0, 5.0].
**kwargs: Additional keyword arguments passed to Layer.
Raises:
......@@ -1186,6 +1191,7 @@ class MultilevelDetectionGenerator(tf.keras.layers.Layer):
'soft_nms_sigma': soft_nms_sigma,
'return_decoded': return_decoded,
'use_class_agnostic_nms': use_class_agnostic_nms,
'box_coder_weights': box_coder_weights,
}
# Don't store if were not defined
if pre_nms_top_k_sharding_block is not None:
......@@ -1257,7 +1263,11 @@ class MultilevelDetectionGenerator(tf.keras.layers.Layer):
raw_boxes_i,
[batch_size, num_locations * num_anchors_per_locations, 4],
)
boxes_i = box_ops.decode_boxes(raw_boxes_i, anchor_boxes_i)
boxes_i = box_ops.decode_boxes(
raw_boxes_i,
anchor_boxes_i,
weights=self._config_dict['box_coder_weights'],
)
# Box clipping.
if image_shape is not None:
......
......@@ -348,6 +348,7 @@ class MultilevelDetectionGeneratorTest(
'tflite_post_processing_config': tflite_post_processing_config,
'return_decoded': False,
'use_class_agnostic_nms': False,
'box_coder_weights': None,
}
generator = detection_generator.MultilevelDetectionGenerator(**kwargs)
......
......@@ -15,6 +15,7 @@
"""Anchor box and labeler definition."""
import collections
import math
from typing import Dict, Optional, Tuple
# Import libraries
......@@ -78,9 +79,10 @@ class Anchor(object):
boxes_all = []
for level in range(self.min_level, self.max_level + 1):
boxes_l = []
feat_size = math.ceil(self.image_size[0] / 2**level)
stride = tf.cast(self.image_size[0] / feat_size, tf.float32)
for scale in range(self.num_scales):
for aspect_ratio in self.aspect_ratios:
stride = 2**level
intermidate_scale = 2 ** (scale / float(self.num_scales))
base_anchor_size = self.anchor_size * stride * intermidate_scale
aspect_x = aspect_ratio**0.5
......@@ -135,7 +137,12 @@ class Anchor(object):
class AnchorLabeler(object):
"""Labeler for dense object detector."""
def __init__(self, match_threshold=0.5, unmatched_threshold=0.5):
def __init__(
self,
match_threshold=0.5,
unmatched_threshold=0.5,
box_coder_weights=None,
):
"""Constructs anchor labeler to assign labels to anchors.
Args:
......@@ -145,6 +152,10 @@ class AnchorLabeler(object):
unmatched_threshold: a float number between 0 and 1 representing the
upper-bound threshold to assign negative labels for anchors. An anchor
with a score below the threshold is labeled negative.
box_coder_weights: Optional `list` of 4 positive floats to scale y, x, h,
and w when encoding box coordinates. If set to None, does not perform
scaling. For Faster RCNN, the open-source implementation recommends
using [10.0, 10.0, 5.0, 5.0].
"""
self.similarity_calc = iou_similarity.IouSimilarity()
self.target_gather = target_gather.TargetGather()
......@@ -153,7 +164,9 @@ class AnchorLabeler(object):
indicators=[-1, -2, 1],
force_match_for_each_col=True,
)
self.box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder()
self.box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
scale_factors=box_coder_weights,
)
def label_anchors(
self,
......
......@@ -370,6 +370,12 @@ def encode_boxes(boxes, anchors, weights=None):
anchor_yc = anchor_ymin + 0.5 * anchor_h
anchor_xc = anchor_xmin + 0.5 * anchor_w
# Avoid inf in log below.
anchor_h += EPSILON
anchor_w += EPSILON
box_h += EPSILON
box_w += EPSILON
encoded_dy = (box_yc - anchor_yc) / anchor_h
encoded_dx = (box_xc - anchor_xc) / anchor_w
encoded_dh = tf.math.log(box_h / anchor_h)
......
......@@ -29,6 +29,7 @@ MEAN_NORM = (0.485, 0.456, 0.406)
STDDEV_NORM = (0.229, 0.224, 0.225)
MEAN_RGB = tuple(255 * i for i in MEAN_NORM)
STDDEV_RGB = tuple(255 * i for i in STDDEV_NORM)
MEDIAN_RGB = (128.0, 128.0, 128.0)
# Alias for convenience. PLEASE use `box_ops.horizontal_flip_boxes` directly.
horizontal_flip_boxes = box_ops.horizontal_flip_boxes
......
......@@ -198,12 +198,36 @@ def decode_jpeg(image_string: tf.Tensor, channels: int = 0) -> tf.Tensor:
dtype=tf.uint8)
def crop_image(frames: tf.Tensor,
target_height: int,
target_width: int,
random: bool = False,
num_crops: int = 1,
seed: Optional[int] = None) -> tf.Tensor:
def decode_image(image_string: tf.Tensor, channels: int = 0) -> tf.Tensor:
"""Decodes PNG or JPEG raw bytes string into a RGB uint8 Tensor.
Args:
image_string: A `tf.Tensor` of type strings with the raw PNG or JPEG bytes
where the first dimension is timesteps.
channels: Number of channels of the PNG image. Allowed values are 0, 1 and
3. If 0, the number of channels will be calculated at runtime and no
static shape is set.
Returns:
A Tensor of shape [T, H, W, C] of type uint8 with the decoded images.
"""
return tf.map_fn(
lambda x: tf.image.decode_image( # pylint: disable=g-long-lambda
x, channels=channels, expand_animations=False),
image_string,
back_prop=False,
dtype=tf.uint8,
)
def crop_image(
frames: tf.Tensor,
target_height: int,
target_width: int,
random: bool = False,
num_crops: int = 1,
seed: Optional[int] = None,
) -> tf.Tensor:
"""Crops the image sequence of images.
If requested size is bigger than image size, image is padded with 0. If not
......
......@@ -96,6 +96,33 @@ class ParserUtilsTest(tf.test.TestCase):
self.assertEqual(decoded_image.shape.as_list()[3], 3)
self.assertAllEqual(decoded_image.shape, (2, 263, 320, 3))
def test_decode_image(self):
# Create a random RGB JPEG image.
random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
random_image = Image.fromarray(random_image)
with io.BytesIO() as buffer:
random_image.save(buffer, format='JPEG')
raw_image_bytes = buffer.getvalue()
raw_image = tf.constant([raw_image_bytes, raw_image_bytes])
decoded_image = preprocess_ops_3d.decode_image(raw_image, 3)
self.assertEqual(decoded_image.shape.as_list()[3], 3)
self.assertAllEqual(decoded_image.shape, (2, 263, 320, 3))
# Create a random RGB PNG image.
random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
random_image = Image.fromarray(random_image)
with io.BytesIO() as buffer:
random_image.save(buffer, format='PNG')
raw_image_bytes = buffer.getvalue()
raw_image = tf.constant([raw_image_bytes, raw_image_bytes])
decoded_image = preprocess_ops_3d.decode_image(raw_image, 3)
self.assertEqual(decoded_image.shape.as_list()[3], 3)
self.assertAllEqual(decoded_image.shape, (2, 263, 320, 3))
def test_crop_image(self):
cropped_image_1 = preprocess_ops_3d.crop_image(self._frames, 50, 70)
cropped_image_2 = preprocess_ops_3d.crop_image(self._frames, 200, 200)
......
......@@ -127,6 +127,9 @@ class RetinaNetTask(base_task.Task):
dtype=params.dtype,
match_threshold=params.parser.match_threshold,
unmatched_threshold=params.parser.unmatched_threshold,
box_coder_weights=(
self.task_config.model.detection_generator.box_coder_weights
),
aug_type=params.parser.aug_type,
aug_rand_hflip=params.parser.aug_rand_hflip,
aug_scale_min=params.parser.aug_scale_min,
......