15049a1b · 15049a1b · 15049a1b · 15049a1b · 15049a1b · 15049a1b
36 changed file
--- a/docs/vision/instance_segmentation.ipynb
+++ b/docs/vision/instance_segmentation.ipynb
@@ -745,7 +745,7 @@
      },
      "outputs": [],
      "source": [
-        "def show_batch(raw_records, num_of_examples):\n",
+        "def show_batch(raw_records):\n",
        "  plt.figure(figsize=(20, 20))\n",
        "  use_normalized_coordinates=True\n",
        "  min_score_thresh = 0.30\n",
@@ -802,7 +802,7 @@
        "\n",
        "train_tfrecords = tf.io.gfile.glob(exp_config.task.train_data.input_path)\n",
        "raw_records = tf.data.TFRecordDataset(train_tfrecords).shuffle(buffer_size=buffer_size).take(num_of_examples)\n",
-        "show_batch(raw_records, num_of_examples)"
+        "show_batch(raw_records)"
      ]
    },
    {
@@ -962,7 +962,7 @@
        "\n",
        "test_tfrecords = tf.io.gfile.glob('./lvis_tfrecords/val*')\n",
        "test_ds = tf.data.TFRecordDataset(test_tfrecords).take(num_of_examples)\n",
-        "show_batch(test_ds, num_of_examples)"
+        "show_batch(test_ds)"
      ]
    },
    {
@@ -1095,7 +1095,7 @@
        "    detection_masks = tf.convert_to_tensor(result['detection_masks'][0])\n",
        "    detection_boxes = tf.convert_to_tensor(result['detection_boxes'][0])\n",
        "    detection_masks_reframed = reframe_box_masks_to_image_masks(\n",
-        "              detection_masks, detection_boxes/255.0,\n",
+        "              detection_masks, detection_boxes/256.0,\n",
        "                image_np.shape[0], image_np.shape[1])\n",
        "    detection_masks_reframed = tf.cast(\n",
        "        detection_masks_reframed \u003e min_score_thresh,\n",

--- a/official/modeling/multitask/train_lib.py
+++ b/official/modeling/multitask/train_lib.py
@@ -46,6 +46,7 @@ def run_experiment(
    model_dir: str,
    run_post_eval: bool = False,
    trainer: base_trainer.MultiTaskBaseTrainer = None,
+    eval_summary_manager: Optional[orbit.utils.SummaryManagerInterface] = None,
    best_ckpt_exporter_creator: Optional[Any] = train_utils
    .maybe_create_best_ckpt_exporter
 ) -> Union[base_model.MultiTaskBaseModel, Tuple[base_model.MultiTaskBaseModel,
@@ -64,6 +65,10 @@ def run_experiment(
      are returned.
    trainer: (optional) A multi-task trainer to use. If none is provided, a
      default one will be created based on `params`.
+    eval_summary_manager: Instance of the eval summary manager. If set, the
+      `eval_summary_dir` will be ignored. Otherwise the eval summary manager
+      will be created internally for TensorBoard summaries by default from the
+      `eval_summary_dir`.
    best_ckpt_exporter_creator: A functor for creating best checkpoint exporter.

  Returns:
@@ -117,6 +122,7 @@ def run_experiment(
      checkpoint_manager=checkpoint_manager,
      summary_dir=os.path.join(model_dir, 'train'),
      eval_summary_dir=os.path.join(model_dir, 'validation'),
+      eval_summary_manager=eval_summary_manager,
      summary_interval=params.trainer.summary_interval)

  logging.info('Starts to execute mode: %s', mode)
@@ -162,6 +168,7 @@ def run_experiment_with_multitask_eval(
    run_post_eval: bool = False,
    save_summary: bool = True,
    trainer: Optional[core_lib.Trainer] = None,
+    eval_summary_manager: Optional[orbit.utils.SummaryManagerInterface] = None,
    best_ckpt_exporter_creator: Optional[Any] = train_utils
    .maybe_create_best_ckpt_exporter,
 ) -> Tuple[Any, Any]:
@@ -181,6 +188,10 @@ def run_experiment_with_multitask_eval(
    trainer: the core_lib.Trainer instance. It should be created within the
      strategy.scope(). If not provided, an instance will be created by default
      if `mode` contains 'train'.
+    eval_summary_manager: Instance of the eval summary manager. If set, the
+      `eval_summary_dir` will be ignored. Otherwise the eval summary manager
+      will be created internally for TensorBoard summaries by default from the
+      `eval_summary_dir`.
    best_ckpt_exporter_creator: A functor for creating best checkpoint exporter.

  Returns:
@@ -253,6 +264,7 @@ def run_experiment_with_multitask_eval(
      summary_dir=os.path.join(model_dir, 'train') if save_summary else None,
      eval_summary_dir=os.path.join(model_dir, 'validation') if
      (save_summary) else None,
+      eval_summary_manager=eval_summary_manager,
      summary_interval=params.trainer.summary_interval if
      (save_summary) else None)


--- a/official/nightly_requirements.txt
+++ b/official/nightly_requirements.txt
@@ -14,8 +14,7 @@ gin-config
 tf_slim>=1.1.0
 Cython
 matplotlib
-# Loader becomes a required positional argument in 6.0 in yaml.load
-pyyaml>=5.1,<6.0
+pyyaml
 # CV related dependencies
 opencv-python-headless==4.5.2.52
 Pillow

--- a/official/nlp/configs/encoders.py
+++ b/official/nlp/configs/encoders.py
@@ -75,6 +75,7 @@ class FunnelEncoderConfig(hyperparams.Config):
  norm_first: bool = False
  share_rezero: bool = False
  append_dense_inputs: bool = False
+  transformer_cls: str = "TransformerEncoderBlock"


 @dataclasses.dataclass
@@ -559,6 +560,7 @@ def build_encoder(config: EncoderConfig,
        norm_first=encoder_cfg.norm_first,
        share_rezero=encoder_cfg.share_rezero,
        append_dense_inputs=encoder_cfg.append_dense_inputs,
+        transformer_cls=encoder_cfg.transformer_cls,
        )

  if encoder_type == "kernel":

--- a/official/projects/pix2seq/configs/pix2seq.py
+++ b/official/projects/pix2seq/configs/pix2seq.py
@@ -113,6 +113,8 @@ class Pix2Seq(hyperparams.Config):
  drop_units: float = 0.1
  drop_att: float = 0.0
  norm_first: bool = True
+  top_k: int = 0
+  top_p: float = 0.4


 @dataclasses.dataclass

--- a/official/projects/pix2seq/modeling/pix2seq_model.py
+++ b/official/projects/pix2seq/modeling/pix2seq_model.py
@@ -236,6 +236,8 @@ class Pix2Seq(tf.keras.Model):
      drop_path=0.1,
      drop_units=0.1,
      drop_att=0.0,
+      top_k=0,
+      top_p=0.4,
      **kwargs
  ):
    super().__init__(**kwargs)
@@ -271,6 +273,8 @@ class Pix2Seq(tf.keras.Model):
        drop_units=self._drop_units,
        drop_att=self._drop_att,
    )
+    self._top_k = top_k
+    self._top_p = top_p

  @property
  def backbone(self) -> tf.keras.Model:
@@ -292,6 +296,8 @@ class Pix2Seq(tf.keras.Model):
        "drop_path": self._drop_path,
        "drop_units": self._drop_units,
        "drop_att": self._drop_att,
+        "top_k": self._top_k,
+        "top_p": self._top_p,
    }

  @classmethod
@@ -350,11 +356,15 @@ class Pix2Seq(tf.keras.Model):
          training,
      )
    else:
-      tokens, logits = self._transformer.infer({
-          "inputs": features,
-          "tokens": targets,
-          "pos_emb": pos_emb,
-      })
+      tokens, logits = self._transformer.infer(
+          {
+              "inputs": features,
+              "tokens": targets,
+              "pos_emb": pos_emb,
+          },
+          top_k=self._top_k,
+          top_p=self._top_p,
+      )

    return [tokens, logits]


--- a/official/projects/yt8m/configs/yt8m.py
+++ b/official/projects/yt8m/configs/yt8m.py
@@ -51,8 +51,9 @@ class DataConfig(cfg.DataConfig):
    temporal_stride: Not used. Need to deprecated.
    max_frames: Maxim Number of frames in a input example. It is used to crop
      the input in the temporal dimension.
-    sample_random_frames: If sample random frames.
-    num_sample_frames: Number of frames to sample for each input example.
+    sample_random_frames: If sample random frames or random sequence.
+    num_sample_frames: Number of frames to sample for each input example. No
+      frame sampling if None.
    num_classes: Number of classes to classify. Assuming it is a classification
      task.
    num_devices: Not used. To be deprecated.

--- a/official/projects/yt8m/dataloaders/yt8m_input.py
+++ b/official/projects/yt8m/dataloaders/yt8m_input.py
@@ -358,14 +358,23 @@ class Parser(parser.Parser):
    if not self._include_video_id and "id" in decoded_tensors:
      del decoded_tensors["id"]

+    # Valid `num_frames` comes from _concat_features().
    outputs = self._process_label(video_matrix, num_frames, decoded_tensors)
-    if self._num_sample_frames is not None:
+    if self._num_sample_frames is None:
+      # Padding to max_frames.
+      outputs["video_matrix"] = resize_axis(
+          outputs["video_matrix"], 1, self._max_frames
+      )
+    else:
      outputs["video_matrix"] = utils.sample_video_frames(
          outputs["video_matrix"],
          tf.reshape(outputs["num_frames"], [-1, 1]),
          random_frames=self._sample_random_frames,
          num_sample_frames=self._num_sample_frames,
      )
+      outputs["num_frames"] = (
+          tf.ones_like(outputs["num_frames"]) * self._num_sample_frames
+      )
    return outputs

  def _parse_eval_data(self, decoded_tensors):
@@ -379,13 +388,21 @@ class Parser(parser.Parser):
      del decoded_tensors["id"]

    outputs = self._process_label(video_matrix, num_frames, decoded_tensors)
-    if self._num_sample_frames is not None:
+    if self._num_sample_frames is None:
+      # Padding to max_frames.
+      outputs["video_matrix"] = resize_axis(
+          outputs["video_matrix"], 1, self._max_frames
+      )
+    else:
      outputs["video_matrix"] = utils.sample_video_frames(
          outputs["video_matrix"],
          tf.reshape(outputs["num_frames"], [-1, 1]),
          random_frames=self._sample_random_frames,
          num_sample_frames=self._num_sample_frames,
      )
+      outputs["num_frames"] = (
+          tf.ones_like(outputs["num_frames"]) * self._num_sample_frames
+      )
    return outputs

  def _process_label(self, video_matrix, num_frames, contexts):
@@ -488,7 +505,9 @@ class PostBatchProcessor():
  def __init__(self, input_params: exp_cfg.DataConfig):
    self.segment_labels = input_params.segment_labels
    self.num_classes = input_params.num_classes
-    self.num_sample_frames = input_params.num_sample_frames
+    self.num_batched_frames = (
+        input_params.num_sample_frames or input_params.max_frames
+    )
    self.num_features = sum(input_params.feature_sizes)

  def post_fn(self, batched_tensors: Dict[str,
@@ -500,12 +519,13 @@ class PostBatchProcessor():
    num_frames = batched_tensors["num_frames"]

    if self.segment_labels:
-      # [batch x num_segment x num_sample_frames x num_features]
-      # -> [batch * num_segment x num_sample_frames x num_features]
+      # [batch x num_segment x num_batched_frames x num_features]
+      # -> [batch * num_segment x num_batched_frames x num_features]
      if video_ids is not None:
        video_ids = tf.reshape(video_ids, [-1])
-      video_matrix = tf.reshape(video_matrix,
-                                [-1, self.num_sample_frames, self.num_features])
+      video_matrix = tf.reshape(
+          video_matrix, [-1, self.num_batched_frames, self.num_features]
+      )
      labels = tf.reshape(labels, [-1, self.num_classes])
      num_frames = tf.reshape(num_frames, [-1, 1])
      batched_tensors["label_weights"] = tf.reshape(

--- a/official/projects/yt8m/dataloaders/yt8m_input_test.py
+++ b/official/projects/yt8m/dataloaders/yt8m_input_test.py
@@ -60,13 +60,15 @@ class Yt8mInputTest(parameterized.TestCase, tf.test.TestCase):
        postprocess_fn=postprocess_fn,
        transform_and_batch_fn=batch_fn)

-  @parameterized.parameters((True,), (False,))
-  def test_read_video_level_input(self, include_video_id):
+  @parameterized.parameters((True, 20), (False, 20), (False, None))
+  def test_read_video_level_input(self, include_video_id, num_sample_frames):
    params = yt8m_configs.yt8m(is_training=False)
    params.global_batch_size = 4
    params.segment_labels = False
    params.input_path = self.data_path
    params.include_video_id = include_video_id
+    params.max_frames = 122
+    params.num_sample_frames = num_sample_frames
    reader = self.create_input_reader(params)

    dataset = reader.read()
@@ -82,27 +84,40 @@ class Yt8mInputTest(parameterized.TestCase, tf.test.TestCase):
      self.assertCountEqual(['video_matrix', 'labels', 'num_frames'],
                            example.keys())
    batch_size = params.global_batch_size
+    expected_num_frames = num_sample_frames or params.max_frames
    self.assertEqual(
        example['video_matrix'].shape.as_list(),
-        [batch_size, params.num_sample_frames, sum(params.feature_sizes)],
+        [batch_size, expected_num_frames, sum(params.feature_sizes)],
+    )
+    self.assertEqual(
+        example['labels'].shape.as_list(), [batch_size, params.num_classes]
    )
-    self.assertEqual(example['labels'].shape.as_list(),
-                     [batch_size, params.num_classes])
    # Check non empty labels.
    self.assertGreater(np.nonzero(example['labels'][0].numpy())[0].shape[0], 0)

-    self.assertEqual(example['num_frames'].shape.as_list(), [batch_size, 1])
+    if num_sample_frames:
+      self.assertAllEqual(
+          example['num_frames'].numpy(),
+          [[num_sample_frames]] * batch_size,
+      )
+    else:
+      self.assertAllEqual(
+          example['num_frames'].numpy(),
+          [[120], [121], [122], [122]],
+      )
+
    if include_video_id:
      self.assertEqual(example['video_ids'].shape.as_list(), [batch_size, 1])

-  @parameterized.parameters((True,), (False,))
-  def test_read_segment_level_input(self, include_video_id=False):
+  @parameterized.parameters((True, 20), (False, 20), (False, None))
+  def test_read_segment_level_input(self, include_video_id, num_sample_frames):
    params = yt8m_configs.yt8m(is_training=False)
    params.global_batch_size = 2
    params.segment_labels = True
    params.segment_size = 24
    params.input_path = self.data_path
    params.include_video_id = include_video_id
+    params.num_sample_frames = num_sample_frames
    reader = self.create_input_reader(params)

    dataset = reader.read()
@@ -120,21 +135,35 @@ class Yt8mInputTest(parameterized.TestCase, tf.test.TestCase):
          ['video_matrix', 'labels', 'num_frames', 'label_weights'],
          example.keys())
    batch_size = params.global_batch_size * self.num_segment
+    expected_num_frames = num_sample_frames or params.max_frames
    self.assertEqual(
        example['video_matrix'].shape.as_list(),
-        [batch_size, params.num_sample_frames, sum(params.feature_sizes)],
+        [batch_size, expected_num_frames, sum(params.feature_sizes)],
    )
    self.assertEqual(example['labels'].shape.as_list(),
                     [batch_size, params.num_classes])
    self.assertGreater(np.nonzero(example['labels'][0].numpy())[0].shape[0], 0)
-    self.assertEqual(example['num_frames'].shape.as_list(), [batch_size, 1])
    self.assertEqual(example['label_weights'].shape.as_list(),
                     [batch_size, params.num_classes])
+
+    if num_sample_frames:
+      self.assertAllEqual(
+          example['num_frames'].numpy(),
+          [[num_sample_frames]] * batch_size,
+      )
+    else:
+      self.assertAllEqual(
+          example['num_frames'].numpy(),
+          [[params.segment_size]] * batch_size,
+      )
+
    if include_video_id:
      self.assertEqual(example['video_ids'].shape.as_list(), [batch_size])

-  @parameterized.parameters((True,), (False,))
-  def test_read_video_level_float_input(self, include_video_id):
+  @parameterized.parameters((True, 4), (False, 4), (False, None))
+  def test_read_video_level_float_input(
+      self, include_video_id, num_sample_frames
+  ):
    data_dir = os.path.join(self.get_temp_dir(), 'data2')
    tf.io.gfile.makedirs(data_dir)
    data_path = os.path.join(data_dir, 'data2.tfrecord')
@@ -150,6 +179,7 @@ class Yt8mInputTest(parameterized.TestCase, tf.test.TestCase):
    params.input_path = data_path
    params.num_frames = 2
    params.max_frames = 2
+    params.num_sample_frames = num_sample_frames
    params.feature_names = ('VIDEO_EMBEDDING/context_feature/floats',
                            'FEATURE/feature/floats')
    params.feature_sources = ('context', 'feature')
@@ -191,9 +221,10 @@ class Yt8mInputTest(parameterized.TestCase, tf.test.TestCase):

    # Check tensor shape.
    batch_size = params.global_batch_size
+    expected_num_frames = params.num_sample_frames or params.max_frames
    self.assertEqual(
        example['video_matrix'].shape.as_list(),
-        [batch_size, params.num_sample_frames, sum(params.feature_sizes)],
+        [batch_size, expected_num_frames, sum(params.feature_sizes)],
    )
    self.assertEqual(example['labels'].shape.as_list(),
                     [batch_size, params.num_classes])
@@ -201,5 +232,6 @@ class Yt8mInputTest(parameterized.TestCase, tf.test.TestCase):
    if include_video_id:
      self.assertEqual(example['video_ids'].shape.as_list(), [batch_size, 1])

+
 if __name__ == '__main__':
  tf.test.main()
--- a/official/projects/yt8m/modeling/backbones/dbof.py
+++ b/official/projects/yt8m/modeling/backbones/dbof.py
@@ -15,14 +15,15 @@
 """Dbof model definitions."""

 import functools
-from typing import Optional
+from typing import Any, Optional

 import tensorflow as tf

 from official.modeling import hyperparams
 from official.modeling import tf_utils
 from official.projects.yt8m.configs import yt8m as yt8m_cfg
-from official.projects.yt8m.modeling import yt8m_model_utils as utils
+from official.projects.yt8m.modeling import nn_layers
+from official.projects.yt8m.modeling import yt8m_model_utils
 from official.vision.configs import common
 from official.vision.modeling.backbones import factory

@@ -30,7 +31,7 @@ from official.vision.modeling.backbones import factory
 layers = tf.keras.layers


-class Dbof(tf.keras.Model):
+class Dbof(layers.Layer):
  """A YT8M model class builder.

  Creates a Deep Bag of Frames model.
@@ -61,10 +62,11 @@ class Dbof(tf.keras.Model):
      l2_regularizer: An optional kernel weight regularizer.
      **kwargs: keyword arguments to be passed.
    """
-    self._self_setattr_tracking = False
+    super().__init__(**kwargs)
    self._input_specs = input_specs
    self._params = params
    self._norm_activation = norm_activation
+    self._l2_regularizer = l2_regularizer
    self._act_fn = tf_utils.get_activation(self._norm_activation.activation)
    self._norm = functools.partial(
        layers.BatchNormalization,
@@ -72,80 +74,90 @@ class Dbof(tf.keras.Model):
        epsilon=self._norm_activation.norm_epsilon,
        synchronized=self._norm_activation.use_sync_bn,
    )
-
-    # [batch_size x num_frames x num_features]
    feature_size = input_specs.shape[-1]
-    # shape 'excluding' batch_size
-    model_input = tf.keras.Input(shape=self._input_specs.shape[1:])
-    # normalize input features
-    input_data = tf.nn.l2_normalize(model_input, -1)
-    tf.summary.histogram("input_hist", input_data)
-
-    # configure model
-    if params.add_batch_norm:
-      input_data = self._norm(name="input_bn")(input_data)
-
-    # activation = reshaped input * cluster weights
-    if params.cluster_size > 0:
-      activation = layers.Dense(
-          params.cluster_size,
-          kernel_regularizer=l2_regularizer,
+
+    # Configure model batch norm layer.
+    if self._params.add_batch_norm:
+      self._input_bn = self._norm(name="input_bn")
+      self._cluster_bn = self._norm(name="cluster_bn")
+      self._hidden_bn = self._norm(name="hidden_bn")
+    else:
+      self._hidden_biases = self.add_weight(
+          name="hidden_biases",
+          shape=[self._params.hidden_size],
+          initializer=tf.random_normal_initializer(stddev=0.01),
+      )
+      self._cluster_biases = self.add_weight(
+          name="cluster_biases",
+          shape=[self._params.cluster_size],
+          initializer=tf.random_normal_initializer(
+              stddev=1.0 / tf.math.sqrt(feature_size)
+          ),
+      )
+
+    if self._params.use_context_gate_cluster_layer:
+      self._context_gate = nn_layers.ContextGate(
+          normalizer_fn=self._norm,
+          pooling_method=None,
+          hidden_layer_size=self._params.context_gate_cluster_bottleneck_size,
+          kernel_regularizer=self._l2_regularizer,
+          name="context_gate_cluster",
+      )
+
+    self._hidden_dense = layers.Dense(
+        self._params.hidden_size,
+        kernel_regularizer=self._l2_regularizer,
+        kernel_initializer=tf.random_normal_initializer(
+            stddev=1.0 / tf.sqrt(tf.cast(self._params.cluster_size, tf.float32))
+        ),
+        name="hidden_dense",
+    )
+
+    if self._params.cluster_size > 0:
+      self._cluster_dense = layers.Dense(
+          self._params.cluster_size,
+          kernel_regularizer=self._l2_regularizer,
          kernel_initializer=tf.random_normal_initializer(
-              stddev=1 / tf.sqrt(tf.cast(feature_size, tf.float32))
+              stddev=1.0 / tf.sqrt(tf.cast(feature_size, tf.float32))
          ),
-      )(input_data)
-    else:
-      activation = input_data
+          name="cluster_dense",
+      )

-    if params.add_batch_norm:
-      activation = self._norm(name="cluster_bn")(activation)
-    else:
-      cluster_biases = tf.Variable(
-          tf.random_normal_initializer(stddev=1 / tf.math.sqrt(feature_size))(
-              shape=[params.cluster_size]),
-          name="cluster_biases")
-      tf.summary.histogram("cluster_biases", cluster_biases)
-      activation += cluster_biases
+  def call(
+      self, inputs: tf.Tensor, num_frames: Any = None,
+  ) -> tf.Tensor:
+    # L2 normalize input features
+    activation = tf.nn.l2_normalize(inputs, -1)

-    activation = self._act_fn(activation)
-    tf.summary.histogram("cluster_output", activation)
+    if self._params.add_batch_norm:
+      activation = self._input_bn(activation)

-    if params.use_context_gate_cluster_layer:
-      pooling_method = None
-      norm_args = dict(name="context_gate_bn")
-      activation = utils.context_gate(
-          activation,
-          normalizer_fn=self._norm,
-          normalizer_params=norm_args,
-          pooling_method=pooling_method,
-          hidden_layer_size=params.context_gate_cluster_bottleneck_size,
-          kernel_regularizer=l2_regularizer)
+    if self._params.cluster_size > 0:
+      activation = self._cluster_dense(activation)
+      if self._params.add_batch_norm:
+        activation = self._cluster_bn(activation)
+    if not self._params.add_batch_norm:
+      activation += self._cluster_biases

-    activation = utils.frame_pooling(activation, params.pooling_method)
+    activation = self._act_fn(activation)

-    # activation = activation * hidden1_weights
-    activation = layers.Dense(
-        params.hidden_size,
-        kernel_regularizer=l2_regularizer,
-        kernel_initializer=tf.random_normal_initializer(
-            stddev=1 / tf.sqrt(tf.cast(params.cluster_size, tf.float32))))(
-                activation)
+    if self._params.use_context_gate_cluster_layer:
+      activation = self._context_gate(activation)

-    if params.add_batch_norm:
-      activation = self._norm(name="hidden1_bn")(activation)
+    activation = yt8m_model_utils.frame_pooling(
+        activation,
+        method=self._params.pooling_method,
+        num_frames=num_frames,
+    )

+    activation = self._hidden_dense(activation)
+    if self._params.add_batch_norm:
+      activation = self._hidden_bn(activation)
    else:
-      hidden1_biases = tf.Variable(
-          tf.random_normal_initializer(stddev=0.01)(shape=[params.hidden_size]),
-          name="hidden1_biases")
-
-      tf.summary.histogram("hidden1_biases", hidden1_biases)
-      activation += hidden1_biases
+      activation += self._hidden_biases

    activation = self._act_fn(activation)
-    tf.summary.histogram("hidden1_output", activation)
-
-    super().__init__(inputs=model_input, outputs=activation, **kwargs)
+    return activation


 @factory.register_backbone_builder("dbof")
@@ -161,10 +173,14 @@ def build_dbof(
  backbone_cfg = backbone_config.get()
  assert backbone_type == "dbof", f"Inconsistent backbone type {backbone_type}"

-  return Dbof(
+  dbof = Dbof(
      input_specs=input_specs,
      params=backbone_cfg,
      norm_activation=norm_activation_config,
      l2_regularizer=l2_regularizer,
      **kwargs,
  )
+
+  # Warmup calls to build model variables.
+  dbof(tf.keras.Input(input_specs.shape[1:]))
+  return dbof
--- a/official/projects/yt8m/modeling/backbones/dbof_test.py
+++ b/official/projects/yt8m/modeling/backbones/dbof_test.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for dbof."""
+
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.projects.yt8m.configs import yt8m as yt8m_cfg
+from official.projects.yt8m.modeling.backbones import dbof
+
+
+class DbofTest(parameterized.TestCase, tf.test.TestCase):
+  """Class for testing nn_layers."""
+
+  @parameterized.product(
+      pooling_method=["average", "max", "swap"],
+      use_context_gate_cluster_layer=[True, False],
+      context_gate_cluster_bottleneck_size=[0, 8],
+  )
+  def test_dbof_backbone(
+      self,
+      pooling_method,
+      use_context_gate_cluster_layer,
+      context_gate_cluster_bottleneck_size,
+  ):
+    """Test for creation of a context gate layer."""
+
+    model_cfg = yt8m_cfg.DbofModel(
+        cluster_size=30,
+        hidden_size=20,
+        pooling_method=pooling_method,
+        use_context_gate_cluster_layer=use_context_gate_cluster_layer,
+        context_gate_cluster_bottleneck_size=context_gate_cluster_bottleneck_size,
+    )
+    backbone = dbof.Dbof(
+        input_specs=tf.keras.layers.InputSpec(shape=[None, None, 32]),
+        params=model_cfg,
+    )
+
+    inputs = tf.ones([2, 24, 32], dtype=tf.float32)
+    outputs = backbone(inputs, num_frames=tf.constant([24, 16]))
+    self.assertAllEqual(outputs.shape.as_list(), [2, 20])
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/projects/yt8m/modeling/heads/logistic.py
+++ b/official/projects/yt8m/modeling/heads/logistic.py
@@ -22,12 +22,11 @@ import tensorflow as tf
 layers = tf.keras.layers


-class LogisticModel(tf.keras.Model):
+class LogisticModel(layers.Layer):
  """Logistic prediction head model with L2 regularization."""

  def __init__(
      self,
-      input_specs: layers.InputSpec = layers.InputSpec(shape=[None, 128]),
      vocab_size: int = 3862,
      return_logits: bool = False,
      l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
@@ -36,22 +35,32 @@ class LogisticModel(tf.keras.Model):
    """Creates a logistic model.

    Args:
-      input_specs: 'batch' x 'num_features' matrix of input features.
      vocab_size: The number of classes in the dataset.
      return_logits: if True also return logits.
      l2_regularizer: An optional L2 weight regularizer.
      **kwargs: extra key word args.
+    """
+    super().__init__(**kwargs)
+    self._return_logits = return_logits
+    self._dense = layers.Dense(vocab_size, kernel_regularizer=l2_regularizer)
+
+  def call(
+      self,
+      inputs: tf.Tensor,
+  ):
+    """Logistic model forward call.
+
+    Args:
+      inputs: 'batch' x 'num_features' matrix of input features.

    Returns:
      A dictionary with a tensor containing the probability predictions of the
      model in the 'predictions' key. The dimensions of the tensor are
      batch_size x num_classes.
    """
-    inputs = tf.keras.Input(shape=input_specs.shape[1:])
-    logits = layers.Dense(vocab_size, kernel_regularizer=l2_regularizer)(inputs)

+    logits = self._dense(inputs)
    outputs = {"predictions": tf.nn.sigmoid(logits)}
-    if return_logits:
+    if self._return_logits:
      outputs.update({"logits": logits})
-
-    super().__init__(inputs=inputs, outputs=outputs, **kwargs)
+    return outputs
--- a/official/projects/yt8m/modeling/heads/moe.py
+++ b/official/projects/yt8m/modeling/heads/moe.py
@@ -19,18 +19,17 @@ from typing import Any, Optional

 import tensorflow as tf

-from official.projects.yt8m.modeling import yt8m_model_utils as utils
+from official.projects.yt8m.modeling import nn_layers


 layers = tf.keras.layers


-class MoeModel(tf.keras.Model):
+class MoeModel(layers.Layer):
  """A softmax over a mixture of logistic models (with L2 regularization)."""

  def __init__(
      self,
-      input_specs: layers.InputSpec = layers.InputSpec(shape=[None, 128]),
      vocab_size: int = 3862,
      num_mixtures: int = 2,
      use_input_context_gate: bool = False,
@@ -45,8 +44,8 @@ class MoeModel(tf.keras.Model):
     The model consists of a per-class softmax distribution over a
     configurable number of logistic classifiers. One of the classifiers
     in the mixture is not trained, and always predicts 0.
+
    Args:
-      input_specs: 'batch_size' x 'num_features' matrix of input features.
      vocab_size: The number of classes in the dataset.
      num_mixtures: The number of mixtures (excluding a dummy 'expert' that
        always predicts the non-existence of an entity).
@@ -59,61 +58,95 @@ class MoeModel(tf.keras.Model):
        be padded to 128, and the second to last will be padded to 8.
      l2_regularizer: An optional L2 weight regularizer.
      **kwargs: extra key word args.
-
-    Returns:
-      A dictionary with a tensor containing the probability predictions
-      of the model in the 'predictions' key. The dimensions of the tensor
-      are batch_size x num_classes.
    """
-    inputs = tf.keras.Input(shape=input_specs.shape[1:])
-    model_input = inputs
+    super().__init__(**kwargs)
+    self._vocab_size = vocab_size
+    self._num_mixtures = num_mixtures
+    self._use_input_context_gate = use_input_context_gate
+    self._use_output_context_gate = use_output_context_gate
+    self._vocab_as_last_dim = vocab_as_last_dim
+    self._normalizer_params = normalizer_params
+    self._l2_regularizer = l2_regularizer

    if use_input_context_gate:
-      model_input = utils.context_gate(
-          model_input,
+      self._input_context_gate = nn_layers.ContextGate(
+          normalizer_fn=layers.BatchNormalization,
+          normalizer_params=normalizer_params,
+          name="input_context_gate",
+      )
+    if use_output_context_gate:
+      self._output_context_gate = nn_layers.ContextGate(
          normalizer_fn=layers.BatchNormalization,
          normalizer_params=normalizer_params,
+          name="output_context_gate",
      )

-    gate_activations = layers.Dense(
+    self._gate_dense = layers.Dense(
        vocab_size * (num_mixtures + 1),
        activation=None,
        bias_initializer=None,
-        kernel_regularizer=l2_regularizer)(
-            model_input)
-    expert_activations = layers.Dense(
+        kernel_regularizer=l2_regularizer,
+        name="gate",
+    )
+
+    self._expert_dense = layers.Dense(
        vocab_size * num_mixtures,
        activation=None,
-        kernel_regularizer=l2_regularizer)(
-            model_input)
+        kernel_regularizer=l2_regularizer,
+        name="expert",
+    )
+
+  def call(self, inputs: tf.Tensor) -> dict[str, tf.Tensor]:
+    """MoE forward call.
+
+    Args:
+      inputs: 'batch_size' x 'num_features' matrix of input features.
+
+    Returns:
+      A dictionary with a tensor containing the probability predictions
+      of the model in the 'predictions' key. The dimensions of the tensor
+      are batch_size x num_classes.
+    """

-    if vocab_as_last_dim:
+    if self._use_input_context_gate:
+      inputs = self._input_context_gate(inputs)
+
+    gate_activations = self._gate_dense(inputs)
+    expert_activations = self._expert_dense(inputs)
+
+    if self._vocab_as_last_dim:
      # Batch x (num_mixtures + 1) x #Labels
      gate_activations = tf.reshape(
-          gate_activations, [-1, num_mixtures + 1, vocab_size])
+          gate_activations, [-1, self._num_mixtures + 1, self._vocab_size]
+      )
      # Batch x num_mixtures x #Labels
      expert_activations = tf.reshape(
-          expert_activations, [-1, num_mixtures, vocab_size])
+          expert_activations,
+          [-1, self._num_mixtures, self._vocab_size],
+      )
    else:
      # (Batch * #Labels) x (num_mixtures + 1)
-      gate_activations = tf.reshape(gate_activations, [-1, num_mixtures + 1])
+      gate_activations = tf.reshape(
+          gate_activations,
+          [-1, self._num_mixtures + 1],
+      )
      # (Batch * #Labels) x num_mixtures
-      expert_activations = tf.reshape(expert_activations, [-1, num_mixtures])
+      expert_activations = tf.reshape(
+          expert_activations,
+          [-1, self._num_mixtures],
+      )

    gating_distribution = tf.nn.softmax(gate_activations, axis=1)
    expert_distribution = tf.nn.sigmoid(expert_activations)
    final_probabilities = tf.reduce_sum(
-        gating_distribution[:, :num_mixtures] * expert_distribution, axis=1)
+        gating_distribution[:, : self._num_mixtures] * expert_distribution,
+        axis=1,
+    )

-    if not vocab_as_last_dim:
-      final_probabilities = tf.reshape(final_probabilities, [-1, vocab_size])
-
-    if use_output_context_gate:
-      final_probabilities = utils.context_gate(
+    if not self._vocab_as_last_dim:
+      final_probabilities = tf.reshape(
          final_probabilities,
-          normalizer_fn=layers.BatchNormalization,
-          normalizer_params=normalizer_params,
+          [-1, self._vocab_size],
      )

-    outputs = {"predictions": final_probabilities}
-    super().__init__(inputs=inputs, outputs=outputs, **kwargs)
+    return {"predictions": final_probabilities}
--- a/official/projects/yt8m/modeling/nn_layers.py
+++ b/official/projects/yt8m/modeling/nn_layers.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains a collection of util functions for model construction."""
+
+from typing import Any, Dict, Optional, Union
+
+import tensorflow as tf
+
+from official.projects.yt8m.modeling import yt8m_model_utils
+
+
+class ContextGate(tf.keras.layers.Layer):
+  """Context Gating. More details: https://arxiv.org/pdf/1706.06905.pdf."""
+
+  def __init__(
+      self,
+      normalizer_fn=None,
+      normalizer_params: Optional[Dict[str, Any]] = None,
+      kernel_initializer: Union[
+          str, tf.keras.regularizers.Regularizer
+      ] = "glorot_uniform",
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_initializer: Union[str, tf.keras.regularizers.Regularizer] = "zeros",
+      hidden_layer_size: int = 0,
+      pooling_method: Optional[str] = None,
+      additive_residual: bool = False,
+      name: Optional[str] = None,
+  ):
+    """Initialization of context gate.
+
+    Args:
+      normalizer_fn: Normalization function to use instead of `biases` (e.g.
+        tf.contrib.layers.batch_norm). If None, bias is added.
+      normalizer_params: Normalization function parameters.
+      kernel_initializer: Weight initializer to use instead of Xavier (e.g.
+        tf.contrib.layers.variance_scaling_initializer).
+      kernel_regularizer: Weight regularizer to use instead of None (e.g.,
+        tf.contrib.layers.l2_regularizer(l2_penalty)).
+      bias_initializer: Biases initializer to use (default tf.zeros_initializer)
+      hidden_layer_size: Dimensionality of the context gating hidden layer size,
+        if any. If None, will apply a fully-connected context gating layer with
+        shape [input_size x input_size]. If set to an int N, will factorize the
+        context gating layer into [input_size x N] x [N x input_size] as in the
+        squeeze-and-excitation block from https://arxiv.org/pdf/1709.01507.pdf.
+      pooling_method: Whether to perform global pooling of the local features
+        before applying the context gating layer. This is relevant only if the
+        input_features tensor has rank > 2, e.g., it's a sequence of frame
+        features, [batch_size, num_frames, feature_dim], or spatial convolution
+        features, [batch_size*num_frames, h, w, feature_dim]. If the inputs are
+        a set of local features and pooling_method is not None, will pool
+        features across all but the batch_size dimension using the specified
+        pooling method, and pass the aggregated features as context to the
+        gating layer. For a list of pooling methods, see the frame_pooling()
+        function.
+      additive_residual: If true, will use ReLu6-activated (additive) residual
+        connections instead of Sigmoid-activated (multiplicative) connections
+        when combining the input_features with the context gating branch.
+      name: Optional `str` name of the module.
+
+    Returns:
+      A tensor with the same shape as input_features.
+    """
+    super().__init__(name=name)
+    self._normalizer_fn = normalizer_fn
+    self._normalizer_params = normalizer_params or {}
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_initializer = bias_initializer
+    self._hidden_layer_size = hidden_layer_size
+    self._pooling_method = pooling_method
+    self._additive_residual = additive_residual
+
+    if hidden_layer_size >= 2:
+      self._gates_bottleneck = tf.keras.layers.Dense(
+          hidden_layer_size,
+          activation="relu6",
+          kernel_initializer=kernel_initializer,
+          bias_initializer=bias_initializer,
+          kernel_regularizer=kernel_regularizer,
+          name="bottleneck",
+      )
+      if self._normalizer_fn:
+        self._gates_bottleneck_norm = self._normalizer_fn(
+            **self._normalizer_params,
+            name="bottleneck_norm",
+        )
+
+  def build(self, input_shape):
+    super().build(input_shape)
+    feature_size = input_shape[-1]
+    activation_fn = tf.nn.relu6 if self._additive_residual else tf.nn.sigmoid
+    self._gates = tf.keras.layers.Dense(
+        feature_size,
+        activation=activation_fn,
+        kernel_initializer=self._kernel_initializer,
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        name="gates_dense",
+    )
+    if self._normalizer_fn:
+      self._gates_norm = self._normalizer_fn(
+          **self._normalizer_params,
+          name="gates_norm",
+      )
+
+  def call(self, inputs: tf.Tensor):
+    num_dimensions = len(inputs.shape.as_list())
+    feature_size = inputs.shape.as_list()[-1]
+
+    if self._pooling_method:
+      assert num_dimensions > 2
+      # Collapse the inner axes of the original features shape into a 3D tensor
+      original_shape = tf.shape(inputs)
+      # The last dimension will change after concatenating the context
+      new_shape = tf.concat(
+          [original_shape[:-1], tf.constant([2 * feature_size])], 0
+      )
+      batch_size = original_shape[0]
+      reshaped_features = tf.reshape(inputs, [batch_size, -1, feature_size])
+      num_features = tf.shape(reshaped_features)[1]
+      # Pool the feature channels across the inner axes to get global context
+      context_features = yt8m_model_utils.frame_pooling(
+          reshaped_features, self._pooling_method
+      )
+      context_features = tf.expand_dims(context_features, 1)
+      # Replicate the global context features and concat to the local features.
+      context_features = tf.tile(context_features, [1, num_features, 1])
+      context_features = tf.concat([reshaped_features, context_features], 2)
+      context_features = tf.reshape(context_features, shape=new_shape)
+    else:
+      # num_dimensions should be 2
+      context_features = tf.identity(inputs)
+
+    if self._hidden_layer_size >= 2:
+      gates_bottleneck = self._gates_bottleneck(context_features)
+      if self._normalizer_fn:
+        gates_bottleneck = self._gates_bottleneck_norm(gates_bottleneck)
+    else:
+      gates_bottleneck = tf.identity(context_features)
+
+    gates = self._gates(gates_bottleneck)
+    if self._normalizer_fn:
+      gates = self._gates_norm(gates)
+
+    if self._additive_residual:
+      inputs += tf.cast(gates, inputs.dtype)
+    else:
+      inputs *= tf.cast(gates, inputs.dtype)
+
+    return inputs
--- a/official/projects/yt8m/modeling/nn_layers_test.py
+++ b/official/projects/yt8m/modeling/nn_layers_test.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for nn_layers."""
+
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.projects.yt8m.modeling import nn_layers
+
+
+class NNLayersTest(parameterized.TestCase, tf.test.TestCase):
+  """Class for testing nn_layers."""
+
+  @parameterized.product(
+      hidden_layer_size=(0, 8, 16),
+      additive_residual=(True, False),
+      pooling_method=["average", "max", "swap", "none", None],
+  )
+  def test_context_gate(
+      self, hidden_layer_size, additive_residual, pooling_method
+  ):
+    """Test for creation of a context gate layer."""
+
+    context_gate = nn_layers.ContextGate(
+        normalizer_fn=tf.keras.layers.BatchNormalization,
+        hidden_layer_size=hidden_layer_size,
+        additive_residual=additive_residual,
+        pooling_method=pooling_method,
+    )
+
+    if pooling_method is None:
+      inputs = tf.ones([2, 32], dtype=tf.float32)
+    elif pooling_method == "none":
+      inputs = tf.ones([2, 1, 32], dtype=tf.float32)
+    else:
+      inputs = tf.ones([2, 24, 32], dtype=tf.float32)
+
+    outputs = context_gate(inputs)
+    self.assertShapeEqual(inputs, outputs)
+
+    context_vars_len = 12 if hidden_layer_size else 6
+    context_trainable_vars_len = 8 if hidden_layer_size else 4
+    self.assertLen(context_gate.variables, context_vars_len)
+    self.assertLen(context_gate.trainable_variables, context_trainable_vars_len)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/projects/yt8m/modeling/yt8m_model.py
+++ b/official/projects/yt8m/modeling/yt8m_model.py
@@ -118,9 +118,6 @@ class VideoClassificationModel(tf.keras.Model):
        else None
    )
    self.head = aggregation_head(
-        input_specs=layers.InputSpec(
-            shape=[None, self._params.backbone.get().hidden_size]
-        ),
        vocab_size=self._num_classes,
        l2_regularizer=l2_regularizer,
        **head_cfg.as_dict(),
@@ -134,10 +131,17 @@ class VideoClassificationModel(tf.keras.Model):
    return cls(**config)

  def call(
-      self, inputs: tf.Tensor, training: Any = None, mask: Any = None
+      self,
+      inputs: tf.Tensor,
+      num_frames: Any = None,
+      training: Any = None,
  ) -> dict[str, tf.Tensor]:
-    features = self.backbone(inputs)
-    outputs = self.head(features)
+    features = self.backbone(
+        inputs,
+        num_frames=num_frames,
+        training=training,
+    )
+    outputs = self.head(features, training=training)
    return outputs

  @property

--- a/official/projects/yt8m/modeling/yt8m_model_test.py
+++ b/official/projects/yt8m/modeling/yt8m_model_test.py
@@ -26,26 +26,42 @@ class YT8MNetworkTest(parameterized.TestCase, tf.test.TestCase):
  """Class for testing yt8m network."""

  # test_yt8m_network_creation arbitrary params
-  @parameterized.parameters((32, 1152), (24, 1152))  # 1152 = 1024 + 128
-  def test_yt8m_network_creation(self, num_frames, feature_dims):
+  @parameterized.product(
+      num_sample_frames=(None, 16, 32),
+      pooling_method=('average', 'max', 'swap'),
+  )
+  def test_yt8m_network_creation(
+      self, num_sample_frames, pooling_method
+  ):
    """Test for creation of a YT8M Model.

    Args:
-      num_frames: number of frames.
-      feature_dims: indicates total dimension size of the features.
+      num_sample_frames: indicates number of frames to sample.
+      pooling_method: str of frame pooling method.
    """
+    num_frames = 24
+    feature_dims = 52
+    num_classes = 45
    input_specs = tf.keras.layers.InputSpec(shape=[None, None, feature_dims])

-    num_classes = 3862
+    params = yt8m_cfg.YT8MTask().model
+    params.backbone.dbof.pooling_method = pooling_method
    model = yt8m_model.VideoClassificationModel(
-        params=yt8m_cfg.YT8MTask().model,
+        params=params,
        num_classes=num_classes,
        input_specs=input_specs,
    )

    # batch = 2 -> arbitrary value for test.
-    inputs = np.random.rand(2, num_frames, feature_dims)
-    predictions = model(inputs)['predictions']
+    if num_sample_frames:
+      inputs = np.random.rand(2, num_sample_frames, feature_dims)
+      num_frames = tf.constant([num_sample_frames, num_sample_frames])
+    else:
+      # Add padding frames.
+      inputs = np.random.rand(2, num_frames + 4, feature_dims)
+      num_frames = tf.constant([num_frames, num_frames + 1])
+
+    predictions = model(inputs, num_frames=num_frames)['predictions']
    self.assertAllEqual([2, num_classes], predictions.numpy().shape)

  def test_serialize_deserialize(self):

--- a/official/projects/yt8m/modeling/yt8m_model_utils.py
+++ b/official/projects/yt8m/modeling/yt8m_model_utils.py
@@ -14,18 +14,81 @@

 """Contains a collection of util functions for model construction."""

-from typing import Any, Dict, Optional, Union
+from typing import Optional

 import tensorflow as tf


-def frame_pooling(frames, method):
+def _large_compatible_negative(tensor_type):
+  """Large negative number as Tensor.
+
+  This function is necessary because the standard value for epsilon
+  in this module (-1e9) cannot be represented using `tf.float16`.
+
+  Args:
+    tensor_type: A dtype to determine the type.
+
+  Returns:
+    A large negative number.
+  """
+  if tensor_type == tf.float16:
+    return tf.float16.min
+  return -1e9
+
+
+def weighted_average_pooling(features, weights, axis):
+  """Weighted average pooling.
+
+  Args:
+    features: a tensor of at least rank 1.
+    weights: a weight tensor whose shape is broadcast compatible with features.
+      It doesn't have to be normalized.
+    axis: the dimensions to reduce.
+
+  Returns:
+    The reduced tensor.
+  """
+  return tf.math.divide_no_nan(
+      tf.reduce_sum(weights * features, axis),  # numerator.
+      tf.reduce_sum(weights, axis),  # denominator.
+  )
+
+
+def frame_swap(
+    frames: tf.Tensor, frame_mask: Optional[tf.Tensor] = None
+) -> tf.Tensor:
+  """Self-weighted average pooling over all frames of a video.
+
+  It does the following operation independently for each feature:
+    x_pooled = (sum_i x_i * |x_i|) / (sum_i |x_i|).
+  Basically the weight for the feature in each frame is determined by the
+  magnitude of the feature itself.
+
+  Paper: https://research.google/pubs/pub48351/
+
+  Args:
+    frames: A tensor with shape [batch_size, max_frames, feature_size].
+    frame_mask:  A tensor with shape [batch_size, max_frames, 1].
+
+  Returns:
+    A tensor with shape [batch_size, feature_size].
+  """
+  weights = tf.abs(frames)
+  if frame_mask is not None:
+    weights *= tf.cast(frame_mask, weights.dtype)
+  # We set axis to 1 to reduce the dimension corresponding to max_frames.
+  return weighted_average_pooling(frames, weights, axis=1)
+
+
+def frame_pooling(frames, method="average", num_frames=None):
  """Pools over the frames of a video.

  Args:
    frames: tensor of shape [batch_size, num_frames, feature_size].
    method: string indicating pooling method, one of: "average", "max",
      "attention", or "none".
+    num_frames: optional tensor of shape [batch_size] indicating valid number of
+      frames for each video.

  Returns:
    tensor of shape [batch_size, feature_size] for average, max, or
@@ -35,119 +98,38 @@ def frame_pooling(frames, method):
    ValueError: if method is other than "average", "max", "attention", or
    "none".
  """
+  frame_mask = None
+  if num_frames is not None:
+    max_frames = frames.shape.as_list()[1]
+    # Generate binary mask from number of frames.
+    frame_mask = tf.sequence_mask(num_frames, max_frames, frames.dtype)
+    frame_mask = tf.expand_dims(frame_mask, axis=2)
+
  if method == "average":
-    reduced = tf.reduce_mean(frames, 1)
+    if num_frames is None:
+      reduced = tf.reduce_mean(frames, 1)
+    else:
+      num_frames = tf.reshape(tf.cast(num_frames, frames.dtype), [-1, 1])
+      reduced = tf.reduce_sum(frames * frame_mask, 1) / num_frames
  elif method == "max":
+    if num_frames is not None:
+      frame_mask = tf.cast(frame_mask, tf.bool)
+      frames = tf.where(
+          frame_mask,
+          frames,
+          tf.ones_like(frames, dtype=frames.dtype)
+          * _large_compatible_negative(frames.dtype),
+      )
    reduced = tf.reduce_max(frames, 1)
+  elif method == "swap":
+    # Note we assume the frames are in the shape of
+    # [batch_size, num_frames, feature_size]. Otherwise this function might
+    # fail.
+    reduced = frame_swap(frames, frame_mask)
  elif method == "none":
-    feature_size = frames.shape_as_list()[2]
+    feature_size = frames.shape.as_list()[2]
    reduced = tf.reshape(frames, [-1, feature_size])
  else:
    raise ValueError("Unrecognized pooling method: %s" % method)

  return reduced
-
-
-def context_gate(
-    input_features,
-    normalizer_fn=None,
-    normalizer_params: Optional[Dict[str, Any]] = None,
-    kernel_initializer: Union[
-        str, tf.keras.regularizers.Regularizer] = "glorot_uniform",
-    kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
-    bias_initializer: Union[str, tf.keras.regularizers.Regularizer] = "zeros",
-    hidden_layer_size: int = 0,
-    pooling_method: Optional[str] = None,
-    additive_residual: bool = False):
-  """Context Gating.
-
-  More details: https://arxiv.org/pdf/1706.06905.pdf.
-
-  Args:
-    input_features: a tensor of at least rank 2.
-    normalizer_fn: Normalization function to use instead of `biases` (e.g.
-      tf.contrib.layers.batch_norm). If None, bias is added.
-    normalizer_params: Normalization function parameters.
-    kernel_initializer: Weight initializer to use instead of Xavier (e.g.
-      tf.contrib.layers.variance_scaling_initializer).
-    kernel_regularizer: Weight regularizer to use instead of None (e.g.,
-      tf.contrib.layers.l2_regularizer(l2_penalty)).
-    bias_initializer: Biases initializer to use (default tf.zeros_initializer)
-    hidden_layer_size: Dimensionality of the context gating hidden layer size,
-      if any. If None, will apply a fully-connected context gating layer with
-      shape [input_size x input_size]. If set to an int N, will factorize the
-      context gating layer into [input_size x N] x [N x input_size] as in the
-      squeeze-and-excitation block from https://arxiv.org/pdf/1709.01507.pdf.
-    pooling_method: Whether to perform global pooling of the local features
-      before applying the context gating layer. This is relevant only if the
-      input_features tensor has rank > 2, e.g., it's a sequence of frame
-      features, [batch_size, num_frames, feature_dim], or spatial convolution
-      features, [batch_size*num_frames, h, w, feature_dim]. If the inputs are a
-      set of local features and pooling_method is not None, will pool features
-      across all but the batch_size dimension using the specified pooling
-      method, and pass the aggregated features as context to the gating layer.
-      For a list of pooling methods, see the frame_pooling() function.
-    additive_residual: If true, will use ReLu6-activated (additive) residual
-      connections instead of Sigmoid-activated (multiplicative) connections when
-      combining the input_features with the context gating branch.
-
-  Returns:
-    A tensor with the same shape as input_features.
-  """
-  if normalizer_params is None:
-    normalizer_params = {}
-  with tf.name_scope("ContextGating"):
-    num_dimensions = len(input_features.shape.as_list())
-    feature_size = input_features.shape.as_list()[-1]
-    if pooling_method:
-      assert num_dimensions > 2
-      # Collapse the inner axes of the original features shape into a 3D tensor
-      original_shape = tf.shape(input_features)
-      # The last dimension will change after concatenating the context
-      new_shape = tf.concat(
-          [original_shape[:-1],
-           tf.constant([2 * feature_size])], 0)
-      batch_size = original_shape[0]
-      reshaped_features = tf.reshape(input_features,
-                                     [batch_size, -1, feature_size])
-      num_features = tf.shape(reshaped_features)[1]
-      # Pool the feature channels across the inner axes to get global context
-      context_features = frame_pooling(reshaped_features, pooling_method)
-      context_features = tf.expand_dims(context_features, 1)
-      # Replicate the global context features and concat to the local features.
-      context_features = tf.tile(context_features, [1, num_features, 1])
-      context_features = tf.concat([reshaped_features, context_features], 2)
-      context_features = tf.reshape(context_features, shape=new_shape)
-    else:
-      context_features = input_features
-
-    if hidden_layer_size >= 2:
-      gates_bottleneck = tf.keras.layers.Dense(
-          hidden_layer_size,
-          activation="relu6",
-          kernel_initializer=kernel_initializer,
-          bias_initializer=bias_initializer,
-          kernel_regularizer=kernel_regularizer,
-      )(context_features)
-      if normalizer_fn:
-        gates_bottleneck = normalizer_fn(**normalizer_params)(gates_bottleneck)
-    else:
-      gates_bottleneck = context_features
-
-    activation_fn = (tf.nn.relu6 if additive_residual else tf.nn.sigmoid)
-    gates = tf.keras.layers.Dense(
-        feature_size,
-        activation=activation_fn,
-        kernel_initializer=kernel_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-    )(gates_bottleneck)
-    if normalizer_fn:
-      gates = normalizer_fn(**normalizer_params)(gates)
-
-    if additive_residual:
-      input_features += tf.cast(gates, input_features.dtype)
-    else:
-      input_features *= tf.cast(gates, input_features.dtype)
-
-    return input_features
--- a/official/projects/yt8m/modeling/yt8m_model_utils_test.py
+++ b/official/projects/yt8m/modeling/yt8m_model_utils_test.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for YT8M modeling utilities."""
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.projects.yt8m.modeling import yt8m_model_utils
+
+
+class Yt8MModelUtilsTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.product(
+      frame_pooling=("average", "max", "swap", "none"),
+      use_frame_mask=(True, False),
+  )
+  def test_frame_pooling(self, frame_pooling, use_frame_mask):
+    frame = tf.constant([
+        [[0.0, 0.0, 0.0], [0.0, 1.0, -1.0]],
+        [[0.0, 0.0, 0.0], [0.0, 2.0, -2.0]],
+    ])
+    num_frames = tf.constant([2, 2]) if use_frame_mask else None
+    pooled_frame = yt8m_model_utils.frame_pooling(
+        frame, method=frame_pooling, num_frames=num_frames
+    )
+    if frame_pooling == "swap":
+      self.assertAllClose([[0.0, 1.0, -1.0], [0.0, 2.0, -2.0]], pooled_frame)
+    elif frame_pooling == "average":
+      self.assertAllClose([[0.0, 0.5, -0.5], [0.0, 1.0, -1.0]], pooled_frame)
+    elif frame_pooling == "max":
+      self.assertAllClose([[0.0, 1.0, 0.0], [0.0, 2.0, 0.0]], pooled_frame)
+    elif frame_pooling == "none":
+      self.assertAllClose(
+          [
+              [0.0, 0.0, 0.0],
+              [0.0, 1.0, -1.0],
+              [0.0, 0.0, 0.0],
+              [0.0, 2.0, -2.0],
+          ],
+          pooled_frame,
+      )
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/projects/yt8m/tasks/yt8m_task.py
+++ b/official/projects/yt8m/tasks/yt8m_task.py
@@ -19,13 +19,13 @@ from absl import logging
 import tensorflow as tf

 from official.core import base_task
-from official.core import input_reader
 from official.core import task_factory
 from official.modeling import tf_utils
 from official.projects.yt8m.configs import yt8m as yt8m_cfg
 from official.projects.yt8m.dataloaders import yt8m_input
 from official.projects.yt8m.eval_utils import eval_util
 from official.projects.yt8m.modeling import yt8m_model
+from official.vision.dataloaders.google import input_reader


 @task_factory.register_task_cls(yt8m_cfg.YT8MTask)
@@ -48,7 +48,14 @@ class YT8MTask(base_task.Task):
        params=model_config,
        input_specs=input_specs,
        num_classes=train_cfg.num_classes,
-        l2_weight_decay=l2_weight_decay)
+        l2_weight_decay=l2_weight_decay,
+    )
+
+    # Warmup calls to build model variables.
+    _ = model(
+        inputs=tf.keras.Input(common_input_shape, dtype=tf.float32),
+        num_frames=tf.keras.Input([], dtype=tf.float32),
+    )

    non_trainable_batch_norm_variables = []
    non_trainable_extra_variables = []
@@ -114,17 +121,16 @@ class YT8MTask(base_task.Task):
        decoder_fn=decoder_fn,
        parser_fn=parser_fn,
        postprocess_fn=postprocess_fn,
-        transform_and_batch_fn=batch_fn)
+        transform_and_batch_fn=batch_fn,
+    )

    dataset = reader.read(input_context=input_context)

    return dataset

-  def build_losses(self,
-                   labels,
-                   model_outputs,
-                   label_weights=None,
-                   aux_losses=None):
+  def build_losses(
+      self, labels, model_outputs, label_weights=None, aux_losses=None
+  ):
    """Sigmoid Cross Entropy.

    Args:
@@ -143,7 +149,8 @@ class YT8MTask(base_task.Task):
        tf.expand_dims(model_outputs, axis=-1),
        from_logits=losses_config.from_logits,
        label_smoothing=losses_config.label_smoothing,
-        axis=-1)
+        axis=-1,
+    )
    if label_weights is None:
      model_loss = tf_utils.safe_mean(model_loss)
    else:
@@ -151,7 +158,8 @@ class YT8MTask(base_task.Task):
      # Manutally compute weighted mean loss.
      total_loss = tf.reduce_sum(model_loss)
      total_weight = tf.cast(
-          tf.reduce_sum(label_weights), dtype=total_loss.dtype)
+          tf.reduce_sum(label_weights), dtype=total_loss.dtype
+      )
      model_loss = tf.math.divide_no_nan(total_loss, total_weight)

    total_loss = model_loss
@@ -188,7 +196,8 @@ class YT8MTask(base_task.Task):
      top_k = self.task_config.evaluation.average_precision.top_k
      top_n = self.task_config.evaluation.average_precision.top_n
      self.avg_prec_metric = eval_util.EvaluationMetrics(
-          num_classes, top_k=top_k, top_n=top_n)
+          num_classes, top_k=top_k, top_n=top_n
+      )

    return metrics

@@ -233,17 +242,26 @@ class YT8MTask(base_task.Task):
        logs[m.name] = m.result()
    return logs

-  def _preprocess_model_inputs(self,
-                               inputs: dict[str, tf.Tensor],
-                               training: bool = True):
+  def _preprocess_model_inputs(
+      self,
+      inputs: dict[str, tf.Tensor],
+      require_num_frames: bool = True,
+      training: bool = True,
+  ):
    """Preprocesses input tensors before model on device."""
-    del training
-
-    return inputs['video_matrix']
+    extra_inputs = {
+        'num_frames': (
+            tf.reshape(inputs['num_frames'], [-1])
+            if require_num_frames
+            else None
+        ),
+        'training': training,
+    }
+    return inputs['video_matrix'], extra_inputs

-  def _preprocess_labels(self,
-                         inputs: dict[str, tf.Tensor],
-                         training: bool = True):
+  def _preprocess_labels(
+      self, inputs: dict[str, tf.Tensor], training: bool = True
+  ):
    """Preprocesses labels."""
    del training  # training is unused in _preprocess_labels in YT8M.
    labels = inputs['labels']
@@ -251,11 +269,9 @@ class YT8MTask(base_task.Task):

    return labels, label_weights

-  def _postprocess_outputs(self,
-                           outputs,
-                           labels,
-                           label_weights,
-                           training: bool = True):
+  def _postprocess_outputs(
+      self, outputs, labels, label_weights, training: bool = True
+  ):
    """Postprocess model outputs (inputs / labels / label_weights)."""
    if not training and self.task_config.validation_data.segment_labels:
      # workaround to ignore the unrated labels.
@@ -279,25 +295,34 @@ class YT8MTask(base_task.Task):
    Returns:
      a dictionary of logs.
    """
-    model_inputs = self._preprocess_model_inputs(inputs, training=True)
+    # Will require `num_frames` if `num_sample_frames` is None since
+    # video_matrix is padded to max_frames in this case.
+    require_num_frames = self.task_config.train_data.num_sample_frames is None
+    inputs_tensor, extra_inputs = self._preprocess_model_inputs(
+        inputs,
+        require_num_frames=require_num_frames,
+        training=True,
+    )
    labels, label_weights = self._preprocess_labels(inputs, training=True)

    num_replicas = tf.distribute.get_strategy().num_replicas_in_sync
    with tf.GradientTape() as tape:
-      outputs = model(model_inputs, training=True)['predictions']
+      outputs = model(inputs_tensor, **extra_inputs)['predictions']
      # Casting output layer as float32 is necessary when mixed_precision is
      # mixed_float16 or mixed_bfloat16 to ensure output is casted as float32.
      outputs = tf.nest.map_structure(lambda x: tf.cast(x, tf.float32), outputs)
      # Post-process model / label outputs.
      outputs, labels, label_weights = self._postprocess_outputs(
-          outputs, labels, label_weights, training=True)
+          outputs, labels, label_weights, training=True
+      )

      # Computes per-replica loss
      all_losses = self.build_losses(
          model_outputs=outputs,
          labels=labels,
          label_weights=label_weights,
-          aux_losses=model.losses)
+          aux_losses=model.losses,
+      )

      loss = all_losses['total_loss']
      # Scales loss as the default gradients allreduce performs sum inside the
@@ -318,8 +343,9 @@ class YT8MTask(base_task.Task):

    # Apply gradient clipping.
    if self.task_config.gradient_clip_norm > 0:
-      grads, _ = tf.clip_by_global_norm(grads,
-                                        self.task_config.gradient_clip_norm)
+      grads, _ = tf.clip_by_global_norm(
+          grads, self.task_config.gradient_clip_norm
+      )
    optimizer.apply_gradients(list(zip(grads, tvars)))

    logs = {self.loss: loss}
@@ -330,7 +356,9 @@ class YT8MTask(base_task.Task):
            outputs=outputs,
            model_losses=all_losses,
            label_weights=label_weights,
-            training=True))
+            training=True,
+        )
+    )
    return logs

  def validation_step(self, inputs, model, metrics=None):
@@ -346,19 +374,26 @@ class YT8MTask(base_task.Task):
    Returns:
      a dictionary of logs.
    """
-    model_inputs = self._preprocess_model_inputs(inputs, training=False)
-    labels, label_weights = self._preprocess_labels(inputs, training=False)
-
-    outputs = self.inference_step(model_inputs, model)['predictions']
+    # Will require `num_frames` if `num_sample_frames` is None since
+    # video_matrix is padded to max_frames in this case.
+    require_num_frames = (
+        self.task_config.validation_data.num_sample_frames is None
+    )
+    outputs = self.inference_step(
+        model, inputs, require_num_frames=require_num_frames
+    )['predictions']
    outputs = tf.nest.map_structure(lambda x: tf.cast(x, tf.float32), outputs)
+    labels, label_weights = self._preprocess_labels(inputs, training=False)
    outputs, labels, label_weights = self._postprocess_outputs(
-        outputs, labels, label_weights, training=False)
+        outputs, labels, label_weights, training=False
+    )

    all_losses = self.build_losses(
        labels=labels,
        model_outputs=outputs,
        label_weights=label_weights,
-        aux_losses=model.losses)
+        aux_losses=model.losses,
+    )

    logs = {self.loss: all_losses['total_loss']}
    logs.update(
@@ -368,13 +403,18 @@ class YT8MTask(base_task.Task):
            outputs=outputs,
            model_losses=all_losses,
            label_weights=inputs.get('label_weights', None),
-            training=False))
+            training=False,
+        )
+    )

    return logs

-  def inference_step(self, inputs, model):
+  def inference_step(self, model, inputs, require_num_frames=True):
    """Performs the forward step."""
-    return model(inputs, training=False)
+    model_inputs, extra_inputs = self._preprocess_model_inputs(
+        inputs, require_num_frames=require_num_frames, training=False
+    )
+    return model(model_inputs, **extra_inputs)

  def aggregate_logs(self, state=None, step_logs=None):
    if self.task_config.evaluation.average_precision is not None:
@@ -382,13 +422,15 @@ class YT8MTask(base_task.Task):
        state = self.avg_prec_metric
      self.avg_prec_metric.accumulate(
          labels=step_logs[self.avg_prec_metric.name][0],
-          predictions=step_logs[self.avg_prec_metric.name][1])
+          predictions=step_logs[self.avg_prec_metric.name][1],
+      )
    return state

  def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
    if self.task_config.evaluation.average_precision is not None:
      avg_prec_metrics = self.avg_prec_metric.get(
-          self.task_config.evaluation.average_precision.return_per_class_ap)
+          self.task_config.evaluation.average_precision.return_per_class_ap
+      )
      self.avg_prec_metric.clear()
      return avg_prec_metrics
    return None
--- a/official/projects/yt8m/train_test.py
+++ b/official/projects/yt8m/train_test.py
@@ -45,25 +45,44 @@ class TrainTest(parameterized.TestCase, tf.test.TestCase):
          testcase_name='segment_with_avg_precison',
          use_segment_level_labels=True,
          use_average_precision_metric=True,
+          num_sample_frames=24,
      ),
      dict(
          testcase_name='video_with_avg_precison',
          use_segment_level_labels=False,
          use_average_precision_metric=True,
+          num_sample_frames=24,
      ),
      dict(
          testcase_name='segment',
          use_segment_level_labels=True,
          use_average_precision_metric=False,
+          num_sample_frames=24,
      ),
      dict(
          testcase_name='video',
          use_segment_level_labels=False,
          use_average_precision_metric=False,
+          num_sample_frames=24,
+      ),
+      dict(
+          testcase_name='segment_without_sampling_frames',
+          use_segment_level_labels=True,
+          use_average_precision_metric=False,
+          num_sample_frames=None,
+      ),
+      dict(
+          testcase_name='video_without_sampling_frames',
+          use_segment_level_labels=False,
+          use_average_precision_metric=False,
+          num_sample_frames=None,
      ),
  )
  def test_train_and_eval(
-      self, use_segment_level_labels, use_average_precision_metric
+      self,
+      use_segment_level_labels,
+      use_average_precision_metric,
+      num_sample_frames,
  ):
    saved_flag_values = flagsaver.save_flag_values()
    train_lib.tfm_flags.define_flags()
@@ -103,11 +122,13 @@ class TrainTest(parameterized.TestCase, tf.test.TestCase):
            'train_data': {
                'input_path': self._data_path,
                'global_batch_size': 4,
+                'num_sample_frames': num_sample_frames,
            },
            'validation_data': {
                'input_path': self._data_path,
                'segment_labels': use_segment_level_labels,
                'global_batch_size': 4,
+                'num_sample_frames': num_sample_frames,
            },
            'evaluation': {
                'average_precision': average_precision,

--- a/official/vision/configs/retinanet.py
+++ b/official/vision/configs/retinanet.py
@@ -152,6 +152,9 @@ class DetectionGenerator(hyperparams.Config):
  return_decoded: Optional[bool] = None
  # Only works when nms_version='v2'.
  use_class_agnostic_nms: Optional[bool] = False
+  # Weights or scales when encode and decode boxes coordinates. For Faster RCNN,
+  # the open-source implementation recommends using [10.0, 10.0, 5.0, 5.0].
+  box_coder_weights: list[float] | None = None


 @dataclasses.dataclass

--- a/official/vision/configs/video_classification.py
+++ b/official/vision/configs/video_classification.py
@@ -64,6 +64,7 @@ class DataConfig(cfg.DataConfig):
  mixup_and_cutmix: Optional[common.MixupAndCutmix] = None
  image_field_key: str = 'image/encoded'
  label_field_key: str = 'clip/label/index'
+  input_image_format: str = 'jpeg'


 def kinetics400(is_training):

--- a/official/vision/dataloaders/input_reader.py
+++ b/official/vision/dataloaders/input_reader.py
@@ -23,8 +23,12 @@ from official.core import config_definitions as cfg
 from official.core import input_reader


+InputReader = input_reader.InputReader
+
+
 def build_weighted_sampling_combine_fn(
-    weights: Mapping[Any, Any]) -> Callable[[tf.data.Dataset], tf.data.Dataset]:
+    weights: Mapping[Any, Any], stop_on_empty_dataset=True
+) -> Callable[[tf.data.Dataset], tf.data.Dataset]:
  """Builds a combine_fn using weighted sampling."""

  def combine_fn(datasets: Mapping[Any, tf.data.Dataset]) -> tf.data.Dataset:
@@ -35,7 +39,7 @@ def build_weighted_sampling_combine_fn(
      ds.append(dataset)
      ws.append(weights[k])
    return tf.data.Dataset.sample_from_datasets(
-        ds, ws, stop_on_empty_dataset=True)
+        ds, ws, stop_on_empty_dataset=stop_on_empty_dataset)

  return combine_fn

@@ -44,6 +48,14 @@ def create_combine_fn(
    params: cfg.DataConfig
 ) -> Union[None, Callable[[tf.data.Dataset], tf.data.Dataset]]:
  """Creates and returns a combine_fn for dataset mixing."""
+  if (
+      hasattr(params, 'stop_on_empty_dataset')
+      and params.stop_on_empty_dataset is not None
+  ):
+    stop_on_empty_dataset = params.stop_on_empty_dataset
+  else:
+    stop_on_empty_dataset = True
+
  if params.is_training and params.weights:
    # Combine multiple datasets using weighted sampling.
    if (not isinstance(params.input_path, cfg.base_config.Config) or
@@ -63,7 +75,7 @@ def create_combine_fn(
        raise ValueError(
            'input_path key \'%s\' does not have a corresponding weight.' % k)

-    return build_weighted_sampling_combine_fn(weights)
+    return build_weighted_sampling_combine_fn(weights, stop_on_empty_dataset)
  return None



--- a/official/vision/dataloaders/retinanet_input.py
+++ b/official/vision/dataloaders/retinanet_input.py
@@ -45,6 +45,7 @@ class Parser(parser.Parser):
               anchor_size,
               match_threshold=0.5,
               unmatched_threshold=0.5,
+               box_coder_weights=None,
               aug_type=None,
               aug_rand_hflip=False,
               aug_scale_min=1.0,
@@ -55,7 +56,8 @@ class Parser(parser.Parser):
               max_num_instances=100,
               dtype='bfloat16',
               resize_first: Optional[bool] = None,
-               mode=None):
+               mode=None,
+               pad=True):
    """Initializes parameters for parsing annotations in the dataset.

    Args:
@@ -78,6 +80,10 @@ class Parser(parser.Parser):
      unmatched_threshold: `float` number between 0 and 1 representing the
        upper-bound threshold to assign negative labels for anchors. An anchor
        with a score below the threshold is labeled negative.
+      box_coder_weights: Optional `list` of 4 positive floats to scale y, x, h,
+        and w when encoding box coordinates. If set to None, does not perform
+        scaling. For Faster RCNN, the open-source implementation recommends
+        using [10.0, 10.0, 5.0, 5.0].
      aug_type: An optional Augmentation object to choose from AutoAugment and
        RandAugment.
      aug_rand_hflip: `bool`, if True, augment training with random horizontal
@@ -99,6 +105,15 @@ class Parser(parser.Parser):
        augmentations; computationally more efficient.
      mode: a ModeKeys. Specifies if this is training, evaluation, prediction or
        prediction with ground-truths in the outputs.
+      pad: A bool indicating whether to pad the input image to make it
+        size a factor of 2**max_level. The padded size will be the smallest
+        rectangle, such that each dimension is the smallest multiple of 
+        2**max_level which is larger than the desired output size. For example,
+        if desired output size = (320, 320) and max_level = 7, the output padded
+        size = (384, 384). This is necessary when using FPN as it assumes each
+        lower feature map is 2x size of its higher neighbor. Without padding,
+        such relationship may be invalidated. The backbone may produce 5x5 and
+        2x2 consecutive feature maps, which does not work with FPN. 
    """
    self._mode = mode
    self._max_num_instances = max_num_instances
@@ -113,6 +128,7 @@ class Parser(parser.Parser):
    self._anchor_size = anchor_size
    self._match_threshold = match_threshold
    self._unmatched_threshold = unmatched_threshold
+    self._box_coder_weights = box_coder_weights

    # Data augmentation.
    self._aug_rand_hflip = aug_rand_hflip
@@ -150,6 +166,10 @@ class Parser(parser.Parser):
    # Input pipeline optimization.
    self._resize_first = resize_first

+    # Whether to pad image to make its size the smallest factor of 2*max_level.
+    # This is needed when using FPN decoder.
+    self._pad = pad
+
  def _resize_and_crop_image_and_boxes(self, image, boxes, pad=True):
    """Resizes and crops image and boxes, optionally with padding."""
    # Resizes and crops image.
@@ -162,7 +182,8 @@ class Parser(parser.Parser):
        self._output_size,
        padded_size=padded_size,
        aug_scale_min=self._aug_scale_min,
-        aug_scale_max=self._aug_scale_max)
+        aug_scale_max=self._aug_scale_max,
+    )

    # Resizes and crops boxes.
    image_scale = image_info[2, :]
@@ -171,7 +192,7 @@ class Parser(parser.Parser):
                                                 image_info[1, :], offset)
    return image, boxes, image_info

-  def _parse_train_data(self, data, anchor_labeler=None):
+  def _parse_train_data(self, data, anchor_labeler=None, input_anchor=None):
    """Parses data for training and evaluation."""
    classes = data['groundtruth_classes']
    boxes = data['groundtruth_boxes']
@@ -209,7 +230,8 @@ class Parser(parser.Parser):
    resize_first = self._resize_first and less_output_pixels
    if resize_first:
      image, boxes, image_info = self._resize_and_crop_image_and_boxes(
-          image, boxes, pad=False)
+          image, boxes, pad=False
+      )
      image = tf.cast(image, dtype=tf.uint8)

    # Apply autoaug or randaug.
@@ -227,14 +249,23 @@ class Parser(parser.Parser):
    # Converts boxes from normalized coordinates to pixel coordinates.
    boxes = box_ops.denormalize_boxes(boxes, image_shape)

-    if not resize_first:
-      image, boxes, image_info = self._resize_and_crop_image_and_boxes(
-          image, boxes, pad=True)
+    if self._pad:
+      padded_size = preprocess_ops.compute_padded_size(
+          self._output_size, 2**self._max_level
+      )
    else:
-      padded_size = preprocess_ops.compute_padded_size(self._output_size,
-                                                       2**self._max_level)
-      image = tf.image.pad_to_bounding_box(
-          image, 0, 0, padded_size[0], padded_size[1])
+      padded_size = self._output_size
+
+    if not resize_first:
+      image, boxes, image_info = (
+          self._resize_and_crop_image_and_boxes(image, boxes, pad=self._pad)
+      )
+
+    image = tf.image.pad_to_bounding_box(
+        image, 0, 0, padded_size[0], padded_size[1]
+    )
+    image = tf.ensure_shape(image, padded_size + [3])
+
    image_height, image_width, _ = image.get_shape().as_list()

    # Filters out ground-truth boxes that are all zeros.
@@ -245,16 +276,21 @@ class Parser(parser.Parser):
      attributes[k] = tf.gather(v, indices)

    # Assigns anchors.
-    input_anchor = anchor.build_anchor_generator(
-        min_level=self._min_level,
-        max_level=self._max_level,
-        num_scales=self._num_scales,
-        aspect_ratios=self._aspect_ratios,
-        anchor_size=self._anchor_size)
+    if input_anchor is None:
+      input_anchor = anchor.build_anchor_generator(
+          min_level=self._min_level,
+          max_level=self._max_level,
+          num_scales=self._num_scales,
+          aspect_ratios=self._aspect_ratios,
+          anchor_size=self._anchor_size,
+      )
+
    anchor_boxes = input_anchor(image_size=(image_height, image_width))
    if anchor_labeler is None:
      anchor_labeler = anchor.AnchorLabeler(
-          self._match_threshold, self._unmatched_threshold
+          match_threshold=self._match_threshold,
+          unmatched_threshold=self._unmatched_threshold,
+          box_coder_weights=self._box_coder_weights,
      )
    (cls_targets, box_targets, att_targets, cls_weights,
     box_weights) = anchor_labeler.label_anchors(
@@ -276,7 +312,7 @@ class Parser(parser.Parser):
      labels['attribute_targets'] = att_targets
    return image, labels

-  def _parse_eval_data(self, data, anchor_labeler=None):
+  def _parse_eval_data(self, data, anchor_labeler=None, input_anchor=None):
    """Parses data for training and evaluation."""

    classes = data['groundtruth_classes']
@@ -296,13 +332,21 @@ class Parser(parser.Parser):
    boxes = box_ops.denormalize_boxes(boxes, image_shape)

    # Resizes and crops image.
+    if self._pad:
+      padded_size = preprocess_ops.compute_padded_size(
+          self._output_size, 2**self._max_level
+      )
+    else:
+      padded_size = self._output_size
+
    image, image_info = preprocess_ops.resize_and_crop_image(
        image,
        self._output_size,
-        padded_size=preprocess_ops.compute_padded_size(self._output_size,
-                                                       2**self._max_level),
+        padded_size=padded_size,
        aug_scale_min=1.0,
-        aug_scale_max=1.0)
+        aug_scale_max=1.0,
+    )
+    image = tf.ensure_shape(image, padded_size + [3])
    image_height, image_width, _ = image.get_shape().as_list()

    # Resizes and crops boxes.
@@ -318,16 +362,21 @@ class Parser(parser.Parser):
      attributes[k] = tf.gather(v, indices)

    # Assigns anchors.
-    input_anchor = anchor.build_anchor_generator(
-        min_level=self._min_level,
-        max_level=self._max_level,
-        num_scales=self._num_scales,
-        aspect_ratios=self._aspect_ratios,
-        anchor_size=self._anchor_size)
+    if input_anchor is None:
+      input_anchor = anchor.build_anchor_generator(
+          min_level=self._min_level,
+          max_level=self._max_level,
+          num_scales=self._num_scales,
+          aspect_ratios=self._aspect_ratios,
+          anchor_size=self._anchor_size,
+      )
+
    anchor_boxes = input_anchor(image_size=(image_height, image_width))
    if anchor_labeler is None:
      anchor_labeler = anchor.AnchorLabeler(
-          self._match_threshold, self._unmatched_threshold
+          match_threshold=self._match_threshold,
+          unmatched_threshold=self._unmatched_threshold,
+          box_coder_weights=self._box_coder_weights,
      )
    (cls_targets, box_targets, att_targets, cls_weights,
     box_weights) = anchor_labeler.label_anchors(

--- a/official/vision/dataloaders/video_input.py
+++ b/official/vision/dataloaders/video_input.py
@@ -45,7 +45,8 @@ def process_image(image: tf.Tensor,
                  min_area_ratio: float = 0.49,
                  max_area_ratio: float = 1.0,
                  augmenter: Optional[augment.ImageAugment] = None,
-                  seed: Optional[int] = None) -> tf.Tensor:
+                  seed: Optional[int] = None,
+                  input_image_format: Optional[str] = 'jpeg') -> tf.Tensor:
  """Processes a serialized image tensor.

  Args:
@@ -78,6 +79,8 @@ def process_image(image: tf.Tensor,
    max_area_ratio: The maximum area range for cropping.
    augmenter: Image augmenter to distort each image.
    seed: A deterministic seed to use when sampling.
+    input_image_format: The format of input image which could be jpeg, png or
+          none for unknown or mixed datasets.

  Returns:
    Processed frames. Tensor of shape
@@ -93,6 +96,10 @@ def process_image(image: tf.Tensor,
    raise ValueError('Random stride range should be >= 0, got {}'.format(
        random_stride_range))

+  if input_image_format not in ('jpeg', 'png', 'none'):
+    raise ValueError('Unknown input image format: {}'.format(
+        input_image_format))
+
  if isinstance(crop_size, int):
    crop_size = (crop_size, crop_size)
  crop_height, crop_width = crop_size
@@ -120,7 +127,7 @@ def process_image(image: tf.Tensor,

  # Decode JPEG string to tf.uint8.
  if image.dtype == tf.string:
-    image = preprocess_ops_3d.decode_jpeg(image, num_channels)
+    image = preprocess_ops_3d.decode_image(image, num_channels)

  if is_training:
    # Standard image data augmentation: random resized crop and random flip.
@@ -295,6 +302,7 @@ class Parser(parser.Parser):
    self._max_aspect_ratio = input_params.aug_max_aspect_ratio
    self._min_area_ratio = input_params.aug_min_area_ratio
    self._max_area_ratio = input_params.aug_max_area_ratio
+    self._input_image_format = input_params.input_image_format
    if self._output_audio:
      self._audio_feature = input_params.audio_feature
      self._audio_shape = input_params.audio_feature_shape
@@ -343,7 +351,8 @@ class Parser(parser.Parser):
        min_area_ratio=self._min_area_ratio,
        max_area_ratio=self._max_area_ratio,
        augmenter=self._augmenter,
-        zero_centering_image=self._zero_centering_image)
+        zero_centering_image=self._zero_centering_image,
+        input_image_format=self._input_image_format)
    image = tf.cast(image, dtype=self._dtype)

    features = {'image': image}
@@ -378,7 +387,8 @@ class Parser(parser.Parser):
        crop_size=self._crop_size,
        num_channels=self._num_channels,
        num_crops=self._num_crops,
-        zero_centering_image=self._zero_centering_image)
+        zero_centering_image=self._zero_centering_image,
+        input_image_format=self._input_image_format)
    image = tf.cast(image, dtype=self._dtype)
    features = {'image': image}


--- a/official/vision/modeling/decoders/fpn.py
+++ b/official/vision/modeling/decoders/fpn.py
@@ -93,18 +93,18 @@ class FPN(tf.keras.Model):
        'kernel_regularizer': kernel_regularizer,
        'bias_regularizer': bias_regularizer,
    }
-    if use_separable_conv:
-      conv2d = tf.keras.layers.SeparableConv2D
-    else:
-      conv2d = tf.keras.layers.Conv2D
+    conv2d = (
+        tf.keras.layers.SeparableConv2D
+        if use_separable_conv
+        else tf.keras.layers.Conv2D
+    )
    norm = tf.keras.layers.BatchNormalization
    activation_fn = tf_utils.get_activation(activation, use_keras_layer=True)

    # Build input feature pyramid.
-    if tf.keras.backend.image_data_format() == 'channels_last':
-      bn_axis = -1
-    else:
-      bn_axis = 1
+    bn_axis = (
+        -1 if tf.keras.backend.image_data_format() == 'channels_last' else 1
+    )

    # Get input feature pyramid from backbone.
    logging.info('FPN input_specs: %s', input_specs)
@@ -191,7 +191,7 @@ class FPN(tf.keras.Model):
        for level in range(min_level, max_level + 1)
    }

-    super(FPN, self).__init__(inputs=inputs, outputs=feats, **kwargs)
+    super().__init__(inputs=inputs, outputs=feats, **kwargs)

  def _build_input_pyramid(self, input_specs: Mapping[str, tf.TensorShape],
                           min_level: int):

--- a/official/vision/modeling/factory.py
+++ b/official/vision/modeling/factory.py
@@ -311,10 +311,13 @@ def build_retinanet(
    _ = head(decoder_features)

  # Add `input_image_size` into `tflite_post_processing_config`.
-  tflite_post_processing_config = generator_config.tflite_post_processing.as_dict(
+  tflite_post_processing_config = (
+      generator_config.tflite_post_processing.as_dict()
+  )
+  tflite_post_processing_config['input_image_size'] = (
+      input_specs.shape[1],
+      input_specs.shape[2],
  )
-  tflite_post_processing_config['input_image_size'] = (input_specs.shape[1],
-                                                       input_specs.shape[2])
  detection_generator_obj = detection_generator.MultilevelDetectionGenerator(
      apply_nms=generator_config.apply_nms,
      pre_nms_top_k=generator_config.pre_nms_top_k,
@@ -327,6 +330,7 @@ def build_retinanet(
      tflite_post_processing_config=tflite_post_processing_config,
      return_decoded=generator_config.return_decoded,
      use_class_agnostic_nms=generator_config.use_class_agnostic_nms,
+      box_coder_weights=generator_config.box_coder_weights,
  )

  model = retinanet_model.RetinaNetModel(

--- a/official/vision/modeling/layers/detection_generator.py
+++ b/official/vision/modeling/layers/detection_generator.py
@@ -1128,6 +1128,7 @@ class MultilevelDetectionGenerator(tf.keras.layers.Layer):
      nms_v3_refinements: Optional[int] = None,
      return_decoded: Optional[bool] = None,
      use_class_agnostic_nms: Optional[bool] = None,
+      box_coder_weights: list[float] | None = None,
      **kwargs,
  ):
    """Initializes a multi-level detection generator.
@@ -1162,6 +1163,10 @@ class MultilevelDetectionGenerator(tf.keras.layers.Layer):
        regardless of whether `apply_nms` is True or not.
      use_class_agnostic_nms: A `bool` of whether non max suppression is
        operated on all the boxes using max scores across all classes.
+      box_coder_weights: An optional `list` of 4 positive floats to scale y, x,
+        h, and w when encoding box coordinates. If set to None, does not perform
+        scaling. For Faster RCNN, the open-source implementation recommends
+        using [10.0, 10.0, 5.0, 5.0].
      **kwargs: Additional keyword arguments passed to Layer.

    Raises:
@@ -1186,6 +1191,7 @@ class MultilevelDetectionGenerator(tf.keras.layers.Layer):
        'soft_nms_sigma': soft_nms_sigma,
        'return_decoded': return_decoded,
        'use_class_agnostic_nms': use_class_agnostic_nms,
+        'box_coder_weights': box_coder_weights,
    }
    # Don't store if were not defined
    if pre_nms_top_k_sharding_block is not None:
@@ -1257,7 +1263,11 @@ class MultilevelDetectionGenerator(tf.keras.layers.Layer):
          raw_boxes_i,
          [batch_size, num_locations * num_anchors_per_locations, 4],
      )
-      boxes_i = box_ops.decode_boxes(raw_boxes_i, anchor_boxes_i)
+      boxes_i = box_ops.decode_boxes(
+          raw_boxes_i,
+          anchor_boxes_i,
+          weights=self._config_dict['box_coder_weights'],
+      )

      # Box clipping.
      if image_shape is not None:

--- a/official/vision/modeling/layers/detection_generator_test.py
+++ b/official/vision/modeling/layers/detection_generator_test.py
@@ -348,6 +348,7 @@ class MultilevelDetectionGeneratorTest(
        'tflite_post_processing_config': tflite_post_processing_config,
        'return_decoded': False,
        'use_class_agnostic_nms': False,
+        'box_coder_weights': None,
    }
    generator = detection_generator.MultilevelDetectionGenerator(**kwargs)


--- a/official/vision/ops/anchor.py
+++ b/official/vision/ops/anchor.py
@@ -15,6 +15,7 @@
 """Anchor box and labeler definition."""

 import collections
+import math
 from typing import Dict, Optional, Tuple

 # Import libraries
@@ -78,9 +79,10 @@ class Anchor(object):
    boxes_all = []
    for level in range(self.min_level, self.max_level + 1):
      boxes_l = []
+      feat_size = math.ceil(self.image_size[0] / 2**level)
+      stride = tf.cast(self.image_size[0] / feat_size, tf.float32)
      for scale in range(self.num_scales):
        for aspect_ratio in self.aspect_ratios:
-          stride = 2**level
          intermidate_scale = 2 ** (scale / float(self.num_scales))
          base_anchor_size = self.anchor_size * stride * intermidate_scale
          aspect_x = aspect_ratio**0.5
@@ -135,7 +137,12 @@ class Anchor(object):
 class AnchorLabeler(object):
  """Labeler for dense object detector."""

-  def __init__(self, match_threshold=0.5, unmatched_threshold=0.5):
+  def __init__(
+      self,
+      match_threshold=0.5,
+      unmatched_threshold=0.5,
+      box_coder_weights=None,
+  ):
    """Constructs anchor labeler to assign labels to anchors.

    Args:
@@ -145,6 +152,10 @@ class AnchorLabeler(object):
      unmatched_threshold: a float number between 0 and 1 representing the
        upper-bound threshold to assign negative labels for anchors. An anchor
        with a score below the threshold is labeled negative.
+      box_coder_weights: Optional `list` of 4 positive floats to scale y, x, h,
+        and w when encoding box coordinates. If set to None, does not perform
+        scaling. For Faster RCNN, the open-source implementation recommends
+        using [10.0, 10.0, 5.0, 5.0].
    """
    self.similarity_calc = iou_similarity.IouSimilarity()
    self.target_gather = target_gather.TargetGather()
@@ -153,7 +164,9 @@ class AnchorLabeler(object):
        indicators=[-1, -2, 1],
        force_match_for_each_col=True,
    )
-    self.box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder()
+    self.box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
+        scale_factors=box_coder_weights,
+    )

  def label_anchors(
      self,

--- a/official/vision/ops/box_ops.py
+++ b/official/vision/ops/box_ops.py
@@ -370,6 +370,12 @@ def encode_boxes(boxes, anchors, weights=None):
    anchor_yc = anchor_ymin + 0.5 * anchor_h
    anchor_xc = anchor_xmin + 0.5 * anchor_w

+    # Avoid inf in log below.
+    anchor_h += EPSILON
+    anchor_w += EPSILON
+    box_h += EPSILON
+    box_w += EPSILON
+
    encoded_dy = (box_yc - anchor_yc) / anchor_h
    encoded_dx = (box_xc - anchor_xc) / anchor_w
    encoded_dh = tf.math.log(box_h / anchor_h)

--- a/official/vision/ops/preprocess_ops.py
+++ b/official/vision/ops/preprocess_ops.py
@@ -29,6 +29,7 @@ MEAN_NORM = (0.485, 0.456, 0.406)
 STDDEV_NORM = (0.229, 0.224, 0.225)
 MEAN_RGB = tuple(255 * i for i in MEAN_NORM)
 STDDEV_RGB = tuple(255 * i for i in STDDEV_NORM)
+MEDIAN_RGB = (128.0, 128.0, 128.0)

 # Alias for convenience. PLEASE use `box_ops.horizontal_flip_boxes` directly.
 horizontal_flip_boxes = box_ops.horizontal_flip_boxes

--- a/official/vision/ops/preprocess_ops_3d.py
+++ b/official/vision/ops/preprocess_ops_3d.py
@@ -198,12 +198,36 @@ def decode_jpeg(image_string: tf.Tensor, channels: int = 0) -> tf.Tensor:
      dtype=tf.uint8)


-def crop_image(frames: tf.Tensor,
-               target_height: int,
-               target_width: int,
-               random: bool = False,
-               num_crops: int = 1,
-               seed: Optional[int] = None) -> tf.Tensor:
+def decode_image(image_string: tf.Tensor, channels: int = 0) -> tf.Tensor:
+  """Decodes PNG or JPEG raw bytes string into a RGB uint8 Tensor.
+
+  Args:
+    image_string: A `tf.Tensor` of type strings with the raw PNG or JPEG bytes
+      where the first dimension is timesteps.
+    channels: Number of channels of the PNG image. Allowed values are 0, 1 and
+      3. If 0, the number of channels will be calculated at runtime and no
+      static shape is set.
+
+  Returns:
+    A Tensor of shape [T, H, W, C] of type uint8 with the decoded images.
+  """
+  return tf.map_fn(
+      lambda x: tf.image.decode_image(  # pylint: disable=g-long-lambda
+          x, channels=channels, expand_animations=False),
+      image_string,
+      back_prop=False,
+      dtype=tf.uint8,
+  )
+
+
+def crop_image(
+    frames: tf.Tensor,
+    target_height: int,
+    target_width: int,
+    random: bool = False,
+    num_crops: int = 1,
+    seed: Optional[int] = None,
+) -> tf.Tensor:
  """Crops the image sequence of images.

  If requested size is bigger than image size, image is padded with 0. If not

--- a/official/vision/ops/preprocess_ops_3d_test.py
+++ b/official/vision/ops/preprocess_ops_3d_test.py
@@ -96,6 +96,33 @@ class ParserUtilsTest(tf.test.TestCase):
    self.assertEqual(decoded_image.shape.as_list()[3], 3)
    self.assertAllEqual(decoded_image.shape, (2, 263, 320, 3))

+  def test_decode_image(self):
+    # Create a random RGB JPEG image.
+    random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
+    random_image = Image.fromarray(random_image)
+    with io.BytesIO() as buffer:
+      random_image.save(buffer, format='JPEG')
+      raw_image_bytes = buffer.getvalue()
+
+    raw_image = tf.constant([raw_image_bytes, raw_image_bytes])
+    decoded_image = preprocess_ops_3d.decode_image(raw_image, 3)
+
+    self.assertEqual(decoded_image.shape.as_list()[3], 3)
+    self.assertAllEqual(decoded_image.shape, (2, 263, 320, 3))
+
+    # Create a random RGB PNG image.
+    random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
+    random_image = Image.fromarray(random_image)
+    with io.BytesIO() as buffer:
+      random_image.save(buffer, format='PNG')
+      raw_image_bytes = buffer.getvalue()
+
+    raw_image = tf.constant([raw_image_bytes, raw_image_bytes])
+    decoded_image = preprocess_ops_3d.decode_image(raw_image, 3)
+
+    self.assertEqual(decoded_image.shape.as_list()[3], 3)
+    self.assertAllEqual(decoded_image.shape, (2, 263, 320, 3))
+
  def test_crop_image(self):
    cropped_image_1 = preprocess_ops_3d.crop_image(self._frames, 50, 70)
    cropped_image_2 = preprocess_ops_3d.crop_image(self._frames, 200, 200)

--- a/official/vision/tasks/retinanet.py
+++ b/official/vision/tasks/retinanet.py
@@ -127,6 +127,9 @@ class RetinaNetTask(base_task.Task):
        dtype=params.dtype,
        match_threshold=params.parser.match_threshold,
        unmatched_threshold=params.parser.unmatched_threshold,
+        box_coder_weights=(
+            self.task_config.model.detection_generator.box_coder_weights
+        ),
        aug_type=params.parser.aug_type,
        aug_rand_hflip=params.parser.aug_rand_hflip,
        aug_scale_min=params.parser.aug_scale_min,