From 441e5ae0ee47d49926e8690a9708e1be5662b9d5 Mon Sep 17 00:00:00 2001
From: Liangzhe Yuan <lzyuan@google.com>
Date: Thu, 27 Jan 2022 17:12:06 -0800
Subject: [PATCH] #movinet Support 'none' squeeze and excitation layers in
 Movinet.

PiperOrigin-RevId: 424743840
---
 official/projects/movinet/modeling/movinet.py | 13 +++---
 .../movinet/modeling/movinet_layers.py        | 29 +++++++------
 .../movinet/modeling/movinet_layers_test.py   | 29 +++++++++++++
 .../projects/movinet/modeling/movinet_test.py | 43 +++++++++++++++++++
 4 files changed, 95 insertions(+), 19 deletions(-)

diff --git a/official/projects/movinet/modeling/movinet.py b/official/projects/movinet/modeling/movinet.py
index 2110f9d2f..f2e46059b 100644
--- a/official/projects/movinet/modeling/movinet.py
+++ b/official/projects/movinet/modeling/movinet.py
@@ -338,7 +338,7 @@ class Movinet(tf.keras.Model):
         3x3 followed by 5x1 conv). '3d_2plus1d' uses (2+1)D convolution with
         Conv3D and no 2D reshaping (e.g., a 5x3x3 kernel becomes 1x3x3 followed
         by 5x1x1 conv).
-      se_type: '3d', '2d', or '2plus3d'. '3d' uses the default 3D
+      se_type: '3d', '2d', '2plus3d' or 'none'. '3d' uses the default 3D
           spatiotemporal global average pooling for squeeze excitation. '2d'
           uses 2D spatial global average pooling  on each frame. '2plus3d'
           concatenates both 3D and 2D global average pooling.
@@ -369,7 +369,7 @@ class Movinet(tf.keras.Model):
 
     if conv_type not in ('3d', '2plus1d', '3d_2plus1d'):
       raise ValueError('Unknown conv type: {}'.format(conv_type))
-    if se_type not in ('3d', '2d', '2plus3d'):
+    if se_type not in ('3d', '2d', '2plus3d', 'none'):
       raise ValueError('Unknown squeeze excitation type: {}'.format(se_type))
 
     self._model_id = model_id
@@ -602,10 +602,11 @@ class Movinet(tf.keras.Model):
                 expand_filters,
             )
 
-          states[f'{prefix}_pool_buffer'] = (
-              input_shape[0], 1, 1, 1, expand_filters,
-          )
-          states[f'{prefix}_pool_frame_count'] = (1,)
+          if '3d' in self._se_type:
+            states[f'{prefix}_pool_buffer'] = (
+                input_shape[0], 1, 1, 1, expand_filters,
+            )
+            states[f'{prefix}_pool_frame_count'] = (1,)
 
           if use_positional_encoding:
             name = f'{prefix}_pos_enc_frame_count'
diff --git a/official/projects/movinet/modeling/movinet_layers.py b/official/projects/movinet/modeling/movinet_layers.py
index fcdfb1f45..360fee866 100644
--- a/official/projects/movinet/modeling/movinet_layers.py
+++ b/official/projects/movinet/modeling/movinet_layers.py
@@ -885,7 +885,8 @@ class MobileBottleneck(tf.keras.layers.Layer):
 
     x = self._expansion_layer(inputs)
     x, states = self._feature_layer(x, states=states)
-    x, states = self._attention_layer(x, states=states)
+    if self._attention_layer is not None:
+      x, states = self._attention_layer(x, states=states)
     x = self._projection_layer(x)
 
     # Add identity so that the ops are ordered as written. This is useful for,
@@ -1136,18 +1137,20 @@ class MovinetBlock(tf.keras.layers.Layer):
         batch_norm_momentum=self._batch_norm_momentum,
         batch_norm_epsilon=self._batch_norm_epsilon,
         name='projection')
-    self._attention = StreamSqueezeExcitation(
-        se_hidden_filters,
-        se_type=se_type,
-        activation=activation,
-        gating_activation=gating_activation,
-        causal=self._causal,
-        conv_type=conv_type,
-        use_positional_encoding=use_positional_encoding,
-        kernel_initializer=kernel_initializer,
-        kernel_regularizer=kernel_regularizer,
-        state_prefix=state_prefix,
-        name='se')
+    self._attention = None
+    if se_type != 'none':
+      self._attention = StreamSqueezeExcitation(
+          se_hidden_filters,
+          se_type=se_type,
+          activation=activation,
+          gating_activation=gating_activation,
+          causal=self._causal,
+          conv_type=conv_type,
+          use_positional_encoding=use_positional_encoding,
+          kernel_initializer=kernel_initializer,
+          kernel_regularizer=kernel_regularizer,
+          state_prefix=state_prefix,
+          name='se')
 
   def get_config(self):
     """Returns a dictionary containing the config used for initialization."""
diff --git a/official/projects/movinet/modeling/movinet_layers_test.py b/official/projects/movinet/modeling/movinet_layers_test.py
index cf661d8aa..9a2aba655 100644
--- a/official/projects/movinet/modeling/movinet_layers_test.py
+++ b/official/projects/movinet/modeling/movinet_layers_test.py
@@ -378,6 +378,35 @@ class MovinetLayersTest(parameterized.TestCase, tf.test.TestCase):
       self.assertEqual(predicted.shape, expected.shape)
       self.assertAllClose(predicted, expected)
 
+  def test_stream_movinet_block_none_se(self):
+    block = movinet_layers.MovinetBlock(
+        out_filters=3,
+        expand_filters=6,
+        kernel_size=(3, 3, 3),
+        strides=(1, 2, 2),
+        causal=True,
+        se_type='none',
+        state_prefix='test',
+    )
+
+    inputs = tf.range(4, dtype=tf.float32) + 1.
+    inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
+    inputs = tf.tile(inputs, [1, 1, 2, 1, 3])
+    expected, expected_states = block(inputs)
+
+    for num_splits in [1, 2, 4]:
+      frames = tf.split(inputs, inputs.shape[1] // num_splits, axis=1)
+      states = {}
+      predicted = []
+      for frame in frames:
+        x, states = block(frame, states=states)
+        predicted.append(x)
+      predicted = tf.concat(predicted, axis=1)
+
+      self.assertEqual(predicted.shape, expected.shape)
+      self.assertAllClose(predicted, expected)
+    self.assertAllEqual(list(expected_states.keys()), ['test_stream_buffer'])
+
   def test_stream_classifier_head(self):
     head = movinet_layers.Head(project_filters=5)
     classifier_head = movinet_layers.ClassifierHead(
diff --git a/official/projects/movinet/modeling/movinet_test.py b/official/projects/movinet/modeling/movinet_test.py
index 55003ab98..b54386c6b 100644
--- a/official/projects/movinet/modeling/movinet_test.py
+++ b/official/projects/movinet/modeling/movinet_test.py
@@ -99,6 +99,49 @@ class MoViNetTest(parameterized.TestCase, tf.test.TestCase):
     self.assertEqual(predicted.shape, expected.shape)
     self.assertAllClose(predicted, expected, 1e-5, 1e-5)
 
+  def test_movinet_stream_nse(self):
+    """Test if the backbone can be run in streaming mode w/o SE layer."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    backbone = movinet.Movinet(
+        model_id='a0',
+        causal=True,
+        use_external_states=True,
+        se_type='none',
+    )
+    inputs = tf.ones([1, 5, 128, 128, 3])
+
+    init_states = backbone.init_states(tf.shape(inputs))
+    expected_endpoints, _ = backbone({**init_states, 'image': inputs})
+
+    frames = tf.split(inputs, inputs.shape[1], axis=1)
+
+    states = init_states
+    for frame in frames:
+      output, states = backbone({**states, 'image': frame})
+    predicted_endpoints = output
+
+    predicted = predicted_endpoints['head']
+
+    # The expected final output is simply the mean across frames
+    expected = expected_endpoints['head']
+    expected = tf.reduce_mean(expected, 1, keepdims=True)
+
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected, 1e-5, 1e-5)
+
+    # Check contents in the states dictionary.
+    state_keys = list(init_states.keys())
+    self.assertIn('state_head_pool_buffer', state_keys)
+    self.assertIn('state_head_pool_frame_count', state_keys)
+    state_keys.remove('state_head_pool_buffer')
+    state_keys.remove('state_head_pool_frame_count')
+    # From now on, there are only 'stream_buffer' for the convolutions.
+    for state_key in state_keys:
+      self.assertIn(
+          'stream_buffer', state_key,
+          msg=f'Expecting stream_buffer only, found {state_key}')
+
   def test_movinet_2plus1d_stream(self):
     tf.keras.backend.set_image_data_format('channels_last')
 
-- 
GitLab