diff --git a/official/nlp/modeling/layers/transformer_scaffold.py b/official/nlp/modeling/layers/transformer_scaffold.py
index 8ce8bcf8003777fa92fa6a8793f77254f55ae4dd..6b46a4b8123c24495b888f0cd3245c50615c4aec 100644
--- a/official/nlp/modeling/layers/transformer_scaffold.py
+++ b/official/nlp/modeling/layers/transformer_scaffold.py
@@ -335,7 +335,9 @@ class TransformerScaffold(tf.keras.layers.Layer):
                                                training=training)
         layer_output += source_attention_output
       else:
-        # if not norm_first, assume that the feedforwad does apply layer norm
+        # Attention: if not norm_first, assume that the feedforwad does apply
+        # layer norm. The feedford also apply residual connection. Please
+        # read  the `GatedFeedforward` as a concrete example.
         layer_output = self._feedforward_block(attention_output,
                                                training=training)