diff --git a/official/nlp/modeling/layers/transformer_scaffold.py b/official/nlp/modeling/layers/transformer_scaffold.py index 8ce8bcf8003777fa92fa6a8793f77254f55ae4dd..6b46a4b8123c24495b888f0cd3245c50615c4aec 100644 --- a/official/nlp/modeling/layers/transformer_scaffold.py +++ b/official/nlp/modeling/layers/transformer_scaffold.py @@ -335,7 +335,9 @@ class TransformerScaffold(tf.keras.layers.Layer): training=training) layer_output += source_attention_output else: - # if not norm_first, assume that the feedforwad does apply layer norm + # Attention: if not norm_first, assume that the feedforwad does apply + # layer norm. The feedford also apply residual connection. Please + # read the `GatedFeedforward` as a concrete example. layer_output = self._feedforward_block(attention_output, training=training)