From d1a5cdac08dc9283927c1cccc4db9d1c331ecefd Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 17 Nov 2022 14:41:27 -0800 Subject: [PATCH] Internal change PiperOrigin-RevId: 489312010 --- official/nlp/modeling/layers/moe.py | 45 ++++++++++++------- official/nlp/modeling/layers/moe_test.py | 14 +++--- .../nlp/modeling/networks/sparse_mixer.py | 2 +- 3 files changed, 38 insertions(+), 23 deletions(-) diff --git a/official/nlp/modeling/layers/moe.py b/official/nlp/modeling/layers/moe.py index 06dcbbaee..e1e30dbad 100644 --- a/official/nlp/modeling/layers/moe.py +++ b/official/nlp/modeling/layers/moe.py @@ -328,9 +328,9 @@ class FeedForward(tf.keras.layers.Layer): self, d_ff: int, *, - dropout_rate: float = 0.1, - activation: Callable[[tf.Tensor], - tf.Tensor] = tf.keras.activations.gelu, + inner_dropout: float = 0.0, + output_dropout: float = 0.0, + activation: Callable[[tf.Tensor], tf.Tensor] = tf.keras.activations.gelu, kernel_initializer: _InitializerType = _DEFAULT_KERNEL_INITIALIZER, bias_initializer: _InitializerType = _DEFAULT_BIAS_INITIALIZER, name: str = "feed_forward", @@ -339,7 +339,9 @@ class FeedForward(tf.keras.layers.Layer): Args: d_ff: Dimension of feed-forward layer. - dropout_rate: The dropout probability. + inner_dropout: The dropout probability to be applied after intermediate + activations. + output_dropout: The dropout probability to be applied after output layer. activation: (Nonlinear) transform applied in layer. kernel_initializer: Initialization scheme for kernel. bias_initializer: Initialization scheme for bias. @@ -356,7 +358,9 @@ class FeedForward(tf.keras.layers.Layer): kernel_initializer=tf_utils.clone_initializer(self.kernel_initializer), bias_initializer=tf_utils.clone_initializer(self.bias_initializer), name="intermediate") - self.dropout_layer = tf.keras.layers.Dropout(dropout_rate) + self.inner_dropout_layer = tf.keras.layers.Dropout( + inner_dropout) + self.output_dropout_layer = tf.keras.layers.Dropout(output_dropout) def build(self, input_shape: Tuple[int, int, int]): """Creates the input shape dependent output weight variables.""" @@ -383,8 +387,9 @@ class FeedForward(tf.keras.layers.Layer): """ x = self.intermediate_layer(inputs) x = self.activation(x) + x = self.inner_dropout_layer(x, training=training) x = self.output_layer(x) - x = self.dropout_layer(x, training=training) + x = self.output_dropout_layer(x, training=training) return x @@ -406,9 +411,9 @@ class FeedForwardExperts(tf.keras.layers.Layer): num_experts: int, d_ff: int, *, - dropout_rate: float = 0.1, - activation: Callable[[tf.Tensor], - tf.Tensor] = tf.keras.activations.gelu, + inner_dropout: float = 0.0, + output_dropout: float = 0.0, + activation: Callable[[tf.Tensor], tf.Tensor] = tf.keras.activations.gelu, kernel_initializer: _InitializerType = _DEFAULT_KERNEL_INITIALIZER, bias_initializer: _InitializerType = _DEFAULT_BIAS_INITIALIZER, name: str = "experts", @@ -419,7 +424,9 @@ class FeedForwardExperts(tf.keras.layers.Layer): num_experts: Number of experts (i.e. number of independent feed-forward blocks). d_ff: Dimension of feed-forward layer of each expert. - dropout_rate: The dropout probability (expert_dropout_rate). + inner_dropout: The dropout probability to be applied after intermediate + activations. + output_dropout: The dropout probability to be applied after output layer. activation: (Nonlinear) transform applied in layer. kernel_initializer: Initialization scheme for kernel. bias_initializer: Initialization scheme for bias. @@ -439,7 +446,9 @@ class FeedForwardExperts(tf.keras.layers.Layer): kernel_initializer=tf_utils.clone_initializer(self.kernel_initializer), bias_initializer=tf_utils.clone_initializer(self.bias_initializer), name="intermediate") - self.dropout_layer = tf.keras.layers.Dropout(dropout_rate) + self.inner_dropout_layer = tf.keras.layers.Dropout( + inner_dropout) + self.output_dropout_layer = tf.keras.layers.Dropout(output_dropout) def build(self, input_shape: Tuple[int, int, int, int]): """Creates the input shape dependent output weight variables.""" @@ -473,8 +482,9 @@ class FeedForwardExperts(tf.keras.layers.Layer): """ x = self.intermediate_layer(inputs) x = self.activation(x) + x = self.inner_dropout_layer(x, training=training) x = self.output_layer(x) - x = self.dropout_layer(x, training=training) + x = self.output_dropout_layer(x, training=training) return x @@ -709,7 +719,8 @@ class MoeLayerWithBackbone(tf.keras.layers.Layer): moe: MoeLayer, backbone_d_ff: int, *, - dropout_rate: float = 0.1, + inner_dropout: float = 0.0, + output_dropout: float = 0.0, activation: Callable[[tf.Tensor], tf.Tensor] = tf.keras.activations.gelu, kernel_initializer: _InitializerType = _DEFAULT_KERNEL_INITIALIZER, @@ -722,7 +733,10 @@ class MoeLayerWithBackbone(tf.keras.layers.Layer): moe: Instance of MoeLayer with experts and router. backbone_d_ff: Dimension of feed-forward layer of a lightweight backbone, which is evaluated for all tokens. - dropout_rate: Dropout rate for the backbone. + inner_dropout: The dropout probability to be applied after intermediate + activations for the backbone. + output_dropout: The dropout probability to be applied after the output + of the backbone. activation: (Nonlinear) transform applied in the backbone. kernel_initializer: Initialization scheme for kernels in the backbone. bias_initializer: Initialization scheme for biases in the backbone. @@ -734,7 +748,8 @@ class MoeLayerWithBackbone(tf.keras.layers.Layer): self._backbone = FeedForward( backbone_d_ff, - dropout_rate=dropout_rate, + inner_dropout=inner_dropout, + output_dropout=output_dropout, activation=activation, kernel_initializer=tf_utils.clone_initializer(kernel_initializer), bias_initializer=tf_utils.clone_initializer(bias_initializer), diff --git a/official/nlp/modeling/layers/moe_test.py b/official/nlp/modeling/layers/moe_test.py index 159076773..af05c3656 100644 --- a/official/nlp/modeling/layers/moe_test.py +++ b/official/nlp/modeling/layers/moe_test.py @@ -24,7 +24,7 @@ def small_config(): """Creates a small model config that can be used by all tests.""" config = {} config['d_ff'] = 32 - config['dropout_rate'] = 0.1 + config['output_dropout'] = 0.1 config['num_experts'] = 2 config['expert_d_ff'] = 33 @@ -131,7 +131,7 @@ class MoeTest(tf.test.TestCase): def test_feed_forward_shape_and_vars(self): config = small_config() layer = moe.FeedForward( - d_ff=config['d_ff'], dropout_rate=config['dropout_rate']) + d_ff=config['d_ff'], output_dropout=config['output_dropout']) inputs = make_input_ones() outputs = layer(inputs) self.assertAllEqual(tf.shape(inputs), tf.shape(outputs)) @@ -146,7 +146,7 @@ class MoeTest(tf.test.TestCase): config = small_config() layer = moe.FeedForward( d_ff=config['d_ff'], - dropout_rate=config['dropout_rate'], + output_dropout=config['output_dropout'], activation=tf.keras.activations.relu, kernel_initializer=tf.keras.initializers.get('ones'), bias_initializer=tf.keras.initializers.get('ones')) @@ -161,7 +161,7 @@ class MoeTest(tf.test.TestCase): layer = moe.FeedForwardExperts( num_experts=config['num_experts'], d_ff=config['expert_d_ff'], - dropout_rate=config['expert_dropout_rate']) + output_dropout=config['expert_dropout_rate']) inputs = make_experts_input_ones() outputs = layer(inputs) self.assertAllEqual(tf.shape(inputs), tf.shape(outputs)) @@ -176,7 +176,7 @@ class MoeTest(tf.test.TestCase): layer = moe.FeedForwardExperts( num_experts=1, d_ff=config['expert_d_ff'], - dropout_rate=config['expert_dropout_rate'], + output_dropout=config['expert_dropout_rate'], activation=tf.keras.activations.relu, kernel_initializer=tf.keras.initializers.get('ones'), bias_initializer=tf.keras.initializers.get('ones')) @@ -191,7 +191,7 @@ class MoeTest(tf.test.TestCase): experts = moe.FeedForwardExperts( num_experts=config['num_experts'], d_ff=config['expert_d_ff'], - dropout_rate=config['expert_dropout_rate']) + output_dropout=config['expert_dropout_rate']) router = moe.ExpertsChooseMaskedRouter( config['num_experts'], jitter_noise=config['jitter_noise']) moe_layer = moe.MoeLayer( @@ -233,7 +233,7 @@ class MoeTest(tf.test.TestCase): experts = moe.FeedForwardExperts( num_experts=config['num_experts'], d_ff=config['expert_d_ff'], - dropout_rate=config['expert_dropout_rate']) + output_dropout=config['expert_dropout_rate']) router = moe.ExpertsChooseMaskedRouter( config['num_experts'], jitter_noise=config['jitter_noise']) moe_layer = moe.MoeLayer( diff --git a/official/nlp/modeling/networks/sparse_mixer.py b/official/nlp/modeling/networks/sparse_mixer.py index c69e7940e..1362aff78 100644 --- a/official/nlp/modeling/networks/sparse_mixer.py +++ b/official/nlp/modeling/networks/sparse_mixer.py @@ -233,7 +233,7 @@ class SparseMixer(tf.keras.layers.Layer): experts=layers.FeedForwardExperts( num_experts=num_experts, d_ff=hidden_size, - dropout_rate=output_dropout, + output_dropout=output_dropout, activation=inner_activation, kernel_initializer=tf_utils.clone_initializer(initializer), name='experts'), -- GitLab