Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
曾经的那一瞬间
Models
提交
d1a5cdac
M
Models
项目概览
曾经的那一瞬间
/
Models
11 个月 前同步成功
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
Models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
d1a5cdac
编写于
11月 17, 2022
作者:
A
A. Unique TensorFlower
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Internal change
PiperOrigin-RevId: 489312010
上级
6f4e62ff
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
38 addition
and
23 deletion
+38
-23
official/nlp/modeling/layers/moe.py
official/nlp/modeling/layers/moe.py
+30
-15
official/nlp/modeling/layers/moe_test.py
official/nlp/modeling/layers/moe_test.py
+7
-7
official/nlp/modeling/networks/sparse_mixer.py
official/nlp/modeling/networks/sparse_mixer.py
+1
-1
未找到文件。
official/nlp/modeling/layers/moe.py
浏览文件 @
d1a5cdac
...
...
@@ -328,9 +328,9 @@ class FeedForward(tf.keras.layers.Layer):
self
,
d_ff
:
int
,
*
,
dropout_rate
:
float
=
0.1
,
activation
:
Callable
[[
tf
.
Tensor
]
,
tf
.
Tensor
]
=
tf
.
keras
.
activations
.
gelu
,
inner_dropout
:
float
=
0.0
,
output_dropout
:
float
=
0.0
,
activation
:
Callable
[[
tf
.
Tensor
],
tf
.
Tensor
]
=
tf
.
keras
.
activations
.
gelu
,
kernel_initializer
:
_InitializerType
=
_DEFAULT_KERNEL_INITIALIZER
,
bias_initializer
:
_InitializerType
=
_DEFAULT_BIAS_INITIALIZER
,
name
:
str
=
"feed_forward"
,
...
...
@@ -339,7 +339,9 @@ class FeedForward(tf.keras.layers.Layer):
Args:
d_ff: Dimension of feed-forward layer.
dropout_rate: The dropout probability.
inner_dropout: The dropout probability to be applied after intermediate
activations.
output_dropout: The dropout probability to be applied after output layer.
activation: (Nonlinear) transform applied in layer.
kernel_initializer: Initialization scheme for kernel.
bias_initializer: Initialization scheme for bias.
...
...
@@ -356,7 +358,9 @@ class FeedForward(tf.keras.layers.Layer):
kernel_initializer
=
tf_utils
.
clone_initializer
(
self
.
kernel_initializer
),
bias_initializer
=
tf_utils
.
clone_initializer
(
self
.
bias_initializer
),
name
=
"intermediate"
)
self
.
dropout_layer
=
tf
.
keras
.
layers
.
Dropout
(
dropout_rate
)
self
.
inner_dropout_layer
=
tf
.
keras
.
layers
.
Dropout
(
inner_dropout
)
self
.
output_dropout_layer
=
tf
.
keras
.
layers
.
Dropout
(
output_dropout
)
def
build
(
self
,
input_shape
:
Tuple
[
int
,
int
,
int
]):
"""Creates the input shape dependent output weight variables."""
...
...
@@ -383,8 +387,9 @@ class FeedForward(tf.keras.layers.Layer):
"""
x
=
self
.
intermediate_layer
(
inputs
)
x
=
self
.
activation
(
x
)
x
=
self
.
inner_dropout_layer
(
x
,
training
=
training
)
x
=
self
.
output_layer
(
x
)
x
=
self
.
dropout_layer
(
x
,
training
=
training
)
x
=
self
.
output_
dropout_layer
(
x
,
training
=
training
)
return
x
...
...
@@ -406,9 +411,9 @@ class FeedForwardExperts(tf.keras.layers.Layer):
num_experts
:
int
,
d_ff
:
int
,
*
,
dropout_rate
:
float
=
0.1
,
activation
:
Callable
[[
tf
.
Tensor
]
,
tf
.
Tensor
]
=
tf
.
keras
.
activations
.
gelu
,
inner_dropout
:
float
=
0.0
,
output_dropout
:
float
=
0.0
,
activation
:
Callable
[[
tf
.
Tensor
],
tf
.
Tensor
]
=
tf
.
keras
.
activations
.
gelu
,
kernel_initializer
:
_InitializerType
=
_DEFAULT_KERNEL_INITIALIZER
,
bias_initializer
:
_InitializerType
=
_DEFAULT_BIAS_INITIALIZER
,
name
:
str
=
"experts"
,
...
...
@@ -419,7 +424,9 @@ class FeedForwardExperts(tf.keras.layers.Layer):
num_experts: Number of experts (i.e. number of independent feed-forward
blocks).
d_ff: Dimension of feed-forward layer of each expert.
dropout_rate: The dropout probability (expert_dropout_rate).
inner_dropout: The dropout probability to be applied after intermediate
activations.
output_dropout: The dropout probability to be applied after output layer.
activation: (Nonlinear) transform applied in layer.
kernel_initializer: Initialization scheme for kernel.
bias_initializer: Initialization scheme for bias.
...
...
@@ -439,7 +446,9 @@ class FeedForwardExperts(tf.keras.layers.Layer):
kernel_initializer
=
tf_utils
.
clone_initializer
(
self
.
kernel_initializer
),
bias_initializer
=
tf_utils
.
clone_initializer
(
self
.
bias_initializer
),
name
=
"intermediate"
)
self
.
dropout_layer
=
tf
.
keras
.
layers
.
Dropout
(
dropout_rate
)
self
.
inner_dropout_layer
=
tf
.
keras
.
layers
.
Dropout
(
inner_dropout
)
self
.
output_dropout_layer
=
tf
.
keras
.
layers
.
Dropout
(
output_dropout
)
def
build
(
self
,
input_shape
:
Tuple
[
int
,
int
,
int
,
int
]):
"""Creates the input shape dependent output weight variables."""
...
...
@@ -473,8 +482,9 @@ class FeedForwardExperts(tf.keras.layers.Layer):
"""
x
=
self
.
intermediate_layer
(
inputs
)
x
=
self
.
activation
(
x
)
x
=
self
.
inner_dropout_layer
(
x
,
training
=
training
)
x
=
self
.
output_layer
(
x
)
x
=
self
.
dropout_layer
(
x
,
training
=
training
)
x
=
self
.
output_
dropout_layer
(
x
,
training
=
training
)
return
x
...
...
@@ -709,7 +719,8 @@ class MoeLayerWithBackbone(tf.keras.layers.Layer):
moe
:
MoeLayer
,
backbone_d_ff
:
int
,
*
,
dropout_rate
:
float
=
0.1
,
inner_dropout
:
float
=
0.0
,
output_dropout
:
float
=
0.0
,
activation
:
Callable
[[
tf
.
Tensor
],
tf
.
Tensor
]
=
tf
.
keras
.
activations
.
gelu
,
kernel_initializer
:
_InitializerType
=
_DEFAULT_KERNEL_INITIALIZER
,
...
...
@@ -722,7 +733,10 @@ class MoeLayerWithBackbone(tf.keras.layers.Layer):
moe: Instance of MoeLayer with experts and router.
backbone_d_ff: Dimension of feed-forward layer of a lightweight backbone,
which is evaluated for all tokens.
dropout_rate: Dropout rate for the backbone.
inner_dropout: The dropout probability to be applied after intermediate
activations for the backbone.
output_dropout: The dropout probability to be applied after the output
of the backbone.
activation: (Nonlinear) transform applied in the backbone.
kernel_initializer: Initialization scheme for kernels in the backbone.
bias_initializer: Initialization scheme for biases in the backbone.
...
...
@@ -734,7 +748,8 @@ class MoeLayerWithBackbone(tf.keras.layers.Layer):
self
.
_backbone
=
FeedForward
(
backbone_d_ff
,
dropout_rate
=
dropout_rate
,
inner_dropout
=
inner_dropout
,
output_dropout
=
output_dropout
,
activation
=
activation
,
kernel_initializer
=
tf_utils
.
clone_initializer
(
kernel_initializer
),
bias_initializer
=
tf_utils
.
clone_initializer
(
bias_initializer
),
...
...
official/nlp/modeling/layers/moe_test.py
浏览文件 @
d1a5cdac
...
...
@@ -24,7 +24,7 @@ def small_config():
"""Creates a small model config that can be used by all tests."""
config
=
{}
config
[
'd_ff'
]
=
32
config
[
'
dropout_rate
'
]
=
0.1
config
[
'
output_dropout
'
]
=
0.1
config
[
'num_experts'
]
=
2
config
[
'expert_d_ff'
]
=
33
...
...
@@ -131,7 +131,7 @@ class MoeTest(tf.test.TestCase):
def
test_feed_forward_shape_and_vars
(
self
):
config
=
small_config
()
layer
=
moe
.
FeedForward
(
d_ff
=
config
[
'd_ff'
],
dropout_rate
=
config
[
'dropout_rate
'
])
d_ff
=
config
[
'd_ff'
],
output_dropout
=
config
[
'output_dropout
'
])
inputs
=
make_input_ones
()
outputs
=
layer
(
inputs
)
self
.
assertAllEqual
(
tf
.
shape
(
inputs
),
tf
.
shape
(
outputs
))
...
...
@@ -146,7 +146,7 @@ class MoeTest(tf.test.TestCase):
config
=
small_config
()
layer
=
moe
.
FeedForward
(
d_ff
=
config
[
'd_ff'
],
dropout_rate
=
config
[
'dropout_rate
'
],
output_dropout
=
config
[
'output_dropout
'
],
activation
=
tf
.
keras
.
activations
.
relu
,
kernel_initializer
=
tf
.
keras
.
initializers
.
get
(
'ones'
),
bias_initializer
=
tf
.
keras
.
initializers
.
get
(
'ones'
))
...
...
@@ -161,7 +161,7 @@ class MoeTest(tf.test.TestCase):
layer
=
moe
.
FeedForwardExperts
(
num_experts
=
config
[
'num_experts'
],
d_ff
=
config
[
'expert_d_ff'
],
dropout_rate
=
config
[
'expert_dropout_rate'
])
output_dropout
=
config
[
'expert_dropout_rate'
])
inputs
=
make_experts_input_ones
()
outputs
=
layer
(
inputs
)
self
.
assertAllEqual
(
tf
.
shape
(
inputs
),
tf
.
shape
(
outputs
))
...
...
@@ -176,7 +176,7 @@ class MoeTest(tf.test.TestCase):
layer
=
moe
.
FeedForwardExperts
(
num_experts
=
1
,
d_ff
=
config
[
'expert_d_ff'
],
dropout_rate
=
config
[
'expert_dropout_rate'
],
output_dropout
=
config
[
'expert_dropout_rate'
],
activation
=
tf
.
keras
.
activations
.
relu
,
kernel_initializer
=
tf
.
keras
.
initializers
.
get
(
'ones'
),
bias_initializer
=
tf
.
keras
.
initializers
.
get
(
'ones'
))
...
...
@@ -191,7 +191,7 @@ class MoeTest(tf.test.TestCase):
experts
=
moe
.
FeedForwardExperts
(
num_experts
=
config
[
'num_experts'
],
d_ff
=
config
[
'expert_d_ff'
],
dropout_rate
=
config
[
'expert_dropout_rate'
])
output_dropout
=
config
[
'expert_dropout_rate'
])
router
=
moe
.
ExpertsChooseMaskedRouter
(
config
[
'num_experts'
],
jitter_noise
=
config
[
'jitter_noise'
])
moe_layer
=
moe
.
MoeLayer
(
...
...
@@ -233,7 +233,7 @@ class MoeTest(tf.test.TestCase):
experts
=
moe
.
FeedForwardExperts
(
num_experts
=
config
[
'num_experts'
],
d_ff
=
config
[
'expert_d_ff'
],
dropout_rate
=
config
[
'expert_dropout_rate'
])
output_dropout
=
config
[
'expert_dropout_rate'
])
router
=
moe
.
ExpertsChooseMaskedRouter
(
config
[
'num_experts'
],
jitter_noise
=
config
[
'jitter_noise'
])
moe_layer
=
moe
.
MoeLayer
(
...
...
official/nlp/modeling/networks/sparse_mixer.py
浏览文件 @
d1a5cdac
...
...
@@ -233,7 +233,7 @@ class SparseMixer(tf.keras.layers.Layer):
experts
=
layers
.
FeedForwardExperts
(
num_experts
=
num_experts
,
d_ff
=
hidden_size
,
dropout_rate
=
output_dropout
,
output_dropout
=
output_dropout
,
activation
=
inner_activation
,
kernel_initializer
=
tf_utils
.
clone_initializer
(
initializer
),
name
=
'experts'
),
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录