Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Oneflow-Inc
OneFlow-Benchmark
提交
f51c43e2
O
OneFlow-Benchmark
项目概览
Oneflow-Inc
/
OneFlow-Benchmark
上一次同步 接近 3 年
通知
1
Star
92
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
OneFlow-Benchmark
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
f51c43e2
编写于
9月 09, 2021
作者:
X
XIE Xuan
提交者:
GitHub
9月 09, 2021
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #218 from Oneflow-Inc/dev_gpt_modify_parameter
change parallel_distribution to nd_sbp
上级
fea3b203
e449b191
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
14 addition
and
14 deletion
+14
-14
LanguageModeling/GPT/oneflow_gpt/data.py
LanguageModeling/GPT/oneflow_gpt/data.py
+1
-1
LanguageModeling/GPT/oneflow_gpt/distribute.py
LanguageModeling/GPT/oneflow_gpt/distribute.py
+6
-6
LanguageModeling/GPT/oneflow_gpt/model.py
LanguageModeling/GPT/oneflow_gpt/model.py
+6
-6
LanguageModeling/GPT/oneflow_gpt/util.py
LanguageModeling/GPT/oneflow_gpt/util.py
+1
-1
未找到文件。
LanguageModeling/GPT/oneflow_gpt/data.py
浏览文件 @
f51c43e2
...
...
@@ -34,7 +34,7 @@ class GPTDataLoader(object):
random_seed
=
self
.
seed
,
split_sizes
=
self
.
split
,
split_index
=
0
,
parallel_distribution
=
distribute
.
get_data_parallel_dist
(),
nd_sbp
=
distribute
.
get_data_parallel_dist
(),
name
=
self
.
name
,
)
...
...
LanguageModeling/GPT/oneflow_gpt/distribute.py
浏览文件 @
f51c43e2
...
...
@@ -235,9 +235,9 @@ def forward_p2b_parallel_cast(x):
# backward: B -> B, identity
x
=
flow
.
hierarchical_parallel_cast
(
x
,
parallel_distribution
=
parallel_dist
,
nd_sbp
=
parallel_dist
,
grad_mode
=
"manual"
,
grad_
parallel_distribution
=
parallel_dist
,
grad_
nd_sbp
=
parallel_dist
,
)
elif
dist_util
.
is_data_parallel
():
# parallel cast: S(0) -> S(0), identity
...
...
@@ -265,9 +265,9 @@ def backward_p2b_parallel_cast(x):
# backward: [S(0), P] cast to [S(0), B], for layernorm grad not supporting P, cast from P to B
x
=
flow
.
hierarchical_parallel_cast
(
x
,
parallel_distribution
=
parallel_dist
,
nd_sbp
=
parallel_dist
,
grad_mode
=
"manual"
,
grad_
parallel_distribution
=
parallel_dist
,
grad_
nd_sbp
=
parallel_dist
,
)
elif
dist_util
.
is_data_parallel
():
# parallel cast: S(0) -> S(0), identity
...
...
@@ -288,7 +288,7 @@ def output_parallel_cast(x, device="gpu"):
dist_util
=
get_dist_util
()
if
dist_util
.
is_hybrid_parallel
():
with
flow
.
scope
.
placement
(
device
,
dist_util
.
get_layer_placement
(
-
1
)):
x
=
flow
.
hierarchical_parallel_cast
(
x
,
parallel_distribution
=
[
"B"
])
x
=
flow
.
hierarchical_parallel_cast
(
x
,
nd_sbp
=
[
"B"
])
return
x
...
...
@@ -297,7 +297,7 @@ def input_data_parallel_cast(x):
dist_util
=
get_dist_util
()
if
dist_util
.
is_hybrid_parallel
():
x
=
flow
.
hierarchical_parallel_cast
(
x
,
parallel_distribution
=
get_data_parallel_dist
(),
x
,
nd_sbp
=
get_data_parallel_dist
(),
)
return
x
LanguageModeling/GPT/oneflow_gpt/model.py
浏览文件 @
f51c43e2
...
...
@@ -112,13 +112,13 @@ class Embedding(object):
"wpe"
,
shape
=
(
self
.
seq_length
,
self
.
hidden_size
),
initializer
=
self
.
wpe_initializer
,
parallel_distribution
=
distribute
.
get_wpe_parallel_dist
(),
nd_sbp
=
distribute
.
get_wpe_parallel_dist
(),
)
wte
=
flow
.
get_variable
(
"wte"
,
shape
=
(
self
.
vocab_size
,
self
.
hidden_size
),
initializer
=
self
.
wte_initializer
,
parallel_distribution
=
distribute
.
get_wte_parallel_dist
(),
nd_sbp
=
distribute
.
get_wte_parallel_dist
(),
)
# 2d sbp sig: [B, S(0)] x [S(0), B] -> [S(0), P] -> [S(0), B]
...
...
@@ -569,7 +569,7 @@ def layernorm(
trainable
=
True
,
model_name
=
"beta"
,
reuse
=
False
,
parallel_distribution
=
params_parallel_dist
,
nd_sbp
=
params_parallel_dist
,
)
gamma
=
flow
.
get_variable
(
...
...
@@ -580,7 +580,7 @@ def layernorm(
trainable
=
True
,
model_name
=
"gamma"
,
reuse
=
False
,
parallel_distribution
=
params_parallel_dist
,
nd_sbp
=
params_parallel_dist
,
)
return
flow
.
nn
.
layer_norm
(
...
...
@@ -604,14 +604,14 @@ def get_linear_params(
shape
=
(
input_size
,
output_size
),
dtype
=
dtype
,
initializer
=
weight_initializer
,
parallel_distribution
=
weight_parallel_dist
,
nd_sbp
=
weight_parallel_dist
,
)
bias
=
flow
.
get_variable
(
name
=
"bias"
,
shape
=
(
output_size
,),
dtype
=
dtype
,
initializer
=
bias_initializer
,
parallel_distribution
=
bias_parallel_dist
,
nd_sbp
=
bias_parallel_dist
,
)
return
weight
,
bias
...
...
LanguageModeling/GPT/oneflow_gpt/util.py
浏览文件 @
f51c43e2
...
...
@@ -145,7 +145,7 @@ class Metric(object):
output
=
outputs
[
key
].
numpy
()
assert
isinstance
(
output
,
np
.
ndarray
)
if
micro_batches
is
None
:
micro_batches
=
output
.
shape
[
0
]
micro_batches
=
output
.
shape
[
0
]
if
output
.
shape
else
1
else
:
assert
micro_batches
==
output
.
shape
[
0
]
self
.
kv_store_
[
key
]
+=
output
.
sum
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录