Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
94c17a0f
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
94c17a0f
编写于
8月 09, 2022
作者:
C
caozhou
提交者:
GitHub
8月 09, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Auto Parallel] Add mul dist op cost (#44973)
* add mul dist op cost * add mul unittest
上级
2c77b575
变更
4
展开全部
显示空白变更内容
内联
并排
Showing
4 changed file
with
763 addition
and
2 deletion
+763
-2
python/paddle/distributed/auto_parallel/operators/dist_matmul.py
...paddle/distributed/auto_parallel/operators/dist_matmul.py
+537
-1
python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py
.../fluid/tests/unittests/auto_parallel/test_dist_op_cost.py
+209
-0
python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
...luid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
+8
-0
python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
.../fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
+9
-1
未找到文件。
python/paddle/distributed/auto_parallel/operators/dist_matmul.py
浏览文件 @
94c17a0f
此差异已折叠。
点击以展开。
python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py
浏览文件 @
94c17a0f
...
...
@@ -215,6 +215,215 @@ class TestDistOpCost(unittest.TestCase):
dist_context
,
cluster
)
self
.
assertTrue
(
dist_op_cost
)
def
test_dist_op_cost_part3
(
self
):
def
make_program
():
main_program
=
paddle
.
static
.
Program
()
start_program
=
paddle
.
static
.
Program
()
with
paddle
.
static
.
program_guard
(
main_program
,
start_program
):
x
=
paddle
.
static
.
data
(
name
=
'x'
,
shape
=
[
4
],
dtype
=
'float32'
)
x
.
stop_gradient
=
True
label
=
paddle
.
static
.
data
(
name
=
"label"
,
shape
=
[
8
,
1
],
dtype
=
'float32'
)
label
.
stop_gradient
=
True
auto
.
shard_tensor
(
x
,
dist_attr
=
{
"process_mesh"
:
auto
.
ProcessMesh
([
0
,
1
]),
"dims_mapping"
:
[
0
]
})
auto
.
shard_tensor
(
label
,
dist_attr
=
{
"process_mesh"
:
auto
.
ProcessMesh
([
0
,
1
]),
"dims_mapping"
:
[
0
,
-
1
]
})
# embedding
tmp
=
paddle
.
fluid
.
layers
.
fill_constant_batch_size_like
(
input
=
x
,
shape
=
[
4
],
value
=
1
,
dtype
=
'int32'
)
embedding
=
paddle
.
nn
.
Embedding
(
10
,
8
)
out
=
embedding
(
tmp
)
# row parallel embedding
for
op
in
main_program
.
global_block
().
ops
:
if
op
.
type
==
"lookup_table_v2"
:
W
=
main_program
.
global_block
().
vars
[
op
.
input
(
"W"
)[
0
]]
auto
.
shard_tensor
(
W
,
dist_attr
=
{
"process_mesh"
:
auto
.
ProcessMesh
([
0
,
1
]),
"dims_mapping"
:
[
0
,
-
1
]
})
out
=
paddle
.
fluid
.
layers
.
transpose
(
out
,
[
1
,
0
])
# [8, 2] [-1, 0]
# matmul_v2
param1
=
paddle
.
fluid
.
layers
.
create_parameter
(
[
4
,
8
],
paddle
.
float32
)
# [2, 8] [0, -1]
auto
.
shard_tensor
(
param1
,
dist_attr
=
{
"process_mesh"
:
auto
.
ProcessMesh
([
0
,
1
]),
"dims_mapping"
:
[
0
,
-
1
]
})
param2
=
paddle
.
fluid
.
layers
.
create_parameter
(
[
8
,
8
],
paddle
.
float32
)
# [8, 4] [-1, 0]
auto
.
shard_tensor
(
param2
,
dist_attr
=
{
"process_mesh"
:
auto
.
ProcessMesh
([
0
,
1
]),
"dims_mapping"
:
[
-
1
,
0
]
})
out1
=
paddle
.
matmul
(
out
,
param1
)
# [8, 8] [-1, -1]
tmp_param
=
paddle
.
fluid
.
layers
.
create_parameter
(
[
8
,
8
],
paddle
.
float32
)
# [8, 8] [-1, -1]
auto
.
shard_tensor
(
param2
,
dist_attr
=
{
"process_mesh"
:
auto
.
ProcessMesh
([
0
,
1
]),
"dims_mapping"
:
[
-
1
,
-
1
]
})
tmp_out
=
paddle
.
matmul
(
out1
,
tmp_param
)
out2
=
paddle
.
matmul
(
tmp_out
,
param2
)
# [8, 4] [-1, 0]
out8
=
paddle
.
fluid
.
layers
.
transpose
(
out2
,
[
1
,
0
])
# [4, 8] [0, -1]
# reshape
out9
=
paddle
.
reshape
(
out8
,
[
8
,
2
,
4
])
# [4, 2, 4] [0, -1, -1]
tmp_reshape_out
=
paddle
.
reshape
(
out9
,
[
8
,
4
,
2
])
out10
=
paddle
.
reshape
(
tmp_reshape_out
,
[
8
,
8
])
# [4, 8] [0, -1]
# softmax
softmax
=
paddle
.
nn
.
Softmax
()
out11
=
softmax
(
out10
)
error_cost
=
paddle
.
nn
.
functional
.
square_error_cost
(
out11
,
label
)
loss
=
paddle
.
mean
(
error_cost
)
return
main_program
,
start_program
,
loss
main_program
,
dist_context
=
parallelizer
(
make_program
,
0
)
ops
=
main_program
.
global_block
().
ops
cluster
=
Cluster
()
cluster
.
gen_default_config_cluster
(
device_count
=
2
)
for
idx
,
op
in
enumerate
(
ops
):
dist_op
=
dist_context
.
get_dist_op_for_program
(
op
)
op_dist_attr
=
dist_op
.
dist_attr
processes
=
op_dist_attr
.
process_mesh
.
processes
if
is_elementwise_op
(
op
.
type
):
container
=
get_distributed_operator_impl_container
(
"elementwise"
)
else
:
container
=
get_distributed_operator_impl_container
(
op_dist_attr
.
impl_type
)
dist_impl
=
container
.
impls
[
op_dist_attr
.
impl_idx
]
dist_op_cost
=
dist_impl
.
calc_cost
(
op
.
attr
(
'op_role'
),
dist_op
,
dist_context
,
cluster
)
self
.
assertTrue
(
dist_op_cost
)
def
test_dist_op_cost_part4
(
self
):
def
make_program
():
main_program
=
paddle
.
static
.
Program
()
start_program
=
paddle
.
static
.
Program
()
with
paddle
.
static
.
program_guard
(
main_program
,
start_program
):
x
=
paddle
.
static
.
data
(
name
=
'x'
,
shape
=
[
4
],
dtype
=
'float32'
)
x
.
stop_gradient
=
True
label
=
paddle
.
static
.
data
(
name
=
"label"
,
shape
=
[
8
,
1
],
dtype
=
'float32'
)
label
.
stop_gradient
=
True
auto
.
shard_tensor
(
x
,
dist_attr
=
{
"process_mesh"
:
auto
.
ProcessMesh
([
0
,
1
]),
"dims_mapping"
:
[
0
]
})
auto
.
shard_tensor
(
label
,
dist_attr
=
{
"process_mesh"
:
auto
.
ProcessMesh
([
0
,
1
]),
"dims_mapping"
:
[
0
,
-
1
]
})
# embedding
tmp
=
paddle
.
fluid
.
layers
.
fill_constant_batch_size_like
(
input
=
x
,
shape
=
[
4
],
value
=
1
,
dtype
=
'int32'
)
embedding
=
paddle
.
nn
.
Embedding
(
10
,
8
)
out
=
embedding
(
tmp
)
# row parallel embedding
for
op
in
main_program
.
global_block
().
ops
:
if
op
.
type
==
"lookup_table_v2"
:
W
=
main_program
.
global_block
().
vars
[
op
.
input
(
"W"
)[
0
]]
auto
.
shard_tensor
(
W
,
dist_attr
=
{
"process_mesh"
:
auto
.
ProcessMesh
([
0
,
1
]),
"dims_mapping"
:
[
0
,
-
1
]
})
out
=
paddle
.
fluid
.
layers
.
transpose
(
out
,
[
1
,
0
])
# [8, 2] [-1, 0]
# mul
param1
=
paddle
.
fluid
.
layers
.
create_parameter
(
[
4
,
8
],
paddle
.
float32
)
# [2, 8] [0, -1]
auto
.
shard_tensor
(
param1
,
dist_attr
=
{
"process_mesh"
:
auto
.
ProcessMesh
([
0
,
1
]),
"dims_mapping"
:
[
0
,
-
1
]
})
param2
=
paddle
.
fluid
.
layers
.
create_parameter
(
[
8
,
8
],
paddle
.
float32
)
# [8, 4] [-1, 0]
auto
.
shard_tensor
(
param2
,
dist_attr
=
{
"process_mesh"
:
auto
.
ProcessMesh
([
0
,
1
]),
"dims_mapping"
:
[
-
1
,
0
]
})
out1
=
paddle
.
fluid
.
layers
.
mul
(
out
,
param1
)
# [8, 8] [-1, -1]
tmp_param
=
paddle
.
fluid
.
layers
.
create_parameter
(
[
8
,
8
],
paddle
.
float32
)
# [8, 8] [-1, -1]
auto
.
shard_tensor
(
param2
,
dist_attr
=
{
"process_mesh"
:
auto
.
ProcessMesh
([
0
,
1
]),
"dims_mapping"
:
[
-
1
,
-
1
]
})
tmp_out
=
paddle
.
fluid
.
layers
.
mul
(
out1
,
tmp_param
)
out2
=
paddle
.
fluid
.
layers
.
mul
(
tmp_out
,
param2
)
# [8, 4] [-1, 0]
out8
=
paddle
.
fluid
.
layers
.
transpose
(
out2
,
[
1
,
0
])
# [4, 8] [0, -1]
# reshape
out9
=
paddle
.
reshape
(
out8
,
[
8
,
2
,
4
])
# [4, 2, 4] [0, -1, -1]
tmp_reshape_out
=
paddle
.
reshape
(
out9
,
[
8
,
4
,
2
])
out10
=
paddle
.
reshape
(
tmp_reshape_out
,
[
8
,
8
])
# [4, 8] [0, -1]
# softmax
softmax
=
paddle
.
nn
.
Softmax
()
out11
=
softmax
(
out10
)
error_cost
=
paddle
.
nn
.
functional
.
square_error_cost
(
out11
,
label
)
loss
=
paddle
.
mean
(
error_cost
)
return
main_program
,
start_program
,
loss
main_program
,
dist_context
=
parallelizer
(
make_program
,
0
)
ops
=
main_program
.
global_block
().
ops
cluster
=
Cluster
()
cluster
.
gen_default_config_cluster
(
device_count
=
2
)
for
idx
,
op
in
enumerate
(
ops
):
dist_op
=
dist_context
.
get_dist_op_for_program
(
op
)
op_dist_attr
=
dist_op
.
dist_attr
processes
=
op_dist_attr
.
process_mesh
.
processes
if
is_elementwise_op
(
op
.
type
):
container
=
get_distributed_operator_impl_container
(
"elementwise"
)
else
:
container
=
get_distributed_operator_impl_container
(
op_dist_attr
.
impl_type
)
dist_impl
=
container
.
impls
[
op_dist_attr
.
impl_idx
]
dist_op_cost
=
dist_impl
.
calc_cost
(
op
.
attr
(
'op_role'
),
dist_op
,
dist_context
,
cluster
)
self
.
assertTrue
(
dist_op_cost
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
浏览文件 @
94c17a0f
...
...
@@ -76,6 +76,14 @@ class MLPLayer(nn.Layer):
out
=
self
.
linear0
(
out
)
out
=
F
.
gelu
(
out
,
approximate
=
True
)
out
=
self
.
linear1
(
out
)
param
=
paddle
.
fluid
.
layers
.
create_parameter
([
1024
,
4096
],
paddle
.
float32
)
auto
.
shard_tensor
(
param
,
dist_attr
=
{
"process_mesh"
:
PP_MESH_1
,
"dims_mapping"
:
[
-
1
,
1
]
})
out
=
paddle
.
fluid
.
layers
.
mul
(
out
,
param
)
return
out
...
...
python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
浏览文件 @
94c17a0f
...
...
@@ -93,6 +93,14 @@ class MLPLayer(nn.Layer):
})
w_out
=
self
.
word_embeddings
(
input
)
out
=
self
.
linear0
(
w_out
)
param
=
paddle
.
fluid
.
layers
.
create_parameter
([
4096
,
4096
],
paddle
.
float32
)
auto
.
shard_tensor
(
param
,
dist_attr
=
{
"process_mesh"
:
PP_MESH_0
,
"dims_mapping"
:
[
0
,
-
1
]
})
out
=
paddle
.
fluid
.
layers
.
mul
(
out
,
param
)
gelu_out
=
F
.
gelu
(
out
,
approximate
=
True
)
out
=
self
.
linear1
(
gelu_out
)
out1
=
self
.
linear2
(
gelu_out
)
...
...
@@ -228,7 +236,7 @@ class TestMLPReshard(unittest.TestCase):
resharder
=
Resharder
(
dist_main_prog
,
dist_startup_prog
,
rank_id
,
dist_context
,
dist_params_grads
)
resharder
.
reshard
()
print_program_with_dist_attr
(
dist_main_prog
,
dist_context
)
# check send and recv result
self
.
assertTrue
(
check_send_recv_result
(
dist_main_prog
,
rank_id
))
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录