Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
294dfd23
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
294dfd23
编写于
6月 16, 2021
作者:
S
ShenLiang
提交者:
GitHub
6月 16, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[HybridParallel]Add SharedLayerDesc for PipelineParallel (#33578)
* add pplayer * add sharedlayerdesc
上级
07197fb9
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
358 addition
and
7 deletion
+358
-7
python/paddle/distributed/collective.py
python/paddle/distributed/collective.py
+3
-1
python/paddle/distributed/fleet/base/topology.py
python/paddle/distributed/fleet/base/topology.py
+8
-4
python/paddle/distributed/fleet/meta_parallel/__init__.py
python/paddle/distributed/fleet/meta_parallel/__init__.py
+1
-0
python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
...stributed/fleet/meta_parallel/parallel_layers/__init__.py
+1
-0
python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
...tributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+105
-0
python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
...ddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+2
-0
python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
...paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
+2
-2
python/paddle/fluid/tests/unittests/hybrid_parallel_shared_weight.py
...le/fluid/tests/unittests/hybrid_parallel_shared_weight.py
+233
-0
python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
...ests/unittests/test_parallel_dygraph_pipeline_parallel.py
+3
-0
未找到文件。
python/paddle/distributed/collective.py
浏览文件 @
294dfd23
...
...
@@ -267,7 +267,9 @@ def new_group(ranks=None, backend=None):
# TODO(shenliang03): This is a temporary solution to solve the problem of
# hang caused by cross-creation of new_group
tmp
=
fill_constant
([
0
],
dtype
=
"int32"
,
value
=
"1"
)
tmp
=
paddle
.
to_tensor
(
[
1
],
dtype
=
"int32"
)
if
in_dygraph_mode
()
else
fill_constant
(
[
0
],
dtype
=
"int32"
,
value
=
"1"
)
paddle
.
distributed
.
all_reduce
(
tmp
,
use_calc_stream
=
True
)
paddle
.
distributed
.
wait
(
tmp
)
return
gp
...
...
python/paddle/distributed/fleet/base/topology.py
浏览文件 @
294dfd23
...
...
@@ -107,6 +107,11 @@ class CommunicateTopology(object):
return
all_result
def
get_rank_from_stage
(
self
,
global_rank
,
**
kwargs
):
coord
=
self
.
get_coord
(
global_rank
)
tf
=
coord
.
_replace
(
**
kwargs
).
_asdict
()
return
self
.
get_rank
(
**
tf
)
class
HybridCommunicateGroup
(
object
):
def
__init__
(
self
,
topology
):
...
...
@@ -254,7 +259,6 @@ class HybridCommunicateGroup(object):
def
get_check_parallel_group
(
self
):
return
self
.
_check_comm_group
def
get_rank_from_stage
(
self
,
stage_id
):
coord
=
self
.
_topo
.
get_coord
(
self
.
global_rank
)
tf
=
coord
.
_replace
(
pipe
=
stage_id
).
_asdict
()
return
self
.
_topo
.
get_rank
(
**
tf
)
def
get_rank_from_stage
(
self
,
stage_id
,
**
kwargs
):
return
self
.
_topo
.
get_rank_from_stage
(
self
.
global_rank
,
pipe
=
stage_id
,
**
kwargs
)
python/paddle/distributed/fleet/meta_parallel/__init__.py
浏览文件 @
294dfd23
...
...
@@ -17,6 +17,7 @@ from .parallel_layers import ColumnParallelLinear # noqa: F401
from
.parallel_layers
import
RowParallelLinear
# noqa: F401
from
.parallel_layers
import
ParallelCrossEntropy
# noqa: F401
from
.parallel_layers
import
LayerDesc
# noqa: F401
from
.parallel_layers
import
SharedLayerDesc
# noqa: F401
from
.parallel_layers
import
PipelineLayer
# noqa: F401
from
.parallel_layers
import
RNGStatesTracker
# noqa: F401
from
.parallel_layers
import
model_parallel_random_seed
# noqa: F401
...
...
python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
浏览文件 @
294dfd23
...
...
@@ -17,6 +17,7 @@ from .mp_layers import ColumnParallelLinear # noqa: F401
from
.mp_layers
import
RowParallelLinear
# noqa: F401
from
.mp_layers
import
ParallelCrossEntropy
# noqa: F401
from
.pp_layers
import
LayerDesc
# noqa: F401
from
.pp_layers
import
SharedLayerDesc
# noqa: F401
from
.pp_layers
import
PipelineLayer
# noqa: F401
from
.random
import
RNGStatesTracker
# noqa: F401
from
.random
import
model_parallel_random_seed
# noqa: F401
...
...
python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
浏览文件 @
294dfd23
...
...
@@ -15,6 +15,7 @@ import math
import
paddle
from
paddle.fluid.dygraph.layers
import
Layer
from
...utils.log_util
import
logger
,
layer_to_str
from
functools
import
partial
__all__
=
[]
...
...
@@ -58,6 +59,20 @@ class LayerDesc(object):
**
self
.
kwargs
)
class
SharedLayerDesc
(
LayerDesc
):
def
__init__
(
self
,
key
,
layer_func
,
forward_func
=
None
,
shared_weight_attr
=
'weight'
,
*
inputs
,
**
kwargs
):
super
(
SharedLayerDesc
,
self
).
__init__
(
layer_func
,
*
inputs
,
**
kwargs
)
self
.
layer_name
=
key
self
.
forward_func
=
forward_func
self
.
shared_weight_attr
=
shared_weight_attr
class
PipelineLayer
(
Layer
):
def
__init__
(
self
,
layers
,
...
...
@@ -104,11 +119,86 @@ class PipelineLayer(Layer):
self
.
_start_pos
=
0
self
.
_end_pos
=
self
.
_num_layers
-
1
self
.
_segment_network
(
seg_method
)
self
.
shared_layers
=
paddle
.
nn
.
LayerDict
()
self
.
shared_weight_attrs
=
{}
# construct layer
self
.
run_function
=
[]
self
.
_build_layer
()
self
.
shared_comm
=
self
.
_construct_shared_comm
()
self
.
_synchronize_shared_weights
()
def
get_stage_from_index
(
self
,
layer_idx
):
assert
0
<=
layer_idx
<
self
.
_num_layers
,
"layer_idx is out of bound"
for
stage
in
range
(
self
.
_topo
.
get_dim
(
'pipe'
)):
if
self
.
segment_parts
[
stage
]
<=
layer_idx
<
self
.
segment_parts
[
stage
+
1
]:
return
stage
def
_construct_shared_comm
(
self
):
shared_comm
=
{}
if
self
.
_topo
.
get_dim
(
"pipe"
)
==
1
:
return
layers_desc
=
self
.
_layers_desc
shared_layer_names
=
set
(
s
.
layer_name
for
s
in
layers_desc
if
isinstance
(
s
,
SharedLayerDesc
))
for
key
in
shared_layer_names
:
shared_layers
=
[]
for
idx
,
layer
in
enumerate
(
layers_desc
):
if
isinstance
(
layer
,
SharedLayerDesc
)
and
layer
.
layer_name
==
key
:
shared_layers
.
append
(
idx
)
shared_stages
=
set
(
self
.
get_stage_from_index
(
idx
)
for
idx
in
shared_layers
)
self
.
_dp_degree
=
self
.
_topo
.
get_dim
(
'data'
)
self
.
_mp_degree
=
self
.
_topo
.
get_dim
(
'model'
)
shared_ranks
=
[]
for
dp
in
range
(
self
.
_dp_degree
):
for
mp
in
range
(
self
.
_mp_degree
):
shared_ranks
=
[]
for
s
in
sorted
(
shared_stages
):
shared_ranks
.
append
(
self
.
_topo
.
get_rank_from_stage
(
self
.
global_rank
,
pipe
=
s
,
data
=
dp
,
model
=
mp
))
group
=
paddle
.
distributed
.
new_group
(
ranks
=
shared_ranks
)
if
self
.
global_rank
in
shared_ranks
:
assert
key
in
self
.
shared_layers
if
key
in
self
.
shared_layers
:
shared_comm
[
key
]
=
{
'ranks'
:
shared_ranks
,
'group'
:
group
,
'weight_attr'
:
self
.
shared_weight_attrs
[
key
],
'layer'
:
self
.
shared_layers
[
key
],
}
return
shared_comm
def
_synchronize_shared_weights
(
self
):
for
key
,
comm
in
self
.
shared_comm
.
items
():
with
paddle
.
framework
.
no_grad
():
paddle
.
distributed
.
broadcast
(
getattr
(
comm
[
'layer'
],
comm
[
'weight_attr'
]),
src
=
min
(
comm
[
'ranks'
]),
group
=
comm
[
'group'
])
def
allreduce_shared_weight_gradients
(
self
):
for
key
,
comm
in
self
.
shared_comm
.
items
():
param
=
getattr
(
self
.
shared_layers
[
key
],
comm
[
'weight_attr'
])
# need use trace_op to allreduce weight
with
paddle
.
framework
.
no_grad
():
paddle
.
fluid
.
framework
.
_dygraph_tracer
().
trace_op
(
type
=
"c_allreduce_sum"
,
inputs
=
{
'X'
:
param
.
_grad_ivar
()},
outputs
=
{
'Out'
:
param
.
_grad_ivar
()},
attrs
=
{
'ring_id'
:
comm
[
'group'
].
id
,
'use_calc_stream'
:
True
})
def
_segment_network
(
self
,
seg_method
):
logger
.
info
(
"start segment network.."
)
seg
=
SegmentLayers
(
...
...
@@ -142,6 +232,21 @@ class PipelineLayer(Layer):
if
isinstance
(
layer
,
Layer
):
self
.
run_function
.
append
(
layer
)
self
.
add_sublayer
(
str
(
layer_index
),
layer
)
elif
isinstance
(
layer
,
SharedLayerDesc
):
if
layer
.
layer_name
not
in
self
.
shared_layers
:
self
.
shared_layers
[
layer
.
layer_name
]
=
layer
.
build_layer
()
self
.
shared_weight_attrs
[
layer
.
layer_name
]
=
layer
.
shared_weight_attr
if
layer
.
forward_func
is
None
:
self
.
run_function
.
append
(
self
.
shared_layers
[
layer
.
layer_name
])
else
:
self
.
run_function
.
append
(
partial
(
layer
.
forward_func
,
self
.
shared_layers
[
layer
.
layer_name
]))
elif
isinstance
(
layer
,
LayerDesc
):
model
=
layer
.
build_layer
()
self
.
run_function
.
append
(
model
)
...
...
python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
浏览文件 @
294dfd23
...
...
@@ -138,6 +138,8 @@ class PipelineParallel(MetaParallelBase):
self
.
_backward
(
cache_id
=
backward_steps
)
backward_steps
+=
1
self
.
_layers
.
allreduce_shared_weight_gradients
()
# optimizer
self
.
_step
()
self
.
train_loss
=
self
.
_reduce_final_loss
()
...
...
python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
浏览文件 @
294dfd23
...
...
@@ -270,8 +270,8 @@ class TestDistTraning(unittest.TestCase):
np
.
testing
.
assert_allclose
(
loss_a
.
numpy
(),
loss_b
.
numpy
())
def
test_parallel_cross_entropy
(
self
):
batch_size
=
2
seq_length
=
1
batch_size
=
8
seq_length
=
1
6
class_size_per_card
=
2
vocab_size
=
class_size_per_card
*
self
.
model_parallel_size
seed
=
1025
...
...
python/paddle/fluid/tests/unittests/hybrid_parallel_shared_weight.py
0 → 100644
浏览文件 @
294dfd23
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
division
from
__future__
import
print_function
import
unittest
import
paddle
import
numpy
as
np
import
random
import
paddle
import
paddle.distributed
as
dist
import
paddle.distributed.fleet
as
fleet
from
paddle.fluid.dygraph.container
import
Sequential
from
paddle.distributed.fleet.meta_parallel
import
PipelineLayer
from
paddle.fluid.dygraph.layers
import
Layer
import
paddle.nn
as
nn
import
paddle.fluid
as
fluid
from
paddle.distributed.fleet.meta_parallel
import
LayerDesc
,
SharedLayerDesc
def
print_hook_fn
(
grad
):
print
(
grad
)
def
set_random_seed
(
seed
,
dp_id
,
rank_id
):
"""Set random seed for reproducability."""
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
+
dp_id
)
paddle
.
seed
(
seed
+
dp_id
)
batch_size
=
8
micro_batch_size
=
2
vocab_size
=
128
hidden_size
=
16
class
SimpleNet
(
Layer
):
def
__init__
(
self
):
super
(
SimpleNet
,
self
).
__init__
()
self
.
word_embeddings
=
nn
.
Embedding
(
vocab_size
,
hidden_size
)
self
.
softmax_weight
=
self
.
create_parameter
(
shape
=
[
hidden_size
,
vocab_size
])
self
.
softmax_bias
=
self
.
create_parameter
(
shape
=
[
vocab_size
],
is_bias
=
False
)
def
forward
(
self
,
x1
,
x2
,
y1
):
x_emb
=
self
.
word_embeddings
(
x1
)
fc
=
fluid
.
layers
.
matmul
(
x_emb
,
self
.
softmax_weight
)
fc
=
fluid
.
layers
.
elementwise_add
(
fc
,
self
.
softmax_bias
)
projection
=
fluid
.
layers
.
reshape
(
fc
,
shape
=
[
-
1
,
vocab_size
])
projection
=
paddle
.
matmul
(
projection
,
self
.
word_embeddings
.
weight
)
loss
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
projection
,
label
=
y1
,
soft_label
=
False
)
return
loss
.
mean
()
class
EmbeddingPipe
(
Layer
):
def
__init__
(
self
):
super
(
EmbeddingPipe
,
self
).
__init__
()
self
.
word_embeddings
=
nn
.
Embedding
(
vocab_size
,
hidden_size
)
@
property
def
embedding_weight
(
self
):
return
self
.
word_embeddings
.
weight
def
forward
(
self
,
args
):
x1
,
x2
=
args
x_emb
=
self
.
word_embeddings
(
x1
)
return
x_emb
,
x2
class
MatmulNet
(
Layer
):
def
__init__
(
self
):
super
(
MatmulNet
,
self
).
__init__
()
self
.
softmax_weight
=
self
.
create_parameter
(
shape
=
[
hidden_size
,
vocab_size
])
def
forward
(
self
,
args
):
x1
,
x2
=
args
fc
=
fluid
.
layers
.
matmul
(
x1
,
self
.
softmax_weight
)
return
fc
,
x2
class
BiasNet
(
Layer
):
def
__init__
(
self
):
super
(
BiasNet
,
self
).
__init__
()
self
.
softmax_bias
=
self
.
create_parameter
(
shape
=
[
vocab_size
])
def
forward
(
self
,
args
):
fc
,
x2
=
args
fc
=
fluid
.
layers
.
elementwise_add
(
fc
,
self
.
softmax_bias
)
projection
=
fluid
.
layers
.
reshape
(
fc
,
shape
=
[
-
1
,
vocab_size
])
return
projection
,
x2
class
LossNet
(
Layer
):
def
__init__
(
self
):
super
(
LossNet
,
self
).
__init__
()
def
forward
(
self
,
args
,
y1
):
projection
=
args
loss
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
projection
,
label
=
y1
[
0
],
soft_label
=
False
)
return
loss
.
mean
()
class
SimpleNetPipe
(
PipelineLayer
):
def
__init__
(
self
,
**
kwargs
):
self
.
descs
=
[]
self
.
descs
.
append
(
SharedLayerDesc
(
'embed'
,
EmbeddingPipe
,
shared_weight_attr
=
'embedding_weight'
))
self
.
descs
.
append
(
LayerDesc
(
MatmulNet
))
self
.
descs
.
append
(
LayerDesc
(
BiasNet
))
def
_logits_helper
(
embedding
,
output
):
return
paddle
.
matmul
(
output
[
0
],
embedding
.
embedding_weight
)
self
.
descs
.
append
(
SharedLayerDesc
(
'embed'
,
EmbeddingPipe
,
forward_func
=
_logits_helper
,
shared_weight_attr
=
'embedding_weight'
))
super
(
SimpleNetPipe
,
self
).
__init__
(
layers
=
self
.
descs
,
loss_fn
=
LossNet
(),
**
kwargs
)
class
TestDistEmbeddingTraning
(
unittest
.
TestCase
):
def
setUp
(
self
):
strategy
=
fleet
.
DistributedStrategy
()
self
.
model_parallel_size
=
1
self
.
data_parallel_size
=
1
self
.
pipeline_parallel_size
=
2
strategy
.
hybrid_configs
=
{
"dp_degree"
:
self
.
data_parallel_size
,
"mp_degree"
:
self
.
model_parallel_size
,
"pp_degree"
:
self
.
pipeline_parallel_size
,
}
strategy
.
pipeline_configs
=
{
"accumulate_steps"
:
batch_size
//
micro_batch_size
,
"micro_batch_size"
:
micro_batch_size
}
fleet
.
init
(
is_collective
=
True
,
strategy
=
strategy
)
def
test_pp_model
(
self
):
hcg
=
fleet
.
get_hybrid_communicate_group
()
word_size
=
hcg
.
get_model_parallel_world_size
()
dp_id
=
hcg
.
get_data_parallel_rank
()
pp_id
=
hcg
.
get_stage_id
()
rank_id
=
dist
.
get_rank
()
set_random_seed
(
1024
,
dp_id
,
rank_id
)
#construct model a
model_a
=
SimpleNet
()
scheduler_a
=
paddle
.
optimizer
.
lr
.
PiecewiseDecay
(
boundaries
=
[
2
,
3
,
4
],
values
=
[
0.01
,
0.02
,
0.03
,
0.04
],
verbose
=
True
)
optimizer_a
=
paddle
.
optimizer
.
SGD
(
learning_rate
=
scheduler_a
,
parameters
=
model_a
.
parameters
())
model_b
=
SimpleNetPipe
(
topology
=
hcg
.
topology
())
scheduler_b
=
paddle
.
optimizer
.
lr
.
PiecewiseDecay
(
boundaries
=
[
2
,
3
,
4
],
values
=
[
0.01
,
0.02
,
0.03
,
0.04
],
verbose
=
True
)
optimizer_b
=
paddle
.
optimizer
.
SGD
(
learning_rate
=
scheduler_b
,
parameters
=
model_b
.
parameters
())
model_b
=
fleet
.
distributed_model
(
model_b
)
optimizer_b
=
fleet
.
distributed_optimizer
(
optimizer_b
)
param_len
=
len
(
model_a
.
parameters
())
parameters
=
[]
for
param
in
model_a
.
parameters
():
parameters
.
append
(
param
.
numpy
())
model_b_params
=
model_b
.
parameters
()
if
pp_id
==
0
:
model_b_params
[
0
].
set_value
(
parameters
[
2
])
model_b_params
[
1
].
set_value
(
parameters
[
0
])
else
:
model_b_params
[
0
].
set_value
(
parameters
[
2
])
model_b_params
[
1
].
set_value
(
parameters
[
1
])
for
step
in
range
(
5
):
x1_data
=
np
.
random
.
randint
(
0
,
vocab_size
,
size
=
[
batch_size
,
1
])
x2_data
=
np
.
random
.
randint
(
0
,
vocab_size
,
size
=
[
batch_size
,
1
])
y1_data
=
np
.
random
.
randint
(
0
,
hidden_size
,
size
=
[
batch_size
,
1
])
x1
=
paddle
.
to_tensor
(
x1_data
)
x2
=
paddle
.
to_tensor
(
x2_data
)
y1
=
paddle
.
to_tensor
(
y1_data
)
x1
.
stop_gradient
=
True
x2
.
stop_gradient
=
True
y1
.
stop_gradient
=
True
loss_a
=
model_a
(
x1
,
x2
,
y1
)
loss_a
.
backward
()
optimizer_a
.
step
()
optimizer_a
.
clear_grad
()
scheduler_a
.
step
()
loss_b
=
model_b
.
train_batch
([(
x1
,
x2
),
(
y1
,
)],
optimizer_b
,
scheduler_b
)
print
(
"loss"
,
loss_a
.
numpy
(),
loss_b
.
numpy
())
np
.
testing
.
assert_allclose
(
loss_a
.
numpy
(),
loss_b
.
numpy
())
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
浏览文件 @
294dfd23
...
...
@@ -27,6 +27,9 @@ class TestHybridPipeParallel(TestMultipleGpus):
def
test_hybrid_parallel_pp_tuple_inputs
(
self
):
self
.
run_mnist_2gpu
(
'hybrid_parallel_pp_embedding.py'
)
def
test_hybrid_parallel_pp_tuple_inputs
(
self
):
self
.
run_mnist_2gpu
(
'hybrid_parallel_shared_weight.py'
)
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录