Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
294dfd23
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2305
Star
20932
Fork
5423
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
294dfd23
编写于
6月 16, 2021
作者:
S
ShenLiang
提交者:
GitHub
6月 16, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[HybridParallel]Add SharedLayerDesc for PipelineParallel (#33578)
* add pplayer * add sharedlayerdesc
上级
07197fb9
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
358 addition
and
7 deletion
+358
-7
python/paddle/distributed/collective.py
python/paddle/distributed/collective.py
+3
-1
python/paddle/distributed/fleet/base/topology.py
python/paddle/distributed/fleet/base/topology.py
+8
-4
python/paddle/distributed/fleet/meta_parallel/__init__.py
python/paddle/distributed/fleet/meta_parallel/__init__.py
+1
-0
python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
...stributed/fleet/meta_parallel/parallel_layers/__init__.py
+1
-0
python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
...tributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+105
-0
python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
...ddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+2
-0
python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
...paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
+2
-2
python/paddle/fluid/tests/unittests/hybrid_parallel_shared_weight.py
...le/fluid/tests/unittests/hybrid_parallel_shared_weight.py
+233
-0
python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
...ests/unittests/test_parallel_dygraph_pipeline_parallel.py
+3
-0
未找到文件。
python/paddle/distributed/collective.py
浏览文件 @
294dfd23
...
...
@@ -267,7 +267,9 @@ def new_group(ranks=None, backend=None):
# TODO(shenliang03): This is a temporary solution to solve the problem of
# hang caused by cross-creation of new_group
tmp
=
fill_constant
([
0
],
dtype
=
"int32"
,
value
=
"1"
)
tmp
=
paddle
.
to_tensor
(
[
1
],
dtype
=
"int32"
)
if
in_dygraph_mode
()
else
fill_constant
(
[
0
],
dtype
=
"int32"
,
value
=
"1"
)
paddle
.
distributed
.
all_reduce
(
tmp
,
use_calc_stream
=
True
)
paddle
.
distributed
.
wait
(
tmp
)
return
gp
...
...
python/paddle/distributed/fleet/base/topology.py
浏览文件 @
294dfd23
...
...
@@ -107,6 +107,11 @@ class CommunicateTopology(object):
return
all_result
def
get_rank_from_stage
(
self
,
global_rank
,
**
kwargs
):
coord
=
self
.
get_coord
(
global_rank
)
tf
=
coord
.
_replace
(
**
kwargs
).
_asdict
()
return
self
.
get_rank
(
**
tf
)
class
HybridCommunicateGroup
(
object
):
def
__init__
(
self
,
topology
):
...
...
@@ -254,7 +259,6 @@ class HybridCommunicateGroup(object):
def
get_check_parallel_group
(
self
):
return
self
.
_check_comm_group
def
get_rank_from_stage
(
self
,
stage_id
):
coord
=
self
.
_topo
.
get_coord
(
self
.
global_rank
)
tf
=
coord
.
_replace
(
pipe
=
stage_id
).
_asdict
()
return
self
.
_topo
.
get_rank
(
**
tf
)
def
get_rank_from_stage
(
self
,
stage_id
,
**
kwargs
):
return
self
.
_topo
.
get_rank_from_stage
(
self
.
global_rank
,
pipe
=
stage_id
,
**
kwargs
)
python/paddle/distributed/fleet/meta_parallel/__init__.py
浏览文件 @
294dfd23
...
...
@@ -17,6 +17,7 @@ from .parallel_layers import ColumnParallelLinear # noqa: F401
from
.parallel_layers
import
RowParallelLinear
# noqa: F401
from
.parallel_layers
import
ParallelCrossEntropy
# noqa: F401
from
.parallel_layers
import
LayerDesc
# noqa: F401
from
.parallel_layers
import
SharedLayerDesc
# noqa: F401
from
.parallel_layers
import
PipelineLayer
# noqa: F401
from
.parallel_layers
import
RNGStatesTracker
# noqa: F401
from
.parallel_layers
import
model_parallel_random_seed
# noqa: F401
...
...
python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
浏览文件 @
294dfd23
...
...
@@ -17,6 +17,7 @@ from .mp_layers import ColumnParallelLinear # noqa: F401
from
.mp_layers
import
RowParallelLinear
# noqa: F401
from
.mp_layers
import
ParallelCrossEntropy
# noqa: F401
from
.pp_layers
import
LayerDesc
# noqa: F401
from
.pp_layers
import
SharedLayerDesc
# noqa: F401
from
.pp_layers
import
PipelineLayer
# noqa: F401
from
.random
import
RNGStatesTracker
# noqa: F401
from
.random
import
model_parallel_random_seed
# noqa: F401
...
...
python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
浏览文件 @
294dfd23
...
...
@@ -15,6 +15,7 @@ import math
import
paddle
from
paddle.fluid.dygraph.layers
import
Layer
from
...utils.log_util
import
logger
,
layer_to_str
from
functools
import
partial
__all__
=
[]
...
...
@@ -58,6 +59,20 @@ class LayerDesc(object):
**
self
.
kwargs
)
class
SharedLayerDesc
(
LayerDesc
):
def
__init__
(
self
,
key
,
layer_func
,
forward_func
=
None
,
shared_weight_attr
=
'weight'
,
*
inputs
,
**
kwargs
):
super
(
SharedLayerDesc
,
self
).
__init__
(
layer_func
,
*
inputs
,
**
kwargs
)
self
.
layer_name
=
key
self
.
forward_func
=
forward_func
self
.
shared_weight_attr
=
shared_weight_attr
class
PipelineLayer
(
Layer
):
def
__init__
(
self
,
layers
,
...
...
@@ -104,11 +119,86 @@ class PipelineLayer(Layer):
self
.
_start_pos
=
0
self
.
_end_pos
=
self
.
_num_layers
-
1
self
.
_segment_network
(
seg_method
)
self
.
shared_layers
=
paddle
.
nn
.
LayerDict
()
self
.
shared_weight_attrs
=
{}
# construct layer
self
.
run_function
=
[]
self
.
_build_layer
()
self
.
shared_comm
=
self
.
_construct_shared_comm
()
self
.
_synchronize_shared_weights
()
def
get_stage_from_index
(
self
,
layer_idx
):
assert
0
<=
layer_idx
<
self
.
_num_layers
,
"layer_idx is out of bound"
for
stage
in
range
(
self
.
_topo
.
get_dim
(
'pipe'
)):
if
self
.
segment_parts
[
stage
]
<=
layer_idx
<
self
.
segment_parts
[
stage
+
1
]:
return
stage
def
_construct_shared_comm
(
self
):
shared_comm
=
{}
if
self
.
_topo
.
get_dim
(
"pipe"
)
==
1
:
return
layers_desc
=
self
.
_layers_desc
shared_layer_names
=
set
(
s
.
layer_name
for
s
in
layers_desc
if
isinstance
(
s
,
SharedLayerDesc
))
for
key
in
shared_layer_names
:
shared_layers
=
[]
for
idx
,
layer
in
enumerate
(
layers_desc
):
if
isinstance
(
layer
,
SharedLayerDesc
)
and
layer
.
layer_name
==
key
:
shared_layers
.
append
(
idx
)
shared_stages
=
set
(
self
.
get_stage_from_index
(
idx
)
for
idx
in
shared_layers
)
self
.
_dp_degree
=
self
.
_topo
.
get_dim
(
'data'
)
self
.
_mp_degree
=
self
.
_topo
.
get_dim
(
'model'
)
shared_ranks
=
[]
for
dp
in
range
(
self
.
_dp_degree
):
for
mp
in
range
(
self
.
_mp_degree
):
shared_ranks
=
[]
for
s
in
sorted
(
shared_stages
):
shared_ranks
.
append
(
self
.
_topo
.
get_rank_from_stage
(
self
.
global_rank
,
pipe
=
s
,
data
=
dp
,
model
=
mp
))
group
=
paddle
.
distributed
.
new_group
(
ranks
=
shared_ranks
)
if
self
.
global_rank
in
shared_ranks
:
assert
key
in
self
.
shared_layers
if
key
in
self
.
shared_layers
:
shared_comm
[
key
]
=
{
'ranks'
:
shared_ranks
,
'group'
:
group
,
'weight_attr'
:
self
.
shared_weight_attrs
[
key
],
'layer'
:
self
.
shared_layers
[
key
],
}
return
shared_comm
def
_synchronize_shared_weights
(
self
):
for
key
,
comm
in
self
.
shared_comm
.
items
():
with
paddle
.
framework
.
no_grad
():
paddle
.
distributed
.
broadcast
(
getattr
(
comm
[
'layer'
],
comm
[
'weight_attr'
]),
src
=
min
(
comm
[
'ranks'
]),
group
=
comm
[
'group'
])
def
allreduce_shared_weight_gradients
(
self
):
for
key
,
comm
in
self
.
shared_comm
.
items
():
param
=
getattr
(
self
.
shared_layers
[
key
],
comm
[
'weight_attr'
])
# need use trace_op to allreduce weight
with
paddle
.
framework
.
no_grad
():
paddle
.
fluid
.
framework
.
_dygraph_tracer
().
trace_op
(
type
=
"c_allreduce_sum"
,
inputs
=
{
'X'
:
param
.
_grad_ivar
()},
outputs
=
{
'Out'
:
param
.
_grad_ivar
()},
attrs
=
{
'ring_id'
:
comm
[
'group'
].
id
,
'use_calc_stream'
:
True
})
def
_segment_network
(
self
,
seg_method
):
logger
.
info
(
"start segment network.."
)
seg
=
SegmentLayers
(
...
...
@@ -142,6 +232,21 @@ class PipelineLayer(Layer):
if
isinstance
(
layer
,
Layer
):
self
.
run_function
.
append
(
layer
)
self
.
add_sublayer
(
str
(
layer_index
),
layer
)
elif
isinstance
(
layer
,
SharedLayerDesc
):
if
layer
.
layer_name
not
in
self
.
shared_layers
:
self
.
shared_layers
[
layer
.
layer_name
]
=
layer
.
build_layer
()
self
.
shared_weight_attrs
[
layer
.
layer_name
]
=
layer
.
shared_weight_attr
if
layer
.
forward_func
is
None
:
self
.
run_function
.
append
(
self
.
shared_layers
[
layer
.
layer_name
])
else
:
self
.
run_function
.
append
(
partial
(
layer
.
forward_func
,
self
.
shared_layers
[
layer
.
layer_name
]))
elif
isinstance
(
layer
,
LayerDesc
):
model
=
layer
.
build_layer
()
self
.
run_function
.
append
(
model
)
...
...
python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
浏览文件 @
294dfd23
...
...
@@ -138,6 +138,8 @@ class PipelineParallel(MetaParallelBase):
self
.
_backward
(
cache_id
=
backward_steps
)
backward_steps
+=
1
self
.
_layers
.
allreduce_shared_weight_gradients
()
# optimizer
self
.
_step
()
self
.
train_loss
=
self
.
_reduce_final_loss
()
...
...
python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
浏览文件 @
294dfd23
...
...
@@ -270,8 +270,8 @@ class TestDistTraning(unittest.TestCase):
np
.
testing
.
assert_allclose
(
loss_a
.
numpy
(),
loss_b
.
numpy
())
def
test_parallel_cross_entropy
(
self
):
batch_size
=
2
seq_length
=
1
batch_size
=
8
seq_length
=
1
6
class_size_per_card
=
2
vocab_size
=
class_size_per_card
*
self
.
model_parallel_size
seed
=
1025
...
...
python/paddle/fluid/tests/unittests/hybrid_parallel_shared_weight.py
0 → 100644
浏览文件 @
294dfd23
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
division
from
__future__
import
print_function
import
unittest
import
paddle
import
numpy
as
np
import
random
import
paddle
import
paddle.distributed
as
dist
import
paddle.distributed.fleet
as
fleet
from
paddle.fluid.dygraph.container
import
Sequential
from
paddle.distributed.fleet.meta_parallel
import
PipelineLayer
from
paddle.fluid.dygraph.layers
import
Layer
import
paddle.nn
as
nn
import
paddle.fluid
as
fluid
from
paddle.distributed.fleet.meta_parallel
import
LayerDesc
,
SharedLayerDesc
def
print_hook_fn
(
grad
):
print
(
grad
)
def
set_random_seed
(
seed
,
dp_id
,
rank_id
):
"""Set random seed for reproducability."""
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
+
dp_id
)
paddle
.
seed
(
seed
+
dp_id
)
batch_size
=
8
micro_batch_size
=
2
vocab_size
=
128
hidden_size
=
16
class
SimpleNet
(
Layer
):
def
__init__
(
self
):
super
(
SimpleNet
,
self
).
__init__
()
self
.
word_embeddings
=
nn
.
Embedding
(
vocab_size
,
hidden_size
)
self
.
softmax_weight
=
self
.
create_parameter
(
shape
=
[
hidden_size
,
vocab_size
])
self
.
softmax_bias
=
self
.
create_parameter
(
shape
=
[
vocab_size
],
is_bias
=
False
)
def
forward
(
self
,
x1
,
x2
,
y1
):
x_emb
=
self
.
word_embeddings
(
x1
)
fc
=
fluid
.
layers
.
matmul
(
x_emb
,
self
.
softmax_weight
)
fc
=
fluid
.
layers
.
elementwise_add
(
fc
,
self
.
softmax_bias
)
projection
=
fluid
.
layers
.
reshape
(
fc
,
shape
=
[
-
1
,
vocab_size
])
projection
=
paddle
.
matmul
(
projection
,
self
.
word_embeddings
.
weight
)
loss
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
projection
,
label
=
y1
,
soft_label
=
False
)
return
loss
.
mean
()
class
EmbeddingPipe
(
Layer
):
def
__init__
(
self
):
super
(
EmbeddingPipe
,
self
).
__init__
()
self
.
word_embeddings
=
nn
.
Embedding
(
vocab_size
,
hidden_size
)
@
property
def
embedding_weight
(
self
):
return
self
.
word_embeddings
.
weight
def
forward
(
self
,
args
):
x1
,
x2
=
args
x_emb
=
self
.
word_embeddings
(
x1
)
return
x_emb
,
x2
class
MatmulNet
(
Layer
):
def
__init__
(
self
):
super
(
MatmulNet
,
self
).
__init__
()
self
.
softmax_weight
=
self
.
create_parameter
(
shape
=
[
hidden_size
,
vocab_size
])
def
forward
(
self
,
args
):
x1
,
x2
=
args
fc
=
fluid
.
layers
.
matmul
(
x1
,
self
.
softmax_weight
)
return
fc
,
x2
class
BiasNet
(
Layer
):
def
__init__
(
self
):
super
(
BiasNet
,
self
).
__init__
()
self
.
softmax_bias
=
self
.
create_parameter
(
shape
=
[
vocab_size
])
def
forward
(
self
,
args
):
fc
,
x2
=
args
fc
=
fluid
.
layers
.
elementwise_add
(
fc
,
self
.
softmax_bias
)
projection
=
fluid
.
layers
.
reshape
(
fc
,
shape
=
[
-
1
,
vocab_size
])
return
projection
,
x2
class
LossNet
(
Layer
):
def
__init__
(
self
):
super
(
LossNet
,
self
).
__init__
()
def
forward
(
self
,
args
,
y1
):
projection
=
args
loss
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
projection
,
label
=
y1
[
0
],
soft_label
=
False
)
return
loss
.
mean
()
class
SimpleNetPipe
(
PipelineLayer
):
def
__init__
(
self
,
**
kwargs
):
self
.
descs
=
[]
self
.
descs
.
append
(
SharedLayerDesc
(
'embed'
,
EmbeddingPipe
,
shared_weight_attr
=
'embedding_weight'
))
self
.
descs
.
append
(
LayerDesc
(
MatmulNet
))
self
.
descs
.
append
(
LayerDesc
(
BiasNet
))
def
_logits_helper
(
embedding
,
output
):
return
paddle
.
matmul
(
output
[
0
],
embedding
.
embedding_weight
)
self
.
descs
.
append
(
SharedLayerDesc
(
'embed'
,
EmbeddingPipe
,
forward_func
=
_logits_helper
,
shared_weight_attr
=
'embedding_weight'
))
super
(
SimpleNetPipe
,
self
).
__init__
(
layers
=
self
.
descs
,
loss_fn
=
LossNet
(),
**
kwargs
)
class
TestDistEmbeddingTraning
(
unittest
.
TestCase
):
def
setUp
(
self
):
strategy
=
fleet
.
DistributedStrategy
()
self
.
model_parallel_size
=
1
self
.
data_parallel_size
=
1
self
.
pipeline_parallel_size
=
2
strategy
.
hybrid_configs
=
{
"dp_degree"
:
self
.
data_parallel_size
,
"mp_degree"
:
self
.
model_parallel_size
,
"pp_degree"
:
self
.
pipeline_parallel_size
,
}
strategy
.
pipeline_configs
=
{
"accumulate_steps"
:
batch_size
//
micro_batch_size
,
"micro_batch_size"
:
micro_batch_size
}
fleet
.
init
(
is_collective
=
True
,
strategy
=
strategy
)
def
test_pp_model
(
self
):
hcg
=
fleet
.
get_hybrid_communicate_group
()
word_size
=
hcg
.
get_model_parallel_world_size
()
dp_id
=
hcg
.
get_data_parallel_rank
()
pp_id
=
hcg
.
get_stage_id
()
rank_id
=
dist
.
get_rank
()
set_random_seed
(
1024
,
dp_id
,
rank_id
)
#construct model a
model_a
=
SimpleNet
()
scheduler_a
=
paddle
.
optimizer
.
lr
.
PiecewiseDecay
(
boundaries
=
[
2
,
3
,
4
],
values
=
[
0.01
,
0.02
,
0.03
,
0.04
],
verbose
=
True
)
optimizer_a
=
paddle
.
optimizer
.
SGD
(
learning_rate
=
scheduler_a
,
parameters
=
model_a
.
parameters
())
model_b
=
SimpleNetPipe
(
topology
=
hcg
.
topology
())
scheduler_b
=
paddle
.
optimizer
.
lr
.
PiecewiseDecay
(
boundaries
=
[
2
,
3
,
4
],
values
=
[
0.01
,
0.02
,
0.03
,
0.04
],
verbose
=
True
)
optimizer_b
=
paddle
.
optimizer
.
SGD
(
learning_rate
=
scheduler_b
,
parameters
=
model_b
.
parameters
())
model_b
=
fleet
.
distributed_model
(
model_b
)
optimizer_b
=
fleet
.
distributed_optimizer
(
optimizer_b
)
param_len
=
len
(
model_a
.
parameters
())
parameters
=
[]
for
param
in
model_a
.
parameters
():
parameters
.
append
(
param
.
numpy
())
model_b_params
=
model_b
.
parameters
()
if
pp_id
==
0
:
model_b_params
[
0
].
set_value
(
parameters
[
2
])
model_b_params
[
1
].
set_value
(
parameters
[
0
])
else
:
model_b_params
[
0
].
set_value
(
parameters
[
2
])
model_b_params
[
1
].
set_value
(
parameters
[
1
])
for
step
in
range
(
5
):
x1_data
=
np
.
random
.
randint
(
0
,
vocab_size
,
size
=
[
batch_size
,
1
])
x2_data
=
np
.
random
.
randint
(
0
,
vocab_size
,
size
=
[
batch_size
,
1
])
y1_data
=
np
.
random
.
randint
(
0
,
hidden_size
,
size
=
[
batch_size
,
1
])
x1
=
paddle
.
to_tensor
(
x1_data
)
x2
=
paddle
.
to_tensor
(
x2_data
)
y1
=
paddle
.
to_tensor
(
y1_data
)
x1
.
stop_gradient
=
True
x2
.
stop_gradient
=
True
y1
.
stop_gradient
=
True
loss_a
=
model_a
(
x1
,
x2
,
y1
)
loss_a
.
backward
()
optimizer_a
.
step
()
optimizer_a
.
clear_grad
()
scheduler_a
.
step
()
loss_b
=
model_b
.
train_batch
([(
x1
,
x2
),
(
y1
,
)],
optimizer_b
,
scheduler_b
)
print
(
"loss"
,
loss_a
.
numpy
(),
loss_b
.
numpy
())
np
.
testing
.
assert_allclose
(
loss_a
.
numpy
(),
loss_b
.
numpy
())
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
浏览文件 @
294dfd23
...
...
@@ -27,6 +27,9 @@ class TestHybridPipeParallel(TestMultipleGpus):
def
test_hybrid_parallel_pp_tuple_inputs
(
self
):
self
.
run_mnist_2gpu
(
'hybrid_parallel_pp_embedding.py'
)
def
test_hybrid_parallel_pp_tuple_inputs
(
self
):
self
.
run_mnist_2gpu
(
'hybrid_parallel_shared_weight.py'
)
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录