Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
fd53181f
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
fd53181f
编写于
3月 03, 2023
作者:
H
Haohongxiang
提交者:
GitHub
3月 03, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Dygraph] Check NCCL version for p2p communication in MoE/Pipeline Parallelism (#51076)
上级
05d9e622
变更
7
显示空白变更内容
内联
并排
Showing
7 changed file
with
65 addition
and
19 deletion
+65
-19
python/paddle/distributed/fleet/base/topology.py
python/paddle/distributed/fleet/base/topology.py
+2
-0
python/paddle/distributed/utils/nccl_utils.py
python/paddle/distributed/utils/nccl_utils.py
+51
-0
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3.py
...nittests/collective/fleet/dygraph_group_sharded_stage3.py
+2
-6
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3_offload.py
.../collective/fleet/dygraph_group_sharded_stage3_offload.py
+2
-6
python/paddle/fluid/tests/unittests/test_collective_api_base.py
.../paddle/fluid/tests/unittests/test_collective_api_base.py
+2
-5
python/paddle/incubate/distributed/models/moe/moe_layer.py
python/paddle/incubate/distributed/models/moe/moe_layer.py
+4
-0
tools/dockerfile/ci_dockerfile.sh
tools/dockerfile/ci_dockerfile.sh
+2
-2
未找到文件。
python/paddle/distributed/fleet/base/topology.py
浏览文件 @
fd53181f
...
...
@@ -17,6 +17,7 @@ from functools import reduce
from
itertools
import
product
import
paddle
from
paddle.distributed.utils.nccl_utils
import
check_nccl_version_for_p2p
from
..utils.log_util
import
logger
...
...
@@ -188,6 +189,7 @@ class HybridCommunicateGroup:
# create p2p_groups
if
self
.
_pp_degree
>
1
:
check_nccl_version_for_p2p
()
self
.
_set_p2p_group
()
debug_str
=
(
...
...
python/paddle/distributed/utils/nccl_utils.py
0 → 100644
浏览文件 @
fd53181f
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
import
subprocess
def
get_nccl_version_str
():
nccl_version_str
=
subprocess
.
check_output
(
r
"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'"
,
stderr
=
subprocess
.
DEVNULL
,
shell
=
True
,
).
decode
(
'utf-8'
)
# NOTE: This is a hacking method to get nccl version, but it will return None
# if current platform is not Linux. So we only check nccl version for Linux
# platform while training with pipeline parallelism.
if
nccl_version_str
:
nccl_version_str
=
nccl_version_str
.
replace
(
"
\n
"
,
""
)
return
nccl_version_str
def
check_nccl_version_for_p2p
():
nccl_version_str
=
get_nccl_version_str
()
if
nccl_version_str
:
nccl_version_str
=
nccl_version_str
.
replace
(
"
\n
"
,
""
)
nccl_version_int
=
[
int
(
s
)
for
s
in
nccl_version_str
.
split
(
"."
)]
nccl_version_baseline
=
[
2
,
8
,
4
]
assert
nccl_version_int
>=
nccl_version_baseline
,
(
"The version of NCCL is required to be at least v2.8.4 while training with "
"pipeline/MoE parallelism, but we found v{}. The previous version of NCCL has "
"some bugs in p2p communication, and you can see more detailed description "
"about this issue from ReleaseNotes of NCCL v2.8.4 "
"(https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-4.html#rel_2-8-4)."
.
format
(
nccl_version_str
)
)
else
:
logging
.
warning
(
"No version for NCCL library found!"
)
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3.py
浏览文件 @
fd53181f
...
...
@@ -16,7 +16,6 @@
import
os
import
shutil
import
subprocess
import
tempfile
import
numpy
as
np
...
...
@@ -34,6 +33,7 @@ from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import
from
paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils
import
(
GroupShardedScaler
,
)
from
paddle.distributed.utils.nccl_utils
import
get_nccl_version_str
from
paddle.nn
import
Linear
epoch
=
10
...
...
@@ -366,11 +366,7 @@ def test_stage2_stage3():
# bfp16
# NOTE: this is a hack to get int format nccl version, like 2134
# if current platform is not linux, version number will be 0
nccl_version_str
=
subprocess
.
check_output
(
r
"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'"
,
stderr
=
subprocess
.
DEVNULL
,
shell
=
True
,
).
decode
(
'utf-8'
)
nccl_version_str
=
get_nccl_version_str
()
nccl_version
=
(
int
(
""
.
join
(
nccl_version_str
.
split
(
"."
)))
if
nccl_version_str
else
0
)
...
...
python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3_offload.py
浏览文件 @
fd53181f
...
...
@@ -14,7 +14,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
subprocess
import
numpy
as
np
...
...
@@ -26,6 +25,7 @@ from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import
from
paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils
import
(
GroupShardedScaler
,
)
from
paddle.distributed.utils.nccl_utils
import
get_nccl_version_str
from
paddle.nn
import
Linear
epoch
=
10
...
...
@@ -217,11 +217,7 @@ def test_stage3_offload():
# bfp16 offload
# NOTE: this is a hack to get int format nccl version, like 2134
# if current platform is not linux, version number will be 0
nccl_version_str
=
subprocess
.
check_output
(
r
"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'"
,
stderr
=
subprocess
.
DEVNULL
,
shell
=
True
,
).
decode
(
'utf-8'
)
nccl_version_str
=
get_nccl_version_str
()
nccl_version
=
(
int
(
""
.
join
(
nccl_version_str
.
split
(
"."
)))
if
nccl_version_str
else
0
)
...
...
python/paddle/fluid/tests/unittests/test_collective_api_base.py
浏览文件 @
fd53181f
...
...
@@ -26,6 +26,7 @@ from paddle_bfloat import bfloat16
import
paddle
import
paddle.fluid
as
fluid
from
paddle.distributed.utils.nccl_utils
import
get_nccl_version_str
from
paddle.fluid
import
core
...
...
@@ -172,11 +173,7 @@ class TestDistBase(unittest.TestCase):
# NOTE: this is a hack to get int format nccl version, like 2134
# if current platform is not linux, version number will be 0
nccl_version_str
=
subprocess
.
check_output
(
r
"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'"
,
stderr
=
subprocess
.
DEVNULL
,
shell
=
True
,
).
decode
(
'utf-8'
)
nccl_version_str
=
get_nccl_version_str
()
self
.
_nccl_version
=
(
int
(
""
.
join
(
nccl_version_str
.
split
(
"."
)))
if
nccl_version_str
else
0
)
...
...
python/paddle/incubate/distributed/models/moe/moe_layer.py
浏览文件 @
fd53181f
...
...
@@ -25,6 +25,7 @@ import paddle
import
paddle.nn
as
nn
from
paddle.autograd
import
PyLayer
from
paddle.distributed.utils.moe_utils
import
global_gather
,
global_scatter
from
paddle.distributed.utils.nccl_utils
import
check_nccl_version_for_p2p
from
paddle.framework
import
in_dygraph_mode
from
paddle.incubate.distributed.fleet
import
recompute_hybrid
...
...
@@ -351,6 +352,9 @@ class MoELayer(nn.Layer):
assert
experts
is
not
None
self
.
experts
=
experts
if
self
.
world_size
>
1
:
check_nccl_version_for_p2p
()
self
.
mp_group
=
mp_group
self
.
d_model
=
d_model
if
isinstance
(
gate
,
dict
):
...
...
tools/dockerfile/ci_dockerfile.sh
浏览文件 @
fd53181f
...
...
@@ -72,8 +72,8 @@ function make_ubuntu_trt7_dockerfile(){
RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc
\\
RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++
\\
ENV PATH=/usr/local/gcc-8.2/bin:
\$
PATH #g"
${
dockerfile_name
}
sed
-i
"s#bash /build_scripts/install_nccl2.sh#wget -q --no-proxy https://nccl2-deb.
cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1
-1_amd64.deb
\\
RUN apt remove -y libnccl* --allow-change-held-packages
\&\&
apt-get install -y
libsndfile1 libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1
zstd pigz --allow-change-held-packages #g"
${
dockerfile_name
}
sed
-i
"s#bash /build_scripts/install_nccl2.sh#wget -q --no-proxy https://nccl2-deb.
gz.bcebos.com/nccl-local-repo-ubuntu1604-2.8.4-cuda10.2_1.0
-1_amd64.deb
\\
RUN apt remove -y libnccl* --allow-change-held-packages
\&\&
apt-get install -y
--allow-unauthenticated libsndfile1 libnccl2=2.8.4-1+cuda10.2 libnccl-dev=2.8.4-1+cuda10.2
zstd pigz --allow-change-held-packages #g"
${
dockerfile_name
}
}
function
make_centos_dockerfile
(){
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录