未验证 提交 fd53181f 编写于 作者: H Haohongxiang 提交者: GitHub

[Dygraph] Check NCCL version for p2p communication in MoE/Pipeline Parallelism (#51076)

上级 05d9e622
......@@ -17,6 +17,7 @@ from functools import reduce
from itertools import product
import paddle
from paddle.distributed.utils.nccl_utils import check_nccl_version_for_p2p
from ..utils.log_util import logger
......@@ -188,6 +189,7 @@ class HybridCommunicateGroup:
# create p2p_groups
if self._pp_degree > 1:
check_nccl_version_for_p2p()
self._set_p2p_group()
debug_str = (
......
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import subprocess
def get_nccl_version_str():
nccl_version_str = subprocess.check_output(
r"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'",
stderr=subprocess.DEVNULL,
shell=True,
).decode('utf-8')
# NOTE: This is a hacking method to get nccl version, but it will return None
# if current platform is not Linux. So we only check nccl version for Linux
# platform while training with pipeline parallelism.
if nccl_version_str:
nccl_version_str = nccl_version_str.replace("\n", "")
return nccl_version_str
def check_nccl_version_for_p2p():
nccl_version_str = get_nccl_version_str()
if nccl_version_str:
nccl_version_str = nccl_version_str.replace("\n", "")
nccl_version_int = [int(s) for s in nccl_version_str.split(".")]
nccl_version_baseline = [2, 8, 4]
assert nccl_version_int >= nccl_version_baseline, (
"The version of NCCL is required to be at least v2.8.4 while training with "
"pipeline/MoE parallelism, but we found v{}. The previous version of NCCL has "
"some bugs in p2p communication, and you can see more detailed description "
"about this issue from ReleaseNotes of NCCL v2.8.4 "
"(https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-4.html#rel_2-8-4).".format(
nccl_version_str
)
)
else:
logging.warning("No version for NCCL library found!")
......@@ -16,7 +16,6 @@
import os
import shutil
import subprocess
import tempfile
import numpy as np
......@@ -34,6 +33,7 @@ from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import (
GroupShardedScaler,
)
from paddle.distributed.utils.nccl_utils import get_nccl_version_str
from paddle.nn import Linear
epoch = 10
......@@ -366,11 +366,7 @@ def test_stage2_stage3():
# bfp16
# NOTE: this is a hack to get int format nccl version, like 2134
# if current platform is not linux, version number will be 0
nccl_version_str = subprocess.check_output(
r"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'",
stderr=subprocess.DEVNULL,
shell=True,
).decode('utf-8')
nccl_version_str = get_nccl_version_str()
nccl_version = (
int("".join(nccl_version_str.split("."))) if nccl_version_str else 0
)
......
......@@ -14,7 +14,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import subprocess
import numpy as np
......@@ -26,6 +25,7 @@ from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import (
GroupShardedScaler,
)
from paddle.distributed.utils.nccl_utils import get_nccl_version_str
from paddle.nn import Linear
epoch = 10
......@@ -217,11 +217,7 @@ def test_stage3_offload():
# bfp16 offload
# NOTE: this is a hack to get int format nccl version, like 2134
# if current platform is not linux, version number will be 0
nccl_version_str = subprocess.check_output(
r"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'",
stderr=subprocess.DEVNULL,
shell=True,
).decode('utf-8')
nccl_version_str = get_nccl_version_str()
nccl_version = (
int("".join(nccl_version_str.split("."))) if nccl_version_str else 0
)
......
......@@ -26,6 +26,7 @@ from paddle_bfloat import bfloat16
import paddle
import paddle.fluid as fluid
from paddle.distributed.utils.nccl_utils import get_nccl_version_str
from paddle.fluid import core
......@@ -172,11 +173,7 @@ class TestDistBase(unittest.TestCase):
# NOTE: this is a hack to get int format nccl version, like 2134
# if current platform is not linux, version number will be 0
nccl_version_str = subprocess.check_output(
r"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'",
stderr=subprocess.DEVNULL,
shell=True,
).decode('utf-8')
nccl_version_str = get_nccl_version_str()
self._nccl_version = (
int("".join(nccl_version_str.split("."))) if nccl_version_str else 0
)
......
......@@ -25,6 +25,7 @@ import paddle
import paddle.nn as nn
from paddle.autograd import PyLayer
from paddle.distributed.utils.moe_utils import global_gather, global_scatter
from paddle.distributed.utils.nccl_utils import check_nccl_version_for_p2p
from paddle.framework import in_dygraph_mode
from paddle.incubate.distributed.fleet import recompute_hybrid
......@@ -351,6 +352,9 @@ class MoELayer(nn.Layer):
assert experts is not None
self.experts = experts
if self.world_size > 1:
check_nccl_version_for_p2p()
self.mp_group = mp_group
self.d_model = d_model
if isinstance(gate, dict):
......
......@@ -72,8 +72,8 @@ function make_ubuntu_trt7_dockerfile(){
RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \\
RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \\
ENV PATH=/usr/local/gcc-8.2/bin:\$PATH #g" ${dockerfile_name}
sed -i "s#bash /build_scripts/install_nccl2.sh#wget -q --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
RUN apt remove -y libnccl* --allow-change-held-packages \&\& apt-get install -y libsndfile1 libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 zstd pigz --allow-change-held-packages #g" ${dockerfile_name}
sed -i "s#bash /build_scripts/install_nccl2.sh#wget -q --no-proxy https://nccl2-deb.gz.bcebos.com/nccl-local-repo-ubuntu1604-2.8.4-cuda10.2_1.0-1_amd64.deb \\
RUN apt remove -y libnccl* --allow-change-held-packages \&\& apt-get install -y --allow-unauthenticated libsndfile1 libnccl2=2.8.4-1+cuda10.2 libnccl-dev=2.8.4-1+cuda10.2 zstd pigz --allow-change-held-packages #g" ${dockerfile_name}
}
function make_centos_dockerfile(){
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册