nccl_utils.py 1.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


16
from paddle.fluid import core
17 18


19 20 21 22 23 24 25 26 27 28
def get_nccl_version_str(ver):
    if ver >= 10000:
        NCCL_MAJOR_VERSION = int(ver // 10000)
        ver = ver % 10000
    else:
        NCCL_MAJOR_VERSION = int(ver // 1000)
        ver = ver % 1000

    NCCL_MINOR_VERSION = int(ver // 100)
    NCCL_PATCH_VERSION = int(ver % 100)
29

30 31 32
    return "{}.{}.{}".format(
        NCCL_MAJOR_VERSION, NCCL_MINOR_VERSION, NCCL_PATCH_VERSION
    )
33 34 35


def check_nccl_version_for_p2p():
36 37 38 39 40 41 42 43 44 45
    nccl_version = core.nccl_version()
    nccl_version_str = get_nccl_version_str(nccl_version)
    nccl_version_baseline = 2804
    assert nccl_version >= nccl_version_baseline, (
        "The version of NCCL is required to be at least v2.8.4 while training with "
        "pipeline/MoE parallelism, but we found v{}. The previous version of NCCL has "
        "some bugs in p2p communication, and you can see more detailed description "
        "about this issue from ReleaseNotes of NCCL v2.8.4 "
        "(https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-4.html#rel_2-8-4).".format(
            nccl_version_str
46
        )
47
    )
48 49 50


def check_nccl_version_for_bf16():
51 52 53
    nccl_version = core.nccl_version()
    nccl_version_baseline = 21000
    return nccl_version >= nccl_version_baseline