diff --git a/python/paddle/distributed/utils/nccl_utils.py b/python/paddle/distributed/utils/nccl_utils.py index 5aafb6ff5a4bed25e22a844c93d0790001d2e589..f9a1b99f91b1796186c5c098b567211f5387bcfd 100644 --- a/python/paddle/distributed/utils/nccl_utils.py +++ b/python/paddle/distributed/utils/nccl_utils.py @@ -12,51 +12,42 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging -import subprocess +from paddle.fluid import core -def get_nccl_version_str(): - nccl_version_str = subprocess.check_output( - r"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'", - stderr=subprocess.DEVNULL, - shell=True, - ).decode('utf-8') - # NOTE: This is a hacking method to get nccl version, but it will return None - # if current platform is not Linux. So we only check nccl version for Linux - # platform while training with pipeline parallelism. - if nccl_version_str: - nccl_version_str = nccl_version_str.replace("\n", "") +def get_nccl_version_str(ver): + if ver >= 10000: + NCCL_MAJOR_VERSION = int(ver // 10000) + ver = ver % 10000 + else: + NCCL_MAJOR_VERSION = int(ver // 1000) + ver = ver % 1000 + + NCCL_MINOR_VERSION = int(ver // 100) + NCCL_PATCH_VERSION = int(ver % 100) - return nccl_version_str + return "{}.{}.{}".format( + NCCL_MAJOR_VERSION, NCCL_MINOR_VERSION, NCCL_PATCH_VERSION + ) def check_nccl_version_for_p2p(): - nccl_version_str = get_nccl_version_str() - if nccl_version_str: - nccl_version_str = nccl_version_str.replace("\n", "") - nccl_version_int = [int(s) for s in nccl_version_str.split(".")] - nccl_version_baseline = [2, 8, 4] - assert nccl_version_int >= nccl_version_baseline, ( - "The version of NCCL is required to be at least v2.8.4 while training with " - "pipeline/MoE parallelism, but we found v{}. The previous version of NCCL has " - "some bugs in p2p communication, and you can see more detailed description " - "about this issue from ReleaseNotes of NCCL v2.8.4 " - "(https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-4.html#rel_2-8-4).".format( - nccl_version_str - ) + nccl_version = core.nccl_version() + nccl_version_str = get_nccl_version_str(nccl_version) + nccl_version_baseline = 2804 + assert nccl_version >= nccl_version_baseline, ( + "The version of NCCL is required to be at least v2.8.4 while training with " + "pipeline/MoE parallelism, but we found v{}. The previous version of NCCL has " + "some bugs in p2p communication, and you can see more detailed description " + "about this issue from ReleaseNotes of NCCL v2.8.4 " + "(https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-4.html#rel_2-8-4).".format( + nccl_version_str ) - else: - logging.warning("No version for NCCL library found!") + ) def check_nccl_version_for_bf16(): - nccl_version_str = get_nccl_version_str() - if nccl_version_str: - nccl_version_str = nccl_version_str.replace("\n", "") - nccl_version_int = [int(s) for s in nccl_version_str.split(".")] - nccl_version_baseline = [2, 10, 0] - return nccl_version_int >= nccl_version_baseline - - return False + nccl_version = core.nccl_version() + nccl_version_baseline = 21000 + return nccl_version >= nccl_version_baseline diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3.py index e73a79724c38ed6240a0f1d5ca10c1d7fd87aba2..5499968079a2e3ebe4924fd5c73bb3e22a89c078 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3.py @@ -33,7 +33,7 @@ from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import ( GroupShardedScaler, ) -from paddle.distributed.utils.nccl_utils import get_nccl_version_str +from paddle.fluid import core from paddle.nn import Linear epoch = 10 @@ -119,7 +119,7 @@ class RandomDataset(paddle.io.Dataset): def optimizer_setting(model, use_pure_fp16, opt_group=False): clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) - optimizer = paddle.optimizer.Momentum( + optimizer = paddle.optimizer.AdamW( parameters=[{"params": list(model.parameters())}] if opt_group else list(model.parameters()), @@ -364,14 +364,9 @@ def test_stage2_stage3(): ) # bfp16 - # NOTE: this is a hack to get int format nccl version, like 2134 - # if current platform is not linux, version number will be 0 - nccl_version_str = get_nccl_version_str() - nccl_version = ( - int("".join(nccl_version_str.split("."))) if nccl_version_str else 0 - ) + nccl_version = core.nccl_version() - if nccl_version >= 2100: + if nccl_version >= 21000: stage2_params = train_mlp( mlp11, sharding_stage=2, @@ -388,8 +383,8 @@ def test_stage2_stage3(): ) for i in range(len(stage2_params)): np.testing.assert_allclose( - stage2_params[i].numpy(), - stage3_params[i].numpy(), + stage2_params[i].astype("float32").numpy(), + stage3_params[i].astype("float32").numpy(), rtol=1e-4, atol=1e-3, ) diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3_offload.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3_offload.py index 3aad77fd788cee88959fd353105a464846e6c1f4..e97a163e42f9afdfb55b511db3ea3a0280429d4c 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3_offload.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3_offload.py @@ -24,7 +24,7 @@ from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import ( GroupShardedScaler, ) -from paddle.distributed.utils.nccl_utils import get_nccl_version_str +from paddle.fluid import core from paddle.nn import Linear epoch = 10 @@ -214,22 +214,16 @@ def test_stage3_offload(): ) # bfp16 offload - # NOTE: this is a hack to get int format nccl version, like 2134 - # if current platform is not linux, version number will be 0 - nccl_version_str = get_nccl_version_str() - nccl_version = ( - int("".join(nccl_version_str.split("."))) if nccl_version_str else 0 - ) - - if nccl_version >= 2100: + nccl_version = core.nccl_version() + if nccl_version >= 21000: stage3_params = train_mlp(mlp7, use_pure_fp16=True, use_bfp16=True) stage3_params_offload = train_mlp( mlp8, use_pure_fp16=True, offload=True, use_bfp16=True ) for i in range(len(stage3_params)): np.testing.assert_allclose( - stage3_params[i].numpy(), - stage3_params_offload[i].numpy(), + stage3_params[i].astype("float32").numpy(), + stage3_params_offload[i].astype("float32").numpy(), rtol=1e-2, atol=1e-2, ) diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py index 46dfbbd869bb53d00678e2715ebca267dbb16f71..265fa6052ddf9a8b979619c095d20a91538ef4d6 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py @@ -55,7 +55,7 @@ class TestCollectiveAllgatherAPI(TestDistBase): "uint8", "bool", ] - if self._nccl_version >= 2100: + if self._nccl_version >= 21000: dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place( @@ -118,7 +118,7 @@ class TestCollectiveAllgatherAPI(TestDistBase): "uint8", "bool", ] - if self._nccl_version >= 2100: + if self._nccl_version >= 21000: dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place( diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py index 97850b5552b85868ac509348c05d37e1ab7ae559..64d3885e8a6a90902d8e348bd41e7dbe1168f704 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py @@ -46,7 +46,7 @@ class TestCollectiveAllreduceAPI(TestDistBase): red_types_to_test = [ dist.ReduceOp.SUM, ] - if self._nccl_version >= 2100: + if self._nccl_version >= 21000: dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: for red_type in red_types_to_test: @@ -107,7 +107,7 @@ class TestCollectiveAllreduceAPI(TestDistBase): "uint8", "bool", ] - if self._nccl_version >= 2100: + if self._nccl_version >= 21000: dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place( diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py index decdd6c29c7447102a3cfb1ca63e5f1a20929908..548ac992bdbf33592affae368116de1c5f8c5708 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py @@ -39,7 +39,7 @@ class TestCollectiveAllToAllAPI(TestDistBase): "uint8", "bool", ] - if self._nccl_version >= 2100: + if self._nccl_version >= 21000: dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place( diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py index 9b542def42298f9289b6ede928ed66a6b383d126..97157ea5cb38be16fd001566306004570a9cd9f8 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py @@ -32,7 +32,7 @@ class TestCollectiveAllToAllSingleAPI(test_base.TestDistBase): "uint8", "bool", ] - if self._nccl_version >= 2100: + if self._nccl_version >= 21000: dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place( diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py index 16793b72d5841bb5f72fa186123d796f068e0ab4..cadd9def8f816be37d6ae655728bbf9074cbc8d0 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py @@ -43,7 +43,7 @@ class TestCollectiveBroadcastAPI(TestDistBase): "uint8", "bool", ] - if self._nccl_version >= 2100: + if self._nccl_version >= 21000: dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place( @@ -92,7 +92,7 @@ class TestCollectiveBroadcastAPI(TestDistBase): "uint8", "bool", ] - if self._nccl_version >= 2100: + if self._nccl_version >= 21000: dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place( diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_gather_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_gather_api.py index 68b6d12878fb6d456e66afd573de47896eb07ca3..e5eed88ae351161fabccd1f3d28de84026888bd8 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_gather_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_gather_api.py @@ -36,7 +36,7 @@ class TestCollectiveGatherAPI(TestDistBase): "uint8", "bool", ] - if self._nccl_version >= 2100: + if self._nccl_version >= 21000: dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place( diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py index 16c355811be7cca111a6d51492f74ee7e82c6911..9bd1cfb91a30ef72982888bc44bd937075709588 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py @@ -32,7 +32,7 @@ class TestCollectiveIsendIrecvAPI(test_base.TestDistBase): "uint8", "bool", ] - if self._nccl_version >= 2100: + if self._nccl_version >= 21000: dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place( diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py index 70ae163054a12887a3da2c9d045fc57d914b99fb..3b3c89344c03d4c4bc3ca19a1cbe7f5bd5bcbc32 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py @@ -44,7 +44,7 @@ class TestCollectiveReduceAPI(TestDistBase): red_types_to_test = [ dist.ReduceOp.SUM, ] - if self._nccl_version >= 2100: + if self._nccl_version >= 21000: dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: if paddle.fluid.core.is_compiled_with_cuda(): @@ -102,7 +102,7 @@ class TestCollectiveReduceAPI(TestDistBase): "uint8", "bool", ] - if self._nccl_version >= 2100: + if self._nccl_version >= 21000: dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place( diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py index ecb579c50470f32d1c9701f9a07da81bc351d357..4fc89d2e18fda8beb4d60fa5857a364625de469e 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py @@ -32,7 +32,7 @@ class TestCollectiveReduceScatterAPI(test_base.TestDistBase): "uint8", "bool", ] - if self._nccl_version >= 2100: + if self._nccl_version >= 21000: dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place( @@ -54,7 +54,7 @@ class TestCollectiveReduceScatterAPI(test_base.TestDistBase): "uint8", "bool", ] - if self._nccl_version >= 2100: + if self._nccl_version >= 21000: dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place( diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py index 6f463919c44a00ad64d97583272ff79047aba73d..9f56b45b97194fedb6be756ce72c6e410d27d3d8 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py @@ -44,7 +44,7 @@ class TestCollectiveScatterAPI(TestDistBase): "uint8", "bool", ] - if self._nccl_version >= 2100: + if self._nccl_version >= 21000: dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place( diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py index 656e18351a2641840760365979e58b75aca336cf..2cbaaa585a294369d9ea4a842876ec32b9cf6427 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py @@ -41,7 +41,7 @@ class TestCollectiveSendRecvAPI(TestDistBase): "uint8", "bool", ] - if self._nccl_version >= 2100: + if self._nccl_version >= 21000: dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: if paddle.fluid.core.is_compiled_with_cuda(): @@ -64,7 +64,7 @@ class TestCollectiveSendRecvAPI(TestDistBase): "uint8", "bool", ] - if self._nccl_version >= 2100: + if self._nccl_version >= 21000: dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place( diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py index fcd58c25e36e1539319a12f4946b44941a1bf0e8..b92f4d342048a2116747123a580c5d2abc646aad 100644 --- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py +++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py @@ -27,7 +27,6 @@ from eager_op_test import convert_float_to_uint16, convert_uint16_to_float import paddle import paddle.distributed as dist from paddle import fluid -from paddle.distributed.utils.nccl_utils import get_nccl_version_str from paddle.fluid import core @@ -194,10 +193,7 @@ class TestDistBase(unittest.TestCase): # NOTE: this is a hack to get int format nccl version, like 2134 # if current platform is not linux, version number will be 0 - nccl_version_str = get_nccl_version_str() - self._nccl_version = ( - int("".join(nccl_version_str.split("."))) if nccl_version_str else 0 - ) + self._nccl_version = core.nccl_version() def tearDown(self): self.temp_dir.cleanup()