diff --git a/tensorflow/dtensor/python/tests/BUILD b/tensorflow/dtensor/python/tests/BUILD index 615baad3085f67cba2a57cc2dfc6fd8ffc33031d..b83fa6e8932782f7a707c0a3ec2a3968d52a2a16 100644 --- a/tensorflow/dtensor/python/tests/BUILD +++ b/tensorflow/dtensor/python/tests/BUILD @@ -366,7 +366,7 @@ dtensor_test( env = { "DTENSOR_GPU_USE_NCCL_COMMUNICATION": "1", "NCCL_P2P_DISABLE": "1", # FIXME(b/251183104): p2p detection in cuda 10.1+ is broken. - "NCCL_PROTO": "Simple", # FIXME(b/272050398): Delete this when the Clang-16/NCCL incompatibility has been resolved. + "NCCL_PROTO": "Simple", # FIXME(b/272050398): Delete this after the migration to LLVM-17. }, tags = [ "no_windows", @@ -412,7 +412,7 @@ dtensor_test( env = { "DTENSOR_GPU_USE_NCCL_COMMUNICATION": "1", "NCCL_P2P_DISABLE": "1", # FIXME(b/251183104): p2p detection in cuda 10.1+ is broken. - "NCCL_PROTO": "Simple", # FIXME(b/272050398): Delete this when the Clang-16/NCCL incompatibility has been resolved. + "NCCL_PROTO": "Simple", # FIXME(b/272050398): Delete this after the migration to LLVM-17. }, tags = [ "no_windows", diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD index 915672d04326384dc7e3298a1d0fd000f45f420d..a608faf8a262ef812cf08c99b816a1f5dc05ff85 100644 --- a/third_party/nccl/archive.BUILD +++ b/third_party/nccl/archive.BUILD @@ -8,7 +8,6 @@ exports_files(["LICENSE.txt"]) load( "@local_config_cuda//cuda:build_defs.bzl", "cuda_library", - "if_cuda_clang", ) load( "@local_config_nccl//:build_defs.bzl", @@ -124,19 +123,56 @@ cc_library( ], ) -cc_library( +alias( name = "enqueue", + actual = select({ + "@local_config_cuda//cuda:using_clang": ":enqueue_clang", + "@local_config_cuda//cuda:using_nvcc": ":enqueue_nvcc", + }), +) + +# Kernels and their names have special treatment in CUDA compilation. +# Specifically, the host-side kernel launch stub (host-side representation of +# the kernel) ends up having the name which does not match the actual kernel +# name. In order to correctly refer to the kernel the referring code must be +# compiled as CUDA. +cuda_library( + name = "enqueue_clang", + srcs = [ + "src/enqueue.cc", + ], + hdrs = ["src/nccl.h"], + copts = [ + "--cuda-host-only", + ], + include_prefix = "third_party/nccl", + linkopts = ["-lrt"], + strip_include_prefix = "src", + target_compatible_with = select({ + "@local_config_cuda//cuda:using_clang": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + visibility = ["//visibility:public"], + deps = [ + ":device", + ":include_hdrs", + ":src_hdrs", + ], +) + +cc_library( + name = "enqueue_nvcc", srcs = [ "src/enqueue.cc", ], hdrs = ["src/nccl.h"], - copts = if_cuda_clang([ - "-x", - "cuda", - ]), include_prefix = "third_party/nccl", linkopts = ["-lrt"], strip_include_prefix = "src", + target_compatible_with = select({ + "@local_config_cuda//cuda:using_nvcc": [], + "//conditions:default": ["@platforms//:incompatible"], + }), visibility = ["//visibility:public"], deps = [ ":device", diff --git a/third_party/nccl/archive.patch b/third_party/nccl/archive.patch index 7934adf6e1d2bb7c4c7d4fe4eb36b0d7daccb165..6d40fd869af299f0dd8fc03d7514b076b4b7680d 100644 --- a/third_party/nccl/archive.patch +++ b/third_party/nccl/archive.patch @@ -46,12 +46,12 @@ index 985274e..7ebb1e1 100644 -#define NCCL_PATCH ${nccl:Patch} -#define NCCL_SUFFIX "${nccl:Suffix}" +#define NCCL_MAJOR 2 -+#define NCCL_MINOR 13 -+#define NCCL_PATCH 4 ++#define NCCL_MINOR 16 ++#define NCCL_PATCH 5 +#define NCCL_SUFFIX "" -#define NCCL_VERSION_CODE ${nccl:Version} -+#define NCCL_VERSION_CODE 2134 ++#define NCCL_VERSION_CODE 2165 #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z)) #ifdef __cplusplus @@ -67,4 +67,18 @@ index 2aeb932..cdc67d2 100644 + explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept { schema_attr.name = schemaName; - schema_attr.entries = entries; \ No newline at end of file + schema_attr.entries = entries; +diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h +index accf8371a..4ab1bfac6 100644 +--- a/src/collectives/device/common.h ++++ b/src/collectives/device/common.h +@@ -166,7 +166,8 @@ __device__ void ncclKernel( + bytes = 0; + break; + } +- copyToShmem16(tid%WARP_SIZE, dst, src, bytes); ++ if (bytes) ++ copyToShmem16(tid%WARP_SIZE, dst, src, bytes); + } + __syncthreads(); // publish ncclShmem + \ No newline at end of file diff --git a/third_party/xla/third_party/nccl/archive.BUILD b/third_party/xla/third_party/nccl/archive.BUILD index 915672d04326384dc7e3298a1d0fd000f45f420d..a608faf8a262ef812cf08c99b816a1f5dc05ff85 100644 --- a/third_party/xla/third_party/nccl/archive.BUILD +++ b/third_party/xla/third_party/nccl/archive.BUILD @@ -8,7 +8,6 @@ exports_files(["LICENSE.txt"]) load( "@local_config_cuda//cuda:build_defs.bzl", "cuda_library", - "if_cuda_clang", ) load( "@local_config_nccl//:build_defs.bzl", @@ -124,19 +123,56 @@ cc_library( ], ) -cc_library( +alias( name = "enqueue", + actual = select({ + "@local_config_cuda//cuda:using_clang": ":enqueue_clang", + "@local_config_cuda//cuda:using_nvcc": ":enqueue_nvcc", + }), +) + +# Kernels and their names have special treatment in CUDA compilation. +# Specifically, the host-side kernel launch stub (host-side representation of +# the kernel) ends up having the name which does not match the actual kernel +# name. In order to correctly refer to the kernel the referring code must be +# compiled as CUDA. +cuda_library( + name = "enqueue_clang", + srcs = [ + "src/enqueue.cc", + ], + hdrs = ["src/nccl.h"], + copts = [ + "--cuda-host-only", + ], + include_prefix = "third_party/nccl", + linkopts = ["-lrt"], + strip_include_prefix = "src", + target_compatible_with = select({ + "@local_config_cuda//cuda:using_clang": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + visibility = ["//visibility:public"], + deps = [ + ":device", + ":include_hdrs", + ":src_hdrs", + ], +) + +cc_library( + name = "enqueue_nvcc", srcs = [ "src/enqueue.cc", ], hdrs = ["src/nccl.h"], - copts = if_cuda_clang([ - "-x", - "cuda", - ]), include_prefix = "third_party/nccl", linkopts = ["-lrt"], strip_include_prefix = "src", + target_compatible_with = select({ + "@local_config_cuda//cuda:using_nvcc": [], + "//conditions:default": ["@platforms//:incompatible"], + }), visibility = ["//visibility:public"], deps = [ ":device", diff --git a/third_party/xla/third_party/nccl/archive.patch b/third_party/xla/third_party/nccl/archive.patch index 7934adf6e1d2bb7c4c7d4fe4eb36b0d7daccb165..6d40fd869af299f0dd8fc03d7514b076b4b7680d 100644 --- a/third_party/xla/third_party/nccl/archive.patch +++ b/third_party/xla/third_party/nccl/archive.patch @@ -46,12 +46,12 @@ index 985274e..7ebb1e1 100644 -#define NCCL_PATCH ${nccl:Patch} -#define NCCL_SUFFIX "${nccl:Suffix}" +#define NCCL_MAJOR 2 -+#define NCCL_MINOR 13 -+#define NCCL_PATCH 4 ++#define NCCL_MINOR 16 ++#define NCCL_PATCH 5 +#define NCCL_SUFFIX "" -#define NCCL_VERSION_CODE ${nccl:Version} -+#define NCCL_VERSION_CODE 2134 ++#define NCCL_VERSION_CODE 2165 #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z)) #ifdef __cplusplus @@ -67,4 +67,18 @@ index 2aeb932..cdc67d2 100644 + explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept { schema_attr.name = schemaName; - schema_attr.entries = entries; \ No newline at end of file + schema_attr.entries = entries; +diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h +index accf8371a..4ab1bfac6 100644 +--- a/src/collectives/device/common.h ++++ b/src/collectives/device/common.h +@@ -166,7 +166,8 @@ __device__ void ncclKernel( + bytes = 0; + break; + } +- copyToShmem16(tid%WARP_SIZE, dst, src, bytes); ++ if (bytes) ++ copyToShmem16(tid%WARP_SIZE, dst, src, bytes); + } + __syncthreads(); // publish ncclShmem + \ No newline at end of file diff --git a/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD b/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD index 915672d04326384dc7e3298a1d0fd000f45f420d..a608faf8a262ef812cf08c99b816a1f5dc05ff85 100644 --- a/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD +++ b/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD @@ -8,7 +8,6 @@ exports_files(["LICENSE.txt"]) load( "@local_config_cuda//cuda:build_defs.bzl", "cuda_library", - "if_cuda_clang", ) load( "@local_config_nccl//:build_defs.bzl", @@ -124,19 +123,56 @@ cc_library( ], ) -cc_library( +alias( name = "enqueue", + actual = select({ + "@local_config_cuda//cuda:using_clang": ":enqueue_clang", + "@local_config_cuda//cuda:using_nvcc": ":enqueue_nvcc", + }), +) + +# Kernels and their names have special treatment in CUDA compilation. +# Specifically, the host-side kernel launch stub (host-side representation of +# the kernel) ends up having the name which does not match the actual kernel +# name. In order to correctly refer to the kernel the referring code must be +# compiled as CUDA. +cuda_library( + name = "enqueue_clang", + srcs = [ + "src/enqueue.cc", + ], + hdrs = ["src/nccl.h"], + copts = [ + "--cuda-host-only", + ], + include_prefix = "third_party/nccl", + linkopts = ["-lrt"], + strip_include_prefix = "src", + target_compatible_with = select({ + "@local_config_cuda//cuda:using_clang": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + visibility = ["//visibility:public"], + deps = [ + ":device", + ":include_hdrs", + ":src_hdrs", + ], +) + +cc_library( + name = "enqueue_nvcc", srcs = [ "src/enqueue.cc", ], hdrs = ["src/nccl.h"], - copts = if_cuda_clang([ - "-x", - "cuda", - ]), include_prefix = "third_party/nccl", linkopts = ["-lrt"], strip_include_prefix = "src", + target_compatible_with = select({ + "@local_config_cuda//cuda:using_nvcc": [], + "//conditions:default": ["@platforms//:incompatible"], + }), visibility = ["//visibility:public"], deps = [ ":device", diff --git a/third_party/xla/third_party/tsl/third_party/nccl/archive.patch b/third_party/xla/third_party/tsl/third_party/nccl/archive.patch index 7934adf6e1d2bb7c4c7d4fe4eb36b0d7daccb165..6d40fd869af299f0dd8fc03d7514b076b4b7680d 100644 --- a/third_party/xla/third_party/tsl/third_party/nccl/archive.patch +++ b/third_party/xla/third_party/tsl/third_party/nccl/archive.patch @@ -46,12 +46,12 @@ index 985274e..7ebb1e1 100644 -#define NCCL_PATCH ${nccl:Patch} -#define NCCL_SUFFIX "${nccl:Suffix}" +#define NCCL_MAJOR 2 -+#define NCCL_MINOR 13 -+#define NCCL_PATCH 4 ++#define NCCL_MINOR 16 ++#define NCCL_PATCH 5 +#define NCCL_SUFFIX "" -#define NCCL_VERSION_CODE ${nccl:Version} -+#define NCCL_VERSION_CODE 2134 ++#define NCCL_VERSION_CODE 2165 #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z)) #ifdef __cplusplus @@ -67,4 +67,18 @@ index 2aeb932..cdc67d2 100644 + explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept { schema_attr.name = schemaName; - schema_attr.entries = entries; \ No newline at end of file + schema_attr.entries = entries; +diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h +index accf8371a..4ab1bfac6 100644 +--- a/src/collectives/device/common.h ++++ b/src/collectives/device/common.h +@@ -166,7 +166,8 @@ __device__ void ncclKernel( + bytes = 0; + break; + } +- copyToShmem16(tid%WARP_SIZE, dst, src, bytes); ++ if (bytes) ++ copyToShmem16(tid%WARP_SIZE, dst, src, bytes); + } + __syncthreads(); // publish ncclShmem + \ No newline at end of file