Fix NCCL UB issue

This is fixing a UB issue which occurs with newer version of Clang (17+). The fix is also upstreamed through https://github.com/NVIDIA/nccl/pull/916. In addition I'm changing the handling of `enqueue.cc` which needs to be compiled in cuda mode under clang. The previous solution with just passing in the `-x cuda` option fails with CUDA 12+. I'm also correcting the version number that we set in the patch - not sure if this version is reported in some logs, but if it is, it should be correct. PiperOrigin-RevId: 564811002

Fix NCCL UB issue
This is fixing a UB issue which occurs with newer version of Clang (17+). The fix is also upstreamed through https://github.com/NVIDIA/nccl/pull/916. In addition I'm changing the handling of `enqueue.cc` which needs to be compiled in cuda mode under clang. The previous solution with just passing in the `-x cuda` option fails with CUDA 12+. I'm also correcting the version number that we set in the patch - not sure if this version is reported in some logs, but if it is, it should be correct. PiperOrigin-RevId: 564811002
4b8f8a33 · A. Unique TensorFlower · TensorFlower Gardener · c2f38110 · 4b8f8a33 · 4b8f8a33
7 changed file
--- a/tensorflow/dtensor/python/tests/BUILD
+++ b/tensorflow/dtensor/python/tests/BUILD
@@ -366,7 +366,7 @@ dtensor_test(
    env = {
        "DTENSOR_GPU_USE_NCCL_COMMUNICATION": "1",
        "NCCL_P2P_DISABLE": "1",  # FIXME(b/251183104): p2p detection in cuda 10.1+ is broken.
-        "NCCL_PROTO": "Simple",  # FIXME(b/272050398): Delete this when the Clang-16/NCCL incompatibility has been resolved.
+        "NCCL_PROTO": "Simple",  # FIXME(b/272050398): Delete this after the migration to LLVM-17.
    },
    tags = [
        "no_windows",
@@ -412,7 +412,7 @@ dtensor_test(
    env = {
        "DTENSOR_GPU_USE_NCCL_COMMUNICATION": "1",
        "NCCL_P2P_DISABLE": "1",  # FIXME(b/251183104): p2p detection in cuda 10.1+ is broken.
-        "NCCL_PROTO": "Simple",  # FIXME(b/272050398): Delete this when the Clang-16/NCCL incompatibility has been resolved.
+        "NCCL_PROTO": "Simple",  # FIXME(b/272050398): Delete this after the migration to LLVM-17.
    },
    tags = [
        "no_windows",

--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -8,7 +8,6 @@ exports_files(["LICENSE.txt"])
 load(
    "@local_config_cuda//cuda:build_defs.bzl",
    "cuda_library",
-    "if_cuda_clang",
 )
 load(
    "@local_config_nccl//:build_defs.bzl",
@@ -124,19 +123,56 @@ cc_library(
    ],
 )

-cc_library(
+alias(
    name = "enqueue",
+    actual = select({
+        "@local_config_cuda//cuda:using_clang": ":enqueue_clang",
+        "@local_config_cuda//cuda:using_nvcc": ":enqueue_nvcc",
+    }),
+)
+
+# Kernels and their names have special treatment in CUDA compilation.
+# Specifically, the host-side kernel launch stub (host-side representation of
+# the kernel) ends up having the name which does not match the actual kernel
+# name. In order to correctly refer to the kernel the referring code must be
+# compiled as CUDA.
+cuda_library(
+    name = "enqueue_clang",
+    srcs = [
+        "src/enqueue.cc",
+    ],
+    hdrs = ["src/nccl.h"],
+    copts = [
+        "--cuda-host-only",
+    ],
+    include_prefix = "third_party/nccl",
+    linkopts = ["-lrt"],
+    strip_include_prefix = "src",
+    target_compatible_with = select({
+        "@local_config_cuda//cuda:using_clang": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":device",
+        ":include_hdrs",
+        ":src_hdrs",
+    ],
+)
+
+cc_library(
+    name = "enqueue_nvcc",
    srcs = [
        "src/enqueue.cc",
    ],
    hdrs = ["src/nccl.h"],
-    copts = if_cuda_clang([
-        "-x",
-        "cuda",
-    ]),
    include_prefix = "third_party/nccl",
    linkopts = ["-lrt"],
    strip_include_prefix = "src",
+    target_compatible_with = select({
+        "@local_config_cuda//cuda:using_nvcc": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
    visibility = ["//visibility:public"],
    deps = [
        ":device",

--- a/third_party/nccl/archive.patch
+++ b/third_party/nccl/archive.patch
@@ -46,12 +46,12 @@ index 985274e..7ebb1e1 100644
 -#define NCCL_PATCH ${nccl:Patch}
 -#define NCCL_SUFFIX "${nccl:Suffix}"
 +#define NCCL_MAJOR 2
-+#define NCCL_MINOR 13
-+#define NCCL_PATCH 4
+#define NCCL_MINOR 16
+#define NCCL_PATCH 5
 +#define NCCL_SUFFIX ""
 
 -#define NCCL_VERSION_CODE ${nccl:Version}
-+#define NCCL_VERSION_CODE 2134
+#define NCCL_VERSION_CODE 2165
 #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
 
 #ifdef __cplusplus
@@ -67,4 +67,18 @@ index 2aeb932..cdc67d2 100644
 +  explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
   {
     schema_attr.name = schemaName;
-     schema_attr.entries = entries;
\ No newline at end of file
+     schema_attr.entries = entries;
+diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
+index accf8371a..4ab1bfac6 100644
+--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
+@@ -166,7 +166,8 @@ __device__ void ncclKernel(
+       bytes = 0;
+       break;
+     }
+-    copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
+    if (bytes)
+      copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
+   }
+   __syncthreads(); // publish ncclShmem
+ 
\ No newline at end of file
--- a/third_party/xla/third_party/nccl/archive.BUILD
+++ b/third_party/xla/third_party/nccl/archive.BUILD
@@ -8,7 +8,6 @@ exports_files(["LICENSE.txt"])
 load(
    "@local_config_cuda//cuda:build_defs.bzl",
    "cuda_library",
-    "if_cuda_clang",
 )
 load(
    "@local_config_nccl//:build_defs.bzl",
@@ -124,19 +123,56 @@ cc_library(
    ],
 )

-cc_library(
+alias(
    name = "enqueue",
+    actual = select({
+        "@local_config_cuda//cuda:using_clang": ":enqueue_clang",
+        "@local_config_cuda//cuda:using_nvcc": ":enqueue_nvcc",
+    }),
+)
+
+# Kernels and their names have special treatment in CUDA compilation.
+# Specifically, the host-side kernel launch stub (host-side representation of
+# the kernel) ends up having the name which does not match the actual kernel
+# name. In order to correctly refer to the kernel the referring code must be
+# compiled as CUDA.
+cuda_library(
+    name = "enqueue_clang",
+    srcs = [
+        "src/enqueue.cc",
+    ],
+    hdrs = ["src/nccl.h"],
+    copts = [
+        "--cuda-host-only",
+    ],
+    include_prefix = "third_party/nccl",
+    linkopts = ["-lrt"],
+    strip_include_prefix = "src",
+    target_compatible_with = select({
+        "@local_config_cuda//cuda:using_clang": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":device",
+        ":include_hdrs",
+        ":src_hdrs",
+    ],
+)
+
+cc_library(
+    name = "enqueue_nvcc",
    srcs = [
        "src/enqueue.cc",
    ],
    hdrs = ["src/nccl.h"],
-    copts = if_cuda_clang([
-        "-x",
-        "cuda",
-    ]),
    include_prefix = "third_party/nccl",
    linkopts = ["-lrt"],
    strip_include_prefix = "src",
+    target_compatible_with = select({
+        "@local_config_cuda//cuda:using_nvcc": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
    visibility = ["//visibility:public"],
    deps = [
        ":device",

--- a/third_party/xla/third_party/nccl/archive.patch
+++ b/third_party/xla/third_party/nccl/archive.patch
@@ -46,12 +46,12 @@ index 985274e..7ebb1e1 100644
 -#define NCCL_PATCH ${nccl:Patch}
 -#define NCCL_SUFFIX "${nccl:Suffix}"
 +#define NCCL_MAJOR 2
-+#define NCCL_MINOR 13
-+#define NCCL_PATCH 4
+#define NCCL_MINOR 16
+#define NCCL_PATCH 5
 +#define NCCL_SUFFIX ""
 
 -#define NCCL_VERSION_CODE ${nccl:Version}
-+#define NCCL_VERSION_CODE 2134
+#define NCCL_VERSION_CODE 2165
 #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
 
 #ifdef __cplusplus
@@ -67,4 +67,18 @@ index 2aeb932..cdc67d2 100644
 +  explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
   {
     schema_attr.name = schemaName;
-     schema_attr.entries = entries;
\ No newline at end of file
+     schema_attr.entries = entries;
+diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
+index accf8371a..4ab1bfac6 100644
+--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
+@@ -166,7 +166,8 @@ __device__ void ncclKernel(
+       bytes = 0;
+       break;
+     }
+-    copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
+    if (bytes)
+      copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
+   }
+   __syncthreads(); // publish ncclShmem
+ 
\ No newline at end of file
--- a/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD
@@ -8,7 +8,6 @@ exports_files(["LICENSE.txt"])
 load(
    "@local_config_cuda//cuda:build_defs.bzl",
    "cuda_library",
-    "if_cuda_clang",
 )
 load(
    "@local_config_nccl//:build_defs.bzl",
@@ -124,19 +123,56 @@ cc_library(
    ],
 )

-cc_library(
+alias(
    name = "enqueue",
+    actual = select({
+        "@local_config_cuda//cuda:using_clang": ":enqueue_clang",
+        "@local_config_cuda//cuda:using_nvcc": ":enqueue_nvcc",
+    }),
+)
+
+# Kernels and their names have special treatment in CUDA compilation.
+# Specifically, the host-side kernel launch stub (host-side representation of
+# the kernel) ends up having the name which does not match the actual kernel
+# name. In order to correctly refer to the kernel the referring code must be
+# compiled as CUDA.
+cuda_library(
+    name = "enqueue_clang",
+    srcs = [
+        "src/enqueue.cc",
+    ],
+    hdrs = ["src/nccl.h"],
+    copts = [
+        "--cuda-host-only",
+    ],
+    include_prefix = "third_party/nccl",
+    linkopts = ["-lrt"],
+    strip_include_prefix = "src",
+    target_compatible_with = select({
+        "@local_config_cuda//cuda:using_clang": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":device",
+        ":include_hdrs",
+        ":src_hdrs",
+    ],
+)
+
+cc_library(
+    name = "enqueue_nvcc",
    srcs = [
        "src/enqueue.cc",
    ],
    hdrs = ["src/nccl.h"],
-    copts = if_cuda_clang([
-        "-x",
-        "cuda",
-    ]),
    include_prefix = "third_party/nccl",
    linkopts = ["-lrt"],
    strip_include_prefix = "src",
+    target_compatible_with = select({
+        "@local_config_cuda//cuda:using_nvcc": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
    visibility = ["//visibility:public"],
    deps = [
        ":device",

--- a/third_party/xla/third_party/tsl/third_party/nccl/archive.patch
+++ b/third_party/xla/third_party/tsl/third_party/nccl/archive.patch
@@ -46,12 +46,12 @@ index 985274e..7ebb1e1 100644
 -#define NCCL_PATCH ${nccl:Patch}
 -#define NCCL_SUFFIX "${nccl:Suffix}"
 +#define NCCL_MAJOR 2
-+#define NCCL_MINOR 13
-+#define NCCL_PATCH 4
+#define NCCL_MINOR 16
+#define NCCL_PATCH 5
 +#define NCCL_SUFFIX ""
 
 -#define NCCL_VERSION_CODE ${nccl:Version}
-+#define NCCL_VERSION_CODE 2134
+#define NCCL_VERSION_CODE 2165
 #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
 
 #ifdef __cplusplus
@@ -67,4 +67,18 @@ index 2aeb932..cdc67d2 100644
 +  explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
   {
     schema_attr.name = schemaName;
-     schema_attr.entries = entries;
\ No newline at end of file
+     schema_attr.entries = entries;
+diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
+index accf8371a..4ab1bfac6 100644
+--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
+@@ -166,7 +166,8 @@ __device__ void ncclKernel(
+       bytes = 0;
+       break;
+     }
+-    copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
+    if (bytes)
+      copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
+   }
+   __syncthreads(); // publish ncclShmem
+ 
\ No newline at end of file