diff --git a/tensorflow/dtensor/python/tests/BUILD b/tensorflow/dtensor/python/tests/BUILD
index 615baad3085f67cba2a57cc2dfc6fd8ffc33031d..b83fa6e8932782f7a707c0a3ec2a3968d52a2a16 100644
--- a/tensorflow/dtensor/python/tests/BUILD
+++ b/tensorflow/dtensor/python/tests/BUILD
@@ -366,7 +366,7 @@ dtensor_test(
     env = {
         "DTENSOR_GPU_USE_NCCL_COMMUNICATION": "1",
         "NCCL_P2P_DISABLE": "1",  # FIXME(b/251183104): p2p detection in cuda 10.1+ is broken.
-        "NCCL_PROTO": "Simple",  # FIXME(b/272050398): Delete this when the Clang-16/NCCL incompatibility has been resolved.
+        "NCCL_PROTO": "Simple",  # FIXME(b/272050398): Delete this after the migration to LLVM-17.
     },
     tags = [
         "no_windows",
@@ -412,7 +412,7 @@ dtensor_test(
     env = {
         "DTENSOR_GPU_USE_NCCL_COMMUNICATION": "1",
         "NCCL_P2P_DISABLE": "1",  # FIXME(b/251183104): p2p detection in cuda 10.1+ is broken.
-        "NCCL_PROTO": "Simple",  # FIXME(b/272050398): Delete this when the Clang-16/NCCL incompatibility has been resolved.
+        "NCCL_PROTO": "Simple",  # FIXME(b/272050398): Delete this after the migration to LLVM-17.
     },
     tags = [
         "no_windows",
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index 915672d04326384dc7e3298a1d0fd000f45f420d..a608faf8a262ef812cf08c99b816a1f5dc05ff85 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -8,7 +8,6 @@ exports_files(["LICENSE.txt"])
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
     "cuda_library",
-    "if_cuda_clang",
 )
 load(
     "@local_config_nccl//:build_defs.bzl",
@@ -124,19 +123,56 @@ cc_library(
     ],
 )
 
-cc_library(
+alias(
     name = "enqueue",
+    actual = select({
+        "@local_config_cuda//cuda:using_clang": ":enqueue_clang",
+        "@local_config_cuda//cuda:using_nvcc": ":enqueue_nvcc",
+    }),
+)
+
+# Kernels and their names have special treatment in CUDA compilation.
+# Specifically, the host-side kernel launch stub (host-side representation of
+# the kernel) ends up having the name which does not match the actual kernel
+# name. In order to correctly refer to the kernel the referring code must be
+# compiled as CUDA.
+cuda_library(
+    name = "enqueue_clang",
+    srcs = [
+        "src/enqueue.cc",
+    ],
+    hdrs = ["src/nccl.h"],
+    copts = [
+        "--cuda-host-only",
+    ],
+    include_prefix = "third_party/nccl",
+    linkopts = ["-lrt"],
+    strip_include_prefix = "src",
+    target_compatible_with = select({
+        "@local_config_cuda//cuda:using_clang": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":device",
+        ":include_hdrs",
+        ":src_hdrs",
+    ],
+)
+
+cc_library(
+    name = "enqueue_nvcc",
     srcs = [
         "src/enqueue.cc",
     ],
     hdrs = ["src/nccl.h"],
-    copts = if_cuda_clang([
-        "-x",
-        "cuda",
-    ]),
     include_prefix = "third_party/nccl",
     linkopts = ["-lrt"],
     strip_include_prefix = "src",
+    target_compatible_with = select({
+        "@local_config_cuda//cuda:using_nvcc": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
     visibility = ["//visibility:public"],
     deps = [
         ":device",
diff --git a/third_party/nccl/archive.patch b/third_party/nccl/archive.patch
index 7934adf6e1d2bb7c4c7d4fe4eb36b0d7daccb165..6d40fd869af299f0dd8fc03d7514b076b4b7680d 100644
--- a/third_party/nccl/archive.patch
+++ b/third_party/nccl/archive.patch
@@ -46,12 +46,12 @@ index 985274e..7ebb1e1 100644
 -#define NCCL_PATCH ${nccl:Patch}
 -#define NCCL_SUFFIX "${nccl:Suffix}"
 +#define NCCL_MAJOR 2
-+#define NCCL_MINOR 13
-+#define NCCL_PATCH 4
++#define NCCL_MINOR 16
++#define NCCL_PATCH 5
 +#define NCCL_SUFFIX ""
  
 -#define NCCL_VERSION_CODE ${nccl:Version}
-+#define NCCL_VERSION_CODE 2134
++#define NCCL_VERSION_CODE 2165
  #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
  
  #ifdef __cplusplus
@@ -67,4 +67,18 @@ index 2aeb932..cdc67d2 100644
 +  explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
    {
      schema_attr.name = schemaName;
-     schema_attr.entries = entries;
\ No newline at end of file
+     schema_attr.entries = entries;
+diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
+index accf8371a..4ab1bfac6 100644
+--- a/src/collectives/device/common.h
++++ b/src/collectives/device/common.h
+@@ -166,7 +166,8 @@ __device__ void ncclKernel(
+       bytes = 0;
+       break;
+     }
+-    copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
++    if (bytes)
++      copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
+   }
+   __syncthreads(); // publish ncclShmem
+ 
\ No newline at end of file
diff --git a/third_party/xla/third_party/nccl/archive.BUILD b/third_party/xla/third_party/nccl/archive.BUILD
index 915672d04326384dc7e3298a1d0fd000f45f420d..a608faf8a262ef812cf08c99b816a1f5dc05ff85 100644
--- a/third_party/xla/third_party/nccl/archive.BUILD
+++ b/third_party/xla/third_party/nccl/archive.BUILD
@@ -8,7 +8,6 @@ exports_files(["LICENSE.txt"])
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
     "cuda_library",
-    "if_cuda_clang",
 )
 load(
     "@local_config_nccl//:build_defs.bzl",
@@ -124,19 +123,56 @@ cc_library(
     ],
 )
 
-cc_library(
+alias(
     name = "enqueue",
+    actual = select({
+        "@local_config_cuda//cuda:using_clang": ":enqueue_clang",
+        "@local_config_cuda//cuda:using_nvcc": ":enqueue_nvcc",
+    }),
+)
+
+# Kernels and their names have special treatment in CUDA compilation.
+# Specifically, the host-side kernel launch stub (host-side representation of
+# the kernel) ends up having the name which does not match the actual kernel
+# name. In order to correctly refer to the kernel the referring code must be
+# compiled as CUDA.
+cuda_library(
+    name = "enqueue_clang",
+    srcs = [
+        "src/enqueue.cc",
+    ],
+    hdrs = ["src/nccl.h"],
+    copts = [
+        "--cuda-host-only",
+    ],
+    include_prefix = "third_party/nccl",
+    linkopts = ["-lrt"],
+    strip_include_prefix = "src",
+    target_compatible_with = select({
+        "@local_config_cuda//cuda:using_clang": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":device",
+        ":include_hdrs",
+        ":src_hdrs",
+    ],
+)
+
+cc_library(
+    name = "enqueue_nvcc",
     srcs = [
         "src/enqueue.cc",
     ],
     hdrs = ["src/nccl.h"],
-    copts = if_cuda_clang([
-        "-x",
-        "cuda",
-    ]),
     include_prefix = "third_party/nccl",
     linkopts = ["-lrt"],
     strip_include_prefix = "src",
+    target_compatible_with = select({
+        "@local_config_cuda//cuda:using_nvcc": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
     visibility = ["//visibility:public"],
     deps = [
         ":device",
diff --git a/third_party/xla/third_party/nccl/archive.patch b/third_party/xla/third_party/nccl/archive.patch
index 7934adf6e1d2bb7c4c7d4fe4eb36b0d7daccb165..6d40fd869af299f0dd8fc03d7514b076b4b7680d 100644
--- a/third_party/xla/third_party/nccl/archive.patch
+++ b/third_party/xla/third_party/nccl/archive.patch
@@ -46,12 +46,12 @@ index 985274e..7ebb1e1 100644
 -#define NCCL_PATCH ${nccl:Patch}
 -#define NCCL_SUFFIX "${nccl:Suffix}"
 +#define NCCL_MAJOR 2
-+#define NCCL_MINOR 13
-+#define NCCL_PATCH 4
++#define NCCL_MINOR 16
++#define NCCL_PATCH 5
 +#define NCCL_SUFFIX ""
  
 -#define NCCL_VERSION_CODE ${nccl:Version}
-+#define NCCL_VERSION_CODE 2134
++#define NCCL_VERSION_CODE 2165
  #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
  
  #ifdef __cplusplus
@@ -67,4 +67,18 @@ index 2aeb932..cdc67d2 100644
 +  explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
    {
      schema_attr.name = schemaName;
-     schema_attr.entries = entries;
\ No newline at end of file
+     schema_attr.entries = entries;
+diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
+index accf8371a..4ab1bfac6 100644
+--- a/src/collectives/device/common.h
++++ b/src/collectives/device/common.h
+@@ -166,7 +166,8 @@ __device__ void ncclKernel(
+       bytes = 0;
+       break;
+     }
+-    copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
++    if (bytes)
++      copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
+   }
+   __syncthreads(); // publish ncclShmem
+ 
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD b/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD
index 915672d04326384dc7e3298a1d0fd000f45f420d..a608faf8a262ef812cf08c99b816a1f5dc05ff85 100644
--- a/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD
@@ -8,7 +8,6 @@ exports_files(["LICENSE.txt"])
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
     "cuda_library",
-    "if_cuda_clang",
 )
 load(
     "@local_config_nccl//:build_defs.bzl",
@@ -124,19 +123,56 @@ cc_library(
     ],
 )
 
-cc_library(
+alias(
     name = "enqueue",
+    actual = select({
+        "@local_config_cuda//cuda:using_clang": ":enqueue_clang",
+        "@local_config_cuda//cuda:using_nvcc": ":enqueue_nvcc",
+    }),
+)
+
+# Kernels and their names have special treatment in CUDA compilation.
+# Specifically, the host-side kernel launch stub (host-side representation of
+# the kernel) ends up having the name which does not match the actual kernel
+# name. In order to correctly refer to the kernel the referring code must be
+# compiled as CUDA.
+cuda_library(
+    name = "enqueue_clang",
+    srcs = [
+        "src/enqueue.cc",
+    ],
+    hdrs = ["src/nccl.h"],
+    copts = [
+        "--cuda-host-only",
+    ],
+    include_prefix = "third_party/nccl",
+    linkopts = ["-lrt"],
+    strip_include_prefix = "src",
+    target_compatible_with = select({
+        "@local_config_cuda//cuda:using_clang": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":device",
+        ":include_hdrs",
+        ":src_hdrs",
+    ],
+)
+
+cc_library(
+    name = "enqueue_nvcc",
     srcs = [
         "src/enqueue.cc",
     ],
     hdrs = ["src/nccl.h"],
-    copts = if_cuda_clang([
-        "-x",
-        "cuda",
-    ]),
     include_prefix = "third_party/nccl",
     linkopts = ["-lrt"],
     strip_include_prefix = "src",
+    target_compatible_with = select({
+        "@local_config_cuda//cuda:using_nvcc": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
     visibility = ["//visibility:public"],
     deps = [
         ":device",
diff --git a/third_party/xla/third_party/tsl/third_party/nccl/archive.patch b/third_party/xla/third_party/tsl/third_party/nccl/archive.patch
index 7934adf6e1d2bb7c4c7d4fe4eb36b0d7daccb165..6d40fd869af299f0dd8fc03d7514b076b4b7680d 100644
--- a/third_party/xla/third_party/tsl/third_party/nccl/archive.patch
+++ b/third_party/xla/third_party/tsl/third_party/nccl/archive.patch
@@ -46,12 +46,12 @@ index 985274e..7ebb1e1 100644
 -#define NCCL_PATCH ${nccl:Patch}
 -#define NCCL_SUFFIX "${nccl:Suffix}"
 +#define NCCL_MAJOR 2
-+#define NCCL_MINOR 13
-+#define NCCL_PATCH 4
++#define NCCL_MINOR 16
++#define NCCL_PATCH 5
 +#define NCCL_SUFFIX ""
  
 -#define NCCL_VERSION_CODE ${nccl:Version}
-+#define NCCL_VERSION_CODE 2134
++#define NCCL_VERSION_CODE 2165
  #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
  
  #ifdef __cplusplus
@@ -67,4 +67,18 @@ index 2aeb932..cdc67d2 100644
 +  explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
    {
      schema_attr.name = schemaName;
-     schema_attr.entries = entries;
\ No newline at end of file
+     schema_attr.entries = entries;
+diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
+index accf8371a..4ab1bfac6 100644
+--- a/src/collectives/device/common.h
++++ b/src/collectives/device/common.h
+@@ -166,7 +166,8 @@ __device__ void ncclKernel(
+       bytes = 0;
+       break;
+     }
+-    copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
++    if (bytes)
++      copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
+   }
+   __syncthreads(); // publish ncclShmem
+ 
\ No newline at end of file