提交 4b8f8a33 编写于 作者: A A. Unique TensorFlower 提交者: TensorFlower Gardener

Fix NCCL UB issue

This is fixing a UB issue which occurs with newer version of Clang (17+).
The fix is also upstreamed through https://github.com/NVIDIA/nccl/pull/916.

In addition I'm changing the handling of `enqueue.cc` which needs to be compiled
in cuda mode under clang. The previous solution with just passing in the `-x cuda` option fails with CUDA 12+.

I'm also correcting the version number that we set in the patch - not sure if this version is reported in some logs, but if it is, it should be correct.

PiperOrigin-RevId: 564811002
上级 c2f38110
......@@ -366,7 +366,7 @@ dtensor_test(
env = {
"DTENSOR_GPU_USE_NCCL_COMMUNICATION": "1",
"NCCL_P2P_DISABLE": "1", # FIXME(b/251183104): p2p detection in cuda 10.1+ is broken.
"NCCL_PROTO": "Simple", # FIXME(b/272050398): Delete this when the Clang-16/NCCL incompatibility has been resolved.
"NCCL_PROTO": "Simple", # FIXME(b/272050398): Delete this after the migration to LLVM-17.
},
tags = [
"no_windows",
......@@ -412,7 +412,7 @@ dtensor_test(
env = {
"DTENSOR_GPU_USE_NCCL_COMMUNICATION": "1",
"NCCL_P2P_DISABLE": "1", # FIXME(b/251183104): p2p detection in cuda 10.1+ is broken.
"NCCL_PROTO": "Simple", # FIXME(b/272050398): Delete this when the Clang-16/NCCL incompatibility has been resolved.
"NCCL_PROTO": "Simple", # FIXME(b/272050398): Delete this after the migration to LLVM-17.
},
tags = [
"no_windows",
......
......@@ -8,7 +8,6 @@ exports_files(["LICENSE.txt"])
load(
"@local_config_cuda//cuda:build_defs.bzl",
"cuda_library",
"if_cuda_clang",
)
load(
"@local_config_nccl//:build_defs.bzl",
......@@ -124,19 +123,56 @@ cc_library(
],
)
cc_library(
alias(
name = "enqueue",
actual = select({
"@local_config_cuda//cuda:using_clang": ":enqueue_clang",
"@local_config_cuda//cuda:using_nvcc": ":enqueue_nvcc",
}),
)
# Kernels and their names have special treatment in CUDA compilation.
# Specifically, the host-side kernel launch stub (host-side representation of
# the kernel) ends up having the name which does not match the actual kernel
# name. In order to correctly refer to the kernel the referring code must be
# compiled as CUDA.
cuda_library(
name = "enqueue_clang",
srcs = [
"src/enqueue.cc",
],
hdrs = ["src/nccl.h"],
copts = [
"--cuda-host-only",
],
include_prefix = "third_party/nccl",
linkopts = ["-lrt"],
strip_include_prefix = "src",
target_compatible_with = select({
"@local_config_cuda//cuda:using_clang": [],
"//conditions:default": ["@platforms//:incompatible"],
}),
visibility = ["//visibility:public"],
deps = [
":device",
":include_hdrs",
":src_hdrs",
],
)
cc_library(
name = "enqueue_nvcc",
srcs = [
"src/enqueue.cc",
],
hdrs = ["src/nccl.h"],
copts = if_cuda_clang([
"-x",
"cuda",
]),
include_prefix = "third_party/nccl",
linkopts = ["-lrt"],
strip_include_prefix = "src",
target_compatible_with = select({
"@local_config_cuda//cuda:using_nvcc": [],
"//conditions:default": ["@platforms//:incompatible"],
}),
visibility = ["//visibility:public"],
deps = [
":device",
......
......@@ -46,12 +46,12 @@ index 985274e..7ebb1e1 100644
-#define NCCL_PATCH ${nccl:Patch}
-#define NCCL_SUFFIX "${nccl:Suffix}"
+#define NCCL_MAJOR 2
+#define NCCL_MINOR 13
+#define NCCL_PATCH 4
+#define NCCL_MINOR 16
+#define NCCL_PATCH 5
+#define NCCL_SUFFIX ""
-#define NCCL_VERSION_CODE ${nccl:Version}
+#define NCCL_VERSION_CODE 2134
+#define NCCL_VERSION_CODE 2165
#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
#ifdef __cplusplus
......@@ -67,4 +67,18 @@ index 2aeb932..cdc67d2 100644
+ explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
{
schema_attr.name = schemaName;
schema_attr.entries = entries;
\ No newline at end of file
schema_attr.entries = entries;
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index accf8371a..4ab1bfac6 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -166,7 +166,8 @@ __device__ void ncclKernel(
bytes = 0;
break;
}
- copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
+ if (bytes)
+ copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
}
__syncthreads(); // publish ncclShmem
\ No newline at end of file
......@@ -8,7 +8,6 @@ exports_files(["LICENSE.txt"])
load(
"@local_config_cuda//cuda:build_defs.bzl",
"cuda_library",
"if_cuda_clang",
)
load(
"@local_config_nccl//:build_defs.bzl",
......@@ -124,19 +123,56 @@ cc_library(
],
)
cc_library(
alias(
name = "enqueue",
actual = select({
"@local_config_cuda//cuda:using_clang": ":enqueue_clang",
"@local_config_cuda//cuda:using_nvcc": ":enqueue_nvcc",
}),
)
# Kernels and their names have special treatment in CUDA compilation.
# Specifically, the host-side kernel launch stub (host-side representation of
# the kernel) ends up having the name which does not match the actual kernel
# name. In order to correctly refer to the kernel the referring code must be
# compiled as CUDA.
cuda_library(
name = "enqueue_clang",
srcs = [
"src/enqueue.cc",
],
hdrs = ["src/nccl.h"],
copts = [
"--cuda-host-only",
],
include_prefix = "third_party/nccl",
linkopts = ["-lrt"],
strip_include_prefix = "src",
target_compatible_with = select({
"@local_config_cuda//cuda:using_clang": [],
"//conditions:default": ["@platforms//:incompatible"],
}),
visibility = ["//visibility:public"],
deps = [
":device",
":include_hdrs",
":src_hdrs",
],
)
cc_library(
name = "enqueue_nvcc",
srcs = [
"src/enqueue.cc",
],
hdrs = ["src/nccl.h"],
copts = if_cuda_clang([
"-x",
"cuda",
]),
include_prefix = "third_party/nccl",
linkopts = ["-lrt"],
strip_include_prefix = "src",
target_compatible_with = select({
"@local_config_cuda//cuda:using_nvcc": [],
"//conditions:default": ["@platforms//:incompatible"],
}),
visibility = ["//visibility:public"],
deps = [
":device",
......
......@@ -46,12 +46,12 @@ index 985274e..7ebb1e1 100644
-#define NCCL_PATCH ${nccl:Patch}
-#define NCCL_SUFFIX "${nccl:Suffix}"
+#define NCCL_MAJOR 2
+#define NCCL_MINOR 13
+#define NCCL_PATCH 4
+#define NCCL_MINOR 16
+#define NCCL_PATCH 5
+#define NCCL_SUFFIX ""
-#define NCCL_VERSION_CODE ${nccl:Version}
+#define NCCL_VERSION_CODE 2134
+#define NCCL_VERSION_CODE 2165
#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
#ifdef __cplusplus
......@@ -67,4 +67,18 @@ index 2aeb932..cdc67d2 100644
+ explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
{
schema_attr.name = schemaName;
schema_attr.entries = entries;
\ No newline at end of file
schema_attr.entries = entries;
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index accf8371a..4ab1bfac6 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -166,7 +166,8 @@ __device__ void ncclKernel(
bytes = 0;
break;
}
- copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
+ if (bytes)
+ copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
}
__syncthreads(); // publish ncclShmem
\ No newline at end of file
......@@ -8,7 +8,6 @@ exports_files(["LICENSE.txt"])
load(
"@local_config_cuda//cuda:build_defs.bzl",
"cuda_library",
"if_cuda_clang",
)
load(
"@local_config_nccl//:build_defs.bzl",
......@@ -124,19 +123,56 @@ cc_library(
],
)
cc_library(
alias(
name = "enqueue",
actual = select({
"@local_config_cuda//cuda:using_clang": ":enqueue_clang",
"@local_config_cuda//cuda:using_nvcc": ":enqueue_nvcc",
}),
)
# Kernels and their names have special treatment in CUDA compilation.
# Specifically, the host-side kernel launch stub (host-side representation of
# the kernel) ends up having the name which does not match the actual kernel
# name. In order to correctly refer to the kernel the referring code must be
# compiled as CUDA.
cuda_library(
name = "enqueue_clang",
srcs = [
"src/enqueue.cc",
],
hdrs = ["src/nccl.h"],
copts = [
"--cuda-host-only",
],
include_prefix = "third_party/nccl",
linkopts = ["-lrt"],
strip_include_prefix = "src",
target_compatible_with = select({
"@local_config_cuda//cuda:using_clang": [],
"//conditions:default": ["@platforms//:incompatible"],
}),
visibility = ["//visibility:public"],
deps = [
":device",
":include_hdrs",
":src_hdrs",
],
)
cc_library(
name = "enqueue_nvcc",
srcs = [
"src/enqueue.cc",
],
hdrs = ["src/nccl.h"],
copts = if_cuda_clang([
"-x",
"cuda",
]),
include_prefix = "third_party/nccl",
linkopts = ["-lrt"],
strip_include_prefix = "src",
target_compatible_with = select({
"@local_config_cuda//cuda:using_nvcc": [],
"//conditions:default": ["@platforms//:incompatible"],
}),
visibility = ["//visibility:public"],
deps = [
":device",
......
......@@ -46,12 +46,12 @@ index 985274e..7ebb1e1 100644
-#define NCCL_PATCH ${nccl:Patch}
-#define NCCL_SUFFIX "${nccl:Suffix}"
+#define NCCL_MAJOR 2
+#define NCCL_MINOR 13
+#define NCCL_PATCH 4
+#define NCCL_MINOR 16
+#define NCCL_PATCH 5
+#define NCCL_SUFFIX ""
-#define NCCL_VERSION_CODE ${nccl:Version}
+#define NCCL_VERSION_CODE 2134
+#define NCCL_VERSION_CODE 2165
#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
#ifdef __cplusplus
......@@ -67,4 +67,18 @@ index 2aeb932..cdc67d2 100644
+ explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
{
schema_attr.name = schemaName;
schema_attr.entries = entries;
\ No newline at end of file
schema_attr.entries = entries;
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index accf8371a..4ab1bfac6 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -166,7 +166,8 @@ __device__ void ncclKernel(
bytes = 0;
break;
}
- copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
+ if (bytes)
+ copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
}
__syncthreads(); // publish ncclShmem
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册