Optimize the content of error reporting information, print error code and...

Optimize the content of error reporting information, print error code and official document web sites (#18671) optimize the error reporting information of cuda related API index on develop: 130ac177 Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into develop

Optimize the content of error reporting information, print error code and...
Optimize the content of error reporting information, print error code and official document web sites (#18671) optimize the error reporting information of cuda related API index on develop: 130ac177 Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into develop
772e0956 · zhouwei25 · liuwei1031 · ae58afc5 · 772e0956
隐藏空白更改
内联并排

Showing with 83 addition and 41 deletion

paddle/fluid/platform/gpu_info.cc paddle/fluid/platform/gpu_info.cc +83 -41

未找到文件。
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -79,6 +79,12 @@ DEFINE_string(selected_gpus, "",
 namespace paddle {
 namespace platform {

+inline std::string CudaErrorWebsite() {
+  return "Please see detail in https://docs.nvidia.com/cuda/cuda-runtime-api"
+         "/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c217824"
+         "6db0a94a430e0038";
+}
+
 static int GetCUDADeviceCountImpl() {
  const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
  if (cuda_visible_devices != nullptr) {
@@ -92,9 +98,12 @@ static int GetCUDADeviceCountImpl() {
  }

  int count;
+  auto error_code = cudaGetDeviceCount(&count);
  PADDLE_ENFORCE(
-      cudaGetDeviceCount(&count),
-      "cudaGetDeviceCount failed in paddle::platform::GetCUDADeviceCount");
+      error_code,
+      "cudaGetDeviceCount failed in "
+      "paddle::platform::GetCUDADeviceCountImpl, error code : %d, %s",
+      error_code, CudaErrorWebsite());
  return count;
 }

@@ -107,28 +116,33 @@ int GetCUDAComputeCapability(int id) {
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
  cudaDeviceProp device_prop;
  auto error_code = cudaGetDeviceProperties(&device_prop, id);
-  PADDLE_ENFORCE(error_code,
-                 "cudaGetDeviceProperties failed in "
-                 "paddle::platform::GetCUDAComputeCapability, error code : %d",
-                 error_code);
+  PADDLE_ENFORCE(
+      error_code,
+      "cudaGetDeviceProperties failed in "
+      "paddle::platform::GetCUDAComputeCapability, error code : %d, %s",
+      error_code, CudaErrorWebsite());
  return device_prop.major * 10 + device_prop.minor;
 }

 int GetCUDARuntimeVersion(int id) {
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
  int runtime_version = 0;
-  PADDLE_ENFORCE(cudaRuntimeGetVersion(&runtime_version),
+  auto error_code = cudaRuntimeGetVersion(&runtime_version);
+  PADDLE_ENFORCE(error_code,
                 "cudaRuntimeGetVersion failed in "
-                 "paddle::platform::cudaRuntimeGetVersion");
+                 "paddle::platform::GetCUDARuntimeVersion, error code : %d, %s",
+                 error_code, CudaErrorWebsite());
  return runtime_version;
 }

 int GetCUDADriverVersion(int id) {
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
  int driver_version = 0;
-  PADDLE_ENFORCE(cudaDriverGetVersion(&driver_version),
+  auto error_code = cudaDriverGetVersion(&driver_version);
+  PADDLE_ENFORCE(error_code,
                 "cudaDriverGetVersion failed in "
-                 "paddle::platform::GetCUDADriverVersion");
+                 "paddle::platform::GetCUDADriverVersion, error code : %d, %s",
+                 error_code, CudaErrorWebsite());
  return driver_version;
 }

@@ -145,28 +159,35 @@ bool TensorCoreAvailable() {
 int GetCUDAMultiProcessors(int id) {
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
  int count;
-  PADDLE_ENFORCE(
-      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id),
-      "cudaDeviceGetAttribute failed in "
-      "paddle::platform::GetCUDAMultiProcessors");
+  auto error_code =
+      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id);
+  PADDLE_ENFORCE(error_code,
+                 "cudaDeviceGetAttribute failed in "
+                 "paddle::platform::GetCUDAMultiProcess, error code : %d, %s",
+                 error_code, CudaErrorWebsite());
  return count;
 }

 int GetCUDAMaxThreadsPerMultiProcessor(int id) {
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
  int count;
-  PADDLE_ENFORCE(cudaDeviceGetAttribute(
-                     &count, cudaDevAttrMaxThreadsPerMultiProcessor, id),
-                 "cudaDeviceGetAttribute failed in "
-                 "paddle::platform::GetCUDAMaxThreadsPerMultiProcessor");
+  auto error_code = cudaDeviceGetAttribute(
+      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id);
+  PADDLE_ENFORCE(
+      error_code,
+      "cudaDeviceGetAttribute failed in paddle::"
+      "platform::GetCUDAMaxThreadsPerMultiProcessor, error code : %d, %s",
+      error_code, CudaErrorWebsite());
  return count;
 }

 int GetCurrentDeviceId() {
  int device_id;
-  PADDLE_ENFORCE(
-      cudaGetDevice(&device_id),
-      "cudaGetDevice failed in paddle::platform::GetCurrentDeviceId");
+  auto error_code = cudaGetDevice(&device_id);
+  PADDLE_ENFORCE(error_code,
+                 "cudaGetDevice failed in "
+                 "paddle::platform::GetCurrentDeviceId, error code : %d, %s",
+                 error_code, CudaErrorWebsite());
  return device_id;
 }

@@ -191,13 +212,19 @@ std::vector<int> GetSelectedDevices() {
 void SetDeviceId(int id) {
  // TODO(qijun): find a better way to cache the cuda device count
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
-  PADDLE_ENFORCE(cudaSetDevice(id),
-                 "cudaSetDevice failed in paddle::platform::SetDeviceId");
+  auto error_code = cudaSetDevice(id);
+  PADDLE_ENFORCE(error_code,
+                 "cudaSetDevice failed in "
+                 "paddle::platform::SetDeviced, error code : %d, %s",
+                 error_code, CudaErrorWebsite());
 }

 void GpuMemoryUsage(size_t *available, size_t *total) {
-  PADDLE_ENFORCE(cudaMemGetInfo(available, total),
-                 "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
+  auto error_code = cudaMemGetInfo(available, total);
+  PADDLE_ENFORCE(error_code,
+                 "cudaMemGetInfo failed in "
+                 "paddle::platform::GetMemoryUsage, error code : %d, %s",
+                 error_code, CudaErrorWebsite());
 }

 size_t GpuMaxAllocSize() {
@@ -224,11 +251,13 @@ size_t GpuInitAllocSize() {

 size_t GpuReallocSize() {
  if (FLAGS_reallocate_gpu_memory_in_mb > 0ul) {
-    // Additional memory will be allocated by FLAGS_reallocate_gpu_memory_in_mb
+    // Additional memory will be allocated by
+    // FLAGS_reallocate_gpu_memory_in_mb
    return static_cast<size_t>(FLAGS_reallocate_gpu_memory_in_mb << 20);
  }

-  // FLAGS_reallocate_gpu_memory_in_mb is 0, additional memory will be allocated
+  // FLAGS_reallocate_gpu_memory_in_mb is 0, additional memory will be
+  // allocated
  // by fraction
  size_t total = 0;
  size_t available = 0;
@@ -268,37 +297,50 @@ size_t GpuMaxChunkSize() {

 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind, cudaStream_t stream) {
-  PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
+  auto error_code = cudaMemcpyAsync(dst, src, count, kind, stream);
+  PADDLE_ENFORCE(error_code,
                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync "
-                 "(%p -> %p, length: %d)",
-                 src, dst, static_cast<int>(count));
+                 "(%p -> %p, length: %d) error code : %d, %s",
+                 src, dst, static_cast<int>(count), error_code,
+                 CudaErrorWebsite());
 }

 void GpuMemcpySync(void *dst, const void *src, size_t count,
                   enum cudaMemcpyKind kind) {
-  PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
-                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync (%p -> "
-                 "%p, length: %d)",
-                 src, dst, static_cast<int>(count));
+  auto error_code = cudaMemcpy(dst, src, count, kind);
+  PADDLE_ENFORCE(error_code,
+                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync "
+                 "(%p -> %p, length: %d) error code : %d, %s",
+                 src, dst, static_cast<int>(count), error_code,
+                 CudaErrorWebsite());
 }

 void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
                        int src_device, size_t count, cudaStream_t stream) {
+  auto error_code =
+      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream);
  PADDLE_ENFORCE(
-      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
-      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync");
+      error_code,
+      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync "
+      "error code : %d, %s",
+      error_code, CudaErrorWebsite());
 }

 void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
                       int src_device, size_t count) {
-  PADDLE_ENFORCE(
-      cudaMemcpyPeer(dst, dst_device, src, src_device, count),
-      "cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync");
+  auto error_code = cudaMemcpyPeer(dst, dst_device, src, src_device, count);
+  PADDLE_ENFORCE(error_code,
+                 "cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync "
+                 "error code : %d, %s",
+                 error_code, CudaErrorWebsite());
 }

 void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
-  PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream),
-                 "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync");
+  auto error_code = cudaMemsetAsync(dst, value, count, stream);
+  PADDLE_ENFORCE(error_code,
+                 "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync "
+                 "error code : %d, %s",
+                 error_code, CudaErrorWebsite());
 }
 }  // namespace platform
 }  // namespace paddle