Add detailed error message for curandStatus_t, cublasStatus_t, cusolverStatus_t (#30161)

404c1676 · WeiXin · GitHub · 91a8a257 · 404c1676
隐藏空白更改
内联并排

Showing with 64 addition and 29 deletion

paddle/fluid/platform/enforce.h paddle/fluid/platform/enforce.h +64 -29

未找到文件。
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -755,31 +755,37 @@ inline bool is_error(curandStatus_t stat) {
 inline const char* curandGetErrorString(curandStatus_t stat) {
  switch (stat) {
    case CURAND_STATUS_SUCCESS:
-      return "CURAND_STATUS_SUCCESS";
+      return "`CURAND_STATUS_SUCCESS`. No errors.";
    case CURAND_STATUS_VERSION_MISMATCH:
-      return "CURAND_STATUS_VERSION_MISMATCH";
+      return "`CURAND_STATUS_VERSION_MISMATCH`. Header file and linked library "
+             "version do not match.";
    case CURAND_STATUS_NOT_INITIALIZED:
-      return "CURAND_STATUS_NOT_INITIALIZED";
+      return "`CURAND_STATUS_NOT_INITIALIZED`. Generator not initialized.";
    case CURAND_STATUS_ALLOCATION_FAILED:
-      return "CURAND_STATUS_ALLOCATION_FAILED";
+      return "`CURAND_STATUS_ALLOCATION_FAILED`. Memory allocation failed.";
    case CURAND_STATUS_TYPE_ERROR:
-      return "CURAND_STATUS_TYPE_ERROR";
+      return "`CURAND_STATUS_TYPE_ERROR`. Generator is wrong type.";
    case CURAND_STATUS_OUT_OF_RANGE:
-      return "CURAND_STATUS_OUT_OF_RANGE";
+      return "`CURAND_STATUS_OUT_OF_RANGE`. Argument out of range.";
    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
-      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+      return "`CURAND_STATUS_LENGTH_NOT_MULTIPLE`. Length requested is not a "
+             "multple of dimension.";
    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+      return "`CURAND_STATUS_DOUBLE_PRECISION_REQUIRED`. GPU does not have "
+             "double precision required by MRG32k3a.";
    case CURAND_STATUS_LAUNCH_FAILURE:
-      return "CURAND_STATUS_LAUNCH_FAILURE";
+      return "`CURAND_STATUS_LAUNCH_FAILURE`. Kernel launch failure.";
    case CURAND_STATUS_PREEXISTING_FAILURE:
-      return "CURAND_STATUS_PREEXISTING_FAILURE";
+      return "`CURAND_STATUS_PREEXISTING_FAILURE`. Preexisting failure on "
+             "library entry.";
    case CURAND_STATUS_INITIALIZATION_FAILED:
-      return "CURAND_STATUS_INITIALIZATION_FAILED";
+      return "`CURAND_STATUS_INITIALIZATION_FAILED`. Initialization of CUDA "
+             "failed.";
    case CURAND_STATUS_ARCH_MISMATCH:
-      return "CURAND_STATUS_ARCH_MISMATCH";
+      return "`CURAND_STATUS_ARCH_MISMATCH`. Architecture mismatch, GPU does "
+             "not support requested feature.";
    case CURAND_STATUS_INTERNAL_ERROR:
-      return "CURAND_STATUS_INTERNAL_ERROR";
+      return "`CURAND_STATUS_INTERNAL_ERROR`. Internal library error.";
    default:
      return "Unknown curand status";
  }
@@ -808,23 +814,37 @@ inline bool is_error(cublasStatus_t stat) {
 inline const char* cublasGetErrorString(cublasStatus_t stat) {
  switch (stat) {
    case CUBLAS_STATUS_NOT_INITIALIZED:
-      return "CUBLAS_STATUS_NOT_INITIALIZED";
+      return "`CUBLAS_STATUS_NOT_INITIALIZED`. The cuBLAS library was not "
+             "initialized.";
    case CUBLAS_STATUS_ALLOC_FAILED:
-      return "CUBLAS_STATUS_ALLOC_FAILED";
+      return "`CUBLAS_STATUS_ALLOC_FAILED`. Resource allocation failed inside "
+             "the cuBLAS library.";
    case CUBLAS_STATUS_INVALID_VALUE:
-      return "CUBLAS_STATUS_INVALID_VALUE";
+      return "`CUBLAS_STATUS_INVALID_VALUE`. An unsupported value or parameter "
+             "was passed to the function (a negative vector size, for "
+             "example).";
    case CUBLAS_STATUS_ARCH_MISMATCH:
-      return "CUBLAS_STATUS_ARCH_MISMATCH";
+      return "`CUBLAS_STATUS_ARCH_MISMATCH`. The function requires a feature "
+             "absent from the device architecture; usually caused by the lack "
+             "of support for double precision.";
    case CUBLAS_STATUS_MAPPING_ERROR:
-      return "CUBLAS_STATUS_MAPPING_ERROR";
+      return "`CUBLAS_STATUS_MAPPING_ERROR`. An access to GPU memory space "
+             "failed, which is usually caused by a failure to bind a texture.";
    case CUBLAS_STATUS_EXECUTION_FAILED:
-      return "CUBLAS_STATUS_EXECUTION_FAILED";
+      return "`CUBLAS_STATUS_EXECUTION_FAILED`. The GPU program failed to "
+             "execute. This is often caused by a launch failure of the kernel "
+             "on the GPU, which can be caused by multiple reasons.";
    case CUBLAS_STATUS_INTERNAL_ERROR:
-      return "CUBLAS_STATUS_INTERNAL_ERROR";
+      return "`CUBLAS_STATUS_INTERNAL_ERROR`. An internal cuBLAS operation "
+             "failed. This error is usually caused by a cudaMemcpyAsync() "
+             "failure.";
    case CUBLAS_STATUS_NOT_SUPPORTED:
-      return "CUBLAS_STATUS_NOT_SUPPORTED";
+      return "`CUBLAS_STATUS_NOT_SUPPORTED`. The functionality requested is "
+             "not supported.";
    case CUBLAS_STATUS_LICENSE_ERROR:
-      return "CUBLAS_STATUS_LICENSE_ERROR";
+      return "`CUBLAS_STATUS_LICENSE_ERROR`. The functionality requested "
+             "requires some license and an error was detected when trying to "
+             "check the current licensing.";
    default:
      return "Unknown cublas status";
  }
@@ -843,19 +863,34 @@ inline bool is_error(cusolverStatus_t stat) {
 inline const char* cusolverGetErrorString(cusolverStatus_t stat) {
  switch (stat) {
    case CUSOLVER_STATUS_NOT_INITIALIZED:
-      return "CUSOLVER_STATUS_NOT_INITIALIZED";
+      return "`CUSOLVER_STATUS_NOT_INITIALIZED`. The cuSolver library was not "
+             "initialized. This is usually caused by the lack of a prior call, "
+             "an error in the CUDA Runtime API called by the cuSolver routine, "
+             "or an error in the hardware setup.";
    case CUSOLVER_STATUS_ALLOC_FAILED:
-      return "CUSOLVER_STATUS_ALLOC_FAILED";
+      return "`CUSOLVER_STATUS_ALLOC_FAILED`. Resource allocation failed "
+             "inside the cuSolver library. This is usually caused by a "
+             "cudaMalloc() failure.";
    case CUSOLVER_STATUS_INVALID_VALUE:
-      return "CUSOLVER_STATUS_INVALID_VALUE";
+      return "`CUSOLVER_STATUS_INVALID_VALUE`. An unsupported value or "
+             "parameter was passed to the function (a negative vector size, "
+             "for example).";
    case CUSOLVER_STATUS_ARCH_MISMATCH:
-      return "CUSOLVER_STATUS_ARCH_MISMATCH";
+      return "`CUSOLVER_STATUS_ARCH_MISMATCH`. The function requires a feature "
+             "absent from the device architecture; usually caused by the lack "
+             "of support for atomic operations or double precision.";
    case CUSOLVER_STATUS_EXECUTION_FAILED:
-      return "CUSOLVER_STATUS_EXECUTION_FAILED";
+      return "`CUSOLVER_STATUS_EXECUTION_FAILED`. The GPU program failed to "
+             "execute. This is often caused by a launch failure of the kernel "
+             "on the GPU, which can be caused by multiple reasons.";
    case CUSOLVER_STATUS_INTERNAL_ERROR:
-      return "CUSOLVER_STATUS_INTERNAL_ERROR";
+      return "`CUSOLVER_STATUS_INTERNAL_ERROR`. An internal cuSolver operation "
+             "failed. This error is usually caused by a cudaMemcpyAsync() "
+             "failure.";
    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+      return "`CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED`. The matrix type is "
+             "not supported by this function. This is usually caused by "
+             "passing an invalid matrix descriptor to the function.";
    default:
      return "Unknown cusolver status";
  }