提交 772e0956 编写于 作者: Z zhouwei25 提交者: liuwei1031

Optimize the content of error reporting information, print error code and...

Optimize the content of error reporting information, print error code and official document web sites (#18671)

 optimize the error reporting information of cuda related API
 index on develop: 130ac177 Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into develop
上级 ae58afc5
...@@ -79,6 +79,12 @@ DEFINE_string(selected_gpus, "", ...@@ -79,6 +79,12 @@ DEFINE_string(selected_gpus, "",
namespace paddle { namespace paddle {
namespace platform { namespace platform {
inline std::string CudaErrorWebsite() {
return "Please see detail in https://docs.nvidia.com/cuda/cuda-runtime-api"
"/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c217824"
"6db0a94a430e0038";
}
static int GetCUDADeviceCountImpl() { static int GetCUDADeviceCountImpl() {
const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES"); const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
if (cuda_visible_devices != nullptr) { if (cuda_visible_devices != nullptr) {
...@@ -92,9 +98,12 @@ static int GetCUDADeviceCountImpl() { ...@@ -92,9 +98,12 @@ static int GetCUDADeviceCountImpl() {
} }
int count; int count;
auto error_code = cudaGetDeviceCount(&count);
PADDLE_ENFORCE( PADDLE_ENFORCE(
cudaGetDeviceCount(&count), error_code,
"cudaGetDeviceCount failed in paddle::platform::GetCUDADeviceCount"); "cudaGetDeviceCount failed in "
"paddle::platform::GetCUDADeviceCountImpl, error code : %d, %s",
error_code, CudaErrorWebsite());
return count; return count;
} }
...@@ -107,28 +116,33 @@ int GetCUDAComputeCapability(int id) { ...@@ -107,28 +116,33 @@ int GetCUDAComputeCapability(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
cudaDeviceProp device_prop; cudaDeviceProp device_prop;
auto error_code = cudaGetDeviceProperties(&device_prop, id); auto error_code = cudaGetDeviceProperties(&device_prop, id);
PADDLE_ENFORCE(error_code, PADDLE_ENFORCE(
"cudaGetDeviceProperties failed in " error_code,
"paddle::platform::GetCUDAComputeCapability, error code : %d", "cudaGetDeviceProperties failed in "
error_code); "paddle::platform::GetCUDAComputeCapability, error code : %d, %s",
error_code, CudaErrorWebsite());
return device_prop.major * 10 + device_prop.minor; return device_prop.major * 10 + device_prop.minor;
} }
int GetCUDARuntimeVersion(int id) { int GetCUDARuntimeVersion(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
int runtime_version = 0; int runtime_version = 0;
PADDLE_ENFORCE(cudaRuntimeGetVersion(&runtime_version), auto error_code = cudaRuntimeGetVersion(&runtime_version);
PADDLE_ENFORCE(error_code,
"cudaRuntimeGetVersion failed in " "cudaRuntimeGetVersion failed in "
"paddle::platform::cudaRuntimeGetVersion"); "paddle::platform::GetCUDARuntimeVersion, error code : %d, %s",
error_code, CudaErrorWebsite());
return runtime_version; return runtime_version;
} }
int GetCUDADriverVersion(int id) { int GetCUDADriverVersion(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
int driver_version = 0; int driver_version = 0;
PADDLE_ENFORCE(cudaDriverGetVersion(&driver_version), auto error_code = cudaDriverGetVersion(&driver_version);
PADDLE_ENFORCE(error_code,
"cudaDriverGetVersion failed in " "cudaDriverGetVersion failed in "
"paddle::platform::GetCUDADriverVersion"); "paddle::platform::GetCUDADriverVersion, error code : %d, %s",
error_code, CudaErrorWebsite());
return driver_version; return driver_version;
} }
...@@ -145,28 +159,35 @@ bool TensorCoreAvailable() { ...@@ -145,28 +159,35 @@ bool TensorCoreAvailable() {
int GetCUDAMultiProcessors(int id) { int GetCUDAMultiProcessors(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
int count; int count;
PADDLE_ENFORCE( auto error_code =
cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id), cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id);
"cudaDeviceGetAttribute failed in " PADDLE_ENFORCE(error_code,
"paddle::platform::GetCUDAMultiProcessors"); "cudaDeviceGetAttribute failed in "
"paddle::platform::GetCUDAMultiProcess, error code : %d, %s",
error_code, CudaErrorWebsite());
return count; return count;
} }
int GetCUDAMaxThreadsPerMultiProcessor(int id) { int GetCUDAMaxThreadsPerMultiProcessor(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
int count; int count;
PADDLE_ENFORCE(cudaDeviceGetAttribute( auto error_code = cudaDeviceGetAttribute(
&count, cudaDevAttrMaxThreadsPerMultiProcessor, id), &count, cudaDevAttrMaxThreadsPerMultiProcessor, id);
"cudaDeviceGetAttribute failed in " PADDLE_ENFORCE(
"paddle::platform::GetCUDAMaxThreadsPerMultiProcessor"); error_code,
"cudaDeviceGetAttribute failed in paddle::"
"platform::GetCUDAMaxThreadsPerMultiProcessor, error code : %d, %s",
error_code, CudaErrorWebsite());
return count; return count;
} }
int GetCurrentDeviceId() { int GetCurrentDeviceId() {
int device_id; int device_id;
PADDLE_ENFORCE( auto error_code = cudaGetDevice(&device_id);
cudaGetDevice(&device_id), PADDLE_ENFORCE(error_code,
"cudaGetDevice failed in paddle::platform::GetCurrentDeviceId"); "cudaGetDevice failed in "
"paddle::platform::GetCurrentDeviceId, error code : %d, %s",
error_code, CudaErrorWebsite());
return device_id; return device_id;
} }
...@@ -191,13 +212,19 @@ std::vector<int> GetSelectedDevices() { ...@@ -191,13 +212,19 @@ std::vector<int> GetSelectedDevices() {
void SetDeviceId(int id) { void SetDeviceId(int id) {
// TODO(qijun): find a better way to cache the cuda device count // TODO(qijun): find a better way to cache the cuda device count
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
PADDLE_ENFORCE(cudaSetDevice(id), auto error_code = cudaSetDevice(id);
"cudaSetDevice failed in paddle::platform::SetDeviceId"); PADDLE_ENFORCE(error_code,
"cudaSetDevice failed in "
"paddle::platform::SetDeviced, error code : %d, %s",
error_code, CudaErrorWebsite());
} }
void GpuMemoryUsage(size_t *available, size_t *total) { void GpuMemoryUsage(size_t *available, size_t *total) {
PADDLE_ENFORCE(cudaMemGetInfo(available, total), auto error_code = cudaMemGetInfo(available, total);
"cudaMemGetInfo failed in paddle::platform::GetMemoryUsage"); PADDLE_ENFORCE(error_code,
"cudaMemGetInfo failed in "
"paddle::platform::GetMemoryUsage, error code : %d, %s",
error_code, CudaErrorWebsite());
} }
size_t GpuMaxAllocSize() { size_t GpuMaxAllocSize() {
...@@ -224,11 +251,13 @@ size_t GpuInitAllocSize() { ...@@ -224,11 +251,13 @@ size_t GpuInitAllocSize() {
size_t GpuReallocSize() { size_t GpuReallocSize() {
if (FLAGS_reallocate_gpu_memory_in_mb > 0ul) { if (FLAGS_reallocate_gpu_memory_in_mb > 0ul) {
// Additional memory will be allocated by FLAGS_reallocate_gpu_memory_in_mb // Additional memory will be allocated by
// FLAGS_reallocate_gpu_memory_in_mb
return static_cast<size_t>(FLAGS_reallocate_gpu_memory_in_mb << 20); return static_cast<size_t>(FLAGS_reallocate_gpu_memory_in_mb << 20);
} }
// FLAGS_reallocate_gpu_memory_in_mb is 0, additional memory will be allocated // FLAGS_reallocate_gpu_memory_in_mb is 0, additional memory will be
// allocated
// by fraction // by fraction
size_t total = 0; size_t total = 0;
size_t available = 0; size_t available = 0;
...@@ -268,37 +297,50 @@ size_t GpuMaxChunkSize() { ...@@ -268,37 +297,50 @@ size_t GpuMaxChunkSize() {
void GpuMemcpyAsync(void *dst, const void *src, size_t count, void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream) { enum cudaMemcpyKind kind, cudaStream_t stream) {
PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream), auto error_code = cudaMemcpyAsync(dst, src, count, kind, stream);
PADDLE_ENFORCE(error_code,
"cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync " "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync "
"(%p -> %p, length: %d)", "(%p -> %p, length: %d) error code : %d, %s",
src, dst, static_cast<int>(count)); src, dst, static_cast<int>(count), error_code,
CudaErrorWebsite());
} }
void GpuMemcpySync(void *dst, const void *src, size_t count, void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind) { enum cudaMemcpyKind kind) {
PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind), auto error_code = cudaMemcpy(dst, src, count, kind);
"cudaMemcpy failed in paddle::platform::GpuMemcpySync (%p -> " PADDLE_ENFORCE(error_code,
"%p, length: %d)", "cudaMemcpy failed in paddle::platform::GpuMemcpySync "
src, dst, static_cast<int>(count)); "(%p -> %p, length: %d) error code : %d, %s",
src, dst, static_cast<int>(count), error_code,
CudaErrorWebsite());
} }
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, cudaStream_t stream) { int src_device, size_t count, cudaStream_t stream) {
auto error_code =
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream);
PADDLE_ENFORCE( PADDLE_ENFORCE(
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream), error_code,
"cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync"); "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync "
"error code : %d, %s",
error_code, CudaErrorWebsite());
} }
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count) { int src_device, size_t count) {
PADDLE_ENFORCE( auto error_code = cudaMemcpyPeer(dst, dst_device, src, src_device, count);
cudaMemcpyPeer(dst, dst_device, src, src_device, count), PADDLE_ENFORCE(error_code,
"cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync"); "cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync "
"error code : %d, %s",
error_code, CudaErrorWebsite());
} }
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) { void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream), auto error_code = cudaMemsetAsync(dst, value, count, stream);
"cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync"); PADDLE_ENFORCE(error_code,
"cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync "
"error code : %d, %s",
error_code, CudaErrorWebsite());
} }
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册