未验证 提交 b425215a 编写于 作者: Z Zhou Wei 提交者: GitHub

Unify all external API error message mechanism and enhance third-party API error msg (#33003)

* Unify all external API error message mechanism and enhance third-party API error msg

* fix some comment

* fix some comment
上级 e05a7a49
...@@ -146,11 +146,11 @@ copy(inference_lib_dist ...@@ -146,11 +146,11 @@ copy(inference_lib_dist
SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
DSTS ${dst_dir}) DSTS ${dst_dir})
# Only GPU need cudaErrorMessage.pb # GPU must copy externalErrorMsg.pb
IF(WITH_GPU) IF(WITH_GPU)
set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data") set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/externalError/data")
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${cudaerror_INCLUDE_DIR} SRCS ${externalError_INCLUDE_DIR}
DSTS ${dst_dir}) DSTS ${dst_dir})
ENDIF() ENDIF()
...@@ -259,7 +259,7 @@ copy(fluid_lib_dist ...@@ -259,7 +259,7 @@ copy(fluid_lib_dist
set(module "platform") set(module "platform")
set(platform_lib_deps profiler_proto error_codes_proto) set(platform_lib_deps profiler_proto error_codes_proto)
if(WITH_GPU) if(WITH_GPU)
set(platform_lib_deps ${platform_lib_deps} cuda_error_proto) set(platform_lib_deps ${platform_lib_deps} external_error_proto)
endif(WITH_GPU) endif(WITH_GPU)
add_dependencies(fluid_lib_dist ${platform_lib_deps}) add_dependencies(fluid_lib_dist ${platform_lib_deps})
......
...@@ -111,10 +111,11 @@ FUNCTION(file_download_and_uncompress URL NAME) ...@@ -111,10 +111,11 @@ FUNCTION(file_download_and_uncompress URL NAME)
MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}") MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}")
SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data PARENT_SCOPE) SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data PARENT_SCOPE)
ExternalProject_Add( ExternalProject_Add(
extern_download_${NAME} download_${NAME}
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${THIRD_PARTY_PATH}/${NAME} PREFIX ${THIRD_PARTY_PATH}/${NAME}
URL ${URL} URL ${URL}
TIMEOUT 120
DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
DOWNLOAD_NO_PROGRESS 1 DOWNLOAD_NO_PROGRESS 1
...@@ -123,7 +124,7 @@ FUNCTION(file_download_and_uncompress URL NAME) ...@@ -123,7 +124,7 @@ FUNCTION(file_download_and_uncompress URL NAME)
UPDATE_COMMAND "" UPDATE_COMMAND ""
INSTALL_COMMAND "" INSTALL_COMMAND ""
) )
set(third_party_deps ${third_party_deps} extern_download_${NAME} PARENT_SCOPE) set(third_party_deps ${third_party_deps} download_${NAME} PARENT_SCOPE)
ENDFUNCTION() ENDFUNCTION()
...@@ -242,8 +243,20 @@ if(WITH_GPU) ...@@ -242,8 +243,20 @@ if(WITH_GPU)
include(external/cub) # download cub include(external/cub) # download cub
list(APPEND third_party_deps extern_cub) list(APPEND third_party_deps extern_cub)
endif() endif()
set(CUDAERROR_URL "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE) set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz" CACHE STRING "" FORCE)
file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage file_download_and_uncompress(${URL} "externalError") # download file externalErrorMsg.tar.gz
if(WITH_TESTING)
# copy externalErrorMsg.pb for unittest 'enforce_test'
set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data)
if(WIN32 AND (NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja"))
set(DST_DIR ${CMAKE_BINARY_DIR}/paddle/fluid/third_party/externalError/data)
else()
set(DST_DIR ${CMAKE_BINARY_DIR}/paddle/third_party/externalError/data)
endif()
add_custom_command(TARGET download_externalError POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR}
COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR}")
endif()
endif(WITH_GPU) endif(WITH_GPU)
if(WITH_XPU) if(WITH_XPU)
......
proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool) proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
proto_library(error_codes_proto SRCS error_codes.proto) proto_library(error_codes_proto SRCS error_codes.proto)
if(WITH_GPU) if(WITH_GPU)
proto_library(cuda_error_proto SRCS cuda_error.proto) proto_library(external_error_proto SRCS external_error.proto)
endif(WITH_GPU) endif(WITH_GPU)
if(WITH_XPU) if(WITH_XPU)
...@@ -45,7 +45,7 @@ cc_test(errors_test SRCS errors_test.cc DEPS errors enforce) ...@@ -45,7 +45,7 @@ cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)
set(enforce_deps flags errors boost) set(enforce_deps flags errors boost)
if(WITH_GPU) if(WITH_GPU)
set(enforce_deps ${enforce_deps} cuda_error_proto) set(enforce_deps ${enforce_deps} external_error_proto)
endif() endif()
cc_library(enforce INTERFACE SRCS enforce.cc DEPS ${enforce_deps}) cc_library(enforce INTERFACE SRCS enforce.cc DEPS ${enforce_deps})
cc_library(monitor SRCS monitor.cc) cc_library(monitor SRCS monitor.cc)
......
...@@ -34,35 +34,6 @@ DECLARE_bool(cudnn_deterministic); ...@@ -34,35 +34,6 @@ DECLARE_bool(cudnn_deterministic);
namespace paddle { namespace paddle {
namespace platform { namespace platform {
inline const char* cudnnGetErrorString(cudnnStatus_t status) {
switch (status) {
case CUDNN_STATUS_SUCCESS:
return "CUDNN_STATUS_SUCCESS";
case CUDNN_STATUS_NOT_INITIALIZED:
return "CUDNN_STATUS_NOT_INITIALIZED";
case CUDNN_STATUS_ALLOC_FAILED:
return "CUDNN_STATUS_ALLOC_FAILED";
case CUDNN_STATUS_BAD_PARAM:
return "CUDNN_STATUS_BAD_PARAM";
case CUDNN_STATUS_INTERNAL_ERROR:
return "CUDNN_STATUS_INTERNAL_ERROR";
case CUDNN_STATUS_INVALID_VALUE:
return "CUDNN_STATUS_INVALID_VALUE";
case CUDNN_STATUS_ARCH_MISMATCH:
return "CUDNN_STATUS_ARCH_MISMATCH";
case CUDNN_STATUS_MAPPING_ERROR:
return "CUDNN_STATUS_MAPPING_ERROR";
case CUDNN_STATUS_EXECUTION_FAILED:
return "CUDNN_STATUS_EXECUTION_FAILED";
case CUDNN_STATUS_NOT_SUPPORTED:
return "CUDNN_STATUS_NOT_SUPPORTED";
case CUDNN_STATUS_LICENSE_ERROR:
return "CUDNN_STATUS_LICENSE_ERROR";
default:
return "Unknown cudnn error number";
}
}
#define CUDNN_VERSION_MIN(major, minor, patch) \ #define CUDNN_VERSION_MIN(major, minor, patch) \
(CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch))) (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
......
此差异已折叠。
...@@ -304,6 +304,7 @@ bool CheckCudaStatusFailure(T value, const std::string& msg) { ...@@ -304,6 +304,7 @@ bool CheckCudaStatusFailure(T value, const std::string& msg) {
return false; return false;
} catch (paddle::platform::EnforceNotMet& error) { } catch (paddle::platform::EnforceNotMet& error) {
std::string ex_msg = error.what(); std::string ex_msg = error.what();
std::cout << ex_msg << std::endl;
return ex_msg.find(msg) != std::string::npos; return ex_msg.find(msg) != std::string::npos;
} }
} }
...@@ -338,30 +339,98 @@ TEST(enforce, hip_success) { ...@@ -338,30 +339,98 @@ TEST(enforce, hip_success) {
#else #else
TEST(enforce, cuda_success) { TEST(enforce, cuda_success) {
EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess)); EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess));
EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "Cuda error")); EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "CUDA error"));
EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "Cuda error"));
EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "CUDA error"));
EXPECT_TRUE(CheckCudaStatusFailure(
cudaErrorInsufficientDriver,
"This indicates that the installed NVIDIA CUDA driver is older than the "
"CUDA runtime library. This is not a supported configuration.Users "
"should install an updated NVIDIA display driver to allow the "
"application to run"));
EXPECT_TRUE(CheckCudaStatusFailure(
cudaErrorContextIsDestroyed,
"This error indicates that the context current to the calling thread has "
"been destroyed using cuCtxDestroy, or is a primary context which has "
"not yet been initialized"));
EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS)); EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS));
EXPECT_TRUE( EXPECT_TRUE(
CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "Curand error")); CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "CURAND error"));
EXPECT_TRUE( EXPECT_TRUE(
CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "Curand error")); CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "CURAND error"));
EXPECT_TRUE(CheckCudaStatusFailure(
CURAND_STATUS_ARCH_MISMATCH,
"Architecture mismatch, GPU does not support requested feature"));
EXPECT_TRUE(
CheckCudaStatusFailure(CURAND_STATUS_LENGTH_NOT_MULTIPLE,
"Length requested is not a multple of dimension"));
EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS)); EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS));
EXPECT_TRUE( EXPECT_TRUE(
CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "Cudnn error")); CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "CUDNN error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "Cudnn error")); EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "CUDNN error"));
EXPECT_TRUE(CheckCudaStatusFailure(
CUDNN_STATUS_BAD_PARAM,
"An incorrect value or parameter was passed to the function. To correct, "
"ensure that all the parameters being passed have valid values"));
EXPECT_TRUE(CheckCudaStatusFailure(
CUDNN_STATUS_LICENSE_ERROR,
"The functionality requested requires some license and an error was "
"detected when trying to check the current licensing. This error can "
"happen if the license is not present or is expired or if the "
"environment variable NVIDIA_LICENSE_FILE is not set properly"));
EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS)); EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS));
EXPECT_TRUE( EXPECT_TRUE(
CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "Cublas error")); CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "CUBLAS error"));
EXPECT_TRUE(
CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "CUBLAS error"));
EXPECT_TRUE(CheckCudaStatusFailure(
CUBLAS_STATUS_EXECUTION_FAILED,
"The GPU program failed to execute. This is often caused by a launch "
"failure of the kernel on the GPU, which can be caused by multiple "
"reasons. To correct: check that the hardware, an appropriate version "
"of the driver, and the cuBLAS library are correctly installed"));
EXPECT_TRUE(CheckCudaStatusFailure(
CUBLAS_STATUS_MAPPING_ERROR,
"An access to GPU memory space failed, which is usually caused by a "
"failure to bind a texture. To correct: prior to the function call, "
"unbind any previously bound textures"));
EXPECT_TRUE(CheckCudaStatusSuccess(CUSOLVER_STATUS_SUCCESS));
EXPECT_TRUE(CheckCudaStatusFailure(CUSOLVER_STATUS_NOT_INITIALIZED,
"CUSOLVER error"));
EXPECT_TRUE( EXPECT_TRUE(
CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "Cublas error")); CheckCudaStatusFailure(CUSOLVER_STATUS_ALLOC_FAILED, "CUSOLVER error"));
EXPECT_TRUE(CheckCudaStatusFailure(
CUSOLVER_STATUS_INTERNAL_ERROR,
"An internal cuSolver operation failed. This error is usually caused by "
"a cudaMemcpyAsync() failure.To correct: check that the hardware, an "
"appropriate version of the driver, and the cuSolver library are "
"correctly installed. Also, check that the memory passed as a parameter "
"to the routine is not being deallocated prior to the routine’s "
"completion"));
EXPECT_TRUE(CheckCudaStatusFailure(
CUSOLVER_STATUS_INVALID_VALUE,
"An unsupported value or parameter was passed to the function (a "
"negative vector size, for example).To correct: ensure that all the "
"parameters being passed have valid values"));
/*
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Nccl error")); EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error"));
EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Nccl error")); EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "NCCL error"));
EXPECT_TRUE(CheckCudaStatusFailure(ncclInternalError,
"An internal check failed. This is either "
"a bug in NCCL or due to memory "
"corruption"));
EXPECT_TRUE(CheckCudaStatusFailure(ncclInvalidUsage,
"The call to NCCL is incorrect. This is "
"usually reflecting a programming error"));
#endif #endif
*/
} }
#endif #endif
#endif #endif
......
...@@ -15,21 +15,32 @@ limitations under the License. */ ...@@ -15,21 +15,32 @@ limitations under the License. */
syntax = "proto2"; syntax = "proto2";
package paddle.platform.proto; package paddle.platform.proto;
// (NOTE:zhouwei): ApiType describes which kind of external third party API
// More external third party API can be added.
enum ApiType {
CUDA = 0;
CURAND = 1;
CUDNN = 2;
CUBLAS = 3;
CUSOLVER = 4;
NCCL = 5;
}
message MessageDesc { message MessageDesc {
// Indicates the type of error // Indicates the code of error
required int32 errorCode = 1; required int32 code = 1;
// Indicates the message of error // Indicates the message of error
required string errorMessage = 2; required string message = 2;
} }
message AllMessageDesc { message AllMessageDesc {
// Version of cuda API // Indicates which kind of third-party API
required int32 version = 1; required ApiType type = 1;
// Error messages of different errortype // Error messages of different errortype
repeated MessageDesc Messages = 2; repeated MessageDesc messages = 2;
} }
message cudaerrorDesc { message ExternalErrorDesc {
// Error messages of different cuda versions(9.0/10.0/10.2) // Error messages of different kind of external third party API
repeated AllMessageDesc AllMessages = 2; repeated AllMessageDesc errors = 1;
} }
\ No newline at end of file
...@@ -76,6 +76,7 @@ if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37 ...@@ -76,6 +76,7 @@ if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
rem -------set cache build directory----------- rem -------set cache build directory-----------
rmdir build\python /s/q rmdir build\python /s/q
rmdir build\paddle\third_party\externalError /s/q
rmdir build\paddle\fluid\pybind /s/q rmdir build\paddle\fluid\pybind /s/q
rmdir build\paddle_install_dir /s/q rmdir build\paddle_install_dir /s/q
rmdir build\paddle_inference_install_dir /s/q rmdir build\paddle_inference_install_dir /s/q
...@@ -506,7 +507,6 @@ echo ======================================== ...@@ -506,7 +507,6 @@ echo ========================================
echo Step 4. Running unit tests ... echo Step 4. Running unit tests ...
echo ======================================== echo ========================================
: set CI_SKIP_CPP_TEST if only *.py changed : set CI_SKIP_CPP_TEST if only *.py changed
git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON
......
...@@ -412,7 +412,8 @@ if '${WITH_MKLDNN}' == 'ON': ...@@ -412,7 +412,8 @@ if '${WITH_MKLDNN}' == 'ON':
headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON': if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON':
headers += list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}')) # errorMessage.pb for errormessage # externalErrorMsg.pb for External Error message
headers += list(find_files('*.pb', '${externalError_INCLUDE_DIR}'))
class InstallCommand(InstallCommandBase): class InstallCommand(InstallCommandBase):
def finalize_options(self): def finalize_options(self):
......
Usage:
Please run:
```
bash start.sh
```
The error message of CUDA9.0 / CUDA10.0 / CUDA-latest-version will be crawled by default.
If you want to crawl a specified version of CUDA, Please run:
```
bash start.sh <version> <URL(optional)>
```
URL can be derived by default, so you don't have to enter a URL.
for example:
```
bash start.sh 11.0
```
will capture error message of CUDA11.0(in future).
Every time when Nvidia upgrade the CUDA major version, you need to run `bash start.sh` in current directory, and upload cudaErrorMessage.tar.gz to https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ssl
import re
import urllib2
import json
import collections
import sys, getopt
import cuda_error_pb2
def parsing(cuda_errorDesc, version, url):
All_Messages = cuda_errorDesc.AllMessages.add()
All_Messages.version = int(version)
ssl._create_default_https_context = ssl._create_unverified_context
html = urllib2.urlopen(url).read()
res_div = r'<div class="section">.*?<p>CUDA error types </p>.*?</div>.*?<div class="enum-members">(.*?)</div>'
m_div = re.findall(res_div, html, re.S | re.M)
url_list = url.split('/')
url_prefix = '/'.join(url_list[0:url_list.index('cuda-runtime-api') + 1])
dic = collections.OrderedDict()
dic_message = collections.OrderedDict()
for line in m_div:
res_dt = r'<dt>(.*?)</dt>.*?<dd>(.*?)</dd>'
m_dt = re.findall(res_dt, line, re.S | re.M)
for error in m_dt:
res_type = r'<span class="ph ph apiData">(.*?)</span>'
m_type = re.findall(res_type, error[0], re.S | re.M)[0]
m_message = error[1]
m_message = m_message.replace('\n', '')
res_a = r'(<a class=.*?</a>)'
res_shape = r'<a class=.*?>(.*?)</a>'
list_a = re.findall(res_a, m_message, re.S | re.M)
list_shape = re.findall(res_shape, m_message, re.S | re.M)
assert len(list_a) == len(list_shape)
for idx in range(len(list_a)):
m_message = m_message.replace(list_a[idx], list_shape[idx])
m_message = m_message.replace(
'<h6 class=\"deprecated_header\">Deprecated</h6>', '')
res_span = r'(<span class=.*?</span>)'
res_span_detail = r'<span class=.*?>(.*?)</span>'
list_span = re.findall(res_span, m_message, re.S | re.M)
list_span_detail = re.findall(res_span_detail, m_message, re.S |
re.M)
assert len(list_span) == len(list_span_detail)
for idx in range(len(list_span)):
m_message = m_message.replace(list_span[idx],
list_span_detail[idx])
res_p = r'(<p>.*?</p>)'
res_p_detail = r'<p>(.*?)</p>'
list_p = re.findall(res_p, m_message, re.S | re.M)
list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M)
assert len(list_p) == len(list_p_detail)
for idx in range(len(list_p)):
m_message = m_message.replace(list_p[idx], list_p_detail[idx])
m_message = m_message.replace(' ', '')
_Messages = All_Messages.Messages.add()
try:
_Messages.errorCode = int(m_type)
except ValueError:
if re.match('0x', m_type):
_Messages.errorCode = int(m_type, 16)
else:
raise ValueError
_Messages.errorMessage = m_message # save for cudaErrorMessage.pb from python-protobuf interface
def main(argv):
version = []
url = []
try:
opts, args = getopt.getopt(argv, "hv:u:", ["help", "version=", "url="])
except getopt.GetoptError:
print 'python spider.py -v <version1,version2,...,> -u <url1,url2,...,>'
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
print 'python spider.py -v <version1,version2,...,> -u <url1,url2,...,>'
sys.exit()
elif opt in ("-v", "--version"):
version = arg
elif opt in ("-u", "--url"):
url = arg
version = version.split(',')
url = url.split(',')
assert len(version) == len(url)
cuda_errorDesc = cuda_error_pb2.cudaerrorDesc()
for idx in range(len(version)):
if version[idx] == "-1":
print("crawling errorMessage for CUDA%s from %s" %
("-latest-version", url[idx]))
else:
print("crawling errorMessage for CUDA%s from %s" %
(version[idx], url[idx]))
parsing(cuda_errorDesc, version[idx], url[idx])
serializeToString = cuda_errorDesc.SerializeToString()
with open("cudaErrorMessage.pb", "wb") as f:
f.write(serializeToString
) # save for cudaErrorMessage.pb from python-protobuf interface
print("crawling errorMessage for CUDA has been done!!!")
if __name__ == "__main__":
main(sys.argv[1:])
Usage:
Please run:
```
bash start.sh
```
If you want to update all external error message, you need to run command `bash start.sh` in current directory,
and upload the generated file `externalErrorMsg.tar.gz` to https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ssl
import re
import urllib.request
import json
import collections
import sys, getopt
import external_error_pb2
def parsing(externalErrorDesc):
#*********************************************************************************************#
#*********************************** CUDA Error Message **************************************#
print("start crawling errorMessage for nvidia CUDA API--->")
url = 'https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038'
allMessageDesc = externalErrorDesc.errors.add()
allMessageDesc.type = external_error_pb2.CUDA
ssl._create_default_https_context = ssl._create_unverified_context
html = urllib.request.urlopen(url).read().decode('utf-8')
res_div = r'<div class="section">.*?<p>CUDA error types </p>.*?</div>.*?<div class="enum-members">(.*?)</div>'
m_div = re.findall(res_div, html, re.S | re.M)[0]
res_dt = r'<dt>(.*?)</dt>.*?<dd>(.*?)</dd>'
m_dt = re.findall(res_dt, m_div, re.S | re.M)
for error in m_dt:
res_type = r'<span class="enum-member-name-def">(.*?) = <span class="ph ph apiData">(.*?)</span></span>'
m_type = re.findall(res_type, error[0], re.S | re.M)[0]
m_message = error[1]
m_message = m_message.replace('\n', '')
res_a = r'(<a class=.*?</a>)'
res_shape = r'<a class=.*?>(.*?)</a>'
list_a = re.findall(res_a, m_message, re.S | re.M)
list_shape = re.findall(res_shape, m_message, re.S | re.M)
assert len(list_a) == len(list_shape)
for idx in range(len(list_a)):
m_message = m_message.replace(list_a[idx], list_shape[idx])
m_message = m_message.replace(
'<h6 class=\"deprecated_header\">Deprecated</h6>', '')
res_span = r'(<span class=.*?</span>)'
res_span_detail = r'<span class=.*?>(.*?)</span>'
list_span = re.findall(res_span, m_message, re.S | re.M)
list_span_detail = re.findall(res_span_detail, m_message, re.S | re.M)
assert len(list_span) == len(list_span_detail)
for idx in range(len(list_span)):
m_message = m_message.replace(list_span[idx], list_span_detail[idx])
res_p = r'(<p>.*?</p>)'
res_p_detail = r'<p>(.*?)</p>'
list_p = re.findall(res_p, m_message, re.S | re.M)
list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M)
assert len(list_p) == len(list_p_detail)
for idx in range(len(list_p)):
m_message = m_message.replace(list_p[idx], list_p_detail[idx])
m_message = m_message.replace(' ', '')
_Messages = allMessageDesc.messages.add()
try:
_Messages.code = int(m_type[1])
except ValueError:
if re.match('0x', m_type[1]):
_Messages.code = int(m_type[1], 16)
else:
raise ValueError
_Messages.message = "'%s'. %s" % (m_type[0], m_message)
print("End crawling errorMessage for nvidia CUDA API!\n")
#***********************************************************************************************#
#*********************************** CURAND Error Message **************************************#
print("start crawling errorMessage for nvidia CURAND API--->")
url = 'https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1gb94a31d5c165858c96b6c18b70644437'
allMessageDesc = externalErrorDesc.errors.add()
allMessageDesc.type = external_error_pb2.CURAND
html = urllib.request.urlopen(url).read().decode('utf-8')
res_div = r'<div class="section">.*?<p>CURAND function call status types </p>.*?</div>.*?<div class="enum-members">(.*?)</div>'
m_div = re.findall(res_div, html, re.S | re.M)[0]
res_dt = r'<dt>(.*?)</dt>.*?<dd>(.*?)</dd>'
m_dt = re.findall(res_dt, m_div, re.S | re.M)
for error in m_dt:
res_type = r'<span class="enum-member-name-def">(.*?) = <span class="ph ph apiData">(.*?)</span></span>'
m_type = re.findall(res_type, error[0], re.S | re.M)[0]
m_message = error[1]
_Messages = allMessageDesc.messages.add()
try:
_Messages.code = int(m_type[1])
except ValueError:
if re.match('0x', m_type[1]):
_Messages.code = int(m_type[1], 16)
else:
raise ValueError
_Messages.message = "'%s'. %s" % (m_type[0], m_message)
print("End crawling errorMessage for nvidia CURAND API!\n")
#**************************************************************************************************#
#*********************************** CUDNN Error Message ******************************************#
cudnnStatus_t = {
"CUDNN_STATUS_SUCCESS": 0,
"CUDNN_STATUS_NOT_INITIALIZED": 1,
"CUDNN_STATUS_ALLOC_FAILED": 2,
"CUDNN_STATUS_BAD_PARAM": 3,
"CUDNN_STATUS_INTERNAL_ERROR": 4,
"CUDNN_STATUS_INVALID_VALUE": 5,
"CUDNN_STATUS_ARCH_MISMATCH": 6,
"CUDNN_STATUS_MAPPING_ERROR": 7,
"CUDNN_STATUS_EXECUTION_FAILED": 8,
"CUDNN_STATUS_NOT_SUPPORTED": 9,
"CUDNN_STATUS_LICENSE_ERROR": 10,
"CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING": 11,
"CUDNN_STATUS_RUNTIME_IN_PROGRESS": 12,
"CUDNN_STATUS_RUNTIME_FP_OVERFLOW": 13,
}
print("start crawling errorMessage for nvidia CUDNN API--->")
url = 'https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnStatus_t'
allMessageDesc = externalErrorDesc.errors.add()
allMessageDesc.type = external_error_pb2.CUDNN
html = urllib.request.urlopen(url).read().decode('utf-8')
f = open('1.txt', 'w')
f.write(html)
res_div = r'<div class="section" id="cudnnStatus_t__section_lmp_dgr_2jb"><a name="cudnnStatus_t__section_lmp_dgr_2jb" shape="rect">(.*?)</div>'
m_div = re.findall(res_div, html, re.S | re.M)[0]
res_dt = r'<dt class="dt dlterm"><samp class="ph codeph">(.*?)</samp></dt>.*?<dd class="dd">(.*?)</dd>'
m_dt = re.findall(res_dt, m_div, re.S | re.M)
for error in m_dt:
m_message = error[1]
res_class = r'<p class="p">.*?</p>'
res_class_detail = r'<p class="p">(.*?)</p>'
list_class = re.findall(res_class, m_message, re.S | re.M)
list_class_detail = re.findall(res_class_detail, m_message, re.S | re.M)
assert len(list_class) == len(list_class_detail)
for idx in range(len(list_class)):
m_message = m_message.replace(list_class[idx],
list_class_detail[idx])
res_a = r'(<a class="xref".*?</a>)'
res_shape = r'<a class="xref".*?>(.*?)</a>'
list_a = re.findall(res_a, m_message, re.S | re.M)
list_shape = re.findall(res_shape, m_message, re.S | re.M)
assert len(list_a) == len(list_shape)
for idx in range(len(list_a)):
m_message = m_message.replace(list_a[idx], list_shape[idx])
res_span = r'(<span class="ph">.*?</span>)'
res_span_detail = r'<span class="ph">(.*?)</span>'
list_span = re.findall(res_span, m_message, re.S | re.M)
list_span_detail = re.findall(res_span_detail, m_message, re.S | re.M)
assert len(list_span) == len(list_span_detail)
for idx in range(len(list_span)):
m_message = m_message.replace(list_span[idx], list_span_detail[idx])
res_samp = r'(<samp class="ph codeph">.*?</samp>)'
res_samp_detail = r'<samp class="ph codeph">(.*?)</samp>'
list_samp = re.findall(res_samp, m_message, re.S | re.M)
list_samp_detail = re.findall(res_samp_detail, m_message, re.S | re.M)
assert len(list_samp) == len(list_samp_detail)
for idx in range(len(list_samp)):
m_message = m_message.replace(list_samp[idx], list_samp_detail[idx])
m_message = re.sub(r'\n +', ' ', m_message)
_Messages = allMessageDesc.messages.add()
_Messages.code = int(cudnnStatus_t[error[0]])
_Messages.message = "'%s'. %s" % (error[0], m_message)
print("End crawling errorMessage for nvidia CUDNN API!\n")
#*************************************************************************************************#
#*********************************** CUBLAS Error Message ****************************************#
cublasStatus_t = {
"CUBLAS_STATUS_SUCCESS": 0,
"CUBLAS_STATUS_NOT_INITIALIZED": 1,
"CUBLAS_STATUS_ALLOC_FAILED": 3,
"CUBLAS_STATUS_INVALID_VALUE": 7,
"CUBLAS_STATUS_ARCH_MISMATCH": 8,
"CUBLAS_STATUS_MAPPING_ERROR": 11,
"CUBLAS_STATUS_EXECUTION_FAILED": 13,
"CUBLAS_STATUS_INTERNAL_ERROR": 14,
"CUBLAS_STATUS_NOT_SUPPORTED": 15,
"CUBLAS_STATUS_LICENSE_ERROR": 16
}
print("start crawling errorMessage for nvidia CUBLAS API--->")
url = 'https://docs.nvidia.com/cuda/cublas/index.html#cublasstatus_t'
allMessageDesc = externalErrorDesc.errors.add()
allMessageDesc.type = external_error_pb2.CUBLAS
html = urllib.request.urlopen(url).read().decode('utf-8')
res_div = r'<p class="p">The type is used for function status returns. All cuBLAS library.*?<div class="tablenoborder">(.*?)</div>'
m_div = re.findall(res_div, html, re.S | re.M)[0]
res_dt = r'<p class="p"><samp class="ph codeph">(.*?)</samp></p>.*?colspan="1">(.*?)</td>'
m_dt = re.findall(res_dt, m_div, re.S | re.M)
for error in m_dt:
m_message = error[1]
m_message = re.sub(r'\n +', ' ', m_message)
res_p = r'<p class="p">.*?</p>'
res_p_detail = r'<p class="p">(.*?)</p>'
list_p = re.findall(res_p, m_message, re.S | re.M)
list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M)
assert len(list_p) == len(list_p_detail)
for idx in range(len(list_p)):
m_message = m_message.replace(list_p[idx], list_p_detail[idx])
res_samp = r'<samp class="ph codeph">.*?</samp>'
res_samp_detail = r'<samp class="ph codeph">(.*?)</samp>'
list_samp = re.findall(res_samp, m_message, re.S | re.M)
list_samp_detail = re.findall(res_samp_detail, m_message, re.S | re.M)
assert len(list_samp) == len(list_samp_detail)
for idx in range(len(list_samp)):
m_message = m_message.replace(list_samp[idx], list_samp_detail[idx])
_Messages = allMessageDesc.messages.add()
_Messages.code = int(cublasStatus_t[error[0]])
_Messages.message = "'%s'. %s" % (error[0], m_message)
print("End crawling errorMessage for nvidia CUBLAS API!\n")
#*************************************************************************************************#
#*********************************** CUSOLVER Error Message **************************************#
cusolverStatus_t = {
"CUSOLVER_STATUS_SUCCESS": 0,
"CUSOLVER_STATUS_NOT_INITIALIZED": 1,
"CUSOLVER_STATUS_ALLOC_FAILED": 2,
"CUSOLVER_STATUS_INVALID_VALUE": 3,
"CUSOLVER_STATUS_ARCH_MISMATCH": 4,
"CUSOLVER_STATUS_MAPPING_ERROR": 5,
"CUSOLVER_STATUS_EXECUTION_FAILED": 6,
"CUSOLVER_STATUS_INTERNAL_ERROR": 7,
"CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED": 8,
"CUSOLVER_STATUS_NOT_SUPPORTED": 9,
"CUSOLVER_STATUS_ZERO_PIVOT": 10,
"CUSOLVER_STATUS_INVALID_LICENSE": 11,
"CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED": 12,
"CUSOLVER_STATUS_IRS_PARAMS_INVALID": 13,
"CUSOLVER_STATUS_IRS_INTERNAL_ERROR": 14,
"CUSOLVER_STATUS_IRS_NOT_SUPPORTED": 15,
"CUSOLVER_STATUS_IRS_OUT_OF_RANGE": 16,
"CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES": 17,
"CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED": 18
}
print("start crawling errorMessage for nvidia CUSOLVER API--->")
url = 'https://docs.nvidia.com/cuda/cusolver/index.html#cuSolverSPstatus'
allMessageDesc = externalErrorDesc.errors.add()
allMessageDesc.type = external_error_pb2.CUSOLVER
html = urllib.request.urlopen(url).read().decode('utf-8')
res_div = r'This is a status type returned by the library functions and.*?<div class="tablenoborder">(.*?)</div>'
m_div = re.findall(res_div, html, re.S | re.M)[0]
res_dt = r'<samp class="ph codeph">(.*?)</samp></td>.*?colspan="1">(.*?)</td>'
m_dt = re.findall(res_dt, m_div, re.S | re.M)
for error in m_dt:
m_message = error[1]
m_message = re.sub(r'\n +', '', m_message)
m_message = re.sub(r'<p class="p"></p>', '', m_message)
res_p = r'<p class="p">.*?</p>'
res_p_detail = r'<p class="p">(.*?)</p>'
list_p = re.findall(res_p, m_message, re.S | re.M)
list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M)
assert len(list_p) == len(list_p_detail)
for idx in range(len(list_p)):
m_message = m_message.replace(list_p[idx], list_p_detail[idx])
res_samp = r'<samp class="ph codeph">.*?</samp>'
res_samp_detail = r'<samp class="ph codeph">(.*?)</samp>'
list_samp = re.findall(res_samp, m_message, re.S | re.M)
list_samp_detail = re.findall(res_samp_detail, m_message, re.S | re.M)
assert len(list_samp) == len(list_samp_detail)
for idx in range(len(list_samp)):
m_message = m_message.replace(list_samp[idx], list_samp_detail[idx])
res_strong = r'<strong class="ph b">.*?</strong>'
res_strong_detail = r'<strong class="ph b">(.*?)</strong>'
list_strong = re.findall(res_strong, m_message, re.S | re.M)
list_strong_detail = re.findall(res_strong_detail, m_message, re.S |
re.M)
assert len(list_strong) == len(list_strong_detail)
for idx in range(len(list_strong)):
m_message = m_message.replace(list_strong[idx],
list_strong_detail[idx])
_Messages = allMessageDesc.messages.add()
_Messages.code = int(cusolverStatus_t[error[0]])
_Messages.message = "'%s'. %s" % (error[0], m_message)
print("End crawling errorMessage for nvidia CUSOLVER API!\n")
#**********************************************************************************************#
#*************************************** NCCL error *******************************************#
print("start crawling errorMessage for nvidia NCCL API--->")
url = 'https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclresult-t'
allMessageDesc = externalErrorDesc.errors.add()
allMessageDesc.type = external_error_pb2.NCCL
html = urllib.request.urlopen(url).read().decode('utf-8')
res_div = r'<code class="descname">ncclResult_t</code>(.*?)</div>'
m_div = re.findall(res_div, html, re.S | re.M)[0]
res_dt = r'<code class="descname">(.*?)</code>.*?<span class="pre">(.*?)</span></code>\)(.*?)</p>\n</dd></dl>'
m_dt = re.findall(res_dt, m_div, re.S | re.M)
for error in m_dt:
m_message = re.sub(r'\n', '', error[2])
_Messages = allMessageDesc.messages.add()
_Messages.code = int(error[1])
_Messages.message = "'%s'. %s" % (error[0], m_message)
print("End crawling errorMessage for nvidia NCCL API!\n")
def main(argv):
try:
opts, _ = getopt.getopt(argv, "h", ["help"])
except getopt.GetoptError:
print('python spider.py')
sys.exit(2)
for opt, _ in opts:
if opt in ("-h", "--help"):
print('python spider.py')
sys.exit(2)
externalErrorDesc = external_error_pb2.ExternalErrorDesc()
parsing(externalErrorDesc)
serializedString = externalErrorDesc.SerializeToString()
with open("externalErrorMsg.pb", "wb") as f:
# save for externalErrorMsg.pb from Python-protobuf interface
# load from C++-protobuf interface and get error message
f.write(serializedString)
print(
"Generating data file [externalErrorMsg.pb] for external third_party API error has been done!"
)
if __name__ == "__main__":
main(sys.argv[1:])
...@@ -29,19 +29,7 @@ else ...@@ -29,19 +29,7 @@ else
echo "please run on Mac/Linux" echo "please run on Mac/Linux"
exit 1 exit 1
fi fi
protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/cuda_error.proto protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/external_error.proto
version=90,100,-1 # -1 represent the latest cuda-version python3.7 spider.py
url=https://docs.nvidia.com/cuda/archive/9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/archive/10.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038 tar czvf externalErrorMsg.tar.gz externalErrorMsg.pb
if [ "$1" != "" ]; then
version=$version,$(($1*10))
if [ "$2" != "" ]; then
url=$url,$2
else
url=$url,https://docs.nvidia.com/cuda/archive/$1/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038
fi
fi
python spider.py --version=$version --url=$url
tar czf cudaErrorMessage.tar.gz cudaErrorMessage.pb
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册