未验证 提交 3f4678c9 编写于 作者: Z Zhou Wei 提交者: GitHub

[cherry-pick2.0]Optimize the error messages of paddle CUDA API (#23849)

* cherry-pick,Optimize the error messages of paddle CUDA API

* fix the error messages of paddle CUDA API

* Refactoring PADDLE_ENFORCE_CUDA_SUCCESS, and apply to curand/cudnn/cublas/NCCL

* remove build_ex_string
上级 30e4cacd
...@@ -135,6 +135,12 @@ copy(inference_lib_dist ...@@ -135,6 +135,12 @@ copy(inference_lib_dist
SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
DSTS ${dst_dir}) DSTS ${dst_dir})
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
copy(inference_lib_dist
SRCS ${cudaerror_INCLUDE_DIR}
DSTS ${dst_dir})
# CMakeCache Info
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
DSTS ${FLUID_INFERENCE_INSTALL_DIR}) DSTS ${FLUID_INFERENCE_INSTALL_DIR})
...@@ -184,7 +190,7 @@ copy(fluid_lib_dist ...@@ -184,7 +190,7 @@ copy(fluid_lib_dist
) )
set(module "framework") set(module "framework")
set(framework_lib_deps framework_proto) set(framework_lib_deps framework_proto data_feed_proto trainer_desc_proto)
add_dependencies(fluid_lib_dist ${framework_lib_deps}) add_dependencies(fluid_lib_dist ${framework_lib_deps})
copy(fluid_lib_dist copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
...@@ -204,11 +210,11 @@ copy(fluid_lib_dist ...@@ -204,11 +210,11 @@ copy(fluid_lib_dist
) )
set(module "platform") set(module "platform")
set(platform_lib_deps profiler_proto) set(platform_lib_deps profiler_proto error_codes_proto cuda_error_proto)
add_dependencies(fluid_lib_dist ${platform_lib_deps}) add_dependencies(fluid_lib_dist ${platform_lib_deps})
copy(fluid_lib_dist copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/profiler.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/error_codes.pb.h SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module} DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module}
) )
set(module "string") set(module "string")
...@@ -249,6 +255,7 @@ copy(inference_lib_dist ...@@ -249,6 +255,7 @@ copy(inference_lib_dist
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib) DSTS ${dst_dir} ${dst_dir}/lib)
# CMakeCache Info # CMakeCache Info
copy(fluid_lib_dist copy(fluid_lib_dist
SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
include(ExternalProject)
# Creat a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac) # Creat a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac)
set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
...@@ -21,6 +22,7 @@ set(THIRD_PARTY_CACHE_PATH "${CMAKE_SOURCE_DIR}" CACHE STRING ...@@ -21,6 +22,7 @@ set(THIRD_PARTY_CACHE_PATH "${CMAKE_SOURCE_DIR}" CACHE STRING
"A path cache third party source code to avoid repeated download.") "A path cache third party source code to avoid repeated download.")
set(THIRD_PARTY_BUILD_TYPE Release) set(THIRD_PARTY_BUILD_TYPE Release)
set(third_party_deps)
# cache funciton to avoid repeat download code of third_party. # cache funciton to avoid repeat download code of third_party.
# This function has 4 parameters, URL / REPOSITOR / TAG / DIR: # This function has 4 parameters, URL / REPOSITOR / TAG / DIR:
...@@ -100,6 +102,32 @@ MACRO(UNSET_VAR VAR_NAME) ...@@ -100,6 +102,32 @@ MACRO(UNSET_VAR VAR_NAME)
UNSET(${VAR_NAME}) UNSET(${VAR_NAME})
ENDMACRO() ENDMACRO()
# Funciton to Download the dependencies during compilation
# This function has 2 parameters, URL / DIRNAME:
# 1. URL: The download url of 3rd dependencies
# 2. NAME: The name of file, that determin the dirname
#
MACRO(file_download_and_uncompress URL NAME)
MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}")
SET(EXTERNAL_PROJECT_NAME "extern_download_${NAME}")
SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data)
ExternalProject_Add(
${EXTERNAL_PROJECT_NAME}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${THIRD_PARTY_PATH}/${NAME}
URL ${URL}
DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND ""
)
list(APPEND third_party_deps ${EXTERNAL_PROJECT_NAME})
ENDMACRO()
# Correction of flags on different Platform(WIN/MAC) and Print Warning Message # Correction of flags on different Platform(WIN/MAC) and Print Warning Message
if (APPLE) if (APPLE)
if(WITH_MKL) if(WITH_MKL)
...@@ -178,10 +206,13 @@ include(external/dlpack) # download dlpack ...@@ -178,10 +206,13 @@ include(external/dlpack) # download dlpack
include(external/xxhash) # download, build, install xxhash include(external/xxhash) # download, build, install xxhash
include(external/warpctc) # download, build, install warpctc include(external/warpctc) # download, build, install warpctc
set(third_party_deps)
list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash) list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool) list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool)
# download file
set(CUDAERROR_URL "https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
file_download_and_uncompress(${CUDAERROR_URL} "cudaerror")
if(WITH_AMD_GPU) if(WITH_AMD_GPU)
include(external/rocprim) # download, build, install rocprim include(external/rocprim) # download, build, install rocprim
list(APPEND third_party_deps extern_rocprim) list(APPEND third_party_deps extern_rocprim)
...@@ -274,4 +305,4 @@ if (WITH_LITE) ...@@ -274,4 +305,4 @@ if (WITH_LITE)
include(external/lite) include(external/lite)
endif (WITH_LITE) endif (WITH_LITE)
add_custom_target(third_party DEPENDS ${third_party_deps}) add_custom_target(third_party ALL DEPENDS ${third_party_deps})
...@@ -152,9 +152,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply( ...@@ -152,9 +152,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1, cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
cudaMemcpyHostToDevice, dev_ctx->stream()), cudaMemcpyHostToDevice, dev_ctx->stream()));
platform::errors::External(
"Async cudaMemcpy op_var info to gpu failed."));
} else { // get } else { // get
auto iter = op_var2gpu_str.find(op_var); auto iter = op_var2gpu_str.find(op_var);
PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true, PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true,
......
...@@ -124,12 +124,9 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs, ...@@ -124,12 +124,9 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
float const* input_ptr = reinterpret_cast<float const*>(inputs[0]); float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
float* const* h_odatas = reinterpret_cast<float* const*>(outputs); float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]); float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
cudaMemcpyAsync(output_ptrs, h_odatas, output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*),
d_output_ptrs_.size() * sizeof(float*), cudaMemcpyHostToDevice, stream));
cudaMemcpyHostToDevice, stream),
platform::errors::External(
"CUDA Memcpy failed during split plugin run."));
int outer_rows = outer_rows_ * batchSize; int outer_rows = outer_rows_ * batchSize;
...@@ -244,12 +241,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, ...@@ -244,12 +241,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
float* const* h_odatas = reinterpret_cast<float* const*>(outputs); float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]); float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
cudaMemcpyAsync(output_ptrs, h_odatas, output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(float*),
d_output_ptrs.size() * sizeof(float*), cudaMemcpyHostToDevice, stream));
cudaMemcpyHostToDevice, stream),
platform::errors::External(
"CUDA Memcpy failed during split plugin run."));
split_kernel<<<grid, block, 0, stream>>>( split_kernel<<<grid, block, 0, stream>>>(
d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs, d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
...@@ -263,12 +257,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, ...@@ -263,12 +257,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
half* const* h_odatas = reinterpret_cast<half* const*>(outputs); half* const* h_odatas = reinterpret_cast<half* const*>(outputs);
half** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]); half** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
cudaMemcpyAsync(output_ptrs, h_odatas, output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(half*),
d_output_ptrs.size() * sizeof(half*), cudaMemcpyHostToDevice, stream));
cudaMemcpyHostToDevice, stream),
platform::errors::External(
"CUDA Memcpy failed during split plugin run."));
split_kernel<<<grid, block, 0, stream>>>( split_kernel<<<grid, block, 0, stream>>>(
d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs, d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
......
...@@ -80,17 +80,13 @@ class CUDADeviceContextAllocator : public Allocator { ...@@ -80,17 +80,13 @@ class CUDADeviceContextAllocator : public Allocator {
: place_(place), default_stream_(default_stream) { : place_(place), default_stream_(default_stream) {
platform::CUDADeviceGuard guard(place_.device); platform::CUDADeviceGuard guard(place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreate(&event_, cudaEventDisableTiming), cudaEventCreate(&event_, cudaEventDisableTiming));
platform::errors::External(
"Create event failed in CUDADeviceContextAllocator"));
} }
~CUDADeviceContextAllocator() { ~CUDADeviceContextAllocator() {
if (event_) { if (event_) {
platform::CUDADeviceGuard guard(place_.device); platform::CUDADeviceGuard guard(place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
cudaEventDestroy(event_),
"Destory event failed in CUDADeviceContextAllocator destroctor");
} }
} }
...@@ -103,12 +99,9 @@ class CUDADeviceContextAllocator : public Allocator { ...@@ -103,12 +99,9 @@ class CUDADeviceContextAllocator : public Allocator {
auto allocation = auto allocation =
new CUDADeviceContextAllocation(memory::Alloc(place_, size)); new CUDADeviceContextAllocation(memory::Alloc(place_, size));
// Wait for the event on stream // Wait for the event on stream
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventRecord(event_, default_stream_), cudaStreamWaitEvent(default_stream_, event_, 0));
"Failed to record event in CUDADeviceContextAllocator");
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(default_stream_, event_, 0),
"Failed to wait event in CUDADeviceContextAllocator");
return allocation; return allocation;
} }
......
...@@ -141,12 +141,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input, ...@@ -141,12 +141,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
cu_stream); cu_stream);
} }
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(err);
err,
"ArgSortOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate"
"temp_storage_bytes, status:%s.",
temp_storage_bytes, cudaGetErrorString(err));
Tensor temp_storage; Tensor temp_storage;
temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes); temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
...@@ -165,12 +160,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input, ...@@ -165,12 +160,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
cu_stream); cu_stream);
} }
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(err);
err,
"ArgSortOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
"temp_storage_bytes:%d status:%s.",
temp_storage_bytes, cudaGetErrorString(err));
} }
template <typename T, typename IndType> template <typename T, typename IndType>
......
...@@ -167,13 +167,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -167,13 +167,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
conv_desc.descriptor<T>(padding_common, strides, dilations); conv_desc.descriptor<T>(padding_common, strides, dilations);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSetConvolutionGroupCount(cudnn_conv_desc, platform::dynload::cudnnSetConvolutionGroupCount(cudnn_conv_desc,
groups), groups));
platform::errors::External(
"Call of cudnnSetConvolutionGroupCount(cudnn_conv_desc, groups) "
"failed, where cudnn_conv_desc is configured: padding = [%s], "
"strides = [%s], dilations = [%s]; groups = %d",
framework::make_ddim(padding_common), framework::make_ddim(strides),
framework::make_ddim(dilations), groups));
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>( cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
layout, framework::vectorize<int>(transformed_input.dims())); layout, framework::vectorize<int>(transformed_input.dims()));
...@@ -204,15 +198,8 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -204,15 +198,8 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle(); auto workspace_handle = dev_ctx.cudnn_workspace_handle();
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
platform::dynload::cudnnSetConvolutionMathType(cudnn_conv_desc, cudnn_conv_desc, CUDNN_DEFAULT_MATH));
CUDNN_DEFAULT_MATH),
platform::errors::External(
"Call of cudnnSetConvolutionMathType(cudnn_conv_desc, "
"CUDNN_DEFAULT_MATH) failed, where cudnn_conv_desc is configured: "
"padding = %d, strides = %d, dilations = %d.",
framework::make_ddim(padding_common), framework::make_ddim(strides),
framework::make_ddim(dilations)));
auto x_dims = framework::vectorize(transformed_input.dims()); auto x_dims = framework::vectorize(transformed_input.dims());
auto f_dims = framework::vectorize(filter->dims()); auto f_dims = framework::vectorize(filter->dims());
...@@ -221,9 +208,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -221,9 +208,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
platform::dynload::cudnnGetConvolutionForwardAlgorithm( platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo), workspace_size_limit, &algo));
platform::errors::External(
"Call of cudnnGetConvolutionForwardAlgorithm failed."));
VLOG(3) << "cuDNN forward algo " << algo; VLOG(3) << "cuDNN forward algo " << algo;
} else { } else {
std::function<cudnnConvolutionFwdAlgo_t()> search_func = std::function<cudnnConvolutionFwdAlgo_t()> search_func =
...@@ -237,9 +222,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -237,9 +222,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
handle, cudnn_input_desc, input_data, cudnn_filter_desc, handle, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
kNUM_CUDNN_FWD_ALGS, &returned_algo_count, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit), fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
platform::errors::External(
"Call of cudnnFindConvolutionForwardAlgorithmEx failed."));
}; };
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
VLOG(3) << "Perf result: (algo: stat, time, memory)"; VLOG(3) << "Perf result: (algo: stat, time, memory)";
...@@ -273,9 +256,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -273,9 +256,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes), cudnn_output_desc, algo, &workspace_size_in_bytes));
platform::errors::External(
"Call of cudnnGetConvolutionForwardWorkspaceSize failed."));
PADDLE_ENFORCE_LE( PADDLE_ENFORCE_LE(
workspace_size_in_bytes, workspace_size_limit, workspace_size_in_bytes, workspace_size_limit,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -292,20 +273,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -292,20 +273,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
// ------------- cudnn conv forward and bias add --------------------- // ------------- cudnn conv forward and bias add ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
auto cudnn_func = [&](void* cudnn_workspace) { auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnConvolutionForward(
platform::dynload::cudnnConvolutionForward( handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, filter_data, cudnn_conv_desc, algo, cudnn_workspace,
filter_data, cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
workspace_size_in_bytes, &beta, cudnn_output_desc, output_data),
platform::errors::External(
"Call of cudnnConvolutionForward failed."));
}; };
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnAddTensor(
platform::dynload::cudnnAddTensor(handle, &alpha, cudnn_bias_desc, handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
bias_data, &alpha, output_data));
cudnn_output_desc, output_data),
platform::errors::External("Call of cudnnAddTensor failed."));
} else { } else {
if (activation == "identity") { if (activation == "identity") {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
...@@ -320,9 +296,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -320,9 +296,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnn_filter_desc, filter_data, cudnn_conv_desc, algo, cudnn_filter_desc, filter_data, cudnn_conv_desc, algo,
cudnn_workspace, workspace_size_in_bytes, &alpha2, cudnn_workspace, workspace_size_in_bytes, &alpha2,
cudnn_output_desc, residual_data, cudnn_bias_desc, bias_data, cudnn_output_desc, residual_data, cudnn_bias_desc, bias_data,
cudnn_act_desc, cudnn_output_desc, output_data), cudnn_act_desc, cudnn_output_desc, output_data));
platform::errors::External(
"Call of cudnnConvolutionBiasActivationForward failed."));
}; };
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
......
...@@ -108,32 +108,21 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T> ...@@ -108,32 +108,21 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_), platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
platform::errors::External(
"The error has happened when calling "
"cudnnCreateTensorDescriptor(&data_desc_)."));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_), platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
platform::errors::External(
"The error has happened when calling "
"cudnnCreateTensorDescriptor(&bn_param_desc_)."));
VLOG(3) << "Setting descriptors."; VLOG(3) << "Setting descriptors.";
std::vector<int> dims = {N, C, H, W, D}; std::vector<int> dims = {N, C, H, W, D};
std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C}; std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType<T>::type,
data_desc_, CudnnDataType<T>::type, x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
platform::errors::External(
"The error has happened when calling cudnnSetTensorNdDescriptor."));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
data_desc_, mode_), data_desc_, mode_));
platform::errors::External("The error has happened when calling "
"cudnnDeriveBNTensorDescriptor."));
double this_factor = 1. - momentum; double this_factor = 1. - momentum;
cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION; cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
...@@ -166,10 +155,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T> ...@@ -166,10 +155,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
/*yDesc=*/data_desc_, /*yDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_, /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/activation_desc_, /*activationDesc=*/activation_desc_,
/*sizeInBytes=*/&workspace_size), /*sizeInBytes=*/&workspace_size));
platform::errors::External(
"The error has happened when calling "
"cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize."));
// -------------- cudnn batchnorm reserve space -------------- // -------------- cudnn batchnorm reserve space --------------
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
...@@ -179,10 +165,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T> ...@@ -179,10 +165,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
/*bnOps=*/bnOps_, /*bnOps=*/bnOps_,
/*activationDesc=*/activation_desc_, /*activationDesc=*/activation_desc_,
/*xDesc=*/data_desc_, /*xDesc=*/data_desc_,
/*sizeInBytes=*/&reserve_space_size), /*sizeInBytes=*/&reserve_space_size));
platform::errors::External(
"The error has happened when calling "
"cudnnGetBatchNormalizationTrainingExReserveSpaceSize."));
reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(), reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(),
reserve_space_size); reserve_space_size);
...@@ -204,22 +187,13 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T> ...@@ -204,22 +187,13 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
saved_variance->template mutable_data<BatchNormParamType<T>>( saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()), ctx.GetPlace()),
activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr, activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
reserve_space_size), reserve_space_size));
platform::errors::External(
"The error has happened when calling "
"cudnnBatchNormalizationForwardTrainingEx."));
// clean when exit. // clean when exit.
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_), platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
platform::errors::External(
"The error has happened when calling "
"cudnnDestroyTensorDescriptor(data_desc_)."));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_), platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
platform::errors::External(
"The error has happened when calling "
"cudnnDestroyTensorDescriptor(bn_param_desc_)."));
} }
}; };
...@@ -298,15 +272,9 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T> ...@@ -298,15 +272,9 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_), platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
platform::errors::External(
"The error has happened when calling "
"cudnnCreateTensorDescriptor(&data_desc_)."));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_), platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
platform::errors::External(
"The error has happened when calling "
"cudnnCreateTensorDescriptor(&bn_param_desc_)."));
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than " LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to " << "CUDNN_BN_MIN_EPSILON. Setting it to "
...@@ -314,17 +282,12 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T> ...@@ -314,17 +282,12 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
} }
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType<T>::type,
data_desc_, CudnnDataType<T>::type, x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
platform::errors::External(
"The error has happened when calling cudnnSetTensorNdDescriptor."));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
data_desc_, mode_), data_desc_, mode_));
platform::errors::External("The error has happened when calling "
"cudnnDeriveBNTensorDescriptor."));
const auto *saved_mean = ctx.Input<Tensor>("SavedMean"); const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *saved_var = ctx.Input<Tensor>("SavedVariance"); const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
...@@ -354,10 +317,7 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T> ...@@ -354,10 +317,7 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
/*dxDesc=*/data_desc_, /*dxDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_, /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/activation_desc_, /*activationDesc=*/activation_desc_,
/*sizeInBytes=*/&workspace_size), /*sizeInBytes=*/&workspace_size));
platform::errors::External(
"The error has happened when calling "
"cudnnGetBatchNormalizationBackwardExWorkspaceSize."));
workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(), workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
workspace_size); workspace_size);
...@@ -395,21 +355,13 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T> ...@@ -395,21 +355,13 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
/*workspace=*/workspace_ptr, /*workspace=*/workspace_ptr,
/*workSpaceSizeInBytes=*/workspace_size, /*workSpaceSizeInBytes=*/workspace_size,
/*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()), /*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()),
/*reserveSpaceSizeInBytes=*/reserve_space_size), /*reserveSpaceSizeInBytes=*/reserve_space_size));
platform::errors::External("The error has happened when calling "
"cudnnBatchNormalizationBackwardEx."));
// clean when exit. // clean when exit.
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_), platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
platform::errors::External(
"The error has happened when calling "
"cudnnDestroyTensorDescriptor(data_desc_)."));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_), platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
platform::errors::External(
"The error has happened when calling "
"cudnnDestroyTensorDescriptor(bn_param_desc_)."));
} }
}; };
......
...@@ -46,13 +46,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> { ...@@ -46,13 +46,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
cudnnTensorDescriptor_t in_desc; cudnnTensorDescriptor_t in_desc;
cudnnTensorDescriptor_t out_desc; cudnnTensorDescriptor_t out_desc;
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&in_desc), platform::dynload::cudnnCreateTensorDescriptor(&in_desc));
platform::errors::External("Create cudnn tensor descriptor failed in "
"transpose_flatten_concat_fusion op."));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&out_desc), platform::dynload::cudnnCreateTensorDescriptor(&out_desc));
platform::errors::External("Create cudnn tensor descriptor failed in "
"transpose_flatten_concat_fusion op."));
cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type; cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
...@@ -91,24 +87,15 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> { ...@@ -91,24 +87,15 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
dims_y[i] = 1; dims_y[i] = 1;
} }
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
platform::dynload::cudnnSetTensorNdDescriptor( in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()), PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
platform::errors::External("Create cudnn tensorNd descriptor failed " out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));
"in transpose_flatten_concat op."));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor(
platform::dynload::cudnnSetTensorNdDescriptor( handle, CudnnDataType<T>::kOne(), in_desc,
out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()), static_cast<const void*>(ins[k]->data<T>()),
platform::errors::External("Create cudnn tensorNd descriptor failed " CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)));
"in transpose_flatten_concat op."));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnTransformTensor(
handle, CudnnDataType<T>::kOne(), in_desc,
static_cast<const void*>(ins[k]->data<T>()),
CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)),
platform::errors::External("Create cudnn transform tensor failed in "
"transpose_flatten_concat op."));
if (concat_axis == 0) { if (concat_axis == 0) {
odata += osize; odata += osize;
} else { } else {
...@@ -117,13 +104,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> { ...@@ -117,13 +104,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
} }
} }
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(in_desc), platform::dynload::cudnnDestroyTensorDescriptor(in_desc));
platform::errors::External(
"Destory cudnn descriptor failed in transpose_flatten_concat op."));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(out_desc), platform::dynload::cudnnDestroyTensorDescriptor(out_desc));
platform::errors::External(
"Destory cudnn descriptor failed in transpose_flatten_concat op."));
} }
}; };
......
...@@ -60,13 +60,10 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> { ...@@ -60,13 +60,10 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>( cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
DataLayout::kNCHW, framework::vectorize<int>(output->dims())); DataLayout::kNCHW, framework::vectorize<int>(output->dims()));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward(
platform::dynload::cudnnSpatialTfSamplerForward( handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc, input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc, output_data));
output_data),
platform::errors::InvalidArgument(
"cudnnSpatialTfSamplerForward in Op(grid_sampler) failed"));
} }
}; };
...@@ -122,9 +119,7 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> { ...@@ -122,9 +119,7 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc, input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc,
input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc, input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc,
output_grad_data, grid_data, CudnnDataType<T>::kZero(), output_grad_data, grid_data, CudnnDataType<T>::kZero(),
grid_grad_data), grid_grad_data));
platform::errors::InvalidArgument(
"cudnnSpatialTfSamplerBackward in Op(grid_sampler) failed"));
} }
}; };
......
...@@ -41,16 +41,12 @@ struct CUBlas<float> { ...@@ -41,16 +41,12 @@ struct CUBlas<float> {
template <typename... ARGS> template <typename... ARGS>
static void SCAL(ARGS... args) { static void SCAL(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSscal(args...));
platform::dynload::cublasSscal(args...),
platform::errors::External("dynload cublasSscal lib failed"));
} }
template <typename... ARGS> template <typename... ARGS>
static void VCOPY(ARGS... args) { static void VCOPY(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasScopy(args...));
platform::dynload::cublasScopy(args...),
platform::errors::External("dynload cublasScopy lib failed"));
} }
template <typename... ARGS> template <typename... ARGS>
...@@ -108,16 +104,12 @@ struct CUBlas<double> { ...@@ -108,16 +104,12 @@ struct CUBlas<double> {
template <typename... ARGS> template <typename... ARGS>
static void SCAL(ARGS... args) { static void SCAL(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDscal(args...));
platform::dynload::cublasDscal(args...),
platform::errors::External("dynload cublasDscal lib failed"));
} }
template <typename... ARGS> template <typename... ARGS>
static void VCOPY(ARGS... args) { static void VCOPY(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDcopy(args...));
platform::dynload::cublasDcopy(args...),
platform::errors::External("dynload cublasDcopy lib failed"));
} }
template <typename... ARGS> template <typename... ARGS>
......
...@@ -59,20 +59,14 @@ class MeanCUDAKernel : public framework::OpKernel<T> { ...@@ -59,20 +59,14 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
auto err = cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, trans_x, auto err = cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, trans_x,
out_data, size_prob, stream); out_data, size_prob, stream);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(err);
err, platform::errors::External(
"MeanOP failed to get reduce workspace size %s.",
cudaGetErrorString(err)));
framework::Tensor tmp; framework::Tensor tmp;
auto* temp_storage = tmp.mutable_data<uint8_t>( auto* temp_storage = tmp.mutable_data<uint8_t>(
framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}), framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
context.GetPlace()); context.GetPlace());
err = cub::DeviceReduce::Sum(temp_storage, temp_storage_bytes, trans_x, err = cub::DeviceReduce::Sum(temp_storage, temp_storage_bytes, trans_x,
out_data, size_prob, stream); out_data, size_prob, stream);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(err);
err, platform::errors::External(
"MeanOP failed to run CUDA reduce computation: %s.",
cudaGetErrorString(err)));
} }
}; };
......
...@@ -104,13 +104,9 @@ void BufferedReader::ReadAsync(size_t i) { ...@@ -104,13 +104,9 @@ void BufferedReader::ReadAsync(size_t i) {
// gpu memory immediately without waiting gpu kernel ends // gpu memory immediately without waiting gpu kernel ends
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device); platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventRecord(events_[i].get(), compute_stream_), cudaEventRecord(events_[i].get(), compute_stream_));
platform::errors::Fatal(
"cudaEventRecord raises unexpected exception"));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0), cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0));
platform::errors::Fatal(
"cudaStreamWaitEvent raises unexpected exception"));
platform::RecordEvent record_event("BufferedReader:MemoryCopy"); platform::RecordEvent record_event("BufferedReader:MemoryCopy");
for (size_t i = 0; i < cpu.size(); ++i) { for (size_t i = 0; i < cpu.size(); ++i) {
...@@ -138,17 +134,11 @@ void BufferedReader::ReadAsync(size_t i) { ...@@ -138,17 +134,11 @@ void BufferedReader::ReadAsync(size_t i) {
size); size);
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr, memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
cuda_pinned_place, cuda_pinned_ptr, size, stream_.get()); cuda_pinned_place, cuda_pinned_ptr, size, stream_.get());
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
cudaStreamSynchronize(stream_.get()),
platform::errors::Fatal(
"cudaStreamSynchronize raises unexpected exception"));
} }
gpu[i].set_lod(cpu[i].lod()); gpu[i].set_lod(cpu[i].lod());
} }
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
cudaStreamSynchronize(stream_.get()),
platform::errors::Fatal(
"cudaStreamSynchronize raises unexpected exception"));
} }
#endif #endif
return i; return i;
......
...@@ -191,12 +191,9 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx, ...@@ -191,12 +191,9 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
if (comm) { if (comm) {
int dtype = platform::ToNCCLDataType(mean_out->type()); int dtype = platform::ToNCCLDataType(mean_out->type());
// In-place operation // In-place operation
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
platform::dynload::ncclAllReduce(stats, stats, 2 * C + 1, stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
static_cast<ncclDataType_t>(dtype), comm, stream));
ncclSum, comm, stream),
platform::errors::InvalidArgument(
"ncclAllReduce in Op(sync_batch_norm) failed"));
} }
#endif #endif
...@@ -468,12 +465,9 @@ void SyncBatchNormGradFunctor( ...@@ -468,12 +465,9 @@ void SyncBatchNormGradFunctor(
if (comm) { if (comm) {
int dtype = platform::ToNCCLDataType(scale->type()); int dtype = platform::ToNCCLDataType(scale->type());
// In-place operation // In-place operation
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
platform::dynload::ncclAllReduce(stats, stats, 2 * C + 1, stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
static_cast<ncclDataType_t>(dtype), comm, stream));
ncclSum, comm, stream),
platform::errors::InvalidArgument(
"ncclAllReduce in Op(sync_batch_norm) failed"));
} }
#endif #endif
......
proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool) proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
proto_library(error_codes_proto SRCS error_codes.proto) proto_library(error_codes_proto SRCS error_codes.proto)
proto_library(cuda_error_proto SRCS cuda_error.proto)
if (WITH_PYTHON) if (WITH_PYTHON)
py_proto_compile(profiler_py_proto SRCS profiler.proto) py_proto_compile(profiler_py_proto SRCS profiler.proto)
...@@ -28,7 +28,7 @@ cc_library(flags SRCS flags.cc DEPS gflags) ...@@ -28,7 +28,7 @@ cc_library(flags SRCS flags.cc DEPS gflags)
cc_library(errors SRCS errors.cc DEPS error_codes_proto) cc_library(errors SRCS errors.cc DEPS error_codes_proto)
cc_test(errors_test SRCS errors_test.cc DEPS errors enforce) cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)
cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors) cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors cuda_error_proto)
cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce) cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
set(CPU_INFO_DEPS gflags glog enforce) set(CPU_INFO_DEPS gflags glog enforce)
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax = "proto2";
package paddle.platform.proto;
message MessageDesc {
// Indicates the type of error
required int32 errorCode = 1;
// Indicates the message of error
required string errorMessage = 2;
}
message AllMessageDesc {
// Version of cuda API
required int32 version = 1;
// Error messages of different errortype
repeated MessageDesc Messages = 2;
}
message cudaerrorDesc {
// Error messages of different cuda versions(9.0/10.0/10.2)
repeated AllMessageDesc AllMessages = 2;
}
\ No newline at end of file
...@@ -29,14 +29,7 @@ namespace platform { ...@@ -29,14 +29,7 @@ namespace platform {
class CublasHandleHolder { class CublasHandleHolder {
public: public:
CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) { CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
dynload::cublasCreate(&handle_),
platform::errors::External(
"The cuBLAS library was not initialized. This is usually caused by "
"an error in the CUDA Runtime API called by the cuBLAS routine, or "
"an error in the hardware setup.\n"
"To correct: check that the hardware, an appropriate version of "
"the driver, and the cuBLAS library are correctly installed."));
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
#if CUDA_VERSION >= 9000 #if CUDA_VERSION >= 9000
if (math_type == CUBLAS_TENSOR_OP_MATH) { if (math_type == CUBLAS_TENSOR_OP_MATH) {
......
...@@ -27,18 +27,13 @@ CudaStreamResourcePool::CudaStreamResourcePool() { ...@@ -27,18 +27,13 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
platform::SetDeviceId(dev_idx); platform::SetDeviceId(dev_idx);
cudaStream_t stream; cudaStream_t stream;
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
platform::errors::Fatal(
"cudaStreamCreateWithFlags raises unexpected exception"));
return stream; return stream;
}; };
auto deleter = [dev_idx](cudaStream_t stream) { auto deleter = [dev_idx](cudaStream_t stream) {
platform::SetDeviceId(dev_idx); platform::SetDeviceId(dev_idx);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream));
cudaStreamDestroy(stream),
platform::errors::Fatal(
"cudaStreamDestroy raises unexpected exception"));
}; };
pool_.emplace_back( pool_.emplace_back(
...@@ -72,18 +67,13 @@ CudaEventResourcePool::CudaEventResourcePool() { ...@@ -72,18 +67,13 @@ CudaEventResourcePool::CudaEventResourcePool() {
platform::SetDeviceId(dev_idx); platform::SetDeviceId(dev_idx);
cudaEvent_t event; cudaEvent_t event;
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&event, cudaEventDisableTiming), cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
platform::errors::Fatal(
"cudaEventCreateWithFlags raises unexpected exception"));
return event; return event;
}; };
auto deleter = [dev_idx](cudaEvent_t event) { auto deleter = [dev_idx](cudaEvent_t event) {
platform::SetDeviceId(dev_idx); platform::SetDeviceId(dev_idx);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event));
cudaEventDestroy(event),
platform::errors::Fatal(
"cudaEventDestroy raises unexpected exception"));
}; };
pool_.emplace_back(ResourcePool<CudaEventObject>::Create(creator, deleter)); pool_.emplace_back(ResourcePool<CudaEventObject>::Create(creator, deleter));
......
...@@ -278,12 +278,9 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { ...@@ -278,12 +278,9 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
<< "Please recompile or reinstall Paddle with compatible CUDNN " << "Please recompile or reinstall Paddle with compatible CUDNN "
"version."; "version.";
} }
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnCreate(&cudnn_handle_), dynload::cudnnSetStream(cudnn_handle_, stream_));
"Failed to create Cudnn handle in DeviceContext");
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnSetStream(cudnn_handle_, stream_),
"Failed to set stream for Cudnn handle in DeviceContext");
} else { } else {
cudnn_handle_ = nullptr; cudnn_handle_ = nullptr;
} }
...@@ -302,8 +299,7 @@ CUDADeviceContext::~CUDADeviceContext() { ...@@ -302,8 +299,7 @@ CUDADeviceContext::~CUDADeviceContext() {
eigen_device_.reset(); eigen_device_.reset();
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
if (cudnn_handle_) { if (cudnn_handle_) {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_), PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_));
"Failed to destory Cudnn handle");
} }
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
if (nccl_comm_) { if (nccl_comm_) {
...@@ -325,10 +321,7 @@ void CUDADeviceContext::Wait() const { ...@@ -325,10 +321,7 @@ void CUDADeviceContext::Wait() const {
} }
#endif #endif
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(e_sync);
e_sync, platform::errors::Fatal(
"cudaStreamSynchronize raises error: %s, errono: %d",
cudaGetErrorString(e_sync), static_cast<int>(e_sync)));
} }
int CUDADeviceContext::GetComputeCapability() const { int CUDADeviceContext::GetComputeCapability() const {
......
...@@ -18,6 +18,13 @@ limitations under the License. */ ...@@ -18,6 +18,13 @@ limitations under the License. */
#include <cxxabi.h> // for __cxa_demangle #include <cxxabi.h> // for __cxa_demangle
#endif // __GNUC__ #endif // __GNUC__
#if !defined(_WIN32)
#include <dlfcn.h> // dladdr
#else // _WIN32
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#include <windows.h> // GetModuleFileName
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cudnn.h> #include <cudnn.h>
...@@ -38,6 +45,7 @@ limitations under the License. */ ...@@ -38,6 +45,7 @@ limitations under the License. */
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/platform/cuda_error.pb.h"
#include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
...@@ -220,10 +228,6 @@ inline std::string GetTraceBackString(StrType&& what, const char* file, ...@@ -220,10 +228,6 @@ inline std::string GetTraceBackString(StrType&& what, const char* file,
inline bool is_error(bool stat) { return !stat; } inline bool is_error(bool stat) { return !stat; }
inline std::string build_ex_string(bool stat, const std::string& msg) {
return msg;
}
inline void throw_on_error(bool stat, const std::string& msg) { inline void throw_on_error(bool stat, const std::string& msg) {
#ifndef REPLACE_ENFORCE_GLOG #ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(msg); throw std::runtime_error(msg);
...@@ -284,23 +288,21 @@ struct EnforceNotMet : public std::exception { ...@@ -284,23 +288,21 @@ struct EnforceNotMet : public std::exception {
} \ } \
} while (0) } while (0)
#else #else
#define PADDLE_ENFORCE(COND, ...) \ #define PADDLE_ENFORCE(COND, ...) \
do { \ do { \
auto __cond__ = (COND); \ auto __cond__ = (COND); \
if (UNLIKELY(::paddle::platform::is_error(__cond__))) { \ if (UNLIKELY(::paddle::platform::is_error(__cond__))) { \
try { \ try { \
::paddle::platform::throw_on_error( \ ::paddle::platform::throw_on_error( \
__cond__, \ __cond__, \
::paddle::platform::build_ex_string( \ ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString()); \
__cond__, \ } catch (...) { \
::paddle::platform::ErrorSummary(__VA_ARGS__).ToString())); \ HANDLE_THE_ERROR \
} catch (...) { \ throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
HANDLE_THE_ERROR \ __FILE__, __LINE__); \
throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ END_HANDLE_THE_ERROR \
__FILE__, __LINE__); \ } \
END_HANDLE_THE_ERROR \ } \
} \
} \
} while (0) } while (0)
#endif #endif
...@@ -464,30 +466,148 @@ struct EOFException : public std::exception { ...@@ -464,30 +466,148 @@ struct EOFException : public std::exception {
} while (0) } while (0)
/** CUDA PADDLE ENFORCE FUNCTIONS AND MACROS **/ /** CUDA PADDLE ENFORCE FUNCTIONS AND MACROS **/
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
/***** CUDA ERROR *****/
inline bool is_error(cudaError_t e) { return e != cudaSuccess; } inline bool is_error(cudaError_t e) { return e != cudaSuccess; }
inline std::string build_ex_string(cudaError_t e, const std::string& msg) { inline std::string GetCudaErrorWebsite(int32_t cuda_version) {
return msg; std::ostringstream webstr;
webstr << "https://docs.nvidia.com/cuda/";
if (cuda_version != -1) {
double version = cuda_version / 10;
webstr << "archive/" << std::fixed << std::setprecision(1) << version;
}
webstr << "/cuda-runtime-api/group__CUDART__TYPES.html"
"#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038";
return webstr.str();
}
inline std::string build_nvidia_error_msg(cudaError_t e) {
#if CUDA_VERSION >= 10000 && CUDA_VERSION < 11000
int32_t cuda_version = 100;
#elif CUDA_VERSION >= 9000
int32_t cuda_version = 90;
#else
int32_t cuda_version = -1;
#endif
std::ostringstream sout;
sout << " Cuda error(" << e << "), " << cudaGetErrorString(e) << ".";
static platform::proto::cudaerrorDesc cudaerror;
static bool _initSucceed = false;
if (cudaerror.ByteSizeLong() == 0) {
std::string filePath;
#if !defined(_WIN32)
Dl_info info;
if (dladdr(reinterpret_cast<void*>(GetCudaErrorWebsite), &info)) {
std::string strModule(info.dli_fname);
const size_t last_slash_idx = strModule.find_last_of("/");
std::string compare_path = strModule.substr(strModule.length() - 6);
if (std::string::npos != last_slash_idx) {
strModule.erase(last_slash_idx, std::string::npos);
}
if (compare_path.compare("avx.so") == 0) {
filePath = strModule +
"/../include/third_party/cudaerror/data/cudaErrorMessage.pb";
} else {
filePath =
strModule + "/../../thirl_party/cudaerror/data/cudaErrorMessage.pb";
}
}
#else
char buf[100];
MEMORY_BASIC_INFORMATION mbi;
HMODULE h_module =
(::VirtualQuery(GetCudaErrorWebsite, &mbi, sizeof(mbi)) != 0)
? (HMODULE)mbi.AllocationBase
: NULL;
GetModuleFileName(h_module, buf, 100);
std::string strModule(buf);
const size_t last_slash_idx = strModule.find_last_of("\\");
std::string compare_path = strModule.substr(strModule.length() - 7);
if (std::string::npos != last_slash_idx) {
strModule.erase(last_slash_idx, std::string::npos);
}
if (compare_path.compare("avx.pyd") == 0) {
filePath =
strModule +
"\\..\\include\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
} else {
filePath =
strModule + "\\..\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
}
#endif
std::ifstream fin(filePath, std::ios::in | std::ios::binary);
_initSucceed = cudaerror.ParseFromIstream(&fin);
}
if (_initSucceed) {
for (int i = 0; i < cudaerror.allmessages_size(); ++i) {
if (cuda_version == cudaerror.allmessages(i).version()) {
for (int j = 0; j < cudaerror.allmessages(i).messages_size(); ++j) {
if (e == cudaerror.allmessages(i).messages(j).errorcode()) {
sout << "\n [Advise: "
<< cudaerror.allmessages(i).messages(j).errormessage() << "]";
return sout.str();
}
}
}
}
}
sout << "\n [Advise: Please search for the error code(" << e
<< ") on website( " << GetCudaErrorWebsite(cuda_version)
<< " ) to get Nvidia's official solution about CUDA Error.]";
return sout.str();
} }
inline void throw_on_error(cudaError_t e, const std::string& msg) { inline void throw_on_error(cudaError_t e, const std::string& msg) {
#ifndef REPLACE_ENFORCE_GLOG #ifndef REPLACE_ENFORCE_GLOG
throw thrust::system_error(e, thrust::cuda_category(), msg); throw std::runtime_error(msg);
#else #else
LOG(FATAL) << msg; LOG(FATAL) << msg;
#endif #endif
} }
/** curand ERROR **/
inline bool is_error(curandStatus_t stat) { inline bool is_error(curandStatus_t stat) {
return stat != CURAND_STATUS_SUCCESS; return stat != CURAND_STATUS_SUCCESS;
} }
inline std::string build_ex_string(curandStatus_t stat, inline const char* curandGetErrorString(curandStatus_t stat) {
const std::string& msg) { switch (stat) {
return msg; case CURAND_STATUS_SUCCESS:
return "CURAND_STATUS_SUCCESS";
case CURAND_STATUS_VERSION_MISMATCH:
return "CURAND_STATUS_VERSION_MISMATCH";
case CURAND_STATUS_NOT_INITIALIZED:
return "CURAND_STATUS_NOT_INITIALIZED";
case CURAND_STATUS_ALLOCATION_FAILED:
return "CURAND_STATUS_ALLOCATION_FAILED";
case CURAND_STATUS_TYPE_ERROR:
return "CURAND_STATUS_TYPE_ERROR";
case CURAND_STATUS_OUT_OF_RANGE:
return "CURAND_STATUS_OUT_OF_RANGE";
case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
case CURAND_STATUS_LAUNCH_FAILURE:
return "CURAND_STATUS_LAUNCH_FAILURE";
case CURAND_STATUS_PREEXISTING_FAILURE:
return "CURAND_STATUS_PREEXISTING_FAILURE";
case CURAND_STATUS_INITIALIZATION_FAILED:
return "CURAND_STATUS_INITIALIZATION_FAILED";
case CURAND_STATUS_ARCH_MISMATCH:
return "CURAND_STATUS_ARCH_MISMATCH";
case CURAND_STATUS_INTERNAL_ERROR:
return "CURAND_STATUS_INTERNAL_ERROR";
default:
return "Unknown curand status";
}
}
inline std::string build_nvidia_error_msg(curandStatus_t stat) {
std::string msg(" Curand error, ");
return msg + curandGetErrorString(stat) + " ";
} }
inline void throw_on_error(curandStatus_t stat, const std::string& msg) { inline void throw_on_error(curandStatus_t stat, const std::string& msg) {
...@@ -499,13 +619,14 @@ inline void throw_on_error(curandStatus_t stat, const std::string& msg) { ...@@ -499,13 +619,14 @@ inline void throw_on_error(curandStatus_t stat, const std::string& msg) {
#endif #endif
} }
/***** CUDNN ERROR *****/
inline bool is_error(cudnnStatus_t stat) { inline bool is_error(cudnnStatus_t stat) {
return stat != CUDNN_STATUS_SUCCESS; return stat != CUDNN_STATUS_SUCCESS;
} }
inline std::string build_ex_string(cudnnStatus_t stat, const std::string& msg) { inline std::string build_nvidia_error_msg(cudnnStatus_t stat) {
return msg + "\n [Hint: " + platform::dynload::cudnnGetErrorString(stat) + std::string msg(" Cudnn error, ");
"]"; return msg + platform::dynload::cudnnGetErrorString(stat) + " ";
} }
inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) { inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) {
...@@ -516,33 +637,39 @@ inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) { ...@@ -516,33 +637,39 @@ inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) {
#endif #endif
} }
/***** CUBLAS ERROR *****/
inline bool is_error(cublasStatus_t stat) { inline bool is_error(cublasStatus_t stat) {
return stat != CUBLAS_STATUS_SUCCESS; return stat != CUBLAS_STATUS_SUCCESS;
} }
inline std::string build_ex_string(cublasStatus_t stat, inline const char* cublasGetErrorString(cublasStatus_t stat) {
const std::string& msg) { switch (stat) {
std::string err; case CUBLAS_STATUS_NOT_INITIALIZED:
if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { return "CUBLAS_STATUS_NOT_INITIALIZED";
err = "CUBLAS_STATUS_NOT_INITIALIZED"; case CUBLAS_STATUS_ALLOC_FAILED:
} else if (stat == CUBLAS_STATUS_ALLOC_FAILED) { return "CUBLAS_STATUS_ALLOC_FAILED";
err = "CUBLAS_STATUS_ALLOC_FAILED"; case CUBLAS_STATUS_INVALID_VALUE:
} else if (stat == CUBLAS_STATUS_INVALID_VALUE) { return "CUBLAS_STATUS_INVALID_VALUE";
err = "CUBLAS_STATUS_INVALID_VALUE"; case CUBLAS_STATUS_ARCH_MISMATCH:
} else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) { return "CUBLAS_STATUS_ARCH_MISMATCH";
err = "CUBLAS_STATUS_ARCH_MISMATCH"; case CUBLAS_STATUS_MAPPING_ERROR:
} else if (stat == CUBLAS_STATUS_MAPPING_ERROR) { return "CUBLAS_STATUS_MAPPING_ERROR";
err = "CUBLAS_STATUS_MAPPING_ERROR"; case CUBLAS_STATUS_EXECUTION_FAILED:
} else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) { return "CUBLAS_STATUS_EXECUTION_FAILED";
err = "CUBLAS_STATUS_EXECUTION_FAILED"; case CUBLAS_STATUS_INTERNAL_ERROR:
} else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) { return "CUBLAS_STATUS_INTERNAL_ERROR";
err = "CUBLAS_STATUS_INTERNAL_ERROR"; case CUBLAS_STATUS_NOT_SUPPORTED:
} else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) { return "CUBLAS_STATUS_NOT_SUPPORTED";
err = "CUBLAS_STATUS_NOT_SUPPORTED"; case CUBLAS_STATUS_LICENSE_ERROR:
} else if (stat == CUBLAS_STATUS_LICENSE_ERROR) { return "CUBLAS_STATUS_LICENSE_ERROR";
err = "CUBLAS_STATUS_LICENSE_ERROR"; default:
return "Unknown cublas status";
} }
return msg + "\n [Hint: " + err + "]"; }
inline std::string build_nvidia_error_msg(cublasStatus_t stat) {
std::string msg(" Cublas error, ");
return msg + cublasGetErrorString(stat) + " ";
} }
inline void throw_on_error(cublasStatus_t stat, const std::string& msg) { inline void throw_on_error(cublasStatus_t stat, const std::string& msg) {
...@@ -553,15 +680,15 @@ inline void throw_on_error(cublasStatus_t stat, const std::string& msg) { ...@@ -553,15 +680,15 @@ inline void throw_on_error(cublasStatus_t stat, const std::string& msg) {
#endif #endif
} }
/****** NCCL ERROR ******/
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
inline bool is_error(ncclResult_t nccl_result) { inline bool is_error(ncclResult_t nccl_result) {
return nccl_result != ncclSuccess; return nccl_result != ncclSuccess;
} }
inline std::string build_ex_string(ncclResult_t nccl_result, inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
const std::string& msg) { std::string msg(" Nccl error, ");
return msg + "\n [" + platform::dynload::ncclGetErrorString(nccl_result) + return msg + platform::dynload::ncclGetErrorString(nccl_result) + " ";
"]";
} }
inline void throw_on_error(ncclResult_t nccl_result, const std::string& msg) { inline void throw_on_error(ncclResult_t nccl_result, const std::string& msg) {
...@@ -571,11 +698,8 @@ inline void throw_on_error(ncclResult_t nccl_result, const std::string& msg) { ...@@ -571,11 +698,8 @@ inline void throw_on_error(ncclResult_t nccl_result, const std::string& msg) {
LOG(FATAL) << msg; LOG(FATAL) << msg;
#endif #endif
} }
#endif // __APPLE__ and windows #endif // not(__APPLE__) and PADDLE_WITH_NCCL
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
namespace details { namespace details {
template <typename T> template <typename T>
...@@ -598,30 +722,28 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); ...@@ -598,30 +722,28 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
#endif #endif
} // namespace details } // namespace details
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA #define PADDLE_ENFORCE_CUDA_SUCCESS(COND) \
#define PADDLE_ENFORCE_CUDA_SUCCESS(COND, ...) \ do { \
do { \ auto __cond__ = (COND); \
auto __cond__ = (COND); \ using __CUDA_STATUS_TYPE__ = decltype(__cond__); \
using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ constexpr auto __success_type__ = \
constexpr auto __success_type__ = \ ::paddle::platform::details::CudaStatusType< \
::paddle::platform::details::CudaStatusType< \ __CUDA_STATUS_TYPE__>::kSuccess; \
__CUDA_STATUS_TYPE__>::kSuccess; \ if (UNLIKELY(__cond__ != __success_type__)) { \
if (UNLIKELY(__cond__ != __success_type__)) { \ try { \
try { \ ::paddle::platform::throw_on_error( \
::paddle::platform::throw_on_error( \ __cond__, \
__cond__, \ ::paddle::platform::errors::External( \
::paddle::platform::build_ex_string( \ ::paddle::platform::build_nvidia_error_msg(__cond__)) \
__cond__, \ .ToString()); \
::paddle::platform::ErrorSummary(__VA_ARGS__).ToString())); \ } catch (...) { \
} catch (...) { \ HANDLE_THE_ERROR \
HANDLE_THE_ERROR \ throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ __FILE__, __LINE__); \
__FILE__, __LINE__); \ END_HANDLE_THE_ERROR \
END_HANDLE_THE_ERROR \ } \
} \ } \
} \
} while (0) } while (0)
#undef DEFINE_CUDA_STATUS_TYPE #undef DEFINE_CUDA_STATUS_TYPE
......
...@@ -261,15 +261,14 @@ TEST(EOF_EXCEPTION, THROW_EOF) { ...@@ -261,15 +261,14 @@ TEST(EOF_EXCEPTION, THROW_EOF) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
template <typename T> template <typename T>
bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") { bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") {
PADDLE_ENFORCE_CUDA_SUCCESS(value, msg); PADDLE_ENFORCE_CUDA_SUCCESS(value);
return true; return true;
} }
template <typename T> template <typename T>
bool CheckCudaStatusFailure( bool CheckCudaStatusFailure(T value, const std::string& msg) {
T value, const std::string& msg = "self-defined cuda status failed") {
try { try {
PADDLE_ENFORCE_CUDA_SUCCESS(value, msg); PADDLE_ENFORCE_CUDA_SUCCESS(value);
return false; return false;
} catch (paddle::platform::EnforceNotMet& error) { } catch (paddle::platform::EnforceNotMet& error) {
std::string ex_msg = error.what(); std::string ex_msg = error.what();
...@@ -279,24 +278,31 @@ bool CheckCudaStatusFailure( ...@@ -279,24 +278,31 @@ bool CheckCudaStatusFailure(
TEST(enforce, cuda_success) { TEST(enforce, cuda_success) {
EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess)); EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess));
EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue)); EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "Cuda error"));
EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation)); EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "Cuda error"));
int count;
PADDLE_ENFORCE(cudaGetDeviceCount(&count));
EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS)); EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS));
EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH)); EXPECT_TRUE(
EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED)); CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "Curand error"));
EXPECT_TRUE(
CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "Curand error"));
EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS)); EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS));
EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED)); EXPECT_TRUE(
EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED)); CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "Cudnn error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "Cudnn error"));
EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS)); EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS));
EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED)); EXPECT_TRUE(
EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE)); CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "Cublas error"));
EXPECT_TRUE(
CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "Cublas error"));
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError)); EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Nccl error"));
EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError)); EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Nccl error"));
#endif #endif
} }
#endif #endif
......
...@@ -16,7 +16,6 @@ limitations under the License. */ ...@@ -16,7 +16,6 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <cstdlib> #include <cstdlib>
#include <memory> #include <memory>
#include <string>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
...@@ -42,18 +41,13 @@ faster way to query device properties. You can see details in ...@@ -42,18 +41,13 @@ faster way to query device properties. You can see details in
https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/ https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
*/ */
inline std::string CudaErrorWebsite() {
return "Please see detail in https://docs.nvidia.com/cuda/cuda-runtime-api"
"/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c217824"
"6db0a94a430e0038";
}
static int GetCUDADeviceCountImpl() { static int GetCUDADeviceCountImpl() {
int driverVersion = 0; int driverVersion = 0;
cudaError_t status = cudaDriverGetVersion(&driverVersion); cudaError_t status = cudaDriverGetVersion(&driverVersion);
if (!(status == cudaSuccess && driverVersion != 0)) { if (!(status == cudaSuccess && driverVersion != 0)) {
// No GPU driver // No GPU driver
VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
return 0; return 0;
} }
...@@ -67,14 +61,8 @@ static int GetCUDADeviceCountImpl() { ...@@ -67,14 +61,8 @@ static int GetCUDADeviceCountImpl() {
return 0; return 0;
} }
} }
int count; int count;
auto error_code = cudaGetDeviceCount(&count); PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDeviceCount(&count));
PADDLE_ENFORCE(
error_code,
"cudaGetDeviceCount failed in "
"paddle::platform::GetCUDADeviceCountImpl, error code : %d, %s",
error_code, CudaErrorWebsite());
return count; return count;
} }
...@@ -84,72 +72,63 @@ int GetCUDADeviceCount() { ...@@ -84,72 +72,63 @@ int GetCUDADeviceCount() {
} }
int GetCUDAComputeCapability(int id) { int GetCUDAComputeCapability(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount()));
int major, minor; int major, minor;
auto major_error_code = auto major_error_code =
cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id); cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
auto minor_error_code = auto minor_error_code =
cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id); cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_CUDA_SUCCESS(major_error_code);
major_error_code, 0, PADDLE_ENFORCE_CUDA_SUCCESS(minor_error_code);
"cudaDevAttrComputeCapabilityMajor failed in "
"paddle::platform::GetCUDAComputeCapability, error code : %d, %s",
major_error_code, CudaErrorWebsite());
PADDLE_ENFORCE_EQ(
minor_error_code, 0,
"cudaDevAttrComputeCapabilityMinor failed in "
"paddle::platform::GetCUDAComputeCapability, error code : %d, %s",
minor_error_code, CudaErrorWebsite());
return major * 10 + minor; return major * 10 + minor;
} }
dim3 GetGpuMaxGridDimSize(int id) { dim3 GetGpuMaxGridDimSize(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount()));
dim3 ret; dim3 ret;
int size; int size;
auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id); auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
PADDLE_ENFORCE_EQ(error_code_x, 0, PADDLE_ENFORCE_CUDA_SUCCESS(error_code_x);
"cudaDevAttrMaxGridDimX failed in "
"paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
error_code_x, CudaErrorWebsite());
ret.x = size; ret.x = size;
auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id); auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
PADDLE_ENFORCE_EQ(error_code_y, 0, PADDLE_ENFORCE_CUDA_SUCCESS(error_code_y);
"cudaDevAttrMaxGridDimY failed in "
"paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
error_code_y, CudaErrorWebsite());
ret.y = size; ret.y = size;
auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id); auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
PADDLE_ENFORCE_EQ(error_code_z, 0, PADDLE_ENFORCE_CUDA_SUCCESS(error_code_z);
"cudaDevAttrMaxGridDimZ failed in "
"paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
error_code_z, CudaErrorWebsite());
ret.z = size; ret.z = size;
return ret; return ret;
} }
int GetCUDARuntimeVersion(int id) { int GetCUDARuntimeVersion(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount()));
int runtime_version = 0; int runtime_version = 0;
auto error_code = cudaRuntimeGetVersion(&runtime_version); PADDLE_ENFORCE_CUDA_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
PADDLE_ENFORCE(error_code,
"cudaRuntimeGetVersion failed in "
"paddle::platform::GetCUDARuntimeVersion, error code : %d, %s",
error_code, CudaErrorWebsite());
return runtime_version; return runtime_version;
} }
int GetCUDADriverVersion(int id) { int GetCUDADriverVersion(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount()));
int driver_version = 0; int driver_version = 0;
auto error_code = cudaDriverGetVersion(&driver_version); PADDLE_ENFORCE_CUDA_SUCCESS(cudaDriverGetVersion(&driver_version));
PADDLE_ENFORCE(error_code,
"cudaDriverGetVersion failed in "
"paddle::platform::GetCUDADriverVersion, error code : %d, %s",
error_code, CudaErrorWebsite());
return driver_version; return driver_version;
} }
...@@ -164,56 +143,44 @@ bool TensorCoreAvailable() { ...@@ -164,56 +143,44 @@ bool TensorCoreAvailable() {
} }
int GetCUDAMultiProcessors(int id) { int GetCUDAMultiProcessors(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount()));
int count; int count;
auto error_code = PADDLE_ENFORCE_CUDA_SUCCESS(
cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id); cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
PADDLE_ENFORCE(error_code,
"cudaDeviceGetAttribute failed in "
"paddle::platform::GetCUDAMultiProcess, error code : %d, %s",
error_code, CudaErrorWebsite());
return count; return count;
} }
int GetCUDAMaxThreadsPerMultiProcessor(int id) { int GetCUDAMaxThreadsPerMultiProcessor(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount()));
int count; int count;
auto error_code = cudaDeviceGetAttribute( PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceGetAttribute(
&count, cudaDevAttrMaxThreadsPerMultiProcessor, id); &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
PADDLE_ENFORCE(
error_code,
"cudaDeviceGetAttribute failed in paddle::"
"platform::GetCUDAMaxThreadsPerMultiProcessor, error code : %d, %s",
error_code, CudaErrorWebsite());
return count; return count;
} }
int GetCUDAMaxThreadsPerBlock(int id) { int GetCUDAMaxThreadsPerBlock(int id) {
PADDLE_ENFORCE_LT( PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
id, GetCUDADeviceCount(), platform::errors::InvalidArgument(
platform::errors::InvalidArgument( "Device id must be less than GPU count, "
"Device id must less than GPU count, but received id is:%d, " "but received id is: %d. GPU count is: %d.",
"GPU count is: %d.", id, GetCUDADeviceCount()));
id, GetCUDADeviceCount()));
int count; int count;
auto error_code = PADDLE_ENFORCE_CUDA_SUCCESS(
cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id); cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
PADDLE_ENFORCE_EQ(
error_code, 0,
platform::errors::InvalidArgument(
"cudaDeviceGetAttribute returned error code should be 0, "
"but received error code is: %d, %s",
error_code, CudaErrorWebsite()));
return count; return count;
} }
int GetCurrentDeviceId() { int GetCurrentDeviceId() {
int device_id; int device_id;
auto error_code = cudaGetDevice(&device_id); PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDevice(&device_id));
PADDLE_ENFORCE(error_code,
"cudaGetDevice failed in "
"paddle::platform::GetCurrentDeviceId, error code : %d, %s",
error_code, CudaErrorWebsite());
return device_id; return device_id;
} }
...@@ -237,12 +204,12 @@ std::vector<int> GetSelectedDevices() { ...@@ -237,12 +204,12 @@ std::vector<int> GetSelectedDevices() {
void SetDeviceId(int id) { void SetDeviceId(int id) {
// TODO(qijun): find a better way to cache the cuda device count // TODO(qijun): find a better way to cache the cuda device count
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
auto error_code = cudaSetDevice(id); platform::errors::InvalidArgument(
PADDLE_ENFORCE(error_code, "Device id must be less than GPU count, "
"cudaSetDevice failed in " "but received id is: %d. GPU count is: %d.",
"paddle::platform::SetDeviced, error code : %d, %s", id, GetCUDADeviceCount()));
error_code, CudaErrorWebsite()); PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(id));
} }
void GpuMemoryUsage(size_t *available, size_t *total) { void GpuMemoryUsage(size_t *available, size_t *total) {
...@@ -306,74 +273,44 @@ size_t GpuMaxChunkSize() { ...@@ -306,74 +273,44 @@ size_t GpuMaxChunkSize() {
void GpuMemcpyAsync(void *dst, const void *src, size_t count, void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream) { enum cudaMemcpyKind kind, cudaStream_t stream) {
auto error_code = cudaMemcpyAsync(dst, src, count, kind, stream); PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
PADDLE_ENFORCE(error_code,
"cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync "
"(%p -> %p, length: %d) error code : %d, %s",
src, dst, static_cast<int>(count), error_code,
CudaErrorWebsite());
} }
void GpuMemcpySync(void *dst, const void *src, size_t count, void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind) { enum cudaMemcpyKind kind) {
auto error_code = cudaMemcpy(dst, src, count, kind); PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(dst, src, count, kind));
PADDLE_ENFORCE(error_code,
"cudaMemcpy failed in paddle::platform::GpuMemcpySync "
"(%p -> %p, length: %d) error code : %d, %s",
src, dst, static_cast<int>(count), error_code,
CudaErrorWebsite());
} }
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, cudaStream_t stream) { int src_device, size_t count, cudaStream_t stream) {
auto error_code = PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream); cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
PADDLE_ENFORCE(
error_code,
"cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync "
"error code : %d, %s",
error_code, CudaErrorWebsite());
} }
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count) { int src_device, size_t count) {
auto error_code = cudaMemcpyPeer(dst, dst_device, src, src_device, count); PADDLE_ENFORCE_CUDA_SUCCESS(
PADDLE_ENFORCE(error_code, cudaMemcpyPeer(dst, dst_device, src, src_device, count));
"cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync "
"error code : %d, %s",
error_code, CudaErrorWebsite());
} }
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) { void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
auto error_code = cudaMemsetAsync(dst, value, count, stream); PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(dst, value, count, stream));
PADDLE_ENFORCE(error_code,
"cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync "
"error code : %d, %s",
error_code, CudaErrorWebsite());
} }
void GpuStreamSync(cudaStream_t stream) { void GpuStreamSync(cudaStream_t stream) {
auto error_code = cudaStreamSynchronize(stream); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
PADDLE_ENFORCE_CUDA_SUCCESS(
error_code,
platform::errors::External(
"cudaStreamSynchronize failed in paddle::platform::GpuStreamSync "
"error code : %d, %s",
error_code, CudaErrorWebsite()));
} }
static void RaiseNonOutOfMemoryError(cudaError_t *status) { static void RaiseNonOutOfMemoryError(cudaError_t *status) {
if (*status == cudaErrorMemoryAllocation) { if (*status == cudaErrorMemoryAllocation) {
*status = cudaSuccess; *status = cudaSuccess;
} }
PADDLE_ENFORCE_CUDA_SUCCESS(*status); PADDLE_ENFORCE_CUDA_SUCCESS(*status);
*status = cudaGetLastError(); *status = cudaGetLastError();
if (*status == cudaErrorMemoryAllocation) { if (*status == cudaErrorMemoryAllocation) {
*status = cudaSuccess; *status = cudaSuccess;
} }
PADDLE_ENFORCE_CUDA_SUCCESS(*status); PADDLE_ENFORCE_CUDA_SUCCESS(*status);
} }
...@@ -450,8 +387,7 @@ class RecordedCudaMallocHelper { ...@@ -450,8 +387,7 @@ class RecordedCudaMallocHelper {
CUDADeviceGuard guard(dev_id_); CUDADeviceGuard guard(dev_id_);
auto err = cudaFree(ptr); auto err = cudaFree(ptr);
if (err != cudaErrorCudartUnloading) { if (err != cudaErrorCudartUnloading) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(err);
err, platform::errors::External("cudaFree raises unexpected error"));
if (NeedRecord()) { if (NeedRecord()) {
std::lock_guard<std::mutex> guard(*mtx_); std::lock_guard<std::mutex> guard(*mtx_);
cur_size_ -= size; cur_size_ -= size;
......
...@@ -117,10 +117,7 @@ void SynchronizeAllDevice() { ...@@ -117,10 +117,7 @@ void SynchronizeAllDevice() {
int count = GetCUDADeviceCount(); int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
SetDeviceId(i); SetDeviceId(i);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
cudaDeviceSynchronize(),
platform::errors::External(
"Device synchronize failed in cudaDeviceSynchronize()"));
} }
#endif #endif
} }
......
...@@ -213,12 +213,14 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']: ...@@ -213,12 +213,14 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
# the prefix is sys.prefix which should always be usr # the prefix is sys.prefix which should always be usr
paddle_bins = '' paddle_bins = ''
if not '${WIN32}': if not '${WIN32}':
paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle'] paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]} package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]}
if '${HAS_NOAVX_CORE}' == 'ON': if '${HAS_NOAVX_CORE}' == 'ON':
package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')] package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')]
package_dir={ package_dir={
'': '${PADDLE_BINARY_DIR}/python', '': '${PADDLE_BINARY_DIR}/python',
# The paddle.fluid.proto will be generated while compiling. # The paddle.fluid.proto will be generated while compiling.
...@@ -329,6 +331,7 @@ headers = ( ...@@ -329,6 +331,7 @@ headers = (
list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/string')) + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/string')) +
list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/platform')) + list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/platform')) +
list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/framework')) + list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/framework')) +
list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}')) + # errorMessage.pb for errormessage
['${EIGEN_INCLUDE_DIR}/Eigen/Core'] + # eigen ['${EIGEN_INCLUDE_DIR}/Eigen/Core'] + # eigen
list(find_files('*', '${EIGEN_INCLUDE_DIR}/Eigen/src')) + # eigen list(find_files('*', '${EIGEN_INCLUDE_DIR}/Eigen/src')) + # eigen
list(find_files('*', '${EIGEN_INCLUDE_DIR}/unsupported/Eigen')) + # eigen list(find_files('*', '${EIGEN_INCLUDE_DIR}/unsupported/Eigen')) + # eigen
...@@ -400,7 +403,9 @@ class InstallHeaders(Command): ...@@ -400,7 +403,9 @@ class InstallHeaders(Command):
return self.copy_file(header, install_dir) return self.copy_file(header, install_dir)
def run(self): def run(self):
# only copy third_party/cudaErrorMessage.pb for cudaErrorMessage on mac or windows
if os.name == 'nt' or sys.platform == 'darwin': if os.name == 'nt' or sys.platform == 'darwin':
self.mkdir_and_copy_file('${cudaerror_INCLUDE_DIR}/cudaErrorMessage.pb')
return return
hdrs = self.distribution.headers hdrs = self.distribution.headers
if not hdrs: if not hdrs:
......
...@@ -172,8 +172,8 @@ if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then ...@@ -172,8 +172,8 @@ if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
check_approval 1 6836917 47554610 22561442 check_approval 1 6836917 47554610 22561442
fi fi
ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true` ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true`
VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true` VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true`
INVALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" |grep -vxF "$VALID_PADDLE_CHECK" || true` INVALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" |grep -vxF "$VALID_PADDLE_CHECK" || true`
if [ "${INVALID_PADDLE_CHECK}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then if [ "${INVALID_PADDLE_CHECK}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
echo_line="The error message you wrote in PADDLE_ENFORCE{_**} or PADDLE_THROW does not meet our error message writing specification. Possible errors include 1. the error message is empty / 2. the error message is too short / 3. the error type is not specified. Please read the specification [ https://github.com/PaddlePaddle/Paddle/wiki/Paddle-Error-Message-Writing-Specification ], then refine the error message. If it is a mismatch, please specify chenwhql (Recommend), luotao1 or lanxianghit review and approve.\nThe PADDLE_ENFORCE{_**} or PADDLE_THROW entries that do not meet the specification are as follows:\n${INVALID_PADDLE_CHECK}\n" echo_line="The error message you wrote in PADDLE_ENFORCE{_**} or PADDLE_THROW does not meet our error message writing specification. Possible errors include 1. the error message is empty / 2. the error message is too short / 3. the error type is not specified. Please read the specification [ https://github.com/PaddlePaddle/Paddle/wiki/Paddle-Error-Message-Writing-Specification ], then refine the error message. If it is a mismatch, please specify chenwhql (Recommend), luotao1 or lanxianghit review and approve.\nThe PADDLE_ENFORCE{_**} or PADDLE_THROW entries that do not meet the specification are as follows:\n${INVALID_PADDLE_CHECK}\n"
......
#!/bin/bash #!/bin/bash
ALL_PADDLE_CHECK=`grep -r -zoE "(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" ../paddle/fluid || true` ALL_PADDLE_CHECK=`grep -r -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" ../paddle/fluid || true`
ALL_PADDLE_CHECK_CNT=`echo "$ALL_PADDLE_CHECK" | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true` ALL_PADDLE_CHECK_CNT=`echo "$ALL_PADDLE_CHECK" | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true`
VALID_PADDLE_CHECK_CNT=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true` VALID_PADDLE_CHECK_CNT=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true`
echo "----------------------------" echo "----------------------------"
echo "PADDLE ENFORCE & THROW COUNT" echo "PADDLE ENFORCE & THROW COUNT"
......
Usage:
Please run:
```
bash start.sh
```
The error message of CUDA9.0 / CUDA10.0 / CUDA-latest-version will be crawled by default.
If you want to crawl a specified version of CUDA, Please run:
```
bash start.sh <version> <URL(optional)>
```
URL can be derived by default, so you don't have to enter a URL.
for example:
```
bash start.sh 11.0
```
will capture error message of CUDA11.0(in future).
Every time when Nvidia upgrade the CUDA major version, you need to run `bash start.sh` in current directory, and upload cudaErrorMessage.tar.gz to https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ssl
import re
import urllib2
import json
import collections
import sys, getopt
import cuda_error_pb2
def parsing(cuda_errorDesc, version, url):
All_Messages = cuda_errorDesc.AllMessages.add()
All_Messages.version = int(version)
ssl._create_default_https_context = ssl._create_unverified_context
html = urllib2.urlopen(url).read()
res_div = r'<div class="section">.*?<p>CUDA error types </p>.*?</div>.*?<div class="enum-members">(.*?)</div>'
m_div = re.findall(res_div, html, re.S | re.M)
url_list = url.split('/')
url_prefix = '/'.join(url_list[0:url_list.index('cuda-runtime-api') + 1])
dic = collections.OrderedDict()
dic_message = collections.OrderedDict()
for line in m_div:
res_dt = r'<dt>(.*?)</dt>.*?<dd>(.*?)</dd>'
m_dt = re.findall(res_dt, line, re.S | re.M)
for error in m_dt:
res_type = r'<span class="ph ph apiData">(.*?)</span>'
m_type = re.findall(res_type, error[0], re.S | re.M)[0]
m_message = error[1]
m_message = m_message.replace('\n', '')
res_a = r'(<a class=.*?</a>)'
res_shape = r'<a class=.*?>(.*?)</a>'
list_a = re.findall(res_a, m_message, re.S | re.M)
list_shape = re.findall(res_shape, m_message, re.S | re.M)
assert len(list_a) == len(list_shape)
for idx in range(len(list_a)):
m_message = m_message.replace(list_a[idx], list_shape[idx])
m_message = m_message.replace(
'<h6 class=\"deprecated_header\">Deprecated</h6>', '')
res_span = r'(<span class=.*?</span>)'
res_span_detail = r'<span class=.*?>(.*?)</span>'
list_span = re.findall(res_span, m_message, re.S | re.M)
list_span_detail = re.findall(res_span_detail, m_message, re.S |
re.M)
assert len(list_span) == len(list_span_detail)
for idx in range(len(list_span)):
m_message = m_message.replace(list_span[idx],
list_span_detail[idx])
res_p = r'(<p>.*?</p>)'
res_p_detail = r'<p>(.*?)</p>'
list_p = re.findall(res_p, m_message, re.S | re.M)
list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M)
assert len(list_p) == len(list_p_detail)
for idx in range(len(list_p)):
m_message = m_message.replace(list_p[idx], list_p_detail[idx])
m_message = m_message.replace(' ', '')
_Messages = All_Messages.Messages.add()
try:
_Messages.errorCode = int(m_type)
except ValueError:
if re.match('0x', m_type):
_Messages.errorCode = int(m_type, 16)
else:
raise ValueError
_Messages.errorMessage = m_message # save for cudaErrorMessage.pb from python-protobuf interface
def main(argv):
version = []
url = []
try:
opts, args = getopt.getopt(argv, "hv:u:", ["help", "version=", "url="])
except getopt.GetoptError:
print 'python spider.py -v <version1,version2,...,> -u <url1,url2,...,>'
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
print 'python spider.py -v <version1,version2,...,> -u <url1,url2,...,>'
sys.exit()
elif opt in ("-v", "--version"):
version = arg
elif opt in ("-u", "--url"):
url = arg
version = version.split(',')
url = url.split(',')
assert len(version) == len(url)
cuda_errorDesc = cuda_error_pb2.cudaerrorDesc()
for idx in range(len(version)):
if version[idx] == "-1":
print("crawling errorMessage for CUDA%s from %s" %
("-latest-version", url[idx]))
else:
print("crawling errorMessage for CUDA%s from %s" %
(version[idx], url[idx]))
parsing(cuda_errorDesc, version[idx], url[idx])
serializeToString = cuda_errorDesc.SerializeToString()
with open("cudaErrorMessage.pb", "wb") as f:
f.write(serializeToString
) # save for cudaErrorMessage.pb from python-protobuf interface
print("crawling errorMessage for CUDA has been done!!!")
if __name__ == "__main__":
main(sys.argv[1:])
#!/usr/bin/env bash
set -ex
SYSTEM=`uname -s`
rm -f protoc-3.11.3-linux-x86_64.*
if [ "$SYSTEM" == "Linux" ]; then
wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v3.11.3/protoc-3.11.3-linux-x86_64.zip
unzip -d protobuf -o protoc-3.11.3-linux-x86_64.zip
rm protoc-3.11.3-linux-x86_64.*
elif [ "$SYSTEM" == "Darwin" ]; then
wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v3.11.3/protoc-3.11.3-osx-x86_64.zip
unzip -d protobuf -o protoc-3.11.3-osx-x86_64.zip
rm protoc-3.11.3-osx-x86_64.*
else
echo "please run on Mac/Linux"
exit 1
fi
protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/cuda_error.proto
version=90,100,-1 # -1 represent the latest cuda-version
url=https://docs.nvidia.com/cuda/archive/9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/archive/10.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038
if [ "$1" != "" ]; then
version=$version,$(($1*10))
if [ "$2" != "" ]; then
url=$url,$2
else
url=$url,https://docs.nvidia.com/cuda/archive/$1/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038
fi
fi
python spider.py --version=$version --url=$url
tar czf cudaErrorMessage.tar.gz cudaErrorMessage.pb
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册