未验证 提交 3f4678c9 编写于 作者: Z Zhou Wei 提交者: GitHub

[cherry-pick2.0]Optimize the error messages of paddle CUDA API (#23849)

* cherry-pick,Optimize the error messages of paddle CUDA API

* fix the error messages of paddle CUDA API

* Refactoring PADDLE_ENFORCE_CUDA_SUCCESS, and apply to curand/cudnn/cublas/NCCL

* remove build_ex_string
上级 30e4cacd
......@@ -135,6 +135,12 @@ copy(inference_lib_dist
SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
DSTS ${dst_dir})
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
copy(inference_lib_dist
SRCS ${cudaerror_INCLUDE_DIR}
DSTS ${dst_dir})
# CMakeCache Info
copy(inference_lib_dist
SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
DSTS ${FLUID_INFERENCE_INSTALL_DIR})
......@@ -184,7 +190,7 @@ copy(fluid_lib_dist
)
set(module "framework")
set(framework_lib_deps framework_proto)
set(framework_lib_deps framework_proto data_feed_proto trainer_desc_proto)
add_dependencies(fluid_lib_dist ${framework_lib_deps})
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
......@@ -204,11 +210,11 @@ copy(fluid_lib_dist
)
set(module "platform")
set(platform_lib_deps profiler_proto)
set(platform_lib_deps profiler_proto error_codes_proto cuda_error_proto)
add_dependencies(fluid_lib_dist ${platform_lib_deps})
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/profiler.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/error_codes.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module}
)
set(module "string")
......@@ -249,6 +255,7 @@ copy(inference_lib_dist
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
# CMakeCache Info
copy(fluid_lib_dist
SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
......
......@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
include(ExternalProject)
# Creat a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac)
set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
......@@ -21,6 +22,7 @@ set(THIRD_PARTY_CACHE_PATH "${CMAKE_SOURCE_DIR}" CACHE STRING
"A path cache third party source code to avoid repeated download.")
set(THIRD_PARTY_BUILD_TYPE Release)
set(third_party_deps)
# cache funciton to avoid repeat download code of third_party.
# This function has 4 parameters, URL / REPOSITOR / TAG / DIR:
......@@ -100,6 +102,32 @@ MACRO(UNSET_VAR VAR_NAME)
UNSET(${VAR_NAME})
ENDMACRO()
# Funciton to Download the dependencies during compilation
# This function has 2 parameters, URL / DIRNAME:
# 1. URL: The download url of 3rd dependencies
# 2. NAME: The name of file, that determin the dirname
#
MACRO(file_download_and_uncompress URL NAME)
MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}")
SET(EXTERNAL_PROJECT_NAME "extern_download_${NAME}")
SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data)
ExternalProject_Add(
${EXTERNAL_PROJECT_NAME}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${THIRD_PARTY_PATH}/${NAME}
URL ${URL}
DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND ""
)
list(APPEND third_party_deps ${EXTERNAL_PROJECT_NAME})
ENDMACRO()
# Correction of flags on different Platform(WIN/MAC) and Print Warning Message
if (APPLE)
if(WITH_MKL)
......@@ -178,10 +206,13 @@ include(external/dlpack) # download dlpack
include(external/xxhash) # download, build, install xxhash
include(external/warpctc) # download, build, install warpctc
set(third_party_deps)
list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool)
# download file
set(CUDAERROR_URL "https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
file_download_and_uncompress(${CUDAERROR_URL} "cudaerror")
if(WITH_AMD_GPU)
include(external/rocprim) # download, build, install rocprim
list(APPEND third_party_deps extern_rocprim)
......@@ -274,4 +305,4 @@ if (WITH_LITE)
include(external/lite)
endif (WITH_LITE)
add_custom_target(third_party DEPENDS ${third_party_deps})
add_custom_target(third_party ALL DEPENDS ${third_party_deps})
......@@ -152,9 +152,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
cudaMemcpyHostToDevice, dev_ctx->stream()),
platform::errors::External(
"Async cudaMemcpy op_var info to gpu failed."));
cudaMemcpyHostToDevice, dev_ctx->stream()));
} else { // get
auto iter = op_var2gpu_str.find(op_var);
PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true,
......
......@@ -124,12 +124,9 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(output_ptrs, h_odatas,
d_output_ptrs_.size() * sizeof(float*),
cudaMemcpyHostToDevice, stream),
platform::errors::External(
"CUDA Memcpy failed during split plugin run."));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*),
cudaMemcpyHostToDevice, stream));
int outer_rows = outer_rows_ * batchSize;
......@@ -244,12 +241,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(output_ptrs, h_odatas,
d_output_ptrs.size() * sizeof(float*),
cudaMemcpyHostToDevice, stream),
platform::errors::External(
"CUDA Memcpy failed during split plugin run."));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(float*),
cudaMemcpyHostToDevice, stream));
split_kernel<<<grid, block, 0, stream>>>(
d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
......@@ -263,12 +257,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
half* const* h_odatas = reinterpret_cast<half* const*>(outputs);
half** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(output_ptrs, h_odatas,
d_output_ptrs.size() * sizeof(half*),
cudaMemcpyHostToDevice, stream),
platform::errors::External(
"CUDA Memcpy failed during split plugin run."));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(half*),
cudaMemcpyHostToDevice, stream));
split_kernel<<<grid, block, 0, stream>>>(
d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
......
......@@ -80,17 +80,13 @@ class CUDADeviceContextAllocator : public Allocator {
: place_(place), default_stream_(default_stream) {
platform::CUDADeviceGuard guard(place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreate(&event_, cudaEventDisableTiming),
platform::errors::External(
"Create event failed in CUDADeviceContextAllocator"));
cudaEventCreate(&event_, cudaEventDisableTiming));
}
~CUDADeviceContextAllocator() {
if (event_) {
platform::CUDADeviceGuard guard(place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventDestroy(event_),
"Destory event failed in CUDADeviceContextAllocator destroctor");
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
}
}
......@@ -103,12 +99,9 @@ class CUDADeviceContextAllocator : public Allocator {
auto allocation =
new CUDADeviceContextAllocation(memory::Alloc(place_, size));
// Wait for the event on stream
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventRecord(event_, default_stream_),
"Failed to record event in CUDADeviceContextAllocator");
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(default_stream_, event_, 0),
"Failed to wait event in CUDADeviceContextAllocator");
cudaStreamWaitEvent(default_stream_, event_, 0));
return allocation;
}
......
......@@ -141,12 +141,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
cu_stream);
}
PADDLE_ENFORCE_CUDA_SUCCESS(
err,
"ArgSortOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate"
"temp_storage_bytes, status:%s.",
temp_storage_bytes, cudaGetErrorString(err));
PADDLE_ENFORCE_CUDA_SUCCESS(err);
Tensor temp_storage;
temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
......@@ -165,12 +160,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
cu_stream);
}
PADDLE_ENFORCE_CUDA_SUCCESS(
err,
"ArgSortOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
"temp_storage_bytes:%d status:%s.",
temp_storage_bytes, cudaGetErrorString(err));
PADDLE_ENFORCE_CUDA_SUCCESS(err);
}
template <typename T, typename IndType>
......
......@@ -167,13 +167,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
conv_desc.descriptor<T>(padding_common, strides, dilations);
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSetConvolutionGroupCount(cudnn_conv_desc,
groups),
platform::errors::External(
"Call of cudnnSetConvolutionGroupCount(cudnn_conv_desc, groups) "
"failed, where cudnn_conv_desc is configured: padding = [%s], "
"strides = [%s], dilations = [%s]; groups = %d",
framework::make_ddim(padding_common), framework::make_ddim(strides),
framework::make_ddim(dilations), groups));
groups));
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
layout, framework::vectorize<int>(transformed_input.dims()));
......@@ -204,15 +198,8 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSetConvolutionMathType(cudnn_conv_desc,
CUDNN_DEFAULT_MATH),
platform::errors::External(
"Call of cudnnSetConvolutionMathType(cudnn_conv_desc, "
"CUDNN_DEFAULT_MATH) failed, where cudnn_conv_desc is configured: "
"padding = %d, strides = %d, dilations = %d.",
framework::make_ddim(padding_common), framework::make_ddim(strides),
framework::make_ddim(dilations)));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
auto x_dims = framework::vectorize(transformed_input.dims());
auto f_dims = framework::vectorize(filter->dims());
......@@ -221,9 +208,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo),
platform::errors::External(
"Call of cudnnGetConvolutionForwardAlgorithm failed."));
workspace_size_limit, &algo));
VLOG(3) << "cuDNN forward algo " << algo;
} else {
std::function<cudnnConvolutionFwdAlgo_t()> search_func =
......@@ -237,9 +222,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
handle, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit),
platform::errors::External(
"Call of cudnnFindConvolutionForwardAlgorithmEx failed."));
fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
};
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
VLOG(3) << "Perf result: (algo: stat, time, memory)";
......@@ -273,9 +256,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes),
platform::errors::External(
"Call of cudnnGetConvolutionForwardWorkspaceSize failed."));
cudnn_output_desc, algo, &workspace_size_in_bytes));
PADDLE_ENFORCE_LE(
workspace_size_in_bytes, workspace_size_limit,
platform::errors::InvalidArgument(
......@@ -292,20 +273,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
// ------------- cudnn conv forward and bias add ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
auto cudnn_func = [&](void* cudnn_workspace) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnConvolutionForward(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &beta, cudnn_output_desc, output_data),
platform::errors::External(
"Call of cudnnConvolutionForward failed."));
workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnAddTensor(handle, &alpha, cudnn_bias_desc,
bias_data, &alpha,
cudnn_output_desc, output_data),
platform::errors::External("Call of cudnnAddTensor failed."));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnAddTensor(
handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
output_data));
} else {
if (activation == "identity") {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
......@@ -320,9 +296,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnn_filter_desc, filter_data, cudnn_conv_desc, algo,
cudnn_workspace, workspace_size_in_bytes, &alpha2,
cudnn_output_desc, residual_data, cudnn_bias_desc, bias_data,
cudnn_act_desc, cudnn_output_desc, output_data),
platform::errors::External(
"Call of cudnnConvolutionBiasActivationForward failed."));
cudnn_act_desc, cudnn_output_desc, output_data));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
}
......
......@@ -108,32 +108,21 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_),
platform::errors::External(
"The error has happened when calling "
"cudnnCreateTensorDescriptor(&data_desc_)."));
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_),
platform::errors::External(
"The error has happened when calling "
"cudnnCreateTensorDescriptor(&bn_param_desc_)."));
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
VLOG(3) << "Setting descriptors.";
std::vector<int> dims = {N, C, H, W, D};
std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSetTensorNdDescriptor(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
platform::errors::External(
"The error has happened when calling cudnnSetTensorNdDescriptor."));
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
data_desc_, mode_),
platform::errors::External("The error has happened when calling "
"cudnnDeriveBNTensorDescriptor."));
data_desc_, mode_));
double this_factor = 1. - momentum;
cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
......@@ -166,10 +155,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
/*yDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/activation_desc_,
/*sizeInBytes=*/&workspace_size),
platform::errors::External(
"The error has happened when calling "
"cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize."));
/*sizeInBytes=*/&workspace_size));
// -------------- cudnn batchnorm reserve space --------------
PADDLE_ENFORCE_CUDA_SUCCESS(
......@@ -179,10 +165,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
/*bnOps=*/bnOps_,
/*activationDesc=*/activation_desc_,
/*xDesc=*/data_desc_,
/*sizeInBytes=*/&reserve_space_size),
platform::errors::External(
"The error has happened when calling "
"cudnnGetBatchNormalizationTrainingExReserveSpaceSize."));
/*sizeInBytes=*/&reserve_space_size));
reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(),
reserve_space_size);
......@@ -204,22 +187,13 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
reserve_space_size),
platform::errors::External(
"The error has happened when calling "
"cudnnBatchNormalizationForwardTrainingEx."));
reserve_space_size));
// clean when exit.
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_),
platform::errors::External(
"The error has happened when calling "
"cudnnDestroyTensorDescriptor(data_desc_)."));
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_),
platform::errors::External(
"The error has happened when calling "
"cudnnDestroyTensorDescriptor(bn_param_desc_)."));
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
}
};
......@@ -298,15 +272,9 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_),
platform::errors::External(
"The error has happened when calling "
"cudnnCreateTensorDescriptor(&data_desc_)."));
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_),
platform::errors::External(
"The error has happened when calling "
"cudnnCreateTensorDescriptor(&bn_param_desc_)."));
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
......@@ -314,17 +282,12 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSetTensorNdDescriptor(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
platform::errors::External(
"The error has happened when calling cudnnSetTensorNdDescriptor."));
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
data_desc_, mode_),
platform::errors::External("The error has happened when calling "
"cudnnDeriveBNTensorDescriptor."));
data_desc_, mode_));
const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
......@@ -354,10 +317,7 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
/*dxDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/activation_desc_,
/*sizeInBytes=*/&workspace_size),
platform::errors::External(
"The error has happened when calling "
"cudnnGetBatchNormalizationBackwardExWorkspaceSize."));
/*sizeInBytes=*/&workspace_size));
workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
workspace_size);
......@@ -395,21 +355,13 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
/*workspace=*/workspace_ptr,
/*workSpaceSizeInBytes=*/workspace_size,
/*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()),
/*reserveSpaceSizeInBytes=*/reserve_space_size),
platform::errors::External("The error has happened when calling "
"cudnnBatchNormalizationBackwardEx."));
/*reserveSpaceSizeInBytes=*/reserve_space_size));
// clean when exit.
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_),
platform::errors::External(
"The error has happened when calling "
"cudnnDestroyTensorDescriptor(data_desc_)."));
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_),
platform::errors::External(
"The error has happened when calling "
"cudnnDestroyTensorDescriptor(bn_param_desc_)."));
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
}
};
......
......@@ -46,13 +46,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
cudnnTensorDescriptor_t in_desc;
cudnnTensorDescriptor_t out_desc;
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&in_desc),
platform::errors::External("Create cudnn tensor descriptor failed in "
"transpose_flatten_concat_fusion op."));
platform::dynload::cudnnCreateTensorDescriptor(&in_desc));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&out_desc),
platform::errors::External("Create cudnn tensor descriptor failed in "
"transpose_flatten_concat_fusion op."));
platform::dynload::cudnnCreateTensorDescriptor(&out_desc));
cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
......@@ -91,24 +87,15 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
dims_y[i] = 1;
}
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSetTensorNdDescriptor(
in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()),
platform::errors::External("Create cudnn tensorNd descriptor failed "
"in transpose_flatten_concat op."));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSetTensorNdDescriptor(
out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()),
platform::errors::External("Create cudnn tensorNd descriptor failed "
"in transpose_flatten_concat op."));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnTransformTensor(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor(
handle, CudnnDataType<T>::kOne(), in_desc,
static_cast<const void*>(ins[k]->data<T>()),
CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)),
platform::errors::External("Create cudnn transform tensor failed in "
"transpose_flatten_concat op."));
CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)));
if (concat_axis == 0) {
odata += osize;
} else {
......@@ -117,13 +104,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
}
}
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(in_desc),
platform::errors::External(
"Destory cudnn descriptor failed in transpose_flatten_concat op."));
platform::dynload::cudnnDestroyTensorDescriptor(in_desc));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(out_desc),
platform::errors::External(
"Destory cudnn descriptor failed in transpose_flatten_concat op."));
platform::dynload::cudnnDestroyTensorDescriptor(out_desc));
}
};
......
......@@ -60,13 +60,10 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
DataLayout::kNCHW, framework::vectorize<int>(output->dims()));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnSpatialTfSamplerForward(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward(
handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
output_data),
platform::errors::InvalidArgument(
"cudnnSpatialTfSamplerForward in Op(grid_sampler) failed"));
output_data));
}
};
......@@ -122,9 +119,7 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc,
input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc,
output_grad_data, grid_data, CudnnDataType<T>::kZero(),
grid_grad_data),
platform::errors::InvalidArgument(
"cudnnSpatialTfSamplerBackward in Op(grid_sampler) failed"));
grid_grad_data));
}
};
......
......@@ -41,16 +41,12 @@ struct CUBlas<float> {
template <typename... ARGS>
static void SCAL(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cublasSscal(args...),
platform::errors::External("dynload cublasSscal lib failed"));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSscal(args...));
}
template <typename... ARGS>
static void VCOPY(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cublasScopy(args...),
platform::errors::External("dynload cublasScopy lib failed"));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasScopy(args...));
}
template <typename... ARGS>
......@@ -108,16 +104,12 @@ struct CUBlas<double> {
template <typename... ARGS>
static void SCAL(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cublasDscal(args...),
platform::errors::External("dynload cublasDscal lib failed"));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDscal(args...));
}
template <typename... ARGS>
static void VCOPY(ARGS... args) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cublasDcopy(args...),
platform::errors::External("dynload cublasDcopy lib failed"));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDcopy(args...));
}
template <typename... ARGS>
......
......@@ -59,20 +59,14 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
auto err = cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, trans_x,
out_data, size_prob, stream);
PADDLE_ENFORCE_CUDA_SUCCESS(
err, platform::errors::External(
"MeanOP failed to get reduce workspace size %s.",
cudaGetErrorString(err)));
PADDLE_ENFORCE_CUDA_SUCCESS(err);
framework::Tensor tmp;
auto* temp_storage = tmp.mutable_data<uint8_t>(
framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
context.GetPlace());
err = cub::DeviceReduce::Sum(temp_storage, temp_storage_bytes, trans_x,
out_data, size_prob, stream);
PADDLE_ENFORCE_CUDA_SUCCESS(
err, platform::errors::External(
"MeanOP failed to run CUDA reduce computation: %s.",
cudaGetErrorString(err)));
PADDLE_ENFORCE_CUDA_SUCCESS(err);
}
};
......
......@@ -104,13 +104,9 @@ void BufferedReader::ReadAsync(size_t i) {
// gpu memory immediately without waiting gpu kernel ends
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventRecord(events_[i].get(), compute_stream_),
platform::errors::Fatal(
"cudaEventRecord raises unexpected exception"));
cudaEventRecord(events_[i].get(), compute_stream_));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0),
platform::errors::Fatal(
"cudaStreamWaitEvent raises unexpected exception"));
cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0));
platform::RecordEvent record_event("BufferedReader:MemoryCopy");
for (size_t i = 0; i < cpu.size(); ++i) {
......@@ -138,17 +134,11 @@ void BufferedReader::ReadAsync(size_t i) {
size);
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
cuda_pinned_place, cuda_pinned_ptr, size, stream_.get());
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamSynchronize(stream_.get()),
platform::errors::Fatal(
"cudaStreamSynchronize raises unexpected exception"));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
}
gpu[i].set_lod(cpu[i].lod());
}
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamSynchronize(stream_.get()),
platform::errors::Fatal(
"cudaStreamSynchronize raises unexpected exception"));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
}
#endif
return i;
......
......@@ -191,12 +191,9 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
if (comm) {
int dtype = platform::ToNCCLDataType(mean_out->type());
// In-place operation
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::ncclAllReduce(stats, stats, 2 * C + 1,
static_cast<ncclDataType_t>(dtype),
ncclSum, comm, stream),
platform::errors::InvalidArgument(
"ncclAllReduce in Op(sync_batch_norm) failed"));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
comm, stream));
}
#endif
......@@ -468,12 +465,9 @@ void SyncBatchNormGradFunctor(
if (comm) {
int dtype = platform::ToNCCLDataType(scale->type());
// In-place operation
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::ncclAllReduce(stats, stats, 2 * C + 1,
static_cast<ncclDataType_t>(dtype),
ncclSum, comm, stream),
platform::errors::InvalidArgument(
"ncclAllReduce in Op(sync_batch_norm) failed"));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
comm, stream));
}
#endif
......
proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
proto_library(error_codes_proto SRCS error_codes.proto)
proto_library(cuda_error_proto SRCS cuda_error.proto)
if (WITH_PYTHON)
py_proto_compile(profiler_py_proto SRCS profiler.proto)
......@@ -28,7 +28,7 @@ cc_library(flags SRCS flags.cc DEPS gflags)
cc_library(errors SRCS errors.cc DEPS error_codes_proto)
cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)
cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors)
cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors cuda_error_proto)
cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
set(CPU_INFO_DEPS gflags glog enforce)
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax = "proto2";
package paddle.platform.proto;
message MessageDesc {
// Indicates the type of error
required int32 errorCode = 1;
// Indicates the message of error
required string errorMessage = 2;
}
message AllMessageDesc {
// Version of cuda API
required int32 version = 1;
// Error messages of different errortype
repeated MessageDesc Messages = 2;
}
message cudaerrorDesc {
// Error messages of different cuda versions(9.0/10.0/10.2)
repeated AllMessageDesc AllMessages = 2;
}
\ No newline at end of file
......@@ -29,14 +29,7 @@ namespace platform {
class CublasHandleHolder {
public:
CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cublasCreate(&handle_),
platform::errors::External(
"The cuBLAS library was not initialized. This is usually caused by "
"an error in the CUDA Runtime API called by the cuBLAS routine, or "
"an error in the hardware setup.\n"
"To correct: check that the hardware, an appropriate version of "
"the driver, and the cuBLAS library are correctly installed."));
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
#if CUDA_VERSION >= 9000
if (math_type == CUBLAS_TENSOR_OP_MATH) {
......
......@@ -27,18 +27,13 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
platform::SetDeviceId(dev_idx);
cudaStream_t stream;
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking),
platform::errors::Fatal(
"cudaStreamCreateWithFlags raises unexpected exception"));
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
return stream;
};
auto deleter = [dev_idx](cudaStream_t stream) {
platform::SetDeviceId(dev_idx);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamDestroy(stream),
platform::errors::Fatal(
"cudaStreamDestroy raises unexpected exception"));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream));
};
pool_.emplace_back(
......@@ -72,18 +67,13 @@ CudaEventResourcePool::CudaEventResourcePool() {
platform::SetDeviceId(dev_idx);
cudaEvent_t event;
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&event, cudaEventDisableTiming),
platform::errors::Fatal(
"cudaEventCreateWithFlags raises unexpected exception"));
cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
return event;
};
auto deleter = [dev_idx](cudaEvent_t event) {
platform::SetDeviceId(dev_idx);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventDestroy(event),
platform::errors::Fatal(
"cudaEventDestroy raises unexpected exception"));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event));
};
pool_.emplace_back(ResourcePool<CudaEventObject>::Create(creator, deleter));
......
......@@ -278,12 +278,9 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
<< "Please recompile or reinstall Paddle with compatible CUDNN "
"version.";
}
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnCreate(&cudnn_handle_),
"Failed to create Cudnn handle in DeviceContext");
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnSetStream(cudnn_handle_, stream_),
"Failed to set stream for Cudnn handle in DeviceContext");
dynload::cudnnSetStream(cudnn_handle_, stream_));
} else {
cudnn_handle_ = nullptr;
}
......@@ -302,8 +299,7 @@ CUDADeviceContext::~CUDADeviceContext() {
eigen_device_.reset();
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
if (cudnn_handle_) {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_),
"Failed to destory Cudnn handle");
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_));
}
#if defined(PADDLE_WITH_NCCL)
if (nccl_comm_) {
......@@ -325,10 +321,7 @@ void CUDADeviceContext::Wait() const {
}
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(
e_sync, platform::errors::Fatal(
"cudaStreamSynchronize raises error: %s, errono: %d",
cudaGetErrorString(e_sync), static_cast<int>(e_sync)));
PADDLE_ENFORCE_CUDA_SUCCESS(e_sync);
}
int CUDADeviceContext::GetComputeCapability() const {
......
......@@ -18,6 +18,13 @@ limitations under the License. */
#include <cxxabi.h> // for __cxa_demangle
#endif // __GNUC__
#if !defined(_WIN32)
#include <dlfcn.h> // dladdr
#else // _WIN32
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#include <windows.h> // GetModuleFileName
#endif
#ifdef PADDLE_WITH_CUDA
#include <cublas_v2.h>
#include <cudnn.h>
......@@ -38,6 +45,7 @@ limitations under the License. */
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "glog/logging.h"
#include "paddle/fluid/platform/cuda_error.pb.h"
#include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/port.h"
......@@ -220,10 +228,6 @@ inline std::string GetTraceBackString(StrType&& what, const char* file,
inline bool is_error(bool stat) { return !stat; }
inline std::string build_ex_string(bool stat, const std::string& msg) {
return msg;
}
inline void throw_on_error(bool stat, const std::string& msg) {
#ifndef REPLACE_ENFORCE_GLOG
throw std::runtime_error(msg);
......@@ -291,9 +295,7 @@ struct EnforceNotMet : public std::exception {
try { \
::paddle::platform::throw_on_error( \
__cond__, \
::paddle::platform::build_ex_string( \
__cond__, \
::paddle::platform::ErrorSummary(__VA_ARGS__).ToString())); \
::paddle::platform::ErrorSummary(__VA_ARGS__).ToString()); \
} catch (...) { \
HANDLE_THE_ERROR \
throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
......@@ -464,30 +466,148 @@ struct EOFException : public std::exception {
} while (0)
/** CUDA PADDLE ENFORCE FUNCTIONS AND MACROS **/
#ifdef PADDLE_WITH_CUDA
/***** CUDA ERROR *****/
inline bool is_error(cudaError_t e) { return e != cudaSuccess; }
inline std::string build_ex_string(cudaError_t e, const std::string& msg) {
return msg;
inline std::string GetCudaErrorWebsite(int32_t cuda_version) {
std::ostringstream webstr;
webstr << "https://docs.nvidia.com/cuda/";
if (cuda_version != -1) {
double version = cuda_version / 10;
webstr << "archive/" << std::fixed << std::setprecision(1) << version;
}
webstr << "/cuda-runtime-api/group__CUDART__TYPES.html"
"#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038";
return webstr.str();
}
inline std::string build_nvidia_error_msg(cudaError_t e) {
#if CUDA_VERSION >= 10000 && CUDA_VERSION < 11000
int32_t cuda_version = 100;
#elif CUDA_VERSION >= 9000
int32_t cuda_version = 90;
#else
int32_t cuda_version = -1;
#endif
std::ostringstream sout;
sout << " Cuda error(" << e << "), " << cudaGetErrorString(e) << ".";
static platform::proto::cudaerrorDesc cudaerror;
static bool _initSucceed = false;
if (cudaerror.ByteSizeLong() == 0) {
std::string filePath;
#if !defined(_WIN32)
Dl_info info;
if (dladdr(reinterpret_cast<void*>(GetCudaErrorWebsite), &info)) {
std::string strModule(info.dli_fname);
const size_t last_slash_idx = strModule.find_last_of("/");
std::string compare_path = strModule.substr(strModule.length() - 6);
if (std::string::npos != last_slash_idx) {
strModule.erase(last_slash_idx, std::string::npos);
}
if (compare_path.compare("avx.so") == 0) {
filePath = strModule +
"/../include/third_party/cudaerror/data/cudaErrorMessage.pb";
} else {
filePath =
strModule + "/../../thirl_party/cudaerror/data/cudaErrorMessage.pb";
}
}
#else
char buf[100];
MEMORY_BASIC_INFORMATION mbi;
HMODULE h_module =
(::VirtualQuery(GetCudaErrorWebsite, &mbi, sizeof(mbi)) != 0)
? (HMODULE)mbi.AllocationBase
: NULL;
GetModuleFileName(h_module, buf, 100);
std::string strModule(buf);
const size_t last_slash_idx = strModule.find_last_of("\\");
std::string compare_path = strModule.substr(strModule.length() - 7);
if (std::string::npos != last_slash_idx) {
strModule.erase(last_slash_idx, std::string::npos);
}
if (compare_path.compare("avx.pyd") == 0) {
filePath =
strModule +
"\\..\\include\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
} else {
filePath =
strModule + "\\..\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
}
#endif
std::ifstream fin(filePath, std::ios::in | std::ios::binary);
_initSucceed = cudaerror.ParseFromIstream(&fin);
}
if (_initSucceed) {
for (int i = 0; i < cudaerror.allmessages_size(); ++i) {
if (cuda_version == cudaerror.allmessages(i).version()) {
for (int j = 0; j < cudaerror.allmessages(i).messages_size(); ++j) {
if (e == cudaerror.allmessages(i).messages(j).errorcode()) {
sout << "\n [Advise: "
<< cudaerror.allmessages(i).messages(j).errormessage() << "]";
return sout.str();
}
}
}
}
}
sout << "\n [Advise: Please search for the error code(" << e
<< ") on website( " << GetCudaErrorWebsite(cuda_version)
<< " ) to get Nvidia's official solution about CUDA Error.]";
return sout.str();
}
inline void throw_on_error(cudaError_t e, const std::string& msg) {
#ifndef REPLACE_ENFORCE_GLOG
throw thrust::system_error(e, thrust::cuda_category(), msg);
throw std::runtime_error(msg);
#else
LOG(FATAL) << msg;
#endif
}
/** curand ERROR **/
inline bool is_error(curandStatus_t stat) {
return stat != CURAND_STATUS_SUCCESS;
}
inline std::string build_ex_string(curandStatus_t stat,
const std::string& msg) {
return msg;
inline const char* curandGetErrorString(curandStatus_t stat) {
switch (stat) {
case CURAND_STATUS_SUCCESS:
return "CURAND_STATUS_SUCCESS";
case CURAND_STATUS_VERSION_MISMATCH:
return "CURAND_STATUS_VERSION_MISMATCH";
case CURAND_STATUS_NOT_INITIALIZED:
return "CURAND_STATUS_NOT_INITIALIZED";
case CURAND_STATUS_ALLOCATION_FAILED:
return "CURAND_STATUS_ALLOCATION_FAILED";
case CURAND_STATUS_TYPE_ERROR:
return "CURAND_STATUS_TYPE_ERROR";
case CURAND_STATUS_OUT_OF_RANGE:
return "CURAND_STATUS_OUT_OF_RANGE";
case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
case CURAND_STATUS_LAUNCH_FAILURE:
return "CURAND_STATUS_LAUNCH_FAILURE";
case CURAND_STATUS_PREEXISTING_FAILURE:
return "CURAND_STATUS_PREEXISTING_FAILURE";
case CURAND_STATUS_INITIALIZATION_FAILED:
return "CURAND_STATUS_INITIALIZATION_FAILED";
case CURAND_STATUS_ARCH_MISMATCH:
return "CURAND_STATUS_ARCH_MISMATCH";
case CURAND_STATUS_INTERNAL_ERROR:
return "CURAND_STATUS_INTERNAL_ERROR";
default:
return "Unknown curand status";
}
}
inline std::string build_nvidia_error_msg(curandStatus_t stat) {
std::string msg(" Curand error, ");
return msg + curandGetErrorString(stat) + " ";
}
inline void throw_on_error(curandStatus_t stat, const std::string& msg) {
......@@ -499,13 +619,14 @@ inline void throw_on_error(curandStatus_t stat, const std::string& msg) {
#endif
}
/***** CUDNN ERROR *****/
inline bool is_error(cudnnStatus_t stat) {
return stat != CUDNN_STATUS_SUCCESS;
}
inline std::string build_ex_string(cudnnStatus_t stat, const std::string& msg) {
return msg + "\n [Hint: " + platform::dynload::cudnnGetErrorString(stat) +
"]";
inline std::string build_nvidia_error_msg(cudnnStatus_t stat) {
std::string msg(" Cudnn error, ");
return msg + platform::dynload::cudnnGetErrorString(stat) + " ";
}
inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) {
......@@ -516,33 +637,39 @@ inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) {
#endif
}
/***** CUBLAS ERROR *****/
inline bool is_error(cublasStatus_t stat) {
return stat != CUBLAS_STATUS_SUCCESS;
}
inline std::string build_ex_string(cublasStatus_t stat,
const std::string& msg) {
std::string err;
if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
err = "CUBLAS_STATUS_NOT_INITIALIZED";
} else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
err = "CUBLAS_STATUS_ALLOC_FAILED";
} else if (stat == CUBLAS_STATUS_INVALID_VALUE) {
err = "CUBLAS_STATUS_INVALID_VALUE";
} else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) {
err = "CUBLAS_STATUS_ARCH_MISMATCH";
} else if (stat == CUBLAS_STATUS_MAPPING_ERROR) {
err = "CUBLAS_STATUS_MAPPING_ERROR";
} else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
err = "CUBLAS_STATUS_EXECUTION_FAILED";
} else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) {
err = "CUBLAS_STATUS_INTERNAL_ERROR";
} else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) {
err = "CUBLAS_STATUS_NOT_SUPPORTED";
} else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
err = "CUBLAS_STATUS_LICENSE_ERROR";
inline const char* cublasGetErrorString(cublasStatus_t stat) {
switch (stat) {
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED:
return "CUBLAS_STATUS_NOT_SUPPORTED";
case CUBLAS_STATUS_LICENSE_ERROR:
return "CUBLAS_STATUS_LICENSE_ERROR";
default:
return "Unknown cublas status";
}
return msg + "\n [Hint: " + err + "]";
}
inline std::string build_nvidia_error_msg(cublasStatus_t stat) {
std::string msg(" Cublas error, ");
return msg + cublasGetErrorString(stat) + " ";
}
inline void throw_on_error(cublasStatus_t stat, const std::string& msg) {
......@@ -553,15 +680,15 @@ inline void throw_on_error(cublasStatus_t stat, const std::string& msg) {
#endif
}
/****** NCCL ERROR ******/
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
inline bool is_error(ncclResult_t nccl_result) {
return nccl_result != ncclSuccess;
}
inline std::string build_ex_string(ncclResult_t nccl_result,
const std::string& msg) {
return msg + "\n [" + platform::dynload::ncclGetErrorString(nccl_result) +
"]";
inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
std::string msg(" Nccl error, ");
return msg + platform::dynload::ncclGetErrorString(nccl_result) + " ";
}
inline void throw_on_error(ncclResult_t nccl_result, const std::string& msg) {
......@@ -571,11 +698,8 @@ inline void throw_on_error(ncclResult_t nccl_result, const std::string& msg) {
LOG(FATAL) << msg;
#endif
}
#endif // __APPLE__ and windows
#endif // PADDLE_WITH_CUDA
#endif // not(__APPLE__) and PADDLE_WITH_NCCL
#ifdef PADDLE_WITH_CUDA
namespace details {
template <typename T>
......@@ -598,10 +722,8 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
#endif
} // namespace details
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
#define PADDLE_ENFORCE_CUDA_SUCCESS(COND, ...) \
#define PADDLE_ENFORCE_CUDA_SUCCESS(COND) \
do { \
auto __cond__ = (COND); \
using __CUDA_STATUS_TYPE__ = decltype(__cond__); \
......@@ -612,9 +734,9 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
try { \
::paddle::platform::throw_on_error( \
__cond__, \
::paddle::platform::build_ex_string( \
__cond__, \
::paddle::platform::ErrorSummary(__VA_ARGS__).ToString())); \
::paddle::platform::errors::External( \
::paddle::platform::build_nvidia_error_msg(__cond__)) \
.ToString()); \
} catch (...) { \
HANDLE_THE_ERROR \
throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
......
......@@ -261,15 +261,14 @@ TEST(EOF_EXCEPTION, THROW_EOF) {
#ifdef PADDLE_WITH_CUDA
template <typename T>
bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") {
PADDLE_ENFORCE_CUDA_SUCCESS(value, msg);
PADDLE_ENFORCE_CUDA_SUCCESS(value);
return true;
}
template <typename T>
bool CheckCudaStatusFailure(
T value, const std::string& msg = "self-defined cuda status failed") {
bool CheckCudaStatusFailure(T value, const std::string& msg) {
try {
PADDLE_ENFORCE_CUDA_SUCCESS(value, msg);
PADDLE_ENFORCE_CUDA_SUCCESS(value);
return false;
} catch (paddle::platform::EnforceNotMet& error) {
std::string ex_msg = error.what();
......@@ -279,24 +278,31 @@ bool CheckCudaStatusFailure(
TEST(enforce, cuda_success) {
EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess));
EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue));
EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation));
EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "Cuda error"));
EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "Cuda error"));
int count;
PADDLE_ENFORCE(cudaGetDeviceCount(&count));
EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS));
EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH));
EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED));
EXPECT_TRUE(
CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "Curand error"));
EXPECT_TRUE(
CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "Curand error"));
EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS));
EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED));
EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED));
EXPECT_TRUE(
CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "Cudnn error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "Cudnn error"));
EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS));
EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED));
EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE));
EXPECT_TRUE(
CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "Cublas error"));
EXPECT_TRUE(
CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "Cublas error"));
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError));
EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError));
EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Nccl error"));
EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Nccl error"));
#endif
}
#endif
......
......@@ -16,7 +16,6 @@ limitations under the License. */
#include <algorithm>
#include <cstdlib>
#include <memory>
#include <string>
#include "gflags/gflags.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
......@@ -42,18 +41,13 @@ faster way to query device properties. You can see details in
https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
*/
inline std::string CudaErrorWebsite() {
return "Please see detail in https://docs.nvidia.com/cuda/cuda-runtime-api"
"/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c217824"
"6db0a94a430e0038";
}
static int GetCUDADeviceCountImpl() {
int driverVersion = 0;
cudaError_t status = cudaDriverGetVersion(&driverVersion);
if (!(status == cudaSuccess && driverVersion != 0)) {
// No GPU driver
VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
return 0;
}
......@@ -67,14 +61,8 @@ static int GetCUDADeviceCountImpl() {
return 0;
}
}
int count;
auto error_code = cudaGetDeviceCount(&count);
PADDLE_ENFORCE(
error_code,
"cudaGetDeviceCount failed in "
"paddle::platform::GetCUDADeviceCountImpl, error code : %d, %s",
error_code, CudaErrorWebsite());
PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDeviceCount(&count));
return count;
}
......@@ -84,72 +72,63 @@ int GetCUDADeviceCount() {
}
int GetCUDAComputeCapability(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount()));
int major, minor;
auto major_error_code =
cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
auto minor_error_code =
cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id);
PADDLE_ENFORCE_EQ(
major_error_code, 0,
"cudaDevAttrComputeCapabilityMajor failed in "
"paddle::platform::GetCUDAComputeCapability, error code : %d, %s",
major_error_code, CudaErrorWebsite());
PADDLE_ENFORCE_EQ(
minor_error_code, 0,
"cudaDevAttrComputeCapabilityMinor failed in "
"paddle::platform::GetCUDAComputeCapability, error code : %d, %s",
minor_error_code, CudaErrorWebsite());
PADDLE_ENFORCE_CUDA_SUCCESS(major_error_code);
PADDLE_ENFORCE_CUDA_SUCCESS(minor_error_code);
return major * 10 + minor;
}
dim3 GetGpuMaxGridDimSize(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount()));
dim3 ret;
int size;
auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
PADDLE_ENFORCE_EQ(error_code_x, 0,
"cudaDevAttrMaxGridDimX failed in "
"paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
error_code_x, CudaErrorWebsite());
PADDLE_ENFORCE_CUDA_SUCCESS(error_code_x);
ret.x = size;
auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
PADDLE_ENFORCE_EQ(error_code_y, 0,
"cudaDevAttrMaxGridDimY failed in "
"paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
error_code_y, CudaErrorWebsite());
PADDLE_ENFORCE_CUDA_SUCCESS(error_code_y);
ret.y = size;
auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
PADDLE_ENFORCE_EQ(error_code_z, 0,
"cudaDevAttrMaxGridDimZ failed in "
"paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
error_code_z, CudaErrorWebsite());
PADDLE_ENFORCE_CUDA_SUCCESS(error_code_z);
ret.z = size;
return ret;
}
int GetCUDARuntimeVersion(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount()));
int runtime_version = 0;
auto error_code = cudaRuntimeGetVersion(&runtime_version);
PADDLE_ENFORCE(error_code,
"cudaRuntimeGetVersion failed in "
"paddle::platform::GetCUDARuntimeVersion, error code : %d, %s",
error_code, CudaErrorWebsite());
PADDLE_ENFORCE_CUDA_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
return runtime_version;
}
int GetCUDADriverVersion(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount()));
int driver_version = 0;
auto error_code = cudaDriverGetVersion(&driver_version);
PADDLE_ENFORCE(error_code,
"cudaDriverGetVersion failed in "
"paddle::platform::GetCUDADriverVersion, error code : %d, %s",
error_code, CudaErrorWebsite());
PADDLE_ENFORCE_CUDA_SUCCESS(cudaDriverGetVersion(&driver_version));
return driver_version;
}
......@@ -164,56 +143,44 @@ bool TensorCoreAvailable() {
}
int GetCUDAMultiProcessors(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount()));
int count;
auto error_code =
cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id);
PADDLE_ENFORCE(error_code,
"cudaDeviceGetAttribute failed in "
"paddle::platform::GetCUDAMultiProcess, error code : %d, %s",
error_code, CudaErrorWebsite());
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
return count;
}
int GetCUDAMaxThreadsPerMultiProcessor(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount()));
int count;
auto error_code = cudaDeviceGetAttribute(
&count, cudaDevAttrMaxThreadsPerMultiProcessor, id);
PADDLE_ENFORCE(
error_code,
"cudaDeviceGetAttribute failed in paddle::"
"platform::GetCUDAMaxThreadsPerMultiProcessor, error code : %d, %s",
error_code, CudaErrorWebsite());
PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceGetAttribute(
&count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
return count;
}
int GetCUDAMaxThreadsPerBlock(int id) {
PADDLE_ENFORCE_LT(
id, GetCUDADeviceCount(),
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
platform::errors::InvalidArgument(
"Device id must less than GPU count, but received id is:%d, "
"GPU count is: %d.",
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount()));
int count;
auto error_code =
cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id);
PADDLE_ENFORCE_EQ(
error_code, 0,
platform::errors::InvalidArgument(
"cudaDeviceGetAttribute returned error code should be 0, "
"but received error code is: %d, %s",
error_code, CudaErrorWebsite()));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
return count;
}
int GetCurrentDeviceId() {
int device_id;
auto error_code = cudaGetDevice(&device_id);
PADDLE_ENFORCE(error_code,
"cudaGetDevice failed in "
"paddle::platform::GetCurrentDeviceId, error code : %d, %s",
error_code, CudaErrorWebsite());
PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDevice(&device_id));
return device_id;
}
......@@ -237,12 +204,12 @@ std::vector<int> GetSelectedDevices() {
void SetDeviceId(int id) {
// TODO(qijun): find a better way to cache the cuda device count
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
auto error_code = cudaSetDevice(id);
PADDLE_ENFORCE(error_code,
"cudaSetDevice failed in "
"paddle::platform::SetDeviced, error code : %d, %s",
error_code, CudaErrorWebsite());
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount()));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(id));
}
void GpuMemoryUsage(size_t *available, size_t *total) {
......@@ -306,74 +273,44 @@ size_t GpuMaxChunkSize() {
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream) {
auto error_code = cudaMemcpyAsync(dst, src, count, kind, stream);
PADDLE_ENFORCE(error_code,
"cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync "
"(%p -> %p, length: %d) error code : %d, %s",
src, dst, static_cast<int>(count), error_code,
CudaErrorWebsite());
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
}
void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind) {
auto error_code = cudaMemcpy(dst, src, count, kind);
PADDLE_ENFORCE(error_code,
"cudaMemcpy failed in paddle::platform::GpuMemcpySync "
"(%p -> %p, length: %d) error code : %d, %s",
src, dst, static_cast<int>(count), error_code,
CudaErrorWebsite());
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(dst, src, count, kind));
}
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, cudaStream_t stream) {
auto error_code =
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream);
PADDLE_ENFORCE(
error_code,
"cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync "
"error code : %d, %s",
error_code, CudaErrorWebsite());
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
}
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count) {
auto error_code = cudaMemcpyPeer(dst, dst_device, src, src_device, count);
PADDLE_ENFORCE(error_code,
"cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync "
"error code : %d, %s",
error_code, CudaErrorWebsite());
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyPeer(dst, dst_device, src, src_device, count));
}
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
auto error_code = cudaMemsetAsync(dst, value, count, stream);
PADDLE_ENFORCE(error_code,
"cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync "
"error code : %d, %s",
error_code, CudaErrorWebsite());
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(dst, value, count, stream));
}
void GpuStreamSync(cudaStream_t stream) {
auto error_code = cudaStreamSynchronize(stream);
PADDLE_ENFORCE_CUDA_SUCCESS(
error_code,
platform::errors::External(
"cudaStreamSynchronize failed in paddle::platform::GpuStreamSync "
"error code : %d, %s",
error_code, CudaErrorWebsite()));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
}
static void RaiseNonOutOfMemoryError(cudaError_t *status) {
if (*status == cudaErrorMemoryAllocation) {
*status = cudaSuccess;
}
PADDLE_ENFORCE_CUDA_SUCCESS(*status);
*status = cudaGetLastError();
if (*status == cudaErrorMemoryAllocation) {
*status = cudaSuccess;
}
PADDLE_ENFORCE_CUDA_SUCCESS(*status);
}
......@@ -450,8 +387,7 @@ class RecordedCudaMallocHelper {
CUDADeviceGuard guard(dev_id_);
auto err = cudaFree(ptr);
if (err != cudaErrorCudartUnloading) {
PADDLE_ENFORCE_CUDA_SUCCESS(
err, platform::errors::External("cudaFree raises unexpected error"));
PADDLE_ENFORCE_CUDA_SUCCESS(err);
if (NeedRecord()) {
std::lock_guard<std::mutex> guard(*mtx_);
cur_size_ -= size;
......
......@@ -117,10 +117,7 @@ void SynchronizeAllDevice() {
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
SetDeviceId(i);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaDeviceSynchronize(),
platform::errors::External(
"Device synchronize failed in cudaDeviceSynchronize()"));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
}
#endif
}
......
......@@ -213,12 +213,14 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
# the prefix is sys.prefix which should always be usr
paddle_bins = ''
if not '${WIN32}':
paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]}
if '${HAS_NOAVX_CORE}' == 'ON':
package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')]
package_dir={
'': '${PADDLE_BINARY_DIR}/python',
# The paddle.fluid.proto will be generated while compiling.
......@@ -329,6 +331,7 @@ headers = (
list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/string')) +
list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/platform')) +
list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/framework')) +
list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}')) + # errorMessage.pb for errormessage
['${EIGEN_INCLUDE_DIR}/Eigen/Core'] + # eigen
list(find_files('*', '${EIGEN_INCLUDE_DIR}/Eigen/src')) + # eigen
list(find_files('*', '${EIGEN_INCLUDE_DIR}/unsupported/Eigen')) + # eigen
......@@ -400,7 +403,9 @@ class InstallHeaders(Command):
return self.copy_file(header, install_dir)
def run(self):
# only copy third_party/cudaErrorMessage.pb for cudaErrorMessage on mac or windows
if os.name == 'nt' or sys.platform == 'darwin':
self.mkdir_and_copy_file('${cudaerror_INCLUDE_DIR}/cudaErrorMessage.pb')
return
hdrs = self.distribution.headers
if not hdrs:
......
......@@ -172,8 +172,8 @@ if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
check_approval 1 6836917 47554610 22561442
fi
ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true`
VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true`
ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true`
VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true`
INVALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" |grep -vxF "$VALID_PADDLE_CHECK" || true`
if [ "${INVALID_PADDLE_CHECK}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
echo_line="The error message you wrote in PADDLE_ENFORCE{_**} or PADDLE_THROW does not meet our error message writing specification. Possible errors include 1. the error message is empty / 2. the error message is too short / 3. the error type is not specified. Please read the specification [ https://github.com/PaddlePaddle/Paddle/wiki/Paddle-Error-Message-Writing-Specification ], then refine the error message. If it is a mismatch, please specify chenwhql (Recommend), luotao1 or lanxianghit review and approve.\nThe PADDLE_ENFORCE{_**} or PADDLE_THROW entries that do not meet the specification are as follows:\n${INVALID_PADDLE_CHECK}\n"
......
#!/bin/bash
ALL_PADDLE_CHECK=`grep -r -zoE "(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" ../paddle/fluid || true`
ALL_PADDLE_CHECK=`grep -r -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" ../paddle/fluid || true`
ALL_PADDLE_CHECK_CNT=`echo "$ALL_PADDLE_CHECK" | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true`
VALID_PADDLE_CHECK_CNT=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true`
VALID_PADDLE_CHECK_CNT=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true`
echo "----------------------------"
echo "PADDLE ENFORCE & THROW COUNT"
......
Usage:
Please run:
```
bash start.sh
```
The error message of CUDA9.0 / CUDA10.0 / CUDA-latest-version will be crawled by default.
If you want to crawl a specified version of CUDA, Please run:
```
bash start.sh <version> <URL(optional)>
```
URL can be derived by default, so you don't have to enter a URL.
for example:
```
bash start.sh 11.0
```
will capture error message of CUDA11.0(in future).
Every time when Nvidia upgrade the CUDA major version, you need to run `bash start.sh` in current directory, and upload cudaErrorMessage.tar.gz to https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ssl
import re
import urllib2
import json
import collections
import sys, getopt
import cuda_error_pb2
def parsing(cuda_errorDesc, version, url):
All_Messages = cuda_errorDesc.AllMessages.add()
All_Messages.version = int(version)
ssl._create_default_https_context = ssl._create_unverified_context
html = urllib2.urlopen(url).read()
res_div = r'<div class="section">.*?<p>CUDA error types </p>.*?</div>.*?<div class="enum-members">(.*?)</div>'
m_div = re.findall(res_div, html, re.S | re.M)
url_list = url.split('/')
url_prefix = '/'.join(url_list[0:url_list.index('cuda-runtime-api') + 1])
dic = collections.OrderedDict()
dic_message = collections.OrderedDict()
for line in m_div:
res_dt = r'<dt>(.*?)</dt>.*?<dd>(.*?)</dd>'
m_dt = re.findall(res_dt, line, re.S | re.M)
for error in m_dt:
res_type = r'<span class="ph ph apiData">(.*?)</span>'
m_type = re.findall(res_type, error[0], re.S | re.M)[0]
m_message = error[1]
m_message = m_message.replace('\n', '')
res_a = r'(<a class=.*?</a>)'
res_shape = r'<a class=.*?>(.*?)</a>'
list_a = re.findall(res_a, m_message, re.S | re.M)
list_shape = re.findall(res_shape, m_message, re.S | re.M)
assert len(list_a) == len(list_shape)
for idx in range(len(list_a)):
m_message = m_message.replace(list_a[idx], list_shape[idx])
m_message = m_message.replace(
'<h6 class=\"deprecated_header\">Deprecated</h6>', '')
res_span = r'(<span class=.*?</span>)'
res_span_detail = r'<span class=.*?>(.*?)</span>'
list_span = re.findall(res_span, m_message, re.S | re.M)
list_span_detail = re.findall(res_span_detail, m_message, re.S |
re.M)
assert len(list_span) == len(list_span_detail)
for idx in range(len(list_span)):
m_message = m_message.replace(list_span[idx],
list_span_detail[idx])
res_p = r'(<p>.*?</p>)'
res_p_detail = r'<p>(.*?)</p>'
list_p = re.findall(res_p, m_message, re.S | re.M)
list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M)
assert len(list_p) == len(list_p_detail)
for idx in range(len(list_p)):
m_message = m_message.replace(list_p[idx], list_p_detail[idx])
m_message = m_message.replace(' ', '')
_Messages = All_Messages.Messages.add()
try:
_Messages.errorCode = int(m_type)
except ValueError:
if re.match('0x', m_type):
_Messages.errorCode = int(m_type, 16)
else:
raise ValueError
_Messages.errorMessage = m_message # save for cudaErrorMessage.pb from python-protobuf interface
def main(argv):
version = []
url = []
try:
opts, args = getopt.getopt(argv, "hv:u:", ["help", "version=", "url="])
except getopt.GetoptError:
print 'python spider.py -v <version1,version2,...,> -u <url1,url2,...,>'
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
print 'python spider.py -v <version1,version2,...,> -u <url1,url2,...,>'
sys.exit()
elif opt in ("-v", "--version"):
version = arg
elif opt in ("-u", "--url"):
url = arg
version = version.split(',')
url = url.split(',')
assert len(version) == len(url)
cuda_errorDesc = cuda_error_pb2.cudaerrorDesc()
for idx in range(len(version)):
if version[idx] == "-1":
print("crawling errorMessage for CUDA%s from %s" %
("-latest-version", url[idx]))
else:
print("crawling errorMessage for CUDA%s from %s" %
(version[idx], url[idx]))
parsing(cuda_errorDesc, version[idx], url[idx])
serializeToString = cuda_errorDesc.SerializeToString()
with open("cudaErrorMessage.pb", "wb") as f:
f.write(serializeToString
) # save for cudaErrorMessage.pb from python-protobuf interface
print("crawling errorMessage for CUDA has been done!!!")
if __name__ == "__main__":
main(sys.argv[1:])
#!/usr/bin/env bash
set -ex
SYSTEM=`uname -s`
rm -f protoc-3.11.3-linux-x86_64.*
if [ "$SYSTEM" == "Linux" ]; then
wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v3.11.3/protoc-3.11.3-linux-x86_64.zip
unzip -d protobuf -o protoc-3.11.3-linux-x86_64.zip
rm protoc-3.11.3-linux-x86_64.*
elif [ "$SYSTEM" == "Darwin" ]; then
wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v3.11.3/protoc-3.11.3-osx-x86_64.zip
unzip -d protobuf -o protoc-3.11.3-osx-x86_64.zip
rm protoc-3.11.3-osx-x86_64.*
else
echo "please run on Mac/Linux"
exit 1
fi
protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/cuda_error.proto
version=90,100,-1 # -1 represent the latest cuda-version
url=https://docs.nvidia.com/cuda/archive/9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/archive/10.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038
if [ "$1" != "" ]; then
version=$version,$(($1*10))
if [ "$2" != "" ]; then
url=$url,$2
else
url=$url,https://docs.nvidia.com/cuda/archive/$1/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038
fi
fi
python spider.py --version=$version --url=$url
tar czf cudaErrorMessage.tar.gz cudaErrorMessage.pb
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册