提交 20aac5bb 编写于 作者: L liaogang

Add style check for *.cc files in cuda directory

上级 77ddce0f
...@@ -81,5 +81,8 @@ else() ...@@ -81,5 +81,8 @@ else()
add_library(paddle_cuda ${CUDA_SOURCES}) add_library(paddle_cuda ${CUDA_SOURCES})
endif() endif()
add_style_check_target(paddle_cuda ${CUDA_SOURCES}) add_style_check_target(paddle_cuda
add_style_check_target(paddle_cuda ${CUDA_HEADERS}) ${CUDA_SOURCES}
${CUDA_HEADERS}
${CUDA_DSO_SOURCES}
${CUDA_CXX_WITH_GPU_SOURCES})
...@@ -104,7 +104,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP) ...@@ -104,7 +104,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
#endif #endif
const char* hl_cublas_get_error_string(cublasStatus_t status) { const char* hl_cublas_get_error_string(cublasStatus_t status) {
switch(status) { switch (status) {
case CUBLAS_STATUS_NOT_INITIALIZED: case CUBLAS_STATUS_NOT_INITIALIZED:
return "[cublas status]: not initialized"; return "[cublas status]: not initialized";
case CUBLAS_STATUS_ALLOC_FAILED: case CUBLAS_STATUS_ALLOC_FAILED:
...@@ -181,7 +181,7 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) { ...@@ -181,7 +181,7 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
real **inout_d = (real **)hl_malloc_device(sizeof(real *)); real **inout_d = (real **)hl_malloc_device(sizeof(real *));
hl_memcpy(inout_d, inout_h, sizeof(real *)); hl_memcpy(inout_d, inout_h, sizeof(real *));
int *pivot_d = (int *)hl_malloc_device(dimN*sizeof(int)); int *pivot_d = (int *)hl_malloc_device(dimN * sizeof(int));
int *info_d = (int *)t_resource.gpu_mem; int *info_d = (int *)t_resource.gpu_mem;
/* Note: cublasSgetrfBatched is used to calculate a number of /* Note: cublasSgetrfBatched is used to calculate a number of
...@@ -189,10 +189,9 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) { ...@@ -189,10 +189,9 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
the API for better performance. the API for better performance.
*/ */
CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle, CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle,
dimN, inout_d, lda, pivot_d, dimN, inout_d, lda, pivot_d, info_d, 1));
info_d, 1));
int info_h; int info_h;
hl_memcpy(&info_h, info_d, sizeof(int)); hl_memcpy(&info_h, info_d, sizeof(int));
if (info_h != 0) { if (info_h != 0) {
LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n"; LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
...@@ -204,8 +203,8 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) { ...@@ -204,8 +203,8 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
hl_memcpy(out_d, out_h, sizeof(real *)); hl_memcpy(out_d, out_h, sizeof(real *));
CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle, CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
dimN, (const real **)inout_d, lda, pivot_d, dimN, (const real **)inout_d, lda, pivot_d,
out_d, ldc, info_d, 1)); out_d, ldc, info_d, 1));
hl_memcpy(&info_h, info_d, sizeof(int)); hl_memcpy(&info_h, info_d, sizeof(int));
if (info_h != 0) { if (info_h != 0) {
...@@ -215,7 +214,7 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) { ...@@ -215,7 +214,7 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
hl_free_mem_device(inout_d); hl_free_mem_device(inout_d);
hl_free_mem_device(pivot_d); hl_free_mem_device(pivot_d);
hl_free_mem_device(out_d); hl_free_mem_device(out_d);
CHECK_SYNC("hl_matrix_inverse failed"); CHECK_SYNC("hl_matrix_inverse failed");
} }
......
...@@ -159,13 +159,11 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP) ...@@ -159,13 +159,11 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
bool g_is_libcudnn_init = false; bool g_is_libcudnn_init = false;
int g_cudnn_lib_version = 0; int g_cudnn_lib_version = 0;
void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) {
{
CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc)); CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
} }
void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream) void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream) {
{
size_t cudnn_dso_ver = dynload::cudnnGetVersion(); size_t cudnn_dso_ver = dynload::cudnnGetVersion();
size_t cudnn_dso_major = cudnn_dso_ver / 1000; size_t cudnn_dso_major = cudnn_dso_ver / 1000;
size_t cudnn_cuh_major = CUDNN_VERSION / 1000; size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
...@@ -212,13 +210,18 @@ void hl_conv_workspace(hl_tensor_descriptor input, ...@@ -212,13 +210,18 @@ void hl_conv_workspace(hl_tensor_descriptor input,
CHECK_NOTNULL(conv); CHECK_NOTNULL(conv);
// Specify workspace limit directly // Specify workspace limit directly
size_t memoryLimitBytes = (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb; size_t memoryLimitBytes =
(1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
// cudnn convolution forward configuration // cudnn convolution forward configuration
cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input); cudnnTensorDescriptor_t fwd_src_desc =
cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output); GET_TENSOR_DESCRIPTOR(input);
cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter); cudnnTensorDescriptor_t fwd_dest_desc =
cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv); GET_TENSOR_DESCRIPTOR(output);
cudnnFilterDescriptor_t fwd_filter_desc =
GET_FILTER_DESCRIPTOR(filter);
cudnnConvolutionDescriptor_t fwd_conv_desc =
GET_CONVOLUTION_DESCRIPTOR(conv);
CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm( CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
t_resource.cudnn_handle, t_resource.cudnn_handle,
...@@ -250,23 +253,23 @@ void hl_conv_workspace(hl_tensor_descriptor input, ...@@ -250,23 +253,23 @@ void hl_conv_workspace(hl_tensor_descriptor input,
GET_CONVOLUTION_DESCRIPTOR(conv); GET_CONVOLUTION_DESCRIPTOR(conv);
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm( CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
t_resource.cudnn_handle, t_resource.cudnn_handle,
bwd_data_filter_desc, bwd_data_filter_desc,
bwd_data_diff_desc, bwd_data_diff_desc,
bwd_data_conv_desc, bwd_data_conv_desc,
bwd_data_grad_desc, bwd_data_grad_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes, memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo))); reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
t_resource.cudnn_handle, t_resource.cudnn_handle,
bwd_data_filter_desc, bwd_data_filter_desc,
bwd_data_diff_desc, bwd_data_diff_desc,
bwd_data_conv_desc, bwd_data_conv_desc,
bwd_data_grad_desc, bwd_data_grad_desc,
static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo), static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
bwdDataLimitBytes)); bwdDataLimitBytes));
// cudnn convolution backward filter configuration // cudnn convolution backward filter configuration
cudnnTensorDescriptor_t bwd_filter_src_desc = cudnnTensorDescriptor_t bwd_filter_src_desc =
...@@ -279,21 +282,21 @@ void hl_conv_workspace(hl_tensor_descriptor input, ...@@ -279,21 +282,21 @@ void hl_conv_workspace(hl_tensor_descriptor input,
GET_FILTER_DESCRIPTOR(filter); GET_FILTER_DESCRIPTOR(filter);
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm( CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
t_resource.cudnn_handle, t_resource.cudnn_handle,
bwd_filter_src_desc, bwd_filter_src_desc,
bwd_filter_diff_desc, bwd_filter_diff_desc,
bwd_filter_conv_desc, bwd_filter_conv_desc,
bwd_filter_grad_desc, bwd_filter_grad_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes, memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo))); reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
t_resource.cudnn_handle, bwd_filter_src_desc, t_resource.cudnn_handle, bwd_filter_src_desc,
bwd_filter_diff_desc, bwd_filter_conv_desc, bwd_filter_diff_desc, bwd_filter_conv_desc,
bwd_filter_grad_desc, bwd_filter_grad_desc,
static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo), static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
bwdFilterLimitBytes)); bwdFilterLimitBytes));
#endif #endif
} }
...@@ -302,8 +305,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc, ...@@ -302,8 +305,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
int batch_size, int batch_size,
int feature_maps, int feature_maps,
int height, int height,
int width) int width) {
{
CHECK_NOTNULL(image_desc); CHECK_NOTNULL(image_desc);
cudnn_tensor_descriptor hl_desc = cudnn_tensor_descriptor hl_desc =
...@@ -359,8 +361,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc, ...@@ -359,8 +361,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
int batch_size, int batch_size,
int feature_maps, int feature_maps,
int height, int height,
int width) int width) {
{
const int stride_w = 1; const int stride_w = 1;
const int stride_h = width * stride_w; const int stride_h = width * stride_w;
const int stride_c = height * stride_h; const int stride_c = height * stride_h;
...@@ -384,8 +385,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc, ...@@ -384,8 +385,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
int nStride, int nStride,
int cStride, int cStride,
int hStride, int hStride,
int wStride) int wStride) {
{
CHECK_NOTNULL(image_desc); CHECK_NOTNULL(image_desc);
cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc; cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
...@@ -408,8 +408,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc, ...@@ -408,8 +408,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
hl_desc->width = width; hl_desc->width = width;
} }
void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
{
CHECK_NOTNULL(image_desc); CHECK_NOTNULL(image_desc);
cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc; cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
...@@ -430,11 +429,9 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc, ...@@ -430,11 +429,9 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
int height_padding, int height_padding,
int width_padding, int width_padding,
int stride_height, int stride_height,
int stride_width) int stride_width) {
{
cudnnPoolingMode_t cudnn_mode; cudnnPoolingMode_t cudnn_mode;
switch (mode) switch (mode) {
{
case HL_POOLING_MAX: case HL_POOLING_MAX:
cudnn_mode = CUDNN_POOLING_MAX; cudnn_mode = CUDNN_POOLING_MAX;
break; break;
...@@ -478,13 +475,13 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc, ...@@ -478,13 +475,13 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
*pooling_desc = (hl_pooling_descriptor)hl_pooling_desc; *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
} }
void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
{
CHECK_NOTNULL(pooling_desc); CHECK_NOTNULL(pooling_desc);
cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc; cudnn_pooling_descriptor hl_pooling =
CHECK_NOTNULL(hl_pooling->desc); (cudnn_pooling_descriptor)pooling_desc;
CHECK_NOTNULL(hl_pooling->desc);
CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc)); CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
hl_pooling->desc = NULL; hl_pooling->desc = NULL;
...@@ -496,8 +493,7 @@ void hl_pooling_forward(hl_tensor_descriptor input, ...@@ -496,8 +493,7 @@ void hl_pooling_forward(hl_tensor_descriptor input,
real* input_image, real* input_image,
hl_tensor_descriptor output, hl_tensor_descriptor output,
real* output_image, real* output_image,
hl_pooling_descriptor pooling) hl_pooling_descriptor pooling) {
{
cudnnPoolingDescriptor_t pooling_desc; cudnnPoolingDescriptor_t pooling_desc;
cudnnTensorDescriptor_t input_desc; cudnnTensorDescriptor_t input_desc;
cudnnTensorDescriptor_t output_desc; cudnnTensorDescriptor_t output_desc;
...@@ -531,8 +527,7 @@ void hl_pooling_backward(hl_tensor_descriptor input, ...@@ -531,8 +527,7 @@ void hl_pooling_backward(hl_tensor_descriptor input,
hl_tensor_descriptor output, hl_tensor_descriptor output,
real* output_image, real* output_image,
real* output_image_grad, real* output_image_grad,
hl_pooling_descriptor pooling) hl_pooling_descriptor pooling) {
{
cudnnPoolingDescriptor_t pooling_desc; cudnnPoolingDescriptor_t pooling_desc;
cudnnTensorDescriptor_t input_desc; cudnnTensorDescriptor_t input_desc;
cudnnTensorDescriptor_t output_desc; cudnnTensorDescriptor_t output_desc;
...@@ -571,8 +566,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter, ...@@ -571,8 +566,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter,
int input_feature_maps, int input_feature_maps,
int output_feature_maps, int output_feature_maps,
int height, int height,
int width) int width) {
{
CHECK_NOTNULL(filter); CHECK_NOTNULL(filter);
cudnn_filter_descriptor hl_filter = cudnn_filter_descriptor hl_filter =
...@@ -607,8 +601,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter, ...@@ -607,8 +601,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter,
} }
void hl_destroy_filter_descriptor(hl_filter_descriptor filter) void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
{
CHECK_NOTNULL(filter); CHECK_NOTNULL(filter);
cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter; cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
...@@ -627,14 +620,13 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv, ...@@ -627,14 +620,13 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
int padding_height, int padding_height,
int padding_width, int padding_width,
int stride_height, int stride_height,
int stride_width) int stride_width) {
{
CHECK_NOTNULL(conv); CHECK_NOTNULL(conv);
cudnn_convolution_descriptor hl_conv = cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)
(cudnn_convolution_descriptor)malloc(sizeof(_cudnn_convolution_descriptor)); malloc(sizeof(_cudnn_convolution_descriptor));
CHECK_NOTNULL(hl_conv);
CHECK_NOTNULL(hl_conv);
CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc)); CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION; cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
...@@ -667,8 +659,7 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv, ...@@ -667,8 +659,7 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
int padding_height, int padding_height,
int padding_width, int padding_width,
int stride_height, int stride_height,
int stride_width) int stride_width) {
{
CHECK_NOTNULL(conv); CHECK_NOTNULL(conv);
CHECK_NOTNULL(image); CHECK_NOTNULL(image);
CHECK_NOTNULL(filter); CHECK_NOTNULL(filter);
...@@ -697,8 +688,7 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv, ...@@ -697,8 +688,7 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
hl_conv->mode = mode; hl_conv->mode = mode;
} }
void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
{
CHECK_NOTNULL(conv); CHECK_NOTNULL(conv);
cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv; cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
...@@ -753,8 +743,7 @@ void hl_convolution_forward(hl_tensor_descriptor input, ...@@ -753,8 +743,7 @@ void hl_convolution_forward(hl_tensor_descriptor input,
void hl_convolution_forward_add_bias(hl_tensor_descriptor bias, void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
real* bias_data, real* bias_data,
hl_tensor_descriptor output, hl_tensor_descriptor output,
real* output_data) real* output_data) {
{
CHECK_NOTNULL(bias); CHECK_NOTNULL(bias);
CHECK_NOTNULL(output); CHECK_NOTNULL(output);
CHECK_NOTNULL(bias_data); CHECK_NOTNULL(bias_data);
...@@ -782,8 +771,7 @@ void hl_convolution_forward_add_bias(hl_tensor_descriptor bias, ...@@ -782,8 +771,7 @@ void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
void hl_convolution_backward_bias(hl_tensor_descriptor bias, void hl_convolution_backward_bias(hl_tensor_descriptor bias,
real* bias_grad_data, real* bias_grad_data,
hl_tensor_descriptor output, hl_tensor_descriptor output,
real* output_grad_data) real* output_grad_data) {
{
CHECK_NOTNULL(bias); CHECK_NOTNULL(bias);
CHECK_NOTNULL(output); CHECK_NOTNULL(output);
CHECK_NOTNULL(bias_grad_data); CHECK_NOTNULL(bias_grad_data);
...@@ -814,7 +802,6 @@ void hl_convolution_backward_filter(hl_tensor_descriptor input, ...@@ -814,7 +802,6 @@ void hl_convolution_backward_filter(hl_tensor_descriptor input,
void* gpuWorkSpace, void* gpuWorkSpace,
size_t sizeInBytes, size_t sizeInBytes,
int convBwdFilterAlgo) { int convBwdFilterAlgo) {
CHECK_NOTNULL(input); CHECK_NOTNULL(input);
CHECK_NOTNULL(output); CHECK_NOTNULL(output);
CHECK_NOTNULL(filter); CHECK_NOTNULL(filter);
...@@ -889,8 +876,7 @@ void hl_convolution_backward_data(hl_tensor_descriptor input, ...@@ -889,8 +876,7 @@ void hl_convolution_backward_data(hl_tensor_descriptor input,
void hl_softmax_forward(real *input, void hl_softmax_forward(real *input,
real *output, real *output,
int height, int height,
int width) int width) {
{
#ifndef PADDLE_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT; cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else #else
...@@ -923,8 +909,7 @@ void hl_softmax_forward(real *input, ...@@ -923,8 +909,7 @@ void hl_softmax_forward(real *input,
void hl_softmax_backward(real *output_value, void hl_softmax_backward(real *output_value,
real *output_grad, real *output_grad,
int height, int height,
int width) int width) {
{
#ifndef PADDLE_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT; cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else #else
......
...@@ -203,8 +203,8 @@ inline pid_t gettid() { ...@@ -203,8 +203,8 @@ inline pid_t gettid() {
#endif #endif
pid_t tid = syscall(__NR_gettid); pid_t tid = syscall(__NR_gettid);
#endif #endif
CHECK_NE(tid, -1); CHECK_NE((int)tid, -1);
return tid; return tid;
} }
void hl_init(int device) { void hl_init(int device) {
...@@ -355,7 +355,8 @@ void* hl_malloc_host(size_t size) { ...@@ -355,7 +355,8 @@ void* hl_malloc_host(size_t size) {
void *dest_h; void *dest_h;
CHECK(size) << __func__ << ": the size for device memory is 0, please check."; CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
CHECK_CUDA(dynload::cudaHostAlloc((void**)&dest_h, size, cudaHostAllocDefault)); CHECK_CUDA(dynload::cudaHostAlloc(
(void**)&dest_h, size, cudaHostAllocDefault));
return dest_h; return dest_h;
} }
...@@ -364,7 +365,7 @@ void hl_free_mem_host(void *dest_h) { ...@@ -364,7 +365,7 @@ void hl_free_mem_host(void *dest_h) {
CHECK_NOTNULL(dest_h); CHECK_NOTNULL(dest_h);
cudaError_t err = dynload::cudaFreeHost(dest_h); cudaError_t err = dynload::cudaFreeHost(dest_h);
CHECK (cudaSuccess == err || cudaErrorCudartUnloading == err) CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
<< hl_get_device_error_string(); << hl_get_device_error_string();
} }
...@@ -502,7 +503,8 @@ int hl_get_cuda_version() { ...@@ -502,7 +503,8 @@ int hl_get_cuda_version() {
return g_cuda_lib_version; return g_cuda_lib_version;
} }
void hl_create_thread_resources(int device, thread_device_resources device_res) { void hl_create_thread_resources(int device,
thread_device_resources device_res) {
CHECK_CUDA(dynload::cudaSetDevice(device)); CHECK_CUDA(dynload::cudaSetDevice(device));
/* create thread stream */ /* create thread stream */
......
...@@ -78,48 +78,38 @@ __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, ...@@ -78,48 +78,38 @@ __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
dim3 blockDim, dim3 blockDim,
void **args, void **args,
size_t sharedMem, size_t sharedMem,
cudaStream_t stream) cudaStream_t stream) {
{ return dynload::cudaLaunchKernel(func, gridDim, blockDim,
return dynload::cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream); args, sharedMem, stream);
} }
#endif /* CUDART_VERSION >= 7000 */ #endif /* CUDART_VERSION >= 7000 */
__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) __host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
{
return dynload::cudaLaunch(func); return dynload::cudaLaunch(func);
} }
__host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
size_t size, size_t size,
size_t offset) size_t offset) {
{
return dynload::cudaSetupArgument(arg, size, offset); return dynload::cudaSetupArgument(arg, size, offset);
} }
__host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
dim3 blockDim, dim3 blockDim,
size_t sharedMem, size_t sharedMem,
cudaStream_t stream) cudaStream_t stream) {
{
return dynload::cudaConfigureCall(gridDim, blockDim, return dynload::cudaConfigureCall(gridDim, blockDim,
sharedMem, stream); sharedMem, stream);
} }
extern "C" { extern "C" {
void** CUDARTAPI __cudaRegisterFatBinary( void** CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
void *fatCubin
)
{
return dynload::__cudaRegisterFatBinary(fatCubin); return dynload::__cudaRegisterFatBinary(fatCubin);
} }
void CUDARTAPI __cudaUnregisterFatBinary( void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
void **fatCubinHandle
)
{
return dynload::__cudaUnregisterFatBinary(fatCubinHandle); return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
} }
......
...@@ -19,17 +19,18 @@ limitations under the License. */ ...@@ -19,17 +19,18 @@ limitations under the License. */
P_DEFINE_string(cudnn_dir, "", P_DEFINE_string(cudnn_dir, "",
"Specify path for loading libcudnn.so. For instance, " "Specify path for loading libcudnn.so. For instance, "
"/usr/local/cudnn/lib64. If empty [default], dlopen will search " "/usr/local/cudnn/lib64. If empty [default], dlopen "
"cudnn from LD_LIBRARY_PATH"); "will search cudnn from LD_LIBRARY_PATH");
P_DEFINE_string(cuda_dir, "", P_DEFINE_string(cuda_dir, "",
"Specify path for loading cuda library, such as libcublas, " "Specify path for loading cuda library, such as libcublas, "
"libcurand. For instance, /usr/local/cuda/lib64. " "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
"(Note: libcudart can not be specified by cuda_dir, since some " "libcudart can not be specified by cuda_dir, since some "
"build-in function in cudart already ran before main entry). " "build-in function in cudart already ran before main entry). "
"If empty [default], dlopen will search cuda from LD_LIBRARY_PATH"); "If default, dlopen will search cuda from LD_LIBRARY_PATH");
static inline std::string join(const std::string& part1, const std::string& part2) { static inline std::string join(const std::string& part1,
const std::string& part2) {
// directory separator // directory separator
const char sep = '/'; const char sep = '/';
...@@ -49,10 +50,10 @@ static inline std::string join(const std::string& part1, const std::string& part ...@@ -49,10 +50,10 @@ static inline std::string join(const std::string& part1, const std::string& part
static inline void GetDsoHandleFromDefaultPath( static inline void GetDsoHandleFromDefaultPath(
std::string& dso_path, void** dso_handle, int dynload_flags) { std::string& dso_path, void** dso_handle, int dynload_flags) {
VLOG(3) << "Try to find cuda library: " << dso_path VLOG(3) << "Try to find cuda library: " << dso_path
<< " from default system path."; << " from default system path.";
// default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
*dso_handle = dlopen(dso_path.c_str(), dynload_flags); *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
// bring System Integrity Projection (SIP), if dso_handle // bring System Integrity Projection (SIP), if dso_handle
// is null, search from default package path in Mac OS. // is null, search from default package path in Mac OS.
...@@ -62,13 +63,13 @@ static inline void GetDsoHandleFromDefaultPath( ...@@ -62,13 +63,13 @@ static inline void GetDsoHandleFromDefaultPath(
*dso_handle = dlopen(dso_path.c_str(), dynload_flags); *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
if (nullptr == *dso_handle) { if (nullptr == *dso_handle) {
if (dso_path == "libcudnn.dylib") { if (dso_path == "libcudnn.dylib") {
LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n" LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n" // NOLINT
<< "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C " << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C " // NOLINT
<< "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h " << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h " // NOLINT
<< "/usr/local/cuda/lib/libcudnn*"; << "/usr/local/cuda/lib/libcudnn*";
} }
} }
} }
#endif #endif
} }
...@@ -96,19 +97,19 @@ static inline void GetDsoHandleFromSearchPath( ...@@ -96,19 +97,19 @@ static inline void GetDsoHandleFromSearchPath(
CHECK(nullptr != *dso_handle) CHECK(nullptr != *dso_handle)
<< "Failed to find cuda library: " << dlPath << std::endl << "Failed to find cuda library: " << dlPath << std::endl
<< "Please specify its path correctly using one of the following ideas: \n" << "Please specify its path correctly using one of the following ways: \n" // NOLINT
<< "Idea 1. set cuda and cudnn lib path at runtime. " << "Method 1. set cuda and cudnn lib path at runtime. "
<< "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n" << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n" // NOLINT
<< "For instance, issue command: paddle train --use_gpu=1 " << "For instance, issue command: paddle train --use_gpu=1 "
<< "--cuda_dir=/usr/local/cudnn/lib --cudnn_dir=/usr/local/cudnn/lib ...\n" << "--cuda_dir=/usr/local/cuda/lib64 --cudnn_dir=/usr/local/cudnn/lib ...\n" // NOLINT
<< "Idea 2. set environment variable LD_LIBRARY_PATH on Linux or " << "Method 2. set environment variable LD_LIBRARY_PATH on Linux or "
<< "DYLD_LIBRARY_PATH on Mac OS. \n" << "DYLD_LIBRARY_PATH on Mac OS. \n"
<< "For instance, issue command: export LD_LIBRARY_PATH=... \n" << "For instance, issue command: export LD_LIBRARY_PATH=... \n"
<< "Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible " << "Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible "
<< "unless System Integrity Protection (SIP) is disabled. However, @Idea 1" << "unless System Integrity Protection (SIP) is disabled. However, method 1 " // NOLINT
<< "always work well."; << "always work well.";
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册