提交 0bd7613e 编写于 作者: T Tao Luo 提交者: GitHub

Merge pull request #539 from gangliao/stylecheckcc

Add style check for *.cc files in cuda directory
......@@ -81,5 +81,8 @@ else()
add_library(paddle_cuda ${CUDA_SOURCES})
endif()
add_style_check_target(paddle_cuda ${CUDA_SOURCES})
add_style_check_target(paddle_cuda ${CUDA_HEADERS})
add_style_check_target(paddle_cuda
${CUDA_SOURCES}
${CUDA_HEADERS}
${CUDA_DSO_SOURCES}
${CUDA_CXX_WITH_GPU_SOURCES})
......@@ -104,7 +104,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
#endif
const char* hl_cublas_get_error_string(cublasStatus_t status) {
switch(status) {
switch (status) {
case CUBLAS_STATUS_NOT_INITIALIZED:
return "[cublas status]: not initialized";
case CUBLAS_STATUS_ALLOC_FAILED:
......@@ -181,7 +181,7 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
real **inout_d = (real **)hl_malloc_device(sizeof(real *));
hl_memcpy(inout_d, inout_h, sizeof(real *));
int *pivot_d = (int *)hl_malloc_device(dimN*sizeof(int));
int *pivot_d = (int *)hl_malloc_device(dimN * sizeof(int));
int *info_d = (int *)t_resource.gpu_mem;
/* Note: cublasSgetrfBatched is used to calculate a number of
......@@ -189,10 +189,9 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
the API for better performance.
*/
CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle,
dimN, inout_d, lda, pivot_d,
info_d, 1));
dimN, inout_d, lda, pivot_d, info_d, 1));
int info_h;
int info_h;
hl_memcpy(&info_h, info_d, sizeof(int));
if (info_h != 0) {
LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
......@@ -204,8 +203,8 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
hl_memcpy(out_d, out_h, sizeof(real *));
CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
dimN, (const real **)inout_d, lda, pivot_d,
out_d, ldc, info_d, 1));
dimN, (const real **)inout_d, lda, pivot_d,
out_d, ldc, info_d, 1));
hl_memcpy(&info_h, info_d, sizeof(int));
if (info_h != 0) {
......@@ -215,7 +214,7 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
hl_free_mem_device(inout_d);
hl_free_mem_device(pivot_d);
hl_free_mem_device(out_d);
CHECK_SYNC("hl_matrix_inverse failed");
}
......
......@@ -159,13 +159,11 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
bool g_is_libcudnn_init = false;
int g_cudnn_lib_version = 0;
void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc)
{
void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) {
CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
}
void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream)
{
void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream) {
size_t cudnn_dso_ver = dynload::cudnnGetVersion();
size_t cudnn_dso_major = cudnn_dso_ver / 1000;
size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
......@@ -212,13 +210,18 @@ void hl_conv_workspace(hl_tensor_descriptor input,
CHECK_NOTNULL(conv);
// Specify workspace limit directly
size_t memoryLimitBytes = (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
size_t memoryLimitBytes =
(1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
// cudnn convolution forward configuration
cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
cudnnTensorDescriptor_t fwd_src_desc =
GET_TENSOR_DESCRIPTOR(input);
cudnnTensorDescriptor_t fwd_dest_desc =
GET_TENSOR_DESCRIPTOR(output);
cudnnFilterDescriptor_t fwd_filter_desc =
GET_FILTER_DESCRIPTOR(filter);
cudnnConvolutionDescriptor_t fwd_conv_desc =
GET_CONVOLUTION_DESCRIPTOR(conv);
CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
t_resource.cudnn_handle,
......@@ -250,23 +253,23 @@ void hl_conv_workspace(hl_tensor_descriptor input,
GET_CONVOLUTION_DESCRIPTOR(conv);
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
t_resource.cudnn_handle,
bwd_data_filter_desc,
bwd_data_diff_desc,
bwd_data_conv_desc,
bwd_data_grad_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
t_resource.cudnn_handle,
bwd_data_filter_desc,
bwd_data_diff_desc,
bwd_data_conv_desc,
bwd_data_grad_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
t_resource.cudnn_handle,
bwd_data_filter_desc,
bwd_data_diff_desc,
bwd_data_conv_desc,
bwd_data_grad_desc,
static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
bwdDataLimitBytes));
t_resource.cudnn_handle,
bwd_data_filter_desc,
bwd_data_diff_desc,
bwd_data_conv_desc,
bwd_data_grad_desc,
static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
bwdDataLimitBytes));
// cudnn convolution backward filter configuration
cudnnTensorDescriptor_t bwd_filter_src_desc =
......@@ -279,21 +282,21 @@ void hl_conv_workspace(hl_tensor_descriptor input,
GET_FILTER_DESCRIPTOR(filter);
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
t_resource.cudnn_handle,
bwd_filter_src_desc,
bwd_filter_diff_desc,
bwd_filter_conv_desc,
bwd_filter_grad_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
t_resource.cudnn_handle,
bwd_filter_src_desc,
bwd_filter_diff_desc,
bwd_filter_conv_desc,
bwd_filter_grad_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
t_resource.cudnn_handle, bwd_filter_src_desc,
bwd_filter_diff_desc, bwd_filter_conv_desc,
bwd_filter_grad_desc,
static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
bwdFilterLimitBytes));
t_resource.cudnn_handle, bwd_filter_src_desc,
bwd_filter_diff_desc, bwd_filter_conv_desc,
bwd_filter_grad_desc,
static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
bwdFilterLimitBytes));
#endif
}
......@@ -302,8 +305,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
int batch_size,
int feature_maps,
int height,
int width)
{
int width) {
CHECK_NOTNULL(image_desc);
cudnn_tensor_descriptor hl_desc =
......@@ -359,8 +361,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
int batch_size,
int feature_maps,
int height,
int width)
{
int width) {
const int stride_w = 1;
const int stride_h = width * stride_w;
const int stride_c = height * stride_h;
......@@ -384,8 +385,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
int nStride,
int cStride,
int hStride,
int wStride)
{
int wStride) {
CHECK_NOTNULL(image_desc);
cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
......@@ -408,8 +408,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
hl_desc->width = width;
}
void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc)
{
void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
CHECK_NOTNULL(image_desc);
cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
......@@ -430,11 +429,9 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
int height_padding,
int width_padding,
int stride_height,
int stride_width)
{
int stride_width) {
cudnnPoolingMode_t cudnn_mode;
switch (mode)
{
switch (mode) {
case HL_POOLING_MAX:
cudnn_mode = CUDNN_POOLING_MAX;
break;
......@@ -478,13 +475,13 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
*pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
}
void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc)
{
void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
CHECK_NOTNULL(pooling_desc);
cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
CHECK_NOTNULL(hl_pooling->desc);
cudnn_pooling_descriptor hl_pooling =
(cudnn_pooling_descriptor)pooling_desc;
CHECK_NOTNULL(hl_pooling->desc);
CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
hl_pooling->desc = NULL;
......@@ -496,8 +493,7 @@ void hl_pooling_forward(hl_tensor_descriptor input,
real* input_image,
hl_tensor_descriptor output,
real* output_image,
hl_pooling_descriptor pooling)
{
hl_pooling_descriptor pooling) {
cudnnPoolingDescriptor_t pooling_desc;
cudnnTensorDescriptor_t input_desc;
cudnnTensorDescriptor_t output_desc;
......@@ -531,8 +527,7 @@ void hl_pooling_backward(hl_tensor_descriptor input,
hl_tensor_descriptor output,
real* output_image,
real* output_image_grad,
hl_pooling_descriptor pooling)
{
hl_pooling_descriptor pooling) {
cudnnPoolingDescriptor_t pooling_desc;
cudnnTensorDescriptor_t input_desc;
cudnnTensorDescriptor_t output_desc;
......@@ -571,8 +566,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter,
int input_feature_maps,
int output_feature_maps,
int height,
int width)
{
int width) {
CHECK_NOTNULL(filter);
cudnn_filter_descriptor hl_filter =
......@@ -607,8 +601,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter,
}
void hl_destroy_filter_descriptor(hl_filter_descriptor filter)
{
void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
CHECK_NOTNULL(filter);
cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
......@@ -627,14 +620,13 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
int padding_height,
int padding_width,
int stride_height,
int stride_width)
{
int stride_width) {
CHECK_NOTNULL(conv);
cudnn_convolution_descriptor hl_conv =
(cudnn_convolution_descriptor)malloc(sizeof(_cudnn_convolution_descriptor));
CHECK_NOTNULL(hl_conv);
cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)
malloc(sizeof(_cudnn_convolution_descriptor));
CHECK_NOTNULL(hl_conv);
CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
......@@ -667,8 +659,7 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
int padding_height,
int padding_width,
int stride_height,
int stride_width)
{
int stride_width) {
CHECK_NOTNULL(conv);
CHECK_NOTNULL(image);
CHECK_NOTNULL(filter);
......@@ -697,8 +688,7 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
hl_conv->mode = mode;
}
void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv)
{
void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
CHECK_NOTNULL(conv);
cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
......@@ -753,8 +743,7 @@ void hl_convolution_forward(hl_tensor_descriptor input,
void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
real* bias_data,
hl_tensor_descriptor output,
real* output_data)
{
real* output_data) {
CHECK_NOTNULL(bias);
CHECK_NOTNULL(output);
CHECK_NOTNULL(bias_data);
......@@ -782,8 +771,7 @@ void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
void hl_convolution_backward_bias(hl_tensor_descriptor bias,
real* bias_grad_data,
hl_tensor_descriptor output,
real* output_grad_data)
{
real* output_grad_data) {
CHECK_NOTNULL(bias);
CHECK_NOTNULL(output);
CHECK_NOTNULL(bias_grad_data);
......@@ -814,7 +802,6 @@ void hl_convolution_backward_filter(hl_tensor_descriptor input,
void* gpuWorkSpace,
size_t sizeInBytes,
int convBwdFilterAlgo) {
CHECK_NOTNULL(input);
CHECK_NOTNULL(output);
CHECK_NOTNULL(filter);
......@@ -889,8 +876,7 @@ void hl_convolution_backward_data(hl_tensor_descriptor input,
void hl_softmax_forward(real *input,
real *output,
int height,
int width)
{
int width) {
#ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
......@@ -923,8 +909,7 @@ void hl_softmax_forward(real *input,
void hl_softmax_backward(real *output_value,
real *output_grad,
int height,
int width)
{
int width) {
#ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
......
......@@ -203,8 +203,8 @@ inline pid_t gettid() {
#endif
pid_t tid = syscall(__NR_gettid);
#endif
CHECK_NE(tid, -1);
return tid;
CHECK_NE((int)tid, -1);
return tid;
}
void hl_init(int device) {
......@@ -355,7 +355,8 @@ void* hl_malloc_host(size_t size) {
void *dest_h;
CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
CHECK_CUDA(dynload::cudaHostAlloc((void**)&dest_h, size, cudaHostAllocDefault));
CHECK_CUDA(dynload::cudaHostAlloc(
(void**)&dest_h, size, cudaHostAllocDefault));
return dest_h;
}
......@@ -364,7 +365,7 @@ void hl_free_mem_host(void *dest_h) {
CHECK_NOTNULL(dest_h);
cudaError_t err = dynload::cudaFreeHost(dest_h);
CHECK (cudaSuccess == err || cudaErrorCudartUnloading == err)
CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
<< hl_get_device_error_string();
}
......@@ -502,7 +503,8 @@ int hl_get_cuda_version() {
return g_cuda_lib_version;
}
void hl_create_thread_resources(int device, thread_device_resources device_res) {
void hl_create_thread_resources(int device,
thread_device_resources device_res) {
CHECK_CUDA(dynload::cudaSetDevice(device));
/* create thread stream */
......
......@@ -78,48 +78,38 @@ __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
dim3 blockDim,
void **args,
size_t sharedMem,
cudaStream_t stream)
{
return dynload::cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
cudaStream_t stream) {
return dynload::cudaLaunchKernel(func, gridDim, blockDim,
args, sharedMem, stream);
}
#endif /* CUDART_VERSION >= 7000 */
__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func)
{
__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
return dynload::cudaLaunch(func);
}
__host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
size_t size,
size_t offset)
{
size_t offset) {
return dynload::cudaSetupArgument(arg, size, offset);
}
__host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
dim3 blockDim,
size_t sharedMem,
cudaStream_t stream)
{
cudaStream_t stream) {
return dynload::cudaConfigureCall(gridDim, blockDim,
sharedMem, stream);
}
extern "C" {
void** CUDARTAPI __cudaRegisterFatBinary(
void *fatCubin
)
{
void** CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
return dynload::__cudaRegisterFatBinary(fatCubin);
}
void CUDARTAPI __cudaUnregisterFatBinary(
void **fatCubinHandle
)
{
void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
}
......
......@@ -19,17 +19,18 @@ limitations under the License. */
P_DEFINE_string(cudnn_dir, "",
"Specify path for loading libcudnn.so. For instance, "
"/usr/local/cudnn/lib64. If empty [default], dlopen will search "
"cudnn from LD_LIBRARY_PATH");
"/usr/local/cudnn/lib64. If empty [default], dlopen "
"will search cudnn from LD_LIBRARY_PATH");
P_DEFINE_string(cuda_dir, "",
"Specify path for loading cuda library, such as libcublas, "
"libcurand. For instance, /usr/local/cuda/lib64. "
"(Note: libcudart can not be specified by cuda_dir, since some "
"libcurand. For instance, /usr/local/cuda/lib64. (Note: "
"libcudart can not be specified by cuda_dir, since some "
"build-in function in cudart already ran before main entry). "
"If empty [default], dlopen will search cuda from LD_LIBRARY_PATH");
"If default, dlopen will search cuda from LD_LIBRARY_PATH");
static inline std::string join(const std::string& part1, const std::string& part2) {
static inline std::string join(const std::string& part1,
const std::string& part2) {
// directory separator
const char sep = '/';
......@@ -49,10 +50,10 @@ static inline std::string join(const std::string& part1, const std::string& part
static inline void GetDsoHandleFromDefaultPath(
std::string& dso_path, void** dso_handle, int dynload_flags) {
VLOG(3) << "Try to find cuda library: " << dso_path
<< " from default system path.";
// default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
<< " from default system path.";
// default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
*dso_handle = dlopen(dso_path.c_str(), dynload_flags);
// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
// bring System Integrity Projection (SIP), if dso_handle
// is null, search from default package path in Mac OS.
......@@ -62,13 +63,13 @@ static inline void GetDsoHandleFromDefaultPath(
*dso_handle = dlopen(dso_path.c_str(), dynload_flags);
if (nullptr == *dso_handle) {
if (dso_path == "libcudnn.dylib") {
LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"
<< "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C "
<< "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h "
LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n" // NOLINT
<< "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C " // NOLINT
<< "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h " // NOLINT
<< "/usr/local/cuda/lib/libcudnn*";
}
}
}
}
}
#endif
}
......@@ -96,19 +97,19 @@ static inline void GetDsoHandleFromSearchPath(
CHECK(nullptr != *dso_handle)
<< "Failed to find cuda library: " << dlPath << std::endl
<< "Please specify its path correctly using one of the following ideas: \n"
<< "Please specify its path correctly using one of the following ways: \n" // NOLINT
<< "Idea 1. set cuda and cudnn lib path at runtime. "
<< "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n"
<< "Method 1. set cuda and cudnn lib path at runtime. "
<< "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n" // NOLINT
<< "For instance, issue command: paddle train --use_gpu=1 "
<< "--cuda_dir=/usr/local/cudnn/lib --cudnn_dir=/usr/local/cudnn/lib ...\n"
<< "--cuda_dir=/usr/local/cuda/lib64 --cudnn_dir=/usr/local/cudnn/lib ...\n" // NOLINT
<< "Idea 2. set environment variable LD_LIBRARY_PATH on Linux or "
<< "Method 2. set environment variable LD_LIBRARY_PATH on Linux or "
<< "DYLD_LIBRARY_PATH on Mac OS. \n"
<< "For instance, issue command: export LD_LIBRARY_PATH=... \n"
<< "Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible "
<< "unless System Integrity Protection (SIP) is disabled. However, @Idea 1"
<< "unless System Integrity Protection (SIP) is disabled. However, method 1 " // NOLINT
<< "always work well.";
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册