Add style check for *.cc files in cuda directory

20aac5bb · liaogang · 77ddce0f · 20aac5bb · 20aac5bb · 20aac5bb
6 changed file
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -81,5 +81,8 @@ else()
    add_library(paddle_cuda ${CUDA_SOURCES})
 endif()

-add_style_check_target(paddle_cuda ${CUDA_SOURCES})
-add_style_check_target(paddle_cuda ${CUDA_HEADERS})
+add_style_check_target(paddle_cuda
+                       ${CUDA_SOURCES}
+                       ${CUDA_HEADERS}
+                       ${CUDA_DSO_SOURCES}
+                       ${CUDA_CXX_WITH_GPU_SOURCES})
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -104,7 +104,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 #endif

 const char* hl_cublas_get_error_string(cublasStatus_t status) {
-  switch(status) {
+  switch (status) {
     case CUBLAS_STATUS_NOT_INITIALIZED:
        return "[cublas status]: not initialized";
     case CUBLAS_STATUS_ALLOC_FAILED:
@@ -181,7 +181,7 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
  real **inout_d = (real **)hl_malloc_device(sizeof(real *));
  hl_memcpy(inout_d, inout_h, sizeof(real *));

-  int *pivot_d = (int *)hl_malloc_device(dimN*sizeof(int));  
+  int *pivot_d = (int *)hl_malloc_device(dimN * sizeof(int));
  int *info_d = (int *)t_resource.gpu_mem;

  /* Note: cublasSgetrfBatched is used to calculate a number of
@@ -189,10 +189,9 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
     the API for better performance.
   */
  CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle,
-	       dimN, inout_d, lda, pivot_d,
-               info_d, 1));
+      dimN, inout_d, lda, pivot_d, info_d, 1));

-  int info_h; 
+  int info_h;
  hl_memcpy(&info_h, info_d, sizeof(int));
  if (info_h != 0) {
      LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
@@ -204,8 +203,8 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
  hl_memcpy(out_d, out_h, sizeof(real *));

  CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
-	       dimN, (const real **)inout_d, lda, pivot_d,
-	       out_d, ldc, info_d, 1));
+      dimN, (const real **)inout_d, lda, pivot_d,
+      out_d, ldc, info_d, 1));

  hl_memcpy(&info_h, info_d, sizeof(int));
  if (info_h != 0) {
@@ -215,7 +214,7 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
  hl_free_mem_device(inout_d);
  hl_free_mem_device(pivot_d);
  hl_free_mem_device(out_d);
-  
+
  CHECK_SYNC("hl_matrix_inverse failed");
 }


--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -159,13 +159,11 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
 bool g_is_libcudnn_init = false;
 int g_cudnn_lib_version = 0;

-void hl_cudnn_desc_init(cudnnTensorDescriptor_t*  cudnn_desc)
-{
+void hl_cudnn_desc_init(cudnnTensorDescriptor_t*  cudnn_desc) {
    CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
 }

-void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream)
-{
+void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream) {
    size_t cudnn_dso_ver = dynload::cudnnGetVersion();
    size_t cudnn_dso_major = cudnn_dso_ver / 1000;
    size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
@@ -212,13 +210,18 @@ void hl_conv_workspace(hl_tensor_descriptor input,
    CHECK_NOTNULL(conv);

    // Specify workspace limit directly
-    size_t memoryLimitBytes = (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
+    size_t memoryLimitBytes =
+        (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;

    // cudnn convolution forward configuration
-    cudnnTensorDescriptor_t       fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnFilterDescriptor_t       fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-    cudnnConvolutionDescriptor_t  fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+    cudnnTensorDescriptor_t       fwd_src_desc =
+                                        GET_TENSOR_DESCRIPTOR(input);
+    cudnnTensorDescriptor_t       fwd_dest_desc =
+                                        GET_TENSOR_DESCRIPTOR(output);
+    cudnnFilterDescriptor_t       fwd_filter_desc =
+                                        GET_FILTER_DESCRIPTOR(filter);
+    cudnnConvolutionDescriptor_t  fwd_conv_desc =
+                                        GET_CONVOLUTION_DESCRIPTOR(conv);

    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
             t_resource.cudnn_handle,
@@ -250,23 +253,23 @@ void hl_conv_workspace(hl_tensor_descriptor input,
                                          GET_CONVOLUTION_DESCRIPTOR(conv);

    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-             t_resource.cudnn_handle,
-             bwd_data_filter_desc,
-             bwd_data_diff_desc,
-             bwd_data_conv_desc,
-             bwd_data_grad_desc,
-             CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-             memoryLimitBytes,
-             reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
+        t_resource.cudnn_handle,
+        bwd_data_filter_desc,
+        bwd_data_diff_desc,
+        bwd_data_conv_desc,
+        bwd_data_grad_desc,
+        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));

    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-             t_resource.cudnn_handle,
-             bwd_data_filter_desc,
-             bwd_data_diff_desc,
-             bwd_data_conv_desc,
-             bwd_data_grad_desc,
-             static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
-             bwdDataLimitBytes));
+        t_resource.cudnn_handle,
+        bwd_data_filter_desc,
+        bwd_data_diff_desc,
+        bwd_data_conv_desc,
+        bwd_data_grad_desc,
+        static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
+        bwdDataLimitBytes));

    // cudnn convolution backward filter configuration
    cudnnTensorDescriptor_t       bwd_filter_src_desc =
@@ -279,21 +282,21 @@ void hl_conv_workspace(hl_tensor_descriptor input,
                                      GET_FILTER_DESCRIPTOR(filter);

    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-             t_resource.cudnn_handle,
-             bwd_filter_src_desc,
-             bwd_filter_diff_desc,
-             bwd_filter_conv_desc,
-             bwd_filter_grad_desc,
-             CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-             memoryLimitBytes,
-             reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
+        t_resource.cudnn_handle,
+        bwd_filter_src_desc,
+        bwd_filter_diff_desc,
+        bwd_filter_conv_desc,
+        bwd_filter_grad_desc,
+        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));

    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-             t_resource.cudnn_handle, bwd_filter_src_desc,
-             bwd_filter_diff_desc, bwd_filter_conv_desc,
-             bwd_filter_grad_desc,
-             static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
-             bwdFilterLimitBytes));
+        t_resource.cudnn_handle, bwd_filter_src_desc,
+        bwd_filter_diff_desc, bwd_filter_conv_desc,
+        bwd_filter_grad_desc,
+        static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
+        bwdFilterLimitBytes));

 #endif
 }
@@ -302,8 +305,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
                                 int batch_size,
                                 int feature_maps,
                                 int height,
-                                 int width)
-{
+                                 int width) {
    CHECK_NOTNULL(image_desc);

    cudnn_tensor_descriptor hl_desc =
@@ -359,8 +361,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
                       int batch_size,
                       int feature_maps,
                       int height,
-                       int width)
-{
+                       int width) {
    const int stride_w = 1;
    const int stride_h = width * stride_w;
    const int stride_c = height * stride_h;
@@ -384,8 +385,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
                       int nStride,
                       int cStride,
                       int hStride,
-                       int wStride)
-{
+                       int wStride) {
    CHECK_NOTNULL(image_desc);

    cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
@@ -408,8 +408,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
    hl_desc->width = width;
 }

-void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc)
-{
+void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
    CHECK_NOTNULL(image_desc);

    cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
@@ -430,11 +429,9 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
                                  int height_padding,
                                  int width_padding,
                                  int stride_height,
-                                  int stride_width)
-{
+                                  int stride_width) {
    cudnnPoolingMode_t cudnn_mode;
-    switch (mode)
-    {
+    switch (mode) {
        case HL_POOLING_MAX:
            cudnn_mode = CUDNN_POOLING_MAX;
            break;
@@ -478,13 +475,13 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
    *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
 }

-void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc)
-{
+void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
    CHECK_NOTNULL(pooling_desc);

-    cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
-    CHECK_NOTNULL(hl_pooling->desc);
+    cudnn_pooling_descriptor hl_pooling =
+        (cudnn_pooling_descriptor)pooling_desc;

+    CHECK_NOTNULL(hl_pooling->desc);
    CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));

    hl_pooling->desc = NULL;
@@ -496,8 +493,7 @@ void hl_pooling_forward(hl_tensor_descriptor input,
                        real* input_image,
                        hl_tensor_descriptor output,
                        real* output_image,
-                        hl_pooling_descriptor pooling)
-{
+                        hl_pooling_descriptor pooling) {
    cudnnPoolingDescriptor_t    pooling_desc;
    cudnnTensorDescriptor_t     input_desc;
    cudnnTensorDescriptor_t     output_desc;
@@ -531,8 +527,7 @@ void hl_pooling_backward(hl_tensor_descriptor input,
                         hl_tensor_descriptor output,
                         real* output_image,
                         real* output_image_grad,
-                         hl_pooling_descriptor pooling)
-{
+                         hl_pooling_descriptor pooling) {
    cudnnPoolingDescriptor_t    pooling_desc;
    cudnnTensorDescriptor_t     input_desc;
    cudnnTensorDescriptor_t     output_desc;
@@ -571,8 +566,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter,
                                 int input_feature_maps,
                                 int output_feature_maps,
                                 int height,
-                                 int width)
-{
+                                 int width) {
    CHECK_NOTNULL(filter);

    cudnn_filter_descriptor hl_filter =
@@ -607,8 +601,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter,
 }


-void hl_destroy_filter_descriptor(hl_filter_descriptor filter)
-{
+void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
    CHECK_NOTNULL(filter);

    cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
@@ -627,14 +620,13 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                      int padding_height,
                                      int padding_width,
                                      int stride_height,
-                                      int stride_width)
-{
+                                      int stride_width) {
    CHECK_NOTNULL(conv);

-    cudnn_convolution_descriptor hl_conv =
-        (cudnn_convolution_descriptor)malloc(sizeof(_cudnn_convolution_descriptor));
-    CHECK_NOTNULL(hl_conv);
+    cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)
+        malloc(sizeof(_cudnn_convolution_descriptor));

+    CHECK_NOTNULL(hl_conv);
    CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));

    cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
@@ -667,8 +659,7 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                     int padding_height,
                                     int padding_width,
                                     int stride_height,
-                                     int stride_width)
-{
+                                     int stride_width) {
    CHECK_NOTNULL(conv);
    CHECK_NOTNULL(image);
    CHECK_NOTNULL(filter);
@@ -697,8 +688,7 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
    hl_conv->mode = mode;
 }

-void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv)
-{
+void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
    CHECK_NOTNULL(conv);

    cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
@@ -753,8 +743,7 @@ void hl_convolution_forward(hl_tensor_descriptor input,
 void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
                                     real* bias_data,
                                     hl_tensor_descriptor output,
-                                     real* output_data)
-{
+                                     real* output_data) {
    CHECK_NOTNULL(bias);
    CHECK_NOTNULL(output);
    CHECK_NOTNULL(bias_data);
@@ -782,8 +771,7 @@ void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
 void hl_convolution_backward_bias(hl_tensor_descriptor bias,
                                  real* bias_grad_data,
                                  hl_tensor_descriptor output,
-                                  real* output_grad_data)
-{
+                                  real* output_grad_data) {
    CHECK_NOTNULL(bias);
    CHECK_NOTNULL(output);
    CHECK_NOTNULL(bias_grad_data);
@@ -814,7 +802,6 @@ void hl_convolution_backward_filter(hl_tensor_descriptor input,
                                    void* gpuWorkSpace,
                                    size_t sizeInBytes,
                                    int convBwdFilterAlgo) {
-
    CHECK_NOTNULL(input);
    CHECK_NOTNULL(output);
    CHECK_NOTNULL(filter);
@@ -889,8 +876,7 @@ void hl_convolution_backward_data(hl_tensor_descriptor input,
 void hl_softmax_forward(real *input,
                        real *output,
                        int height,
-                        int width)
-{
+                        int width) {
 #ifndef PADDLE_TYPE_DOUBLE
    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
@@ -923,8 +909,7 @@ void hl_softmax_forward(real *input,
 void hl_softmax_backward(real *output_value,
                         real *output_grad,
                         int height,
-                         int width)
-{
+                         int width) {
 #ifndef PADDLE_TYPE_DOUBLE
    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else

--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -203,8 +203,8 @@ inline pid_t gettid() {
  #endif
  pid_t tid = syscall(__NR_gettid);
 #endif
-  CHECK_NE(tid, -1);
-  return tid;    
+  CHECK_NE((int)tid, -1);
+  return tid;
 }

 void hl_init(int device) {
@@ -355,7 +355,8 @@ void* hl_malloc_host(size_t size) {
  void *dest_h;

  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(dynload::cudaHostAlloc((void**)&dest_h, size, cudaHostAllocDefault));
+  CHECK_CUDA(dynload::cudaHostAlloc(
+    (void**)&dest_h, size, cudaHostAllocDefault));

  return dest_h;
 }
@@ -364,7 +365,7 @@ void hl_free_mem_host(void *dest_h) {
  CHECK_NOTNULL(dest_h);

  cudaError_t err = dynload::cudaFreeHost(dest_h);
-  CHECK (cudaSuccess == err || cudaErrorCudartUnloading == err)
+  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
    << hl_get_device_error_string();
 }

@@ -502,7 +503,8 @@ int hl_get_cuda_version() {
  return g_cuda_lib_version;
 }

-void hl_create_thread_resources(int device, thread_device_resources device_res) {
+void hl_create_thread_resources(int device,
+  thread_device_resources device_res) {
  CHECK_CUDA(dynload::cudaSetDevice(device));

  /* create thread stream */

--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -78,48 +78,38 @@ __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
                                                dim3 blockDim,
                                                void **args,
                                                size_t sharedMem,
-                                                cudaStream_t stream)
-{
-  return dynload::cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
+                                                cudaStream_t stream) {
+  return dynload::cudaLaunchKernel(func, gridDim, blockDim,
+                                   args, sharedMem, stream);
 }
 #endif /* CUDART_VERSION >= 7000 */


-__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func)
-{
+__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
  return dynload::cudaLaunch(func);
 }

 __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
                                                 size_t size,
-                                                 size_t offset)
-{
+                                                 size_t offset) {
  return dynload::cudaSetupArgument(arg, size, offset);
 }

 __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
                                                 dim3 blockDim,
                                                 size_t sharedMem,
-                                                 cudaStream_t stream)
-{
+                                                 cudaStream_t stream) {
  return dynload::cudaConfigureCall(gridDim, blockDim,
                                    sharedMem, stream);
 }

 extern "C" {

-void** CUDARTAPI __cudaRegisterFatBinary(
-  void *fatCubin
-)
-{
+void** CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
  return dynload::__cudaRegisterFatBinary(fatCubin);
-
 }

-void CUDARTAPI __cudaUnregisterFatBinary(
-  void **fatCubinHandle
-)
-{
+void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
  return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
 }


--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -19,17 +19,18 @@ limitations under the License. */

 P_DEFINE_string(cudnn_dir, "",
                "Specify path for loading libcudnn.so. For instance, "
-                "/usr/local/cudnn/lib64. If empty [default], dlopen will search "
-                "cudnn from LD_LIBRARY_PATH");
+                "/usr/local/cudnn/lib64. If empty [default], dlopen "
+                "will search cudnn from LD_LIBRARY_PATH");

 P_DEFINE_string(cuda_dir, "",
                "Specify path for loading cuda library, such as libcublas, "
-                "libcurand. For instance, /usr/local/cuda/lib64. "
-                "(Note: libcudart can not be specified by cuda_dir, since some "
+                "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
+                "libcudart can not be specified by cuda_dir, since some "
                "build-in function in cudart already ran before main entry). "
-                "If empty [default], dlopen will search cuda from LD_LIBRARY_PATH");
+                "If default, dlopen will search cuda from LD_LIBRARY_PATH");

-static inline std::string join(const std::string& part1, const std::string& part2) {
+static inline std::string join(const std::string& part1,
+                               const std::string& part2) {
  // directory separator
  const char sep = '/';

@@ -49,10 +50,10 @@ static inline std::string join(const std::string& part1, const std::string& part
 static inline void GetDsoHandleFromDefaultPath(
        std::string& dso_path, void** dso_handle, int dynload_flags) {
    VLOG(3) << "Try to find cuda library: " << dso_path
-              << " from default system path.";
-    // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH 
+            << " from default system path.";
+    // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-    
+
    // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
    // bring System Integrity Projection (SIP), if dso_handle
    // is null, search from default package path in Mac OS.
@@ -62,13 +63,13 @@ static inline void GetDsoHandleFromDefaultPath(
        *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
        if (nullptr == *dso_handle) {
            if (dso_path == "libcudnn.dylib") {
-                LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"
-                << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C "
-                << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h "
+                LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n" // NOLINT
+                << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C "    // NOLINT
+                << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h "    // NOLINT
                << "/usr/local/cuda/lib/libcudnn*";
            }
-        } 
-    }   
+        }
+    }
    #endif
 }

@@ -96,19 +97,19 @@ static inline void GetDsoHandleFromSearchPath(

    CHECK(nullptr != *dso_handle)
      << "Failed to find cuda library: " << dlPath << std::endl
-      << "Please specify its path correctly using one of the following ideas: \n"
+      << "Please specify its path correctly using one of the following ways: \n"    // NOLINT

-      << "Idea 1. set cuda and cudnn lib path at runtime. "
-      << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n"
+      << "Method 1. set cuda and cudnn lib path at runtime. "
+      << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n" // NOLINT
      << "For instance, issue command: paddle train --use_gpu=1 "
-      << "--cuda_dir=/usr/local/cudnn/lib --cudnn_dir=/usr/local/cudnn/lib ...\n"
+      << "--cuda_dir=/usr/local/cuda/lib64 --cudnn_dir=/usr/local/cudnn/lib ...\n"  // NOLINT

-      << "Idea 2. set environment variable LD_LIBRARY_PATH on Linux or "
+      << "Method 2. set environment variable LD_LIBRARY_PATH on Linux or "
      << "DYLD_LIBRARY_PATH on Mac OS. \n"
      << "For instance, issue command: export LD_LIBRARY_PATH=... \n"

      << "Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible "
-      << "unless System Integrity Protection (SIP) is disabled. However, @Idea 1"
+      << "unless System Integrity Protection (SIP) is disabled. However, method 1 " // NOLINT
      << "always work well.";
 }