diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index cdb730bb3cec7a32fa42cf4c6738d575b76c6032..11dbfb54b268774405ade1e532bef9a0e8c7ada9 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -81,5 +81,8 @@ else()
     add_library(paddle_cuda ${CUDA_SOURCES})
 endif()
 
-add_style_check_target(paddle_cuda ${CUDA_SOURCES})
-add_style_check_target(paddle_cuda ${CUDA_HEADERS})
+add_style_check_target(paddle_cuda
+                       ${CUDA_SOURCES}
+                       ${CUDA_HEADERS}
+                       ${CUDA_DSO_SOURCES}
+                       ${CUDA_CXX_WITH_GPU_SOURCES})
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index f16376ec937d3a397d9e7117de528c304f8403ee..abf6afadc218f615dc6b3cf734d09f072214be40 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -104,7 +104,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 #endif
 
 const char* hl_cublas_get_error_string(cublasStatus_t status) {
-  switch(status) {
+  switch (status) {
      case CUBLAS_STATUS_NOT_INITIALIZED:
         return "[cublas status]: not initialized";
      case CUBLAS_STATUS_ALLOC_FAILED:
@@ -181,7 +181,7 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
   real **inout_d = (real **)hl_malloc_device(sizeof(real *));
   hl_memcpy(inout_d, inout_h, sizeof(real *));
 
-  int *pivot_d = (int *)hl_malloc_device(dimN*sizeof(int));  
+  int *pivot_d = (int *)hl_malloc_device(dimN * sizeof(int));
   int *info_d = (int *)t_resource.gpu_mem;
 
   /* Note: cublasSgetrfBatched is used to calculate a number of
@@ -189,10 +189,9 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
      the API for better performance.
    */
   CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle,
-	       dimN, inout_d, lda, pivot_d,
-               info_d, 1));
+      dimN, inout_d, lda, pivot_d, info_d, 1));
 
-  int info_h; 
+  int info_h;
   hl_memcpy(&info_h, info_d, sizeof(int));
   if (info_h != 0) {
       LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
@@ -204,8 +203,8 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
   hl_memcpy(out_d, out_h, sizeof(real *));
 
   CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
-	       dimN, (const real **)inout_d, lda, pivot_d,
-	       out_d, ldc, info_d, 1));
+      dimN, (const real **)inout_d, lda, pivot_d,
+      out_d, ldc, info_d, 1));
 
   hl_memcpy(&info_h, info_d, sizeof(int));
   if (info_h != 0) {
@@ -215,7 +214,7 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
   hl_free_mem_device(inout_d);
   hl_free_mem_device(pivot_d);
   hl_free_mem_device(out_d);
-  
+
   CHECK_SYNC("hl_matrix_inverse failed");
 }
 
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 92b28e4345c3d4d306e6ee2a7f9f50189454f951..1829fe23ac594e63253df23b350b16cb28eaebc1 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -159,13 +159,11 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
 bool g_is_libcudnn_init = false;
 int g_cudnn_lib_version = 0;
 
-void hl_cudnn_desc_init(cudnnTensorDescriptor_t*  cudnn_desc)
-{
+void hl_cudnn_desc_init(cudnnTensorDescriptor_t*  cudnn_desc) {
     CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
 }
 
-void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream)
-{
+void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream) {
     size_t cudnn_dso_ver = dynload::cudnnGetVersion();
     size_t cudnn_dso_major = cudnn_dso_ver / 1000;
     size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
@@ -212,13 +210,18 @@ void hl_conv_workspace(hl_tensor_descriptor input,
     CHECK_NOTNULL(conv);
 
     // Specify workspace limit directly
-    size_t memoryLimitBytes = (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
+    size_t memoryLimitBytes =
+        (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
 
     // cudnn convolution forward configuration
-    cudnnTensorDescriptor_t       fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnFilterDescriptor_t       fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-    cudnnConvolutionDescriptor_t  fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+    cudnnTensorDescriptor_t       fwd_src_desc =
+                                        GET_TENSOR_DESCRIPTOR(input);
+    cudnnTensorDescriptor_t       fwd_dest_desc =
+                                        GET_TENSOR_DESCRIPTOR(output);
+    cudnnFilterDescriptor_t       fwd_filter_desc =
+                                        GET_FILTER_DESCRIPTOR(filter);
+    cudnnConvolutionDescriptor_t  fwd_conv_desc =
+                                        GET_CONVOLUTION_DESCRIPTOR(conv);
 
     CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
              t_resource.cudnn_handle,
@@ -250,23 +253,23 @@ void hl_conv_workspace(hl_tensor_descriptor input,
                                           GET_CONVOLUTION_DESCRIPTOR(conv);
 
     CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-             t_resource.cudnn_handle,
-             bwd_data_filter_desc,
-             bwd_data_diff_desc,
-             bwd_data_conv_desc,
-             bwd_data_grad_desc,
-             CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-             memoryLimitBytes,
-             reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
+        t_resource.cudnn_handle,
+        bwd_data_filter_desc,
+        bwd_data_diff_desc,
+        bwd_data_conv_desc,
+        bwd_data_grad_desc,
+        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
 
     CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-             t_resource.cudnn_handle,
-             bwd_data_filter_desc,
-             bwd_data_diff_desc,
-             bwd_data_conv_desc,
-             bwd_data_grad_desc,
-             static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
-             bwdDataLimitBytes));
+        t_resource.cudnn_handle,
+        bwd_data_filter_desc,
+        bwd_data_diff_desc,
+        bwd_data_conv_desc,
+        bwd_data_grad_desc,
+        static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
+        bwdDataLimitBytes));
 
     // cudnn convolution backward filter configuration
     cudnnTensorDescriptor_t       bwd_filter_src_desc =
@@ -279,21 +282,21 @@ void hl_conv_workspace(hl_tensor_descriptor input,
                                       GET_FILTER_DESCRIPTOR(filter);
 
     CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-             t_resource.cudnn_handle,
-             bwd_filter_src_desc,
-             bwd_filter_diff_desc,
-             bwd_filter_conv_desc,
-             bwd_filter_grad_desc,
-             CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-             memoryLimitBytes,
-             reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
+        t_resource.cudnn_handle,
+        bwd_filter_src_desc,
+        bwd_filter_diff_desc,
+        bwd_filter_conv_desc,
+        bwd_filter_grad_desc,
+        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
 
     CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-             t_resource.cudnn_handle, bwd_filter_src_desc,
-             bwd_filter_diff_desc, bwd_filter_conv_desc,
-             bwd_filter_grad_desc,
-             static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
-             bwdFilterLimitBytes));
+        t_resource.cudnn_handle, bwd_filter_src_desc,
+        bwd_filter_diff_desc, bwd_filter_conv_desc,
+        bwd_filter_grad_desc,
+        static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
+        bwdFilterLimitBytes));
 
 #endif
 }
@@ -302,8 +305,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
                                  int batch_size,
                                  int feature_maps,
                                  int height,
-                                 int width)
-{
+                                 int width) {
     CHECK_NOTNULL(image_desc);
 
     cudnn_tensor_descriptor hl_desc =
@@ -359,8 +361,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
                        int batch_size,
                        int feature_maps,
                        int height,
-                       int width)
-{
+                       int width) {
     const int stride_w = 1;
     const int stride_h = width * stride_w;
     const int stride_c = height * stride_h;
@@ -384,8 +385,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
                        int nStride,
                        int cStride,
                        int hStride,
-                       int wStride)
-{
+                       int wStride) {
     CHECK_NOTNULL(image_desc);
 
     cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
@@ -408,8 +408,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
     hl_desc->width = width;
 }
 
-void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc)
-{
+void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
     CHECK_NOTNULL(image_desc);
 
     cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
@@ -430,11 +429,9 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
                                   int height_padding,
                                   int width_padding,
                                   int stride_height,
-                                  int stride_width)
-{
+                                  int stride_width) {
     cudnnPoolingMode_t cudnn_mode;
-    switch (mode)
-    {
+    switch (mode) {
         case HL_POOLING_MAX:
             cudnn_mode = CUDNN_POOLING_MAX;
             break;
@@ -478,13 +475,13 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
     *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
 }
 
-void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc)
-{
+void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
     CHECK_NOTNULL(pooling_desc);
 
-    cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
-    CHECK_NOTNULL(hl_pooling->desc);
+    cudnn_pooling_descriptor hl_pooling =
+        (cudnn_pooling_descriptor)pooling_desc;
 
+    CHECK_NOTNULL(hl_pooling->desc);
     CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
 
     hl_pooling->desc = NULL;
@@ -496,8 +493,7 @@ void hl_pooling_forward(hl_tensor_descriptor input,
                         real* input_image,
                         hl_tensor_descriptor output,
                         real* output_image,
-                        hl_pooling_descriptor pooling)
-{
+                        hl_pooling_descriptor pooling) {
     cudnnPoolingDescriptor_t    pooling_desc;
     cudnnTensorDescriptor_t     input_desc;
     cudnnTensorDescriptor_t     output_desc;
@@ -531,8 +527,7 @@ void hl_pooling_backward(hl_tensor_descriptor input,
                          hl_tensor_descriptor output,
                          real* output_image,
                          real* output_image_grad,
-                         hl_pooling_descriptor pooling)
-{
+                         hl_pooling_descriptor pooling) {
     cudnnPoolingDescriptor_t    pooling_desc;
     cudnnTensorDescriptor_t     input_desc;
     cudnnTensorDescriptor_t     output_desc;
@@ -571,8 +566,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter,
                                  int input_feature_maps,
                                  int output_feature_maps,
                                  int height,
-                                 int width)
-{
+                                 int width) {
     CHECK_NOTNULL(filter);
 
     cudnn_filter_descriptor hl_filter =
@@ -607,8 +601,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter,
 }
 
 
-void hl_destroy_filter_descriptor(hl_filter_descriptor filter)
-{
+void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
     CHECK_NOTNULL(filter);
 
     cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
@@ -627,14 +620,13 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                       int padding_height,
                                       int padding_width,
                                       int stride_height,
-                                      int stride_width)
-{
+                                      int stride_width) {
     CHECK_NOTNULL(conv);
 
-    cudnn_convolution_descriptor hl_conv =
-        (cudnn_convolution_descriptor)malloc(sizeof(_cudnn_convolution_descriptor));
-    CHECK_NOTNULL(hl_conv);
+    cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)
+        malloc(sizeof(_cudnn_convolution_descriptor));
 
+    CHECK_NOTNULL(hl_conv);
     CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
 
     cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
@@ -667,8 +659,7 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                      int padding_height,
                                      int padding_width,
                                      int stride_height,
-                                     int stride_width)
-{
+                                     int stride_width) {
     CHECK_NOTNULL(conv);
     CHECK_NOTNULL(image);
     CHECK_NOTNULL(filter);
@@ -697,8 +688,7 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
     hl_conv->mode = mode;
 }
 
-void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv)
-{
+void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
     CHECK_NOTNULL(conv);
 
     cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
@@ -753,8 +743,7 @@ void hl_convolution_forward(hl_tensor_descriptor input,
 void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
                                      real* bias_data,
                                      hl_tensor_descriptor output,
-                                     real* output_data)
-{
+                                     real* output_data) {
     CHECK_NOTNULL(bias);
     CHECK_NOTNULL(output);
     CHECK_NOTNULL(bias_data);
@@ -782,8 +771,7 @@ void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
 void hl_convolution_backward_bias(hl_tensor_descriptor bias,
                                   real* bias_grad_data,
                                   hl_tensor_descriptor output,
-                                  real* output_grad_data)
-{
+                                  real* output_grad_data) {
     CHECK_NOTNULL(bias);
     CHECK_NOTNULL(output);
     CHECK_NOTNULL(bias_grad_data);
@@ -814,7 +802,6 @@ void hl_convolution_backward_filter(hl_tensor_descriptor input,
                                     void* gpuWorkSpace,
                                     size_t sizeInBytes,
                                     int convBwdFilterAlgo) {
-
     CHECK_NOTNULL(input);
     CHECK_NOTNULL(output);
     CHECK_NOTNULL(filter);
@@ -889,8 +876,7 @@ void hl_convolution_backward_data(hl_tensor_descriptor input,
 void hl_softmax_forward(real *input,
                         real *output,
                         int height,
-                        int width)
-{
+                        int width) {
 #ifndef PADDLE_TYPE_DOUBLE
     cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
@@ -923,8 +909,7 @@ void hl_softmax_forward(real *input,
 void hl_softmax_backward(real *output_value,
                          real *output_grad,
                          int height,
-                         int width)
-{
+                         int width) {
 #ifndef PADDLE_TYPE_DOUBLE
     cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index 3ea2c91bd5a41e0cd6ece0605a25e645676faa40..ca19f210c5c9d5151b01ce81a4f44663e2df97cc 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -203,8 +203,8 @@ inline pid_t gettid() {
   #endif
   pid_t tid = syscall(__NR_gettid);
 #endif
-  CHECK_NE(tid, -1);
-  return tid;    
+  CHECK_NE((int)tid, -1);
+  return tid;
 }
 
 void hl_init(int device) {
@@ -355,7 +355,8 @@ void* hl_malloc_host(size_t size) {
   void *dest_h;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(dynload::cudaHostAlloc((void**)&dest_h, size, cudaHostAllocDefault));
+  CHECK_CUDA(dynload::cudaHostAlloc(
+    (void**)&dest_h, size, cudaHostAllocDefault));
 
   return dest_h;
 }
@@ -364,7 +365,7 @@ void hl_free_mem_host(void *dest_h) {
   CHECK_NOTNULL(dest_h);
 
   cudaError_t err = dynload::cudaFreeHost(dest_h);
-  CHECK (cudaSuccess == err || cudaErrorCudartUnloading == err)
+  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
     << hl_get_device_error_string();
 }
 
@@ -502,7 +503,8 @@ int hl_get_cuda_version() {
   return g_cuda_lib_version;
 }
 
-void hl_create_thread_resources(int device, thread_device_resources device_res) {
+void hl_create_thread_resources(int device,
+  thread_device_resources device_res) {
   CHECK_CUDA(dynload::cudaSetDevice(device));
 
   /* create thread stream */
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
index 27bbd03bc328293d978867c6badddc13a754ece2..fe755b8c2606dffeeff2ea1549180ca8b134c251 100644
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -78,48 +78,38 @@ __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
                                                 dim3 blockDim,
                                                 void **args,
                                                 size_t sharedMem,
-                                                cudaStream_t stream)
-{
-  return dynload::cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
+                                                cudaStream_t stream) {
+  return dynload::cudaLaunchKernel(func, gridDim, blockDim,
+                                   args, sharedMem, stream);
 }
 #endif /* CUDART_VERSION >= 7000 */
 
 
-__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func)
-{
+__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
   return dynload::cudaLaunch(func);
 }
 
 __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
                                                  size_t size,
-                                                 size_t offset)
-{
+                                                 size_t offset) {
   return dynload::cudaSetupArgument(arg, size, offset);
 }
 
 __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
                                                  dim3 blockDim,
                                                  size_t sharedMem,
-                                                 cudaStream_t stream)
-{
+                                                 cudaStream_t stream) {
   return dynload::cudaConfigureCall(gridDim, blockDim,
                                     sharedMem, stream);
 }
 
 extern "C" {
 
-void** CUDARTAPI __cudaRegisterFatBinary(
-  void *fatCubin
-)
-{
+void** CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
   return dynload::__cudaRegisterFatBinary(fatCubin);
-
 }
 
-void CUDARTAPI __cudaUnregisterFatBinary(
-  void **fatCubinHandle
-)
-{
+void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
   return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
 }
 
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index b564b969033680a001577de25ceb84dae391754a..5cb16cfbb372209a6cac83cdaace9afbf590e0fe 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -19,17 +19,18 @@ limitations under the License. */
 
 P_DEFINE_string(cudnn_dir, "",
                 "Specify path for loading libcudnn.so. For instance, "
-                "/usr/local/cudnn/lib64. If empty [default], dlopen will search "
-                "cudnn from LD_LIBRARY_PATH");
+                "/usr/local/cudnn/lib64. If empty [default], dlopen "
+                "will search cudnn from LD_LIBRARY_PATH");
 
 P_DEFINE_string(cuda_dir, "",
                 "Specify path for loading cuda library, such as libcublas, "
-                "libcurand. For instance, /usr/local/cuda/lib64. "
-                "(Note: libcudart can not be specified by cuda_dir, since some "
+                "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
+                "libcudart can not be specified by cuda_dir, since some "
                 "build-in function in cudart already ran before main entry). "
-                "If empty [default], dlopen will search cuda from LD_LIBRARY_PATH");
+                "If default, dlopen will search cuda from LD_LIBRARY_PATH");
 
-static inline std::string join(const std::string& part1, const std::string& part2) {
+static inline std::string join(const std::string& part1,
+                               const std::string& part2) {
   // directory separator
   const char sep = '/';
 
@@ -49,10 +50,10 @@ static inline std::string join(const std::string& part1, const std::string& part
 static inline void GetDsoHandleFromDefaultPath(
         std::string& dso_path, void** dso_handle, int dynload_flags) {
     VLOG(3) << "Try to find cuda library: " << dso_path
-              << " from default system path.";
-    // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH 
+            << " from default system path.";
+    // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
     *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-    
+
     // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
     // bring System Integrity Projection (SIP), if dso_handle
     // is null, search from default package path in Mac OS.
@@ -62,13 +63,13 @@ static inline void GetDsoHandleFromDefaultPath(
         *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
         if (nullptr == *dso_handle) {
             if (dso_path == "libcudnn.dylib") {
-                LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"
-                << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C "
-                << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h "
+                LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n" // NOLINT
+                << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C "    // NOLINT
+                << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h "    // NOLINT
                 << "/usr/local/cuda/lib/libcudnn*";
             }
-        } 
-    }   
+        }
+    }
     #endif
 }
 
@@ -96,19 +97,19 @@ static inline void GetDsoHandleFromSearchPath(
 
     CHECK(nullptr != *dso_handle)
       << "Failed to find cuda library: " << dlPath << std::endl
-      << "Please specify its path correctly using one of the following ideas: \n"
+      << "Please specify its path correctly using one of the following ways: \n"    // NOLINT
 
-      << "Idea 1. set cuda and cudnn lib path at runtime. "
-      << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n"
+      << "Method 1. set cuda and cudnn lib path at runtime. "
+      << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n" // NOLINT
       << "For instance, issue command: paddle train --use_gpu=1 "
-      << "--cuda_dir=/usr/local/cudnn/lib --cudnn_dir=/usr/local/cudnn/lib ...\n"
+      << "--cuda_dir=/usr/local/cuda/lib64 --cudnn_dir=/usr/local/cudnn/lib ...\n"  // NOLINT
 
-      << "Idea 2. set environment variable LD_LIBRARY_PATH on Linux or "
+      << "Method 2. set environment variable LD_LIBRARY_PATH on Linux or "
       << "DYLD_LIBRARY_PATH on Mac OS. \n"
       << "For instance, issue command: export LD_LIBRARY_PATH=... \n"
 
       << "Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible "
-      << "unless System Integrity Protection (SIP) is disabled. However, @Idea 1"
+      << "unless System Integrity Protection (SIP) is disabled. However, method 1 " // NOLINT
       << "always work well.";
 }