diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt index cdb730bb3cec7a32fa42cf4c6738d575b76c6032..11dbfb54b268774405ade1e532bef9a0e8c7ada9 100755 --- a/paddle/cuda/CMakeLists.txt +++ b/paddle/cuda/CMakeLists.txt @@ -81,5 +81,8 @@ else() add_library(paddle_cuda ${CUDA_SOURCES}) endif() -add_style_check_target(paddle_cuda ${CUDA_SOURCES}) -add_style_check_target(paddle_cuda ${CUDA_HEADERS}) +add_style_check_target(paddle_cuda + ${CUDA_SOURCES} + ${CUDA_HEADERS} + ${CUDA_DSO_SOURCES} + ${CUDA_CXX_WITH_GPU_SOURCES}) diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc index f16376ec937d3a397d9e7117de528c304f8403ee..abf6afadc218f615dc6b3cf734d09f072214be40 100644 --- a/paddle/cuda/src/hl_cuda_cublas.cc +++ b/paddle/cuda/src/hl_cuda_cublas.cc @@ -104,7 +104,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP) #endif const char* hl_cublas_get_error_string(cublasStatus_t status) { - switch(status) { + switch (status) { case CUBLAS_STATUS_NOT_INITIALIZED: return "[cublas status]: not initialized"; case CUBLAS_STATUS_ALLOC_FAILED: @@ -181,7 +181,7 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) { real **inout_d = (real **)hl_malloc_device(sizeof(real *)); hl_memcpy(inout_d, inout_h, sizeof(real *)); - int *pivot_d = (int *)hl_malloc_device(dimN*sizeof(int)); + int *pivot_d = (int *)hl_malloc_device(dimN * sizeof(int)); int *info_d = (int *)t_resource.gpu_mem; /* Note: cublasSgetrfBatched is used to calculate a number of @@ -189,10 +189,9 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) { the API for better performance. */ CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle, - dimN, inout_d, lda, pivot_d, - info_d, 1)); + dimN, inout_d, lda, pivot_d, info_d, 1)); - int info_h; + int info_h; hl_memcpy(&info_h, info_d, sizeof(int)); if (info_h != 0) { LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n"; @@ -204,8 +203,8 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) { hl_memcpy(out_d, out_h, sizeof(real *)); CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle, - dimN, (const real **)inout_d, lda, pivot_d, - out_d, ldc, info_d, 1)); + dimN, (const real **)inout_d, lda, pivot_d, + out_d, ldc, info_d, 1)); hl_memcpy(&info_h, info_d, sizeof(int)); if (info_h != 0) { @@ -215,7 +214,7 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) { hl_free_mem_device(inout_d); hl_free_mem_device(pivot_d); hl_free_mem_device(out_d); - + CHECK_SYNC("hl_matrix_inverse failed"); } diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc index 92b28e4345c3d4d306e6ee2a7f9f50189454f951..1829fe23ac594e63253df23b350b16cb28eaebc1 100644 --- a/paddle/cuda/src/hl_cuda_cudnn.cc +++ b/paddle/cuda/src/hl_cuda_cudnn.cc @@ -159,13 +159,11 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP) bool g_is_libcudnn_init = false; int g_cudnn_lib_version = 0; -void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) -{ +void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) { CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc)); } -void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream) -{ +void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream) { size_t cudnn_dso_ver = dynload::cudnnGetVersion(); size_t cudnn_dso_major = cudnn_dso_ver / 1000; size_t cudnn_cuh_major = CUDNN_VERSION / 1000; @@ -212,13 +210,18 @@ void hl_conv_workspace(hl_tensor_descriptor input, CHECK_NOTNULL(conv); // Specify workspace limit directly - size_t memoryLimitBytes = (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb; + size_t memoryLimitBytes = + (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb; // cudnn convolution forward configuration - cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input); - cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output); - cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter); - cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv); + cudnnTensorDescriptor_t fwd_src_desc = + GET_TENSOR_DESCRIPTOR(input); + cudnnTensorDescriptor_t fwd_dest_desc = + GET_TENSOR_DESCRIPTOR(output); + cudnnFilterDescriptor_t fwd_filter_desc = + GET_FILTER_DESCRIPTOR(filter); + cudnnConvolutionDescriptor_t fwd_conv_desc = + GET_CONVOLUTION_DESCRIPTOR(conv); CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm( t_resource.cudnn_handle, @@ -250,23 +253,23 @@ void hl_conv_workspace(hl_tensor_descriptor input, GET_CONVOLUTION_DESCRIPTOR(conv); CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm( - t_resource.cudnn_handle, - bwd_data_filter_desc, - bwd_data_diff_desc, - bwd_data_conv_desc, - bwd_data_grad_desc, - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - memoryLimitBytes, - reinterpret_cast(convBwdDataAlgo))); + t_resource.cudnn_handle, + bwd_data_filter_desc, + bwd_data_diff_desc, + bwd_data_conv_desc, + bwd_data_grad_desc, + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + memoryLimitBytes, + reinterpret_cast(convBwdDataAlgo))); CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( - t_resource.cudnn_handle, - bwd_data_filter_desc, - bwd_data_diff_desc, - bwd_data_conv_desc, - bwd_data_grad_desc, - static_cast(*convBwdDataAlgo), - bwdDataLimitBytes)); + t_resource.cudnn_handle, + bwd_data_filter_desc, + bwd_data_diff_desc, + bwd_data_conv_desc, + bwd_data_grad_desc, + static_cast(*convBwdDataAlgo), + bwdDataLimitBytes)); // cudnn convolution backward filter configuration cudnnTensorDescriptor_t bwd_filter_src_desc = @@ -279,21 +282,21 @@ void hl_conv_workspace(hl_tensor_descriptor input, GET_FILTER_DESCRIPTOR(filter); CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm( - t_resource.cudnn_handle, - bwd_filter_src_desc, - bwd_filter_diff_desc, - bwd_filter_conv_desc, - bwd_filter_grad_desc, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - memoryLimitBytes, - reinterpret_cast(convBwdFilterAlgo))); + t_resource.cudnn_handle, + bwd_filter_src_desc, + bwd_filter_diff_desc, + bwd_filter_conv_desc, + bwd_filter_grad_desc, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + memoryLimitBytes, + reinterpret_cast(convBwdFilterAlgo))); CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( - t_resource.cudnn_handle, bwd_filter_src_desc, - bwd_filter_diff_desc, bwd_filter_conv_desc, - bwd_filter_grad_desc, - static_cast(*convBwdFilterAlgo), - bwdFilterLimitBytes)); + t_resource.cudnn_handle, bwd_filter_src_desc, + bwd_filter_diff_desc, bwd_filter_conv_desc, + bwd_filter_grad_desc, + static_cast(*convBwdFilterAlgo), + bwdFilterLimitBytes)); #endif } @@ -302,8 +305,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc, int batch_size, int feature_maps, int height, - int width) -{ + int width) { CHECK_NOTNULL(image_desc); cudnn_tensor_descriptor hl_desc = @@ -359,8 +361,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc, int batch_size, int feature_maps, int height, - int width) -{ + int width) { const int stride_w = 1; const int stride_h = width * stride_w; const int stride_c = height * stride_h; @@ -384,8 +385,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc, int nStride, int cStride, int hStride, - int wStride) -{ + int wStride) { CHECK_NOTNULL(image_desc); cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc; @@ -408,8 +408,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc, hl_desc->width = width; } -void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) -{ +void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) { CHECK_NOTNULL(image_desc); cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc; @@ -430,11 +429,9 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc, int height_padding, int width_padding, int stride_height, - int stride_width) -{ + int stride_width) { cudnnPoolingMode_t cudnn_mode; - switch (mode) - { + switch (mode) { case HL_POOLING_MAX: cudnn_mode = CUDNN_POOLING_MAX; break; @@ -478,13 +475,13 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc, *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc; } -void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) -{ +void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) { CHECK_NOTNULL(pooling_desc); - cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc; - CHECK_NOTNULL(hl_pooling->desc); + cudnn_pooling_descriptor hl_pooling = + (cudnn_pooling_descriptor)pooling_desc; + CHECK_NOTNULL(hl_pooling->desc); CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc)); hl_pooling->desc = NULL; @@ -496,8 +493,7 @@ void hl_pooling_forward(hl_tensor_descriptor input, real* input_image, hl_tensor_descriptor output, real* output_image, - hl_pooling_descriptor pooling) -{ + hl_pooling_descriptor pooling) { cudnnPoolingDescriptor_t pooling_desc; cudnnTensorDescriptor_t input_desc; cudnnTensorDescriptor_t output_desc; @@ -531,8 +527,7 @@ void hl_pooling_backward(hl_tensor_descriptor input, hl_tensor_descriptor output, real* output_image, real* output_image_grad, - hl_pooling_descriptor pooling) -{ + hl_pooling_descriptor pooling) { cudnnPoolingDescriptor_t pooling_desc; cudnnTensorDescriptor_t input_desc; cudnnTensorDescriptor_t output_desc; @@ -571,8 +566,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter, int input_feature_maps, int output_feature_maps, int height, - int width) -{ + int width) { CHECK_NOTNULL(filter); cudnn_filter_descriptor hl_filter = @@ -607,8 +601,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter, } -void hl_destroy_filter_descriptor(hl_filter_descriptor filter) -{ +void hl_destroy_filter_descriptor(hl_filter_descriptor filter) { CHECK_NOTNULL(filter); cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter; @@ -627,14 +620,13 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv, int padding_height, int padding_width, int stride_height, - int stride_width) -{ + int stride_width) { CHECK_NOTNULL(conv); - cudnn_convolution_descriptor hl_conv = - (cudnn_convolution_descriptor)malloc(sizeof(_cudnn_convolution_descriptor)); - CHECK_NOTNULL(hl_conv); + cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor) + malloc(sizeof(_cudnn_convolution_descriptor)); + CHECK_NOTNULL(hl_conv); CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc)); cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION; @@ -667,8 +659,7 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv, int padding_height, int padding_width, int stride_height, - int stride_width) -{ + int stride_width) { CHECK_NOTNULL(conv); CHECK_NOTNULL(image); CHECK_NOTNULL(filter); @@ -697,8 +688,7 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv, hl_conv->mode = mode; } -void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) -{ +void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) { CHECK_NOTNULL(conv); cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv; @@ -753,8 +743,7 @@ void hl_convolution_forward(hl_tensor_descriptor input, void hl_convolution_forward_add_bias(hl_tensor_descriptor bias, real* bias_data, hl_tensor_descriptor output, - real* output_data) -{ + real* output_data) { CHECK_NOTNULL(bias); CHECK_NOTNULL(output); CHECK_NOTNULL(bias_data); @@ -782,8 +771,7 @@ void hl_convolution_forward_add_bias(hl_tensor_descriptor bias, void hl_convolution_backward_bias(hl_tensor_descriptor bias, real* bias_grad_data, hl_tensor_descriptor output, - real* output_grad_data) -{ + real* output_grad_data) { CHECK_NOTNULL(bias); CHECK_NOTNULL(output); CHECK_NOTNULL(bias_grad_data); @@ -814,7 +802,6 @@ void hl_convolution_backward_filter(hl_tensor_descriptor input, void* gpuWorkSpace, size_t sizeInBytes, int convBwdFilterAlgo) { - CHECK_NOTNULL(input); CHECK_NOTNULL(output); CHECK_NOTNULL(filter); @@ -889,8 +876,7 @@ void hl_convolution_backward_data(hl_tensor_descriptor input, void hl_softmax_forward(real *input, real *output, int height, - int width) -{ + int width) { #ifndef PADDLE_TYPE_DOUBLE cudnnDataType_t data_type = CUDNN_DATA_FLOAT; #else @@ -923,8 +909,7 @@ void hl_softmax_forward(real *input, void hl_softmax_backward(real *output_value, real *output_grad, int height, - int width) -{ + int width) { #ifndef PADDLE_TYPE_DOUBLE cudnnDataType_t data_type = CUDNN_DATA_FLOAT; #else diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc index 3ea2c91bd5a41e0cd6ece0605a25e645676faa40..ca19f210c5c9d5151b01ce81a4f44663e2df97cc 100644 --- a/paddle/cuda/src/hl_cuda_device.cc +++ b/paddle/cuda/src/hl_cuda_device.cc @@ -203,8 +203,8 @@ inline pid_t gettid() { #endif pid_t tid = syscall(__NR_gettid); #endif - CHECK_NE(tid, -1); - return tid; + CHECK_NE((int)tid, -1); + return tid; } void hl_init(int device) { @@ -355,7 +355,8 @@ void* hl_malloc_host(size_t size) { void *dest_h; CHECK(size) << __func__ << ": the size for device memory is 0, please check."; - CHECK_CUDA(dynload::cudaHostAlloc((void**)&dest_h, size, cudaHostAllocDefault)); + CHECK_CUDA(dynload::cudaHostAlloc( + (void**)&dest_h, size, cudaHostAllocDefault)); return dest_h; } @@ -364,7 +365,7 @@ void hl_free_mem_host(void *dest_h) { CHECK_NOTNULL(dest_h); cudaError_t err = dynload::cudaFreeHost(dest_h); - CHECK (cudaSuccess == err || cudaErrorCudartUnloading == err) + CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err) << hl_get_device_error_string(); } @@ -502,7 +503,8 @@ int hl_get_cuda_version() { return g_cuda_lib_version; } -void hl_create_thread_resources(int device, thread_device_resources device_res) { +void hl_create_thread_resources(int device, + thread_device_resources device_res) { CHECK_CUDA(dynload::cudaSetDevice(device)); /* create thread stream */ diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc index 27bbd03bc328293d978867c6badddc13a754ece2..fe755b8c2606dffeeff2ea1549180ca8b134c251 100644 --- a/paddle/cuda/src/hl_cudart_wrap.cc +++ b/paddle/cuda/src/hl_cudart_wrap.cc @@ -78,48 +78,38 @@ __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 blockDim, void **args, size_t sharedMem, - cudaStream_t stream) -{ - return dynload::cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream); + cudaStream_t stream) { + return dynload::cudaLaunchKernel(func, gridDim, blockDim, + args, sharedMem, stream); } #endif /* CUDART_VERSION >= 7000 */ -__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) -{ +__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) { return dynload::cudaLaunch(func); } __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, size_t size, - size_t offset) -{ + size_t offset) { return dynload::cudaSetupArgument(arg, size, offset); } __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem, - cudaStream_t stream) -{ + cudaStream_t stream) { return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream); } extern "C" { -void** CUDARTAPI __cudaRegisterFatBinary( - void *fatCubin -) -{ +void** CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) { return dynload::__cudaRegisterFatBinary(fatCubin); - } -void CUDARTAPI __cudaUnregisterFatBinary( - void **fatCubinHandle -) -{ +void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) { return dynload::__cudaUnregisterFatBinary(fatCubinHandle); } diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc index b564b969033680a001577de25ceb84dae391754a..5cb16cfbb372209a6cac83cdaace9afbf590e0fe 100644 --- a/paddle/cuda/src/hl_dso_loader.cc +++ b/paddle/cuda/src/hl_dso_loader.cc @@ -19,17 +19,18 @@ limitations under the License. */ P_DEFINE_string(cudnn_dir, "", "Specify path for loading libcudnn.so. For instance, " - "/usr/local/cudnn/lib64. If empty [default], dlopen will search " - "cudnn from LD_LIBRARY_PATH"); + "/usr/local/cudnn/lib64. If empty [default], dlopen " + "will search cudnn from LD_LIBRARY_PATH"); P_DEFINE_string(cuda_dir, "", "Specify path for loading cuda library, such as libcublas, " - "libcurand. For instance, /usr/local/cuda/lib64. " - "(Note: libcudart can not be specified by cuda_dir, since some " + "libcurand. For instance, /usr/local/cuda/lib64. (Note: " + "libcudart can not be specified by cuda_dir, since some " "build-in function in cudart already ran before main entry). " - "If empty [default], dlopen will search cuda from LD_LIBRARY_PATH"); + "If default, dlopen will search cuda from LD_LIBRARY_PATH"); -static inline std::string join(const std::string& part1, const std::string& part2) { +static inline std::string join(const std::string& part1, + const std::string& part2) { // directory separator const char sep = '/'; @@ -49,10 +50,10 @@ static inline std::string join(const std::string& part1, const std::string& part static inline void GetDsoHandleFromDefaultPath( std::string& dso_path, void** dso_handle, int dynload_flags) { VLOG(3) << "Try to find cuda library: " << dso_path - << " from default system path."; - // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH + << " from default system path."; + // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH *dso_handle = dlopen(dso_path.c_str(), dynload_flags); - + // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to // bring System Integrity Projection (SIP), if dso_handle // is null, search from default package path in Mac OS. @@ -62,13 +63,13 @@ static inline void GetDsoHandleFromDefaultPath( *dso_handle = dlopen(dso_path.c_str(), dynload_flags); if (nullptr == *dso_handle) { if (dso_path == "libcudnn.dylib") { - LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n" - << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C " - << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h " + LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n" // NOLINT + << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C " // NOLINT + << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h " // NOLINT << "/usr/local/cuda/lib/libcudnn*"; } - } - } + } + } #endif } @@ -96,19 +97,19 @@ static inline void GetDsoHandleFromSearchPath( CHECK(nullptr != *dso_handle) << "Failed to find cuda library: " << dlPath << std::endl - << "Please specify its path correctly using one of the following ideas: \n" + << "Please specify its path correctly using one of the following ways: \n" // NOLINT - << "Idea 1. set cuda and cudnn lib path at runtime. " - << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n" + << "Method 1. set cuda and cudnn lib path at runtime. " + << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n" // NOLINT << "For instance, issue command: paddle train --use_gpu=1 " - << "--cuda_dir=/usr/local/cudnn/lib --cudnn_dir=/usr/local/cudnn/lib ...\n" + << "--cuda_dir=/usr/local/cuda/lib64 --cudnn_dir=/usr/local/cudnn/lib ...\n" // NOLINT - << "Idea 2. set environment variable LD_LIBRARY_PATH on Linux or " + << "Method 2. set environment variable LD_LIBRARY_PATH on Linux or " << "DYLD_LIBRARY_PATH on Mac OS. \n" << "For instance, issue command: export LD_LIBRARY_PATH=... \n" << "Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible " - << "unless System Integrity Protection (SIP) is disabled. However, @Idea 1" + << "unless System Integrity Protection (SIP) is disabled. However, method 1 " // NOLINT << "always work well."; }