diff --git a/cmake/util.cmake b/cmake/util.cmake index 7da52bb758a3ab28db85f7c6c4d3a99b141e6342..24ad5c815ca20d9b6b317b1be4d2dc93a9e06fba 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -108,6 +108,7 @@ function(link_paddle_exe TARGET_NAME) endif() if(WITH_GPU) + target_link_libraries(${TARGET_NAME} ${CUDA_CUDART_LIBRARY}) if(NOT WITH_DSO OR WITH_METRIC) target_link_libraries(${TARGET_NAME} ${CUDNN_LIBRARY} diff --git a/paddle/api/paddle_api_config.py.in b/paddle/api/paddle_api_config.py.in index e11ee920362aed3ec79a2e62d447d7dde4a99248..82f45ba6ccec49eb190d1814a67a575f311689e8 100644 --- a/paddle/api/paddle_api_config.py.in +++ b/paddle/api/paddle_api_config.py.in @@ -13,5 +13,5 @@ GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@" GFLAGS_LOCATION="@GFLAGS_LOCATION@" CBLAS_LIBRARIES="@CBLAS_LIBRARIES@" -CUDA_LIBRARIES="@CUDA_cudart_shared_LIBRARY@" +CUDA_LIBRARIES="@CUDA_CUDART_LIBRARY@" WITH_COVERALLS="@ON_COVERALLS@" diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt index 57fb89608f4bcf3e6829fe850a61c2a626adfbdc..a28ccd6f07cfd56b7f1978f67fdcf6e7e5fe6337 100755 --- a/paddle/cuda/CMakeLists.txt +++ b/paddle/cuda/CMakeLists.txt @@ -15,7 +15,6 @@ else() endif() set(CUDA_CXX_WITH_GPU_SOURCES - src/hl_cudart_wrap.cc src/hl_cuda_cublas.cc src/hl_cuda_cudnn.cc src/hl_cuda_device.cc) diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h index 20c13f21e61a92b0635b686f6f724ae2b44518cc..276a07d3c735c771c851e8b4bd14c720f9ab6569 100644 --- a/paddle/cuda/include/hl_dso_loader.h +++ b/paddle/cuda/include/hl_dso_loader.h @@ -36,14 +36,6 @@ void GetCublasDsoHandle(void** dso_handle); */ void GetCudnnDsoHandle(void** dso_handle); -/** - * @brief load the DSO of CUDA Run Time - * - * @param **dso_handle dso handler - * - */ -void GetCudartDsoHandle(void** dso_handle); - /** * @brief load the DSO of CURAND * diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc index a71eecba2736234dafaf6b67e5efac5358a30871..6dfb12e00b80db36ad2e53326b880c7d1ed59263 100644 --- a/paddle/cuda/src/hl_cuda_device.cc +++ b/paddle/cuda/src/hl_cuda_device.cc @@ -22,10 +22,9 @@ limitations under the License. */ #include #include #include -#include "hl_cuda.h" #include "hl_cuda.ph" -#include "hl_dso_loader.h" #include "hl_thread.ph" +#include "hl_dso_loader.h" #include "paddle/utils/Logging.h" // clang-format on @@ -77,78 +76,6 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP) #undef CURAND_RAND_ROUTINE_EACH #undef DYNAMIC_LOAD_CURAND_WRAP -std::once_flag cudart_dso_flag; -void *cudart_dso_handle = nullptr; - -/** - * The following macro definition can generate structs - * (for each function) to dynamic load cuda routine - * via operator overloading. - * - * note: default dynamic linked libs - */ -#ifdef PADDLE_USE_DSO -#define DYNAMIC_LOAD_CUDART_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ - using cudart_func = decltype(__name(args...)) (*)(Args...); \ - std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \ - void *p_##__name = dlsym(cudart_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - } __name; /* struct DynLoad__##__name */ -#else -#define DYNAMIC_LOAD_CUDART_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ - return __name(args...); \ - } \ - } __name; /* struct DynLoad__##__name */ -#endif - -/* include all needed cuda functions in HPPL */ -// clang-format off -#define CUDA_ROUTINE_EACH(__macro) \ - __macro(cudaMalloc) \ - __macro(cudaHostAlloc) \ - __macro(cudaFree) \ - __macro(cudaFreeHost) \ - __macro(cudaMemcpy) \ - __macro(cudaMemset) \ - __macro(cudaMemcpyAsync) \ - __macro(cudaSetDevice) \ - __macro(cudaGetDevice) \ - __macro(cudaGetDeviceCount) \ - __macro(cudaGetDeviceProperties) \ - __macro(cudaDeviceSynchronize) \ - __macro(cudaDeviceCanAccessPeer) \ - __macro(cudaDeviceEnablePeerAccess) \ - __macro(cudaStreamCreate) \ - __macro(cudaStreamDestroy) \ - __macro(cudaStreamSynchronize) \ - __macro(cudaStreamWaitEvent) \ - __macro(cudaEventCreate) \ - __macro(cudaEventRecord) \ - __macro(cudaEventQuery) \ - __macro(cudaEventDestroy) \ - __macro(cudaEventSynchronize) \ - __macro(cudaEventElapsedTime) \ - __macro(cudaSetDeviceFlags) \ - __macro(cudaGetLastError) \ - __macro(cudaFuncSetCacheConfig) \ - __macro(cudaRuntimeGetVersion) \ - __macro(cudaGetErrorString) \ - __macro(cudaProfilerStart) \ - __macro(cudaProfilerStop) -// clang-format on - -CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP) - -#undef CUDA_ROUNTINE_EACH -#undef DYNAMIC_LOAD_CUDART_WRAP - } /* namespace dynload */ /** @@ -171,11 +98,11 @@ int g_cuda_lib_version = 0; * Check build-in cuda function using glog and it **does not** * support << operator for more details error info. */ -#define CHECK_CUDA(cudaFunc) \ - do { \ - cudaError_t cudaStat = cudaFunc; \ - CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \ - << dynload::cudaGetErrorString(cudaStat); \ +#define CHECK_CUDA(cudaFunc) \ + do { \ + cudaError_t cudaStat = cudaFunc; \ + CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \ + << cudaGetErrorString(cudaStat); \ } while (0) /** @@ -284,13 +211,13 @@ void hl_fini() { tmp_stream = (char *)t_device[dev]->stream; } for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) { - CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j])); + CHECK_CUDA(cudaStreamDestroy(t_device[dev]->stream[j])); } /* free device memory */ hl_free_mem_device(t_device[dev]->gpu_mem); hl_free_mem_host(t_device[dev]->cpu_mem); - CHECK_CUDA(dynload::cudaEventDestroy(t_device[dev]->mem_event)); + CHECK_CUDA(cudaEventDestroy(t_device[dev]->mem_event)); } free(tmp); @@ -308,7 +235,7 @@ void hl_set_device(int device) { CHECK(device >= 0 && device < g_system_device_num && g_device[device]) << "Device: " << device << " is not specified in startup."; - CHECK_CUDA(dynload::cudaSetDevice(device)); + CHECK_CUDA(cudaSetDevice(device)); /* switch thread stream */ for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) { @@ -336,7 +263,7 @@ void hl_set_device(int device) { int hl_get_device() { int device; - CHECK_CUDA(dynload::cudaGetDevice(&device)); + CHECK_CUDA(cudaGetDevice(&device)); return device; } @@ -344,7 +271,7 @@ void *hl_malloc_device(size_t size) { void *dest_d; CHECK(size) << __func__ << ": the size for device memory is 0, please check."; - CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size)); + CHECK_CUDA(cudaMalloc((void **)&dest_d, size)); return dest_d; } @@ -352,7 +279,7 @@ void *hl_malloc_device(size_t size) { void hl_free_mem_device(void *dest_d) { CHECK_NOTNULL(dest_d); - cudaError_t err = dynload::cudaFree(dest_d); + cudaError_t err = cudaFree(dest_d); CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err) << hl_get_device_error_string(); } @@ -361,8 +288,7 @@ void *hl_malloc_host(size_t size) { void *dest_h; CHECK(size) << __func__ << ": the size for device memory is 0, please check."; - CHECK_CUDA( - dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault)); + CHECK_CUDA(cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault)); return dest_h; } @@ -370,7 +296,7 @@ void *hl_malloc_host(size_t size) { void hl_free_mem_host(void *dest_h) { CHECK_NOTNULL(dest_h); - cudaError_t err = dynload::cudaFreeHost(dest_h); + cudaError_t err = cudaFreeHost(dest_h); CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err) << hl_get_device_error_string(); } @@ -381,11 +307,11 @@ void hl_memcpy(void *dst, void *src, size_t size) { } CHECK_NOTNULL(dst); CHECK_NOTNULL(src); - CHECK_CUDA(dynload::cudaMemcpy(dst, src, size, cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDefault)); } void hl_memset_device(void *dest_d, int value, size_t size) { - CHECK_CUDA(dynload::cudaMemset(dest_d, value, size)); + CHECK_CUDA(cudaMemset(dest_d, value, size)); } void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) { @@ -394,7 +320,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) { } CHECK_NOTNULL(src_h); CHECK_NOTNULL(dest_d); - CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice)); } void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) { @@ -403,7 +329,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) { } CHECK_NOTNULL(dest_h); CHECK_NOTNULL(src_d); - CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost)); + CHECK_CUDA(cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost)); } void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) { @@ -412,8 +338,7 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) { } CHECK_NOTNULL(dest_d); CHECK_NOTNULL(src_d); - CHECK_CUDA( - dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice)); + CHECK_CUDA(cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice)); } void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) { @@ -427,8 +352,7 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) { CHECK_LT(stream, HPPL_STREAM_END); cu_stream = t_resource.stream[stream]; - CHECK_CUDA( - dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream)); + CHECK_CUDA(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream)); } void hl_start() { @@ -439,8 +363,7 @@ void hl_start() { bool hl_device_can_access_peer(int device, int peerDevice) { int canAccessPeer; - CHECK_CUDA( - dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice)); + CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice)); if (canAccessPeer == 1) { return true; @@ -450,9 +373,9 @@ bool hl_device_can_access_peer(int device, int peerDevice) { } void hl_device_enable_peer_access(int peerDevice) { - cudaError_t err = dynload::cudaDeviceEnablePeerAccess(peerDevice, 0); + cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice, 0); if (cudaErrorPeerAccessAlreadyEnabled == err) { - dynload::cudaGetLastError(); + cudaGetLastError(); } else { CHECK_CUDA(err); } @@ -463,9 +386,9 @@ void hl_create_global_resources(hl_device_prop device_prop) { int device = device_prop->device; global_device_resources device_res = device_prop->device_resources; - CHECK_CUDA(dynload::cudaSetDevice(device)); + CHECK_CUDA(cudaSetDevice(device)); /* device properties */ - CHECK_CUDA(dynload::cudaGetDeviceProperties(&cu_prop, device)); + CHECK_CUDA(cudaGetDeviceProperties(&cu_prop, device)); device_prop->major = cu_prop.major; device_prop->minor = cu_prop.minor; @@ -474,7 +397,7 @@ void hl_create_global_resources(hl_device_prop device_prop) { /* create device stream */ for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) { - CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j])); + CHECK_CUDA(cudaStreamCreate(&device_res->stream[j])); } /* cublas init */ @@ -501,18 +424,18 @@ void hl_create_global_resources(hl_device_prop device_prop) { device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t))); pthread_mutex_init(device_res->gen_mutex, NULL); - CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version)); + CHECK_CUDA(cudaRuntimeGetVersion(&g_cuda_lib_version)); } int hl_get_cuda_version() { return g_cuda_lib_version; } void hl_create_thread_resources(int device, thread_device_resources device_res) { - CHECK_CUDA(dynload::cudaSetDevice(device)); + CHECK_CUDA(cudaSetDevice(device)); /* create thread stream */ for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) { - CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j])); + CHECK_CUDA(cudaStreamCreate(&device_res->stream[j])); } /* allocation device memory */ @@ -521,14 +444,14 @@ void hl_create_thread_resources(int device, /* allocation host memory */ device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE); - CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event)); + CHECK_CUDA(cudaEventCreate(&device_res->mem_event)); } void hl_specify_devices_start(int *device, int number) { if (hl_start_flag) return; /* 1. get the number of devices */ - CHECK_CUDA(dynload::cudaGetDeviceCount(&g_system_device_num)); + CHECK_CUDA(cudaGetDeviceCount(&g_system_device_num)); CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device"; if (device == NULL) { number = g_system_device_num; @@ -640,7 +563,7 @@ void hl_stream_synchronize(hl_stream_t stream) { << ": the parameter stream is error."; cu_stream = t_resource.stream[stream]; - CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream)); + CHECK_CUDA(cudaStreamSynchronize(cu_stream)); } void hl_create_event(hl_event_t *event) { @@ -649,7 +572,7 @@ void hl_create_event(hl_event_t *event) { struct _hl_event_st *st_event = (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st)); - CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event)); + CHECK_CUDA(cudaEventCreate(&st_event->cu_event)); *event = st_event; } @@ -659,8 +582,7 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) { CHECK_NOTNULL(start); CHECK_NOTNULL(end); - CHECK_CUDA( - dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event)); + CHECK_CUDA(cudaEventElapsedTime(&time, start->cu_event, end->cu_event)); return time; } @@ -672,7 +594,7 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) { << ": the parameter stream is error."; cu_stream = t_resource.stream[stream]; - CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream)); + CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream)); } void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) { @@ -683,12 +605,12 @@ void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) { << ": the parameter stream is error."; cu_stream = t_resource.stream[stream]; - CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0)); + CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0)); } void hl_destroy_event(hl_event_t event) { CHECK_NOTNULL(event); - CHECK_CUDA(dynload::cudaEventDestroy(event->cu_event)); + CHECK_CUDA(cudaEventDestroy(event->cu_event)); free(event); event = NULL; @@ -696,7 +618,7 @@ void hl_destroy_event(hl_event_t event) { void hl_event_synchronize(hl_event_t event) { CHECK_NOTNULL(event); - CHECK_CUDA(dynload::cudaEventSynchronize(event->cu_event)); + CHECK_CUDA(cudaEventSynchronize(event->cu_event)); } void hl_get_device_name(char *name, int len, int device) { @@ -725,24 +647,24 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) { *minor = g_device[device]->minor; } -int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); } +int hl_get_device_last_error() { return (int)cudaGetLastError(); } const char *hl_get_device_error_string() { - cudaError_t err = dynload::cudaGetLastError(); - return dynload::cudaGetErrorString(err); + cudaError_t err = cudaGetLastError(); + return cudaGetErrorString(err); } const char *hl_get_device_error_string(size_t err) { - return dynload::cudaGetErrorString((cudaError_t)err); + return cudaGetErrorString((cudaError_t)err); } -void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); } +void hl_device_synchronize() { CHECK_CUDA(cudaDeviceSynchronize()); } void hl_set_device_flags_block() { - CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); + CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); } bool hl_cuda_event_is_ready(hl_event_t event) { - cudaError_t err = dynload::cudaEventQuery(event->cu_event); + cudaError_t err = cudaEventQuery(event->cu_event); CHECK(cudaSuccess == err || cudaErrorNotReady == err); if (cudaErrorNotReady == err) { @@ -751,6 +673,6 @@ bool hl_cuda_event_is_ready(hl_event_t event) { return true; } -void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); } +void hl_profiler_start() { CHECK_CUDA(cudaProfilerStart()); } -void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); } +void hl_profiler_end() { CHECK_CUDA(cudaProfilerStop()); } diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc deleted file mode 100644 index ecc03a729dde2f2b4f8f004234a47d9272997a50..0000000000000000000000000000000000000000 --- a/paddle/cuda/src/hl_cudart_wrap.cc +++ /dev/null @@ -1,200 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_USE_DSO - -#include -#include -#include "hl_dso_loader.h" - -/** - * cudart wrapper: for dynamic load libcudart.so. - * When nvcc compile cuda kernels, it will insert - * some build-in runtime routines, which must be - * provided by us if PADDLE_USE_DSO is true. If - * PADDLE_USE_DSO is false, all of them must be - * ignored to avoid multiple definitions. - */ -namespace dynload { - -extern std::once_flag cudart_dso_flag; -extern void *cudart_dso_handle; - -/** - * The following macro definition can generate structs - * (for each function) to dynamic load cuda routine - * via operator overloading. - **/ -#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type) \ - struct DynLoad__##__name { \ - template \ - __type operator()(Args... args) { \ - typedef __type (*cudartFunc)(Args...); \ - std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \ - void *p_##__name = dlsym(cudart_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - } __name; /* struct DynLoad__##__name */ - -/* include all needed cuda functions in HPPL */ -// clang-format off -#define CUDA_ROUTINE_EACH(__macro) \ - __macro(cudaLaunch, cudaError_t) \ - __macro(cudaSetupArgument, cudaError_t) \ - __macro(cudaConfigureCall, cudaError_t) \ - __macro(__cudaRegisterFatBinary, void**) \ - __macro(__cudaUnregisterFatBinary, void) \ - __macro(__cudaRegisterFunction, void) \ - __macro(__cudaRegisterVar, void) \ - __macro(__cudaRegisterManagedVar, void) \ - __macro(__cudaInitModule, char) \ - __macro(__cudaRegisterTexture, void) \ - __macro(__cudaRegisterSurface, void) -// clang-format on - -CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP) - -#if CUDART_VERSION >= 7000 -DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t) -#endif - -#undef CUDA_ROUNTINE_EACH - -} /* namespace dynload */ - -#if CUDART_VERSION >= 7000 -__host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, - dim3 gridDim, - dim3 blockDim, - void **args, - size_t sharedMem, - cudaStream_t stream) { - return dynload::cudaLaunchKernel( - func, gridDim, blockDim, args, sharedMem, stream); -} -#endif /* CUDART_VERSION >= 7000 */ - -__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) { - return dynload::cudaLaunch(func); -} - -__host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, - size_t size, - size_t offset) { - return dynload::cudaSetupArgument(arg, size, offset); -} - -__host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, - dim3 blockDim, - size_t sharedMem, - cudaStream_t stream) { - return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream); -} - -extern "C" { - -void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) { - return dynload::__cudaRegisterFatBinary(fatCubin); -} - -void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) { - return dynload::__cudaUnregisterFatBinary(fatCubinHandle); -} - -void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle, - const char *hostFun, - char *deviceFun, - const char *deviceName, - int thread_limit, - uint3 *tid, - uint3 *bid, - dim3 *bDim, - dim3 *gDim, - int *wSize) { - return dynload::__cudaRegisterFunction(fatCubinHandle, - hostFun, - deviceFun, - deviceName, - thread_limit, - tid, - bid, - bDim, - gDim, - wSize); -} - -void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle, - char *hostVar, - char *deviceAddress, - const char *deviceName, - int ext, - int size, - int constant, - int global) { - return dynload::__cudaRegisterVar(fatCubinHandle, - hostVar, - deviceAddress, - deviceName, - ext, - size, - constant, - global); -} - -extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle, - void **hostVarPtrAddress, - char *deviceAddress, - const char *deviceName, - int ext, - int size, - int constant, - int global) { - return dynload::__cudaRegisterManagedVar(fatCubinHandle, - hostVarPtrAddress, - deviceAddress, - deviceName, - ext, - size, - constant, - global); -} - -char CUDARTAPI __cudaInitModule(void **fatCubinHandle) { - return dynload::__cudaInitModule(fatCubinHandle); -} - -void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle, - const struct textureReference *hostVar, - const void **deviceAddress, - const char *deviceName, - int dim, - int norm, - int ext) { - return dynload::__cudaRegisterTexture( - fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext); -} - -void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle, - const struct surfaceReference *hostVar, - const void **deviceAddress, - const char *deviceName, - int dim, - int ext) { - return dynload::__cudaRegisterSurface( - fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext); -} - -} /* extern "C" */ - -#endif diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc index c92909de534a875028d6d4784b02f08648c85a9a..53164dd27c7c5f5254e743b6fcf1d7b6fc895e31 100644 --- a/paddle/cuda/src/hl_dso_loader.cc +++ b/paddle/cuda/src/hl_dso_loader.cc @@ -25,10 +25,8 @@ DEFINE_string(cudnn_dir, DEFINE_string(cuda_dir, "", "Specify path for loading cuda library, such as libcublas, " - "libcurand. For instance, /usr/local/cuda/lib64. (Note: " - "libcudart can not be specified by cuda_dir, since some " - "build-in function in cudart already ran before main entry). " - "If default, dlopen will search cuda from LD_LIBRARY_PATH"); + "libcurand. For instance, /usr/local/cuda/lib64. If default, " + "dlopen will search cuda from LD_LIBRARY_PATH"); DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); @@ -147,14 +145,6 @@ void GetCudnnDsoHandle(void** dso_handle) { #endif } -void GetCudartDsoHandle(void** dso_handle) { -#if defined(__APPLE__) || defined(__OSX__) - GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle); -#else - GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle); -#endif -} - void GetCurandDsoHandle(void** dso_handle) { #if defined(__APPLE__) || defined(__OSX__) GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);