提交 de27ee7d 编写于 作者: G gangliao 提交者: GitHub

Merge pull request #1115 from gangliao/cudart

Fix cudart bugs before initMain
...@@ -108,6 +108,7 @@ function(link_paddle_exe TARGET_NAME) ...@@ -108,6 +108,7 @@ function(link_paddle_exe TARGET_NAME)
endif() endif()
if(WITH_GPU) if(WITH_GPU)
target_link_libraries(${TARGET_NAME} ${CUDA_CUDART_LIBRARY})
if(NOT WITH_DSO OR WITH_METRIC) if(NOT WITH_DSO OR WITH_METRIC)
target_link_libraries(${TARGET_NAME} target_link_libraries(${TARGET_NAME}
${CUDNN_LIBRARY} ${CUDNN_LIBRARY}
......
...@@ -13,5 +13,5 @@ GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@" ...@@ -13,5 +13,5 @@ GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
GFLAGS_LOCATION="@GFLAGS_LOCATION@" GFLAGS_LOCATION="@GFLAGS_LOCATION@"
CBLAS_LIBRARIES="@CBLAS_LIBRARIES@" CBLAS_LIBRARIES="@CBLAS_LIBRARIES@"
CUDA_LIBRARIES="@CUDA_cudart_shared_LIBRARY@" CUDA_LIBRARIES="@CUDA_CUDART_LIBRARY@"
WITH_COVERALLS="@ON_COVERALLS@" WITH_COVERALLS="@ON_COVERALLS@"
...@@ -15,7 +15,6 @@ else() ...@@ -15,7 +15,6 @@ else()
endif() endif()
set(CUDA_CXX_WITH_GPU_SOURCES set(CUDA_CXX_WITH_GPU_SOURCES
src/hl_cudart_wrap.cc
src/hl_cuda_cublas.cc src/hl_cuda_cublas.cc
src/hl_cuda_cudnn.cc src/hl_cuda_cudnn.cc
src/hl_cuda_device.cc) src/hl_cuda_device.cc)
......
...@@ -36,14 +36,6 @@ void GetCublasDsoHandle(void** dso_handle); ...@@ -36,14 +36,6 @@ void GetCublasDsoHandle(void** dso_handle);
*/ */
void GetCudnnDsoHandle(void** dso_handle); void GetCudnnDsoHandle(void** dso_handle);
/**
* @brief load the DSO of CUDA Run Time
*
* @param **dso_handle dso handler
*
*/
void GetCudartDsoHandle(void** dso_handle);
/** /**
* @brief load the DSO of CURAND * @brief load the DSO of CURAND
* *
......
...@@ -22,10 +22,9 @@ limitations under the License. */ ...@@ -22,10 +22,9 @@ limitations under the License. */
#include <sys/time.h> #include <sys/time.h>
#include <unistd.h> #include <unistd.h>
#include <mutex> #include <mutex>
#include "hl_cuda.h"
#include "hl_cuda.ph" #include "hl_cuda.ph"
#include "hl_dso_loader.h"
#include "hl_thread.ph" #include "hl_thread.ph"
#include "hl_dso_loader.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
// clang-format on // clang-format on
...@@ -77,78 +76,6 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP) ...@@ -77,78 +76,6 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
#undef CURAND_RAND_ROUTINE_EACH #undef CURAND_RAND_ROUTINE_EACH
#undef DYNAMIC_LOAD_CURAND_WRAP #undef DYNAMIC_LOAD_CURAND_WRAP
std::once_flag cudart_dso_flag;
void *cudart_dso_handle = nullptr;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cuda routine
* via operator overloading.
*
* note: default dynamic linked libs
*/
#ifdef PADDLE_USE_DSO
#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using cudart_func = decltype(__name(args...)) (*)(Args...); \
std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
void *p_##__name = dlsym(cudart_dso_handle, #__name); \
return reinterpret_cast<cudart_func>(p_##__name)(args...); \
} \
} __name; /* struct DynLoad__##__name */
#else
#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
return __name(args...); \
} \
} __name; /* struct DynLoad__##__name */
#endif
/* include all needed cuda functions in HPPL */
// clang-format off
#define CUDA_ROUTINE_EACH(__macro) \
__macro(cudaMalloc) \
__macro(cudaHostAlloc) \
__macro(cudaFree) \
__macro(cudaFreeHost) \
__macro(cudaMemcpy) \
__macro(cudaMemset) \
__macro(cudaMemcpyAsync) \
__macro(cudaSetDevice) \
__macro(cudaGetDevice) \
__macro(cudaGetDeviceCount) \
__macro(cudaGetDeviceProperties) \
__macro(cudaDeviceSynchronize) \
__macro(cudaDeviceCanAccessPeer) \
__macro(cudaDeviceEnablePeerAccess) \
__macro(cudaStreamCreate) \
__macro(cudaStreamDestroy) \
__macro(cudaStreamSynchronize) \
__macro(cudaStreamWaitEvent) \
__macro(cudaEventCreate) \
__macro(cudaEventRecord) \
__macro(cudaEventQuery) \
__macro(cudaEventDestroy) \
__macro(cudaEventSynchronize) \
__macro(cudaEventElapsedTime) \
__macro(cudaSetDeviceFlags) \
__macro(cudaGetLastError) \
__macro(cudaFuncSetCacheConfig) \
__macro(cudaRuntimeGetVersion) \
__macro(cudaGetErrorString) \
__macro(cudaProfilerStart) \
__macro(cudaProfilerStop)
// clang-format on
CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
#undef CUDA_ROUNTINE_EACH
#undef DYNAMIC_LOAD_CUDART_WRAP
} /* namespace dynload */ } /* namespace dynload */
/** /**
...@@ -171,11 +98,11 @@ int g_cuda_lib_version = 0; ...@@ -171,11 +98,11 @@ int g_cuda_lib_version = 0;
* Check build-in cuda function using glog and it **does not** * Check build-in cuda function using glog and it **does not**
* support << operator for more details error info. * support << operator for more details error info.
*/ */
#define CHECK_CUDA(cudaFunc) \ #define CHECK_CUDA(cudaFunc) \
do { \ do { \
cudaError_t cudaStat = cudaFunc; \ cudaError_t cudaStat = cudaFunc; \
CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \ CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \
<< dynload::cudaGetErrorString(cudaStat); \ << cudaGetErrorString(cudaStat); \
} while (0) } while (0)
/** /**
...@@ -284,13 +211,13 @@ void hl_fini() { ...@@ -284,13 +211,13 @@ void hl_fini() {
tmp_stream = (char *)t_device[dev]->stream; tmp_stream = (char *)t_device[dev]->stream;
} }
for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) { for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j])); CHECK_CUDA(cudaStreamDestroy(t_device[dev]->stream[j]));
} }
/* free device memory */ /* free device memory */
hl_free_mem_device(t_device[dev]->gpu_mem); hl_free_mem_device(t_device[dev]->gpu_mem);
hl_free_mem_host(t_device[dev]->cpu_mem); hl_free_mem_host(t_device[dev]->cpu_mem);
CHECK_CUDA(dynload::cudaEventDestroy(t_device[dev]->mem_event)); CHECK_CUDA(cudaEventDestroy(t_device[dev]->mem_event));
} }
free(tmp); free(tmp);
...@@ -308,7 +235,7 @@ void hl_set_device(int device) { ...@@ -308,7 +235,7 @@ void hl_set_device(int device) {
CHECK(device >= 0 && device < g_system_device_num && g_device[device]) CHECK(device >= 0 && device < g_system_device_num && g_device[device])
<< "Device: " << device << " is not specified in startup."; << "Device: " << device << " is not specified in startup.";
CHECK_CUDA(dynload::cudaSetDevice(device)); CHECK_CUDA(cudaSetDevice(device));
/* switch thread stream */ /* switch thread stream */
for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) { for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) {
...@@ -336,7 +263,7 @@ void hl_set_device(int device) { ...@@ -336,7 +263,7 @@ void hl_set_device(int device) {
int hl_get_device() { int hl_get_device() {
int device; int device;
CHECK_CUDA(dynload::cudaGetDevice(&device)); CHECK_CUDA(cudaGetDevice(&device));
return device; return device;
} }
...@@ -344,7 +271,7 @@ void *hl_malloc_device(size_t size) { ...@@ -344,7 +271,7 @@ void *hl_malloc_device(size_t size) {
void *dest_d; void *dest_d;
CHECK(size) << __func__ << ": the size for device memory is 0, please check."; CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size)); CHECK_CUDA(cudaMalloc((void **)&dest_d, size));
return dest_d; return dest_d;
} }
...@@ -352,7 +279,7 @@ void *hl_malloc_device(size_t size) { ...@@ -352,7 +279,7 @@ void *hl_malloc_device(size_t size) {
void hl_free_mem_device(void *dest_d) { void hl_free_mem_device(void *dest_d) {
CHECK_NOTNULL(dest_d); CHECK_NOTNULL(dest_d);
cudaError_t err = dynload::cudaFree(dest_d); cudaError_t err = cudaFree(dest_d);
CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err) CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
<< hl_get_device_error_string(); << hl_get_device_error_string();
} }
...@@ -361,8 +288,7 @@ void *hl_malloc_host(size_t size) { ...@@ -361,8 +288,7 @@ void *hl_malloc_host(size_t size) {
void *dest_h; void *dest_h;
CHECK(size) << __func__ << ": the size for device memory is 0, please check."; CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
CHECK_CUDA( CHECK_CUDA(cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
return dest_h; return dest_h;
} }
...@@ -370,7 +296,7 @@ void *hl_malloc_host(size_t size) { ...@@ -370,7 +296,7 @@ void *hl_malloc_host(size_t size) {
void hl_free_mem_host(void *dest_h) { void hl_free_mem_host(void *dest_h) {
CHECK_NOTNULL(dest_h); CHECK_NOTNULL(dest_h);
cudaError_t err = dynload::cudaFreeHost(dest_h); cudaError_t err = cudaFreeHost(dest_h);
CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err) CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
<< hl_get_device_error_string(); << hl_get_device_error_string();
} }
...@@ -381,11 +307,11 @@ void hl_memcpy(void *dst, void *src, size_t size) { ...@@ -381,11 +307,11 @@ void hl_memcpy(void *dst, void *src, size_t size) {
} }
CHECK_NOTNULL(dst); CHECK_NOTNULL(dst);
CHECK_NOTNULL(src); CHECK_NOTNULL(src);
CHECK_CUDA(dynload::cudaMemcpy(dst, src, size, cudaMemcpyDefault)); CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
} }
void hl_memset_device(void *dest_d, int value, size_t size) { void hl_memset_device(void *dest_d, int value, size_t size) {
CHECK_CUDA(dynload::cudaMemset(dest_d, value, size)); CHECK_CUDA(cudaMemset(dest_d, value, size));
} }
void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) { void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
...@@ -394,7 +320,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) { ...@@ -394,7 +320,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
} }
CHECK_NOTNULL(src_h); CHECK_NOTNULL(src_h);
CHECK_NOTNULL(dest_d); CHECK_NOTNULL(dest_d);
CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice)); CHECK_CUDA(cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
} }
void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) { void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
...@@ -403,7 +329,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) { ...@@ -403,7 +329,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
} }
CHECK_NOTNULL(dest_h); CHECK_NOTNULL(dest_h);
CHECK_NOTNULL(src_d); CHECK_NOTNULL(src_d);
CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost)); CHECK_CUDA(cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
} }
void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) { void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
...@@ -412,8 +338,7 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) { ...@@ -412,8 +338,7 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
} }
CHECK_NOTNULL(dest_d); CHECK_NOTNULL(dest_d);
CHECK_NOTNULL(src_d); CHECK_NOTNULL(src_d);
CHECK_CUDA( CHECK_CUDA(cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
} }
void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) { void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
...@@ -427,8 +352,7 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) { ...@@ -427,8 +352,7 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
CHECK_LT(stream, HPPL_STREAM_END); CHECK_LT(stream, HPPL_STREAM_END);
cu_stream = t_resource.stream[stream]; cu_stream = t_resource.stream[stream];
CHECK_CUDA( CHECK_CUDA(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
} }
void hl_start() { void hl_start() {
...@@ -439,8 +363,7 @@ void hl_start() { ...@@ -439,8 +363,7 @@ void hl_start() {
bool hl_device_can_access_peer(int device, int peerDevice) { bool hl_device_can_access_peer(int device, int peerDevice) {
int canAccessPeer; int canAccessPeer;
CHECK_CUDA( CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
if (canAccessPeer == 1) { if (canAccessPeer == 1) {
return true; return true;
...@@ -450,9 +373,9 @@ bool hl_device_can_access_peer(int device, int peerDevice) { ...@@ -450,9 +373,9 @@ bool hl_device_can_access_peer(int device, int peerDevice) {
} }
void hl_device_enable_peer_access(int peerDevice) { void hl_device_enable_peer_access(int peerDevice) {
cudaError_t err = dynload::cudaDeviceEnablePeerAccess(peerDevice, 0); cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice, 0);
if (cudaErrorPeerAccessAlreadyEnabled == err) { if (cudaErrorPeerAccessAlreadyEnabled == err) {
dynload::cudaGetLastError(); cudaGetLastError();
} else { } else {
CHECK_CUDA(err); CHECK_CUDA(err);
} }
...@@ -463,9 +386,9 @@ void hl_create_global_resources(hl_device_prop device_prop) { ...@@ -463,9 +386,9 @@ void hl_create_global_resources(hl_device_prop device_prop) {
int device = device_prop->device; int device = device_prop->device;
global_device_resources device_res = device_prop->device_resources; global_device_resources device_res = device_prop->device_resources;
CHECK_CUDA(dynload::cudaSetDevice(device)); CHECK_CUDA(cudaSetDevice(device));
/* device properties */ /* device properties */
CHECK_CUDA(dynload::cudaGetDeviceProperties(&cu_prop, device)); CHECK_CUDA(cudaGetDeviceProperties(&cu_prop, device));
device_prop->major = cu_prop.major; device_prop->major = cu_prop.major;
device_prop->minor = cu_prop.minor; device_prop->minor = cu_prop.minor;
...@@ -474,7 +397,7 @@ void hl_create_global_resources(hl_device_prop device_prop) { ...@@ -474,7 +397,7 @@ void hl_create_global_resources(hl_device_prop device_prop) {
/* create device stream */ /* create device stream */
for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) { for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) {
CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j])); CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
} }
/* cublas init */ /* cublas init */
...@@ -501,18 +424,18 @@ void hl_create_global_resources(hl_device_prop device_prop) { ...@@ -501,18 +424,18 @@ void hl_create_global_resources(hl_device_prop device_prop) {
device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t))); device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
pthread_mutex_init(device_res->gen_mutex, NULL); pthread_mutex_init(device_res->gen_mutex, NULL);
CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version)); CHECK_CUDA(cudaRuntimeGetVersion(&g_cuda_lib_version));
} }
int hl_get_cuda_version() { return g_cuda_lib_version; } int hl_get_cuda_version() { return g_cuda_lib_version; }
void hl_create_thread_resources(int device, void hl_create_thread_resources(int device,
thread_device_resources device_res) { thread_device_resources device_res) {
CHECK_CUDA(dynload::cudaSetDevice(device)); CHECK_CUDA(cudaSetDevice(device));
/* create thread stream */ /* create thread stream */
for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) { for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j])); CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
} }
/* allocation device memory */ /* allocation device memory */
...@@ -521,14 +444,14 @@ void hl_create_thread_resources(int device, ...@@ -521,14 +444,14 @@ void hl_create_thread_resources(int device,
/* allocation host memory */ /* allocation host memory */
device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE); device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event)); CHECK_CUDA(cudaEventCreate(&device_res->mem_event));
} }
void hl_specify_devices_start(int *device, int number) { void hl_specify_devices_start(int *device, int number) {
if (hl_start_flag) return; if (hl_start_flag) return;
/* 1. get the number of devices */ /* 1. get the number of devices */
CHECK_CUDA(dynload::cudaGetDeviceCount(&g_system_device_num)); CHECK_CUDA(cudaGetDeviceCount(&g_system_device_num));
CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device"; CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device";
if (device == NULL) { if (device == NULL) {
number = g_system_device_num; number = g_system_device_num;
...@@ -640,7 +563,7 @@ void hl_stream_synchronize(hl_stream_t stream) { ...@@ -640,7 +563,7 @@ void hl_stream_synchronize(hl_stream_t stream) {
<< ": the parameter stream is error."; << ": the parameter stream is error.";
cu_stream = t_resource.stream[stream]; cu_stream = t_resource.stream[stream];
CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream)); CHECK_CUDA(cudaStreamSynchronize(cu_stream));
} }
void hl_create_event(hl_event_t *event) { void hl_create_event(hl_event_t *event) {
...@@ -649,7 +572,7 @@ void hl_create_event(hl_event_t *event) { ...@@ -649,7 +572,7 @@ void hl_create_event(hl_event_t *event) {
struct _hl_event_st *st_event = struct _hl_event_st *st_event =
(struct _hl_event_st *)malloc(sizeof(struct _hl_event_st)); (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event)); CHECK_CUDA(cudaEventCreate(&st_event->cu_event));
*event = st_event; *event = st_event;
} }
...@@ -659,8 +582,7 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) { ...@@ -659,8 +582,7 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
CHECK_NOTNULL(start); CHECK_NOTNULL(start);
CHECK_NOTNULL(end); CHECK_NOTNULL(end);
CHECK_CUDA( CHECK_CUDA(cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
return time; return time;
} }
...@@ -672,7 +594,7 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) { ...@@ -672,7 +594,7 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
<< ": the parameter stream is error."; << ": the parameter stream is error.";
cu_stream = t_resource.stream[stream]; cu_stream = t_resource.stream[stream];
CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream)); CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream));
} }
void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) { void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
...@@ -683,12 +605,12 @@ void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) { ...@@ -683,12 +605,12 @@ void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
<< ": the parameter stream is error."; << ": the parameter stream is error.";
cu_stream = t_resource.stream[stream]; cu_stream = t_resource.stream[stream];
CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0)); CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
} }
void hl_destroy_event(hl_event_t event) { void hl_destroy_event(hl_event_t event) {
CHECK_NOTNULL(event); CHECK_NOTNULL(event);
CHECK_CUDA(dynload::cudaEventDestroy(event->cu_event)); CHECK_CUDA(cudaEventDestroy(event->cu_event));
free(event); free(event);
event = NULL; event = NULL;
...@@ -696,7 +618,7 @@ void hl_destroy_event(hl_event_t event) { ...@@ -696,7 +618,7 @@ void hl_destroy_event(hl_event_t event) {
void hl_event_synchronize(hl_event_t event) { void hl_event_synchronize(hl_event_t event) {
CHECK_NOTNULL(event); CHECK_NOTNULL(event);
CHECK_CUDA(dynload::cudaEventSynchronize(event->cu_event)); CHECK_CUDA(cudaEventSynchronize(event->cu_event));
} }
void hl_get_device_name(char *name, int len, int device) { void hl_get_device_name(char *name, int len, int device) {
...@@ -725,24 +647,24 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) { ...@@ -725,24 +647,24 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) {
*minor = g_device[device]->minor; *minor = g_device[device]->minor;
} }
int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); } int hl_get_device_last_error() { return (int)cudaGetLastError(); }
const char *hl_get_device_error_string() { const char *hl_get_device_error_string() {
cudaError_t err = dynload::cudaGetLastError(); cudaError_t err = cudaGetLastError();
return dynload::cudaGetErrorString(err); return cudaGetErrorString(err);
} }
const char *hl_get_device_error_string(size_t err) { const char *hl_get_device_error_string(size_t err) {
return dynload::cudaGetErrorString((cudaError_t)err); return cudaGetErrorString((cudaError_t)err);
} }
void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); } void hl_device_synchronize() { CHECK_CUDA(cudaDeviceSynchronize()); }
void hl_set_device_flags_block() { void hl_set_device_flags_block() {
CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
} }
bool hl_cuda_event_is_ready(hl_event_t event) { bool hl_cuda_event_is_ready(hl_event_t event) {
cudaError_t err = dynload::cudaEventQuery(event->cu_event); cudaError_t err = cudaEventQuery(event->cu_event);
CHECK(cudaSuccess == err || cudaErrorNotReady == err); CHECK(cudaSuccess == err || cudaErrorNotReady == err);
if (cudaErrorNotReady == err) { if (cudaErrorNotReady == err) {
...@@ -751,6 +673,6 @@ bool hl_cuda_event_is_ready(hl_event_t event) { ...@@ -751,6 +673,6 @@ bool hl_cuda_event_is_ready(hl_event_t event) {
return true; return true;
} }
void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); } void hl_profiler_start() { CHECK_CUDA(cudaProfilerStart()); }
void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); } void hl_profiler_end() { CHECK_CUDA(cudaProfilerStop()); }
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_USE_DSO
#include <cuda_runtime.h>
#include <mutex>
#include "hl_dso_loader.h"
/**
* cudart wrapper: for dynamic load libcudart.so.
* When nvcc compile cuda kernels, it will insert
* some build-in runtime routines, which must be
* provided by us if PADDLE_USE_DSO is true. If
* PADDLE_USE_DSO is false, all of them must be
* ignored to avoid multiple definitions.
*/
namespace dynload {
extern std::once_flag cudart_dso_flag;
extern void *cudart_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cuda routine
* via operator overloading.
**/
#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type) \
struct DynLoad__##__name { \
template <typename... Args> \
__type operator()(Args... args) { \
typedef __type (*cudartFunc)(Args...); \
std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
void *p_##__name = dlsym(cudart_dso_handle, #__name); \
return reinterpret_cast<cudartFunc>(p_##__name)(args...); \
} \
} __name; /* struct DynLoad__##__name */
/* include all needed cuda functions in HPPL */
// clang-format off
#define CUDA_ROUTINE_EACH(__macro) \
__macro(cudaLaunch, cudaError_t) \
__macro(cudaSetupArgument, cudaError_t) \
__macro(cudaConfigureCall, cudaError_t) \
__macro(__cudaRegisterFatBinary, void**) \
__macro(__cudaUnregisterFatBinary, void) \
__macro(__cudaRegisterFunction, void) \
__macro(__cudaRegisterVar, void) \
__macro(__cudaRegisterManagedVar, void) \
__macro(__cudaInitModule, char) \
__macro(__cudaRegisterTexture, void) \
__macro(__cudaRegisterSurface, void)
// clang-format on
CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
#if CUDART_VERSION >= 7000
DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
#endif
#undef CUDA_ROUNTINE_EACH
} /* namespace dynload */
#if CUDART_VERSION >= 7000
__host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
dim3 gridDim,
dim3 blockDim,
void **args,
size_t sharedMem,
cudaStream_t stream) {
return dynload::cudaLaunchKernel(
func, gridDim, blockDim, args, sharedMem, stream);
}
#endif /* CUDART_VERSION >= 7000 */
__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
return dynload::cudaLaunch(func);
}
__host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
size_t size,
size_t offset) {
return dynload::cudaSetupArgument(arg, size, offset);
}
__host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
dim3 blockDim,
size_t sharedMem,
cudaStream_t stream) {
return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
}
extern "C" {
void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
return dynload::__cudaRegisterFatBinary(fatCubin);
}
void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
}
void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
const char *hostFun,
char *deviceFun,
const char *deviceName,
int thread_limit,
uint3 *tid,
uint3 *bid,
dim3 *bDim,
dim3 *gDim,
int *wSize) {
return dynload::__cudaRegisterFunction(fatCubinHandle,
hostFun,
deviceFun,
deviceName,
thread_limit,
tid,
bid,
bDim,
gDim,
wSize);
}
void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle,
char *hostVar,
char *deviceAddress,
const char *deviceName,
int ext,
int size,
int constant,
int global) {
return dynload::__cudaRegisterVar(fatCubinHandle,
hostVar,
deviceAddress,
deviceName,
ext,
size,
constant,
global);
}
extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle,
void **hostVarPtrAddress,
char *deviceAddress,
const char *deviceName,
int ext,
int size,
int constant,
int global) {
return dynload::__cudaRegisterManagedVar(fatCubinHandle,
hostVarPtrAddress,
deviceAddress,
deviceName,
ext,
size,
constant,
global);
}
char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
return dynload::__cudaInitModule(fatCubinHandle);
}
void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle,
const struct textureReference *hostVar,
const void **deviceAddress,
const char *deviceName,
int dim,
int norm,
int ext) {
return dynload::__cudaRegisterTexture(
fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext);
}
void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle,
const struct surfaceReference *hostVar,
const void **deviceAddress,
const char *deviceName,
int dim,
int ext) {
return dynload::__cudaRegisterSurface(
fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext);
}
} /* extern "C" */
#endif
...@@ -25,10 +25,8 @@ DEFINE_string(cudnn_dir, ...@@ -25,10 +25,8 @@ DEFINE_string(cudnn_dir,
DEFINE_string(cuda_dir, DEFINE_string(cuda_dir,
"", "",
"Specify path for loading cuda library, such as libcublas, " "Specify path for loading cuda library, such as libcublas, "
"libcurand. For instance, /usr/local/cuda/lib64. (Note: " "libcurand. For instance, /usr/local/cuda/lib64. If default, "
"libcudart can not be specified by cuda_dir, since some " "dlopen will search cuda from LD_LIBRARY_PATH");
"build-in function in cudart already ran before main entry). "
"If default, dlopen will search cuda from LD_LIBRARY_PATH");
DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
...@@ -147,14 +145,6 @@ void GetCudnnDsoHandle(void** dso_handle) { ...@@ -147,14 +145,6 @@ void GetCudnnDsoHandle(void** dso_handle) {
#endif #endif
} }
void GetCudartDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
#else
GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
#endif
}
void GetCurandDsoHandle(void** dso_handle) { void GetCurandDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle); GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册