Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
de27ee7d
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
de27ee7d
编写于
1月 11, 2017
作者:
G
gangliao
提交者:
GitHub
1月 11, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1115 from gangliao/cudart
Fix cudart bugs before initMain
上级
441d00e6
eda4254a
变更
7
显示空白变更内容
内联
并排
Showing
7 changed file
with
51 addition
and
347 deletion
+51
-347
cmake/util.cmake
cmake/util.cmake
+1
-0
paddle/api/paddle_api_config.py.in
paddle/api/paddle_api_config.py.in
+1
-1
paddle/cuda/CMakeLists.txt
paddle/cuda/CMakeLists.txt
+0
-1
paddle/cuda/include/hl_dso_loader.h
paddle/cuda/include/hl_dso_loader.h
+0
-8
paddle/cuda/src/hl_cuda_device.cc
paddle/cuda/src/hl_cuda_device.cc
+47
-125
paddle/cuda/src/hl_cudart_wrap.cc
paddle/cuda/src/hl_cudart_wrap.cc
+0
-200
paddle/cuda/src/hl_dso_loader.cc
paddle/cuda/src/hl_dso_loader.cc
+2
-12
未找到文件。
cmake/util.cmake
浏览文件 @
de27ee7d
...
...
@@ -108,6 +108,7 @@ function(link_paddle_exe TARGET_NAME)
endif
()
if
(
WITH_GPU
)
target_link_libraries
(
${
TARGET_NAME
}
${
CUDA_CUDART_LIBRARY
}
)
if
(
NOT WITH_DSO OR WITH_METRIC
)
target_link_libraries
(
${
TARGET_NAME
}
${
CUDNN_LIBRARY
}
...
...
paddle/api/paddle_api_config.py.in
浏览文件 @
de27ee7d
...
...
@@ -13,5 +13,5 @@ GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
GFLAGS_LOCATION="@GFLAGS_LOCATION@"
CBLAS_LIBRARIES="@CBLAS_LIBRARIES@"
CUDA_LIBRARIES="@CUDA_
cudart_shared
_LIBRARY@"
CUDA_LIBRARIES="@CUDA_
CUDART
_LIBRARY@"
WITH_COVERALLS="@ON_COVERALLS@"
paddle/cuda/CMakeLists.txt
浏览文件 @
de27ee7d
...
...
@@ -15,7 +15,6 @@ else()
endif
()
set
(
CUDA_CXX_WITH_GPU_SOURCES
src/hl_cudart_wrap.cc
src/hl_cuda_cublas.cc
src/hl_cuda_cudnn.cc
src/hl_cuda_device.cc
)
...
...
paddle/cuda/include/hl_dso_loader.h
浏览文件 @
de27ee7d
...
...
@@ -36,14 +36,6 @@ void GetCublasDsoHandle(void** dso_handle);
*/
void
GetCudnnDsoHandle
(
void
**
dso_handle
);
/**
* @brief load the DSO of CUDA Run Time
*
* @param **dso_handle dso handler
*
*/
void
GetCudartDsoHandle
(
void
**
dso_handle
);
/**
* @brief load the DSO of CURAND
*
...
...
paddle/cuda/src/hl_cuda_device.cc
浏览文件 @
de27ee7d
...
...
@@ -22,10 +22,9 @@ limitations under the License. */
#include <sys/time.h>
#include <unistd.h>
#include <mutex>
#include "hl_cuda.h"
#include "hl_cuda.ph"
#include "hl_dso_loader.h"
#include "hl_thread.ph"
#include "hl_dso_loader.h"
#include "paddle/utils/Logging.h"
// clang-format on
...
...
@@ -77,78 +76,6 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
#undef CURAND_RAND_ROUTINE_EACH
#undef DYNAMIC_LOAD_CURAND_WRAP
std
::
once_flag
cudart_dso_flag
;
void
*
cudart_dso_handle
=
nullptr
;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cuda routine
* via operator overloading.
*
* note: default dynamic linked libs
*/
#ifdef PADDLE_USE_DSO
#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using cudart_func = decltype(__name(args...)) (*)(Args...); \
std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
void *p_##__name = dlsym(cudart_dso_handle, #__name); \
return reinterpret_cast<cudart_func>(p_##__name)(args...); \
} \
} __name;
/* struct DynLoad__##__name */
#else
#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
return __name(args...); \
} \
} __name;
/* struct DynLoad__##__name */
#endif
/* include all needed cuda functions in HPPL */
// clang-format off
#define CUDA_ROUTINE_EACH(__macro) \
__macro(cudaMalloc) \
__macro(cudaHostAlloc) \
__macro(cudaFree) \
__macro(cudaFreeHost) \
__macro(cudaMemcpy) \
__macro(cudaMemset) \
__macro(cudaMemcpyAsync) \
__macro(cudaSetDevice) \
__macro(cudaGetDevice) \
__macro(cudaGetDeviceCount) \
__macro(cudaGetDeviceProperties) \
__macro(cudaDeviceSynchronize) \
__macro(cudaDeviceCanAccessPeer) \
__macro(cudaDeviceEnablePeerAccess) \
__macro(cudaStreamCreate) \
__macro(cudaStreamDestroy) \
__macro(cudaStreamSynchronize) \
__macro(cudaStreamWaitEvent) \
__macro(cudaEventCreate) \
__macro(cudaEventRecord) \
__macro(cudaEventQuery) \
__macro(cudaEventDestroy) \
__macro(cudaEventSynchronize) \
__macro(cudaEventElapsedTime) \
__macro(cudaSetDeviceFlags) \
__macro(cudaGetLastError) \
__macro(cudaFuncSetCacheConfig) \
__macro(cudaRuntimeGetVersion) \
__macro(cudaGetErrorString) \
__macro(cudaProfilerStart) \
__macro(cudaProfilerStop)
// clang-format on
CUDA_ROUTINE_EACH
(
DYNAMIC_LOAD_CUDART_WRAP
)
#undef CUDA_ROUNTINE_EACH
#undef DYNAMIC_LOAD_CUDART_WRAP
}
/* namespace dynload */
/**
...
...
@@ -175,7 +102,7 @@ int g_cuda_lib_version = 0;
do { \
cudaError_t cudaStat = cudaFunc; \
CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \
<<
dynload::
cudaGetErrorString(cudaStat); \
<< cudaGetErrorString(cudaStat); \
} while (0)
/**
...
...
@@ -284,13 +211,13 @@ void hl_fini() {
tmp_stream
=
(
char
*
)
t_device
[
dev
]
->
stream
;
}
for
(
int
j
=
0
;
j
<
NUMBER_OF_THREAD_STREAM
;
j
++
)
{
CHECK_CUDA
(
dynload
::
cudaStreamDestroy
(
t_device
[
dev
]
->
stream
[
j
]));
CHECK_CUDA
(
cudaStreamDestroy
(
t_device
[
dev
]
->
stream
[
j
]));
}
/* free device memory */
hl_free_mem_device
(
t_device
[
dev
]
->
gpu_mem
);
hl_free_mem_host
(
t_device
[
dev
]
->
cpu_mem
);
CHECK_CUDA
(
dynload
::
cudaEventDestroy
(
t_device
[
dev
]
->
mem_event
));
CHECK_CUDA
(
cudaEventDestroy
(
t_device
[
dev
]
->
mem_event
));
}
free
(
tmp
);
...
...
@@ -308,7 +235,7 @@ void hl_set_device(int device) {
CHECK
(
device
>=
0
&&
device
<
g_system_device_num
&&
g_device
[
device
])
<<
"Device: "
<<
device
<<
" is not specified in startup."
;
CHECK_CUDA
(
dynload
::
cudaSetDevice
(
device
));
CHECK_CUDA
(
cudaSetDevice
(
device
));
/* switch thread stream */
for
(
int
i
=
0
;
i
<
NUMBER_OF_GLOBAL_STREAM
;
i
++
)
{
...
...
@@ -336,7 +263,7 @@ void hl_set_device(int device) {
int
hl_get_device
()
{
int
device
;
CHECK_CUDA
(
dynload
::
cudaGetDevice
(
&
device
));
CHECK_CUDA
(
cudaGetDevice
(
&
device
));
return
device
;
}
...
...
@@ -344,7 +271,7 @@ void *hl_malloc_device(size_t size) {
void
*
dest_d
;
CHECK
(
size
)
<<
__func__
<<
": the size for device memory is 0, please check."
;
CHECK_CUDA
(
dynload
::
cudaMalloc
((
void
**
)
&
dest_d
,
size
));
CHECK_CUDA
(
cudaMalloc
((
void
**
)
&
dest_d
,
size
));
return
dest_d
;
}
...
...
@@ -352,7 +279,7 @@ void *hl_malloc_device(size_t size) {
void
hl_free_mem_device
(
void
*
dest_d
)
{
CHECK_NOTNULL
(
dest_d
);
cudaError_t
err
=
dynload
::
cudaFree
(
dest_d
);
cudaError_t
err
=
cudaFree
(
dest_d
);
CHECK
(
cudaSuccess
==
err
||
cudaErrorCudartUnloading
==
err
)
<<
hl_get_device_error_string
();
}
...
...
@@ -361,8 +288,7 @@ void *hl_malloc_host(size_t size) {
void
*
dest_h
;
CHECK
(
size
)
<<
__func__
<<
": the size for device memory is 0, please check."
;
CHECK_CUDA
(
dynload
::
cudaHostAlloc
((
void
**
)
&
dest_h
,
size
,
cudaHostAllocDefault
));
CHECK_CUDA
(
cudaHostAlloc
((
void
**
)
&
dest_h
,
size
,
cudaHostAllocDefault
));
return
dest_h
;
}
...
...
@@ -370,7 +296,7 @@ void *hl_malloc_host(size_t size) {
void
hl_free_mem_host
(
void
*
dest_h
)
{
CHECK_NOTNULL
(
dest_h
);
cudaError_t
err
=
dynload
::
cudaFreeHost
(
dest_h
);
cudaError_t
err
=
cudaFreeHost
(
dest_h
);
CHECK
(
cudaSuccess
==
err
||
cudaErrorCudartUnloading
==
err
)
<<
hl_get_device_error_string
();
}
...
...
@@ -381,11 +307,11 @@ void hl_memcpy(void *dst, void *src, size_t size) {
}
CHECK_NOTNULL
(
dst
);
CHECK_NOTNULL
(
src
);
CHECK_CUDA
(
dynload
::
cudaMemcpy
(
dst
,
src
,
size
,
cudaMemcpyDefault
));
CHECK_CUDA
(
cudaMemcpy
(
dst
,
src
,
size
,
cudaMemcpyDefault
));
}
void
hl_memset_device
(
void
*
dest_d
,
int
value
,
size_t
size
)
{
CHECK_CUDA
(
dynload
::
cudaMemset
(
dest_d
,
value
,
size
));
CHECK_CUDA
(
cudaMemset
(
dest_d
,
value
,
size
));
}
void
hl_memcpy_host2device
(
void
*
dest_d
,
void
*
src_h
,
size_t
size
)
{
...
...
@@ -394,7 +320,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
}
CHECK_NOTNULL
(
src_h
);
CHECK_NOTNULL
(
dest_d
);
CHECK_CUDA
(
dynload
::
cudaMemcpy
(
dest_d
,
src_h
,
size
,
cudaMemcpyHostToDevice
));
CHECK_CUDA
(
cudaMemcpy
(
dest_d
,
src_h
,
size
,
cudaMemcpyHostToDevice
));
}
void
hl_memcpy_device2host
(
void
*
dest_h
,
void
*
src_d
,
size_t
size
)
{
...
...
@@ -403,7 +329,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
}
CHECK_NOTNULL
(
dest_h
);
CHECK_NOTNULL
(
src_d
);
CHECK_CUDA
(
dynload
::
cudaMemcpy
(
dest_h
,
src_d
,
size
,
cudaMemcpyDeviceToHost
));
CHECK_CUDA
(
cudaMemcpy
(
dest_h
,
src_d
,
size
,
cudaMemcpyDeviceToHost
));
}
void
hl_memcpy_device2device
(
void
*
dest_d
,
void
*
src_d
,
size_t
size
)
{
...
...
@@ -412,8 +338,7 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
}
CHECK_NOTNULL
(
dest_d
);
CHECK_NOTNULL
(
src_d
);
CHECK_CUDA
(
dynload
::
cudaMemcpy
(
dest_d
,
src_d
,
size
,
cudaMemcpyDeviceToDevice
));
CHECK_CUDA
(
cudaMemcpy
(
dest_d
,
src_d
,
size
,
cudaMemcpyDeviceToDevice
));
}
void
hl_memcpy_async
(
void
*
dst
,
void
*
src
,
size_t
size
,
hl_stream_t
stream
)
{
...
...
@@ -427,8 +352,7 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
CHECK_LT
(
stream
,
HPPL_STREAM_END
);
cu_stream
=
t_resource
.
stream
[
stream
];
CHECK_CUDA
(
dynload
::
cudaMemcpyAsync
(
dst
,
src
,
size
,
cudaMemcpyDefault
,
cu_stream
));
CHECK_CUDA
(
cudaMemcpyAsync
(
dst
,
src
,
size
,
cudaMemcpyDefault
,
cu_stream
));
}
void
hl_start
()
{
...
...
@@ -439,8 +363,7 @@ void hl_start() {
bool
hl_device_can_access_peer
(
int
device
,
int
peerDevice
)
{
int
canAccessPeer
;
CHECK_CUDA
(
dynload
::
cudaDeviceCanAccessPeer
(
&
canAccessPeer
,
device
,
peerDevice
));
CHECK_CUDA
(
cudaDeviceCanAccessPeer
(
&
canAccessPeer
,
device
,
peerDevice
));
if
(
canAccessPeer
==
1
)
{
return
true
;
...
...
@@ -450,9 +373,9 @@ bool hl_device_can_access_peer(int device, int peerDevice) {
}
void
hl_device_enable_peer_access
(
int
peerDevice
)
{
cudaError_t
err
=
dynload
::
cudaDeviceEnablePeerAccess
(
peerDevice
,
0
);
cudaError_t
err
=
cudaDeviceEnablePeerAccess
(
peerDevice
,
0
);
if
(
cudaErrorPeerAccessAlreadyEnabled
==
err
)
{
dynload
::
cudaGetLastError
();
cudaGetLastError
();
}
else
{
CHECK_CUDA
(
err
);
}
...
...
@@ -463,9 +386,9 @@ void hl_create_global_resources(hl_device_prop device_prop) {
int
device
=
device_prop
->
device
;
global_device_resources
device_res
=
device_prop
->
device_resources
;
CHECK_CUDA
(
dynload
::
cudaSetDevice
(
device
));
CHECK_CUDA
(
cudaSetDevice
(
device
));
/* device properties */
CHECK_CUDA
(
dynload
::
cudaGetDeviceProperties
(
&
cu_prop
,
device
));
CHECK_CUDA
(
cudaGetDeviceProperties
(
&
cu_prop
,
device
));
device_prop
->
major
=
cu_prop
.
major
;
device_prop
->
minor
=
cu_prop
.
minor
;
...
...
@@ -474,7 +397,7 @@ void hl_create_global_resources(hl_device_prop device_prop) {
/* create device stream */
for
(
int
j
=
0
;
j
<
NUMBER_OF_GLOBAL_STREAM
;
j
++
)
{
CHECK_CUDA
(
dynload
::
cudaStreamCreate
(
&
device_res
->
stream
[
j
]));
CHECK_CUDA
(
cudaStreamCreate
(
&
device_res
->
stream
[
j
]));
}
/* cublas init */
...
...
@@ -501,18 +424,18 @@ void hl_create_global_resources(hl_device_prop device_prop) {
device_res
->
gen_mutex
=
(
pthread_mutex_t
*
)(
malloc
(
sizeof
(
pthread_mutex_t
)));
pthread_mutex_init
(
device_res
->
gen_mutex
,
NULL
);
CHECK_CUDA
(
dynload
::
cudaRuntimeGetVersion
(
&
g_cuda_lib_version
));
CHECK_CUDA
(
cudaRuntimeGetVersion
(
&
g_cuda_lib_version
));
}
int
hl_get_cuda_version
()
{
return
g_cuda_lib_version
;
}
void
hl_create_thread_resources
(
int
device
,
thread_device_resources
device_res
)
{
CHECK_CUDA
(
dynload
::
cudaSetDevice
(
device
));
CHECK_CUDA
(
cudaSetDevice
(
device
));
/* create thread stream */
for
(
int
j
=
0
;
j
<
NUMBER_OF_THREAD_STREAM
;
j
++
)
{
CHECK_CUDA
(
dynload
::
cudaStreamCreate
(
&
device_res
->
stream
[
j
]));
CHECK_CUDA
(
cudaStreamCreate
(
&
device_res
->
stream
[
j
]));
}
/* allocation device memory */
...
...
@@ -521,14 +444,14 @@ void hl_create_thread_resources(int device,
/* allocation host memory */
device_res
->
cpu_mem
=
(
real
*
)
hl_malloc_host
(
HPPL_GPU_MEMORY_SIZE
);
CHECK_CUDA
(
dynload
::
cudaEventCreate
(
&
device_res
->
mem_event
));
CHECK_CUDA
(
cudaEventCreate
(
&
device_res
->
mem_event
));
}
void
hl_specify_devices_start
(
int
*
device
,
int
number
)
{
if
(
hl_start_flag
)
return
;
/* 1. get the number of devices */
CHECK_CUDA
(
dynload
::
cudaGetDeviceCount
(
&
g_system_device_num
));
CHECK_CUDA
(
cudaGetDeviceCount
(
&
g_system_device_num
));
CHECK_NE
(
g_system_device_num
,
0
)
<<
"[Start failed] there is no GPU device"
;
if
(
device
==
NULL
)
{
number
=
g_system_device_num
;
...
...
@@ -640,7 +563,7 @@ void hl_stream_synchronize(hl_stream_t stream) {
<<
": the parameter stream is error."
;
cu_stream
=
t_resource
.
stream
[
stream
];
CHECK_CUDA
(
dynload
::
cudaStreamSynchronize
(
cu_stream
));
CHECK_CUDA
(
cudaStreamSynchronize
(
cu_stream
));
}
void
hl_create_event
(
hl_event_t
*
event
)
{
...
...
@@ -649,7 +572,7 @@ void hl_create_event(hl_event_t *event) {
struct
_hl_event_st
*
st_event
=
(
struct
_hl_event_st
*
)
malloc
(
sizeof
(
struct
_hl_event_st
));
CHECK_CUDA
(
dynload
::
cudaEventCreate
(
&
st_event
->
cu_event
));
CHECK_CUDA
(
cudaEventCreate
(
&
st_event
->
cu_event
));
*
event
=
st_event
;
}
...
...
@@ -659,8 +582,7 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
CHECK_NOTNULL
(
start
);
CHECK_NOTNULL
(
end
);
CHECK_CUDA
(
dynload
::
cudaEventElapsedTime
(
&
time
,
start
->
cu_event
,
end
->
cu_event
));
CHECK_CUDA
(
cudaEventElapsedTime
(
&
time
,
start
->
cu_event
,
end
->
cu_event
));
return
time
;
}
...
...
@@ -672,7 +594,7 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
<<
": the parameter stream is error."
;
cu_stream
=
t_resource
.
stream
[
stream
];
CHECK_CUDA
(
dynload
::
cudaEventRecord
(
event
->
cu_event
,
cu_stream
));
CHECK_CUDA
(
cudaEventRecord
(
event
->
cu_event
,
cu_stream
));
}
void
hl_stream_wait_event
(
hl_stream_t
stream
,
hl_event_t
event
)
{
...
...
@@ -683,12 +605,12 @@ void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
<<
": the parameter stream is error."
;
cu_stream
=
t_resource
.
stream
[
stream
];
CHECK_CUDA
(
dynload
::
cudaStreamWaitEvent
(
cu_stream
,
event
->
cu_event
,
0
));
CHECK_CUDA
(
cudaStreamWaitEvent
(
cu_stream
,
event
->
cu_event
,
0
));
}
void
hl_destroy_event
(
hl_event_t
event
)
{
CHECK_NOTNULL
(
event
);
CHECK_CUDA
(
dynload
::
cudaEventDestroy
(
event
->
cu_event
));
CHECK_CUDA
(
cudaEventDestroy
(
event
->
cu_event
));
free
(
event
);
event
=
NULL
;
...
...
@@ -696,7 +618,7 @@ void hl_destroy_event(hl_event_t event) {
void
hl_event_synchronize
(
hl_event_t
event
)
{
CHECK_NOTNULL
(
event
);
CHECK_CUDA
(
dynload
::
cudaEventSynchronize
(
event
->
cu_event
));
CHECK_CUDA
(
cudaEventSynchronize
(
event
->
cu_event
));
}
void
hl_get_device_name
(
char
*
name
,
int
len
,
int
device
)
{
...
...
@@ -725,24 +647,24 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) {
*
minor
=
g_device
[
device
]
->
minor
;
}
int
hl_get_device_last_error
()
{
return
(
int
)
dynload
::
cudaGetLastError
();
}
int
hl_get_device_last_error
()
{
return
(
int
)
cudaGetLastError
();
}
const
char
*
hl_get_device_error_string
()
{
cudaError_t
err
=
dynload
::
cudaGetLastError
();
return
dynload
::
cudaGetErrorString
(
err
);
cudaError_t
err
=
cudaGetLastError
();
return
cudaGetErrorString
(
err
);
}
const
char
*
hl_get_device_error_string
(
size_t
err
)
{
return
dynload
::
cudaGetErrorString
((
cudaError_t
)
err
);
return
cudaGetErrorString
((
cudaError_t
)
err
);
}
void
hl_device_synchronize
()
{
CHECK_CUDA
(
dynload
::
cudaDeviceSynchronize
());
}
void
hl_device_synchronize
()
{
CHECK_CUDA
(
cudaDeviceSynchronize
());
}
void
hl_set_device_flags_block
()
{
CHECK_CUDA
(
dynload
::
cudaSetDeviceFlags
(
cudaDeviceScheduleBlockingSync
));
CHECK_CUDA
(
cudaSetDeviceFlags
(
cudaDeviceScheduleBlockingSync
));
}
bool
hl_cuda_event_is_ready
(
hl_event_t
event
)
{
cudaError_t
err
=
dynload
::
cudaEventQuery
(
event
->
cu_event
);
cudaError_t
err
=
cudaEventQuery
(
event
->
cu_event
);
CHECK
(
cudaSuccess
==
err
||
cudaErrorNotReady
==
err
);
if
(
cudaErrorNotReady
==
err
)
{
...
...
@@ -751,6 +673,6 @@ bool hl_cuda_event_is_ready(hl_event_t event) {
return
true
;
}
void
hl_profiler_start
()
{
CHECK_CUDA
(
dynload
::
cudaProfilerStart
());
}
void
hl_profiler_start
()
{
CHECK_CUDA
(
cudaProfilerStart
());
}
void
hl_profiler_end
()
{
CHECK_CUDA
(
dynload
::
cudaProfilerStop
());
}
void
hl_profiler_end
()
{
CHECK_CUDA
(
cudaProfilerStop
());
}
paddle/cuda/src/hl_cudart_wrap.cc
已删除
100644 → 0
浏览文件 @
441d00e6
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_USE_DSO
#include <cuda_runtime.h>
#include <mutex>
#include "hl_dso_loader.h"
/**
* cudart wrapper: for dynamic load libcudart.so.
* When nvcc compile cuda kernels, it will insert
* some build-in runtime routines, which must be
* provided by us if PADDLE_USE_DSO is true. If
* PADDLE_USE_DSO is false, all of them must be
* ignored to avoid multiple definitions.
*/
namespace
dynload
{
extern
std
::
once_flag
cudart_dso_flag
;
extern
void
*
cudart_dso_handle
;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cuda routine
* via operator overloading.
**/
#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type) \
struct DynLoad__##__name { \
template <typename... Args> \
__type operator()(Args... args) { \
typedef __type (*cudartFunc)(Args...); \
std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
void *p_##__name = dlsym(cudart_dso_handle, #__name); \
return reinterpret_cast<cudartFunc>(p_##__name)(args...); \
} \
} __name;
/* struct DynLoad__##__name */
/* include all needed cuda functions in HPPL */
// clang-format off
#define CUDA_ROUTINE_EACH(__macro) \
__macro(cudaLaunch, cudaError_t) \
__macro(cudaSetupArgument, cudaError_t) \
__macro(cudaConfigureCall, cudaError_t) \
__macro(__cudaRegisterFatBinary, void**) \
__macro(__cudaUnregisterFatBinary, void) \
__macro(__cudaRegisterFunction, void) \
__macro(__cudaRegisterVar, void) \
__macro(__cudaRegisterManagedVar, void) \
__macro(__cudaInitModule, char) \
__macro(__cudaRegisterTexture, void) \
__macro(__cudaRegisterSurface, void)
// clang-format on
CUDA_ROUTINE_EACH
(
DYNAMIC_LOAD_CUDART_WRAP
)
#if CUDART_VERSION >= 7000
DYNAMIC_LOAD_CUDART_WRAP
(
cudaLaunchKernel
,
cudaError_t
)
#endif
#undef CUDA_ROUNTINE_EACH
}
/* namespace dynload */
#if CUDART_VERSION >= 7000
__host__
cudaError_t
CUDARTAPI
cudaLaunchKernel
(
const
void
*
func
,
dim3
gridDim
,
dim3
blockDim
,
void
**
args
,
size_t
sharedMem
,
cudaStream_t
stream
)
{
return
dynload
::
cudaLaunchKernel
(
func
,
gridDim
,
blockDim
,
args
,
sharedMem
,
stream
);
}
#endif
/* CUDART_VERSION >= 7000 */
__host__
cudaError_t
CUDARTAPI
cudaLaunch
(
const
void
*
func
)
{
return
dynload
::
cudaLaunch
(
func
);
}
__host__
cudaError_t
CUDARTAPI
cudaSetupArgument
(
const
void
*
arg
,
size_t
size
,
size_t
offset
)
{
return
dynload
::
cudaSetupArgument
(
arg
,
size
,
offset
);
}
__host__
cudaError_t
CUDARTAPI
cudaConfigureCall
(
dim3
gridDim
,
dim3
blockDim
,
size_t
sharedMem
,
cudaStream_t
stream
)
{
return
dynload
::
cudaConfigureCall
(
gridDim
,
blockDim
,
sharedMem
,
stream
);
}
extern
"C"
{
void
**
CUDARTAPI
__cudaRegisterFatBinary
(
void
*
fatCubin
)
{
return
dynload
::
__cudaRegisterFatBinary
(
fatCubin
);
}
void
CUDARTAPI
__cudaUnregisterFatBinary
(
void
**
fatCubinHandle
)
{
return
dynload
::
__cudaUnregisterFatBinary
(
fatCubinHandle
);
}
void
CUDARTAPI
__cudaRegisterFunction
(
void
**
fatCubinHandle
,
const
char
*
hostFun
,
char
*
deviceFun
,
const
char
*
deviceName
,
int
thread_limit
,
uint3
*
tid
,
uint3
*
bid
,
dim3
*
bDim
,
dim3
*
gDim
,
int
*
wSize
)
{
return
dynload
::
__cudaRegisterFunction
(
fatCubinHandle
,
hostFun
,
deviceFun
,
deviceName
,
thread_limit
,
tid
,
bid
,
bDim
,
gDim
,
wSize
);
}
void
CUDARTAPI
__cudaRegisterVar
(
void
**
fatCubinHandle
,
char
*
hostVar
,
char
*
deviceAddress
,
const
char
*
deviceName
,
int
ext
,
int
size
,
int
constant
,
int
global
)
{
return
dynload
::
__cudaRegisterVar
(
fatCubinHandle
,
hostVar
,
deviceAddress
,
deviceName
,
ext
,
size
,
constant
,
global
);
}
extern
void
CUDARTAPI
__cudaRegisterManagedVar
(
void
**
fatCubinHandle
,
void
**
hostVarPtrAddress
,
char
*
deviceAddress
,
const
char
*
deviceName
,
int
ext
,
int
size
,
int
constant
,
int
global
)
{
return
dynload
::
__cudaRegisterManagedVar
(
fatCubinHandle
,
hostVarPtrAddress
,
deviceAddress
,
deviceName
,
ext
,
size
,
constant
,
global
);
}
char
CUDARTAPI
__cudaInitModule
(
void
**
fatCubinHandle
)
{
return
dynload
::
__cudaInitModule
(
fatCubinHandle
);
}
void
CUDARTAPI
__cudaRegisterTexture
(
void
**
fatCubinHandle
,
const
struct
textureReference
*
hostVar
,
const
void
**
deviceAddress
,
const
char
*
deviceName
,
int
dim
,
int
norm
,
int
ext
)
{
return
dynload
::
__cudaRegisterTexture
(
fatCubinHandle
,
hostVar
,
deviceAddress
,
deviceName
,
dim
,
norm
,
ext
);
}
void
CUDARTAPI
__cudaRegisterSurface
(
void
**
fatCubinHandle
,
const
struct
surfaceReference
*
hostVar
,
const
void
**
deviceAddress
,
const
char
*
deviceName
,
int
dim
,
int
ext
)
{
return
dynload
::
__cudaRegisterSurface
(
fatCubinHandle
,
hostVar
,
deviceAddress
,
deviceName
,
dim
,
ext
);
}
}
/* extern "C" */
#endif
paddle/cuda/src/hl_dso_loader.cc
浏览文件 @
de27ee7d
...
...
@@ -25,10 +25,8 @@ DEFINE_string(cudnn_dir,
DEFINE_string
(
cuda_dir
,
""
,
"Specify path for loading cuda library, such as libcublas, "
"libcurand. For instance, /usr/local/cuda/lib64. (Note: "
"libcudart can not be specified by cuda_dir, since some "
"build-in function in cudart already ran before main entry). "
"If default, dlopen will search cuda from LD_LIBRARY_PATH"
);
"libcurand. For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH"
);
DEFINE_string
(
warpctc_dir
,
""
,
"Specify path for loading libwarpctc.so."
);
...
...
@@ -147,14 +145,6 @@ void GetCudnnDsoHandle(void** dso_handle) {
#endif
}
void
GetCudartDsoHandle
(
void
**
dso_handle
)
{
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath
(
""
,
"libcudart.dylib"
,
dso_handle
);
#else
GetDsoHandleFromSearchPath
(
""
,
"libcudart.so"
,
dso_handle
);
#endif
}
void
GetCurandDsoHandle
(
void
**
dso_handle
)
{
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath
(
FLAGS_cuda_dir
,
"libcurand.dylib"
,
dso_handle
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录