Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
93c1d9e7
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
93c1d9e7
编写于
2月 08, 2021
作者:
Q
Qi Li
提交者:
GitHub
2月 08, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] update fluid platform for rocm39 (part3), test=develop (#30913)
上级
15297a06
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
750 addition
and
110 deletion
+750
-110
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+43
-16
paddle/fluid/platform/collective_helper.cc
paddle/fluid/platform/collective_helper.cc
+3
-2
paddle/fluid/platform/collective_helper.h
paddle/fluid/platform/collective_helper.h
+2
-2
paddle/fluid/platform/cuda_device_function.h
paddle/fluid/platform/cuda_device_function.h
+27
-12
paddle/fluid/platform/cuda_helper.h
paddle/fluid/platform/cuda_helper.h
+21
-1
paddle/fluid/platform/cuda_helper_test.cu
paddle/fluid/platform/cuda_helper_test.cu
+60
-0
paddle/fluid/platform/cuda_primitives.h
paddle/fluid/platform/cuda_primitives.h
+14
-9
paddle/fluid/platform/cuda_resource_pool.cc
paddle/fluid/platform/cuda_resource_pool.cc
+23
-5
paddle/fluid/platform/cuda_resource_pool.h
paddle/fluid/platform/cuda_resource_pool.h
+10
-2
paddle/fluid/platform/cudnn_desc_test.cc
paddle/fluid/platform/cudnn_desc_test.cc
+4
-0
paddle/fluid/platform/device_code.cc
paddle/fluid/platform/device_code.cc
+151
-10
paddle/fluid/platform/device_code.h
paddle/fluid/platform/device_code.h
+14
-1
paddle/fluid/platform/device_code_test.cc
paddle/fluid/platform/device_code_test.cc
+17
-1
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+47
-14
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+74
-9
paddle/fluid/platform/device_context_test.cu
paddle/fluid/platform/device_context_test.cu
+4
-0
paddle/fluid/platform/enforce.h
paddle/fluid/platform/enforce.h
+1
-7
paddle/fluid/platform/miopen_desc.h
paddle/fluid/platform/miopen_desc.h
+221
-0
tools/dockerfile/Dockerfile.rocm
tools/dockerfile/Dockerfile.rocm
+14
-19
未找到文件。
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
93c1d9e7
...
...
@@ -52,7 +52,12 @@ ENDIF()
cc_library
(
cpu_info SRCS cpu_info.cc DEPS
${
CPU_INFO_DEPS
}
)
cc_test
(
cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info
)
nv_library
(
gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda
)
IF
(
WITH_GPU
)
nv_library
(
gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda
)
ENDIF
()
IF
(
WITH_ROCM
)
hip_library
(
gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda
)
ENDIF
()
cc_library
(
place SRCS place.cc DEPS enforce boost
)
cc_test
(
place_test SRCS place_test.cc DEPS place glog gflags
)
...
...
@@ -72,7 +77,7 @@ IF(WITH_DGC)
set
(
dgc_deps dgc
)
ENDIF
()
IF
(
WITH_GPU
)
IF
(
WITH_GPU
OR WITH_ROCM
)
set
(
GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream
)
ENDIF
()
...
...
@@ -81,9 +86,14 @@ IF(WITH_MKLDNN)
ELSE
()
set
(
MKLDNN_CTX_DEPS
)
ENDIF
()
nv_library
(
stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce
)
IF
(
WITH_GPU
)
nv_library
(
stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce
)
ENDIF
()
IF
(
WITH_ROCM
)
hip_library
(
stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce
)
ENDIF
()
IF
(
WITH_GPU OR WITH_ROCM
)
set
(
STREAM_CALLBACK_DEPS stream_callback_manager
)
ELSE
()
set
(
STREAM_CALLBACK_DEPS
)
...
...
@@ -103,18 +113,26 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool
cc_library
(
collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce
)
if
(
WITH_GPU
)
if
(
WITH_GPU
OR WITH_ROCM
)
cc_library
(
cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info
)
target_link_libraries
(
device_context cuda_resource_pool
)
endif
()
nv_test
(
device_context_test SRCS device_context_test.cu DEPS device_context gpu_info
)
cc_test
(
init_test SRCS init_test.cc DEPS device_context
)
nv_test
(
cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda
)
nv_test
(
cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda
)
nv_test
(
transform_test SRCS transform_test.cu DEPS memory place device_context
)
if
(
WITH_GPU
)
nv_test
(
device_context_test SRCS device_context_test.cu DEPS device_context gpu_info
)
nv_test
(
cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda
)
nv_test
(
cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda
)
nv_test
(
transform_test SRCS transform_test.cu DEPS memory place device_context
)
endif
()
if
(
WITH_ROCM
)
hip_test
(
device_context_test SRCS device_context_test.cu DEPS device_context gpu_info
)
hip_test
(
miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda
)
hip_test
(
cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda tensor
)
hip_test
(
transform_test SRCS transform_test.cu DEPS memory place device_context
)
endif
()
cc_library
(
timer SRCS timer.cc
)
cc_test
(
timer_test SRCS timer_test.cc DEPS timer
)
...
...
@@ -127,25 +145,34 @@ if(WITH_GPU)
nv_library
(
profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda
)
nv_test
(
cuda_helper_test SRCS cuda_helper_test.cu
)
nv_library
(
device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place
)
elseif
(
WITH_ROCM
)
hip_library
(
profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce
)
hip_test
(
cuda_helper_test SRCS cuda_helper_test.cu
)
hip_library
(
device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place
)
else
()
cc_library
(
profiler SRCS profiler.cc DEPS device_tracer enforce
)
cc_library
(
device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place
)
endif
()
cc_test
(
profiler_test SRCS profiler_test.cc DEPS profiler
)
nv_test
(
float16_gpu_test SRCS float16_test.cu DEPS lod_tensor
)
cc_test
(
float16_test SRCS float16_test.cc DEPS lod_tensor
)
cc_test
(
bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor
)
nv_test
(
test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags
)
IF
(
WITH_GPU
)
nv_test
(
float16_gpu_test SRCS float16_test.cu DEPS lod_tensor
)
nv_test
(
test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags
)
nv_library
(
cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info
)
ENDIF
()
nv_library
(
cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info
)
IF
(
WITH_ROCM
)
hip_test
(
float16_gpu_test SRCS float16_test.cu DEPS lod_tensor
)
hip_test
(
test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags
)
hip_library
(
cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info
)
ENDIF
()
if
(
NOT APPLE AND NOT WIN32
)
cc_library
(
device_code SRCS device_code.cc DEPS device_context
)
if
(
WITH_GPU
)
if
(
WITH_GPU
OR WITH_ROCM
)
cc_test
(
device_code_test SRCS device_code_test.cc DEPS device_code lod_tensor
)
endif
()
endif
()
paddle/fluid/platform/collective_helper.cc
浏览文件 @
93c1d9e7
...
...
@@ -13,10 +13,11 @@
// limitations under the License.
#include "paddle/fluid/platform/collective_helper.h"
#include <utility>
namespace
paddle
{
namespace
platform
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
class
NCCLCommImpl
:
public
NCCLComm
{
public:
void
set_ring_id
(
int
ring_id
)
{
ring_id_
=
ring_id
;
}
...
...
@@ -35,7 +36,7 @@ class NCCLCommImpl : public NCCLComm {
void
set_comm
(
ncclComm_t
comm
)
{
comm_
=
comm
;
}
ncclComm_t
comm
()
const
override
{
return
comm_
;
}
cuda
Stream_t
stream
()
const
override
{
return
dev_ctx_
->
stream
();
}
gpu
Stream_t
stream
()
const
override
{
return
dev_ctx_
->
stream
();
}
void
set_dev_ctx
(
std
::
unique_ptr
<
CUDADeviceContext
>&&
dev_ctx
)
{
dev_ctx_
=
std
::
move
(
dev_ctx
);
...
...
paddle/fluid/platform/collective_helper.h
浏览文件 @
93c1d9e7
...
...
@@ -27,7 +27,7 @@
namespace
paddle
{
namespace
platform
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
// In order to apply hierarchical communication with NCCL, we need
// a communication ring contains NCCL communicators associated to a global
// ncclUniqueId. E.g. for a hierarchical case,
...
...
@@ -56,7 +56,7 @@ class NCCLComm {
virtual
int
rank
()
const
=
0
;
virtual
int
device_id
()
const
=
0
;
virtual
ncclComm_t
comm
()
const
=
0
;
virtual
cuda
Stream_t
stream
()
const
=
0
;
virtual
gpu
Stream_t
stream
()
const
=
0
;
virtual
CUDADeviceContext
*
dev_context
()
const
=
0
;
virtual
~
NCCLComm
()
=
default
;
};
...
...
paddle/fluid/platform/cuda_device_function.h
浏览文件 @
93c1d9e7
...
...
@@ -14,10 +14,8 @@ limitations under the License. */
#pragma once
#include <cuda.h>
// NOTE(): support float16 to half in header file.
#define PADDLE_CUDA_FP16
#include <cuda_fp16.h>
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
#include "paddle/fluid/platform/float16.h"
...
...
@@ -25,6 +23,9 @@ limitations under the License. */
namespace
paddle
{
namespace
platform
{
#ifdef PADDLE_WITH_HIP
#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
#else
#if CUDA_VERSION < 9000
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
...
...
@@ -32,6 +33,7 @@ namespace platform {
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
#endif
inline
static
int
RoundToPowerOfTwo
(
int
dim
)
{
if
(
dim
>
512
)
{
...
...
@@ -67,7 +69,7 @@ template <typename T>
__forceinline__
__device__
T
CudaShuffleDownSync
(
unsigned
mask
,
T
val
,
int
delta
,
int
width
=
warpSize
)
{
#if CUDA_VERSION < 9000
#if
defined(PADDLE_WITH_HIP) ||
CUDA_VERSION < 9000
return
__shfl_down
(
val
,
delta
,
width
);
#else
return
__shfl_down_sync
(
mask
,
val
,
static_cast
<
unsigned
>
(
delta
),
width
);
...
...
@@ -77,7 +79,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
template
<
typename
T
>
__forceinline__
__device__
T
CudaShuffleXorSync
(
unsigned
mask
,
T
val
,
int
width
=
warpSize
)
{
#if CUDA_VERSION < 9000
#if
defined(PADDLE_WITH_HIP) ||
CUDA_VERSION < 9000
return
__shfl_xor
(
val
,
width
);
#else
return
__shfl_xor_sync
(
mask
,
val
,
width
);
...
...
@@ -85,18 +87,27 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
}
// CUDA 9.0 have native compatible float16 shfl_down
#if CUDA_VERSION < 9000
#if
defined(PADDLE_WITH_HIP) ||
CUDA_VERSION < 9000
template
<
>
__forceinline__
__device__
float16
CudaShuffleDownSync
(
unsigned
mask
,
float16
val
,
int
delta
,
int
width
)
{
#ifdef PADDLE_WITH_HIP
return
float16
(
__shfl_down
(
static_cast
<
float
>
(
val
),
static_cast
<
unsigned
>
(
delta
),
width
));
#else
return
float16
(
__shfl_down
(
static_cast
<
half
>
(
val
),
static_cast
<
unsigned
>
(
delta
),
width
));
#endif
}
template
<
>
__forceinline__
__device__
float16
CudaShuffleXorSync
(
unsigned
mask
,
float16
val
,
int
width
)
{
#ifdef PADDLE_WITH_HIP
return
float16
(
__shfl_xor
(
static_cast
<
float
>
(
val
),
width
));
#else
return
float16
(
__shfl_xor
(
static_cast
<
half
>
(
val
),
width
));
#endif
}
#else
template
<
>
...
...
@@ -159,7 +170,7 @@ __forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync(
template
<
typename
T
>
__forceinline__
__device__
T
CudaShuffleSync
(
unsigned
mask
,
T
val
,
int
src_line
,
int
width
=
32
)
{
#if CUDA_VERSION < 9000
#if
defined(PADDLE_WITH_HIP) ||
CUDA_VERSION < 9000
return
__shfl
(
val
,
src_line
,
width
);
#else
return
__shfl_sync
(
mask
,
val
,
src_line
,
width
);
...
...
@@ -173,13 +184,17 @@ HOSTDEVICE T Infinity() {
template
<
typename
T
>
__device__
T
reduceSum
(
T
val
,
int
tid
,
int
len
)
{
// NOTE(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
// NOTE(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
#ifdef PADDLE_WITH_HIP
const
int
warpSize
=
64
;
#else
const
int
warpSize
=
32
;
#endif
__shared__
T
shm
[
warpSize
];
unsigned
mask
=
0u
;
CREATE_SHFL_MASK
(
mask
,
tid
<
len
);
...
...
paddle/fluid/platform/cuda_helper.h
浏览文件 @
93c1d9e7
...
...
@@ -16,11 +16,16 @@
#include <mutex> // NOLINT
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cublas.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/rocblas.h"
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#if CUDA_VERSION < 9000
#if
defined(PADDLE_WITH_CUDA) &&
CUDA_VERSION < 9000
enum
cublasMath_t
{
CUBLAS_DEFAULT_MATH
=
0
};
#endif
...
...
@@ -77,6 +82,12 @@ namespace platform {
class
CublasHandleHolder
{
public:
#ifdef PADDLE_WITH_HIP
explicit
CublasHandleHolder
(
hipStream_t
stream
)
{
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
rocblas_create_handle
(
&
handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
rocblas_set_stream
(
handle_
,
stream
));
}
#else
CublasHandleHolder
(
cudaStream_t
stream
,
cublasMath_t
math_type
)
{
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cublasCreate
(
&
handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cublasSetStream
(
handle_
,
stream
));
...
...
@@ -92,9 +103,14 @@ class CublasHandleHolder {
}
#endif // CUDA_VERSION >= 9000
}
#endif
~
CublasHandleHolder
()
PADDLE_MAY_THROW
{
#ifdef PADDLE_WITH_HIP
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
rocblas_destroy_handle
(
handle_
));
#else
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cublasDestroy
(
handle_
));
#endif
}
template
<
typename
Callback
>
...
...
@@ -106,7 +122,11 @@ class CublasHandleHolder {
private:
DISABLE_COPY_AND_ASSIGN
(
CublasHandleHolder
);
#ifdef PADDLE_WITH_HIP
rocblas_handle
handle_
;
#else
cublasHandle_t
handle_
;
#endif
mutable
std
::
mutex
mtx_
;
};
...
...
paddle/fluid/platform/cuda_helper_test.cu
浏览文件 @
93c1d9e7
...
...
@@ -47,8 +47,13 @@ void TestCase(size_t num) {
T
*
in1
,
*
in2
,
*
out
;
T
*
d_in1
,
*
d_in2
;
size_t
size
=
sizeof
(
T
)
*
num
;
#ifdef PADDLE_WITH_HIP
hipMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
hipMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
size
);
#else
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
size
);
#endif
in1
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
in2
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
out
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
...
...
@@ -58,12 +63,22 @@ void TestCase(size_t num) {
in1
[
i
]
=
static_cast
<
T
>
(
dist
(
engine
));
in2
[
i
]
=
static_cast
<
T
>
(
dist
(
engine
));
}
#ifdef PADDLE_WITH_HIP
hipMemcpy
(
d_in1
,
in1
,
size
,
hipMemcpyHostToDevice
);
hipMemcpy
(
d_in2
,
in2
,
size
,
hipMemcpyHostToDevice
);
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
AddKernel
<
T
>
),
dim3
(
1
),
dim3
(
PADDLE_CUDA_NUM_THREADS
),
0
,
0
,
d_in1
,
d_in2
,
num
);
hipDeviceSynchronize
();
hipMemcpy
(
out
,
d_in2
,
size
,
hipMemcpyDeviceToHost
);
hipDeviceSynchronize
();
#else
cudaMemcpy
(
d_in1
,
in1
,
size
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_in2
,
in2
,
size
,
cudaMemcpyHostToDevice
);
AddKernel
<
T
><<<
1
,
PADDLE_CUDA_NUM_THREADS
>>>
(
d_in1
,
d_in2
,
num
);
cudaDeviceSynchronize
();
cudaMemcpy
(
out
,
d_in2
,
size
,
cudaMemcpyDeviceToHost
);
cudaDeviceSynchronize
();
#endif
for
(
size_t
i
=
0
;
i
<
num
;
++
i
)
{
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
...
...
@@ -73,8 +88,13 @@ void TestCase(size_t num) {
free
(
in1
);
free
(
in2
);
free
(
out
);
#ifdef PADDLE_WITH_HIP
hipFree
(
d_in1
);
hipFree
(
d_in2
);
#else
cudaFree
(
d_in1
);
cudaFree
(
d_in2
);
#endif
}
// cuda primitives
...
...
@@ -103,8 +123,13 @@ void TestUnalign(size_t num, const int shift_bit) {
size_t
size
=
sizeof
(
uint8_t
)
*
(
num
+
shift_bit
);
size_t
array_size
=
sizeof
(
float16
)
*
(
num
/
2
);
#ifdef PADDLE_WITH_HIP
hipMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
hipMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
size
);
#else
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
size
);
#endif
in1
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
in2
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
out
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
...
...
@@ -121,12 +146,23 @@ void TestUnalign(size_t num, const int shift_bit) {
r_in1
[
i
]
=
static_cast
<
float16
>
(
dist
(
engine
));
r_in2
[
i
]
=
static_cast
<
float16
>
(
dist
(
engine
));
}
#ifdef PADDLE_WITH_HIP
hipMemcpy
(
d_in1
,
r_in1
,
array_size
,
hipMemcpyHostToDevice
);
hipMemcpy
(
d_in2
,
r_in2
,
array_size
,
hipMemcpyHostToDevice
);
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
AddKernel
<
float16
>
),
dim3
(
1
),
dim3
(
PADDLE_CUDA_NUM_THREADS
),
0
,
0
,
d_in1
,
d_in2
,
num
/
2
);
hipDeviceSynchronize
();
hipMemcpy
(
out
,
d_in2
,
array_size
,
hipMemcpyDeviceToHost
);
hipDeviceSynchronize
();
#else
cudaMemcpy
(
d_in1
,
r_in1
,
array_size
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_in2
,
r_in2
,
array_size
,
cudaMemcpyHostToDevice
);
AddKernel
<
float16
><<<
1
,
PADDLE_CUDA_NUM_THREADS
>>>
(
d_in1
,
d_in2
,
num
/
2
);
cudaDeviceSynchronize
();
cudaMemcpy
(
out
,
d_in2
,
array_size
,
cudaMemcpyDeviceToHost
);
cudaDeviceSynchronize
();
#endif
for
(
size_t
i
=
0
;
i
<
num
/
2
;
++
i
)
{
// NOTE(dzhwinter): the float16 add has small truncate error.
// so we use EXPECT_NEAR to check the result.
...
...
@@ -137,8 +173,13 @@ void TestUnalign(size_t num, const int shift_bit) {
free
(
in1
);
free
(
in2
);
free
(
out
);
#ifdef PADDLE_WITH_HIP
hipFree
(
d_in1
);
hipFree
(
d_in2
);
#else
cudaFree
(
d_in1
);
cudaFree
(
d_in2
);
#endif
}
TEST
(
CudaAtomic
,
float16Unalign
)
{
...
...
@@ -203,8 +244,13 @@ void TestReduce(size_t num, float atol = 0.01) {
T
*
in1
;
T
*
d_in1
,
*
d_in2
;
size_t
size
=
sizeof
(
T
)
*
num
;
#ifdef PADDLE_WITH_HIP
hipMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
hipMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
sizeof
(
T
));
#else
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
sizeof
(
T
));
#endif
in1
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
std
::
minstd_rand
engine
;
std
::
uniform_real_distribution
<
double
>
dist
(
0.0
,
1.0
);
...
...
@@ -212,17 +258,31 @@ void TestReduce(size_t num, float atol = 0.01) {
in1
[
i
]
=
static_cast
<
T
>
(
dist
(
engine
));
}
auto
out
=
std
::
accumulate
(
in1
,
in1
+
num
,
static_cast
<
T
>
(
0
));
#ifdef PADDLE_WITH_HIP
hipMemcpy
(
d_in1
,
in1
,
size
,
hipMemcpyHostToDevice
);
hipDeviceSynchronize
();
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
DeviceReduceSum
<
T
>
),
dim3
(
1
),
dim3
(
PADDLE_CUDA_NUM_THREADS
),
0
,
0
,
d_in1
,
d_in2
,
num
);
hipMemcpy
(
in1
,
d_in2
,
sizeof
(
T
),
hipMemcpyDeviceToHost
);
hipDeviceSynchronize
();
#else
cudaMemcpy
(
d_in1
,
in1
,
size
,
cudaMemcpyHostToDevice
);
cudaDeviceSynchronize
();
DeviceReduceSum
<
T
><<<
1
,
PADDLE_CUDA_NUM_THREADS
>>>
(
d_in1
,
d_in2
,
num
);
cudaMemcpy
(
in1
,
d_in2
,
sizeof
(
T
),
cudaMemcpyDeviceToHost
);
cudaDeviceSynchronize
();
#endif
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
EXPECT_NEAR
(
static_cast
<
float
>
(
in1
[
0
]),
static_cast
<
float
>
(
out
),
atol
);
free
(
in1
);
#ifdef PADDLE_WITH_HIP
hipFree
(
d_in1
);
hipFree
(
d_in2
);
#else
cudaFree
(
d_in1
);
cudaFree
(
d_in2
);
#endif
}
TEST
(
CudaShuffleSync
,
float16
)
{
...
...
paddle/fluid/platform/cuda_primitives.h
浏览文件 @
93c1d9e7
...
...
@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <stdio.h>
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
...
...
@@ -50,7 +55,7 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) {
static_cast
<
unsigned
long
long
int
>
(
val
));
// NOLINT
}
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 600
#if defined(__
HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
USE_CUDA_ATOMIC
(
Add
,
double
);
#else
CUDA_ATOMIC_WRAPPER
(
Add
,
double
)
{
...
...
@@ -149,12 +154,12 @@ USE_CUDA_ATOMIC(Max, int);
USE_CUDA_ATOMIC
(
Max
,
unsigned
int
);
// CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 350
#if defined(__
HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
USE_CUDA_ATOMIC
(
Max
,
unsigned
long
long
int
);
// NOLINT
#else
CUDA_ATOMIC_WRAPPER
(
Max
,
unsigned
long
long
int
)
{
// NOLINT
if
(
*
address
>=
val
)
{
return
;
return
*
address
;
}
unsigned
long
long
int
old
=
*
address
,
assumed
;
// NOLINT
...
...
@@ -181,7 +186,7 @@ CUDA_ATOMIC_WRAPPER(Max, int64_t) {
CUDA_ATOMIC_WRAPPER
(
Max
,
float
)
{
if
(
*
address
>=
val
)
{
return
;
return
*
address
;
}
int
*
const
address_as_i
=
reinterpret_cast
<
int
*>
(
address
);
...
...
@@ -199,7 +204,7 @@ CUDA_ATOMIC_WRAPPER(Max, float) {
CUDA_ATOMIC_WRAPPER
(
Max
,
double
)
{
if
(
*
address
>=
val
)
{
return
;
return
*
address
;
}
unsigned
long
long
int
*
const
address_as_ull
=
// NOLINT
...
...
@@ -221,12 +226,12 @@ USE_CUDA_ATOMIC(Min, int);
USE_CUDA_ATOMIC
(
Min
,
unsigned
int
);
// CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 350
#if defined(__
HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
USE_CUDA_ATOMIC
(
Min
,
unsigned
long
long
int
);
// NOLINT
#else
CUDA_ATOMIC_WRAPPER
(
Min
,
unsigned
long
long
int
)
{
// NOLINT
if
(
*
address
<=
val
)
{
return
;
return
*
address
;
}
unsigned
long
long
int
old
=
*
address
,
assumed
;
// NOLINT
...
...
@@ -253,7 +258,7 @@ CUDA_ATOMIC_WRAPPER(Min, int64_t) {
CUDA_ATOMIC_WRAPPER
(
Min
,
float
)
{
if
(
*
address
<=
val
)
{
return
;
return
*
address
;
}
int
*
const
address_as_i
=
reinterpret_cast
<
int
*>
(
address
);
...
...
@@ -271,7 +276,7 @@ CUDA_ATOMIC_WRAPPER(Min, float) {
CUDA_ATOMIC_WRAPPER
(
Min
,
double
)
{
if
(
*
address
<=
val
)
{
return
;
return
*
address
;
}
unsigned
long
long
int
*
const
address_as_ull
=
// NOLINT
...
...
paddle/fluid/platform/cuda_resource_pool.cc
浏览文件 @
93c1d9e7
...
...
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_resource_pool.h"
#include "paddle/fluid/platform/gpu_info.h"
...
...
@@ -25,15 +25,24 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
for
(
int
dev_idx
=
0
;
dev_idx
<
dev_cnt
;
++
dev_idx
)
{
auto
creator
=
[
dev_idx
]
{
platform
::
SetDeviceId
(
dev_idx
);
cudaStream_t
stream
;
gpuStream_t
stream
;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamCreateWithFlags
(
&
stream
,
hipStreamNonBlocking
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamCreateWithFlags
(
&
stream
,
cudaStreamNonBlocking
));
#endif
return
stream
;
};
auto
deleter
=
[
dev_idx
](
cuda
Stream_t
stream
)
{
auto
deleter
=
[
dev_idx
](
gpu
Stream_t
stream
)
{
platform
::
SetDeviceId
(
dev_idx
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamDestroy
(
stream
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamDestroy
(
stream
));
#endif
};
pool_
.
emplace_back
(
...
...
@@ -65,15 +74,24 @@ CudaEventResourcePool::CudaEventResourcePool() {
for
(
int
dev_idx
=
0
;
dev_idx
<
dev_cnt
;
++
dev_idx
)
{
auto
creator
=
[
dev_idx
]
{
platform
::
SetDeviceId
(
dev_idx
);
cudaEvent_t
event
;
gpuEvent_t
event
;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventCreateWithFlags
(
&
event
,
hipEventDisableTiming
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
event
,
cudaEventDisableTiming
));
#endif
return
event
;
};
auto
deleter
=
[
dev_idx
](
cuda
Event_t
event
)
{
auto
deleter
=
[
dev_idx
](
gpu
Event_t
event
)
{
platform
::
SetDeviceId
(
dev_idx
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventDestroy
(
event
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventDestroy
(
event
));
#endif
};
pool_
.
emplace_back
(
ResourcePool
<
CudaEventObject
>::
Create
(
creator
,
deleter
));
...
...
paddle/fluid/platform/cuda_resource_pool.h
浏览文件 @
93c1d9e7
...
...
@@ -14,9 +14,17 @@
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <memory>
#include <type_traits>
#include <vector>
...
...
@@ -26,8 +34,8 @@
namespace
paddle
{
namespace
platform
{
using
CudaStreamObject
=
std
::
remove_pointer
<
cuda
Stream_t
>::
type
;
using
CudaEventObject
=
std
::
remove_pointer
<
cuda
Event_t
>::
type
;
using
CudaStreamObject
=
std
::
remove_pointer
<
gpu
Stream_t
>::
type
;
using
CudaEventObject
=
std
::
remove_pointer
<
gpu
Event_t
>::
type
;
class
CudaStreamResourcePool
{
public:
...
...
paddle/fluid/platform/cudnn_desc_test.cc
浏览文件 @
93c1d9e7
...
...
@@ -12,7 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_desc.h"
#else
#include "paddle/fluid/platform/cudnn_desc.h"
#endif
#include <gtest/gtest.h>
...
...
paddle/fluid/platform/device_code.cc
浏览文件 @
93c1d9e7
...
...
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_code.h"
#include <sys/stat.h>
#include <algorithm>
#include <set>
#include <utility>
#include "paddle/fluid/platform/device_code.h"
#include "paddle/fluid/platform/enforce.h"
DECLARE_string
(
cuda_dir
);
...
...
@@ -71,26 +72,35 @@ DeviceCodePool::DeviceCodePool(const std::vector<platform::Place>& places) {
}
for
(
auto
&
p
:
set
)
{
if
(
is_gpu_place
(
p
))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
device_codes_
.
emplace
(
p
,
DeviceCodeMap
());
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"CUDAPlace is not supported, please re-compile with WITH_GPU=ON."
));
"CUDAPlace or HIPPlace is not supported, please re-compile with "
"WITH_GPU=ON or WITH_ROCM=ON."
));
#endif
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
CUDADeviceCode
::
CheckAvailableStatus
();
#endif
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_HIP
static
bool
CheckCUDADriverResult
(
hipError_t
result
,
std
::
string
caller
,
std
::
string
kernel_name
=
""
)
{
if
(
result
!=
hipSuccess
)
{
const
char
*
error
=
nullptr
;
error
=
dynload
::
hipGetErrorString
(
result
);
#else
static
bool
CheckCUDADriverResult
(
CUresult
result
,
std
::
string
caller
,
std
::
string
kernel_name
=
""
)
{
if
(
result
!=
CUDA_SUCCESS
)
{
const
char
*
error
=
nullptr
;
dynload
::
cuGetErrorString
(
result
,
&
error
);
#endif
LOG_FIRST_N
(
WARNING
,
1
)
<<
"Call "
<<
caller
<<
" for < "
<<
kernel_name
<<
" > failed: "
<<
error
<<
" ("
<<
result
<<
")"
;
return
false
;
...
...
@@ -109,13 +119,23 @@ void CUDADeviceCode::CheckAvailableStatus() {
int
nvrtc_major
=
0
;
int
nvrtc_minor
=
0
;
#ifdef PADDLE_WITH_HIP
hiprtcResult
nvrtc_result
=
dynload
::
hiprtcVersion
(
&
nvrtc_major
,
&
nvrtc_minor
);
#else
nvrtcResult
nvrtc_result
=
dynload
::
nvrtcVersion
(
&
nvrtc_major
,
&
nvrtc_minor
);
#endif
int
driver_version
=
0
;
int
dirver_major
=
0
;
int
driver_minor
=
0
;
#ifdef PADDLE_WITH_HIP
hipError_t
driver_result
=
dynload
::
hipDriverGetVersion
(
&
driver_version
);
if
(
driver_result
==
hipSuccess
)
{
#else
CUresult
driver_result
=
dynload
::
cuDriverGetVersion
(
&
driver_version
);
if
(
driver_result
==
CUDA_SUCCESS
)
{
#endif
dirver_major
=
driver_version
/
1000
;
driver_minor
=
(
driver_version
%
1000
)
/
10
;
}
...
...
@@ -123,13 +143,22 @@ void CUDADeviceCode::CheckAvailableStatus() {
LOG_FIRST_N
(
INFO
,
1
)
<<
"CUDA Driver Version: "
<<
dirver_major
<<
"."
<<
driver_minor
<<
"; NVRTC Version: "
<<
nvrtc_major
<<
"."
<<
nvrtc_minor
;
#ifdef PADDLE_WITH_HIP
if
(
nvrtc_result
!=
HIPRTC_SUCCESS
||
driver_result
!=
hipSuccess
)
{
#else
if
(
nvrtc_result
!=
NVRTC_SUCCESS
||
driver_result
!=
CUDA_SUCCESS
)
{
#endif
return
;
}
int
count
=
0
;
#ifdef PADDLE_WITH_HIP
if
(
CheckCUDADriverResult
(
dynload
::
hipGetDeviceCount
(
&
count
),
"hipGetDeviceCount"
))
{
#else
if
(
CheckCUDADriverResult
(
dynload
::
cuDeviceGetCount
(
&
count
),
"cuDeviceGetCount"
))
{
#endif
available_
=
true
;
}
}
...
...
@@ -163,14 +192,20 @@ static std::string FindCUDAIncludePath() {
}
}
#ifdef PADDLE_WITH_HIP
cuda_include_path
=
"/opt/rocm/include"
;
#else
cuda_include_path
=
"/usr/local/cuda/include"
;
#endif
if
(
stat
(
cuda_include_path
.
c_str
(),
&
st
)
==
0
)
{
return
cuda_include_path
;
}
LOG
(
WARNING
)
<<
"Cannot find CUDA include path."
<<
"Please check whether CUDA is installed in the default "
"installation path, or specify it by export "
"FLAGS_cuda_dir=xxx."
;
LOG
(
WARNING
)
<<
"Cannot find CUDA or ROCM include path."
<<
"Please check whether CUDA or ROCM is installed in the default "
"installation path, or specify it by export "
"FLAGS_cuda_dir=xxx."
;
return
""
;
}
...
...
@@ -183,7 +218,11 @@ CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name,
place_
=
place
;
name_
=
name
;
#ifdef PADDLE_WITH_HIP
kernel_
=
"#include <hip/hip_runtime.h>
\n
"
+
kernel
;
#else
kernel_
=
kernel
;
#endif
}
bool
CUDADeviceCode
::
Compile
(
bool
include_path
)
{
...
...
@@ -193,7 +232,84 @@ bool CUDADeviceCode::Compile(bool include_path) {
<<
"NVRTC and CUDA driver are need for JIT compiling of CUDA code."
;
return
false
;
}
#ifdef PADDLE_WITH_HIP
hiprtcProgram
program
;
if
(
!
CheckNVRTCResult
(
dynload
::
hiprtcCreateProgram
(
&
program
,
kernel_
.
c_str
(),
// buffer
name_
.
c_str
(),
// name
0
,
// numHeaders
nullptr
,
// headers
nullptr
),
// includeNames
"hiprtcCreateProgram"
))
{
return
false
;
}
// Compile the program for specified compute_capability
auto
*
dev_ctx
=
reinterpret_cast
<
CUDADeviceContext
*>
(
DeviceContextPool
::
Instance
().
Get
(
place_
));
int
compute_capability
=
dev_ctx
->
GetComputeCapability
();
std
::
vector
<
const
char
*>
options
=
{
"-std=c++11"
,
"--amdgpu-target=gfx906"
};
std
::
string
include_option
;
if
(
include_path
)
{
std
::
string
cuda_include_path
=
FindCUDAIncludePath
();
if
(
!
cuda_include_path
.
empty
())
{
include_option
=
"--include-path="
+
cuda_include_path
;
options
.
push_back
(
include_option
.
c_str
());
}
}
hiprtcResult
compile_result
=
dynload
::
hiprtcCompileProgram
(
program
,
// program
options
.
size
(),
// numOptions
options
.
data
());
// options
if
(
compile_result
==
HIPRTC_ERROR_COMPILATION
)
{
// Obtain compilation log from the program
size_t
log_size
;
if
(
!
CheckNVRTCResult
(
dynload
::
hiprtcGetProgramLogSize
(
program
,
&
log_size
),
"hiprtcGetProgramLogSize"
))
{
return
false
;
}
std
::
vector
<
char
>
log
;
log
.
resize
(
log_size
+
1
);
if
(
!
CheckNVRTCResult
(
dynload
::
hiprtcGetProgramLog
(
program
,
log
.
data
()),
"hiprtcGetProgramLog"
))
{
return
false
;
}
LOG
(
WARNING
)
<<
"JIT compiling of ROCM GPU code failed:"
<<
"
\n
Kernel name: "
<<
name_
<<
"
\n
Kernel body:
\n
"
<<
kernel_
<<
"
\n
Compiling log: "
<<
log
.
data
();
return
false
;
}
// Obtain PTX from the program for cuda
// Obtain Code from the program for hip
size_t
ptx_size
;
if
(
!
CheckNVRTCResult
(
dynload
::
hiprtcGetCodeSize
(
program
,
&
ptx_size
),
"hiprtcGetCodeSize"
))
{
return
false
;
}
ptx_
.
resize
(
ptx_size
+
1
);
if
(
!
CheckNVRTCResult
(
dynload
::
hiprtcGetCode
(
program
,
ptx_
.
data
()),
"hiprtcGetCode"
))
{
return
false
;
}
if
(
!
CheckNVRTCResult
(
dynload
::
hiprtcDestroyProgram
(
&
program
),
"hiprtcDestroyProgram"
))
{
return
false
;
}
if
(
!
CheckCUDADriverResult
(
dynload
::
hipModuleLoadData
(
&
module_
,
ptx_
.
data
()),
"hipModuleLoadData"
))
{
return
false
;
}
if
(
!
CheckCUDADriverResult
(
dynload
::
hipModuleGetFunction
(
&
function_
,
module_
,
name_
.
c_str
()),
"hipModuleGetFunction"
))
{
return
false
;
}
#else
nvrtcProgram
program
;
if
(
!
CheckNVRTCResult
(
dynload
::
nvrtcCreateProgram
(
&
program
,
kernel_
.
c_str
(),
// buffer
...
...
@@ -271,6 +387,7 @@ bool CUDADeviceCode::Compile(bool include_path) {
"cuModuleGetFunction"
,
name_
))
{
return
false
;
}
#endif
max_threads_
=
dev_ctx
->
GetMaxPhysicalThreadCount
();
is_compiled_
=
true
;
...
...
@@ -291,6 +408,18 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
auto
*
dev_ctx
=
reinterpret_cast
<
CUDADeviceContext
*>
(
DeviceContextPool
::
Instance
().
Get
(
place_
));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_EQ
(
dynload
::
hipModuleLaunchKernel
(
function_
,
num_blocks
,
1
,
1
,
// grid dim
num_threads_
,
1
,
1
,
// block dim
0
,
// shared memory
dev_ctx
->
stream
(),
// stream
args
->
data
(),
// arguments
nullptr
),
hipSuccess
,
errors
::
External
(
"Fail to launch kernel %s (in hipModuleLaunchKernel.)"
,
name_
.
c_str
()));
#else
PADDLE_ENFORCE_EQ
(
dynload
::
cuLaunchKernel
(
function_
,
num_blocks
,
1
,
1
,
// grid dim
num_threads_
,
1
,
1
,
// block dim
...
...
@@ -301,8 +430,19 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
CUDA_SUCCESS
,
errors
::
External
(
"Fail to launch kernel %s (in cuLaunchKernel.)"
,
name_
.
c_str
()));
#endif
}
#ifdef PADDLE_WITH_HIP
bool
CUDADeviceCode
::
CheckNVRTCResult
(
hiprtcResult
result
,
std
::
string
function
)
{
if
(
result
!=
HIPRTC_SUCCESS
)
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"Call "
<<
function
<<
" for < "
<<
name_
<<
" > failed: "
<<
dynload
::
hiprtcGetErrorString
(
result
);
return
false
;
}
#else
bool
CUDADeviceCode
::
CheckNVRTCResult
(
nvrtcResult
result
,
std
::
string
function
)
{
if
(
result
!=
NVRTC_SUCCESS
)
{
...
...
@@ -311,6 +451,7 @@ bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result,
<<
" > failed: "
<<
dynload
::
nvrtcGetErrorString
(
result
);
return
false
;
}
#endif
return
true
;
}
#endif
...
...
paddle/fluid/platform/device_code.h
浏览文件 @
93c1d9e7
...
...
@@ -25,6 +25,10 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cuda_driver.h"
#include "paddle/fluid/platform/dynload/nvrtc.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/hiprtc.h"
#include "paddle/fluid/platform/dynload/rocm_driver.h"
#endif
namespace
paddle
{
namespace
platform
{
...
...
@@ -44,7 +48,7 @@ class DeviceCode {
std
::
string
kernel_
;
};
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class
CUDADeviceCode
:
public
DeviceCode
{
public:
explicit
CUDADeviceCode
(
const
Place
&
place
,
const
std
::
string
&
name
,
...
...
@@ -61,7 +65,11 @@ class CUDADeviceCode : public DeviceCode {
static
bool
IsAvailable
()
{
return
available_
;
}
private:
#ifdef PADDLE_WITH_HIP
bool
CheckNVRTCResult
(
hiprtcResult
result
,
std
::
string
function
);
#else
bool
CheckNVRTCResult
(
nvrtcResult
result
,
std
::
string
function
);
#endif
static
bool
available_
;
...
...
@@ -70,8 +78,13 @@ class CUDADeviceCode : public DeviceCode {
int
num_threads_
{
1024
};
int
workload_per_thread_
{
1
};
std
::
vector
<
char
>
ptx_
;
#ifdef PADDLE_WITH_HIP
hipModule_t
module_
;
hipFunction_t
function_
;
#else
CUmodule
module_
;
CUfunction
function_
;
#endif
};
#endif
...
...
paddle/fluid/platform/device_code_test.cc
浏览文件 @
93c1d9e7
...
...
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_code.h"
#include <utility>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/init.h"
#ifdef PADDLE_WITH_CUDA
constexpr
auto
saxpy_code
=
R"(
extern "C" __global__
void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
...
...
@@ -26,8 +28,22 @@ void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
}
}
)"
;
#endif
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
constexpr
auto
saxpy_code
=
R"(
#include <hip/hip_runtime.h>
extern "C" __global__
void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < n;
tid += blockDim.x * gridDim.x) {
z[tid] = a * x[tid] + y[tid];
}
}
)"
;
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST
(
DeviceCode
,
cuda
)
{
if
(
!
paddle
::
platform
::
dynload
::
HasNVRTC
()
||
!
paddle
::
platform
::
dynload
::
HasCUDADriver
())
{
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
93c1d9e7
...
...
@@ -12,7 +12,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device_context.h"
#include <set>
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
...
...
@@ -29,7 +29,7 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
}
if
(
platform
::
is_gpu_place
(
place
))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto
*
default_dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
auto
&
desired_dev_ctx
=
...
...
@@ -65,7 +65,7 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
namespace
paddle
{
namespace
platform
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
bool
allow_tf32_cublas
=
true
;
void
SetAllowTF32Cublas
(
bool
active
)
{
allow_tf32_cublas
=
active
;
}
bool
AllowTF32Cublas
()
{
return
allow_tf32_cublas
;
}
...
...
@@ -122,7 +122,7 @@ DeviceContextPool::DeviceContextPool(
EmplaceDeviceContext
<
CPUDeviceContext
,
CPUPlace
>
(
&
device_contexts_
,
p
);
#endif
}
else
if
(
platform
::
is_gpu_place
(
p
))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext
<
CUDADeviceContext
,
CUDAPlace
>
(
&
device_contexts_
,
p
);
#else
PADDLE_THROW
(
...
...
@@ -130,7 +130,7 @@ DeviceContextPool::DeviceContextPool(
"re-compile with WITH_GPU option."
));
#endif
}
else
if
(
platform
::
is_cuda_pinned_place
(
p
))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext
<
CUDAPinnedDeviceContext
,
CUDAPinnedPlace
>
(
&
device_contexts_
,
p
);
#else
...
...
@@ -229,7 +229,7 @@ Place XPUDeviceContext::GetPlace() const { return place_; }
xpu
::
Context
*
XPUDeviceContext
::
x_context
()
const
{
return
context_
;
}
#endif
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class
EigenCudaStreamDevice
:
public
Eigen
::
StreamInterface
{
public:
...
...
@@ -238,15 +238,19 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
}
~
EigenCudaStreamDevice
()
override
{}
void
Reinitialize
(
const
cuda
Stream_t
*
cuda_stream
,
CUDAPlace
place
)
{
void
Reinitialize
(
const
gpu
Stream_t
*
cuda_stream
,
CUDAPlace
place
)
{
stream_
=
cuda_stream
;
place_
=
place
;
device_prop_
=
&
Eigen
::
m_deviceProperties
[
place
.
device
];
}
const
cuda
Stream_t
&
stream
()
const
override
{
return
*
stream_
;
}
const
gpu
Stream_t
&
stream
()
const
override
{
return
*
stream_
;
}
#ifdef PADDLE_WITH_HIP
const
hipDeviceProp_t
&
deviceProperties
()
const
override
{
#else
const
cudaDeviceProp
&
deviceProperties
()
const
override
{
#endif
return
*
device_prop_
;
}
...
...
@@ -295,16 +299,25 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
char
*
scratch
=
static_cast
<
char
*>
(
scratchpad
())
+
Eigen
::
kGpuScratchSize
;
#endif
semaphore_
=
reinterpret_cast
<
unsigned
int
*>
(
scratch
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipMemsetAsync
(
semaphore_
,
0
,
sizeof
(
unsigned
int
),
*
stream_
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaMemsetAsync
(
semaphore_
,
0
,
sizeof
(
unsigned
int
),
*
stream_
));
#endif
}
return
semaphore_
;
}
private:
CUDAPlace
place_
;
const
cudaStream_t
*
stream_
;
// not owned;
const
gpuStream_t
*
stream_
;
// not owned;
#ifdef PADDLE_WITH_HIP
const
hipDeviceProp_t
*
device_prop_
;
#else
const
cudaDeviceProp
*
device_prop_
;
// not owned;
#endif
mutable
void
*
scratch_
;
mutable
unsigned
int
*
semaphore_
;
mutable
std
::
mutex
mtx_
;
// to protect allocations_
...
...
@@ -339,14 +352,18 @@ CUDAContext::CUDAContext(const CUDAPlace& place,
InitEigenContext
();
InitCuBlasContext
();
InitCuDNNContext
();
#ifndef PADDLE_WITH_HIP
InitCuSolverContext
();
#endif
}
CUDAContext
::~
CUDAContext
()
{
CUDADeviceGuard
guard
(
place_
.
device
);
DestoryCuDNNContext
();
DestoryCuBlasContext
();
#ifndef PADDLE_WITH_HIP
DestoryCuSolverContext
();
#endif
}
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
:
place_
(
place
)
{
...
...
@@ -369,17 +386,29 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
<<
", Runtime API Version: "
<<
runtime_version_
/
1000
<<
"."
<<
(
runtime_version_
%
100
)
/
10
;
#ifdef PADDLE_WITH_HIP
size_t
version_major
,
version_minor
,
version_patch
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenGetVersion
(
&
version_major
,
&
version_minor
,
&
version_patch
));
LOG_FIRST_N
(
WARNING
,
1
)
<<
"device: "
<<
place_
.
device
<<
", MIOpen Version: "
<<
version_major
<<
"."
<<
version_minor
<<
"."
<<
version_patch
;
#else
size_t
cudnn_dso_ver
=
dynload
::
cudnnGetVersion
();
LOG_FIRST_N
(
WARNING
,
1
)
<<
"device: "
<<
place_
.
device
<<
", cuDNN Version: "
<<
cudnn_dso_ver
/
1000
<<
"."
<<
(
cudnn_dso_ver
%
1000
)
/
100
<<
"."
;
#endif
{
// Check CUDA/CUDNN version compatiblity
auto
local_cuda_version
=
(
driver_version_
/
1000
)
*
10
+
(
driver_version_
%
100
)
/
10
;
#ifdef PADDLE_WITH_HIP
auto
compile_cuda_version
=
(
HIP_VERSION
/
100
)
*
10
+
(
HIP_VERSION
%
10
);
#else
auto
compile_cuda_version
=
(
CUDA_VERSION
/
1000
)
*
10
+
(
CUDA_VERSION
%
100
)
/
10
;
#endif
if
(
local_cuda_version
<
compile_cuda_version
)
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
place_
.
device
...
...
@@ -397,7 +426,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
CUDADeviceContext
::~
CUDADeviceContext
()
{
SetDeviceId
(
place_
.
device
);
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
if
(
nccl_comm_
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
ncclCommDestroy
(
nccl_comm_
));
}
...
...
@@ -434,7 +463,11 @@ dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
return
max_grid_dim_size_
;
}
#ifdef PADDLE_WITH_HIP
miopenHandle_t
CUDADeviceContext
::
cudnn_handle
()
const
{
#else
cudnnHandle_t
CUDADeviceContext
::
cudnn_handle
()
const
{
#endif
return
context
()
->
CudnnHandle
();
}
...
...
@@ -442,13 +475,13 @@ CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
return
CudnnWorkspaceHandle
(
*
this
,
&
cudnn_handle_mtx_
);
}
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t
CUDADeviceContext
::
cusolver_dn_handle
()
const
{
return
context
()
->
CusolverDnHandle
();
}
#endif
cudaStream_t
CUDADeviceContext
::
stream
()
const
{
return
context
()
->
RawStream
();
}
gpuStream_t
CUDADeviceContext
::
stream
()
const
{
return
context
()
->
RawStream
();
}
CUDAPinnedDeviceContext
::
CUDAPinnedDeviceContext
()
{
eigen_device_
.
reset
(
new
Eigen
::
DefaultDevice
());
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
93c1d9e7
...
...
@@ -30,6 +30,16 @@ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/cuda_helper.h" // NOLINT
#include "paddle/fluid/platform/dynload/miopen.h"
#include "paddle/fluid/platform/dynload/rocblas.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/dynload/rccl.h"
#endif
#include "paddle/fluid/platform/gpu_info.h" // NOLINT
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#include "xpu/bkcl.h"
#endif
...
...
@@ -44,7 +54,7 @@ limitations under the License. */
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/stream/cuda_stream.h"
#endif
#include "unsupported/Eigen/CXX11/Tensor"
...
...
@@ -62,7 +72,7 @@ struct GpuDevice;
namespace
paddle
{
namespace
platform
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
/*Set the value of the global variable allow_tf32_cublas*/
void
SetAllowTF32Cublas
(
bool
active
);
/*Get the global variable allow_tf32_cublas value*/
...
...
@@ -153,7 +163,7 @@ struct DefaultDeviceContextType<platform::XPUPlace> {
};
#endif
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class
CudnnWorkspaceHandle
;
class
EigenCudaStreamDevice
;
...
...
@@ -179,13 +189,19 @@ class CUDAContext {
const
std
::
unique_ptr
<
stream
::
CUDAStream
>&
Stream
()
const
{
return
stream_
;
}
const
cuda
Stream_t
&
RawStream
()
{
return
stream_
->
raw_stream
();
}
const
gpu
Stream_t
&
RawStream
()
{
return
stream_
->
raw_stream
();
}
#ifdef PADDLE_WITH_HIP
const
miopenHandle_t
&
CudnnHandle
()
const
{
return
cudnn_handle_
;
}
#else
const
cudnnHandle_t
&
CudnnHandle
()
const
{
return
cudnn_handle_
;
}
#endif
#ifndef PADDLE_WITH_HIP
const
cusolverDnHandle_t
&
CusolverDnHandle
()
const
{
return
cusolver_dn_handle_
;
}
#endif
const
std
::
unique_ptr
<
CublasHandleHolder
>&
CublasHandle
()
const
{
return
cublas_handle_
;
...
...
@@ -222,6 +238,11 @@ class CUDAContext {
private:
void
InitEigenContext
();
#ifdef PADDLE_WITH_HIP
void
InitCuBlasContext
()
{
cublas_handle_
.
reset
(
new
CublasHandleHolder
(
RawStream
()));
}
#else
void
InitCuBlasContext
()
{
cublas_handle_
.
reset
(
new
CublasHandleHolder
(
RawStream
(),
CUBLAS_DEFAULT_MATH
));
...
...
@@ -236,9 +257,32 @@ class CUDAContext {
#endif // CUDA_VERSION >= 9000
}
}
#endif
void
InitCuDNNContext
()
{
if
(
dynload
::
HasCUDNN
())
{
#ifdef PADDLE_WITH_HIP
size_t
miopen_major
,
miopen_minor
,
miopen_patch
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenGetVersion
(
&
miopen_major
,
&
miopen_minor
,
&
miopen_patch
));
auto
local_miopen_version
=
(
miopen_major
*
1000
+
miopen_minor
*
100
+
miopen_patch
)
/
100
;
auto
compile_miopen_version
=
MIOPEN_VERSION
/
100
;
if
(
local_miopen_version
<
static_cast
<
size_t
>
(
compile_miopen_version
))
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
place_
.
device
<<
". The installed Paddle is compiled with MIOPEN "
<<
compile_miopen_version
/
10
<<
"."
<<
compile_miopen_version
%
10
<<
", but MIOPEN version in your machine is "
<<
local_miopen_version
/
10
<<
"."
<<
local_miopen_version
%
10
<<
", which may cause serious incompatible bug. "
<<
"Please recompile or reinstall Paddle with compatible MIOPEN "
"version."
;
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreate
(
&
cudnn_handle_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetStream
(
cudnn_handle_
,
RawStream
()));
#else
auto
local_cudnn_version
=
dynload
::
cudnnGetVersion
()
/
100
;
auto
compile_cudnn_version
=
CUDNN_VERSION
/
100
;
if
(
local_cudnn_version
<
static_cast
<
size_t
>
(
compile_cudnn_version
))
{
...
...
@@ -255,20 +299,27 @@ class CUDAContext {
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cudnnCreate
(
&
cudnn_handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cudnnSetStream
(
cudnn_handle_
,
RawStream
()));
#endif
}
else
{
cudnn_handle_
=
nullptr
;
}
}
#ifndef PADDLE_WITH_HIP
void
InitCuSolverContext
()
{
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cusolverDnCreate
(
&
cusolver_dn_handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cusolverDnSetStream
(
cusolver_dn_handle_
,
RawStream
()));
}
#endif
void
DestoryCuDNNContext
()
{
if
(
cudnn_handle_
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroy
(
cudnn_handle_
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroy
(
cudnn_handle_
));
#endif
}
cudnn_handle_
=
nullptr
;
}
...
...
@@ -279,22 +330,30 @@ class CUDAContext {
cublas_tf32_tensor_core_handle_
.
reset
();
}
#ifndef PADDLE_WITH_HIP
void
DestoryCuSolverContext
()
{
if
(
cusolver_dn_handle_
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cusolverDnDestroy
(
cusolver_dn_handle_
));
}
}
#endif
CUDAPlace
place_
;
std
::
unique_ptr
<
Eigen
::
GpuDevice
>
eigen_device_
;
std
::
unique_ptr
<
EigenCudaStreamDevice
>
eigen_stream_
;
std
::
unique_ptr
<
stream
::
CUDAStream
>
stream_
;
#ifdef PADDLE_WITH_HIP
miopenHandle_t
cudnn_handle_
;
#else
cudnnHandle_t
cudnn_handle_
;
#endif
std
::
unique_ptr
<
CublasHandleHolder
>
cublas_handle_
;
std
::
unique_ptr
<
CublasHandleHolder
>
cublas_tensor_core_handle_
;
std
::
unique_ptr
<
CublasHandleHolder
>
cublas_tf32_tensor_core_handle_
;
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t
cusolver_dn_handle_
;
#endif
DISABLE_COPY_AND_ASSIGN
(
CUDAContext
);
};
...
...
@@ -343,8 +402,12 @@ class CUDADeviceContext : public DeviceContext {
return
context
()
->
TensorCoreCublasCallIfAvailable
(
callback
);
}
/*! \brief Return cudnn handle in the device context. */
/*! \brief Return cudnn handle in the device context. */
#ifdef PADDLE_WITH_HIP
miopenHandle_t
cudnn_handle
()
const
;
#else
cudnnHandle_t
cudnn_handle
()
const
;
#endif
/*! \brief Return a cudnn workspace handle to call multiple cudnn
* functions without interrupting by other threads.
...
...
@@ -355,12 +418,14 @@ class CUDADeviceContext : public DeviceContext {
* sequential cudnn function calls. */
CudnnWorkspaceHandle
cudnn_workspace_handle
()
const
;
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t
cusolver_dn_handle
()
const
;
#endif
/*! \brief Return cuda stream in the device context. */
cuda
Stream_t
stream
()
const
;
gpu
Stream_t
stream
()
const
;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
/*! \brief Return nccl communicators. */
ncclComm_t
nccl_comm
()
const
{
return
nccl_comm_
;
}
...
...
@@ -369,7 +434,7 @@ class CUDADeviceContext : public DeviceContext {
#endif
template
<
typename
Callback
>
void
RecordEvent
(
cuda
Event_t
ev
,
Callback
callback
)
const
{
void
RecordEvent
(
gpu
Event_t
ev
,
Callback
callback
)
const
{
return
context
()
->
Stream
()
->
RecordEvent
(
ev
,
callback
);
}
...
...
@@ -411,7 +476,7 @@ class CUDADeviceContext : public DeviceContext {
mutable
std
::
mutex
cudnn_handle_mtx_
;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
// NCCL communicator (single process version) for NCCL collective operations.
// NCCL collective operations provides fast collectives over multiple GPUs
// both within and across nodes.
...
...
paddle/fluid/platform/device_context_test.cu
浏览文件 @
93c1d9e7
...
...
@@ -41,7 +41,11 @@ TEST(Device, CUDADeviceContext) {
CUDADeviceContext
*
device_context
=
new
CUDADeviceContext
(
CUDAPlace
(
i
));
Eigen
::
GpuDevice
*
gpu_device
=
device_context
->
eigen_device
();
ASSERT_NE
(
nullptr
,
gpu_device
);
#ifdef PADDLE_WITH_HIP
miopenHandle_t
cudnn_handle
=
device_context
->
cudnn_handle
();
#else
cudnnHandle_t
cudnn_handle
=
device_context
->
cudnn_handle
();
#endif
ASSERT_NE
(
nullptr
,
cudnn_handle
);
delete
device_context
;
}
...
...
paddle/fluid/platform/enforce.h
浏览文件 @
93c1d9e7
...
...
@@ -42,8 +42,7 @@ limitations under the License. */
#include <miopen/miopen.h>
#include <rocblas.h>
#include <thrust/system/hip/error.h>
#include <thrust/system_error.h> // NOLINT
#include "paddle/fluid/platform/cuda_error.pb.h" // NOLINT
#include <thrust/system_error.h> // NOLINT
#endif
#include <fstream>
...
...
@@ -1034,11 +1033,6 @@ inline void retry_sleep(unsigned milliseconds) {
inline
bool
is_error
(
hipError_t
e
)
{
return
e
!=
hipSuccess
;
}
inline
std
::
string
build_rocm_error_msg
(
hipError_t
e
)
{
#if defined(PADDLE_WITH_HIP)
int32_t
cuda_version
=
100
;
#else
int32_t
cuda_version
=
-
1
;
#endif
std
::
ostringstream
sout
;
sout
<<
" Hip error("
<<
e
<<
"), "
<<
hipGetErrorString
(
e
)
<<
"."
;
return
sout
.
str
();
...
...
paddle/fluid/platform/miopen_desc.h
0 → 100644
浏览文件 @
93c1d9e7
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <functional>
#include <iostream>
#include <iterator>
#include <memory>
#include <numeric>
#include <string>
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/miopen_helper.h"
namespace
paddle
{
namespace
framework
{
class
Tensor
;
}
// namespace framework
}
// namespace paddle
namespace
paddle
{
namespace
platform
{
using
framework
::
Tensor
;
template
<
typename
T
>
inline
miopenDataType_t
ToMIOpenDataType
(
const
T
&
t
)
{
auto
type
=
framework
::
ToDataType
(
t
);
return
ToMIOpenDataType
(
type
);
}
inline
std
::
vector
<
int
>
TransformDimOrder
(
const
std
::
vector
<
int
>&
dims
)
{
std
::
vector
<
int
>
transformed_dims
(
dims
.
begin
(),
dims
.
end
());
int
H
,
W
,
D
,
C
;
if
(
dims
.
size
()
==
4
)
{
H
=
dims
[
1
];
W
=
dims
[
2
];
C
=
dims
[
3
];
transformed_dims
[
1
]
=
C
;
transformed_dims
[
2
]
=
H
;
transformed_dims
[
3
]
=
W
;
}
else
{
D
=
dims
[
1
];
H
=
dims
[
2
];
W
=
dims
[
3
];
C
=
dims
[
4
];
transformed_dims
[
1
]
=
C
;
transformed_dims
[
2
]
=
D
;
transformed_dims
[
3
]
=
H
;
transformed_dims
[
4
]
=
W
;
}
return
transformed_dims
;
}
template
<
>
inline
miopenDataType_t
ToMIOpenDataType
(
const
framework
::
proto
::
VarType
::
Type
&
t
)
{
miopenDataType_t
type
=
miopenFloat
;
switch
(
t
)
{
case
framework
::
proto
::
VarType
::
FP16
:
type
=
miopenHalf
;
break
;
case
framework
::
proto
::
VarType
::
FP32
:
type
=
miopenFloat
;
break
;
default:
break
;
}
return
type
;
}
class
ActivationDescriptor
{
public:
ActivationDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateActivationDescriptor
(
&
desc_
));
}
~
ActivationDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyActivationDescriptor
(
desc_
));
}
template
<
typename
T
>
void
set
(
miopenActivationMode_t
mode
,
const
T
&
coef
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetActivationDescriptor
(
desc_
,
mode
,
static_cast
<
double
>
(
coef
),
0.0
,
0.0
));
}
miopenActivationDescriptor_t
desc
()
{
return
desc_
;
}
miopenActivationDescriptor_t
desc
()
const
{
return
desc_
;
}
private:
miopenActivationDescriptor_t
desc_
;
};
class
TensorDescriptor
{
public:
TensorDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateTensorDescriptor
(
&
desc_
));
}
~
TensorDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyTensorDescriptor
(
desc_
));
}
miopenTensorDescriptor_t
desc
()
{
return
desc_
;
}
miopenTensorDescriptor_t
desc
()
const
{
return
desc_
;
}
void
set
(
const
Tensor
&
tensor
,
const
int
groups
=
1
)
{
auto
dims
=
framework
::
vectorize
<
int
>
(
tensor
.
dims
());
std
::
vector
<
int
>
strides
(
dims
.
size
());
strides
[
dims
.
size
()
-
1
]
=
1
;
for
(
int
i
=
dims
.
size
()
-
2
;
i
>=
0
;
i
--
)
{
strides
[
i
]
=
dims
[
i
+
1
]
*
strides
[
i
+
1
];
}
std
::
vector
<
int
>
dims_with_group
(
dims
.
begin
(),
dims
.
end
());
if
(
groups
>
1
)
{
dims_with_group
[
1
]
=
dims_with_group
[
1
]
/
groups
;
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetTensorDescriptor
(
desc_
,
ToMIOpenDataType
(
tensor
.
type
()),
static_cast
<
int
>
(
dims_with_group
.
size
()),
const_cast
<
int
*>
(
dims_with_group
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
}
void
set
(
const
Tensor
&
tensor
,
const
miopenTensorFormat_t
format
)
{
const
int
groups
=
1
;
auto
dims
=
framework
::
vectorize
<
int
>
(
tensor
.
dims
());
std
::
vector
<
int
>
strides
(
dims
.
size
());
strides
[
dims
.
size
()
-
1
]
=
1
;
for
(
int
i
=
dims
.
size
()
-
2
;
i
>=
0
;
i
--
)
{
strides
[
i
]
=
dims
[
i
+
1
]
*
strides
[
i
+
1
];
}
std
::
vector
<
int
>
dims_with_group
(
dims
.
begin
(),
dims
.
end
());
if
(
groups
>
1
)
{
dims_with_group
[
1
]
=
dims_with_group
[
1
]
/
groups
;
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetTensorDescriptor
(
desc_
,
ToMIOpenDataType
(
tensor
.
type
()),
static_cast
<
int
>
(
dims_with_group
.
size
()),
const_cast
<
int
*>
(
dims_with_group
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
}
private:
miopenTensorDescriptor_t
desc_
;
};
class
FilterDescriptor
{
public:
FilterDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateTensorDescriptor
(
&
desc_
));
}
~
FilterDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyTensorDescriptor
(
desc_
));
}
miopenTensorDescriptor_t
desc
()
{
return
desc_
;
}
miopenTensorDescriptor_t
desc
()
const
{
return
desc_
;
}
void
set
(
const
Tensor
&
tensor
,
const
miopenTensorFormat_t
format
,
const
int
groups
=
1
)
{
auto
dims
=
framework
::
vectorize
<
int
>
(
tensor
.
dims
());
std
::
vector
<
int
>
transformed_dims
;
PADDLE_ENFORCE_EQ
(
format
,
MIOPEN_TENSOR_NCHW
,
platform
::
errors
::
InvalidArgument
(
"format should ONLY be NCHW in MIOPEN."
));
transformed_dims
=
dims
;
if
(
groups
>
1
)
{
transformed_dims
[
1
]
=
transformed_dims
[
1
]
/
groups
;
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetTensorDescriptor
(
desc_
,
ToMIOpenDataType
(
tensor
.
type
()),
static_cast
<
int
>
(
transformed_dims
.
size
()),
const_cast
<
int
*>
(
transformed_dims
.
data
()),
nullptr
));
}
private:
miopenTensorDescriptor_t
desc_
;
};
class
ConvolutionDescriptor
{
public:
ConvolutionDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateConvolutionDescriptor
(
&
desc_
));
}
~
ConvolutionDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyConvolutionDescriptor
(
desc_
));
}
miopenConvolutionDescriptor_t
desc
()
{
return
desc_
;
}
miopenConvolutionDescriptor_t
desc
()
const
{
return
desc_
;
}
void
set
(
miopenDataType_t
dtype
,
const
std
::
vector
<
int
>&
pads
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
bool
allow_tf32
,
const
int
groups
=
1
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenInitConvolutionNdDescriptor
(
desc_
,
static_cast
<
int
>
(
pads
.
size
()),
const_cast
<
int
*>
(
pads
.
data
()),
const_cast
<
int
*>
(
strides
.
data
()),
const_cast
<
int
*>
(
dilations
.
data
()),
miopenConvolution
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenSetConvolutionGroupCount
(
desc_
,
groups
));
}
private:
miopenConvolutionDescriptor_t
desc_
;
};
}
// namespace platform
}
// namespace paddle
tools/dockerfile/Dockerfile.rocm
浏览文件 @
93c1d9e7
# A image for building paddle binaries
# Use rocm-terminal base image for both rocm environment
# When you modify it, please be aware of rocm version
#
# Build: ROCM 3.5.1
# cd Paddle/tools/dockerfile
# docker build -f Dockerfile.rocm \
# --build-arg ROCM_VERSION=3.5.1 \
# --build-arg CENTOS_VERSION=7.7.1908 \
# -t paddlepaddle/paddle-centos-rocm35-dev:latest .
#
# Build: ROCM 3.9
.1
# Build: ROCM 3.9
# cd Paddle/tools/dockerfile
# docker build -f Dockerfile.rocm \
# --build-arg ROCM_VERSION=3.9.1 \
# --build-arg CENTOS_VERSION=7.8.2003 \
# --build-arg ROCM_VERSION=3.9 \
# -t paddlepaddle/paddle-centos-rocm39-dev:latest .
#
# Run: ROCM 3.5.1
# docker run -it --device=/dev/kfd --device=/dev/dri \
# --security-opt seccomp=unconfined --group-add video \
# paddlepaddle/paddle-centos-rocm3
5
-dev:latest /bin/bash
# paddlepaddle/paddle-centos-rocm3
9
-dev:latest /bin/bash
ARG CENTOS_VERSION
FROM centos:${CENTOS_VERSION}
ARG CENTOS_VERSION
FROM centos:7.8.2003
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ENV LC_ALL en_US.UTF-8
...
...
@@ -34,7 +23,7 @@ RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlit
zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \
make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel
# Install devtoolset-7
for ROCM 3.5/3.9
# Install devtoolset-7
RUN yum install -y yum-utils centos-release-scl && \
yum-config-manager --enable rhel-server-rhscl-7-rpms && \
yum-config-manager --enable rhel-7-server-rpms && \
...
...
@@ -70,10 +59,8 @@ ENV ROCM_PATH=/opt/rocm
ENV HIP_PATH=/opt/rocm/hip
ENV HIP_CLANG_PATH=/opt/rocm/llvm/bin
ENV PATH=/opt/rocm/bin:$PATH
ENV PATH=/opt/rocm/hcc/bin:$PATH
ENV PATH=/opt/rocm/hip/bin:$PATH
ENV PATH=/opt/rocm/opencl/bin:$PATH
ENV
PATH=/opt/rocm/llvm/bin:$
PATH
ENV
LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_
PATH
# git 2.17.1
RUN cd /opt && wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \
...
...
@@ -146,4 +133,12 @@ RUN cd /opt && wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache && \
cd .. && rm -rf ccache-3.7.9.tar.gz && rm -rf ccache-3.7.9
# configure ssh
RUN sed -i "s/^#PermitRootLogin/PermitRootLogin/" /etc/ssh/sshd_config && \
sed -i "s/^#PubkeyAuthentication/PubkeyAuthentication/" /etc/ssh/sshd_config && \
sed -i "s/^#RSAAuthentication/RSAAuthentication/" /etc/ssh/sshd_config && \
sed -i "s/#UseDNS .*/UseDNS no/" /etc/ssh/sshd_config
RUN ssh-keygen -A
EXPOSE 22
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录