Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
93c1d9e7
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
93c1d9e7
编写于
2月 08, 2021
作者:
Q
Qi Li
提交者:
GitHub
2月 08, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] update fluid platform for rocm39 (part3), test=develop (#30913)
上级
15297a06
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
750 addition
and
110 deletion
+750
-110
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+43
-16
paddle/fluid/platform/collective_helper.cc
paddle/fluid/platform/collective_helper.cc
+3
-2
paddle/fluid/platform/collective_helper.h
paddle/fluid/platform/collective_helper.h
+2
-2
paddle/fluid/platform/cuda_device_function.h
paddle/fluid/platform/cuda_device_function.h
+27
-12
paddle/fluid/platform/cuda_helper.h
paddle/fluid/platform/cuda_helper.h
+21
-1
paddle/fluid/platform/cuda_helper_test.cu
paddle/fluid/platform/cuda_helper_test.cu
+60
-0
paddle/fluid/platform/cuda_primitives.h
paddle/fluid/platform/cuda_primitives.h
+14
-9
paddle/fluid/platform/cuda_resource_pool.cc
paddle/fluid/platform/cuda_resource_pool.cc
+23
-5
paddle/fluid/platform/cuda_resource_pool.h
paddle/fluid/platform/cuda_resource_pool.h
+10
-2
paddle/fluid/platform/cudnn_desc_test.cc
paddle/fluid/platform/cudnn_desc_test.cc
+4
-0
paddle/fluid/platform/device_code.cc
paddle/fluid/platform/device_code.cc
+151
-10
paddle/fluid/platform/device_code.h
paddle/fluid/platform/device_code.h
+14
-1
paddle/fluid/platform/device_code_test.cc
paddle/fluid/platform/device_code_test.cc
+17
-1
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+47
-14
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+74
-9
paddle/fluid/platform/device_context_test.cu
paddle/fluid/platform/device_context_test.cu
+4
-0
paddle/fluid/platform/enforce.h
paddle/fluid/platform/enforce.h
+1
-7
paddle/fluid/platform/miopen_desc.h
paddle/fluid/platform/miopen_desc.h
+221
-0
tools/dockerfile/Dockerfile.rocm
tools/dockerfile/Dockerfile.rocm
+14
-19
未找到文件。
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
93c1d9e7
...
...
@@ -52,7 +52,12 @@ ENDIF()
cc_library
(
cpu_info SRCS cpu_info.cc DEPS
${
CPU_INFO_DEPS
}
)
cc_test
(
cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info
)
nv_library
(
gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda
)
IF
(
WITH_GPU
)
nv_library
(
gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda
)
ENDIF
()
IF
(
WITH_ROCM
)
hip_library
(
gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda
)
ENDIF
()
cc_library
(
place SRCS place.cc DEPS enforce boost
)
cc_test
(
place_test SRCS place_test.cc DEPS place glog gflags
)
...
...
@@ -72,7 +77,7 @@ IF(WITH_DGC)
set
(
dgc_deps dgc
)
ENDIF
()
IF
(
WITH_GPU
)
IF
(
WITH_GPU
OR WITH_ROCM
)
set
(
GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream
)
ENDIF
()
...
...
@@ -81,9 +86,14 @@ IF(WITH_MKLDNN)
ELSE
()
set
(
MKLDNN_CTX_DEPS
)
ENDIF
()
nv_library
(
stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce
)
IF
(
WITH_GPU
)
nv_library
(
stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce
)
ENDIF
()
IF
(
WITH_ROCM
)
hip_library
(
stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce
)
ENDIF
()
IF
(
WITH_GPU OR WITH_ROCM
)
set
(
STREAM_CALLBACK_DEPS stream_callback_manager
)
ELSE
()
set
(
STREAM_CALLBACK_DEPS
)
...
...
@@ -103,18 +113,26 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool
cc_library
(
collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce
)
if
(
WITH_GPU
)
if
(
WITH_GPU
OR WITH_ROCM
)
cc_library
(
cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info
)
target_link_libraries
(
device_context cuda_resource_pool
)
endif
()
nv_test
(
device_context_test SRCS device_context_test.cu DEPS device_context gpu_info
)
cc_test
(
init_test SRCS init_test.cc DEPS device_context
)
nv_test
(
cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda
)
nv_test
(
cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda
)
nv_test
(
transform_test SRCS transform_test.cu DEPS memory place device_context
)
if
(
WITH_GPU
)
nv_test
(
device_context_test SRCS device_context_test.cu DEPS device_context gpu_info
)
nv_test
(
cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda
)
nv_test
(
cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda
)
nv_test
(
transform_test SRCS transform_test.cu DEPS memory place device_context
)
endif
()
if
(
WITH_ROCM
)
hip_test
(
device_context_test SRCS device_context_test.cu DEPS device_context gpu_info
)
hip_test
(
miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda
)
hip_test
(
cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda tensor
)
hip_test
(
transform_test SRCS transform_test.cu DEPS memory place device_context
)
endif
()
cc_library
(
timer SRCS timer.cc
)
cc_test
(
timer_test SRCS timer_test.cc DEPS timer
)
...
...
@@ -127,25 +145,34 @@ if(WITH_GPU)
nv_library
(
profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda
)
nv_test
(
cuda_helper_test SRCS cuda_helper_test.cu
)
nv_library
(
device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place
)
elseif
(
WITH_ROCM
)
hip_library
(
profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce
)
hip_test
(
cuda_helper_test SRCS cuda_helper_test.cu
)
hip_library
(
device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place
)
else
()
cc_library
(
profiler SRCS profiler.cc DEPS device_tracer enforce
)
cc_library
(
device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place
)
endif
()
cc_test
(
profiler_test SRCS profiler_test.cc DEPS profiler
)
nv_test
(
float16_gpu_test SRCS float16_test.cu DEPS lod_tensor
)
cc_test
(
float16_test SRCS float16_test.cc DEPS lod_tensor
)
cc_test
(
bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor
)
nv_test
(
test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags
)
IF
(
WITH_GPU
)
nv_test
(
float16_gpu_test SRCS float16_test.cu DEPS lod_tensor
)
nv_test
(
test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags
)
nv_library
(
cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info
)
ENDIF
()
nv_library
(
cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info
)
IF
(
WITH_ROCM
)
hip_test
(
float16_gpu_test SRCS float16_test.cu DEPS lod_tensor
)
hip_test
(
test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags
)
hip_library
(
cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info
)
ENDIF
()
if
(
NOT APPLE AND NOT WIN32
)
cc_library
(
device_code SRCS device_code.cc DEPS device_context
)
if
(
WITH_GPU
)
if
(
WITH_GPU
OR WITH_ROCM
)
cc_test
(
device_code_test SRCS device_code_test.cc DEPS device_code lod_tensor
)
endif
()
endif
()
paddle/fluid/platform/collective_helper.cc
浏览文件 @
93c1d9e7
...
...
@@ -13,10 +13,11 @@
// limitations under the License.
#include "paddle/fluid/platform/collective_helper.h"
#include <utility>
namespace
paddle
{
namespace
platform
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
class
NCCLCommImpl
:
public
NCCLComm
{
public:
void
set_ring_id
(
int
ring_id
)
{
ring_id_
=
ring_id
;
}
...
...
@@ -35,7 +36,7 @@ class NCCLCommImpl : public NCCLComm {
void
set_comm
(
ncclComm_t
comm
)
{
comm_
=
comm
;
}
ncclComm_t
comm
()
const
override
{
return
comm_
;
}
cuda
Stream_t
stream
()
const
override
{
return
dev_ctx_
->
stream
();
}
gpu
Stream_t
stream
()
const
override
{
return
dev_ctx_
->
stream
();
}
void
set_dev_ctx
(
std
::
unique_ptr
<
CUDADeviceContext
>&&
dev_ctx
)
{
dev_ctx_
=
std
::
move
(
dev_ctx
);
...
...
paddle/fluid/platform/collective_helper.h
浏览文件 @
93c1d9e7
...
...
@@ -27,7 +27,7 @@
namespace
paddle
{
namespace
platform
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
// In order to apply hierarchical communication with NCCL, we need
// a communication ring contains NCCL communicators associated to a global
// ncclUniqueId. E.g. for a hierarchical case,
...
...
@@ -56,7 +56,7 @@ class NCCLComm {
virtual
int
rank
()
const
=
0
;
virtual
int
device_id
()
const
=
0
;
virtual
ncclComm_t
comm
()
const
=
0
;
virtual
cuda
Stream_t
stream
()
const
=
0
;
virtual
gpu
Stream_t
stream
()
const
=
0
;
virtual
CUDADeviceContext
*
dev_context
()
const
=
0
;
virtual
~
NCCLComm
()
=
default
;
};
...
...
paddle/fluid/platform/cuda_device_function.h
浏览文件 @
93c1d9e7
...
...
@@ -14,10 +14,8 @@ limitations under the License. */
#pragma once
#include <cuda.h>
// NOTE(): support float16 to half in header file.
#define PADDLE_CUDA_FP16
#include <cuda_fp16.h>
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
#include "paddle/fluid/platform/float16.h"
...
...
@@ -25,6 +23,9 @@ limitations under the License. */
namespace
paddle
{
namespace
platform
{
#ifdef PADDLE_WITH_HIP
#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
#else
#if CUDA_VERSION < 9000
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
...
...
@@ -32,6 +33,7 @@ namespace platform {
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
#endif
inline
static
int
RoundToPowerOfTwo
(
int
dim
)
{
if
(
dim
>
512
)
{
...
...
@@ -67,7 +69,7 @@ template <typename T>
__forceinline__
__device__
T
CudaShuffleDownSync
(
unsigned
mask
,
T
val
,
int
delta
,
int
width
=
warpSize
)
{
#if CUDA_VERSION < 9000
#if
defined(PADDLE_WITH_HIP) ||
CUDA_VERSION < 9000
return
__shfl_down
(
val
,
delta
,
width
);
#else
return
__shfl_down_sync
(
mask
,
val
,
static_cast
<
unsigned
>
(
delta
),
width
);
...
...
@@ -77,7 +79,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
template
<
typename
T
>
__forceinline__
__device__
T
CudaShuffleXorSync
(
unsigned
mask
,
T
val
,
int
width
=
warpSize
)
{
#if CUDA_VERSION < 9000
#if
defined(PADDLE_WITH_HIP) ||
CUDA_VERSION < 9000
return
__shfl_xor
(
val
,
width
);
#else
return
__shfl_xor_sync
(
mask
,
val
,
width
);
...
...
@@ -85,18 +87,27 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
}
// CUDA 9.0 have native compatible float16 shfl_down
#if CUDA_VERSION < 9000
#if
defined(PADDLE_WITH_HIP) ||
CUDA_VERSION < 9000
template
<
>
__forceinline__
__device__
float16
CudaShuffleDownSync
(
unsigned
mask
,
float16
val
,
int
delta
,
int
width
)
{
#ifdef PADDLE_WITH_HIP
return
float16
(
__shfl_down
(
static_cast
<
float
>
(
val
),
static_cast
<
unsigned
>
(
delta
),
width
));
#else
return
float16
(
__shfl_down
(
static_cast
<
half
>
(
val
),
static_cast
<
unsigned
>
(
delta
),
width
));
#endif
}
template
<
>
__forceinline__
__device__
float16
CudaShuffleXorSync
(
unsigned
mask
,
float16
val
,
int
width
)
{
#ifdef PADDLE_WITH_HIP
return
float16
(
__shfl_xor
(
static_cast
<
float
>
(
val
),
width
));
#else
return
float16
(
__shfl_xor
(
static_cast
<
half
>
(
val
),
width
));
#endif
}
#else
template
<
>
...
...
@@ -159,7 +170,7 @@ __forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync(
template
<
typename
T
>
__forceinline__
__device__
T
CudaShuffleSync
(
unsigned
mask
,
T
val
,
int
src_line
,
int
width
=
32
)
{
#if CUDA_VERSION < 9000
#if
defined(PADDLE_WITH_HIP) ||
CUDA_VERSION < 9000
return
__shfl
(
val
,
src_line
,
width
);
#else
return
__shfl_sync
(
mask
,
val
,
src_line
,
width
);
...
...
@@ -173,13 +184,17 @@ HOSTDEVICE T Infinity() {
template
<
typename
T
>
__device__
T
reduceSum
(
T
val
,
int
tid
,
int
len
)
{
// NOTE(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
// NOTE(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
#ifdef PADDLE_WITH_HIP
const
int
warpSize
=
64
;
#else
const
int
warpSize
=
32
;
#endif
__shared__
T
shm
[
warpSize
];
unsigned
mask
=
0u
;
CREATE_SHFL_MASK
(
mask
,
tid
<
len
);
...
...
paddle/fluid/platform/cuda_helper.h
浏览文件 @
93c1d9e7
...
...
@@ -16,11 +16,16 @@
#include <mutex> // NOLINT
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cublas.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/rocblas.h"
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#if CUDA_VERSION < 9000
#if
defined(PADDLE_WITH_CUDA) &&
CUDA_VERSION < 9000
enum
cublasMath_t
{
CUBLAS_DEFAULT_MATH
=
0
};
#endif
...
...
@@ -77,6 +82,12 @@ namespace platform {
class
CublasHandleHolder
{
public:
#ifdef PADDLE_WITH_HIP
explicit
CublasHandleHolder
(
hipStream_t
stream
)
{
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
rocblas_create_handle
(
&
handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
rocblas_set_stream
(
handle_
,
stream
));
}
#else
CublasHandleHolder
(
cudaStream_t
stream
,
cublasMath_t
math_type
)
{
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cublasCreate
(
&
handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cublasSetStream
(
handle_
,
stream
));
...
...
@@ -92,9 +103,14 @@ class CublasHandleHolder {
}
#endif // CUDA_VERSION >= 9000
}
#endif
~
CublasHandleHolder
()
PADDLE_MAY_THROW
{
#ifdef PADDLE_WITH_HIP
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
rocblas_destroy_handle
(
handle_
));
#else
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cublasDestroy
(
handle_
));
#endif
}
template
<
typename
Callback
>
...
...
@@ -106,7 +122,11 @@ class CublasHandleHolder {
private:
DISABLE_COPY_AND_ASSIGN
(
CublasHandleHolder
);
#ifdef PADDLE_WITH_HIP
rocblas_handle
handle_
;
#else
cublasHandle_t
handle_
;
#endif
mutable
std
::
mutex
mtx_
;
};
...
...
paddle/fluid/platform/cuda_helper_test.cu
浏览文件 @
93c1d9e7
...
...
@@ -47,8 +47,13 @@ void TestCase(size_t num) {
T
*
in1
,
*
in2
,
*
out
;
T
*
d_in1
,
*
d_in2
;
size_t
size
=
sizeof
(
T
)
*
num
;
#ifdef PADDLE_WITH_HIP
hipMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
hipMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
size
);
#else
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
size
);
#endif
in1
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
in2
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
out
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
...
...
@@ -58,12 +63,22 @@ void TestCase(size_t num) {
in1
[
i
]
=
static_cast
<
T
>
(
dist
(
engine
));
in2
[
i
]
=
static_cast
<
T
>
(
dist
(
engine
));
}
#ifdef PADDLE_WITH_HIP
hipMemcpy
(
d_in1
,
in1
,
size
,
hipMemcpyHostToDevice
);
hipMemcpy
(
d_in2
,
in2
,
size
,
hipMemcpyHostToDevice
);
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
AddKernel
<
T
>
),
dim3
(
1
),
dim3
(
PADDLE_CUDA_NUM_THREADS
),
0
,
0
,
d_in1
,
d_in2
,
num
);
hipDeviceSynchronize
();
hipMemcpy
(
out
,
d_in2
,
size
,
hipMemcpyDeviceToHost
);
hipDeviceSynchronize
();
#else
cudaMemcpy
(
d_in1
,
in1
,
size
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_in2
,
in2
,
size
,
cudaMemcpyHostToDevice
);
AddKernel
<
T
><<<
1
,
PADDLE_CUDA_NUM_THREADS
>>>
(
d_in1
,
d_in2
,
num
);
cudaDeviceSynchronize
();
cudaMemcpy
(
out
,
d_in2
,
size
,
cudaMemcpyDeviceToHost
);
cudaDeviceSynchronize
();
#endif
for
(
size_t
i
=
0
;
i
<
num
;
++
i
)
{
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
...
...
@@ -73,8 +88,13 @@ void TestCase(size_t num) {
free
(
in1
);
free
(
in2
);
free
(
out
);
#ifdef PADDLE_WITH_HIP
hipFree
(
d_in1
);
hipFree
(
d_in2
);
#else
cudaFree
(
d_in1
);
cudaFree
(
d_in2
);
#endif
}
// cuda primitives
...
...
@@ -103,8 +123,13 @@ void TestUnalign(size_t num, const int shift_bit) {
size_t
size
=
sizeof
(
uint8_t
)
*
(
num
+
shift_bit
);
size_t
array_size
=
sizeof
(
float16
)
*
(
num
/
2
);
#ifdef PADDLE_WITH_HIP
hipMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
hipMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
size
);
#else
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
size
);
#endif
in1
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
in2
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
out
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
...
...
@@ -121,12 +146,23 @@ void TestUnalign(size_t num, const int shift_bit) {
r_in1
[
i
]
=
static_cast
<
float16
>
(
dist
(
engine
));
r_in2
[
i
]
=
static_cast
<
float16
>
(
dist
(
engine
));
}
#ifdef PADDLE_WITH_HIP
hipMemcpy
(
d_in1
,
r_in1
,
array_size
,
hipMemcpyHostToDevice
);
hipMemcpy
(
d_in2
,
r_in2
,
array_size
,
hipMemcpyHostToDevice
);
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
AddKernel
<
float16
>
),
dim3
(
1
),
dim3
(
PADDLE_CUDA_NUM_THREADS
),
0
,
0
,
d_in1
,
d_in2
,
num
/
2
);
hipDeviceSynchronize
();
hipMemcpy
(
out
,
d_in2
,
array_size
,
hipMemcpyDeviceToHost
);
hipDeviceSynchronize
();
#else
cudaMemcpy
(
d_in1
,
r_in1
,
array_size
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_in2
,
r_in2
,
array_size
,
cudaMemcpyHostToDevice
);
AddKernel
<
float16
><<<
1
,
PADDLE_CUDA_NUM_THREADS
>>>
(
d_in1
,
d_in2
,
num
/
2
);
cudaDeviceSynchronize
();
cudaMemcpy
(
out
,
d_in2
,
array_size
,
cudaMemcpyDeviceToHost
);
cudaDeviceSynchronize
();
#endif
for
(
size_t
i
=
0
;
i
<
num
/
2
;
++
i
)
{
// NOTE(dzhwinter): the float16 add has small truncate error.
// so we use EXPECT_NEAR to check the result.
...
...
@@ -137,8 +173,13 @@ void TestUnalign(size_t num, const int shift_bit) {
free
(
in1
);
free
(
in2
);
free
(
out
);
#ifdef PADDLE_WITH_HIP
hipFree
(
d_in1
);
hipFree
(
d_in2
);
#else
cudaFree
(
d_in1
);
cudaFree
(
d_in2
);
#endif
}
TEST
(
CudaAtomic
,
float16Unalign
)
{
...
...
@@ -203,8 +244,13 @@ void TestReduce(size_t num, float atol = 0.01) {
T
*
in1
;
T
*
d_in1
,
*
d_in2
;
size_t
size
=
sizeof
(
T
)
*
num
;
#ifdef PADDLE_WITH_HIP
hipMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
hipMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
sizeof
(
T
));
#else
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
sizeof
(
T
));
#endif
in1
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
std
::
minstd_rand
engine
;
std
::
uniform_real_distribution
<
double
>
dist
(
0.0
,
1.0
);
...
...
@@ -212,17 +258,31 @@ void TestReduce(size_t num, float atol = 0.01) {
in1
[
i
]
=
static_cast
<
T
>
(
dist
(
engine
));
}
auto
out
=
std
::
accumulate
(
in1
,
in1
+
num
,
static_cast
<
T
>
(
0
));
#ifdef PADDLE_WITH_HIP
hipMemcpy
(
d_in1
,
in1
,
size
,
hipMemcpyHostToDevice
);
hipDeviceSynchronize
();
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
DeviceReduceSum
<
T
>
),
dim3
(
1
),
dim3
(
PADDLE_CUDA_NUM_THREADS
),
0
,
0
,
d_in1
,
d_in2
,
num
);
hipMemcpy
(
in1
,
d_in2
,
sizeof
(
T
),
hipMemcpyDeviceToHost
);
hipDeviceSynchronize
();
#else
cudaMemcpy
(
d_in1
,
in1
,
size
,
cudaMemcpyHostToDevice
);
cudaDeviceSynchronize
();
DeviceReduceSum
<
T
><<<
1
,
PADDLE_CUDA_NUM_THREADS
>>>
(
d_in1
,
d_in2
,
num
);
cudaMemcpy
(
in1
,
d_in2
,
sizeof
(
T
),
cudaMemcpyDeviceToHost
);
cudaDeviceSynchronize
();
#endif
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
EXPECT_NEAR
(
static_cast
<
float
>
(
in1
[
0
]),
static_cast
<
float
>
(
out
),
atol
);
free
(
in1
);
#ifdef PADDLE_WITH_HIP
hipFree
(
d_in1
);
hipFree
(
d_in2
);
#else
cudaFree
(
d_in1
);
cudaFree
(
d_in2
);
#endif
}
TEST
(
CudaShuffleSync
,
float16
)
{
...
...
paddle/fluid/platform/cuda_primitives.h
浏览文件 @
93c1d9e7
...
...
@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <stdio.h>
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
...
...
@@ -50,7 +55,7 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) {
static_cast
<
unsigned
long
long
int
>
(
val
));
// NOLINT
}
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 600
#if defined(__
HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
USE_CUDA_ATOMIC
(
Add
,
double
);
#else
CUDA_ATOMIC_WRAPPER
(
Add
,
double
)
{
...
...
@@ -149,12 +154,12 @@ USE_CUDA_ATOMIC(Max, int);
USE_CUDA_ATOMIC
(
Max
,
unsigned
int
);
// CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 350
#if defined(__
HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
USE_CUDA_ATOMIC
(
Max
,
unsigned
long
long
int
);
// NOLINT
#else
CUDA_ATOMIC_WRAPPER
(
Max
,
unsigned
long
long
int
)
{
// NOLINT
if
(
*
address
>=
val
)
{
return
;
return
*
address
;
}
unsigned
long
long
int
old
=
*
address
,
assumed
;
// NOLINT
...
...
@@ -181,7 +186,7 @@ CUDA_ATOMIC_WRAPPER(Max, int64_t) {
CUDA_ATOMIC_WRAPPER
(
Max
,
float
)
{
if
(
*
address
>=
val
)
{
return
;
return
*
address
;
}
int
*
const
address_as_i
=
reinterpret_cast
<
int
*>
(
address
);
...
...
@@ -199,7 +204,7 @@ CUDA_ATOMIC_WRAPPER(Max, float) {
CUDA_ATOMIC_WRAPPER
(
Max
,
double
)
{
if
(
*
address
>=
val
)
{
return
;
return
*
address
;
}
unsigned
long
long
int
*
const
address_as_ull
=
// NOLINT
...
...
@@ -221,12 +226,12 @@ USE_CUDA_ATOMIC(Min, int);
USE_CUDA_ATOMIC
(
Min
,
unsigned
int
);
// CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t
#if defined(__
CUDA_ARCH__) && __CUDA_ARCH__ >= 350
#if defined(__
HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
USE_CUDA_ATOMIC
(
Min
,
unsigned
long
long
int
);
// NOLINT
#else
CUDA_ATOMIC_WRAPPER
(
Min
,
unsigned
long
long
int
)
{
// NOLINT
if
(
*
address
<=
val
)
{
return
;
return
*
address
;
}
unsigned
long
long
int
old
=
*
address
,
assumed
;
// NOLINT
...
...
@@ -253,7 +258,7 @@ CUDA_ATOMIC_WRAPPER(Min, int64_t) {
CUDA_ATOMIC_WRAPPER
(
Min
,
float
)
{
if
(
*
address
<=
val
)
{
return
;
return
*
address
;
}
int
*
const
address_as_i
=
reinterpret_cast
<
int
*>
(
address
);
...
...
@@ -271,7 +276,7 @@ CUDA_ATOMIC_WRAPPER(Min, float) {
CUDA_ATOMIC_WRAPPER
(
Min
,
double
)
{
if
(
*
address
<=
val
)
{
return
;
return
*
address
;
}
unsigned
long
long
int
*
const
address_as_ull
=
// NOLINT
...
...
paddle/fluid/platform/cuda_resource_pool.cc
浏览文件 @
93c1d9e7
...
...
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_resource_pool.h"
#include "paddle/fluid/platform/gpu_info.h"
...
...
@@ -25,15 +25,24 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
for
(
int
dev_idx
=
0
;
dev_idx
<
dev_cnt
;
++
dev_idx
)
{
auto
creator
=
[
dev_idx
]
{
platform
::
SetDeviceId
(
dev_idx
);
cudaStream_t
stream
;
gpuStream_t
stream
;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamCreateWithFlags
(
&
stream
,
hipStreamNonBlocking
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamCreateWithFlags
(
&
stream
,
cudaStreamNonBlocking
));
#endif
return
stream
;
};
auto
deleter
=
[
dev_idx
](
cuda
Stream_t
stream
)
{
auto
deleter
=
[
dev_idx
](
gpu
Stream_t
stream
)
{
platform
::
SetDeviceId
(
dev_idx
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamDestroy
(
stream
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamDestroy
(
stream
));
#endif
};
pool_
.
emplace_back
(
...
...
@@ -65,15 +74,24 @@ CudaEventResourcePool::CudaEventResourcePool() {
for
(
int
dev_idx
=
0
;
dev_idx
<
dev_cnt
;
++
dev_idx
)
{
auto
creator
=
[
dev_idx
]
{
platform
::
SetDeviceId
(
dev_idx
);
cudaEvent_t
event
;
gpuEvent_t
event
;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventCreateWithFlags
(
&
event
,
hipEventDisableTiming
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
event
,
cudaEventDisableTiming
));
#endif
return
event
;
};
auto
deleter
=
[
dev_idx
](
cuda
Event_t
event
)
{
auto
deleter
=
[
dev_idx
](
gpu
Event_t
event
)
{
platform
::
SetDeviceId
(
dev_idx
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventDestroy
(
event
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventDestroy
(
event
));
#endif
};
pool_
.
emplace_back
(
ResourcePool
<
CudaEventObject
>::
Create
(
creator
,
deleter
));
...
...
paddle/fluid/platform/cuda_resource_pool.h
浏览文件 @
93c1d9e7
...
...
@@ -14,9 +14,17 @@
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <memory>
#include <type_traits>
#include <vector>
...
...
@@ -26,8 +34,8 @@
namespace
paddle
{
namespace
platform
{
using
CudaStreamObject
=
std
::
remove_pointer
<
cuda
Stream_t
>::
type
;
using
CudaEventObject
=
std
::
remove_pointer
<
cuda
Event_t
>::
type
;
using
CudaStreamObject
=
std
::
remove_pointer
<
gpu
Stream_t
>::
type
;
using
CudaEventObject
=
std
::
remove_pointer
<
gpu
Event_t
>::
type
;
class
CudaStreamResourcePool
{
public:
...
...
paddle/fluid/platform/cudnn_desc_test.cc
浏览文件 @
93c1d9e7
...
...
@@ -12,7 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_desc.h"
#else
#include "paddle/fluid/platform/cudnn_desc.h"
#endif
#include <gtest/gtest.h>
...
...
paddle/fluid/platform/device_code.cc
浏览文件 @
93c1d9e7
...
...
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_code.h"
#include <sys/stat.h>
#include <algorithm>
#include <set>
#include <utility>
#include "paddle/fluid/platform/device_code.h"
#include "paddle/fluid/platform/enforce.h"
DECLARE_string
(
cuda_dir
);
...
...
@@ -71,26 +72,35 @@ DeviceCodePool::DeviceCodePool(const std::vector<platform::Place>& places) {
}
for
(
auto
&
p
:
set
)
{
if
(
is_gpu_place
(
p
))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
device_codes_
.
emplace
(
p
,
DeviceCodeMap
());
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"CUDAPlace is not supported, please re-compile with WITH_GPU=ON."
));
"CUDAPlace or HIPPlace is not supported, please re-compile with "
"WITH_GPU=ON or WITH_ROCM=ON."
));
#endif
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
CUDADeviceCode
::
CheckAvailableStatus
();
#endif
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_HIP
static
bool
CheckCUDADriverResult
(
hipError_t
result
,
std
::
string
caller
,
std
::
string
kernel_name
=
""
)
{
if
(
result
!=
hipSuccess
)
{
const
char
*
error
=
nullptr
;
error
=
dynload
::
hipGetErrorString
(
result
);
#else
static
bool
CheckCUDADriverResult
(
CUresult
result
,
std
::
string
caller
,
std
::
string
kernel_name
=
""
)
{
if
(
result
!=
CUDA_SUCCESS
)
{
const
char
*
error
=
nullptr
;
dynload
::
cuGetErrorString
(
result
,
&
error
);
#endif
LOG_FIRST_N
(
WARNING
,
1
)
<<
"Call "
<<
caller
<<
" for < "
<<
kernel_name
<<
" > failed: "
<<
error
<<
" ("
<<
result
<<
")"
;
return
false
;
...
...
@@ -109,13 +119,23 @@ void CUDADeviceCode::CheckAvailableStatus() {
int
nvrtc_major
=
0
;
int
nvrtc_minor
=
0
;
#ifdef PADDLE_WITH_HIP
hiprtcResult
nvrtc_result
=
dynload
::
hiprtcVersion
(
&
nvrtc_major
,
&
nvrtc_minor
);
#else
nvrtcResult
nvrtc_result
=
dynload
::
nvrtcVersion
(
&
nvrtc_major
,
&
nvrtc_minor
);
#endif
int
driver_version
=
0
;
int
dirver_major
=
0
;
int
driver_minor
=
0
;
#ifdef PADDLE_WITH_HIP
hipError_t
driver_result
=
dynload
::
hipDriverGetVersion
(
&
driver_version
);
if
(
driver_result
==
hipSuccess
)
{
#else
CUresult
driver_result
=
dynload
::
cuDriverGetVersion
(
&
driver_version
);
if
(
driver_result
==
CUDA_SUCCESS
)
{
#endif
dirver_major
=
driver_version
/
1000
;
driver_minor
=
(
driver_version
%
1000
)
/
10
;
}
...
...
@@ -123,13 +143,22 @@ void CUDADeviceCode::CheckAvailableStatus() {
LOG_FIRST_N
(
INFO
,
1
)
<<
"CUDA Driver Version: "
<<
dirver_major
<<
"."
<<
driver_minor
<<
"; NVRTC Version: "
<<
nvrtc_major
<<
"."
<<
nvrtc_minor
;
#ifdef PADDLE_WITH_HIP
if
(
nvrtc_result
!=
HIPRTC_SUCCESS
||
driver_result
!=
hipSuccess
)
{
#else
if
(
nvrtc_result
!=
NVRTC_SUCCESS
||
driver_result
!=
CUDA_SUCCESS
)
{
#endif
return
;
}
int
count
=
0
;
#ifdef PADDLE_WITH_HIP
if
(
CheckCUDADriverResult
(
dynload
::
hipGetDeviceCount
(
&
count
),
"hipGetDeviceCount"
))
{
#else
if
(
CheckCUDADriverResult
(
dynload
::
cuDeviceGetCount
(
&
count
),
"cuDeviceGetCount"
))
{
#endif
available_
=
true
;
}
}
...
...
@@ -163,14 +192,20 @@ static std::string FindCUDAIncludePath() {
}
}
#ifdef PADDLE_WITH_HIP
cuda_include_path
=
"/opt/rocm/include"
;
#else
cuda_include_path
=
"/usr/local/cuda/include"
;
#endif
if
(
stat
(
cuda_include_path
.
c_str
(),
&
st
)
==
0
)
{
return
cuda_include_path
;
}
LOG
(
WARNING
)
<<
"Cannot find CUDA include path."
<<
"Please check whether CUDA is installed in the default "
"installation path, or specify it by export "
"FLAGS_cuda_dir=xxx."
;
LOG
(
WARNING
)
<<
"Cannot find CUDA or ROCM include path."
<<
"Please check whether CUDA or ROCM is installed in the default "
"installation path, or specify it by export "
"FLAGS_cuda_dir=xxx."
;
return
""
;
}
...
...
@@ -183,7 +218,11 @@ CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name,
place_
=
place
;
name_
=
name
;
#ifdef PADDLE_WITH_HIP
kernel_
=
"#include <hip/hip_runtime.h>
\n
"
+
kernel
;
#else
kernel_
=
kernel
;
#endif
}
bool
CUDADeviceCode
::
Compile
(
bool
include_path
)
{
...
...
@@ -193,7 +232,84 @@ bool CUDADeviceCode::Compile(bool include_path) {
<<
"NVRTC and CUDA driver are need for JIT compiling of CUDA code."
;
return
false
;
}
#ifdef PADDLE_WITH_HIP
hiprtcProgram
program
;
if
(
!
CheckNVRTCResult
(
dynload
::
hiprtcCreateProgram
(
&
program
,
kernel_
.
c_str
(),
// buffer
name_
.
c_str
(),
// name
0
,
// numHeaders
nullptr
,
// headers
nullptr
),
// includeNames
"hiprtcCreateProgram"
))
{
return
false
;
}
// Compile the program for specified compute_capability
auto
*
dev_ctx
=
reinterpret_cast
<
CUDADeviceContext
*>
(
DeviceContextPool
::
Instance
().
Get
(
place_
));
int
compute_capability
=
dev_ctx
->
GetComputeCapability
();
std
::
vector
<
const
char
*>
options
=
{
"-std=c++11"
,
"--amdgpu-target=gfx906"
};
std
::
string
include_option
;
if
(
include_path
)
{
std
::
string
cuda_include_path
=
FindCUDAIncludePath
();
if
(
!
cuda_include_path
.
empty
())
{
include_option
=
"--include-path="
+
cuda_include_path
;
options
.
push_back
(
include_option
.
c_str
());
}
}
hiprtcResult
compile_result
=
dynload
::
hiprtcCompileProgram
(
program
,
// program
options
.
size
(),
// numOptions
options
.
data
());
// options
if
(
compile_result
==
HIPRTC_ERROR_COMPILATION
)
{
// Obtain compilation log from the program
size_t
log_size
;
if
(
!
CheckNVRTCResult
(
dynload
::
hiprtcGetProgramLogSize
(
program
,
&
log_size
),
"hiprtcGetProgramLogSize"
))
{
return
false
;
}
std
::
vector
<
char
>
log
;
log
.
resize
(
log_size
+
1
);
if
(
!
CheckNVRTCResult
(
dynload
::
hiprtcGetProgramLog
(
program
,
log
.
data
()),
"hiprtcGetProgramLog"
))
{
return
false
;
}
LOG
(
WARNING
)
<<
"JIT compiling of ROCM GPU code failed:"
<<
"
\n
Kernel name: "
<<
name_
<<
"
\n
Kernel body:
\n
"
<<
kernel_
<<
"
\n
Compiling log: "
<<
log
.
data
();
return
false
;
}
// Obtain PTX from the program for cuda
// Obtain Code from the program for hip
size_t
ptx_size
;
if
(
!
CheckNVRTCResult
(
dynload
::
hiprtcGetCodeSize
(
program
,
&
ptx_size
),
"hiprtcGetCodeSize"
))
{
return
false
;
}
ptx_
.
resize
(
ptx_size
+
1
);
if
(
!
CheckNVRTCResult
(
dynload
::
hiprtcGetCode
(
program
,
ptx_
.
data
()),
"hiprtcGetCode"
))
{
return
false
;
}
if
(
!
CheckNVRTCResult
(
dynload
::
hiprtcDestroyProgram
(
&
program
),
"hiprtcDestroyProgram"
))
{
return
false
;
}
if
(
!
CheckCUDADriverResult
(
dynload
::
hipModuleLoadData
(
&
module_
,
ptx_
.
data
()),
"hipModuleLoadData"
))
{
return
false
;
}
if
(
!
CheckCUDADriverResult
(
dynload
::
hipModuleGetFunction
(
&
function_
,
module_
,
name_
.
c_str
()),
"hipModuleGetFunction"
))
{
return
false
;
}
#else
nvrtcProgram
program
;
if
(
!
CheckNVRTCResult
(
dynload
::
nvrtcCreateProgram
(
&
program
,
kernel_
.
c_str
(),
// buffer
...
...
@@ -271,6 +387,7 @@ bool CUDADeviceCode::Compile(bool include_path) {
"cuModuleGetFunction"
,
name_
))
{
return
false
;
}
#endif
max_threads_
=
dev_ctx
->
GetMaxPhysicalThreadCount
();
is_compiled_
=
true
;
...
...
@@ -291,6 +408,18 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
auto
*
dev_ctx
=
reinterpret_cast
<
CUDADeviceContext
*>
(
DeviceContextPool
::
Instance
().
Get
(
place_
));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_EQ
(
dynload
::
hipModuleLaunchKernel
(
function_
,
num_blocks
,
1
,
1
,
// grid dim
num_threads_
,
1
,
1
,
// block dim
0
,
// shared memory
dev_ctx
->
stream
(),
// stream
args
->
data
(),
// arguments
nullptr
),
hipSuccess
,
errors
::
External
(
"Fail to launch kernel %s (in hipModuleLaunchKernel.)"
,
name_
.
c_str
()));
#else
PADDLE_ENFORCE_EQ
(
dynload
::
cuLaunchKernel
(
function_
,
num_blocks
,
1
,
1
,
// grid dim
num_threads_
,
1
,
1
,
// block dim
...
...
@@ -301,8 +430,19 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
CUDA_SUCCESS
,
errors
::
External
(
"Fail to launch kernel %s (in cuLaunchKernel.)"
,
name_
.
c_str
()));
#endif
}
#ifdef PADDLE_WITH_HIP
bool
CUDADeviceCode
::
CheckNVRTCResult
(
hiprtcResult
result
,
std
::
string
function
)
{
if
(
result
!=
HIPRTC_SUCCESS
)
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"Call "
<<
function
<<
" for < "
<<
name_
<<
" > failed: "
<<
dynload
::
hiprtcGetErrorString
(
result
);
return
false
;
}
#else
bool
CUDADeviceCode
::
CheckNVRTCResult
(
nvrtcResult
result
,
std
::
string
function
)
{
if
(
result
!=
NVRTC_SUCCESS
)
{
...
...
@@ -311,6 +451,7 @@ bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result,
<<
" > failed: "
<<
dynload
::
nvrtcGetErrorString
(
result
);
return
false
;
}
#endif
return
true
;
}
#endif
...
...
paddle/fluid/platform/device_code.h
浏览文件 @
93c1d9e7
...
...
@@ -25,6 +25,10 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cuda_driver.h"
#include "paddle/fluid/platform/dynload/nvrtc.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/hiprtc.h"
#include "paddle/fluid/platform/dynload/rocm_driver.h"
#endif
namespace
paddle
{
namespace
platform
{
...
...
@@ -44,7 +48,7 @@ class DeviceCode {
std
::
string
kernel_
;
};
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class
CUDADeviceCode
:
public
DeviceCode
{
public:
explicit
CUDADeviceCode
(
const
Place
&
place
,
const
std
::
string
&
name
,
...
...
@@ -61,7 +65,11 @@ class CUDADeviceCode : public DeviceCode {
static
bool
IsAvailable
()
{
return
available_
;
}
private:
#ifdef PADDLE_WITH_HIP
bool
CheckNVRTCResult
(
hiprtcResult
result
,
std
::
string
function
);
#else
bool
CheckNVRTCResult
(
nvrtcResult
result
,
std
::
string
function
);
#endif
static
bool
available_
;
...
...
@@ -70,8 +78,13 @@ class CUDADeviceCode : public DeviceCode {
int
num_threads_
{
1024
};
int
workload_per_thread_
{
1
};
std
::
vector
<
char
>
ptx_
;
#ifdef PADDLE_WITH_HIP
hipModule_t
module_
;
hipFunction_t
function_
;
#else
CUmodule
module_
;
CUfunction
function_
;
#endif
};
#endif
...
...
paddle/fluid/platform/device_code_test.cc
浏览文件 @
93c1d9e7
...
...
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_code.h"
#include <utility>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/init.h"
#ifdef PADDLE_WITH_CUDA
constexpr
auto
saxpy_code
=
R"(
extern "C" __global__
void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
...
...
@@ -26,8 +28,22 @@ void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
}
}
)"
;
#endif
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
constexpr
auto
saxpy_code
=
R"(
#include <hip/hip_runtime.h>
extern "C" __global__
void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < n;
tid += blockDim.x * gridDim.x) {
z[tid] = a * x[tid] + y[tid];
}
}
)"
;
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST
(
DeviceCode
,
cuda
)
{
if
(
!
paddle
::
platform
::
dynload
::
HasNVRTC
()
||
!
paddle
::
platform
::
dynload
::
HasCUDADriver
())
{
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
93c1d9e7
...
...
@@ -12,7 +12,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device_context.h"
#include <set>
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
...
...
@@ -29,7 +29,7 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
}
if
(
platform
::
is_gpu_place
(
place
))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto
*
default_dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
auto
&
desired_dev_ctx
=
...
...
@@ -65,7 +65,7 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
namespace
paddle
{
namespace
platform
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
bool
allow_tf32_cublas
=
true
;
void
SetAllowTF32Cublas
(
bool
active
)
{
allow_tf32_cublas
=
active
;
}
bool
AllowTF32Cublas
()
{
return
allow_tf32_cublas
;
}
...
...
@@ -122,7 +122,7 @@ DeviceContextPool::DeviceContextPool(
EmplaceDeviceContext
<
CPUDeviceContext
,
CPUPlace
>
(
&
device_contexts_
,
p
);
#endif
}
else
if
(
platform
::
is_gpu_place
(
p
))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext
<
CUDADeviceContext
,
CUDAPlace
>
(
&
device_contexts_
,
p
);
#else
PADDLE_THROW
(
...
...
@@ -130,7 +130,7 @@ DeviceContextPool::DeviceContextPool(
"re-compile with WITH_GPU option."
));
#endif
}
else
if
(
platform
::
is_cuda_pinned_place
(
p
))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext
<
CUDAPinnedDeviceContext
,
CUDAPinnedPlace
>
(
&
device_contexts_
,
p
);
#else
...
...
@@ -229,7 +229,7 @@ Place XPUDeviceContext::GetPlace() const { return place_; }
xpu
::
Context
*
XPUDeviceContext
::
x_context
()
const
{
return
context_
;
}
#endif
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class
EigenCudaStreamDevice
:
public
Eigen
::
StreamInterface
{
public:
...
...
@@ -238,15 +238,19 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
}
~
EigenCudaStreamDevice
()
override
{}
void
Reinitialize
(
const
cuda
Stream_t
*
cuda_stream
,
CUDAPlace
place
)
{
void
Reinitialize
(
const
gpu
Stream_t
*
cuda_stream
,
CUDAPlace
place
)
{
stream_
=
cuda_stream
;
place_
=
place
;
device_prop_
=
&
Eigen
::
m_deviceProperties
[
place
.
device
];
}
const
cuda
Stream_t
&
stream
()
const
override
{
return
*
stream_
;
}
const
gpu
Stream_t
&
stream
()
const
override
{
return
*
stream_
;
}
#ifdef PADDLE_WITH_HIP
const
hipDeviceProp_t
&
deviceProperties
()
const
override
{
#else
const
cudaDeviceProp
&
deviceProperties
()
const
override
{
#endif
return
*
device_prop_
;
}
...
...
@@ -295,16 +299,25 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
char
*
scratch
=
static_cast
<
char
*>
(
scratchpad
())
+
Eigen
::
kGpuScratchSize
;
#endif
semaphore_
=
reinterpret_cast
<
unsigned
int
*>
(
scratch
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipMemsetAsync
(
semaphore_
,
0
,
sizeof
(
unsigned
int
),
*
stream_
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaMemsetAsync
(
semaphore_
,
0
,
sizeof
(
unsigned
int
),
*
stream_
));
#endif
}
return
semaphore_
;
}
private:
CUDAPlace
place_
;
const
cudaStream_t
*
stream_
;
// not owned;
const
gpuStream_t
*
stream_
;
// not owned;
#ifdef PADDLE_WITH_HIP
const
hipDeviceProp_t
*
device_prop_
;
#else
const
cudaDeviceProp
*
device_prop_
;
// not owned;
#endif
mutable
void
*
scratch_
;
mutable
unsigned
int
*
semaphore_
;
mutable
std
::
mutex
mtx_
;
// to protect allocations_
...
...
@@ -339,14 +352,18 @@ CUDAContext::CUDAContext(const CUDAPlace& place,
InitEigenContext
();
InitCuBlasContext
();
InitCuDNNContext
();
#ifndef PADDLE_WITH_HIP
InitCuSolverContext
();
#endif
}
CUDAContext
::~
CUDAContext
()
{
CUDADeviceGuard
guard
(
place_
.
device
);
DestoryCuDNNContext
();
DestoryCuBlasContext
();
#ifndef PADDLE_WITH_HIP
DestoryCuSolverContext
();
#endif
}
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
:
place_
(
place
)
{
...
...
@@ -369,17 +386,29 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
<<
", Runtime API Version: "
<<
runtime_version_
/
1000
<<
"."
<<
(
runtime_version_
%
100
)
/
10
;
#ifdef PADDLE_WITH_HIP
size_t
version_major
,
version_minor
,
version_patch
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenGetVersion
(
&
version_major
,
&
version_minor
,
&
version_patch
));
LOG_FIRST_N
(
WARNING
,
1
)
<<
"device: "
<<
place_
.
device
<<
", MIOpen Version: "
<<
version_major
<<
"."
<<
version_minor
<<
"."
<<
version_patch
;
#else
size_t
cudnn_dso_ver
=
dynload
::
cudnnGetVersion
();
LOG_FIRST_N
(
WARNING
,
1
)
<<
"device: "
<<
place_
.
device
<<
", cuDNN Version: "
<<
cudnn_dso_ver
/
1000
<<
"."
<<
(
cudnn_dso_ver
%
1000
)
/
100
<<
"."
;
#endif
{
// Check CUDA/CUDNN version compatiblity
auto
local_cuda_version
=
(
driver_version_
/
1000
)
*
10
+
(
driver_version_
%
100
)
/
10
;
#ifdef PADDLE_WITH_HIP
auto
compile_cuda_version
=
(
HIP_VERSION
/
100
)
*
10
+
(
HIP_VERSION
%
10
);
#else
auto
compile_cuda_version
=
(
CUDA_VERSION
/
1000
)
*
10
+
(
CUDA_VERSION
%
100
)
/
10
;
#endif
if
(
local_cuda_version
<
compile_cuda_version
)
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
place_
.
device
...
...
@@ -397,7 +426,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
CUDADeviceContext
::~
CUDADeviceContext
()
{
SetDeviceId
(
place_
.
device
);
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
if
(
nccl_comm_
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
ncclCommDestroy
(
nccl_comm_
));
}
...
...
@@ -434,7 +463,11 @@ dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
return
max_grid_dim_size_
;
}
#ifdef PADDLE_WITH_HIP
miopenHandle_t
CUDADeviceContext
::
cudnn_handle
()
const
{
#else
cudnnHandle_t
CUDADeviceContext
::
cudnn_handle
()
const
{
#endif
return
context
()
->
CudnnHandle
();
}
...
...
@@ -442,13 +475,13 @@ CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
return
CudnnWorkspaceHandle
(
*
this
,
&
cudnn_handle_mtx_
);
}
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t
CUDADeviceContext
::
cusolver_dn_handle
()
const
{
return
context
()
->
CusolverDnHandle
();
}
#endif
cudaStream_t
CUDADeviceContext
::
stream
()
const
{
return
context
()
->
RawStream
();
}
gpuStream_t
CUDADeviceContext
::
stream
()
const
{
return
context
()
->
RawStream
();
}
CUDAPinnedDeviceContext
::
CUDAPinnedDeviceContext
()
{
eigen_device_
.
reset
(
new
Eigen
::
DefaultDevice
());
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
93c1d9e7
...
...
@@ -30,6 +30,16 @@ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/cuda_helper.h" // NOLINT
#include "paddle/fluid/platform/dynload/miopen.h"
#include "paddle/fluid/platform/dynload/rocblas.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/dynload/rccl.h"
#endif
#include "paddle/fluid/platform/gpu_info.h" // NOLINT
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#include "xpu/bkcl.h"
#endif
...
...
@@ -44,7 +54,7 @@ limitations under the License. */
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/stream/cuda_stream.h"
#endif
#include "unsupported/Eigen/CXX11/Tensor"
...
...
@@ -62,7 +72,7 @@ struct GpuDevice;
namespace
paddle
{
namespace
platform
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
/*Set the value of the global variable allow_tf32_cublas*/
void
SetAllowTF32Cublas
(
bool
active
);
/*Get the global variable allow_tf32_cublas value*/
...
...
@@ -153,7 +163,7 @@ struct DefaultDeviceContextType<platform::XPUPlace> {
};
#endif
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class
CudnnWorkspaceHandle
;
class
EigenCudaStreamDevice
;
...
...
@@ -179,13 +189,19 @@ class CUDAContext {
const
std
::
unique_ptr
<
stream
::
CUDAStream
>&
Stream
()
const
{
return
stream_
;
}
const
cuda
Stream_t
&
RawStream
()
{
return
stream_
->
raw_stream
();
}
const
gpu
Stream_t
&
RawStream
()
{
return
stream_
->
raw_stream
();
}
#ifdef PADDLE_WITH_HIP
const
miopenHandle_t
&
CudnnHandle
()
const
{
return
cudnn_handle_
;
}
#else
const
cudnnHandle_t
&
CudnnHandle
()
const
{
return
cudnn_handle_
;
}
#endif
#ifndef PADDLE_WITH_HIP
const
cusolverDnHandle_t
&
CusolverDnHandle
()
const
{
return
cusolver_dn_handle_
;
}
#endif
const
std
::
unique_ptr
<
CublasHandleHolder
>&
CublasHandle
()
const
{
return
cublas_handle_
;
...
...
@@ -222,6 +238,11 @@ class CUDAContext {
private:
void
InitEigenContext
();
#ifdef PADDLE_WITH_HIP
void
InitCuBlasContext
()
{
cublas_handle_
.
reset
(
new
CublasHandleHolder
(
RawStream
()));
}
#else
void
InitCuBlasContext
()
{
cublas_handle_
.
reset
(
new
CublasHandleHolder
(
RawStream
(),
CUBLAS_DEFAULT_MATH
));
...
...
@@ -236,9 +257,32 @@ class CUDAContext {
#endif // CUDA_VERSION >= 9000
}
}
#endif
void
InitCuDNNContext
()
{
if
(
dynload
::
HasCUDNN
())
{
#ifdef PADDLE_WITH_HIP
size_t
miopen_major
,
miopen_minor
,
miopen_patch
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenGetVersion
(
&
miopen_major
,
&
miopen_minor
,
&
miopen_patch
));
auto
local_miopen_version
=
(
miopen_major
*
1000
+
miopen_minor
*
100
+
miopen_patch
)
/
100
;
auto
compile_miopen_version
=
MIOPEN_VERSION
/
100
;
if
(
local_miopen_version
<
static_cast
<
size_t
>
(
compile_miopen_version
))
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
place_
.
device
<<
". The installed Paddle is compiled with MIOPEN "
<<
compile_miopen_version
/
10
<<
"."
<<
compile_miopen_version
%
10
<<
", but MIOPEN version in your machine is "
<<
local_miopen_version
/
10
<<
"."
<<
local_miopen_version
%
10
<<
", which may cause serious incompatible bug. "
<<
"Please recompile or reinstall Paddle with compatible MIOPEN "
"version."
;
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreate
(
&
cudnn_handle_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetStream
(
cudnn_handle_
,
RawStream
()));
#else
auto
local_cudnn_version
=
dynload
::
cudnnGetVersion
()
/
100
;
auto
compile_cudnn_version
=
CUDNN_VERSION
/
100
;
if
(
local_cudnn_version
<
static_cast
<
size_t
>
(
compile_cudnn_version
))
{
...
...
@@ -255,20 +299,27 @@ class CUDAContext {
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cudnnCreate
(
&
cudnn_handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cudnnSetStream
(
cudnn_handle_
,
RawStream
()));
#endif
}
else
{
cudnn_handle_
=
nullptr
;
}
}
#ifndef PADDLE_WITH_HIP
void
InitCuSolverContext
()
{
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cusolverDnCreate
(
&
cusolver_dn_handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cusolverDnSetStream
(
cusolver_dn_handle_
,
RawStream
()));
}
#endif
void
DestoryCuDNNContext
()
{
if
(
cudnn_handle_
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroy
(
cudnn_handle_
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroy
(
cudnn_handle_
));
#endif
}
cudnn_handle_
=
nullptr
;
}
...
...
@@ -279,22 +330,30 @@ class CUDAContext {
cublas_tf32_tensor_core_handle_
.
reset
();
}
#ifndef PADDLE_WITH_HIP
void
DestoryCuSolverContext
()
{
if
(
cusolver_dn_handle_
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cusolverDnDestroy
(
cusolver_dn_handle_
));
}
}
#endif
CUDAPlace
place_
;
std
::
unique_ptr
<
Eigen
::
GpuDevice
>
eigen_device_
;
std
::
unique_ptr
<
EigenCudaStreamDevice
>
eigen_stream_
;
std
::
unique_ptr
<
stream
::
CUDAStream
>
stream_
;
#ifdef PADDLE_WITH_HIP
miopenHandle_t
cudnn_handle_
;
#else
cudnnHandle_t
cudnn_handle_
;
#endif
std
::
unique_ptr
<
CublasHandleHolder
>
cublas_handle_
;
std
::
unique_ptr
<
CublasHandleHolder
>
cublas_tensor_core_handle_
;
std
::
unique_ptr
<
CublasHandleHolder
>
cublas_tf32_tensor_core_handle_
;
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t
cusolver_dn_handle_
;
#endif
DISABLE_COPY_AND_ASSIGN
(
CUDAContext
);
};
...
...
@@ -343,8 +402,12 @@ class CUDADeviceContext : public DeviceContext {
return
context
()
->
TensorCoreCublasCallIfAvailable
(
callback
);
}
/*! \brief Return cudnn handle in the device context. */
/*! \brief Return cudnn handle in the device context. */
#ifdef PADDLE_WITH_HIP
miopenHandle_t
cudnn_handle
()
const
;
#else
cudnnHandle_t
cudnn_handle
()
const
;
#endif
/*! \brief Return a cudnn workspace handle to call multiple cudnn
* functions without interrupting by other threads.
...
...
@@ -355,12 +418,14 @@ class CUDADeviceContext : public DeviceContext {
* sequential cudnn function calls. */
CudnnWorkspaceHandle
cudnn_workspace_handle
()
const
;
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t
cusolver_dn_handle
()
const
;
#endif
/*! \brief Return cuda stream in the device context. */
cuda
Stream_t
stream
()
const
;
gpu
Stream_t
stream
()
const
;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
/*! \brief Return nccl communicators. */
ncclComm_t
nccl_comm
()
const
{
return
nccl_comm_
;
}
...
...
@@ -369,7 +434,7 @@ class CUDADeviceContext : public DeviceContext {
#endif
template
<
typename
Callback
>
void
RecordEvent
(
cuda
Event_t
ev
,
Callback
callback
)
const
{
void
RecordEvent
(
gpu
Event_t
ev
,
Callback
callback
)
const
{
return
context
()
->
Stream
()
->
RecordEvent
(
ev
,
callback
);
}
...
...
@@ -411,7 +476,7 @@ class CUDADeviceContext : public DeviceContext {
mutable
std
::
mutex
cudnn_handle_mtx_
;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
// NCCL communicator (single process version) for NCCL collective operations.
// NCCL collective operations provides fast collectives over multiple GPUs
// both within and across nodes.
...
...
paddle/fluid/platform/device_context_test.cu
浏览文件 @
93c1d9e7
...
...
@@ -41,7 +41,11 @@ TEST(Device, CUDADeviceContext) {
CUDADeviceContext
*
device_context
=
new
CUDADeviceContext
(
CUDAPlace
(
i
));
Eigen
::
GpuDevice
*
gpu_device
=
device_context
->
eigen_device
();
ASSERT_NE
(
nullptr
,
gpu_device
);
#ifdef PADDLE_WITH_HIP
miopenHandle_t
cudnn_handle
=
device_context
->
cudnn_handle
();
#else
cudnnHandle_t
cudnn_handle
=
device_context
->
cudnn_handle
();
#endif
ASSERT_NE
(
nullptr
,
cudnn_handle
);
delete
device_context
;
}
...
...
paddle/fluid/platform/enforce.h
浏览文件 @
93c1d9e7
...
...
@@ -42,8 +42,7 @@ limitations under the License. */
#include <miopen/miopen.h>
#include <rocblas.h>
#include <thrust/system/hip/error.h>
#include <thrust/system_error.h> // NOLINT
#include "paddle/fluid/platform/cuda_error.pb.h" // NOLINT
#include <thrust/system_error.h> // NOLINT
#endif
#include <fstream>
...
...
@@ -1034,11 +1033,6 @@ inline void retry_sleep(unsigned milliseconds) {
inline
bool
is_error
(
hipError_t
e
)
{
return
e
!=
hipSuccess
;
}
inline
std
::
string
build_rocm_error_msg
(
hipError_t
e
)
{
#if defined(PADDLE_WITH_HIP)
int32_t
cuda_version
=
100
;
#else
int32_t
cuda_version
=
-
1
;
#endif
std
::
ostringstream
sout
;
sout
<<
" Hip error("
<<
e
<<
"), "
<<
hipGetErrorString
(
e
)
<<
"."
;
return
sout
.
str
();
...
...
paddle/fluid/platform/miopen_desc.h
0 → 100644
浏览文件 @
93c1d9e7
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <functional>
#include <iostream>
#include <iterator>
#include <memory>
#include <numeric>
#include <string>
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/miopen_helper.h"
namespace
paddle
{
namespace
framework
{
class
Tensor
;
}
// namespace framework
}
// namespace paddle
namespace
paddle
{
namespace
platform
{
using
framework
::
Tensor
;
template
<
typename
T
>
inline
miopenDataType_t
ToMIOpenDataType
(
const
T
&
t
)
{
auto
type
=
framework
::
ToDataType
(
t
);
return
ToMIOpenDataType
(
type
);
}
inline
std
::
vector
<
int
>
TransformDimOrder
(
const
std
::
vector
<
int
>&
dims
)
{
std
::
vector
<
int
>
transformed_dims
(
dims
.
begin
(),
dims
.
end
());
int
H
,
W
,
D
,
C
;
if
(
dims
.
size
()
==
4
)
{
H
=
dims
[
1
];
W
=
dims
[
2
];
C
=
dims
[
3
];
transformed_dims
[
1
]
=
C
;
transformed_dims
[
2
]
=
H
;
transformed_dims
[
3
]
=
W
;
}
else
{
D
=
dims
[
1
];
H
=
dims
[
2
];
W
=
dims
[
3
];
C
=
dims
[
4
];
transformed_dims
[
1
]
=
C
;
transformed_dims
[
2
]
=
D
;
transformed_dims
[
3
]
=
H
;
transformed_dims
[
4
]
=
W
;
}
return
transformed_dims
;
}
template
<
>
inline
miopenDataType_t
ToMIOpenDataType
(
const
framework
::
proto
::
VarType
::
Type
&
t
)
{
miopenDataType_t
type
=
miopenFloat
;
switch
(
t
)
{
case
framework
::
proto
::
VarType
::
FP16
:
type
=
miopenHalf
;
break
;
case
framework
::
proto
::
VarType
::
FP32
:
type
=
miopenFloat
;
break
;
default:
break
;
}
return
type
;
}
class
ActivationDescriptor
{
public:
ActivationDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateActivationDescriptor
(
&
desc_
));
}
~
ActivationDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyActivationDescriptor
(
desc_
));
}
template
<
typename
T
>
void
set
(
miopenActivationMode_t
mode
,
const
T
&
coef
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetActivationDescriptor
(
desc_
,
mode
,
static_cast
<
double
>
(
coef
),
0.0
,
0.0
));
}
miopenActivationDescriptor_t
desc
()
{
return
desc_
;
}
miopenActivationDescriptor_t
desc
()
const
{
return
desc_
;
}
private:
miopenActivationDescriptor_t
desc_
;
};
class
TensorDescriptor
{
public:
TensorDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateTensorDescriptor
(
&
desc_
));
}
~
TensorDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyTensorDescriptor
(
desc_
));
}
miopenTensorDescriptor_t
desc
()
{
return
desc_
;
}
miopenTensorDescriptor_t
desc
()
const
{
return
desc_
;
}
void
set
(
const
Tensor
&
tensor
,
const
int
groups
=
1
)
{
auto
dims
=
framework
::
vectorize
<
int
>
(
tensor
.
dims
());
std
::
vector
<
int
>
strides
(
dims
.
size
());
strides
[
dims
.
size
()
-
1
]
=
1
;
for
(
int
i
=
dims
.
size
()
-
2
;
i
>=
0
;
i
--
)
{
strides
[
i
]
=
dims
[
i
+
1
]
*
strides
[
i
+
1
];
}
std
::
vector
<
int
>
dims_with_group
(
dims
.
begin
(),
dims
.
end
());
if
(
groups
>
1
)
{
dims_with_group
[
1
]
=
dims_with_group
[
1
]
/
groups
;
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetTensorDescriptor
(
desc_
,
ToMIOpenDataType
(
tensor
.
type
()),
static_cast
<
int
>
(
dims_with_group
.
size
()),
const_cast
<
int
*>
(
dims_with_group
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
}
void
set
(
const
Tensor
&
tensor
,
const
miopenTensorFormat_t
format
)
{
const
int
groups
=
1
;
auto
dims
=
framework
::
vectorize
<
int
>
(
tensor
.
dims
());
std
::
vector
<
int
>
strides
(
dims
.
size
());
strides
[
dims
.
size
()
-
1
]
=
1
;
for
(
int
i
=
dims
.
size
()
-
2
;
i
>=
0
;
i
--
)
{
strides
[
i
]
=
dims
[
i
+
1
]
*
strides
[
i
+
1
];
}
std
::
vector
<
int
>
dims_with_group
(
dims
.
begin
(),
dims
.
end
());
if
(
groups
>
1
)
{
dims_with_group
[
1
]
=
dims_with_group
[
1
]
/
groups
;
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetTensorDescriptor
(
desc_
,
ToMIOpenDataType
(
tensor
.
type
()),
static_cast
<
int
>
(
dims_with_group
.
size
()),
const_cast
<
int
*>
(
dims_with_group
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
}
private:
miopenTensorDescriptor_t
desc_
;
};
class
FilterDescriptor
{
public:
FilterDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateTensorDescriptor
(
&
desc_
));
}
~
FilterDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyTensorDescriptor
(
desc_
));
}
miopenTensorDescriptor_t
desc
()
{
return
desc_
;
}
miopenTensorDescriptor_t
desc
()
const
{
return
desc_
;
}
void
set
(
const
Tensor
&
tensor
,
const
miopenTensorFormat_t
format
,
const
int
groups
=
1
)
{
auto
dims
=
framework
::
vectorize
<
int
>
(
tensor
.
dims
());
std
::
vector
<
int
>
transformed_dims
;
PADDLE_ENFORCE_EQ
(
format
,
MIOPEN_TENSOR_NCHW
,
platform
::
errors
::
InvalidArgument
(
"format should ONLY be NCHW in MIOPEN."
));
transformed_dims
=
dims
;
if
(
groups
>
1
)
{
transformed_dims
[
1
]
=
transformed_dims
[
1
]
/
groups
;
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetTensorDescriptor
(
desc_
,
ToMIOpenDataType
(
tensor
.
type
()),
static_cast
<
int
>
(
transformed_dims
.
size
()),
const_cast
<
int
*>
(
transformed_dims
.
data
()),
nullptr
));
}
private:
miopenTensorDescriptor_t
desc_
;
};
class
ConvolutionDescriptor
{
public:
ConvolutionDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateConvolutionDescriptor
(
&
desc_
));
}
~
ConvolutionDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyConvolutionDescriptor
(
desc_
));
}
miopenConvolutionDescriptor_t
desc
()
{
return
desc_
;
}
miopenConvolutionDescriptor_t
desc
()
const
{
return
desc_
;
}
void
set
(
miopenDataType_t
dtype
,
const
std
::
vector
<
int
>&
pads
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
bool
allow_tf32
,
const
int
groups
=
1
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenInitConvolutionNdDescriptor
(
desc_
,
static_cast
<
int
>
(
pads
.
size
()),
const_cast
<
int
*>
(
pads
.
data
()),
const_cast
<
int
*>
(
strides
.
data
()),
const_cast
<
int
*>
(
dilations
.
data
()),
miopenConvolution
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenSetConvolutionGroupCount
(
desc_
,
groups
));
}
private:
miopenConvolutionDescriptor_t
desc_
;
};
}
// namespace platform
}
// namespace paddle
tools/dockerfile/Dockerfile.rocm
浏览文件 @
93c1d9e7
# A image for building paddle binaries
# Use rocm-terminal base image for both rocm environment
# When you modify it, please be aware of rocm version
#
# Build: ROCM 3.5.1
# cd Paddle/tools/dockerfile
# docker build -f Dockerfile.rocm \
# --build-arg ROCM_VERSION=3.5.1 \
# --build-arg CENTOS_VERSION=7.7.1908 \
# -t paddlepaddle/paddle-centos-rocm35-dev:latest .
#
# Build: ROCM 3.9
.1
# Build: ROCM 3.9
# cd Paddle/tools/dockerfile
# docker build -f Dockerfile.rocm \
# --build-arg ROCM_VERSION=3.9.1 \
# --build-arg CENTOS_VERSION=7.8.2003 \
# --build-arg ROCM_VERSION=3.9 \
# -t paddlepaddle/paddle-centos-rocm39-dev:latest .
#
# Run: ROCM 3.5.1
# docker run -it --device=/dev/kfd --device=/dev/dri \
# --security-opt seccomp=unconfined --group-add video \
# paddlepaddle/paddle-centos-rocm3
5
-dev:latest /bin/bash
# paddlepaddle/paddle-centos-rocm3
9
-dev:latest /bin/bash
ARG CENTOS_VERSION
FROM centos:${CENTOS_VERSION}
ARG CENTOS_VERSION
FROM centos:7.8.2003
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ENV LC_ALL en_US.UTF-8
...
...
@@ -34,7 +23,7 @@ RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlit
zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \
make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel
# Install devtoolset-7
for ROCM 3.5/3.9
# Install devtoolset-7
RUN yum install -y yum-utils centos-release-scl && \
yum-config-manager --enable rhel-server-rhscl-7-rpms && \
yum-config-manager --enable rhel-7-server-rpms && \
...
...
@@ -70,10 +59,8 @@ ENV ROCM_PATH=/opt/rocm
ENV HIP_PATH=/opt/rocm/hip
ENV HIP_CLANG_PATH=/opt/rocm/llvm/bin
ENV PATH=/opt/rocm/bin:$PATH
ENV PATH=/opt/rocm/hcc/bin:$PATH
ENV PATH=/opt/rocm/hip/bin:$PATH
ENV PATH=/opt/rocm/opencl/bin:$PATH
ENV
PATH=/opt/rocm/llvm/bin:$
PATH
ENV
LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_
PATH
# git 2.17.1
RUN cd /opt && wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \
...
...
@@ -146,4 +133,12 @@ RUN cd /opt && wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache && \
cd .. && rm -rf ccache-3.7.9.tar.gz && rm -rf ccache-3.7.9
# configure ssh
RUN sed -i "s/^#PermitRootLogin/PermitRootLogin/" /etc/ssh/sshd_config && \
sed -i "s/^#PubkeyAuthentication/PubkeyAuthentication/" /etc/ssh/sshd_config && \
sed -i "s/^#RSAAuthentication/RSAAuthentication/" /etc/ssh/sshd_config && \
sed -i "s/#UseDNS .*/UseDNS no/" /etc/ssh/sshd_config
RUN ssh-keygen -A
EXPOSE 22
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录