Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
607eec30
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
607eec30
编写于
3月 12, 2018
作者:
K
kexinzhao
提交者:
GitHub
3月 12, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #8946 from kexinzhao/fix_cuda_arch_fp16
Add GPU compute capability check for float16 math function test
上级
b5ef315c
c88f58db
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
50 addition
and
0 deletion
+50
-0
paddle/fluid/operators/math/math_function.cu
paddle/fluid/operators/math/math_function.cu
+9
-0
paddle/fluid/operators/math/math_function_test.cu
paddle/fluid/operators/math/math_function_test.cu
+20
-0
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+5
-0
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+4
-0
paddle/fluid/platform/gpu_info.cc
paddle/fluid/platform/gpu_info.cc
+9
-0
paddle/fluid/platform/gpu_info.h
paddle/fluid/platform/gpu_info.h
+3
-0
未找到文件。
paddle/fluid/operators/math/math_function.cu
浏览文件 @
607eec30
...
@@ -45,6 +45,9 @@ void gemm<platform::CUDADeviceContext, float16>(
...
@@ -45,6 +45,9 @@ void gemm<platform::CUDADeviceContext, float16>(
const
half
*
h_B
=
reinterpret_cast
<
const
half
*>
(
B
);
const
half
*
h_B
=
reinterpret_cast
<
const
half
*>
(
B
);
half
*
h_C
=
reinterpret_cast
<
half
*>
(
C
);
half
*
h_C
=
reinterpret_cast
<
half
*>
(
C
);
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context
.
GetComputeCapability
(),
53
,
"cublas Hgemm requires GPU compute capability >= 53"
);
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasHgemm
(
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasHgemm
(
context
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
h_B
,
ldb
,
context
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
h_B
,
ldb
,
h_A
,
lda
,
&
h_beta
,
h_C
,
N
));
h_A
,
lda
,
&
h_beta
,
h_C
,
N
));
...
@@ -106,6 +109,9 @@ void gemm<platform::CUDADeviceContext, float16>(
...
@@ -106,6 +109,9 @@ void gemm<platform::CUDADeviceContext, float16>(
const
half
*
h_B
=
reinterpret_cast
<
const
half
*>
(
B
);
const
half
*
h_B
=
reinterpret_cast
<
const
half
*>
(
B
);
half
*
h_C
=
reinterpret_cast
<
half
*>
(
C
);
half
*
h_C
=
reinterpret_cast
<
half
*>
(
C
);
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context
.
GetComputeCapability
(),
53
,
"cublas Hgemm requires GPU compute capability >= 53"
);
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasHgemm
(
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasHgemm
(
context
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
h_B
,
ldb
,
context
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
h_B
,
ldb
,
h_A
,
lda
,
&
h_beta
,
h_C
,
ldc
));
h_A
,
lda
,
&
h_beta
,
h_C
,
ldc
));
...
@@ -251,6 +257,9 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
...
@@ -251,6 +257,9 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
const
half
*
h_B
=
reinterpret_cast
<
const
half
*>
(
B
);
const
half
*
h_B
=
reinterpret_cast
<
const
half
*>
(
B
);
half
*
h_C
=
reinterpret_cast
<
half
*>
(
C
);
half
*
h_C
=
reinterpret_cast
<
half
*>
(
C
);
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context
.
GetComputeCapability
(),
53
,
"cublas Hgemm requires GPU compute capability >= 53"
);
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasHgemmStridedBatched
(
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasHgemmStridedBatched
(
context
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
h_B
,
ldb
,
context
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
h_B
,
ldb
,
strideB
,
h_A
,
lda
,
strideA
,
&
h_beta
,
h_C
,
ldc
,
strideC
,
batchCount
));
strideB
,
h_A
,
lda
,
strideA
,
&
h_beta
,
h_C
,
ldc
,
strideC
,
batchCount
));
...
...
paddle/fluid/operators/math/math_function_test.cu
浏览文件 @
607eec30
...
@@ -72,6 +72,11 @@ TEST(math_function, notrans_mul_trans_fp16) {
...
@@ -72,6 +72,11 @@ TEST(math_function, notrans_mul_trans_fp16) {
CUDAPlace
gpu_place
(
0
);
CUDAPlace
gpu_place
(
0
);
CUDADeviceContext
context
(
gpu_place
);
CUDADeviceContext
context
(
gpu_place
);
// fp16 GEMM in cublas requires GPU compute capability >= 53
if
(
context
.
GetComputeCapability
()
<
53
)
{
return
;
}
float16
*
input1_ptr
=
input1
.
mutable_data
<
float16
>
({
2
,
3
},
cpu_place
);
float16
*
input1_ptr
=
input1
.
mutable_data
<
float16
>
({
2
,
3
},
cpu_place
);
fill_fp16_data
(
input1_ptr
,
input1
.
numel
(),
{
0
,
1
,
2
,
3
,
4
,
5
});
fill_fp16_data
(
input1_ptr
,
input1
.
numel
(),
{
0
,
1
,
2
,
3
,
4
,
5
});
...
@@ -149,6 +154,11 @@ TEST(math_function, trans_mul_notrans_fp16) {
...
@@ -149,6 +154,11 @@ TEST(math_function, trans_mul_notrans_fp16) {
CUDAPlace
gpu_place
(
0
);
CUDAPlace
gpu_place
(
0
);
CUDADeviceContext
context
(
gpu_place
);
CUDADeviceContext
context
(
gpu_place
);
// fp16 GEMM in cublas requires GPU compute capability >= 53
if
(
context
.
GetComputeCapability
()
<
53
)
{
return
;
}
float16
*
input1_ptr
=
input1
.
mutable_data
<
float16
>
({
2
,
3
},
cpu_place
);
float16
*
input1_ptr
=
input1
.
mutable_data
<
float16
>
({
2
,
3
},
cpu_place
);
fill_fp16_data
(
input1_ptr
,
input1
.
numel
(),
{
0
,
1
,
2
,
3
,
4
,
5
});
fill_fp16_data
(
input1_ptr
,
input1
.
numel
(),
{
0
,
1
,
2
,
3
,
4
,
5
});
...
@@ -248,6 +258,11 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
...
@@ -248,6 +258,11 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
CUDAPlace
gpu_place
(
0
);
CUDAPlace
gpu_place
(
0
);
CUDADeviceContext
context
(
gpu_place
);
CUDADeviceContext
context
(
gpu_place
);
// fp16 GEMM in cublas requires GPU compute capability >= 53
if
(
context
.
GetComputeCapability
()
<
53
)
{
return
;
}
int
m
=
2
;
int
m
=
2
;
int
n
=
3
;
int
n
=
3
;
int
k
=
3
;
int
k
=
3
;
...
@@ -355,6 +370,11 @@ TEST(math_function, gemm_trans_cublas_fp16) {
...
@@ -355,6 +370,11 @@ TEST(math_function, gemm_trans_cublas_fp16) {
CUDAPlace
gpu_place
(
0
);
CUDAPlace
gpu_place
(
0
);
CUDADeviceContext
context
(
gpu_place
);
CUDADeviceContext
context
(
gpu_place
);
// fp16 GEMM in cublas requires GPU compute capability >= 53
if
(
context
.
GetComputeCapability
()
<
53
)
{
return
;
}
int
m
=
2
;
int
m
=
2
;
int
n
=
3
;
int
n
=
3
;
int
k
=
3
;
int
k
=
3
;
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
607eec30
...
@@ -127,6 +127,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
...
@@ -127,6 +127,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
:
place_
(
place
)
{
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
:
place_
(
place
)
{
SetDeviceId
(
place_
.
device
);
SetDeviceId
(
place_
.
device
);
compute_capability
=
GetCUDAComputeCapability
(
place_
.
device
);
multi_process
=
GetCUDAMultiProcessors
(
place_
.
device
);
multi_process
=
GetCUDAMultiProcessors
(
place_
.
device
);
max_threads_per_mp
=
GetCUDAMaxThreadsPerMultiProcessor
(
place_
.
device
);
max_threads_per_mp
=
GetCUDAMaxThreadsPerMultiProcessor
(
place_
.
device
);
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream_
));
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream_
));
...
@@ -162,6 +163,10 @@ void CUDADeviceContext::Wait() const {
...
@@ -162,6 +163,10 @@ void CUDADeviceContext::Wait() const {
PADDLE_ENFORCE
(
cudaGetLastError
());
PADDLE_ENFORCE
(
cudaGetLastError
());
}
}
int
CUDADeviceContext
::
GetComputeCapability
()
const
{
return
compute_capability
;
}
int
CUDADeviceContext
::
GetMaxPhysicalThreadCount
()
const
{
int
CUDADeviceContext
::
GetMaxPhysicalThreadCount
()
const
{
return
multi_process
*
max_threads_per_mp
;
return
multi_process
*
max_threads_per_mp
;
}
}
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
607eec30
...
@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
...
@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return place in the device context. */
/*! \brief Return place in the device context. */
Place
GetPlace
()
const
override
;
Place
GetPlace
()
const
override
;
/*! \brief Return compute capability in the device context. */
int
GetComputeCapability
()
const
;
/*! \brief Return the max physical thread count in the device context */
/*! \brief Return the max physical thread count in the device context */
int
GetMaxPhysicalThreadCount
()
const
;
int
GetMaxPhysicalThreadCount
()
const
;
...
@@ -104,6 +107,7 @@ class CUDADeviceContext : public DeviceContext {
...
@@ -104,6 +107,7 @@ class CUDADeviceContext : public DeviceContext {
cudnnHandle_t
cudnn_handle_
;
cudnnHandle_t
cudnn_handle_
;
cublasHandle_t
cublas_handle_
;
cublasHandle_t
cublas_handle_
;
int
compute_capability
;
int
multi_process
;
int
multi_process
;
int
max_threads_per_mp
;
int
max_threads_per_mp
;
};
};
...
...
paddle/fluid/platform/gpu_info.cc
浏览文件 @
607eec30
...
@@ -33,6 +33,15 @@ int GetCUDADeviceCount() {
...
@@ -33,6 +33,15 @@ int GetCUDADeviceCount() {
return
count
;
return
count
;
}
}
int
GetCUDAComputeCapability
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetCUDADeviceCount
(),
"id must less than GPU count"
);
cudaDeviceProp
device_prop
;
PADDLE_ENFORCE
(
cudaGetDeviceProperties
(
&
device_prop
,
id
),
"cudaGetDeviceProperties failed in "
"paddle::platform::GetCUDAComputeCapability"
);
return
device_prop
.
major
*
10
+
device_prop
.
minor
;
}
int
GetCUDAMultiProcessors
(
int
id
)
{
int
GetCUDAMultiProcessors
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetCUDADeviceCount
(),
"id must less than GPU count"
);
PADDLE_ENFORCE_LT
(
id
,
GetCUDADeviceCount
(),
"id must less than GPU count"
);
int
count
;
int
count
;
...
...
paddle/fluid/platform/gpu_info.h
浏览文件 @
607eec30
...
@@ -30,6 +30,9 @@ const std::string kEnvFractionGpuMemoryToUse =
...
@@ -30,6 +30,9 @@ const std::string kEnvFractionGpuMemoryToUse =
//! Get the total number of GPU devices in system.
//! Get the total number of GPU devices in system.
int
GetCUDADeviceCount
();
int
GetCUDADeviceCount
();
//! Get the compute capability of the ith GPU (format: major * 10 + minor)
int
GetCUDAComputeCapability
(
int
i
);
//! Get the MultiProcessors of the ith GPU.
//! Get the MultiProcessors of the ith GPU.
int
GetCUDAMultiProcessors
(
int
i
);
int
GetCUDAMultiProcessors
(
int
i
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录