Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
6d3da458
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
6d3da458
编写于
7月 31, 2018
作者:
D
dzhwinter
提交者:
GitHub
7月 31, 2018
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix/float16 style (#12446)
* "rewrite the test case" * "follow comment"
上级
91fb0156
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
119 addition
and
84 deletion
+119
-84
paddle/fluid/platform/cuda_helper_test.cu
paddle/fluid/platform/cuda_helper_test.cu
+109
-74
paddle/fluid/platform/cuda_primitives.h
paddle/fluid/platform/cuda_primitives.h
+10
-10
未找到文件。
paddle/fluid/platform/cuda_helper_test.cu
浏览文件 @
6d3da458
...
...
@@ -13,7 +13,6 @@
// limitations under the License.
#include <gtest/gtest.h>
#include <bitset>
#include <iostream>
#include <random>
...
...
@@ -25,13 +24,13 @@
using
paddle
::
platform
::
PADDLE_CUDA_NUM_THREADS
;
using
paddle
::
platform
::
float16
;
#define CUDA_ATOMIC_KERNEL(op, T) \
__global__ void op##Kernel(const T* data_a, T* data_b, size_t num) { \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; \
i += blockDim.x * gridDim.x) { \
paddle::platform::CudaAtomic##op(&data_b[i], data_a[i]); \
} \
template
<
typename
T
>
__global__
void
AddKernel
(
const
T
*
data_a
,
T
*
data_b
,
size_t
num
)
{
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
num
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
paddle
::
platform
::
CudaAtomicAdd
(
&
data_b
[
i
],
data_a
[
i
]);
}
}
template
<
typename
T
>
struct
AddFunctor
{
...
...
@@ -39,80 +38,116 @@ struct AddFunctor {
};
template
<
typename
T
>
struct
SubFunctor
{
T
operator
()(
const
T
&
a
,
const
T
&
b
)
{
return
a
-
b
;
}
};
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
#define ARITHMETIC_KERNEL_LAUNCH(op, T) \
void Test##T##op(size_t num) { \
T *in1, *in2, *out; \
T *d_in1, *d_in2; \
size_t size = sizeof(T) * num; \
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
in1 = reinterpret_cast<T*>(malloc(size)); \
in2 = reinterpret_cast<T*>(malloc(size)); \
out = reinterpret_cast<T*>(malloc(size)); \
std::minstd_rand engine; \
std::uniform_real_distribution<double> dist(0.0, 1.0); \
for (size_t i = 0; i < num; ++i) { \
in1[i] = static_cast<T>(dist(engine)); \
in2[i] = static_cast<T>(dist(engine)); \
} \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op##Kernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); \
cudaDeviceSynchronize(); \
cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost); \
cudaDeviceSynchronize(); \
for (size_t i = 0; i < num; ++i) { \
EXPECT_NEAR(static_cast<float>(out[i]), \
static_cast<float>(op##Functor<T>()(in1[i], in2[i])), \
0.001); \
} \
free(in1); \
free(in2); \
free(out); \
cudaFree(d_in1); \
cudaFree(d_in2); \
void
TestCase
(
size_t
num
)
{
T
*
in1
,
*
in2
,
*
out
;
T
*
d_in1
,
*
d_in2
;
size_t
size
=
sizeof
(
T
)
*
num
;
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
size
);
in1
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
in2
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
out
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
std
::
minstd_rand
engine
;
std
::
uniform_real_distribution
<
double
>
dist
(
0.0
,
1.0
);
for
(
size_t
i
=
0
;
i
<
num
;
++
i
)
{
in1
[
i
]
=
static_cast
<
T
>
(
dist
(
engine
));
in2
[
i
]
=
static_cast
<
T
>
(
dist
(
engine
));
}
CUDA_ATOMIC_KERNEL
(
Add
,
float
);
CUDA_ATOMIC_KERNEL
(
Add
,
double
);
CUDA_ATOMIC_KERNEL
(
Add
,
float16
);
ARITHMETIC_KERNEL_LAUNCH
(
Add
,
float
);
ARITHMETIC_KERNEL_LAUNCH
(
Add
,
double
);
ARITHMETIC_KERNEL_LAUNCH
(
Add
,
float16
);
namespace
paddle
{
namespace
platform
{
USE_CUDA_ATOMIC
(
Sub
,
int
);
};
};
CUDA_ATOMIC_KERNEL
(
Sub
,
int
);
ARITHMETIC_KERNEL_LAUNCH
(
Sub
,
int
);
cudaMemcpy
(
d_in1
,
in1
,
size
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_in2
,
in2
,
size
,
cudaMemcpyHostToDevice
);
AddKernel
<
T
><<<
1
,
PADDLE_CUDA_NUM_THREADS
>>>
(
d_in1
,
d_in2
,
num
);
cudaDeviceSynchronize
();
cudaMemcpy
(
out
,
d_in2
,
size
,
cudaMemcpyDeviceToHost
);
cudaDeviceSynchronize
();
for
(
size_t
i
=
0
;
i
<
num
;
++
i
)
{
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
EXPECT_NEAR
(
static_cast
<
float
>
(
out
[
i
]),
static_cast
<
float
>
(
AddFunctor
<
T
>
()(
in1
[
i
],
in2
[
i
])),
0.001
);
}
free
(
in1
);
free
(
in2
);
free
(
out
);
cudaFree
(
d_in1
);
cudaFree
(
d_in2
);
}
// cuda primitives
TEST
(
CudaAtomic
,
Add
)
{
TestfloatAdd
(
static_cast
<
size_t
>
(
10
));
TestfloatAdd
(
static_cast
<
size_t
>
(
1024
*
1024
));
TestdoubleAdd
(
static_cast
<
size_t
>
(
10
));
TestdoubleAdd
(
static_cast
<
size_t
>
(
1024
*
1024
));
}
TestCase
<
float
>
(
static_cast
<
size_t
>
(
10
));
TestCase
<
float
>
(
static_cast
<
size_t
>
(
1024
*
1024
));
TEST
(
CudaAtomic
,
Sub
)
{
TestintSub
(
static_cast
<
size_t
>
(
10
));
TestintSub
(
static_cast
<
size_t
>
(
1024
*
1024
));
TestCase
<
double
>
(
static_cast
<
size_t
>
(
10
));
TestCase
<
double
>
(
static_cast
<
size_t
>
(
1024
*
1024
));
}
TEST
(
CudaAtomic
,
float16
)
{
using
paddle
::
platform
::
float16
;
Testfloat16Add
(
static_cast
<
size_t
>
(
1
));
Testfloat16Add
(
static_cast
<
size_t
>
(
2
));
Testfloat16Add
(
static_cast
<
size_t
>
(
3
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
1
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
2
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
3
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
10
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
1024
*
1024
));
}
// unalignment of uint8
void
TestUnalign
(
size_t
num
,
const
int
shift_bit
)
{
PADDLE_ENFORCE
(
num
%
2
==
0
,
"must be a multiple of 2"
);
float16
*
in1
,
*
in2
,
*
out
;
float16
*
d_in1
,
*
d_in2
;
size_t
size
=
sizeof
(
uint8_t
)
*
(
num
+
shift_bit
);
size_t
array_size
=
sizeof
(
float16
)
*
(
num
/
2
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
size
);
in1
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
in2
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
out
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
// right shift 1, mimic the unalignment of address
float16
*
r_in1
=
reinterpret_cast
<
float16
*>
(
reinterpret_cast
<
uint8_t
*>
(
in1
)
+
shift_bit
);
float16
*
r_in2
=
reinterpret_cast
<
float16
*>
(
reinterpret_cast
<
uint8_t
*>
(
in2
)
+
shift_bit
);
std
::
minstd_rand
engine
;
std
::
uniform_real_distribution
<
double
>
dist
(
0.0
,
1.0
);
for
(
size_t
i
=
0
;
i
<
num
/
2
;
++
i
)
{
r_in1
[
i
]
=
static_cast
<
float16
>
(
dist
(
engine
));
r_in2
[
i
]
=
static_cast
<
float16
>
(
dist
(
engine
));
}
cudaMemcpy
(
d_in1
,
r_in1
,
array_size
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_in2
,
r_in2
,
array_size
,
cudaMemcpyHostToDevice
);
AddKernel
<
float16
><<<
1
,
PADDLE_CUDA_NUM_THREADS
>>>
(
d_in1
,
d_in2
,
num
/
2
);
cudaDeviceSynchronize
();
cudaMemcpy
(
out
,
d_in2
,
array_size
,
cudaMemcpyDeviceToHost
);
cudaDeviceSynchronize
();
for
(
size_t
i
=
0
;
i
<
num
/
2
;
++
i
)
{
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
EXPECT_NEAR
(
static_cast
<
float
>
(
out
[
i
]),
static_cast
<
float
>
(
AddFunctor
<
float16
>
()(
r_in1
[
i
],
r_in2
[
i
])),
0.001
);
}
free
(
in1
);
free
(
in2
);
free
(
out
);
cudaFree
(
d_in1
);
cudaFree
(
d_in2
);
}
TEST
(
CudaAtomic
,
float16Unalign
)
{
// same with float16 testcase
TestUnalign
(
static_cast
<
size_t
>
(
2
),
/*shift_bit*/
2
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
),
/*shift_bit*/
2
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
*
1024
),
/*shift_bit*/
2
);
// shift the address.
TestUnalign
(
static_cast
<
size_t
>
(
2
),
/*shift_bit*/
1
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
),
/*shift_bit*/
1
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
*
1024
),
/*shift_bit*/
1
);
Testfloat16Add
(
static_cast
<
size_t
>
(
10
));
Testfloat16Add
(
static_cast
<
size_t
>
(
1024
*
1024
));
TestUnalign
(
static_cast
<
size_t
>
(
2
),
/*shift_bit*/
3
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
),
/*shift_bit*/
3
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
*
1024
),
/*shift_bit*/
3
);
}
paddle/fluid/platform/cuda_primitives.h
浏览文件 @
6d3da458
...
...
@@ -79,41 +79,41 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
// convert the value into float and do the add arithmetic.
// then store the result into a uint32.
inline
__device__
uint32_t
add_to_low_half
(
uint32_t
val
,
float
x
)
{
inline
static
__device__
uint32_t
add_to_low_half
(
uint32_t
val
,
float
x
)
{
float16
low_half
;
// the float16 in lower 16bits
low_half
.
x
=
static_cast
<
uint16_t
>
(
val
&
0x
ffff
u
);
low_half
.
x
=
static_cast
<
uint16_t
>
(
val
&
0x
FFFF
u
);
low_half
=
static_cast
<
float16
>
(
static_cast
<
float
>
(
low_half
)
+
x
);
return
(
val
&
0x
ffff
0000u
)
|
low_half
.
x
;
return
(
val
&
0x
FFFF
0000u
)
|
low_half
.
x
;
}
inline
__device__
uint32_t
add_to_high_half
(
uint32_t
val
,
float
x
)
{
inline
static
__device__
uint32_t
add_to_high_half
(
uint32_t
val
,
float
x
)
{
float16
high_half
;
// the float16 in higher 16bits
high_half
.
x
=
static_cast
<
uint16_t
>
(
val
>>
16
);
high_half
=
static_cast
<
float16
>
(
static_cast
<
float
>
(
high_half
)
+
x
);
return
(
val
&
0x
ffff
u
)
|
(
static_cast
<
uint32_t
>
(
high_half
.
x
)
<<
16
);
return
(
val
&
0x
FFFF
u
)
|
(
static_cast
<
uint32_t
>
(
high_half
.
x
)
<<
16
);
}
CUDA_ATOMIC_WRAPPER
(
Add
,
float16
)
{
// concrete packed float16 value may exsits in lower or higher 16bits
// of the 32bits address.
uint32_t
*
address_as_ui
=
reinterpret_cast
<
uint32_t
*>
(
reinterpret_cast
<
char
*>
(
address
)
-
(
reinterpret_cast
<
size_t
>
(
address
)
&
2
));
uint32_t
*
address_as_ui
=
reinterpret_cast
<
uint32_t
*>
(
reinterpret_cast
<
char
*>
(
address
)
-
(
reinterpret_cast
<
uintptr_t
>
(
address
)
&
0x0
2
));
float
val_f
=
static_cast
<
float
>
(
val
);
uint32_t
old
=
*
address_as_ui
;
uint32_t
sum
;
uint32_t
newval
;
uint32_t
assumed
;
if
(((
size_t
)
address
&
2
)
==
0
)
{
if
(((
uintptr_t
)
address
&
0x0
2
)
==
0
)
{
// the float16 value stay at lower 16 bits of the address.
do
{
assumed
=
old
;
old
=
atomicCAS
(
address_as_ui
,
assumed
,
add_to_low_half
(
assumed
,
val_f
));
}
while
(
old
!=
assumed
);
float16
ret
;
ret
.
x
=
old
&
0x
ffff
u
;
ret
.
x
=
old
&
0x
FFFF
u
;
return
ret
;
}
else
{
// the float16 value stay at higher 16 bits of the address.
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录