Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
41bd1f91
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
41bd1f91
编写于
11月 28, 2017
作者:
K
Kexin Zhao
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix gpu test, clean code and add cmake
上级
a5feb771
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
109 addition
and
212 deletion
+109
-212
CMakeLists.txt
CMakeLists.txt
+1
-0
cmake/configure.cmake
cmake/configure.cmake
+5
-0
paddle/math/float16.h
paddle/math/float16.h
+61
-156
paddle/math/tests/test_float16.cpp
paddle/math/tests/test_float16.cpp
+0
-8
paddle/math/tests/test_float16.cu
paddle/math/tests/test_float16.cu
+42
-48
未找到文件。
CMakeLists.txt
浏览文件 @
41bd1f91
...
@@ -56,6 +56,7 @@ option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
...
@@ -56,6 +56,7 @@ option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option
(
GLIDE_INSTALL
"Download and install go dependencies "
ON
)
option
(
GLIDE_INSTALL
"Download and install go dependencies "
ON
)
option
(
USE_NNPACK
"Compile PaddlePaddle with NNPACK library"
OFF
)
option
(
USE_NNPACK
"Compile PaddlePaddle with NNPACK library"
OFF
)
option
(
USE_EIGEN_FOR_BLAS
"Use matrix multiplication in Eigen"
OFF
)
option
(
USE_EIGEN_FOR_BLAS
"Use matrix multiplication in Eigen"
OFF
)
option
(
WITH_ARM_FP16
"Use half precision support on armv8.2-a cpu"
OFF
)
# CMAKE_BUILD_TYPE
# CMAKE_BUILD_TYPE
if
(
NOT CMAKE_BUILD_TYPE
)
if
(
NOT CMAKE_BUILD_TYPE
)
...
...
cmake/configure.cmake
浏览文件 @
41bd1f91
...
@@ -24,6 +24,11 @@ if(WITH_DOUBLE)
...
@@ -24,6 +24,11 @@ if(WITH_DOUBLE)
add_definitions
(
-DPADDLE_TYPE_DOUBLE
)
add_definitions
(
-DPADDLE_TYPE_DOUBLE
)
endif
(
WITH_DOUBLE
)
endif
(
WITH_DOUBLE
)
if
(
WITH_ARM_FP16
)
add_definitions
(
-DPADDLE_ARM_FP16
)
add_definitions
(
"-march=armv8.2-a+fp16+simd"
)
endif
(
WITH_ARM_FP16
)
if
(
WITH_TESTING
)
if
(
WITH_TESTING
)
add_definitions
(
-DPADDLE_WITH_TESTING
)
add_definitions
(
-DPADDLE_WITH_TESTING
)
endif
(
WITH_TESTING
)
endif
(
WITH_TESTING
)
...
...
paddle/math/float16.h
浏览文件 @
41bd1f91
...
@@ -14,7 +14,7 @@ limitations under the License. */
...
@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once
#pragma once
#include <
cstdint
>
#include <
stdint.h
>
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda.h>
...
@@ -71,6 +71,7 @@ struct PADDLE_ALIGN(2) float16 {
...
@@ -71,6 +71,7 @@ struct PADDLE_ALIGN(2) float16 {
public:
public:
uint16_t
x
;
uint16_t
x
;
// Constructors
HOSTDEVICE
inline
float16
()
:
x
(
0
)
{}
HOSTDEVICE
inline
float16
()
:
x
(
0
)
{}
HOSTDEVICE
inline
float16
(
const
float16
&
h
)
:
x
(
h
.
x
)
{}
HOSTDEVICE
inline
float16
(
const
float16
&
h
)
:
x
(
h
.
x
)
{}
...
@@ -89,8 +90,7 @@ public:
...
@@ -89,8 +90,7 @@ public:
#ifdef PADDLE_WITH_NATIVE_FP16
#ifdef PADDLE_WITH_NATIVE_FP16
// __fp16 is a native half precision data type for arm cpu,
// __fp16 is a native half precision data type for arm cpu,
// float16_t is an alias for __fp16 in arm_fp16.h,
// float16_t is an alias for __fp16
// which is included in arm_neon.h.
HOSTDEVICE
inline
explicit
float16
(
const
float16_t
&
h
)
{
HOSTDEVICE
inline
explicit
float16
(
const
float16_t
&
h
)
{
x
=
*
reinterpret_cast
<
const
uint16_t
*>
(
&
h
);
x
=
*
reinterpret_cast
<
const
uint16_t
*>
(
&
h
);
}
}
...
@@ -141,6 +141,7 @@ public:
...
@@ -141,6 +141,7 @@ public:
return
*
this
;
return
*
this
;
}
}
// Assignment operators
#ifdef PADDLE_CUDA_FP16
#ifdef PADDLE_CUDA_FP16
HOSTDEVICE
inline
float16
&
operator
=
(
const
half
&
rhs
)
{
HOSTDEVICE
inline
float16
&
operator
=
(
const
half
&
rhs
)
{
#if CUDA_VERSION >= 9000
#if CUDA_VERSION >= 9000
...
@@ -219,6 +220,7 @@ public:
...
@@ -219,6 +220,7 @@ public:
return
*
this
;
return
*
this
;
}
}
// Conversion opertors
#ifdef PADDLE_CUDA_FP16
#ifdef PADDLE_CUDA_FP16
HOSTDEVICE
inline
explicit
operator
half
()
const
{
HOSTDEVICE
inline
explicit
operator
half
()
const
{
#if CUDA_VERSION >= 9000
#if CUDA_VERSION >= 9000
...
@@ -353,27 +355,54 @@ private:
...
@@ -353,27 +355,54 @@ private:
// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
// CUDA 9.0 regarding the half data type.
// CUDA 9.0 regarding the half data type.
#if defined(PADDLE_CUDA_FP16) &&
defined(__CUDA_ARCH__) && \
#if defined(PADDLE_CUDA_FP16) &&
CUDA_VERSION < 9000
__CUDA_ARCH__ >= 530 && CUDA_VERSION < 9000
DEVICE
inline
half
operator
+
(
const
half
&
a
,
const
half
&
b
)
{
DEVICE
inline
half
operator
+
(
const
half
&
a
,
const
half
&
b
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return
__hadd
(
a
,
b
);
return
__hadd
(
a
,
b
);
#else
float
res
=
float
(
float16
(
a
))
+
float
(
float16
(
b
));
return
half
(
float16
(
res
));
#endif
}
}
DEVICE
inline
half
operator
-
(
const
half
&
a
,
const
half
&
b
)
{
DEVICE
inline
half
operator
-
(
const
half
&
a
,
const
half
&
b
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return
__hsub
(
a
,
b
);
return
__hsub
(
a
,
b
);
#else
float
res
=
float
(
float16
(
a
))
-
float
(
float16
(
b
));
return
half
(
float16
(
res
));
#endif
}
}
DEVICE
inline
half
operator
*
(
const
half
&
a
,
const
half
&
b
)
{
DEVICE
inline
half
operator
*
(
const
half
&
a
,
const
half
&
b
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return
__hmul
(
a
,
b
);
return
__hmul
(
a
,
b
);
#else
float
res
=
float
(
float16
(
a
))
*
float
(
float16
(
b
));
return
half
(
float16
(
res
));
#endif
}
}
DEVICE
inline
half
operator
/
(
const
half
&
a
,
const
half
&
b
)
{
DEVICE
inline
half
operator
/
(
const
half
&
a
,
const
half
&
b
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
float
num
=
__half2float
(
a
);
float
num
=
__half2float
(
a
);
float
denom
=
__half2float
(
b
);
float
denom
=
__half2float
(
b
);
return
__float2half
(
num
/
denom
);
return
__float2half
(
num
/
denom
);
#else
float
res
=
float
(
float16
(
a
))
/
float
(
float16
(
b
));
return
half
(
float16
(
res
));
#endif
}
}
DEVICE
inline
half
operator
-
(
const
half
&
a
)
{
return
__hneg
(
a
);
}
DEVICE
inline
half
operator
-
(
const
half
&
a
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return
__hneg
(
a
);
#else
float
res
=
-
float
(
float16
(
a
));
return
half
(
float16
(
res
));
#endif
}
DEVICE
inline
half
&
operator
+=
(
half
&
a
,
const
half
&
b
)
{
DEVICE
inline
half
&
operator
+=
(
half
&
a
,
const
half
&
b
)
{
a
=
a
+
b
;
a
=
a
+
b
;
...
@@ -396,99 +425,57 @@ DEVICE inline half& operator/=(half& a, const half& b) {
...
@@ -396,99 +425,57 @@ DEVICE inline half& operator/=(half& a, const half& b) {
}
}
DEVICE
inline
bool
operator
==
(
const
half
&
a
,
const
half
&
b
)
{
DEVICE
inline
bool
operator
==
(
const
half
&
a
,
const
half
&
b
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return
__heq
(
a
,
b
);
return
__heq
(
a
,
b
);
#else
return
float
(
float16
(
a
))
==
float
(
float16
(
b
));
#endif
}
}
DEVICE
inline
bool
operator
!=
(
const
half
&
a
,
const
half
&
b
)
{
DEVICE
inline
bool
operator
!=
(
const
half
&
a
,
const
half
&
b
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return
__hne
(
a
,
b
);
return
__hne
(
a
,
b
);
#else
return
float
(
float16
(
a
))
!=
float
(
float16
(
b
));
#endif
}
}
DEVICE
inline
bool
operator
<
(
const
half
&
a
,
const
half
&
b
)
{
DEVICE
inline
bool
operator
<
(
const
half
&
a
,
const
half
&
b
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return
__hlt
(
a
,
b
);
return
__hlt
(
a
,
b
);
#else
return
float
(
float16
(
a
))
<
float
(
float16
(
b
));
#endif
}
}
DEVICE
inline
bool
operator
<=
(
const
half
&
a
,
const
half
&
b
)
{
DEVICE
inline
bool
operator
<=
(
const
half
&
a
,
const
half
&
b
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return
__hle
(
a
,
b
);
return
__hle
(
a
,
b
);
#else
return
float
(
float16
(
a
))
<=
float
(
float16
(
b
));
#endif
}
}
DEVICE
inline
bool
operator
>
(
const
half
&
a
,
const
half
&
b
)
{
DEVICE
inline
bool
operator
>
(
const
half
&
a
,
const
half
&
b
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return
__hgt
(
a
,
b
);
return
__hgt
(
a
,
b
);
#else
return
float
(
float16
(
a
))
>
float
(
float16
(
b
));
#endif
}
}
DEVICE
inline
bool
operator
>=
(
const
half
&
a
,
const
half
&
b
)
{
DEVICE
inline
bool
operator
>=
(
const
half
&
a
,
const
half
&
b
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return
__hge
(
a
,
b
);
return
__hge
(
a
,
b
);
#else
return
float
(
float16
(
a
))
>=
float
(
float16
(
b
));
#endif
}
}
/*
#endif // PADDLE_CUDA_FP16
DEVICE inline float16 operator+(const float16& a, const float16& b) {
return float16(__hadd(half(a), half(b)));
}
DEVICE inline float16 operator-(const float16& a, const float16& b) {
return float16(__hsub(half(a), half(b)));
}
DEVICE inline float16 operator*(const float16& a, const float16& b) {
return float16(__hmul(half(a), half(b)));
}
DEVICE inline float16 operator/(const float16& a, const float16& b) {
float num = __half2float(half(a));
float denom = __half2float(half(b));
return float16(num / denom);
}
DEVICE inline float16 operator-(const float16& a) {
return float16(__hneg(half(a)));
}
DEVICE inline float16& operator+=(float16& a, const float16& b) {
a = a + b;
return a;
}
DEVICE inline float16& operator-=(float16& a, const float16& b) {
a = a - b;
return a;
}
DEVICE inline float16& operator*=(float16& a, const float16& b) {
a = a * b;
return a;
}
DEVICE inline float16& operator/=(float16& a, const float16& b) {
a = a / b;
return a;
}
DEVICE inline bool operator==(const float16& a, const float16& b) {
return __heq(half(a), half(b));
}
DEVICE inline bool operator!=(const float16& a, const float16& b) {
return __hne(half(a), half(b));
}
DEVICE inline bool operator<(const float16& a, const float16& b) {
return __hlt(half(a), half(b));
}
DEVICE inline bool operator<=(const float16& a, const float16& b) {
return __hle(half(a), half(b));
}
DEVICE inline bool operator>(const float16& a, const float16& b) {
return __hgt(half(a), half(b));
}
DEVICE inline bool operator>=(const float16& a, const float16& b) {
return __hge(half(a), half(b));
}
*/
// Arithmetic operators on ARMv8.2-A CPU
// Arithmetic operators on ARMv8.2-A CPU
#
el
if defined(PADDLE_WITH_NATIVE_FP16)
#if defined(PADDLE_WITH_NATIVE_FP16)
HOST
inline
float16
operator
+
(
const
float16
&
a
,
const
float16
&
b
)
{
HOST
inline
float16
operator
+
(
const
float16
&
a
,
const
float16
&
b
)
{
float16
res
;
float16
res
;
asm
volatile
(
asm
volatile
(
...
@@ -681,88 +668,6 @@ HOST inline bool operator>=(const float16& a, const float16& b) {
...
@@ -681,88 +668,6 @@ HOST inline bool operator>=(const float16& a, const float16& b) {
return
(
res
&
0xffff
)
!=
0
;
return
(
res
&
0xffff
)
!=
0
;
}
}
/*
HOST inline float16 operator+(const float16& a, const float16& b) {
return float16(vaddh_f16(float16_t(a), float16_t(b)));
}
HOST inline float16 operator-(const float16& a, const float16& b) {
return float16(vsubh_f16(float16_t(a), float16_t(b)));
}
HOST inline float16 operator*(const float16& a, const float16& b) {
return float16(vmulh_f16(float16_t(a), float16_t(b)));
}
HOST inline float16 operator/(const float16& a, const float16& b) {
return float16(vdivh_f16(float16_t(a), float16_t(b)));
}
HOST inline float16 operator-(const float16& a) {
return float16(vnegh_f16(float16_t(a)));
}
HOST inline float16& operator+=(float16& a, const float16& b) {
a = a + b;
return a;
}
HOST inline float16& operator-=(float16& a, const float16& b) {
a = a - b;
return a;
}
HOST inline float16& operator*=(float16& a, const float16& b) {
a = a * b;
return a;
}
HOST inline float16& operator/=(float16& a, const float16& b) {
a = a / b;
return a;
}
HOST inline bool operator==(const float16& a, const float16& b) {
return static_cast<bool>(vceqh_f16(float16_t(a), float16_t(b)));
}
HOST inline bool operator!=(const float16& a, const float16& b) {
return !(a == b);
}
HOST inline bool operator<(const float16& a, const float16& b) {
#ifdef PADDLE_NEON_64
return static_cast<bool>(vclth_f16(float16_t(a), float16_t(b)));
#else
return float(a) < float(b);
#endif // PADDLE_NEON_64
}
HOST inline bool operator<=(const float16& a, const float16& b) {
#ifdef PADDLE_NEON_64
return static_cast<bool>(vcleh_f16(float16_t(a), float16_t(b)));
#else
return float(a) <= float(b);
#endif // PADDLE_NEON_64
}
HOST inline bool operator>(const float16& a, const float16& b) {
#ifdef PADDLE_NEON_64
return static_cast<bool>(vcgth_f16(float16_t(a), float16_t(b)));
#else
return float(a) > float(b);
#endif // PADDLE_NEON_64
}
HOST inline bool operator>=(const float16& a, const float16& b) {
#ifdef PADDLE_NEON_64
return static_cast<bool>(vcgeh_f16(float16_t(a), float16_t(b)));
#else
return float(a) >= float(b);
#endif // PADDLE_NEON_64
}
*/
// Arithmetic operators, software emulated on other CPU
// Arithmetic operators, software emulated on other CPU
#else
#else
HOSTDEVICE
inline
float16
operator
+
(
const
float16
&
a
,
const
float16
&
b
)
{
HOSTDEVICE
inline
float16
operator
+
(
const
float16
&
a
,
const
float16
&
b
)
{
...
...
paddle/math/tests/test_float16.cpp
浏览文件 @
41bd1f91
...
@@ -54,14 +54,6 @@ TEST(float16, conversion_cpu) {
...
@@ -54,14 +54,6 @@ TEST(float16, conversion_cpu) {
EXPECT_EQ
(
float16
(
true
).
x
,
0x3c00
);
EXPECT_EQ
(
float16
(
true
).
x
,
0x3c00
);
EXPECT_EQ
(
float16
(
false
).
x
,
0x0000
);
EXPECT_EQ
(
float16
(
false
).
x
,
0x0000
);
// Implicit conversion to and from Eigen::half
/*
Eigen::half tmp = float16(1.0f);
float16 v_conv = tmp;
EXPECT_EQ(tmp.x, 0x3c00);
EXPECT_EQ(v_conv.x, 0x3c00);
*/
// Default constructor
// Default constructor
float16
v_def
;
float16
v_def
;
EXPECT_EQ
(
v_def
.
x
,
0x0000
);
EXPECT_EQ
(
v_def
.
x
,
0x0000
);
...
...
paddle/math/tests/test_float16.cu
浏览文件 @
41bd1f91
...
@@ -16,40 +16,37 @@ limitations under the License. */
...
@@ -16,40 +16,37 @@ limitations under the License. */
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
#define ARITHMETIC_KERNEL(op_type, sign) \
#define ARITHMETIC_KERNEL(op_type, sign) \
__global__ void op_type( \
__global__ void op_type(const half* in1, const half* in2, half* out) { \
const float16* in1, const float16* in2, float16* out) { \
out[0] = in1[0] sign in2[0]; \
out[0] = in1[0] sign in2[0]; \
}
}
#define COMPOUND_KERNEL(op_type, sign) \
#define COMPOUND_KERNEL(op_type, sign) \
__global__ void op_type(float16* in1, const float16* in2) { \
__global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; }
in1[0] sign in2[0]; \
}
#define COMPARISON_KERNEL(op_type, sign) \
#define COMPARISON_KERNEL(op_type, sign) \
__global__ void op_type(const
float16* in1, const float16
* in2, bool* out) { \
__global__ void op_type(const
half* in1, const half
* in2, bool* out) { \
out[0] = in1[0] sign in2[0]; \
out[0] = in1[0] sign in2[0]; \
}
}
#define ARITHMETIC_KERNEL_LAUNCH(op_type) \
#define ARITHMETIC_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, float v_out) { \
void Test##op_type(float v_in1, float v_in2, float v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
float16 *in1, *in2, *out;
\
half *in1, *in2, *out;
\
float16 *d_in1, *d_in2, *d_out;
\
half *d_in1, *d_in2, *d_out;
\
int size = sizeof(
float16);
\
int size = sizeof(
half);
\
cudaMalloc((void**)&d_in1, size); \
cudaMalloc((void**)&d_in1, size); \
cudaMalloc((void**)&d_in2, size); \
cudaMalloc((void**)&d_in2, size); \
cudaMalloc((void**)&d_out, size); \
cudaMalloc((void**)&d_out, size); \
in1 = (
float16*)malloc(size);
\
in1 = (
half*)malloc(size);
\
in2 = (
float16*)malloc(size);
\
in2 = (
half*)malloc(size);
\
out = (
float16*)malloc(size);
\
out = (
half*)malloc(size);
\
in1[0] =
float16(v_in1);
\
in1[0] =
half(float16(v_in1));
\
in2[0] =
float16(v_in2);
\
in2[0] =
half(float16(v_in2));
\
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op_type<<<1, 1>>>(d_in1, d_in2, d_out); \
op_type<<<1, 1>>>(d_in1, d_in2, d_out); \
cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \
cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \
EXPECT_EQ(float(
out[0]), v_out);
\
EXPECT_EQ(float(
float16(out[0])), v_out);
\
free(in1); \
free(in1); \
free(in2); \
free(in2); \
free(out); \
free(out); \
...
@@ -61,20 +58,20 @@ limitations under the License. */
...
@@ -61,20 +58,20 @@ limitations under the License. */
#define COMPOUND_KERNEL_LAUNCH(op_type) \
#define COMPOUND_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, float v_out) { \
void Test##op_type(float v_in1, float v_in2, float v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
float16 *in1, *in2;
\
half *in1, *in2;
\
float16 *d_in1, *d_in2;
\
half *d_in1, *d_in2;
\
int size = sizeof(
float16);
\
int size = sizeof(
half);
\
cudaMalloc((void**)&d_in1, size); \
cudaMalloc((void**)&d_in1, size); \
cudaMalloc((void**)&d_in2, size); \
cudaMalloc((void**)&d_in2, size); \
in1 = (
float16*)malloc(size);
\
in1 = (
half*)malloc(size);
\
in2 = (
float16*)malloc(size);
\
in2 = (
half*)malloc(size);
\
in1[0] =
float16(v_in1);
\
in1[0] =
half(float16(v_in1));
\
in2[0] =
float16(v_in2);
\
in2[0] =
half(float16(v_in2));
\
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op_type<<<1, 1>>>(d_in1, d_in2); \
op_type<<<1, 1>>>(d_in1, d_in2); \
cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \
cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \
EXPECT_EQ(float(
in1[0]), v_out);
\
EXPECT_EQ(float(
float16(in1[0])), v_out);
\
free(in1); \
free(in1); \
free(in2); \
free(in2); \
cudaFree(d_in1); \
cudaFree(d_in1); \
...
@@ -84,18 +81,18 @@ limitations under the License. */
...
@@ -84,18 +81,18 @@ limitations under the License. */
#define COMPARISON_KERNEL_LAUNCH(op_type) \
#define COMPARISON_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, bool v_out) { \
void Test##op_type(float v_in1, float v_in2, bool v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
float16 *in1, *in2;
\
half *in1, *in2;
\
float16 *d_in1, *d_in2;
\
half *d_in1, *d_in2;
\
bool *out, *d_out; \
bool *out, *d_out; \
int size = sizeof(
float16);
\
int size = sizeof(
half);
\
cudaMalloc((void**)&d_in1, size); \
cudaMalloc((void**)&d_in1, size); \
cudaMalloc((void**)&d_in2, size); \
cudaMalloc((void**)&d_in2, size); \
cudaMalloc((void**)&d_out, 1); \
cudaMalloc((void**)&d_out, 1); \
in1 = (
float16*)malloc(size);
\
in1 = (
half*)malloc(size);
\
in2 = (
float16*)malloc(size);
\
in2 = (
half*)malloc(size);
\
out = (bool*)malloc(1); \
out = (bool*)malloc(1); \
in1[0] =
float16(v_in1);
\
in1[0] =
half(float16(v_in1));
\
in2[0] =
float16(v_in2);
\
in2[0] =
half(float16(v_in2));
\
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op_type<<<1, 1>>>(d_in1, d_in2, d_out); \
op_type<<<1, 1>>>(d_in1, d_in2, d_out); \
...
@@ -112,6 +109,7 @@ limitations under the License. */
...
@@ -112,6 +109,7 @@ limitations under the License. */
#ifdef PADDLE_CUDA_FP16
#ifdef PADDLE_CUDA_FP16
namespace
paddle
{
namespace
paddle
{
#if CUDA_VERSION < 9000
ARITHMETIC_KERNEL
(
Add
,
+
)
ARITHMETIC_KERNEL
(
Add
,
+
)
ARITHMETIC_KERNEL
(
Sub
,
-
)
ARITHMETIC_KERNEL
(
Sub
,
-
)
ARITHMETIC_KERNEL
(
Mul
,
*
)
ARITHMETIC_KERNEL
(
Mul
,
*
)
...
@@ -123,19 +121,19 @@ ARITHMETIC_KERNEL_LAUNCH(Mul)
...
@@ -123,19 +121,19 @@ ARITHMETIC_KERNEL_LAUNCH(Mul)
ARITHMETIC_KERNEL_LAUNCH
(
Div
)
ARITHMETIC_KERNEL_LAUNCH
(
Div
)
// Negative sign kernel
// Negative sign kernel
__global__
void
Neg
(
float16
*
in
)
{
in
[
0
]
=
-
in
[
0
];
}
__global__
void
Neg
(
half
*
in
)
{
in
[
0
]
=
-
in
[
0
];
}
void
TestNeg
(
float
v_in
,
float
v_out
)
{
void
TestNeg
(
float
v_in
,
float
v_out
)
{
LOG
(
INFO
)
<<
"Test Neg on GPU!"
;
LOG
(
INFO
)
<<
"Test Neg on GPU!"
;
float16
*
in
,
*
d_in
;
half
*
in
,
*
d_in
;
int
size
=
sizeof
(
float16
);
int
size
=
sizeof
(
half
);
cudaMalloc
((
void
**
)
&
d_in
,
size
);
cudaMalloc
((
void
**
)
&
d_in
,
size
);
in
=
(
float16
*
)
malloc
(
size
);
in
=
(
half
*
)
malloc
(
size
);
in
[
0
]
=
float16
(
v_in
);
in
[
0
]
=
half
(
float16
(
v_in
)
);
cudaMemcpy
(
d_in
,
in
,
size
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_in
,
in
,
size
,
cudaMemcpyHostToDevice
);
Neg
<<<
1
,
1
>>>
(
d_in
);
Neg
<<<
1
,
1
>>>
(
d_in
);
cudaMemcpy
(
in
,
d_in
,
size
,
cudaMemcpyDeviceToHost
);
cudaMemcpy
(
in
,
d_in
,
size
,
cudaMemcpyDeviceToHost
);
EXPECT_EQ
(
float
(
in
[
0
]
),
v_out
);
EXPECT_EQ
(
float
(
float16
(
in
[
0
])
),
v_out
);
free
(
in
);
free
(
in
);
cudaFree
(
d_in
);
cudaFree
(
d_in
);
}
}
...
@@ -193,6 +191,7 @@ TEST(float16, comparision_on_gpu) {
...
@@ -193,6 +191,7 @@ TEST(float16, comparision_on_gpu) {
TestGreaterEqual
(
4
,
4
,
true
);
TestGreaterEqual
(
4
,
4
,
true
);
TestGreaterEqual
(
4
,
5
,
false
);
TestGreaterEqual
(
4
,
5
,
false
);
}
}
#endif // CUDA_VERSION
TEST
(
float16
,
conversion_on_gpu
)
{
TEST
(
float16
,
conversion_on_gpu
)
{
// Explicit conversion to and from cuda half
// Explicit conversion to and from cuda half
...
@@ -204,16 +203,11 @@ TEST(float16, conversion_on_gpu) {
...
@@ -204,16 +203,11 @@ TEST(float16, conversion_on_gpu) {
EXPECT_EQ
(
float16
(
half
(
float16
(
65504.0
f
))).
x
,
0x7bff
);
EXPECT_EQ
(
float16
(
half
(
float16
(
65504.0
f
))).
x
,
0x7bff
);
EXPECT_EQ
(
float16
(
half
(
float16
(
65536.0
f
))).
x
,
0x7c00
);
EXPECT_EQ
(
float16
(
half
(
float16
(
65536.0
f
))).
x
,
0x7c00
);
// Implicit conversion to and from cuda half
half
tmp
=
float16
(
1.0
f
);
float16
val
=
tmp
;
EXPECT_EQ
(
val
.
x
,
0x3c00
);
// Assignment operator
// Assignment operator
float16
v_assign
;
float16
v_assign
;
v_assign
=
tmp
;
v_assign
=
half
(
float16
(
1.0
f
))
;
EXPECT_EQ
(
v_assign
.
x
,
0x3c00
);
EXPECT_EQ
(
v_assign
.
x
,
0x3c00
);
}
}
}
// namespace paddle
}
// namespace paddle
#endif
#endif
// PADDLE_CUDA_FP16
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录