提交 41bd1f91 编写于 作者: K Kexin Zhao

fix gpu test, clean code and add cmake

上级 a5feb771
......@@ -56,6 +56,7 @@ option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
......
......@@ -24,6 +24,11 @@ if(WITH_DOUBLE)
add_definitions(-DPADDLE_TYPE_DOUBLE)
endif(WITH_DOUBLE)
if(WITH_ARM_FP16)
add_definitions(-DPADDLE_ARM_FP16)
add_definitions("-march=armv8.2-a+fp16+simd")
endif(WITH_ARM_FP16)
if(WITH_TESTING)
add_definitions(-DPADDLE_WITH_TESTING)
endif(WITH_TESTING)
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once
#include <cstdint>
#include <stdint.h>
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
......@@ -71,6 +71,7 @@ struct PADDLE_ALIGN(2) float16 {
public:
uint16_t x;
// Constructors
HOSTDEVICE inline float16() : x(0) {}
HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
......@@ -89,8 +90,7 @@ public:
#ifdef PADDLE_WITH_NATIVE_FP16
// __fp16 is a native half precision data type for arm cpu,
// float16_t is an alias for __fp16 in arm_fp16.h,
// which is included in arm_neon.h.
// float16_t is an alias for __fp16
HOSTDEVICE inline explicit float16(const float16_t& h) {
x = *reinterpret_cast<const uint16_t*>(&h);
}
......@@ -141,6 +141,7 @@ public:
return *this;
}
// Assignment operators
#ifdef PADDLE_CUDA_FP16
HOSTDEVICE inline float16& operator=(const half& rhs) {
#if CUDA_VERSION >= 9000
......@@ -219,6 +220,7 @@ public:
return *this;
}
// Conversion opertors
#ifdef PADDLE_CUDA_FP16
HOSTDEVICE inline explicit operator half() const {
#if CUDA_VERSION >= 9000
......@@ -353,27 +355,54 @@ private:
// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
// CUDA 9.0 regarding the half data type.
#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && \
__CUDA_ARCH__ >= 530 && CUDA_VERSION < 9000
#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
DEVICE inline half operator+(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hadd(a, b);
#else
float res = float(float16(a)) + float(float16(b));
return half(float16(res));
#endif
}
DEVICE inline half operator-(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hsub(a, b);
#else
float res = float(float16(a)) - float(float16(b));
return half(float16(res));
#endif
}
DEVICE inline half operator*(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hmul(a, b);
#else
float res = float(float16(a)) * float(float16(b));
return half(float16(res));
#endif
}
DEVICE inline half operator/(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
float num = __half2float(a);
float denom = __half2float(b);
return __float2half(num / denom);
#else
float res = float(float16(a)) / float(float16(b));
return half(float16(res));
#endif
}
DEVICE inline half operator-(const half& a) { return __hneg(a); }
DEVICE inline half operator-(const half& a) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hneg(a);
#else
float res = -float(float16(a));
return half(float16(res));
#endif
}
DEVICE inline half& operator+=(half& a, const half& b) {
a = a + b;
......@@ -396,99 +425,57 @@ DEVICE inline half& operator/=(half& a, const half& b) {
}
DEVICE inline bool operator==(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __heq(a, b);
#else
return float(float16(a)) == float(float16(b));
#endif
}
DEVICE inline bool operator!=(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hne(a, b);
#else
return float(float16(a)) != float(float16(b));
#endif
}
DEVICE inline bool operator<(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hlt(a, b);
#else
return float(float16(a)) < float(float16(b));
#endif
}
DEVICE inline bool operator<=(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hle(a, b);
#else
return float(float16(a)) <= float(float16(b));
#endif
}
DEVICE inline bool operator>(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hgt(a, b);
#else
return float(float16(a)) > float(float16(b));
#endif
}
DEVICE inline bool operator>=(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hge(a, b);
#else
return float(float16(a)) >= float(float16(b));
#endif
}
/*
DEVICE inline float16 operator+(const float16& a, const float16& b) {
return float16(__hadd(half(a), half(b)));
}
DEVICE inline float16 operator-(const float16& a, const float16& b) {
return float16(__hsub(half(a), half(b)));
}
DEVICE inline float16 operator*(const float16& a, const float16& b) {
return float16(__hmul(half(a), half(b)));
}
DEVICE inline float16 operator/(const float16& a, const float16& b) {
float num = __half2float(half(a));
float denom = __half2float(half(b));
return float16(num / denom);
}
DEVICE inline float16 operator-(const float16& a) {
return float16(__hneg(half(a)));
}
DEVICE inline float16& operator+=(float16& a, const float16& b) {
a = a + b;
return a;
}
DEVICE inline float16& operator-=(float16& a, const float16& b) {
a = a - b;
return a;
}
DEVICE inline float16& operator*=(float16& a, const float16& b) {
a = a * b;
return a;
}
DEVICE inline float16& operator/=(float16& a, const float16& b) {
a = a / b;
return a;
}
DEVICE inline bool operator==(const float16& a, const float16& b) {
return __heq(half(a), half(b));
}
DEVICE inline bool operator!=(const float16& a, const float16& b) {
return __hne(half(a), half(b));
}
DEVICE inline bool operator<(const float16& a, const float16& b) {
return __hlt(half(a), half(b));
}
DEVICE inline bool operator<=(const float16& a, const float16& b) {
return __hle(half(a), half(b));
}
DEVICE inline bool operator>(const float16& a, const float16& b) {
return __hgt(half(a), half(b));
}
DEVICE inline bool operator>=(const float16& a, const float16& b) {
return __hge(half(a), half(b));
}
*/
#endif // PADDLE_CUDA_FP16
// Arithmetic operators on ARMv8.2-A CPU
#elif defined(PADDLE_WITH_NATIVE_FP16)
#if defined(PADDLE_WITH_NATIVE_FP16)
HOST inline float16 operator+(const float16& a, const float16& b) {
float16 res;
asm volatile(
......@@ -681,88 +668,6 @@ HOST inline bool operator>=(const float16& a, const float16& b) {
return (res & 0xffff) != 0;
}
/*
HOST inline float16 operator+(const float16& a, const float16& b) {
return float16(vaddh_f16(float16_t(a), float16_t(b)));
}
HOST inline float16 operator-(const float16& a, const float16& b) {
return float16(vsubh_f16(float16_t(a), float16_t(b)));
}
HOST inline float16 operator*(const float16& a, const float16& b) {
return float16(vmulh_f16(float16_t(a), float16_t(b)));
}
HOST inline float16 operator/(const float16& a, const float16& b) {
return float16(vdivh_f16(float16_t(a), float16_t(b)));
}
HOST inline float16 operator-(const float16& a) {
return float16(vnegh_f16(float16_t(a)));
}
HOST inline float16& operator+=(float16& a, const float16& b) {
a = a + b;
return a;
}
HOST inline float16& operator-=(float16& a, const float16& b) {
a = a - b;
return a;
}
HOST inline float16& operator*=(float16& a, const float16& b) {
a = a * b;
return a;
}
HOST inline float16& operator/=(float16& a, const float16& b) {
a = a / b;
return a;
}
HOST inline bool operator==(const float16& a, const float16& b) {
return static_cast<bool>(vceqh_f16(float16_t(a), float16_t(b)));
}
HOST inline bool operator!=(const float16& a, const float16& b) {
return !(a == b);
}
HOST inline bool operator<(const float16& a, const float16& b) {
#ifdef PADDLE_NEON_64
return static_cast<bool>(vclth_f16(float16_t(a), float16_t(b)));
#else
return float(a) < float(b);
#endif // PADDLE_NEON_64
}
HOST inline bool operator<=(const float16& a, const float16& b) {
#ifdef PADDLE_NEON_64
return static_cast<bool>(vcleh_f16(float16_t(a), float16_t(b)));
#else
return float(a) <= float(b);
#endif // PADDLE_NEON_64
}
HOST inline bool operator>(const float16& a, const float16& b) {
#ifdef PADDLE_NEON_64
return static_cast<bool>(vcgth_f16(float16_t(a), float16_t(b)));
#else
return float(a) > float(b);
#endif // PADDLE_NEON_64
}
HOST inline bool operator>=(const float16& a, const float16& b) {
#ifdef PADDLE_NEON_64
return static_cast<bool>(vcgeh_f16(float16_t(a), float16_t(b)));
#else
return float(a) >= float(b);
#endif // PADDLE_NEON_64
}
*/
// Arithmetic operators, software emulated on other CPU
#else
HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
......
......@@ -54,14 +54,6 @@ TEST(float16, conversion_cpu) {
EXPECT_EQ(float16(true).x, 0x3c00);
EXPECT_EQ(float16(false).x, 0x0000);
// Implicit conversion to and from Eigen::half
/*
Eigen::half tmp = float16(1.0f);
float16 v_conv = tmp;
EXPECT_EQ(tmp.x, 0x3c00);
EXPECT_EQ(v_conv.x, 0x3c00);
*/
// Default constructor
float16 v_def;
EXPECT_EQ(v_def.x, 0x0000);
......
......@@ -16,40 +16,37 @@ limitations under the License. */
#include "paddle/utils/Logging.h"
#define ARITHMETIC_KERNEL(op_type, sign) \
__global__ void op_type( \
const float16* in1, const float16* in2, float16* out) { \
__global__ void op_type(const half* in1, const half* in2, half* out) { \
out[0] = in1[0] sign in2[0]; \
}
#define COMPOUND_KERNEL(op_type, sign) \
__global__ void op_type(float16* in1, const float16* in2) { \
in1[0] sign in2[0]; \
}
__global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; }
#define COMPARISON_KERNEL(op_type, sign) \
__global__ void op_type(const float16* in1, const float16* in2, bool* out) { \
__global__ void op_type(const half* in1, const half* in2, bool* out) { \
out[0] = in1[0] sign in2[0]; \
}
#define ARITHMETIC_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, float v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
float16 *in1, *in2, *out; \
float16 *d_in1, *d_in2, *d_out; \
int size = sizeof(float16); \
half *in1, *in2, *out; \
half *d_in1, *d_in2, *d_out; \
int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \
cudaMalloc((void**)&d_in2, size); \
cudaMalloc((void**)&d_out, size); \
in1 = (float16*)malloc(size); \
in2 = (float16*)malloc(size); \
out = (float16*)malloc(size); \
in1[0] = float16(v_in1); \
in2[0] = float16(v_in2); \
in1 = (half*)malloc(size); \
in2 = (half*)malloc(size); \
out = (half*)malloc(size); \
in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op_type<<<1, 1>>>(d_in1, d_in2, d_out); \
cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \
EXPECT_EQ(float(out[0]), v_out); \
EXPECT_EQ(float(float16(out[0])), v_out); \
free(in1); \
free(in2); \
free(out); \
......@@ -61,20 +58,20 @@ limitations under the License. */
#define COMPOUND_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, float v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
float16 *in1, *in2; \
float16 *d_in1, *d_in2; \
int size = sizeof(float16); \
half *in1, *in2; \
half *d_in1, *d_in2; \
int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \
cudaMalloc((void**)&d_in2, size); \
in1 = (float16*)malloc(size); \
in2 = (float16*)malloc(size); \
in1[0] = float16(v_in1); \
in2[0] = float16(v_in2); \
in1 = (half*)malloc(size); \
in2 = (half*)malloc(size); \
in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op_type<<<1, 1>>>(d_in1, d_in2); \
cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \
EXPECT_EQ(float(in1[0]), v_out); \
EXPECT_EQ(float(float16(in1[0])), v_out); \
free(in1); \
free(in2); \
cudaFree(d_in1); \
......@@ -84,18 +81,18 @@ limitations under the License. */
#define COMPARISON_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, bool v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
float16 *in1, *in2; \
float16 *d_in1, *d_in2; \
half *in1, *in2; \
half *d_in1, *d_in2; \
bool *out, *d_out; \
int size = sizeof(float16); \
int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \
cudaMalloc((void**)&d_in2, size); \
cudaMalloc((void**)&d_out, 1); \
in1 = (float16*)malloc(size); \
in2 = (float16*)malloc(size); \
in1 = (half*)malloc(size); \
in2 = (half*)malloc(size); \
out = (bool*)malloc(1); \
in1[0] = float16(v_in1); \
in2[0] = float16(v_in2); \
in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op_type<<<1, 1>>>(d_in1, d_in2, d_out); \
......@@ -112,6 +109,7 @@ limitations under the License. */
#ifdef PADDLE_CUDA_FP16
namespace paddle {
#if CUDA_VERSION < 9000
ARITHMETIC_KERNEL(Add, +)
ARITHMETIC_KERNEL(Sub, -)
ARITHMETIC_KERNEL(Mul, *)
......@@ -123,19 +121,19 @@ ARITHMETIC_KERNEL_LAUNCH(Mul)
ARITHMETIC_KERNEL_LAUNCH(Div)
// Negative sign kernel
__global__ void Neg(float16* in) { in[0] = -in[0]; }
__global__ void Neg(half* in) { in[0] = -in[0]; }
void TestNeg(float v_in, float v_out) {
LOG(INFO) << "Test Neg on GPU!";
float16 *in, *d_in;
int size = sizeof(float16);
half *in, *d_in;
int size = sizeof(half);
cudaMalloc((void**)&d_in, size);
in = (float16*)malloc(size);
in[0] = float16(v_in);
in = (half*)malloc(size);
in[0] = half(float16(v_in));
cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
Neg<<<1, 1>>>(d_in);
cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
EXPECT_EQ(float(in[0]), v_out);
EXPECT_EQ(float(float16(in[0])), v_out);
free(in);
cudaFree(d_in);
}
......@@ -193,6 +191,7 @@ TEST(float16, comparision_on_gpu) {
TestGreaterEqual(4, 4, true);
TestGreaterEqual(4, 5, false);
}
#endif // CUDA_VERSION
TEST(float16, conversion_on_gpu) {
// Explicit conversion to and from cuda half
......@@ -204,16 +203,11 @@ TEST(float16, conversion_on_gpu) {
EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff);
EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00);
// Implicit conversion to and from cuda half
half tmp = float16(1.0f);
float16 val = tmp;
EXPECT_EQ(val.x, 0x3c00);
// Assignment operator
float16 v_assign;
v_assign = tmp;
v_assign = half(float16(1.0f));
EXPECT_EQ(v_assign.x, 0x3c00);
}
} // namespace paddle
#endif
#endif // PADDLE_CUDA_FP16
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册