提交 41bd1f91 编写于 作者: K Kexin Zhao

fix gpu test, clean code and add cmake

上级 a5feb771
...@@ -56,6 +56,7 @@ option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) ...@@ -56,6 +56,7 @@ option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON) option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
# CMAKE_BUILD_TYPE # CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE) if(NOT CMAKE_BUILD_TYPE)
......
...@@ -24,6 +24,11 @@ if(WITH_DOUBLE) ...@@ -24,6 +24,11 @@ if(WITH_DOUBLE)
add_definitions(-DPADDLE_TYPE_DOUBLE) add_definitions(-DPADDLE_TYPE_DOUBLE)
endif(WITH_DOUBLE) endif(WITH_DOUBLE)
if(WITH_ARM_FP16)
add_definitions(-DPADDLE_ARM_FP16)
add_definitions("-march=armv8.2-a+fp16+simd")
endif(WITH_ARM_FP16)
if(WITH_TESTING) if(WITH_TESTING)
add_definitions(-DPADDLE_WITH_TESTING) add_definitions(-DPADDLE_WITH_TESTING)
endif(WITH_TESTING) endif(WITH_TESTING)
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <cstdint> #include <stdint.h>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
...@@ -71,6 +71,7 @@ struct PADDLE_ALIGN(2) float16 { ...@@ -71,6 +71,7 @@ struct PADDLE_ALIGN(2) float16 {
public: public:
uint16_t x; uint16_t x;
// Constructors
HOSTDEVICE inline float16() : x(0) {} HOSTDEVICE inline float16() : x(0) {}
HOSTDEVICE inline float16(const float16& h) : x(h.x) {} HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
...@@ -89,8 +90,7 @@ public: ...@@ -89,8 +90,7 @@ public:
#ifdef PADDLE_WITH_NATIVE_FP16 #ifdef PADDLE_WITH_NATIVE_FP16
// __fp16 is a native half precision data type for arm cpu, // __fp16 is a native half precision data type for arm cpu,
// float16_t is an alias for __fp16 in arm_fp16.h, // float16_t is an alias for __fp16
// which is included in arm_neon.h.
HOSTDEVICE inline explicit float16(const float16_t& h) { HOSTDEVICE inline explicit float16(const float16_t& h) {
x = *reinterpret_cast<const uint16_t*>(&h); x = *reinterpret_cast<const uint16_t*>(&h);
} }
...@@ -141,6 +141,7 @@ public: ...@@ -141,6 +141,7 @@ public:
return *this; return *this;
} }
// Assignment operators
#ifdef PADDLE_CUDA_FP16 #ifdef PADDLE_CUDA_FP16
HOSTDEVICE inline float16& operator=(const half& rhs) { HOSTDEVICE inline float16& operator=(const half& rhs) {
#if CUDA_VERSION >= 9000 #if CUDA_VERSION >= 9000
...@@ -219,6 +220,7 @@ public: ...@@ -219,6 +220,7 @@ public:
return *this; return *this;
} }
// Conversion opertors
#ifdef PADDLE_CUDA_FP16 #ifdef PADDLE_CUDA_FP16
HOSTDEVICE inline explicit operator half() const { HOSTDEVICE inline explicit operator half() const {
#if CUDA_VERSION >= 9000 #if CUDA_VERSION >= 9000
...@@ -353,27 +355,54 @@ private: ...@@ -353,27 +355,54 @@ private:
// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are // CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in // for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
// CUDA 9.0 regarding the half data type. // CUDA 9.0 regarding the half data type.
#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && \ #if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
__CUDA_ARCH__ >= 530 && CUDA_VERSION < 9000
DEVICE inline half operator+(const half& a, const half& b) { DEVICE inline half operator+(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hadd(a, b); return __hadd(a, b);
#else
float res = float(float16(a)) + float(float16(b));
return half(float16(res));
#endif
} }
DEVICE inline half operator-(const half& a, const half& b) { DEVICE inline half operator-(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hsub(a, b); return __hsub(a, b);
#else
float res = float(float16(a)) - float(float16(b));
return half(float16(res));
#endif
} }
DEVICE inline half operator*(const half& a, const half& b) { DEVICE inline half operator*(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hmul(a, b); return __hmul(a, b);
#else
float res = float(float16(a)) * float(float16(b));
return half(float16(res));
#endif
} }
DEVICE inline half operator/(const half& a, const half& b) { DEVICE inline half operator/(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
float num = __half2float(a); float num = __half2float(a);
float denom = __half2float(b); float denom = __half2float(b);
return __float2half(num / denom); return __float2half(num / denom);
#else
float res = float(float16(a)) / float(float16(b));
return half(float16(res));
#endif
} }
DEVICE inline half operator-(const half& a) { return __hneg(a); } DEVICE inline half operator-(const half& a) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hneg(a);
#else
float res = -float(float16(a));
return half(float16(res));
#endif
}
DEVICE inline half& operator+=(half& a, const half& b) { DEVICE inline half& operator+=(half& a, const half& b) {
a = a + b; a = a + b;
...@@ -396,99 +425,57 @@ DEVICE inline half& operator/=(half& a, const half& b) { ...@@ -396,99 +425,57 @@ DEVICE inline half& operator/=(half& a, const half& b) {
} }
DEVICE inline bool operator==(const half& a, const half& b) { DEVICE inline bool operator==(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __heq(a, b); return __heq(a, b);
#else
return float(float16(a)) == float(float16(b));
#endif
} }
DEVICE inline bool operator!=(const half& a, const half& b) { DEVICE inline bool operator!=(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hne(a, b); return __hne(a, b);
#else
return float(float16(a)) != float(float16(b));
#endif
} }
DEVICE inline bool operator<(const half& a, const half& b) { DEVICE inline bool operator<(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hlt(a, b); return __hlt(a, b);
#else
return float(float16(a)) < float(float16(b));
#endif
} }
DEVICE inline bool operator<=(const half& a, const half& b) { DEVICE inline bool operator<=(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hle(a, b); return __hle(a, b);
#else
return float(float16(a)) <= float(float16(b));
#endif
} }
DEVICE inline bool operator>(const half& a, const half& b) { DEVICE inline bool operator>(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hgt(a, b); return __hgt(a, b);
#else
return float(float16(a)) > float(float16(b));
#endif
} }
DEVICE inline bool operator>=(const half& a, const half& b) { DEVICE inline bool operator>=(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hge(a, b); return __hge(a, b);
#else
return float(float16(a)) >= float(float16(b));
#endif
} }
/* #endif // PADDLE_CUDA_FP16
DEVICE inline float16 operator+(const float16& a, const float16& b) {
return float16(__hadd(half(a), half(b)));
}
DEVICE inline float16 operator-(const float16& a, const float16& b) {
return float16(__hsub(half(a), half(b)));
}
DEVICE inline float16 operator*(const float16& a, const float16& b) {
return float16(__hmul(half(a), half(b)));
}
DEVICE inline float16 operator/(const float16& a, const float16& b) {
float num = __half2float(half(a));
float denom = __half2float(half(b));
return float16(num / denom);
}
DEVICE inline float16 operator-(const float16& a) {
return float16(__hneg(half(a)));
}
DEVICE inline float16& operator+=(float16& a, const float16& b) {
a = a + b;
return a;
}
DEVICE inline float16& operator-=(float16& a, const float16& b) {
a = a - b;
return a;
}
DEVICE inline float16& operator*=(float16& a, const float16& b) {
a = a * b;
return a;
}
DEVICE inline float16& operator/=(float16& a, const float16& b) {
a = a / b;
return a;
}
DEVICE inline bool operator==(const float16& a, const float16& b) {
return __heq(half(a), half(b));
}
DEVICE inline bool operator!=(const float16& a, const float16& b) {
return __hne(half(a), half(b));
}
DEVICE inline bool operator<(const float16& a, const float16& b) {
return __hlt(half(a), half(b));
}
DEVICE inline bool operator<=(const float16& a, const float16& b) {
return __hle(half(a), half(b));
}
DEVICE inline bool operator>(const float16& a, const float16& b) {
return __hgt(half(a), half(b));
}
DEVICE inline bool operator>=(const float16& a, const float16& b) {
return __hge(half(a), half(b));
}
*/
// Arithmetic operators on ARMv8.2-A CPU // Arithmetic operators on ARMv8.2-A CPU
#elif defined(PADDLE_WITH_NATIVE_FP16) #if defined(PADDLE_WITH_NATIVE_FP16)
HOST inline float16 operator+(const float16& a, const float16& b) { HOST inline float16 operator+(const float16& a, const float16& b) {
float16 res; float16 res;
asm volatile( asm volatile(
...@@ -681,88 +668,6 @@ HOST inline bool operator>=(const float16& a, const float16& b) { ...@@ -681,88 +668,6 @@ HOST inline bool operator>=(const float16& a, const float16& b) {
return (res & 0xffff) != 0; return (res & 0xffff) != 0;
} }
/*
HOST inline float16 operator+(const float16& a, const float16& b) {
return float16(vaddh_f16(float16_t(a), float16_t(b)));
}
HOST inline float16 operator-(const float16& a, const float16& b) {
return float16(vsubh_f16(float16_t(a), float16_t(b)));
}
HOST inline float16 operator*(const float16& a, const float16& b) {
return float16(vmulh_f16(float16_t(a), float16_t(b)));
}
HOST inline float16 operator/(const float16& a, const float16& b) {
return float16(vdivh_f16(float16_t(a), float16_t(b)));
}
HOST inline float16 operator-(const float16& a) {
return float16(vnegh_f16(float16_t(a)));
}
HOST inline float16& operator+=(float16& a, const float16& b) {
a = a + b;
return a;
}
HOST inline float16& operator-=(float16& a, const float16& b) {
a = a - b;
return a;
}
HOST inline float16& operator*=(float16& a, const float16& b) {
a = a * b;
return a;
}
HOST inline float16& operator/=(float16& a, const float16& b) {
a = a / b;
return a;
}
HOST inline bool operator==(const float16& a, const float16& b) {
return static_cast<bool>(vceqh_f16(float16_t(a), float16_t(b)));
}
HOST inline bool operator!=(const float16& a, const float16& b) {
return !(a == b);
}
HOST inline bool operator<(const float16& a, const float16& b) {
#ifdef PADDLE_NEON_64
return static_cast<bool>(vclth_f16(float16_t(a), float16_t(b)));
#else
return float(a) < float(b);
#endif // PADDLE_NEON_64
}
HOST inline bool operator<=(const float16& a, const float16& b) {
#ifdef PADDLE_NEON_64
return static_cast<bool>(vcleh_f16(float16_t(a), float16_t(b)));
#else
return float(a) <= float(b);
#endif // PADDLE_NEON_64
}
HOST inline bool operator>(const float16& a, const float16& b) {
#ifdef PADDLE_NEON_64
return static_cast<bool>(vcgth_f16(float16_t(a), float16_t(b)));
#else
return float(a) > float(b);
#endif // PADDLE_NEON_64
}
HOST inline bool operator>=(const float16& a, const float16& b) {
#ifdef PADDLE_NEON_64
return static_cast<bool>(vcgeh_f16(float16_t(a), float16_t(b)));
#else
return float(a) >= float(b);
#endif // PADDLE_NEON_64
}
*/
// Arithmetic operators, software emulated on other CPU // Arithmetic operators, software emulated on other CPU
#else #else
HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
......
...@@ -54,14 +54,6 @@ TEST(float16, conversion_cpu) { ...@@ -54,14 +54,6 @@ TEST(float16, conversion_cpu) {
EXPECT_EQ(float16(true).x, 0x3c00); EXPECT_EQ(float16(true).x, 0x3c00);
EXPECT_EQ(float16(false).x, 0x0000); EXPECT_EQ(float16(false).x, 0x0000);
// Implicit conversion to and from Eigen::half
/*
Eigen::half tmp = float16(1.0f);
float16 v_conv = tmp;
EXPECT_EQ(tmp.x, 0x3c00);
EXPECT_EQ(v_conv.x, 0x3c00);
*/
// Default constructor // Default constructor
float16 v_def; float16 v_def;
EXPECT_EQ(v_def.x, 0x0000); EXPECT_EQ(v_def.x, 0x0000);
......
...@@ -16,40 +16,37 @@ limitations under the License. */ ...@@ -16,40 +16,37 @@ limitations under the License. */
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#define ARITHMETIC_KERNEL(op_type, sign) \ #define ARITHMETIC_KERNEL(op_type, sign) \
__global__ void op_type( \ __global__ void op_type(const half* in1, const half* in2, half* out) { \
const float16* in1, const float16* in2, float16* out) { \
out[0] = in1[0] sign in2[0]; \ out[0] = in1[0] sign in2[0]; \
} }
#define COMPOUND_KERNEL(op_type, sign) \ #define COMPOUND_KERNEL(op_type, sign) \
__global__ void op_type(float16* in1, const float16* in2) { \ __global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; }
in1[0] sign in2[0]; \
}
#define COMPARISON_KERNEL(op_type, sign) \ #define COMPARISON_KERNEL(op_type, sign) \
__global__ void op_type(const float16* in1, const float16* in2, bool* out) { \ __global__ void op_type(const half* in1, const half* in2, bool* out) { \
out[0] = in1[0] sign in2[0]; \ out[0] = in1[0] sign in2[0]; \
} }
#define ARITHMETIC_KERNEL_LAUNCH(op_type) \ #define ARITHMETIC_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, float v_out) { \ void Test##op_type(float v_in1, float v_in2, float v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \ LOG(INFO) << "Test " << #op_type << " on GPU!"; \
float16 *in1, *in2, *out; \ half *in1, *in2, *out; \
float16 *d_in1, *d_in2, *d_out; \ half *d_in1, *d_in2, *d_out; \
int size = sizeof(float16); \ int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \ cudaMalloc((void**)&d_in1, size); \
cudaMalloc((void**)&d_in2, size); \ cudaMalloc((void**)&d_in2, size); \
cudaMalloc((void**)&d_out, size); \ cudaMalloc((void**)&d_out, size); \
in1 = (float16*)malloc(size); \ in1 = (half*)malloc(size); \
in2 = (float16*)malloc(size); \ in2 = (half*)malloc(size); \
out = (float16*)malloc(size); \ out = (half*)malloc(size); \
in1[0] = float16(v_in1); \ in1[0] = half(float16(v_in1)); \
in2[0] = float16(v_in2); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \
cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \ cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \
EXPECT_EQ(float(out[0]), v_out); \ EXPECT_EQ(float(float16(out[0])), v_out); \
free(in1); \ free(in1); \
free(in2); \ free(in2); \
free(out); \ free(out); \
...@@ -61,20 +58,20 @@ limitations under the License. */ ...@@ -61,20 +58,20 @@ limitations under the License. */
#define COMPOUND_KERNEL_LAUNCH(op_type) \ #define COMPOUND_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, float v_out) { \ void Test##op_type(float v_in1, float v_in2, float v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \ LOG(INFO) << "Test " << #op_type << " on GPU!"; \
float16 *in1, *in2; \ half *in1, *in2; \
float16 *d_in1, *d_in2; \ half *d_in1, *d_in2; \
int size = sizeof(float16); \ int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \ cudaMalloc((void**)&d_in1, size); \
cudaMalloc((void**)&d_in2, size); \ cudaMalloc((void**)&d_in2, size); \
in1 = (float16*)malloc(size); \ in1 = (half*)malloc(size); \
in2 = (float16*)malloc(size); \ in2 = (half*)malloc(size); \
in1[0] = float16(v_in1); \ in1[0] = half(float16(v_in1)); \
in2[0] = float16(v_in2); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op_type<<<1, 1>>>(d_in1, d_in2); \ op_type<<<1, 1>>>(d_in1, d_in2); \
cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \ cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \
EXPECT_EQ(float(in1[0]), v_out); \ EXPECT_EQ(float(float16(in1[0])), v_out); \
free(in1); \ free(in1); \
free(in2); \ free(in2); \
cudaFree(d_in1); \ cudaFree(d_in1); \
...@@ -84,18 +81,18 @@ limitations under the License. */ ...@@ -84,18 +81,18 @@ limitations under the License. */
#define COMPARISON_KERNEL_LAUNCH(op_type) \ #define COMPARISON_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, bool v_out) { \ void Test##op_type(float v_in1, float v_in2, bool v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \ LOG(INFO) << "Test " << #op_type << " on GPU!"; \
float16 *in1, *in2; \ half *in1, *in2; \
float16 *d_in1, *d_in2; \ half *d_in1, *d_in2; \
bool *out, *d_out; \ bool *out, *d_out; \
int size = sizeof(float16); \ int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \ cudaMalloc((void**)&d_in1, size); \
cudaMalloc((void**)&d_in2, size); \ cudaMalloc((void**)&d_in2, size); \
cudaMalloc((void**)&d_out, 1); \ cudaMalloc((void**)&d_out, 1); \
in1 = (float16*)malloc(size); \ in1 = (half*)malloc(size); \
in2 = (float16*)malloc(size); \ in2 = (half*)malloc(size); \
out = (bool*)malloc(1); \ out = (bool*)malloc(1); \
in1[0] = float16(v_in1); \ in1[0] = half(float16(v_in1)); \
in2[0] = float16(v_in2); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \
...@@ -112,6 +109,7 @@ limitations under the License. */ ...@@ -112,6 +109,7 @@ limitations under the License. */
#ifdef PADDLE_CUDA_FP16 #ifdef PADDLE_CUDA_FP16
namespace paddle { namespace paddle {
#if CUDA_VERSION < 9000
ARITHMETIC_KERNEL(Add, +) ARITHMETIC_KERNEL(Add, +)
ARITHMETIC_KERNEL(Sub, -) ARITHMETIC_KERNEL(Sub, -)
ARITHMETIC_KERNEL(Mul, *) ARITHMETIC_KERNEL(Mul, *)
...@@ -123,19 +121,19 @@ ARITHMETIC_KERNEL_LAUNCH(Mul) ...@@ -123,19 +121,19 @@ ARITHMETIC_KERNEL_LAUNCH(Mul)
ARITHMETIC_KERNEL_LAUNCH(Div) ARITHMETIC_KERNEL_LAUNCH(Div)
// Negative sign kernel // Negative sign kernel
__global__ void Neg(float16* in) { in[0] = -in[0]; } __global__ void Neg(half* in) { in[0] = -in[0]; }
void TestNeg(float v_in, float v_out) { void TestNeg(float v_in, float v_out) {
LOG(INFO) << "Test Neg on GPU!"; LOG(INFO) << "Test Neg on GPU!";
float16 *in, *d_in; half *in, *d_in;
int size = sizeof(float16); int size = sizeof(half);
cudaMalloc((void**)&d_in, size); cudaMalloc((void**)&d_in, size);
in = (float16*)malloc(size); in = (half*)malloc(size);
in[0] = float16(v_in); in[0] = half(float16(v_in));
cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice); cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
Neg<<<1, 1>>>(d_in); Neg<<<1, 1>>>(d_in);
cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost); cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
EXPECT_EQ(float(in[0]), v_out); EXPECT_EQ(float(float16(in[0])), v_out);
free(in); free(in);
cudaFree(d_in); cudaFree(d_in);
} }
...@@ -193,6 +191,7 @@ TEST(float16, comparision_on_gpu) { ...@@ -193,6 +191,7 @@ TEST(float16, comparision_on_gpu) {
TestGreaterEqual(4, 4, true); TestGreaterEqual(4, 4, true);
TestGreaterEqual(4, 5, false); TestGreaterEqual(4, 5, false);
} }
#endif // CUDA_VERSION
TEST(float16, conversion_on_gpu) { TEST(float16, conversion_on_gpu) {
// Explicit conversion to and from cuda half // Explicit conversion to and from cuda half
...@@ -204,16 +203,11 @@ TEST(float16, conversion_on_gpu) { ...@@ -204,16 +203,11 @@ TEST(float16, conversion_on_gpu) {
EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff); EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff);
EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00); EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00);
// Implicit conversion to and from cuda half
half tmp = float16(1.0f);
float16 val = tmp;
EXPECT_EQ(val.x, 0x3c00);
// Assignment operator // Assignment operator
float16 v_assign; float16 v_assign;
v_assign = tmp; v_assign = half(float16(1.0f));
EXPECT_EQ(v_assign.x, 0x3c00); EXPECT_EQ(v_assign.x, 0x3c00);
} }
} // namespace paddle } // namespace paddle
#endif #endif // PADDLE_CUDA_FP16
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册