未验证 提交 6d3da458 编写于 作者: D dzhwinter 提交者: GitHub

Fix/float16 style (#12446)

* "rewrite the test case"

* "follow comment"
上级 91fb0156
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
// limitations under the License. // limitations under the License.
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <bitset>
#include <iostream> #include <iostream>
#include <random> #include <random>
...@@ -25,13 +24,13 @@ ...@@ -25,13 +24,13 @@
using paddle::platform::PADDLE_CUDA_NUM_THREADS; using paddle::platform::PADDLE_CUDA_NUM_THREADS;
using paddle::platform::float16; using paddle::platform::float16;
#define CUDA_ATOMIC_KERNEL(op, T) \ template <typename T>
__global__ void op##Kernel(const T* data_a, T* data_b, size_t num) { \ __global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) { \ i += blockDim.x * gridDim.x) {
paddle::platform::CudaAtomic##op(&data_b[i], data_a[i]); \ paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]);
} \
} }
}
template <typename T> template <typename T>
struct AddFunctor { struct AddFunctor {
...@@ -39,80 +38,116 @@ struct AddFunctor { ...@@ -39,80 +38,116 @@ struct AddFunctor {
}; };
template <typename T> template <typename T>
struct SubFunctor { void TestCase(size_t num) {
T operator()(const T& a, const T& b) { return a - b; } T *in1, *in2, *out;
}; T *d_in1, *d_in2;
size_t size = sizeof(T) * num;
// NOTE(dzhwinter): the float16 add has small underflow/overflow cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
// so we use EXPECT_NEAR to check the result. cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
#define ARITHMETIC_KERNEL_LAUNCH(op, T) \ in1 = reinterpret_cast<T*>(malloc(size));
void Test##T##op(size_t num) { \ in2 = reinterpret_cast<T*>(malloc(size));
T *in1, *in2, *out; \ out = reinterpret_cast<T*>(malloc(size));
T *d_in1, *d_in2; \ std::minstd_rand engine;
size_t size = sizeof(T) * num; \ std::uniform_real_distribution<double> dist(0.0, 1.0);
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \ for (size_t i = 0; i < num; ++i) {
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \ in1[i] = static_cast<T>(dist(engine));
in1 = reinterpret_cast<T*>(malloc(size)); \ in2[i] = static_cast<T>(dist(engine));
in2 = reinterpret_cast<T*>(malloc(size)); \
out = reinterpret_cast<T*>(malloc(size)); \
std::minstd_rand engine; \
std::uniform_real_distribution<double> dist(0.0, 1.0); \
for (size_t i = 0; i < num; ++i) { \
in1[i] = static_cast<T>(dist(engine)); \
in2[i] = static_cast<T>(dist(engine)); \
} \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op##Kernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); \
cudaDeviceSynchronize(); \
cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost); \
cudaDeviceSynchronize(); \
for (size_t i = 0; i < num; ++i) { \
EXPECT_NEAR(static_cast<float>(out[i]), \
static_cast<float>(op##Functor<T>()(in1[i], in2[i])), \
0.001); \
} \
free(in1); \
free(in2); \
free(out); \
cudaFree(d_in1); \
cudaFree(d_in2); \
} }
CUDA_ATOMIC_KERNEL(Add, float); cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
CUDA_ATOMIC_KERNEL(Add, double); cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);
CUDA_ATOMIC_KERNEL(Add, float16); AddKernel<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
cudaDeviceSynchronize();
ARITHMETIC_KERNEL_LAUNCH(Add, float); cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);
ARITHMETIC_KERNEL_LAUNCH(Add, double); cudaDeviceSynchronize();
ARITHMETIC_KERNEL_LAUNCH(Add, float16); for (size_t i = 0; i < num; ++i) {
// NOTE(dzhwinter): the float16 add has small underflow/overflow
namespace paddle { // so we use EXPECT_NEAR to check the result.
namespace platform { EXPECT_NEAR(static_cast<float>(out[i]),
USE_CUDA_ATOMIC(Sub, int); static_cast<float>(AddFunctor<T>()(in1[i], in2[i])), 0.001);
}; }
}; free(in1);
CUDA_ATOMIC_KERNEL(Sub, int); free(in2);
ARITHMETIC_KERNEL_LAUNCH(Sub, int); free(out);
cudaFree(d_in1);
cudaFree(d_in2);
}
// cuda primitives // cuda primitives
TEST(CudaAtomic, Add) { TEST(CudaAtomic, Add) {
TestfloatAdd(static_cast<size_t>(10)); TestCase<float>(static_cast<size_t>(10));
TestfloatAdd(static_cast<size_t>(1024 * 1024)); TestCase<float>(static_cast<size_t>(1024 * 1024));
TestdoubleAdd(static_cast<size_t>(10));
TestdoubleAdd(static_cast<size_t>(1024 * 1024));
}
TEST(CudaAtomic, Sub) { TestCase<double>(static_cast<size_t>(10));
TestintSub(static_cast<size_t>(10)); TestCase<double>(static_cast<size_t>(1024 * 1024));
TestintSub(static_cast<size_t>(1024 * 1024));
} }
TEST(CudaAtomic, float16) { TEST(CudaAtomic, float16) {
using paddle::platform::float16; TestCase<float16>(static_cast<size_t>(1));
Testfloat16Add(static_cast<size_t>(1)); TestCase<float16>(static_cast<size_t>(2));
Testfloat16Add(static_cast<size_t>(2)); TestCase<float16>(static_cast<size_t>(3));
Testfloat16Add(static_cast<size_t>(3));
TestCase<float16>(static_cast<size_t>(10));
TestCase<float16>(static_cast<size_t>(1024 * 1024));
}
// unalignment of uint8
void TestUnalign(size_t num, const int shift_bit) {
PADDLE_ENFORCE(num % 2 == 0, "must be a multiple of 2");
float16 *in1, *in2, *out;
float16 *d_in1, *d_in2;
size_t size = sizeof(uint8_t) * (num + shift_bit);
size_t array_size = sizeof(float16) * (num / 2);
cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
in1 = reinterpret_cast<float16*>(malloc(size));
in2 = reinterpret_cast<float16*>(malloc(size));
out = reinterpret_cast<float16*>(malloc(size));
// right shift 1, mimic the unalignment of address
float16* r_in1 =
reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in1) + shift_bit);
float16* r_in2 =
reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in2) + shift_bit);
std::minstd_rand engine;
std::uniform_real_distribution<double> dist(0.0, 1.0);
for (size_t i = 0; i < num / 2; ++i) {
r_in1[i] = static_cast<float16>(dist(engine));
r_in2[i] = static_cast<float16>(dist(engine));
}
cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice);
AddKernel<float16><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2);
cudaDeviceSynchronize();
cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (size_t i = 0; i < num / 2; ++i) {
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
EXPECT_NEAR(static_cast<float>(out[i]),
static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])),
0.001);
}
free(in1);
free(in2);
free(out);
cudaFree(d_in1);
cudaFree(d_in2);
}
TEST(CudaAtomic, float16Unalign) {
// same with float16 testcase
TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 2);
TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 2);
TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 2);
// shift the address.
TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 1);
TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 1);
TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 1);
Testfloat16Add(static_cast<size_t>(10)); TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 3);
Testfloat16Add(static_cast<size_t>(1024 * 1024)); TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3);
TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3);
} }
...@@ -79,41 +79,41 @@ CUDA_ATOMIC_WRAPPER(Add, double) { ...@@ -79,41 +79,41 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
// convert the value into float and do the add arithmetic. // convert the value into float and do the add arithmetic.
// then store the result into a uint32. // then store the result into a uint32.
inline __device__ uint32_t add_to_low_half(uint32_t val, float x) { inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) {
float16 low_half; float16 low_half;
// the float16 in lower 16bits // the float16 in lower 16bits
low_half.x = static_cast<uint16_t>(val & 0xffffu); low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
low_half = static_cast<float16>(static_cast<float>(low_half) + x); low_half = static_cast<float16>(static_cast<float>(low_half) + x);
return (val & 0xffff0000u) | low_half.x; return (val & 0xFFFF0000u) | low_half.x;
} }
inline __device__ uint32_t add_to_high_half(uint32_t val, float x) { inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
float16 high_half; float16 high_half;
// the float16 in higher 16bits // the float16 in higher 16bits
high_half.x = static_cast<uint16_t>(val >> 16); high_half.x = static_cast<uint16_t>(val >> 16);
high_half = static_cast<float16>(static_cast<float>(high_half) + x); high_half = static_cast<float16>(static_cast<float>(high_half) + x);
return (val & 0xffffu) | (static_cast<uint32_t>(high_half.x) << 16); return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
} }
CUDA_ATOMIC_WRAPPER(Add, float16) { CUDA_ATOMIC_WRAPPER(Add, float16) {
// concrete packed float16 value may exsits in lower or higher 16bits // concrete packed float16 value may exsits in lower or higher 16bits
// of the 32bits address. // of the 32bits address.
uint32_t *address_as_ui = uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
reinterpret_cast<uint32_t *>(reinterpret_cast<char *>(address) - reinterpret_cast<char *>(address) -
(reinterpret_cast<size_t>(address) & 2)); (reinterpret_cast<uintptr_t>(address) & 0x02));
float val_f = static_cast<float>(val); float val_f = static_cast<float>(val);
uint32_t old = *address_as_ui; uint32_t old = *address_as_ui;
uint32_t sum; uint32_t sum;
uint32_t newval; uint32_t newval;
uint32_t assumed; uint32_t assumed;
if (((size_t)address & 2) == 0) { if (((uintptr_t)address & 0x02) == 0) {
// the float16 value stay at lower 16 bits of the address. // the float16 value stay at lower 16 bits of the address.
do { do {
assumed = old; assumed = old;
old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f)); old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f));
} while (old != assumed); } while (old != assumed);
float16 ret; float16 ret;
ret.x = old & 0xffffu; ret.x = old & 0xFFFFu;
return ret; return ret;
} else { } else {
// the float16 value stay at higher 16 bits of the address. // the float16 value stay at higher 16 bits of the address.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册