未验证 提交 bf0ec9b8 编写于 作者: A AshburnLee 提交者: GitHub

Add Bfloat16 support on Ampere GPU with CUDA 11 (#32132)

上级 b47dd158
...@@ -190,6 +190,7 @@ cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor) ...@@ -190,6 +190,7 @@ cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)
IF(WITH_GPU) IF(WITH_GPU)
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
nv_test(bfloat16_gpu_test SRCS bfloat16_test.cu DEPS lod_tensor)
nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags) nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
ENDIF() ENDIF()
......
...@@ -21,6 +21,15 @@ ...@@ -21,6 +21,15 @@
#include <iostream> #include <iostream>
#include <limits> #include <limits>
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif
#if defined(__CUDACC__) && CUDA_VERSION >= 11000
#define PADDLE_CUDA_BF16
#include <cuda_bf16.h>
#endif
#if !defined(_WIN32) #if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x))) #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else #else
...@@ -44,6 +53,7 @@ struct PADDLE_ALIGN(2) bfloat16 { ...@@ -44,6 +53,7 @@ struct PADDLE_ALIGN(2) bfloat16 {
public: public:
uint16_t x; uint16_t x;
// Constructors
bfloat16() = default; bfloat16() = default;
bfloat16(const bfloat16& o) = default; bfloat16(const bfloat16& o) = default;
bfloat16& operator=(const bfloat16& o) = default; bfloat16& operator=(const bfloat16& o) = default;
...@@ -60,15 +70,34 @@ struct PADDLE_ALIGN(2) bfloat16 { ...@@ -60,15 +70,34 @@ struct PADDLE_ALIGN(2) bfloat16 {
tempRes = reinterpret_cast<uint32_t*>(&val); tempRes = reinterpret_cast<uint32_t*>(&val);
res = *tempRes; res = *tempRes;
x = res >> 16; x = res >> 16;
#else
#if defined(PADDLE_CUDA_BF16)
__nv_bfloat16 tmp = __float2bfloat16(val);
x = *reinterpret_cast<uint16_t*>(&tmp);
#else #else
std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2); std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
#endif #endif
#endif
}
#if defined(PADDLE_CUDA_BF16)
HOSTDEVICE inline explicit bfloat16(const __nv_bfloat16& val) {
x = *reinterpret_cast<const unsigned short*>(&val);
} }
#endif
template <class T> template <class T>
HOSTDEVICE inline explicit bfloat16(const T& val) HOSTDEVICE inline explicit bfloat16(const T& val)
: x(bfloat16(static_cast<float>(val)).x) {} : x(bfloat16(static_cast<float>(val)).x) {}
// Assignment operators
#if defined(PADDLE_CUDA_BF16)
HOSTDEVICE inline bfloat16& operator=(const __nv_bfloat16& val) {
x = *reinterpret_cast<const unsigned short*>(&val);
return *this;
}
#endif
HOSTDEVICE inline bfloat16& operator=(bool b) { HOSTDEVICE inline bfloat16& operator=(bool b) {
x = b ? 0x3f80 : 0; x = b ? 0x3f80 : 0;
return *this; return *this;
...@@ -124,13 +153,24 @@ struct PADDLE_ALIGN(2) bfloat16 { ...@@ -124,13 +153,24 @@ struct PADDLE_ALIGN(2) bfloat16 {
return *this; return *this;
} }
// Conversion opertors
HOSTDEVICE inline explicit operator float() const { HOSTDEVICE inline explicit operator float() const {
#ifdef PADDLE_CUDA_BF16
return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
#else
float val = 0.f; float val = 0.f;
uint16_t temp = x; uint16_t temp = x;
memcpy(reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp), memcpy(reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp),
2); 2);
return val; return val;
#endif
}
#ifdef PADDLE_CUDA_BF16
HOSTDEVICE inline explicit operator __nv_bfloat16() const {
return *reinterpret_cast<const __nv_bfloat16*>(&x);
} }
#endif
HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
...@@ -223,6 +263,7 @@ HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) { ...@@ -223,6 +263,7 @@ HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) {
return res; return res;
} }
// Comparison operators
HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) { HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) {
return static_cast<float>(a) == static_cast<float>(b); return static_cast<float>(a) == static_cast<float>(b);
} }
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/bfloat16.h"
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <iostream>
#include "paddle/fluid/framework/lod_tensor.h"
#if defined(PADDLE_CUDA_BF16)
namespace paddle {
namespace platform {
TEST(bfloat16, convert_float32_to_bfloat16_on_gpu) {
// Convert float32 to bfloat16
EXPECT_EQ((bfloat16(1.0f)).x, 0x3f80);
EXPECT_EQ((bfloat16(0.5f)).x, 0x3f00);
EXPECT_EQ((bfloat16(0.33333f)).x, 0x3eab);
EXPECT_EQ((bfloat16(0.0f)).x, 0x0000);
EXPECT_EQ((bfloat16(-0.0f)).x, 0x8000);
EXPECT_EQ((bfloat16(65536.0f)).x, 0x4780);
}
TEST(bfloat16, assignment_operator_on_gpu) {
// Assignment operator
bfloat16 v_assign;
v_assign = nv_bfloat16(bfloat16(1.0f));
EXPECT_EQ(v_assign.x, 0x3f80);
v_assign = 0.33333;
EXPECT_EQ(v_assign.x, 0x3eab);
}
TEST(bfloat16, convert_bfloat16_to_float32_on_gpu) {
// Conversion operator
EXPECT_EQ(static_cast<float>(bfloat16(0.5f)), 0.5f);
EXPECT_NEAR(static_cast<double>(bfloat16(0.33333)), 0.33333, 0.01);
EXPECT_EQ(static_cast<int>(bfloat16(-1)), -1);
EXPECT_EQ(static_cast<bool>(bfloat16(true)), true);
}
TEST(bfloat16, lod_tensor_on_gpu) {
framework::LoDTensor src_tensor;
framework::LoDTensor gpu_tensor;
framework::LoDTensor dst_tensor;
bfloat16 *src_ptr = src_tensor.mutable_data<bfloat16>(
framework::make_ddim({2, 2}), CPUPlace());
bfloat16 arr[4] = {bfloat16(1.0f), bfloat16(0.5f), bfloat16(0.33333f),
bfloat16(0.0f)};
memcpy(src_ptr, arr, 4 * sizeof(bfloat16));
// CPU LoDTensor to GPU LoDTensor
CUDAPlace gpu_place(0);
CUDADeviceContext gpu_ctx(gpu_place);
framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
// GPU LoDTensor to CPU LoDTensor
framework::TensorCopy(gpu_tensor, CPUPlace(), gpu_ctx, &dst_tensor);
// Sync before comparing LoDTensors
gpu_ctx.Wait();
const bfloat16 *dst_ptr = dst_tensor.data<bfloat16>();
ASSERT_NE(src_ptr, dst_ptr);
for (size_t i = 0; i < 4; ++i) {
EXPECT_EQ(src_ptr[i].x, dst_ptr[i].x);
}
}
TEST(bfloat16, isinf) {
bfloat16 a;
a.x = 0x7f80;
bfloat16 b = bfloat16(INFINITY);
bfloat16 c = static_cast<bfloat16>(INFINITY);
EXPECT_EQ(std::isinf(a), true);
EXPECT_EQ(std::isinf(b), true);
EXPECT_EQ(std::isinf(c), true);
}
TEST(bfloat16, isnan) {
bfloat16 a;
a.x = 0x7fff;
bfloat16 b = bfloat16(NAN);
bfloat16 c = static_cast<bfloat16>(NAN);
EXPECT_EQ(std::isnan(a), true);
EXPECT_EQ(std::isnan(b), true);
EXPECT_EQ(std::isnan(c), true);
}
TEST(bfloat16, cast) {
bfloat16 a;
a.x = 0x0070;
auto b = a;
{
// change semantic, keep the same value
bfloat16 c = reinterpret_cast<bfloat16 &>(reinterpret_cast<unsigned &>(b));
EXPECT_EQ(b, c);
}
{
// use uint32 low 16 bit store float16
uint32_t c = reinterpret_cast<uint32_t &>(b);
bfloat16 d;
d.x = c;
EXPECT_EQ(b, d);
}
}
} // namespace platform
} // namespace paddle
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册