diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 5e646a5b93e05972d87c26f920e1626ad358ab37..0827d6a5ae7644579ffc2ab502893ec1e6ab1ee2 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -190,6 +190,7 @@ cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor) IF(WITH_GPU) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) + nv_test(bfloat16_gpu_test SRCS bfloat16_test.cu DEPS lod_tensor) nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags) nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) ENDIF() diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h index 6cb4901f1dde326f5da5f41dfdfeb29d43595884..a362e2903f24560fa7ba3fbae4833929ccb8b9e3 100644 --- a/paddle/fluid/platform/bfloat16.h +++ b/paddle/fluid/platform/bfloat16.h @@ -21,6 +21,15 @@ #include #include +#ifdef PADDLE_WITH_CUDA +#include +#endif + +#if defined(__CUDACC__) && CUDA_VERSION >= 11000 +#define PADDLE_CUDA_BF16 +#include +#endif + #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) #else @@ -44,6 +53,7 @@ struct PADDLE_ALIGN(2) bfloat16 { public: uint16_t x; + // Constructors bfloat16() = default; bfloat16(const bfloat16& o) = default; bfloat16& operator=(const bfloat16& o) = default; @@ -60,15 +70,34 @@ struct PADDLE_ALIGN(2) bfloat16 { tempRes = reinterpret_cast(&val); res = *tempRes; x = res >> 16; +#else +#if defined(PADDLE_CUDA_BF16) + __nv_bfloat16 tmp = __float2bfloat16(val); + x = *reinterpret_cast(&tmp); #else std::memcpy(&x, reinterpret_cast(&val) + 2, 2); #endif +#endif + } + +#if defined(PADDLE_CUDA_BF16) + HOSTDEVICE inline explicit bfloat16(const __nv_bfloat16& val) { + x = *reinterpret_cast(&val); } +#endif template HOSTDEVICE inline explicit bfloat16(const T& val) : x(bfloat16(static_cast(val)).x) {} +// Assignment operators +#if defined(PADDLE_CUDA_BF16) + HOSTDEVICE inline bfloat16& operator=(const __nv_bfloat16& val) { + x = *reinterpret_cast(&val); + return *this; + } +#endif + HOSTDEVICE inline bfloat16& operator=(bool b) { x = b ? 0x3f80 : 0; return *this; @@ -124,13 +153,24 @@ struct PADDLE_ALIGN(2) bfloat16 { return *this; } + // Conversion opertors HOSTDEVICE inline explicit operator float() const { +#ifdef PADDLE_CUDA_BF16 + return __bfloat162float(*reinterpret_cast(&x)); +#else float val = 0.f; uint16_t temp = x; memcpy(reinterpret_cast(&val) + 2, reinterpret_cast(&temp), 2); return val; +#endif + } + +#ifdef PADDLE_CUDA_BF16 + HOSTDEVICE inline explicit operator __nv_bfloat16() const { + return *reinterpret_cast(&x); } +#endif HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } @@ -223,6 +263,7 @@ HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) { return res; } +// Comparison operators HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) { return static_cast(a) == static_cast(b); } diff --git a/paddle/fluid/platform/bfloat16_test.cu b/paddle/fluid/platform/bfloat16_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..dbbb72920a53b06fa9dfdd75e12df68317025b70 --- /dev/null +++ b/paddle/fluid/platform/bfloat16_test.cu @@ -0,0 +1,124 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/bfloat16.h" + +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" + +#if defined(PADDLE_CUDA_BF16) +namespace paddle { +namespace platform { + +TEST(bfloat16, convert_float32_to_bfloat16_on_gpu) { + // Convert float32 to bfloat16 + EXPECT_EQ((bfloat16(1.0f)).x, 0x3f80); + EXPECT_EQ((bfloat16(0.5f)).x, 0x3f00); + EXPECT_EQ((bfloat16(0.33333f)).x, 0x3eab); + EXPECT_EQ((bfloat16(0.0f)).x, 0x0000); + EXPECT_EQ((bfloat16(-0.0f)).x, 0x8000); + EXPECT_EQ((bfloat16(65536.0f)).x, 0x4780); +} + +TEST(bfloat16, assignment_operator_on_gpu) { + // Assignment operator + bfloat16 v_assign; + v_assign = nv_bfloat16(bfloat16(1.0f)); + EXPECT_EQ(v_assign.x, 0x3f80); + v_assign = 0.33333; + EXPECT_EQ(v_assign.x, 0x3eab); +} + +TEST(bfloat16, convert_bfloat16_to_float32_on_gpu) { + // Conversion operator + EXPECT_EQ(static_cast(bfloat16(0.5f)), 0.5f); + EXPECT_NEAR(static_cast(bfloat16(0.33333)), 0.33333, 0.01); + EXPECT_EQ(static_cast(bfloat16(-1)), -1); + EXPECT_EQ(static_cast(bfloat16(true)), true); +} + +TEST(bfloat16, lod_tensor_on_gpu) { + framework::LoDTensor src_tensor; + framework::LoDTensor gpu_tensor; + framework::LoDTensor dst_tensor; + + bfloat16 *src_ptr = src_tensor.mutable_data( + framework::make_ddim({2, 2}), CPUPlace()); + + bfloat16 arr[4] = {bfloat16(1.0f), bfloat16(0.5f), bfloat16(0.33333f), + bfloat16(0.0f)}; + memcpy(src_ptr, arr, 4 * sizeof(bfloat16)); + + // CPU LoDTensor to GPU LoDTensor + CUDAPlace gpu_place(0); + CUDADeviceContext gpu_ctx(gpu_place); + framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor); + + // GPU LoDTensor to CPU LoDTensor + framework::TensorCopy(gpu_tensor, CPUPlace(), gpu_ctx, &dst_tensor); + + // Sync before comparing LoDTensors + gpu_ctx.Wait(); + const bfloat16 *dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 4; ++i) { + EXPECT_EQ(src_ptr[i].x, dst_ptr[i].x); + } +} + +TEST(bfloat16, isinf) { + bfloat16 a; + a.x = 0x7f80; + bfloat16 b = bfloat16(INFINITY); + bfloat16 c = static_cast(INFINITY); + EXPECT_EQ(std::isinf(a), true); + EXPECT_EQ(std::isinf(b), true); + EXPECT_EQ(std::isinf(c), true); +} + +TEST(bfloat16, isnan) { + bfloat16 a; + a.x = 0x7fff; + bfloat16 b = bfloat16(NAN); + bfloat16 c = static_cast(NAN); + EXPECT_EQ(std::isnan(a), true); + EXPECT_EQ(std::isnan(b), true); + EXPECT_EQ(std::isnan(c), true); +} + +TEST(bfloat16, cast) { + bfloat16 a; + a.x = 0x0070; + auto b = a; + { + // change semantic, keep the same value + bfloat16 c = reinterpret_cast(reinterpret_cast(b)); + EXPECT_EQ(b, c); + } + + { + // use uint32 low 16 bit store float16 + uint32_t c = reinterpret_cast(b); + bfloat16 d; + d.x = c; + EXPECT_EQ(b, d); + } +} + +} // namespace platform +} // namespace paddle +#endif