From 95e1434bb2fc8fd43a519cfa60ae36845a0cf2ef Mon Sep 17 00:00:00 2001 From: "joanna.wozna.intel" Date: Thu, 3 Sep 2020 06:42:46 +0200 Subject: [PATCH] Add bfloat16 data type (#25402) --- .../fluid/framework/data_layout_transform.cc | 2 + .../fluid/framework/data_layout_transform.h | 6 +- .../framework/data_layout_transform_test.cc | 14 + paddle/fluid/framework/data_type.cc | 1 + paddle/fluid/framework/data_type.h | 21 +- paddle/fluid/framework/data_type_test.cc | 22 + paddle/fluid/framework/data_type_transform.cc | 4 + .../framework/data_type_transform_test.cc | 121 +++++ .../framework/details/nan_inf_utils_detail.cc | 2 + paddle/fluid/framework/dlpack_tensor.cc | 1 + .../analysis/passes/memory_optimize_pass.cc | 28 +- paddle/fluid/inference/lite/test_engine.cc | 5 +- .../fluid/operators/math/concat_and_split.h | 21 +- paddle/fluid/operators/math/math_function.cc | 23 +- paddle/fluid/platform/CMakeLists.txt | 2 + paddle/fluid/platform/bfloat16.h | 439 ++++++++++++++++++ paddle/fluid/platform/bfloat16_test.cc | 162 +++++++ paddle/fluid/platform/mkldnn_helper.h | 6 + paddle/fluid/pybind/tensor_py.h | 15 +- 19 files changed, 832 insertions(+), 63 deletions(-) create mode 100644 paddle/fluid/platform/bfloat16.h create mode 100644 paddle/fluid/platform/bfloat16_test.cc diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 3cea7a66d01..f757e244e38 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -116,6 +116,8 @@ void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) { return platform::to_void_cast(tensor.data()); case mkldnn::memory::data_type::s32: return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::bf16: + return platform::to_void_cast(tensor.data()); default: PADDLE_THROW( platform::errors::InvalidArgument("Wrong mkldnn type provided.")); diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h index 6eb84ef9d7c..b92c47c2eb0 100644 --- a/paddle/fluid/framework/data_layout_transform.h +++ b/paddle/fluid/framework/data_layout_transform.h @@ -61,7 +61,8 @@ inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) { {DataTypeTrait::DataType(), MKLDNNDataType::f32}, {DataTypeTrait::DataType(), MKLDNNDataType::s8}, {DataTypeTrait::DataType(), MKLDNNDataType::u8}, - {DataTypeTrait::DataType(), MKLDNNDataType::s32}}; + {DataTypeTrait::DataType(), MKLDNNDataType::s32}, + {DataTypeTrait::DataType(), MKLDNNDataType::bf16}}; auto iter = dict.find(static_cast(type)); if (iter != dict.end()) return iter->second; return MKLDNNDataType::undef; @@ -74,6 +75,9 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, const OpKernelType& expected_kernel_type, const Tensor& in, Tensor* out); + +void* GetDataFromTensor(const Tensor& tensor, MKLDNNDataType type); + #endif std::vector GetAxis(const DataLayout& from, const DataLayout& to); diff --git a/paddle/fluid/framework/data_layout_transform_test.cc b/paddle/fluid/framework/data_layout_transform_test.cc index a0d08826b85..8dfad23db65 100644 --- a/paddle/fluid/framework/data_layout_transform_test.cc +++ b/paddle/fluid/framework/data_layout_transform_test.cc @@ -43,3 +43,17 @@ TEST(DataTransform, DataLayoutFunction) { EXPECT_TRUE(in.layout() == paddle::framework::DataLayout::kNHWC); EXPECT_TRUE(in.dims() == paddle::framework::make_ddim({2, 3, 1, 2})); } + +#ifdef PADDLE_WITH_MKLDNN +TEST(DataTransform, GetDataFromTensorDNNL) { + auto place = paddle::platform::CPUPlace(); + paddle::framework::Tensor in = paddle::framework::Tensor(); + in.mutable_data( + paddle::framework::make_ddim({2, 3, 1, 2}), place); + + void* in_data = + paddle::framework::GetDataFromTensor(in, dnnl::memory::data_type::bf16); + EXPECT_EQ(in_data, paddle::platform::to_void_cast( + in.data())); +} +#endif diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc index f479d92483c..8188d5cde1b 100644 --- a/paddle/fluid/framework/data_type.cc +++ b/paddle/fluid/framework/data_type.cc @@ -18,6 +18,7 @@ #include using float16 = paddle::platform::float16; +using bfloat16 = paddle::platform::bfloat16; namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h index 2c4a7b4d027..720e422e114 100644 --- a/paddle/fluid/framework/data_type.h +++ b/paddle/fluid/framework/data_type.h @@ -17,6 +17,8 @@ limitations under the License. */ #include #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/platform/enforce.h" + +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -36,15 +38,16 @@ struct DataTypeTrait { #define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \ callback(cpp_type, ::paddle::framework::proto::VarType::proto_type); -#define _ForEachDataType_(callback) \ - _ForEachDataTypeHelper_(callback, float, FP32); \ - _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \ - _ForEachDataTypeHelper_(callback, double, FP64); \ - _ForEachDataTypeHelper_(callback, int, INT32); \ - _ForEachDataTypeHelper_(callback, int64_t, INT64); \ - _ForEachDataTypeHelper_(callback, bool, BOOL); \ - _ForEachDataTypeHelper_(callback, uint8_t, UINT8); \ - _ForEachDataTypeHelper_(callback, int16_t, INT16); \ +#define _ForEachDataType_(callback) \ + _ForEachDataTypeHelper_(callback, float, FP32); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16); \ + _ForEachDataTypeHelper_(callback, double, FP64); \ + _ForEachDataTypeHelper_(callback, int, INT32); \ + _ForEachDataTypeHelper_(callback, int64_t, INT64); \ + _ForEachDataTypeHelper_(callback, bool, BOOL); \ + _ForEachDataTypeHelper_(callback, uint8_t, UINT8); \ + _ForEachDataTypeHelper_(callback, int16_t, INT16); \ _ForEachDataTypeHelper_(callback, int8_t, INT8) #define _ForEachDataTypeSmall_(callback) \ diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc index 2a380201f29..331596da33a 100644 --- a/paddle/fluid/framework/data_type_test.cc +++ b/paddle/fluid/framework/data_type_test.cc @@ -38,3 +38,25 @@ TEST(DataType, float16) { std::string type = "::paddle::platform::float16"; EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str()); } + +TEST(DataType, bfloat16) { + using paddle::framework::Tensor; + using paddle::platform::CPUPlace; + using paddle::platform::bfloat16; + namespace f = paddle::framework; + f::proto::VarType::Type dtype = f::proto::VarType::BF16; + + Tensor tensor; + CPUPlace cpu; + tensor.mutable_data(cpu, dtype); + + // test bf16 tensor + EXPECT_EQ(tensor.type(), f::ToDataType(typeid(bfloat16))); + + // test bf16 size + EXPECT_EQ(f::SizeOfType(dtype), 2u); + + // test debug info + std::string type = "::paddle::platform::bfloat16"; + EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str()); +} diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 44542f05d9d..3d56152c237 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -77,6 +77,10 @@ void TransDataType(const OpKernelType& kernel_type_for_var, framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); break; + case proto::VarType::BF16: + framework::VisitDataType(dst_type, + CastDataType(in, out, ctx)); + break; case proto::VarType::FP32: framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); break; diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc index bbebea9f13f..ea7a665bcbe 100644 --- a/paddle/fluid/framework/data_type_transform_test.cc +++ b/paddle/fluid/framework/data_type_transform_test.cc @@ -24,6 +24,11 @@ TEST(DataTypeTransform, CPUTransform) { paddle::framework::DataLayout::kAnyLayout, paddle::framework::LibraryType::kPlain); + auto kernel_bf16 = paddle::framework::OpKernelType( + paddle::framework::proto::VarType::BF16, place, + paddle::framework::DataLayout::kAnyLayout, + paddle::framework::LibraryType::kPlain); + auto kernel_fp32 = paddle::framework::OpKernelType( paddle::framework::proto::VarType::FP32, place, paddle::framework::DataLayout::kAnyLayout, @@ -189,4 +194,120 @@ TEST(DataTypeTransform, CPUTransform) { static_cast(in_data_bool[i]).x); } } + + // data type transform from/to bfloat16 + { + paddle::framework::Tensor in; + paddle::framework::Tensor out; + + paddle::platform::bfloat16* ptr = + in.mutable_data( + paddle::framework::make_ddim({2, 3}), place); + int data_number = 2 * 3; + + for (int i = 0; i < data_number; ++i) { + ptr[i] = i; + } + + // transform from bfloat16 to other data types + paddle::framework::TransDataType(kernel_bf16, kernel_fp32, in, &out); + float* out_data_float = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_float[i], static_cast(ptr[i])); + } + + paddle::framework::TransDataType(kernel_bf16, kernel_fp64, in, &out); + double* out_data_double = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_double[i], static_cast(ptr[i])); + } + + paddle::framework::TransDataType(kernel_bf16, kernel_int32, in, &out); + int* out_data_int = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_int[i], static_cast(ptr[i])); + } + + paddle::framework::TransDataType(kernel_bf16, kernel_int64, in, &out); + int64_t* out_data_int64 = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_int64[i], static_cast(ptr[i])); + } + + paddle::framework::TransDataType(kernel_bf16, kernel_bool, in, &out); + bool* out_data_bool = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_bool[i], static_cast(ptr[i])); + } + + // transform float to bfloat16 + float* in_data_float = + in.mutable_data(paddle::framework::make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_float[i] = i; + } + + paddle::framework::TransDataType(kernel_fp32, kernel_bf16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, + static_cast(in_data_float[i]).x); + } + + // transform double to bfloat16 + double* in_data_double = + in.mutable_data(paddle::framework::make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_double[i] = i; + } + + paddle::framework::TransDataType(kernel_fp64, kernel_bf16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, + static_cast(in_data_double[i]).x); + } + + // transform int to bfloat16 + int* in_data_int = + in.mutable_data(paddle::framework::make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_int[i] = i; + } + + paddle::framework::TransDataType(kernel_int32, kernel_bf16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, + static_cast(in_data_int[i]).x); + } + + // transform int64 to bfloat16 + int64_t* in_data_int64 = + in.mutable_data(paddle::framework::make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_int64[i] = i; + } + + paddle::framework::TransDataType(kernel_int64, kernel_bf16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, + static_cast(in_data_int64[i]).x); + } + + // transform bool to bfloat16 + bool* in_data_bool = + in.mutable_data(paddle::framework::make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_bool[i] = i; + } + + paddle::framework::TransDataType(kernel_bool, kernel_bf16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, + static_cast(in_data_bool[i]).x); + } + } } diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 956b099e883..67a4c0caf8e 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -167,6 +167,8 @@ static void PrintNanInf(const T* value, const size_t numel, int print_num, // more detail see: 180 page of // https://www.openmp.org/wp-content/uploads/OpenMP4.0.0.pdf #pragma omp declare reduction(+ : paddle::platform::float16 : omp_out += omp_in) +#pragma omp declare reduction(+ : paddle::platform::bfloat16 : omp_out += \ + omp_in) #endif template diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 180b33d0cb7..915589b3242 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -23,6 +23,7 @@ template static ::DLDataType GetDLDataTypeCode() { ::DLDataType dtype; if (std::is_same::value || + std::is_same::value || std::is_floating_point::value) { dtype.code = kDLFloat; } else if (std::is_unsigned::value) { diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc index 6fbf880356c..9eb84785157 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -90,32 +90,6 @@ void MemoryOptimizePass::CollectLifeCycle( } } -// TODO(Superjomn) Make this a general help method. -int DataTypeToSpace(framework::proto::VarType_Type type) { - switch (type) { - case framework::proto::VarType_Type_BOOL: - return sizeof(bool); - case framework::proto::VarType_Type_FP32: - return sizeof(float); - case framework::proto::VarType_Type_INT32: - return sizeof(int32_t); - case framework::proto::VarType_Type_INT64: - return sizeof(int64_t); - case framework::proto::VarType_Type_INT16: - return sizeof(int16_t); - case framework::proto::VarType_Type_FP16: - return sizeof(int16_t); - case framework::proto::VarType_Type_FP64: - return sizeof(double); - case framework::proto::VarType_Type_UINT8: - return sizeof(unsigned char); - case framework::proto::VarType_Type_INT8: - return sizeof(int8_t); - default: - PADDLE_THROW("Unknown data type"); - } -} - void MemoryOptimizePass::CollectVarMemorySize( space_table_t* space_table) const { const int fake_batch_size = 1; @@ -163,7 +137,7 @@ void MemoryOptimizePass::CollectVarMemorySize( int size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); (*space_table)[node->Var()->Name()] = - size * DataTypeToSpace(node->Var()->GetDataType()); + size * paddle::framework::SizeOfType(node->Var()->GetDataType()); } } } diff --git a/paddle/fluid/inference/lite/test_engine.cc b/paddle/fluid/inference/lite/test_engine.cc index 325c7ab2539..d29bcb76be7 100644 --- a/paddle/fluid/inference/lite/test_engine.cc +++ b/paddle/fluid/inference/lite/test_engine.cc @@ -14,15 +14,16 @@ #include -#include "paddle/fluid/inference/lite/engine.h" #include "paddle/fluid/inference/utils/singleton.h" -#include "paddle/fluid/operators/lite/ut_helper.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/lite/engine.h" +#include "paddle/fluid/operators/lite/ut_helper.h" + namespace paddle { namespace inference { namespace lite { diff --git a/paddle/fluid/operators/math/concat_and_split.h b/paddle/fluid/operators/math/concat_and_split.h index 3a5eddcbf4a..18d9a6310dd 100644 --- a/paddle/fluid/operators/math/concat_and_split.h +++ b/paddle/fluid/operators/math/concat_and_split.h @@ -65,13 +65,14 @@ class SplitFunctor { } // namespace operators } // namespace paddle -#define FOR_ALL_TYPES(macro) \ - macro(int); \ - macro(float); \ - macro(double); \ - macro(bool); \ - macro(int64_t); \ - macro(int16_t); \ - macro(uint8_t); \ - macro(int8_t); \ - macro(::paddle::platform::float16) +#define FOR_ALL_TYPES(macro) \ + macro(int); \ + macro(float); \ + macro(double); \ + macro(bool); \ + macro(int64_t); \ + macro(int16_t); \ + macro(uint8_t); \ + macro(int8_t); \ + macro(::paddle::platform::float16); \ + macro(::paddle::platform::bfloat16) diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 6748d0ab43f..824e66b1eb4 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -34,6 +34,7 @@ namespace math { using float16 = paddle::platform::float16; template struct SetConstant; +template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; @@ -41,16 +42,18 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; -#define DEFINE_CPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ +#define DEFINE_CPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ template struct Transpose; DEFINE_CPU_TRANS(1); diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 652b4dd47da..ef827fd7490 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -136,6 +136,8 @@ cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) +cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor) + nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags) nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h new file mode 100644 index 00000000000..742329abb2d --- /dev/null +++ b/paddle/fluid/platform/bfloat16.h @@ -0,0 +1,439 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#if !defined(_WIN32) +#define PADDLE_ALIGN(x) __attribute__((aligned(x))) +#else +#define PADDLE_ALIGN(x) __declspec(align(x)) +#endif + +#include +#include "paddle/fluid/platform/hostdevice.h" +#include "unsupported/Eigen/CXX11/Tensor" + +namespace paddle { +namespace platform { + +struct PADDLE_ALIGN(2) bfloat16 { + public: + uint16_t x; + + bfloat16() = default; + bfloat16(const bfloat16& o) = default; + bfloat16& operator=(const bfloat16& o) = default; + bfloat16(bfloat16&& o) = default; + bfloat16& operator=(bfloat16&& o) = default; + ~bfloat16() = default; + + HOSTDEVICE inline explicit bfloat16(float val) { + std::memcpy(&x, reinterpret_cast(&val) + 2, 2); + } + + template + HOSTDEVICE inline explicit bfloat16(const T& val) + : x(bfloat16(static_cast(val)).x) {} + + HOSTDEVICE inline bfloat16& operator=(bool b) { + x = b ? 0x3f80 : 0; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(int8_t val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(uint8_t val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(int16_t val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(uint16_t val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(int32_t val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(uint32_t val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(int64_t val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(uint64_t val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(float val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline bfloat16& operator=(double val) { + x = bfloat16(val).x; + return *this; + } + + HOSTDEVICE inline explicit operator float() const { + float val = 0.f; + uint16_t temp = x; + memcpy(reinterpret_cast(&val) + 2, reinterpret_cast(&temp), + 2); + return val; + } + + HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } + + HOSTDEVICE inline explicit operator int8_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator uint8_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator int16_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator uint16_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator int32_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator uint32_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator int64_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator uint64_t() const { + return static_cast(static_cast(*this)); + } + + HOSTDEVICE inline explicit operator double() const { + return static_cast(static_cast(*this)); + } +}; + +HOSTDEVICE inline bfloat16 operator+(const bfloat16& a, const bfloat16& b) { + return bfloat16(static_cast(a) + static_cast(b)); +} + +HOSTDEVICE inline bfloat16 operator-(const bfloat16& a, const bfloat16& b) { + return bfloat16(static_cast(a) - static_cast(b)); +} + +HOSTDEVICE inline bfloat16 operator*(const bfloat16& a, const bfloat16& b) { + return bfloat16(static_cast(a) * static_cast(b)); +} + +HOSTDEVICE inline bfloat16 operator/(const bfloat16& a, const bfloat16& b) { + return bfloat16(static_cast(a) / static_cast(b)); +} + +HOSTDEVICE inline bfloat16 operator-(const bfloat16& a) { + bfloat16 res; + res.x = a.x ^ 0x8000; + return res; +} + +HOSTDEVICE inline bfloat16& operator+=(bfloat16& a, // NOLINT + const bfloat16& b) { + a = bfloat16(static_cast(a) + static_cast(b)); + return a; +} + +HOSTDEVICE inline bfloat16& operator-=(bfloat16& a, // NOLINT + const bfloat16& b) { + a = bfloat16(static_cast(a) - static_cast(b)); + return a; +} + +HOSTDEVICE inline bfloat16& operator*=(bfloat16& a, // NOLINT + const bfloat16& b) { + a = bfloat16(static_cast(a) * static_cast(b)); + return a; +} + +HOSTDEVICE inline bfloat16& operator/=(bfloat16& a, // NOLINT + const bfloat16& b) { + a = bfloat16(static_cast(a) / static_cast(b)); + return a; +} + +HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) { + bfloat16 res; + res.x = a; + return res; +} + +HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) { + return static_cast(a) == static_cast(b); +} + +HOSTDEVICE inline bool operator!=(const bfloat16& a, const bfloat16& b) { + return static_cast(a) != static_cast(b); +} + +HOSTDEVICE inline bool operator<(const bfloat16& a, const bfloat16& b) { + return static_cast(a) < static_cast(b); +} + +HOSTDEVICE inline bool operator<=(const bfloat16& a, const bfloat16& b) { + return static_cast(a) <= static_cast(b); +} + +HOSTDEVICE inline bool operator>(const bfloat16& a, const bfloat16& b) { + return static_cast(a) > static_cast(b); +} + +HOSTDEVICE inline bool operator>=(const bfloat16& a, const bfloat16& b) { + return static_cast(a) >= static_cast(b); +} + +HOSTDEVICE inline bool(isnan)(const bfloat16& a) { + return (a.x & 0x7FFF) > 0x7F80; +} + +HOSTDEVICE inline bool(isinf)(const bfloat16& a) { + return (a.x & 0x7F80) == 0x7F80; +} + +HOSTDEVICE inline bool(isfinite)(const bfloat16& a) { + return !((isnan)(a)) && !((isinf)(a)); +} + +inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) { + os << a.x; + return os; +} + +} // namespace platform +} // namespace paddle + +namespace std { + +template <> +struct is_pod { + static const bool value = + is_trivial::value && + is_standard_layout::value; +}; + +template <> +struct is_floating_point + : std::integral_constant< + bool, std::is_same::type>::value> {}; +template <> +struct is_signed { + static const bool value = true; +}; + +template <> +struct is_unsigned { + static const bool value = false; +}; + +inline bool isnan(const paddle::platform::bfloat16& a) { + return paddle::platform::isnan(a); +} + +inline bool isinf(const paddle::platform::bfloat16& a) { + return paddle::platform::isinf(a); +} + +template <> +struct numeric_limits { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_to_nearest; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 8; + static const int digits10 = 2; + static const int max_digits10 = 9; + static const int radix = 2; + static const int min_exponent = -125; + static const int min_exponent10 = -37; + static const int max_exponent = 128; + static const int max_exponent10 = 38; + static const bool traps = true; + static const bool tinyness_before = false; + + static paddle::platform::bfloat16(min)() { + return paddle::platform::raw_uint16_to_bfloat16(0x007f); + } + static paddle::platform::bfloat16 lowest() { + return paddle::platform::raw_uint16_to_bfloat16(0xff7f); + } + static paddle::platform::bfloat16(max)() { + return paddle::platform::raw_uint16_to_bfloat16(0x7f7f); + } + static paddle::platform::bfloat16 epsilon() { + return paddle::platform::raw_uint16_to_bfloat16(0x3400); + } + static paddle::platform::bfloat16 round_error() { + return paddle::platform::bfloat16(0.5); + } + static paddle::platform::bfloat16 infinity() { + return paddle::platform::raw_uint16_to_bfloat16(0x7f80); + } + static paddle::platform::bfloat16 quiet_NaN() { + return paddle::platform::raw_uint16_to_bfloat16(0xffc1); + } + static paddle::platform::bfloat16 signaling_NaN() { + return paddle::platform::raw_uint16_to_bfloat16(0xff81); + } + static paddle::platform::bfloat16 denorm_min() { + return paddle::platform::raw_uint16_to_bfloat16(0x0001); + } +}; + +} // namespace std + +namespace Eigen { + +using bfloat16 = paddle::platform::bfloat16; + +template <> +struct NumTraits : GenericNumTraits { + enum { + IsSigned = true, + IsInteger = false, + IsComplex = false, + RequireInitialization = false + }; + + HOSTDEVICE static inline bfloat16 epsilon() { + return paddle::platform::raw_uint16_to_bfloat16(0x3400); + } + HOSTDEVICE static inline bfloat16 dummy_precision() { + return bfloat16(1e-5f); + } + HOSTDEVICE static inline bfloat16 highest() { + return paddle::platform::raw_uint16_to_bfloat16(0x7f7f); + } + HOSTDEVICE static inline bfloat16 lowest() { + return paddle::platform::raw_uint16_to_bfloat16(0xff7f); + } + HOSTDEVICE static inline bfloat16 infinity() { + return paddle::platform::raw_uint16_to_bfloat16(0x7f80); + } + HOSTDEVICE static inline bfloat16 quiet_NaN() { + return paddle::platform::raw_uint16_to_bfloat16(0xffc1); + } +}; +namespace numext { + +template <> +HOSTDEVICE inline bool(isnan)(const bfloat16& a) { + return (paddle::platform::isnan)(a); +} + +template <> +HOSTDEVICE inline bool(isinf)(const bfloat16& a) { + return (paddle::platform::isinf)(a); +} + +template <> +HOSTDEVICE inline bool(isfinite)(const bfloat16& a) { + return (paddle::platform::isfinite)(a); +} + +template <> +HOSTDEVICE inline bfloat16 exp(const bfloat16& a) { + return bfloat16(::expf(static_cast(a))); +} + +template <> +HOSTDEVICE inline bfloat16 erf(const bfloat16& a) { + return bfloat16(::erff(static_cast(a))); +} + +template <> +HOSTDEVICE inline bfloat16 log(const bfloat16& a) { + return bfloat16(::logf(static_cast(a))); +} + +template <> +HOSTDEVICE inline bfloat16 tanh(const bfloat16& a) { + return bfloat16(::tanhf(static_cast(a))); +} + +template <> +HOSTDEVICE inline bfloat16 sqrt(const bfloat16& a) { + return bfloat16(::sqrtf(static_cast(a))); +} + +template <> +HOSTDEVICE inline bfloat16 ceil(const bfloat16& a) { + return bfloat16(::ceilf(static_cast(a))); +} + +template <> +HOSTDEVICE inline bfloat16 floor(const bfloat16& a) { + return bfloat16(::floorf(static_cast(a))); +} + +template <> +HOSTDEVICE inline bfloat16 round(const bfloat16& a) { + return bfloat16(::roundf(static_cast(a))); +} + +template <> +HOSTDEVICE inline bfloat16 pow(const bfloat16& a, const bfloat16& b) { + return bfloat16(::powf(static_cast(a), static_cast(b))); +} + +template <> +HOSTDEVICE inline bfloat16 abs(const bfloat16& a) { + return bfloat16(::fabs(static_cast(a))); +} + +} // namespace numext +} // namespace Eigen diff --git a/paddle/fluid/platform/bfloat16_test.cc b/paddle/fluid/platform/bfloat16_test.cc new file mode 100644 index 00000000000..bdb508ee336 --- /dev/null +++ b/paddle/fluid/platform/bfloat16_test.cc @@ -0,0 +1,162 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/bfloat16.h" + +#include + +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/init.h" + +namespace paddle { +namespace platform { + +using bfloat16 = paddle::platform::bfloat16; + +TEST(bfloat16, conversion_cpu) { + // Conversion from float + EXPECT_EQ(bfloat16(1.0f).x, 0x3f80); + EXPECT_EQ(bfloat16(0.5f).x, 0x3f00); + EXPECT_EQ(bfloat16(0.33333f).x, 0x3eaa); + EXPECT_EQ(bfloat16(0.0f).x, 0x0000); + EXPECT_EQ(bfloat16(-0.0f).x, 0x8000); + EXPECT_EQ(bfloat16(65504.0f).x, 0x477f); + EXPECT_EQ(bfloat16(65536.0f).x, 0x4780); + + // Conversion from double + EXPECT_EQ(bfloat16(1.0).x, 0x3f80); + EXPECT_EQ(bfloat16(0.5).x, 0x3f00); + EXPECT_EQ(bfloat16(0.33333).x, 0x3eaa); + EXPECT_EQ(bfloat16(0.0).x, 0x0000); + EXPECT_EQ(bfloat16(-0.0).x, 0x8000); + EXPECT_EQ(bfloat16(65504.0).x, 0x477f); + EXPECT_EQ(bfloat16(65536.0).x, 0x4780); + + // Conversion from int + EXPECT_EQ(bfloat16(-1).x, 0xbf80); + EXPECT_EQ(bfloat16(0).x, 0x0000); + EXPECT_EQ(bfloat16(1).x, 0x3f80); + EXPECT_EQ(bfloat16(2).x, 0x4000); + EXPECT_EQ(bfloat16(3).x, 0x4040); + + // Conversion from bool + EXPECT_EQ(bfloat16(true).x, 0x3f80); + EXPECT_EQ(bfloat16(false).x, 0x0000); + + // Assignment operator + bfloat16 v_assign; + v_assign = bfloat16(0.f); + EXPECT_EQ(v_assign.x, 0x0000); + v_assign = 0.5f; + EXPECT_EQ(v_assign.x, 0x3f00); + v_assign = 0.33333; + EXPECT_EQ(v_assign.x, 0x3eaa); + v_assign = -1; + EXPECT_EQ(v_assign.x, 0xbf80); + + // Conversion operator + EXPECT_EQ(static_cast(bfloat16(0.5f)), 0.5f); + EXPECT_NEAR(static_cast(bfloat16(0.33333)), 0.33333, 0.01); + EXPECT_EQ(static_cast(bfloat16(-1)), -1); + EXPECT_EQ(static_cast(bfloat16(true)), true); +} + +TEST(bfloat16, arithmetic_cpu) { + EXPECT_NEAR(static_cast(bfloat16(1) + bfloat16(1)), 2, 0.001); + EXPECT_EQ(static_cast(bfloat16(5) + bfloat16(-5)), 0); + EXPECT_NEAR(static_cast(bfloat16(0.33333f) + bfloat16(0.66667f)), 1.0f, + 0.01); + EXPECT_EQ(static_cast(bfloat16(3) - bfloat16(5)), -2); + EXPECT_NEAR(static_cast(bfloat16(0.66667f) - bfloat16(0.33333f)), + 0.33334f, 0.01); + EXPECT_NEAR(static_cast(bfloat16(3.3f) * bfloat16(2.0f)), 6.6f, 0.01); + EXPECT_NEAR(static_cast(bfloat16(-2.1f) * bfloat16(-3.0f)), 6.3f, 0.1); + EXPECT_NEAR(static_cast(bfloat16(2.0f) / bfloat16(3.0f)), 0.66667f, + 0.01); + EXPECT_EQ(static_cast(bfloat16(1.0f) / bfloat16(2.0f)), 0.5f); + EXPECT_EQ(static_cast(-bfloat16(512.0f)), -512.0f); + EXPECT_EQ(static_cast(-bfloat16(-512.0f)), 512.0f); +} + +TEST(bfloat16, comparison_cpu) { + EXPECT_TRUE(bfloat16(1.0f) == bfloat16(1.0f)); + EXPECT_FALSE(bfloat16(-1.0f) == bfloat16(-0.5f)); + EXPECT_TRUE(bfloat16(1.0f) != bfloat16(0.5f)); + EXPECT_FALSE(bfloat16(-1.0f) != bfloat16(-1.0f)); + EXPECT_TRUE(bfloat16(1.0f) < bfloat16(2.0f)); + EXPECT_FALSE(bfloat16(-1.0f) < bfloat16(-1.0f)); + EXPECT_TRUE(bfloat16(1.0f) <= bfloat16(1.0f)); + EXPECT_TRUE(bfloat16(2.0f) > bfloat16(1.0f)); + EXPECT_FALSE(bfloat16(-2.0f) > bfloat16(-2.0f)); + EXPECT_TRUE(bfloat16(2.0f) >= bfloat16(2.0f)); +} + +TEST(bfloat16, lod_tensor_cpu) { + framework::LoDTensor lod_tensor; + + std::vector input_data = {bfloat16(1.0f), bfloat16(0.5f), + bfloat16(0.33333f), bfloat16(0.0f)}; + EXPECT_EQ(input_data[0].x, 0x3f80); + EXPECT_EQ(input_data[1].x, 0x3f00); + EXPECT_EQ(input_data[2].x, 0x3eaa); + EXPECT_EQ(input_data[3].x, 0x0000); + + lod_tensor.Resize({4, 1}); + lod_tensor.set_lod(framework::LoD({{0, 2, 4}})); + bfloat16* data_ptr = lod_tensor.mutable_data(CPUPlace()); + + EXPECT_NE(data_ptr, nullptr); + EXPECT_EQ(input_data.size(), static_cast(lod_tensor.numel())); + for (size_t i = 0; i < input_data.size(); ++i) { + data_ptr[i] = input_data[i]; + EXPECT_EQ(data_ptr[i].x, input_data[i].x); + } +} + +TEST(bfloat16, floating) { + // compile time assert. + PADDLE_ENFORCE_EQ( + std::is_floating_point::value, true, + platform::errors::Fatal("std::is_floating_point with bfloat16 data type " + "should be equal to true but it is not")); +} + +TEST(bfloat16, print) { + bfloat16 a = bfloat16(1.0f); + std::cout << a << std::endl; +} + +// CPU test +TEST(bfloat16, isinf) { + bfloat16 a; + a.x = 0x7f80; + bfloat16 b = bfloat16(INFINITY); + bfloat16 c = static_cast(INFINITY); + EXPECT_EQ(std::isinf(a), true); + EXPECT_EQ(std::isinf(b), true); + EXPECT_EQ(std::isinf(c), true); +} + +TEST(bfloat16, isnan) { + bfloat16 a; + a.x = 0x7fff; + bfloat16 b = bfloat16(NAN); + bfloat16 c = static_cast(NAN); + EXPECT_EQ(std::isnan(a), true); + EXPECT_EQ(std::isnan(b), true); + EXPECT_EQ(std::isnan(c), true); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 3782eb684f2..8fb66c6f34b 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -161,6 +161,12 @@ inline mkldnn::memory::data_type MKLDNNGetDataType() { return mkldnn::memory::data_type::u8; } +template <> +inline mkldnn::memory::data_type +MKLDNNGetDataType() { + return mkldnn::memory::data_type::bf16; +} + inline void Reorder(mkldnn::memory src, mkldnn::memory dst, const mkldnn::engine& engine) { auto reorder_prim = mkldnn::reorder(src, dst); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 4377a8c2cef..5ee15073267 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" #include "pybind11/numpy.h" @@ -104,6 +105,7 @@ struct ValidDTypeToPyArrayChecker { } DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::float16); +DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::bfloat16); DECLARE_VALID_DTYPE_TO_PY_ARRAY(float); DECLARE_VALID_DTYPE_TO_PY_ARRAY(double); DECLARE_VALID_DTYPE_TO_PY_ARRAY(bool); @@ -119,6 +121,9 @@ inline std::string TensorDTypeToPyDTypeStr( if (type == proto_type) { \ if (std::is_same::value) { \ return "e"; \ + } else if (std::is_same::value) { \ + /* NumPy character code of uint16 due to no support for bfloat16 */ \ + return "H"; \ } else { \ constexpr auto kIsValidDType = ValidDTypeToPyArrayChecker::kValue; \ PADDLE_ENFORCE_EQ( \ @@ -262,10 +267,10 @@ void SetTensorFromPyArray(framework::Tensor *self, const py::object &obj, SetTensorFromPyArrayT(self, array, place, zero_copy); } else if (py::isinstance>(array)) { - // TODO(cql): temporary keeping uint16, which is used for casting float16 - // before. It should be depracated later. - SetTensorFromPyArrayT(self, array, place, - zero_copy); + // since there is still no support for bfloat16 in NumPy, + // uint16 is used for casting bfloat16 + SetTensorFromPyArrayT(self, array, place, + zero_copy); } else if (py::isinstance>(array)) { SetTensorFromPyArrayT(self, array, place, zero_copy); } else { @@ -479,6 +484,8 @@ inline framework::Tensor *_sliceTensor(const framework::Tensor &self, switch (src_type) { case framework::proto::VarType::FP16: return _sliceAndConcat(self, obj, dim); + case framework::proto::VarType::BF16: + return _sliceAndConcat(self, obj, dim); case framework::proto::VarType::FP32: return _sliceAndConcat(self, obj, dim); case framework::proto::VarType::FP64: -- GitLab