未验证 提交 f3436af1 编写于 作者: A Adam Osewski 提交者: GitHub

[cherry-pick] Sum kernel for CPU supporting BF16 and SelectedRows (#32631) (#32755)

上级 21448525
......@@ -15,6 +15,7 @@
#ifdef PADDLE_WITH_MKLML
#include <mkl.h>
#endif
#include <algorithm>
#include <cmath>
#include <limits>
......@@ -28,6 +29,19 @@
namespace paddle {
namespace operators {
namespace math {
namespace detail {
template <typename T>
static void axpy(int n, const T alpha, const T *x, const int incx, T *y,
const int incy) {
// Y = Y + alpha * X
while (n-- > 0) {
*y += alpha * *x;
y = y + incy;
x = x + incx;
}
}
} // namespace detail
template <typename T>
struct CBlas;
......@@ -43,6 +57,11 @@ struct CBlas<int8_t> {
template <>
struct CBlas<platform::bfloat16> {
template <typename... ARGS>
static void AXPY(ARGS... args) {
detail::axpy(args...);
}
template <typename... ARGS>
static void VCOPY(ARGS... args) {
PADDLE_THROW(platform::errors::Unimplemented(
......
......@@ -285,6 +285,8 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
platform::bfloat16>;
// This is a separated namespace for manipulate SelectedRows typed
// data. Like merge duplicated rows, adding two SelectedRows etc.
......@@ -294,21 +296,17 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
// add or mul.
namespace scatter {
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_floating_point<T>::value &&
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
size_t data_len, const T* in, T* out) {
blas->AXPY(data_len, 1., in, out);
template <typename T>
typename std::enable_if<std::is_floating_point<T>::value>::type
elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
const T* in, T* out) {
blas->AXPY(data_len, T(1.f), in, out);
}
template <typename DeviceContext, typename T>
typename std::enable_if<
!std::is_floating_point<T>::value &&
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
size_t data_len, const T* in, T* out) {
template <typename T>
typename std::enable_if<std::is_integral<T>::value>::type elementwise_add_to(
BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len, const T* in,
T* out) {
for (size_t i = 0; i < data_len; i++) {
out[i] += in[i];
}
......@@ -412,7 +410,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
out.set_rows(merge_rows);
math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
constant_functor(context, out.mutable_value(), 0.0);
constant_functor(context, out.mutable_value(), static_cast<T>(0.f));
std::unordered_map<int64_t, size_t> rows_to_id;
for (size_t i = 0; i < merge_rows.size(); ++i) {
......@@ -429,9 +427,9 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_to_id[input_rows[i]];
elementwise_add_to<platform::CPUDeviceContext, T>(
context, &blas, static_cast<size_t>(input_width),
&input_data[i * input_width], &out_data[out_i * input_width]);
elementwise_add_to<T>(&blas, static_cast<size_t>(input_width),
&input_data[i * input_width],
&out_data[out_i * input_width]);
}
}
}
......@@ -524,9 +522,9 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_to_id[input_rows[i]];
elementwise_add_to<platform::CPUDeviceContext, T>(
context, &blas, static_cast<size_t>(input_width),
&input_data[i * input_width], &out_data[out_i * input_width]);
elementwise_add_to<T>(&blas, static_cast<size_t>(input_width),
&input_data[i * input_width],
&out_data[out_i * input_width]);
}
}
size_t input_width_cast = static_cast<size_t>(input_width);
......@@ -547,6 +545,8 @@ template struct MergeAdd<platform::CPUDeviceContext,
paddle::platform::complex64>;
template struct MergeAdd<platform::CPUDeviceContext,
paddle::platform::complex128>;
template struct MergeAdd<platform::CPUDeviceContext,
paddle::platform::bfloat16>;
template struct MergeAverage<platform::CPUDeviceContext, int>;
template struct MergeAverage<platform::CPUDeviceContext, int64_t>;
......
......@@ -326,4 +326,6 @@ REGISTER_OP_CPU_KERNEL(
sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
ops::SumKernel<paddle::platform::CPUDeviceContext, double>,
ops::SumKernel<paddle::platform::CPUDeviceContext, int>,
ops::SumKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>,
ops::SumKernel<paddle::platform::CPUDeviceContext, int64_t>);
......@@ -76,8 +76,7 @@ class TestSparseSGDOpBF16(unittest.TestCase):
grad_selected_rows = scope.var('Grad').get_selected_rows()
grad_selected_rows.set_height(height)
grad_selected_rows.set_rows(rows)
# grad_array = np.random.random((len(rows), row_numel)).astype('float32')
grad_array = np.full((len(rows), row_numel), 2, np.float32)
grad_array = np.random.random((len(rows), row_numel)).astype('float32')
np_array_bf16 = convert_float_to_uint16(grad_array)
grad_tensor = grad_selected_rows.get_tensor()
......@@ -87,8 +86,7 @@ class TestSparseSGDOpBF16(unittest.TestCase):
def create_dense_param_var(self, scope, place, height, width):
param_tensor = scope.var('Param').get_tensor()
# param_array = np.random.random((height, width)).astype('float32')
param_array = np.full((height, width), 5, np.float32)
param_array = np.random.random((height, width)).astype('float32')
param_array_bf16 = convert_float_to_uint16(param_array)
param_tensor.set(param_array_bf16, place)
......@@ -109,8 +107,7 @@ class TestSparseSGDOpBF16(unittest.TestCase):
def create_dense_lr_var(self, scope, place):
lr_tensor = scope.var('LearningRate').get_tensor()
# lr_value = np.random.uniform()
lr_value = 2
lr_value = np.random.uniform()
lr_array = np.full((1), lr_value, np.float32)
lr_array_bf16 = convert_float_to_uint16(lr_array)
lr_tensor.set(lr_array_bf16, place)
......
......@@ -18,9 +18,12 @@ import unittest
import numpy as np
from op_test import OpTest
import paddle
from paddle import enable_static
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.op import Operator
from paddle.fluid.tests.unittests.op_test import (
OpTest, convert_float_to_uint16, convert_uint16_to_float)
class TestSumOp(OpTest):
......@@ -141,6 +144,73 @@ class TestSelectedRowsSumOp(unittest.TestCase):
self.check_with_place(place, inplace)
class TestSelectedRowsSumOpInt(TestSelectedRowsSumOp):
def init_kernel_type(self):
self.dtype = np.int32
@unittest.skipIf(not core.supports_bfloat16(),
'place does not support BF16 evaluation')
class TestSelectedRowsSumBF16Op(TestSelectedRowsSumOp):
def setUp(self):
self.height = 10
self.row_numel = 12
self.rows = [0, 1, 2, 3, 4, 5, 6]
self.dtype = np.uint16
self.init_kernel_type()
np.random.seed(12345)
self.data = np.random.random((len(self.rows),
self.row_numel)).astype(np.float32)
def _get_array(self, rows, row_numel):
if len(rows) > 0:
return convert_float_to_uint16(self.data)
else:
return np.ndarray((0, row_numel), dtype=self.dtype)
def check_input_and_optput(self,
scope,
place,
inplace,
w1_has_data=False,
w2_has_data=False,
w3_has_data=False):
self.create_selected_rows(scope, place, "W1", w1_has_data)
self.create_selected_rows(scope, place, "W2", w2_has_data)
self.create_selected_rows(scope, place, "W3", w3_has_data)
# create Out Variable
if inplace:
out_var_name = "W1"
else:
out_var_name = "Out"
out = scope.var(out_var_name).get_selected_rows()
# create and run sum operator
sum_op = Operator("sum", X=["W1", "W2", "W3"], Out=out_var_name)
sum_op.run(scope, place)
has_data_w_num = 0
for has_data in [w1_has_data, w2_has_data, w3_has_data]:
if has_data:
has_data_w_num += 1
if has_data_w_num > 0:
self.assertEqual(len(out.rows()), 7)
out_bf16 = np.array(out.get_tensor())
out_fp32 = convert_uint16_to_float(out_bf16)
ref_fp32 = convert_uint16_to_float(
self._get_array(self.rows, self.row_numel)) * has_data_w_num
np.testing.assert_allclose(out_fp32, ref_fp32, atol=0, rtol=0.95e-2)
else:
self.assertEqual(len(out.rows()), 0)
def test_w_is_selected_rows(self):
for inplace in [True, False]:
self.check_with_place(core.CPUPlace(), inplace)
class TestLoDTensorAndSelectedRowsOp(TestSelectedRowsSumOp):
def setUp(self):
self.height = 10
......@@ -324,4 +394,5 @@ create_test_sum_fp16_class(TestSelectedRowsSumOp)
create_test_sum_fp16_class(TestLoDTensorAndSelectedRowsOp)
if __name__ == "__main__":
enable_static()
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册