From f3436af1ba8403f59fba592857e7582713a30011 Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Thu, 6 May 2021 14:07:14 +0200 Subject: [PATCH] [cherry-pick] Sum kernel for CPU supporting BF16 and SelectedRows (#32631) (#32755) --- paddle/fluid/operators/math/blas_impl.h | 19 +++++ .../operators/math/selected_rows_functor.cc | 40 +++++------ paddle/fluid/operators/sum_op.cc | 2 + .../fluid/tests/unittests/test_sgd_op_bf16.py | 9 +-- .../fluid/tests/unittests/test_sum_op.py | 71 +++++++++++++++++++ 5 files changed, 115 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index 64b533de098..05d42f02c10 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -15,6 +15,7 @@ #ifdef PADDLE_WITH_MKLML #include #endif + #include #include #include @@ -28,6 +29,19 @@ namespace paddle { namespace operators { namespace math { +namespace detail { + +template +static void axpy(int n, const T alpha, const T *x, const int incx, T *y, + const int incy) { + // Y = Y + alpha * X + while (n-- > 0) { + *y += alpha * *x; + y = y + incy; + x = x + incx; + } +} +} // namespace detail template struct CBlas; @@ -43,6 +57,11 @@ struct CBlas { template <> struct CBlas { + template + static void AXPY(ARGS... args) { + detail::axpy(args...); + } + template static void VCOPY(ARGS... args) { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index f7b16453e01..b9a1854a661 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -285,6 +285,8 @@ template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; // This is a separated namespace for manipulate SelectedRows typed // data. Like merge duplicated rows, adding two SelectedRows etc. @@ -294,21 +296,17 @@ template struct SelectedRowsAddToTensor; // add or mul. namespace scatter { -template -typename std::enable_if< - std::is_floating_point::value && - std::is_same::value>::type -elementwise_add_to(const DeviceContext& ctx, BlasT* blas, - size_t data_len, const T* in, T* out) { - blas->AXPY(data_len, 1., in, out); +template +typename std::enable_if::value>::type +elementwise_add_to(BlasT* blas, size_t data_len, + const T* in, T* out) { + blas->AXPY(data_len, T(1.f), in, out); } -template -typename std::enable_if< - !std::is_floating_point::value && - std::is_same::value>::type -elementwise_add_to(const DeviceContext& ctx, BlasT* blas, - size_t data_len, const T* in, T* out) { +template +typename std::enable_if::value>::type elementwise_add_to( + BlasT* blas, size_t data_len, const T* in, + T* out) { for (size_t i = 0; i < data_len; i++) { out[i] += in[i]; } @@ -412,7 +410,7 @@ struct MergeAdd { out.set_rows(merge_rows); math::SetConstant constant_functor; - constant_functor(context, out.mutable_value(), 0.0); + constant_functor(context, out.mutable_value(), static_cast(0.f)); std::unordered_map rows_to_id; for (size_t i = 0; i < merge_rows.size(); ++i) { @@ -429,9 +427,9 @@ struct MergeAdd { for (size_t i = 0; i < input_rows.size(); i++) { size_t out_i = rows_to_id[input_rows[i]]; - elementwise_add_to( - context, &blas, static_cast(input_width), - &input_data[i * input_width], &out_data[out_i * input_width]); + elementwise_add_to(&blas, static_cast(input_width), + &input_data[i * input_width], + &out_data[out_i * input_width]); } } } @@ -524,9 +522,9 @@ struct MergeAverage { for (size_t i = 0; i < input_rows.size(); i++) { size_t out_i = rows_to_id[input_rows[i]]; - elementwise_add_to( - context, &blas, static_cast(input_width), - &input_data[i * input_width], &out_data[out_i * input_width]); + elementwise_add_to(&blas, static_cast(input_width), + &input_data[i * input_width], + &out_data[out_i * input_width]); } } size_t input_width_cast = static_cast(input_width); @@ -547,6 +545,8 @@ template struct MergeAdd; template struct MergeAdd; +template struct MergeAdd; template struct MergeAverage; template struct MergeAverage; diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 741f86f3584..0f520adba57 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -326,4 +326,6 @@ REGISTER_OP_CPU_KERNEL( sum, ops::SumKernel, ops::SumKernel, ops::SumKernel, + ops::SumKernel, ops::SumKernel); diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py index 0717ec80f6a..fa8ff4effcf 100644 --- a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py +++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py @@ -76,8 +76,7 @@ class TestSparseSGDOpBF16(unittest.TestCase): grad_selected_rows = scope.var('Grad').get_selected_rows() grad_selected_rows.set_height(height) grad_selected_rows.set_rows(rows) - # grad_array = np.random.random((len(rows), row_numel)).astype('float32') - grad_array = np.full((len(rows), row_numel), 2, np.float32) + grad_array = np.random.random((len(rows), row_numel)).astype('float32') np_array_bf16 = convert_float_to_uint16(grad_array) grad_tensor = grad_selected_rows.get_tensor() @@ -87,8 +86,7 @@ class TestSparseSGDOpBF16(unittest.TestCase): def create_dense_param_var(self, scope, place, height, width): param_tensor = scope.var('Param').get_tensor() - # param_array = np.random.random((height, width)).astype('float32') - param_array = np.full((height, width), 5, np.float32) + param_array = np.random.random((height, width)).astype('float32') param_array_bf16 = convert_float_to_uint16(param_array) param_tensor.set(param_array_bf16, place) @@ -109,8 +107,7 @@ class TestSparseSGDOpBF16(unittest.TestCase): def create_dense_lr_var(self, scope, place): lr_tensor = scope.var('LearningRate').get_tensor() - # lr_value = np.random.uniform() - lr_value = 2 + lr_value = np.random.uniform() lr_array = np.full((1), lr_value, np.float32) lr_array_bf16 = convert_float_to_uint16(lr_array) lr_tensor.set(lr_array_bf16, place) diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py index 35dc92ffb08..f9e40cf8133 100644 --- a/python/paddle/fluid/tests/unittests/test_sum_op.py +++ b/python/paddle/fluid/tests/unittests/test_sum_op.py @@ -18,9 +18,12 @@ import unittest import numpy as np from op_test import OpTest import paddle +from paddle import enable_static import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.op import Operator +from paddle.fluid.tests.unittests.op_test import ( + OpTest, convert_float_to_uint16, convert_uint16_to_float) class TestSumOp(OpTest): @@ -141,6 +144,73 @@ class TestSelectedRowsSumOp(unittest.TestCase): self.check_with_place(place, inplace) +class TestSelectedRowsSumOpInt(TestSelectedRowsSumOp): + def init_kernel_type(self): + self.dtype = np.int32 + + +@unittest.skipIf(not core.supports_bfloat16(), + 'place does not support BF16 evaluation') +class TestSelectedRowsSumBF16Op(TestSelectedRowsSumOp): + def setUp(self): + self.height = 10 + self.row_numel = 12 + self.rows = [0, 1, 2, 3, 4, 5, 6] + self.dtype = np.uint16 + self.init_kernel_type() + np.random.seed(12345) + self.data = np.random.random((len(self.rows), + self.row_numel)).astype(np.float32) + + def _get_array(self, rows, row_numel): + if len(rows) > 0: + return convert_float_to_uint16(self.data) + else: + return np.ndarray((0, row_numel), dtype=self.dtype) + + def check_input_and_optput(self, + scope, + place, + inplace, + w1_has_data=False, + w2_has_data=False, + w3_has_data=False): + + self.create_selected_rows(scope, place, "W1", w1_has_data) + self.create_selected_rows(scope, place, "W2", w2_has_data) + self.create_selected_rows(scope, place, "W3", w3_has_data) + + # create Out Variable + if inplace: + out_var_name = "W1" + else: + out_var_name = "Out" + out = scope.var(out_var_name).get_selected_rows() + + # create and run sum operator + sum_op = Operator("sum", X=["W1", "W2", "W3"], Out=out_var_name) + sum_op.run(scope, place) + + has_data_w_num = 0 + for has_data in [w1_has_data, w2_has_data, w3_has_data]: + if has_data: + has_data_w_num += 1 + + if has_data_w_num > 0: + self.assertEqual(len(out.rows()), 7) + out_bf16 = np.array(out.get_tensor()) + out_fp32 = convert_uint16_to_float(out_bf16) + ref_fp32 = convert_uint16_to_float( + self._get_array(self.rows, self.row_numel)) * has_data_w_num + np.testing.assert_allclose(out_fp32, ref_fp32, atol=0, rtol=0.95e-2) + else: + self.assertEqual(len(out.rows()), 0) + + def test_w_is_selected_rows(self): + for inplace in [True, False]: + self.check_with_place(core.CPUPlace(), inplace) + + class TestLoDTensorAndSelectedRowsOp(TestSelectedRowsSumOp): def setUp(self): self.height = 10 @@ -324,4 +394,5 @@ create_test_sum_fp16_class(TestSelectedRowsSumOp) create_test_sum_fp16_class(TestLoDTensorAndSelectedRowsOp) if __name__ == "__main__": + enable_static() unittest.main() -- GitLab