Sum kernel for CPU supporting BF16 and SelectedRows (#32631)

9599c3b3 · Adam Osewski · GitHub · 8b1b214f · 9599c3b3 · 9599c3b3
6 changed file
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -15,6 +15,7 @@
 #ifdef PADDLE_WITH_MKLML
 #include <mkl.h>
 #endif
 #include <algorithm>
 #include <cmath>
 #include <limits>
@@ -28,6 +29,19 @@
 namespace paddle {
 namespace operators {
 namespace math {
+namespace detail {
+template <typename T>
+static void axpy(int n, const T alpha, const T *x, const int incx, T *y,
+                 const int incy) {
+  // Y = Y + alpha * X
+  while (n-- > 0) {
+    *y += alpha * *x;
+    y = y + incy;
+    x = x + incx;
+  }
+}
+}  // namespace detail
 template <typename T>
 struct CBlas;
@@ -43,6 +57,11 @@ struct CBlas<int8_t> {
 template <>
 struct CBlas<platform::bfloat16> {
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    detail::axpy(args...);
+  }
  template <typename... ARGS>
  static void VCOPY(ARGS... args) {
    PADDLE_THROW(platform::errors::Unimplemented(

--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -285,6 +285,8 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
+                                        platform::bfloat16>;
 // This is a separated namespace for manipulate SelectedRows typed
 // data. Like merge duplicated rows, adding two SelectedRows etc.
@@ -294,21 +296,17 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
 // add or mul.
 namespace scatter {
-template <typename DeviceContext, typename T>
+template <typename T>
-typename std::enable_if<
+typename std::enable_if<std::is_floating_point<T>::value>::type
-    std::is_floating_point<T>::value &&
+elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+                   const T* in, T* out) {
-elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
+  blas->AXPY(data_len, T(1.f), in, out);
-                   size_t data_len, const T* in, T* out) {
-  blas->AXPY(data_len, 1., in, out);
 }
-template <typename DeviceContext, typename T>
+template <typename T>
-typename std::enable_if<
+typename std::enable_if<std::is_integral<T>::value>::type elementwise_add_to(
-    !std::is_floating_point<T>::value &&
+    BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len, const T* in,
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+    T* out) {
-elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
-                   size_t data_len, const T* in, T* out) {
  for (size_t i = 0; i < data_len; i++) {
    out[i] += in[i];
  }
@@ -412,7 +410,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
      out.set_rows(merge_rows);
      math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
-      constant_functor(context, out.mutable_value(), 0.0);
+      constant_functor(context, out.mutable_value(), static_cast<T>(0.f));
      std::unordered_map<int64_t, size_t> rows_to_id;
      for (size_t i = 0; i < merge_rows.size(); ++i) {
@@ -429,9 +427,9 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
        for (size_t i = 0; i < input_rows.size(); i++) {
          size_t out_i = rows_to_id[input_rows[i]];
-          elementwise_add_to<platform::CPUDeviceContext, T>(
+          elementwise_add_to<T>(&blas, static_cast<size_t>(input_width),
-              context, &blas, static_cast<size_t>(input_width),
+                                &input_data[i * input_width],
-              &input_data[i * input_width], &out_data[out_i * input_width]);
+                                &out_data[out_i * input_width]);
        }
      }
    }
@@ -524,9 +522,9 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
      for (size_t i = 0; i < input_rows.size(); i++) {
        size_t out_i = rows_to_id[input_rows[i]];
-        elementwise_add_to<platform::CPUDeviceContext, T>(
+        elementwise_add_to<T>(&blas, static_cast<size_t>(input_width),
-            context, &blas, static_cast<size_t>(input_width),
+                              &input_data[i * input_width],
-            &input_data[i * input_width], &out_data[out_i * input_width]);
+                              &out_data[out_i * input_width]);
      }
    }
    size_t input_width_cast = static_cast<size_t>(input_width);
@@ -547,6 +545,8 @@ template struct MergeAdd<platform::CPUDeviceContext,
                         paddle::platform::complex64>;
 template struct MergeAdd<platform::CPUDeviceContext,
                         paddle::platform::complex128>;
+template struct MergeAdd<platform::CPUDeviceContext,
+                         paddle::platform::bfloat16>;
 template struct MergeAverage<platform::CPUDeviceContext, int>;
 template struct MergeAverage<platform::CPUDeviceContext, int64_t>;

--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -326,4 +326,6 @@ REGISTER_OP_CPU_KERNEL(
    sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
    ops::SumKernel<paddle::platform::CPUDeviceContext, double>,
    ops::SumKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext,
+                   paddle::platform::bfloat16>,
    ops::SumKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -640,7 +640,7 @@ class BroadcastDataMKLDNNHandler
                             platform::Place cpu_place, const Tensor* x,
                             const Tensor* y, float scale_x, float scale_y,
                             const std::string& uniq_name,
-                             std::vector<int64_t>& input_dims)
+                             const std::vector<int64_t>& input_dims)
      : platform::MKLDNNHandlerT<T, dnnl::binary>(
            dev_ctx, engine, cpu_place,
            platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),

--- a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
@@ -76,8 +76,7 @@ class TestSparseSGDOpBF16(unittest.TestCase):
        grad_selected_rows = scope.var('Grad').get_selected_rows()
        grad_selected_rows.set_height(height)
        grad_selected_rows.set_rows(rows)
-        # grad_array = np.random.random((len(rows), row_numel)).astype('float32')
+        grad_array = np.random.random((len(rows), row_numel)).astype('float32')
-        grad_array = np.full((len(rows), row_numel), 2, np.float32)
        np_array_bf16 = convert_float_to_uint16(grad_array)
        grad_tensor = grad_selected_rows.get_tensor()
@@ -87,8 +86,7 @@ class TestSparseSGDOpBF16(unittest.TestCase):
    def create_dense_param_var(self, scope, place, height, width):
        param_tensor = scope.var('Param').get_tensor()
-        # param_array = np.random.random((height, width)).astype('float32')
+        param_array = np.random.random((height, width)).astype('float32')
-        param_array = np.full((height, width), 5, np.float32)
        param_array_bf16 = convert_float_to_uint16(param_array)
        param_tensor.set(param_array_bf16, place)
@@ -109,8 +107,7 @@ class TestSparseSGDOpBF16(unittest.TestCase):
    def create_dense_lr_var(self, scope, place):
        lr_tensor = scope.var('LearningRate').get_tensor()
-        # lr_value = np.random.uniform()
+        lr_value = np.random.uniform()
-        lr_value = 2
        lr_array = np.full((1), lr_value, np.float32)
        lr_array_bf16 = convert_float_to_uint16(lr_array)
        lr_tensor.set(lr_array_bf16, place)

--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -18,9 +18,12 @@ import unittest
 import numpy as np
 from op_test import OpTest
 import paddle
+from paddle import enable_static
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+from paddle.fluid.tests.unittests.op_test import (
+    OpTest, convert_float_to_uint16, convert_uint16_to_float)
 class TestSumOp(OpTest):
@@ -141,6 +144,73 @@ class TestSelectedRowsSumOp(unittest.TestCase):
                self.check_with_place(place, inplace)
+class TestSelectedRowsSumOpInt(TestSelectedRowsSumOp):
+    def init_kernel_type(self):
+        self.dtype = np.int32
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSelectedRowsSumBF16Op(TestSelectedRowsSumOp):
+    def setUp(self):
+        self.height = 10
+        self.row_numel = 12
+        self.rows = [0, 1, 2, 3, 4, 5, 6]
+        self.dtype = np.uint16
+        self.init_kernel_type()
+        np.random.seed(12345)
+        self.data = np.random.random((len(self.rows),
+                                      self.row_numel)).astype(np.float32)
+    def _get_array(self, rows, row_numel):
+        if len(rows) > 0:
+            return convert_float_to_uint16(self.data)
+        else:
+            return np.ndarray((0, row_numel), dtype=self.dtype)
+    def check_input_and_optput(self,
+                               scope,
+                               place,
+                               inplace,
+                               w1_has_data=False,
+                               w2_has_data=False,
+                               w3_has_data=False):
+        self.create_selected_rows(scope, place, "W1", w1_has_data)
+        self.create_selected_rows(scope, place, "W2", w2_has_data)
+        self.create_selected_rows(scope, place, "W3", w3_has_data)
+        # create Out Variable
+        if inplace:
+            out_var_name = "W1"
+        else:
+            out_var_name = "Out"
+        out = scope.var(out_var_name).get_selected_rows()
+        # create and run sum operator
+        sum_op = Operator("sum", X=["W1", "W2", "W3"], Out=out_var_name)
+        sum_op.run(scope, place)
+        has_data_w_num = 0
+        for has_data in [w1_has_data, w2_has_data, w3_has_data]:
+            if has_data:
+                has_data_w_num += 1
+        if has_data_w_num > 0:
+            self.assertEqual(len(out.rows()), 7)
+            out_bf16 = np.array(out.get_tensor())
+            out_fp32 = convert_uint16_to_float(out_bf16)
+            ref_fp32 = convert_uint16_to_float(
+                self._get_array(self.rows, self.row_numel)) * has_data_w_num
+            np.testing.assert_allclose(out_fp32, ref_fp32, atol=0, rtol=0.95e-2)
+        else:
+            self.assertEqual(len(out.rows()), 0)
+    def test_w_is_selected_rows(self):
+        for inplace in [True, False]:
+            self.check_with_place(core.CPUPlace(), inplace)
 class TestLoDTensorAndSelectedRowsOp(TestSelectedRowsSumOp):
    def setUp(self):
        self.height = 10
@@ -324,4 +394,5 @@ create_test_sum_fp16_class(TestSelectedRowsSumOp)
 create_test_sum_fp16_class(TestLoDTensorAndSelectedRowsOp)
 if __name__ == "__main__":
+    enable_static()
    unittest.main()