From 773aabc7710f030b7b2e83d30637cd94c3379194 Mon Sep 17 00:00:00 2001
From: lidanqing <danqing.li@intel.com>
Date: Mon, 21 Jun 2021 18:28:25 +0800
Subject: [PATCH] Add AXPY oneDNN handler (#33632)

* Add oneDNN AXPY handler.

* Add fallback for small tensors.

* Fix ifdefs

* Remove unnecessary namespace prefixes and add missing headers.

* Guard handler_axpy with proper ifdefs.

* Compilation of this function is possible only when Paddle is not build
with CUDA nor HIP.

* Move AXPY handler code to separate files.

* Use oneDNN AXPY handler in SGD op.

* Use axpy handler only when Paddle is built with oneDNN.

* Add test for SUM BF16 with big rows.

* Fix SFINAE rules for elementwise_add_to.

* Add test case for SGD with big rows.

* update

* update

Co-authored-by: Adam Osewski <adam.osewski@intel.com>
---
 paddle/fluid/operators/CMakeLists.txt         |   3 +
 paddle/fluid/operators/math/CMakeLists.txt    |   8 +-
 .../operators/math/selected_rows_functor.cc   |  23 +++
 paddle/fluid/operators/mkldnn/CMakeLists.txt  |   1 +
 paddle/fluid/operators/mkldnn/axpy_handler.cc | 152 ++++++++++++++++++
 paddle/fluid/operators/mkldnn/axpy_handler.h  |  33 ++++
 paddle/fluid/operators/optimizers/sgd_op.h    |   9 ++
 .../fluid/tests/unittests/test_sgd_op_bf16.py |   9 +-
 .../fluid/tests/unittests/test_sum_op.py      |   5 +
 9 files changed, 240 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/operators/mkldnn/CMakeLists.txt
 create mode 100644 paddle/fluid/operators/mkldnn/axpy_handler.cc
 create mode 100644 paddle/fluid/operators/mkldnn/axpy_handler.h
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 14912ac3a7d..0956410041b 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -18,6 +18,9 @@ add_subdirectory(optimizers)
 add_subdirectory(reduce_ops)
 add_subdirectory(sequence_ops)
 add_subdirectory(jit)
+if(WITH_MKLDNN)
+    add_subdirectory(mkldnn)
+endif()
 
 
 if(WITH_DISTRIBUTE)
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index fdbc0c68525..a13fffe15cf 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -56,7 +56,13 @@ cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
 math_library(math_function DEPS blas)
 math_library(maxouting)
 math_library(pooling)
-math_library(selected_rows_functor DEPS selected_rows math_function blas)
+
+if(WITH_MKLDNN)
+    math_library(selected_rows_functor DEPS selected_rows math_function blas mkldnn_axpy_handler)
+else()
+    math_library(selected_rows_functor DEPS selected_rows math_function blas)
+endif()
+
 math_library(sequence2batch)
 math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function jit_kernel_helper)
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index ee405be5ae9..a72bdec05d7 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -14,6 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
+#endif
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -296,6 +300,24 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
 // add or mul.
 namespace scatter {
 
+#ifdef PADDLE_WITH_MKLDNN
+template <typename T>
+typename std::enable_if<std::is_same<T, float>::value ||
+                        std::is_same<T, platform::bfloat16>::value>::type
+elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
+                   const T* in, T* out) {
+  onednn_handler_axpy(data_len, T(1.f), in, out);
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, double>::value ||
+                        std::is_same<T, platform::complex<float>>::value ||
+                        std::is_same<T, platform::complex<double>>::value>::type
+elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
+                   const T* in, T* out) {
+  blas->AXPY(data_len, T(1.f), in, out);
+}
+#else
 template <typename T>
 typename std::enable_if<std::is_floating_point<T>::value ||
                         std::is_same<T, platform::complex<float>>::value ||
@@ -304,6 +326,7 @@ elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
                    const T* in, T* out) {
   blas->AXPY(data_len, T(1.f), in, out);
 }
+#endif
 
 template <typename T>
 typename std::enable_if<std::is_integral<T>::value>::type elementwise_add_to(
diff --git a/paddle/fluid/operators/mkldnn/CMakeLists.txt b/paddle/fluid/operators/mkldnn/CMakeLists.txt
new file mode 100644
index 00000000000..ce95ec560c2
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(mkldnn_axpy_handler SRCS axpy_handler.cc DEPS place device_context enforce)
diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc
new file mode 100644
index 00000000000..76101f19ab6
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc
@@ -0,0 +1,152 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cinttypes>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+
+namespace plat = paddle::platform;
+
+namespace {
+
+template <typename T>
+class AXPYMKLDNNHandler : public plat::MKLDNNHandlerT<T, dnnl::reorder> {
+ public:
+  AXPYMKLDNNHandler(const plat::MKLDNNDeviceContext &dev_ctx,
+                    const dnnl::engine mkldnn_engine, plat::Place cpu_place,
+                    int n, float alpha)
+      : plat::MKLDNNHandlerT<T, dnnl::reorder>(
+            dev_ctx, mkldnn_engine, cpu_place,
+            plat::CreateKey(dev_ctx, static_cast<int64_t>(n),
+                            plat::MKLDNNGetDataType<T>(), alpha, "-axpy")),
+        alpha_(alpha),
+        n_(n) {}
+
+  std::shared_ptr<dnnl::memory> AcquireMemory(void *ptr,
+                                              const std::string &suffix) {
+    /*Generate key*/
+    auto local_key = this->key_ + suffix;
+    auto mem_p = std::static_pointer_cast<dnnl::memory>(
+        this->dev_ctx_.GetBlob(local_key));
+    if (mem_p == nullptr) {
+      auto md = dnnl::memory::desc({n_}, plat::MKLDNNGetDataType<T>(),
+                                   dnnl::memory::format_tag::x);
+      mem_p = std::make_shared<dnnl::memory>(md, this->engine_, ptr);
+      this->dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const T *x) {
+    return this->AcquireMemory(plat::to_void_cast(x), "@user_src_mem_p");
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireDstMemory(T *y) {
+    return this->AcquireMemory(y, "@user_dst_mem_p");
+  }
+
+  std::shared_ptr<dnnl::reorder> AcquireReorder(
+      std::shared_ptr<dnnl::memory> dst_memory_p,
+      std::shared_ptr<dnnl::memory> src_memory_p) {
+    auto prim_key = this->key_ + "@reorder_p";
+    auto reorder_p = std::static_pointer_cast<dnnl::reorder>(
+        this->dev_ctx_.GetBlob(prim_key));
+    if (reorder_p == nullptr) {
+      // Here we pass Postops to mimick y -> a*X + y
+      dnnl::primitive_attr reorder_attr;
+      dnnl::post_ops post_operations;
+      if (this->alpha_ != 1.f) {
+        std::vector<float> scales(1, this->alpha_);
+        reorder_attr.set_output_scales(0, scales);
+      }
+      post_operations.append_sum(1.0f);
+
+      reorder_attr.set_post_ops(post_operations);
+      reorder_p = std::make_shared<dnnl::reorder>(
+          *(src_memory_p), *(dst_memory_p), reorder_attr);
+      this->dev_ctx_.SetBlob(prim_key, reorder_p);
+    }
+    return reorder_p;
+  }
+
+ private:
+  float alpha_;
+  int n_;
+};
+
+template class AXPYMKLDNNHandler<float>;
+template class AXPYMKLDNNHandler<plat::bfloat16>;
+
+}  // anonnymouse namespace
+
+template <typename T>
+static void naive_axpy(int n, T alpha, const T *x, T *y) {
+  while (n-- > 0) {
+    *y += alpha * *x;
+    ++y;
+    ++x;
+  }
+}
+
+template <typename T>
+void onednn_handler_axpy(int n, T alpha, const T *x, T *y) {
+  // fallback to naive version
+  if (n < 100) {
+    naive_axpy(n, alpha, x, y);
+    return;
+  }
+
+  auto &pool = plat::DeviceContextPool::Instance();
+  auto cpu_place = plat::CPUPlace();
+  auto *dev_ctx =
+      dynamic_cast<plat::MKLDNNDeviceContext *>(pool.Get(cpu_place));
+  auto &cpu_engine = dev_ctx->GetEngine();
+
+  AXPYMKLDNNHandler<T> handler(*dev_ctx, cpu_engine, cpu_place, n,
+                               static_cast<float>(alpha));
+
+  auto reorder_src_memory_p = handler.AcquireSrcMemory(x);
+  auto reorder_dst_memory_p = handler.AcquireDstMemory(y);
+  auto reorder_p =
+      handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
+
+  auto &astream = plat::MKLDNNDeviceContext::tls().get_stream();
+  plat::RecordEvent record_reorder("axpy_int_reorder",
+                                   plat::EventRole::kUniqueOp);
+  reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+  astream.wait();
+}
+
+template void onednn_handler_axpy<float>(int, float, const float *, float *);
+template void onednn_handler_axpy<plat::bfloat16>(int, plat::bfloat16,
+                                                  const plat::bfloat16 *,
+                                                  plat::bfloat16 *);
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.h b/paddle/fluid/operators/mkldnn/axpy_handler.h
new file mode 100644
index 00000000000..8f0fdeb5c02
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/axpy_handler.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+namespace paddle {
+namespace operators {
+
+///
+/// @brief      Helper function to execute AXPY using oneDNN.
+///
+/// @param[in]  n      The number of elements in tensor (assumed 1D)
+/// @param[in]  alpha  The alpha coefficient.
+/// @param[in]  x      The pointer to input X tensor.
+/// @param      y      The pointer to output Y tensor.
+///
+/// @tparam     T      Data type.
+///
+template <typename T>
+void onednn_handler_axpy(int n, T alpha, const T *x, T *y);
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
index 076121c0e27..076afdc6553 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -19,6 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/jit/kernels.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
+#endif
 #include "paddle/fluid/platform/bfloat16.h"
 
 namespace paddle {
@@ -139,9 +142,15 @@ struct sgd_dense_param_kernel<
               "Got [%s], but expected less than [%s]",
               grad_rows[i], grad_height));
       const int64_t row = grad_rows[i];
+#ifdef PADDLE_WITH_MKLDNN
+      operators::onednn_handler_axpy(grad_width, -lr[0],
+                                     grad_data + i * grad_width,
+                                     out_data + row * grad_width);
+#else
       for (int64_t j = 0; j < grad_width; ++j) {
         out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j];
       }
+#endif
     }
   }
 };
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
index fa8ff4effcf..e60b04257db 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
@@ -158,6 +158,13 @@ class TestSparseGradSGDOpBF16Case2(TestSparseGradSGDOpBF16):
         self.grad_row_numel = 16
 
 
+class TestSparseGradSGDOpBF16Case3(TestSparseGradSGDOpBF16):
+    def setup_params(self):
+        self.grad_height = 10
+        self.grad_rows = [0, 4, 7]
+        self.grad_row_numel = 120
+
+
 @unittest.skipIf(not core.supports_bfloat16(),
                  'place does not support BF16 evaluation')
 class TestSparseGradParamSGDOpBF16(TestSparseSGDOpBF16):
@@ -194,8 +201,6 @@ class TestSparseGradParamSGDOpBF16(TestSparseSGDOpBF16):
         self.check_output(output, reference, atol=5e-3, rtol=1e-1)
 
 
-@unittest.skipIf(not core.supports_bfloat16(),
-                 'place does not support BF16 evaluation')
 class TestSparseGradParamSGDOpBF16Case2(TestSparseGradParamSGDOpBF16):
     def setup_params(self):
         self.grad_height = 14
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index f9e40cf8133..f0fbd143c5a 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -211,6 +211,11 @@ class TestSelectedRowsSumBF16Op(TestSelectedRowsSumOp):
             self.check_with_place(core.CPUPlace(), inplace)
 
 
+class TestSelectedRowsSumBF16OpBigRow(TestSelectedRowsSumBF16Op):
+    def init_kernel_type(self):
+        self.row_numel = 102
+
+
 class TestLoDTensorAndSelectedRowsOp(TestSelectedRowsSumOp):
     def setUp(self):
         self.height = 10
-- 
GitLab