Add AXPY oneDNN handler (#33632)

* Add oneDNN AXPY handler. * Add fallback for small tensors. * Fix ifdefs * Remove unnecessary namespace prefixes and add missing headers. * Guard handler_axpy with proper ifdefs. * Compilation of this function is possible only when Paddle is not build with CUDA nor HIP. * Move AXPY handler code to separate files. * Use oneDNN AXPY handler in SGD op. * Use axpy handler only when Paddle is built with oneDNN. * Add test for SUM BF16 with big rows. * Fix SFINAE rules for elementwise_add_to. * Add test case for SGD with big rows. * update * update Co-authored-by: N Adam Osewski <adam.osewski@intel.com>

Add AXPY oneDNN handler (#33632)
* Add oneDNN AXPY handler. * Add fallback for small tensors. * Fix ifdefs * Remove unnecessary namespace prefixes and add missing headers. * Guard handler_axpy with proper ifdefs. * Compilation of this function is possible only when Paddle is not build with CUDA nor HIP. * Move AXPY handler code to separate files. * Use oneDNN AXPY handler in SGD op. * Use axpy handler only when Paddle is built with oneDNN. * Add test for SUM BF16 with big rows. * Fix SFINAE rules for elementwise_add_to. * Add test case for SGD with big rows. * update * update Co-authored-by: N Adam Osewski <adam.osewski@intel.com>
773aabc7 · lidanqing · GitHub · e0e0c0fa · 773aabc7 · 773aabc7
9 changed file
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -18,6 +18,9 @@ add_subdirectory(optimizers)
 add_subdirectory(reduce_ops)
 add_subdirectory(sequence_ops)
 add_subdirectory(jit)
+if(WITH_MKLDNN)
+    add_subdirectory(mkldnn)
+endif()


 if(WITH_DISTRIBUTE)

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -56,7 +56,13 @@ cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
 math_library(math_function DEPS blas)
 math_library(maxouting)
 math_library(pooling)
-math_library(selected_rows_functor DEPS selected_rows math_function blas)
+
+if(WITH_MKLDNN)
+    math_library(selected_rows_functor DEPS selected_rows math_function blas mkldnn_axpy_handler)
+else()
+    math_library(selected_rows_functor DEPS selected_rows math_function blas)
+endif()
+
 math_library(sequence2batch)
 math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function jit_kernel_helper)

--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -14,6 +14,10 @@ limitations under the License. */

 #include "paddle/fluid/operators/math/selected_rows_functor.h"

+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
+#endif
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -296,6 +300,24 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
 // add or mul.
 namespace scatter {

+#ifdef PADDLE_WITH_MKLDNN
+template <typename T>
+typename std::enable_if<std::is_same<T, float>::value ||
+                        std::is_same<T, platform::bfloat16>::value>::type
+elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
+                   const T* in, T* out) {
+  onednn_handler_axpy(data_len, T(1.f), in, out);
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, double>::value ||
+                        std::is_same<T, platform::complex<float>>::value ||
+                        std::is_same<T, platform::complex<double>>::value>::type
+elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
+                   const T* in, T* out) {
+  blas->AXPY(data_len, T(1.f), in, out);
+}
+#else
 template <typename T>
 typename std::enable_if<std::is_floating_point<T>::value ||
                        std::is_same<T, platform::complex<float>>::value ||
@@ -304,6 +326,7 @@ elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
                   const T* in, T* out) {
  blas->AXPY(data_len, T(1.f), in, out);
 }
+#endif

 template <typename T>
 typename std::enable_if<std::is_integral<T>::value>::type elementwise_add_to(

--- a/paddle/fluid/operators/mkldnn/CMakeLists.txt
+++ b/paddle/fluid/operators/mkldnn/CMakeLists.txt
+cc_library(mkldnn_axpy_handler SRCS axpy_handler.cc DEPS place device_context enforce)
--- a/paddle/fluid/operators/mkldnn/axpy_handler.cc
+++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cinttypes>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+
+namespace plat = paddle::platform;
+
+namespace {
+
+template <typename T>
+class AXPYMKLDNNHandler : public plat::MKLDNNHandlerT<T, dnnl::reorder> {
+ public:
+  AXPYMKLDNNHandler(const plat::MKLDNNDeviceContext &dev_ctx,
+                    const dnnl::engine mkldnn_engine, plat::Place cpu_place,
+                    int n, float alpha)
+      : plat::MKLDNNHandlerT<T, dnnl::reorder>(
+            dev_ctx, mkldnn_engine, cpu_place,
+            plat::CreateKey(dev_ctx, static_cast<int64_t>(n),
+                            plat::MKLDNNGetDataType<T>(), alpha, "-axpy")),
+        alpha_(alpha),
+        n_(n) {}
+
+  std::shared_ptr<dnnl::memory> AcquireMemory(void *ptr,
+                                              const std::string &suffix) {
+    /*Generate key*/
+    auto local_key = this->key_ + suffix;
+    auto mem_p = std::static_pointer_cast<dnnl::memory>(
+        this->dev_ctx_.GetBlob(local_key));
+    if (mem_p == nullptr) {
+      auto md = dnnl::memory::desc({n_}, plat::MKLDNNGetDataType<T>(),
+                                   dnnl::memory::format_tag::x);
+      mem_p = std::make_shared<dnnl::memory>(md, this->engine_, ptr);
+      this->dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const T *x) {
+    return this->AcquireMemory(plat::to_void_cast(x), "@user_src_mem_p");
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireDstMemory(T *y) {
+    return this->AcquireMemory(y, "@user_dst_mem_p");
+  }
+
+  std::shared_ptr<dnnl::reorder> AcquireReorder(
+      std::shared_ptr<dnnl::memory> dst_memory_p,
+      std::shared_ptr<dnnl::memory> src_memory_p) {
+    auto prim_key = this->key_ + "@reorder_p";
+    auto reorder_p = std::static_pointer_cast<dnnl::reorder>(
+        this->dev_ctx_.GetBlob(prim_key));
+    if (reorder_p == nullptr) {
+      // Here we pass Postops to mimick y -> a*X + y
+      dnnl::primitive_attr reorder_attr;
+      dnnl::post_ops post_operations;
+      if (this->alpha_ != 1.f) {
+        std::vector<float> scales(1, this->alpha_);
+        reorder_attr.set_output_scales(0, scales);
+      }
+      post_operations.append_sum(1.0f);
+
+      reorder_attr.set_post_ops(post_operations);
+      reorder_p = std::make_shared<dnnl::reorder>(
+          *(src_memory_p), *(dst_memory_p), reorder_attr);
+      this->dev_ctx_.SetBlob(prim_key, reorder_p);
+    }
+    return reorder_p;
+  }
+
+ private:
+  float alpha_;
+  int n_;
+};
+
+template class AXPYMKLDNNHandler<float>;
+template class AXPYMKLDNNHandler<plat::bfloat16>;
+
+}  // anonnymouse namespace
+
+template <typename T>
+static void naive_axpy(int n, T alpha, const T *x, T *y) {
+  while (n-- > 0) {
+    *y += alpha * *x;
+    ++y;
+    ++x;
+  }
+}
+
+template <typename T>
+void onednn_handler_axpy(int n, T alpha, const T *x, T *y) {
+  // fallback to naive version
+  if (n < 100) {
+    naive_axpy(n, alpha, x, y);
+    return;
+  }
+
+  auto &pool = plat::DeviceContextPool::Instance();
+  auto cpu_place = plat::CPUPlace();
+  auto *dev_ctx =
+      dynamic_cast<plat::MKLDNNDeviceContext *>(pool.Get(cpu_place));
+  auto &cpu_engine = dev_ctx->GetEngine();
+
+  AXPYMKLDNNHandler<T> handler(*dev_ctx, cpu_engine, cpu_place, n,
+                               static_cast<float>(alpha));
+
+  auto reorder_src_memory_p = handler.AcquireSrcMemory(x);
+  auto reorder_dst_memory_p = handler.AcquireDstMemory(y);
+  auto reorder_p =
+      handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
+
+  auto &astream = plat::MKLDNNDeviceContext::tls().get_stream();
+  plat::RecordEvent record_reorder("axpy_int_reorder",
+                                   plat::EventRole::kUniqueOp);
+  reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+  astream.wait();
+}
+
+template void onednn_handler_axpy<float>(int, float, const float *, float *);
+template void onednn_handler_axpy<plat::bfloat16>(int, plat::bfloat16,
+                                                  const plat::bfloat16 *,
+                                                  plat::bfloat16 *);
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/mkldnn/axpy_handler.h
+++ b/paddle/fluid/operators/mkldnn/axpy_handler.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+namespace paddle {
+namespace operators {
+
+///
+/// @brief      Helper function to execute AXPY using oneDNN.
+///
+/// @param[in]  n      The number of elements in tensor (assumed 1D)
+/// @param[in]  alpha  The alpha coefficient.
+/// @param[in]  x      The pointer to input X tensor.
+/// @param      y      The pointer to output Y tensor.
+///
+/// @tparam     T      Data type.
+///
+template <typename T>
+void onednn_handler_axpy(int n, T alpha, const T *x, T *y);
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -19,6 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/jit/kernels.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
+#endif
 #include "paddle/fluid/platform/bfloat16.h"

 namespace paddle {
@@ -139,9 +142,15 @@ struct sgd_dense_param_kernel<
              "Got [%s], but expected less than [%s]",
              grad_rows[i], grad_height));
      const int64_t row = grad_rows[i];
+#ifdef PADDLE_WITH_MKLDNN
+      operators::onednn_handler_axpy(grad_width, -lr[0],
+                                     grad_data + i * grad_width,
+                                     out_data + row * grad_width);
+#else
      for (int64_t j = 0; j < grad_width; ++j) {
        out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j];
      }
+#endif
    }
  }
 };

--- a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
@@ -158,6 +158,13 @@ class TestSparseGradSGDOpBF16Case2(TestSparseGradSGDOpBF16):
        self.grad_row_numel = 16


+class TestSparseGradSGDOpBF16Case3(TestSparseGradSGDOpBF16):
+    def setup_params(self):
+        self.grad_height = 10
+        self.grad_rows = [0, 4, 7]
+        self.grad_row_numel = 120
+
+
 @unittest.skipIf(not core.supports_bfloat16(),
                 'place does not support BF16 evaluation')
 class TestSparseGradParamSGDOpBF16(TestSparseSGDOpBF16):
@@ -194,8 +201,6 @@ class TestSparseGradParamSGDOpBF16(TestSparseSGDOpBF16):
        self.check_output(output, reference, atol=5e-3, rtol=1e-1)


-@unittest.skipIf(not core.supports_bfloat16(),
-                 'place does not support BF16 evaluation')
 class TestSparseGradParamSGDOpBF16Case2(TestSparseGradParamSGDOpBF16):
    def setup_params(self):
        self.grad_height = 14

--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -211,6 +211,11 @@ class TestSelectedRowsSumBF16Op(TestSelectedRowsSumOp):
            self.check_with_place(core.CPUPlace(), inplace)


+class TestSelectedRowsSumBF16OpBigRow(TestSelectedRowsSumBF16Op):
+    def init_kernel_type(self):
+        self.row_numel = 102
+
+
 class TestLoDTensorAndSelectedRowsOp(TestSelectedRowsSumOp):
    def setUp(self):
        self.height = 10