[cherry-pick] [PHI] Migrate sgd and stack oneDNN kernels (#46374) (#46729)

* [PHI] Migrate sgd and stack oneDNN kernels (#46374) * Convert slice+grad oneDNN fluid kernels to PHI * Change mutable_data to Alloc * Refactor licences * update dependencies Co-authored-by: N Piotr Paturej <48731682+piotrekobi@users.noreply.github.com>

[cherry-pick] [PHI] Migrate sgd and stack oneDNN kernels (#46374) (#46729)
* [PHI] Migrate sgd and stack oneDNN kernels (#46374) * Convert slice+grad oneDNN fluid kernels to PHI * Change mutable_data to Alloc * Refactor licences * update dependencies Co-authored-by: N Piotr Paturej <48731682+piotrekobi@users.noreply.github.com>
25d61cd1 · Sławomir Siwek · GitHub · 51a91fee · 51a91fee · 25d61cd1
6 changed file
--- a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc
+++ b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <cstring>
-#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
-namespace pplat = paddle::platform;
-namespace paddle {
-namespace operators {
-template <typename T>
-class SGDOneDNNKernel : public SGDOpKernel<phi::CPUContext, T> {
- protected:
-  void dense_param_and_grad_kernel(
-      const framework::ExecutionContext &ctx) const override {
-    VLOG(4) << "[ONEDNN]: sgd_dense_param_kernel<T, LodTensor>";
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    const auto *param = ctx.Input<framework::Tensor>("Param");
-    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-    const auto *grad = ctx.Input<framework::Tensor>("Grad");
-    auto *out_data = param_out->mutable_data<T>(ctx.GetPlace());
-    const T *param_data = param->data<T>();
-    const auto *grad_data = grad->data<T>();
-    const auto *lr = learning_rate->data<T>();
-    // Since denese SGD is not in place operation, first copy params to output
-    // tensor and then update it.
-    std::memcpy(out_data, param_data, param->memory_size());
-    OneDNNAXPYHandler<T>(param_out->numel(), -lr[0])(grad_data, out_data);
-  }
-  void dense_param_sparse_grad_kernel(
-      const framework::ExecutionContext &ctx) const override {
-    VLOG(4) << "[ONEDNN]: sgd_dense_param_kernel<T, SelectedRows>";
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-    const auto *grad = ctx.Input<phi::SelectedRows>("Grad");
-    const auto &grad_value = grad->value();
-    const auto &grad_rows = grad->rows();
-    const auto grad_height = grad->height();
-    const int64_t grad_val_height = static_cast<int64_t>(grad_rows.size());
-    const auto grad_width = grad_value.numel() / grad_val_height;
-    const auto *grad_data = grad_value.data<T>();
-    auto *out_data = param_out->data<T>();
-    const auto *lr = learning_rate->data<T>();
-    OneDNNAXPYHandler<T> axpy_handler(grad_width, -lr[0]);
-    for (size_t i = 0; i < grad_rows.size(); ++i) {
-      PADDLE_ENFORCE_LT(
-          grad_rows[i],
-          grad_height,
-          pplat::errors::OutOfRange(
-              "Grad rows index value should be less than grad height."
-              "Got [%s], but expected less than [%s]",
-              grad_rows[i],
-              grad_height));
-      const int64_t row = grad_rows[i];
-      const auto *src = grad_data + i * grad_width;
-      auto *dst = out_data + row * grad_width;
-      axpy_handler(src, dst);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(sgd,
-                   MKLDNN,
-                   pplat::CPUPlace,
-                   ops::SGDOneDNNKernel<float>,
-                   ops::SGDOneDNNKernel<pplat::bfloat16>);
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -21,6 +21,7 @@ endif()
 if(WITH_MKLDNN)
  list(APPEND BACKENDS_SRCS onednn/onednn_context.cc)
+  list(APPEND BACKENDS_SRCS onednn/axpy_handler.cc)
  list(APPEND BACKENDS_DEPS mkldnn)
 endif()

--- a/paddle/phi/backends/onednn/axpy_handler.cc
+++ b/paddle/phi/backends/onednn/axpy_handler.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/backends/onednn/axpy_handler.h"
+#include <cinttypes>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/phi/backends/onednn/onednn_helper.h"
+namespace phi {
+namespace funcs {
+template <typename T>
+class AXPYHandler {
+ public:
+  AXPYHandler(const dnnl::engine onednn_engine, int n, float alpha) {
+    OneDNNContext::tls().log_lib_version();
+    auto md = dnnl::memory::desc(
+        {n}, OneDNNGetDataType<T>(), dnnl::memory::format_tag::x);
+    src_mem_ = dnnl::memory(md, onednn_engine, DNNL_MEMORY_NONE);
+    dst_mem_ = dnnl::memory(md, onednn_engine, DNNL_MEMORY_NONE);
+    dnnl::primitive_attr reorder_attr;
+    dnnl::post_ops post_operations;
+    if (alpha != 1.f) {
+      std::vector<float> scales(1, alpha);
+      reorder_attr.set_output_scales(0, scales);
+    }
+    post_operations.append_sum(1.0f);
+    reorder_attr.set_post_ops(post_operations);
+    reorder_p_ = dnnl::reorder(src_mem_, dst_mem_, reorder_attr);
+  }
+  dnnl::memory &AcquireSrcMemory(const T *x) {
+    src_mem_.set_data_handle(to_void_cast<T>(x));
+    return src_mem_;
+  }
+  dnnl::memory &AcquireDstMemory(T *y) {
+    dst_mem_.set_data_handle(y);
+    return dst_mem_;
+  }
+  const dnnl::reorder &AcquireReorder() { return reorder_p_; }
+ private:
+  dnnl::memory src_mem_;
+  dnnl::memory dst_mem_;
+  dnnl::reorder reorder_p_;
+};
+template class AXPYHandler<float>;
+template class AXPYHandler<phi::dtype::bfloat16>;
+template <typename T>
+static void naive_axpy(int n, T alpha, const T *x, T *y) {
+  while (n-- > 0) {
+    *y += alpha * *x;
+    ++y;
+    ++x;
+  }
+}
+template <typename T>
+class OneDNNAXPYHandler<T>::Impl {
+ public:
+  Impl(int64_t n, T alpha, const dnnl::engine onednn_engine);
+  void operator()(const T *x, T *y);
+ private:
+  std::unique_ptr<AXPYHandler<T>> handler_;
+  int64_t n_;
+  T alpha_;
+};
+template <typename T>
+OneDNNAXPYHandler<T>::Impl::Impl(int64_t n,
+                                 T alpha,
+                                 const dnnl::engine onednn_engine)
+    : n_{n}, alpha_{alpha} {
+  handler_ = std::make_unique<AXPYHandler<T>>(
+      onednn_engine, n, static_cast<float>(alpha));
+}
+template <typename T>
+void OneDNNAXPYHandler<T>::Impl::operator()(const T *x, T *y) {
+  if (this->n_ < 100) {
+    naive_axpy(this->n_, this->alpha_, x, y);
+    return;
+  }
+  auto &reorder_src_mem_p = handler_->AcquireSrcMemory(x);
+  auto &reorder_dst_mem_p = handler_->AcquireDstMemory(y);
+  auto reorder_p = handler_->AcquireReorder();
+  auto &astream = OneDNNContext::tls().get_stream();
+  reorder_p.execute(astream, reorder_src_mem_p, reorder_dst_mem_p);
+  astream.wait();
+}
+template <typename T>
+OneDNNAXPYHandler<T>::OneDNNAXPYHandler(int64_t n,
+                                        T alpha,
+                                        const dnnl::engine onednn_engine)
+    : pimpl_{new Impl{n, alpha, onednn_engine},
+             [](Impl *impl) { delete impl; }} {
+  VLOG(4) << "[OneDNN] OneDNNAXPYHandler<" << typeid(T).name() << ">, "
+          << "n: " << n << ", alpha: " << alpha;
+}
+template <typename T>
+void OneDNNAXPYHandler<T>::operator()(const T *x, T *y) {
+  pimpl_->operator()(x, y);
+}
+template class OneDNNAXPYHandler<float>;
+template class OneDNNAXPYHandler<dtype::bfloat16>;
+}  // namespace funcs
+}  // namespace phi
--- a/paddle/phi/backends/onednn/axpy_handler.h
+++ b/paddle/phi/backends/onednn/axpy_handler.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include "dnnl.hpp"  // NOLINT
+namespace phi {
+namespace funcs {
+///
+/// @brief      Helper class for AXPY execution using oneDNN library.
+///
+/// @tparam     T     Data type.
+///
+template <typename T>
+class OneDNNAXPYHandler {
+ public:
+  OneDNNAXPYHandler(OneDNNAXPYHandler&) = delete;
+  OneDNNAXPYHandler(OneDNNAXPYHandler&&) = delete;
+  OneDNNAXPYHandler& operator=(OneDNNAXPYHandler&) = delete;
+  OneDNNAXPYHandler& operator=(OneDNNAXPYHandler&&) = delete;
+  ///
+  /// @brief      Constructor.
+  ///
+  /// @param[in]  n              The number of elements in tensor (assumed 1D
+  /// tensor)
+  /// @param[in]  alpha          The alpha coefficient.
+  /// @param[in]  onednn_engine  The oneDNN engine.
+  ///
+  OneDNNAXPYHandler(int64_t n, T alpha, dnnl::engine onednn_engine);
+  ///
+  /// @brief      Executes AXPY.
+  ///
+  /// @param[in]  x     The pointer to input X tensor data.
+  /// @param[out] y     The pointer to output Y tensor data.
+  ///
+  void operator()(const T* x, T* y);
+ private:
+  OneDNNAXPYHandler() = delete;
+  // (arogowie-intel) Private implementation idiom to hide dependency
+  // on OneDNN headers.
+  class Impl;
+  // We need custom deleter, since the compiler is unable to parameterize
+  // an allocator's default deleter due to incomple type.
+  std::unique_ptr<Impl, void (*)(Impl*)> pimpl_;
+};
+}  // namespace funcs
+}  // namespace phi
--- a/paddle/phi/kernels/onednn/sgd_kernel.cc
+++ b/paddle/phi/kernels/onednn/sgd_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/sgd_kernel.h"
+#include "paddle/phi/backends/onednn/axpy_handler.h"
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context>
+void SGDDenseKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& learning_rate,
+                    const DenseTensor& grad,
+                    const paddle::optional<DenseTensor>& master_param,
+                    bool multi_precision,
+                    DenseTensor* param_out,
+                    DenseTensor* master_param_out) {
+  auto* out_data = dev_ctx.template Alloc<T>(param_out);
+  const T* param_data = param.data<T>();
+  const auto* grad_data = grad.data<T>();
+  const auto* lr = learning_rate.data<T>();
+  // Since denese SGD is not in place operation, first copy params to output
+  // tensor and then update it.
+  std::memcpy(out_data, param_data, param.memory_size());
+  funcs::OneDNNAXPYHandler<T>(param_out->numel(), -lr[0], dev_ctx.GetEngine())(
+      grad_data, out_data);
+}
+template <typename T, typename Context>
+void SGDDenseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const DenseTensor& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    const paddle::optional<DenseTensor>& master_param,
+    bool multi_precision,
+    DenseTensor* param_out,
+    DenseTensor* master_param_out) {
+  const auto& grad_value = grad.value();
+  const auto& grad_rows = grad.rows();
+  const auto grad_height = grad.height();
+  const int64_t grad_val_height = static_cast<int64_t>(grad_rows.size());
+  const auto grad_width = grad_value.numel() / grad_val_height;
+  const auto* grad_data = grad_value.data<T>();
+  auto* out_data = param_out->data<T>();
+  const auto* lr = learning_rate.data<T>();
+  funcs::OneDNNAXPYHandler<T> axpy_handler(
+      grad_width, -lr[0], dev_ctx.GetEngine());
+  for (size_t i = 0; i < grad_rows.size(); ++i) {
+    PADDLE_ENFORCE_LT(
+        grad_rows[i],
+        grad_height,
+        errors::OutOfRange(
+            "Grad rows index value should be less than grad height."
+            "Got [%s], but expected less than [%s]",
+            grad_rows[i],
+            grad_height));
+    const int64_t row = grad_rows[i];
+    const auto* src = grad_data + i * grad_width;
+    auto* dst = out_data + row * grad_width;
+    axpy_handler(src, dst);
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    sgd, OneDNN, ALL_LAYOUT, phi::SGDDenseKernel, float, phi::dtype::bfloat16) {
+}
+PD_REGISTER_KERNEL(sgd_dense_param_sparse_grad,
+                   OneDNN,
+                   ALL_LAYOUT,
+                   phi::SGDDenseParamSparseGradKernel,
+                   float,
+                   phi::dtype::bfloat16) {}
--- a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
-Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
+// you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+// You may obtain a copy of the License at
+//
-    http://www.apache.org/licenses/LICENSE-2.0
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
-Unless required by applicable law or agreed to in writing, software
+// Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
+// distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
+// See the License for the specific language governing permissions and
-limitations under the License. */
+// limitations under the License.
-#include "paddle/fluid/operators/utils.h"
+#include "paddle/phi/kernels/stack_kernel.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-namespace paddle {
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
-namespace operators {
+#include "paddle/phi/core/kernel_registry.h"
-using dnnl::concat;
+namespace phi {
-using dnnl::memory;
-using dnnl::primitive;
+namespace funcs {
-using dnnl::stream;
-using framework::DataLayout;
-using framework::LoDTensor;
-using framework::Tensor;
-using platform::to_void_cast;
 template <typename T>
-class StackMKLDNNHandler
+class StackOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::concat> {
-    : public platform::MKLDNNHandlerNoCachingT<T, dnnl::concat> {
 public:
-  StackMKLDNNHandler(const framework::ExecutionContext& ctx,
+  StackOneDNNHandler(const Place& cpu_place,
-                     const dnnl::engine mkldnn_engine,
+                     int stack_axis,
-                     const std::vector<const Tensor*>& inputs,
+                     const dnnl::engine onednn_engine,
-                     Tensor* output)
+                     const std::vector<const DenseTensor*>& inputs,
-      : platform::MKLDNNHandlerNoCachingT<T, dnnl::concat>(mkldnn_engine,
+                     DenseTensor* output)
-                                                           ctx.GetPlace()) {
+      : OneDNNHandlerNoCachingT<T, dnnl::concat>(onednn_engine, cpu_place) {
-    int stack_axis = ctx.Attr<int>("axis");
    int ndims = inputs[0]->dims().size();
    if (stack_axis < 0) {
@@ -45,13 +36,12 @@ class StackMKLDNNHandler
    }
    // in stack op all inputs must have same dims
-    auto input_dims = phi::vectorize<int64_t>(inputs[0]->dims());
+    auto input_dims = vectorize<int64_t>(inputs[0]->dims());
-    memory::data_type dt = framework::ToMKLDNNDataType(
+    dnnl::memory::data_type dt = ToOneDNNDataType(inputs[0]->dtype());
-        framework::TransToProtoVarType(inputs[0]->dtype()));
    std::vector<memory::desc> srcs_md;
-    memory::desc dst_md;
+    dnnl::memory::desc dst_md;
-    MKLDNNMemoryFormat dst_fmt;
+    OneDNNMemoryFormat dst_fmt;
    srcs_md.reserve(inputs.size());
@@ -64,9 +54,9 @@ class StackMKLDNNHandler
      }
      input_dims[stack_axis] *= inputs.size();
-      dst_md = memory::desc(input_dims, dt, MKLDNNMemoryFormat::any);
+      dst_md = dnnl::memory::desc(input_dims, dt, OneDNNMemoryFormat::any);
    } else {
-      auto extended_input_dims = phi::vectorize<int64_t>(output->dims());
+      auto extended_input_dims = vectorize<int64_t>(output->dims());
      extended_input_dims[stack_axis] = 1;
      for (size_t i = 0; i < inputs.size(); ++i) {
@@ -76,8 +66,8 @@ class StackMKLDNNHandler
      // concat primitive choses suboptimal format tag because it cannot
      // distinguish between f.e. abcd and abdc if last dim is equal to 1 so
      // enforcing is needed for better performance
-      dst_fmt = platform::GetPlainMKLDNNFormat(extended_input_dims.size());
+      dst_fmt = GetPlainOneDNNFormat(extended_input_dims.size());
-      dst_md = memory::desc(phi::vectorize(output->dims()), dt, dst_fmt);
+      dst_md = dnnl::memory::desc(vectorize(output->dims()), dt, dst_fmt);
    }
    this->AcquireForwardPrimitiveDescriptor(dst_md, stack_axis, srcs_md);
@@ -93,35 +83,33 @@ class StackMKLDNNHandler
        dst_md, stack_axis, srcs_md, this->engine_));
  }
-  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const Tensor& input, int i) {
+  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const DenseTensor& input,
+                                                 int i) {
    const T* input_data = input.data<T>();
    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src_desc(i),
                                            to_void_cast<T>(input_data));
  }
 };
+}  // namespace funcs
-template <typename T>
+template <typename T, typename Context>
-class StackMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+void StackKernel(const Context& dev_ctx,
- public:
+                 const std::vector<const DenseTensor*>& multi_input,
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+                 int axis,
-    auto& dev_ctx =
+                 DenseTensor* output) {
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
+  const auto& onednn_engine = dev_ctx.GetEngine();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-    auto multi_input = ctx.MultiInput<Tensor>("X");
-    Tensor* output = ctx.Output<Tensor>("Y");
-    StackMKLDNNHandler<T> handler(ctx, mkldnn_engine, multi_input, output);
+  funcs::StackOneDNNHandler<T> handler(
+      dev_ctx.GetPlace(), axis, onednn_engine, multi_input, output);
-    std::vector<std::shared_ptr<memory>> srcs;
+  std::vector<std::shared_ptr<dnnl::memory>> srcs;
  srcs.reserve(multi_input.size());
  auto dst_mem = handler.AcquireDstMemory(output);
  auto concat_p = handler.AcquireForwardPrimitive();
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+  auto& astream = OneDNNContext::tls().get_stream();
-    std::unordered_map<int, memory> args;
+  std::unordered_map<int, dnnl::memory> args;
  for (size_t i = 0; i < multi_input.size(); ++i) {
    srcs.push_back(handler.AcquireSrcMemory(*(multi_input[i]), i));
    args.insert({DNNL_ARG_MULTIPLE_SRC + i, *(srcs.at(i))});
@@ -131,16 +119,9 @@ class StackMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  concat_p->execute(astream, args);
  astream.wait();
-    output->set_mem_desc(
+  output->set_mem_desc(dst_mem->get_desc().reshape(vectorize(output->dims())));
-        dst_mem->get_desc().reshape(phi::vectorize(output->dims())));
+}
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
+}  // namespace phi
-REGISTER_OP_KERNEL(stack,
+PD_REGISTER_KERNEL(stack, OneDNN, ALL_LAYOUT, phi::StackKernel, float) {}
-                   MKLDNN,
-                   ::paddle::platform::CPUPlace,
-                   ops::StackMKLDNNOpKernel<float>);