Added optimization pass for oneDNN layernorm kernel (#47782)

* optimization for ln * fix * added output to gpd * added formatting * fix

Added optimization pass for oneDNN layernorm kernel (#47782)
* optimization for ln * fix * added output to gpd * added formatting * fix
519e7426 · jakpiase · GitHub · 626d7bcb · 519e7426 · 519e7426
7 changed file
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -178,6 +178,7 @@ if(WITH_MKLDNN)
  pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn)
  pass_library(matmul_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
  pass_library(matmul_activation_mkldnn_fuse_pass inference DIR mkldnn)
+  pass_library(layer_norm_onednn_optimization_pass inference DIR mkldnn)
  pass_library(operator_scale_onednn_fuse_pass inference DIR mkldnn)
  pass_library(squeeze2_transpose2_onednn_fuse_pass inference DIR mkldnn)
  pass_library(operator_unsqueeze2_onednn_fuse_pass inference DIR mkldnn)

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -940,6 +940,29 @@ PDNode *patterns::ConvBN::operator()(paddle::framework::ir::PDNode *conv_input,
  return bn_out_var;
 }

+PDNode *patterns::LayerNormShiftScale::operator()() {
+  auto layer_norm_in = pattern->NewNode(layer_norm_in_repr())
+                           ->AsInput()
+                           ->assert_is_op_input("layer_norm", "X");
+  auto layer_norm_bias = pattern->NewNode(layer_norm_bias_repr())
+                             ->AsInput()
+                             ->assert_is_op_input("layer_norm", "Bias");
+  auto layer_norm_scale = pattern->NewNode(layer_norm_scale_repr())
+                              ->AsInput()
+                              ->assert_is_op_input("layer_norm", "Scale");
+
+  auto layer_norm_op =
+      pattern->NewNode(layer_norm_op_repr())->assert_is_op("layer_norm");
+
+  auto layer_norm_out = pattern->NewNode(layer_norm_out_repr())
+                            ->assert_is_op_output("layer_norm", "Y")
+                            ->AsOutput();
+
+  layer_norm_op->LinksFrom({layer_norm_in, layer_norm_bias, layer_norm_scale})
+      .LinksTo({layer_norm_out});
+  return layer_norm_out;
+}
+
 PDNode *patterns::OperatorActivation::operator()(
    const std::string &operator_type, const std::string &activation_type) {
  auto *preceding_op =

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -526,6 +526,19 @@ struct ConvBN : public PatternBase {
  PATTERN_DECL_NODE(bn_saved_variance);
 };

+struct LayerNormShiftScale : public PatternBase {
+  LayerNormShiftScale(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "layer_norm_shift_scale") {}
+
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(layer_norm_in);
+  PATTERN_DECL_NODE(layer_norm_op);
+  PATTERN_DECL_NODE(layer_norm_bias);
+  PATTERN_DECL_NODE(layer_norm_scale);
+  PATTERN_DECL_NODE(layer_norm_out);
+};
+
 struct OperatorActivation : public PatternBase {
  OperatorActivation(PDPattern* pattern, const std::string& name_scope)
      : PatternBase(pattern, name_scope, "operator_activation") {}

--- a/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.h"
+
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void LayerNormOneDNNOptimizationPass::ApplyImpl(Graph *graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  FusePassBase::Init("layer_norm_onednn_optimization_pass", graph);
+
+  GraphPatternDetector gpd;
+  patterns::LayerNormShiftScale layer_norm_shift_scale_pattern(
+      gpd.mutable_pattern(), "layer_norm_onednn_optimization_pass");
+  layer_norm_shift_scale_pattern();
+
+  int found_layer_norm = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm_op, layer_norm_op, layer_norm_shift_scale_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm_bias, layer_norm_bias, layer_norm_shift_scale_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm_scale, layer_norm_scale, layer_norm_shift_scale_pattern);
+
+    if (layer_norm_op->Op()->HasAttr("use_mkldnn") &&
+        !(PADDLE_GET_CONST(bool, layer_norm_op->Op()->GetAttr("use_mkldnn")))) {
+      VLOG(4) << "Only oneDNN version of layer_norm can be optimized to "
+                 "include Bias and Shift in a single tensor.";
+      return;
+    }
+
+    auto *scope = param_scope();
+
+    auto ln_bias_name = layer_norm_op->Op()->Input("Bias");
+    auto ln_scale_name = layer_norm_op->Op()->Input("Scale");
+
+    auto *ln_bias_tensor =
+        scope->FindVar(ln_bias_name[0])->GetMutable<phi::DenseTensor>();
+    auto *ln_scale_tensor =
+        scope->FindVar(ln_scale_name[0])->GetMutable<phi::DenseTensor>();
+
+    const int channels = ln_bias_tensor->dims()[0];
+
+    VarDesc scale_shift_desc(patterns::PDNodeName(
+        "layer_norm_onednn_optimization_pass", "ScaleShift"));
+    scale_shift_desc.SetShape({channels * 2});
+    scale_shift_desc.SetDataType(
+        framework::TransToProtoVarType(ln_bias_tensor->dtype()));
+    scale_shift_desc.SetPersistable(true);
+
+    auto scale_shift_node = g->CreateVarNode(&scale_shift_desc);
+    auto *scale_shift_tensor =
+        scope->Var(scale_shift_node->Name())->GetMutable<phi::DenseTensor>();
+
+    scale_shift_tensor->Resize(phi::make_ddim({channels * 2}));
+
+    memcpy(scale_shift_tensor->mutable_data<float>(platform::CPUPlace()),
+           ln_scale_tensor->data<float>(),
+           channels * sizeof(float));
+
+    memcpy(scale_shift_tensor->data<float>() + channels,
+           ln_bias_tensor->data<float>(),
+           channels * sizeof(float));
+
+    layer_norm_op->Op()->SetInput("ScaleShift", {scale_shift_node->Name()});
+
+    IR_NODE_LINK_TO(scale_shift_node, layer_norm_op);
+    found_layer_norm++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_layer_norm);
+  if ((!Has("disable_logs") || !Get<bool>("disable_logs")) &&
+      found_layer_norm > 0)
+    PrettyLogDetail("---    optimized %d layer_norms by merging Scale and Bias",
+                    found_layer_norm);
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(layer_norm_onednn_optimization_pass,
+              paddle::framework::ir::LayerNormOneDNNOptimizationPass);
+REGISTER_PASS_CAPABILITY(layer_norm_onednn_optimization_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().GE(
+            "layer_norm", 0));
--- a/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class LayerNormOneDNNOptimizationPass : public FusePassBase {
+ public:
+  virtual ~LayerNormOneDNNOptimizationPass() {}
+
+ protected:
+  void ApplyImpl(Graph *graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -346,6 +346,7 @@ void CpuPassStrategy::EnableMKLDNN() {
             "softplus_activation_mkldnn_fuse_pass",  //
             "shuffle_channel_mkldnn_detect_pass",    //
             "elt_act_mkldnn_fuse_pass",              //
+             "layer_norm_onednn_optimization_pass",   //
             "operator_scale_onednn_fuse_pass",       //
             "operator_unsqueeze2_onednn_fuse_pass",  //
             "operator_reshape2_onednn_fuse_pass",    //
@@ -443,6 +444,7 @@ void CpuPassStrategy::EnableMkldnnInt8() {
    passes_.push_back("scale_matmul_fuse_pass");
    passes_.push_back("reshape_transpose_matmul_mkldnn_fuse_pass");
    passes_.push_back("matmul_elementwise_add_mkldnn_fuse_pass");
+    passes_.push_back("layer_norm_onednn_optimization_pass");
    passes_.push_back("operator_scale_onednn_fuse_pass");
    passes_.push_back("operator_unsqueeze2_onednn_fuse_pass");
    passes_.push_back("operator_reshape2_onednn_fuse_pass");

--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -41,19 +41,32 @@ class LayerNormOneDNNHandler
  }

  std::shared_ptr<dnnl::memory> AcquireScaleShiftMemory(
-      const phi::DenseTensor* scale, const phi::DenseTensor* shift) {
-    // OneDNN requires a single piece of memory for scale and shift data
-    const unsigned int C = phi::vectorize(scale->dims())[0];
-
-    auto scaleshift_memory =
-        this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc());
-
-    auto mem_data_handle =
-        reinterpret_cast<float*>(scaleshift_memory->get_data_handle());
-    std::copy(scale->data<float>(), scale->data<float>() + C, mem_data_handle);
-    std::copy(
-        shift->data<float>(), shift->data<float>() + C, mem_data_handle + C);
-    return scaleshift_memory;
+      const phi::DenseTensor* scale,
+      const phi::DenseTensor* shift,
+      const framework::ExecutionContext& ctx) {
+    // OneDNN requires a single piece of memory for scale and shift data. During
+    // inference both pieces of memory are merged inside
+    // layer_norm_onednn_optimization_pass, but during training we have to
+    // manually copy them into new memory buffer
+    auto* scaleshift = ctx.Input<phi::DenseTensor>("ScaleShift");
+    if (scaleshift) {
+      return this->AcquireMemoryFromPrimitive(
+          this->fwd_pd_->weights_desc(),
+          platform::to_void_cast(scaleshift->data<float>()));
+    } else {
+      const unsigned int C = phi::vectorize(scale->dims())[0];
+
+      auto scaleshift_memory =
+          this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc());
+
+      auto mem_data_handle =
+          reinterpret_cast<float*>(scaleshift_memory->get_data_handle());
+      std::copy(
+          scale->data<float>(), scale->data<float>() + C, mem_data_handle);
+      std::copy(
+          shift->data<float>(), shift->data<float>() + C, mem_data_handle + C);
+      return scaleshift_memory;
+    }
  }

  std::shared_ptr<dnnl::memory> AcquireMeanMemory(phi::DenseTensor* mean) {
@@ -77,9 +90,9 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Y");
    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* out = ctx.Output<phi::DenseTensor>("Y");

    const float epsilon = ctx.Attr<float>("epsilon");
    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
@@ -129,7 +142,7 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {

    if (with_scaleshift) {
      std::shared_ptr<dnnl::memory> scaleshift_memory =
-          handler.AcquireScaleShiftMemory(scale, bias);
+          handler.AcquireScaleShiftMemory(scale, bias, ctx);
      args.insert({DNNL_ARG_SCALE_SHIFT, *scaleshift_memory});
    }