Fused elementwise_(mul/div) (#50428)

* extract Op and OPMaker to .h * extend pattern for fused_op * set "with_residual" default to false * adjust fuse passes * remove fc+eltwise flag * fused_output_scale * activation attrs * remove extra attrs * fix int8/bf16 unit tests * simplify RecomputeOutputDims * remove unused method * Add description for attributes * add extra check * adjust op compats * update quantize test * fix protobuf parsing error * fix int8 performance * fused elementwises * merge develop * remove activation * restore activation for existing add/sub ops

Fused elementwise_(mul/div) (#50428)
* extract Op and OPMaker to .h * extend pattern for fused_op * set "with_residual" default to false * adjust fuse passes * remove fc+eltwise flag * fused_output_scale * activation attrs * remove extra attrs * fix int8/bf16 unit tests * simplify RecomputeOutputDims * remove unused method * Add description for attributes * add extra check * adjust op compats * update quantize test * fix protobuf parsing error * fix int8 performance * fused elementwises * merge develop * remove activation * restore activation for existing add/sub ops
968f7f24 · Sławomir Siwek · GitHub · 14abafa1 · 968f7f24 · 968f7f24
12 changed file
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -87,7 +87,12 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
      return;
    }

-    ConvertToFusedOp(op->Op());
+    // Remove this condition when all fused_elementwise ops are merged
+    if (!(op->Op()->Type() == "elementwise_add" ||
+          op->Op()->Type() == "elementwise_sub" ||
+          op->Op()->Type() == "elementwise_mul")) {
+      ConvertToFusedOp(op->Op());
+    }
    op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
  };
  gpd(graph, handler);

--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
@@ -148,6 +148,8 @@ inline void ConvertToFusedOp(OpDesc* op) {
  const std::map<std::string, std::string> fused_ops = {
      {"conv2d", "fused_conv2d"},
      {"depthwise_conv2d", "fused_conv2d"},
+      {"elementwise_mul", "fused_elementwise_mul"},
+      {"elementwise_div", "fused_elementwise_div"},
      {"matmul", "fused_matmul"},
      {"matmul_v2", "fused_matmul"},
      {"softplus", "fused_softplus"},

--- a/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
@@ -31,6 +31,8 @@ void FuseOperatorScaleOneDNNPass::ApplyImpl(Graph *graph) const {
      "fused_matmul",
      "matmul",
      "matmul_v2",
+      "fused_elementwise_mul",
+      "fused_elementwise_div",
      "elementwise_add",
      "elementwise_sub",
      "elementwise_mul",
@@ -87,10 +89,7 @@ void FuseOperatorScaleOneDNNPass::FuseScale(Graph *graph,
      scale = *(scale_tensor->data<float>());
    }

-    if (op_type == "matmul" || op_type == "matmul_v2") {
-      ConvertToFusedOp(operator_op->Op());
-    }
-
+    ConvertToFusedOp(operator_op->Op());
    operator_op->Op()->SetAttr("fused_output_scale", scale);
    operator_op->Op()->SetOutput("Out", {scale_out->Name()});

@@ -120,6 +119,8 @@ REGISTER_PASS_CAPABILITY(operator_scale_onednn_fuse_pass)
            .EQ("fused_matmul", 0)
            .LE("matmul", 1)
            .EQ("matmul_v2", 0)
+            .EQ("fused_elementwise_mul", 0)
+            .EQ("fused_elementwise_div", 0)
            .LE("elementwise_add", 1)
            .LE("elementwise_sub", 1)
            .LE("elementwise_mul", 1)

--- a/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc
@@ -27,7 +27,10 @@ using string::PrettyLogDetail;

 void FuseOperatorUnsqueeze2OneDNNPass::ApplyImpl(Graph *graph) const {
  std::vector<std::pair<std::string, int>> ops_and_outputs = {
-      {"fused_transpose", 2}, {"transpose2", 2}, {"elementwise_mul", 1}};
+      {"fused_transpose", 2},
+      {"transpose2", 2},
+      {"fused_elementwise_mul", 1},
+      {"elementwise_mul", 1}};

  for (const auto &op_and_outputs : ops_and_outputs)
    FuseUnsqueeze2(graph, op_and_outputs.first, op_and_outputs.second);

--- a/paddle/fluid/operators/compat/fused_elementwise_div.pbtxt
+++ b/paddle/fluid/operators/compat/fused_elementwise_div.pbtxt
+type: "fused_elementwise_div"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "fuse_activation"
+    type: STRING
+  }
+  attrs {
+    name: "fuse_alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "fuse_beta"
+    type: FLOAT
+  }
+  attrs {
+    name: "fused_output_scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "fused_unsqueeze2_axes"
+    type: INTS
+  }
+  attrs {
+    name: "scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "scale_out"
+    type: FLOAT
+  }
+}
--- a/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt
+++ b/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt
+type: "fused_elementwise_mul"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "fuse_activation"
+    type: STRING
+  }
+  attrs {
+    name: "fuse_alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "fuse_beta"
+    type: FLOAT
+  }
+  attrs {
+    name: "fused_output_scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "fused_unsqueeze2_axes"
+    type: INTS
+  }
+  attrs {
+    name: "scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "scale_out"
+    type: FLOAT
+  }
+}
--- a/paddle/fluid/operators/fused/fused_elementwise_op.cc
+++ b/paddle/fluid/operators/fused/fused_elementwise_op.cc
+//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FusedElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() final {
+    AddInput("X", "The first input tensor of elementwise op.");
+    AddInput("Y", "The second input tensor of elementwise op.");
+    AddOutput("Out", "A location into which the result is stored.");
+    AddAttr<int>(
+        "axis",
+        "If X.dimension != Y.dimension, Y.dimension must be a "
+        "subsequence of X.dimension. And axis is the start dimension index "
+        "for broadcasting Y onto X.")
+        .SetDefault(-1);
+    AddAttr<std::string>(
+        "fuse_activation",
+        "Activation type from elementwise_act_onednn_fuse_pass")
+        .SetDefault("");
+    AddAttr<float>("fuse_alpha",
+                   "Activation alpha from elementwise_act_onednn_fuse_pass")
+        .SetDefault(0.0f);
+    AddAttr<float>("fuse_beta",
+                   "Activation beta from elementwise_act_onednn_fuse_pass")
+        .SetDefault(0.0f);
+    AddAttr<float>("fused_output_scale",
+                   "Obtained from operator_scale_onednn_fuse_pass")
+        .SetDefault(1.0f);
+    AddAttr<std::vector<int>>(
+        "fused_unsqueeze2_axes",
+        "Obtained from operator_unsqueeze2_onednn_fuse_pass")
+        .SetDefault({});
+    AddAttr<float>("scale_x", "Elementwise X input quantization scale")
+        .SetDefault(1.0f);
+    AddAttr<float>("scale_y", "Elementwise Y input quantization scale")
+        .SetDefault(1.0f);
+    AddAttr<float>("scale_out", "Elementwise Out output quantization scale")
+        .SetDefault(1.0f);
+    AddComment(
+        R"DOC(Elementwise operator extended with oneDNN-specific fusion logic.)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    fused_elementwise_mul,
+    ops::ElementwiseOp,
+    ops::FusedElementwiseOpMaker,
+    ops::ElementwiseOpInferVarType,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(
+    fused_elementwise_div,
+    ops::ElementwiseOp,
+    ops::FusedElementwiseOpMaker,
+    ops::ElementwiseOpInferVarType,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
--- a/paddle/fluid/operators/ops_extra_info.h
+++ b/paddle/fluid/operators/ops_extra_info.h
@@ -109,7 +109,6 @@ const std::unordered_map<std::string, ExtraAttrPropertySet>
        {"Scale_weights", ExtraAttrProperty::ONEDNN},
        {"x_data_format", ExtraAttrProperty::ONEDNN},
        {"y_data_format", ExtraAttrProperty::ONEDNN},
-        {"fused_unsqueeze2_axes", ExtraAttrProperty::ONEDNN},
        {"fused_reshape2_shape", ExtraAttrProperty::ONEDNN},
        // ONEDNN pass dedicated attributes
        {"Activation_scale", ExtraAttrProperty::ONEDNN},

--- a/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, dnnl::algorithm BINARY_OP>
+void FusedElementwiseKernel(const OneDNNContext& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& y,
+                            const int axis,
+                            const std::string& fuse_activation,
+                            const float fuse_alpha,
+                            const float fuse_beta,
+                            const float fused_output_scale,
+                            const std::vector<int>& fused_unsqueeze2_axes,
+                            const float scale_x,
+                            const float scale_y,
+                            const float scale_out,
+                            DenseTensor* out) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+
+  dnnl::post_ops post_operations;
+  funcs::AppendActivation(
+      dev_ctx, post_operations, 1.0f, fuse_activation, fuse_alpha, fuse_beta);
+  if (fused_output_scale != 1.0) {
+    post_operations.append_eltwise(
+        1.0, dnnl::algorithm::eltwise_linear, fused_output_scale, 0.0f);
+  }
+
+  auto* non_const_x = &x;
+  auto* non_const_y = &y;
+
+  funcs::BinaryOneDNNHandler<T> handler(BINARY_OP,
+                                        axis,
+                                        onednn_engine,
+                                        dev_ctx.GetPlace(),
+                                        non_const_x,
+                                        non_const_y,
+                                        out,
+                                        scale_x,
+                                        scale_y,
+                                        scale_out,
+                                        true,
+                                        post_operations);
+
+  // oneDNN's binary is optimized for broadcasting y into x, so in other case
+  // we have to swap tensors to achieve optimal performance
+  if (x.numel() < y.numel()) {
+    std::swap(non_const_x, non_const_y);
+  }
+
+  const auto src_x_memory = handler.AcquireSrcMemory(non_const_x);
+  const auto src_y_memory = handler.AcquireSecondSrcMemory(non_const_y);
+  // For Inplace src and dst should be the same memory object.
+  // So x should share buffer with z. But UT mechanics is testing inplace
+  // execution for this op not checking that x can be bradcasted to match in
+  // shape y tensor.
+  // This is wrong as when x is to be broadcasted then z(out) will match the
+  // shape of y which is bigger than x. Hence if x is smaller in shape than z
+  // and they share a buffer (of shape x) then this buffer is not big enough
+  // to hold result of elementwise operation.
+  const bool reuse_x_memory = non_const_x->numel() == out->numel() &&
+                              non_const_x->IsSharedBufferWith(*out);
+  std::shared_ptr<dnnl::memory> dst_memory;
+
+  if (reuse_x_memory) {
+    dst_memory = src_x_memory;
+    // NOTE(chenfeiyu): when the output reuses memory from other tensor rather
+    // than allocate its own, it's still need to take care of its data type.
+    // Unfortunately, paddle's operator only infers the output' shape, but not
+    // the data type. Alloc<T> takes care of allocation and data type
+    // normally, but if the memory is already allocated and there is no need
+    // to re-allocate, it just set the data type. So this it added there to
+    // get the right data type.
+    dev_ctx.template Alloc<T>(out);
+  } else {
+    dst_memory = handler.AcquireDstMemory(out);
+  }
+
+  const auto binary_prim = handler.AcquireForwardPrimitive();
+
+  auto& astream = OneDNNContext::tls().get_stream();
+
+  const std::unordered_map<int, dnnl::memory> args = {
+      {DNNL_ARG_SRC_0, *src_x_memory},
+      {DNNL_ARG_SRC_1, *src_y_memory},
+      {DNNL_ARG_DST, *dst_memory}};
+
+  binary_prim->execute(astream, args);
+  astream.wait();
+
+  auto out_md = dst_memory->get_desc();
+
+  if (handler.use_broadcasting_hack) {
+    auto dims = out_md.dims();
+    dims.insert(dims.begin(), non_const_x->dims()[0]);
+    dims[1] /= dims[0];
+    out_md = out_md.reshape(dims);
+  }
+
+  if (fused_unsqueeze2_axes.empty()) {
+    out->set_mem_desc(out_md);
+  } else {
+    funcs::SetOutMemDescWithUnsqueeze2FuseSupport(
+        fused_unsqueeze2_axes, out, out_md);
+  }
+}
+
+#define DEFINE_ONEDNN_ELEMENTWISE_KERNEL(name, algorithm)          \
+  template <typename T, typename Context>                          \
+  void name##Kernel(const Context& dev_ctx,                        \
+                    const DenseTensor& x,                          \
+                    const DenseTensor& y,                          \
+                    const int axis,                                \
+                    const std::string& fuse_activation,            \
+                    const float fuse_alpha,                        \
+                    const float fuse_beta,                         \
+                    const float fused_output_scale,                \
+                    const std::vector<int>& fused_unsqueeze2_axes, \
+                    const float scale_x,                           \
+                    const float scale_y,                           \
+                    const float scale_out,                         \
+                    DenseTensor* out) {                            \
+    FusedElementwiseKernel<T, algorithm>(dev_ctx,                  \
+                                         x,                        \
+                                         y,                        \
+                                         axis,                     \
+                                         fuse_activation,          \
+                                         fuse_alpha,               \
+                                         fuse_beta,                \
+                                         fused_output_scale,       \
+                                         fused_unsqueeze2_axes,    \
+                                         scale_x,                  \
+                                         scale_y,                  \
+                                         scale_out,                \
+                                         out);                     \
+  }
+
+DEFINE_ONEDNN_ELEMENTWISE_KERNEL(FusedMultiply, dnnl::algorithm::binary_mul)
+DEFINE_ONEDNN_ELEMENTWISE_KERNEL(FusedDivide, dnnl::algorithm::binary_div)
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_elementwise_mul,
+                   OneDNN,
+                   ONEDNN,
+                   phi::FusedMultiplyKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(fused_elementwise_div,
+                   OneDNN,
+                   ONEDNN,
+                   phi::FusedDivideKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
--- a/paddle/phi/kernels/onednn/elementwise_kernel.cc
+++ b/paddle/phi/kernels/onednn/elementwise_kernel.cc
@@ -122,18 +122,7 @@ void ElementwiseKernel(const OneDNNContext& dev_ctx,
    out_md = out_md.reshape(dims);
  }

-  const auto fused_unsqueeze2_axes =
-      dev_ctx.HasDnnAttr("fused_unsqueeze2_axes")
-          ? PADDLE_GET_CONST(std::vector<int>,
-                             dev_ctx.GetDnnAttr("fused_unsqueeze2_axes"))
-          : std::vector<int>();
-
-  if (!fused_unsqueeze2_axes.empty()) {
-    funcs::SetOutMemDescWithUnsqueeze2FuseSupport(
-        fused_unsqueeze2_axes, out, out_md);
-  } else {
-    out->set_mem_desc(out_md);
-  }
+  out->set_mem_desc(out_md);
 }

 #define DEFINE_ONEDNN_ELEMENTWISE_KERNEL(name, algorithm)      \

--- a/paddle/phi/ops/compat/fused_elementwise_sig.cc
+++ b/paddle/phi/ops/compat/fused_elementwise_sig.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature FusedElementwiseMulOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("fused_elementwise_mul",
+                         {"X", "Y"},
+                         {"axis",
+                          "fuse_activation",
+                          "fuse_alpha",
+                          "fuse_beta",
+                          "fused_output_scale",
+                          "fused_unsqueeze2_axes",
+                          "scale_x",
+                          "scale_y",
+                          "scale_out"},
+                         {"Out"});
+}
+
+KernelSignature FusedElementwiseDivOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("fused_elementwise_div",
+                         {"X", "Y"},
+                         {"axis",
+                          "fuse_activation",
+                          "fuse_alpha",
+                          "fuse_beta",
+                          "fused_output_scale",
+                          "fused_unsqueeze2_axes",
+                          "scale_x",
+                          "scale_y",
+                          "scale_out"},
+                         {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(fused_elementwise_mul,
+                           phi::FusedElementwiseMulOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(fused_elementwise_div,
+                           phi::FusedElementwiseDivOpArgumentMapping);
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_operator_unsqueeze2_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_operator_unsqueeze2_fuse_pass.py
@@ -143,7 +143,7 @@ class TestElementwiseMulUnsqueeze2OneDNNFusePass(PassAutoScanTest):
                "operator_unsqueeze2_onednn_fuse_pass",
            ],
        )
-        yield config, ["elementwise_mul"], (1e-5, 1e-5)
+        yield config, ["fused_elementwise_mul"], (1e-5, 1e-5)

    def test(self):
        self.run_and_statis(