From 41c4f7238f7ca6bf35cd6741762831e3e619a087 Mon Sep 17 00:00:00 2001
From: jakpiase <62569058+jakpiase@users.noreply.github.com>
Date: Fri, 30 Jul 2021 09:51:41 +0200
Subject: [PATCH] Added expand_v2 BF16/FP32 FWD/BWD kernels (#34284)

* added expand_v2 bf16/fp32 kernel

* minor change

* CI fix

* added missing test file

* added formatting

* reduced binary size

* CI fix
---
 paddle/fluid/operators/expand_v2_op.cc        |  36 +++-
 .../operators/mkldnn/expand_v2_mkldnn_op.cc   | 161 ++++++++++++++++++
 .../reduce_ops/mkldnn/reduce_mkldnn_op.h      |  12 +-
 paddle/fluid/platform/mkldnn_reuse.h          |  37 ++--
 .../mkldnn/test_expand_v2_mkldnn_op.py        | 107 ++++++++++++
 5 files changed, 316 insertions(+), 37 deletions(-)
 create mode 100644 paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index 618c1560c5e..3c2b939e799 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -89,9 +89,17 @@ class ExpandV2Op : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-        ctx.device_context());
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
   framework::OpKernelType GetKernelTypeForVar(
@@ -130,6 +138,14 @@ class ExpandV2OpMaker : public framework::OpProtoAndCheckerMaker {
               "the corresponding value given by Attr(expand_times).");
     AddAttr<std::vector<int>>("shape", "The expanded shape for each dimension.")
         .SetDefault({});
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "bfloat16"});
     AddComment(R"DOC(
 Expand the input to the given shape. The rank of X
 should be in [1, 6] and size of 'shape' must be in [1, 6] also.
@@ -200,9 +216,17 @@ class ExpandV2GradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
-                                       ctx, framework::GradVarName("Out")),
-                                   ctx.device_context());
+    auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
   framework::OpKernelType GetKernelTypeForVar(
diff --git a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
new file mode 100644
index 00000000000..ffd64a841ec
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
@@ -0,0 +1,161 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace {
+
+using paddle::framework::Tensor;
+using paddle::framework::vectorize;
+using paddle::framework::GradVarName;
+using paddle::framework::ExecutionContext;
+using paddle::platform::MKLDNNDeviceContext;
+
+template <typename T>
+class ExpandMKLDNNKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const ExecutionContext& ctx) const override {
+    this->RunKernel(ctx);
+  }
+
+  void RunKernel(const ExecutionContext& ctx) const {
+    const auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    const auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto x_vec_dims = vectorize(x->dims());
+    auto out_vec_dims = vectorize(out->dims());
+
+    dnnl::memory::format_tag x_format_tag = x->format();
+    if (x_vec_dims.size() != out_vec_dims.size()) {
+      x_format_tag =
+          GetExtendedFormatTag(x_vec_dims, out_vec_dims.size(), x_format_tag);
+    }
+
+    out->set_format(x_format_tag);
+
+    paddle::platform::BroadcastDataMKLDNNHandler<T> handler(
+        dnnl::algorithm::binary_add, dev_ctx, onednn_engine, ctx.GetPlace(),
+        out, x, 0.0f, 1.0f, ctx.InputName("X"), x_vec_dims);
+
+    auto src_memory_p = handler.AcquireSrcMemory(x);
+    auto dst_memory_p = handler.AcquireDstMemory(out);
+    auto binary_p = handler.AcquireForwardPrimitive();
+
+    const std::unordered_map<int, dnnl::memory> args = {
+        {DNNL_ARG_SRC_0, *dst_memory_p},
+        {DNNL_ARG_SRC_1, *src_memory_p},
+        {DNNL_ARG_DST, *dst_memory_p}};
+
+    auto& astream = MKLDNNDeviceContext::tls().get_stream();
+    binary_p->execute(astream, args);
+    astream.wait();
+
+    out->set_layout(paddle::framework::DataLayout::kMKLDNN);
+    out->set_format(paddle::platform::GetMKLDNNFormat(*dst_memory_p));
+  }
+
+ private:
+  dnnl::memory::format_tag GetExtendedFormatTag(
+      std::vector<int64_t>& dims, int new_size,
+      mkldnn::memory::format_tag format_tag) const {
+    mkldnn::memory::desc md(dims, paddle::platform::MKLDNNGetDataType<T>(),
+                            format_tag);
+    std::vector<int64_t> new_dims(new_size, 1);
+    std::copy(dims.begin(), dims.end(),
+              new_dims.begin() + new_size - dims.size());
+
+    dims = std::move(new_dims);
+    return paddle::platform::GetMKLDNNFormat(md.reshape(dims));
+  }
+};
+
+template <typename T>
+class ExpandGradMKLDNNKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const ExecutionContext& ctx) const override {
+    this->RunKernel(ctx);
+  }
+
+  void RunKernel(const ExecutionContext& ctx) const {
+    const auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    auto* dout = ctx.Input<Tensor>(GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(GradVarName("X"));
+
+    auto dx_vec_dims = vectorize(dx->dims());
+    auto dout_vec_dims = vectorize(dout->dims());
+
+    if (dx_vec_dims.size() != dout_vec_dims.size()) {
+      dx_vec_dims.insert(dx_vec_dims.begin(),
+                         dout_vec_dims.size() - dx_vec_dims.size(), 1);
+    }
+
+    auto& astream = MKLDNNDeviceContext::tls().get_stream();
+    if (dout_vec_dims == dx_vec_dims) {
+      mkldnn::memory::data_type dout_type =
+          paddle::framework::ToMKLDNNDataType(dout->type());
+      std::string key = paddle::platform::CreateKey(
+          dev_ctx, dout_vec_dims, dout->format(), dout->format(), dout_type);
+      paddle::platform::ReorderMKLDNNHandler reorder_handler(
+          dout_vec_dims, dout->type(), dout_type, dev_ctx, onednn_engine, key);
+
+      auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+          dout->format(), paddle::platform::to_void_cast(dout->data<T>()));
+
+      auto reorder_dst_memory_p =
+          reorder_handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace());
+
+      auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
+                                                      reorder_dst_memory_p);
+
+      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+      astream.wait();
+
+      dx->set_layout(paddle::framework::DataLayout::kMKLDNN);
+      dx->set_format(
+          paddle::platform::GetMKLDNNFormat(reorder_dst_memory_p->get_desc()));
+    } else {
+      paddle::platform::ReductionMKLDNNHandler<T> handler(
+          dnnl::algorithm::reduction_sum, 0.0f, 0.0f, dev_ctx, onednn_engine,
+          ctx.GetPlace(), dout, dx, ctx.InputName("X"), dx_vec_dims);
+
+      auto src_memory_p = handler.AcquireSrcMemory(dout);
+      auto dst_memory_p = handler.AcquireDstMemory(dx);
+
+      std::unordered_map<int, dnnl::memory> reduction_args = {
+          {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}};
+
+      auto reduction_p = handler.AcquireForwardPrimitive();
+
+      reduction_p->execute(astream, reduction_args);
+      astream.wait();
+      dx->set_layout(paddle::framework::DataLayout::kMKLDNN);
+      dx->set_format(paddle::platform::GetMKLDNNFormat(
+          dst_memory_p->get_desc().reshape(vectorize<int64_t>(dx->dims()))));
+    }
+  }
+};
+}  // anonymous namespace
+
+REGISTER_OP_KERNEL(expand_v2, MKLDNN, paddle::platform::CPUPlace,
+                   ExpandMKLDNNKernel<float>,
+                   ExpandMKLDNNKernel<paddle::platform::bfloat16>);
+
+REGISTER_OP_KERNEL(expand_v2_grad, MKLDNN, paddle::platform::CPUPlace,
+                   ExpandGradMKLDNNKernel<float>,
+                   ExpandGradMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
index 40cd3ba974f..6a9aae046f3 100644
--- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
@@ -165,23 +165,21 @@ class ReduceGradMKLDNNKernel : public framework::OpKernel<T> {
       x_format_tag = getPlainFormatTag(output_dx);
     }
 
-    output_dx->mutable_data<T>(ctx.GetPlace());
     output_dx->set_format(x_format_tag);
-    output_dx->set_layout(input_dy->layout());
 
     platform::BroadcastDataMKLDNNHandler<T> handler(
         binary_type, dev_ctx, onednn_engine, ctx.GetPlace(), output_dx,
         input_dy, scale_x, scale_y,
         ctx.InputName(framework::GradVarName("Out")), input_dims);
 
-    const auto src_dx_memory = handler.AcquireSrcMemory(output_dx);
-    const auto src_dy_memory = handler.AcquireSecondSrcMemory(input_dy);
+    const auto src_memory_p = handler.AcquireSrcMemory(input_dy);
+    const auto dst_memory_p = handler.AcquireDstMemory(output_dx);
     const auto binary_prim = handler.AcquireForwardPrimitive();
 
     const std::unordered_map<int, dnnl::memory> args = {
-        {DNNL_ARG_SRC_0, *src_dx_memory},
-        {DNNL_ARG_SRC_1, *src_dy_memory},
-        {DNNL_ARG_DST, *src_dx_memory}};
+        {DNNL_ARG_SRC_0, *dst_memory_p},
+        {DNNL_ARG_SRC_1, *src_memory_p},
+        {DNNL_ARG_DST, *dst_memory_p}};
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     binary_prim->execute(astream, args);
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 58622fb2529..f63d45d7ff6 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -695,8 +695,8 @@ class BroadcastDataMKLDNNHandler
   BroadcastDataMKLDNNHandler(const dnnl::algorithm algo,
                              const MKLDNNDeviceContext& dev_ctx,
                              const mkldnn::engine engine,
-                             platform::Place cpu_place, const Tensor* x,
-                             const Tensor* y, float scale_x, float scale_y,
+                             platform::Place cpu_place, const Tensor* out,
+                             const Tensor* x, float scale_x, float scale_y,
                              const std::string& uniq_name,
                              const std::vector<int64_t>& input_dims)
       : platform::MKLDNNHandlerT<T, dnnl::binary>(
@@ -711,19 +711,12 @@ class BroadcastDataMKLDNNHandler
           x->format(), MKLDNNMemoryFormat::undef,
           platform::errors::InvalidArgument("Wrong format set for X tensor."));
 
-      PADDLE_ENFORCE_EQ(
-          y->layout(), DataLayout::kMKLDNN,
-          platform::errors::InvalidArgument("Wrong layout set for Y tensor."));
-      PADDLE_ENFORCE_NE(
-          y->format(), MKLDNNMemoryFormat::undef,
-          platform::errors::InvalidArgument("Wrong format set for Y tensor."));
-
-      const auto src0_tz = framework::vectorize(x->dims());
+      const auto src0_tz = framework::vectorize(out->dims());
 
       const auto src0_md = dnnl::memory::desc(
-          src0_tz, platform::MKLDNNGetDataType<T>(), x->format());
+          src0_tz, platform::MKLDNNGetDataType<T>(), out->format());
       const auto src1_md = dnnl::memory::desc(
-          input_dims, platform::MKLDNNGetDataType<T>(), x->format());
+          input_dims, platform::MKLDNNGetDataType<T>(), out->format());
 
       dnnl::primitive_attr attributes;
       attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x});
@@ -734,18 +727,14 @@ class BroadcastDataMKLDNNHandler
     }
   }
 
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(framework::Tensor* input) {
-    T* input_data = input->data<T>();
-    memset(input_data, 0, this->fwd_pd_->src_desc().get_size());
-    return this->AcquireMemoryFromPrimitive(
-        this->fwd_pd_->src_desc(), to_void_cast<T>(input_data), "@src0_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSecondSrcMemory(
-      const framework::Tensor* input) {
-    const T* input_data = input->data<T>();
-    return this->AcquireMemoryFromPrimitive(
-        this->fwd_pd_->src1_desc(), to_void_cast<T>(input_data), "@src1_mem_p");
+  template <typename T_out = T>
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output) {
+    T_out* ptr = output->mutable_data<T_out>(
+        this->place_, this->fwd_pd_->dst_desc().get_size());
+    ;
+    memset(ptr, 0, this->fwd_pd_->dst_desc().get_size());
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr,
+                                            "@dst_mem_p");
   }
 };
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py
new file mode 100644
index 00000000000..51d7fe97167
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py
@@ -0,0 +1,107 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard, core
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16
+
+
+@OpTestTool.skip_if(core.is_compiled_with_cuda(),
+                    "CUDA required dygraph so oneDNN UT must be skipped")
+class TestExpandV2OneDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.init_data()
+        self.x = np.random.random(self.ori_shape).astype("float32")
+        self.set_inputs()
+        self.attrs = {'shape': self.shape, 'use_mkldnn': True}
+        output = np.tile(self.x, self.expand_times)
+        self.outputs = {'Out': output}
+
+    def set_inputs(self):
+        self.inputs = {'X': self.x}
+
+    def init_data(self):
+        self.ori_shape = [1, 140]
+        self.shape = [12, 140]
+        self.expand_times = [12, 1]
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_grad(self):
+        self.check_grad_with_place(core.CPUPlace(), ["X"], "Out")
+
+
+class TestExpandV2ExpandDimOneDNNOp(TestExpandV2OneDNNOp):
+    def init_data(self):
+        self.ori_shape = [120]
+        self.shape = [2, 120]
+        self.expand_times = [2, 1]
+
+
+class TestExpandV2CopyScenarioOneDNNOp(TestExpandV2OneDNNOp):
+    def init_data(self):
+        self.ori_shape = (2, 10, 5)
+        self.shape = (2, 10, 5)
+        self.expand_times = (1, 1, 1)
+
+
+class TestExpandV2CopyScenarioShapeNotGivenOneDNNOp(TestExpandV2OneDNNOp):
+    def init_data(self):
+        self.ori_shape = (2, 4, 5, 7)
+        self.shape = (-1, -1, -1, -1)
+        self.expand_times = (1, 1, 1, 1)
+
+
+#   BF16 TESTS
+def create_expand_v2_bf16_test_class(parent):
+    @OpTestTool.skip_if_not_cpu_bf16()
+    class TestExpandV2BF16OneDNNOp(parent):
+        def set_inputs(self):
+            self.inputs = {"X": convert_float_to_uint16(self.x)}
+
+        def calculate_grads(self):
+            self.dout = self.outputs['Out']
+            self.dx = self.dout.copy()
+
+            for i in range(len(self.shape)):
+                if self.expand_times[i] != 1:
+                    self.dx = np.sum(self.dx, axis=i, keepdims=True)
+
+        def test_check_grad(self):
+            self.calculate_grads()
+            self.check_grad_with_place(
+                core.CPUPlace(), ["X"],
+                "Out",
+                user_defined_grads=[convert_float_to_uint16(self.dx)],
+                user_defined_grad_outputs=[self.dout])
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Expand_v2_BF16")
+    TestExpandV2BF16OneDNNOp.__name__ = cls_name
+    globals()[cls_name] = TestExpandV2BF16OneDNNOp
+
+
+create_expand_v2_bf16_test_class(TestExpandV2OneDNNOp)
+create_expand_v2_bf16_test_class(TestExpandV2ExpandDimOneDNNOp)
+create_expand_v2_bf16_test_class(TestExpandV2CopyScenarioOneDNNOp)
+create_expand_v2_bf16_test_class(TestExpandV2CopyScenarioShapeNotGivenOneDNNOp)
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
-- 
GitLab