Added split op bf16/fp32 oneDNN kernel (#33584)

* base changes for split op * 90% of split functionality added * full fp32 functionality * added bf16 test * added submemory caching * added bf test to static mode whitelist * minor change * enabled split op for inference * minor fix * minor fix

Added split op bf16/fp32 oneDNN kernel (#33584)
* base changes for split op * 90% of split functionality added * full fp32 functionality * added bf16 test * added submemory caching * added bf test to static mode whitelist * minor change * enabled split op for inference * minor fix * minor fix
68106509 · jakpiase · GitHub · 5d2eb678 · 68106509 · 68106509
8 changed file
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2266,7 +2266,7 @@ PDNode *patterns::Bfloat16Placement::operator()(
          {"concat", "conv2d", "conv2d_transpose", "elementwise_add",
           "elementwise_mul", "fc", "fusion_gru", "fusion_lstm", "gelu",
           "layer_norm", "matmul", "pool2d", "relu", "reshape2", "softmax",
-           "sum", "transpose2"});
+           "split", "sum", "transpose2"});
  if (!bfloat16_enabled_op_types.empty()) {
    supported_op_types = bfloat16_enabled_op_types;
  }

--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */

 #include <memory>
 #include "paddle/fluid/operators/concat_op.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"

@@ -156,6 +157,17 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
            "The axis is expected to be in range of [%d, %d), but got %d",
            -rank, rank, concat_axis));
    platform::MKLDNNDeviceContext::tls().log_lib_version();
+
+    if (ctx.HasInput("AxisTensor")) {
+      auto* axis_tensor = ctx.Input<Tensor>("AxisTensor");
+      concat_axis = GetDataFromTensor(axis_tensor)[0];
+      auto out_dims = multi_input[0]->dims();
+      for (size_t i = 1; i < multi_input.size(); ++i) {
+        out_dims[concat_axis] += multi_input[i]->dims()[concat_axis];
+      }
+      output->Resize(out_dims);
+    }
+
    if (concat_axis < 0) {
      concat_axis = concat_axis + rank;
    }

--- a/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+
+static inline std::vector<std::vector<int64_t>> CalculateOutsDims(
+    const framework::DDim& in_dims, const size_t num,
+    const std::vector<int>& sections, const size_t axis,
+    const int outs_number) {
+  std::vector<std::vector<int64_t>> outs_dims(outs_number,
+                                              framework::vectorize(in_dims));
+
+  if (num > 0) {
+    PADDLE_ENFORCE_EQ(in_dims[axis] % num, 0,
+                      platform::errors::InvalidArgument(
+                          "The input's size along the split dimension "
+                          "must be evenly divisible by Attr(num_or_sections). "
+                          "But received Attr(num_or_sections) "
+                          "= %d, input(X)'s shape = [%s], Attr(dim) = %d.",
+                          num, in_dims, axis));
+
+    const size_t out_axis_dim = in_dims[axis] / num;
+
+    for (auto& out_dim : outs_dims) out_dim[axis] = out_axis_dim;
+  } else {
+    for (size_t i = 0; i < outs_dims.size(); ++i)
+      outs_dims[i][axis] = sections[i];
+  }
+  return outs_dims;
+}
+
+template <typename T>
+class SplitMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx);
+  }
+
+  void RunKernel(const framework::ExecutionContext& ctx) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    const auto* x = ctx.Input<Tensor>("X");
+    auto outs = ctx.MultiOutput<Tensor>("Out");
+
+    int num = ctx.Attr<int>("num");
+    auto sections = ctx.Attr<std::vector<int>>("sections");
+    int axis = ctx.Attr<int>("axis");
+    auto outs_number = outs.size();
+    const auto x_dims = x->dims();
+
+    bool need_resize = false;
+    if (ctx.HasInput("AxisTensor")) {
+      auto* axis_tensor = ctx.Input<Tensor>("AxisTensor");
+      axis = GetDataFromTensor(axis_tensor)[0];
+      need_resize = true;
+    }
+
+    auto sections_tensor_list = ctx.MultiInput<Tensor>("SectionsTensorList");
+    if (sections_tensor_list.size() > 0) {
+      sections = GetDataFromTensorList(sections_tensor_list);
+      need_resize = true;
+    }
+
+    if (need_resize) {
+      const auto outs_dims =
+          CalculateOutsDims(x->dims(), num, sections, axis, outs_number);
+      for (size_t i = 0; i < outs.size(); ++i) {
+        outs[i]->Resize(framework::make_ddim(outs_dims[i]));
+      }
+    }
+
+    auto x_vec_dims = framework::vectorize(x_dims);
+
+    mkldnn::memory::data_type x_type = framework::ToMKLDNNDataType(x->type());
+    auto key = platform::CreateKey(dev_ctx, x_vec_dims, axis, num, sections,
+                                   x->format(), x_type);
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+    std::vector<int64_t> offset(x_vec_dims.size(), 0);
+
+    platform::ReorderMKLDNNHandler reorder_handler(
+        x_vec_dims, x->type(), x_type, dev_ctx, onednn_engine, key);
+    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+        x->format(), platform::to_void_cast(x->data<T>()));
+
+    for (size_t i = 0; i < outs_number; ++i) {
+      auto out_vec_dims = framework::vectorize(outs[i]->dims());
+      auto slice_mem_p = reorder_handler.AcquireSrcSubmemory(
+          out_vec_dims, offset, reorder_src_memory_p, i);
+
+      auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+          outs[i], out_vec_dims, i, x->format(), ctx.GetPlace());
+      auto reorder_p =
+          reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p, i);
+
+      reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p);
+
+      offset[axis] += num > 0 ? x->dims()[axis] / num : sections[i];
+
+      outs[i]->set_layout(framework::DataLayout::kMKLDNN);
+      outs[i]->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
+    }
+    astream.wait();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(split, MKLDNN, paddle::platform::CPUPlace,
+                   ops::SplitMKLDNNKernel<float>,
+                   ops::SplitMKLDNNKernel<paddle::platform::bfloat16>);
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -73,8 +73,17 @@ class SplitOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
  }

  framework::OpKernelType GetKernelTypeForVar(
@@ -136,6 +145,14 @@ Example:
                 "(int, default 0) "
                 "The axis which the input will be split on.")
        .SetDefault(0);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "bfloat16"});
  }
 };


--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -1023,6 +1023,27 @@ class ReorderMKLDNNHandler : public MKLDNNHandler {
    return this->AcquireMemory(dims_, dtype_, fmt, ptr, "@user_src_mem_p");
  }

+  std::shared_ptr<mkldnn::memory> AcquireSrcSubmemory(
+      const std::vector<int64_t>& dims, const std::vector<int64_t>& offset,
+      const std::shared_ptr<mkldnn::memory>& mem_p, int submemory_number) {
+    std::string local_key = key_;
+    local_key.append("@submem")
+        .append(std::to_string(submemory_number))
+        .append("_p");
+
+    auto sub_mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    if (sub_mem_p == nullptr) {
+      auto sub_md = mem_p->get_desc().submemory_desc(dims, {offset});
+      sub_mem_p = std::make_shared<mkldnn::memory>(sub_md, engine_,
+                                                   mem_p->get_data_handle());
+      dev_ctx_.SetBlob(local_key, sub_mem_p);
+    } else {
+      sub_mem_p->set_data_handle(mem_p->get_data_handle());
+    }
+    return sub_mem_p;
+  }
+
  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
      framework::Tensor* output, const MKLDNNMemoryFormat& fmt,
      platform::Place place) {
@@ -1045,6 +1066,44 @@ class ReorderMKLDNNHandler : public MKLDNNHandler {
    return mem_p;
  }

+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
+      framework::Tensor* output, const std::vector<int64_t>& dims,
+      const int memory_number, const MKLDNNMemoryFormat& fmt,
+      platform::Place place) {
+    auto local_key =
+        key_ + "@user_dst_mem" + std::to_string(memory_number) + "_p";
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    if (mem_p == nullptr) {
+      auto dst_md = platform::MKLDNNMemDesc(dims, dtype_dst_, fmt);
+      auto dst_data =
+          output->mutable_data(place, vtype_dst_, dst_md.get_size());
+
+      mem_p = std::make_shared<mkldnn::memory>(dst_md, engine_, dst_data);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      // Even if memory object exists , we may be using it for diffrent tensor
+      auto dst_data =
+          output->mutable_data(place, vtype_dst_, mem_p->get_desc().get_size());
+      mem_p->set_data_handle(dst_data);
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<mkldnn::reorder> AcquireReorder(
+      std::shared_ptr<mkldnn::memory> dst_memory_p,
+      std::shared_ptr<mkldnn::memory> src_memory_p, int reorder_number) {
+    auto prim_key = key_ + "@reorder" + std::to_string(reorder_number) + "_p";
+    auto reorder_p =
+        std::static_pointer_cast<mkldnn::reorder>(dev_ctx_.GetBlob(prim_key));
+    if (reorder_p == nullptr) {
+      reorder_p =
+          std::make_shared<mkldnn::reorder>(*(src_memory_p), *(dst_memory_p));
+      dev_ctx_.SetBlob(prim_key, reorder_p);
+    }
+    return reorder_p;
+  }
+
  std::shared_ptr<mkldnn::reorder> AcquireReorder(
      std::shared_ptr<mkldnn::memory> dst_memory_p,
      std::shared_ptr<mkldnn::memory> src_memory_p) {

--- a/python/paddle/fluid/tests/unittests/mkldnn/test_split_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_split_bf16_mkldnn_op.py
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard, core
+from paddle.fluid.tests.unittests.op_test import OpTest
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+@unittest.skipIf(core.is_compiled_with_cuda(),
+                 "core is compiled with CUDA which has no BF implementation")
+class TestSplitSectionsBF16OneDNNOp(OpTest):
+    def init_data(self):
+        self.x = np.random.random((4, 5, 6)).astype("uint16")
+        self.axis = 1
+        self.sections = [2, 1, 2]
+        indices_or_sections = [2, 3]  # sections
+        np_sections = [2, 3]
+        self.out = np.split(self.x, np_sections, self.axis)
+
+    def setUp(self):
+        self.op_type = "split"
+        self.axis_tensor = None
+        self.sections_tensor_list = None
+        self.num = 0
+        self.init_data()
+        self.inputs = {'X': self.x}
+        self.attrs = {
+            'use_mkldnn': True,
+            'num': self.num,
+            'mkldnn_data_type': "bfloat16"
+        }
+
+        if self.axis is not None:
+            self.attrs['axis'] = self.axis
+        if self.sections is not None:
+            self.attrs['sections'] = self.sections
+        if self.axis_tensor is not None:
+            self.inputs['AxisTensor'] = self.axis_tensor
+        if self.sections_tensor_list is not None:
+            self.inputs['SectionsTensorList'] = self.sections_tensor_list
+
+        self.outputs = {'Out': [('out%d' % i, self.out[i]) \
+            for i in range(len(self.out))]}
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+
+# TODO jakpiase enable grad check(concat op)
+#    def test_check_grad(self):
+#        self.check_grad_with_place(
+#            core.CPUPlace(), ["X"],
+#            "Out",
+#            chck_dgrph=
+#            user_defined_grads=[self.inputs['X']],
+#            user_defined_grad_outputs=self.out[0])
+
+
+class TestSplitNumBF16OneDNNOp(TestSplitSectionsBF16OneDNNOp):
+    def init_data(self):
+        self.x = np.random.random((4, 8, 5, 3)).astype("uint16")
+        self.axis = 1
+        self.sections = []
+        self.num = 4
+        indices_or_sections = 4  #indices
+        self.out = np.split(self.x, indices_or_sections, self.axis)
+
+
+class TestSplitNumAxisTensorBF16OneDNNOp(TestSplitSectionsBF16OneDNNOp):
+    def init_data(self):
+        self.x = np.random.random((4, 5, 6)).astype("uint16")
+        self.axis = None
+        self.sections = []
+        self.num = 3
+        indices_or_sections = 3  #indices
+        self.axis_tensor = np.array([2]).astype("int32")
+        self.out = np.split(self.x, indices_or_sections, 2)
+
+
+class TestSplitSectionsTensorBF16OneDNNOp(TestSplitSectionsBF16OneDNNOp):
+    def init_data(self):
+        self.x = np.random.random((4, 5, 6)).astype("uint16")
+        self.axis = 1
+        self.sections = [2, 1, 2]
+        self.sections_tensor_list = []
+        for index, ele in enumerate(self.sections):
+            self.sections_tensor_list.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+        self.sections = [-1, -1, -1]
+        indices_or_sections = [2, 3]  #sections
+        self.out = np.split(self.x, indices_or_sections, self.axis)
+
+
+class TestSplitOpUnknownSectionBF16OneDNNOp(TestSplitSectionsBF16OneDNNOp):
+    def init_data(self):
+        self.x = np.random.random((4, 5, 6)).astype("uint16")
+        self.axis = 2
+        self.sections = [2, 2, -1]
+        indices_or_sections = [2, 4]  #sections
+        self.out = np.split(self.x, indices_or_sections, self.axis)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_split_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_split_mkldnn_op.py
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard, core
+from paddle.fluid.tests.unittests.op_test import OpTest
+
+
+class TestSplitSectionsOneDNNOp(OpTest):
+    def init_data(self):
+        self.x = np.random.random((4, 5, 6)).astype("float32")
+        self.axis = 1
+        self.sections = [2, 1, 2]
+        indices_or_sections = [2, 3]  # sections
+        np_sections = [2, 3]
+        self.out = np.split(self.x, np_sections, self.axis)
+
+    def setUp(self):
+        self.op_type = "split"
+        self.axis_tensor = None
+        self.sections_tensor_list = None
+        self.num = 0
+        self.init_data()
+        self.inputs = {'X': self.x}
+        self.attrs = {'use_mkldnn': True, 'num': self.num}
+
+        if self.axis is not None:
+            self.attrs['axis'] = self.axis
+        if self.sections is not None:
+            self.attrs['sections'] = self.sections
+        if self.axis_tensor is not None:
+            self.inputs['AxisTensor'] = self.axis_tensor
+        if self.sections_tensor_list is not None:
+            self.inputs['SectionsTensorList'] = self.sections_tensor_list
+
+        self.outputs = {'Out': [('out%d' % i, self.out[i]) \
+            for i in range(len(self.out))]}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], ['out0', 'out1', 'out2'])
+
+
+# test with attr(num)
+class TestSplitNumOneDNNOp(TestSplitSectionsOneDNNOp):
+    def init_data(self):
+        self.x = np.random.random((4, 8, 5, 3)).astype("float32")
+        self.axis = 1
+        self.sections = []
+        self.num = 4
+        indices_or_sections = 4  #indices
+        self.out = np.split(self.x, indices_or_sections, self.axis)
+
+    def test_check_grad(self):
+        self.check_grad(['X'], ['out0', 'out1', 'out2', 'out3'])
+
+
+class TestSplitNumAxisTensorOneDNNOp(TestSplitSectionsOneDNNOp):
+    def init_data(self):
+        self.x = np.random.random((4, 5, 6)).astype("float32")
+        self.axis = None
+        self.sections = []
+        self.num = 3
+        indices_or_sections = 3  #indices
+        self.axis_tensor = np.array([2]).astype("int32")
+        self.out = np.split(self.x, indices_or_sections, 2)
+
+
+# attr(sections) is list containing Tensor
+class TestSplitSectionsTensorOneDNNOp(TestSplitSectionsOneDNNOp):
+    def init_data(self):
+        self.x = np.random.random((4, 5, 6)).astype("float32")
+        self.axis = 1
+        self.sections = [2, 1, 2]
+        self.sections_tensor_list = []
+        for index, ele in enumerate(self.sections):
+            self.sections_tensor_list.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+        self.sections = [-1, -1, -1]
+        indices_or_sections = [2, 3]  #sections
+        self.out = np.split(self.x, indices_or_sections, self.axis)
+
+
+class TestSplitOpUnknownSectionOneDNNOp(TestSplitSectionsOneDNNOp):
+    def init_data(self):
+        self.x = np.random.random((4, 5, 6)).astype("float32")
+        self.axis = 2
+        self.sections = [2, 2, -1]
+        indices_or_sections = [2, 4]  #sections
+        self.out = np.split(self.x, indices_or_sections, self.axis)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -475,6 +475,8 @@ STATIC_MODE_TESTING_LIST = [
    'test_split_and_merge_lod_tensor_op',
    'test_split_ids_op',
    'test_split_op',
+    'test_split_mkldnn_op',
+    'test_split_bf16_mkldnn_op',
    'test_spp_op',
    'test_square_error_cost',
    'test_squared_l2_norm_op',