diff --git a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e16c41829b1a657082325024924ca1a134988c67 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc @@ -0,0 +1,175 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; + +template +class SliceMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + this->RunKernel(ctx); + } + + void RunKernel(const framework::ExecutionContext& ctx) const { + const auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + auto* x = ctx.Input("Input"); + auto* out = ctx.Output("Out"); + + auto x_vec_dims = framework::vectorize(x->dims()); + auto out_vec_dims = framework::vectorize(out->dims()); + + auto axes_int = ctx.Attr>("axes"); + auto starts_int = ctx.Attr>("starts"); + auto ends_int = ctx.Attr>("ends"); + + std::vector axes(ctx.Attr>("axes").begin(), + ctx.Attr>("axes").end()); + std::vector starts(ctx.Attr>("starts").begin(), + ctx.Attr>("starts").end()); + std::vector ends(ctx.Attr>("ends").begin(), + ctx.Attr>("ends").end()); + + auto decrease_axis = ctx.Attr>("decrease_axis"); + + std::vector offsets(x_vec_dims.size(), 0); + std::vector slice_dims(x_vec_dims); + + for (size_t i = 0; i < axes.size(); ++i) { + starts[i] = starts[i] < 0 ? x_vec_dims[axes[i]] + starts[i] : starts[i]; + ends[i] = ends[i] < 0 ? x_vec_dims[axes[i]] + ends[i] + : std::min(ends[i], x_vec_dims[axes[i]]); + offsets[axes[i]] = starts[i]; + slice_dims[axes[i]] = ends[i] - starts[i]; + } + + mkldnn::memory::data_type x_type = framework::ToMKLDNNDataType(x->type()); + auto key = platform::CreateKey(dev_ctx, x_vec_dims, axes, starts, ends, + x->format(), x_type); + + platform::ReorderMKLDNNHandler reorder_handler( + x_vec_dims, x->type(), x_type, dev_ctx, onednn_engine, key); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + x->format(), platform::to_void_cast(x->data())); + auto slice_mem_p = reorder_handler.AcquireSubmemory(slice_dims, offsets, + reorder_src_memory_p); + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + out, slice_dims, 0, x->format(), ctx.GetPlace()); + + auto reorder_p = + reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat( + reorder_dst_memory_p->get_desc().reshape(out_vec_dims))); + } +}; + +template +class SliceGradMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + this->RunKernel(ctx); + } + + void RunKernel(const framework::ExecutionContext& ctx) const { + const auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("Input")); + + auto dx_vec_dims = framework::vectorize(dx->dims()); + auto dout_vec_dims = framework::vectorize(dout->dims()); + + auto axes_int = ctx.Attr>("axes"); + auto starts_int = ctx.Attr>("starts"); + auto ends_int = ctx.Attr>("ends"); + + std::vector axes(ctx.Attr>("axes").begin(), + ctx.Attr>("axes").end()); + std::vector starts(ctx.Attr>("starts").begin(), + ctx.Attr>("starts").end()); + std::vector ends(ctx.Attr>("ends").begin(), + ctx.Attr>("ends").end()); + + auto decrease_axis = ctx.Attr>("decrease_axis"); + + std::vector offsets(dx_vec_dims.size(), 0); + std::vector slice_dims(dx_vec_dims); + + for (size_t i = 0; i < axes.size(); ++i) { + starts[i] = starts[i] < 0 ? dx_vec_dims[axes[i]] + starts[i] : starts[i]; + ends[i] = ends[i] < 0 ? dx_vec_dims[axes[i]] + ends[i] + : std::min(ends[i], dx_vec_dims[axes[i]]); + offsets[axes[i]] = starts[i]; + slice_dims[axes[i]] = ends[i] - starts[i]; + } + + mkldnn::memory::data_type dout_type = + framework::ToMKLDNNDataType(dout->type()); + mkldnn::memory::desc md(dout_vec_dims, platform::MKLDNNGetDataType(), + dout->format()); + mkldnn::memory::format_tag reorder_format_tag = + platform::GetMKLDNNFormat(md.reshape(slice_dims)); + + auto key = platform::CreateKey(dev_ctx, dout_vec_dims, axes, starts, ends, + reorder_format_tag, dout_type); + + platform::ReorderMKLDNNHandler reorder_handler( + slice_dims, dout->type(), dout_type, dev_ctx, onednn_engine, key); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + reorder_format_tag, platform::to_void_cast(dout->data())); + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + dx, dx_vec_dims, 0, reorder_format_tag, ctx.GetPlace()); + memset(dx->data(), 0, reorder_dst_memory_p->get_desc().get_size()); + + auto slice_mem_p = reorder_handler.AcquireSubmemory(slice_dims, offsets, + reorder_dst_memory_p); + + auto reorder_p = + reorder_handler.AcquireReorder(slice_mem_p, reorder_src_memory_p); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + reorder_p->execute(astream, *reorder_src_memory_p, *slice_mem_p); + astream.wait(); + + dx->set_layout(framework::DataLayout::kMKLDNN); + dx->set_format(reorder_format_tag); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(slice, MKLDNN, paddle::platform::CPUPlace, + ops::SliceMKLDNNKernel, + ops::SliceMKLDNNKernel); + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(slice_grad, MKLDNN, paddle::platform::CPUPlace, + ops::SliceGradMKLDNNKernel, + ops::SliceGradMKLDNNKernel); \ No newline at end of file diff --git a/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc index afbe330305b7e10123a07e9b1418fe33064f76e8..8a58d9f26f87bcc1e22356239346f725bfac6083 100644 --- a/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc @@ -105,7 +105,7 @@ class SplitMKLDNNKernel : public framework::OpKernel { for (size_t i = 0; i < outs_number; ++i) { auto out_vec_dims = framework::vectorize(outs[i]->dims()); - auto slice_mem_p = reorder_handler.AcquireSrcSubmemory( + auto slice_mem_p = reorder_handler.AcquireSubmemory( out_vec_dims, offset, reorder_src_memory_p, i); auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index a55959385f6276a4a01057b5fd9c51b04ea12d9e..ac50ccea9eee4682992f102a2f1aa4fcbbf30fff 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -132,6 +132,26 @@ class SliceOp : public framework::OperatorWithKernel { if (platform::is_cuda_pinned_place(in_tensor.place())) { return framework::OpKernelType(in_tensor.type(), ctx.device_context()); } + +#ifdef PADDLE_WITH_MKLDNN + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "Input"); + + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // OneDNN uses blocking format, which cannot be always supported with + // reorders, because if blocked dimension is not divisible by 8 or + // 16(depending on which blocking format is used) submemory cannot be + // created, so in that scenario a fallback is needed + auto tmp_md = dnnl::memory::desc( + framework::vectorize(ctx.Input("Input")->dims()), + dnnl::memory::data_type::f32, ctx.Input("Input")->format()); + if (tmp_md.data.format_desc.blocking.inner_nblks == 0) + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(in_tensor.type(), in_tensor.place()); } return framework::OpKernelType( @@ -216,6 +236,14 @@ class SliceOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault({}); AddAttr>("decrease_axis", "(list) decrease_axis") .SetDefault({}); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "mkldnn_data_type", + "(string, default \"float32\"). Data type of mkldnn kernel") + .SetDefault("float32") + .InEnum({"float32", "bfloat16"}); AddComment(R"DOC( Slice Operator. @@ -278,12 +306,32 @@ class SliceOpGrad : public framework::OperatorWithKernel { ctx->SetOutputDim(x_grad_name, x_dims); } } + framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.device_context()); + auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // OneDNN uses blocking format, which cannot be always supported with + // reorders, because if blocked dimension is not divisible by 8 or + // 16(depending on which blocking format is used) submemory cannot be + // created, so in that scenario a fallback is needed + auto tmp_md = dnnl::memory::desc( + framework::vectorize( + ctx.Input(framework::GradVarName("Out"))->dims()), + dnnl::memory::data_type::f32, + ctx.Input(framework::GradVarName("Out"))->format()); + if (tmp_md.data.format_desc.blocking.inner_nblks == 0) + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } + framework::OpKernelType GetKernelTypeForVar( const std::string &var_name, const Tensor &tensor, const framework::OpKernelType &expected_kernel_type) const override { diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc index f81ac8882d1076a0999acc0810a0a387028d6c7c..5bd699e08abbcad5524a500c28ec7d7768dc18f0 100644 --- a/paddle/fluid/operators/split_op.cc +++ b/paddle/fluid/operators/split_op.cc @@ -78,10 +78,9 @@ class SplitOp : public framework::OperatorWithKernel { #ifdef PADDLE_WITH_MKLDNN if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - // OneDNN uses blocking format, which cannot be always - // supported with reorders, because if blocked dimension is not divisible - // by - // 8 or 16(depending on which blocking format is used) submemory cannot be + // OneDNN uses blocking format, which cannot be always supported with + // reorders, because if blocked dimension is not divisible by 8 or + // 16(depending on which blocking format is used) submemory cannot be // created, so in that scenario a fallback is needed auto tmp_md = dnnl::memory::desc( framework::vectorize(ctx.Input("X")->dims()), diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index e6442ded6b5aeca1b1b2aad8b1578697bea617d2..370d9b3925226249130559ccca90c26af4af44d4 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -1090,9 +1090,9 @@ class ReorderMKLDNNHandler : public MKLDNNHandler { return this->AcquireMemory(dims_, dtype_, fmt, ptr, "@user_src_mem_p"); } - std::shared_ptr AcquireSrcSubmemory( + std::shared_ptr AcquireSubmemory( const std::vector& dims, const std::vector& offset, - const std::shared_ptr& mem_p, int submemory_number) { + const std::shared_ptr& mem_p, int submemory_number = 0) { std::string local_key = key_; local_key.append("@submem") .append(std::to_string(submemory_number)) diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_slice_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_slice_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..caebcffd0e966af1dd55eec9fb4b900673c8e66d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_slice_mkldnn_op.py @@ -0,0 +1,199 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle.fluid.core as core +from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16 +import paddle.fluid as fluid +import paddle.fluid.layers as layers +import paddle + + +@OpTestTool.skip_if(core.is_compiled_with_cuda(), + "CUDA required dygraph so oneDNN UT must be skipped") +class TestSliceOneDNNOp(OpTest): + def setUp(self): + self.op_type = "slice" + self.config() + self.set_inputs() + self.outputs = {'Out': self.out} + self.attrs = { + 'axes': self.axes, + 'starts': self.starts, + 'ends': self.ends, + 'infer_flags': self.infer_flags, + 'use_mkldnn': True + } + self.set_attrs() + + def set_inputs(self): + self.inputs = {'Input': self.input} + + def set_attrs(self): + pass + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [1, 0, 2] + self.ends = [3, 3, 4] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[1:3, 0:3, 2:4, :] + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['Input'], 'Out') + + +class TestSliceOneDNNOp1(TestSliceOneDNNOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[-3:3, 0:100, 2:-1, :] + + +class TestSliceOneDNNOp2(TestSliceOneDNNOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[-3:3, 0:100, :, 2:-1] + + +class TestSliceDecrease1AxisOneDNNOp(TestSliceOneDNNOp): + def set_attrs(self): + self.attrs['decrease_axis'] = self.decrease_axis + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [1, 0, 2] + self.ends = [2, 3, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0] + self.infer_flags = [1, 1, 1] + self.out = self.input[1, 0:3, 2:4, :] + + +class TestSliceDecrease2AxesOneDNNOp(TestSliceDecrease1AxisOneDNNOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [1, 0, 2] + self.ends = [2, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[1, 0, 2:4, :] + + +class TestSliceDecrease3AxesOneDNNOp(TestSliceDecrease1AxisOneDNNOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [-1, 0, 2] + self.ends = [1000000, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[-1, 0, 2:4, :] + + +class TestSliceDecrease4AxesOneDNNOp(TestSliceDecrease1AxisOneDNNOp): + def config(self): + self.input = np.random.random([3, 4, 5, 7]).astype("float32") + self.starts = [0, 1, 2, 3] + self.ends = [1, 2, 3, 4] + self.axes = [0, 1, 2, 3] + self.decrease_axis = [0, 1, 2, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[0, 1, 2, 3:4] + + +class TestSlice5DOneDNNOp(TestSliceDecrease1AxisOneDNNOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6, 7]).astype("float32") + self.starts = [-1] + self.ends = [1000000] + self.axes = [4] + self.decrease_axis = [4] + self.infer_flags = [1, 1, 1] + self.out = self.input[:, :, :, :, -1] + + +class TestSlice3DOneDNNOp(TestSliceDecrease1AxisOneDNNOp): + def config(self): + self.input = np.random.random([5, 4, 5]).astype("float32") + self.starts = [-1] + self.ends = [1000000] + self.axes = [2] + self.decrease_axis = [2] + self.infer_flags = [1, 1, 1] + self.out = self.input[:, :, -1] + + +# BF16 TESTS +def create_bf16_test_class(parent): + @OpTestTool.skip_if_not_cpu_bf16() + class TestSliceBF16OneDNNOp(parent): + def set_inputs(self): + self.dtype = np.uint16 + self.inputs = {'Input': convert_float_to_uint16(self.input)} + + def calculate_grads(self): + self.dout = self.out + self.dx = np.zeros(shape=self.input.shape) + + begin = [None] * self.input.ndim + end = [None] * self.input.ndim + + for i in range(len(self.axes)): + begin[self.axes[i]] = self.starts[i] + end[self.axes[i]] = self.ends[i] + self.dx[begin[0]:end[0], begin[1]:end[1], begin[2]:end[2], begin[3]: + end[3]] = self.dout + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace()) + + def test_check_grad(self): + self.calculate_grads() + self.check_grad_with_place( + core.CPUPlace(), ["Input"], + "Out", + user_defined_grads=[self.dx], + user_defined_grad_outputs=[convert_float_to_uint16(self.dout)]) + + cls_name = "{0}_{1}".format(parent.__name__, "BF16") + TestSliceBF16OneDNNOp.__name__ = cls_name + globals()[cls_name] = TestSliceBF16OneDNNOp + + +create_bf16_test_class(TestSliceOneDNNOp) +create_bf16_test_class(TestSliceOneDNNOp1) +create_bf16_test_class(TestSliceDecrease1AxisOneDNNOp) +create_bf16_test_class(TestSliceDecrease2AxesOneDNNOp) +create_bf16_test_class(TestSliceDecrease3AxesOneDNNOp) +create_bf16_test_class(TestSliceDecrease4AxesOneDNNOp) + +if __name__ == '__main__': + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py b/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py index 8d665a1746816650a68fddfbf68e51a1ea27ad96..bc8c3cc5b23e547cf38910c45b7d974a5cf7473c 100644 --- a/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py +++ b/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py @@ -81,10 +81,13 @@ class TestSaveLoadBF16(unittest.TestCase): y_data = np.arange(1, 13).reshape(4, 3).astype('int64') x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) + #TODO investigate initializing model with "float32" instead of "uint16" as it was before + # slice_op PR(datatypes in model graph are different than datatypes during runtime because of that) init_hidden_data = np.zeros( - (num_layers, batch_size, hidden_size), dtype='float32') + (num_layers, batch_size, hidden_size), dtype='uint16') init_cell_data = np.zeros( - (num_layers, batch_size, hidden_size), dtype='float32') + (num_layers, batch_size, hidden_size), dtype='uint16') + fetch_list = [static_loss, static_last_hidden, static_last_cell] out = exe.run(fluid.default_main_program(), feed={