diff --git a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..94cf3747581c1bd65f349f4c2e6cb06e13ede519 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc @@ -0,0 +1,143 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/utils.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" +namespace paddle { +namespace operators { + +using framework::DataLayout; +using framework::Tensor; +using framework::LoDTensor; +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::concat; +using mkldnn::stream; +using platform::to_void_cast; + +template +class StackMKLDNNHandler + : public platform::MKLDNNHandlerNoCachingT { + public: + StackMKLDNNHandler(const framework::ExecutionContext& ctx, + const mkldnn::engine mkldnn_engine, + const std::vector& inputs, Tensor* output) + : platform::MKLDNNHandlerNoCachingT(mkldnn_engine, + ctx.GetPlace()) { + int stack_axis = ctx.Attr("axis"); + + int ndims = inputs[0]->dims().size(); + + if (stack_axis < 0) { + stack_axis = ndims + 1 + stack_axis; // +1 to match output's ndims + } + + // in stack op all inputs must have same dims + auto input_dims = framework::vectorize(inputs[0]->dims()); + + memory::data_type dt = framework::ToMKLDNNDataType(inputs[0]->type()); + std::vector srcs_md; + memory::desc dst_md; + MKLDNNMemoryFormat dst_fmt; + + srcs_md.reserve(inputs.size()); + + // if stack is not done on last(non existing) axis, then we can optimize + // concat primitive by not adding additional dimension, since it causes + // wrong output format deduction and suboptimal performance as a result + if (stack_axis != ndims) { + for (size_t i = 0; i < inputs.size(); ++i) { + srcs_md.emplace_back(memory::desc(input_dims, dt, inputs[i]->format())); + } + + input_dims[stack_axis] *= inputs.size(); + dst_md = memory::desc(input_dims, dt, MKLDNNMemoryFormat::any); + } else { + auto extended_input_dims = framework::vectorize(output->dims()); + extended_input_dims[stack_axis] = 1; + + for (size_t i = 0; i < inputs.size(); ++i) { + srcs_md.emplace_back(memory::desc(input_dims, dt, inputs[i]->format()) + .reshape(extended_input_dims)); + } + + // concat primitive choses suboptimal format tag because it cannot + // distinguish between f.e. abcd and abdc if last dim is equal to 1 so + // enforcing is needed for better performance + dst_fmt = platform::GetPlainMKLDNNFormat(extended_input_dims.size()); + dst_md = memory::desc(framework::vectorize(output->dims()), dt, dst_fmt); + } + + this->AcquireForwardPrimitiveDescriptor(dst_md, stack_axis, srcs_md); + } + + // concat oneDNN prim is not having .desc attribute so we cannot use default + // AcquireForwardPrimitiveDescriptor + void AcquireForwardPrimitiveDescriptor( + const memory::desc& dst_md, const int stack_axis, + const std::vector& srcs_md) { + this->fwd_pd_.reset(new dnnl::concat::primitive_desc( + dst_md, stack_axis, srcs_md, this->engine_)); + } + + std::shared_ptr AcquireSrcMemory(const Tensor& input, int i) { + const T* input_data = input.data(); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src_desc(i), + to_void_cast(input_data)); + } +}; + +template +class StackMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + auto multi_input = ctx.MultiInput("X"); + + Tensor* output = ctx.Output("Y"); + + StackMKLDNNHandler handler(ctx, mkldnn_engine, multi_input, output); + + std::vector> srcs; + srcs.reserve(multi_input.size()); + + auto dst_mem = handler.AcquireDstMemory(output); + auto concat_p = handler.AcquireForwardPrimitive(); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + std::unordered_map args; + for (size_t i = 0; i < multi_input.size(); ++i) { + srcs.push_back(handler.AcquireSrcMemory(*(multi_input[i]), i)); + args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, *(srcs.at(i))}); + } + args.insert({MKLDNN_ARG_DST, *dst_mem}); + + concat_p->execute(astream, args); + astream.wait(); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(platform::GetMKLDNNFormat( + dst_mem->get_desc().reshape(framework::vectorize(output->dims())))); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(stack, MKLDNN, ::paddle::platform::CPUPlace, + ops::StackMKLDNNOpKernel); diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc index fbd8d8b2e0727e9736f903464d1fedeb76bdaf71..0a813759aa3ecab38864b01586d13ce2aa4d6a73 100644 --- a/paddle/fluid/operators/stack_op.cc +++ b/paddle/fluid/operators/stack_op.cc @@ -71,6 +71,21 @@ class StackOp : public framework::OperatorWithKernel { vec.insert(vec.begin() + axis, input_dims.size()); ctx->SetOutputDim("Y", framework::make_ddim(vec)); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class StackOpMaker : public framework::OpProtoAndCheckerMaker { @@ -81,6 +96,11 @@ class StackOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("axis", "The axis along which all of the Inputs(X) should be stacked.") .SetDefault(0); + AddAttr( + "use_mkldnn", + "(bool, default false) Indicates if MKL-DNN kernel will be used") + .SetDefault(false) + .AsExtra(); AddComment(R"DOC( Stack Operator. Stack all of the Inputs(X) into one tensor along Attr(axis). The dims of all Inputs(X) must be the same. diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 37fa58e423db779eec3f08aaeb1b71fb49c5c8d6..9236521fe1d95f9b97127322bcd4f16ce003a9d2 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -333,6 +333,43 @@ inline mkldnn::memory::format_tag GetMKLDNNFormat(const mkldnn::memory memory) { return GetMKLDNNFormat(mem_desc); } +inline mkldnn::memory::format_tag GetPlainMKLDNNFormat(int tensor_rank) { + switch (tensor_rank) { + case 1: + return mkldnn::memory::format_tag::a; + break; + case 2: + return mkldnn::memory::format_tag::ab; + break; + case 3: + return mkldnn::memory::format_tag::abc; + break; + case 4: + return mkldnn::memory::format_tag::abcd; + break; + case 5: + return mkldnn::memory::format_tag::abcde; + break; + case 6: + return mkldnn::memory::format_tag::abcdef; + break; + case 7: + return mkldnn::memory::format_tag::abcdefg; + break; + case 8: + return mkldnn::memory::format_tag::abcdefgh; + break; + case 9: + return mkldnn::memory::format_tag::abcdefghi; + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Paddle support tensors with rank in range <1, 9>, but received " + "tensor with rank: %d", + tensor_rank)); + } +} + inline MKLDNNMemoryFormat MKLDNNFormatForSize(size_t dims_size, MKLDNNMemoryFormat data_format) { if (dims_size == 1) { diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_stack_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_stack_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..f7424014c21116e17ba897dc6417dff904559584 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_stack_mkldnn_op.py @@ -0,0 +1,115 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, skip_check_grad_ci +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core + + +@OpTestTool.skip_if_not_cpu() +class TestStack2DOneDNNOp(OpTest): + def initDefaultParameters(self): + self.num_inputs = 4 + self.input_dim = (2, 2) + self.axis = 1 + self.dtype = np.float32 + + def initParameters(self): + pass + + def getInputNames(self): + input_names = [] + for i in range(self.num_inputs): + input_names.append('x{}'.format(i)) + return input_names + + def setUp(self): + self.initDefaultParameters() + self.initParameters() + self.op_type = 'stack' + self.op_inputs = [] + + for i in range(self.num_inputs): + self.op_inputs.append( + np.random.random(size=self.input_dim).astype(np.float32)) + + input_list = [] + input_names = self.getInputNames() + for i in range(self.num_inputs): + input_list.append((input_names[i], self.op_inputs[i])) + + self.inputs = {'X': input_list} + self.outputs = {'Y': np.stack(self.op_inputs, axis=self.axis)} + self.attrs = {'axis': self.axis, 'use_mkldnn': True} + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace()) + + # JUST FOR CI TO PASS, GRAD IS NOT IMPLEMENTED YET + def test_check_grad(self): + pass + + +class TestStack1DOneDNNOp(TestStack2DOneDNNOp): + def initParameters(self): + self.input_dim = (100) + self.axis = 0 + + +class TestStack1DAxis1OneDNNOp(TestStack2DOneDNNOp): + def initParameters(self): + self.input_dim = (100) + self.axis = 1 + + +class TestStack2DAxisLastOneDNNOp(TestStack2DOneDNNOp): + def initParameters(self): + self.input_dim = (13, 24) + self.num_inputs = 5 + self.axis = -1 + + +class TestStack3DAxisNegativeOneDNNOp(TestStack2DOneDNNOp): + def initParameters(self): + self.input_dim = (10, 128, 128) + self.axis = -2 + + +class TestStack3DOneDNNOp(TestStack2DOneDNNOp): + def initParameters(self): + self.input_dim = (10, 128, 128) + self.num_inputs = 3 + self.axis = 1 + + +class TestStack4DOneDNNOp(TestStack2DOneDNNOp): + def initParameters(self): + self.input_dim = (2, 2, 2, 2) + self.num_inputs = 3 + self.axis = 4 + + +class TestStack5DOneDNNOp(TestStack2DOneDNNOp): + def initParameters(self): + self.input_dim = (2, 3, 4, 5, 6) + self.num_inputs = 6 + self.axis = 0 + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 41fd0b442fe1c50e5cdf26df814874f164cffae5..a3e1650c131cdede0c7cb5dc004ef67d84c671b5 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -1832,3 +1832,9 @@ class OpTestTool: not (isinstance(_current_expected_place(), core.CPUPlace) and core.supports_bfloat16()), "Place does not support BF16 evaluation") + + @classmethod + def skip_if_not_cpu(cls): + return OpTestTool.skip_if( + not isinstance(_current_expected_place(), core.CPUPlace), + "OneDNN supports only CPU for now")