From 24783c841d47b338c57a4d45680d552a6746cf66 Mon Sep 17 00:00:00 2001 From: jakpiase <62569058+jakpiase@users.noreply.github.com> Date: Wed, 30 Jun 2021 13:51:32 +0200 Subject: [PATCH] Added matmul_v2 BF16/FP32 FWD kernel (#33750) * added matmul_v2 bf16/fp32 FWD kernel added matmul_v2 bf16/fp32 FWD kernel * added formatting * removed some tests due to timeout in CI * refactored tests * merged tests classes into one file * minor change * removed test guard for CUDA * remove skipIf * changes after review * formated one file * minor change * added skipping UT in CUDA place --- .../framework/ir/graph_pattern_detector.cc | 4 +- paddle/fluid/operators/matmul_v2_op.cc | 20 +- .../operators/mkldnn/matmul_v2_mkldnn_op.cc | 205 +++++++++++++ .../mkldnn/test_matmul_v2_mkldnn_op.py | 288 ++++++++++++++++++ tools/static_mode_white_list.py | 1 + 5 files changed, 514 insertions(+), 4 deletions(-) create mode 100644 paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index b542fe49af..37a8ec1268 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2265,8 +2265,8 @@ PDNode *patterns::Bfloat16Placement::operator()( std::unordered_set( {"concat", "conv2d", "conv2d_transpose", "elementwise_add", "elementwise_mul", "fc", "fusion_gru", "fusion_lstm", "gelu", - "layer_norm", "matmul", "pool2d", "relu", "reshape2", "softmax", - "split", "sum", "transpose2"}); + "layer_norm", "matmul", "matmul_v2", "pool2d", "relu", "reshape2", + "softmax", "split", "sum", "transpose2"}); if (!bfloat16_enabled_op_types.empty()) { supported_op_types = bfloat16_enabled_op_types; } diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index 82706fd487..8ac81596a3 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -85,9 +85,17 @@ class MatMulV2Op : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto data_type = + auto input_data_type = OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y"); - return framework::OpKernelType(data_type, ctx.device_context()); + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } framework::OpKernelType GetKernelTypeForVar( @@ -118,6 +126,14 @@ class MatMulV2OpMaker : public framework::OpProtoAndCheckerMaker { "Set true to transpose the last two dimensions of Y before " "doing multiplication") .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "mkldnn_data_type", + "(string, default \"float32\"). Data type of mkldnn kernel") + .SetDefault("float32") + .InEnum({"float32", "bfloat16"}); AddComment( R"DOC(Matrix multiplication Out = X * Y. A has shape (d0, d1 ... M, K), B has shape (d0, d1 ... K, N), Out has shape ((d0, d1 ... M, N)). diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc new file mode 100644 index 0000000000..50afd41717 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc @@ -0,0 +1,205 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using dnnl::memory; +using dnnl::primitive; +using framework::DataLayout; +using framework::ExecutionContext; +using platform::GetMKLDNNFormat; +using platform::MKLDNNDeviceContext; +using platform::MKLDNNGetDataType; +using platform::to_void_cast; +using Tensor = framework::Tensor; + +template +class MatMulV2MKLDNNHandler : public platform::MKLDNNHandlerT { + public: + MatMulV2MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine engine, platform::Place cpu_place, + std::vector& x_dims, bool trans_x, + std::vector& y_dims, bool trans_y, + const std::string& uniq_name) + : platform::MKLDNNHandlerT( + dev_ctx, engine, cpu_place, + platform::CreateKey(dev_ctx, x_dims, uniq_name)) { + if (!this->isCached()) { + // M X K * K X N + const int MB_idx = x_dims.size() - 3; + const int H_idx = x_dims.size() - 2; + const int W_idx = x_dims.size() - 1; + + if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]); + if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]); + + const memory::dim M = x_dims[H_idx]; + const memory::dim K = x_dims[W_idx]; + const memory::dim N = y_dims[W_idx]; + + std::vector x_strides(x_dims.size() - 3, 1); + std::vector y_strides(x_dims.size() - 3, 1); + std::vector out_strides(x_dims.size() - 3, 1); + std::vector out_ddims(x_dims.size() - 3, 1); + + x_strides.reserve(x_dims.size()); + y_strides.reserve(x_dims.size()); + out_strides.reserve(x_dims.size()); + + if (!trans_x) { + x_strides.insert(x_strides.end(), {M * K, K, 1}); + } else { + x_strides.insert(x_strides.end(), {M * K, 1, M}); + } + + if (!trans_y) { + y_strides.insert(y_strides.end(), {N * K, N, 1}); + } else { + y_strides.insert(y_strides.end(), {N * K, 1, K}); + } + + out_strides.insert(out_strides.end(), {M * N, N, 1}); + out_ddims.insert(out_ddims.end(), + {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N}); + + for (int i = x_dims.size() - 4; i >= 0; --i) { + out_ddims[i] = std::max(x_dims[i], y_dims[i]); + x_strides[i] = x_dims[i + 1] * x_strides[i + 1]; + y_strides[i] = y_dims[i + 1] * y_strides[i + 1]; + out_strides[i] = out_ddims[i + 1] * out_strides[i + 1]; + } + + auto x_md = memory::desc(x_dims, MKLDNNGetDataType(), x_strides); + auto y_md = memory::desc(y_dims, MKLDNNGetDataType(), y_strides); + auto out_md = + memory::desc(out_ddims, MKLDNNGetDataType(), out_strides); + + this->AcquireForwardPrimitiveDescriptor(x_md, y_md, out_md); + } + } + + std::shared_ptr AcquireWeightsMemory(const Tensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(), + to_void_cast(input_data), + "@weights_mem_p"); + } +}; + +template +class MatMulV2MKLDNNKernel : public framework::OpKernel { + public: + void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); } + + private: + void CalculateMatrixDims(const ExecutionContext& ctx, + const std::vector& x_dims, + const std::vector& y_dims, + std::vector& x_bd_dims, + std::vector& y_bd_dims, + std::vector& out_dims, Tensor* out) const { + if (x_dims.size() == 1) { + x_bd_dims[x_bd_dims.size() - 1] = x_dims[0]; + } else { + for (size_t i = 0; i < x_dims.size(); ++i) { + x_bd_dims[i] = x_dims[i]; + } + } + if (y_dims.size() == 1) { + y_bd_dims[x_bd_dims.size() - 2] = y_dims[0]; + } else { + for (size_t i = 0; i < y_dims.size(); ++i) { + y_bd_dims[i] = y_dims[i]; + } + } + + if ((y_dims.size() == x_dims.size()) && y_dims.size() > 2) { + for (size_t i = 0; i < x_dims.size() - 2; ++i) { + PADDLE_ENFORCE_EQ( + x_dims[i] == y_dims[i] || x_dims[i] == 1 || y_dims[i] == 1, true, + platform::errors::InvalidArgument( + "Tensor dimensions are incorrect for broadcasting." + "Dimensions in X and Y must be same or equal to 1, but " + "received x_dim[%d]=%d and y_dims[%d]= %d", + i, x_dims[i], i, y_dims[i])); + out_dims[i] = std::max(x_dims[i], y_dims[i]); + } + out->Resize(framework::make_ddim(out_dims)); + } + } + + void RunKernel(const ExecutionContext& ctx) const { + const auto& dev_ctx = ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + bool trans_x = ctx.Attr("trans_x"); + bool trans_y = ctx.Attr("trans_y"); + + auto x_dims = framework::vectorize(x->dims()); + auto y_dims = framework::vectorize(y->dims()); + auto out_dims = framework::vectorize(out->dims()); + + int ndims = std::max(x->dims().size(), y->dims().size()); + ndims = std::max(ndims, 3); + + std::vector x_bd_dims(ndims, 1); + std::vector y_bd_dims(ndims, 1); + + CalculateMatrixDims(ctx, x_dims, y_dims, x_bd_dims, y_bd_dims, out_dims, + out); + + MatMulV2MKLDNNHandler handler(dev_ctx, onednn_engine, ctx.GetPlace(), + x_bd_dims, trans_x, y_bd_dims, trans_y, + ctx.InputName("X")); + + const auto src_memory_p = handler.AcquireSrcMemory(x); + const auto weights_memory_p = handler.AcquireWeightsMemory(y); + const auto dst_memory_p = handler.AcquireDstMemory(out); + + auto matmul_p = handler.AcquireForwardPrimitive(); + + std::unordered_map matmul_args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + auto& astream = MKLDNNDeviceContext::tls().get_stream(); + matmul_p->execute(astream, matmul_args); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format( + GetMKLDNNFormat(dst_memory_p->get_desc().reshape(out_dims))); + } +}; +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(matmul_v2, MKLDNN, ::paddle::platform::CPUPlace, + ops::MatMulV2MKLDNNKernel, + ops::MatMulV2MKLDNNKernel); + +// REGISTER_OP_KERNEL(matmul_grad_v2, MKLDNN, ::paddle::platform::CPUPlace, +// ops::MatMulV2GradMKLDNNKernel, +// ops::MatMulV2GradMKLDNNKernel); diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py new file mode 100644 index 0000000000..11b111310d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py @@ -0,0 +1,288 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16 +import paddle.fluid.core as core +import paddle +import paddle.fluid as fluid +import paddle.fluid.framework as framework + + +def reference_matmul(X, Y, transpose_X=False, transpose_Y=False): + """Reference forward implementation using np.matmul.""" + # np.matmul does not support the transpose flags, so we manually + # transpose X and Y appropriately. + if transpose_X: + if X.ndim == 1: + X = X.reshape((X.size, )) + elif X.ndim == 2: + X = X.T + else: + dim = [i for i in range(len(X.shape))] + dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1] + X = np.transpose(X, tuple(dim)) + if transpose_Y: + if Y.ndim == 1: + Y = Y.reshape((Y.size, )) + else: + dim = [i for i in range(len(Y.shape))] + dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1] + Y = np.transpose(Y, tuple(dim)) + + Out = np.atleast_1d(np.matmul(X, Y)) + return Out + + +class TestMatMulV2VectorXVectorOneDNNOp(OpTest): + def config(self): + self.x_shape = (100, ) + self.y_shape = (100, ) + self.trans_x = False + self.trans_y = False + + def set_inputs(self, x, y): + self.inputs = {'X': x, 'Y': y} + + def set_dtype_attr(self): + self.attrs['mkldnn_data_type'] = "float32" + + def setUp(self): + self.config() + self.op_type = "matmul_v2" + x = np.random.random(self.x_shape).astype("float32") + y = np.random.random(self.y_shape).astype("float32") + # -0.1 ~ 0.1 + x = -0.1 + 0.2 * x + y = -0.1 + 0.2 * y + result = reference_matmul(x, y, self.trans_x, + self.trans_y).astype("float32") + + self.set_inputs(x, y) + self.attrs = { + 'trans_x': self.trans_x, + 'trans_y': self.trans_y, + 'use_mkldnn': True + } + self.set_dtype_attr() + self.outputs = {'Out': result} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X', 'Y'], 'Out') + + +class TestMatMulV2VectorXMatrixTransposeYOneDNNOp( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (100, ) + self.y_shape = (1, 3, 2, 100) + self.trans_x = False + self.trans_y = True + + +class TestMatMulV2VectorXMatrixOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (100, ) + self.y_shape = (1, 1, 100, 2) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2MatrixXVectorTransposeXOneDNNOp( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (1, 1, 100, 1) + self.y_shape = (100, ) + self.trans_x = True + self.trans_y = False + + +class TestMatMulV2MatrixXVectorOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (1, 2, 1, 100) + self.y_shape = (100, ) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2MatrixXMatrixOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (1, 1, 2, 100) + self.y_shape = (1, 1, 100, 1) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2MatrixXMatrixTransposeYOneDNNOp( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (1, 1, 1, 100) + self.y_shape = (2, 1, 2, 100) + self.trans_x = False + self.trans_y = True + + +class TestMatMulV2MatrixXMatrix2OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (1, 1, 12, 4) + self.y_shape = (1, 2, 4, 12) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2MatrixXMatrix3OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (2, 1, 2, 100) + self.y_shape = (1, 1, 100, 2) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2MatrixXMatrixTranposeXOneDNNOp2( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (2, 1, 4, 25) + self.y_shape = (1, 1, 4, 25) + self.trans_x = True + self.trans_y = False + + +class TestMatMulV2MatrixXMatrixTranposeX2OneDNNOp3( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (2, 2, 5, 4) + self.y_shape = (2, 2, 5, 3) + self.trans_x = True + self.trans_y = False + + +class TestMatMulV2MatrixXMatrixTransposeX3OneDNNOp( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (3, 1, 6, 5) + self.y_shape = (1, 2, 6, 9) + self.trans_x = True + self.trans_y = False + + +class TestMatMulV2MatrixXMatrix4OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (3, 1, 6, 6) + self.y_shape = (1, 2, 6, 9) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2VectorXMatrix5DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (100) + self.y_shape = (1, 2, 2, 100, 2) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2Matrix3DXVectorOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (2, 1, 40) + self.y_shape = (40) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2MatrixXMatrixTransposeXTransposeYOneDNNOp( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (3, 1, 10, 8) + self.y_shape = (1, 2, 9, 10) + self.trans_x = True + self.trans_y = True + + +class TestMatMulV2MatrixXMatrixTransposeY2OneDNNOp( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (3, 1, 10, 10) + self.y_shape = (1, 2, 9, 10) + self.trans_x = False + self.trans_y = True + + +class TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (1, 3, 1, 10, 10) + self.y_shape = (3, 1, 2, 9, 10) + self.trans_x = False + self.trans_y = True + + +# BF16 TESTS +def create_bf16_test_class(parent): + class TestMatMulV2Bf16OneDNNOp(parent): + def set_inputs(self, x, y): + self.inputs = { + 'X': convert_float_to_uint16(x), + 'Y': convert_float_to_uint16(y) + } + + def set_dtype_attr(self): + self.attrs['mkldnn_data_type'] = "bfloat16" + + def test_check_output(self): + if core.is_compiled_with_cuda(): + self.skipTest( + "OneDNN doesn't support bf16 with CUDA, skipping UT" + + self.__class__.__name__) + elif not core.supports_bfloat16(): + self.skipTest("Core doesn't support bf16, skipping UT" + + self.__class__.__name__) + else: + self.check_output_with_place(core.CPUPlace()) + + def test_check_grad(self): + pass + + cls_name = "{0}_{1}".format(parent.__name__, "BF16") + TestMatMulV2Bf16OneDNNOp.__name__ = cls_name + globals()[cls_name] = TestMatMulV2Bf16OneDNNOp + + +create_bf16_test_class(TestMatMulV2VectorXMatrixTransposeYOneDNNOp) +create_bf16_test_class(TestMatMulV2VectorXMatrixOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXVectorTransposeXOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXVectorOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrixOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeYOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrix2OneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrix3OneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrixTranposeXOneDNNOp2) +create_bf16_test_class(TestMatMulV2MatrixXMatrixTranposeX2OneDNNOp3) +create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeX3OneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrix4OneDNNOp) +create_bf16_test_class(TestMatMulV2VectorXMatrix5DOneDNNOp) +create_bf16_test_class(TestMatMulV2Matrix3DXVectorOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeXTransposeYOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeY2OneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp) + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index 7b38f39976..09029b6ad8 100644 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -625,6 +625,7 @@ STATIC_MODE_TESTING_LIST = [ 'test_lrn_mkldnn_op', 'test_matmul_mkldnn_op', 'test_matmul_bf16_mkldnn_op', + 'test_matmul_v2_mkldnn_op', 'test_mul_int8_mkldnn_op', 'test_multi_gru_mkldnn_op', 'test_multi_gru_fuse_pass', -- GitLab