未验证 提交 24783c84 编写于 作者: J jakpiase 提交者: GitHub

Added matmul_v2 BF16/FP32 FWD kernel (#33750)

* added matmul_v2 bf16/fp32 FWD kernel

added matmul_v2 bf16/fp32 FWD kernel

* added formatting

* removed some tests due to timeout in CI

* refactored tests

* merged tests classes into one file

* minor change

* removed test guard for CUDA

* remove skipIf

* changes after review

* formated one file

* minor change

* added skipping UT in CUDA place
上级 defae0ef
......@@ -2265,8 +2265,8 @@ PDNode *patterns::Bfloat16Placement::operator()(
std::unordered_set<std::string>(
{"concat", "conv2d", "conv2d_transpose", "elementwise_add",
"elementwise_mul", "fc", "fusion_gru", "fusion_lstm", "gelu",
"layer_norm", "matmul", "pool2d", "relu", "reshape2", "softmax",
"split", "sum", "transpose2"});
"layer_norm", "matmul", "matmul_v2", "pool2d", "relu", "reshape2",
"softmax", "split", "sum", "transpose2"});
if (!bfloat16_enabled_op_types.empty()) {
supported_op_types = bfloat16_enabled_op_types;
}
......
......@@ -85,9 +85,17 @@ class MatMulV2Op : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type =
auto input_data_type =
OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y");
return framework::OpKernelType(data_type, ctx.device_context());
#ifdef PADDLE_WITH_MKLDNN
if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
return framework::OpKernelType(input_data_type, ctx.GetPlace(),
framework::DataLayout::kMKLDNN,
framework::LibraryType::kMKLDNN);
}
#endif
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
framework::OpKernelType GetKernelTypeForVar(
......@@ -118,6 +126,14 @@ class MatMulV2OpMaker : public framework::OpProtoAndCheckerMaker {
"Set true to transpose the last two dimensions of Y before "
"doing multiplication")
.SetDefault(false);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"mkldnn_data_type",
"(string, default \"float32\"). Data type of mkldnn kernel")
.SetDefault("float32")
.InEnum({"float32", "bfloat16"});
AddComment(
R"DOC(Matrix multiplication Out = X * Y. A has shape (d0, d1 ... M, K),
B has shape (d0, d1 ... K, N), Out has shape ((d0, d1 ... M, N)).
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle {
namespace operators {
using dnnl::memory;
using dnnl::primitive;
using framework::DataLayout;
using framework::ExecutionContext;
using platform::GetMKLDNNFormat;
using platform::MKLDNNDeviceContext;
using platform::MKLDNNGetDataType;
using platform::to_void_cast;
using Tensor = framework::Tensor;
template <typename T>
class MatMulV2MKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::matmul> {
public:
MatMulV2MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx,
const mkldnn::engine engine, platform::Place cpu_place,
std::vector<int64_t>& x_dims, bool trans_x,
std::vector<int64_t>& y_dims, bool trans_y,
const std::string& uniq_name)
: platform::MKLDNNHandlerT<T, dnnl::matmul>(
dev_ctx, engine, cpu_place,
platform::CreateKey(dev_ctx, x_dims, uniq_name)) {
if (!this->isCached()) {
// M X K * K X N
const int MB_idx = x_dims.size() - 3;
const int H_idx = x_dims.size() - 2;
const int W_idx = x_dims.size() - 1;
if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]);
if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]);
const memory::dim M = x_dims[H_idx];
const memory::dim K = x_dims[W_idx];
const memory::dim N = y_dims[W_idx];
std::vector<int64_t> x_strides(x_dims.size() - 3, 1);
std::vector<int64_t> y_strides(x_dims.size() - 3, 1);
std::vector<int64_t> out_strides(x_dims.size() - 3, 1);
std::vector<int64_t> out_ddims(x_dims.size() - 3, 1);
x_strides.reserve(x_dims.size());
y_strides.reserve(x_dims.size());
out_strides.reserve(x_dims.size());
if (!trans_x) {
x_strides.insert(x_strides.end(), {M * K, K, 1});
} else {
x_strides.insert(x_strides.end(), {M * K, 1, M});
}
if (!trans_y) {
y_strides.insert(y_strides.end(), {N * K, N, 1});
} else {
y_strides.insert(y_strides.end(), {N * K, 1, K});
}
out_strides.insert(out_strides.end(), {M * N, N, 1});
out_ddims.insert(out_ddims.end(),
{std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N});
for (int i = x_dims.size() - 4; i >= 0; --i) {
out_ddims[i] = std::max(x_dims[i], y_dims[i]);
x_strides[i] = x_dims[i + 1] * x_strides[i + 1];
y_strides[i] = y_dims[i + 1] * y_strides[i + 1];
out_strides[i] = out_ddims[i + 1] * out_strides[i + 1];
}
auto x_md = memory::desc(x_dims, MKLDNNGetDataType<T>(), x_strides);
auto y_md = memory::desc(y_dims, MKLDNNGetDataType<T>(), y_strides);
auto out_md =
memory::desc(out_ddims, MKLDNNGetDataType<T>(), out_strides);
this->AcquireForwardPrimitiveDescriptor(x_md, y_md, out_md);
}
}
std::shared_ptr<memory> AcquireWeightsMemory(const Tensor* input) {
const T* input_data = input->data<T>();
return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(),
to_void_cast<T>(input_data),
"@weights_mem_p");
}
};
template <typename T>
class MatMulV2MKLDNNKernel : public framework::OpKernel<T> {
public:
void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); }
private:
void CalculateMatrixDims(const ExecutionContext& ctx,
const std::vector<int64_t>& x_dims,
const std::vector<int64_t>& y_dims,
std::vector<int64_t>& x_bd_dims,
std::vector<int64_t>& y_bd_dims,
std::vector<int64_t>& out_dims, Tensor* out) const {
if (x_dims.size() == 1) {
x_bd_dims[x_bd_dims.size() - 1] = x_dims[0];
} else {
for (size_t i = 0; i < x_dims.size(); ++i) {
x_bd_dims[i] = x_dims[i];
}
}
if (y_dims.size() == 1) {
y_bd_dims[x_bd_dims.size() - 2] = y_dims[0];
} else {
for (size_t i = 0; i < y_dims.size(); ++i) {
y_bd_dims[i] = y_dims[i];
}
}
if ((y_dims.size() == x_dims.size()) && y_dims.size() > 2) {
for (size_t i = 0; i < x_dims.size() - 2; ++i) {
PADDLE_ENFORCE_EQ(
x_dims[i] == y_dims[i] || x_dims[i] == 1 || y_dims[i] == 1, true,
platform::errors::InvalidArgument(
"Tensor dimensions are incorrect for broadcasting."
"Dimensions in X and Y must be same or equal to 1, but "
"received x_dim[%d]=%d and y_dims[%d]= %d",
i, x_dims[i], i, y_dims[i]));
out_dims[i] = std::max(x_dims[i], y_dims[i]);
}
out->Resize(framework::make_ddim(out_dims));
}
}
void RunKernel(const ExecutionContext& ctx) const {
const auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& onednn_engine = dev_ctx.GetEngine();
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Output<Tensor>("Out");
bool trans_x = ctx.Attr<bool>("trans_x");
bool trans_y = ctx.Attr<bool>("trans_y");
auto x_dims = framework::vectorize(x->dims());
auto y_dims = framework::vectorize(y->dims());
auto out_dims = framework::vectorize(out->dims());
int ndims = std::max(x->dims().size(), y->dims().size());
ndims = std::max(ndims, 3);
std::vector<int64_t> x_bd_dims(ndims, 1);
std::vector<int64_t> y_bd_dims(ndims, 1);
CalculateMatrixDims(ctx, x_dims, y_dims, x_bd_dims, y_bd_dims, out_dims,
out);
MatMulV2MKLDNNHandler<T> handler(dev_ctx, onednn_engine, ctx.GetPlace(),
x_bd_dims, trans_x, y_bd_dims, trans_y,
ctx.InputName("X"));
const auto src_memory_p = handler.AcquireSrcMemory(x);
const auto weights_memory_p = handler.AcquireWeightsMemory(y);
const auto dst_memory_p = handler.AcquireDstMemory(out);
auto matmul_p = handler.AcquireForwardPrimitive();
std::unordered_map<int, memory> matmul_args = {
{DNNL_ARG_SRC, *src_memory_p},
{DNNL_ARG_WEIGHTS, *weights_memory_p},
{DNNL_ARG_DST, *dst_memory_p}};
auto& astream = MKLDNNDeviceContext::tls().get_stream();
matmul_p->execute(astream, matmul_args);
astream.wait();
out->set_layout(framework::DataLayout::kMKLDNN);
out->set_format(
GetMKLDNNFormat(dst_memory_p->get_desc().reshape(out_dims)));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(matmul_v2, MKLDNN, ::paddle::platform::CPUPlace,
ops::MatMulV2MKLDNNKernel<float>,
ops::MatMulV2MKLDNNKernel<paddle::platform::bfloat16>);
// REGISTER_OP_KERNEL(matmul_grad_v2, MKLDNN, ::paddle::platform::CPUPlace,
// ops::MatMulV2GradMKLDNNKernel<float>,
// ops::MatMulV2GradMKLDNNKernel<paddle::platform::bfloat16>);
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
import paddle.fluid.core as core
import paddle
import paddle.fluid as fluid
import paddle.fluid.framework as framework
def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
"""Reference forward implementation using np.matmul."""
# np.matmul does not support the transpose flags, so we manually
# transpose X and Y appropriately.
if transpose_X:
if X.ndim == 1:
X = X.reshape((X.size, ))
elif X.ndim == 2:
X = X.T
else:
dim = [i for i in range(len(X.shape))]
dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
X = np.transpose(X, tuple(dim))
if transpose_Y:
if Y.ndim == 1:
Y = Y.reshape((Y.size, ))
else:
dim = [i for i in range(len(Y.shape))]
dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
Y = np.transpose(Y, tuple(dim))
Out = np.atleast_1d(np.matmul(X, Y))
return Out
class TestMatMulV2VectorXVectorOneDNNOp(OpTest):
def config(self):
self.x_shape = (100, )
self.y_shape = (100, )
self.trans_x = False
self.trans_y = False
def set_inputs(self, x, y):
self.inputs = {'X': x, 'Y': y}
def set_dtype_attr(self):
self.attrs['mkldnn_data_type'] = "float32"
def setUp(self):
self.config()
self.op_type = "matmul_v2"
x = np.random.random(self.x_shape).astype("float32")
y = np.random.random(self.y_shape).astype("float32")
# -0.1 ~ 0.1
x = -0.1 + 0.2 * x
y = -0.1 + 0.2 * y
result = reference_matmul(x, y, self.trans_x,
self.trans_y).astype("float32")
self.set_inputs(x, y)
self.attrs = {
'trans_x': self.trans_x,
'trans_y': self.trans_y,
'use_mkldnn': True
}
self.set_dtype_attr()
self.outputs = {'Out': result}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X', 'Y'], 'Out')
class TestMatMulV2VectorXMatrixTransposeYOneDNNOp(
TestMatMulV2VectorXVectorOneDNNOp):
def config(self):
self.x_shape = (100, )
self.y_shape = (1, 3, 2, 100)
self.trans_x = False
self.trans_y = True
class TestMatMulV2VectorXMatrixOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
def config(self):
self.x_shape = (100, )
self.y_shape = (1, 1, 100, 2)
self.trans_x = False
self.trans_y = False
class TestMatMulV2MatrixXVectorTransposeXOneDNNOp(
TestMatMulV2VectorXVectorOneDNNOp):
def config(self):
self.x_shape = (1, 1, 100, 1)
self.y_shape = (100, )
self.trans_x = True
self.trans_y = False
class TestMatMulV2MatrixXVectorOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
def config(self):
self.x_shape = (1, 2, 1, 100)
self.y_shape = (100, )
self.trans_x = False
self.trans_y = False
class TestMatMulV2MatrixXMatrixOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
def config(self):
self.x_shape = (1, 1, 2, 100)
self.y_shape = (1, 1, 100, 1)
self.trans_x = False
self.trans_y = False
class TestMatMulV2MatrixXMatrixTransposeYOneDNNOp(
TestMatMulV2VectorXVectorOneDNNOp):
def config(self):
self.x_shape = (1, 1, 1, 100)
self.y_shape = (2, 1, 2, 100)
self.trans_x = False
self.trans_y = True
class TestMatMulV2MatrixXMatrix2OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
def config(self):
self.x_shape = (1, 1, 12, 4)
self.y_shape = (1, 2, 4, 12)
self.trans_x = False
self.trans_y = False
class TestMatMulV2MatrixXMatrix3OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
def config(self):
self.x_shape = (2, 1, 2, 100)
self.y_shape = (1, 1, 100, 2)
self.trans_x = False
self.trans_y = False
class TestMatMulV2MatrixXMatrixTranposeXOneDNNOp2(
TestMatMulV2VectorXVectorOneDNNOp):
def config(self):
self.x_shape = (2, 1, 4, 25)
self.y_shape = (1, 1, 4, 25)
self.trans_x = True
self.trans_y = False
class TestMatMulV2MatrixXMatrixTranposeX2OneDNNOp3(
TestMatMulV2VectorXVectorOneDNNOp):
def config(self):
self.x_shape = (2, 2, 5, 4)
self.y_shape = (2, 2, 5, 3)
self.trans_x = True
self.trans_y = False
class TestMatMulV2MatrixXMatrixTransposeX3OneDNNOp(
TestMatMulV2VectorXVectorOneDNNOp):
def config(self):
self.x_shape = (3, 1, 6, 5)
self.y_shape = (1, 2, 6, 9)
self.trans_x = True
self.trans_y = False
class TestMatMulV2MatrixXMatrix4OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
def config(self):
self.x_shape = (3, 1, 6, 6)
self.y_shape = (1, 2, 6, 9)
self.trans_x = False
self.trans_y = False
class TestMatMulV2VectorXMatrix5DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
def config(self):
self.x_shape = (100)
self.y_shape = (1, 2, 2, 100, 2)
self.trans_x = False
self.trans_y = False
class TestMatMulV2Matrix3DXVectorOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
def config(self):
self.x_shape = (2, 1, 40)
self.y_shape = (40)
self.trans_x = False
self.trans_y = False
class TestMatMulV2MatrixXMatrixTransposeXTransposeYOneDNNOp(
TestMatMulV2VectorXVectorOneDNNOp):
def config(self):
self.x_shape = (3, 1, 10, 8)
self.y_shape = (1, 2, 9, 10)
self.trans_x = True
self.trans_y = True
class TestMatMulV2MatrixXMatrixTransposeY2OneDNNOp(
TestMatMulV2VectorXVectorOneDNNOp):
def config(self):
self.x_shape = (3, 1, 10, 10)
self.y_shape = (1, 2, 9, 10)
self.trans_x = False
self.trans_y = True
class TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp(
TestMatMulV2VectorXVectorOneDNNOp):
def config(self):
self.x_shape = (1, 3, 1, 10, 10)
self.y_shape = (3, 1, 2, 9, 10)
self.trans_x = False
self.trans_y = True
# BF16 TESTS
def create_bf16_test_class(parent):
class TestMatMulV2Bf16OneDNNOp(parent):
def set_inputs(self, x, y):
self.inputs = {
'X': convert_float_to_uint16(x),
'Y': convert_float_to_uint16(y)
}
def set_dtype_attr(self):
self.attrs['mkldnn_data_type'] = "bfloat16"
def test_check_output(self):
if core.is_compiled_with_cuda():
self.skipTest(
"OneDNN doesn't support bf16 with CUDA, skipping UT" +
self.__class__.__name__)
elif not core.supports_bfloat16():
self.skipTest("Core doesn't support bf16, skipping UT" +
self.__class__.__name__)
else:
self.check_output_with_place(core.CPUPlace())
def test_check_grad(self):
pass
cls_name = "{0}_{1}".format(parent.__name__, "BF16")
TestMatMulV2Bf16OneDNNOp.__name__ = cls_name
globals()[cls_name] = TestMatMulV2Bf16OneDNNOp
create_bf16_test_class(TestMatMulV2VectorXMatrixTransposeYOneDNNOp)
create_bf16_test_class(TestMatMulV2VectorXMatrixOneDNNOp)
create_bf16_test_class(TestMatMulV2MatrixXVectorTransposeXOneDNNOp)
create_bf16_test_class(TestMatMulV2MatrixXVectorOneDNNOp)
create_bf16_test_class(TestMatMulV2MatrixXMatrixOneDNNOp)
create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeYOneDNNOp)
create_bf16_test_class(TestMatMulV2MatrixXMatrix2OneDNNOp)
create_bf16_test_class(TestMatMulV2MatrixXMatrix3OneDNNOp)
create_bf16_test_class(TestMatMulV2MatrixXMatrixTranposeXOneDNNOp2)
create_bf16_test_class(TestMatMulV2MatrixXMatrixTranposeX2OneDNNOp3)
create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeX3OneDNNOp)
create_bf16_test_class(TestMatMulV2MatrixXMatrix4OneDNNOp)
create_bf16_test_class(TestMatMulV2VectorXMatrix5DOneDNNOp)
create_bf16_test_class(TestMatMulV2Matrix3DXVectorOneDNNOp)
create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeXTransposeYOneDNNOp)
create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeY2OneDNNOp)
create_bf16_test_class(TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp)
if __name__ == "__main__":
paddle.enable_static()
unittest.main()
......@@ -625,6 +625,7 @@ STATIC_MODE_TESTING_LIST = [
'test_lrn_mkldnn_op',
'test_matmul_mkldnn_op',
'test_matmul_bf16_mkldnn_op',
'test_matmul_v2_mkldnn_op',
'test_mul_int8_mkldnn_op',
'test_multi_gru_mkldnn_op',
'test_multi_gru_fuse_pass',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册