Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
338cbeaa
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
338cbeaa
编写于
1月 04, 2023
作者:
S
Sławomir Siwek
提交者:
GitHub
1月 04, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Revert "Replace matmul with matmul_v2 during oneDNN fuse passes (#49108)" (#49524)
This reverts commit
2c444dfa
.
上级
49f5a97b
变更
16
隐藏空白更改
内联
并排
Showing
16 changed file
with
1304 addition
and
60 deletion
+1304
-60
paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc
...framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc
+0
-10
paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc
...work/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc
+0
-10
paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc
...rk/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc
+0
-9
paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
...id/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
+0
-11
paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
...rk/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
+0
-9
paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
...kldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
+1
-1
paddle/fluid/operators/matmul_op.cc
paddle/fluid/operators/matmul_op.cc
+30
-2
paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+941
-0
paddle/fluid/operators/ops_extra_info.h
paddle/fluid/operators/ops_extra_info.h
+1
-1
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
...s/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
+1
-1
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
...est_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
+1
-1
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
...inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
+1
-1
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
...ference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
+1
-1
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
...ence/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
+10
-1
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py
...ference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py
+1
-1
python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
...dle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
+316
-1
未找到文件。
paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc
浏览文件 @
338cbeaa
...
@@ -77,16 +77,6 @@ void MatmulActivationMkldnnFusePass::FuseMatmulAct(
...
@@ -77,16 +77,6 @@ void MatmulActivationMkldnnFusePass::FuseMatmulAct(
?
"gelu_tanh"
?
"gelu_tanh"
:
"gelu_erf"
;
:
"gelu_erf"
;
}
}
if
(
matmul_type
==
"matmul"
)
{
matmul_op
->
SetType
(
"matmul_v2"
);
matmul_op
->
SetAttr
(
"trans_x"
,
matmul_op
->
GetAttr
(
"transpose_X"
));
matmul_op
->
SetAttr
(
"trans_y"
,
matmul_op
->
GetAttr
(
"transpose_Y"
));
auto
matmul_alpha
=
matmul_op
->
GetAttrIfExists
<
float
>
(
"alpha"
);
if
(
matmul_alpha
!=
1.0
f
)
{
matmul_op
->
SetAttr
(
"alpha"
,
matmul_alpha
);
}
}
matmul_op
->
SetAttr
(
"fuse_activation"
,
act_type
);
matmul_op
->
SetAttr
(
"fuse_activation"
,
act_type
);
matmul_op
->
SetOutput
(
"Out"
,
{
activation_out
->
Name
()});
matmul_op
->
SetOutput
(
"Out"
,
{
activation_out
->
Name
()});
...
...
paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc
浏览文件 @
338cbeaa
...
@@ -65,16 +65,6 @@ void MatmulElementwiseAddMKLDNNFusePass::FuseMatmulElementwiseAdd(
...
@@ -65,16 +65,6 @@ void MatmulElementwiseAddMKLDNNFusePass::FuseMatmulElementwiseAdd(
return
;
return
;
}
}
if
(
matmul_type
==
"matmul"
)
{
matmul
->
Op
()
->
SetType
(
"matmul_v2"
);
matmul
->
Op
()
->
SetAttr
(
"trans_x"
,
matmul
->
Op
()
->
GetAttr
(
"transpose_X"
));
matmul
->
Op
()
->
SetAttr
(
"trans_y"
,
matmul
->
Op
()
->
GetAttr
(
"transpose_Y"
));
auto
matmul_alpha
=
matmul
->
Op
()
->
GetAttrIfExists
<
float
>
(
"alpha"
);
if
(
matmul_alpha
!=
1.0
f
)
{
matmul
->
Op
()
->
SetAttr
(
"alpha"
,
matmul_alpha
);
}
}
matmul
->
Op
()
->
SetInput
(
"ResidualData"
,
{
elementwise_addend
->
Name
()});
matmul
->
Op
()
->
SetInput
(
"ResidualData"
,
{
elementwise_addend
->
Name
()});
matmul
->
Op
()
->
SetOutput
(
"Out"
,
{
elementwise_add_out
->
Name
()});
matmul
->
Op
()
->
SetOutput
(
"Out"
,
{
elementwise_add_out
->
Name
()});
...
...
paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc
浏览文件 @
338cbeaa
...
@@ -84,15 +84,6 @@ void MatmulTransposeReshapeMKLDNNPass::Fuse(
...
@@ -84,15 +84,6 @@ void MatmulTransposeReshapeMKLDNNPass::Fuse(
}
}
OpDesc
*
matmul_desc
=
matmul_op
->
Op
();
OpDesc
*
matmul_desc
=
matmul_op
->
Op
();
if
(
matmul_type
==
"matmul"
)
{
matmul_desc
->
SetType
(
"matmul_v2"
);
matmul_desc
->
SetAttr
(
"trans_x"
,
matmul_desc
->
GetAttr
(
"transpose_X"
));
matmul_desc
->
SetAttr
(
"trans_y"
,
matmul_desc
->
GetAttr
(
"transpose_Y"
));
auto
matmul_alpha
=
matmul_desc
->
GetAttrIfExists
<
float
>
(
"alpha"
);
if
(
matmul_alpha
!=
1.0
f
)
{
matmul_desc
->
SetAttr
(
"alpha"
,
matmul_alpha
);
}
}
matmul_desc
->
SetOutput
(
"Out"
,
{
reshape_out
->
Name
()});
matmul_desc
->
SetOutput
(
"Out"
,
{
reshape_out
->
Name
()});
matmul_desc
->
SetAttr
(
"fused_reshape_Out"
,
reshape_shape
);
matmul_desc
->
SetAttr
(
"fused_reshape_Out"
,
reshape_shape
);
matmul_desc
->
SetAttr
(
"fused_transpose_Out"
,
transpose_axis
);
matmul_desc
->
SetAttr
(
"fused_transpose_Out"
,
transpose_axis
);
...
...
paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
浏览文件 @
338cbeaa
...
@@ -85,17 +85,6 @@ void FuseOperatorScaleOneDNNPass::FuseScale(Graph *graph,
...
@@ -85,17 +85,6 @@ void FuseOperatorScaleOneDNNPass::FuseScale(Graph *graph,
scale
=
*
(
scale_tensor
->
data
<
float
>
());
scale
=
*
(
scale_tensor
->
data
<
float
>
());
}
}
if
(
op_type
==
"matmul"
)
{
operator_op
->
Op
()
->
SetType
(
"matmul_v2"
);
operator_op
->
Op
()
->
SetAttr
(
"trans_x"
,
operator_op
->
Op
()
->
GetAttr
(
"transpose_X"
));
operator_op
->
Op
()
->
SetAttr
(
"trans_y"
,
operator_op
->
Op
()
->
GetAttr
(
"transpose_Y"
));
auto
matmul_alpha
=
operator_op
->
Op
()
->
GetAttrIfExists
<
float
>
(
"alpha"
);
if
(
matmul_alpha
!=
1.0
f
)
{
operator_op
->
Op
()
->
SetAttr
(
"alpha"
,
matmul_alpha
);
}
}
operator_op
->
Op
()
->
SetAttr
(
"fused_output_scale"
,
scale
);
operator_op
->
Op
()
->
SetAttr
(
"fused_output_scale"
,
scale
);
operator_op
->
Op
()
->
SetOutput
(
"Out"
,
{
scale_out
->
Name
()});
operator_op
->
Op
()
->
SetOutput
(
"Out"
,
{
scale_out
->
Name
()});
...
...
paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
浏览文件 @
338cbeaa
...
@@ -123,15 +123,6 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse(
...
@@ -123,15 +123,6 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse(
return
;
return
;
}
}
if
(
matmul_type
==
"matmul"
)
{
matmul_desc
->
SetType
(
"matmul_v2"
);
matmul_desc
->
SetAttr
(
"trans_x"
,
matmul_desc
->
GetAttr
(
"transpose_X"
));
matmul_desc
->
SetAttr
(
"trans_y"
,
matmul_desc
->
GetAttr
(
"transpose_Y"
));
auto
matmul_alpha
=
matmul_desc
->
GetAttrIfExists
<
float
>
(
"alpha"
);
if
(
matmul_alpha
!=
1.0
f
)
{
matmul_desc
->
SetAttr
(
"alpha"
,
matmul_alpha
);
}
}
matmul_desc
->
SetInput
(
matmul_input_name
,
{(
reshape_in
)
->
Name
()});
matmul_desc
->
SetInput
(
matmul_input_name
,
{(
reshape_in
)
->
Name
()});
matmul_desc
->
SetAttr
(
"fused_reshape_"
+
matmul_input_name
,
reshape_shape
);
matmul_desc
->
SetAttr
(
"fused_reshape_"
+
matmul_input_name
,
reshape_shape
);
matmul_desc
->
SetAttr
(
"fused_transpose_"
+
matmul_input_name
,
matmul_desc
->
SetAttr
(
"fused_transpose_"
+
matmul_input_name
,
...
...
paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
浏览文件 @
338cbeaa
...
@@ -97,7 +97,7 @@ void TestMain(const std::string& op_name, bool with_xshapes) {
...
@@ -97,7 +97,7 @@ void TestMain(const std::string& op_name, bool with_xshapes) {
int
removed
=
8
;
// 2* reshape, reshape_out, transpose, transpose_out
int
removed
=
8
;
// 2* reshape, reshape_out, transpose, transpose_out
if
(
with_xshapes
)
removed
+=
2
;
// transpose_xshape, reshape_xshape
if
(
with_xshapes
)
removed
+=
2
;
// transpose_xshape, reshape_xshape
EXPECT_EQ
(
total_nodes_before
-
removed
,
total_nodes_after
);
EXPECT_EQ
(
total_nodes_before
-
removed
,
total_nodes_after
);
auto
*
matmul_op_desc
=
GetOpNodes
(
graph
,
"matmul_v2"
).
at
(
0
)
->
Op
();
auto
*
matmul_op_desc
=
GetOpNodes
(
graph
,
op_name
).
at
(
0
)
->
Op
();
auto
check
=
[
&
matmul_op_desc
](
std
::
string
a
)
{
auto
check
=
[
&
matmul_op_desc
](
std
::
string
a
)
{
std
::
string
shape_str
=
"fused_reshape_"
+
a
;
std
::
string
shape_str
=
"fused_reshape_"
+
a
;
...
...
paddle/fluid/operators/matmul_op.cc
浏览文件 @
338cbeaa
...
@@ -345,6 +345,26 @@ class MatMulGradKernel : public framework::OpKernel<T> {
...
@@ -345,6 +345,26 @@ class MatMulGradKernel : public framework::OpKernel<T> {
}
}
};
};
framework
::
DDim
GetDimForInput
(
const
framework
::
InferShapeContext
&
ctx
,
std
::
string
input_name
)
{
auto
shape
=
ctx
.
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"fused_reshape_"
+
input_name
);
auto
axis
=
ctx
.
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"fused_transpose_"
+
input_name
);
auto
dim
=
ctx
.
GetInputDim
(
input_name
);
PADDLE_ENFORCE_GT
(
dim
.
size
(),
0
,
platform
::
errors
::
InvalidArgument
(
"The Input(%s) has not been initialized properly. The "
"shape of Input(%s) = [%s]."
,
dim
));
if
(
!
shape
.
empty
()
&&
!
axis
.
empty
())
{
dim
=
dim
.
reshape
(
shape
).
transpose
(
axis
);
}
return
dim
;
}
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
MatMulDoubleGradKernel
:
public
framework
::
OpKernel
<
T
>
{
class
MatMulDoubleGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
...
@@ -559,8 +579,8 @@ class MatMulOp : public framework::OperatorWithKernel {
...
@@ -559,8 +579,8 @@ class MatMulOp : public framework::OperatorWithKernel {
OP_INOUT_CHECK
(
context
->
HasInput
(
"Y"
),
"Input"
,
"Y"
,
"matmul"
);
OP_INOUT_CHECK
(
context
->
HasInput
(
"Y"
),
"Input"
,
"Y"
,
"matmul"
);
OP_INOUT_CHECK
(
context
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"matmul"
);
OP_INOUT_CHECK
(
context
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"matmul"
);
auto
dim_x
=
context
->
GetInputDim
(
"X"
);
auto
dim_x
=
GetDimForInput
(
*
context
,
"X"
);
auto
dim_y
=
context
->
GetInputDim
(
"Y"
);
auto
dim_y
=
GetDimForInput
(
*
context
,
"Y"
);
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
// (jczaja): For NHWC execution output shape needs
// (jczaja): For NHWC execution output shape needs
...
@@ -661,6 +681,14 @@ class MatMulOp : public framework::OperatorWithKernel {
...
@@ -661,6 +681,14 @@ class MatMulOp : public framework::OperatorWithKernel {
framework
::
DDim
ddim_out
=
phi
::
make_ddim
(
dim_out
);
framework
::
DDim
ddim_out
=
phi
::
make_ddim
(
dim_out
);
#ifdef PADDLE_WITH_MKLDNN
auto
shape
=
context
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"fused_reshape_Out"
);
auto
axis
=
context
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"fused_transpose_Out"
);
if
(
!
shape
.
empty
()
&&
!
axis
.
empty
())
{
ddim_out
=
ddim_out
.
transpose
(
axis
).
reshape
(
shape
);
}
#endif
context
->
SetOutputDim
(
"Out"
,
ddim_out
);
context
->
SetOutputDim
(
"Out"
,
ddim_out
);
context
->
ShareLoD
(
"X"
,
"Out"
);
context
->
ShareLoD
(
"X"
,
"Out"
);
}
}
...
...
paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
→
paddle/fluid/operators/mkldnn/matmul_
v2_
mkldnn_op.cc
浏览文件 @
338cbeaa
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/phi/backends/onednn/onednn_reuse.h"
#include "paddle/phi/backends/onednn/onednn_reuse.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
...
@@ -20,14 +21,13 @@ namespace {
...
@@ -20,14 +21,13 @@ namespace {
using
dnnl
::
memory
;
using
dnnl
::
memory
;
using
paddle
::
framework
::
ExecutionContext
;
using
paddle
::
framework
::
ExecutionContext
;
using
paddle
::
framework
::
GradVarName
;
using
paddle
::
framework
::
GradVarName
;
using
phi
::
DenseTensor
;
using
phi
::
OneDNNContext
;
using
phi
::
OneDNNContext
;
using
phi
::
vectorize
;
using
phi
::
vectorize
;
using
phi
::
funcs
::
OneDNNGetDataType
;
using
phi
::
funcs
::
OneDNNGetDataType
;
// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
// Identity op if the tensor is not of rank 3.
// Identity op if the tensor is not of rank 3.
static
DenseTensor
FoldOuterDims
(
const
DenseTensor
&
input
)
{
static
phi
::
DenseTensor
FoldOuterDims
(
const
phi
::
DenseTensor
&
input
)
{
auto
output
=
input
;
auto
output
=
input
;
auto
in_dims
=
input
.
dims
();
auto
in_dims
=
input
.
dims
();
if
(
in_dims
.
size
()
==
3
)
{
if
(
in_dims
.
size
()
==
3
)
{
...
@@ -40,14 +40,14 @@ static DenseTensor FoldOuterDims(const DenseTensor &input) {
...
@@ -40,14 +40,14 @@ static DenseTensor FoldOuterDims(const DenseTensor &input) {
// (Warning: This requires transposing data and writes into new memory.)
// (Warning: This requires transposing data and writes into new memory.)
// Identity op if the tensor is not of rank 3.
// Identity op if the tensor is not of rank 3.
template
<
typename
T
>
template
<
typename
T
>
static
DenseTensor
FoldFirstAndLastDims
(
const
OneDNNContext
&
dev_ctx
,
static
phi
::
DenseTensor
FoldFirstAndLastDims
(
const
OneDNNContext
&
dev_ctx
,
const
DenseTensor
*
input
)
{
const
phi
::
DenseTensor
*
input
)
{
auto
input_dims
=
vectorize
(
input
->
dims
());
auto
input_dims
=
vectorize
(
input
->
dims
());
if
(
input_dims
.
size
()
!=
3
)
{
if
(
input_dims
.
size
()
!=
3
)
{
return
*
input
;
return
*
input
;
}
}
DenseTensor
output
;
phi
::
DenseTensor
output
;
output
.
Resize
({
input_dims
[
1
],
input_dims
[
0
],
input_dims
[
2
]});
output
.
Resize
({
input_dims
[
1
],
input_dims
[
0
],
input_dims
[
2
]});
auto
output_dims
=
vectorize
(
output
.
dims
());
auto
output_dims
=
vectorize
(
output
.
dims
());
...
@@ -71,15 +71,30 @@ static DenseTensor FoldFirstAndLastDims(const OneDNNContext &dev_ctx,
...
@@ -71,15 +71,30 @@ static DenseTensor FoldFirstAndLastDims(const OneDNNContext &dev_ctx,
return
output
;
return
output
;
}
}
phi
::
DDim
GetDimForInput
(
const
ExecutionContext
&
ctx
,
std
::
string
input_name
)
{
auto
shape
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"fused_reshape_"
+
input_name
);
auto
axis
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"fused_transpose_"
+
input_name
);
auto
input_dims
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
input_name
)
->
dims
();
if
(
!
shape
.
empty
()
&&
!
axis
.
empty
())
{
return
input_dims
.
reshape
(
shape
).
transpose
(
axis
);
}
return
input_dims
;
}
template
<
typename
XT
,
typename
YT
,
typename
OT
>
template
<
typename
XT
,
typename
YT
,
typename
OT
>
class
MatMulV
1One
DNNHandler
class
MatMulV
2MKL
DNNHandler
:
public
phi
::
funcs
::
OneDNNHandlerNoCachingT
<
XT
,
dnnl
::
matmul
>
{
:
public
phi
::
funcs
::
OneDNNHandlerNoCachingT
<
XT
,
dnnl
::
matmul
>
{
public:
public:
MatMulV
1One
DNNHandler
(
const
ExecutionContext
&
ctx
,
MatMulV
2MKL
DNNHandler
(
const
ExecutionContext
&
ctx
,
const
dnnl
::
engine
engine
,
const
dnnl
::
engine
engine
,
p
hi
::
Place
cpu_place
,
p
addle
::
platform
::
Place
cpu_place
,
const
std
::
vector
<
int64_t
>
&
x_org_dims
,
const
std
::
vector
<
int64_t
>
&
x_org_dims
,
const
std
::
vector
<
int64_t
>
&
y_org_dims
)
bool
trans_x
,
const
std
::
vector
<
int64_t
>
&
y_org_dims
,
bool
trans_y
,
bool
is_output_fused
,
const
std
::
vector
<
int64_t
>
&
x_strides_override
,
const
std
::
vector
<
int64_t
>
&
y_strides_override
)
:
phi
::
funcs
::
OneDNNHandlerNoCachingT
<
XT
,
dnnl
::
matmul
>
(
engine
,
:
phi
::
funcs
::
OneDNNHandlerNoCachingT
<
XT
,
dnnl
::
matmul
>
(
engine
,
cpu_place
)
{
cpu_place
)
{
// M X K * K X N
// M X K * K X N
...
@@ -90,8 +105,6 @@ class MatMulV1OneDNNHandler
...
@@ -90,8 +105,6 @@ class MatMulV1OneDNNHandler
const
int
H_idx
=
x_dims
.
size
()
-
2
;
const
int
H_idx
=
x_dims
.
size
()
-
2
;
const
int
W_idx
=
x_dims
.
size
()
-
1
;
const
int
W_idx
=
x_dims
.
size
()
-
1
;
auto
trans_x
=
ctx
.
Attr
<
bool
>
(
"transpose_X"
);
auto
trans_y
=
ctx
.
Attr
<
bool
>
(
"transpose_Y"
);
if
(
trans_x
)
std
::
swap
(
x_dims
[
H_idx
],
x_dims
[
W_idx
]);
if
(
trans_x
)
std
::
swap
(
x_dims
[
H_idx
],
x_dims
[
W_idx
]);
if
(
trans_y
)
std
::
swap
(
y_dims
[
H_idx
],
y_dims
[
W_idx
]);
if
(
trans_y
)
std
::
swap
(
y_dims
[
H_idx
],
y_dims
[
W_idx
]);
...
@@ -108,16 +121,24 @@ class MatMulV1OneDNNHandler
...
@@ -108,16 +121,24 @@ class MatMulV1OneDNNHandler
y_strides
.
reserve
(
x_dims
.
size
());
y_strides
.
reserve
(
x_dims
.
size
());
out_strides
.
reserve
(
x_dims
.
size
());
out_strides
.
reserve
(
x_dims
.
size
());
if
(
trans_x
)
{
if
(
!
x_strides_override
.
empty
()
)
{
x_strides
.
insert
(
x_strides
.
end
(),
{
M
*
K
,
1
,
M
})
;
x_strides
=
x_strides_override
;
}
else
{
}
else
{
x_strides
.
insert
(
x_strides
.
end
(),
{
M
*
K
,
K
,
1
});
if
(
!
trans_x
)
{
x_strides
.
insert
(
x_strides
.
end
(),
{
M
*
K
,
K
,
1
});
}
else
{
x_strides
.
insert
(
x_strides
.
end
(),
{
M
*
K
,
1
,
M
});
}
}
}
if
(
trans_y
)
{
if
(
!
y_strides_override
.
empty
()
)
{
y_strides
.
insert
(
y_strides
.
end
(),
{
N
*
K
,
1
,
K
})
;
y_strides
=
y_strides_override
;
}
else
{
}
else
{
y_strides
.
insert
(
y_strides
.
end
(),
{
N
*
K
,
N
,
1
});
if
(
!
trans_y
)
{
y_strides
.
insert
(
y_strides
.
end
(),
{
N
*
K
,
N
,
1
});
}
else
{
y_strides
.
insert
(
y_strides
.
end
(),
{
N
*
K
,
1
,
K
});
}
}
}
out_strides
.
insert
(
out_strides
.
end
(),
{
M
*
N
,
N
,
1
});
out_strides
.
insert
(
out_strides
.
end
(),
{
M
*
N
,
N
,
1
});
...
@@ -126,11 +147,20 @@ class MatMulV1OneDNNHandler
...
@@ -126,11 +147,20 @@ class MatMulV1OneDNNHandler
for
(
int
i
=
x_dims
.
size
()
-
4
;
i
>=
0
;
--
i
)
{
for
(
int
i
=
x_dims
.
size
()
-
4
;
i
>=
0
;
--
i
)
{
out_ddims
[
i
]
=
std
::
max
(
x_dims
[
i
],
y_dims
[
i
]);
out_ddims
[
i
]
=
std
::
max
(
x_dims
[
i
],
y_dims
[
i
]);
x_strides
[
i
]
=
x_dims
[
i
+
1
]
*
x_strides
[
i
+
1
];
if
(
x_strides_override
.
empty
())
{
y_strides
[
i
]
=
y_dims
[
i
+
1
]
*
y_strides
[
i
+
1
];
x_strides
[
i
]
=
x_dims
[
i
+
1
]
*
x_strides
[
i
+
1
];
}
if
(
y_strides_override
.
empty
())
{
y_strides
[
i
]
=
y_dims
[
i
+
1
]
*
y_strides
[
i
+
1
];
}
out_strides
[
i
]
=
out_ddims
[
i
+
1
]
*
out_strides
[
i
+
1
];
out_strides
[
i
]
=
out_ddims
[
i
+
1
]
*
out_strides
[
i
+
1
];
}
}
// TODO(jczaja): Why not for int8??
if
(
!
phi
::
funcs
::
is_int8
<
OT
>
()
&&
is_output_fused
)
{
out_strides
=
FakeTransposeStrides
(
out_ddims
);
}
auto
x_md
=
auto
x_md
=
memory
::
desc
(
x_dims
,
phi
::
funcs
::
OneDNNGetDataType
<
XT
>
(),
x_strides
);
memory
::
desc
(
x_dims
,
phi
::
funcs
::
OneDNNGetDataType
<
XT
>
(),
x_strides
);
auto
y_md
=
auto
y_md
=
...
@@ -138,25 +168,164 @@ class MatMulV1OneDNNHandler
...
@@ -138,25 +168,164 @@ class MatMulV1OneDNNHandler
auto
out_md
=
memory
::
desc
(
auto
out_md
=
memory
::
desc
(
out_ddims
,
phi
::
funcs
::
OneDNNGetDataType
<
OT
>
(),
out_strides
);
out_ddims
,
phi
::
funcs
::
OneDNNGetDataType
<
OT
>
(),
out_strides
);
const
dnnl
::
primitive_attr
matmul_attrs
=
CreateMatmulAttrs
(
ctx
);
this
->
AcquireForwardPrimitiveDescriptor
(
matmul_attrs
,
x_md
,
y_md
,
out_md
);
}
void
AppendActivation
(
const
ExecutionContext
&
ctx
,
dnnl
::
post_ops
&
post_ops
,
// NOLINT
float
activation_scale
=
1.0
f
)
{
const
auto
invalid_attribute
=
ctx
.
HasAttr
(
"fuse_activation"
)
?
ctx
.
Attr
<
std
::
string
>
(
"fuse_activation"
).
empty
()
:
true
;
if
(
invalid_attribute
)
return
;
const
auto
fuse_activation
=
ctx
.
Attr
<
std
::
string
>
(
"fuse_activation"
);
const
auto
fuse_alpha
=
ctx
.
HasAttr
(
"fuse_alpha"
)
?
ctx
.
Attr
<
float
>
(
"fuse_alpha"
)
:
0.0
f
;
const
auto
fuse_beta
=
ctx
.
HasAttr
(
"fuse_beta"
)
?
ctx
.
Attr
<
float
>
(
"fuse_beta"
)
:
0.0
f
;
if
(
fuse_activation
==
"hard_sigmoid"
)
{
post_ops
.
append_eltwise
(
activation_scale
,
dnnl
::
algorithm
::
eltwise_linear
,
fuse_alpha
,
fuse_beta
);
post_ops
.
append_eltwise
(
activation_scale
,
dnnl
::
algorithm
::
eltwise_clip
,
0.0
f
,
1.0
f
);
}
else
{
const
std
::
unordered_map
<
std
::
string
,
dnnl
::
algorithm
>
activation_map
=
{
{
"abs"
,
dnnl
::
algorithm
::
eltwise_abs
},
{
"clip"
,
dnnl
::
algorithm
::
eltwise_clip
},
{
"gelu"
,
dnnl
::
algorithm
::
eltwise_gelu_erf
},
{
"gelu_erf"
,
dnnl
::
algorithm
::
eltwise_gelu_erf
},
{
"gelu_tanh"
,
dnnl
::
algorithm
::
eltwise_gelu_tanh
},
{
"hard_swish"
,
dnnl
::
algorithm
::
eltwise_hardswish
},
{
"leaky_relu"
,
dnnl
::
algorithm
::
eltwise_relu
},
{
"mish"
,
dnnl
::
algorithm
::
eltwise_mish
},
{
"relu"
,
dnnl
::
algorithm
::
eltwise_relu
},
{
"relu6"
,
dnnl
::
algorithm
::
eltwise_bounded_relu
},
{
"sigmoid"
,
dnnl
::
algorithm
::
eltwise_logistic
},
{
"sqrt"
,
dnnl
::
algorithm
::
eltwise_sqrt
},
{
"swish"
,
dnnl
::
algorithm
::
eltwise_swish
},
{
"tanh"
,
dnnl
::
algorithm
::
eltwise_tanh
}};
const
auto
&
activation_type
=
activation_map
.
find
(
fuse_activation
);
PADDLE_ENFORCE_NE
(
activation_type
,
activation_map
.
end
(),
phi
::
errors
::
InvalidArgument
(
"Activation '%s' not found in oneDNN algorithms mapper"
,
fuse_activation
));
post_ops
.
append_eltwise
(
activation_scale
,
activation_type
->
second
,
fuse_alpha
,
fuse_beta
);
}
}
float
ComputeOutputScale
(
const
ExecutionContext
&
ctx
)
{
float
alpha
=
ctx
.
HasAttr
(
"alpha"
)
?
ctx
.
Attr
<
float
>
(
"alpha"
)
:
1.0
f
;
if
(
ctx
.
HasAttr
(
"Scale_x"
)
&&
ctx
.
HasAttr
(
"Scale_y"
)
&&
ctx
.
HasAttr
(
"Scale_out"
))
{
float
scale_x
=
ctx
.
Attr
<
float
>
(
"Scale_x"
);
float
scale_y
=
ctx
.
Attr
<
float
>
(
"Scale_y"
);
bool
force_fp32_out
=
ctx
.
HasAttr
(
"force_fp32_output"
)
?
ctx
.
Attr
<
bool
>
(
"force_fp32_output"
)
:
false
;
float
scale_out
=
force_fp32_out
?
1.
f
:
ctx
.
Attr
<
float
>
(
"Scale_out"
);
alpha
*=
scale_out
/
(
scale_x
*
scale_y
);
}
return
alpha
;
}
dnnl
::
primitive_attr
CreateMatmulAttrs
(
const
ExecutionContext
&
ctx
)
{
dnnl
::
primitive_attr
matmul_attrs
;
dnnl
::
primitive_attr
matmul_attrs
;
dnnl
::
post_ops
post_operations
;
dnnl
::
post_ops
post_operations
;
float
scale_out
=
ComputeOutputScale
(
ctx
);
float
scale_out
=
ComputeOutputScale
(
ctx
);
if
(
scale_out
!=
1.0
f
)
{
if
(
scale_out
!=
1.0
f
)
{
matmul_attrs
.
set_output_scales
(
0
,
{
scale_out
});
matmul_attrs
.
set_output_scales
(
0
,
{
scale_out
});
}
}
if
(
ctx
.
HasInput
(
"ResidualData"
))
{
auto
*
residual_data
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"ResidualData"
);
auto
residual_data_tz
=
phi
::
vectorize
(
residual_data
->
dims
());
auto
residual_data_md
=
memory
::
desc
(
residual_data_tz
,
phi
::
funcs
::
OneDNNGetDataType
<
OT
>
(),
dnnl
::
memory
::
format_tag
::
any
);
post_operations
.
append_binary
(
dnnl
::
algorithm
::
binary_add
,
residual_data_md
);
if
(
ctx
.
HasAttr
(
"Scale_in_eltwise"
))
{
float
sum_scale
=
scale_out
/
ctx
.
Attr
<
float
>
(
"Scale_in_eltwise"
);
post_operations
.
append_sum
(
sum_scale
);
}
}
AppendActivation
(
ctx
,
post_operations
);
if
(
ctx
.
HasAttr
(
"fused_output_scale"
))
{
float
scale_alpha
=
ctx
.
Attr
<
float
>
(
"fused_output_scale"
);
post_operations
.
append_eltwise
(
1.0
,
dnnl
::
algorithm
::
eltwise_linear
,
scale_alpha
,
0.0
f
);
}
matmul_attrs
.
set_post_ops
(
post_operations
);
matmul_attrs
.
set_post_ops
(
post_operations
);
return
matmul_attrs
;
}
this
->
AcquireForwardPrimitiveDescriptor
(
matmul_attrs
,
x_md
,
y_md
,
out_md
);
std
::
vector
<
int64_t
>
FakeTransposeStrides
(
const
std
::
vector
<
int64_t
>
&
matmul_out_dims
)
const
{
// fuse matmul_v2 + transpose + reshape guarantees that output is 4D and
// transpose axis are: {0, 2, 1, 3}
std
::
vector
<
int64_t
>
transpose_axis
=
{
0
,
2
,
1
,
3
};
std
::
vector
<
int64_t
>
fake_strides
(
transpose_axis
.
size
());
int
ndims
=
static_cast
<
int
>
(
transpose_axis
.
size
());
int
total_stride
=
1
;
for
(
int
i
=
ndims
-
1
;
i
>=
0
;
--
i
)
{
fake_strides
[
transpose_axis
[
i
]]
=
total_stride
;
total_stride
*=
matmul_out_dims
[
transpose_axis
[
i
]];
}
return
fake_strides
;
}
}
MatMulV1OneDNNHandler
(
const
dnnl
::
engine
engine
,
std
::
shared_ptr
<
memory
>
AcquireWeightsMemory
(
const
phi
::
DenseTensor
*
input
)
{
phi
::
Place
cpu_place
,
const
YT
*
input_data
=
input
->
data
<
YT
>
();
DenseTensor
*
x
,
return
this
->
AcquireMemoryFromPrimitive
(
bool
trans_x
,
this
->
fwd_pd_
->
weights_desc
(),
DenseTensor
*
y
,
phi
::
funcs
::
to_void_cast
<
YT
>
(
input_data
));
bool
trans_y
,
}
DenseTensor
*
out
,
float
scale
)
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireDstMemory
(
phi
::
DenseTensor
*
output
)
{
// We cannot use base AcquireDstMemory as it makes an allocation request
// base on DST memory primitive size. This is fine in general, but in MatMul
// we have primitive that covers only one batch of Data and then shift
// pointer for every new batch. Hence phi::DenseTensor size is bigger that
// dst memory primitive size. So would we request less memory that is there
// and it triggers an assertion. So as there is no 'any' format here we can
// leave default size of phi::DenseTensor as computed in ComputeInferShape
OT
*
ptr
=
output
->
mutable_data
<
OT
>
(
this
->
place_
);
return
this
->
AcquireMemoryFromPrimitive
(
this
->
fwd_pd_
->
dst_desc
(),
ptr
);
}
};
template
<
typename
XT
,
typename
YT
,
typename
OT
>
class
MatMulMKLDNNHandler
:
public
phi
::
funcs
::
OneDNNHandlerNoCachingT
<
XT
,
dnnl
::
matmul
>
{
public:
MatMulMKLDNNHandler
(
const
dnnl
::
engine
engine
,
paddle
::
platform
::
Place
cpu_place
,
phi
::
DenseTensor
*
x
,
bool
trans_x
,
phi
::
DenseTensor
*
y
,
bool
trans_y
,
phi
::
DenseTensor
*
out
,
float
scale
)
:
phi
::
funcs
::
OneDNNHandlerNoCachingT
<
XT
,
dnnl
::
matmul
>
(
engine
,
:
phi
::
funcs
::
OneDNNHandlerNoCachingT
<
XT
,
dnnl
::
matmul
>
(
engine
,
cpu_place
)
{
cpu_place
)
{
auto
mat_dim_x
=
phi
::
funcs
::
CreateMatrixDescriptor
(
x
->
dims
(),
0
,
trans_x
);
auto
mat_dim_x
=
phi
::
funcs
::
CreateMatrixDescriptor
(
x
->
dims
(),
0
,
trans_x
);
...
@@ -175,10 +344,10 @@ class MatMulV1OneDNNHandler
...
@@ -175,10 +344,10 @@ class MatMulV1OneDNNHandler
memory
::
dims
out_dims
=
{
out_bs
,
M
,
N
};
memory
::
dims
out_dims
=
{
out_bs
,
M
,
N
};
memory
::
dims
x_strides
=
memory
::
dims
x_strides
=
trans_x
?
memory
::
dims
{
M
*
K
,
1
,
M
}
:
memory
::
dims
{
M
*
K
,
K
,
1
};
!
trans_x
?
memory
::
dims
{
M
*
K
,
K
,
1
}
:
memory
::
dims
{
M
*
K
,
1
,
M
};
memory
::
dims
y_strides
=
memory
::
dims
y_strides
=
trans_y
?
memory
::
dims
{
N
*
K
,
1
,
K
}
:
memory
::
dims
{
N
*
K
,
N
,
1
};
!
trans_y
?
memory
::
dims
{
N
*
K
,
N
,
1
}
:
memory
::
dims
{
N
*
K
,
1
,
K
};
memory
::
dims
out_strides
=
memory
::
dims
{
M
*
N
,
N
,
1
};
memory
::
dims
out_strides
=
memory
::
dims
{
M
*
N
,
N
,
1
};
auto
x_md
=
memory
::
desc
(
x_dims
,
OneDNNGetDataType
<
XT
>
(),
x_strides
);
auto
x_md
=
memory
::
desc
(
x_dims
,
OneDNNGetDataType
<
XT
>
(),
x_strides
);
...
@@ -191,41 +360,65 @@ class MatMulV1OneDNNHandler
...
@@ -191,41 +360,65 @@ class MatMulV1OneDNNHandler
this
->
AcquireForwardPrimitiveDescriptor
(
attrs
,
x_md
,
y_md
,
out_md
);
this
->
AcquireForwardPrimitiveDescriptor
(
attrs
,
x_md
,
y_md
,
out_md
);
}
}
float
ComputeOutputScale
(
const
ExecutionContext
&
ctx
)
{
std
::
shared_ptr
<
memory
>
AcquireWeightsMemory
(
const
phi
::
DenseTensor
*
input
)
{
float
alpha
=
ctx
.
Attr
<
float
>
(
"alpha"
);
if
(
ctx
.
HasAttr
(
"Scale_x"
)
&&
ctx
.
HasAttr
(
"Scale_y"
)
&&
ctx
.
HasAttr
(
"Scale_out"
))
{
float
scale_x
=
ctx
.
Attr
<
float
>
(
"Scale_x"
);
float
scale_y
=
ctx
.
Attr
<
float
>
(
"Scale_y"
);
bool
force_fp32_out
=
ctx
.
HasAttr
(
"force_fp32_output"
)
?
ctx
.
Attr
<
bool
>
(
"force_fp32_output"
)
:
false
;
float
scale_out
=
force_fp32_out
?
1.
f
:
ctx
.
Attr
<
float
>
(
"Scale_out"
);
alpha
*=
scale_out
/
(
scale_x
*
scale_y
);
}
return
alpha
;
}
std
::
shared_ptr
<
memory
>
AcquireWeightsMemory
(
const
DenseTensor
*
input
)
{
const
YT
*
input_data
=
input
->
data
<
YT
>
();
const
YT
*
input_data
=
input
->
data
<
YT
>
();
return
this
->
AcquireMemoryFromPrimitive
(
return
this
->
AcquireMemoryFromPrimitive
(
this
->
fwd_pd_
->
weights_desc
(),
this
->
fwd_pd_
->
weights_desc
(),
phi
::
funcs
::
to_void_cast
<
YT
>
(
input_data
));
phi
::
funcs
::
to_void_cast
<
YT
>
(
input_data
));
}
}
std
::
shared_ptr
<
memory
>
AcquireDstMemory
(
DenseTensor
*
output
)
{
public:
void
Execute
(
const
phi
::
DenseTensor
*
x
,
const
phi
::
DenseTensor
*
y
,
phi
::
DenseTensor
*
out
)
{
const
auto
src_memory_p
=
this
->
AcquireSrcMemory
(
x
);
const
auto
weights_memory_p
=
this
->
AcquireWeightsMemory
(
y
);
const
auto
dst_memory_p
=
this
->
AcquireDstMemory
(
out
);
auto
matmul_p
=
this
->
AcquireForwardPrimitive
();
std
::
unordered_map
<
int
,
dnnl
::
memory
>
matmul_args
=
{
{
DNNL_ARG_SRC
,
*
src_memory_p
},
{
DNNL_ARG_WEIGHTS
,
*
weights_memory_p
},
{
DNNL_ARG_DST
,
*
dst_memory_p
}};
auto
&
astream
=
OneDNNContext
::
tls
().
get_stream
();
// Simulate batch matmul by processing in loop
void
*
x_ptr
=
src_memory_p
->
get_data_handle
();
void
*
y_ptr
=
weights_memory_p
->
get_data_handle
();
void
*
out_ptr
=
dst_memory_p
->
get_data_handle
();
auto
offsets
=
std
::
make_tuple
(
x_offset_
,
y_offset_
,
out_offset_
);
for
(
uint16_t
i
=
0
;
i
<
batch_size_
;
++
i
)
{
src_memory_p
->
set_data_handle
(
x_ptr
);
weights_memory_p
->
set_data_handle
(
y_ptr
);
dst_memory_p
->
set_data_handle
(
out_ptr
);
matmul_p
->
execute
(
astream
,
matmul_args
);
x_ptr
=
static_cast
<
char
*>
(
x_ptr
)
+
std
::
get
<
0
>
(
offsets
);
y_ptr
=
static_cast
<
char
*>
(
y_ptr
)
+
std
::
get
<
1
>
(
offsets
);
out_ptr
=
static_cast
<
char
*>
(
out_ptr
)
+
std
::
get
<
2
>
(
offsets
);
}
astream
.
wait
();
out
->
set_mem_desc
(
dst_memory_p
->
get_desc
().
reshape
(
out
->
dims
()));
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireDstMemory
(
phi
::
DenseTensor
*
output
)
{
// We cannot use base AcquireDstMemory as it makes an allocation request
// We cannot use base AcquireDstMemory as it makes an allocation request
// base on DST memory primitive size. This is fine in general, but in MatMul
// base on DST memory primitive size. This is fine in general, but in MatMul
// we have primitive that covers only one batch of Data and then shift
// we have primitive that covers only one batch of Data and then shift
// pointer for every new batch. Hence DenseTensor size is bigger that
// pointer for every new batch. Hence
phi::
DenseTensor size is bigger that
// dst memory primitive size. So would we request less memory that is there
// dst memory primitive size. So would we request less memory that is there
// and it triggers an assertion. So as there is no 'any' format here we can
// and it triggers an assertion. So as there is no 'any' format here we can
// leave default size of DenseTensor as computed in ComputeInferShape
// leave default size of
phi::
DenseTensor as computed in ComputeInferShape
OT
*
ptr
=
output
->
mutable_data
<
OT
>
(
this
->
place_
);
OT
*
ptr
=
output
->
mutable_data
<
OT
>
(
this
->
place_
);
return
this
->
AcquireMemoryFromPrimitive
(
this
->
fwd_pd_
->
dst_desc
(),
ptr
);
return
this
->
AcquireMemoryFromPrimitive
(
this
->
fwd_pd_
->
dst_desc
(),
ptr
);
}
}
private:
private:
uint32_t
x_offset_
;
uint32_t
y_offset_
;
uint32_t
out_offset_
;
uint16_t
batch_size_
;
uint16_t
batch_size_
;
};
};
...
@@ -236,7 +429,7 @@ class MatMulV1OneDNNHandler
...
@@ -236,7 +429,7 @@ class MatMulV1OneDNNHandler
* If transposed, `H,W` will be swapped.
* If transposed, `H,W` will be swapped.
*/
*/
static
void
ReshapeTensorToMatrixSequence
(
static
void
ReshapeTensorToMatrixSequence
(
DenseTensor
*
x
,
const
phi
::
funcs
::
MatDescriptor
&
descriptor
)
{
phi
::
DenseTensor
*
x
,
const
phi
::
funcs
::
MatDescriptor
&
descriptor
)
{
int64_t
h
,
w
;
int64_t
h
,
w
;
h
=
descriptor
.
height_
;
h
=
descriptor
.
height_
;
w
=
descriptor
.
width_
;
w
=
descriptor
.
width_
;
...
@@ -264,9 +457,9 @@ static void ReshapeTensorToMatrixSequence(
...
@@ -264,9 +457,9 @@ static void ReshapeTensorToMatrixSequence(
* If any of `X` and `Y` has batch size BatchSize, the out will have the
* If any of `X` and `Y` has batch size BatchSize, the out will have the
* BatchSize.
* BatchSize.
*/
*/
static
void
ReshapeXYOutToMatrixSequence
(
DenseTensor
*
x
,
static
void
ReshapeXYOutToMatrixSequence
(
phi
::
DenseTensor
*
x
,
DenseTensor
*
y
,
phi
::
DenseTensor
*
y
,
DenseTensor
*
out
,
phi
::
DenseTensor
*
out
,
bool
trans_x
,
bool
trans_x
,
bool
trans_y
)
{
bool
trans_y
)
{
auto
x_dim
=
phi
::
funcs
::
RowMatrixDimsFromVector
(
x
->
dims
());
auto
x_dim
=
phi
::
funcs
::
RowMatrixDimsFromVector
(
x
->
dims
());
...
@@ -293,22 +486,22 @@ std::vector<int64_t> Transpose(const std::vector<int64_t> &x,
...
@@ -293,22 +486,22 @@ std::vector<int64_t> Transpose(const std::vector<int64_t> &x,
auto
axis_set
=
std
::
set
<
int
>
(
axis
.
begin
(),
axis
.
end
());
auto
axis_set
=
std
::
set
<
int
>
(
axis
.
begin
(),
axis
.
end
());
PADDLE_ENFORCE_EQ
(
axis_set
.
size
(),
PADDLE_ENFORCE_EQ
(
axis_set
.
size
(),
axis_size
,
axis_size
,
p
hi
::
errors
::
InvalidArgument
(
p
addle
::
platform
::
errors
::
InvalidArgument
(
"In an axis array, elements must be unique."
));
"In an axis array, elements must be unique."
));
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
in_rank
,
in_rank
,
axis_size
,
axis_size
,
paddle
::
platform
::
errors
::
InvalidArgument
(
phi
::
errors
::
InvalidArgument
(
"The input dimension's size "
"The input dimension's size "
"should be equal to the axis's size. "
"should be equal to the axis's size. "
"But received dimension is %d, "
"But received dimension is %d, "
"axis's size is %d"
,
"axis's size is %d"
,
in_rank
,
in_rank
,
axis_size
));
axis_size
));
PADDLE_ENFORCE_LT
(
*
std
::
max_element
(
axis
.
begin
(),
axis
.
end
()),
PADDLE_ENFORCE_LT
(
*
std
::
max_element
(
axis
.
begin
(),
axis
.
end
()),
axis_size
,
axis_size
,
p
hi
::
errors
::
InvalidArgument
(
p
addle
::
platform
::
errors
::
InvalidArgument
(
"Axis values must be ranging from 0 to (dims - 1)."
));
"Axis values must be ranging from 0 to (dims - 1)."
));
std
::
vector
<
int64_t
>
new_x
(
x
.
size
());
std
::
vector
<
int64_t
>
new_x
(
x
.
size
());
...
@@ -318,16 +511,73 @@ std::vector<int64_t> Transpose(const std::vector<int64_t> &x,
...
@@ -318,16 +511,73 @@ std::vector<int64_t> Transpose(const std::vector<int64_t> &x,
return
new_x
;
return
new_x
;
}
}
std
::
vector
<
int64_t
>
GetInputStrides
(
const
ExecutionContext
&
ctx
,
const
std
::
string
input_name
)
{
auto
shape
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"fused_reshape_"
+
input_name
);
auto
axis
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"fused_transpose_"
+
input_name
);
auto
input_dims
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
input_name
)
->
dims
();
auto
new_dims
=
input_dims
;
if
(
!
shape
.
empty
()
&&
!
axis
.
empty
())
{
new_dims
=
input_dims
.
reshape
(
shape
).
transpose
(
axis
);
}
auto
&
MatrixDimsFromVector
=
input_name
==
"X"
?
phi
::
funcs
::
RowMatrixDimsFromVector
:
phi
::
funcs
::
ColumnMatrixDimsFromVector
;
phi
::
funcs
::
MatDescriptor
mat_dim
=
phi
::
funcs
::
CreateMatrixDescriptor
(
MatrixDimsFromVector
(
new_dims
),
0
,
ctx
.
HasAttr
(
"trans_x"
)
?
ctx
.
Attr
<
bool
>
(
std
::
string
(
"trans_"
)
+
static_cast
<
char
>
(
std
::
tolower
(
input_name
[
0
])))
:
ctx
.
Attr
<
bool
>
(
std
::
string
(
"transpose_"
)
+
input_name
[
0
]));
std
::
vector
<
int64_t
>
strides
;
if
(
!
shape
.
empty
())
{
auto
shape2
=
input_dims
.
reshape
(
shape
);
strides
.
push_back
(
1
);
for
(
auto
i
=
shape2
.
size
()
-
1
;
i
>
0
;
--
i
)
{
strides
.
insert
(
strides
.
begin
(),
strides
.
front
()
*
static_cast
<
int64_t
>
(
shape2
[
i
]));
}
strides
=
Transpose
(
strides
,
axis
);
if
(
shape
.
size
()
==
2
)
strides
.
insert
(
strides
.
begin
(),
static_cast
<
int64_t
>
(
shape
[
0
]
*
shape
[
1
]));
mat_dim
.
stride_
=
strides
[
0
];
if
(
mat_dim
.
trans_
)
std
::
swap
(
*
strides
.
rbegin
(),
*
(
++
strides
.
rbegin
()));
}
return
strides
;
}
bool
IsOutputFused
(
const
ExecutionContext
&
ctx
)
{
auto
&
fused_reshape_Out
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"fused_reshape_Out"
);
auto
&
fused_transpose_Out
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"fused_transpose_Out"
);
return
!
fused_reshape_Out
.
empty
()
&&
!
fused_transpose_Out
.
empty
();
}
template
<
typename
T
,
typename
T_out
>
template
<
typename
T
,
typename
T_out
>
void
ExecuteMatMul
(
const
ExecutionContext
&
ctx
,
void
ExecuteMatMulV2
(
const
ExecutionContext
&
ctx
,
const
DenseTensor
*
x
,
const
dnnl
::
engine
onednn_engine
,
const
std
::
vector
<
int64_t
>
&
x_dims
,
const
phi
::
DenseTensor
*
x
,
const
DenseTensor
*
y
,
const
std
::
vector
<
int64_t
>
&
x_dims
,
const
std
::
vector
<
int64_t
>
&
y_dims
,
bool
trans_x
,
DenseTensor
*
out
)
{
const
phi
::
DenseTensor
*
y
,
const
auto
&
dev_ctx
=
ctx
.
template
device_context
<
OneDNNContext
>();
const
std
::
vector
<
int64_t
>
&
y_dims
,
MatMulV1OneDNNHandler
<
T
,
T
,
T_out
>
handler
(
bool
trans_y
,
ctx
,
dev_ctx
.
GetEngine
(),
ctx
.
GetPlace
(),
x_dims
,
y_dims
);
phi
::
DenseTensor
*
out
)
{
std
::
vector
<
int64_t
>
x_strides_override
=
GetInputStrides
(
ctx
,
"X"
);
std
::
vector
<
int64_t
>
y_strides_override
=
GetInputStrides
(
ctx
,
"Y"
);
MatMulV2MKLDNNHandler
<
T
,
T
,
T_out
>
handler
(
ctx
,
onednn_engine
,
ctx
.
GetPlace
(),
x_dims
,
trans_x
,
y_dims
,
trans_y
,
IsOutputFused
(
ctx
),
x_strides_override
,
y_strides_override
);
const
auto
src_memory_p
=
handler
.
AcquireSrcMemory
(
x
);
const
auto
src_memory_p
=
handler
.
AcquireSrcMemory
(
x
);
const
auto
weights_memory_p
=
handler
.
AcquireWeightsMemory
(
y
);
const
auto
weights_memory_p
=
handler
.
AcquireWeightsMemory
(
y
);
...
@@ -340,23 +590,38 @@ void ExecuteMatMul(const ExecutionContext &ctx,
...
@@ -340,23 +590,38 @@ void ExecuteMatMul(const ExecutionContext &ctx,
{
DNNL_ARG_WEIGHTS
,
*
weights_memory_p
},
{
DNNL_ARG_WEIGHTS
,
*
weights_memory_p
},
{
DNNL_ARG_DST
,
*
dst_memory_p
}};
{
DNNL_ARG_DST
,
*
dst_memory_p
}};
if
(
ctx
.
HasInput
(
"ResidualData"
))
{
auto
*
residual_data
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"ResidualData"
);
const
auto
residual_data_memory_p
=
handler
.
AcquireSrcMemory
(
residual_data
);
matmul_args
.
insert
({
DNNL_ARG_ATTR_MULTIPLE_POST_OP
(
0
)
|
DNNL_ARG_SRC_1
,
*
residual_data_memory_p
});
}
auto
&
astream
=
OneDNNContext
::
tls
().
get_stream
();
auto
&
astream
=
OneDNNContext
::
tls
().
get_stream
();
matmul_p
->
execute
(
astream
,
matmul_args
);
matmul_p
->
execute
(
astream
,
matmul_args
);
astream
.
wait
();
astream
.
wait
();
out
->
set_mem_desc
(
// TODO(jczaja): Explain why int8 format of dst is ABCD and do not need
dst_memory_p
->
get_desc
().
reshape
(
vectorize
<
int64_t
>
(
out
->
dims
())));
// permute
if
(
IsOutputFused
(
ctx
)
&&
!
phi
::
funcs
::
is_int8
<
T_out
>
())
{
auto
axis
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"fused_transpose_Out"
);
auto
permuted_md
=
dst_memory_p
->
get_desc
().
permute_axes
(
axis
);
out
->
set_mem_desc
(
permuted_md
.
reshape
(
vectorize
<
int64_t
>
(
out
->
dims
())));
}
else
{
out
->
set_mem_desc
(
dst_memory_p
->
get_desc
().
reshape
(
vectorize
<
int64_t
>
(
out
->
dims
())));
}
}
}
template
<
typename
T
>
template
<
typename
T
>
class
MatMul
V1One
DNNKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
class
MatMul
MKL
DNNKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
ExecutionContext
&
ctx
)
const
override
{
if
(
ctx
.
HasAttr
(
"head_number"
))
{
if
(
ctx
.
HasAttr
(
"head_number"
))
{
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
ctx
.
Attr
<
int
>
(
"head_number"
),
ctx
.
Attr
<
int
>
(
"head_number"
),
1
,
1
,
p
hi
::
errors
::
Unimplemented
(
p
addle
::
platform
::
errors
::
Unimplemented
(
"oneDNN matmul doesn't support multiple heads. Expected "
"oneDNN matmul doesn't support multiple heads. Expected "
"head_number=1. But received `head_number` is %d"
,
"head_number=1. But received `head_number` is %d"
,
ctx
.
Attr
<
int
>
(
"head_number"
)));
ctx
.
Attr
<
int
>
(
"head_number"
)));
...
@@ -368,12 +633,19 @@ class MatMulV1OneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -368,12 +633,19 @@ class MatMulV1OneDNNKernel : public paddle::framework::OpKernel<T> {
:
false
;
:
false
;
constexpr
bool
fuse_relu
=
false
;
// TODO(intel): Enable eltwise fuses
constexpr
bool
fuse_relu
=
false
;
// TODO(intel): Enable eltwise fuses
auto
*
x
=
ctx
.
Input
<
DenseTensor
>
(
"X"
);
const
auto
&
dev_ctx
=
ctx
.
template
device_context
<
OneDNNContext
>();
auto
*
y
=
ctx
.
Input
<
DenseTensor
>
(
"Y"
);
const
auto
&
onednn_engine
=
dev_ctx
.
GetEngine
();
auto
*
out
=
ctx
.
Output
<
DenseTensor
>
(
"Out"
);
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
bool
trans_x
=
ctx
.
HasAttr
(
"trans_x"
)
?
ctx
.
Attr
<
bool
>
(
"trans_x"
)
:
ctx
.
Attr
<
bool
>
(
"transpose_X"
);
bool
trans_y
=
ctx
.
HasAttr
(
"trans_y"
)
?
ctx
.
Attr
<
bool
>
(
"trans_y"
)
:
ctx
.
Attr
<
bool
>
(
"transpose_Y"
);
auto
x_dims
=
vectorize
(
x
->
dims
(
));
auto
x_dims
=
vectorize
(
GetDimForInput
(
ctx
,
"X"
));
auto
y_dims
=
vectorize
(
y
->
dims
(
));
auto
y_dims
=
vectorize
(
GetDimForInput
(
ctx
,
"Y"
));
int
ndims
=
std
::
max
(
x_dims
.
size
(),
y_dims
.
size
());
int
ndims
=
std
::
max
(
x_dims
.
size
(),
y_dims
.
size
());
ndims
=
std
::
max
(
ndims
,
3
);
ndims
=
std
::
max
(
ndims
,
3
);
...
@@ -381,26 +653,58 @@ class MatMulV1OneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -381,26 +653,58 @@ class MatMulV1OneDNNKernel : public paddle::framework::OpKernel<T> {
std
::
vector
<
int64_t
>
x_bd_dims
(
ndims
,
1
);
std
::
vector
<
int64_t
>
x_bd_dims
(
ndims
,
1
);
std
::
vector
<
int64_t
>
y_bd_dims
(
ndims
,
1
);
std
::
vector
<
int64_t
>
y_bd_dims
(
ndims
,
1
);
CalculateMatrixDims
(
x_dims
,
y_dims
,
&
x_bd_dims
,
&
y_bd_dims
,
out
);
CalculateMatrixDims
(
ctx
,
x_dims
,
y_dims
,
&
x_bd_dims
,
&
y_bd_dims
,
out
);
if
(
force_fp32_output
||
((
!
is_int8
)
&&
(
!
is_bfloat16
)))
{
if
(
force_fp32_output
||
((
!
is_int8
)
&&
(
!
is_bfloat16
)))
{
ExecuteMatMul
<
T
,
float
>
(
ctx
,
x
,
x_bd_dims
,
y
,
y_bd_dims
,
out
);
ExecuteMatMulV2
<
T
,
float
>
(
ctx
,
onednn_engine
,
x
,
x_bd_dims
,
trans_x
,
y
,
y_bd_dims
,
trans_y
,
out
);
}
else
if
(
is_bfloat16
)
{
}
else
if
(
is_bfloat16
)
{
ExecuteMatMul
<
T
,
phi
::
dtype
::
bfloat16
>
(
ExecuteMatMulV2
<
T
,
paddle
::
platform
::
bfloat16
>
(
ctx
,
ctx
,
x
,
x_bd_dims
,
y
,
y_bd_dims
,
out
);
onednn_engine
,
x
,
x_bd_dims
,
trans_x
,
y
,
y_bd_dims
,
trans_y
,
out
);
}
else
if
(
fuse_relu
)
{
}
else
if
(
fuse_relu
)
{
ExecuteMatMul
<
T
,
uint8_t
>
(
ctx
,
x
,
x_bd_dims
,
y
,
y_bd_dims
,
out
);
ExecuteMatMulV2
<
T
,
uint8_t
>
(
ctx
,
onednn_engine
,
x
,
x_bd_dims
,
trans_x
,
y
,
y_bd_dims
,
trans_y
,
out
);
}
else
{
}
else
{
ExecuteMatMul
<
T
,
int8_t
>
(
ctx
,
x
,
x_bd_dims
,
y
,
y_bd_dims
,
out
);
ExecuteMatMulV2
<
T
,
int8_t
>
(
ctx
,
onednn_engine
,
x
,
x_bd_dims
,
trans_x
,
y
,
y_bd_dims
,
trans_y
,
out
);
}
}
}
}
private:
private:
void
CalculateMatrixDims
(
const
std
::
vector
<
int64_t
>
&
x_dims
,
void
CalculateMatrixDims
(
const
ExecutionContext
&
ctx
,
const
std
::
vector
<
int64_t
>
&
x_dims
,
const
std
::
vector
<
int64_t
>
&
y_dims
,
const
std
::
vector
<
int64_t
>
&
y_dims
,
std
::
vector
<
int64_t
>
*
x_bd_dims
,
std
::
vector
<
int64_t
>
*
x_bd_dims
,
std
::
vector
<
int64_t
>
*
y_bd_dims
,
std
::
vector
<
int64_t
>
*
y_bd_dims
,
DenseTensor
*
out
)
const
{
phi
::
DenseTensor
*
out
)
const
{
if
(
x_dims
.
size
()
==
1
)
{
if
(
x_dims
.
size
()
==
1
)
{
(
*
x_bd_dims
)[(
*
x_bd_dims
).
size
()
-
1
]
=
x_dims
[
0
];
(
*
x_bd_dims
)[(
*
x_bd_dims
).
size
()
-
1
]
=
x_dims
[
0
];
}
else
if
(
x_dims
.
size
()
==
2
)
{
}
else
if
(
x_dims
.
size
()
==
2
)
{
...
@@ -422,15 +726,15 @@ class MatMulV1OneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -422,15 +726,15 @@ class MatMulV1OneDNNKernel : public paddle::framework::OpKernel<T> {
}
}
}
}
if
(
x_dims
.
size
()
>
2
&&
y_dims
.
size
()
>
2
)
{
if
(
!
IsOutputFused
(
ctx
)
&&
x_dims
.
size
()
>
2
&&
y_dims
.
size
()
>
2
)
{
auto
out_dims
=
vectorize
(
out
->
dims
());
auto
out_dims
=
vectorize
(
out
->
dims
());
for
(
size_t
i
=
0
;
i
<
(
*
x_bd_dims
).
size
()
-
2
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
(
*
x_bd_dims
).
size
()
-
2
;
++
i
)
{
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
(
*
x_bd_dims
)[
i
]
==
(
*
y_bd_dims
)[
i
]
||
(
*
x_bd_dims
)[
i
]
==
1
||
(
*
x_bd_dims
)[
i
]
==
(
*
y_bd_dims
)[
i
]
||
(
*
x_bd_dims
)[
i
]
==
1
||
(
*
y_bd_dims
)[
i
]
==
1
,
(
*
y_bd_dims
)[
i
]
==
1
,
true
,
true
,
p
hi
::
errors
::
InvalidArgument
(
p
addle
::
platform
::
errors
::
InvalidArgument
(
"DenseTensor dimensions are incorrect for broadcasting."
"
phi::
DenseTensor dimensions are incorrect for broadcasting."
"Dimensions in X and Y must be same or equal to 1, but "
"Dimensions in X and Y must be same or equal to 1, but "
"received x_dim[%d]=%d and y_dims[%d]= %d"
,
"received x_dim[%d]=%d and y_dims[%d]= %d"
,
i
,
i
,
...
@@ -445,14 +749,14 @@ class MatMulV1OneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -445,14 +749,14 @@ class MatMulV1OneDNNKernel : public paddle::framework::OpKernel<T> {
};
};
template
<
typename
T
>
template
<
typename
T
>
class
MatMul
V1GradOne
DNNKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
class
MatMul
GradMKL
DNNKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
ExecutionContext
&
ctx
)
const
override
{
if
(
ctx
.
HasAttr
(
"head_number"
))
{
if
(
ctx
.
HasAttr
(
"head_number"
))
{
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
ctx
.
Attr
<
int
>
(
"head_number"
),
ctx
.
Attr
<
int
>
(
"head_number"
),
1
,
1
,
p
hi
::
errors
::
Unimplemented
(
p
addle
::
platform
::
errors
::
Unimplemented
(
"oneDNN matmul doesn't support multiple heads. Expected "
"oneDNN matmul doesn't support multiple heads. Expected "
"head_number=1. But received `head_number` is %d"
,
"head_number=1. But received `head_number` is %d"
,
ctx
.
Attr
<
int
>
(
"head_number"
)));
ctx
.
Attr
<
int
>
(
"head_number"
)));
...
@@ -461,18 +765,25 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -461,18 +765,25 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
const
auto
&
dev_ctx
=
ctx
.
template
device_context
<
OneDNNContext
>();
const
auto
&
dev_ctx
=
ctx
.
template
device_context
<
OneDNNContext
>();
const
auto
&
onednn_engine
=
dev_ctx
.
GetEngine
();
const
auto
&
onednn_engine
=
dev_ctx
.
GetEngine
();
auto
x
=
*
ctx
.
Input
<
DenseTensor
>
(
"X"
);
auto
x
=
*
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
y
=
*
ctx
.
Input
<
DenseTensor
>
(
"Y"
);
auto
y
=
*
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
dout
=
*
ctx
.
Input
<
DenseTensor
>
(
paddle
::
framework
::
GradVarName
(
"Out"
));
auto
dout
=
auto
*
dx
=
ctx
.
Output
<
DenseTensor
>
(
paddle
::
framework
::
GradVarName
(
"X"
));
*
ctx
.
Input
<
phi
::
DenseTensor
>
(
paddle
::
framework
::
GradVarName
(
"Out"
));
auto
*
dy
=
ctx
.
Output
<
DenseTensor
>
(
paddle
::
framework
::
GradVarName
(
"Y"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
paddle
::
framework
::
GradVarName
(
"X"
));
bool
transpose_x
=
ctx
.
Attr
<
bool
>
(
"transpose_X"
);
auto
*
dy
=
bool
transpose_y
=
ctx
.
Attr
<
bool
>
(
"transpose_Y"
);
ctx
.
Output
<
phi
::
DenseTensor
>
(
paddle
::
framework
::
GradVarName
(
"Y"
));
bool
transpose_x
=
ctx
.
HasAttr
(
"transpose_X"
)
?
ctx
.
Attr
<
bool
>
(
"transpose_X"
)
:
ctx
.
Attr
<
bool
>
(
"trans_x"
);
bool
transpose_y
=
ctx
.
HasAttr
(
"transpose_Y"
)
?
ctx
.
Attr
<
bool
>
(
"transpose_Y"
)
:
ctx
.
Attr
<
bool
>
(
"trans_y"
);
ReshapeXYOutToMatrixSequence
(
&
x
,
&
y
,
&
dout
,
transpose_x
,
transpose_y
);
ReshapeXYOutToMatrixSequence
(
&
x
,
&
y
,
&
dout
,
transpose_x
,
transpose_y
);
p
hi
::
DDim
dx_dims
;
p
addle
::
framework
::
DDim
dx_dims
;
if
(
dx
)
{
if
(
dx
)
{
dx_dims
=
dx
->
dims
();
dx_dims
=
dx
->
dims
();
if
(
dx_dims
!=
x
.
dims
())
{
if
(
dx_dims
!=
x
.
dims
())
{
...
@@ -480,7 +791,7 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -480,7 +791,7 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
}
}
}
}
p
hi
::
DDim
dy_dims
;
p
addle
::
framework
::
DDim
dy_dims
;
if
(
dy
)
{
if
(
dy
)
{
dy_dims
=
dy
->
dims
();
dy_dims
=
dy
->
dims
();
if
(
dy_dims
!=
y
.
dims
())
{
if
(
dy_dims
!=
y
.
dims
())
{
...
@@ -560,38 +871,38 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -560,38 +871,38 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
void
ExecuteMatMulGrad
(
const
ExecutionContext
&
ctx
,
void
ExecuteMatMulGrad
(
const
ExecutionContext
&
ctx
,
const
OneDNNContext
&
dev_ctx
,
const
OneDNNContext
&
dev_ctx
,
const
dnnl
::
engine
&
engine
,
const
dnnl
::
engine
&
engine
,
DenseTensor
*
x
,
phi
::
DenseTensor
*
x
,
bool
trans_x
,
bool
trans_x
,
bool
is_fold_init_dims_x
,
bool
is_fold_init_dims_x
,
DenseTensor
*
y
,
phi
::
DenseTensor
*
y
,
bool
trans_y
,
bool
trans_y
,
bool
is_fold_init_dims_y
,
bool
is_fold_init_dims_y
,
DenseTensor
*
out
)
const
{
phi
::
DenseTensor
*
out
)
const
{
// gradient is calculated in a different way when broadcasting is used
// gradient is calculated in a different way when broadcasting is used
bool
need_combine
=
(
x
->
dims
().
size
()
==
3
||
y
->
dims
().
size
()
==
3
)
&&
bool
need_combine
=
(
x
->
dims
().
size
()
==
3
||
y
->
dims
().
size
()
==
3
)
&&
out
->
dims
().
size
()
==
2
;
out
->
dims
().
size
()
==
2
;
DenseTensor
x_combined
,
y_combined
;
phi
::
DenseTensor
x_combined
,
y_combined
;
if
(
need_combine
)
{
if
(
!
need_combine
)
{
x_combined
=
*
x
;
y_combined
=
*
y
;
}
else
{
x_combined
=
is_fold_init_dims_x
?
FoldOuterDims
(
*
x
)
x_combined
=
is_fold_init_dims_x
?
FoldOuterDims
(
*
x
)
:
FoldFirstAndLastDims
<
T
>
(
dev_ctx
,
x
);
:
FoldFirstAndLastDims
<
T
>
(
dev_ctx
,
x
);
y_combined
=
is_fold_init_dims_y
?
FoldOuterDims
(
*
y
)
y_combined
=
is_fold_init_dims_y
?
FoldOuterDims
(
*
y
)
:
FoldFirstAndLastDims
<
T
>
(
dev_ctx
,
y
);
:
FoldFirstAndLastDims
<
T
>
(
dev_ctx
,
y
);
}
else
{
x_combined
=
*
x
;
y_combined
=
*
y
;
}
}
float
alpha
=
ctx
.
Attr
<
float
>
(
"alpha"
)
;
float
alpha
=
ctx
.
HasAttr
(
"alpha"
)
?
ctx
.
Attr
<
float
>
(
"alpha"
)
:
1.0
f
;
MatMul
V1One
DNNHandler
<
T
,
T
,
T
>
handler
(
engine
,
MatMul
MKL
DNNHandler
<
T
,
T
,
T
>
handler
(
engine
,
ctx
.
GetPlace
(),
ctx
.
GetPlace
(),
&
x_combined
,
&
x_combined
,
trans_x
,
trans_x
,
&
y_combined
,
&
y_combined
,
trans_y
,
trans_y
,
out
,
out
,
alpha
);
alpha
);
const
auto
src_memory_p
=
handler
.
AcquireSrcMemory
(
&
x_combined
);
const
auto
src_memory_p
=
handler
.
AcquireSrcMemory
(
&
x_combined
);
const
auto
weights_memory_p
=
handler
.
AcquireWeightsMemory
(
&
y_combined
);
const
auto
weights_memory_p
=
handler
.
AcquireWeightsMemory
(
&
y_combined
);
...
@@ -599,7 +910,7 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -599,7 +910,7 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
auto
matmul_p
=
handler
.
AcquireForwardPrimitive
();
auto
matmul_p
=
handler
.
AcquireForwardPrimitive
();
std
::
unordered_map
<
int
,
memory
>
matmul_args
=
{
std
::
unordered_map
<
int
,
dnnl
::
memory
>
matmul_args
=
{
{
DNNL_ARG_SRC
,
*
src_memory_p
},
{
DNNL_ARG_SRC
,
*
src_memory_p
},
{
DNNL_ARG_WEIGHTS
,
*
weights_memory_p
},
{
DNNL_ARG_WEIGHTS
,
*
weights_memory_p
},
{
DNNL_ARG_DST
,
*
dst_memory_p
}};
{
DNNL_ARG_DST
,
*
dst_memory_p
}};
...
@@ -618,13 +929,13 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -618,13 +929,13 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
REGISTER_OP_KERNEL
(
matmul
,
REGISTER_OP_KERNEL
(
matmul
,
MKLDNN
,
MKLDNN
,
::
phi
::
CPUPlace
,
::
phi
::
CPUPlace
,
MatMul
V1One
DNNKernel
<
float
>
,
MatMul
MKL
DNNKernel
<
float
>
,
MatMul
V1OneDNNKernel
<
phi
::
dtype
::
bfloat16
>
,
MatMul
MKLDNNKernel
<
paddle
::
platform
::
bfloat16
>
,
MatMul
V1One
DNNKernel
<
int8_t
>
,
MatMul
MKL
DNNKernel
<
int8_t
>
,
MatMul
V1One
DNNKernel
<
uint8_t
>
);
MatMul
MKL
DNNKernel
<
uint8_t
>
);
REGISTER_OP_KERNEL
(
matmul_grad
,
REGISTER_OP_KERNEL
(
matmul_grad
,
MKLDNN
,
MKLDNN
,
::
phi
::
CPUPlace
,
::
phi
::
CPUPlace
,
MatMul
V1GradOne
DNNKernel
<
float
>
,
MatMul
GradMKL
DNNKernel
<
float
>
,
MatMul
V1GradOneDNNKernel
<
phi
::
dtype
::
bfloat16
>
);
MatMul
GradMKLDNNKernel
<
paddle
::
platform
::
bfloat16
>
);
paddle/fluid/operators/ops_extra_info.h
浏览文件 @
338cbeaa
...
@@ -99,7 +99,7 @@ const std::unordered_map<std::string, ExtraAttrPropertySet>
...
@@ -99,7 +99,7 @@ const std::unordered_map<std::string, ExtraAttrPropertySet>
{
"fuse_alpha"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_alpha"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_beta"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_beta"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_relu"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_relu"
,
ExtraAttrProperty
::
ONEDNN
},
{
"
alpha
"
,
ExtraAttrProperty
::
ONEDNN
},
{
"
fused_output_scale
"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_residual_connection"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_residual_connection"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_with_relu"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_with_relu"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fused_reshape_Out"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fused_reshape_Out"
,
ExtraAttrProperty
::
ONEDNN
},
...
...
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
浏览文件 @
338cbeaa
...
@@ -146,7 +146,7 @@ class TestMatmulActivationMkldnnFusePass(PassAutoScanTest):
...
@@ -146,7 +146,7 @@ class TestMatmulActivationMkldnnFusePass(PassAutoScanTest):
'operator_scale_onednn_fuse_pass'
,
'operator_scale_onednn_fuse_pass'
,
],
],
)
)
yield
config
,
[
'matmul
_v2
'
],
(
1e-5
,
1e-5
)
yield
config
,
[
'matmul'
],
(
1e-5
,
1e-5
)
def
test
(
self
):
def
test
(
self
):
self
.
run_and_statis
(
self
.
run_and_statis
(
...
...
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
浏览文件 @
338cbeaa
...
@@ -137,7 +137,7 @@ class TestMatmulElementwiseAddActivationMkldnnFusePass(PassAutoScanTest):
...
@@ -137,7 +137,7 @@ class TestMatmulElementwiseAddActivationMkldnnFusePass(PassAutoScanTest):
'matmul_activation_mkldnn_fuse_pass'
,
'matmul_activation_mkldnn_fuse_pass'
,
],
],
)
)
yield
config
,
[
'matmul
_v2
'
],
(
1e-5
,
1e-5
)
yield
config
,
[
'matmul'
],
(
1e-5
,
1e-5
)
def
test
(
self
):
def
test
(
self
):
self
.
run_and_statis
(
self
.
run_and_statis
(
...
...
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
浏览文件 @
338cbeaa
...
@@ -76,7 +76,7 @@ class TestMatmulElementwiseAddMkldnnFusePass(PassAutoScanTest):
...
@@ -76,7 +76,7 @@ class TestMatmulElementwiseAddMkldnnFusePass(PassAutoScanTest):
config
=
self
.
create_inference_config
(
config
=
self
.
create_inference_config
(
use_mkldnn
=
True
,
passes
=
[
'matmul_elementwise_add_mkldnn_fuse_pass'
]
use_mkldnn
=
True
,
passes
=
[
'matmul_elementwise_add_mkldnn_fuse_pass'
]
)
)
yield
config
,
[
'matmul
_v2
'
],
(
1e-5
,
1e-5
)
yield
config
,
[
'matmul'
],
(
1e-5
,
1e-5
)
def
test
(
self
):
def
test
(
self
):
self
.
run_and_statis
(
self
.
run_and_statis
(
...
...
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
浏览文件 @
338cbeaa
...
@@ -116,7 +116,7 @@ class TestMatmulTransposeReshapeMkldnnFusePass(PassAutoScanTest):
...
@@ -116,7 +116,7 @@ class TestMatmulTransposeReshapeMkldnnFusePass(PassAutoScanTest):
def
sample_predictor_configs
(
self
,
program_config
):
def
sample_predictor_configs
(
self
,
program_config
):
config
=
self
.
create_inference_config
(
use_mkldnn
=
True
)
config
=
self
.
create_inference_config
(
use_mkldnn
=
True
)
yield
config
,
[
"matmul
_v2
"
],
(
1e-5
,
1e-5
)
yield
config
,
[
"matmul"
],
(
1e-5
,
1e-5
)
def
test
(
self
):
def
test
(
self
):
self
.
run_and_statis
(
self
.
run_and_statis
(
...
...
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
浏览文件 @
338cbeaa
...
@@ -135,8 +135,17 @@ class TestMatmulv2TransposeReshapeMkldnnFusePass(PassAutoScanTest):
...
@@ -135,8 +135,17 @@ class TestMatmulv2TransposeReshapeMkldnnFusePass(PassAutoScanTest):
return
program_config
return
program_config
def
sample_predictor_configs
(
self
,
program_config
):
def
sample_predictor_configs
(
self
,
program_config
):
# gpu_cpu_map_matmul_v2_to_matmul_pass will affect the type of final fused op
fused_op
=
"matmul_v2"
input1_dim1
=
program_config
.
inputs
[
"input_data1"
].
shape
[
0
]
input2_dim1
=
program_config
.
inputs
[
"input_data2"
].
shape
[
0
]
input1_dim2
=
program_config
.
inputs
[
"input_data1"
].
shape
[
1
]
input2_dim2
=
program_config
.
inputs
[
"input_data2"
].
shape
[
1
]
if
input1_dim1
==
input2_dim1
and
input1_dim2
==
input2_dim2
:
fused_op
=
"matmul"
config
=
self
.
create_inference_config
(
use_mkldnn
=
True
)
config
=
self
.
create_inference_config
(
use_mkldnn
=
True
)
yield
config
,
[
"matmul_v2"
],
(
1e-5
,
1e-5
)
yield
config
,
[
fused_op
],
(
1e-5
,
1e-5
)
def
test
(
self
):
def
test
(
self
):
self
.
run_and_statis
(
self
.
run_and_statis
(
...
...
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py
浏览文件 @
338cbeaa
...
@@ -153,7 +153,7 @@ class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest):
...
@@ -153,7 +153,7 @@ class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest):
def
sample_predictor_configs
(
self
,
program_config
):
def
sample_predictor_configs
(
self
,
program_config
):
config
=
self
.
create_inference_config
(
use_mkldnn
=
True
)
config
=
self
.
create_inference_config
(
use_mkldnn
=
True
)
yield
config
,
[
"matmul
_v2
"
],
(
1e-5
,
1e-5
)
yield
config
,
[
"matmul"
],
(
1e-5
,
1e-5
)
def
test
(
self
):
def
test
(
self
):
self
.
run_and_statis
(
self
.
run_and_statis
(
...
...
python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
浏览文件 @
338cbeaa
...
@@ -17,7 +17,7 @@ import unittest
...
@@ -17,7 +17,7 @@ import unittest
import
numpy
as
np
import
numpy
as
np
from
paddle.fluid.tests.unittests.op_test
import
OpTest
from
paddle.fluid.tests.unittests.op_test
import
OpTest
,
skip_check_grad_ci
class
TestDnnlMatMulOp
(
OpTest
):
class
TestDnnlMatMulOp
(
OpTest
):
...
@@ -254,6 +254,321 @@ class TestDnnlMatMulOpInt8ForceFP32BasicScales(TestDnnlMatMulOp):
...
@@ -254,6 +254,321 @@ class TestDnnlMatMulOpInt8ForceFP32BasicScales(TestDnnlMatMulOp):
self
.
attrs
=
{
'force_fp32_output'
:
True
}
self
.
attrs
=
{
'force_fp32_output'
:
True
}
@
skip_check_grad_ci
(
reason
=
"DNNL's MatMul doesn't implement grad kernel."
)
class
TestReshapeTransposeMatMulOp
(
OpTest
):
def
init_data_type
(
self
):
self
.
data_type_
=
'float32'
def
generate_data
(
self
):
self
.
x
=
(
np
.
random
.
random
([
2
,
128
,
768
])
.
astype
(
"float32"
)
.
reshape
([
2
,
128
,
12
,
64
])
.
transpose
([
0
,
2
,
1
,
3
])
)
self
.
y
=
(
np
.
random
.
random
([
2
,
128
,
768
])
.
astype
(
"float32"
)
.
reshape
([
2
,
128
,
12
,
64
])
.
transpose
([
0
,
2
,
1
,
3
])
)
self
.
out
=
np
.
matmul
(
self
.
x
,
self
.
y
.
transpose
([
0
,
1
,
3
,
2
]))
self
.
fused_reshape_X
=
[]
self
.
fused_transpose_X
=
[]
self
.
fused_reshape_Y
=
[]
self
.
fused_transpose_Y
=
[]
def
set_op_type_and_transpose_y_name
(
self
):
self
.
op_type
=
"matmul"
self
.
transpose_y_name
=
"transpose_Y"
def
setUp
(
self
):
self
.
set_op_type_and_transpose_y_name
()
self
.
_cpu_only
=
True
self
.
use_mkldnn
=
True
self
.
transpose_y
=
True
self
.
init_data_type
()
self
.
generate_data
()
self
.
inputs
=
{
'X'
:
self
.
x
,
'Y'
:
self
.
y
}
self
.
attrs
=
{
'use_mkldnn'
:
self
.
use_mkldnn
,
self
.
transpose_y_name
:
self
.
transpose_y
,
}
if
len
(
self
.
fused_transpose_X
)
>
0
:
self
.
attrs
[
'fused_transpose_X'
]
=
self
.
fused_transpose_X
if
len
(
self
.
fused_transpose_Y
)
>
0
:
self
.
attrs
[
'fused_transpose_Y'
]
=
self
.
fused_transpose_Y
if
len
(
self
.
fused_reshape_X
)
>
0
:
self
.
attrs
[
'fused_reshape_X'
]
=
self
.
fused_reshape_X
if
len
(
self
.
fused_reshape_Y
)
>
0
:
self
.
attrs
[
'fused_reshape_Y'
]
=
self
.
fused_reshape_Y
self
.
outputs
=
{
'Out'
:
self
.
out
}
def
test_check_output
(
self
):
self
.
check_output
()
class
TestReshapeTransposeMatMulOp4DXFloat
(
TestReshapeTransposeMatMulOp
):
def
generate_data
(
self
):
self
.
x
=
np
.
random
.
random
([
2
,
128
,
768
]).
astype
(
"float32"
)
self
.
y
=
(
np
.
random
.
random
([
2
,
128
,
768
])
.
astype
(
"float32"
)
.
reshape
([
2
,
128
,
12
,
64
])
.
transpose
([
0
,
2
,
1
,
3
])
)
self
.
fused_transpose_X
=
[
0
,
2
,
1
,
3
]
self
.
fused_reshape_X
=
[
0
,
0
,
12
,
64
]
self
.
fused_transpose_Y
=
[]
self
.
fused_reshape_Y
=
[]
self
.
out
=
np
.
matmul
(
self
.
x
.
reshape
([
2
,
128
,
12
,
64
]).
transpose
([
0
,
2
,
1
,
3
]),
self
.
y
.
transpose
([
0
,
1
,
3
,
2
]),
)
class
TestReshapeTransposeMatMulOp4DXInt8
(
TestReshapeTransposeMatMulOp4DXFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
'int8'
class
TestReshapeTransposeMatMulOp4DYFloat
(
TestReshapeTransposeMatMulOp
):
def
generate_data
(
self
):
self
.
x
=
(
np
.
random
.
random
([
2
,
128
,
768
])
.
astype
(
"float32"
)
.
reshape
([
2
,
128
,
12
,
64
])
.
transpose
([
0
,
2
,
1
,
3
])
)
self
.
y
=
np
.
random
.
random
([
2
,
128
,
768
]).
astype
(
"float32"
)
self
.
fused_transpose_X
=
[]
self
.
fused_reshape_X
=
[]
self
.
fused_transpose_Y
=
[
0
,
2
,
1
,
3
]
self
.
fused_reshape_Y
=
[
0
,
0
,
12
,
64
]
self
.
out
=
np
.
matmul
(
self
.
x
,
self
.
y
.
reshape
([
2
,
128
,
12
,
64
]).
transpose
([
0
,
2
,
3
,
1
])
)
class
TestReshapeTransposeMatMulOp4DYInt8
(
TestReshapeTransposeMatMulOp4DYFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
'int8'
class
TestReshapeTransposeMatMulOp4DXYFloat
(
TestReshapeTransposeMatMulOp
):
def
generate_data
(
self
):
self
.
x
=
np
.
random
.
random
([
2
,
128
,
768
]).
astype
(
"float32"
)
self
.
y
=
np
.
random
.
random
([
2
,
128
,
768
]).
astype
(
"float32"
)
self
.
fused_transpose_X
=
[
0
,
2
,
1
,
3
]
self
.
fused_reshape_X
=
[
0
,
0
,
12
,
64
]
self
.
fused_transpose_Y
=
[
0
,
2
,
1
,
3
]
self
.
fused_reshape_Y
=
[
0
,
0
,
12
,
64
]
self
.
out
=
np
.
matmul
(
self
.
x
.
reshape
([
2
,
128
,
12
,
64
]).
transpose
([
0
,
2
,
1
,
3
]),
self
.
y
.
reshape
([
2
,
128
,
12
,
64
]).
transpose
([
0
,
2
,
3
,
1
]),
)
class
TestReshapeTransposeMatMulOp4DXYInt8
(
TestReshapeTransposeMatMulOp4DXYFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
'int8'
class
TestReshapeTransposeMatMulOp2DXFloat
(
TestReshapeTransposeMatMulOp
):
def
generate_data
(
self
):
self
.
x
=
np
.
random
.
random
([
2
,
5
,
10
]).
astype
(
"float32"
)
self
.
y
=
(
np
.
random
.
random
([
2
,
5
,
10
])
.
astype
(
"float32"
)
.
reshape
([
10
,
10
])
.
transpose
([
1
,
0
])
)
self
.
fused_transpose_X
=
[
1
,
0
]
self
.
fused_reshape_X
=
[
10
,
10
]
self
.
fused_transpose_Y
=
[]
self
.
fused_reshape_Y
=
[]
self
.
out
=
np
.
matmul
(
self
.
x
.
reshape
([
10
,
10
]).
transpose
([
1
,
0
]),
self
.
y
.
transpose
([
1
,
0
])
)
class
TestReshapeTransposeMatMulOp2DXInt8
(
TestReshapeTransposeMatMulOp2DXFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
'int8'
class
TestReshapeTransposeMatMulOp2DYFloat
(
TestReshapeTransposeMatMulOp
):
def
generate_data
(
self
):
self
.
x
=
(
np
.
random
.
random
([
2
,
5
,
10
])
.
astype
(
"float32"
)
.
reshape
([
10
,
10
])
.
transpose
([
1
,
0
])
)
self
.
y
=
np
.
random
.
random
([
2
,
5
,
10
]).
astype
(
"float32"
)
self
.
fused_transpose_X
=
[]
self
.
fused_reshape_X
=
[]
self
.
fused_transpose_Y
=
[
1
,
0
]
self
.
fused_reshape_Y
=
[
10
,
10
]
self
.
out
=
np
.
matmul
(
self
.
x
,
self
.
y
.
reshape
([
10
,
10
]))
class
TestReshapeTransposeMatMulOp2DYInt8
(
TestReshapeTransposeMatMulOp2DYFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
'int8'
class
TestReshapeTransposeMatMulOp3DXFloat
(
TestReshapeTransposeMatMulOp
):
def
generate_data
(
self
):
self
.
x
=
np
.
random
.
random
([
2
,
2
,
5
,
5
]).
astype
(
"float32"
)
self
.
y
=
(
np
.
random
.
random
([
2
,
2
,
5
,
5
])
.
astype
(
"float32"
)
.
reshape
([
2
,
10
,
5
])
.
transpose
([
0
,
2
,
1
])
)
self
.
fused_transpose_X
=
[
0
,
2
,
1
]
self
.
fused_reshape_X
=
[
2
,
10
,
5
]
self
.
fused_transpose_Y
=
[]
self
.
fused_reshape_Y
=
[]
self
.
out
=
np
.
matmul
(
self
.
x
.
reshape
([
2
,
10
,
5
]).
transpose
(
0
,
2
,
1
),
self
.
y
.
transpose
(
0
,
2
,
1
),
)
class
TestReshapeTransposeMatMulOp3DXInt8
(
TestReshapeTransposeMatMulOp3DXFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
'int8'
class
TestReshapeTransposeMatMulOp3DYFloat
(
TestReshapeTransposeMatMulOp
):
def
generate_data
(
self
):
self
.
x
=
(
np
.
random
.
random
([
2
,
2
,
5
,
5
])
.
astype
(
self
.
data_type_
)
.
reshape
([
2
,
10
,
5
])
.
transpose
([
0
,
2
,
1
])
)
self
.
y
=
np
.
random
.
random
([
2
,
2
,
5
,
5
]).
astype
(
self
.
data_type_
)
self
.
fused_transpose_X
=
[]
self
.
fused_reshape_X
=
[]
self
.
fused_transpose_Y
=
[
0
,
2
,
1
]
self
.
fused_reshape_Y
=
[
2
,
10
,
5
]
self
.
out
=
np
.
matmul
(
self
.
x
,
self
.
y
.
reshape
([
2
,
10
,
5
]))
class
TestReshapeTransposeMatMulOp3DYInt8
(
TestReshapeTransposeMatMulOp3DYFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
'int8'
@
skip_check_grad_ci
(
reason
=
"Tests inference only optimization."
)
class
TestMatMulOpTransposeReshapeEmptyFloat
(
OpTest
):
def
init_data_type
(
self
):
self
.
data_type_
=
np
.
float32
def
generate_data
(
self
):
self
.
bs
=
1
self
.
x
=
np
.
random
.
random
([
self
.
bs
,
128
,
128
]).
astype
(
self
.
data_type_
)
self
.
y
=
np
.
random
.
random
([
self
.
bs
,
128
,
64
]).
astype
(
self
.
data_type_
)
def
init_params_and_out
(
self
):
self
.
transpose_out
=
[]
self
.
reshape_out
=
[]
self
.
out
=
np
.
matmul
(
self
.
x
,
self
.
y
)
def
set_op_type
(
self
):
self
.
op_type
=
"matmul"
def
setUp
(
self
):
self
.
set_op_type
()
self
.
_cpu_only
=
True
self
.
use_mkldnn
=
True
self
.
init_data_type
()
self
.
generate_data
()
self
.
init_params_and_out
()
self
.
inputs
=
{
'X'
:
self
.
x
,
'Y'
:
self
.
y
}
self
.
attrs
=
{
'use_mkldnn'
:
self
.
use_mkldnn
}
if
len
(
self
.
reshape_out
)
>
0
:
self
.
attrs
[
'fused_reshape_Out'
]
=
self
.
reshape_out
if
len
(
self
.
transpose_out
)
>
0
:
self
.
attrs
[
'fused_transpose_Out'
]
=
self
.
transpose_out
self
.
inputs
=
{
'X'
:
self
.
x
,
'Y'
:
self
.
y
}
self
.
outputs
=
{
'Out'
:
self
.
out
}
def
test_check_output
(
self
):
self
.
check_output
()
def
check_raise_error
(
self
,
msg
):
try
:
self
.
check_output
()
except
Exception
as
e
:
if
msg
in
str
(
e
):
raise
AttributeError
else
:
print
(
e
)
class
TestMatMulOpTransposeReshapeIntEmptyInt
(
TestMatMulOpTransposeReshapeEmptyFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
np
.
int8
class
TestMatMulOpTransposeReshapeBasicFloat
(
TestMatMulOpTransposeReshapeEmptyFloat
):
def
generate_data
(
self
):
self
.
bs
=
8
self
.
x
=
np
.
random
.
random
([
self
.
bs
,
12
,
128
,
128
]).
astype
(
self
.
data_type_
)
self
.
y
=
np
.
random
.
random
([
self
.
bs
,
12
,
128
,
64
]).
astype
(
self
.
data_type_
)
def
init_params_and_out
(
self
):
self
.
transpose_out
=
[
0
,
2
,
1
,
3
]
self
.
reshape_out
=
[
0
,
0
,
self
.
x
.
shape
[
1
]
*
self
.
y
.
shape
[
-
1
]]
self
.
out
=
(
np
.
matmul
(
self
.
x
,
self
.
y
)
.
transpose
([
0
,
2
,
1
,
3
])
.
reshape
([
self
.
bs
,
-
1
,
self
.
x
.
shape
[
1
]
*
self
.
y
.
shape
[
-
1
]])
)
class
TestMatMulOpTransposeReshapeBasicInt
(
TestMatMulOpTransposeReshapeBasicFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
np
.
int8
class
TestMatMulOpTransposeReshapeOtherDimFloat
(
TestMatMulOpTransposeReshapeBasicFloat
):
def
generate_data
(
self
):
self
.
bs
=
11
self
.
x
=
np
.
random
.
random
([
self
.
bs
,
12
,
14
,
18
]).
astype
(
self
.
data_type_
)
self
.
y
=
np
.
random
.
random
([
self
.
bs
,
12
,
18
,
13
]).
astype
(
self
.
data_type_
)
class
TestMatMulOpTransposeReshapeOtherDimInt
(
TestMatMulOpTransposeReshapeOtherDimFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
np
.
int8
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
from
paddle
import
enable_static
from
paddle
import
enable_static
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录