Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
338cbeaa
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
338cbeaa
编写于
1月 04, 2023
作者:
S
Sławomir Siwek
提交者:
GitHub
1月 04, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Revert "Replace matmul with matmul_v2 during oneDNN fuse passes (#49108)" (#49524)
This reverts commit
2c444dfa
.
上级
49f5a97b
变更
16
隐藏空白更改
内联
并排
Showing
16 changed file
with
1304 addition
and
60 deletion
+1304
-60
paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc
...framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc
+0
-10
paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc
...work/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc
+0
-10
paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc
...rk/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc
+0
-9
paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
...id/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
+0
-11
paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
...rk/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
+0
-9
paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
...kldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
+1
-1
paddle/fluid/operators/matmul_op.cc
paddle/fluid/operators/matmul_op.cc
+30
-2
paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+941
-0
paddle/fluid/operators/ops_extra_info.h
paddle/fluid/operators/ops_extra_info.h
+1
-1
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
...s/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
+1
-1
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
...est_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
+1
-1
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
...inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
+1
-1
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
...ference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
+1
-1
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
...ence/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
+10
-1
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py
...ference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py
+1
-1
python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
...dle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
+316
-1
未找到文件。
paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc
浏览文件 @
338cbeaa
...
@@ -77,16 +77,6 @@ void MatmulActivationMkldnnFusePass::FuseMatmulAct(
...
@@ -77,16 +77,6 @@ void MatmulActivationMkldnnFusePass::FuseMatmulAct(
?
"gelu_tanh"
?
"gelu_tanh"
:
"gelu_erf"
;
:
"gelu_erf"
;
}
}
if
(
matmul_type
==
"matmul"
)
{
matmul_op
->
SetType
(
"matmul_v2"
);
matmul_op
->
SetAttr
(
"trans_x"
,
matmul_op
->
GetAttr
(
"transpose_X"
));
matmul_op
->
SetAttr
(
"trans_y"
,
matmul_op
->
GetAttr
(
"transpose_Y"
));
auto
matmul_alpha
=
matmul_op
->
GetAttrIfExists
<
float
>
(
"alpha"
);
if
(
matmul_alpha
!=
1.0
f
)
{
matmul_op
->
SetAttr
(
"alpha"
,
matmul_alpha
);
}
}
matmul_op
->
SetAttr
(
"fuse_activation"
,
act_type
);
matmul_op
->
SetAttr
(
"fuse_activation"
,
act_type
);
matmul_op
->
SetOutput
(
"Out"
,
{
activation_out
->
Name
()});
matmul_op
->
SetOutput
(
"Out"
,
{
activation_out
->
Name
()});
...
...
paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc
浏览文件 @
338cbeaa
...
@@ -65,16 +65,6 @@ void MatmulElementwiseAddMKLDNNFusePass::FuseMatmulElementwiseAdd(
...
@@ -65,16 +65,6 @@ void MatmulElementwiseAddMKLDNNFusePass::FuseMatmulElementwiseAdd(
return
;
return
;
}
}
if
(
matmul_type
==
"matmul"
)
{
matmul
->
Op
()
->
SetType
(
"matmul_v2"
);
matmul
->
Op
()
->
SetAttr
(
"trans_x"
,
matmul
->
Op
()
->
GetAttr
(
"transpose_X"
));
matmul
->
Op
()
->
SetAttr
(
"trans_y"
,
matmul
->
Op
()
->
GetAttr
(
"transpose_Y"
));
auto
matmul_alpha
=
matmul
->
Op
()
->
GetAttrIfExists
<
float
>
(
"alpha"
);
if
(
matmul_alpha
!=
1.0
f
)
{
matmul
->
Op
()
->
SetAttr
(
"alpha"
,
matmul_alpha
);
}
}
matmul
->
Op
()
->
SetInput
(
"ResidualData"
,
{
elementwise_addend
->
Name
()});
matmul
->
Op
()
->
SetInput
(
"ResidualData"
,
{
elementwise_addend
->
Name
()});
matmul
->
Op
()
->
SetOutput
(
"Out"
,
{
elementwise_add_out
->
Name
()});
matmul
->
Op
()
->
SetOutput
(
"Out"
,
{
elementwise_add_out
->
Name
()});
...
...
paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc
浏览文件 @
338cbeaa
...
@@ -84,15 +84,6 @@ void MatmulTransposeReshapeMKLDNNPass::Fuse(
...
@@ -84,15 +84,6 @@ void MatmulTransposeReshapeMKLDNNPass::Fuse(
}
}
OpDesc
*
matmul_desc
=
matmul_op
->
Op
();
OpDesc
*
matmul_desc
=
matmul_op
->
Op
();
if
(
matmul_type
==
"matmul"
)
{
matmul_desc
->
SetType
(
"matmul_v2"
);
matmul_desc
->
SetAttr
(
"trans_x"
,
matmul_desc
->
GetAttr
(
"transpose_X"
));
matmul_desc
->
SetAttr
(
"trans_y"
,
matmul_desc
->
GetAttr
(
"transpose_Y"
));
auto
matmul_alpha
=
matmul_desc
->
GetAttrIfExists
<
float
>
(
"alpha"
);
if
(
matmul_alpha
!=
1.0
f
)
{
matmul_desc
->
SetAttr
(
"alpha"
,
matmul_alpha
);
}
}
matmul_desc
->
SetOutput
(
"Out"
,
{
reshape_out
->
Name
()});
matmul_desc
->
SetOutput
(
"Out"
,
{
reshape_out
->
Name
()});
matmul_desc
->
SetAttr
(
"fused_reshape_Out"
,
reshape_shape
);
matmul_desc
->
SetAttr
(
"fused_reshape_Out"
,
reshape_shape
);
matmul_desc
->
SetAttr
(
"fused_transpose_Out"
,
transpose_axis
);
matmul_desc
->
SetAttr
(
"fused_transpose_Out"
,
transpose_axis
);
...
...
paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
浏览文件 @
338cbeaa
...
@@ -85,17 +85,6 @@ void FuseOperatorScaleOneDNNPass::FuseScale(Graph *graph,
...
@@ -85,17 +85,6 @@ void FuseOperatorScaleOneDNNPass::FuseScale(Graph *graph,
scale
=
*
(
scale_tensor
->
data
<
float
>
());
scale
=
*
(
scale_tensor
->
data
<
float
>
());
}
}
if
(
op_type
==
"matmul"
)
{
operator_op
->
Op
()
->
SetType
(
"matmul_v2"
);
operator_op
->
Op
()
->
SetAttr
(
"trans_x"
,
operator_op
->
Op
()
->
GetAttr
(
"transpose_X"
));
operator_op
->
Op
()
->
SetAttr
(
"trans_y"
,
operator_op
->
Op
()
->
GetAttr
(
"transpose_Y"
));
auto
matmul_alpha
=
operator_op
->
Op
()
->
GetAttrIfExists
<
float
>
(
"alpha"
);
if
(
matmul_alpha
!=
1.0
f
)
{
operator_op
->
Op
()
->
SetAttr
(
"alpha"
,
matmul_alpha
);
}
}
operator_op
->
Op
()
->
SetAttr
(
"fused_output_scale"
,
scale
);
operator_op
->
Op
()
->
SetAttr
(
"fused_output_scale"
,
scale
);
operator_op
->
Op
()
->
SetOutput
(
"Out"
,
{
scale_out
->
Name
()});
operator_op
->
Op
()
->
SetOutput
(
"Out"
,
{
scale_out
->
Name
()});
...
...
paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
浏览文件 @
338cbeaa
...
@@ -123,15 +123,6 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse(
...
@@ -123,15 +123,6 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse(
return
;
return
;
}
}
if
(
matmul_type
==
"matmul"
)
{
matmul_desc
->
SetType
(
"matmul_v2"
);
matmul_desc
->
SetAttr
(
"trans_x"
,
matmul_desc
->
GetAttr
(
"transpose_X"
));
matmul_desc
->
SetAttr
(
"trans_y"
,
matmul_desc
->
GetAttr
(
"transpose_Y"
));
auto
matmul_alpha
=
matmul_desc
->
GetAttrIfExists
<
float
>
(
"alpha"
);
if
(
matmul_alpha
!=
1.0
f
)
{
matmul_desc
->
SetAttr
(
"alpha"
,
matmul_alpha
);
}
}
matmul_desc
->
SetInput
(
matmul_input_name
,
{(
reshape_in
)
->
Name
()});
matmul_desc
->
SetInput
(
matmul_input_name
,
{(
reshape_in
)
->
Name
()});
matmul_desc
->
SetAttr
(
"fused_reshape_"
+
matmul_input_name
,
reshape_shape
);
matmul_desc
->
SetAttr
(
"fused_reshape_"
+
matmul_input_name
,
reshape_shape
);
matmul_desc
->
SetAttr
(
"fused_transpose_"
+
matmul_input_name
,
matmul_desc
->
SetAttr
(
"fused_transpose_"
+
matmul_input_name
,
...
...
paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
浏览文件 @
338cbeaa
...
@@ -97,7 +97,7 @@ void TestMain(const std::string& op_name, bool with_xshapes) {
...
@@ -97,7 +97,7 @@ void TestMain(const std::string& op_name, bool with_xshapes) {
int
removed
=
8
;
// 2* reshape, reshape_out, transpose, transpose_out
int
removed
=
8
;
// 2* reshape, reshape_out, transpose, transpose_out
if
(
with_xshapes
)
removed
+=
2
;
// transpose_xshape, reshape_xshape
if
(
with_xshapes
)
removed
+=
2
;
// transpose_xshape, reshape_xshape
EXPECT_EQ
(
total_nodes_before
-
removed
,
total_nodes_after
);
EXPECT_EQ
(
total_nodes_before
-
removed
,
total_nodes_after
);
auto
*
matmul_op_desc
=
GetOpNodes
(
graph
,
"matmul_v2"
).
at
(
0
)
->
Op
();
auto
*
matmul_op_desc
=
GetOpNodes
(
graph
,
op_name
).
at
(
0
)
->
Op
();
auto
check
=
[
&
matmul_op_desc
](
std
::
string
a
)
{
auto
check
=
[
&
matmul_op_desc
](
std
::
string
a
)
{
std
::
string
shape_str
=
"fused_reshape_"
+
a
;
std
::
string
shape_str
=
"fused_reshape_"
+
a
;
...
...
paddle/fluid/operators/matmul_op.cc
浏览文件 @
338cbeaa
...
@@ -345,6 +345,26 @@ class MatMulGradKernel : public framework::OpKernel<T> {
...
@@ -345,6 +345,26 @@ class MatMulGradKernel : public framework::OpKernel<T> {
}
}
};
};
framework
::
DDim
GetDimForInput
(
const
framework
::
InferShapeContext
&
ctx
,
std
::
string
input_name
)
{
auto
shape
=
ctx
.
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"fused_reshape_"
+
input_name
);
auto
axis
=
ctx
.
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"fused_transpose_"
+
input_name
);
auto
dim
=
ctx
.
GetInputDim
(
input_name
);
PADDLE_ENFORCE_GT
(
dim
.
size
(),
0
,
platform
::
errors
::
InvalidArgument
(
"The Input(%s) has not been initialized properly. The "
"shape of Input(%s) = [%s]."
,
dim
));
if
(
!
shape
.
empty
()
&&
!
axis
.
empty
())
{
dim
=
dim
.
reshape
(
shape
).
transpose
(
axis
);
}
return
dim
;
}
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
MatMulDoubleGradKernel
:
public
framework
::
OpKernel
<
T
>
{
class
MatMulDoubleGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
...
@@ -559,8 +579,8 @@ class MatMulOp : public framework::OperatorWithKernel {
...
@@ -559,8 +579,8 @@ class MatMulOp : public framework::OperatorWithKernel {
OP_INOUT_CHECK
(
context
->
HasInput
(
"Y"
),
"Input"
,
"Y"
,
"matmul"
);
OP_INOUT_CHECK
(
context
->
HasInput
(
"Y"
),
"Input"
,
"Y"
,
"matmul"
);
OP_INOUT_CHECK
(
context
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"matmul"
);
OP_INOUT_CHECK
(
context
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"matmul"
);
auto
dim_x
=
context
->
GetInputDim
(
"X"
);
auto
dim_x
=
GetDimForInput
(
*
context
,
"X"
);
auto
dim_y
=
context
->
GetInputDim
(
"Y"
);
auto
dim_y
=
GetDimForInput
(
*
context
,
"Y"
);
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
// (jczaja): For NHWC execution output shape needs
// (jczaja): For NHWC execution output shape needs
...
@@ -661,6 +681,14 @@ class MatMulOp : public framework::OperatorWithKernel {
...
@@ -661,6 +681,14 @@ class MatMulOp : public framework::OperatorWithKernel {
framework
::
DDim
ddim_out
=
phi
::
make_ddim
(
dim_out
);
framework
::
DDim
ddim_out
=
phi
::
make_ddim
(
dim_out
);
#ifdef PADDLE_WITH_MKLDNN
auto
shape
=
context
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"fused_reshape_Out"
);
auto
axis
=
context
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"fused_transpose_Out"
);
if
(
!
shape
.
empty
()
&&
!
axis
.
empty
())
{
ddim_out
=
ddim_out
.
transpose
(
axis
).
reshape
(
shape
);
}
#endif
context
->
SetOutputDim
(
"Out"
,
ddim_out
);
context
->
SetOutputDim
(
"Out"
,
ddim_out
);
context
->
ShareLoD
(
"X"
,
"Out"
);
context
->
ShareLoD
(
"X"
,
"Out"
);
}
}
...
...
paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
→
paddle/fluid/operators/mkldnn/matmul_
v2_
mkldnn_op.cc
浏览文件 @
338cbeaa
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/phi/backends/onednn/onednn_reuse.h"
#include "paddle/phi/backends/onednn/onednn_reuse.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
...
@@ -20,14 +21,13 @@ namespace {
...
@@ -20,14 +21,13 @@ namespace {
using
dnnl
::
memory
;
using
dnnl
::
memory
;
using
paddle
::
framework
::
ExecutionContext
;
using
paddle
::
framework
::
ExecutionContext
;
using
paddle
::
framework
::
GradVarName
;
using
paddle
::
framework
::
GradVarName
;
using
phi
::
DenseTensor
;
using
phi
::
OneDNNContext
;
using
phi
::
OneDNNContext
;
using
phi
::
vectorize
;
using
phi
::
vectorize
;
using
phi
::
funcs
::
OneDNNGetDataType
;
using
phi
::
funcs
::
OneDNNGetDataType
;
// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
// Identity op if the tensor is not of rank 3.
// Identity op if the tensor is not of rank 3.
static
DenseTensor
FoldOuterDims
(
const
DenseTensor
&
input
)
{
static
phi
::
DenseTensor
FoldOuterDims
(
const
phi
::
DenseTensor
&
input
)
{
auto
output
=
input
;
auto
output
=
input
;
auto
in_dims
=
input
.
dims
();
auto
in_dims
=
input
.
dims
();
if
(
in_dims
.
size
()
==
3
)
{
if
(
in_dims
.
size
()
==
3
)
{
...
@@ -40,14 +40,14 @@ static DenseTensor FoldOuterDims(const DenseTensor &input) {
...
@@ -40,14 +40,14 @@ static DenseTensor FoldOuterDims(const DenseTensor &input) {
// (Warning: This requires transposing data and writes into new memory.)
// (Warning: This requires transposing data and writes into new memory.)
// Identity op if the tensor is not of rank 3.
// Identity op if the tensor is not of rank 3.
template
<
typename
T
>
template
<
typename
T
>
static
DenseTensor
FoldFirstAndLastDims
(
const
OneDNNContext
&
dev_ctx
,
static
phi
::
DenseTensor
FoldFirstAndLastDims
(
const
OneDNNContext
&
dev_ctx
,
const
DenseTensor
*
input
)
{
const
phi
::
DenseTensor
*
input
)
{
auto
input_dims
=
vectorize
(
input
->
dims
());
auto
input_dims
=
vectorize
(
input
->
dims
());
if
(
input_dims
.
size
()
!=
3
)
{
if
(
input_dims
.
size
()
!=
3
)
{
return
*
input
;
return
*
input
;
}
}
DenseTensor
output
;
phi
::
DenseTensor
output
;
output
.
Resize
({
input_dims
[
1
],
input_dims
[
0
],
input_dims
[
2
]});
output
.
Resize
({
input_dims
[
1
],
input_dims
[
0
],
input_dims
[
2
]});
auto
output_dims
=
vectorize
(
output
.
dims
());
auto
output_dims
=
vectorize
(
output
.
dims
());
...
@@ -71,15 +71,30 @@ static DenseTensor FoldFirstAndLastDims(const OneDNNContext &dev_ctx,
...
@@ -71,15 +71,30 @@ static DenseTensor FoldFirstAndLastDims(const OneDNNContext &dev_ctx,
return
output
;
return
output
;
}
}
phi
::
DDim
GetDimForInput
(
const
ExecutionContext
&
ctx
,
std
::
string
input_name
)
{
auto
shape
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"fused_reshape_"
+
input_name
);
auto
axis
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"fused_transpose_"
+
input_name
);
auto
input_dims
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
input_name
)
->
dims
();
if
(
!
shape
.
empty
()
&&
!
axis
.
empty
())
{
return
input_dims
.
reshape
(
shape
).
transpose
(
axis
);
}
return
input_dims
;
}
template
<
typename
XT
,
typename
YT
,
typename
OT
>
template
<
typename
XT
,
typename
YT
,
typename
OT
>
class
MatMulV
1One
DNNHandler
class
MatMulV
2MKL
DNNHandler
:
public
phi
::
funcs
::
OneDNNHandlerNoCachingT
<
XT
,
dnnl
::
matmul
>
{
:
public
phi
::
funcs
::
OneDNNHandlerNoCachingT
<
XT
,
dnnl
::
matmul
>
{
public:
public:
MatMulV
1One
DNNHandler
(
const
ExecutionContext
&
ctx
,
MatMulV
2MKL
DNNHandler
(
const
ExecutionContext
&
ctx
,
const
dnnl
::
engine
engine
,
const
dnnl
::
engine
engine
,
p
hi
::
Place
cpu_place
,
p
addle
::
platform
::
Place
cpu_place
,
const
std
::
vector
<
int64_t
>
&
x_org_dims
,
const
std
::
vector
<
int64_t
>
&
x_org_dims
,
const
std
::
vector
<
int64_t
>
&
y_org_dims
)
bool
trans_x
,
const
std
::
vector
<
int64_t
>
&
y_org_dims
,
bool
trans_y
,
bool
is_output_fused
,
const
std
::
vector
<
int64_t
>
&
x_strides_override
,
const
std
::
vector
<
int64_t
>
&
y_strides_override
)
:
phi
::
funcs
::
OneDNNHandlerNoCachingT
<
XT
,
dnnl
::
matmul
>
(
engine
,
:
phi
::
funcs
::
OneDNNHandlerNoCachingT
<
XT
,
dnnl
::
matmul
>
(
engine
,
cpu_place
)
{
cpu_place
)
{
// M X K * K X N
// M X K * K X N
...
@@ -90,8 +105,6 @@ class MatMulV1OneDNNHandler
...
@@ -90,8 +105,6 @@ class MatMulV1OneDNNHandler
const
int
H_idx
=
x_dims
.
size
()
-
2
;
const
int
H_idx
=
x_dims
.
size
()
-
2
;
const
int
W_idx
=
x_dims
.
size
()
-
1
;
const
int
W_idx
=
x_dims
.
size
()
-
1
;
auto
trans_x
=
ctx
.
Attr
<
bool
>
(
"transpose_X"
);
auto
trans_y
=
ctx
.
Attr
<
bool
>
(
"transpose_Y"
);
if
(
trans_x
)
std
::
swap
(
x_dims
[
H_idx
],
x_dims
[
W_idx
]);
if
(
trans_x
)
std
::
swap
(
x_dims
[
H_idx
],
x_dims
[
W_idx
]);
if
(
trans_y
)
std
::
swap
(
y_dims
[
H_idx
],
y_dims
[
W_idx
]);
if
(
trans_y
)
std
::
swap
(
y_dims
[
H_idx
],
y_dims
[
W_idx
]);
...
@@ -108,16 +121,24 @@ class MatMulV1OneDNNHandler
...
@@ -108,16 +121,24 @@ class MatMulV1OneDNNHandler
y_strides
.
reserve
(
x_dims
.
size
());
y_strides
.
reserve
(
x_dims
.
size
());
out_strides
.
reserve
(
x_dims
.
size
());
out_strides
.
reserve
(
x_dims
.
size
());
if
(
trans_x
)
{
if
(
!
x_strides_override
.
empty
()
)
{
x_strides
.
insert
(
x_strides
.
end
(),
{
M
*
K
,
1
,
M
})
;
x_strides
=
x_strides_override
;
}
else
{
}
else
{
x_strides
.
insert
(
x_strides
.
end
(),
{
M
*
K
,
K
,
1
});
if
(
!
trans_x
)
{
x_strides
.
insert
(
x_strides
.
end
(),
{
M
*
K
,
K
,
1
});
}
else
{
x_strides
.
insert
(
x_strides
.
end
(),
{
M
*
K
,
1
,
M
});
}
}
}
if
(
trans_y
)
{
if
(
!
y_strides_override
.
empty
()
)
{
y_strides
.
insert
(
y_strides
.
end
(),
{
N
*
K
,
1
,
K
})
;
y_strides
=
y_strides_override
;
}
else
{
}
else
{
y_strides
.
insert
(
y_strides
.
end
(),
{
N
*
K
,
N
,
1
});
if
(
!
trans_y
)
{
y_strides
.
insert
(
y_strides
.
end
(),
{
N
*
K
,
N
,
1
});
}
else
{
y_strides
.
insert
(
y_strides
.
end
(),
{
N
*
K
,
1
,
K
});
}
}
}
out_strides
.
insert
(
out_strides
.
end
(),
{
M
*
N
,
N
,
1
});
out_strides
.
insert
(
out_strides
.
end
(),
{
M
*
N
,
N
,
1
});
...
@@ -126,11 +147,20 @@ class MatMulV1OneDNNHandler
...
@@ -126,11 +147,20 @@ class MatMulV1OneDNNHandler
for
(
int
i
=
x_dims
.
size
()
-
4
;
i
>=
0
;
--
i
)
{
for
(
int
i
=
x_dims
.
size
()
-
4
;
i
>=
0
;
--
i
)
{
out_ddims
[
i
]
=
std
::
max
(
x_dims
[
i
],
y_dims
[
i
]);
out_ddims
[
i
]
=
std
::
max
(
x_dims
[
i
],
y_dims
[
i
]);
x_strides
[
i
]
=
x_dims
[
i
+
1
]
*
x_strides
[
i
+
1
];
if
(
x_strides_override
.
empty
())
{
y_strides
[
i
]
=
y_dims
[
i
+
1
]
*
y_strides
[
i
+
1
];
x_strides
[
i
]
=
x_dims
[
i
+
1
]
*
x_strides
[
i
+
1
];
}
if
(
y_strides_override
.
empty
())
{
y_strides
[
i
]
=
y_dims
[
i
+
1
]
*
y_strides
[
i
+
1
];
}
out_strides
[
i
]
=
out_ddims
[
i
+
1
]
*
out_strides
[
i
+
1
];
out_strides
[
i
]
=
out_ddims
[
i
+
1
]
*
out_strides
[
i
+
1
];
}
}
// TODO(jczaja): Why not for int8??
if
(
!
phi
::
funcs
::
is_int8
<
OT
>
()
&&
is_output_fused
)
{
out_strides
=
FakeTransposeStrides
(
out_ddims
);
}
auto
x_md
=
auto
x_md
=
memory
::
desc
(
x_dims
,
phi
::
funcs
::
OneDNNGetDataType
<
XT
>
(),
x_strides
);
memory
::
desc
(
x_dims
,
phi
::
funcs
::
OneDNNGetDataType
<
XT
>
(),
x_strides
);
auto
y_md
=
auto
y_md
=
...
@@ -138,25 +168,164 @@ class MatMulV1OneDNNHandler
...
@@ -138,25 +168,164 @@ class MatMulV1OneDNNHandler
auto
out_md
=
memory
::
desc
(
auto
out_md
=
memory
::
desc
(
out_ddims
,
phi
::
funcs
::
OneDNNGetDataType
<
OT
>
(),
out_strides
);
out_ddims
,
phi
::
funcs
::
OneDNNGetDataType
<
OT
>
(),
out_strides
);
const
dnnl
::
primitive_attr
matmul_attrs
=
CreateMatmulAttrs
(
ctx
);
this
->
AcquireForwardPrimitiveDescriptor
(
matmul_attrs
,
x_md
,
y_md
,
out_md
);
}
void
AppendActivation
(
const
ExecutionContext
&
ctx
,
dnnl
::
post_ops
&
post_ops
,
// NOLINT
float
activation_scale
=
1.0
f
)
{
const
auto
invalid_attribute
=
ctx
.
HasAttr
(
"fuse_activation"
)
?
ctx
.
Attr
<
std
::
string
>
(
"fuse_activation"
).
empty
()
:
true
;
if
(
invalid_attribute
)
return
;
const
auto
fuse_activation
=
ctx
.
Attr
<
std
::
string
>
(
"fuse_activation"
);
const
auto
fuse_alpha
=
ctx
.
HasAttr
(
"fuse_alpha"
)
?
ctx
.
Attr
<
float
>
(
"fuse_alpha"
)
:
0.0
f
;
const
auto
fuse_beta
=
ctx
.
HasAttr
(
"fuse_beta"
)
?
ctx
.
Attr
<
float
>
(
"fuse_beta"
)
:
0.0
f
;
if
(
fuse_activation
==
"hard_sigmoid"
)
{
post_ops
.
append_eltwise
(
activation_scale
,
dnnl
::
algorithm
::
eltwise_linear
,
fuse_alpha
,
fuse_beta
);
post_ops
.
append_eltwise
(
activation_scale
,
dnnl
::
algorithm
::
eltwise_clip
,
0.0
f
,
1.0
f
);
}
else
{
const
std
::
unordered_map
<
std
::
string
,
dnnl
::
algorithm
>
activation_map
=
{
{
"abs"
,
dnnl
::
algorithm
::
eltwise_abs
},
{
"clip"
,
dnnl
::
algorithm
::
eltwise_clip
},
{
"gelu"
,
dnnl
::
algorithm
::
eltwise_gelu_erf
},
{
"gelu_erf"
,
dnnl
::
algorithm
::
eltwise_gelu_erf
},
{
"gelu_tanh"
,
dnnl
::
algorithm
::
eltwise_gelu_tanh
},
{
"hard_swish"
,
dnnl
::
algorithm
::
eltwise_hardswish
},
{
"leaky_relu"
,
dnnl
::
algorithm
::
eltwise_relu
},
{
"mish"
,
dnnl
::
algorithm
::
eltwise_mish
},
{
"relu"
,
dnnl
::
algorithm
::
eltwise_relu
},
{
"relu6"
,
dnnl
::
algorithm
::
eltwise_bounded_relu
},
{
"sigmoid"
,
dnnl
::
algorithm
::
eltwise_logistic
},
{
"sqrt"
,
dnnl
::
algorithm
::
eltwise_sqrt
},
{
"swish"
,
dnnl
::
algorithm
::
eltwise_swish
},
{
"tanh"
,
dnnl
::
algorithm
::
eltwise_tanh
}};
const
auto
&
activation_type
=
activation_map
.
find
(
fuse_activation
);
PADDLE_ENFORCE_NE
(
activation_type
,
activation_map
.
end
(),
phi
::
errors
::
InvalidArgument
(
"Activation '%s' not found in oneDNN algorithms mapper"
,
fuse_activation
));
post_ops
.
append_eltwise
(
activation_scale
,
activation_type
->
second
,
fuse_alpha
,
fuse_beta
);
}
}
float
ComputeOutputScale
(
const
ExecutionContext
&
ctx
)
{
float
alpha
=
ctx
.
HasAttr
(
"alpha"
)
?
ctx
.
Attr
<
float
>
(
"alpha"
)
:
1.0
f
;
if
(
ctx
.
HasAttr
(
"Scale_x"
)
&&
ctx
.
HasAttr
(
"Scale_y"
)
&&
ctx
.
HasAttr
(
"Scale_out"
))
{
float
scale_x
=
ctx
.
Attr
<
float
>
(
"Scale_x"
);
float
scale_y
=
ctx
.
Attr
<
float
>
(
"Scale_y"
);
bool
force_fp32_out
=
ctx
.
HasAttr
(
"force_fp32_output"
)
?
ctx
.
Attr
<
bool
>
(
"force_fp32_output"
)
:
false
;
float
scale_out
=
force_fp32_out
?
1.
f
:
ctx
.
Attr
<
float
>
(
"Scale_out"
);
alpha
*=
scale_out
/
(
scale_x
*
scale_y
);
}
return
alpha
;
}
dnnl
::
primitive_attr
CreateMatmulAttrs
(
const
ExecutionContext
&
ctx
)
{
dnnl
::
primitive_attr
matmul_attrs
;
dnnl
::
primitive_attr
matmul_attrs
;
dnnl
::
post_ops
post_operations
;
dnnl
::
post_ops
post_operations
;
float
scale_out
=
ComputeOutputScale
(
ctx
);
float
scale_out
=
ComputeOutputScale
(
ctx
);
if
(
scale_out
!=
1.0
f
)
{
if
(
scale_out
!=
1.0
f
)
{
matmul_attrs
.
set_output_scales
(
0
,
{
scale_out
});
matmul_attrs
.
set_output_scales
(
0
,
{
scale_out
});
}
}
if
(
ctx
.
HasInput
(
"ResidualData"
))
{
auto
*
residual_data
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"ResidualData"
);
auto
residual_data_tz
=
phi
::
vectorize
(
residual_data
->
dims
());
auto
residual_data_md
=
memory
::
desc
(
residual_data_tz
,
phi
::
funcs
::
OneDNNGetDataType
<
OT
>
(),
dnnl
::
memory
::
format_tag
::
any
);
post_operations
.
append_binary
(
dnnl
::
algorithm
::
binary_add
,
residual_data_md
);
if
(
ctx
.
HasAttr
(
"Scale_in_eltwise"
))
{
float
sum_scale
=
scale_out
/
ctx
.
Attr
<
float
>
(
"Scale_in_eltwise"
);
post_operations
.
append_sum
(
sum_scale
);
}
}
AppendActivation
(
ctx
,
post_operations
);
if
(
ctx
.
HasAttr
(
"fused_output_scale"
))
{
float
scale_alpha
=
ctx
.
Attr
<
float
>
(
"fused_output_scale"
);
post_operations
.
append_eltwise
(
1.0
,
dnnl
::
algorithm
::
eltwise_linear
,
scale_alpha
,
0.0
f
);
}
matmul_attrs
.
set_post_ops
(
post_operations
);
matmul_attrs
.
set_post_ops
(
post_operations
);
return
matmul_attrs
;
}
this
->
AcquireForwardPrimitiveDescriptor
(
matmul_attrs
,
x_md
,
y_md
,
out_md
);
std
::
vector
<
int64_t
>
FakeTransposeStrides
(
const
std
::
vector
<
int64_t
>
&
matmul_out_dims
)
const
{
// fuse matmul_v2 + transpose + reshape guarantees that output is 4D and
// transpose axis are: {0, 2, 1, 3}
std
::
vector
<
int64_t
>
transpose_axis
=
{
0
,
2
,
1
,
3
};
std
::
vector
<
int64_t
>
fake_strides
(
transpose_axis
.
size
());
int
ndims
=
static_cast
<
int
>
(
transpose_axis
.
size
());
int
total_stride
=
1
;
for
(
int
i
=
ndims
-
1
;
i
>=
0
;
--
i
)
{
fake_strides
[
transpose_axis
[
i
]]
=
total_stride
;
total_stride
*=
matmul_out_dims
[
transpose_axis
[
i
]];
}
return
fake_strides
;
}
}
MatMulV1OneDNNHandler
(
const
dnnl
::
engine
engine
,
std
::
shared_ptr
<
memory
>
AcquireWeightsMemory
(
const
phi
::
DenseTensor
*
input
)
{
phi
::
Place
cpu_place
,
const
YT
*
input_data
=
input
->
data
<
YT
>
();
DenseTensor
*
x
,
return
this
->
AcquireMemoryFromPrimitive
(
bool
trans_x
,
this
->
fwd_pd_
->
weights_desc
(),
DenseTensor
*
y
,
phi
::
funcs
::
to_void_cast
<
YT
>
(
input_data
));
bool
trans_y
,
}
DenseTensor
*
out
,
float
scale
)
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireDstMemory
(
phi
::
DenseTensor
*
output
)
{
// We cannot use base AcquireDstMemory as it makes an allocation request
// base on DST memory primitive size. This is fine in general, but in MatMul
// we have primitive that covers only one batch of Data and then shift
// pointer for every new batch. Hence phi::DenseTensor size is bigger that
// dst memory primitive size. So would we request less memory that is there
// and it triggers an assertion. So as there is no 'any' format here we can
// leave default size of phi::DenseTensor as computed in ComputeInferShape
OT
*
ptr
=
output
->
mutable_data
<
OT
>
(
this
->
place_
);
return
this
->
AcquireMemoryFromPrimitive
(
this
->
fwd_pd_
->
dst_desc
(),
ptr
);
}
};
template
<
typename
XT
,
typename
YT
,
typename
OT
>
class
MatMulMKLDNNHandler
:
public
phi
::
funcs
::
OneDNNHandlerNoCachingT
<
XT
,
dnnl
::
matmul
>
{
public:
MatMulMKLDNNHandler
(
const
dnnl
::
engine
engine
,
paddle
::
platform
::
Place
cpu_place
,
phi
::
DenseTensor
*
x
,
bool
trans_x
,
phi
::
DenseTensor
*
y
,
bool
trans_y
,
phi
::
DenseTensor
*
out
,
float
scale
)
:
phi
::
funcs
::
OneDNNHandlerNoCachingT
<
XT
,
dnnl
::
matmul
>
(
engine
,
:
phi
::
funcs
::
OneDNNHandlerNoCachingT
<
XT
,
dnnl
::
matmul
>
(
engine
,
cpu_place
)
{
cpu_place
)
{
auto
mat_dim_x
=
phi
::
funcs
::
CreateMatrixDescriptor
(
x
->
dims
(),
0
,
trans_x
);
auto
mat_dim_x
=
phi
::
funcs
::
CreateMatrixDescriptor
(
x
->
dims
(),
0
,
trans_x
);
...
@@ -175,10 +344,10 @@ class MatMulV1OneDNNHandler
...
@@ -175,10 +344,10 @@ class MatMulV1OneDNNHandler
memory
::
dims
out_dims
=
{
out_bs
,
M
,
N
};
memory
::
dims
out_dims
=
{
out_bs
,
M
,
N
};
memory
::
dims
x_strides
=
memory
::
dims
x_strides
=
trans_x
?
memory
::
dims
{
M
*
K
,
1
,
M
}
:
memory
::
dims
{
M
*
K
,
K
,
1
};
!
trans_x
?
memory
::
dims
{
M
*
K
,
K
,
1
}
:
memory
::
dims
{
M
*
K
,
1
,
M
};
memory
::
dims
y_strides
=
memory
::
dims
y_strides
=
trans_y
?
memory
::
dims
{
N
*
K
,
1
,
K
}
:
memory
::
dims
{
N
*
K
,
N
,
1
};
!
trans_y
?
memory
::
dims
{
N
*
K
,
N
,
1
}
:
memory
::
dims
{
N
*
K
,
1
,
K
};
memory
::
dims
out_strides
=
memory
::
dims
{
M
*
N
,
N
,
1
};
memory
::
dims
out_strides
=
memory
::
dims
{
M
*
N
,
N
,
1
};
auto
x_md
=
memory
::
desc
(
x_dims
,
OneDNNGetDataType
<
XT
>
(),
x_strides
);
auto
x_md
=
memory
::
desc
(
x_dims
,
OneDNNGetDataType
<
XT
>
(),
x_strides
);
...
@@ -191,41 +360,65 @@ class MatMulV1OneDNNHandler
...
@@ -191,41 +360,65 @@ class MatMulV1OneDNNHandler
this
->
AcquireForwardPrimitiveDescriptor
(
attrs
,
x_md
,
y_md
,
out_md
);
this
->
AcquireForwardPrimitiveDescriptor
(
attrs
,
x_md
,
y_md
,
out_md
);
}
}
float
ComputeOutputScale
(
const
ExecutionContext
&
ctx
)
{
std
::
shared_ptr
<
memory
>
AcquireWeightsMemory
(
const
phi
::
DenseTensor
*
input
)
{
float
alpha
=
ctx
.
Attr
<
float
>
(
"alpha"
);
if
(
ctx
.
HasAttr
(
"Scale_x"
)
&&
ctx
.
HasAttr
(
"Scale_y"
)
&&
ctx
.
HasAttr
(
"Scale_out"
))
{
float
scale_x
=
ctx
.
Attr
<
float
>
(
"Scale_x"
);
float
scale_y
=
ctx
.
Attr
<
float
>
(
"Scale_y"
);
bool
force_fp32_out
=
ctx
.
HasAttr
(
"force_fp32_output"
)
?
ctx
.
Attr
<
bool
>
(
"force_fp32_output"
)
:
false
;
float
scale_out
=
force_fp32_out
?
1.
f
:
ctx
.
Attr
<
float
>
(
"Scale_out"
);
alpha
*=
scale_out
/
(
scale_x
*
scale_y
);
}
return
alpha
;
}
std
::
shared_ptr
<
memory
>
AcquireWeightsMemory
(
const
DenseTensor
*
input
)
{
const
YT
*
input_data
=
input
->
data
<
YT
>
();
const
YT
*
input_data
=
input
->
data
<
YT
>
();
return
this
->
AcquireMemoryFromPrimitive
(
return
this
->
AcquireMemoryFromPrimitive
(
this
->
fwd_pd_
->
weights_desc
(),
this
->
fwd_pd_
->
weights_desc
(),
phi
::
funcs
::
to_void_cast
<
YT
>
(
input_data
));
phi
::
funcs
::
to_void_cast
<
YT
>
(
input_data
));
}
}
std
::
shared_ptr
<
memory
>
AcquireDstMemory
(
DenseTensor
*
output
)
{
public:
void
Execute
(
const
phi
::
DenseTensor
*
x
,
const
phi
::
DenseTensor
*
y
,
phi
::
DenseTensor
*
out
)
{
const
auto
src_memory_p
=
this
->
AcquireSrcMemory
(
x
);
const
auto
weights_memory_p
=
this
->
AcquireWeightsMemory
(
y
);
const
auto
dst_memory_p
=
this
->
AcquireDstMemory
(
out
);
auto
matmul_p
=
this
->
AcquireForwardPrimitive
();
std
::
unordered_map
<
int
,
dnnl
::
memory
>
matmul_args
=
{
{
DNNL_ARG_SRC
,
*
src_memory_p
},
{
DNNL_ARG_WEIGHTS
,
*
weights_memory_p
},
{
DNNL_ARG_DST
,
*
dst_memory_p
}};
auto
&
astream
=
OneDNNContext
::
tls
().
get_stream
();
// Simulate batch matmul by processing in loop
void
*
x_ptr
=
src_memory_p
->
get_data_handle
();
void
*
y_ptr
=
weights_memory_p
->
get_data_handle
();
void
*
out_ptr
=
dst_memory_p
->
get_data_handle
();
auto
offsets
=
std
::
make_tuple
(
x_offset_
,
y_offset_
,
out_offset_
);
for
(
uint16_t
i
=
0
;
i
<
batch_size_
;
++
i
)
{
src_memory_p
->
set_data_handle
(
x_ptr
);
weights_memory_p
->
set_data_handle
(
y_ptr
);
dst_memory_p
->
set_data_handle
(
out_ptr
);
matmul_p
->
execute
(
astream
,
matmul_args
);
x_ptr
=
static_cast
<
char
*>
(
x_ptr
)
+
std
::
get
<
0
>
(
offsets
);
y_ptr
=
static_cast
<
char
*>
(
y_ptr
)
+
std
::
get
<
1
>
(
offsets
);
out_ptr
=
static_cast
<
char
*>
(
out_ptr
)
+
std
::
get
<
2
>
(
offsets
);
}
astream
.
wait
();
out
->
set_mem_desc
(
dst_memory_p
->
get_desc
().
reshape
(
out
->
dims
()));
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireDstMemory
(
phi
::
DenseTensor
*
output
)
{
// We cannot use base AcquireDstMemory as it makes an allocation request
// We cannot use base AcquireDstMemory as it makes an allocation request
// base on DST memory primitive size. This is fine in general, but in MatMul
// base on DST memory primitive size. This is fine in general, but in MatMul
// we have primitive that covers only one batch of Data and then shift
// we have primitive that covers only one batch of Data and then shift
// pointer for every new batch. Hence DenseTensor size is bigger that
// pointer for every new batch. Hence
phi::
DenseTensor size is bigger that
// dst memory primitive size. So would we request less memory that is there
// dst memory primitive size. So would we request less memory that is there
// and it triggers an assertion. So as there is no 'any' format here we can
// and it triggers an assertion. So as there is no 'any' format here we can
// leave default size of DenseTensor as computed in ComputeInferShape
// leave default size of
phi::
DenseTensor as computed in ComputeInferShape
OT
*
ptr
=
output
->
mutable_data
<
OT
>
(
this
->
place_
);
OT
*
ptr
=
output
->
mutable_data
<
OT
>
(
this
->
place_
);
return
this
->
AcquireMemoryFromPrimitive
(
this
->
fwd_pd_
->
dst_desc
(),
ptr
);
return
this
->
AcquireMemoryFromPrimitive
(
this
->
fwd_pd_
->
dst_desc
(),
ptr
);
}
}
private:
private:
uint32_t
x_offset_
;
uint32_t
y_offset_
;
uint32_t
out_offset_
;
uint16_t
batch_size_
;
uint16_t
batch_size_
;
};
};
...
@@ -236,7 +429,7 @@ class MatMulV1OneDNNHandler
...
@@ -236,7 +429,7 @@ class MatMulV1OneDNNHandler
* If transposed, `H,W` will be swapped.
* If transposed, `H,W` will be swapped.
*/
*/
static
void
ReshapeTensorToMatrixSequence
(
static
void
ReshapeTensorToMatrixSequence
(
DenseTensor
*
x
,
const
phi
::
funcs
::
MatDescriptor
&
descriptor
)
{
phi
::
DenseTensor
*
x
,
const
phi
::
funcs
::
MatDescriptor
&
descriptor
)
{
int64_t
h
,
w
;
int64_t
h
,
w
;
h
=
descriptor
.
height_
;
h
=
descriptor
.
height_
;
w
=
descriptor
.
width_
;
w
=
descriptor
.
width_
;
...
@@ -264,9 +457,9 @@ static void ReshapeTensorToMatrixSequence(
...
@@ -264,9 +457,9 @@ static void ReshapeTensorToMatrixSequence(
* If any of `X` and `Y` has batch size BatchSize, the out will have the
* If any of `X` and `Y` has batch size BatchSize, the out will have the
* BatchSize.
* BatchSize.
*/
*/
static
void
ReshapeXYOutToMatrixSequence
(
DenseTensor
*
x
,
static
void
ReshapeXYOutToMatrixSequence
(
phi
::
DenseTensor
*
x
,
DenseTensor
*
y
,
phi
::
DenseTensor
*
y
,
DenseTensor
*
out
,
phi
::
DenseTensor
*
out
,
bool
trans_x
,
bool
trans_x
,
bool
trans_y
)
{
bool
trans_y
)
{
auto
x_dim
=
phi
::
funcs
::
RowMatrixDimsFromVector
(
x
->
dims
());
auto
x_dim
=
phi
::
funcs
::
RowMatrixDimsFromVector
(
x
->
dims
());
...
@@ -293,22 +486,22 @@ std::vector<int64_t> Transpose(const std::vector<int64_t> &x,
...
@@ -293,22 +486,22 @@ std::vector<int64_t> Transpose(const std::vector<int64_t> &x,
auto
axis_set
=
std
::
set
<
int
>
(
axis
.
begin
(),
axis
.
end
());
auto
axis_set
=
std
::
set
<
int
>
(
axis
.
begin
(),
axis
.
end
());
PADDLE_ENFORCE_EQ
(
axis_set
.
size
(),
PADDLE_ENFORCE_EQ
(
axis_set
.
size
(),
axis_size
,
axis_size
,
p
hi
::
errors
::
InvalidArgument
(
p
addle
::
platform
::
errors
::
InvalidArgument
(
"In an axis array, elements must be unique."
));
"In an axis array, elements must be unique."
));
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
in_rank
,
in_rank
,
axis_size
,
axis_size
,
paddle
::
platform
::
errors
::
InvalidArgument
(
phi
::
errors
::
InvalidArgument
(
"The input dimension's size "
"The input dimension's size "
"should be equal to the axis's size. "
"should be equal to the axis's size. "
"But received dimension is %d, "
"But received dimension is %d, "
"axis's size is %d"
,
"axis's size is %d"
,
in_rank
,
in_rank
,
axis_size
));
axis_size
));
PADDLE_ENFORCE_LT
(
*
std
::
max_element
(
axis
.
begin
(),
axis
.
end
()),
PADDLE_ENFORCE_LT
(
*
std
::
max_element
(
axis
.
begin
(),
axis
.
end
()),
axis_size
,
axis_size
,
p
hi
::
errors
::
InvalidArgument
(
p
addle
::
platform
::
errors
::
InvalidArgument
(
"Axis values must be ranging from 0 to (dims - 1)."
));
"Axis values must be ranging from 0 to (dims - 1)."
));
std
::
vector
<
int64_t
>
new_x
(
x
.
size
());
std
::
vector
<
int64_t
>
new_x
(
x
.
size
());
...
@@ -318,16 +511,73 @@ std::vector<int64_t> Transpose(const std::vector<int64_t> &x,
...
@@ -318,16 +511,73 @@ std::vector<int64_t> Transpose(const std::vector<int64_t> &x,
return
new_x
;
return
new_x
;
}
}
std
::
vector
<
int64_t
>
GetInputStrides
(
const
ExecutionContext
&
ctx
,
const
std
::
string
input_name
)
{
auto
shape
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"fused_reshape_"
+
input_name
);
auto
axis
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"fused_transpose_"
+
input_name
);
auto
input_dims
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
input_name
)
->
dims
();
auto
new_dims
=
input_dims
;
if
(
!
shape
.
empty
()
&&
!
axis
.
empty
())
{
new_dims
=
input_dims
.
reshape
(
shape
).
transpose
(
axis
);
}
auto
&
MatrixDimsFromVector
=
input_name
==
"X"
?
phi
::
funcs
::
RowMatrixDimsFromVector
:
phi
::
funcs
::
ColumnMatrixDimsFromVector
;
phi
::
funcs
::
MatDescriptor
mat_dim
=
phi
::
funcs
::
CreateMatrixDescriptor
(
MatrixDimsFromVector
(
new_dims
),
0
,
ctx
.
HasAttr
(
"trans_x"
)
?
ctx
.
Attr
<
bool
>
(
std
::
string
(
"trans_"
)
+
static_cast
<
char
>
(
std
::
tolower
(
input_name
[
0
])))
:
ctx
.
Attr
<
bool
>
(
std
::
string
(
"transpose_"
)
+
input_name
[
0
]));
std
::
vector
<
int64_t
>
strides
;
if
(
!
shape
.
empty
())
{
auto
shape2
=
input_dims
.
reshape
(
shape
);
strides
.
push_back
(
1
);
for
(
auto
i
=
shape2
.
size
()
-
1
;
i
>
0
;
--
i
)
{
strides
.
insert
(
strides
.
begin
(),
strides
.
front
()
*
static_cast
<
int64_t
>
(
shape2
[
i
]));
}
strides
=
Transpose
(
strides
,
axis
);
if
(
shape
.
size
()
==
2
)
strides
.
insert
(
strides
.
begin
(),
static_cast
<
int64_t
>
(
shape
[
0
]
*
shape
[
1
]));
mat_dim
.
stride_
=
strides
[
0
];
if
(
mat_dim
.
trans_
)
std
::
swap
(
*
strides
.
rbegin
(),
*
(
++
strides
.
rbegin
()));
}
return
strides
;
}
bool
IsOutputFused
(
const
ExecutionContext
&
ctx
)
{
auto
&
fused_reshape_Out
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"fused_reshape_Out"
);
auto
&
fused_transpose_Out
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"fused_transpose_Out"
);
return
!
fused_reshape_Out
.
empty
()
&&
!
fused_transpose_Out
.
empty
();
}
template
<
typename
T
,
typename
T_out
>
template
<
typename
T
,
typename
T_out
>
void
ExecuteMatMul
(
const
ExecutionContext
&
ctx
,
void
ExecuteMatMulV2
(
const
ExecutionContext
&
ctx
,
const
DenseTensor
*
x
,
const
dnnl
::
engine
onednn_engine
,
const
std
::
vector
<
int64_t
>
&
x_dims
,
const
phi
::
DenseTensor
*
x
,
const
DenseTensor
*
y
,
const
std
::
vector
<
int64_t
>
&
x_dims
,
const
std
::
vector
<
int64_t
>
&
y_dims
,
bool
trans_x
,
DenseTensor
*
out
)
{
const
phi
::
DenseTensor
*
y
,
const
auto
&
dev_ctx
=
ctx
.
template
device_context
<
OneDNNContext
>();
const
std
::
vector
<
int64_t
>
&
y_dims
,
MatMulV1OneDNNHandler
<
T
,
T
,
T_out
>
handler
(
bool
trans_y
,
ctx
,
dev_ctx
.
GetEngine
(),
ctx
.
GetPlace
(),
x_dims
,
y_dims
);
phi
::
DenseTensor
*
out
)
{
std
::
vector
<
int64_t
>
x_strides_override
=
GetInputStrides
(
ctx
,
"X"
);
std
::
vector
<
int64_t
>
y_strides_override
=
GetInputStrides
(
ctx
,
"Y"
);
MatMulV2MKLDNNHandler
<
T
,
T
,
T_out
>
handler
(
ctx
,
onednn_engine
,
ctx
.
GetPlace
(),
x_dims
,
trans_x
,
y_dims
,
trans_y
,
IsOutputFused
(
ctx
),
x_strides_override
,
y_strides_override
);
const
auto
src_memory_p
=
handler
.
AcquireSrcMemory
(
x
);
const
auto
src_memory_p
=
handler
.
AcquireSrcMemory
(
x
);
const
auto
weights_memory_p
=
handler
.
AcquireWeightsMemory
(
y
);
const
auto
weights_memory_p
=
handler
.
AcquireWeightsMemory
(
y
);
...
@@ -340,23 +590,38 @@ void ExecuteMatMul(const ExecutionContext &ctx,
...
@@ -340,23 +590,38 @@ void ExecuteMatMul(const ExecutionContext &ctx,
{
DNNL_ARG_WEIGHTS
,
*
weights_memory_p
},
{
DNNL_ARG_WEIGHTS
,
*
weights_memory_p
},
{
DNNL_ARG_DST
,
*
dst_memory_p
}};
{
DNNL_ARG_DST
,
*
dst_memory_p
}};
if
(
ctx
.
HasInput
(
"ResidualData"
))
{
auto
*
residual_data
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"ResidualData"
);
const
auto
residual_data_memory_p
=
handler
.
AcquireSrcMemory
(
residual_data
);
matmul_args
.
insert
({
DNNL_ARG_ATTR_MULTIPLE_POST_OP
(
0
)
|
DNNL_ARG_SRC_1
,
*
residual_data_memory_p
});
}
auto
&
astream
=
OneDNNContext
::
tls
().
get_stream
();
auto
&
astream
=
OneDNNContext
::
tls
().
get_stream
();
matmul_p
->
execute
(
astream
,
matmul_args
);
matmul_p
->
execute
(
astream
,
matmul_args
);
astream
.
wait
();
astream
.
wait
();
out
->
set_mem_desc
(
// TODO(jczaja): Explain why int8 format of dst is ABCD and do not need
dst_memory_p
->
get_desc
().
reshape
(
vectorize
<
int64_t
>
(
out
->
dims
())));
// permute
if
(
IsOutputFused
(
ctx
)
&&
!
phi
::
funcs
::
is_int8
<
T_out
>
())
{
auto
axis
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"fused_transpose_Out"
);
auto
permuted_md
=
dst_memory_p
->
get_desc
().
permute_axes
(
axis
);
out
->
set_mem_desc
(
permuted_md
.
reshape
(
vectorize
<
int64_t
>
(
out
->
dims
())));
}
else
{
out
->
set_mem_desc
(
dst_memory_p
->
get_desc
().
reshape
(
vectorize
<
int64_t
>
(
out
->
dims
())));
}
}
}
template
<
typename
T
>
template
<
typename
T
>
class
MatMul
V1One
DNNKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
class
MatMul
MKL
DNNKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
ExecutionContext
&
ctx
)
const
override
{
if
(
ctx
.
HasAttr
(
"head_number"
))
{
if
(
ctx
.
HasAttr
(
"head_number"
))
{
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
ctx
.
Attr
<
int
>
(
"head_number"
),
ctx
.
Attr
<
int
>
(
"head_number"
),
1
,
1
,
p
hi
::
errors
::
Unimplemented
(
p
addle
::
platform
::
errors
::
Unimplemented
(
"oneDNN matmul doesn't support multiple heads. Expected "
"oneDNN matmul doesn't support multiple heads. Expected "
"head_number=1. But received `head_number` is %d"
,
"head_number=1. But received `head_number` is %d"
,
ctx
.
Attr
<
int
>
(
"head_number"
)));
ctx
.
Attr
<
int
>
(
"head_number"
)));
...
@@ -368,12 +633,19 @@ class MatMulV1OneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -368,12 +633,19 @@ class MatMulV1OneDNNKernel : public paddle::framework::OpKernel<T> {
:
false
;
:
false
;
constexpr
bool
fuse_relu
=
false
;
// TODO(intel): Enable eltwise fuses
constexpr
bool
fuse_relu
=
false
;
// TODO(intel): Enable eltwise fuses
auto
*
x
=
ctx
.
Input
<
DenseTensor
>
(
"X"
);
const
auto
&
dev_ctx
=
ctx
.
template
device_context
<
OneDNNContext
>();
auto
*
y
=
ctx
.
Input
<
DenseTensor
>
(
"Y"
);
const
auto
&
onednn_engine
=
dev_ctx
.
GetEngine
();
auto
*
out
=
ctx
.
Output
<
DenseTensor
>
(
"Out"
);
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
bool
trans_x
=
ctx
.
HasAttr
(
"trans_x"
)
?
ctx
.
Attr
<
bool
>
(
"trans_x"
)
:
ctx
.
Attr
<
bool
>
(
"transpose_X"
);
bool
trans_y
=
ctx
.
HasAttr
(
"trans_y"
)
?
ctx
.
Attr
<
bool
>
(
"trans_y"
)
:
ctx
.
Attr
<
bool
>
(
"transpose_Y"
);
auto
x_dims
=
vectorize
(
x
->
dims
(
));
auto
x_dims
=
vectorize
(
GetDimForInput
(
ctx
,
"X"
));
auto
y_dims
=
vectorize
(
y
->
dims
(
));
auto
y_dims
=
vectorize
(
GetDimForInput
(
ctx
,
"Y"
));
int
ndims
=
std
::
max
(
x_dims
.
size
(),
y_dims
.
size
());
int
ndims
=
std
::
max
(
x_dims
.
size
(),
y_dims
.
size
());
ndims
=
std
::
max
(
ndims
,
3
);
ndims
=
std
::
max
(
ndims
,
3
);
...
@@ -381,26 +653,58 @@ class MatMulV1OneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -381,26 +653,58 @@ class MatMulV1OneDNNKernel : public paddle::framework::OpKernel<T> {
std
::
vector
<
int64_t
>
x_bd_dims
(
ndims
,
1
);
std
::
vector
<
int64_t
>
x_bd_dims
(
ndims
,
1
);
std
::
vector
<
int64_t
>
y_bd_dims
(
ndims
,
1
);
std
::
vector
<
int64_t
>
y_bd_dims
(
ndims
,
1
);
CalculateMatrixDims
(
x_dims
,
y_dims
,
&
x_bd_dims
,
&
y_bd_dims
,
out
);
CalculateMatrixDims
(
ctx
,
x_dims
,
y_dims
,
&
x_bd_dims
,
&
y_bd_dims
,
out
);
if
(
force_fp32_output
||
((
!
is_int8
)
&&
(
!
is_bfloat16
)))
{
if
(
force_fp32_output
||
((
!
is_int8
)
&&
(
!
is_bfloat16
)))
{
ExecuteMatMul
<
T
,
float
>
(
ctx
,
x
,
x_bd_dims
,
y
,
y_bd_dims
,
out
);
ExecuteMatMulV2
<
T
,
float
>
(
ctx
,
onednn_engine
,
x
,
x_bd_dims
,
trans_x
,
y
,
y_bd_dims
,
trans_y
,
out
);
}
else
if
(
is_bfloat16
)
{
}
else
if
(
is_bfloat16
)
{
ExecuteMatMul
<
T
,
phi
::
dtype
::
bfloat16
>
(
ExecuteMatMulV2
<
T
,
paddle
::
platform
::
bfloat16
>
(
ctx
,
ctx
,
x
,
x_bd_dims
,
y
,
y_bd_dims
,
out
);
onednn_engine
,
x
,
x_bd_dims
,
trans_x
,
y
,
y_bd_dims
,
trans_y
,
out
);
}
else
if
(
fuse_relu
)
{
}
else
if
(
fuse_relu
)
{
ExecuteMatMul
<
T
,
uint8_t
>
(
ctx
,
x
,
x_bd_dims
,
y
,
y_bd_dims
,
out
);
ExecuteMatMulV2
<
T
,
uint8_t
>
(
ctx
,
onednn_engine
,
x
,
x_bd_dims
,
trans_x
,
y
,
y_bd_dims
,
trans_y
,
out
);
}
else
{
}
else
{
ExecuteMatMul
<
T
,
int8_t
>
(
ctx
,
x
,
x_bd_dims
,
y
,
y_bd_dims
,
out
);
ExecuteMatMulV2
<
T
,
int8_t
>
(
ctx
,
onednn_engine
,
x
,
x_bd_dims
,
trans_x
,
y
,
y_bd_dims
,
trans_y
,
out
);
}
}
}
}
private:
private:
void
CalculateMatrixDims
(
const
std
::
vector
<
int64_t
>
&
x_dims
,
void
CalculateMatrixDims
(
const
ExecutionContext
&
ctx
,
const
std
::
vector
<
int64_t
>
&
x_dims
,
const
std
::
vector
<
int64_t
>
&
y_dims
,
const
std
::
vector
<
int64_t
>
&
y_dims
,
std
::
vector
<
int64_t
>
*
x_bd_dims
,
std
::
vector
<
int64_t
>
*
x_bd_dims
,
std
::
vector
<
int64_t
>
*
y_bd_dims
,
std
::
vector
<
int64_t
>
*
y_bd_dims
,
DenseTensor
*
out
)
const
{
phi
::
DenseTensor
*
out
)
const
{
if
(
x_dims
.
size
()
==
1
)
{
if
(
x_dims
.
size
()
==
1
)
{
(
*
x_bd_dims
)[(
*
x_bd_dims
).
size
()
-
1
]
=
x_dims
[
0
];
(
*
x_bd_dims
)[(
*
x_bd_dims
).
size
()
-
1
]
=
x_dims
[
0
];
}
else
if
(
x_dims
.
size
()
==
2
)
{
}
else
if
(
x_dims
.
size
()
==
2
)
{
...
@@ -422,15 +726,15 @@ class MatMulV1OneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -422,15 +726,15 @@ class MatMulV1OneDNNKernel : public paddle::framework::OpKernel<T> {
}
}
}
}
if
(
x_dims
.
size
()
>
2
&&
y_dims
.
size
()
>
2
)
{
if
(
!
IsOutputFused
(
ctx
)
&&
x_dims
.
size
()
>
2
&&
y_dims
.
size
()
>
2
)
{
auto
out_dims
=
vectorize
(
out
->
dims
());
auto
out_dims
=
vectorize
(
out
->
dims
());
for
(
size_t
i
=
0
;
i
<
(
*
x_bd_dims
).
size
()
-
2
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
(
*
x_bd_dims
).
size
()
-
2
;
++
i
)
{
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
(
*
x_bd_dims
)[
i
]
==
(
*
y_bd_dims
)[
i
]
||
(
*
x_bd_dims
)[
i
]
==
1
||
(
*
x_bd_dims
)[
i
]
==
(
*
y_bd_dims
)[
i
]
||
(
*
x_bd_dims
)[
i
]
==
1
||
(
*
y_bd_dims
)[
i
]
==
1
,
(
*
y_bd_dims
)[
i
]
==
1
,
true
,
true
,
p
hi
::
errors
::
InvalidArgument
(
p
addle
::
platform
::
errors
::
InvalidArgument
(
"DenseTensor dimensions are incorrect for broadcasting."
"
phi::
DenseTensor dimensions are incorrect for broadcasting."
"Dimensions in X and Y must be same or equal to 1, but "
"Dimensions in X and Y must be same or equal to 1, but "
"received x_dim[%d]=%d and y_dims[%d]= %d"
,
"received x_dim[%d]=%d and y_dims[%d]= %d"
,
i
,
i
,
...
@@ -445,14 +749,14 @@ class MatMulV1OneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -445,14 +749,14 @@ class MatMulV1OneDNNKernel : public paddle::framework::OpKernel<T> {
};
};
template
<
typename
T
>
template
<
typename
T
>
class
MatMul
V1GradOne
DNNKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
class
MatMul
GradMKL
DNNKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
ExecutionContext
&
ctx
)
const
override
{
if
(
ctx
.
HasAttr
(
"head_number"
))
{
if
(
ctx
.
HasAttr
(
"head_number"
))
{
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
ctx
.
Attr
<
int
>
(
"head_number"
),
ctx
.
Attr
<
int
>
(
"head_number"
),
1
,
1
,
p
hi
::
errors
::
Unimplemented
(
p
addle
::
platform
::
errors
::
Unimplemented
(
"oneDNN matmul doesn't support multiple heads. Expected "
"oneDNN matmul doesn't support multiple heads. Expected "
"head_number=1. But received `head_number` is %d"
,
"head_number=1. But received `head_number` is %d"
,
ctx
.
Attr
<
int
>
(
"head_number"
)));
ctx
.
Attr
<
int
>
(
"head_number"
)));
...
@@ -461,18 +765,25 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -461,18 +765,25 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
const
auto
&
dev_ctx
=
ctx
.
template
device_context
<
OneDNNContext
>();
const
auto
&
dev_ctx
=
ctx
.
template
device_context
<
OneDNNContext
>();
const
auto
&
onednn_engine
=
dev_ctx
.
GetEngine
();
const
auto
&
onednn_engine
=
dev_ctx
.
GetEngine
();
auto
x
=
*
ctx
.
Input
<
DenseTensor
>
(
"X"
);
auto
x
=
*
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
y
=
*
ctx
.
Input
<
DenseTensor
>
(
"Y"
);
auto
y
=
*
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
dout
=
*
ctx
.
Input
<
DenseTensor
>
(
paddle
::
framework
::
GradVarName
(
"Out"
));
auto
dout
=
auto
*
dx
=
ctx
.
Output
<
DenseTensor
>
(
paddle
::
framework
::
GradVarName
(
"X"
));
*
ctx
.
Input
<
phi
::
DenseTensor
>
(
paddle
::
framework
::
GradVarName
(
"Out"
));
auto
*
dy
=
ctx
.
Output
<
DenseTensor
>
(
paddle
::
framework
::
GradVarName
(
"Y"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
paddle
::
framework
::
GradVarName
(
"X"
));
bool
transpose_x
=
ctx
.
Attr
<
bool
>
(
"transpose_X"
);
auto
*
dy
=
bool
transpose_y
=
ctx
.
Attr
<
bool
>
(
"transpose_Y"
);
ctx
.
Output
<
phi
::
DenseTensor
>
(
paddle
::
framework
::
GradVarName
(
"Y"
));
bool
transpose_x
=
ctx
.
HasAttr
(
"transpose_X"
)
?
ctx
.
Attr
<
bool
>
(
"transpose_X"
)
:
ctx
.
Attr
<
bool
>
(
"trans_x"
);
bool
transpose_y
=
ctx
.
HasAttr
(
"transpose_Y"
)
?
ctx
.
Attr
<
bool
>
(
"transpose_Y"
)
:
ctx
.
Attr
<
bool
>
(
"trans_y"
);
ReshapeXYOutToMatrixSequence
(
&
x
,
&
y
,
&
dout
,
transpose_x
,
transpose_y
);
ReshapeXYOutToMatrixSequence
(
&
x
,
&
y
,
&
dout
,
transpose_x
,
transpose_y
);
p
hi
::
DDim
dx_dims
;
p
addle
::
framework
::
DDim
dx_dims
;
if
(
dx
)
{
if
(
dx
)
{
dx_dims
=
dx
->
dims
();
dx_dims
=
dx
->
dims
();
if
(
dx_dims
!=
x
.
dims
())
{
if
(
dx_dims
!=
x
.
dims
())
{
...
@@ -480,7 +791,7 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -480,7 +791,7 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
}
}
}
}
p
hi
::
DDim
dy_dims
;
p
addle
::
framework
::
DDim
dy_dims
;
if
(
dy
)
{
if
(
dy
)
{
dy_dims
=
dy
->
dims
();
dy_dims
=
dy
->
dims
();
if
(
dy_dims
!=
y
.
dims
())
{
if
(
dy_dims
!=
y
.
dims
())
{
...
@@ -560,38 +871,38 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -560,38 +871,38 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
void
ExecuteMatMulGrad
(
const
ExecutionContext
&
ctx
,
void
ExecuteMatMulGrad
(
const
ExecutionContext
&
ctx
,
const
OneDNNContext
&
dev_ctx
,
const
OneDNNContext
&
dev_ctx
,
const
dnnl
::
engine
&
engine
,
const
dnnl
::
engine
&
engine
,
DenseTensor
*
x
,
phi
::
DenseTensor
*
x
,
bool
trans_x
,
bool
trans_x
,
bool
is_fold_init_dims_x
,
bool
is_fold_init_dims_x
,
DenseTensor
*
y
,
phi
::
DenseTensor
*
y
,
bool
trans_y
,
bool
trans_y
,
bool
is_fold_init_dims_y
,
bool
is_fold_init_dims_y
,
DenseTensor
*
out
)
const
{
phi
::
DenseTensor
*
out
)
const
{
// gradient is calculated in a different way when broadcasting is used
// gradient is calculated in a different way when broadcasting is used
bool
need_combine
=
(
x
->
dims
().
size
()
==
3
||
y
->
dims
().
size
()
==
3
)
&&
bool
need_combine
=
(
x
->
dims
().
size
()
==
3
||
y
->
dims
().
size
()
==
3
)
&&
out
->
dims
().
size
()
==
2
;
out
->
dims
().
size
()
==
2
;
DenseTensor
x_combined
,
y_combined
;
phi
::
DenseTensor
x_combined
,
y_combined
;
if
(
need_combine
)
{
if
(
!
need_combine
)
{
x_combined
=
*
x
;
y_combined
=
*
y
;
}
else
{
x_combined
=
is_fold_init_dims_x
?
FoldOuterDims
(
*
x
)
x_combined
=
is_fold_init_dims_x
?
FoldOuterDims
(
*
x
)
:
FoldFirstAndLastDims
<
T
>
(
dev_ctx
,
x
);
:
FoldFirstAndLastDims
<
T
>
(
dev_ctx
,
x
);
y_combined
=
is_fold_init_dims_y
?
FoldOuterDims
(
*
y
)
y_combined
=
is_fold_init_dims_y
?
FoldOuterDims
(
*
y
)
:
FoldFirstAndLastDims
<
T
>
(
dev_ctx
,
y
);
:
FoldFirstAndLastDims
<
T
>
(
dev_ctx
,
y
);
}
else
{
x_combined
=
*
x
;
y_combined
=
*
y
;
}
}
float
alpha
=
ctx
.
Attr
<
float
>
(
"alpha"
)
;
float
alpha
=
ctx
.
HasAttr
(
"alpha"
)
?
ctx
.
Attr
<
float
>
(
"alpha"
)
:
1.0
f
;
MatMul
V1One
DNNHandler
<
T
,
T
,
T
>
handler
(
engine
,
MatMul
MKL
DNNHandler
<
T
,
T
,
T
>
handler
(
engine
,
ctx
.
GetPlace
(),
ctx
.
GetPlace
(),
&
x_combined
,
&
x_combined
,
trans_x
,
trans_x
,
&
y_combined
,
&
y_combined
,
trans_y
,
trans_y
,
out
,
out
,
alpha
);
alpha
);
const
auto
src_memory_p
=
handler
.
AcquireSrcMemory
(
&
x_combined
);
const
auto
src_memory_p
=
handler
.
AcquireSrcMemory
(
&
x_combined
);
const
auto
weights_memory_p
=
handler
.
AcquireWeightsMemory
(
&
y_combined
);
const
auto
weights_memory_p
=
handler
.
AcquireWeightsMemory
(
&
y_combined
);
...
@@ -599,7 +910,7 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -599,7 +910,7 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
auto
matmul_p
=
handler
.
AcquireForwardPrimitive
();
auto
matmul_p
=
handler
.
AcquireForwardPrimitive
();
std
::
unordered_map
<
int
,
memory
>
matmul_args
=
{
std
::
unordered_map
<
int
,
dnnl
::
memory
>
matmul_args
=
{
{
DNNL_ARG_SRC
,
*
src_memory_p
},
{
DNNL_ARG_SRC
,
*
src_memory_p
},
{
DNNL_ARG_WEIGHTS
,
*
weights_memory_p
},
{
DNNL_ARG_WEIGHTS
,
*
weights_memory_p
},
{
DNNL_ARG_DST
,
*
dst_memory_p
}};
{
DNNL_ARG_DST
,
*
dst_memory_p
}};
...
@@ -618,13 +929,13 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
...
@@ -618,13 +929,13 @@ class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel<T> {
REGISTER_OP_KERNEL
(
matmul
,
REGISTER_OP_KERNEL
(
matmul
,
MKLDNN
,
MKLDNN
,
::
phi
::
CPUPlace
,
::
phi
::
CPUPlace
,
MatMul
V1One
DNNKernel
<
float
>
,
MatMul
MKL
DNNKernel
<
float
>
,
MatMul
V1OneDNNKernel
<
phi
::
dtype
::
bfloat16
>
,
MatMul
MKLDNNKernel
<
paddle
::
platform
::
bfloat16
>
,
MatMul
V1One
DNNKernel
<
int8_t
>
,
MatMul
MKL
DNNKernel
<
int8_t
>
,
MatMul
V1One
DNNKernel
<
uint8_t
>
);
MatMul
MKL
DNNKernel
<
uint8_t
>
);
REGISTER_OP_KERNEL
(
matmul_grad
,
REGISTER_OP_KERNEL
(
matmul_grad
,
MKLDNN
,
MKLDNN
,
::
phi
::
CPUPlace
,
::
phi
::
CPUPlace
,
MatMul
V1GradOne
DNNKernel
<
float
>
,
MatMul
GradMKL
DNNKernel
<
float
>
,
MatMul
V1GradOneDNNKernel
<
phi
::
dtype
::
bfloat16
>
);
MatMul
GradMKLDNNKernel
<
paddle
::
platform
::
bfloat16
>
);
paddle/fluid/operators/ops_extra_info.h
浏览文件 @
338cbeaa
...
@@ -99,7 +99,7 @@ const std::unordered_map<std::string, ExtraAttrPropertySet>
...
@@ -99,7 +99,7 @@ const std::unordered_map<std::string, ExtraAttrPropertySet>
{
"fuse_alpha"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_alpha"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_beta"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_beta"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_relu"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_relu"
,
ExtraAttrProperty
::
ONEDNN
},
{
"
alpha
"
,
ExtraAttrProperty
::
ONEDNN
},
{
"
fused_output_scale
"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_residual_connection"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_residual_connection"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_with_relu"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fuse_with_relu"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fused_reshape_Out"
,
ExtraAttrProperty
::
ONEDNN
},
{
"fused_reshape_Out"
,
ExtraAttrProperty
::
ONEDNN
},
...
...
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
浏览文件 @
338cbeaa
...
@@ -146,7 +146,7 @@ class TestMatmulActivationMkldnnFusePass(PassAutoScanTest):
...
@@ -146,7 +146,7 @@ class TestMatmulActivationMkldnnFusePass(PassAutoScanTest):
'operator_scale_onednn_fuse_pass'
,
'operator_scale_onednn_fuse_pass'
,
],
],
)
)
yield
config
,
[
'matmul
_v2
'
],
(
1e-5
,
1e-5
)
yield
config
,
[
'matmul'
],
(
1e-5
,
1e-5
)
def
test
(
self
):
def
test
(
self
):
self
.
run_and_statis
(
self
.
run_and_statis
(
...
...
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
浏览文件 @
338cbeaa
...
@@ -137,7 +137,7 @@ class TestMatmulElementwiseAddActivationMkldnnFusePass(PassAutoScanTest):
...
@@ -137,7 +137,7 @@ class TestMatmulElementwiseAddActivationMkldnnFusePass(PassAutoScanTest):
'matmul_activation_mkldnn_fuse_pass'
,
'matmul_activation_mkldnn_fuse_pass'
,
],
],
)
)
yield
config
,
[
'matmul
_v2
'
],
(
1e-5
,
1e-5
)
yield
config
,
[
'matmul'
],
(
1e-5
,
1e-5
)
def
test
(
self
):
def
test
(
self
):
self
.
run_and_statis
(
self
.
run_and_statis
(
...
...
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
浏览文件 @
338cbeaa
...
@@ -76,7 +76,7 @@ class TestMatmulElementwiseAddMkldnnFusePass(PassAutoScanTest):
...
@@ -76,7 +76,7 @@ class TestMatmulElementwiseAddMkldnnFusePass(PassAutoScanTest):
config
=
self
.
create_inference_config
(
config
=
self
.
create_inference_config
(
use_mkldnn
=
True
,
passes
=
[
'matmul_elementwise_add_mkldnn_fuse_pass'
]
use_mkldnn
=
True
,
passes
=
[
'matmul_elementwise_add_mkldnn_fuse_pass'
]
)
)
yield
config
,
[
'matmul
_v2
'
],
(
1e-5
,
1e-5
)
yield
config
,
[
'matmul'
],
(
1e-5
,
1e-5
)
def
test
(
self
):
def
test
(
self
):
self
.
run_and_statis
(
self
.
run_and_statis
(
...
...
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
浏览文件 @
338cbeaa
...
@@ -116,7 +116,7 @@ class TestMatmulTransposeReshapeMkldnnFusePass(PassAutoScanTest):
...
@@ -116,7 +116,7 @@ class TestMatmulTransposeReshapeMkldnnFusePass(PassAutoScanTest):
def
sample_predictor_configs
(
self
,
program_config
):
def
sample_predictor_configs
(
self
,
program_config
):
config
=
self
.
create_inference_config
(
use_mkldnn
=
True
)
config
=
self
.
create_inference_config
(
use_mkldnn
=
True
)
yield
config
,
[
"matmul
_v2
"
],
(
1e-5
,
1e-5
)
yield
config
,
[
"matmul"
],
(
1e-5
,
1e-5
)
def
test
(
self
):
def
test
(
self
):
self
.
run_and_statis
(
self
.
run_and_statis
(
...
...
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
浏览文件 @
338cbeaa
...
@@ -135,8 +135,17 @@ class TestMatmulv2TransposeReshapeMkldnnFusePass(PassAutoScanTest):
...
@@ -135,8 +135,17 @@ class TestMatmulv2TransposeReshapeMkldnnFusePass(PassAutoScanTest):
return
program_config
return
program_config
def
sample_predictor_configs
(
self
,
program_config
):
def
sample_predictor_configs
(
self
,
program_config
):
# gpu_cpu_map_matmul_v2_to_matmul_pass will affect the type of final fused op
fused_op
=
"matmul_v2"
input1_dim1
=
program_config
.
inputs
[
"input_data1"
].
shape
[
0
]
input2_dim1
=
program_config
.
inputs
[
"input_data2"
].
shape
[
0
]
input1_dim2
=
program_config
.
inputs
[
"input_data1"
].
shape
[
1
]
input2_dim2
=
program_config
.
inputs
[
"input_data2"
].
shape
[
1
]
if
input1_dim1
==
input2_dim1
and
input1_dim2
==
input2_dim2
:
fused_op
=
"matmul"
config
=
self
.
create_inference_config
(
use_mkldnn
=
True
)
config
=
self
.
create_inference_config
(
use_mkldnn
=
True
)
yield
config
,
[
"matmul_v2"
],
(
1e-5
,
1e-5
)
yield
config
,
[
fused_op
],
(
1e-5
,
1e-5
)
def
test
(
self
):
def
test
(
self
):
self
.
run_and_statis
(
self
.
run_and_statis
(
...
...
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py
浏览文件 @
338cbeaa
...
@@ -153,7 +153,7 @@ class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest):
...
@@ -153,7 +153,7 @@ class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest):
def
sample_predictor_configs
(
self
,
program_config
):
def
sample_predictor_configs
(
self
,
program_config
):
config
=
self
.
create_inference_config
(
use_mkldnn
=
True
)
config
=
self
.
create_inference_config
(
use_mkldnn
=
True
)
yield
config
,
[
"matmul
_v2
"
],
(
1e-5
,
1e-5
)
yield
config
,
[
"matmul"
],
(
1e-5
,
1e-5
)
def
test
(
self
):
def
test
(
self
):
self
.
run_and_statis
(
self
.
run_and_statis
(
...
...
python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
浏览文件 @
338cbeaa
...
@@ -17,7 +17,7 @@ import unittest
...
@@ -17,7 +17,7 @@ import unittest
import
numpy
as
np
import
numpy
as
np
from
paddle.fluid.tests.unittests.op_test
import
OpTest
from
paddle.fluid.tests.unittests.op_test
import
OpTest
,
skip_check_grad_ci
class
TestDnnlMatMulOp
(
OpTest
):
class
TestDnnlMatMulOp
(
OpTest
):
...
@@ -254,6 +254,321 @@ class TestDnnlMatMulOpInt8ForceFP32BasicScales(TestDnnlMatMulOp):
...
@@ -254,6 +254,321 @@ class TestDnnlMatMulOpInt8ForceFP32BasicScales(TestDnnlMatMulOp):
self
.
attrs
=
{
'force_fp32_output'
:
True
}
self
.
attrs
=
{
'force_fp32_output'
:
True
}
@
skip_check_grad_ci
(
reason
=
"DNNL's MatMul doesn't implement grad kernel."
)
class
TestReshapeTransposeMatMulOp
(
OpTest
):
def
init_data_type
(
self
):
self
.
data_type_
=
'float32'
def
generate_data
(
self
):
self
.
x
=
(
np
.
random
.
random
([
2
,
128
,
768
])
.
astype
(
"float32"
)
.
reshape
([
2
,
128
,
12
,
64
])
.
transpose
([
0
,
2
,
1
,
3
])
)
self
.
y
=
(
np
.
random
.
random
([
2
,
128
,
768
])
.
astype
(
"float32"
)
.
reshape
([
2
,
128
,
12
,
64
])
.
transpose
([
0
,
2
,
1
,
3
])
)
self
.
out
=
np
.
matmul
(
self
.
x
,
self
.
y
.
transpose
([
0
,
1
,
3
,
2
]))
self
.
fused_reshape_X
=
[]
self
.
fused_transpose_X
=
[]
self
.
fused_reshape_Y
=
[]
self
.
fused_transpose_Y
=
[]
def
set_op_type_and_transpose_y_name
(
self
):
self
.
op_type
=
"matmul"
self
.
transpose_y_name
=
"transpose_Y"
def
setUp
(
self
):
self
.
set_op_type_and_transpose_y_name
()
self
.
_cpu_only
=
True
self
.
use_mkldnn
=
True
self
.
transpose_y
=
True
self
.
init_data_type
()
self
.
generate_data
()
self
.
inputs
=
{
'X'
:
self
.
x
,
'Y'
:
self
.
y
}
self
.
attrs
=
{
'use_mkldnn'
:
self
.
use_mkldnn
,
self
.
transpose_y_name
:
self
.
transpose_y
,
}
if
len
(
self
.
fused_transpose_X
)
>
0
:
self
.
attrs
[
'fused_transpose_X'
]
=
self
.
fused_transpose_X
if
len
(
self
.
fused_transpose_Y
)
>
0
:
self
.
attrs
[
'fused_transpose_Y'
]
=
self
.
fused_transpose_Y
if
len
(
self
.
fused_reshape_X
)
>
0
:
self
.
attrs
[
'fused_reshape_X'
]
=
self
.
fused_reshape_X
if
len
(
self
.
fused_reshape_Y
)
>
0
:
self
.
attrs
[
'fused_reshape_Y'
]
=
self
.
fused_reshape_Y
self
.
outputs
=
{
'Out'
:
self
.
out
}
def
test_check_output
(
self
):
self
.
check_output
()
class
TestReshapeTransposeMatMulOp4DXFloat
(
TestReshapeTransposeMatMulOp
):
def
generate_data
(
self
):
self
.
x
=
np
.
random
.
random
([
2
,
128
,
768
]).
astype
(
"float32"
)
self
.
y
=
(
np
.
random
.
random
([
2
,
128
,
768
])
.
astype
(
"float32"
)
.
reshape
([
2
,
128
,
12
,
64
])
.
transpose
([
0
,
2
,
1
,
3
])
)
self
.
fused_transpose_X
=
[
0
,
2
,
1
,
3
]
self
.
fused_reshape_X
=
[
0
,
0
,
12
,
64
]
self
.
fused_transpose_Y
=
[]
self
.
fused_reshape_Y
=
[]
self
.
out
=
np
.
matmul
(
self
.
x
.
reshape
([
2
,
128
,
12
,
64
]).
transpose
([
0
,
2
,
1
,
3
]),
self
.
y
.
transpose
([
0
,
1
,
3
,
2
]),
)
class
TestReshapeTransposeMatMulOp4DXInt8
(
TestReshapeTransposeMatMulOp4DXFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
'int8'
class
TestReshapeTransposeMatMulOp4DYFloat
(
TestReshapeTransposeMatMulOp
):
def
generate_data
(
self
):
self
.
x
=
(
np
.
random
.
random
([
2
,
128
,
768
])
.
astype
(
"float32"
)
.
reshape
([
2
,
128
,
12
,
64
])
.
transpose
([
0
,
2
,
1
,
3
])
)
self
.
y
=
np
.
random
.
random
([
2
,
128
,
768
]).
astype
(
"float32"
)
self
.
fused_transpose_X
=
[]
self
.
fused_reshape_X
=
[]
self
.
fused_transpose_Y
=
[
0
,
2
,
1
,
3
]
self
.
fused_reshape_Y
=
[
0
,
0
,
12
,
64
]
self
.
out
=
np
.
matmul
(
self
.
x
,
self
.
y
.
reshape
([
2
,
128
,
12
,
64
]).
transpose
([
0
,
2
,
3
,
1
])
)
class
TestReshapeTransposeMatMulOp4DYInt8
(
TestReshapeTransposeMatMulOp4DYFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
'int8'
class
TestReshapeTransposeMatMulOp4DXYFloat
(
TestReshapeTransposeMatMulOp
):
def
generate_data
(
self
):
self
.
x
=
np
.
random
.
random
([
2
,
128
,
768
]).
astype
(
"float32"
)
self
.
y
=
np
.
random
.
random
([
2
,
128
,
768
]).
astype
(
"float32"
)
self
.
fused_transpose_X
=
[
0
,
2
,
1
,
3
]
self
.
fused_reshape_X
=
[
0
,
0
,
12
,
64
]
self
.
fused_transpose_Y
=
[
0
,
2
,
1
,
3
]
self
.
fused_reshape_Y
=
[
0
,
0
,
12
,
64
]
self
.
out
=
np
.
matmul
(
self
.
x
.
reshape
([
2
,
128
,
12
,
64
]).
transpose
([
0
,
2
,
1
,
3
]),
self
.
y
.
reshape
([
2
,
128
,
12
,
64
]).
transpose
([
0
,
2
,
3
,
1
]),
)
class
TestReshapeTransposeMatMulOp4DXYInt8
(
TestReshapeTransposeMatMulOp4DXYFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
'int8'
class
TestReshapeTransposeMatMulOp2DXFloat
(
TestReshapeTransposeMatMulOp
):
def
generate_data
(
self
):
self
.
x
=
np
.
random
.
random
([
2
,
5
,
10
]).
astype
(
"float32"
)
self
.
y
=
(
np
.
random
.
random
([
2
,
5
,
10
])
.
astype
(
"float32"
)
.
reshape
([
10
,
10
])
.
transpose
([
1
,
0
])
)
self
.
fused_transpose_X
=
[
1
,
0
]
self
.
fused_reshape_X
=
[
10
,
10
]
self
.
fused_transpose_Y
=
[]
self
.
fused_reshape_Y
=
[]
self
.
out
=
np
.
matmul
(
self
.
x
.
reshape
([
10
,
10
]).
transpose
([
1
,
0
]),
self
.
y
.
transpose
([
1
,
0
])
)
class
TestReshapeTransposeMatMulOp2DXInt8
(
TestReshapeTransposeMatMulOp2DXFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
'int8'
class
TestReshapeTransposeMatMulOp2DYFloat
(
TestReshapeTransposeMatMulOp
):
def
generate_data
(
self
):
self
.
x
=
(
np
.
random
.
random
([
2
,
5
,
10
])
.
astype
(
"float32"
)
.
reshape
([
10
,
10
])
.
transpose
([
1
,
0
])
)
self
.
y
=
np
.
random
.
random
([
2
,
5
,
10
]).
astype
(
"float32"
)
self
.
fused_transpose_X
=
[]
self
.
fused_reshape_X
=
[]
self
.
fused_transpose_Y
=
[
1
,
0
]
self
.
fused_reshape_Y
=
[
10
,
10
]
self
.
out
=
np
.
matmul
(
self
.
x
,
self
.
y
.
reshape
([
10
,
10
]))
class
TestReshapeTransposeMatMulOp2DYInt8
(
TestReshapeTransposeMatMulOp2DYFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
'int8'
class
TestReshapeTransposeMatMulOp3DXFloat
(
TestReshapeTransposeMatMulOp
):
def
generate_data
(
self
):
self
.
x
=
np
.
random
.
random
([
2
,
2
,
5
,
5
]).
astype
(
"float32"
)
self
.
y
=
(
np
.
random
.
random
([
2
,
2
,
5
,
5
])
.
astype
(
"float32"
)
.
reshape
([
2
,
10
,
5
])
.
transpose
([
0
,
2
,
1
])
)
self
.
fused_transpose_X
=
[
0
,
2
,
1
]
self
.
fused_reshape_X
=
[
2
,
10
,
5
]
self
.
fused_transpose_Y
=
[]
self
.
fused_reshape_Y
=
[]
self
.
out
=
np
.
matmul
(
self
.
x
.
reshape
([
2
,
10
,
5
]).
transpose
(
0
,
2
,
1
),
self
.
y
.
transpose
(
0
,
2
,
1
),
)
class
TestReshapeTransposeMatMulOp3DXInt8
(
TestReshapeTransposeMatMulOp3DXFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
'int8'
class
TestReshapeTransposeMatMulOp3DYFloat
(
TestReshapeTransposeMatMulOp
):
def
generate_data
(
self
):
self
.
x
=
(
np
.
random
.
random
([
2
,
2
,
5
,
5
])
.
astype
(
self
.
data_type_
)
.
reshape
([
2
,
10
,
5
])
.
transpose
([
0
,
2
,
1
])
)
self
.
y
=
np
.
random
.
random
([
2
,
2
,
5
,
5
]).
astype
(
self
.
data_type_
)
self
.
fused_transpose_X
=
[]
self
.
fused_reshape_X
=
[]
self
.
fused_transpose_Y
=
[
0
,
2
,
1
]
self
.
fused_reshape_Y
=
[
2
,
10
,
5
]
self
.
out
=
np
.
matmul
(
self
.
x
,
self
.
y
.
reshape
([
2
,
10
,
5
]))
class
TestReshapeTransposeMatMulOp3DYInt8
(
TestReshapeTransposeMatMulOp3DYFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
'int8'
@
skip_check_grad_ci
(
reason
=
"Tests inference only optimization."
)
class
TestMatMulOpTransposeReshapeEmptyFloat
(
OpTest
):
def
init_data_type
(
self
):
self
.
data_type_
=
np
.
float32
def
generate_data
(
self
):
self
.
bs
=
1
self
.
x
=
np
.
random
.
random
([
self
.
bs
,
128
,
128
]).
astype
(
self
.
data_type_
)
self
.
y
=
np
.
random
.
random
([
self
.
bs
,
128
,
64
]).
astype
(
self
.
data_type_
)
def
init_params_and_out
(
self
):
self
.
transpose_out
=
[]
self
.
reshape_out
=
[]
self
.
out
=
np
.
matmul
(
self
.
x
,
self
.
y
)
def
set_op_type
(
self
):
self
.
op_type
=
"matmul"
def
setUp
(
self
):
self
.
set_op_type
()
self
.
_cpu_only
=
True
self
.
use_mkldnn
=
True
self
.
init_data_type
()
self
.
generate_data
()
self
.
init_params_and_out
()
self
.
inputs
=
{
'X'
:
self
.
x
,
'Y'
:
self
.
y
}
self
.
attrs
=
{
'use_mkldnn'
:
self
.
use_mkldnn
}
if
len
(
self
.
reshape_out
)
>
0
:
self
.
attrs
[
'fused_reshape_Out'
]
=
self
.
reshape_out
if
len
(
self
.
transpose_out
)
>
0
:
self
.
attrs
[
'fused_transpose_Out'
]
=
self
.
transpose_out
self
.
inputs
=
{
'X'
:
self
.
x
,
'Y'
:
self
.
y
}
self
.
outputs
=
{
'Out'
:
self
.
out
}
def
test_check_output
(
self
):
self
.
check_output
()
def
check_raise_error
(
self
,
msg
):
try
:
self
.
check_output
()
except
Exception
as
e
:
if
msg
in
str
(
e
):
raise
AttributeError
else
:
print
(
e
)
class
TestMatMulOpTransposeReshapeIntEmptyInt
(
TestMatMulOpTransposeReshapeEmptyFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
np
.
int8
class
TestMatMulOpTransposeReshapeBasicFloat
(
TestMatMulOpTransposeReshapeEmptyFloat
):
def
generate_data
(
self
):
self
.
bs
=
8
self
.
x
=
np
.
random
.
random
([
self
.
bs
,
12
,
128
,
128
]).
astype
(
self
.
data_type_
)
self
.
y
=
np
.
random
.
random
([
self
.
bs
,
12
,
128
,
64
]).
astype
(
self
.
data_type_
)
def
init_params_and_out
(
self
):
self
.
transpose_out
=
[
0
,
2
,
1
,
3
]
self
.
reshape_out
=
[
0
,
0
,
self
.
x
.
shape
[
1
]
*
self
.
y
.
shape
[
-
1
]]
self
.
out
=
(
np
.
matmul
(
self
.
x
,
self
.
y
)
.
transpose
([
0
,
2
,
1
,
3
])
.
reshape
([
self
.
bs
,
-
1
,
self
.
x
.
shape
[
1
]
*
self
.
y
.
shape
[
-
1
]])
)
class
TestMatMulOpTransposeReshapeBasicInt
(
TestMatMulOpTransposeReshapeBasicFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
np
.
int8
class
TestMatMulOpTransposeReshapeOtherDimFloat
(
TestMatMulOpTransposeReshapeBasicFloat
):
def
generate_data
(
self
):
self
.
bs
=
11
self
.
x
=
np
.
random
.
random
([
self
.
bs
,
12
,
14
,
18
]).
astype
(
self
.
data_type_
)
self
.
y
=
np
.
random
.
random
([
self
.
bs
,
12
,
18
,
13
]).
astype
(
self
.
data_type_
)
class
TestMatMulOpTransposeReshapeOtherDimInt
(
TestMatMulOpTransposeReshapeOtherDimFloat
):
def
init_data_type
(
self
):
self
.
data_type_
=
np
.
int8
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
from
paddle
import
enable_static
from
paddle
import
enable_static
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录