Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
7575d37c
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
7575d37c
编写于
12月 06, 2022
作者:
S
Sławomir Siwek
提交者:
GitHub
12月 06, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[PHI] Migrate elementwise_(add/mul) kernels (#48625)
* remove fluid code * init * typo * fix merge conflicts
上级
8de336f9
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
69 addition
and
495 deletion
+69
-495
paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
...id/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+1
-1
paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
...e/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
+1
-1
paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
...operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+0
-27
paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
...luid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+0
-415
paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
...operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+0
-27
paddle/fluid/operators/elementwise/unity_build_rule.cmake
paddle/fluid/operators/elementwise/unity_build_rule.cmake
+0
-2
paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+2
-2
paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
+1
-1
paddle/phi/kernels/elementwise_kernel.cc
paddle/phi/kernels/elementwise_kernel.cc
+0
-14
paddle/phi/kernels/onednn/elementwise_kernel.cc
paddle/phi/kernels/onednn/elementwise_kernel.cc
+64
-5
未找到文件。
paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
浏览文件 @
7575d37c
...
...
@@ -36,7 +36,7 @@ PD_DECLARE_KERNEL(batch_norm, OneDNN, ONEDNN);
USE_OP_ITSELF
(
conv2d_transpose
);
PD_DECLARE_KERNEL
(
conv2d_transpose
,
OneDNN
,
ONEDNN
);
USE_OP_ITSELF
(
elementwise_add
);
USE_OP_DEVICE_KERNEL
(
elementwise_add
,
MKL
DNN
);
PD_DECLARE_KERNEL
(
add_raw
,
OneDNN
,
ONE
DNN
);
USE_OP_ITSELF
(
gelu
);
PD_DECLARE_KERNEL
(
gelu
,
OneDNN
,
ONEDNN
);
PD_DECLARE_ARG_MAPPING_FN
(
gelu
);
...
...
paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
浏览文件 @
7575d37c
...
...
@@ -25,7 +25,7 @@
USE_OP_ITSELF
(
softmax
);
PD_DECLARE_KERNEL
(
softmax
,
OneDNN
,
ONEDNN
);
USE_OP_ITSELF
(
elementwise_add
);
USE_OP_DEVICE_KERNEL
(
elementwise_add
,
MKL
DNN
);
PD_DECLARE_KERNEL
(
add_raw
,
OneDNN
,
ONE
DNN
);
USE_OP_ITSELF
(
leaky_relu
);
PD_DECLARE_KERNEL
(
leaky_relu
,
OneDNN
,
ONEDNN
);
USE_OP_ITSELF
(
gelu
);
...
...
paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
已删除
100644 → 0
浏览文件 @
8de336f9
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_KERNEL
(
elementwise_add
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
ops
::
EltwiseMKLDNNKernel
<
float
,
dnnl
::
algorithm
::
binary_add
>
,
ops
::
EltwiseMKLDNNKernel
<
paddle
::
platform
::
bfloat16
,
dnnl
::
algorithm
::
binary_add
>
,
ops
::
EltwiseMKLDNNKernel
<
int8_t
,
dnnl
::
algorithm
::
binary_add
>
,
ops
::
EltwiseMKLDNNKernel
<
uint8_t
,
dnnl
::
algorithm
::
binary_add
>
)
paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
已删除
100644 → 0
浏览文件 @
8de336f9
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace
paddle
{
namespace
operators
{
using
dnnl
::
memory
;
using
dnnl
::
primitive
;
using
dnnl
::
stream
;
using
phi
::
DataLayout
;
using
phi
::
OneDNNContext
;
using
phi
::
funcs
::
BinaryOneDNNHandler
;
inline
std
::
vector
<
int64_t
>
CalculateBroadcastedDims
(
const
phi
::
DenseTensor
*
x
,
const
phi
::
DenseTensor
*
y
)
{
const
auto
src_tz
=
phi
::
vectorize
(
x
->
dims
());
const
auto
dst_tz
=
phi
::
vectorize
(
y
->
dims
());
std
::
vector
<
int64_t
>
dst_tz_ex
(
src_tz
.
size
(),
1
);
if
(
src_tz
.
size
()
==
dst_tz
.
size
())
{
for
(
size_t
i
=
0
;
i
<
src_tz
.
size
();
i
++
)
{
dst_tz_ex
[
i
]
=
(
src_tz
[
i
]
==
dst_tz
[
i
])
?
dst_tz
[
i
]
:
1
;
}
}
else
{
size_t
j
=
0
;
for
(
size_t
i
=
0
;
i
<
src_tz
.
size
();
i
++
)
{
dst_tz_ex
[
i
]
=
(
src_tz
[
i
]
!=
dst_tz
[
j
])
?
1
:
dst_tz
[
j
++
];
if
(
j
==
dst_tz
.
size
())
break
;
}
}
return
dst_tz_ex
;
}
inline
void
AddSubNonBroadcast
(
phi
::
funcs
::
ReorderOneDNNHandler
*
reorder_handler
,
phi
::
DenseTensor
*
grad_tensor
,
const
std
::
shared_ptr
<
dnnl
::
memory
>&
src_memory
,
const
std
::
shared_ptr
<
dnnl
::
memory
>&
dst_memory
,
const
std
::
vector
<
float
>&
scales
)
{
dnnl
::
primitive_attr
reorder_attr
;
reorder_attr
.
set_output_scales
(
0
,
scales
);
auto
reorder_p
=
reorder_handler
->
AcquireReorder
(
dst_memory
,
src_memory
,
reorder_attr
);
reorder_p
->
execute
(
OneDNNContext
::
tls
().
get_stream
(),
*
src_memory
,
*
dst_memory
);
}
template
<
typename
T
>
inline
void
BroadcastReduction
(
const
framework
::
ExecutionContext
&
ctx
,
const
dnnl
::
engine
&
onednn_engine
,
phi
::
DenseTensor
*
grad_tensor
,
const
phi
::
DenseTensor
*
dout
,
const
std
::
shared_ptr
<
dnnl
::
memory
>&
src_memory
,
std
::
shared_ptr
<
dnnl
::
memory
>
dst_memory
,
const
std
::
vector
<
float
>&
scales
,
const
bool
is_sub
)
{
dnnl
::
primitive_attr
broadcast_reduction_attr
;
// Broadcasting
if
(
is_sub
)
{
dnnl
::
post_ops
po
;
po
.
append_eltwise
(
1.0
f
,
dnnl
::
algorithm
::
eltwise_linear
,
scales
[
0
],
0
);
broadcast_reduction_attr
.
set_post_ops
(
po
);
}
phi
::
funcs
::
ReductionOneDNNHandler
<
T
>
reduction_handler
(
dnnl
::
algorithm
::
reduction_sum
,
0.0
f
,
0.0
f
,
onednn_engine
,
ctx
.
GetPlace
(),
dout
,
grad_tensor
,
CalculateBroadcastedDims
(
dout
,
grad_tensor
),
broadcast_reduction_attr
);
dst_memory
=
reduction_handler
.
AcquireDstMemory
(
grad_tensor
);
auto
reduction_p
=
reduction_handler
.
AcquireForwardPrimitive
();
auto
astream
=
OneDNNContext
::
tls
().
get_stream
();
reduction_p
->
execute
(
astream
,
{
{
DNNL_ARG_SRC
,
*
src_memory
},
{
DNNL_ARG_DST
,
*
dst_memory
},
});
astream
.
wait
();
grad_tensor
->
set_mem_desc
(
dst_memory
->
get_desc
().
reshape
(
phi
::
vectorize
<
int64_t
>
(
grad_tensor
->
dims
())));
}
template
<
typename
T
,
dnnl
::
algorithm
BINARY_OP
>
class
EltwiseMKLDNNKernel
:
public
framework
::
OpKernel
<
T
>
{
private:
dnnl
::
post_ops
get_post_ops
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
dnnl
::
post_ops
post_operations
;
platform
::
AppendActivation
(
ctx
,
post_operations
);
if
(
ctx
.
HasAttr
(
"fused_output_scale"
))
{
float
scale_alpha
=
ctx
.
Attr
<
float
>
(
"fused_output_scale"
);
post_operations
.
append_eltwise
(
1.0
,
dnnl
::
algorithm
::
eltwise_linear
,
scale_alpha
,
0.0
f
);
}
return
post_operations
;
}
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
&
dev_ctx
=
ctx
.
template
device_context
<
OneDNNContext
>();
const
auto
&
mkldnn_engine
=
dev_ctx
.
GetEngine
();
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
z
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
float
scale_x
=
ctx
.
Attr
<
float
>
(
"Scale_x"
);
float
scale_y
=
ctx
.
Attr
<
float
>
(
"Scale_y"
);
float
scale_o
=
ctx
.
Attr
<
float
>
(
"Scale_out"
);
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
BinaryOneDNNHandler
<
T
>
handler
(
BINARY_OP
,
axis
,
mkldnn_engine
,
ctx
.
GetPlace
(),
x
,
y
,
z
,
scale_x
,
scale_y
,
scale_o
,
true
,
get_post_ops
(
ctx
));
// oneDNN's binary is optimized for broadcasting y into x, so in other case
// we have to swap tensors to achieve optimal performance
if
(
x
->
numel
()
<
y
->
numel
())
{
std
::
swap
(
x
,
y
);
}
const
auto
src_x_memory
=
handler
.
AcquireSrcMemory
(
x
);
const
auto
src_y_memory
=
handler
.
AcquireSecondSrcMemory
(
y
);
// (jczaja) For Inplace src and dst should be the same memory object.
// So x should share buffer with z. But UT mechanics is testing inplace
// execution for this op not checking that x can be bradcasted to match in
// shape y tensor.
// This is wrong as when x is to be broadcasted then z(out) will match the
// shape of y which is bigger than x. Hence if x is smaller in shape than z
// and they share a buffer (of
// shape x) then this buffer is not big enough to hold result of elementwise
// operation.
const
bool
reuse_x_memopry
=
x
->
numel
()
==
z
->
numel
()
&&
x
->
IsSharedBufferWith
(
*
z
);
std
::
shared_ptr
<
dnnl
::
memory
>
dst_memory
;
if
(
reuse_x_memopry
)
{
dst_memory
=
src_x_memory
;
// NOTE(chenfeiyu): when the output reuses memory from other tensor rather
// than allocate its own, it's still need to take care of its data type.
// Unfortunately, paddle's operator only infers the output' shape, but not
// the data type. mutable_data<T> takes care of allocation and data type
// normally, but if the memory is already allocated and there is no need
// to re-allocate, it just set the data type. So this it added there to
// get the right data type.
z
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
else
{
dst_memory
=
handler
.
AcquireDstMemory
(
z
);
}
const
auto
binary_prim
=
handler
.
AcquireForwardPrimitive
();
auto
&
astream
=
OneDNNContext
::
tls
().
get_stream
();
const
std
::
unordered_map
<
int
,
dnnl
::
memory
>
args
=
{
{
DNNL_ARG_SRC_0
,
*
src_x_memory
},
{
DNNL_ARG_SRC_1
,
*
src_y_memory
},
{
DNNL_ARG_DST
,
*
dst_memory
}};
binary_prim
->
execute
(
astream
,
args
);
astream
.
wait
();
if
(
handler
.
use_broadcasting_hack
==
false
)
{
platform
::
SetOutMemDescWithLogicalLayoutFusesSupport
(
ctx
,
z
,
dst_memory
->
get_desc
());
}
else
{
auto
dims
=
dst_memory
->
get_desc
().
dims
();
dims
.
insert
(
dims
.
begin
(),
x
->
dims
()[
0
]);
dims
[
1
]
/=
dims
[
0
];
platform
::
SetOutMemDescWithLogicalLayoutFusesSupport
(
ctx
,
z
,
dst_memory
->
get_desc
().
reshape
(
dims
));
}
}
};
template
<
typename
T
,
dnnl
::
algorithm
BINARY_OP
>
class
EltwiseMKLDNNGradKernel
:
public
ElemwiseGradKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
ElemwiseGradKernel
<
T
>::
Compute
(
ctx
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
OneDNNContext
>();
const
auto
&
onednn_engine
=
dev_ctx
.
GetEngine
();
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
// oneDNN's binary is optimized for broadcasting y into x, so in other case
// we have to swap tensors to achieve optimal performance
bool
swap_x_y
=
false
;
if
(
x
->
numel
()
<
y
->
numel
())
{
std
::
swap
(
x
,
y
);
std
::
swap
(
dx
,
dy
);
swap_x_y
=
true
;
}
std
::
vector
<
float
>
scales
{
1.0
};
if
(
swap_x_y
)
{
scales
[
0
]
=
(
BINARY_OP
==
dnnl
::
algorithm
::
binary_add
)
?
1
:
-
1
;
}
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
auto
tz
=
phi
::
vectorize
<
int64_t
>
(
dout
->
dims
());
auto
dout_type
=
phi
::
funcs
::
ToOneDNNDataType
(
dout
->
dtype
());
phi
::
funcs
::
ReorderOneDNNHandler
reorder_handler
(
tz
,
dout
->
dtype
(),
dout_type
,
onednn_engine
);
auto
reorder_src_memory
=
reorder_handler
.
AcquireSrcMemory
(
dout
->
mem_desc
(),
phi
::
funcs
::
to_void_cast
(
dout
->
data
<
T
>
()));
std
::
shared_ptr
<
dnnl
::
memory
>
dst_memory
;
std
::
shared_ptr
<
dnnl
::
memory
>
broadcast_src_memory
=
reorder_src_memory
;
auto
&
astream
=
OneDNNContext
::
tls
().
get_stream
();
if
(
dx
)
{
// elementwise_add & elementwise_sub
if
(
BINARY_OP
==
dnnl
::
algorithm
::
binary_add
||
BINARY_OP
==
dnnl
::
algorithm
::
binary_sub
)
{
if
(
dout
->
dims
()
==
dx
->
dims
())
{
dst_memory
=
reorder_handler
.
AcquireDstMemory
(
dx
,
dout
->
mem_desc
(),
ctx
.
GetPlace
());
AddSubNonBroadcast
(
&
reorder_handler
,
dx
,
reorder_src_memory
,
dst_memory
,
scales
);
}
}
else
{
// elementwise_mul & elementwise_div
BinaryOneDNNHandler
<
T
>
binary_handler
(
BINARY_OP
,
axis
,
onednn_engine
,
ctx
.
GetPlace
(),
dout
,
y
,
dx
,
1.0
f
,
1.0
f
,
1.0
f
,
false
);
const
auto
src_dout_memory
=
binary_handler
.
AcquireSrcMemory
(
dout
);
const
auto
src_y_memory
=
binary_handler
.
AcquireSecondSrcMemory
(
y
);
dst_memory
=
binary_handler
.
AcquireDstMemory
(
dx
);
const
auto
binary_prim
=
binary_handler
.
AcquireForwardPrimitive
();
const
std
::
unordered_map
<
int
,
dnnl
::
memory
>
args
=
{
{
DNNL_ARG_SRC_0
,
*
src_dout_memory
},
{
DNNL_ARG_SRC_1
,
*
src_y_memory
},
{
DNNL_ARG_DST
,
*
dst_memory
}};
binary_prim
->
execute
(
astream
,
args
);
}
astream
.
wait
();
if
(
dout
->
dims
()
!=
dx
->
dims
())
{
BroadcastReduction
<
T
>
(
ctx
,
onednn_engine
,
dx
,
dout
,
broadcast_src_memory
,
dst_memory
,
scales
,
BINARY_OP
==
dnnl
::
algorithm
::
binary_sub
);
}
else
{
dx
->
set_mem_desc
(
dst_memory
->
get_desc
());
}
}
if
(
dy
)
{
// elementwise_add & elementwise_sub
if
(
BINARY_OP
==
dnnl
::
algorithm
::
binary_add
||
BINARY_OP
==
dnnl
::
algorithm
::
binary_sub
)
{
if
(
dout
->
dims
()
==
dy
->
dims
())
{
dst_memory
=
reorder_handler
.
AcquireDstMemory
(
dy
,
dout
->
mem_desc
(),
ctx
.
GetPlace
());
AddSubNonBroadcast
(
&
reorder_handler
,
dy
,
reorder_src_memory
,
dst_memory
,
scales
);
}
}
else
{
// elementwise_mul & elementwise_div
std
::
unordered_map
<
int
,
dnnl
::
memory
>
args
;
std
::
shared_ptr
<
dnnl
::
binary
>
binary_prim
;
std
::
shared_ptr
<
dnnl
::
memory
>
post_op_memory
;
std
::
shared_ptr
<
dnnl
::
memory
>
src_0_memory
;
std
::
shared_ptr
<
dnnl
::
memory
>
src_1_memory
;
BinaryOneDNNHandler
<
T
>
binary_handler
(
dnnl
::
algorithm
::
binary_mul
,
axis
,
onednn_engine
,
ctx
.
GetPlace
(),
dout
,
x
,
nullptr
,
1.0
f
,
1.0
f
,
1.0
f
,
false
);
src_1_memory
=
binary_handler
.
AcquireSecondSrcMemory
(
x
);
if
(
BINARY_OP
==
dnnl
::
algorithm
::
binary_div
)
{
BinaryOneDNNHandler
<
T
>
post_op_binary_handler
(
dnnl
::
algorithm
::
binary_div
,
axis
,
onednn_engine
,
ctx
.
GetPlace
(),
y
,
y
,
nullptr
,
1.0
f
,
1.0
f
,
1.0
f
,
false
);
post_op_memory
=
post_op_binary_handler
.
AcquireSrcMemory
(
y
);
dnnl
::
post_ops
po
;
po
.
append_binary
(
dnnl
::
algorithm
::
binary_div
,
post_op_memory
->
get_desc
());
binary_handler
=
BinaryOneDNNHandler
<
T
>
(
dnnl
::
algorithm
::
binary_mul
,
axis
,
onednn_engine
,
ctx
.
GetPlace
(),
dout
,
out
,
nullptr
,
-
1.0
f
,
1.0
f
,
1.0
f
,
false
,
po
);
src_1_memory
=
binary_handler
.
AcquireSecondSrcMemory
(
out
);
}
src_0_memory
=
binary_handler
.
AcquireSrcMemory
(
dout
);
const
auto
dst_dy_memory
=
(
dout
->
dims
()
==
dy
->
dims
())
?
binary_handler
.
AcquireDstMemory
(
dy
)
:
binary_handler
.
AcquireDstMemory
();
binary_prim
=
binary_handler
.
AcquireForwardPrimitive
();
args
=
{{
DNNL_ARG_SRC_0
,
*
src_0_memory
},
{
DNNL_ARG_SRC_1
,
*
src_1_memory
},
{
DNNL_ARG_DST
,
*
dst_dy_memory
}};
if
(
BINARY_OP
==
dnnl
::
algorithm
::
binary_div
)
args
.
insert
({
DNNL_ARG_ATTR_MULTIPLE_POST_OP
(
0
)
|
DNNL_ARG_SRC_1
,
*
post_op_memory
});
binary_prim
->
execute
(
astream
,
args
);
broadcast_src_memory
=
dst_dy_memory
;
dst_memory
=
dst_dy_memory
;
}
astream
.
wait
();
if
(
dout
->
dims
()
!=
dy
->
dims
())
{
BroadcastReduction
<
T
>
(
ctx
,
onednn_engine
,
dy
,
dout
,
broadcast_src_memory
,
dst_memory
,
scales
,
BINARY_OP
==
dnnl
::
algorithm
::
binary_sub
);
}
else
{
dy
->
set_mem_desc
(
dst_memory
->
get_desc
());
}
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
已删除
100644 → 0
浏览文件 @
8de336f9
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_KERNEL
(
elementwise_mul
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
ops
::
EltwiseMKLDNNKernel
<
float
,
dnnl
::
algorithm
::
binary_mul
>
,
ops
::
EltwiseMKLDNNKernel
<
paddle
::
platform
::
bfloat16
,
dnnl
::
algorithm
::
binary_mul
>
,
ops
::
EltwiseMKLDNNKernel
<
int8_t
,
dnnl
::
algorithm
::
binary_mul
>
,
ops
::
EltwiseMKLDNNKernel
<
uint8_t
,
dnnl
::
algorithm
::
binary_mul
>
)
paddle/fluid/operators/elementwise/unity_build_rule.cmake
浏览文件 @
7575d37c
...
...
@@ -7,14 +7,12 @@
register_unity_group
(
cc
elementwise_add_op.cc
mkldnn/elementwise_add_mkldnn_op.cc
elementwise_div_op.cc
elementwise_floordiv_op.cc
elementwise_max_op.cc
elementwise_min_op.cc
elementwise_mod_op.cc
elementwise_mul_op.cc
mkldnn/elementwise_mul_mkldnn_op.cc
elementwise_pow_op.cc
elementwise_sub_op.cc
)
register_unity_group
(
...
...
paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
浏览文件 @
7575d37c
...
...
@@ -28,9 +28,9 @@
#include "paddle/phi/core/kernel_registry.h"
USE_OP_ITSELF
(
elementwise_add
);
USE_OP_DEVICE_KERNEL
(
elementwise_add
,
MKL
DNN
);
PD_DECLARE_KERNEL
(
add_raw
,
OneDNN
,
ONE
DNN
);
USE_OP_ITSELF
(
elementwise_mul
);
USE_OP_DEVICE_KERNEL
(
elementwise_mul
,
MKL
DNN
);
PD_DECLARE_KERNEL
(
multiply_raw
,
OneDNN
,
ONE
DNN
);
USE_OP_ITSELF
(
relu
);
PD_DECLARE_KERNEL
(
relu
,
OneDNN
,
ONEDNN
);
USE_OP_ITSELF
(
softmax
);
...
...
paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
浏览文件 @
7575d37c
...
...
@@ -28,7 +28,7 @@
#include "paddle/phi/core/kernel_registry.h"
USE_OP_ITSELF
(
elementwise_add
);
USE_OP_DEVICE_KERNEL
(
elementwise_add
,
MKL
DNN
);
PD_DECLARE_KERNEL
(
add_raw
,
OneDNN
,
ONE
DNN
);
USE_OP_ITSELF
(
relu
);
PD_DECLARE_KERNEL
(
relu
,
OneDNN
,
ONEDNN
);
USE_OP_ITSELF
(
softmax
);
...
...
paddle/phi/kernels/elementwise_kernel.cc
浏览文件 @
7575d37c
...
...
@@ -414,17 +414,3 @@ PD_REGISTER_KERNEL(elementwise_pow,
float
,
phi
::
dtype
::
float16
)
{}
#endif
#if defined PADDLE_WITH_MKLDNN
PD_REGISTER_KERNEL
(
subtract
,
OneDNN
,
ONEDNN
,
phi
::
SubtractKernel
,
float
,
phi
::
dtype
::
bfloat16
,
int8_t
,
uint8_t
)
{}
PD_REGISTER_KERNEL
(
divide
,
OneDNN
,
ONEDNN
,
phi
::
DivideKernel
,
float
,
phi
::
dtype
::
bfloat16
)
{}
#endif
paddle/phi/kernels/onednn/elementwise_kernel.cc
浏览文件 @
7575d37c
...
...
@@ -32,14 +32,14 @@ void ElementwiseKernel(const OneDNNContext& dev_ctx,
float
scale_x
=
dev_ctx
.
HasDnnAttr
(
"Scale_x"
)
?
PADDLE_GET_CONST
(
float
,
dev_ctx
.
GetDnnAttr
(
"Scale_x"
))
:
1
;
:
1
.0
f
;
float
scale_y
=
dev_ctx
.
HasDnnAttr
(
"Scale_y"
)
?
PADDLE_GET_CONST
(
float
,
dev_ctx
.
GetDnnAttr
(
"Scale_y"
))
:
1
;
:
1
.0
f
;
float
scale_out
=
dev_ctx
.
HasDnnAttr
(
"Scale_out"
)
?
PADDLE_GET_CONST
(
float
,
dev_ctx
.
GetDnnAttr
(
"Scale_out"
))
:
1
;
:
1
.0
f
;
dnnl
::
post_ops
post_operations
;
funcs
::
AppendActivation
(
dev_ctx
,
post_operations
);
...
...
@@ -114,12 +114,14 @@ void ElementwiseKernel(const OneDNNContext& dev_ctx,
astream
.
wait
();
if
(
handler
.
use_broadcasting_hack
==
false
)
{
out
->
set_mem_desc
(
dst_memory
->
get_desc
());
funcs
::
SetOutMemDescWithLogicalLayoutFusesSupport
(
dev_ctx
,
out
,
dst_memory
->
get_desc
());
}
else
{
auto
dims
=
dst_memory
->
get_desc
().
dims
();
dims
.
insert
(
dims
.
begin
(),
non_const_x
->
dims
()[
0
]);
dims
[
1
]
/=
dims
[
0
];
out
->
set_mem_desc
(
dst_memory
->
get_desc
().
reshape
(
dims
));
funcs
::
SetOutMemDescWithLogicalLayoutFusesSupport
(
dev_ctx
,
out
,
dst_memory
->
get_desc
().
reshape
(
dims
));
}
}
...
...
@@ -131,13 +133,40 @@ void ElementwiseKernel(const OneDNNContext& dev_ctx,
int axis, \
DenseTensor* out) { \
ElementwiseKernel<T, algorithm>(dev_ctx, x, y, axis, out); \
} \
template <typename T, typename Context> \
void name##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& y, \
DenseTensor* out) { \
ElementwiseKernel<T, algorithm>(dev_ctx, x, y, -1, out); \
}
DEFINE_ONEDNN_ELEMENTWISE_KERNEL
(
Add
,
dnnl
::
algorithm
::
binary_add
)
DEFINE_ONEDNN_ELEMENTWISE_KERNEL
(
Subtract
,
dnnl
::
algorithm
::
binary_sub
)
DEFINE_ONEDNN_ELEMENTWISE_KERNEL
(
Multiply
,
dnnl
::
algorithm
::
binary_mul
)
DEFINE_ONEDNN_ELEMENTWISE_KERNEL
(
Divide
,
dnnl
::
algorithm
::
binary_div
)
}
// namespace phi
PD_REGISTER_KERNEL
(
add_raw
,
OneDNN
,
ONEDNN
,
phi
::
AddRawKernel
,
float
,
phi
::
dtype
::
bfloat16
,
int8_t
,
uint8_t
)
{}
PD_REGISTER_KERNEL
(
add
,
OneDNN
,
ONEDNN
,
phi
::
AddKernel
,
float
,
phi
::
dtype
::
bfloat16
,
int8_t
,
uint8_t
)
{}
PD_REGISTER_KERNEL
(
subtract_raw
,
OneDNN
,
ONEDNN
,
...
...
@@ -147,9 +176,39 @@ PD_REGISTER_KERNEL(subtract_raw,
int8_t
,
uint8_t
)
{}
PD_REGISTER_KERNEL
(
subtract
,
OneDNN
,
ONEDNN
,
phi
::
SubtractKernel
,
float
,
phi
::
dtype
::
bfloat16
,
int8_t
,
uint8_t
)
{}
PD_REGISTER_KERNEL
(
multiply_raw
,
OneDNN
,
ONEDNN
,
phi
::
MultiplyRawKernel
,
float
,
phi
::
dtype
::
bfloat16
,
int8_t
,
uint8_t
)
{}
PD_REGISTER_KERNEL
(
multiply
,
OneDNN
,
ONEDNN
,
phi
::
MultiplyKernel
,
float
,
phi
::
dtype
::
bfloat16
,
int8_t
,
uint8_t
)
{}
PD_REGISTER_KERNEL
(
divide_raw
,
OneDNN
,
ONEDNN
,
phi
::
DivideRawKernel
,
float
,
phi
::
dtype
::
bfloat16
)
{}
PD_REGISTER_KERNEL
(
divide
,
OneDNN
,
ONEDNN
,
phi
::
DivideKernel
,
float
,
phi
::
dtype
::
bfloat16
)
{}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录