Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
78b30e97
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
78b30e97
编写于
11月 22, 2022
作者:
P
Piotr Paturej
提交者:
GitHub
11月 22, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[PHI] Migrate elementwise_div + all elementwise grad kernels (#48210)
* Migrate elementwise_div * Migrate elementwise grad kernels
上级
27f49254
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
505 addition
and
56 deletion
+505
-56
paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
...operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+0
-8
paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
...operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
+0
-32
paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
...operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+0
-8
paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
...operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
+0
-8
paddle/phi/kernels/elementwise_kernel.cc
paddle/phi/kernels/elementwise_kernel.cc
+5
-0
paddle/phi/kernels/onednn/elementwise_grad_kernel.cc
paddle/phi/kernels/onednn/elementwise_grad_kernel.cc
+361
-0
paddle/phi/kernels/onednn/elementwise_kernel.cc
paddle/phi/kernels/onednn/elementwise_kernel.cc
+139
-0
未找到文件。
paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
浏览文件 @
78b30e97
...
...
@@ -25,11 +25,3 @@ REGISTER_OP_KERNEL(
dnnl
::
algorithm
::
binary_add
>
,
ops
::
EltwiseMKLDNNKernel
<
int8_t
,
dnnl
::
algorithm
::
binary_add
>
,
ops
::
EltwiseMKLDNNKernel
<
uint8_t
,
dnnl
::
algorithm
::
binary_add
>
)
REGISTER_OP_KERNEL
(
elementwise_add_grad
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
ops
::
EltwiseMKLDNNGradKernel
<
paddle
::
platform
::
bfloat16
,
dnnl
::
algorithm
::
binary_add
>
,
ops
::
EltwiseMKLDNNGradKernel
<
float
,
dnnl
::
algorithm
::
binary_add
>
)
paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
已删除
100644 → 0
浏览文件 @
27f49254
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_KERNEL
(
elementwise_div
,
MKLDNN
,
paddle
::
platform
::
CPUPlace
,
ops
::
EltwiseMKLDNNKernel
<
float
,
dnnl
::
algorithm
::
binary_div
>
,
ops
::
EltwiseMKLDNNKernel
<
paddle
::
platform
::
bfloat16
,
dnnl
::
algorithm
::
binary_div
>
)
REGISTER_OP_KERNEL
(
elementwise_div_grad
,
MKLDNN
,
paddle
::
platform
::
CPUPlace
,
ops
::
EltwiseMKLDNNGradKernel
<
paddle
::
platform
::
bfloat16
,
dnnl
::
algorithm
::
binary_div
>
,
ops
::
EltwiseMKLDNNGradKernel
<
float
,
dnnl
::
algorithm
::
binary_div
>
)
paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
浏览文件 @
78b30e97
...
...
@@ -25,11 +25,3 @@ REGISTER_OP_KERNEL(
dnnl
::
algorithm
::
binary_mul
>
,
ops
::
EltwiseMKLDNNKernel
<
int8_t
,
dnnl
::
algorithm
::
binary_mul
>
,
ops
::
EltwiseMKLDNNKernel
<
uint8_t
,
dnnl
::
algorithm
::
binary_mul
>
)
REGISTER_OP_KERNEL
(
elementwise_mul_grad
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
ops
::
EltwiseMKLDNNGradKernel
<
paddle
::
platform
::
bfloat16
,
dnnl
::
algorithm
::
binary_mul
>
,
ops
::
EltwiseMKLDNNGradKernel
<
float
,
dnnl
::
algorithm
::
binary_mul
>
)
paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
浏览文件 @
78b30e97
...
...
@@ -25,11 +25,3 @@ REGISTER_OP_KERNEL(
dnnl
::
algorithm
::
binary_sub
>
,
ops
::
EltwiseMKLDNNKernel
<
int8_t
,
dnnl
::
algorithm
::
binary_sub
>
,
ops
::
EltwiseMKLDNNKernel
<
uint8_t
,
dnnl
::
algorithm
::
binary_sub
>
)
REGISTER_OP_KERNEL
(
elementwise_sub_grad
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
ops
::
EltwiseMKLDNNGradKernel
<
paddle
::
platform
::
bfloat16
,
dnnl
::
algorithm
::
binary_sub
>
,
ops
::
EltwiseMKLDNNGradKernel
<
float
,
dnnl
::
algorithm
::
binary_sub
>
)
paddle/phi/kernels/elementwise_kernel.cc
浏览文件 @
78b30e97
...
...
@@ -414,3 +414,8 @@ PD_REGISTER_KERNEL(elementwise_pow,
float
,
phi
::
dtype
::
float16
)
{}
#endif
#if defined PADDLE_WITH_MKLDNN
PD_REGISTER_KERNEL
(
divide
,
OneDNN
,
ONEDNN
,
phi
::
DivideKernel
,
float
,
phi
::
dtype
::
bfloat16
)
{}
#endif
paddle/phi/kernels/onednn/elementwise_grad_kernel.cc
0 → 100644
浏览文件 @
78b30e97
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
#include "paddle/phi/kernels/elementwise_divide_grad_kernel.h"
#include "paddle/phi/kernels/elementwise_multiply_grad_kernel.h"
#include "paddle/phi/kernels/elementwise_subtract_grad_kernel.h"
#include "paddle/phi/backends/onednn/onednn_reuse.h"
#include "paddle/phi/core/kernel_registry.h"
namespace
phi
{
namespace
funcs
{
inline
std
::
vector
<
int64_t
>
CalculateBroadcastedDims
(
const
phi
::
DenseTensor
*
x
,
const
phi
::
DenseTensor
*
y
)
{
const
auto
src_tz
=
phi
::
vectorize
(
x
->
dims
());
const
auto
dst_tz
=
phi
::
vectorize
(
y
->
dims
());
std
::
vector
<
int64_t
>
dst_tz_ex
(
src_tz
.
size
(),
1
);
if
(
src_tz
.
size
()
==
dst_tz
.
size
())
{
for
(
size_t
i
=
0
;
i
<
src_tz
.
size
();
i
++
)
{
dst_tz_ex
[
i
]
=
(
src_tz
[
i
]
==
dst_tz
[
i
])
?
dst_tz
[
i
]
:
1
;
}
}
else
{
size_t
j
=
0
;
for
(
size_t
i
=
0
;
i
<
src_tz
.
size
();
i
++
)
{
dst_tz_ex
[
i
]
=
(
src_tz
[
i
]
!=
dst_tz
[
j
])
?
1
:
dst_tz
[
j
++
];
if
(
j
==
dst_tz
.
size
())
break
;
}
}
return
dst_tz_ex
;
}
inline
void
AddSubNonBroadcast
(
ReorderOneDNNHandler
*
reorder_handler
,
phi
::
DenseTensor
*
grad_tensor
,
const
std
::
shared_ptr
<
dnnl
::
memory
>&
src_memory
,
const
std
::
shared_ptr
<
dnnl
::
memory
>&
dst_memory
,
const
std
::
vector
<
float
>&
scales
)
{
dnnl
::
primitive_attr
reorder_attr
;
reorder_attr
.
set_output_scales
(
0
,
scales
);
auto
reorder_p
=
reorder_handler
->
AcquireReorder
(
dst_memory
,
src_memory
,
reorder_attr
);
paddle
::
platform
::
RecordEvent
record_reorder
(
"int_reorder"
,
paddle
::
platform
::
TracerEventType
::
UserDefined
,
2
,
paddle
::
platform
::
EventRole
::
kUniqueOp
);
reorder_p
->
execute
(
OneDNNContext
::
tls
().
get_stream
(),
*
src_memory
,
*
dst_memory
);
}
template
<
typename
T
>
inline
void
BroadcastReduction
(
const
Place
&
place
,
const
dnnl
::
engine
&
onednn_engine
,
phi
::
DenseTensor
*
grad_tensor
,
const
phi
::
DenseTensor
*
dout
,
const
std
::
shared_ptr
<
dnnl
::
memory
>&
src_memory
,
std
::
shared_ptr
<
dnnl
::
memory
>
dst_memory
,
const
std
::
vector
<
float
>&
scales
,
const
bool
is_sub
)
{
dnnl
::
primitive_attr
broadcast_reduction_attr
;
// Broadcasting
if
(
is_sub
)
{
dnnl
::
post_ops
po
;
po
.
append_eltwise
(
1.0
f
,
dnnl
::
algorithm
::
eltwise_linear
,
scales
[
0
],
0
);
broadcast_reduction_attr
.
set_post_ops
(
po
);
}
ReductionOneDNNHandler
<
T
>
reduction_handler
(
dnnl
::
algorithm
::
reduction_sum
,
0.0
f
,
0.0
f
,
onednn_engine
,
place
,
dout
,
grad_tensor
,
CalculateBroadcastedDims
(
dout
,
grad_tensor
),
broadcast_reduction_attr
);
dst_memory
=
reduction_handler
.
AcquireDstMemory
(
grad_tensor
);
auto
reduction_p
=
reduction_handler
.
AcquireForwardPrimitive
();
auto
astream
=
OneDNNContext
::
tls
().
get_stream
();
reduction_p
->
execute
(
astream
,
{
{
DNNL_ARG_SRC
,
*
src_memory
},
{
DNNL_ARG_DST
,
*
dst_memory
},
});
astream
.
wait
();
grad_tensor
->
set_mem_desc
(
dst_memory
->
get_desc
().
reshape
(
phi
::
vectorize
<
int64_t
>
(
grad_tensor
->
dims
())));
}
}
// namespace funcs
template
<
typename
T
,
dnnl
::
algorithm
BINARY_OP
>
void
ElementwiseGradKernel
(
const
OneDNNContext
&
dev_ctx
,
const
DenseTensor
&
x
,
const
DenseTensor
&
y
,
const
DenseTensor
*
out
,
const
DenseTensor
&
dout
,
int
axis
,
DenseTensor
*
dx
,
DenseTensor
*
dy
)
{
const
auto
&
onednn_engine
=
dev_ctx
.
GetEngine
();
// oneDNN's binary is optimized for broadcasting y into x, so in other case
// we have to swap tensors to achieve optimal performance
bool
swap_x_y
=
false
;
auto
*
non_const_x
=
&
x
;
auto
*
non_const_y
=
&
y
;
if
(
x
.
numel
()
<
y
.
numel
())
{
std
::
swap
(
non_const_x
,
non_const_y
);
std
::
swap
(
dx
,
dy
);
swap_x_y
=
true
;
}
std
::
vector
<
float
>
scales
{
1.0
};
if
(
swap_x_y
)
{
scales
[
0
]
=
(
BINARY_OP
==
dnnl
::
algorithm
::
binary_add
)
?
1
:
-
1
;
}
auto
tz
=
phi
::
vectorize
<
int64_t
>
(
dout
.
dims
());
funcs
::
ReorderOneDNNHandler
reorder_handler
(
tz
,
dout
.
dtype
(),
funcs
::
ToOneDNNDataType
(
dout
.
dtype
()),
onednn_engine
);
auto
reorder_src_memory
=
reorder_handler
.
AcquireSrcMemory
(
dout
.
mem_desc
(),
funcs
::
to_void_cast
(
dout
.
data
<
T
>
()));
std
::
shared_ptr
<
dnnl
::
memory
>
dst_memory
;
std
::
shared_ptr
<
dnnl
::
memory
>
broadcast_src_memory
=
reorder_src_memory
;
auto
&
astream
=
OneDNNContext
::
tls
().
get_stream
();
if
(
dx
)
{
// elementwise_add & elementwise_sub
if
(
BINARY_OP
==
dnnl
::
algorithm
::
binary_add
||
BINARY_OP
==
dnnl
::
algorithm
::
binary_sub
)
{
if
(
dout
.
dims
()
==
dx
->
dims
())
{
dst_memory
=
reorder_handler
.
AcquireDstMemory
(
dx
,
dout
.
mem_desc
(),
dev_ctx
.
GetPlace
());
AddSubNonBroadcast
(
&
reorder_handler
,
dx
,
reorder_src_memory
,
dst_memory
,
scales
);
}
}
else
{
// elementwise_mul & elementwise_div
funcs
::
BinaryOneDNNHandler
<
T
>
binary_handler
(
BINARY_OP
,
axis
,
onednn_engine
,
dev_ctx
.
GetPlace
(),
&
dout
,
non_const_y
,
dx
,
1.0
f
,
1.0
f
,
1.0
f
,
false
);
const
auto
src_dout_memory
=
binary_handler
.
AcquireSrcMemory
(
&
dout
);
const
auto
src_y_memory
=
binary_handler
.
AcquireSecondSrcMemory
(
non_const_y
);
dst_memory
=
binary_handler
.
AcquireDstMemory
(
dx
);
const
auto
binary_prim
=
binary_handler
.
AcquireForwardPrimitive
();
const
std
::
unordered_map
<
int
,
dnnl
::
memory
>
args
=
{
{
DNNL_ARG_SRC_0
,
*
src_dout_memory
},
{
DNNL_ARG_SRC_1
,
*
src_y_memory
},
{
DNNL_ARG_DST
,
*
dst_memory
}};
binary_prim
->
execute
(
astream
,
args
);
}
astream
.
wait
();
if
(
dout
.
dims
()
!=
dx
->
dims
())
{
funcs
::
BroadcastReduction
<
T
>
(
dev_ctx
.
GetPlace
(),
onednn_engine
,
dx
,
&
dout
,
broadcast_src_memory
,
dst_memory
,
scales
,
BINARY_OP
==
dnnl
::
algorithm
::
binary_sub
);
}
else
{
dx
->
set_mem_desc
(
dst_memory
->
get_desc
());
}
}
if
(
dy
)
{
// elementwise_add & elementwise_sub
if
(
BINARY_OP
==
dnnl
::
algorithm
::
binary_add
||
BINARY_OP
==
dnnl
::
algorithm
::
binary_sub
)
{
if
(
dout
.
dims
()
==
dy
->
dims
())
{
dst_memory
=
reorder_handler
.
AcquireDstMemory
(
dy
,
dout
.
mem_desc
(),
dev_ctx
.
GetPlace
());
AddSubNonBroadcast
(
&
reorder_handler
,
dy
,
reorder_src_memory
,
dst_memory
,
scales
);
}
}
else
{
// elementwise_mul & elementwise_div
std
::
unordered_map
<
int
,
dnnl
::
memory
>
args
;
std
::
shared_ptr
<
dnnl
::
binary
>
binary_prim
;
std
::
shared_ptr
<
dnnl
::
memory
>
post_op_memory
;
std
::
shared_ptr
<
dnnl
::
memory
>
src_0_memory
;
std
::
shared_ptr
<
dnnl
::
memory
>
src_1_memory
;
funcs
::
BinaryOneDNNHandler
<
T
>
binary_handler
(
dnnl
::
algorithm
::
binary_mul
,
axis
,
onednn_engine
,
dev_ctx
.
GetPlace
(),
&
dout
,
non_const_x
,
nullptr
,
1.0
f
,
1.0
f
,
1.0
f
,
false
);
src_1_memory
=
binary_handler
.
AcquireSecondSrcMemory
(
non_const_x
);
if
(
BINARY_OP
==
dnnl
::
algorithm
::
binary_div
)
{
funcs
::
BinaryOneDNNHandler
<
T
>
post_op_binary_handler
(
dnnl
::
algorithm
::
binary_div
,
axis
,
onednn_engine
,
dev_ctx
.
GetPlace
(),
non_const_y
,
non_const_y
,
nullptr
,
1.0
f
,
1.0
f
,
1.0
f
,
false
);
post_op_memory
=
post_op_binary_handler
.
AcquireSrcMemory
(
non_const_y
);
dnnl
::
post_ops
po
;
po
.
append_binary
(
dnnl
::
algorithm
::
binary_div
,
post_op_memory
->
get_desc
());
binary_handler
=
funcs
::
BinaryOneDNNHandler
<
T
>
(
dnnl
::
algorithm
::
binary_mul
,
axis
,
onednn_engine
,
dev_ctx
.
GetPlace
(),
&
dout
,
out
,
nullptr
,
-
1.0
f
,
1.0
f
,
1.0
f
,
false
,
po
);
src_1_memory
=
binary_handler
.
AcquireSecondSrcMemory
(
out
);
}
src_0_memory
=
binary_handler
.
AcquireSrcMemory
(
&
dout
);
const
auto
dst_dy_memory
=
(
dout
.
dims
()
==
dy
->
dims
())
?
binary_handler
.
AcquireDstMemory
(
dy
)
:
binary_handler
.
AcquireDstMemory
();
binary_prim
=
binary_handler
.
AcquireForwardPrimitive
();
args
=
{{
DNNL_ARG_SRC_0
,
*
src_0_memory
},
{
DNNL_ARG_SRC_1
,
*
src_1_memory
},
{
DNNL_ARG_DST
,
*
dst_dy_memory
}};
if
(
BINARY_OP
==
dnnl
::
algorithm
::
binary_div
)
args
.
insert
({
DNNL_ARG_ATTR_MULTIPLE_POST_OP
(
0
)
|
DNNL_ARG_SRC_1
,
*
post_op_memory
});
binary_prim
->
execute
(
astream
,
args
);
broadcast_src_memory
=
dst_dy_memory
;
dst_memory
=
dst_dy_memory
;
}
astream
.
wait
();
if
(
dout
.
dims
()
!=
dy
->
dims
())
{
funcs
::
BroadcastReduction
<
T
>
(
dev_ctx
.
GetPlace
(),
onednn_engine
,
dy
,
&
dout
,
broadcast_src_memory
,
dst_memory
,
scales
,
BINARY_OP
==
dnnl
::
algorithm
::
binary_sub
);
}
else
{
dy
->
set_mem_desc
(
dst_memory
->
get_desc
());
}
}
}
#define DEFINE_ONEDNN_ELEMENTWISE_GRAD_KERNEL(name, algorithm) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& y, \
const DenseTensor& dout, \
int axis, \
DenseTensor* dx, \
DenseTensor* dy) { \
ElementwiseGradKernel<T, algorithm>( \
dev_ctx, x, y, nullptr, dout, axis, dx, dy); \
}
DEFINE_ONEDNN_ELEMENTWISE_GRAD_KERNEL
(
Add
,
dnnl
::
algorithm
::
binary_add
)
DEFINE_ONEDNN_ELEMENTWISE_GRAD_KERNEL
(
Subtract
,
dnnl
::
algorithm
::
binary_sub
)
DEFINE_ONEDNN_ELEMENTWISE_GRAD_KERNEL
(
Multiply
,
dnnl
::
algorithm
::
binary_mul
)
template
<
typename
T
,
typename
Context
>
void
DivideGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
const
DenseTensor
&
y
,
const
DenseTensor
&
out
,
const
DenseTensor
&
dout
,
int
axis
,
DenseTensor
*
dx
,
DenseTensor
*
dy
)
{
ElementwiseGradKernel
<
T
,
dnnl
::
algorithm
::
binary_div
>
(
dev_ctx
,
x
,
y
,
&
out
,
dout
,
axis
,
dx
,
dy
);
}
}
// namespace phi
PD_REGISTER_KERNEL
(
add_grad
,
OneDNN
,
ONEDNN
,
phi
::
AddGradKernel
,
float
,
phi
::
dtype
::
bfloat16
)
{
}
PD_REGISTER_KERNEL
(
subtract_grad
,
OneDNN
,
ONEDNN
,
phi
::
SubtractGradKernel
,
float
,
phi
::
dtype
::
bfloat16
)
{}
PD_REGISTER_KERNEL
(
multiply_grad
,
OneDNN
,
ONEDNN
,
phi
::
MultiplyGradKernel
,
float
,
phi
::
dtype
::
bfloat16
)
{}
PD_REGISTER_KERNEL
(
divide_grad
,
OneDNN
,
ONEDNN
,
phi
::
DivideGradKernel
,
float
,
phi
::
dtype
::
bfloat16
)
{}
paddle/phi/kernels/onednn/elementwise_kernel.cc
0 → 100644
浏览文件 @
78b30e97
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/elementwise_add_kernel.h"
#include "paddle/phi/kernels/elementwise_divide_kernel.h"
#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
#include "paddle/phi/backends/onednn/onednn_reuse.h"
#include "paddle/phi/core/kernel_registry.h"
namespace
phi
{
template
<
typename
T
,
dnnl
::
algorithm
BINARY_OP
>
void
ElementwiseKernel
(
const
OneDNNContext
&
dev_ctx
,
const
DenseTensor
&
x
,
const
DenseTensor
&
y
,
int
axis
,
DenseTensor
*
out
)
{
const
auto
&
onednn_engine
=
dev_ctx
.
GetEngine
();
float
scale_x
=
dev_ctx
.
HasDnnAttr
(
"Scale_x"
)
?
PADDLE_GET_CONST
(
float
,
dev_ctx
.
GetDnnAttr
(
"Scale_x"
))
:
1
;
float
scale_y
=
dev_ctx
.
HasDnnAttr
(
"Scale_y"
)
?
PADDLE_GET_CONST
(
float
,
dev_ctx
.
GetDnnAttr
(
"Scale_y"
))
:
1
;
float
scale_out
=
dev_ctx
.
HasDnnAttr
(
"Scale_out"
)
?
PADDLE_GET_CONST
(
float
,
dev_ctx
.
GetDnnAttr
(
"Scale_out"
))
:
1
;
dnnl
::
post_ops
post_operations
;
funcs
::
AppendActivation
(
dev_ctx
,
post_operations
);
auto
*
non_const_x
=
&
x
;
auto
*
non_const_y
=
&
y
;
funcs
::
BinaryOneDNNHandler
<
T
>
handler
(
BINARY_OP
,
axis
,
onednn_engine
,
dev_ctx
.
GetPlace
(),
non_const_x
,
non_const_y
,
out
,
scale_x
,
scale_y
,
scale_out
,
true
,
post_operations
);
// oneDNN's binary is optimized for broadcasting y into x, so in other case
// we have to swap tensors to achieve optimal performance
if
(
x
.
numel
()
<
y
.
numel
())
{
std
::
swap
(
non_const_x
,
non_const_y
);
}
const
auto
src_x_memory
=
handler
.
AcquireSrcMemory
(
non_const_x
);
const
auto
src_y_memory
=
handler
.
AcquireSecondSrcMemory
(
non_const_y
);
// (jczaja) For Inplace src and dst should be the same memory object.
// So x should share buffer with z. But UT mechanics is testing inplace
// execution for this op not checking that x can be bradcasted to match in
// shape y tensor.
// This is wrong as when x is to be broadcasted then z(out) will match the
// shape of y which is bigger than x. Hence if x is smaller in shape than z
// and they share a buffer (of
// shape x) then this buffer is not big enough to hold result of elementwise
// operation.
const
bool
reuse_x_memory
=
non_const_x
->
numel
()
==
out
->
numel
()
&&
non_const_x
->
IsSharedBufferWith
(
*
out
);
std
::
shared_ptr
<
dnnl
::
memory
>
dst_memory
;
if
(
reuse_x_memory
)
{
dst_memory
=
src_x_memory
;
// NOTE(chenfeiyu): when the output reuses memory from other tensor rather
// than allocate its own, it's still need to take care of its data type.
// Unfortunately, paddle's operator only infers the output' shape, but not
// the data type. Alloc<T> takes care of allocation and data type
// normally, but if the memory is already allocated and there is no need
// to re-allocate, it just set the data type. So this it added there to
// get the right data type.
dev_ctx
.
template
Alloc
<
T
>(
out
);
}
else
{
dst_memory
=
handler
.
AcquireDstMemory
(
out
);
}
const
auto
binary_prim
=
handler
.
AcquireForwardPrimitive
();
auto
&
astream
=
OneDNNContext
::
tls
().
get_stream
();
const
std
::
unordered_map
<
int
,
dnnl
::
memory
>
args
=
{
{
DNNL_ARG_SRC_0
,
*
src_x_memory
},
{
DNNL_ARG_SRC_1
,
*
src_y_memory
},
{
DNNL_ARG_DST
,
*
dst_memory
}};
binary_prim
->
execute
(
astream
,
args
);
astream
.
wait
();
if
(
handler
.
use_broadcasting_hack
==
false
)
{
out
->
set_mem_desc
(
dst_memory
->
get_desc
());
}
else
{
auto
dims
=
dst_memory
->
get_desc
().
dims
();
dims
.
insert
(
dims
.
begin
(),
non_const_x
->
dims
()[
0
]);
dims
[
1
]
/=
dims
[
0
];
out
->
set_mem_desc
(
dst_memory
->
get_desc
().
reshape
(
dims
));
}
}
#define DEFINE_ONEDNN_ELEMENTWISE_KERNEL(name, algorithm) \
template <typename T, typename Context> \
void name##RawKernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& y, \
int axis, \
DenseTensor* out) { \
ElementwiseKernel<T, algorithm>(dev_ctx, x, y, axis, out); \
}
DEFINE_ONEDNN_ELEMENTWISE_KERNEL
(
Divide
,
dnnl
::
algorithm
::
binary_div
)
}
// namespace phi
PD_REGISTER_KERNEL
(
divide_raw
,
OneDNN
,
ONEDNN
,
phi
::
DivideRawKernel
,
float
,
phi
::
dtype
::
bfloat16
)
{}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录