Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
39a5424e
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
39a5424e
编写于
3月 09, 2021
作者:
J
Jacek Czaja
提交者:
GitHub
3月 09, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[oneDNN] elementwise add bf16 grad kernel with broadcasting (#31385)
上级
5f621321
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
131 addition
and
19 deletion
+131
-19
paddle/fluid/operators/elementwise/elementwise_op.h
paddle/fluid/operators/elementwise/elementwise_op.h
+4
-1
paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
...operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+23
-8
paddle/fluid/platform/mkldnn_reuse.h
paddle/fluid/platform/mkldnn_reuse.h
+44
-0
python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_bf16_mkldnn_op.py
...s/unittests/mkldnn/test_elementwise_add_bf16_mkldnn_op.py
+37
-4
python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
.../tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
+9
-3
python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py
...ddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py
+4
-3
python/paddle/fluid/tests/unittests/op_test.py
python/paddle/fluid/tests/unittests/op_test.py
+10
-0
未找到文件。
paddle/fluid/operators/elementwise/elementwise_op.h
浏览文件 @
39a5424e
...
...
@@ -277,7 +277,10 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
#ifdef PADDLE_WITH_MKLDNN
// If broadcasting is needed, use native implementation
auto
CanMKLDNNElementwiseAddGradBeUsed
=
[
&
]()
{
return
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
dims
()
==
ctx
.
Input
<
Tensor
>
(
"Y"
)
->
dims
());
auto
dx_dims
=
ctx
.
Input
<
Tensor
>
(
"X"
)
->
dims
();
auto
dy_dims
=
ctx
.
Input
<
Tensor
>
(
"Y"
)
->
dims
();
// No broadcast or broadcasting of data on inner dims is supported
return
(
dx_dims
[
dx_dims
.
size
()
-
1
]
==
dy_dims
[
dy_dims
.
size
()
-
1
]);
};
if
(
this
->
CanMKLDNNBeUsed
(
ctx
,
input_data_type
)
&&
...
...
paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
浏览文件 @
39a5424e
...
...
@@ -64,14 +64,29 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
}
if
(
dy
)
{
auto
reorder_dst_memory_p
=
handler
.
AcquireDstMemory
(
dy
,
dout
->
format
(),
ctx
.
GetPlace
());
auto
reorder_p
=
handler
.
AcquireReorder
(
reorder_dst_memory_p
,
reorder_src_memory_p
);
platform
::
RecordEvent
record_reorder
(
"int_reorder"
,
platform
::
EventRole
::
kUniqueOp
);
reorder_p
->
execute
(
astream
,
*
reorder_src_memory_p
,
*
reorder_dst_memory_p
);
astream
.
wait
();
// Direct copy
if
(
dout
->
dims
()
==
dy
->
dims
())
{
auto
reorder_dst_memory_p
=
handler
.
AcquireDstMemory
(
dy
,
dout
->
format
(),
ctx
.
GetPlace
());
auto
reorder_p
=
handler
.
AcquireReorder
(
reorder_dst_memory_p
,
reorder_src_memory_p
);
platform
::
RecordEvent
record_reorder
(
"int_reorder"
,
platform
::
EventRole
::
kUniqueOp
);
reorder_p
->
execute
(
astream
,
*
reorder_src_memory_p
,
*
reorder_dst_memory_p
);
astream
.
wait
();
}
else
{
// Broadcasting
platform
::
ReductionMKLDNNHandler
<
T
>
handler_sum
(
dnnl
::
algorithm
::
reduction_sum
,
0.0
f
,
0.0
f
,
dev_ctx
,
onednn_engine
,
ctx
.
GetPlace
(),
dout
,
dy
,
ctx
.
InputName
(
framework
::
GradVarName
(
"Out"
)));
auto
dy_memory_p
=
handler_sum
.
AcquireDstMemory
(
dy
);
auto
reduction_p
=
handler_sum
.
AcquireForwardPrimitive
();
reduction_p
->
execute
(
astream
,
{{
DNNL_ARG_SRC
,
*
reorder_src_memory_p
},
{
DNNL_ARG_DST
,
*
dy_memory_p
}});
astream
.
wait
();
}
}
}
};
...
...
paddle/fluid/platform/mkldnn_reuse.h
浏览文件 @
39a5424e
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <memory>
#include <sstream>
#include <string>
...
...
@@ -621,6 +622,49 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> {
}
};
template
<
typename
T
>
class
ReductionMKLDNNHandler
:
public
platform
::
MKLDNNHandlerT
<
T
,
dnnl
::
reduction
>
{
public:
ReductionMKLDNNHandler
(
const
dnnl
::
algorithm
algo
,
const
float
p
,
const
float
eps
,
const
MKLDNNDeviceContext
&
dev_ctx
,
const
mkldnn
::
engine
engine
,
platform
::
Place
cpu_place
,
const
Tensor
*
x
,
const
Tensor
*
y
,
const
std
::
string
&
uniq_name
)
:
platform
::
MKLDNNHandlerT
<
T
,
dnnl
::
reduction
>
(
dev_ctx
,
engine
,
cpu_place
,
platform
::
CreateKey
(
dev_ctx
,
framework
::
vectorize
(
x
->
dims
()),
uniq_name
,
(
std
::
to_string
(
static_cast
<
int
>
(
algo
)))))
{
if
(
!
this
->
isCached
())
{
PADDLE_ENFORCE_EQ
(
x
->
layout
(),
DataLayout
::
kMKLDNN
,
platform
::
errors
::
InvalidArgument
(
"Wrong layout set for X tensor."
));
PADDLE_ENFORCE_NE
(
x
->
format
(),
MKLDNNMemoryFormat
::
undef
,
platform
::
errors
::
InvalidArgument
(
"Wrong format set for X tensor."
));
const
auto
src_tz
=
framework
::
vectorize
(
x
->
dims
());
const
auto
dst_tz
=
framework
::
vectorize
(
y
->
dims
());
// For oneDNN dimensionality should match so we need to
// extend Y tensor dims with values of 1 (before and after pattern)
int
j
=
0
;
std
::
vector
<
int64_t
>
dst_tz_ex
(
src_tz
.
size
(),
1
);
for
(
size_t
i
=
0
;
i
<
src_tz
.
size
();
++
i
)
{
dst_tz_ex
[
i
]
=
(
src_tz
[
i
]
!=
dst_tz
[
j
])
?
1
:
dst_tz
[
j
++
];
}
const
auto
src_md
=
dnnl
::
memory
::
desc
(
src_tz
,
platform
::
MKLDNNGetDataType
<
T
>
(),
x
->
format
());
const
auto
dst_md
=
memory
::
desc
(
dst_tz_ex
,
platform
::
MKLDNNGetDataType
<
T
>
(),
x
->
format
());
this
->
AcquireForwardPrimitiveDescriptor
(
algo
,
src_md
,
dst_md
,
p
,
eps
);
}
}
};
template
<
typename
T
>
class
ActivationMKLDNNHandler
:
public
MKLDNNHandlerT
<
T
,
mkldnn
::
eltwise_forward
,
...
...
python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_bf16_mkldnn_op.py
浏览文件 @
39a5424e
...
...
@@ -45,13 +45,13 @@ class TestElementwiseAddBf16MklDNNOp(OpTest):
def
test_check_output
(
self
):
self
.
check_output_with_place
(
core
.
CPUPlace
())
# elementwise_add grad is just passing upper gradients to either X or Y or both
# elementwise_add grad
(no braodcasting)
is just passing upper gradients to either X or Y or both
def
test_check_grad_normal
(
self
):
self
.
check_grad_with_place
(
core
.
CPUPlace
(),
[
"X"
,
"Y"
],
"Out"
,
check_dygraph
=
False
,
user_defined_grads
=
[
self
.
x
_bf16
,
self
.
x_bf16
],
user_defined_grads
=
[
self
.
x
,
self
.
x
],
user_defined_grad_outputs
=
[
self
.
x_bf16
])
def
test_check_grad_ingore_x
(
self
):
...
...
@@ -59,7 +59,7 @@ class TestElementwiseAddBf16MklDNNOp(OpTest):
core
.
CPUPlace
(),
[
"Y"
],
"Out"
,
check_dygraph
=
False
,
user_defined_grads
=
[
self
.
y
_bf16
],
user_defined_grads
=
[
self
.
y
],
user_defined_grad_outputs
=
[
self
.
y_bf16
])
def
test_check_grad_ingore_y
(
self
):
...
...
@@ -67,7 +67,40 @@ class TestElementwiseAddBf16MklDNNOp(OpTest):
core
.
CPUPlace
(),
[
"X"
],
"Out"
,
check_dygraph
=
False
,
user_defined_grads
=
[
self
.
x_bf16
],
user_defined_grads
=
[
self
.
x
],
user_defined_grad_outputs
=
[
self
.
x_bf16
])
class
TestElementwiseAddBroadCastingBf16MklDNNOp
(
TestElementwiseAddBf16MklDNNOp
):
def
generate_data
(
self
):
self
.
x
=
np
.
random
.
uniform
(
1
,
2
,
[
2
,
3
,
4
,
100
]).
astype
(
np
.
float32
)
self
.
y
=
np
.
random
.
uniform
(
1
,
2
,
[
100
]).
astype
(
np
.
float32
)
self
.
out
=
np
.
add
(
self
.
x
,
self
.
y
)
# Compute partial sums along all axes but last one
def
compute_reduced_gradients
(
self
,
out_grads
):
part_sum
=
np
.
add
.
reduceat
(
out_grads
,
[
0
],
axis
=
0
)
part_sum
=
np
.
add
.
reduceat
(
part_sum
,
[
0
],
axis
=
1
)
part_sum
=
np
.
add
.
reduceat
(
part_sum
,
[
0
],
axis
=
2
)
return
part_sum
.
flatten
()
def
test_check_grad_normal
(
self
):
self
.
check_grad_with_place
(
core
.
CPUPlace
(),
[
"X"
,
"Y"
],
"Out"
,
check_dygraph
=
False
,
user_defined_grads
=
[
self
.
x
,
self
.
compute_reduced_gradients
(
self
.
x
)
],
user_defined_grad_outputs
=
[
self
.
x_bf16
])
def
test_check_grad_ingore_x
(
self
):
self
.
check_grad_with_place
(
core
.
CPUPlace
(),
[
"Y"
],
"Out"
,
check_dygraph
=
False
,
user_defined_grads
=
[
self
.
compute_reduced_gradients
(
self
.
x
)],
user_defined_grad_outputs
=
[
self
.
x_bf16
])
...
...
python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
浏览文件 @
39a5424e
...
...
@@ -17,6 +17,7 @@ import unittest
import
numpy
as
np
from
paddle.fluid.tests.unittests.op_test
import
skip_check_grad_ci
from
paddle.fluid.tests.unittests.test_elementwise_add_op
import
TestElementwiseAddOp
from
paddle
import
enable_static
class
TestMKLDNNElementwiseAddOp
(
TestElementwiseAddOp
):
...
...
@@ -51,13 +52,17 @@ class TestMKLDNNElementwiseAddOp4(TestMKLDNNElementwiseAddOp):
def
test_check_grad_normal
(
self
):
pass
def
test_check_grad_ingore_x
(
self
):
pass
def
test_check_grad_ingore_y
(
self
):
pass
class
TestMKLDNNElementwiseAddOp5
(
TestMKLDNNElementwiseAddOp
):
def
init_input_output
(
self
):
self
.
x
=
np
.
random
.
uniform
(
1
,
2
,
[
2
,
3
,
4
,
100
]).
astype
(
self
.
dtype
)
self
.
y
=
np
.
random
.
uniform
(
1
,
2
,
[
100
]).
astype
(
self
.
dtype
)
self
.
out
=
np
.
add
(
self
.
x
,
self
.
y
)
class
TestMKLDNNElementwiseAddOp_broadcast_3
(
TestMKLDNNElementwiseAddOp
):
def
init_input_output
(
self
):
self
.
x
=
np
.
random
.
rand
(
2
,
10
,
12
,
3
).
astype
(
self
.
dtype
)
...
...
@@ -150,4 +155,5 @@ class TestUint8Scales(TestInt8Scales):
if
__name__
==
'__main__'
:
enable_static
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py
浏览文件 @
39a5424e
...
...
@@ -50,8 +50,9 @@ class TestReshapeBf16Op(OpTest):
self
.
infered_shape
=
(
10
,
2
,
3
,
-
1
)
def
init_input_data
(
self
):
self
.
input_data
=
convert_float_to_uint16
(
np
.
random
.
random
(
self
.
ori_shape
).
astype
(
np
.
float32
))
self
.
input_data_fp32
=
np
.
random
.
random
(
self
.
ori_shape
).
astype
(
np
.
float32
)
self
.
input_data
=
convert_float_to_uint16
(
self
.
input_data_fp32
)
def
test_check_output
(
self
):
self
.
check_output_with_place
(
core
.
CPUPlace
(),
no_check_set
=
[
'XShape'
])
...
...
@@ -61,7 +62,7 @@ class TestReshapeBf16Op(OpTest):
core
.
CPUPlace
(),
[
"X"
],
"Out"
,
check_dygraph
=
False
,
user_defined_grads
=
[
self
.
input
s
[
"X"
]
],
user_defined_grads
=
[
self
.
input
_data_fp32
],
user_defined_grad_outputs
=
[
self
.
inputs
[
"X"
].
reshape
(
self
.
infered_shape
)
])
...
...
python/paddle/fluid/tests/unittests/op_test.py
浏览文件 @
39a5424e
...
...
@@ -1452,6 +1452,16 @@ class OpTest(unittest.TestCase):
analytic_grads
=
self
.
_get_gradient
(
inputs_to_check
,
place
,
output_names
,
no_grad_set
,
user_defined_grad_outputs
)
# comparison of bf16 results will happen as fp32
# loop over list of grads and convert bf16 to fp32
fp32_grads
=
[]
for
grad
in
analytic_grads
:
if
grad
.
dtype
==
np
.
uint16
:
grad
=
convert_uint16_to_float
(
grad
)
max_relative_error
=
0.03
fp32_grads
.
append
(
grad
)
analytic_grads
=
fp32_grads
self
.
_assert_is_close
(
numeric_grads
,
analytic_grads
,
inputs_to_check
,
max_relative_error
,
"Gradient Check On %s"
%
str
(
place
))
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录