Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
f3436af1
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f3436af1
编写于
5月 06, 2021
作者:
A
Adam Osewski
提交者:
GitHub
5月 06, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[cherry-pick] Sum kernel for CPU supporting BF16 and SelectedRows (#32631) (#32755)
上级
21448525
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
115 addition
and
26 deletion
+115
-26
paddle/fluid/operators/math/blas_impl.h
paddle/fluid/operators/math/blas_impl.h
+19
-0
paddle/fluid/operators/math/selected_rows_functor.cc
paddle/fluid/operators/math/selected_rows_functor.cc
+20
-20
paddle/fluid/operators/sum_op.cc
paddle/fluid/operators/sum_op.cc
+2
-0
python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
+3
-6
python/paddle/fluid/tests/unittests/test_sum_op.py
python/paddle/fluid/tests/unittests/test_sum_op.py
+71
-0
未找到文件。
paddle/fluid/operators/math/blas_impl.h
浏览文件 @
f3436af1
...
...
@@ -15,6 +15,7 @@
#ifdef PADDLE_WITH_MKLML
#include <mkl.h>
#endif
#include <algorithm>
#include <cmath>
#include <limits>
...
...
@@ -28,6 +29,19 @@
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
detail
{
template
<
typename
T
>
static
void
axpy
(
int
n
,
const
T
alpha
,
const
T
*
x
,
const
int
incx
,
T
*
y
,
const
int
incy
)
{
// Y = Y + alpha * X
while
(
n
--
>
0
)
{
*
y
+=
alpha
*
*
x
;
y
=
y
+
incy
;
x
=
x
+
incx
;
}
}
}
// namespace detail
template
<
typename
T
>
struct
CBlas
;
...
...
@@ -43,6 +57,11 @@ struct CBlas<int8_t> {
template
<
>
struct
CBlas
<
platform
::
bfloat16
>
{
template
<
typename
...
ARGS
>
static
void
AXPY
(
ARGS
...
args
)
{
detail
::
axpy
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
VCOPY
(
ARGS
...
args
)
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
...
...
paddle/fluid/operators/math/selected_rows_functor.cc
浏览文件 @
f3436af1
...
...
@@ -285,6 +285,8 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CPUDeviceContext
,
double
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CPUDeviceContext
,
int
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CPUDeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CPUDeviceContext
,
platform
::
bfloat16
>;
// This is a separated namespace for manipulate SelectedRows typed
// data. Like merge duplicated rows, adding two SelectedRows etc.
...
...
@@ -294,21 +296,17 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
// add or mul.
namespace
scatter
{
template
<
typename
DeviceContext
,
typename
T
>
typename
std
::
enable_if
<
std
::
is_floating_point
<
T
>::
value
&&
std
::
is_same
<
DeviceContext
,
platform
::
CPUDeviceContext
>::
value
>::
type
elementwise_add_to
(
const
DeviceContext
&
ctx
,
BlasT
<
DeviceContext
,
T
>*
blas
,
size_t
data_len
,
const
T
*
in
,
T
*
out
)
{
blas
->
AXPY
(
data_len
,
1.
,
in
,
out
);
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_floating_point
<
T
>::
value
>::
type
elementwise_add_to
(
BlasT
<
platform
::
CPUDeviceContext
,
T
>*
blas
,
size_t
data_len
,
const
T
*
in
,
T
*
out
)
{
blas
->
AXPY
(
data_len
,
T
(
1.
f
),
in
,
out
);
}
template
<
typename
DeviceContext
,
typename
T
>
typename
std
::
enable_if
<
!
std
::
is_floating_point
<
T
>::
value
&&
std
::
is_same
<
DeviceContext
,
platform
::
CPUDeviceContext
>::
value
>::
type
elementwise_add_to
(
const
DeviceContext
&
ctx
,
BlasT
<
DeviceContext
,
T
>*
blas
,
size_t
data_len
,
const
T
*
in
,
T
*
out
)
{
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_integral
<
T
>::
value
>::
type
elementwise_add_to
(
BlasT
<
platform
::
CPUDeviceContext
,
T
>*
blas
,
size_t
data_len
,
const
T
*
in
,
T
*
out
)
{
for
(
size_t
i
=
0
;
i
<
data_len
;
i
++
)
{
out
[
i
]
+=
in
[
i
];
}
...
...
@@ -412,7 +410,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
out
.
set_rows
(
merge_rows
);
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
constant_functor
;
constant_functor
(
context
,
out
.
mutable_value
(),
0.0
);
constant_functor
(
context
,
out
.
mutable_value
(),
static_cast
<
T
>
(
0.
f
)
);
std
::
unordered_map
<
int64_t
,
size_t
>
rows_to_id
;
for
(
size_t
i
=
0
;
i
<
merge_rows
.
size
();
++
i
)
{
...
...
@@ -429,9 +427,9 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
for
(
size_t
i
=
0
;
i
<
input_rows
.
size
();
i
++
)
{
size_t
out_i
=
rows_to_id
[
input_rows
[
i
]];
elementwise_add_to
<
platform
::
CPUDeviceContext
,
T
>
(
context
,
&
blas
,
static_cast
<
size_t
>
(
input_width
)
,
&
input_data
[
i
*
input_width
],
&
out_data
[
out_i
*
input_width
]);
elementwise_add_to
<
T
>
(
&
blas
,
static_cast
<
size_t
>
(
input_width
),
&
input_data
[
i
*
input_width
]
,
&
out_data
[
out_i
*
input_width
]);
}
}
}
...
...
@@ -524,9 +522,9 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
for
(
size_t
i
=
0
;
i
<
input_rows
.
size
();
i
++
)
{
size_t
out_i
=
rows_to_id
[
input_rows
[
i
]];
elementwise_add_to
<
platform
::
CPUDeviceContext
,
T
>
(
context
,
&
blas
,
static_cast
<
size_t
>
(
input_width
)
,
&
input_data
[
i
*
input_width
],
&
out_data
[
out_i
*
input_width
]);
elementwise_add_to
<
T
>
(
&
blas
,
static_cast
<
size_t
>
(
input_width
),
&
input_data
[
i
*
input_width
]
,
&
out_data
[
out_i
*
input_width
]);
}
}
size_t
input_width_cast
=
static_cast
<
size_t
>
(
input_width
);
...
...
@@ -547,6 +545,8 @@ template struct MergeAdd<platform::CPUDeviceContext,
paddle
::
platform
::
complex64
>;
template
struct
MergeAdd
<
platform
::
CPUDeviceContext
,
paddle
::
platform
::
complex128
>;
template
struct
MergeAdd
<
platform
::
CPUDeviceContext
,
paddle
::
platform
::
bfloat16
>;
template
struct
MergeAverage
<
platform
::
CPUDeviceContext
,
int
>;
template
struct
MergeAverage
<
platform
::
CPUDeviceContext
,
int64_t
>;
...
...
paddle/fluid/operators/sum_op.cc
浏览文件 @
f3436af1
...
...
@@ -326,4 +326,6 @@ REGISTER_OP_CPU_KERNEL(
sum
,
ops
::
SumKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CPUDeviceContext
,
paddle
::
platform
::
bfloat16
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
);
python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
浏览文件 @
f3436af1
...
...
@@ -76,8 +76,7 @@ class TestSparseSGDOpBF16(unittest.TestCase):
grad_selected_rows
=
scope
.
var
(
'Grad'
).
get_selected_rows
()
grad_selected_rows
.
set_height
(
height
)
grad_selected_rows
.
set_rows
(
rows
)
# grad_array = np.random.random((len(rows), row_numel)).astype('float32')
grad_array
=
np
.
full
((
len
(
rows
),
row_numel
),
2
,
np
.
float32
)
grad_array
=
np
.
random
.
random
((
len
(
rows
),
row_numel
)).
astype
(
'float32'
)
np_array_bf16
=
convert_float_to_uint16
(
grad_array
)
grad_tensor
=
grad_selected_rows
.
get_tensor
()
...
...
@@ -87,8 +86,7 @@ class TestSparseSGDOpBF16(unittest.TestCase):
def
create_dense_param_var
(
self
,
scope
,
place
,
height
,
width
):
param_tensor
=
scope
.
var
(
'Param'
).
get_tensor
()
# param_array = np.random.random((height, width)).astype('float32')
param_array
=
np
.
full
((
height
,
width
),
5
,
np
.
float32
)
param_array
=
np
.
random
.
random
((
height
,
width
)).
astype
(
'float32'
)
param_array_bf16
=
convert_float_to_uint16
(
param_array
)
param_tensor
.
set
(
param_array_bf16
,
place
)
...
...
@@ -109,8 +107,7 @@ class TestSparseSGDOpBF16(unittest.TestCase):
def
create_dense_lr_var
(
self
,
scope
,
place
):
lr_tensor
=
scope
.
var
(
'LearningRate'
).
get_tensor
()
# lr_value = np.random.uniform()
lr_value
=
2
lr_value
=
np
.
random
.
uniform
()
lr_array
=
np
.
full
((
1
),
lr_value
,
np
.
float32
)
lr_array_bf16
=
convert_float_to_uint16
(
lr_array
)
lr_tensor
.
set
(
lr_array_bf16
,
place
)
...
...
python/paddle/fluid/tests/unittests/test_sum_op.py
浏览文件 @
f3436af1
...
...
@@ -18,9 +18,12 @@ import unittest
import
numpy
as
np
from
op_test
import
OpTest
import
paddle
from
paddle
import
enable_static
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.op
import
Operator
from
paddle.fluid.tests.unittests.op_test
import
(
OpTest
,
convert_float_to_uint16
,
convert_uint16_to_float
)
class
TestSumOp
(
OpTest
):
...
...
@@ -141,6 +144,73 @@ class TestSelectedRowsSumOp(unittest.TestCase):
self
.
check_with_place
(
place
,
inplace
)
class
TestSelectedRowsSumOpInt
(
TestSelectedRowsSumOp
):
def
init_kernel_type
(
self
):
self
.
dtype
=
np
.
int32
@
unittest
.
skipIf
(
not
core
.
supports_bfloat16
(),
'place does not support BF16 evaluation'
)
class
TestSelectedRowsSumBF16Op
(
TestSelectedRowsSumOp
):
def
setUp
(
self
):
self
.
height
=
10
self
.
row_numel
=
12
self
.
rows
=
[
0
,
1
,
2
,
3
,
4
,
5
,
6
]
self
.
dtype
=
np
.
uint16
self
.
init_kernel_type
()
np
.
random
.
seed
(
12345
)
self
.
data
=
np
.
random
.
random
((
len
(
self
.
rows
),
self
.
row_numel
)).
astype
(
np
.
float32
)
def
_get_array
(
self
,
rows
,
row_numel
):
if
len
(
rows
)
>
0
:
return
convert_float_to_uint16
(
self
.
data
)
else
:
return
np
.
ndarray
((
0
,
row_numel
),
dtype
=
self
.
dtype
)
def
check_input_and_optput
(
self
,
scope
,
place
,
inplace
,
w1_has_data
=
False
,
w2_has_data
=
False
,
w3_has_data
=
False
):
self
.
create_selected_rows
(
scope
,
place
,
"W1"
,
w1_has_data
)
self
.
create_selected_rows
(
scope
,
place
,
"W2"
,
w2_has_data
)
self
.
create_selected_rows
(
scope
,
place
,
"W3"
,
w3_has_data
)
# create Out Variable
if
inplace
:
out_var_name
=
"W1"
else
:
out_var_name
=
"Out"
out
=
scope
.
var
(
out_var_name
).
get_selected_rows
()
# create and run sum operator
sum_op
=
Operator
(
"sum"
,
X
=
[
"W1"
,
"W2"
,
"W3"
],
Out
=
out_var_name
)
sum_op
.
run
(
scope
,
place
)
has_data_w_num
=
0
for
has_data
in
[
w1_has_data
,
w2_has_data
,
w3_has_data
]:
if
has_data
:
has_data_w_num
+=
1
if
has_data_w_num
>
0
:
self
.
assertEqual
(
len
(
out
.
rows
()),
7
)
out_bf16
=
np
.
array
(
out
.
get_tensor
())
out_fp32
=
convert_uint16_to_float
(
out_bf16
)
ref_fp32
=
convert_uint16_to_float
(
self
.
_get_array
(
self
.
rows
,
self
.
row_numel
))
*
has_data_w_num
np
.
testing
.
assert_allclose
(
out_fp32
,
ref_fp32
,
atol
=
0
,
rtol
=
0.95e-2
)
else
:
self
.
assertEqual
(
len
(
out
.
rows
()),
0
)
def
test_w_is_selected_rows
(
self
):
for
inplace
in
[
True
,
False
]:
self
.
check_with_place
(
core
.
CPUPlace
(),
inplace
)
class
TestLoDTensorAndSelectedRowsOp
(
TestSelectedRowsSumOp
):
def
setUp
(
self
):
self
.
height
=
10
...
...
@@ -324,4 +394,5 @@ create_test_sum_fp16_class(TestSelectedRowsSumOp)
create_test_sum_fp16_class
(
TestLoDTensorAndSelectedRowsOp
)
if
__name__
==
"__main__"
:
enable_static
()
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录