Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
cbabbe2e
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
cbabbe2e
编写于
9月 02, 2022
作者:
A
Aurelius84
提交者:
GitHub
9月 02, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[XPU]Migrate Adam XPU kernel into Phi (#45572)
* [XPU]Migrate Adam XPU kernel into Phi * test=kunlun
上级
e3e92c9a
变更
6
展开全部
隐藏空白更改
内联
并排
Showing
6 changed file
with
703 addition
and
664 deletion
+703
-664
paddle/fluid/operators/math/selected_rows_functor.cc
paddle/fluid/operators/math/selected_rows_functor.cc
+5
-5
paddle/fluid/operators/optimizers/adam_op_xpu.cc
paddle/fluid/operators/optimizers/adam_op_xpu.cc
+0
-643
paddle/phi/kernels/CMakeLists.txt
paddle/phi/kernels/CMakeLists.txt
+4
-16
paddle/phi/kernels/funcs/adam_functors.h
paddle/phi/kernels/funcs/adam_functors.h
+134
-0
paddle/phi/kernels/selected_rows/xpu/adam_kernel.cc
paddle/phi/kernels/selected_rows/xpu/adam_kernel.cc
+308
-0
paddle/phi/kernels/xpu/adam_kernel.cc
paddle/phi/kernels/xpu/adam_kernel.cc
+252
-0
未找到文件。
paddle/fluid/operators/math/selected_rows_functor.cc
浏览文件 @
cbabbe2e
...
@@ -569,8 +569,8 @@ TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(platform::complex<double>)
...
@@ -569,8 +569,8 @@ TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(platform::complex<double>)
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
template
<
typename
T
>
template
<
typename
T
>
struct
MergeAdd
<
p
latform
::
XPUDevice
Context
,
T
>
{
struct
MergeAdd
<
p
hi
::
XPU
Context
,
T
>
{
phi
::
SelectedRows
operator
()(
const
p
latform
::
XPUDevice
Context
&
context
,
phi
::
SelectedRows
operator
()(
const
p
hi
::
XPU
Context
&
context
,
const
phi
::
SelectedRows
&
input
,
const
phi
::
SelectedRows
&
input
,
const
bool
sorted_result
=
false
)
{
const
bool
sorted_result
=
false
)
{
phi
::
SelectedRows
out
;
phi
::
SelectedRows
out
;
...
@@ -578,7 +578,7 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
...
@@ -578,7 +578,7 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
return
out
;
return
out
;
}
}
void
operator
()(
const
p
latform
::
XPUDevice
Context
&
context
,
void
operator
()(
const
p
hi
::
XPU
Context
&
context
,
const
phi
::
SelectedRows
&
input
,
const
phi
::
SelectedRows
&
input
,
phi
::
SelectedRows
*
output
,
phi
::
SelectedRows
*
output
,
const
bool
sorted_result
=
false
)
{
const
bool
sorted_result
=
false
)
{
...
@@ -633,7 +633,7 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
...
@@ -633,7 +633,7 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"merge_dup_rows"
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"merge_dup_rows"
);
}
}
void
operator
()(
const
p
latform
::
XPUDevice
Context
&
context
,
void
operator
()(
const
p
hi
::
XPU
Context
&
context
,
const
std
::
vector
<
const
phi
::
SelectedRows
*>&
inputs
,
const
std
::
vector
<
const
phi
::
SelectedRows
*>&
inputs
,
phi
::
SelectedRows
*
output
,
phi
::
SelectedRows
*
output
,
const
bool
sorted_result
=
false
)
{
const
bool
sorted_result
=
false
)
{
...
@@ -838,7 +838,7 @@ struct MergeAverage<phi::CPUContext, T> {
...
@@ -838,7 +838,7 @@ struct MergeAverage<phi::CPUContext, T> {
};
};
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
template
struct
MergeAdd
<
p
latform
::
XPUDevice
Context
,
float
>;
template
struct
MergeAdd
<
p
hi
::
XPU
Context
,
float
>;
#endif
#endif
template
struct
MergeAverage
<
phi
::
CPUContext
,
int
>;
template
struct
MergeAverage
<
phi
::
CPUContext
,
int
>;
...
...
paddle/fluid/operators/optimizers/adam_op_xpu.cc
已删除
100644 → 0
浏览文件 @
e3e92c9a
此差异已折叠。
点击以展开。
paddle/phi/kernels/CMakeLists.txt
浏览文件 @
cbabbe2e
...
@@ -22,6 +22,7 @@ set_property(GLOBAL PROPERTY PHI_KERNELS "")
...
@@ -22,6 +22,7 @@ set_property(GLOBAL PROPERTY PHI_KERNELS "")
# [ 1. Common kernel compilation dependencies ]
# [ 1. Common kernel compilation dependencies ]
set
(
COMMON_KERNEL_DEPS
set
(
COMMON_KERNEL_DEPS
dense_tensor
dense_tensor
string_tensor
sparse_coo_tensor
sparse_coo_tensor
sparse_csr_tensor
sparse_csr_tensor
kernel_context
kernel_context
...
@@ -30,6 +31,7 @@ set(COMMON_KERNEL_DEPS
...
@@ -30,6 +31,7 @@ set(COMMON_KERNEL_DEPS
convert_utils
convert_utils
lod_utils
lod_utils
custom_kernel
custom_kernel
string_infermeta
phi_tensor_utils
)
phi_tensor_utils
)
set
(
COMMON_KERNEL_DEPS
set
(
COMMON_KERNEL_DEPS
${
COMMON_KERNEL_DEPS
}
${
COMMON_KERNEL_DEPS
}
...
@@ -67,21 +69,7 @@ set(COMMON_KERNEL_DEPS
...
@@ -67,21 +69,7 @@ set(COMMON_KERNEL_DEPS
sequence_padding
sequence_padding
sequence_scale
sequence_scale
fft
fft
phi_data_layout_transform
)
phi_data_layout_transform
set
(
COMMON_KERNEL_DEPS
${
COMMON_KERNEL_DEPS
}
dense_tensor
string_tensor
sparse_coo_tensor
sparse_csr_tensor
kernel_context
kernel_factory
arg_map_context
convert_utils
lod_utils
custom_kernel
string_infermeta
gpc
gpc
utf8proc
utf8proc
device_memory_aligment
)
device_memory_aligment
)
...
@@ -136,7 +124,7 @@ else()
...
@@ -136,7 +124,7 @@ else()
"strings/cpu/*.cc"
)
"strings/cpu/*.cc"
)
endif
()
endif
()
file
(
GLOB kernel_xpu
"xpu/*.cc"
)
file
(
GLOB kernel_xpu
"xpu/*.cc"
"selected_rows/xpu/*.cc"
)
add_library
(
phi_cpu
${
kernel_cc
}
)
add_library
(
phi_cpu
${
kernel_cc
}
)
kernel_declare
(
"
${
kernel_cc
}
"
)
kernel_declare
(
"
${
kernel_cc
}
"
)
...
...
paddle/phi/kernels/funcs/adam_functors.h
浏览文件 @
cbabbe2e
...
@@ -19,8 +19,142 @@
...
@@ -19,8 +19,142 @@
#include "paddle/phi/kernels/funcs/algorithm.h"
#include "paddle/phi/kernels/funcs/algorithm.h"
#ifdef PADDLE_WITH_XPU
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_header.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/memory/memcpy.h"
#endif
namespace
phi
{
namespace
phi
{
namespace
funcs
{
namespace
funcs
{
using
float16
=
dtype
::
float16
;
#ifdef PADDLE_WITH_XPU
template
<
typename
Context
,
typename
T1
,
typename
T2
>
static
int
ConvertDataByType
(
const
T1
*
x
,
T2
**
y
,
int
len
,
bool
allocateFlag
,
const
Context
&
dev_ctx
)
{
if
(
nullptr
==
x
||
nullptr
==
y
||
len
<=
0
)
return
xpu
::
Error_t
::
INVALID_PARAM
;
int
r
=
0
;
if
(
allocateFlag
)
{
r
=
xpu_malloc
(
reinterpret_cast
<
void
**>
(
y
),
sizeof
(
T2
)
*
len
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"adam"
);
}
T1
*
cpu_data
=
reinterpret_cast
<
T1
*>
(
malloc
(
sizeof
(
T1
)
*
len
));
paddle
::
memory
::
Copy
(
CPUPlace
(),
cpu_data
,
dev_ctx
.
GetPlace
(),
x
,
len
*
sizeof
(
T1
));
T2
*
cpu_real_data
=
reinterpret_cast
<
T2
*>
(
malloc
(
sizeof
(
T2
)
*
len
));
for
(
int
i
=
0
;
i
<
len
;
i
++
)
cpu_real_data
[
i
]
=
static_cast
<
T2
>
(
cpu_data
[
i
]);
paddle
::
memory
::
Copy
(
dev_ctx
.
GetPlace
(),
*
y
,
CPUPlace
(),
cpu_real_data
,
len
*
sizeof
(
T2
));
free
(
cpu_data
);
free
(
cpu_real_data
);
return
xpu
::
Error_t
::
SUCCESS
;
}
template
<
typename
Context
,
typename
T
>
static
void
GetDataPointer
(
const
phi
::
DenseTensor
&
tensorData
,
T
**
result
,
const
Context
&
dev_ctx
)
{
if
(
tensorData
.
dtype
()
==
DataType
::
FLOAT16
)
{
const
float16
*
real_data
=
tensorData
.
template
data
<
float16
>();
int
len
=
tensorData
.
numel
();
int
r
=
ConvertDataByType
<
Context
,
float16
,
T
>
(
real_data
,
result
,
len
,
true
,
dev_ctx
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"adam"
);
}
}
template
<
typename
Context
,
typename
T
>
static
void
GetOutDataPointer
(
DenseTensor
*
tensorData
,
DenseTensor
*
out
,
T
**
result
,
const
Context
&
dev_ctx
)
{
if
(
tensorData
->
dtype
()
==
DataType
::
FLOAT16
)
{
*
result
=
dev_ctx
.
template
Alloc
<
T
>(
out
);
}
else
{
*
result
=
dev_ctx
.
template
Alloc
<
T
>(
tensorData
);
}
}
template
<
typename
Context
,
typename
T
>
static
void
CopyOutData
(
const
DenseTensor
&
srcTensor
,
phi
::
DenseTensor
*
dstTensor
,
const
Context
&
dev_ctx
)
{
if
(
dstTensor
->
dtype
()
==
DataType
::
FLOAT16
)
{
const
T
*
xpu_out_data
=
srcTensor
.
template
data
<
T
>();
float16
*
out_data
=
dev_ctx
.
template
Alloc
<
float16
>(
dstTensor
);
int
len
=
srcTensor
.
numel
();
int
r
=
ConvertDataByType
<
Context
,
T
,
float16
>
(
xpu_out_data
,
&
out_data
,
len
,
false
,
dev_ctx
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"adam"
);
}
}
template
<
typename
T
>
static
void
FreeData
(
const
phi
::
DenseTensor
&
tensorData
,
T
*
dataPtr
)
{
if
(
tensorData
.
dtype
()
==
DataType
::
FLOAT16
)
xpu_free
(
dataPtr
);
}
template
<
typename
Context
,
typename
T
>
static
void
SetBetaData
(
const
phi
::
DenseTensor
&
beta_pow
,
phi
::
DenseTensor
*
beta_pow_out
,
const
T
&
beta
,
const
Context
&
dev_ctx
)
{
if
(
beta_pow
.
dtype
()
==
DataType
::
FLOAT16
)
{
const
float16
*
beta_pow_p
=
beta_pow
.
template
data
<
float16
>();
dev_ctx
.
template
HostAlloc
<
float16
>(
beta_pow_out
)[
0
]
=
static_cast
<
float16
>
(
beta
)
*
beta_pow_p
[
0
];
}
else
{
const
T
*
beta_pow_p
=
beta_pow
.
template
data
<
T
>();
dev_ctx
.
template
HostAlloc
<
T
>(
beta_pow_out
)[
0
]
=
beta
*
beta_pow_p
[
0
];
}
}
template
<
typename
Context
,
typename
T
>
static
void
Scale
(
phi
::
DenseTensor
*
beta_pow_out
,
const
phi
::
DenseTensor
&
beta_pow
,
T
*
beta_pow_ptr
,
const
T
&
beta
,
const
Context
&
dev_ctx
)
{
float16
*
beta_pow_out_p2
=
dev_ctx
.
template
Alloc
<
float16
>(
beta_pow_out
);
DenseTensor
xpu_beta_pow_out
;
const
phi
::
DenseTensorMeta
meta_beta_pow_out
(
DataType
::
FLOAT32
,
beta_pow_out
->
dims
());
xpu_beta_pow_out
.
set_meta
(
meta_beta_pow_out
);
T
*
beta_pow_out_ptr
=
dev_ctx
.
template
Alloc
<
T
>(
&
xpu_beta_pow_out
);
int
r
=
xpu
::
scale
(
dev_ctx
.
x_context
(),
beta_pow_ptr
,
beta_pow_out_ptr
,
beta_pow
.
numel
(),
false
,
beta
,
0.0
f
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"adam"
);
const
float
*
xpu_beta_pow_out_data
=
dev_ctx
.
template
Alloc
<
T
>(
&
xpu_beta_pow_out
);
int
len
=
xpu_beta_pow_out
.
numel
();
r
=
ConvertDataByType
<
Context
,
T
,
float16
>
(
xpu_beta_pow_out_data
,
&
beta_pow_out_p2
,
len
,
false
,
dev_ctx
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"adam"
);
}
#endif
struct
GPUAdam
;
struct
GPUAdam
;
struct
CPUAdam
;
struct
CPUAdam
;
...
...
paddle/phi/kernels/selected_rows/xpu/adam_kernel.cc
0 → 100644
浏览文件 @
cbabbe2e
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/selected_rows/adam_kernel.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/adam_functors.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace
phi
{
namespace
sr
{
using
float16
=
dtype
::
float16
;
template
<
typename
T
,
typename
Context
>
void
AdamDenseParamSparseGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
param
,
const
SelectedRows
&
grad
,
const
DenseTensor
&
learning_rate
,
const
DenseTensor
&
moment1
,
const
DenseTensor
&
moment2
,
const
DenseTensor
&
beta1_pow
,
const
DenseTensor
&
beta2_pow
,
const
paddle
::
optional
<
DenseTensor
>&
master_param
,
const
paddle
::
optional
<
DenseTensor
>&
skip_update
,
const
Scalar
&
beta1
,
const
Scalar
&
beta2
,
const
Scalar
&
epsilon
,
bool
lazy_mode
,
int64_t
min_row_size_to_use_multithread
,
bool
multi_precision
,
bool
use_global_beta_pow
,
DenseTensor
*
param_out
,
DenseTensor
*
moment1_out
,
DenseTensor
*
moment2_out
,
DenseTensor
*
beta1_pow_out
,
DenseTensor
*
beta2_pow_out
,
DenseTensor
*
master_param_outs
)
{
float
*
param_ptr
=
nullptr
;
funcs
::
GetDataPointer
<
Context
,
float
>
(
param
,
&
param_ptr
,
dev_ctx
);
float
*
mom1_ptr
=
nullptr
;
funcs
::
GetDataPointer
<
Context
,
float
>
(
moment1
,
&
mom1_ptr
,
dev_ctx
);
float
*
mom2_ptr
=
nullptr
;
funcs
::
GetDataPointer
<
Context
,
float
>
(
moment2
,
&
mom2_ptr
,
dev_ctx
);
float
*
lr_ptr
=
nullptr
;
funcs
::
GetDataPointer
<
Context
,
float
>
(
learning_rate
,
&
lr_ptr
,
dev_ctx
);
float
*
beta1_pow_ptr
=
nullptr
;
const
float
*
beta1_const_pow_ptr
=
nullptr
;
if
(
beta1_pow
.
place
()
==
CPUPlace
())
{
DenseTensor
xpu_beta1_pow
;
phi
::
Copy
(
dev_ctx
,
beta1_pow
,
beta1_pow
.
place
(),
false
,
&
xpu_beta1_pow
);
if
(
xpu_beta1_pow
.
dtype
()
==
DataType
::
FLOAT16
)
funcs
::
GetDataPointer
<
Context
,
float
>
(
xpu_beta1_pow
,
&
beta1_pow_ptr
,
dev_ctx
);
else
beta1_const_pow_ptr
=
xpu_beta1_pow
.
template
data
<
float
>();
}
else
{
if
(
beta1_pow
.
dtype
()
==
DataType
::
FLOAT16
)
funcs
::
GetDataPointer
<
Context
,
float
>
(
beta1_pow
,
&
beta1_pow_ptr
,
dev_ctx
);
else
beta1_const_pow_ptr
=
beta1_pow
.
template
data
<
float
>();
}
float
*
beta2_pow_ptr
=
nullptr
;
const
float
*
beta2_const_pow_ptr
=
nullptr
;
if
(
beta2_pow
.
place
()
==
CPUPlace
())
{
DenseTensor
xpu_beta2_pow
;
phi
::
Copy
(
dev_ctx
,
beta2_pow
,
beta2_pow
.
place
(),
false
,
&
xpu_beta2_pow
);
if
(
xpu_beta2_pow
.
dtype
()
==
DataType
::
FLOAT16
)
funcs
::
GetDataPointer
<
Context
,
float
>
(
xpu_beta2_pow
,
&
beta2_pow_ptr
,
dev_ctx
);
else
beta2_const_pow_ptr
=
xpu_beta2_pow
.
template
data
<
float
>();
}
else
{
if
(
beta2_pow
.
dtype
()
==
DataType
::
FLOAT16
)
funcs
::
GetDataPointer
<
Context
,
float
>
(
beta2_pow
,
&
beta2_pow_ptr
,
dev_ctx
);
else
beta2_const_pow_ptr
=
beta2_pow
.
template
data
<
float
>();
}
DenseTensor
xpu_param_out
;
float
*
param_out_ptr
=
nullptr
;
const
phi
::
DenseTensorMeta
meta_param
(
DataType
::
FLOAT32
,
param_out
->
dims
());
xpu_param_out
.
set_meta
(
meta_param
);
funcs
::
GetOutDataPointer
<
Context
,
float
>
(
param_out
,
&
xpu_param_out
,
&
param_out_ptr
,
dev_ctx
);
DenseTensor
xpu_mom1_out
;
float
*
mom1_out_ptr
=
nullptr
;
const
phi
::
DenseTensorMeta
meta_mom1
(
DataType
::
FLOAT32
,
moment1_out
->
dims
());
xpu_mom1_out
.
set_meta
(
meta_mom1
);
funcs
::
GetOutDataPointer
<
Context
,
float
>
(
moment1_out
,
&
xpu_mom1_out
,
&
mom1_out_ptr
,
dev_ctx
);
DenseTensor
xpu_mom2_out
;
float
*
mom2_out_ptr
=
nullptr
;
const
phi
::
DenseTensorMeta
meta_mom2
(
DataType
::
FLOAT32
,
moment2_out
->
dims
());
xpu_mom2_out
.
set_meta
(
meta_mom2
);
funcs
::
GetOutDataPointer
<
Context
,
float
>
(
moment2_out
,
&
xpu_mom2_out
,
&
mom2_out_ptr
,
dev_ctx
);
bool
skip_update_
=
false
;
if
(
skip_update
.
is_initialized
())
{
PADDLE_ENFORCE_EQ
(
skip_update
->
numel
(),
1
,
errors
::
InvalidArgument
(
"Input(SkipUpdate) size must be 1, but get %d"
,
skip_update
->
numel
()));
std
::
vector
<
bool
>
skip_update_vec
;
paddle
::
framework
::
TensorToVector
(
*
skip_update
,
dev_ctx
,
&
skip_update_vec
);
skip_update_
=
skip_update_vec
[
0
];
}
if
(
skip_update_
)
{
VLOG
(
4
)
<<
"Adam skip update"
;
phi
::
Copy
(
dev_ctx
,
param
,
dev_ctx
.
GetPlace
(),
false
,
param_out
);
phi
::
Copy
(
dev_ctx
,
moment1
,
dev_ctx
.
GetPlace
(),
false
,
moment1_out
);
phi
::
Copy
(
dev_ctx
,
moment2
,
dev_ctx
.
GetPlace
(),
false
,
moment2_out
);
phi
::
Copy
(
dev_ctx
,
beta1_pow
,
beta1_pow
.
place
(),
false
,
beta1_pow_out
);
phi
::
Copy
(
dev_ctx
,
beta2_pow
,
beta2_pow
.
place
(),
false
,
beta2_pow_out
);
return
;
}
PADDLE_ENFORCE_EQ
(
beta1_pow_out
->
numel
(),
1
,
errors
::
InvalidArgument
(
"Tensor holds the wrong size, Expected beta1 pow "
"output size is 1, but received "
"value is:%d."
,
beta1_pow_out
->
numel
()));
PADDLE_ENFORCE_EQ
(
beta2_pow_out
->
numel
(),
1
,
errors
::
InvalidArgument
(
"Tensor holds the wrong size, Expected beta2 pow "
"output size is 1, but received "
"value is:%d."
,
beta2_pow_out
->
numel
()));
VLOG
(
4
)
<<
"use_global_beta_pow:"
<<
use_global_beta_pow
;
auto
beta1_
=
beta1
.
to
<
float
>
();
auto
beta2_
=
beta2
.
to
<
float
>
();
auto
epsilon_
=
epsilon
.
to
<
float
>
();
float
*
grad_c
=
nullptr
;
if
(
grad
.
rows
().
size
()
==
0
)
{
VLOG
(
3
)
<<
"grad row size is 0!!"
;
return
;
}
std
::
vector
<
int64_t
>
cpu_rows
(
grad
.
rows
().
begin
(),
grad
.
rows
().
end
());
bool
is_strict_sorted
=
true
;
for
(
size_t
i
=
1
;
i
<
cpu_rows
.
size
();
++
i
)
{
if
(
cpu_rows
[
i
-
1
]
>=
cpu_rows
[
i
])
{
is_strict_sorted
=
false
;
break
;
}
}
SelectedRows
tmp_grad_merge
;
const
SelectedRows
*
grad_merge_ptr
;
if
(
is_strict_sorted
)
{
grad_merge_ptr
=
&
grad
;
}
else
{
paddle
::
operators
::
math
::
scatter
::
MergeAdd
<
Context
,
float
>
merge_func
;
merge_func
(
dev_ctx
,
grad
,
&
tmp_grad_merge
,
true
);
xpu_wait
(
dev_ctx
.
x_context
()
->
xpu_stream
);
grad_merge_ptr
=
&
tmp_grad_merge
;
}
auto
&
grad_merge
=
*
grad_merge_ptr
;
auto
&
grad_tensor
=
grad_merge
.
value
();
funcs
::
GetDataPointer
<
Context
,
float
>
(
grad_tensor
,
&
grad_c
,
dev_ctx
);
int
row_count
=
grad_merge
.
rows
().
size
();
std
::
vector
<
int
>
rows
(
row_count
);
xpu
::
ctx_guard
RAII_GUARD
(
dev_ctx
.
x_context
());
int
*
xpu_rows
=
RAII_GUARD
.
alloc_l3_or_gm
<
int
>
(
row_count
);
std
::
vector
<
int64_t
>
merge_rows
(
grad_merge
.
rows
().
begin
(),
grad_merge
.
rows
().
end
());
for
(
size_t
i
=
0
;
i
<
grad_merge
.
rows
().
size
();
++
i
)
{
rows
[
i
]
=
static_cast
<
int
>
(
merge_rows
[
i
]);
}
xpu_wait
(
dev_ctx
.
x_context
()
->
xpu_stream
);
paddle
::
memory
::
Copy
(
dev_ctx
.
GetPlace
(),
xpu_rows
,
CPUPlace
(),
rows
.
data
(),
row_count
*
sizeof
(
int
));
auto
row_numel
=
grad_tensor
.
numel
()
/
grad_merge
.
rows
().
size
();
auto
ori_rows
=
param
.
numel
()
/
row_numel
;
int
r
=
xpu
::
sparse_adam
(
dev_ctx
.
x_context
(),
grad_c
!=
nullptr
?
grad_c
:
grad_tensor
.
template
data
<
float
>(),
mom1_ptr
!=
nullptr
?
mom1_ptr
:
moment1
.
template
data
<
float
>(),
mom2_ptr
!=
nullptr
?
mom2_ptr
:
moment2
.
template
data
<
float
>(),
param_ptr
!=
nullptr
?
param_ptr
:
param
.
template
data
<
float
>(),
beta1_pow_ptr
!=
nullptr
?
beta1_pow_ptr
:
beta1_const_pow_ptr
,
beta2_pow_ptr
!=
nullptr
?
beta2_pow_ptr
:
beta2_const_pow_ptr
,
lr_ptr
!=
nullptr
?
lr_ptr
:
learning_rate
.
template
data
<
float
>(),
mom1_out_ptr
,
mom2_out_ptr
,
param_out_ptr
,
beta1_
,
beta2_
,
epsilon_
,
ori_rows
,
xpu_rows
,
row_numel
,
grad_merge
.
rows
().
size
(),
lazy_mode
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"adam"
);
funcs
::
FreeData
<
float
>
(
grad_tensor
,
grad_c
);
funcs
::
CopyOutData
<
Context
,
float
>
(
xpu_mom1_out
,
moment1_out
,
dev_ctx
);
funcs
::
CopyOutData
<
Context
,
float
>
(
xpu_mom2_out
,
moment1_out
,
dev_ctx
);
funcs
::
CopyOutData
<
Context
,
float
>
(
xpu_param_out
,
moment1_out
,
dev_ctx
);
if
(
!
use_global_beta_pow
)
{
// update in cpu and then copy to xpu
if
(
beta1_pow
.
place
()
==
CPUPlace
()
&&
beta2_pow
.
place
()
==
CPUPlace
())
{
funcs
::
SetBetaData
<
Context
,
float
>
(
beta1_pow
,
beta1_pow_out
,
beta1_
,
dev_ctx
);
funcs
::
SetBetaData
<
Context
,
float
>
(
beta2_pow
,
beta2_pow_out
,
beta2_
,
dev_ctx
);
}
else
{
float
*
beta1_pow_out_p1
=
nullptr
;
if
(
beta1_pow_out
->
dtype
()
==
DataType
::
FLOAT16
)
{
funcs
::
Scale
<
Context
,
float
>
(
beta1_pow_out
,
beta1_pow
,
beta1_pow_ptr
,
beta1_
,
dev_ctx
);
}
else
{
const
float
*
beta1_pow_data
=
beta1_pow
.
template
data
<
float
>();
beta1_pow_out_p1
=
dev_ctx
.
template
Alloc
<
float
>(
beta1_pow_out
);
r
=
xpu
::
scale
(
dev_ctx
.
x_context
(),
beta1_pow_data
,
beta1_pow_out_p1
,
beta1_pow
.
numel
(),
false
,
beta1_
,
0.0
f
);
xpu_wait
(
dev_ctx
.
x_context
()
->
xpu_stream
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"adam"
);
}
float
*
beta2_pow_out_p1
=
nullptr
;
if
(
beta2_pow_out
->
dtype
()
==
DataType
::
FLOAT16
)
{
funcs
::
Scale
<
Context
,
float
>
(
beta2_pow_out
,
beta2_pow
,
beta2_pow_ptr
,
beta2_
,
dev_ctx
);
}
else
{
const
float
*
beta2_pow_data
=
beta2_pow
.
template
data
<
float
>();
beta2_pow_out_p1
=
dev_ctx
.
template
Alloc
<
float
>(
beta2_pow_out
);
r
=
xpu
::
scale
(
dev_ctx
.
x_context
(),
beta2_pow_data
,
beta2_pow_out_p1
,
beta2_pow
.
numel
(),
false
,
beta2_
,
0.0
f
);
xpu_wait
(
dev_ctx
.
x_context
()
->
xpu_stream
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"adam"
);
}
}
}
funcs
::
FreeData
<
float
>
(
param
,
param_ptr
);
funcs
::
FreeData
<
float
>
(
moment1
,
mom1_ptr
);
funcs
::
FreeData
<
float
>
(
moment2
,
mom2_ptr
);
funcs
::
FreeData
<
float
>
(
learning_rate
,
lr_ptr
);
}
}
// namespace sr
}
// namespace phi
PD_REGISTER_KERNEL
(
adam_dense_param_sparse_grad
,
XPU
,
ALL_LAYOUT
,
phi
::
sr
::
AdamDenseParamSparseGradKernel
,
float
,
phi
::
dtype
::
float16
)
{
// Skip beta1_pow, beta2_pow, skip_update data transform
kernel
->
InputAt
(
5
).
SetBackend
(
phi
::
Backend
::
ALL_BACKEND
);
kernel
->
InputAt
(
6
).
SetBackend
(
phi
::
Backend
::
ALL_BACKEND
);
kernel
->
InputAt
(
8
).
SetBackend
(
phi
::
Backend
::
ALL_BACKEND
);
}
paddle/phi/kernels/xpu/adam_kernel.cc
0 → 100644
浏览文件 @
cbabbe2e
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/adam_kernel.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/adam_functors.h"
namespace
phi
{
using
float16
=
dtype
::
float16
;
template
<
typename
T
,
typename
Context
>
void
AdamDenseKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
param
,
const
DenseTensor
&
grad
,
const
DenseTensor
&
learning_rate
,
const
DenseTensor
&
moment1
,
const
DenseTensor
&
moment2
,
const
DenseTensor
&
beta1_pow
,
const
DenseTensor
&
beta2_pow
,
const
paddle
::
optional
<
DenseTensor
>&
master_param
,
const
paddle
::
optional
<
DenseTensor
>&
skip_update
,
const
Scalar
&
beta1
,
const
Scalar
&
beta2
,
const
Scalar
&
epsilon
,
bool
lazy_mode
,
int64_t
min_row_size_to_use_multithread
,
bool
multi_precision
,
bool
use_global_beta_pow
,
DenseTensor
*
param_out
,
DenseTensor
*
moment1_out
,
DenseTensor
*
moment2_out
,
DenseTensor
*
beta1_pow_out
,
DenseTensor
*
beta2_pow_out
,
DenseTensor
*
master_param_outs
)
{
float
*
param_ptr
=
nullptr
;
funcs
::
GetDataPointer
<
Context
,
float
>
(
param
,
&
param_ptr
,
dev_ctx
);
float
*
mom1_ptr
=
nullptr
;
funcs
::
GetDataPointer
<
Context
,
float
>
(
moment1
,
&
mom1_ptr
,
dev_ctx
);
float
*
mom2_ptr
=
nullptr
;
funcs
::
GetDataPointer
<
Context
,
float
>
(
moment2
,
&
mom2_ptr
,
dev_ctx
);
float
*
lr_ptr
=
nullptr
;
funcs
::
GetDataPointer
<
Context
,
float
>
(
learning_rate
,
&
lr_ptr
,
dev_ctx
);
float
*
beta1_pow_ptr
=
nullptr
;
const
float
*
beta1_const_pow_ptr
=
nullptr
;
if
(
beta1_pow
.
place
()
==
CPUPlace
())
{
DenseTensor
xpu_beta1_pow
;
phi
::
Copy
(
dev_ctx
,
beta1_pow
,
beta1_pow
.
place
(),
false
,
&
xpu_beta1_pow
);
if
(
xpu_beta1_pow
.
dtype
()
==
DataType
::
FLOAT16
)
funcs
::
GetDataPointer
<
Context
,
float
>
(
xpu_beta1_pow
,
&
beta1_pow_ptr
,
dev_ctx
);
else
beta1_const_pow_ptr
=
xpu_beta1_pow
.
template
data
<
float
>();
}
else
{
if
(
beta1_pow
.
dtype
()
==
DataType
::
FLOAT16
)
funcs
::
GetDataPointer
<
Context
,
float
>
(
beta1_pow
,
&
beta1_pow_ptr
,
dev_ctx
);
else
beta1_const_pow_ptr
=
beta1_pow
.
template
data
<
float
>();
}
float
*
beta2_pow_ptr
=
nullptr
;
const
float
*
beta2_const_pow_ptr
=
nullptr
;
if
(
beta2_pow
.
place
()
==
CPUPlace
())
{
DenseTensor
xpu_beta2_pow
;
phi
::
Copy
(
dev_ctx
,
beta2_pow
,
beta2_pow
.
place
(),
false
,
&
xpu_beta2_pow
);
if
(
xpu_beta2_pow
.
dtype
()
==
DataType
::
FLOAT16
)
funcs
::
GetDataPointer
<
Context
,
float
>
(
xpu_beta2_pow
,
&
beta2_pow_ptr
,
dev_ctx
);
else
beta2_const_pow_ptr
=
xpu_beta2_pow
.
template
data
<
float
>();
}
else
{
if
(
beta2_pow
.
dtype
()
==
DataType
::
FLOAT16
)
funcs
::
GetDataPointer
<
Context
,
float
>
(
beta2_pow
,
&
beta2_pow_ptr
,
dev_ctx
);
else
beta2_const_pow_ptr
=
beta2_pow
.
template
data
<
float
>();
}
DenseTensor
xpu_param_out
;
float
*
param_out_ptr
=
nullptr
;
const
phi
::
DenseTensorMeta
meta_param
(
DataType
::
FLOAT32
,
param_out
->
dims
());
xpu_param_out
.
set_meta
(
meta_param
);
funcs
::
GetOutDataPointer
<
Context
,
float
>
(
param_out
,
&
xpu_param_out
,
&
param_out_ptr
,
dev_ctx
);
DenseTensor
xpu_mom1_out
;
float
*
mom1_out_ptr
=
nullptr
;
const
phi
::
DenseTensorMeta
meta_mom1
(
DataType
::
FLOAT32
,
moment1_out
->
dims
());
xpu_mom1_out
.
set_meta
(
meta_mom1
);
funcs
::
GetOutDataPointer
<
Context
,
float
>
(
moment1_out
,
&
xpu_mom1_out
,
&
mom1_out_ptr
,
dev_ctx
);
DenseTensor
xpu_mom2_out
;
float
*
mom2_out_ptr
=
nullptr
;
const
phi
::
DenseTensorMeta
meta_mom2
(
DataType
::
FLOAT32
,
moment2_out
->
dims
());
xpu_mom2_out
.
set_meta
(
meta_mom2
);
funcs
::
GetOutDataPointer
<
Context
,
float
>
(
moment2_out
,
&
xpu_mom2_out
,
&
mom2_out_ptr
,
dev_ctx
);
bool
skip_update_
=
false
;
if
(
skip_update
.
is_initialized
())
{
PADDLE_ENFORCE_EQ
(
skip_update
->
numel
(),
1
,
errors
::
InvalidArgument
(
"Input(SkipUpdate) size must be 1, but get %d"
,
skip_update
->
numel
()));
std
::
vector
<
bool
>
skip_update_vec
;
paddle
::
framework
::
TensorToVector
(
*
skip_update
,
dev_ctx
,
&
skip_update_vec
);
skip_update_
=
skip_update_vec
[
0
];
}
if
(
skip_update_
)
{
VLOG
(
4
)
<<
"Adam skip update"
;
phi
::
Copy
(
dev_ctx
,
param
,
dev_ctx
.
GetPlace
(),
false
,
param_out
);
phi
::
Copy
(
dev_ctx
,
moment1
,
dev_ctx
.
GetPlace
(),
false
,
moment1_out
);
phi
::
Copy
(
dev_ctx
,
moment2
,
dev_ctx
.
GetPlace
(),
false
,
moment2_out
);
phi
::
Copy
(
dev_ctx
,
beta1_pow
,
beta1_pow
.
place
(),
false
,
beta1_pow_out
);
phi
::
Copy
(
dev_ctx
,
beta2_pow
,
beta2_pow
.
place
(),
false
,
beta2_pow_out
);
return
;
}
PADDLE_ENFORCE_EQ
(
beta1_pow_out
->
numel
(),
1
,
errors
::
InvalidArgument
(
"Tensor holds the wrong size, Expected beta1 pow "
"output size is 1, but received "
"value is:%d."
,
beta1_pow_out
->
numel
()));
PADDLE_ENFORCE_EQ
(
beta2_pow_out
->
numel
(),
1
,
errors
::
InvalidArgument
(
"Tensor holds the wrong size, Expected beta2 pow "
"output size is 1, but received "
"value is:%d."
,
beta2_pow_out
->
numel
()));
VLOG
(
4
)
<<
"use_global_beta_pow:"
<<
use_global_beta_pow
;
auto
beta1_
=
beta1
.
to
<
float
>
();
auto
beta2_
=
beta2
.
to
<
float
>
();
auto
epsilon_
=
epsilon
.
to
<
float
>
();
float
*
grad_c
=
nullptr
;
funcs
::
GetDataPointer
<
Context
,
float
>
(
grad
,
&
grad_c
,
dev_ctx
);
int
r
=
xpu
::
adam
(
dev_ctx
.
x_context
(),
grad_c
!=
nullptr
?
grad_c
:
grad
.
template
data
<
float
>(),
mom1_ptr
!=
nullptr
?
mom1_ptr
:
moment1
.
template
data
<
float
>(),
mom2_ptr
!=
nullptr
?
mom2_ptr
:
moment2
.
template
data
<
float
>(),
param_ptr
!=
nullptr
?
param_ptr
:
param
.
template
data
<
float
>(),
beta1_pow_ptr
!=
nullptr
?
beta1_pow_ptr
:
beta1_const_pow_ptr
,
beta2_pow_ptr
!=
nullptr
?
beta2_pow_ptr
:
beta2_const_pow_ptr
,
lr_ptr
!=
nullptr
?
lr_ptr
:
learning_rate
.
template
data
<
float
>(),
mom1_out_ptr
,
mom2_out_ptr
,
param_out_ptr
,
beta1_
,
beta2_
,
epsilon_
,
param
.
numel
());
xpu_wait
(
dev_ctx
.
x_context
()
->
xpu_stream
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"adam"
);
funcs
::
FreeData
<
float
>
(
grad
,
grad_c
);
funcs
::
CopyOutData
<
Context
,
float
>
(
xpu_mom1_out
,
moment1_out
,
dev_ctx
);
funcs
::
CopyOutData
<
Context
,
float
>
(
xpu_mom2_out
,
moment2_out
,
dev_ctx
);
funcs
::
CopyOutData
<
Context
,
float
>
(
xpu_param_out
,
param_out
,
dev_ctx
);
if
(
!
use_global_beta_pow
)
{
// update in cpu and then copy to xpu
if
(
beta1_pow
.
place
()
==
CPUPlace
()
&&
beta2_pow
.
place
()
==
CPUPlace
())
{
funcs
::
SetBetaData
<
Context
,
float
>
(
beta1_pow
,
beta1_pow_out
,
beta1_
,
dev_ctx
);
funcs
::
SetBetaData
<
Context
,
float
>
(
beta2_pow
,
beta2_pow_out
,
beta2_
,
dev_ctx
);
}
else
{
float
*
beta1_pow_out_p1
=
nullptr
;
if
(
beta1_pow_out
->
dtype
()
==
DataType
::
FLOAT16
)
{
funcs
::
Scale
<
Context
,
float
>
(
beta1_pow_out
,
beta1_pow
,
beta1_pow_ptr
,
beta1_
,
dev_ctx
);
}
else
{
const
float
*
beta1_pow_data
=
beta1_pow
.
template
data
<
float
>();
beta1_pow_out_p1
=
dev_ctx
.
template
Alloc
<
float
>(
beta1_pow_out
);
r
=
xpu
::
scale
(
dev_ctx
.
x_context
(),
beta1_pow_data
,
beta1_pow_out_p1
,
beta1_pow
.
numel
(),
false
,
beta1_
,
0.0
f
);
xpu_wait
(
dev_ctx
.
x_context
()
->
xpu_stream
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"adam"
);
}
float
*
beta2_pow_out_p1
=
nullptr
;
if
(
beta2_pow_out
->
dtype
()
==
DataType
::
FLOAT16
)
{
funcs
::
Scale
<
Context
,
float
>
(
beta2_pow_out
,
beta2_pow
,
beta2_pow_ptr
,
beta2_
,
dev_ctx
);
}
else
{
const
float
*
beta2_pow_data
=
beta2_pow
.
template
data
<
float
>();
beta2_pow_out_p1
=
dev_ctx
.
template
Alloc
<
float
>(
beta2_pow_out
);
r
=
xpu
::
scale
(
dev_ctx
.
x_context
(),
beta2_pow_data
,
beta2_pow_out_p1
,
beta2_pow
.
numel
(),
false
,
beta2_
,
0.0
f
);
xpu_wait
(
dev_ctx
.
x_context
()
->
xpu_stream
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"adam"
);
}
}
}
funcs
::
FreeData
<
float
>
(
param
,
param_ptr
);
funcs
::
FreeData
<
float
>
(
moment1
,
mom1_ptr
);
funcs
::
FreeData
<
float
>
(
moment2
,
mom2_ptr
);
funcs
::
FreeData
<
float
>
(
learning_rate
,
lr_ptr
);
}
}
// namespace phi
PD_REGISTER_KERNEL
(
adam
,
XPU
,
ALL_LAYOUT
,
phi
::
AdamDenseKernel
,
float
,
phi
::
dtype
::
float16
)
{
// Skip beta1_pow, beta2_pow, skip_update data transform
kernel
->
InputAt
(
5
).
SetBackend
(
phi
::
Backend
::
ALL_BACKEND
);
kernel
->
InputAt
(
6
).
SetBackend
(
phi
::
Backend
::
ALL_BACKEND
);
kernel
->
InputAt
(
8
).
SetBackend
(
phi
::
Backend
::
ALL_BACKEND
);
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录