Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
6b0c57cf
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
6b0c57cf
编写于
1月 20, 2022
作者:
Z
zhangbo9674
提交者:
GitHub
1月 20, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix master weight bug for multi_tensor optimizer(momentum, adam) (#38991)
* fix mp * support merged_momentum for mp
上级
c0f27282
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
68 addition
and
60 deletion
+68
-60
paddle/fluid/operators/optimizers/merged_momentum_op.h
paddle/fluid/operators/optimizers/merged_momentum_op.h
+60
-50
python/paddle/optimizer/adam.py
python/paddle/optimizer/adam.py
+4
-5
python/paddle/optimizer/momentum.py
python/paddle/optimizer/momentum.py
+4
-5
未找到文件。
paddle/fluid/operators/optimizers/merged_momentum_op.h
浏览文件 @
6b0c57cf
...
...
@@ -48,13 +48,13 @@ struct MergedMomentumKernelParam
T
*
PADDLE_RESTRICT
params
[
N
];
const
T
*
PADDLE_RESTRICT
grads
[
N
];
MT
*
PADDLE_RESTRICT
velocitys
[
N
];
const
M
T
*
PADDLE_RESTRICT
lr
;
const
M
ultiPrecisionType
<
MT
>
*
PADDLE_RESTRICT
lr
;
MT
mu
;
MT
rescale_grad
;
uint32_t
param_num
;
HOSTDEVICE
void
operator
()(
size_t
i
)
const
{
const
auto
lr_val
=
*
lr
;
const
MT
lr_val
=
static_cast
<
MT
>
(
*
lr
)
;
for
(
uint32_t
idx
=
0
;
idx
<
param_num
;
++
idx
)
{
auto
size
=
sizes
[
idx
];
if
(
i
>=
size
)
continue
;
...
...
@@ -81,8 +81,22 @@ struct MergedMomentumKernelParam
template
<
typename
DeviceContext
,
typename
T
>
class
MergedMomentumOpKernel
:
public
framework
::
OpKernel
<
T
>
{
using
MPType
=
typename
operators
::
details
::
MPTypeTrait
<
T
>::
Type
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
bool
multi_precision
=
ctx
.
Attr
<
bool
>
(
"multi_precision"
);
if
(
multi_precision
)
{
InnerCompute
<
MPType
>
(
ctx
,
multi_precision
);
}
else
{
InnerCompute
<
T
>
(
ctx
,
multi_precision
);
}
}
private:
template
<
typename
MT
>
void
InnerCompute
(
const
framework
::
ExecutionContext
&
ctx
,
const
bool
multi_precision
)
const
{
auto
params
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"Param"
);
auto
params_out
=
ctx
.
MultiOutput
<
framework
::
Tensor
>
(
"ParamOut"
);
size_t
n
=
params
.
size
();
...
...
@@ -133,7 +147,6 @@ class MergedMomentumOpKernel : public framework::OpKernel<T> {
auto
master_params
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"MasterParam"
);
auto
master_params_out
=
ctx
.
MultiOutput
<
framework
::
Tensor
>
(
"MasterParamOut"
);
auto
multi_precision
=
ctx
.
Attr
<
bool
>
(
"multi_precision"
);
if
(
multi_precision
)
{
PADDLE_ENFORCE_EQ
(
n
,
master_params
.
size
(),
...
...
@@ -206,39 +219,37 @@ class MergedMomentumOpKernel : public framework::OpKernel<T> {
<<
", regularization_coeffs.size(): "
<<
regularization_coeffs
.
size
();
using
MPType
=
typename
operators
::
details
::
MPTypeTrait
<
T
>::
Type
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
if
(
lrs
.
size
()
==
1
&&
use_nesterov
==
false
&&
regularization_methods
.
size
()
==
0
)
{
#define PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(kMultiPrecision)
\
MergedMomentumKernelParam<T, M
PType
, kMultiPrecision> kernel_params; \
constexpr auto kMaxMergedNum = decltype(kernel_params)::N;
\
size_t kernel_num = (n + kMaxMergedNum - 1) / kMaxMergedNum;
\
kernel_params.mu = static_cast<M
PType
>(mu); \
kernel_params.rescale_grad = static_cast<M
PType
>(rescale_grad); \
kernel_params.lr = lrs[0]->data<MPType>();
\
for (size_t i = 0; i < kernel_num; ++i) {
\
size_t start = i * kMaxMergedNum;
\
size_t end = std::min((i + 1) * kMaxMergedNum, n);
\
kernel_params.param_num = static_cast<uint32_t>(end - start);
\
size_t max_size = 0;
\
for (size_t j = 0; j < kernel_params.param_num; ++j) {
\
auto size = static_cast<size_t>(params_out[j + start]->numel());
\
max_size = std::max(max_size, size);
\
kernel_params.sizes[j] = size;
\
kernel_params.params[j] = params_out[j + start]->data<T>();
\
kernel_params.grads[j] = grads[j + start]->data<T>();
\
kernel_params.velocitys[j] = velocitys_out[j + start]->data<M
PType
>(); \
kernel_params.SetMasterParam(
\
j, kMultiPrecision ? master_params_out[j + start]->data<M
PType
>() \
: nullptr);
\
}
\
platform::ForRange<DeviceContext> for_range(dev_ctx, max_size);
\
for_range(kernel_params);
\
VLOG(10) << "Launch MergedMomentum kernel " << i << " "
\
<< kernel_params.param_num;
\
#define PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(kMultiPrecision) \
MergedMomentumKernelParam<T, M
T
, kMultiPrecision> kernel_params; \
constexpr auto kMaxMergedNum = decltype(kernel_params)::N; \
size_t kernel_num = (n + kMaxMergedNum - 1) / kMaxMergedNum; \
kernel_params.mu = static_cast<M
T
>(mu); \
kernel_params.rescale_grad = static_cast<M
T
>(rescale_grad); \
kernel_params.lr = lrs[0]->data<MPType>(); \
for (size_t i = 0; i < kernel_num; ++i) { \
size_t start = i * kMaxMergedNum; \
size_t end = std::min((i + 1) * kMaxMergedNum, n); \
kernel_params.param_num = static_cast<uint32_t>(end - start); \
size_t max_size = 0; \
for (size_t j = 0; j < kernel_params.param_num; ++j) { \
auto size = static_cast<size_t>(params_out[j + start]->numel()); \
max_size = std::max(max_size, size); \
kernel_params.sizes[j] = size; \
kernel_params.params[j] = params_out[j + start]->data<T>(); \
kernel_params.grads[j] = grads[j + start]->data<T>(); \
kernel_params.velocitys[j] = velocitys_out[j + start]->data<M
T
>(); \
kernel_params.SetMasterParam( \
j, kMultiPrecision ? master_params_out[j + start]->data<M
T
>() \
: nullptr); \
} \
platform::ForRange<DeviceContext> for_range(dev_ctx, max_size); \
for_range(kernel_params); \
VLOG(10) << "Launch MergedMomentum kernel " << i << " " \
<< kernel_params.param_num; \
}
if
(
multi_precision
)
{
PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL
(
true
);
...
...
@@ -254,34 +265,33 @@ class MergedMomentumOpKernel : public framework::OpKernel<T> {
?
RegularizationType
::
kL2DECAY
:
RegularizationType
::
kNONE
;
M
PType
regularization_coeff
=
static_cast
<
MPType
>
(
0.0
);
M
T
regularization_coeff
=
static_cast
<
MT
>
(
0.0
);
if
(
regularization_coeffs
.
size
()
!=
0
)
{
regularization_coeff
=
static_cast
<
MPType
>
(
regularization_coeffs
[
idx
]);
regularization_coeff
=
static_cast
<
MT
>
(
regularization_coeffs
[
idx
]);
}
auto
lr_temp
=
lrs
.
size
()
>
1
?
lrs
[
idx
]
:
lrs
[
0
];
const
M
PType
*
master_in_data
=
multi_precision
?
master_params
[
idx
]
->
data
<
M
PType
>
()
:
nullptr
;
M
PType
*
master_out_data
=
multi_precision
?
master_params_out
[
idx
]
->
data
<
M
PType
>
()
:
nullptr
;
const
M
T
*
master_in_data
=
multi_precision
?
master_params
[
idx
]
->
data
<
M
T
>
()
:
nullptr
;
M
T
*
master_out_data
=
multi_precision
?
master_params_out
[
idx
]
->
data
<
M
T
>
()
:
nullptr
;
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()))
{
CPUDenseMomentumFunctor
<
M
PType
>
functor
;
functor
(
params
[
idx
],
grads
[
idx
],
velocitys
[
idx
],
lr_temp
,
mu
,
use_nesterov
,
regularization_flag
,
regularization_coeff
,
params_out
[
idx
],
velocitys_out
[
idx
]);
CPUDenseMomentumFunctor
<
M
T
>
functor
;
functor
(
params
[
idx
],
grads
[
idx
],
velocitys
[
idx
],
lr_temp
,
static_cast
<
MT
>
(
mu
),
use_nesterov
,
regularization_flag
,
regularization_coeff
,
params_out
[
idx
],
velocitys_out
[
idx
]);
VLOG
(
10
)
<<
"Launch MergedMomentum cpu kernel."
;
}
else
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
platform
::
ForRange
<
DeviceContext
>
for_range
(
static_cast
<
const
DeviceContext
&>
(
ctx
.
device_context
()),
params
[
idx
]
->
numel
());
#define PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(__nesterov, __reg_type)
\
DenseMomentumFunctor<T, M
PType, __reg_type, __nesterov> functor(
\
params[idx]->data<T>(), grads[idx]->data<T>(),
\
velocitys[idx]->data<M
PType>(), lr_temp->data<MPType>(), master_in_data,
\
mu, rescale_grad, params[idx]->numel(), regularization_coeff,
\
params
_out[idx]->data<T>(), velocitys_out[idx]->data<MPType>(),
\
master_out_data);
\
#define PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(__nesterov, __reg_type) \
DenseMomentumFunctor<T, M
T, __reg_type, __nesterov> functor(
\
params[idx]->data<T>(), grads[idx]->data<T>(), \
velocitys[idx]->data<M
T>(), lr_temp->data<MPType>(), master_in_data,
\
static_cast<MT>(mu), static_cast<MT>(rescale_grad),
\
params
[idx]->numel(), regularization_coeff, params_out[idx]->data<T>(),
\
velocitys_out[idx]->data<MT>(), master_out_data);
\
for_range(functor);
if
(
use_nesterov
)
{
if
(
regularization_flag
==
RegularizationType
::
kL2DECAY
)
{
...
...
python/paddle/optimizer/adam.py
浏览文件 @
6b0c57cf
...
...
@@ -551,8 +551,7 @@ class Adam(Optimizer):
multi_tensor_list
=
[
'FP32_LODTensor'
,
'FP16_LODTensor'
]
for
key
in
multi_tensor_list
:
if
len
(
self
.
_param_dict
[
key
])
>
0
:
if
key
==
'FP32_LODTensor'
:
self
.
_multi_precision
=
False
find_master
=
self
.
_multi_precision
and
key
==
'FP16_LODTensor'
_beta1
=
self
.
_beta1
if
not
isinstance
(
self
.
_beta1
,
Variable
)
else
self
.
_beta1
.
numpy
().
item
(
0
)
...
...
@@ -571,7 +570,7 @@ class Adam(Optimizer):
self
.
_beta2_pow_acc_dict
[
key
],
self
.
_master_weight_dict
[
key
],
'epsilon'
,
self
.
_epsilon
,
'beta1'
,
_beta1
,
'beta2'
,
_beta2
,
'multi_precision'
,
self
.
_multi_precision
)
find_master
)
else
:
inputs
=
{
"Param"
:
self
.
_param_dict
[
key
],
...
...
@@ -594,11 +593,11 @@ class Adam(Optimizer):
"beta1"
:
_beta1
,
"beta2"
:
_beta2
}
if
self
.
_multi_precision
:
if
find_master
:
inputs
[
"MasterParam"
]
=
self
.
_master_weight_dict
[
key
]
outputs
[
"MasterParamOut"
]
=
self
.
_master_weight_dict
[
key
]
attrs
[
"multi_precision"
]
=
self
.
_multi_precision
attrs
[
"multi_precision"
]
=
find_master
target_block
.
append_op
(
type
=
"merged_adam"
,
inputs
=
inputs
,
...
...
python/paddle/optimizer/momentum.py
浏览文件 @
6b0c57cf
...
...
@@ -464,8 +464,7 @@ class Momentum(Optimizer):
multi_tensor_list
=
[
'FP32_LODTensor'
,
'FP16_LODTensor'
]
for
key
in
multi_tensor_list
:
if
len
(
self
.
_param_dict
[
key
])
>
0
:
if
key
==
'FP32_LODTensor'
:
self
.
_multi_precision
=
False
find_master
=
self
.
_multi_precision
and
key
==
'FP16_LODTensor'
if
framework
.
in_dygraph_mode
():
_
,
_
,
_
=
_C_ops
.
merged_momentum
(
...
...
@@ -478,7 +477,7 @@ class Momentum(Optimizer):
self
.
_regularization_method_dict
[
key
],
'regularization_coeff'
,
self
.
_regularization_coeff_dict
[
key
],
'multi_precision'
,
self
.
_multi_precision
)
find_master
)
else
:
inputs
=
{
"Param"
:
self
.
_param_dict
[
key
],
...
...
@@ -498,11 +497,11 @@ class Momentum(Optimizer):
"regularization_coeff"
:
self
.
_regularization_coeff_dict
[
key
],
}
if
self
.
_multi_precision
:
if
find_master
:
inputs
[
"MasterParam"
]
=
self
.
_master_weight_dict
[
key
]
outputs
[
"MasterParamOut"
]
=
self
.
_master_weight_dict
[
key
]
attrs
[
"multi_precision"
]
=
self
.
_multi_precision
attrs
[
"multi_precision"
]
=
find_master
target_block
.
append_op
(
type
=
"merged_momentum"
,
inputs
=
inputs
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录