Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
4d805e6a
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2297
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
4d805e6a
编写于
6月 03, 2021
作者:
Y
Yuang Liu
提交者:
GitHub
6月 03, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
multi pricison for lars op and lars optimizer (#33280)
上级
fc5b3a99
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
271 addition
and
55 deletion
+271
-55
paddle/fluid/operators/optimizers/lars_momentum_op.cc
paddle/fluid/operators/optimizers/lars_momentum_op.cc
+14
-0
paddle/fluid/operators/optimizers/lars_momentum_op.cu
paddle/fluid/operators/optimizers/lars_momentum_op.cu
+89
-30
paddle/fluid/operators/optimizers/momentum_op.h
paddle/fluid/operators/optimizers/momentum_op.h
+3
-0
python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
...le/fluid/contrib/tests/test_multi_precision_fp16_train.py
+15
-7
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+92
-18
python/paddle/fluid/tests/unittests/test_momentum_op.py
python/paddle/fluid/tests/unittests/test_momentum_op.py
+58
-0
未找到文件。
paddle/fluid/operators/optimizers/lars_momentum_op.cc
100755 → 100644
浏览文件 @
4d805e6a
...
...
@@ -34,6 +34,7 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput
(
"LearningRate"
,
"(LoDTensor, default LoDTensor<float>) "
"Input learning rate"
);
AddInput
(
"MasterParam"
,
"FP32 master weight for AMP."
).
AsDispensable
();
AddOutput
(
"ParamOut"
,
"(LoDTensor) This output is updated parameter. "
...
...
@@ -41,6 +42,10 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"VelocityOut"
,
"(LoDTensor) This output is updated velocity. "
"It shared memory with Input(Velocity)."
);
AddOutput
(
"MasterParamOut"
,
"The updated FP32 master weight for AMP. "
"It shared memory with Input(MasterParam)."
)
.
AsDispensable
();
AddAttr
<
float
>
(
"mu"
,
"(float) Momentum coefficient"
);
AddAttr
<
float
>
(
"lars_coeff"
,
"(float, default 0.001) LARS coefficient."
)
...
...
@@ -51,6 +56,15 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr
<
float
>
(
"epsilon"
,
"(float, default 0.0) epsilon to avoid Division by Zero."
)
.
SetDefault
(
0.0
);
AddAttr
<
bool
>
(
"multi_precision"
,
"(bool, default false) "
"Whether to use multi-precision during weight updating."
)
.
SetDefault
(
false
);
AddAttr
<
float
>
(
"rescale_grad"
,
"(float, default 1.0) Multiply the gradient with `rescale_grad`"
"before updating. Often choose to be `1.0/batch_size`."
)
.
SetDefault
(
1.0
f
);
AddComment
(
R"DOC(
Lars Momentum Optimizer.
...
...
paddle/fluid/operators/optimizers/lars_momentum_op.cu
浏览文件 @
4d805e6a
...
...
@@ -13,36 +13,64 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
__global__
void
MomentumLarsKernel
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
const
T
*
learning_rate
,
const
T
mu
,
const
int64_t
num
,
const
T
lars_coeff
,
const
T
lars_weight_decay
,
const
T
*
p_norm
,
const
T
*
g_norm
,
T
*
p_out
,
T
*
v_out
,
const
T
epsilon
)
{
T
lr
=
learning_rate
[
0
];
T
local_lr
=
learning_rate
[
0
];
using
MultiPrecisionType
=
typename
details
::
MPTypeTrait
<
T
>::
Type
;
template
<
typename
T
,
typename
MT
>
__global__
void
MomentumLarsKernel
(
const
T
*
p
,
const
T
*
g
,
const
MT
*
v
,
const
MultiPrecisionType
<
T
>*
learning_rate
,
const
MT
mu
,
const
int64_t
num
,
const
MT
lars_coeff
,
const
MT
lars_weight_decay
,
const
MultiPrecisionType
<
T
>*
p_norm
,
const
MultiPrecisionType
<
T
>*
g_norm
,
T
*
p_out
,
MT
*
v_out
,
const
MT
epsilon
,
const
MT
*
master_p
,
MT
*
master_p_out
,
const
MultiPrecisionType
<
T
>
rescale_grad
)
{
const
MT
lr
=
static_cast
<
MT
>
(
learning_rate
[
0
]);
MT
local_lr
=
lr
;
const
MT
p_n
=
static_cast
<
MT
>
(
p_norm
[
0
]);
const
MT
g_n
=
static_cast
<
MT
>
(
g_norm
[
0
]);
if
(
lars_weight_decay
>
static_cast
<
MT
>
(
0
)
&&
p_n
>
static_cast
<
MT
>
(
0
)
&&
g_n
>
static_cast
<
MT
>
(
0
))
{
local_lr
=
lr
*
lars_coeff
*
p_n
/
(
g_n
+
lars_weight_decay
*
p_n
+
epsilon
);
}
CUDA_KERNEL_LOOP
(
i
,
num
)
{
if
(
lars_weight_decay
>
0
&&
p_norm
[
0
]
>
0
&&
g_norm
[
0
]
>
0
)
{
local_lr
=
lr
*
lars_coeff
*
p_norm
[
0
]
/
(
g_norm
[
0
]
+
lars_weight_decay
*
p_norm
[
0
]
+
epsilon
);
}
MT
grad
=
static_cast
<
MT
>
(
g
[
i
])
*
static_cast
<
MT
>
(
rescale_grad
);
MT
param
=
master_p
?
master_p
[
i
]
:
static_cast
<
MT
>
(
p
[
i
]);
MT
v_new
=
v
[
i
]
*
mu
+
local_lr
*
(
grad
+
lars_weight_decay
*
param
);
MT
p_new
=
param
-
v_new
;
T
v_new
=
v
[
i
]
*
mu
+
local_lr
*
(
g
[
i
]
+
lars_weight_decay
*
p
[
i
]);
v_out
[
i
]
=
v_new
;
p_out
[
i
]
=
p
[
i
]
-
v_new
;
p_out
[
i
]
=
static_cast
<
T
>
(
p_new
);
if
(
master_p_out
)
master_p_out
[
i
]
=
p_new
;
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
LarsMomentumOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
using
MPDType
=
MultiPrecisionType
<
T
>
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
bool
multi_precision
=
ctx
.
Attr
<
bool
>
(
"multi_precision"
);
if
(
multi_precision
)
{
InnerCompute
<
MPDType
>
(
ctx
,
multi_precision
);
}
else
{
InnerCompute
<
T
>
(
ctx
,
multi_precision
);
}
}
private:
template
<
typename
MT
>
void
InnerCompute
(
const
framework
::
ExecutionContext
&
ctx
,
const
bool
multi_precision
)
const
{
auto
param_out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"ParamOut"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"VelocityOut"
);
auto
param
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Param"
);
...
...
@@ -50,18 +78,40 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
auto
grad
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Grad"
);
auto
learning_rate
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"LearningRate"
);
const
framework
::
Tensor
*
master_param
=
nullptr
;
framework
::
Tensor
*
master_param_out
=
nullptr
;
if
(
multi_precision
)
{
bool
has_master
=
ctx
.
HasInput
(
"MasterParam"
)
&&
ctx
.
HasOutput
(
"MasterParamOut"
);
PADDLE_ENFORCE_EQ
(
has_master
,
true
,
platform
::
errors
::
InvalidArgument
(
"The Input(MasterParam) and Output(MasterParamOut) "
"should not be null when "
"the attr `multi_precision` is true"
));
master_param
=
ctx
.
Input
<
framework
::
Tensor
>
(
"MasterParam"
);
master_param_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"MasterParamOut"
);
}
const
MT
*
master_p
=
multi_precision
?
master_param
->
data
<
MT
>
()
:
nullptr
;
MT
*
master_p_out
=
multi_precision
?
master_param_out
->
mutable_data
<
MT
>
(
ctx
.
GetPlace
())
:
nullptr
;
T
*
p_out
=
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
v_out
=
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MT
*
v_out
=
velocity_out
->
mutable_data
<
M
T
>
(
ctx
.
GetPlace
());
T
mu
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"mu"
));
T
lars_coeff
=
ctx
.
Attr
<
float
>
(
"lars_coeff"
);
T
lars_weight_decay
=
ctx
.
Attr
<
float
>
(
"lars_weight_decay"
);
T
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
MT
mu
=
static_cast
<
MT
>
(
ctx
.
Attr
<
float
>
(
"mu"
));
MT
lars_coeff
=
static_cast
<
MT
>
(
ctx
.
Attr
<
float
>
(
"lars_coeff"
));
MT
lars_weight_decay
=
static_cast
<
MT
>
(
ctx
.
Attr
<
float
>
(
"lars_weight_decay"
));
MT
epsilon
=
static_cast
<
MT
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
MPDType
rescale_grad
=
static_cast
<
MPDType
>
(
ctx
.
Attr
<
float
>
(
"rescale_grad"
));
auto
*
p
=
param
->
data
<
T
>
();
auto
*
v
=
velocity
->
data
<
T
>
();
auto
*
g
=
grad
->
data
<
T
>
();
auto
*
lr
=
learning_rate
->
data
<
T
>
();
auto
*
v
=
velocity
->
data
<
MT
>
();
auto
*
lr
=
learning_rate
->
data
<
MPDType
>
();
int
block
=
512
;
int
grid
=
(
param
->
numel
()
+
block
-
1
)
/
block
;
...
...
@@ -72,17 +122,24 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
framework
::
Tensor
p_norm_t
,
g_norm_t
;
p_norm_t
.
Resize
({
1
});
g_norm_t
.
Resize
({
1
});
auto
*
p_norm_data
=
p_norm_t
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
g_norm_data
=
g_norm_t
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
ep_norm
=
framework
::
EigenScalar
<
T
>::
From
(
p_norm_t
);
auto
eg_norm
=
framework
::
EigenScalar
<
T
>::
From
(
g_norm_t
);
auto
*
p_norm_data
=
p_norm_t
.
mutable_data
<
MPDType
>
(
ctx
.
GetPlace
());
auto
*
g_norm_data
=
g_norm_t
.
mutable_data
<
MPDType
>
(
ctx
.
GetPlace
());
auto
ep_norm
=
framework
::
EigenScalar
<
MPDType
>::
From
(
p_norm_t
);
auto
eg_norm
=
framework
::
EigenScalar
<
MPDType
>::
From
(
g_norm_t
);
auto
*
place
=
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
ep_norm
.
device
(
*
place
)
=
eigen_p
.
square
().
sum
().
sqrt
();
eg_norm
.
device
(
*
place
)
=
eigen_g
.
square
().
sum
().
sqrt
();
MomentumLarsKernel
<<<
grid
,
block
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
// eigen unsupport fp16 l2-norm
ep_norm
.
device
(
*
place
)
=
eigen_p
.
template
cast
<
MPDType
>().
square
().
sum
().
sqrt
();
eg_norm
.
device
(
*
place
)
=
(
eigen_g
.
template
cast
<
MPDType
>()
*
rescale_grad
).
square
().
sum
().
sqrt
();
MomentumLarsKernel
<
T
,
MT
><<<
grid
,
block
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
p
,
g
,
v
,
lr
,
mu
,
param
->
numel
(),
lars_coeff
,
lars_weight_decay
,
p_norm_data
,
g_norm_data
,
p_out
,
v_out
,
epsilon
);
p_norm_data
,
g_norm_data
,
p_out
,
v_out
,
epsilon
,
master_p
,
master_p_out
,
rescale_grad
);
}
};
...
...
@@ -93,4 +150,6 @@ namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL
(
lars_momentum
,
ops
::
LarsMomentumOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
LarsMomentumOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
ops
::
LarsMomentumOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
LarsMomentumOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/optimizers/momentum_op.h
浏览文件 @
4d805e6a
...
...
@@ -135,6 +135,9 @@ class MomentumOp : public framework::OperatorWithKernel {
ctx
->
SetOutputDim
(
"ParamOut"
,
param_dim
);
ctx
->
SetOutputDim
(
"VelocityOut"
,
param_dim
);
if
(
ctx
->
HasOutput
(
"MasterParamOut"
))
{
ctx
->
SetOutputDim
(
"MasterParamOut"
,
param_dim
);
}
}
framework
::
OpKernelType
GetExpectedKernelType
(
...
...
python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
浏览文件 @
4d805e6a
...
...
@@ -73,7 +73,7 @@ def resnet_cifar10(input, depth=32):
return
pool
def
train
(
use_pure_fp16
=
True
,
use_nesterov
=
False
,
use_adam
=
False
):
def
train
(
use_pure_fp16
=
True
,
use_nesterov
=
False
,
optimizer
=
""
):
classdim
=
10
data_shape
=
[
3
,
32
,
32
]
BATCH_SIZE
=
32
...
...
@@ -96,12 +96,17 @@ def train(use_pure_fp16=True, use_nesterov=False, use_adam=False):
# Test program
test_program
=
train_program
.
clone
(
for_test
=
True
)
if
use_adam
:
if
optimizer
==
"Adam"
:
optimizer
=
paddle
.
optimizer
.
AdamW
(
learning_rate
=
0.001
,
epsilon
=
1e-8
,
weight_decay
=
0.0
,
multi_precision
=
True
)
elif
optimizer
==
"Lars"
:
optimizer
=
paddle
.
fluid
.
optimizer
.
LarsMomentumOptimizer
(
learning_rate
=
0.001
,
momentum
=
0.9
,
multi_precision
=
use_pure_fp16
)
else
:
optimizer
=
paddle
.
optimizer
.
Momentum
(
learning_rate
=
0.001
,
...
...
@@ -169,9 +174,11 @@ class TestImageMultiPrecision(unittest.TestCase):
if
not
fluid
.
core
.
is_compiled_with_cuda
():
return
def
do_test
(
use_nesterov
=
False
,
use_adam
=
False
):
if
use_adam
:
def
do_test
(
use_nesterov
=
False
,
optimizer
=
""
):
if
optimizer
==
"Adam"
:
suffix
=
"use Adam"
elif
optimizer
==
"Lars"
:
suffix
=
"use Lars"
else
:
suffix
=
"with Nesterov"
if
use_nesterov
else
"without Nesterov"
with
self
.
scope_prog_guard
():
...
...
@@ -180,14 +187,14 @@ class TestImageMultiPrecision(unittest.TestCase):
train_loss_fp16
,
test_loss_fp16
=
train
(
use_pure_fp16
=
True
,
use_nesterov
=
use_nesterov
,
use_adam
=
use_adam
)
optimizer
=
optimizer
)
with
self
.
scope_prog_guard
():
print
(
"-----------------FP32 Train {}-----------------"
.
format
(
suffix
))
train_loss_fp32
,
test_loss_fp32
=
train
(
use_pure_fp16
=
False
,
use_nesterov
=
use_nesterov
,
use_adam
=
use_adam
)
optimizer
=
optimizer
)
self
.
assertTrue
(
np
.
allclose
(
...
...
@@ -208,7 +215,8 @@ class TestImageMultiPrecision(unittest.TestCase):
do_test
(
use_nesterov
=
False
)
do_test
(
use_nesterov
=
True
)
do_test
(
use_adam
=
True
)
do_test
(
optimizer
=
"Adam"
)
do_test
(
optimizer
=
"Lars"
)
@
contextlib
.
contextmanager
def
scope_prog_guard
(
self
):
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
4d805e6a
...
...
@@ -1725,6 +1725,9 @@ class LarsMomentumOptimizer(Optimizer):
For details, please refer to :ref:`api_guide_Name`. Default is None.
exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None.
epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0.
multi_precision (bool, optional): Whether to use multi-precision during weight updating.
rescale_grad (float, optional): Multiply the gradient with `rescale_grad` \
before updating. Often choose to be `1.0/batch_size`.
Examples:
.. code-block:: python
...
...
@@ -1758,7 +1761,9 @@ class LarsMomentumOptimizer(Optimizer):
grad_clip
=
None
,
name
=
None
,
exclude_from_weight_decay
=
None
,
epsilon
=
0
):
epsilon
=
0
,
multi_precision
=
False
,
rescale_grad
=
1.0
):
assert
learning_rate
is
not
None
assert
momentum
is
not
None
super
(
LarsMomentumOptimizer
,
self
).
__init__
(
...
...
@@ -1776,16 +1781,70 @@ class LarsMomentumOptimizer(Optimizer):
self
.
_exclude_from_weight_decay
=
[]
else
:
self
.
_exclude_from_weight_decay
=
exclude_from_weight_decay
self
.
_multi_precision
=
multi_precision
self
.
_rescale_grad
=
float
(
rescale_grad
)
self
.
_master_weights
=
{}
def
_create_master_weight
(
self
,
param
):
assert
isinstance
(
self
.
helper
,
LayerHelper
)
var_name
=
param
.
name
+
'_fp32_master'
var_name
=
unique_name
.
generate
(
var_name
)
var
=
layers
.
create_global_var
(
name
=
var_name
,
shape
=
param
.
shape
,
value
=
0
,
dtype
=
'float32'
,
persistable
=
True
)
block
=
self
.
helper
.
startup_program
.
global_block
()
block
.
append_op
(
type
=
"cast"
,
inputs
=
{
"X"
:
[
param
]},
outputs
=
{
"Out"
:
[
var
]},
attrs
=
{
"in_dtype"
:
param
.
dtype
,
"out_dtype"
:
core
.
VarDesc
.
VarType
.
FP32
})
self
.
_master_weights
[
param
.
name
]
=
var
return
var
def
_get_accumulator
(
self
,
name
,
param
):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if
self
.
_name
is
not
None
:
name
=
self
.
_name
+
"_"
+
name
find_master
=
self
.
_multi_precision
and
param
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
target_param
=
self
.
_master_weights
[
param
.
name
]
if
find_master
else
param
target_name
=
target_param
.
name
if
(
name
not
in
self
.
_accumulators
or
target_name
not
in
self
.
_accumulators
[
name
]):
raise
Exception
(
"Accumulator {} does not exist for parameter {}"
.
format
(
name
,
target_name
))
return
self
.
_accumulators
[
name
][
target_name
]
def
_create_accumulators
(
self
,
block
,
parameters
):
assert
isinstance
(
block
,
framework
.
Block
)
for
p
in
parameters
:
if
self
.
_multi_precision
and
p
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
master_p
=
self
.
_create_master_weight
(
p
)
self
.
_add_accumulator
(
self
.
_velocity_acc_str
,
master_p
)
continue
if
p
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
and
not
self
.
_multi_precision
:
warnings
.
warn
(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Lars optimizer."
)
self
.
_add_accumulator
(
self
.
_velocity_acc_str
,
p
)
def
_append_optimize_op
(
self
,
block
,
param_and_grad
):
assert
isinstance
(
block
,
framework
.
Block
)
_lars_weight_decay
=
self
.
_lars_weight_decay
param_name
=
param_and_grad
[
0
].
name
if
len
(
self
.
_exclude_from_weight_decay
)
>
0
:
...
...
@@ -1796,25 +1855,40 @@ class LarsMomentumOptimizer(Optimizer):
velocity_acc
=
self
.
_get_accumulator
(
self
.
_velocity_acc_str
,
param_and_grad
[
0
])
lr
=
self
.
_create_param_lr
(
param_and_grad
)
find_master
=
self
.
_multi_precision
and
param_and_grad
[
0
].
dtype
==
core
.
VarDesc
.
VarType
.
FP16
master_weight
=
(
self
.
_master_weights
[
param_and_grad
[
0
].
name
]
if
find_master
else
None
)
attrs
=
{
"mu"
:
self
.
_momentum
,
"lars_coeff"
:
self
.
_lars_coeff
,
"lars_weight_decay"
:
_lars_weight_decay
,
"multi_precision"
:
find_master
,
"rescale_grad"
:
self
.
_rescale_grad
}
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"Velocity"
:
velocity_acc
,
"LearningRate"
:
lr
}
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"VelocityOut"
:
velocity_acc
}
if
find_master
:
inputs
[
"MasterParam"
]
=
master_weight
outputs
[
"MasterParamOut"
]
=
master_weight
# create the momentum optimize op
momentum_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"Velocity"
:
velocity_acc
,
"LearningRate"
:
self
.
_create_param_lr
(
param_and_grad
)
},
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"VelocityOut"
:
velocity_acc
},
attrs
=
{
"mu"
:
self
.
_momentum
,
"lars_coeff"
:
self
.
_lars_coeff
,
"lars_weight_decay"
:
_lars_weight_decay
,
"epsilon"
:
self
.
_epsilon
},
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
True
)
return
momentum_op
...
...
python/paddle/fluid/tests/unittests/test_momentum_op.py
浏览文件 @
4d805e6a
...
...
@@ -134,6 +134,64 @@ class TestMomentumOp2(OpTest):
self
.
check_output
()
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestLarsMomentumOpWithMP
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"lars_momentum"
master_param
=
np
.
random
.
random
((
123
,
321
)).
astype
(
"float32"
)
param
=
master_param
.
astype
(
"float16"
)
grad
=
np
.
random
.
random
((
123
,
321
)).
astype
(
"float16"
)
velocity
=
np
.
zeros
((
123
,
321
)).
astype
(
"float32"
)
learning_rate
=
np
.
array
([
0.001
]).
astype
(
"float32"
)
mu
=
0.0001
lars_coeff
=
0.001
lars_weight_decay
=
0.0005
rescale_grad
=
1.0
self
.
inputs
=
{
'Param'
:
param
,
'Grad'
:
grad
,
'Velocity'
:
velocity
,
'LearningRate'
:
learning_rate
,
'MasterParam'
:
master_param
,
}
self
.
attrs
=
{
'mu'
:
mu
,
'lars_coeff'
:
lars_coeff
,
'lars_weight_decay'
:
lars_weight_decay
,
'multi_precision'
:
True
,
'rescale_grad'
:
rescale_grad
}
fp32_grad
=
grad
.
astype
(
"float32"
)
pnorm
=
np
.
sqrt
(
np
.
square
(
master_param
).
sum
())
gnorm
=
np
.
sqrt
(
np
.
square
(
fp32_grad
).
sum
())
local_lr
=
learning_rate
*
lars_coeff
*
pnorm
/
(
gnorm
+
lars_weight_decay
*
pnorm
)
fp32_grad
=
fp32_grad
*
rescale_grad
velocity_out
=
mu
*
velocity
+
local_lr
*
(
fp32_grad
+
lars_weight_decay
*
master_param
)
p_new
=
master_param
-
velocity_out
param_out
=
p_new
.
astype
(
"float16"
)
master_param_out
=
p_new
self
.
outputs
=
{
'ParamOut'
:
param_out
,
'VelocityOut'
:
velocity_out
,
'MasterParamOut'
:
master_param_out
}
def
test_check_output
(
self
):
paddle
.
enable_static
()
if
core
.
is_compiled_with_cuda
():
place
=
fluid
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
)
class
TestLarsMomentumOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"lars_momentum"
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录