Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
4d805e6a
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
4d805e6a
编写于
6月 03, 2021
作者:
Y
Yuang Liu
提交者:
GitHub
6月 03, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
multi pricison for lars op and lars optimizer (#33280)
上级
fc5b3a99
变更
6
显示空白变更内容
内联
并排
Showing
6 changed file
with
271 addition
and
55 deletion
+271
-55
paddle/fluid/operators/optimizers/lars_momentum_op.cc
paddle/fluid/operators/optimizers/lars_momentum_op.cc
+14
-0
paddle/fluid/operators/optimizers/lars_momentum_op.cu
paddle/fluid/operators/optimizers/lars_momentum_op.cu
+89
-30
paddle/fluid/operators/optimizers/momentum_op.h
paddle/fluid/operators/optimizers/momentum_op.h
+3
-0
python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
...le/fluid/contrib/tests/test_multi_precision_fp16_train.py
+15
-7
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+92
-18
python/paddle/fluid/tests/unittests/test_momentum_op.py
python/paddle/fluid/tests/unittests/test_momentum_op.py
+58
-0
未找到文件。
paddle/fluid/operators/optimizers/lars_momentum_op.cc
100755 → 100644
浏览文件 @
4d805e6a
...
@@ -34,6 +34,7 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -34,6 +34,7 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput
(
"LearningRate"
,
AddInput
(
"LearningRate"
,
"(LoDTensor, default LoDTensor<float>) "
"(LoDTensor, default LoDTensor<float>) "
"Input learning rate"
);
"Input learning rate"
);
AddInput
(
"MasterParam"
,
"FP32 master weight for AMP."
).
AsDispensable
();
AddOutput
(
"ParamOut"
,
AddOutput
(
"ParamOut"
,
"(LoDTensor) This output is updated parameter. "
"(LoDTensor) This output is updated parameter. "
...
@@ -41,6 +42,10 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -41,6 +42,10 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"VelocityOut"
,
AddOutput
(
"VelocityOut"
,
"(LoDTensor) This output is updated velocity. "
"(LoDTensor) This output is updated velocity. "
"It shared memory with Input(Velocity)."
);
"It shared memory with Input(Velocity)."
);
AddOutput
(
"MasterParamOut"
,
"The updated FP32 master weight for AMP. "
"It shared memory with Input(MasterParam)."
)
.
AsDispensable
();
AddAttr
<
float
>
(
"mu"
,
"(float) Momentum coefficient"
);
AddAttr
<
float
>
(
"mu"
,
"(float) Momentum coefficient"
);
AddAttr
<
float
>
(
"lars_coeff"
,
"(float, default 0.001) LARS coefficient."
)
AddAttr
<
float
>
(
"lars_coeff"
,
"(float, default 0.001) LARS coefficient."
)
...
@@ -51,6 +56,15 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -51,6 +56,15 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr
<
float
>
(
"epsilon"
,
AddAttr
<
float
>
(
"epsilon"
,
"(float, default 0.0) epsilon to avoid Division by Zero."
)
"(float, default 0.0) epsilon to avoid Division by Zero."
)
.
SetDefault
(
0.0
);
.
SetDefault
(
0.0
);
AddAttr
<
bool
>
(
"multi_precision"
,
"(bool, default false) "
"Whether to use multi-precision during weight updating."
)
.
SetDefault
(
false
);
AddAttr
<
float
>
(
"rescale_grad"
,
"(float, default 1.0) Multiply the gradient with `rescale_grad`"
"before updating. Often choose to be `1.0/batch_size`."
)
.
SetDefault
(
1.0
f
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
Lars Momentum Optimizer.
Lars Momentum Optimizer.
...
...
paddle/fluid/operators/optimizers/lars_momentum_op.cu
浏览文件 @
4d805e6a
...
@@ -13,36 +13,64 @@ See the License for the specific language governing permissions and
...
@@ -13,36 +13,64 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
template
<
typename
T
>
template
<
typename
T
>
__global__
void
MomentumLarsKernel
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
using
MultiPrecisionType
=
typename
details
::
MPTypeTrait
<
T
>::
Type
;
const
T
*
learning_rate
,
const
T
mu
,
const
int64_t
num
,
const
T
lars_coeff
,
template
<
typename
T
,
typename
MT
>
const
T
lars_weight_decay
,
const
T
*
p_norm
,
__global__
void
MomentumLarsKernel
(
const
T
*
g_norm
,
T
*
p_out
,
T
*
v_out
,
const
T
*
p
,
const
T
*
g
,
const
MT
*
v
,
const
T
epsilon
)
{
const
MultiPrecisionType
<
T
>*
learning_rate
,
const
MT
mu
,
const
int64_t
num
,
T
lr
=
learning_rate
[
0
];
const
MT
lars_coeff
,
const
MT
lars_weight_decay
,
T
local_lr
=
learning_rate
[
0
];
const
MultiPrecisionType
<
T
>*
p_norm
,
const
MultiPrecisionType
<
T
>*
g_norm
,
CUDA_KERNEL_LOOP
(
i
,
num
)
{
T
*
p_out
,
MT
*
v_out
,
const
MT
epsilon
,
const
MT
*
master_p
,
MT
*
master_p_out
,
if
(
lars_weight_decay
>
0
&&
p_norm
[
0
]
>
0
&&
g_norm
[
0
]
>
0
)
{
const
MultiPrecisionType
<
T
>
rescale_grad
)
{
local_lr
=
lr
*
lars_coeff
*
p_norm
[
0
]
/
const
MT
lr
=
static_cast
<
MT
>
(
learning_rate
[
0
]);
(
g_norm
[
0
]
+
lars_weight_decay
*
p_norm
[
0
]
+
epsilon
);
MT
local_lr
=
lr
;
const
MT
p_n
=
static_cast
<
MT
>
(
p_norm
[
0
]);
const
MT
g_n
=
static_cast
<
MT
>
(
g_norm
[
0
]);
if
(
lars_weight_decay
>
static_cast
<
MT
>
(
0
)
&&
p_n
>
static_cast
<
MT
>
(
0
)
&&
g_n
>
static_cast
<
MT
>
(
0
))
{
local_lr
=
lr
*
lars_coeff
*
p_n
/
(
g_n
+
lars_weight_decay
*
p_n
+
epsilon
);
}
}
CUDA_KERNEL_LOOP
(
i
,
num
)
{
MT
grad
=
static_cast
<
MT
>
(
g
[
i
])
*
static_cast
<
MT
>
(
rescale_grad
);
MT
param
=
master_p
?
master_p
[
i
]
:
static_cast
<
MT
>
(
p
[
i
]);
MT
v_new
=
v
[
i
]
*
mu
+
local_lr
*
(
grad
+
lars_weight_decay
*
param
);
MT
p_new
=
param
-
v_new
;
T
v_new
=
v
[
i
]
*
mu
+
local_lr
*
(
g
[
i
]
+
lars_weight_decay
*
p
[
i
]);
v_out
[
i
]
=
v_new
;
v_out
[
i
]
=
v_new
;
p_out
[
i
]
=
p
[
i
]
-
v_new
;
p_out
[
i
]
=
static_cast
<
T
>
(
p_new
);
if
(
master_p_out
)
master_p_out
[
i
]
=
p_new
;
}
}
}
}
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
LarsMomentumOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
class
LarsMomentumOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
using
MPDType
=
MultiPrecisionType
<
T
>
;
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
bool
multi_precision
=
ctx
.
Attr
<
bool
>
(
"multi_precision"
);
if
(
multi_precision
)
{
InnerCompute
<
MPDType
>
(
ctx
,
multi_precision
);
}
else
{
InnerCompute
<
T
>
(
ctx
,
multi_precision
);
}
}
private:
template
<
typename
MT
>
void
InnerCompute
(
const
framework
::
ExecutionContext
&
ctx
,
const
bool
multi_precision
)
const
{
auto
param_out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"ParamOut"
);
auto
param_out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"ParamOut"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"VelocityOut"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"VelocityOut"
);
auto
param
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Param"
);
auto
param
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Param"
);
...
@@ -50,18 +78,40 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
...
@@ -50,18 +78,40 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
auto
grad
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Grad"
);
auto
grad
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Grad"
);
auto
learning_rate
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"LearningRate"
);
auto
learning_rate
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"LearningRate"
);
const
framework
::
Tensor
*
master_param
=
nullptr
;
framework
::
Tensor
*
master_param_out
=
nullptr
;
if
(
multi_precision
)
{
bool
has_master
=
ctx
.
HasInput
(
"MasterParam"
)
&&
ctx
.
HasOutput
(
"MasterParamOut"
);
PADDLE_ENFORCE_EQ
(
has_master
,
true
,
platform
::
errors
::
InvalidArgument
(
"The Input(MasterParam) and Output(MasterParamOut) "
"should not be null when "
"the attr `multi_precision` is true"
));
master_param
=
ctx
.
Input
<
framework
::
Tensor
>
(
"MasterParam"
);
master_param_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"MasterParamOut"
);
}
const
MT
*
master_p
=
multi_precision
?
master_param
->
data
<
MT
>
()
:
nullptr
;
MT
*
master_p_out
=
multi_precision
?
master_param_out
->
mutable_data
<
MT
>
(
ctx
.
GetPlace
())
:
nullptr
;
T
*
p_out
=
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
p_out
=
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
v_out
=
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MT
*
v_out
=
velocity_out
->
mutable_data
<
M
T
>
(
ctx
.
GetPlace
());
T
mu
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"mu"
));
MT
mu
=
static_cast
<
MT
>
(
ctx
.
Attr
<
float
>
(
"mu"
));
T
lars_coeff
=
ctx
.
Attr
<
float
>
(
"lars_coeff"
);
MT
lars_coeff
=
static_cast
<
MT
>
(
ctx
.
Attr
<
float
>
(
"lars_coeff"
));
T
lars_weight_decay
=
ctx
.
Attr
<
float
>
(
"lars_weight_decay"
);
MT
lars_weight_decay
=
T
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
static_cast
<
MT
>
(
ctx
.
Attr
<
float
>
(
"lars_weight_decay"
));
MT
epsilon
=
static_cast
<
MT
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
MPDType
rescale_grad
=
static_cast
<
MPDType
>
(
ctx
.
Attr
<
float
>
(
"rescale_grad"
));
auto
*
p
=
param
->
data
<
T
>
();
auto
*
p
=
param
->
data
<
T
>
();
auto
*
v
=
velocity
->
data
<
T
>
();
auto
*
g
=
grad
->
data
<
T
>
();
auto
*
g
=
grad
->
data
<
T
>
();
auto
*
lr
=
learning_rate
->
data
<
T
>
();
auto
*
v
=
velocity
->
data
<
MT
>
();
auto
*
lr
=
learning_rate
->
data
<
MPDType
>
();
int
block
=
512
;
int
block
=
512
;
int
grid
=
(
param
->
numel
()
+
block
-
1
)
/
block
;
int
grid
=
(
param
->
numel
()
+
block
-
1
)
/
block
;
...
@@ -72,17 +122,24 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
...
@@ -72,17 +122,24 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
framework
::
Tensor
p_norm_t
,
g_norm_t
;
framework
::
Tensor
p_norm_t
,
g_norm_t
;
p_norm_t
.
Resize
({
1
});
p_norm_t
.
Resize
({
1
});
g_norm_t
.
Resize
({
1
});
g_norm_t
.
Resize
({
1
});
auto
*
p_norm_data
=
p_norm_t
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
p_norm_data
=
p_norm_t
.
mutable_data
<
MPDType
>
(
ctx
.
GetPlace
());
auto
*
g_norm_data
=
g_norm_t
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
g_norm_data
=
g_norm_t
.
mutable_data
<
MPDType
>
(
ctx
.
GetPlace
());
auto
ep_norm
=
framework
::
EigenScalar
<
T
>::
From
(
p_norm_t
);
auto
ep_norm
=
framework
::
EigenScalar
<
MPDType
>::
From
(
p_norm_t
);
auto
eg_norm
=
framework
::
EigenScalar
<
T
>::
From
(
g_norm_t
);
auto
eg_norm
=
framework
::
EigenScalar
<
MPDType
>::
From
(
g_norm_t
);
auto
*
place
=
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
auto
*
place
=
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
ep_norm
.
device
(
*
place
)
=
eigen_p
.
square
().
sum
().
sqrt
();
eg_norm
.
device
(
*
place
)
=
eigen_g
.
square
().
sum
().
sqrt
();
// eigen unsupport fp16 l2-norm
MomentumLarsKernel
<<<
grid
,
block
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
ep_norm
.
device
(
*
place
)
=
eigen_p
.
template
cast
<
MPDType
>().
square
().
sum
().
sqrt
();
eg_norm
.
device
(
*
place
)
=
(
eigen_g
.
template
cast
<
MPDType
>()
*
rescale_grad
).
square
().
sum
().
sqrt
();
MomentumLarsKernel
<
T
,
MT
><<<
grid
,
block
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
p
,
g
,
v
,
lr
,
mu
,
param
->
numel
(),
lars_coeff
,
lars_weight_decay
,
p
,
g
,
v
,
lr
,
mu
,
param
->
numel
(),
lars_coeff
,
lars_weight_decay
,
p_norm_data
,
g_norm_data
,
p_out
,
v_out
,
epsilon
);
p_norm_data
,
g_norm_data
,
p_out
,
v_out
,
epsilon
,
master_p
,
master_p_out
,
rescale_grad
);
}
}
};
};
...
@@ -93,4 +150,6 @@ namespace ops = paddle::operators;
...
@@ -93,4 +150,6 @@ namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
lars_momentum
,
lars_momentum
,
ops
::
LarsMomentumOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
LarsMomentumOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
LarsMomentumOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
ops
::
LarsMomentumOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
LarsMomentumOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/optimizers/momentum_op.h
浏览文件 @
4d805e6a
...
@@ -135,6 +135,9 @@ class MomentumOp : public framework::OperatorWithKernel {
...
@@ -135,6 +135,9 @@ class MomentumOp : public framework::OperatorWithKernel {
ctx
->
SetOutputDim
(
"ParamOut"
,
param_dim
);
ctx
->
SetOutputDim
(
"ParamOut"
,
param_dim
);
ctx
->
SetOutputDim
(
"VelocityOut"
,
param_dim
);
ctx
->
SetOutputDim
(
"VelocityOut"
,
param_dim
);
if
(
ctx
->
HasOutput
(
"MasterParamOut"
))
{
ctx
->
SetOutputDim
(
"MasterParamOut"
,
param_dim
);
}
}
}
framework
::
OpKernelType
GetExpectedKernelType
(
framework
::
OpKernelType
GetExpectedKernelType
(
...
...
python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
浏览文件 @
4d805e6a
...
@@ -73,7 +73,7 @@ def resnet_cifar10(input, depth=32):
...
@@ -73,7 +73,7 @@ def resnet_cifar10(input, depth=32):
return
pool
return
pool
def
train
(
use_pure_fp16
=
True
,
use_nesterov
=
False
,
use_adam
=
False
):
def
train
(
use_pure_fp16
=
True
,
use_nesterov
=
False
,
optimizer
=
""
):
classdim
=
10
classdim
=
10
data_shape
=
[
3
,
32
,
32
]
data_shape
=
[
3
,
32
,
32
]
BATCH_SIZE
=
32
BATCH_SIZE
=
32
...
@@ -96,12 +96,17 @@ def train(use_pure_fp16=True, use_nesterov=False, use_adam=False):
...
@@ -96,12 +96,17 @@ def train(use_pure_fp16=True, use_nesterov=False, use_adam=False):
# Test program
# Test program
test_program
=
train_program
.
clone
(
for_test
=
True
)
test_program
=
train_program
.
clone
(
for_test
=
True
)
if
use_adam
:
if
optimizer
==
"Adam"
:
optimizer
=
paddle
.
optimizer
.
AdamW
(
optimizer
=
paddle
.
optimizer
.
AdamW
(
learning_rate
=
0.001
,
learning_rate
=
0.001
,
epsilon
=
1e-8
,
epsilon
=
1e-8
,
weight_decay
=
0.0
,
weight_decay
=
0.0
,
multi_precision
=
True
)
multi_precision
=
True
)
elif
optimizer
==
"Lars"
:
optimizer
=
paddle
.
fluid
.
optimizer
.
LarsMomentumOptimizer
(
learning_rate
=
0.001
,
momentum
=
0.9
,
multi_precision
=
use_pure_fp16
)
else
:
else
:
optimizer
=
paddle
.
optimizer
.
Momentum
(
optimizer
=
paddle
.
optimizer
.
Momentum
(
learning_rate
=
0.001
,
learning_rate
=
0.001
,
...
@@ -169,9 +174,11 @@ class TestImageMultiPrecision(unittest.TestCase):
...
@@ -169,9 +174,11 @@ class TestImageMultiPrecision(unittest.TestCase):
if
not
fluid
.
core
.
is_compiled_with_cuda
():
if
not
fluid
.
core
.
is_compiled_with_cuda
():
return
return
def
do_test
(
use_nesterov
=
False
,
use_adam
=
False
):
def
do_test
(
use_nesterov
=
False
,
optimizer
=
""
):
if
use_adam
:
if
optimizer
==
"Adam"
:
suffix
=
"use Adam"
suffix
=
"use Adam"
elif
optimizer
==
"Lars"
:
suffix
=
"use Lars"
else
:
else
:
suffix
=
"with Nesterov"
if
use_nesterov
else
"without Nesterov"
suffix
=
"with Nesterov"
if
use_nesterov
else
"without Nesterov"
with
self
.
scope_prog_guard
():
with
self
.
scope_prog_guard
():
...
@@ -180,14 +187,14 @@ class TestImageMultiPrecision(unittest.TestCase):
...
@@ -180,14 +187,14 @@ class TestImageMultiPrecision(unittest.TestCase):
train_loss_fp16
,
test_loss_fp16
=
train
(
train_loss_fp16
,
test_loss_fp16
=
train
(
use_pure_fp16
=
True
,
use_pure_fp16
=
True
,
use_nesterov
=
use_nesterov
,
use_nesterov
=
use_nesterov
,
use_adam
=
use_adam
)
optimizer
=
optimizer
)
with
self
.
scope_prog_guard
():
with
self
.
scope_prog_guard
():
print
(
"-----------------FP32 Train {}-----------------"
.
format
(
print
(
"-----------------FP32 Train {}-----------------"
.
format
(
suffix
))
suffix
))
train_loss_fp32
,
test_loss_fp32
=
train
(
train_loss_fp32
,
test_loss_fp32
=
train
(
use_pure_fp16
=
False
,
use_pure_fp16
=
False
,
use_nesterov
=
use_nesterov
,
use_nesterov
=
use_nesterov
,
use_adam
=
use_adam
)
optimizer
=
optimizer
)
self
.
assertTrue
(
self
.
assertTrue
(
np
.
allclose
(
np
.
allclose
(
...
@@ -208,7 +215,8 @@ class TestImageMultiPrecision(unittest.TestCase):
...
@@ -208,7 +215,8 @@ class TestImageMultiPrecision(unittest.TestCase):
do_test
(
use_nesterov
=
False
)
do_test
(
use_nesterov
=
False
)
do_test
(
use_nesterov
=
True
)
do_test
(
use_nesterov
=
True
)
do_test
(
use_adam
=
True
)
do_test
(
optimizer
=
"Adam"
)
do_test
(
optimizer
=
"Lars"
)
@
contextlib
.
contextmanager
@
contextlib
.
contextmanager
def
scope_prog_guard
(
self
):
def
scope_prog_guard
(
self
):
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
4d805e6a
...
@@ -1725,6 +1725,9 @@ class LarsMomentumOptimizer(Optimizer):
...
@@ -1725,6 +1725,9 @@ class LarsMomentumOptimizer(Optimizer):
For details, please refer to :ref:`api_guide_Name`. Default is None.
For details, please refer to :ref:`api_guide_Name`. Default is None.
exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None.
exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None.
epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0.
epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0.
multi_precision (bool, optional): Whether to use multi-precision during weight updating.
rescale_grad (float, optional): Multiply the gradient with `rescale_grad` \
before updating. Often choose to be `1.0/batch_size`.
Examples:
Examples:
.. code-block:: python
.. code-block:: python
...
@@ -1758,7 +1761,9 @@ class LarsMomentumOptimizer(Optimizer):
...
@@ -1758,7 +1761,9 @@ class LarsMomentumOptimizer(Optimizer):
grad_clip
=
None
,
grad_clip
=
None
,
name
=
None
,
name
=
None
,
exclude_from_weight_decay
=
None
,
exclude_from_weight_decay
=
None
,
epsilon
=
0
):
epsilon
=
0
,
multi_precision
=
False
,
rescale_grad
=
1.0
):
assert
learning_rate
is
not
None
assert
learning_rate
is
not
None
assert
momentum
is
not
None
assert
momentum
is
not
None
super
(
LarsMomentumOptimizer
,
self
).
__init__
(
super
(
LarsMomentumOptimizer
,
self
).
__init__
(
...
@@ -1776,16 +1781,70 @@ class LarsMomentumOptimizer(Optimizer):
...
@@ -1776,16 +1781,70 @@ class LarsMomentumOptimizer(Optimizer):
self
.
_exclude_from_weight_decay
=
[]
self
.
_exclude_from_weight_decay
=
[]
else
:
else
:
self
.
_exclude_from_weight_decay
=
exclude_from_weight_decay
self
.
_exclude_from_weight_decay
=
exclude_from_weight_decay
self
.
_multi_precision
=
multi_precision
self
.
_rescale_grad
=
float
(
rescale_grad
)
self
.
_master_weights
=
{}
def
_create_master_weight
(
self
,
param
):
assert
isinstance
(
self
.
helper
,
LayerHelper
)
var_name
=
param
.
name
+
'_fp32_master'
var_name
=
unique_name
.
generate
(
var_name
)
var
=
layers
.
create_global_var
(
name
=
var_name
,
shape
=
param
.
shape
,
value
=
0
,
dtype
=
'float32'
,
persistable
=
True
)
block
=
self
.
helper
.
startup_program
.
global_block
()
block
.
append_op
(
type
=
"cast"
,
inputs
=
{
"X"
:
[
param
]},
outputs
=
{
"Out"
:
[
var
]},
attrs
=
{
"in_dtype"
:
param
.
dtype
,
"out_dtype"
:
core
.
VarDesc
.
VarType
.
FP32
})
self
.
_master_weights
[
param
.
name
]
=
var
return
var
def
_get_accumulator
(
self
,
name
,
param
):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if
self
.
_name
is
not
None
:
name
=
self
.
_name
+
"_"
+
name
find_master
=
self
.
_multi_precision
and
param
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
target_param
=
self
.
_master_weights
[
param
.
name
]
if
find_master
else
param
target_name
=
target_param
.
name
if
(
name
not
in
self
.
_accumulators
or
target_name
not
in
self
.
_accumulators
[
name
]):
raise
Exception
(
"Accumulator {} does not exist for parameter {}"
.
format
(
name
,
target_name
))
return
self
.
_accumulators
[
name
][
target_name
]
def
_create_accumulators
(
self
,
block
,
parameters
):
def
_create_accumulators
(
self
,
block
,
parameters
):
assert
isinstance
(
block
,
framework
.
Block
)
assert
isinstance
(
block
,
framework
.
Block
)
for
p
in
parameters
:
for
p
in
parameters
:
if
self
.
_multi_precision
and
p
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
master_p
=
self
.
_create_master_weight
(
p
)
self
.
_add_accumulator
(
self
.
_velocity_acc_str
,
master_p
)
continue
if
p
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
and
not
self
.
_multi_precision
:
warnings
.
warn
(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Lars optimizer."
)
self
.
_add_accumulator
(
self
.
_velocity_acc_str
,
p
)
self
.
_add_accumulator
(
self
.
_velocity_acc_str
,
p
)
def
_append_optimize_op
(
self
,
block
,
param_and_grad
):
def
_append_optimize_op
(
self
,
block
,
param_and_grad
):
assert
isinstance
(
block
,
framework
.
Block
)
assert
isinstance
(
block
,
framework
.
Block
)
_lars_weight_decay
=
self
.
_lars_weight_decay
_lars_weight_decay
=
self
.
_lars_weight_decay
param_name
=
param_and_grad
[
0
].
name
param_name
=
param_and_grad
[
0
].
name
if
len
(
self
.
_exclude_from_weight_decay
)
>
0
:
if
len
(
self
.
_exclude_from_weight_decay
)
>
0
:
...
@@ -1796,25 +1855,40 @@ class LarsMomentumOptimizer(Optimizer):
...
@@ -1796,25 +1855,40 @@ class LarsMomentumOptimizer(Optimizer):
velocity_acc
=
self
.
_get_accumulator
(
self
.
_velocity_acc_str
,
velocity_acc
=
self
.
_get_accumulator
(
self
.
_velocity_acc_str
,
param_and_grad
[
0
])
param_and_grad
[
0
])
# create the momentum optimize op
lr
=
self
.
_create_param_lr
(
param_and_grad
)
momentum_op
=
block
.
append_op
(
type
=
self
.
type
,
find_master
=
self
.
_multi_precision
and
param_and_grad
[
inputs
=
{
0
].
dtype
==
core
.
VarDesc
.
VarType
.
FP16
"Param"
:
param_and_grad
[
0
],
master_weight
=
(
self
.
_master_weights
[
param_and_grad
[
0
].
name
]
"Grad"
:
param_and_grad
[
1
],
if
find_master
else
None
)
"Velocity"
:
velocity_acc
,
"LearningRate"
:
self
.
_create_param_lr
(
param_and_grad
)
attrs
=
{
},
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"VelocityOut"
:
velocity_acc
},
attrs
=
{
"mu"
:
self
.
_momentum
,
"mu"
:
self
.
_momentum
,
"lars_coeff"
:
self
.
_lars_coeff
,
"lars_coeff"
:
self
.
_lars_coeff
,
"lars_weight_decay"
:
_lars_weight_decay
,
"lars_weight_decay"
:
_lars_weight_decay
,
"epsilon"
:
self
.
_epsilon
"multi_precision"
:
find_master
,
},
"rescale_grad"
:
self
.
_rescale_grad
}
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"Velocity"
:
velocity_acc
,
"LearningRate"
:
lr
}
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"VelocityOut"
:
velocity_acc
}
if
find_master
:
inputs
[
"MasterParam"
]
=
master_weight
outputs
[
"MasterParamOut"
]
=
master_weight
# create the momentum optimize op
momentum_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
True
)
stop_gradient
=
True
)
return
momentum_op
return
momentum_op
...
...
python/paddle/fluid/tests/unittests/test_momentum_op.py
浏览文件 @
4d805e6a
...
@@ -134,6 +134,64 @@ class TestMomentumOp2(OpTest):
...
@@ -134,6 +134,64 @@ class TestMomentumOp2(OpTest):
self
.
check_output
()
self
.
check_output
()
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestLarsMomentumOpWithMP
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"lars_momentum"
master_param
=
np
.
random
.
random
((
123
,
321
)).
astype
(
"float32"
)
param
=
master_param
.
astype
(
"float16"
)
grad
=
np
.
random
.
random
((
123
,
321
)).
astype
(
"float16"
)
velocity
=
np
.
zeros
((
123
,
321
)).
astype
(
"float32"
)
learning_rate
=
np
.
array
([
0.001
]).
astype
(
"float32"
)
mu
=
0.0001
lars_coeff
=
0.001
lars_weight_decay
=
0.0005
rescale_grad
=
1.0
self
.
inputs
=
{
'Param'
:
param
,
'Grad'
:
grad
,
'Velocity'
:
velocity
,
'LearningRate'
:
learning_rate
,
'MasterParam'
:
master_param
,
}
self
.
attrs
=
{
'mu'
:
mu
,
'lars_coeff'
:
lars_coeff
,
'lars_weight_decay'
:
lars_weight_decay
,
'multi_precision'
:
True
,
'rescale_grad'
:
rescale_grad
}
fp32_grad
=
grad
.
astype
(
"float32"
)
pnorm
=
np
.
sqrt
(
np
.
square
(
master_param
).
sum
())
gnorm
=
np
.
sqrt
(
np
.
square
(
fp32_grad
).
sum
())
local_lr
=
learning_rate
*
lars_coeff
*
pnorm
/
(
gnorm
+
lars_weight_decay
*
pnorm
)
fp32_grad
=
fp32_grad
*
rescale_grad
velocity_out
=
mu
*
velocity
+
local_lr
*
(
fp32_grad
+
lars_weight_decay
*
master_param
)
p_new
=
master_param
-
velocity_out
param_out
=
p_new
.
astype
(
"float16"
)
master_param_out
=
p_new
self
.
outputs
=
{
'ParamOut'
:
param_out
,
'VelocityOut'
:
velocity_out
,
'MasterParamOut'
:
master_param_out
}
def
test_check_output
(
self
):
paddle
.
enable_static
()
if
core
.
is_compiled_with_cuda
():
place
=
fluid
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
)
class
TestLarsMomentumOp
(
OpTest
):
class
TestLarsMomentumOp
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"lars_momentum"
self
.
op_type
=
"lars_momentum"
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录