Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
54003b87
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
54003b87
编写于
8月 12, 2020
作者:
J
JZ-LIANG
提交者:
GitHub
8月 12, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
【paddle.fleet】add lamb to fleet meta optimizer (#26025)
add lamb to fleet meta optimizer
上级
1be6bf45
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
292 addition
and
45 deletion
+292
-45
paddle/fluid/framework/distributed_strategy.proto
paddle/fluid/framework/distributed_strategy.proto
+2
-3
python/paddle/fleet/base/distributed_strategy.py
python/paddle/fleet/base/distributed_strategy.py
+9
-0
python/paddle/fleet/meta_optimizers/__init__.py
python/paddle/fleet/meta_optimizers/__init__.py
+2
-0
python/paddle/fleet/meta_optimizers/lamb_optimizer.py
python/paddle/fleet/meta_optimizers/lamb_optimizer.py
+99
-0
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+2
-0
python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
...le/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
+36
-22
python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
...e/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
+108
-0
python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
...e/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
+34
-20
未找到文件。
paddle/fluid/framework/distributed_strategy.proto
浏览文件 @
54003b87
...
...
@@ -55,9 +55,8 @@ message LarsConfig {
}
message
LambConfig
{
optional
float
beta1
=
1
[
default
=
0.001
];
optional
float
beta2
=
2
[
default
=
0.999
];
optional
float
epsilon
=
3
[
default
=
0.000001
];
optional
float
lamb_weight_decay
=
1
[
default
=
0.01
];
repeated
string
exclude_from_weight_decay
=
2
;
}
message
BuildStrategy
{
...
...
python/paddle/fleet/base/distributed_strategy.py
100644 → 100755
浏览文件 @
54003b87
...
...
@@ -627,6 +627,15 @@ class DistributedStrategy(object):
else
:
print
(
"WARNING: lamb should have value of bool type"
)
@
property
def
lamb_configs
(
self
):
return
get_msg_dict
(
self
.
strategy
.
lamb_configs
)
@
lamb_configs
.
setter
def
lamb_configs
(
self
,
configs
):
check_configs_key
(
self
.
strategy
.
lamb_configs
,
configs
,
"lamb_configs"
)
assign_configs_value
(
self
.
strategy
.
lamb_configs
,
configs
)
@
property
def
elastic
(
self
):
return
self
.
strategy
.
elastic
...
...
python/paddle/fleet/meta_optimizers/__init__.py
浏览文件 @
54003b87
...
...
@@ -21,6 +21,7 @@ from .localsgd_optimizer import LocalSGDOptimizer
from
.lars_optimizer
import
LarsOptimizer
from
.async_graph_execution_optimizer
import
AsyncGraphExecutionOptimizer
from
.dgc_optimizer
import
DGCOptimizer
from
.lamb_optimizer
import
LambOptimizer
__all__
=
[
'AMPOptimizer'
,
...
...
@@ -33,4 +34,5 @@ __all__ = [
'LarsOptimizer'
,
'AsyncGraphExecutionOptimizer'
,
'DGCOptimizer'
,
'LambOptimizer'
,
]
python/paddle/fleet/meta_optimizers/lamb_optimizer.py
0 → 100755
浏览文件 @
54003b87
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from
paddle.fluid.optimizer
import
AdamOptimizer
from
paddle.fluid.optimizer
import
LambOptimizer
as
LAMB
from
.meta_optimizer_base
import
MetaOptimizerBase
import
logging
__all__
=
[
"LambOptimizer"
]
class
LambOptimizer
(
MetaOptimizerBase
):
def
__init__
(
self
,
optimizer
):
super
(
LambOptimizer
,
self
).
__init__
(
optimizer
)
self
.
inner_opt
=
optimizer
self
.
lamb_opt
=
None
# we do not allow meta optimizer to be inner optimizer currently
self
.
meta_optimizers_white_list
=
[]
def
_set_basic_info
(
self
,
loss
,
role_maker
,
user_defined_optimizer
,
user_defined_strategy
):
super
(
LambOptimizer
,
self
).
_set_basic_info
(
loss
,
role_maker
,
user_defined_optimizer
,
user_defined_strategy
)
opt
=
self
.
inner_opt
if
not
isinstance
(
opt
,
AdamOptimizer
):
return
configs
=
self
.
user_defined_strategy
.
lamb_configs
if
len
(
configs
[
'exclude_from_weight_decay'
])
==
0
:
_exclude_from_weight_decay_fn
=
None
else
:
def
exclude_fn
(
param
):
exclude_list
=
configs
[
'exclude_from_weight_decay'
]
for
name
in
exclude_list
:
if
param
.
name
.
endswith
(
name
):
return
True
return
False
_exclude_from_weight_decay_fn
=
exclude_fn
self
.
lamb_opt
=
LAMB
(
learning_rate
=
opt
.
_learning_rate
,
lamb_weight_decay
=
configs
[
'lamb_weight_decay'
],
beta1
=
opt
.
_beta1
,
beta2
=
opt
.
_beta2
,
epsilon
=
opt
.
_epsilon
,
parameter_list
=
opt
.
_parameter_list
,
regularization
=
opt
.
regularization
,
grad_clip
=
opt
.
_grad_clip
,
exclude_from_weight_decay_fn
=
_exclude_from_weight_decay_fn
,
name
=
opt
.
_name
)
def
_can_apply
(
self
):
if
self
.
user_defined_strategy
.
lamb
:
if
not
isinstance
(
self
.
inner_opt
,
AdamOptimizer
):
logging
.
warn
(
"lamb need the inner optimizer to be AdamOptimizer optimizer but got {}."
.
format
(
self
.
inner_opt
.
type
))
return
False
return
True
return
False
def
_disable_strategy
(
self
,
dist_strategy
):
dist_strategy
.
lamb
=
False
dist_strategy
.
lamb_configs
=
{
'lamb_weight_decay'
:
0.01
,
'exclude_from_weight_decay'
:
[],
}
def
backward
(
self
,
loss
,
startup_program
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
,
callbacks
=
None
):
return
self
.
lamb_opt
.
backward
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
,
callbacks
)
def
minimize_impl
(
self
,
loss
,
startup_program
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
optimize_ops
,
params_grads
=
\
self
.
lamb_opt
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
return
optimize_ops
,
params_grads
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
54003b87
...
...
@@ -40,6 +40,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_lamb_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_private_function
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor
)
...
...
@@ -386,6 +387,7 @@ if(WITH_DISTRIBUTE)
if
(
NOT WIN32
)
py_test_modules
(
test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS
${
dist_ENVS
}
)
endif
(
NOT WIN32
)
endif
(
NOT APPLE
)
if
(
WITH_DGC
)
...
...
python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
100644 → 100755
浏览文件 @
54003b87
...
...
@@ -14,6 +14,7 @@
import
unittest
import
paddle
from
paddle
import
fluid
import
os
import
paddle.fleet
as
fleet
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
...
...
@@ -25,16 +26,23 @@ class TestFleetDGCOptimizer(unittest.TestCase):
os
.
environ
[
"PADDLE_TRAINER_ENDPOINTS"
]
=
"127.0.0.1:36001,127.0.0.1:36002"
def
net
(
self
):
def
net
(
self
,
main_prog
,
startup_prog
):
with
fluid
.
program_guard
(
main_prog
,
startup_prog
):
with
fluid
.
unique_name
.
guard
():
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
input_x
=
paddle
.
fluid
.
layers
.
data
(
name
=
"x"
,
shape
=
[
32
],
dtype
=
'float32'
)
input_y
=
paddle
.
fluid
.
layers
.
data
(
name
=
"y"
,
shape
=
[
1
],
dtype
=
'int64'
)
input_y
=
paddle
.
fluid
.
layers
.
data
(
name
=
"y"
,
shape
=
[
1
],
dtype
=
'int64'
)
fc_1
=
paddle
.
fluid
.
layers
.
fc
(
input
=
input_x
,
size
=
64
,
act
=
'tanh'
)
fc_1
=
paddle
.
fluid
.
layers
.
fc
(
input
=
input_x
,
size
=
64
,
act
=
'tanh'
)
fc_2
=
paddle
.
fluid
.
layers
.
fc
(
input
=
fc_1
,
size
=
256
,
act
=
'tanh'
)
prediction
=
paddle
.
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
2
,
act
=
'softmax'
)
prediction
=
paddle
.
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
2
,
act
=
'softmax'
)
cost
=
paddle
.
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
input_y
)
avg_cost
=
paddle
.
fluid
.
layers
.
mean
(
x
=
cost
)
...
...
@@ -49,7 +57,9 @@ class TestFleetDGCOptimizer(unittest.TestCase):
return
avg_cost
,
strategy
def
test_dgc_optimizer
(
self
):
avg_cost
,
strategy
=
self
.
net
()
startup_prog
=
fluid
.
Program
()
train_prog
=
fluid
.
Program
()
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
)
optimizer
=
paddle
.
optimizer
.
Momentum
(
learning_rate
=
0.01
,
momentum
=
0.9
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
.
minimize
(
avg_cost
)
...
...
@@ -59,7 +69,9 @@ class TestFleetDGCOptimizer(unittest.TestCase):
self
.
assertIn
(
'dgc_momentum'
,
ops
)
def
test_dgc_not_apply_with_adam
(
self
):
avg_cost
,
strategy
=
self
.
net
()
startup_prog
=
fluid
.
Program
()
train_prog
=
fluid
.
Program
()
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
)
optimizer
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
0.01
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
.
minimize
(
avg_cost
)
...
...
@@ -72,7 +84,9 @@ class TestFleetDGCOptimizer(unittest.TestCase):
os
.
environ
[
"PADDLE_TRAINER_ID"
]
=
"0"
os
.
environ
[
"PADDLE_TRAINER_ENDPOINTS"
]
=
"127.0.0.1:36001"
avg_cost
,
strategy
=
self
.
net
()
startup_prog
=
fluid
.
Program
()
train_prog
=
fluid
.
Program
()
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
)
optimizer
=
paddle
.
optimizer
.
Momentum
(
learning_rate
=
0.01
,
momentum
=
0.9
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
.
minimize
(
avg_cost
)
...
...
python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
0 → 100755
浏览文件 @
54003b87
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
paddle
from
paddle
import
fluid
import
os
import
paddle.fleet
as
fleet
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
class
TestFleetLambMetaOptimizer
(
unittest
.
TestCase
):
def
setUp
(
self
):
os
.
environ
[
"POD_IP"
]
=
"127.0.0.1"
os
.
environ
[
"PADDLE_TRAINER_ENDPOINTS"
]
=
"127.0.0.1:36001"
os
.
environ
[
"PADDLE_TRAINERS_NUM"
]
=
"2"
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
]
=
\
"127.0.0.1:36001,127.0.0.2:36001"
def
net
(
self
,
main_prog
,
startup_prog
):
with
fluid
.
program_guard
(
main_prog
,
startup_prog
):
with
fluid
.
unique_name
.
guard
():
input_x
=
paddle
.
fluid
.
layers
.
data
(
name
=
"x"
,
shape
=
[
32
],
dtype
=
'float32'
)
input_y
=
paddle
.
fluid
.
layers
.
data
(
name
=
"y"
,
shape
=
[
1
],
dtype
=
'int64'
)
fc_1
=
paddle
.
fluid
.
layers
.
fc
(
input
=
input_x
,
size
=
64
,
act
=
'tanh'
)
fc_2
=
paddle
.
fluid
.
layers
.
fc
(
input
=
fc_1
,
size
=
256
,
act
=
'tanh'
)
prediction
=
paddle
.
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
2
,
act
=
'softmax'
)
cost
=
paddle
.
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
input_y
)
avg_cost
=
paddle
.
fluid
.
layers
.
mean
(
x
=
cost
)
strategy
=
paddle
.
fleet
.
DistributedStrategy
()
strategy
.
lamb
=
True
strategy
.
lamb_configs
=
{
'lamb_weight_decay'
:
0.01
,
'exclude_from_weight_decay'
:
[],
}
return
avg_cost
,
strategy
def
test_lamb_optimizer
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
startup_prog
=
fluid
.
Program
()
train_prog
=
fluid
.
Program
()
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
)
optimizer
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
0.01
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
.
minimize
(
avg_cost
)
ops
=
[
op
.
type
for
op
in
avg_cost
.
block
.
ops
]
self
.
assertIn
(
'lamb'
,
ops
)
def
test_lamb_not_apply_with_momentum
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
startup_prog
=
fluid
.
Program
()
train_prog
=
fluid
.
Program
()
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
)
optimizer
=
paddle
.
optimizer
.
Momentum
(
learning_rate
=
0.1
,
momentum
=
0.9
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
.
minimize
(
avg_cost
)
ops
=
[
op
.
type
for
op
in
avg_cost
.
block
.
ops
]
self
.
assertNotIn
(
'lamb'
,
ops
)
def
test_lamb_exclude_fn
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
startup_prog
=
fluid
.
Program
()
train_prog
=
fluid
.
Program
()
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
)
optimizer
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
0.01
)
strategy
.
lamb_configs
=
{
'lamb_weight_decay'
:
0.01
,
'exclude_from_weight_decay'
:
[
'.b_0'
],
}
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
.
minimize
(
avg_cost
)
ops_with_bias
=
[
op
for
op
in
avg_cost
.
block
.
ops
if
op
.
type
==
'lamb'
and
op
.
attr
(
'op_role_var'
)[
0
].
endswith
(
'.b_0'
)
]
for
op
in
ops_with_bias
:
self
.
assertEqual
(
op
.
attr
(
'weight_decay'
),
0
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
浏览文件 @
54003b87
...
...
@@ -14,6 +14,7 @@
import
unittest
import
paddle
from
paddle
import
fluid
import
os
import
paddle.fleet
as
fleet
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
...
...
@@ -27,16 +28,21 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
]
=
\
"127.0.0.1:36001,127.0.0.2:36001"
def
net
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
def
net
(
self
,
main_prog
,
startup_prog
):
with
fluid
.
program_guard
(
main_prog
,
startup_prog
):
with
fluid
.
unique_name
.
guard
():
input_x
=
paddle
.
fluid
.
layers
.
data
(
name
=
"x"
,
shape
=
[
32
],
dtype
=
'float32'
)
input_y
=
paddle
.
fluid
.
layers
.
data
(
name
=
"y"
,
shape
=
[
1
],
dtype
=
'int64'
)
input_y
=
paddle
.
fluid
.
layers
.
data
(
name
=
"y"
,
shape
=
[
1
],
dtype
=
'int64'
)
fc_1
=
paddle
.
fluid
.
layers
.
fc
(
input
=
input_x
,
size
=
64
,
act
=
'tanh'
)
fc_1
=
paddle
.
fluid
.
layers
.
fc
(
input
=
input_x
,
size
=
64
,
act
=
'tanh'
)
fc_2
=
paddle
.
fluid
.
layers
.
fc
(
input
=
fc_1
,
size
=
256
,
act
=
'tanh'
)
prediction
=
paddle
.
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
2
,
act
=
'softmax'
)
prediction
=
paddle
.
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
2
,
act
=
'softmax'
)
cost
=
paddle
.
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
input_y
)
avg_cost
=
paddle
.
fluid
.
layers
.
mean
(
x
=
cost
)
...
...
@@ -51,7 +57,11 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
return
avg_cost
,
strategy
def
test_lars_optimizer
(
self
):
avg_cost
,
strategy
=
self
.
net
()
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
startup_prog
=
fluid
.
Program
()
train_prog
=
fluid
.
Program
()
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
)
optimizer
=
paddle
.
optimizer
.
Momentum
(
learning_rate
=
0.01
,
momentum
=
0.9
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
.
minimize
(
avg_cost
)
...
...
@@ -60,7 +70,11 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
self
.
assertIn
(
'lars_momentum'
,
ops
)
def
test_lars_not_apply_with_adam
(
self
):
avg_cost
,
strategy
=
self
.
net
()
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
startup_prog
=
fluid
.
Program
()
train_prog
=
fluid
.
Program
()
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
)
optimizer
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
0.01
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
.
minimize
(
avg_cost
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录