未验证 提交 54003b87 编写于 作者: J JZ-LIANG 提交者: GitHub

【paddle.fleet】add lamb to fleet meta optimizer (#26025)

add lamb to fleet meta optimizer
上级 1be6bf45
...@@ -55,9 +55,8 @@ message LarsConfig { ...@@ -55,9 +55,8 @@ message LarsConfig {
} }
message LambConfig { message LambConfig {
optional float beta1 = 1 [ default = 0.001 ]; optional float lamb_weight_decay = 1 [ default = 0.01 ];
optional float beta2 = 2 [ default = 0.999 ]; repeated string exclude_from_weight_decay = 2;
optional float epsilon = 3 [ default = 0.000001 ];
} }
message BuildStrategy { message BuildStrategy {
......
...@@ -627,6 +627,15 @@ class DistributedStrategy(object): ...@@ -627,6 +627,15 @@ class DistributedStrategy(object):
else: else:
print("WARNING: lamb should have value of bool type") print("WARNING: lamb should have value of bool type")
@property
def lamb_configs(self):
return get_msg_dict(self.strategy.lamb_configs)
@lamb_configs.setter
def lamb_configs(self, configs):
check_configs_key(self.strategy.lamb_configs, configs, "lamb_configs")
assign_configs_value(self.strategy.lamb_configs, configs)
@property @property
def elastic(self): def elastic(self):
return self.strategy.elastic return self.strategy.elastic
......
...@@ -21,6 +21,7 @@ from .localsgd_optimizer import LocalSGDOptimizer ...@@ -21,6 +21,7 @@ from .localsgd_optimizer import LocalSGDOptimizer
from .lars_optimizer import LarsOptimizer from .lars_optimizer import LarsOptimizer
from .async_graph_execution_optimizer import AsyncGraphExecutionOptimizer from .async_graph_execution_optimizer import AsyncGraphExecutionOptimizer
from .dgc_optimizer import DGCOptimizer from .dgc_optimizer import DGCOptimizer
from .lamb_optimizer import LambOptimizer
__all__ = [ __all__ = [
'AMPOptimizer', 'AMPOptimizer',
...@@ -33,4 +34,5 @@ __all__ = [ ...@@ -33,4 +34,5 @@ __all__ = [
'LarsOptimizer', 'LarsOptimizer',
'AsyncGraphExecutionOptimizer', 'AsyncGraphExecutionOptimizer',
'DGCOptimizer', 'DGCOptimizer',
'LambOptimizer',
] ]
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.optimizer import LambOptimizer as LAMB
from .meta_optimizer_base import MetaOptimizerBase
import logging
__all__ = ["LambOptimizer"]
class LambOptimizer(MetaOptimizerBase):
def __init__(self, optimizer):
super(LambOptimizer, self).__init__(optimizer)
self.inner_opt = optimizer
self.lamb_opt = None
# we do not allow meta optimizer to be inner optimizer currently
self.meta_optimizers_white_list = []
def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
user_defined_strategy):
super(LambOptimizer, self)._set_basic_info(
loss, role_maker, user_defined_optimizer, user_defined_strategy)
opt = self.inner_opt
if not isinstance(opt, AdamOptimizer):
return
configs = self.user_defined_strategy.lamb_configs
if len(configs['exclude_from_weight_decay']) == 0:
_exclude_from_weight_decay_fn = None
else:
def exclude_fn(param):
exclude_list = configs['exclude_from_weight_decay']
for name in exclude_list:
if param.name.endswith(name):
return True
return False
_exclude_from_weight_decay_fn = exclude_fn
self.lamb_opt = LAMB(
learning_rate=opt._learning_rate,
lamb_weight_decay=configs['lamb_weight_decay'],
beta1=opt._beta1,
beta2=opt._beta2,
epsilon=opt._epsilon,
parameter_list=opt._parameter_list,
regularization=opt.regularization,
grad_clip=opt._grad_clip,
exclude_from_weight_decay_fn=_exclude_from_weight_decay_fn,
name=opt._name)
def _can_apply(self):
if self.user_defined_strategy.lamb:
if not isinstance(self.inner_opt, AdamOptimizer):
logging.warn(
"lamb need the inner optimizer to be AdamOptimizer optimizer but got {}.".
format(self.inner_opt.type))
return False
return True
return False
def _disable_strategy(self, dist_strategy):
dist_strategy.lamb = False
dist_strategy.lamb_configs = {
'lamb_weight_decay': 0.01,
'exclude_from_weight_decay': [],
}
def backward(self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None,
callbacks=None):
return self.lamb_opt.backward(loss, startup_program, parameter_list,
no_grad_set, callbacks)
def minimize_impl(self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None):
optimize_ops, params_grads = \
self.lamb_opt.minimize(loss, startup_program,
parameter_list, no_grad_set)
return optimize_ops, params_grads
...@@ -40,6 +40,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer) ...@@ -40,6 +40,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_lamb_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function) list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor) list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor)
...@@ -386,6 +387,7 @@ if(WITH_DISTRIBUTE) ...@@ -386,6 +387,7 @@ if(WITH_DISTRIBUTE)
if(NOT WIN32) if(NOT WIN32)
py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
endif(NOT WIN32) endif(NOT WIN32)
endif(NOT APPLE) endif(NOT APPLE)
if(WITH_DGC) if(WITH_DGC)
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import unittest import unittest
import paddle import paddle
from paddle import fluid
import os import os
import paddle.fleet as fleet import paddle.fleet as fleet
import paddle.fluid.incubate.fleet.base.role_maker as role_maker import paddle.fluid.incubate.fleet.base.role_maker as role_maker
...@@ -25,31 +26,40 @@ class TestFleetDGCOptimizer(unittest.TestCase): ...@@ -25,31 +26,40 @@ class TestFleetDGCOptimizer(unittest.TestCase):
os.environ[ os.environ[
"PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002" "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
def net(self): def net(self, main_prog, startup_prog):
role = role_maker.PaddleCloudRoleMaker(is_collective=True) with fluid.program_guard(main_prog, startup_prog):
fleet.init(role) with fluid.unique_name.guard():
input_x = paddle.fluid.layers.data( role = role_maker.PaddleCloudRoleMaker(is_collective=True)
name="x", shape=[32], dtype='float32') fleet.init(role)
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(
name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_1 = paddle.fluid.layers.fc(input=input_x,
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh') size=64,
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') act='tanh')
cost = paddle.fluid.layers.cross_entropy( fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
input=prediction, label=input_y) prediction = paddle.fluid.layers.fc(input=[fc_2],
avg_cost = paddle.fluid.layers.mean(x=cost) size=2,
act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.fleet.DistributedStrategy() strategy = paddle.fleet.DistributedStrategy()
strategy.dgc = True strategy.dgc = True
strategy.dgc_configs = { strategy.dgc_configs = {
"rampup_begin_step": 128, "rampup_begin_step": 128,
"rampup_step": 100, "rampup_step": 100,
"sparsity": [0.996, 0.999] "sparsity": [0.996, 0.999]
} }
return avg_cost, strategy return avg_cost, strategy
def test_dgc_optimizer(self): def test_dgc_optimizer(self):
avg_cost, strategy = self.net() startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9) optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
...@@ -59,7 +69,9 @@ class TestFleetDGCOptimizer(unittest.TestCase): ...@@ -59,7 +69,9 @@ class TestFleetDGCOptimizer(unittest.TestCase):
self.assertIn('dgc_momentum', ops) self.assertIn('dgc_momentum', ops)
def test_dgc_not_apply_with_adam(self): def test_dgc_not_apply_with_adam(self):
avg_cost, strategy = self.net() startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.optimizer.Adam(learning_rate=0.01) optimizer = paddle.optimizer.Adam(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
...@@ -72,7 +84,9 @@ class TestFleetDGCOptimizer(unittest.TestCase): ...@@ -72,7 +84,9 @@ class TestFleetDGCOptimizer(unittest.TestCase):
os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINER_ID"] = "0"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
avg_cost, strategy = self.net() startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9) optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
from paddle import fluid
import os
import paddle.fleet as fleet
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
class TestFleetLambMetaOptimizer(unittest.TestCase):
def setUp(self):
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
"127.0.0.1:36001,127.0.0.2:36001"
def net(self, main_prog, startup_prog):
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(
name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x,
size=64,
act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2],
size=2,
act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.fleet.DistributedStrategy()
strategy.lamb = True
strategy.lamb_configs = {
'lamb_weight_decay': 0.01,
'exclude_from_weight_decay': [],
}
return avg_cost, strategy
def test_lamb_optimizer(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.optimizer.Adam(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
ops = [op.type for op in avg_cost.block.ops]
self.assertIn('lamb', ops)
def test_lamb_not_apply_with_momentum(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.optimizer.Momentum(learning_rate=0.1, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
ops = [op.type for op in avg_cost.block.ops]
self.assertNotIn('lamb', ops)
def test_lamb_exclude_fn(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.optimizer.Adam(learning_rate=0.01)
strategy.lamb_configs = {
'lamb_weight_decay': 0.01,
'exclude_from_weight_decay': ['.b_0'],
}
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
ops_with_bias = [
op for op in avg_cost.block.ops
if op.type == 'lamb' and op.attr('op_role_var')[0].endswith('.b_0')
]
for op in ops_with_bias:
self.assertEqual(op.attr('weight_decay'), 0)
if __name__ == "__main__":
unittest.main()
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import unittest import unittest
import paddle import paddle
from paddle import fluid
import os import os
import paddle.fleet as fleet import paddle.fleet as fleet
import paddle.fluid.incubate.fleet.base.role_maker as role_maker import paddle.fluid.incubate.fleet.base.role_maker as role_maker
...@@ -27,31 +28,40 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase): ...@@ -27,31 +28,40 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \ os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
"127.0.0.1:36001,127.0.0.2:36001" "127.0.0.1:36001,127.0.0.2:36001"
def net(self): def net(self, main_prog, startup_prog):
role = role_maker.PaddleCloudRoleMaker(is_collective=True) with fluid.program_guard(main_prog, startup_prog):
fleet.init(role) with fluid.unique_name.guard():
input_x = paddle.fluid.layers.data( input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32') name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') input_y = paddle.fluid.layers.data(
name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_1 = paddle.fluid.layers.fc(input=input_x,
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh') size=64,
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') act='tanh')
cost = paddle.fluid.layers.cross_entropy( fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
input=prediction, label=input_y) prediction = paddle.fluid.layers.fc(input=[fc_2],
avg_cost = paddle.fluid.layers.mean(x=cost) size=2,
act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.fleet.DistributedStrategy() strategy = paddle.fleet.DistributedStrategy()
strategy.lars = True strategy.lars = True
strategy.lars_configs = { strategy.lars_configs = {
"lars_coeff": 0.001, "lars_coeff": 0.001,
"lars_weight_decay": 0.0005, "lars_weight_decay": 0.0005,
} }
return avg_cost, strategy return avg_cost, strategy
def test_lars_optimizer(self): def test_lars_optimizer(self):
avg_cost, strategy = self.net() role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9) optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
...@@ -60,7 +70,11 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase): ...@@ -60,7 +70,11 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
self.assertIn('lars_momentum', ops) self.assertIn('lars_momentum', ops)
def test_lars_not_apply_with_adam(self): def test_lars_not_apply_with_adam(self):
avg_cost, strategy = self.net() role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
startup_prog = fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.optimizer.Adam(learning_rate=0.01) optimizer = paddle.optimizer.Adam(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册