提交 0e1e098c 编写于 作者: W WangXi

Add lams lamb, test=develop

上级 8dd3d4b6
...@@ -46,7 +46,7 @@ class AMPOptimizer(MetaOptimizerBase): ...@@ -46,7 +46,7 @@ class AMPOptimizer(MetaOptimizerBase):
custom_white_list = set(config['custom_white_list']) custom_white_list = set(config['custom_white_list'])
custom_black_list = set(config['custom_black_list']) custom_black_list = set(config['custom_black_list'])
custom_black_varnames = set(config['custom_black_varnames']) custom_black_varnames = set(config['custom_black_varnames'])
self.amp_lists = mixed_precision.AutoMixedPrecisionLists( amp_lists = mixed_precision.AutoMixedPrecisionLists(
custom_white_list, custom_black_list, custom_black_varnames) custom_white_list, custom_black_list, custom_black_varnames)
self.wrapped_opt = mixed_precision.decorate( self.wrapped_opt = mixed_precision.decorate(
......
...@@ -98,6 +98,10 @@ class LambOptimizer(MetaOptimizerBase): ...@@ -98,6 +98,10 @@ class LambOptimizer(MetaOptimizerBase):
def apply_gradients(self, params_grads): def apply_gradients(self, params_grads):
return self.lamb_opt.apply_gradients(params_grads=params_grads) return self.lamb_opt.apply_gradients(params_grads=params_grads)
def apply_optimize(self, loss, startup_program, params_grads):
return self.lamb_opt.apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads)
def minimize_impl(self, def minimize_impl(self,
loss, loss,
startup_program=None, startup_program=None,
......
...@@ -85,6 +85,10 @@ class LarsOptimizer(MetaOptimizerBase): ...@@ -85,6 +85,10 @@ class LarsOptimizer(MetaOptimizerBase):
def apply_gradients(self, params_grads): def apply_gradients(self, params_grads):
return self.lars_opt.apply_gradients(params_grads=params_grads) return self.lars_opt.apply_gradients(params_grads=params_grads)
def apply_optimize(self, loss, startup_program, params_grads):
return self.lars_opt.apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads)
def minimize_impl(self, def minimize_impl(self,
loss, loss,
startup_program=None, startup_program=None,
......
...@@ -19,10 +19,8 @@ import os ...@@ -19,10 +19,8 @@ import os
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker import paddle.distributed.fleet.base.role_maker as role_maker
paddle.enable_static()
class TestFleetMetaOptimizer(unittest.TestCase):
class TestFleetCombineOptimizer(unittest.TestCase):
def setUp(self): def setUp(self):
os.environ["PADDLE_TRAINER_ID"] = "1" os.environ["PADDLE_TRAINER_ID"] = "1"
os.environ[ os.environ[
...@@ -50,19 +48,21 @@ class TestFleetCombineOptimizer(unittest.TestCase): ...@@ -50,19 +48,21 @@ class TestFleetCombineOptimizer(unittest.TestCase):
avg_cost = paddle.fluid.layers.mean(x=cost) avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy() strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.dgc = True
strategy.dgc_configs = {
"rampup_begin_step": 128,
"rampup_step": 100,
"sparsity": [0.996, 0.999]
}
return avg_cost, strategy return avg_cost, strategy
def optimizer(self, loss, strategy, train_prog, startup_prog): def optimizer(self,
loss,
strategy,
train_prog,
startup_prog,
name='momentum'):
with fluid.program_guard(train_prog, startup_prog): with fluid.program_guard(train_prog, startup_prog):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
optimizer = paddle.fluid.optimizer.Momentum( if name == 'momentum':
learning_rate=0.01, momentum=0.9) optimizer = paddle.fluid.optimizer.Momentum(
learning_rate=0.01, momentum=0.9)
elif name == 'adam':
optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
optimizer = fleet.distributed_optimizer( optimizer = fleet.distributed_optimizer(
optimizer, strategy=strategy) optimizer, strategy=strategy)
optimizer.minimize(loss) optimizer.minimize(loss)
...@@ -70,53 +70,41 @@ class TestFleetCombineOptimizer(unittest.TestCase): ...@@ -70,53 +70,41 @@ class TestFleetCombineOptimizer(unittest.TestCase):
def set_strategy(self, strategy, name): def set_strategy(self, strategy, name):
if name == 'amp': if name == 'amp':
strategy.amp = True strategy.amp = True
strategy.amp_configs = {
"init_loss_scaling": 32768,
"decr_every_n_nan_or_inf": 2,
"incr_every_n_steps": 1000,
"incr_ratio": 2.0,
"use_dynamic_loss_scaling": True,
"decr_ratio": 0.5,
"custom_white_list": ['softmax'],
"custom_black_list": ['tanh'],
}
elif name == 'dgc': elif name == 'dgc':
strategy.dgc = True strategy.dgc = True
strategy.dgc_configs = {
"rampup_begin_step": 128,
"rampup_step": 100,
"sparsity": [0.996, 0.999]
}
elif name == 'recompute': elif name == 'recompute':
strategy.recompute = True strategy.recompute = True
strategy.recompute_configs = { strategy.recompute_configs = {
"checkpoints": ["fc_0.tmp_2", "fc_1.tmp_2"] "checkpoints": ["fc_0.tmp_2", "fc_1.tmp_2"]
} }
elif name == 'lars':
def test_dgc_recompute_optimizer(self): strategy.lars = True
train_prog = fluid.Program() strategy.lars_configs = {
startup_prog = fluid.Program() "lars_coeff": 0.001,
avg_cost, strategy = self.net(train_prog, startup_prog) "lars_weight_decay": 0.0005,
"epsilon": 0,
self.set_strategy(strategy, 'dgc') "exclude_from_weight_decay": ["batch_norm", ".b"],
self.set_strategy(strategy, 'recompute') }
elif name == 'lamb':
self.optimizer(avg_cost, strategy, train_prog, startup_prog) strategy.lamb = True
strategy.lamb_configs = {
ops = [op.type for op in avg_cost.block.ops] 'lamb_weight_decay': 0.01,
outs = [ 'exclude_from_weight_decay': [],
op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul' }
] else:
self.assertIn('dgc', ops) raise NotImplementedError()
self.assertIn('dgc_momentum', ops)
self.assertIn('subprog', ''.join(outs))
def test_amp_recompute_optimizer(self):
train_prog = fluid.Program()
startup_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'amp')
self.set_strategy(strategy, 'recompute')
self.optimizer(avg_cost, strategy, train_prog, startup_prog)
ops = [op.type for op in avg_cost.block.ops]
outs = [
op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
]
print(train_prog)
self.assertIn('cast', ops)
self.assertIn('check_finite_and_unscale', ops)
self.assertIn('subprog', ''.join(outs))
if __name__ == "__main__":
unittest.main()
...@@ -16,12 +16,14 @@ from __future__ import print_function ...@@ -16,12 +16,14 @@ from __future__ import print_function
import unittest import unittest
import paddle
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
import paddle.fluid.optimizer as optimizer import paddle.fluid.optimizer as optimizer
import paddle.fluid.regularizer as regularizer import paddle.fluid.regularizer as regularizer
import paddle.fluid.clip as clip import paddle.fluid.clip as clip
import paddle.compat as cpt import paddle.compat as cpt
from paddle.fluid.backward import append_backward from paddle.fluid.backward import append_backward
paddle.enable_static()
class TestDGCMomentumOptimizer(unittest.TestCase): class TestDGCMomentumOptimizer(unittest.TestCase):
...@@ -86,13 +88,17 @@ class TestDGCMomentumOptimizer(unittest.TestCase): ...@@ -86,13 +88,17 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
block.append_op( block.append_op(
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
# params_grads = append_backward(mean_out) # params_grads = append_backward(mean_out)
params_grads = dgc_momentum_optimizer.backward(mean_out) params_grads = dgc_momentum_optimizer.backward(
mean_out, startup_program=init_program)
with framework.program_guard(program, init_program):
opts = dgc_momentum_optimizer.apply_gradients(params_grads)
accumulator_count = 1 if name == "momentum" else 2 accumulator_count = 1 if name == "momentum" else 2
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual( self.assertEqual(
len(dgc_momentum_optimizer.get_accumulators()), accumulator_count) len(dgc_momentum_optimizer.get_accumulators()), accumulator_count)
with framework.program_guard(program, init_program):
opts = dgc_momentum_optimizer.apply_gradients(params_grads)
self.assertEqual(len(opts), 2) self.assertEqual(len(opts), 2)
sgd_op = opts[-1] sgd_op = opts[-1]
self.assertEqual([op.type for op in opts], ["scale", name]) self.assertEqual([op.type for op in opts], ["scale", name])
...@@ -108,8 +114,11 @@ class TestDGCMomentumOptimizer(unittest.TestCase): ...@@ -108,8 +114,11 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
self.assertTrue(mul_x.name in velocity_acc) self.assertTrue(mul_x.name in velocity_acc)
# Check init_program # Check init_program
# dgc not apply include: lr, dgc(count, nranks, begin step), (u,)
# dgc apply include: lr, dgc(count, nranks, begin_step), (u,v,k,encode,gather)
init_ops_count = 5 if name == "momentum" else 9
init_ops = init_program.global_block().ops init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 1) self.assertEqual(len(init_ops), init_ops_count)
self.assertEqual(init_ops[0].type, "fill_constant") self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate) self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
......
...@@ -16,53 +16,42 @@ import paddle.distributed.fleet as fleet ...@@ -16,53 +16,42 @@ import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker import paddle.distributed.fleet.base.role_maker as role_maker
import unittest import unittest
import paddle import paddle
import paddle.fluid as fluid
import os import os
from fleet_meta_optimizer_base import TestFleetMetaOptimizer
paddle.enable_static() paddle.enable_static()
class TestFleetAMPOptimizer(unittest.TestCase): class TestFleetAMPOptimizer(TestFleetMetaOptimizer):
def setUp(self):
os.environ["PADDLE_TRAINER_ID"] = "0"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
def test_amp_optimizer(self): def test_amp_optimizer(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True) train_prog, startup_prog = fluid.Program(), fluid.Program()
fleet.init(role) avg_cost, strategy = self.net(train_prog, startup_prog)
input_x = paddle.fluid.layers.data( self.set_strategy(strategy, 'amp')
name="x", shape=[32], dtype='float32') self.optimizer(avg_cost, strategy, train_prog, startup_prog)
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy() ops = [op.type for op in avg_cost.block.ops]
strategy.amp = True self.assertIn('cast', ops)
strategy.amp_configs = { self.assertIn('check_finite_and_unscale', ops)
"init_loss_scaling": 32768,
"decr_every_n_nan_or_inf": 2,
"incr_every_n_steps": 1000,
"incr_ratio": 2.0,
"use_dynamic_loss_scaling": True,
"decr_ratio": 0.5,
"custom_white_list": ['softmax'],
"custom_black_list": ['tanh'],
}
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) def test_amp_recompute_optimizer(self):
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) train_prog, startup_prog = fluid.Program(), fluid.Program()
optimizer.minimize(avg_cost) avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'amp')
self.set_strategy(strategy, 'recompute')
self.optimizer(avg_cost, strategy, train_prog, startup_prog)
strategy = fleet._final_strategy() strategy = fleet._final_strategy()
ops = [op.type for op in avg_cost.block.ops] ops = [op.type for op in avg_cost.block.ops]
outs = [
op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
]
self.assertIn('cast', ops) self.assertIn('cast', ops)
self.assertIn('check_finite_and_unscale', ops) self.assertIn('check_finite_and_unscale', ops)
self.assertIn('subprog', ''.join(outs))
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -18,66 +18,27 @@ from paddle import fluid ...@@ -18,66 +18,27 @@ from paddle import fluid
import os import os
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker import paddle.distributed.fleet.base.role_maker as role_maker
from fleet_meta_optimizer_base import TestFleetMetaOptimizer
paddle.enable_static() paddle.enable_static()
class TestFleetDGCOptimizer(unittest.TestCase): class TestFleetDGCOptimizer(TestFleetMetaOptimizer):
def setUp(self):
os.environ["PADDLE_TRAINER_ID"] = "1"
os.environ[
"PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
def net(self, main_prog, startup_prog):
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(
name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x,
size=64,
act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2],
size=2,
act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.dgc = True
strategy.dgc_configs = {
"rampup_begin_step": 128,
"rampup_step": 100,
"sparsity": [0.996, 0.999]
}
return avg_cost, strategy
def test_dgc_optimizer(self): def test_dgc_optimizer(self):
startup_prog = fluid.Program() train_prog, startup_prog = fluid.Program(), fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog) avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.fluid.optimizer.Momentum( self.set_strategy(strategy, 'dgc')
learning_rate=0.01, momentum=0.9) self.optimizer(avg_cost, strategy, train_prog, startup_prog)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
ops = [op.type for op in avg_cost.block.ops] ops = [op.type for op in avg_cost.block.ops]
self.assertIn('dgc', ops) self.assertIn('dgc', ops)
self.assertIn('dgc_momentum', ops) self.assertIn('dgc_momentum', ops)
def test_dgc_not_apply_with_adam(self): def test_dgc_not_apply_with_adam(self):
startup_prog = fluid.Program() train_prog, startup_prog = fluid.Program(), fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog) avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) self.set_strategy(strategy, 'dgc')
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) self.optimizer(avg_cost, strategy, train_prog, startup_prog, 'adam')
optimizer.minimize(avg_cost)
ops = [op.type for op in avg_cost.block.ops] ops = [op.type for op in avg_cost.block.ops]
self.assertNotIn('dgc', ops) self.assertNotIn('dgc', ops)
...@@ -87,18 +48,31 @@ class TestFleetDGCOptimizer(unittest.TestCase): ...@@ -87,18 +48,31 @@ class TestFleetDGCOptimizer(unittest.TestCase):
os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINER_ID"] = "0"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
startup_prog = fluid.Program() train_prog, startup_prog = fluid.Program(), fluid.Program()
train_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog) avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.fluid.optimizer.Momentum( self.set_strategy(strategy, 'dgc')
learning_rate=0.01, momentum=0.9) self.optimizer(avg_cost, strategy, train_prog, startup_prog)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
ops = [op.type for op in avg_cost.block.ops] ops = [op.type for op in avg_cost.block.ops]
self.assertNotIn('dgc', ops) self.assertNotIn('dgc', ops)
self.assertNotIn('dgc_momentum', ops) self.assertNotIn('dgc_momentum', ops)
def test_dgc_recompute_optimizer(self):
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'dgc')
self.set_strategy(strategy, 'recompute')
self.optimizer(avg_cost, strategy, train_prog, startup_prog)
ops = [op.type for op in avg_cost.block.ops]
outs = [
op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
]
self.assertIn('dgc', ops)
self.assertIn('dgc_momentum', ops)
self.assertIn('subprog', ''.join(outs))
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -14,40 +14,55 @@ ...@@ -14,40 +14,55 @@
import unittest import unittest
import paddle import paddle
import paddle.fluid as fluid
import os import os
from fleet_meta_optimizer_base import TestFleetMetaOptimizer
paddle.enable_static()
class TestFleetRecomputeMetaOptimizer(unittest.TestCase):
def setUp(self):
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
"127.0.0.1:36001,127.0.0.2:36001"
class TestFleetRecomputeMetaOptimizer(TestFleetMetaOptimizer):
def test_recompute_optimizer(self): def test_recompute_optimizer(self):
import paddle.distributed.fleet as fleet train_prog, startup_prog = fluid.Program(), fluid.Program()
import paddle.distributed.fleet.base.role_maker as role_maker avg_cost, strategy = self.net(train_prog, startup_prog)
role = role_maker.PaddleCloudRoleMaker(is_collective=True) self.set_strategy(strategy, 'recompute')
fleet.init(role) self.optimizer(avg_cost, strategy, train_prog, startup_prog)
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32') outs = [
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
]
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') self.assertIn('subprog', ''.join(outs))
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy( def test_recompute_lars_optimizer(self):
input=prediction, label=input_y) train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost = paddle.fluid.layers.mean(x=cost) avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'recompute')
strategy = paddle.distributed.fleet.DistributedStrategy() self.set_strategy(strategy, 'lars')
strategy.recompute = True self.optimizer(avg_cost, strategy, train_prog, startup_prog)
strategy.recompute_configs = {"checkpoints": ["fc_1.tmp_0"]}
ops = [op.type for op in avg_cost.block.ops]
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) outs = [
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
optimizer.minimize(avg_cost) ]
self.assertIn('lars_momentum', ops)
self.assertIn('subprog', ''.join(outs))
def test_recompute_lamb_optimizer(self):
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'recompute')
self.set_strategy(strategy, 'lamb')
self.optimizer(avg_cost, strategy, train_prog, startup_prog, 'adam')
ops = [op.type for op in avg_cost.block.ops]
outs = [
op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
]
self.assertIn('lamb', ops)
self.assertIn('subprog', ''.join(outs))
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册