提交 0e1e098c 编写于 作者: W WangXi

Add lams lamb, test=develop

上级 8dd3d4b6
......@@ -46,7 +46,7 @@ class AMPOptimizer(MetaOptimizerBase):
custom_white_list = set(config['custom_white_list'])
custom_black_list = set(config['custom_black_list'])
custom_black_varnames = set(config['custom_black_varnames'])
self.amp_lists = mixed_precision.AutoMixedPrecisionLists(
amp_lists = mixed_precision.AutoMixedPrecisionLists(
custom_white_list, custom_black_list, custom_black_varnames)
self.wrapped_opt = mixed_precision.decorate(
......
......@@ -98,6 +98,10 @@ class LambOptimizer(MetaOptimizerBase):
def apply_gradients(self, params_grads):
return self.lamb_opt.apply_gradients(params_grads=params_grads)
def apply_optimize(self, loss, startup_program, params_grads):
return self.lamb_opt.apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads)
def minimize_impl(self,
loss,
startup_program=None,
......
......@@ -85,6 +85,10 @@ class LarsOptimizer(MetaOptimizerBase):
def apply_gradients(self, params_grads):
return self.lars_opt.apply_gradients(params_grads=params_grads)
def apply_optimize(self, loss, startup_program, params_grads):
return self.lars_opt.apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads)
def minimize_impl(self,
loss,
startup_program=None,
......
......@@ -19,10 +19,8 @@ import os
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
paddle.enable_static()
class TestFleetCombineOptimizer(unittest.TestCase):
class TestFleetMetaOptimizer(unittest.TestCase):
def setUp(self):
os.environ["PADDLE_TRAINER_ID"] = "1"
os.environ[
......@@ -50,19 +48,21 @@ class TestFleetCombineOptimizer(unittest.TestCase):
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.dgc = True
strategy.dgc_configs = {
"rampup_begin_step": 128,
"rampup_step": 100,
"sparsity": [0.996, 0.999]
}
return avg_cost, strategy
def optimizer(self, loss, strategy, train_prog, startup_prog):
def optimizer(self,
loss,
strategy,
train_prog,
startup_prog,
name='momentum'):
with fluid.program_guard(train_prog, startup_prog):
with fluid.unique_name.guard():
if name == 'momentum':
optimizer = paddle.fluid.optimizer.Momentum(
learning_rate=0.01, momentum=0.9)
elif name == 'adam':
optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(
optimizer, strategy=strategy)
optimizer.minimize(loss)
......@@ -70,53 +70,41 @@ class TestFleetCombineOptimizer(unittest.TestCase):
def set_strategy(self, strategy, name):
if name == 'amp':
strategy.amp = True
strategy.amp_configs = {
"init_loss_scaling": 32768,
"decr_every_n_nan_or_inf": 2,
"incr_every_n_steps": 1000,
"incr_ratio": 2.0,
"use_dynamic_loss_scaling": True,
"decr_ratio": 0.5,
"custom_white_list": ['softmax'],
"custom_black_list": ['tanh'],
}
elif name == 'dgc':
strategy.dgc = True
strategy.dgc_configs = {
"rampup_begin_step": 128,
"rampup_step": 100,
"sparsity": [0.996, 0.999]
}
elif name == 'recompute':
strategy.recompute = True
strategy.recompute_configs = {
"checkpoints": ["fc_0.tmp_2", "fc_1.tmp_2"]
}
def test_dgc_recompute_optimizer(self):
train_prog = fluid.Program()
startup_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'dgc')
self.set_strategy(strategy, 'recompute')
self.optimizer(avg_cost, strategy, train_prog, startup_prog)
ops = [op.type for op in avg_cost.block.ops]
outs = [
op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
]
self.assertIn('dgc', ops)
self.assertIn('dgc_momentum', ops)
self.assertIn('subprog', ''.join(outs))
def test_amp_recompute_optimizer(self):
train_prog = fluid.Program()
startup_prog = fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'amp')
self.set_strategy(strategy, 'recompute')
self.optimizer(avg_cost, strategy, train_prog, startup_prog)
ops = [op.type for op in avg_cost.block.ops]
outs = [
op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
]
print(train_prog)
self.assertIn('cast', ops)
self.assertIn('check_finite_and_unscale', ops)
self.assertIn('subprog', ''.join(outs))
if __name__ == "__main__":
unittest.main()
elif name == 'lars':
strategy.lars = True
strategy.lars_configs = {
"lars_coeff": 0.001,
"lars_weight_decay": 0.0005,
"epsilon": 0,
"exclude_from_weight_decay": ["batch_norm", ".b"],
}
elif name == 'lamb':
strategy.lamb = True
strategy.lamb_configs = {
'lamb_weight_decay': 0.01,
'exclude_from_weight_decay': [],
}
else:
raise NotImplementedError()
......@@ -16,12 +16,14 @@ from __future__ import print_function
import unittest
import paddle
import paddle.fluid.framework as framework
import paddle.fluid.optimizer as optimizer
import paddle.fluid.regularizer as regularizer
import paddle.fluid.clip as clip
import paddle.compat as cpt
from paddle.fluid.backward import append_backward
paddle.enable_static()
class TestDGCMomentumOptimizer(unittest.TestCase):
......@@ -86,13 +88,17 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
block.append_op(
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
# params_grads = append_backward(mean_out)
params_grads = dgc_momentum_optimizer.backward(mean_out)
params_grads = dgc_momentum_optimizer.backward(
mean_out, startup_program=init_program)
with framework.program_guard(program, init_program):
opts = dgc_momentum_optimizer.apply_gradients(params_grads)
accumulator_count = 1 if name == "momentum" else 2
self.assertEqual(len(params_grads), 1)
self.assertEqual(
len(dgc_momentum_optimizer.get_accumulators()), accumulator_count)
with framework.program_guard(program, init_program):
opts = dgc_momentum_optimizer.apply_gradients(params_grads)
self.assertEqual(len(opts), 2)
sgd_op = opts[-1]
self.assertEqual([op.type for op in opts], ["scale", name])
......@@ -108,8 +114,11 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
self.assertTrue(mul_x.name in velocity_acc)
# Check init_program
# dgc not apply include: lr, dgc(count, nranks, begin step), (u,)
# dgc apply include: lr, dgc(count, nranks, begin_step), (u,v,k,encode,gather)
init_ops_count = 5 if name == "momentum" else 9
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 1)
self.assertEqual(len(init_ops), init_ops_count)
self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
......
......@@ -16,53 +16,42 @@ import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
import unittest
import paddle
import paddle.fluid as fluid
import os
from fleet_meta_optimizer_base import TestFleetMetaOptimizer
paddle.enable_static()
class TestFleetAMPOptimizer(unittest.TestCase):
def setUp(self):
os.environ["PADDLE_TRAINER_ID"] = "0"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
class TestFleetAMPOptimizer(TestFleetMetaOptimizer):
def test_amp_optimizer(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'amp')
self.optimizer(avg_cost, strategy, train_prog, startup_prog)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.amp = True
strategy.amp_configs = {
"init_loss_scaling": 32768,
"decr_every_n_nan_or_inf": 2,
"incr_every_n_steps": 1000,
"incr_ratio": 2.0,
"use_dynamic_loss_scaling": True,
"decr_ratio": 0.5,
"custom_white_list": ['softmax'],
"custom_black_list": ['tanh'],
}
ops = [op.type for op in avg_cost.block.ops]
self.assertIn('cast', ops)
self.assertIn('check_finite_and_unscale', ops)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
def test_amp_recompute_optimizer(self):
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'amp')
self.set_strategy(strategy, 'recompute')
self.optimizer(avg_cost, strategy, train_prog, startup_prog)
strategy = fleet._final_strategy()
ops = [op.type for op in avg_cost.block.ops]
outs = [
op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
]
self.assertIn('cast', ops)
self.assertIn('check_finite_and_unscale', ops)
self.assertIn('subprog', ''.join(outs))
if __name__ == "__main__":
unittest.main()
......@@ -18,66 +18,27 @@ from paddle import fluid
import os
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
from fleet_meta_optimizer_base import TestFleetMetaOptimizer
paddle.enable_static()
class TestFleetDGCOptimizer(unittest.TestCase):
def setUp(self):
os.environ["PADDLE_TRAINER_ID"] = "1"
os.environ[
"PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
def net(self, main_prog, startup_prog):
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(
name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x,
size=64,
act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2],
size=2,
act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.dgc = True
strategy.dgc_configs = {
"rampup_begin_step": 128,
"rampup_step": 100,
"sparsity": [0.996, 0.999]
}
return avg_cost, strategy
class TestFleetDGCOptimizer(TestFleetMetaOptimizer):
def test_dgc_optimizer(self):
startup_prog = fluid.Program()
train_prog = fluid.Program()
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.fluid.optimizer.Momentum(
learning_rate=0.01, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
self.set_strategy(strategy, 'dgc')
self.optimizer(avg_cost, strategy, train_prog, startup_prog)
ops = [op.type for op in avg_cost.block.ops]
self.assertIn('dgc', ops)
self.assertIn('dgc_momentum', ops)
def test_dgc_not_apply_with_adam(self):
startup_prog = fluid.Program()
train_prog = fluid.Program()
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
self.set_strategy(strategy, 'dgc')
self.optimizer(avg_cost, strategy, train_prog, startup_prog, 'adam')
ops = [op.type for op in avg_cost.block.ops]
self.assertNotIn('dgc', ops)
......@@ -87,18 +48,31 @@ class TestFleetDGCOptimizer(unittest.TestCase):
os.environ["PADDLE_TRAINER_ID"] = "0"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
startup_prog = fluid.Program()
train_prog = fluid.Program()
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.fluid.optimizer.Momentum(
learning_rate=0.01, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
self.set_strategy(strategy, 'dgc')
self.optimizer(avg_cost, strategy, train_prog, startup_prog)
ops = [op.type for op in avg_cost.block.ops]
self.assertNotIn('dgc', ops)
self.assertNotIn('dgc_momentum', ops)
def test_dgc_recompute_optimizer(self):
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'dgc')
self.set_strategy(strategy, 'recompute')
self.optimizer(avg_cost, strategy, train_prog, startup_prog)
ops = [op.type for op in avg_cost.block.ops]
outs = [
op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
]
self.assertIn('dgc', ops)
self.assertIn('dgc_momentum', ops)
self.assertIn('subprog', ''.join(outs))
if __name__ == "__main__":
unittest.main()
......@@ -14,40 +14,55 @@
import unittest
import paddle
import paddle.fluid as fluid
import os
from fleet_meta_optimizer_base import TestFleetMetaOptimizer
paddle.enable_static()
class TestFleetRecomputeMetaOptimizer(unittest.TestCase):
def setUp(self):
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
"127.0.0.1:36001,127.0.0.2:36001"
class TestFleetRecomputeMetaOptimizer(TestFleetMetaOptimizer):
def test_recompute_optimizer(self):
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.recompute = True
strategy.recompute_configs = {"checkpoints": ["fc_1.tmp_0"]}
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'recompute')
self.optimizer(avg_cost, strategy, train_prog, startup_prog)
outs = [
op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
]
self.assertIn('subprog', ''.join(outs))
def test_recompute_lars_optimizer(self):
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'recompute')
self.set_strategy(strategy, 'lars')
self.optimizer(avg_cost, strategy, train_prog, startup_prog)
ops = [op.type for op in avg_cost.block.ops]
outs = [
op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
]
self.assertIn('lars_momentum', ops)
self.assertIn('subprog', ''.join(outs))
def test_recompute_lamb_optimizer(self):
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'recompute')
self.set_strategy(strategy, 'lamb')
self.optimizer(avg_cost, strategy, train_prog, startup_prog, 'adam')
ops = [op.type for op in avg_cost.block.ops]
outs = [
op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
]
self.assertIn('lamb', ops)
self.assertIn('subprog', ''.join(outs))
if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册