未验证 提交 9901f696 编写于 作者: M mapingshuo 提交者: GitHub

Forward recompute3 (#19913)

* add recompute based checkpoints methods for large batch training
test=develop

* add append_backward_with_forward_recomputation
test=develop

* refine optimizer
test=develop

* update backward and optimizer
test=develop

* make Variable usable
test=develop

* add recompute code

* refine optimizer
test=develop

* refine addup _append_backward_ops_with_checkpoints_
1) for recompute part, just cache the grad_op_desc without appending to block
2) before appending grad_op_desc to backward part, addup_repetitive_vars, remove unused branch
test=develop

* make method private

* add recompute strategy into DistributedStrategy
test=develop

* checkpoint version3
test=develop

* remove some print information
test=develop

* remove unused sumop
test=develop

* try to fix recompute with graph building modules

* add input names to vars should be held

* add memory debug tool

* backup backward

* Fix bugs

* add backward desc for op not in any segments

* add exception info for sub_block

test=develop

* modify code style

test=develop

* modify code style

test=develop

* remove print functions

test=develop

* add API spec

test=develop
test=document_preview

* make Recompute a child class of Optimizer

test=develop
test=document_preview

* add API spec

test=develop
test=document_preview

* modify API spec

test=develop
test=document_preview

* add document for Recompute

test=develop
test=document_preview

* change API doc of Rcompute

test=develop
test=document_preview

* code cleaning

test=develop
test=document_preview

* modify API spec

* fix bugs when segments hold no element

* add testcase for Recompute Optimizer

test=develop
test=document_preview

* add test for apply_gradient, and code cleaning

test=develop
test=document_preview

* add test case for load function

* enable CI

test=develop
test=document

* add test case

test=develop
test=document_preview

* add sample code for 4 function of recompute optimizer

test=develop
test=document_preview
上级 d7251a8e
......@@ -1012,7 +1012,15 @@ paddle.fluid.optimizer.PipelineOptimizer.minimize (ArgSpec(args=['self', 'loss',
paddle.fluid.optimizer.LookaheadOptimizer ('paddle.fluid.optimizer.LookaheadOptimizer', ('document', 'c291cadfa7452c7bf58b9e2f900a3511'))
paddle.fluid.optimizer.LookaheadOptimizer.__init__ (ArgSpec(args=['self', 'inner_optimizer', 'alpha', 'k'], varargs=None, keywords=None, defaults=(0.5, 5)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.LookaheadOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '52488008103886c793843a3828bacd5e'))
paddle.fluid.optimizer.RecomputeOptimizer ('paddle.fluid.optimizer.RecomputeOptimizer', ('document', '05769ba1182270f808f85488a50c8caa'))
paddle.fluid.optimizer.RecomputeOptimizer.__init__ (ArgSpec(args=['self', 'optimizer'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.RecomputeOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '7838e157ec5ff4f835f814adf3a2b9cc'))
paddle.fluid.optimizer.RecomputeOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'ec8dfa14fcd958d7c196f3d1a0ce6fa7'))
paddle.fluid.optimizer.RecomputeOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks', 'checkpoints'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', 'a26b3dbb0f63ee81d847d92e9fb942dc'))
paddle.fluid.optimizer.RecomputeOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.RecomputeOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '7b2b8ae72011bc4decb67e97623f2c56'))
paddle.fluid.optimizer.RecomputeOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks', 'checkpoints'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '52488008103886c793843a3828bacd5e'))
paddle.fluid.backward.gradients (ArgSpec(args=['targets', 'inputs', 'target_gradients', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'e2097e1e0ed84ae44951437bfe269a1b'))
paddle.fluid.regularizer.L1DecayRegularizer ('paddle.fluid.regularizer.L1DecayRegularizer', ('document', '34603757e70974d2fcc730643b382925'))
paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
......
此差异已折叠。
......@@ -105,6 +105,8 @@ class DistributedStrategy(fluid.BuildStrategy):
self.mode = "nccl2" # or collective
self.collective_mode = None # local_sgd or grad_allreduce
self.nccl_comm_num = 1
self.forward_recompute = False
self.recompute_checkpoints = []
self.exec_strategy = fluid.ExecutionStrategy()
......@@ -150,6 +152,11 @@ class CollectiveOptimizer(DistributedOptimizer):
def __init__(self, optimizer, strategy=DistributedStrategy()):
super(CollectiveOptimizer, self).__init__(optimizer, strategy)
if strategy.forward_recompute:
self.forward_recompute = True
self.recompute_checkpoints = strategy.recompute_checkpoints
else:
self.forward_recompute = False
self.print_config = False
def backward(self,
......@@ -347,6 +354,13 @@ class CollectiveOptimizer(DistributedOptimizer):
self._check_collective_mode(main_program, self._optimizer,
self._strategy)
if self.forward_recompute:
assert (isinstance(self.recompute_checkpoints, list) and
len(self.recompute_checkpoints) > 0)
self._optimizer = \
fluid.optimizer.RecomputeOptimizer(self._optimizer)
self._optimizer._set_checkpoints(self.recompute_checkpoints)
optimize_ops, param_grads = self._optimizer.minimize(
loss,
startup_program=startup_program,
......
......@@ -36,6 +36,7 @@ from paddle.fluid import core
from paddle.fluid.layers import tensor
from functools import reduce
from .wrapped_decorator import signature_safe_contextmanager
from .. import compat as cpt
__all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
......@@ -43,7 +44,8 @@ __all__ = [
'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'LarsMomentum',
'LarsMomentumOptimizer', 'DGCMomentumOptimizer', 'LambOptimizer',
'ExponentialMovingAverage', 'PipelineOptimizer', 'LookaheadOptimizer'
'ExponentialMovingAverage', 'PipelineOptimizer', 'LookaheadOptimizer',
'RecomputeOptimizer'
]
......@@ -2977,6 +2979,298 @@ class PipelineOptimizer(object):
}
class RecomputeOptimizer(Optimizer):
"""
Recompute Optimizer Wrapper
Normally, a training step contains three sub-steps: first, run forward
Operators to calculate the loss; second, run backward Operators to
calculate gradient of the parameters; third, apply optimization method
to update the value of the parameters.
In the forward computation process, all variables that are needed by
backward computation process will be kept in memory, which occupy a great
amount of memory when the network becomes very deep.
Recompute split the network to k segments. In each segment, It will
recompute the forward Operators, before running backward operators. It is
very helpful for saving memory.
The Variables that separate a network to segments are called as checkpoints,
and users should set it manually. The usage is very simple:
Args:
optimizer (Optimizer): The optimizer that is applied to parameters.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import numpy as np
def gen_data():
return {"x": np.random.random(size=(32, 32)).astype('float32'),
"y": np.random.randint(2, size=(32, 1)).astype('int64')}
def mlp(input_x, input_y, hid_dim=128, label_dim=2):
print(input_x)
fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
sum_cost = fluid.layers.reduce_mean(cost)
return sum_cost, fc_1, prediction
input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
cost, fc_1, pred = mlp(input_x, input_y)
sgd = fluid.optimizer.Adam(learning_rate=0.01)
sgd = fluid.optimizer.RecomputeOptimizer(sgd)
sgd._set_checkpoints([fc_1, pred])
sgd.minimize(cost)
print("Finished optimize")
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
step = 10
for i in range(step):
cost_val = exe.run(feed=gen_data(),
program=fluid.default_main_program(),
fetch_list=[cost.name])
print("step=%d cost=%f" % (i, cost_val[0]))
"""
def __init__(self, optimizer):
self._optimizer = optimizer
self._checkpoints = None
def _set_checkpoints(self, checkpoints):
self._checkpoints = checkpoints
def load(self, stat_dict):
"""
load function is not supported by Recompute Optimizer for now.
:return: None
Args:
stat_dict: the dict load by load_persistable method
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle.compat as cpt
def mlp(input_x, input_y, hid_dim=128, label_dim=2):
fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
sum_cost = fluid.layers.reduce_mean(cost)
return sum_cost, fc_1, prediction
input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
cost, fc_1, pred = mlp(input_x, input_y)
print("Finished FF")
sgd = fluid.optimizer.Adam(learning_rate=0.01)
sgd = fluid.optimizer.RecomputeOptimizer(sgd)
sgd._set_checkpoints([fc_1, pred])
try:
stat_dict = {}
sgd.load(stat_dict)
except NotImplementedError as e:
print(cpt.get_exception_message(e))
"""
raise NotImplementedError(
"load function is not supported by Recompute Optimizer for now")
def apply_gradients(self, params_grads):
"""
call apply_gradients function of self._optimizer.
Args:
params_grads (list): list of (param, grad) pair to do optimization.
Returns:
list: A list of operators appended to the current program.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle.fluid.framework as framework
def mlp(input_x, input_y, hid_dim=128, label_dim=2):
fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
sum_cost = fluid.layers.reduce_mean(cost)
return sum_cost, fc_1, prediction
input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
cost, fc_1, pred = mlp(input_x, input_y)
print("Finished FF")
sgd = fluid.optimizer.Adam(learning_rate=0.01)
sgd = fluid.optimizer.RecomputeOptimizer(sgd)
params_grads = sgd.backward(
cost,
startup_program=None,
parameter_list=None,
no_grad_set=None,
checkpoints=[fc_1, pred])
program = cost.block.program
with framework.program_guard(program, None):
optimize_ops = sgd.apply_gradients(params_grads)
print("Finished apply gradients")
"""
return self._optimizer.apply_gradients(params_grads=params_grads)
def backward(self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None,
callbacks=None,
checkpoints=None):
"""
call append_backward with checkpoints.
Args:
loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters
in `parameter_list`.
parameter_list (list): list of Variables to update.
no_grad_set (set|None): set of Variables should be ignored.
callbacks (list|None): list of callables to run when appending backward
operator for one parameter.
checkpoints (list): list of Variables as checkpoints
Examples:
.. code-block:: python
import paddle.fluid as fluid
def mlp(input_x, input_y, hid_dim=128, label_dim=2):
fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
sum_cost = fluid.layers.reduce_mean(cost)
return sum_cost, fc_1, prediction
input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
cost, fc_1, pred = mlp(input_x, input_y)
print("Finished FF")
sgd = fluid.optimizer.Adam(learning_rate=0.01)
sgd = fluid.optimizer.RecomputeOptimizer(sgd)
params_grads = sgd.backward(
cost,
startup_program=None,
parameter_list=None,
no_grad_set=None,
checkpoints=[fc_1, pred])
print("Finished backward")
"""
if framework.in_dygraph_mode():
raise NotImplementedError(
"DyGraph current does not support recompute")
self._dtype = loss.dtype
program = loss.block.program
with program_guard(program, startup_program):
params_grads = append_backward(
loss,
parameter_list,
no_grad_set,
checkpoints=self._checkpoints)
return params_grads
def apply_optimize(self, loss, startup_program, params_grads):
"""
call the apply_optimize function of self._optimizer
Args:
loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters
in `parameter_list`.
params_grads (list): list of (param, grad) pair to do optimization.
Examples:
.. code-block:: python
import paddle.fluid as fluid
def mlp(input_x, input_y, hid_dim=128, label_dim=2):
fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
sum_cost = fluid.layers.reduce_mean(cost)
return sum_cost, fc_1, prediction
input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
cost, fc_1, pred = mlp(input_x, input_y)
print("Finished FF")
sgd = fluid.optimizer.Adam(learning_rate=0.01)
sgd = fluid.optimizer.RecomputeOptimizer(sgd)
params_grads = sgd.backward(
cost,
startup_program=None,
parameter_list=None,
no_grad_set=None,
checkpoints=[fc_1, pred])
optimize_ops = sgd.apply_optimize(
cost, startup_program=None, params_grads=params_grads)
print("Finished apply_optimize")
"""
return self._optimizer.apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads)
def minimize(self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None,
grad_clip=None):
assert (isinstance(loss, Variable)), "The loss should be an Variable."
assert (self._checkpoints is not None
), "You should call _set_checkpoints first"
if framework.in_dygraph_mode():
raise NotImplementedError(
"DyGraph current does not support recompute")
params_grads = self.backward(
loss,
startup_program=startup_program,
parameter_list=parameter_list,
no_grad_set=no_grad_set,
checkpoints=self._checkpoints)
if grad_clip:
# TODO(guru4elephant): should add grad_clip for static graph
pass
optimize_ops = self.apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads)
return optimize_ops, params_grads
class LookaheadOptimizer(object):
"""
This implements the Lookahead optimizer of the
......
......@@ -18,6 +18,7 @@ import unittest
import paddle.fluid.framework as framework
import paddle.fluid.optimizer as optimizer
import paddle.compat as cpt
from paddle.fluid.backward import append_backward
......@@ -571,5 +572,154 @@ class TestLookaheadOptimizer(unittest.TestCase):
self.assertEqual([op.type for op in opts], ["scale", "sgd"])
class TestRecomputeOptimizer(unittest.TestCase):
def net(self):
program = framework.Program()
block = program.global_block()
mul_x = block.create_parameter(
dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
mul_out = block.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
b1 = block.create_parameter(
dtype="float32", shape=[5, 8], lod_level=0, name="b1")
b1_out = block.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="b1_out")
b2 = block.create_parameter(
dtype="float32", shape=[5, 8], lod_level=0, name="b2")
b2_out = block.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="b2_out")
mean_out = block.create_var(
dtype="float32", shape=[1], lod_level=0, name="mean.out")
block.append_op(
type="mul",
inputs={"X": mul_x,
"Y": mul_y},
outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1})
block.append_op(
type="elementwise_add",
inputs={"X": mul_out,
"Y": b1},
outputs={"Out": b1_out})
block.append_op(
type="elementwise_add",
inputs={"X": b1_out,
"Y": b2},
outputs={"Out": b2_out})
block.append_op(
type="mean", inputs={"X": b2_out}, outputs={"Out": mean_out})
return mul_out, b1_out, b2_out, mean_out
def test_no_checkpoint(self):
mul_out, b1_out, b2_out, mean_out = self.net()
self.assertEqual(len(mean_out.block.ops), 4)
self.assertEqual([op.type for op in mean_out.block.ops],
["mul", "elementwise_add", "elementwise_add", "mean"])
sgd_optimizer = optimizer.SGD(learning_rate=1.0)
recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer)
recompute_optimizer._set_checkpoints([])
opts, params_grads = recompute_optimizer.minimize(mean_out)
self.assertEqual(len(mean_out.block.ops), 12)
self.assertEqual([op.type for op in mean_out.block.ops], [
"mul", "elementwise_add", "elementwise_add", "mean",
"fill_constant", "mean_grad", "elementwise_add_grad",
"elementwise_add_grad", "mul_grad", "sgd", "sgd", "sgd"
])
def test_one_checkpoint(self):
mul_out, b1_out, b2_out, mean_out = self.net()
self.assertEqual(len(mean_out.block.ops), 4)
self.assertEqual([op.type for op in mean_out.block.ops],
["mul", "elementwise_add", "elementwise_add", "mean"])
sgd_optimizer = optimizer.SGD(learning_rate=1.0)
recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer)
recompute_optimizer._set_checkpoints([b1_out])
opts, params_grads = recompute_optimizer.minimize(mean_out)
self.assertEqual(len(mean_out.block.ops), 13)
self.assertEqual([op.type for op in mean_out.block.ops], [
"mul", "elementwise_add", "elementwise_add", "mean",
"fill_constant", "mean_grad", "elementwise_add_grad", "mul",
"elementwise_add_grad", "mul_grad", "sgd", "sgd", "sgd"
])
def test_multi_checkpoint(self):
mul_out, b1_out, b2_out, mean_out = self.net()
self.assertEqual(len(mean_out.block.ops), 4)
self.assertEqual([op.type for op in mean_out.block.ops],
["mul", "elementwise_add", "elementwise_add", "mean"])
sgd_optimizer = optimizer.SGD(learning_rate=1.0)
recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer)
recompute_optimizer._set_checkpoints([mul_out, b2_out])
opts, params_grads = recompute_optimizer.minimize(mean_out)
self.assertEqual(len(mean_out.block.ops), 13)
self.assertEqual([op.type for op in mean_out.block.ops], [
"mul", "elementwise_add", "elementwise_add", "mean",
"fill_constant", "mean_grad", "elementwise_add",
"elementwise_add_grad", "elementwise_add_grad", "mul_grad", "sgd",
"sgd", "sgd"
])
def test_adjacent_checkpoint(self):
mul_out, b1_out, b2_out, mean_out = self.net()
self.assertEqual(len(mean_out.block.ops), 4)
self.assertEqual([op.type for op in mean_out.block.ops],
["mul", "elementwise_add", "elementwise_add", "mean"])
sgd_optimizer = optimizer.SGD(learning_rate=1.0)
recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer)
recompute_optimizer._set_checkpoints([mul_out, b1_out])
opts, params_grads = recompute_optimizer.minimize(mean_out)
self.assertEqual(len(mean_out.block.ops), 12)
self.assertEqual([op.type for op in mean_out.block.ops], [
"mul", "elementwise_add", "elementwise_add", "mean",
"fill_constant", "mean_grad", "elementwise_add_grad",
"elementwise_add_grad", "mul_grad", "sgd", "sgd", "sgd"
])
def test_apply_gradients(self):
mul_out, b1_out, b2_out, mean_out = self.net()
sgd_optimizer = optimizer.SGD(learning_rate=1.0)
recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer)
recompute_optimizer._set_checkpoints([b1_out])
# apply backward
params_grads = recompute_optimizer.backward(
mean_out,
startup_program=None,
parameter_list=None,
no_grad_set=None,
checkpoints=[b1_out])
# apply gradient
program = mean_out.block.program
with framework.program_guard(program, None):
optimize_ops = recompute_optimizer.apply_gradients(params_grads)
self.assertEqual(len(mean_out.block.ops), 13)
self.assertEqual([op.type for op in mean_out.block.ops], [
"mul", "elementwise_add", "elementwise_add", "mean",
"fill_constant", "mean_grad", "elementwise_add_grad", "mul",
"elementwise_add_grad", "mul_grad", "sgd", "sgd", "sgd"
])
def test_load(self):
mul_out, b1_out, b2_out, mean_out = self.net()
sgd_optimizer = optimizer.SGD(learning_rate=1.0)
recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer)
recompute_optimizer._set_checkpoints([b1_out])
try:
stat_dict = {}
recompute_optimizer.load(stat_dict)
except NotImplementedError as e:
self.assertEqual(
"load function is not supported by Recompute Optimizer for now",
cpt.get_exception_message(e))
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册