From d62360fe5f3cd5f8549ec16924696cb5d70cf919 Mon Sep 17 00:00:00 2001 From: mapingshuo Date: Thu, 26 Sep 2019 09:59:21 +0800 Subject: [PATCH] fix doc of apply_optimize (#19965) * fix doc of apply_optimize test=document_fix test=document_preview * modify doc of backward test=develop test=document_fix * modify document hash test=develop test=document_preview --- paddle/fluid/API.spec | 2 +- python/paddle/fluid/backward.py | 42 +++++++++++++++++++------------- python/paddle/fluid/optimizer.py | 5 ++-- 3 files changed, 29 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 47b3319cade..62cb9d39187 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -1025,7 +1025,7 @@ paddle.fluid.optimizer.LookaheadOptimizer.minimize (ArgSpec(args=['self', 'loss' paddle.fluid.optimizer.RecomputeOptimizer ('paddle.fluid.optimizer.RecomputeOptimizer', ('document', '05769ba1182270f808f85488a50c8caa')) paddle.fluid.optimizer.RecomputeOptimizer.__init__ (ArgSpec(args=['self', 'optimizer'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.RecomputeOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '7838e157ec5ff4f835f814adf3a2b9cc')) -paddle.fluid.optimizer.RecomputeOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'ec8dfa14fcd958d7c196f3d1a0ce6fa7')) +paddle.fluid.optimizer.RecomputeOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '89c5348bfd78ad21f90b1da4af4b3cd1')) paddle.fluid.optimizer.RecomputeOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks', 'checkpoints'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', 'a26b3dbb0f63ee81d847d92e9fb942dc')) paddle.fluid.optimizer.RecomputeOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.RecomputeOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '7b2b8ae72011bc4decb67e97623f2c56')) diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 07d7c9d19df..07d69fadb95 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -554,8 +554,6 @@ def serialize_op_decs(op_desc): def _append_backward_ops_with_checkpoints_( block, ops, target_block, no_grad_dict, grad_to_var, checkpoints): - - checkpoints_name = [x.name for x in checkpoints] """ Create grad ops with forward ops, and insert them into given block @@ -569,25 +567,27 @@ def _append_backward_ops_with_checkpoints_( checkpoints: variables that a user defined as checkpoint for forward recomputation Algorithms: - 1) go through all forward ops and induct all checkpoint vars - a. input variables can be deduced from forward program - b. input variables are checkpoints - c. variables that are used across segments will be held in memory - 2) find ops between checkpoints, i.e. recompute_segments + 1) find ops between checkpoints, i.e. recompute_segments + 2) go through all forward ops and induct all variables that will be hold in memory + a. variables that are used across segments will be held in memory + b. output of dropout op will be held in memory + c. input variables will be held in memory 3) go through each recompute_segments, add backward ops with forward recomputation a. add ops in current recompute_segment as forward recomputation ops b. rename all non-checkpoint variables in recomputation ops - c. add sum_op to merge gradient if needed - d. add backward ops of current recomputation ops + c. add backward ops of current recomputation ops + d. add sum op for repetitive_outputs 4) remove no grad branch as it is in _remove_no_grad_branch_ 5) Note1: all appended ops' OpRole are Backward - 6) Note2: variables that are used across segments will be held in memory - 7) Note3: all variables with new name should be returned so that _append_backward_vars_ can be called - 8) Note4: current forward recomputation backpropagation does not handle programs with subblock + 6) Note2: all variables with new name should be returned so that _append_backward_vars_ can be called + 7) Note3: current forward recomputation backpropagation does not handle programs with subblock """ + + checkpoints_name = [x.name for x in checkpoints] local_block = block.program._create_block() buffer_block = block.program._create_block() + # 1) find ops between checkpoints, i.e. recompute_segments program_stat = ProgramStats(block, ops) program_stat.build_stats() segments = [] @@ -622,11 +622,16 @@ def _append_backward_ops_with_checkpoints_( recompute_segments = [[0, segments[0][0]]] + segments else: recompute_segments = segments + + # 2) go through all forward ops and induct all variables that will be hold in memory vars_should_be_hold = [] + # a. variables that are used across segments will be held in memory for segment in recompute_segments: vars_should_be_hold.extend( program_stat.get_out_of_subgraph_vars(segment[0], segment[1])) + # b. output of dropout op will be held in memory vars_should_be_hold.extend(program_stat.get_reserved_vars()) + # c. input variables are checkpoints vars_should_be_hold.extend(program_stat.get_input_nodes()) vars_should_be_hold = list(set(vars_should_be_hold)) @@ -634,6 +639,7 @@ def _append_backward_ops_with_checkpoints_( grad_should_be_hold = [x + "@GRAD" for x in vars_should_be_hold] vars_should_be_hold.extend(grad_should_be_hold) + # 3) go through each recompute_segments, add backward ops with forward recomputation grad_op_descs = [] var_name_dict = {} @@ -641,6 +647,8 @@ def _append_backward_ops_with_checkpoints_( max_calculated_op_position = len(ops) if recompute_segments == []: + # if there is no recompute segment, add backward ops like + # _append_backward_ops_ function gap_ops = ops[0:max_calculated_op_position] for op in reversed(gap_ops): if op.has_attr("sub_block"): @@ -686,30 +694,30 @@ def _append_backward_ops_with_checkpoints_( continue if name not in var_name_dict: var_name_dict[name] = name + var_suffix + # 3.a. add ops in current recompute_segment as forward recomputation ops buffer_descs = _add_needed_descs_to_block(ff_ops, buffer_block, block, vars_in_memory) added_descs = _add_descs_to_block(ff_ops, local_block) - # rename variable names in added_descs + # 3.b. rename all non-checkpoint variables in recomputation ops for key in var_name_dict: _rename_arg_(buffer_descs, key, var_name_dict[key]) # added_descs should be in grad_op_descs because it is backward op desc grad_op_descs.extend(buffer_descs) - #for op_desc in reversed(buffer_descs): + # 3.c. add backward ops of current recomputation ops for op_desc in reversed(added_descs): - grad_op_desc, op_grad_to_var = core.get_grad_op_desc( op_desc, cpt.to_text(no_grad_dict[block.idx]), []) - for key in var_name_dict: _rename_arg_(grad_op_desc, key, var_name_dict[key]) - grad_op_descs.extend(grad_op_desc) grad_to_var.update(op_grad_to_var) + # 3.d. add sum op for repetitive_outputs grad_op_descs = _addup_repetitive_outputs_(grad_op_descs) + # 4) remove no grad branch as it is in _remove_no_grad_branch_ grad_op_descs = _remove_no_grad_branch_(grad_op_descs, no_grad_dict[block.idx]) added_descs = _add_descs_to_block(grad_op_descs, target_block) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index a3690de36e2..5cb76e51294 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -3291,6 +3291,7 @@ class RecomputeOptimizer(Optimizer): Examples: .. code-block:: python + import paddle.fluid as fluid def mlp(input_x, input_y, hid_dim=128, label_dim=2): @@ -3298,8 +3299,7 @@ class RecomputeOptimizer(Optimizer): prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=input_y) sum_cost = fluid.layers.reduce_mean(cost) - return sum_cost, fc_1, prediction - + return sum_cost, fc_1, prediction input_x = fluid.layers.data(name="x", shape=[32], dtype='float32') input_y = fluid.layers.data(name="y", shape=[1], dtype='int64') @@ -3319,6 +3319,7 @@ class RecomputeOptimizer(Optimizer): cost, startup_program=None, params_grads=params_grads) print("Finished apply_optimize") + """ return self._optimizer.apply_optimize( -- GitLab