From 7d8d45991a7a744c72b7bce25b830177ea461d0d Mon Sep 17 00:00:00 2001 From: liym27 <33742067+liym27@users.noreply.github.com> Date: Sat, 4 Jan 2020 13:04:54 +0800 Subject: [PATCH] control flow: support optimizer called (#21851) * append optimize op in the grad block of current block if current block is in control flow. test=develop * add conditional grad op when optimizer used in control flow. test=develop * add comment and modify typo. test=develop * fix append_backward to support control flow. test=develop * add test. test=develop * fix copy_var_to_parent_block and conditional_block_grad. test=develop * fix bug: revert to append conditional_block_grad vars to sub grad block. test=develop * fix bug: revert to assign var to parent block even if var already is in parent block * fix bug: consider outputs is empty. test=develop * move _rename_grad_ out. test=develop * modify code according to reviews from Huihuang. test=develop * modify code according to reviews from Jinle. test=develop --- python/paddle/fluid/backward.py | 216 +++++++++++++---- python/paddle/fluid/framework.py | 8 + python/paddle/fluid/layers/control_flow.py | 91 ++++++- python/paddle/fluid/optimizer.py | 28 ++- .../paddle/fluid/tests/unittests/test_case.py | 48 ++++ .../test_optimizer_in_control_flow.py | 229 ++++++++++++++++++ 6 files changed, 558 insertions(+), 62 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 55c2520ed5d..317c4956435 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -1009,6 +1009,17 @@ def _get_stop_gradients_(program): return no_grad_dict +def _get_son_parent_block_idx_dict(program, current_block_idx): + + son_parent_block_idx_dict = collections.OrderedDict() + while current_block_idx >= 0: + parent_block_idx = program.block(current_block_idx).parent_idx + son_parent_block_idx_dict[current_block_idx] = parent_block_idx + current_block_idx = parent_block_idx + + return son_parent_block_idx_dict + + def append_backward(loss, parameter_list=None, no_grad_set=None, @@ -1101,67 +1112,113 @@ def append_backward(loss, isinstance(callbacks, list) program = loss.block.program - program._appending_grad_times += 1 + root_block = program.block(0) + current_block_idx = program.current_block_idx + current_block = program.block(current_block_idx) + + is_in_control_flow = current_block_idx != 0 + + # Double grad is not supported in sub-block (control flow) + if not is_in_control_flow: + # _appending_grad_times used for double grad + program._appending_grad_times += 1 if no_grad_set is None: no_grad_set = set() no_grad_set = copy.copy(no_grad_set) no_grad_dict = _get_stop_gradients_(program) + # no_grad_set only contains vars in block 0 + # Todo(liym27): support vars in sub block no_grad_dict[0].update(list(map(_append_grad_suffix_, no_grad_set))) - root_block = program.block(0) + # Currently it is only to support the optimizer.minimize + # in a switch branch, which can append_backward in a sub_block. + # Note: while_loop is in control flow, but it makes no sense to call optimizer in while. + # Todo: report error when it is in while_loop + if is_in_control_flow: + # create grad block if in switch control flow. + target_grad_block = program._create_block( + parent_idx=current_block.parent_idx) + target_grad_block._set_forward_block_idx(current_block_idx) + # after _create_block, program.current_block changes + else: + target_grad_block = root_block + + son_parent_block_idx_dict = _get_son_parent_block_idx_dict( + program, current_block_idx) + + block_fwd_op_num_dict = {} # block_id: fwd_op_num + for idx in son_parent_block_idx_dict: + block_fwd_op_num_dict[idx] = program.block(idx).desc.op_size() - fwd_op_num = root_block.desc.op_size() - current_block_idx = program.current_block_idx grad_to_var = dict() op_desc = _create_loss_op_desc_(loss) - root_block.desc.append_op().copy_from(op_desc) + target_grad_block.desc.append_op().copy_from(op_desc) - block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0])) - op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set) - no_grad_vars = _find_no_grad_vars(root_block, op_path, [loss], - block_no_grad_set) - block_no_grad_set.update(no_grad_vars) - no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set))) + for block_idx in son_parent_block_idx_dict: + block = program.block(block_idx) - input_grad_names_set = None - # For double backward, input_grad_names is used for filter - # some non-used gradients op. - if program._appending_grad_times > 1: - input_grad_names_set = set([_append_grad_suffix_(loss.name)]) - - - if checkpoints != None and \ - isinstance(checkpoints, list) and \ - len(checkpoints) > 0: - program_stat, checkpoint_names, \ - vars_should_be_hold, \ - recompute_segments = \ - _append_backward_ops_with_checkpoints_( - root_block, - op_path, - root_block, - no_grad_dict, - grad_to_var, - checkpoints) - else: - _append_backward_ops_( - root_block, - op_path, - root_block, - no_grad_dict, - grad_to_var, - callbacks, - input_grad_names_set=input_grad_names_set) + block_no_grad_set = set( + map(_strip_grad_suffix_, no_grad_dict[block_idx])) + op_path = _find_op_path_(block, [loss], [], block_no_grad_set) - # Because calc_gradient may be called multiple times, + no_grad_vars = _find_no_grad_vars(block, op_path, [loss], + block_no_grad_set) + + block_no_grad_set.update(no_grad_vars) + no_grad_dict[block_idx].update( + list(map(_append_grad_suffix_, block_no_grad_set))) + + input_grad_names_set = None + # For double backward, input_grad_names is used for filtering + # some non-used gradients op(s). + + # Todo(liym27): need a better design. + # not support double grad in control flow sub-block now. + if not is_in_control_flow: + if program._appending_grad_times > 1: + input_grad_names_set = set([_append_grad_suffix_(loss.name)]) + + # Todo: support _append_backward_ops_with_checkpoints_ in + # sub-block (control flow) + if checkpoints != None and \ + isinstance(checkpoints, list) and \ + len(checkpoints) > 0: + program_stat, checkpoint_names, \ + vars_should_be_hold, \ + recompute_segments = \ + _append_backward_ops_with_checkpoints_( + root_block, + op_path, + root_block, + no_grad_dict, + grad_to_var, + checkpoints) + else: + _append_backward_ops_( + block, # the block where forward ops are in + op_path, + target_grad_block, + no_grad_dict, + grad_to_var, + callbacks, + input_grad_names_set=input_grad_names_set) + + grad_info_map = dict() + + # if in control flow, target_grad_block is a created new block which only contains grad ops, + # so fwd_op_num is set to 0. + fwd_op_num = block_fwd_op_num_dict[ + current_block_idx] if not is_in_control_flow else 0 + + # Because append_backward may be called multiple times, # we need rename the internal gradient variables so that they have # different names. - _rename_grad_(root_block, fwd_op_num, grad_to_var, {}) + _rename_grad_(target_grad_block, fwd_op_num, grad_to_var, {}) - grad_info_map = dict() - _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map) + _append_backward_vars_(target_grad_block, fwd_op_num, grad_to_var, + grad_info_map) program.current_block_idx = current_block_idx program._sync_with_cpp() @@ -1186,6 +1243,7 @@ def append_backward(loss, parameters = [param.name for param in params if param.trainable] params_and_grads = [] + op_role_var_attr_name = core.op_proto_and_checker_maker.kOpRoleVarAttrName() for param in parameters: if cpt.to_text(param) not in grad_info_map: continue @@ -1197,16 +1255,20 @@ def append_backward(loss, # Get the param var from the global block param_var = program.global_block().var(param) grad_var = grad_block.var(grad_info[0]) - if loss.block.has_var(grad_info[0]): - params_and_grads.append((param_var, grad_var)) + if not is_in_control_flow: + if loss.block.has_var(grad_info[0]): + params_and_grads.append((param_var, grad_var)) + else: + params_and_grads.append((param_var, None)) else: - params_and_grads.append((param_var, None)) + params_and_grads.append((param_var, grad_var)) - op_role_var_attr_name = core.op_proto_and_checker_maker.kOpRoleVarAttrName() for p, g in params_and_grads: if g is None: continue - for op in reversed(program.global_block().ops): + ops = grad_block.ops if is_in_control_flow else program.global_block( + ).ops + for op in reversed(ops): assert isinstance(op, framework.Operator) if g.name in op.output_arg_names: g.op = op @@ -1228,12 +1290,62 @@ def _as_list(x): return list(x) if isinstance(x, collections.Sequence) else [x] +def _is_ancestor_block(ancestor_block, block): + prog = block.program + ancestor_idx = ancestor_block.idx + parent_idx = block.parent_idx + + while parent_idx != -1: + if parent_idx == ancestor_idx: + return True + parent_idx = prog.block(parent_idx).parent_idx + + return False + + +def _get_output_names(cur_block, targets): + """ + In `cur_block`, get output names those linked to targets. + NOTE: + 1. `targets` can be in `cur_block`; + Usually, `targets` is in `cur_block`. However, considering control flow, + 2. `targets` may be in sub-block but `cur_block` is an ancestor of `targets[0].block`; + 3. `targets` may be in the block which is ancestor of `cur_block`. + """ + + block = targets[0].block if targets else cur_block + prog = cur_block.program + if _is_ancestor_block(block, cur_block): + return set() + + current_output_names = set([out.name for out in targets]) + + # if `cur_block` is an ancestor of `targets[0].block`, run while loop + while block.idx != cur_block.idx: + assert block.parent_idx != -1 + parent_block = prog.block(block.parent_idx) + + parent_block_output_names = set() + for op in reversed(block.ops): + if _some_in_set_(op.desc.output_arg_names(), current_output_names): + for name in op.desc.input_arg_names(): + current_output_names.add(name) + if not block.desc.find_var(cpt.to_bytes(name)) \ + and parent_block.desc.find_var(cpt.to_bytes(name)): + parent_block_output_names.add(name) + + block = parent_block + current_output_names = parent_block_output_names + + return current_output_names + + def _find_no_grad_vars(block, op_path, targets, no_grad_set): """ Find the vars which is not used in the program, and - those var belong to no_grad_var. + those vars belong to no_grad_var. """ - output_names = set([out.name for out in targets]) + output_names = _get_output_names(block, targets) no_grad_var = [] for i, op in reversed(list(enumerate(op_path))): # If the op has sub_block, it is too complicated to find the correct no_grad_var. @@ -1253,7 +1365,7 @@ def _find_op_path_(block, outputs, inputs, no_grad_set): no_grad_set will also be changed """ input_names = set([inp.name for inp in inputs]) - output_names = set([out.name for out in outputs]) + output_names = _get_output_names(block, outputs) relevant_op_flags = [True] * len(block.ops) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index ebe78218ec2..e365a584e08 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -2231,6 +2231,14 @@ class Block(object): """ self.desc._set_forward_block_idx(idx) + @property + def backward_block_idx(self): + cur_block_idx = self.idx + for block in self.program.blocks: + if block.forward_block_idx == cur_block_idx: + return block.idx + return -1 + @property def idx(self): return self.desc.id diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 72a515e5707..14cd8b200f7 100755 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -28,6 +28,8 @@ import warnings import six from functools import reduce, partial from ..data_feeder import convert_dtype, check_type_and_dtype +from ... import compat as cpt +from ..backward import _infer_var_data_type_shape_ __all__ = [ 'While', 'Switch', 'increment', 'array_write', 'create_array', 'less_than', @@ -1799,6 +1801,9 @@ class ConditionalBlock(object): intermediate.add(out_var_name) input_set = set([ipt.name for ipt in self.inputs]) + # Todo(liym27) Here assume that all params are in recursive parent block + # but when minimize() called in control flow, some params may be in + # conditional grad block param_list = [ parent_block._var_recursive(each_name) for each_name in params ] @@ -1811,7 +1816,7 @@ class ConditionalBlock(object): step_scope = parent_block.create_var( type=core.VarDesc.VarType.STEP_SCOPES) - parent_block.append_op( + conditional_block_op = parent_block.append_op( type='conditional_block', inputs={ 'Cond': self.inputs, @@ -1824,6 +1829,90 @@ class ConditionalBlock(object): 'is_scalar_condition': self.is_scalar_condition }) + if self.need_append_conditional_block_grad(inside_block): + self.append_conditional_block_grad(parent_block, inside_block, + conditional_block_op) + + def need_append_conditional_block_grad(self, inside_block): + grad_sub_block_idx = inside_block.backward_block_idx + + return grad_sub_block_idx != -1 + + def append_conditional_block_grad(self, parent_block, inside_block, + conditional_block_op): + ''' + Append op `conditional_block_grad` manually. + When `optimizer.minimize/append_backward` is called in Paddle control flow, + grad ops will be appended before appending op `conditional_block` so that + op `conditional_block_grad` can't be appended when calling + `optimizer.minimize/append_backward`. After appending op `conditional_block`, + `conditional_block_grad` is appended manually. + + Args: + parent_block (Block): The block that `conditional_block_op` blongs to. + inside_block (Block): The sub block of `conditional_block_op`. + conditional_block_op (Operator): The forward op conditional_block. + ''' + + grad_sub_block_idx = inside_block.backward_block_idx + grad_sub_block = self.helper.main_program.block(grad_sub_block_idx) + + intermediate = set() + params = set() + + for each_op in grad_sub_block.ops: + assert isinstance(each_op, Operator) + for iname in each_op.input_names: + for in_var_name in each_op.input(iname): + if in_var_name not in intermediate: + params.add(in_var_name) + + for oname in each_op.output_names: + for out_var_name in each_op.output(oname): + intermediate.add(out_var_name) + + param_list = [] + for inner_input_name in params: + inner_var = parent_block._find_var_recursive(inner_input_name) + if inner_var: + param_list.append(cpt.to_text(inner_var.name)) + + grad_op_desc, op_grad_to_var = core.get_grad_op_desc( + conditional_block_op.desc, + cpt.to_text(set()), [grad_sub_block.desc]) + + # append op_desc in grad_op_descs to target_block + op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName() + backward = core.op_proto_and_checker_maker.OpRole.Backward + new_op_desc = parent_block.desc.append_op() + new_op_desc.copy_from(grad_op_desc[0]) + new_op_desc._set_attr(op_role_attr_name, backward) + # set input and output manually + new_op_desc.set_input('Input', param_list) + new_op_desc.set_output('Input@GRAD', + [param + "@GRAD" for param in param_list]) + + new_vars = set() + for grad_var_name in new_op_desc.output_arg_names(): + if grad_sub_block.desc.has_var_recursive( + cpt.to_bytes(grad_var_name) + ) or grad_var_name == core.empty_var_name(): + continue + grad_sub_block.desc.var(cpt.to_bytes(grad_var_name)) + new_vars.add(grad_var_name) + if grad_var_name not in op_grad_to_var: + continue + + # infer_shape and infer_type + new_op_desc.infer_var_type(grad_sub_block.desc) + new_op_desc.infer_shape(grad_sub_block.desc) + + for arg in new_op_desc.output_arg_names(): + if arg in new_vars: + _infer_var_data_type_shape_(arg, grad_sub_block) + + self.helper.main_program._sync_with_cpp() + def copy_var_to_parent_block(var, layer_helper): if var is None: diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 4f1a092c010..953398ecf1a 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -422,11 +422,22 @@ class Optimizer(object): # for parameters and extend _finish_update method to add custom ops. # Allways called under program_guard use global block as loss block + # But if current block is in control flow, append optimize op in the + # grad block of current block + global_block = framework.default_main_program().global_block() - start = len(global_block.ops) + target_block = global_block + current_block = framework.default_main_program().current_block() + if current_block.idx != global_block.idx: + assert current_block.backward_block_idx != -1, \ + "current block is not global_block, but it doesn't have backward block." + target_block = framework.default_main_program().blocks[ + current_block.backward_block_idx] + + start = len(target_block.ops) self.helper = LayerHelper(self.__class__.__name__) self._create_accumulators( - global_block, + target_block, [p[0] for p in parameters_and_grads if p[0].trainable]) self._create_global_learning_rate() @@ -438,7 +449,7 @@ class Optimizer(object): with param_and_grad[0].block.program._optimized_guard( param_and_grad): if param_and_grad[0].trainable is True: - optimize_op = self._append_optimize_op(global_block, + optimize_op = self._append_optimize_op(target_block, param_and_grad) optimize_ops.append(optimize_op) else: @@ -448,16 +459,16 @@ class Optimizer(object): with param_and_grad[0].block.program._optimized_guard( param_and_grad), name_scope("optimizer"): if param_and_grad[0].trainable is True: - optimize_op = self._append_optimize_op(global_block, + optimize_op = self._append_optimize_op(target_block, param_and_grad) optimize_ops.append(optimize_op) # Get custom finish ops for subclasses # FIXME: Need to fix this once we figure out how to handle dependencies - self._finish_update(global_block, parameters_and_grads) + self._finish_update(target_block, parameters_and_grads) - end = len(global_block.ops) - return global_block._slice_ops(start, end) + end = len(target_block.ops) + return target_block._slice_ops(start, end) def _process_distribute_lookuptable(self, param_grads): """ @@ -1904,7 +1915,6 @@ class AdamaxOptimizer(Optimizer): """Update Beta1 Power accumulator """ assert isinstance(block, framework.Block) - main_block = block.program.global_block() for param, grad in parameters_and_grads: if grad is None or param.trainable is False: continue @@ -1912,7 +1922,7 @@ class AdamaxOptimizer(Optimizer): [param, grad]), name_scope('adamx'): beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, param) - main_block.append_op( + block.append_op( type="scale", inputs={"X": beta1_pow_acc}, outputs={"Out": beta1_pow_acc}, diff --git a/python/paddle/fluid/tests/unittests/test_case.py b/python/paddle/fluid/tests/unittests/test_case.py index fa73a5ec62f..722d5ef0862 100644 --- a/python/paddle/fluid/tests/unittests/test_case.py +++ b/python/paddle/fluid/tests/unittests/test_case.py @@ -22,6 +22,7 @@ import paddle.fluid.core as core import paddle.fluid.layers as layers from paddle.fluid.framework import Program, program_guard from functools import partial +import paddle.fluid.optimizer as optimizer class TestAPICase(unittest.TestCase): @@ -223,5 +224,52 @@ class TestAPICase_Error(unittest.TestCase): self.assertRaises(TypeError, type_error_default) +# when optimizer in case +class TestMutiTask(unittest.TestCase): + def test_optimizer_in_case(self): + BATCH_SIZE = 1 + INPUT_SIZE = 784 + EPOCH_NUM = 2 + + x = fluid.data( + name='x', shape=[BATCH_SIZE, INPUT_SIZE], dtype='float32') + y = fluid.data( + name='y', shape=[BATCH_SIZE, INPUT_SIZE], dtype='float32') + + switch_id = fluid.data(name='switch_id', shape=[1], dtype='int32') + + one = layers.fill_constant(shape=[1], dtype='int32', value=1) + adam = optimizer.Adam(learning_rate=0.001) + adagrad = optimizer.Adagrad(learning_rate=0.001) + + def fn_1(): + sum = layers.elementwise_mul(x, y) + loss = layers.mean(sum, name="f_1_loss") + adam.minimize(loss) + + def fn_2(): + sum = layers.elementwise_mul(x, y) + loss = layers.mean(sum, name="f_2_loss") + adagrad.minimize(loss) + + layers.case(pred_fn_pairs=[(switch_id == one, fn_1)], default=fn_2) + + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + + for epoch in range(EPOCH_NUM): + np.random.seed(epoch) + feed_image = np.random.random( + size=[BATCH_SIZE, INPUT_SIZE]).astype('float32') + main_program = fluid.default_main_program() + out = exe.run(main_program, + feed={ + 'x': feed_image, + 'y': feed_image, + 'switch_id': np.array([epoch]).astype('int32') + }, + fetch_list=[]) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py new file mode 100644 index 00000000000..63579ee80ac --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py @@ -0,0 +1,229 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest + +import paddle.fluid as fluid +import paddle.fluid.layers as layers +import paddle.fluid.optimizer as optimizer +from paddle.fluid.framework import Program, program_guard +import paddle.fluid.core as core + +BATCH_SIZE = 1 +INPUT_SIZE = 784 +CLASS_NUM = 10 +FC_SIZE = 40 +EPOCH_NUM = 5 +LR = 0.001 +SEED = 2020 + + +def static(train_data, + loss_in_switch=True, + use_cuda=False, + use_parallel_exe=False): + startup_program = Program() + main_program = Program() + startup_program.random_seed = SEED + main_program.random_seed = SEED + + with program_guard(main_program, startup_program): + + def double_fc_net(image): + hidden = layers.fc( + image, + size=FC_SIZE, + act='relu', + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.99)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.5)), + name="hidden") + + prediction = layers.fc( + hidden, + size=CLASS_NUM, + act='softmax', + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.2)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.8)), + name="prediction") + return hidden, prediction + + def fn_1(opt, avg_loss=None, pred=None, label=None): + if avg_loss is None: + loss = layers.cross_entropy(input=pred, label=label) + avg_loss = layers.mean(loss, name='mean_cross_entropy_loss') + opt.minimize(avg_loss) + return avg_loss + + def fn_2(opt, avg_loss=None, pred=None, label=None): + if avg_loss is None: + loss = layers.softmax_with_cross_entropy( + logits=pred, label=label) + avg_loss = layers.mean(loss, name='mean_softmax_loss') + opt.minimize(avg_loss) + return avg_loss + + image = fluid.data('image', [BATCH_SIZE, INPUT_SIZE], 'float32') + label = fluid.data('label', [BATCH_SIZE, 1], 'int64') + hidden, prediction = double_fc_net(image) + + adam = optimizer.Adam(learning_rate=LR) + sgd = optimizer.SGD(learning_rate=LR) + + id = fluid.data('id', [1], 'int32') + two = layers.fill_constant([1], 'int32', 2) + mod_two = layers.elementwise_mod(id, two) == 0 + + if loss_in_switch: + avg_loss = layers.case([( + mod_two, lambda: fn_1(adam, None, prediction, label))], + lambda: fn_2(sgd, None, prediction, label)) + else: + loss_1 = layers.cross_entropy(input=prediction, label=label) + avg_loss_1 = layers.mean(loss_1) + loss_2 = layers.softmax_with_cross_entropy( + logits=prediction, label=label) + avg_loss_2 = layers.mean(loss_2) + avg_loss = layers.case([(mod_two, lambda: fn_1(adam, avg_loss_1))], + lambda: fn_2(sgd, avg_loss_2)) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + for epoch in range(EPOCH_NUM): + feed_image, feed_label = train_data[epoch] + fetch_list = [hidden, prediction, avg_loss] + feed = { + 'image': feed_image, + 'label': feed_label, + 'id': np.array([epoch]).astype('int32') + } + out = exe.run(main_program, feed=feed, fetch_list=fetch_list) + out_hidden, out_pred, loss = out + + return out_hidden, out_pred, loss + + +class DygraphLayer(fluid.dygraph.Layer): + def __init__(self): + super(DygraphLayer, self).__init__() + self.fc_1 = fluid.dygraph.nn.Linear( + INPUT_SIZE, + FC_SIZE, + act='relu', + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=0.99)), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=0.5)), ) + + self.fc_2 = fluid.dygraph.nn.Linear( + FC_SIZE, + CLASS_NUM, + act='softmax', + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=1.2)), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=0.8))) + + def forward(self, inputs): + hidden = self.fc_1(inputs) + prediction = self.fc_2(hidden) + return hidden, prediction + + +def dynamic(train_data, use_cuda=False, use_parallel_exe=False): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + with fluid.dygraph.guard(place): + fluid.default_startup_program().random_seed = SEED + fluid.default_main_program().random_seed = SEED + dy_layer = DygraphLayer() + adam = fluid.optimizer.Adam( + learning_rate=LR, parameter_list=dy_layer.parameters()) + sgd = fluid.optimizer.SGD(learning_rate=LR, + parameter_list=dy_layer.parameters()) + + for epoch in range(EPOCH_NUM): + image_data, label = train_data[epoch] + var_input = fluid.dygraph.to_variable(image_data) + var_label = fluid.dygraph.to_variable(label) + hidden, prediction = dy_layer(var_input) + + if epoch % 2 == 0: + cross_entropy_loss = layers.cross_entropy(prediction, var_label) + loss = layers.mean(cross_entropy_loss) + loss.backward() + adam.minimize(loss) + else: + softmax_loss = layers.softmax_with_cross_entropy(prediction, + var_label) + loss = layers.mean(softmax_loss) + loss.backward() + sgd.minimize(loss) + + dy_layer.clear_gradients() + return hidden.numpy(), prediction.numpy(), loss.numpy() + + +class TestMultiTask(unittest.TestCase): + ''' + Compare results of static graph and dynamic graph. + Todo(liym27): add parallel GPU train. + ''' + + def random_input(self, + seed, + image_shape=[BATCH_SIZE, INPUT_SIZE], + label_shape=[BATCH_SIZE, 1]): + np.random.seed(seed) + image_np = np.random.random(size=image_shape).astype('float32') + np.random.seed(seed) + label_np = np.random.randint( + low=0, high=CLASS_NUM - 1, size=label_shape).astype('int64') + return image_np, label_np + + def init_train_data(self): + self.train_data = [] + for epoch in range(EPOCH_NUM): + self.train_data.append(self.random_input(epoch)) + + def test_optimzier_in_switch(self): + self.init_train_data() + use_cuda = core.is_compiled_with_cuda() + hidden_2, pre_2, loss_2 = dynamic(self.train_data, use_cuda) + for loss_in_switch in [True, False]: + hidden_1, pre_1, loss_1 = static(self.train_data, loss_in_switch, + use_cuda) + self.assertTrue( + np.allclose(hidden_1, hidden_2), + msg='static hidden is {}\ndynamic hidden is {}'.format( + hidden_1, hidden_2)) + self.assertTrue( + np.allclose(pre_1, pre_2), + msg='static prediction is {}\ndynamic prediction is {}'.format( + pre_1, pre_2)) + self.assertTrue( + np.allclose(loss_1, loss_2), + msg='static loss is {}\ndynamic loss is {}'.format(loss_1, + loss_2)) + + +if __name__ == '__main__': + unittest.main() -- GitLab