From dca075839bc5b580a3cb3d7faa9e784e5206bc38 Mon Sep 17 00:00:00 2001 From: zhongpu <2013000149@qq.com> Date: Fri, 27 Dec 2019 16:32:37 +0800 Subject: [PATCH] remove params in Tracer object (in dygraph) (#20815) * remove params in Tracer object, test=develop * Repair failed optest, test=develop * remove build_once & name_scope (Conv2D) test=develop * fix unittest test=develop * Conv2DTranspose * Conv3D & Conv3DTranspose test=develop * Pool2D & BatchNorm * Embedding * LayerNorm * GRUUnit & NCE * PRelu * BilinearTensorProduct * GroupNorm & SpectralNorm * TreeConv test=develop * fix LayerNorm in transformer unnittest test=develop * disable LayerNorm or BatchNorm in multicard test=develop * refine Layer.create_parameter api test=develop * refine LayerNorm, remove begin_norm_axis param, add normed shape check test=develop * LayerNorm bug fix test=develop * fix optest,test=develop * fix optest, test=develop * fix optest for pass parameter_list when constructing an Optimizer class instance, test=develop * polish code for better code style, test=develop * fix se_resnext optest, test=develop * polish code for better code style, test=develop Co-authored-by: songyouwei --- python/paddle/fluid/dygraph/base.py | 4 +- python/paddle/fluid/dygraph/checkpoint.py | 6 +- python/paddle/fluid/dygraph/layers.py | 9 +- .../fluid/dygraph/learning_rate_scheduler.py | 16 ++- python/paddle/fluid/dygraph/tracer.py | 8 -- python/paddle/fluid/framework.py | 2 - python/paddle/fluid/optimizer.py | 97 +++++++++++++-- .../tests/unittests/parallel_dygraph_mnist.py | 29 ++--- .../unittests/parallel_dygraph_se_resnext.py | 65 ++++++----- .../unittests/test_dygraph_multi_forward.py | 33 +++--- .../unittests/test_imperative_auto_prune.py | 14 ++- .../unittests/test_imperative_debug_string.py | 4 +- .../tests/unittests/test_imperative_deepcf.py | 54 +++++---- .../tests/unittests/test_imperative_gan.py | 52 +++++---- .../tests/unittests/test_imperative_gnn.py | 6 +- ..._imperative_lod_tensor_to_selected_rows.py | 4 +- .../tests/unittests/test_imperative_mnist.py | 31 ++--- .../test_imperative_mnist_sorted_gradient.py | 7 +- .../test_imperative_ocr_attention_model.py | 110 ++++++++---------- .../unittests/test_imperative_optimizer.py | 84 +++++++++++-- .../test_imperative_partitial_backward.py | 3 +- .../unittests/test_imperative_ptb_rnn.py | 15 +-- ...test_imperative_ptb_rnn_sorted_gradient.py | 5 +- .../test_imperative_reinforcement.py | 3 +- .../tests/unittests/test_imperative_resnet.py | 45 ++++--- .../test_imperative_resnet_sorted_gradient.py | 25 ++-- .../unittests/test_imperative_save_load.py | 33 ++++-- .../unittests/test_imperative_se_resnext.py | 83 +++++++------ .../test_imperative_selected_rows.py | 36 +++--- ..._imperative_selected_rows_to_lod_tensor.py | 4 +- ..._imperative_transformer_sorted_gradient.py | 106 ++++++----------- .../fluid/tests/unittests/test_layers.py | 6 +- 32 files changed, 590 insertions(+), 409 deletions(-) diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index 2a67a5ddd3f..af0e79929f8 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -154,14 +154,14 @@ def guard(place=None): yield -def _print_debug_msg(limit=5, is_test=False): +def _print_debug_msg(parameter_list, limit=5, is_test=False): if not core._is_dygraph_debug_enabled(): logging.warn( 'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug' ) return unique_name_size = len(framework.unique_name.generator.ids) - tracer_var_size = len(framework._dygraph_tracer()._vars) + tracer_var_size = len(parameter_list) alive_cpp_var_size = len(core.VarBase._alive_vars()) if not is_test: logging.warn( diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py index 08a62454925..1772620c386 100644 --- a/python/paddle/fluid/dygraph/checkpoint.py +++ b/python/paddle/fluid/dygraph/checkpoint.py @@ -53,7 +53,8 @@ def save_dygraph(state_dict, model_path): state_dict = emb.state_dict() fluid.save_dygraph( state_dict, "paddle_dy") - adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000) ) + adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000), + parameter_list = emb.parameters() ) state_dict = adam.state_dict() fluid.save_dygraph( state_dict, "paddle_dy") @@ -96,7 +97,8 @@ def load_dygraph(model_path): state_dict = emb.state_dict() fluid.save_dygraph( state_dict, "paddle_dy") - adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000) ) + adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000), + parameter_list = emb.parameters() ) state_dict = adam.state_dict() fluid.save_dygraph( state_dict, "padle_dy") diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 4d5f9e3ba9f..6461a7ba8d0 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -145,9 +145,13 @@ class Layer(core.Layer): list of :ref:`api_guide_Variable_en` : a list of Parameters. """ ret = [p for p in self._parameters.values()] + parameters_set = set(ret) if include_sublayers: for l in self._sub_layers.values(): for p in l.parameters(include_sublayers): + if p in parameters_set: + continue + parameters_set.add(p) ret.append(p) return ret @@ -261,11 +265,6 @@ class Layer(core.Layer): value.set_value(self._loaddict_holder[value.name]) - if name in params: - # remove unused param in tracer - if framework._dygraph_tracer_ is not None: - framework._dygraph_tracer_._vars.pop(params[name].name, - None) params[name] = value elif isinstance(value, core.Layer): layers = self.__dict__.get('_sub_layers', None) diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py index bc7c694cee7..45e8959ef82 100644 --- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py +++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py @@ -104,8 +104,10 @@ class PiecewiseDecay(LearningRateDecay): boundaries = [10000, 20000] values = [1.0, 0.5, 0.1] with fluid.dygraph.guard(): + emb = fluid.dygraph.Embedding( [10, 10] ) optimizer = fluid.optimizer.SGD( - learning_rate=fluid.dygraph.PiecewiseDecay(boundaries, values, 0) ) + learning_rate=fluid.dygraph.PiecewiseDecay(boundaries, values, 0), + parameter_list = emb.parameters() ) """ def __init__(self, boundaries, values, begin, step=1, dtype='float32'): @@ -323,12 +325,14 @@ class InverseTimeDecay(LearningRateDecay): import paddle.fluid as fluid base_lr = 0.1 with fluid.dygraph.guard(): + emb = fluid.dygraph.Embedding([10, 10]) sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.dygraph.InverseTimeDecay( learning_rate=base_lr, decay_steps=10000, decay_rate=0.5, - staircase=True)) + staircase=True), + parameter_list = emb.parameters()) """ @@ -404,9 +408,11 @@ class PolynomialDecay(LearningRateDecay): total_step = 5000 end_lr = 0 with fluid.dygraph.guard(): + emb = fluid.dygraph.Embedding( [10, 10]) optimizer = fluid.optimizer.SGD( learning_rate = fluid.dygraph.PolynomialDecay( - start_lr, total_step, end_lr, power=1.0) ) + start_lr, total_step, end_lr, power=1.0), + parameter_list = emb.parameters()) """ @@ -536,10 +542,12 @@ class NoamDecay(LearningRateDecay): warmup_steps = 100 learning_rate = 0.01 with fluid.dygraph.guard(): + emb = fluid.dygraph.Embedding([10, 10]) optimizer = fluid.optimizer.SGD( learning_rate = fluid.dygraph.NoamDecay( 1/(warmup_steps *(learning_rate ** 2)), - warmup_steps) ) + warmup_steps), + parameter_list = emb.parameters()) """ def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'): diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py index 799f9423a1d..a753e465c63 100644 --- a/python/paddle/fluid/dygraph/tracer.py +++ b/python/paddle/fluid/dygraph/tracer.py @@ -31,16 +31,8 @@ class Tracer(core.Tracer): def __init__(self): super(Tracer, self).__init__() - self._vars = defaultdict() self._train_mode = True - def trace_var(self, name, var): - self._vars[name] = var - - def all_parameters(self): - return list((item for name, item in six.iteritems(self._vars) - if isinstance(item, framework.Parameter))) - def trace_op(self, type, inputs, outputs, attrs, stop_gradient=False): self.trace(type, inputs, outputs, attrs, framework._current_expected_place(), self._train_mode and diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index aabb288d45a..ebe78218ec2 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -4676,8 +4676,6 @@ class ParamBase(core.VarBase): # self.block = default_main_program().global_block() - _dygraph_tracer().trace_var(name, self) - def __str__(self): return self.to_string(True) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index fdec2d808d1..487dc677d12 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -60,7 +60,12 @@ class Optimizer(object): """ @imperative_base.no_grad - def __init__(self, learning_rate, regularization=None, name=None): + def __init__(self, + learning_rate, + parameter_list=None, + regularization=None, + name=None): + self._parameter_list = None if framework.in_dygraph_mode(): if not isinstance(learning_rate, float) and \ not isinstance(learning_rate, LearningRateDecay): @@ -71,6 +76,12 @@ class Optimizer(object): self._name = unique_name.generate(name) else: self._name = unique_name.generate(self.__class__.__name__) + if parameter_list is not None: + self._parameter_list = parameter_list + else: + raise AttributeError( + "parameter_list argument given to the Optimizer should not be None in dygraph mode." + ) else: if not isinstance(learning_rate, float) and \ not isinstance(learning_rate, framework.Variable): @@ -154,7 +165,8 @@ class Optimizer(object): state_dict = emb.state_dict() fluid.save_dygraph( state_dict, "paddle_dy") - adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000) ) + adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000), + parameter_list = emb.parameters() ) state_dict = adam.state_dict() fluid.save_dygraph( state_dict, "padle_dy") @@ -530,13 +542,8 @@ class Optimizer(object): self._dtype = loss.dtype if framework.in_dygraph_mode(): - if parameter_list is not None: - parameters = parameter_list - else: - parameters = framework._dygraph_tracer().all_parameters() - params_grads = [] - for param in parameters: + for param in self._parameter_list: if not param.trainable: continue if param._grad_ivar() is not None: @@ -705,6 +712,9 @@ class SGDOptimizer(Optimizer): Parameters: learning_rate (float|Variable): The learning rate used to update parameters. \ Can be a float value or a Variable with one float value as data element. + parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. \ Optional, default is None. name (str, optional): This parameter is used by developers to print debugging information. \ @@ -740,10 +750,15 @@ class SGDOptimizer(Optimizer): """ - def __init__(self, learning_rate, regularization=None, name=None): + def __init__(self, + learning_rate, + parameter_list=None, + regularization=None, + name=None): assert learning_rate is not None super(SGDOptimizer, self).__init__( learning_rate=learning_rate, + parameter_list=parameter_list, regularization=regularization, name=name) self.type = "sgd" @@ -801,6 +816,9 @@ class MomentumOptimizer(Optimizer): learning_rate (float|Variable): The learning rate used to update parameters. \ Can be a float value or a Variable with one float value as data element. momentum (float): Momentum factor + parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. use_nesterov (bool, optional): Enables Nesterov momentum, default is false. regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. \ Optional, default is None. @@ -841,6 +859,7 @@ class MomentumOptimizer(Optimizer): def __init__(self, learning_rate, momentum, + parameter_list=None, use_nesterov=False, regularization=None, name=None): @@ -848,6 +867,7 @@ class MomentumOptimizer(Optimizer): assert momentum is not None super(MomentumOptimizer, self).__init__( learning_rate=learning_rate, + parameter_list=parameter_list, regularization=regularization, name=name) self.type = "momentum" @@ -921,6 +941,9 @@ class DGCMomentumOptimizer(Optimizer): sparsity (list[float]): Get top important element from gradient tensor, the ratio is (1 - current sparsity). \ Default is [0.999]. For example, if the sparsity is [0.99, 0.999], \ the top [1%, 0.1%] important element will be transmitted. + parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. use_nesterov (bool): Enables Nesterov momentum. True means use Nesterov. Default is False. local_grad_clip_norm (float, optional): Local gradient clip norm value. Optional, default is None, represent no need clip. num_trainers (int, optional): The number of training nodes. Optional, default is None. @@ -950,6 +973,7 @@ class DGCMomentumOptimizer(Optimizer): rampup_begin_step, rampup_step=1, sparsity=[0.999], + parameter_list=None, use_nesterov=False, local_grad_clip_norm=None, num_trainers=None, @@ -959,6 +983,7 @@ class DGCMomentumOptimizer(Optimizer): assert momentum is not None super(DGCMomentumOptimizer, self).__init__( learning_rate=learning_rate, + parameter_list=parameter_list, regularization=regularization, name=name) self.type = "dgc_momentum" @@ -1286,6 +1311,9 @@ class LarsMomentumOptimizer(Optimizer): momentum (float): momentum factor lars_coeff (float): Defines how much we trust the layer to change its weights. lars_weight_decay (float): Weight decay coefficient for decaying using LARS. + parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. Optional, default is None. name (str, optional): This parameter is used by developers to print debugging information. \ @@ -1318,12 +1346,14 @@ class LarsMomentumOptimizer(Optimizer): momentum, lars_coeff=0.001, lars_weight_decay=0.0005, + parameter_list=None, regularization=None, name=None): assert learning_rate is not None assert momentum is not None super(LarsMomentumOptimizer, self).__init__( learning_rate=learning_rate, + parameter_list=parameter_list, regularization=regularization, name=name) self.type = "lars_momentum" @@ -1391,6 +1421,9 @@ class AdagradOptimizer(Optimizer): It can be a float value or a ``Variable`` with a float type. epsilon (float, optional): A small float value for numerical stability. The default value is 1e-06. + parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None. name (str, optional): Normally there is no need for user to set this property. @@ -1423,6 +1456,7 @@ class AdagradOptimizer(Optimizer): def __init__(self, learning_rate, epsilon=1.0e-6, + parameter_list=None, regularization=None, name=None, initial_accumulator_value=0.0): @@ -1430,6 +1464,7 @@ class AdagradOptimizer(Optimizer): assert epsilon is not None super(AdagradOptimizer, self).__init__( learning_rate=learning_rate, + parameter_list=parameter_list, regularization=regularization, name=name) self.type = "adagrad" @@ -1510,6 +1545,9 @@ class AdamOptimizer(Optimizer): The default value is 0.999. epsilon (float, optional): A small float value for numerical stability. The default value is 1e-08. + parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None. name (str, optional): Normally there is no need for user to set this property. @@ -1619,6 +1657,7 @@ class AdamOptimizer(Optimizer): beta1=0.9, beta2=0.999, epsilon=1e-8, + parameter_list=None, regularization=None, name=None, lazy_mode=False): @@ -1628,6 +1667,7 @@ class AdamOptimizer(Optimizer): assert epsilon is not None super(AdamOptimizer, self).__init__( learning_rate=learning_rate, + parameter_list=parameter_list, regularization=regularization, name=name) self.type = "adam" @@ -1747,6 +1787,9 @@ class AdamaxOptimizer(Optimizer): The default value is 0.999. epsilon (float, optional): A small float value for numerical stability. The default value is 1e-08. + parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None. name (str, optional): Normally there is no need for user to set this property. @@ -1792,6 +1835,7 @@ class AdamaxOptimizer(Optimizer): beta1=0.9, beta2=0.999, epsilon=1e-8, + parameter_list=None, regularization=None, name=None): assert learning_rate is not None @@ -1800,6 +1844,7 @@ class AdamaxOptimizer(Optimizer): assert epsilon is not None super(AdamaxOptimizer, self).__init__( learning_rate=learning_rate, + parameter_list=parameter_list, regularization=regularization, name=name) self.type = "adamax" @@ -1909,6 +1954,9 @@ class DpsgdOptimizer(Optimizer): clip (float): clipping threshold batch_size (float): batch size. sigma (float): for gaussian noise. + parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. Notes: Currently, DpsgdOptimizer doesn't support sparse parameter optimization. """ @@ -1917,12 +1965,14 @@ class DpsgdOptimizer(Optimizer): learning_rate=0.001, clip=0.9, batch_size=0.999, - sigma=1e-8): + sigma=1e-8, + parameter_list=None): assert learning_rate is not None assert clip is not None assert batch_size is not None assert sigma is not None - super(DpsgdOptimizer, self).__init__(learning_rate=learning_rate) + super(DpsgdOptimizer, self).__init__( + learning_rate=learning_rate, parameter_list=parameter_list) self.type = "dpsgd" self._clip = clip self._batch_size = batch_size @@ -1976,6 +2026,9 @@ class DecayedAdagradOptimizer(Optimizer): decay (float, optional): The decay rate. The default value is 0.95. epsilon (float, optional): A small float value for numerical stability. The default value is 1e-06. + parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None. name (str, optional): Normally there is no need for user to set this property. @@ -2002,6 +2055,7 @@ class DecayedAdagradOptimizer(Optimizer): learning_rate, decay=0.95, epsilon=1.0e-6, + parameter_list=None, regularization=None, name=None): assert learning_rate is not None @@ -2010,6 +2064,7 @@ class DecayedAdagradOptimizer(Optimizer): super(DecayedAdagradOptimizer, self).__init__( learning_rate=learning_rate, + parameter_list=parameter_list, regularization=regularization, name=name) self.type = "decayed_adagrad" @@ -2066,6 +2121,9 @@ class AdadeltaOptimizer(Optimizer): learning_rate (float|Variable): global learning rate. epsilon (float): a small float number for numeric stability. Default 1.0e-6. rho (float): a floating point value indicating the decay rate. Default 0.95. + parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. regularization (WeightDecayRegularizer, optional): A Regularizer, such as fluid.regularizer.L2DecayRegularizer. Default None, meaning that there is no regularization. @@ -2097,6 +2155,7 @@ class AdadeltaOptimizer(Optimizer): learning_rate, epsilon=1.0e-6, rho=0.95, + parameter_list=None, regularization=None, name=None): if learning_rate is None: @@ -2107,6 +2166,7 @@ class AdadeltaOptimizer(Optimizer): raise ValueError("rho is not set.") super(AdadeltaOptimizer, self).__init__( learning_rate=learning_rate, + parameter_list=parameter_list, regularization=regularization, name=name) self.type = "adadelta" @@ -2210,6 +2270,9 @@ class RMSPropOptimizer(Optimizer): the gradient; if False, by the uncentered second moment. Setting this to True may help with training, but is slightly more expensive in terms of computation and memory. Defaults to False. + parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. \ Optional, default is None. name (str, optional): This parameter is used by developers to print debugging information. \ @@ -2258,10 +2321,12 @@ class RMSPropOptimizer(Optimizer): epsilon=1.0e-6, momentum=0.0, centered=False, + parameter_list=None, regularization=None, name=None): super(RMSPropOptimizer, self).__init__( learning_rate=learning_rate, + parameter_list=parameter_list, regularization=regularization, name=name) if learning_rate is None: @@ -2370,6 +2435,9 @@ class FtrlOptimizer(Optimizer): l1 (float): L1 regularization strength, default is 0.0. l2 (float): L2 regularization strength, default is 0.0. lr_power (float): Learning Rate Power, default is -0.5. + parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. \ Optional, default is None. name (str, optional): This parameter is used by developers to print debugging information. \ @@ -2418,10 +2486,12 @@ class FtrlOptimizer(Optimizer): l1=0.0, l2=0.0, lr_power=-0.5, + parameter_list=None, regularization=None, name=None): super(FtrlOptimizer, self).__init__( learning_rate=learning_rate, + parameter_list=parameter_list, regularization=regularization, name=name) if learning_rate is None: @@ -2504,6 +2574,9 @@ class LambOptimizer(AdamOptimizer): beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. Default 0.999. epsilon (float, optional): A small float value for numerical stability. Default 1e-6. + parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. regularization (Regularizer|None): A Regularizer, such as fluid.regularizer.L1DecayRegularizer. Default None. exclude_from_weight_decay_fn (function|None): Exclude a parameter from weight @@ -2540,6 +2613,7 @@ class LambOptimizer(AdamOptimizer): beta1=0.9, beta2=0.999, epsilon=1e-6, + parameter_list=None, regularization=None, exclude_from_weight_decay_fn=None, name=None): @@ -2550,6 +2624,7 @@ class LambOptimizer(AdamOptimizer): assert epsilon is not None super(LambOptimizer, self).__init__( learning_rate=learning_rate, + parameter_list=parameter_list, regularization=regularization, beta1=beta1, beta2=beta2, diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py index 830cd3097dd..aff13f0b555 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py @@ -26,7 +26,7 @@ import paddle.fluid as fluid import paddle.fluid.dygraph as dygraph from paddle.fluid import core from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear from paddle.fluid.dygraph.base import to_variable from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase @@ -79,8 +79,8 @@ class SimpleImgConvPool(fluid.dygraph.Layer): class MNIST(fluid.dygraph.Layer): - def __init__(self, name_scope): - super(MNIST, self).__init__(name_scope) + def __init__(self): + super(MNIST, self).__init__() self._simple_img_conv_pool_1 = SimpleImgConvPool( 1, 20, 5, 2, 2, act="relu") @@ -88,19 +88,21 @@ class MNIST(fluid.dygraph.Layer): self._simple_img_conv_pool_2 = SimpleImgConvPool( 20, 50, 5, 2, 2, act="relu") - pool_2_shape = 50 * 4 * 4 + self.pool_2_shape = 50 * 4 * 4 SIZE = 10 - scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5 - self._fc = FC(self.full_name(), - 10, - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.NormalInitializer( - loc=0.0, scale=scale)), - act="softmax") + scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5 + self._fc = Linear( + self.pool_2_shape, + 10, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale)), + act="softmax") def forward(self, inputs, label): x = self._simple_img_conv_pool_1(inputs) x = self._simple_img_conv_pool_2(x) + x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape]) cost = self._fc(x) loss = fluid.layers.cross_entropy(cost, label) avg_loss = fluid.layers.mean(loss) @@ -109,10 +111,11 @@ class MNIST(fluid.dygraph.Layer): class TestMnist(TestParallelDyGraphRunnerBase): def get_model(self): - model = MNIST("mnist") + model = MNIST() train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=2, drop_last=True) - opt = fluid.optimizer.Adam(learning_rate=1e-3) + opt = fluid.optimizer.Adam( + learning_rate=1e-3, parameter_list=model.parameters()) return model, train_reader, opt def run_one_loop(self, model, opt, data): diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py index 525855ceaa1..4f35b02194f 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py @@ -27,7 +27,7 @@ import paddle.fluid as fluid import paddle.fluid.dygraph as dygraph from paddle.fluid import core from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, BatchNorm +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm from paddle.fluid.dygraph.base import to_variable from paddle.fluid.layer_helper import LayerHelper import math @@ -54,7 +54,7 @@ train_parameters = { } -def optimizer_setting(params): +def optimizer_setting(params, parameter_list=None): ls = params["learning_strategy"] if "total_images" not in params: total_images = 6149 @@ -66,11 +66,19 @@ def optimizer_setting(params): bd = [step * e for e in ls["epochs"]] lr = params["lr"] num_epochs = params["num_epochs"] - optimizer = fluid.optimizer.Momentum( - learning_rate=fluid.layers.cosine_decay( - learning_rate=lr, step_each_epoch=step, epochs=num_epochs), - momentum=momentum_rate, - regularization=fluid.regularizer.L2Decay(l2_decay)) + if fluid.in_dygraph_mode(): + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.cosine_decay( + learning_rate=lr, step_each_epoch=step, epochs=num_epochs), + momentum=momentum_rate, + regularization=fluid.regularizer.L2Decay(l2_decay), + parameter_list=parameter_list) + else: + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.cosine_decay( + learning_rate=lr, step_each_epoch=step, epochs=num_epochs), + momentum=momentum_rate, + regularization=fluid.regularizer.L2Decay(l2_decay)) return optimizer @@ -107,27 +115,29 @@ class ConvBNLayer(fluid.dygraph.Layer): class SqueezeExcitation(fluid.dygraph.Layer): - def __init__(self, name_scope, num_channels, reduction_ratio): + def __init__(self, num_channels, reduction_ratio): - super(SqueezeExcitation, self).__init__(name_scope) + super(SqueezeExcitation, self).__init__() + self._num_channels = num_channels self._pool = Pool2D(pool_size=0, pool_type='avg', global_pooling=True) stdv = 1.0 / math.sqrt(num_channels * 1.0) - self._squeeze = FC( - self.full_name(), - size=num_channels // reduction_ratio, + self._squeeze = Linear( + num_channels, + num_channels // reduction_ratio, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform(-stdv, stdv)), act='relu') stdv = 1.0 / math.sqrt(num_channels / 16.0 * 1.0) - self._excitation = FC( - self.full_name(), - size=num_channels, + self._excitation = Linear( + num_channels // reduction_ratio, + num_channels, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform(-stdv, stdv)), act='sigmoid') def forward(self, input): y = self._pool(input) + y = fluid.layers.reshape(y, shape=[-1, self._num_channels]) y = self._squeeze(y) y = self._excitation(y) y = fluid.layers.elementwise_mul(x=input, y=y, axis=0) @@ -163,9 +173,7 @@ class BottleneckBlock(fluid.dygraph.Layer): act=None) self.scale = SqueezeExcitation( - self.full_name(), - num_channels=num_filters * 2, - reduction_ratio=reduction_ratio) + num_channels=num_filters * 2, reduction_ratio=reduction_ratio) if not shortcut: self.short = ConvBNLayer( @@ -194,8 +202,8 @@ class BottleneckBlock(fluid.dygraph.Layer): class SeResNeXt(fluid.dygraph.Layer): - def __init__(self, name_scope, layers=50, class_dim=102): - super(SeResNeXt, self).__init__(name_scope) + def __init__(self, layers=50, class_dim=102): + super(SeResNeXt, self).__init__() self.layers = layers supported_layers = [50, 101, 152] @@ -276,10 +284,13 @@ class SeResNeXt(fluid.dygraph.Layer): pool_size=7, pool_type='avg', global_pooling=True) stdv = 1.0 / math.sqrt(2048 * 1.0) - self.out = FC(self.full_name(), - size=class_dim, - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.Uniform(-stdv, stdv))) + self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 2 * 1 * 1 + + self.out = Linear( + self.pool2d_avg_output, + class_dim, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv))) def forward(self, inputs): if self.layers == 50 or self.layers == 101: @@ -294,18 +305,20 @@ class SeResNeXt(fluid.dygraph.Layer): for bottleneck_block in self.bottleneck_block_list: y = bottleneck_block(y) y = self.pool2d_avg(y) + y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output]) y = self.out(y) return y class TestSeResNeXt(TestParallelDyGraphRunnerBase): def get_model(self): - model = SeResNeXt("se-resnext") + model = SeResNeXt() train_reader = paddle.batch( paddle.dataset.flowers.test(use_xmap=False), batch_size=train_parameters["batch_size"], drop_last=True) - optimizer = optimizer_setting(train_parameters) + optimizer = optimizer_setting( + train_parameters, parameter_list=model.parameters()) return model, train_reader, optimizer def run_one_loop(self, model, opt, data): diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py index ef7ff153ba2..ae4355ec412 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py @@ -23,7 +23,7 @@ import paddle import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope @@ -75,8 +75,8 @@ class SimpleImgConvPool(fluid.dygraph.Layer): class MNIST(fluid.dygraph.Layer): - def __init__(self, name_scope): - super(MNIST, self).__init__(name_scope) + def __init__(self): + super(MNIST, self).__init__() self._simple_img_conv_pool_1 = SimpleImgConvPool( 1, 20, 5, 2, 2, act="relu") @@ -84,19 +84,21 @@ class MNIST(fluid.dygraph.Layer): self._simple_img_conv_pool_2 = SimpleImgConvPool( 20, 50, 5, 2, 2, act="relu") - pool_2_shape = 50 * 4 * 4 - SIZE = 10 - scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5 - self._fc = FC(self.full_name(), - 10, - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.NormalInitializer( - loc=0.0, scale=scale)), - act="softmax") + self.pool_2_shape = 50 * 4 * 4 + SIZE = 100 #10 + scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5 + self._fc = Linear( + self.pool_2_shape, + SIZE, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale)), + act="softmax") def forward(self, inputs): x = self._simple_img_conv_pool_1(inputs) x = self._simple_img_conv_pool_2(x) + x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape]) x = self._fc(x) return x @@ -109,8 +111,9 @@ class TestDygraphMultiForward(unittest.TestCase): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - mnist = MNIST("mnist") - sgd = SGDOptimizer(learning_rate=1e-3) + mnist = MNIST() + sgd = SGDOptimizer( + learning_rate=1e-3, parameter_list=mnist.parameters()) train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=128, drop_last=True) @@ -145,7 +148,7 @@ class TestDygraphMultiForward(unittest.TestCase): exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) - mnist = MNIST("mnist") + mnist = MNIST() sgd = SGDOptimizer(learning_rate=1e-3) train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=128, drop_last=True) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py index aed9520ce8d..a3a5ce883a3 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py @@ -258,7 +258,9 @@ class TestImperativeAutoPrune(unittest.TestCase): fc2_origin = fc2._w.numpy() fc2._w.stop_gradient = True out2.backward() - optimizer = fluid.optimizer.SGD(learning_rate=0.003) + optimizer = fluid.optimizer.SGD( + learning_rate=0.003, + parameter_list=(fc.parameters() + fc2.parameters())) optimizer.minimize(out2) self.assertTrue(np.array_equal(fc2_origin, fc2._w.numpy())) self.assertFalse(np.array_equal(fc_origin, fc._w.numpy())) @@ -279,7 +281,9 @@ class TestImperativeAutoPrune(unittest.TestCase): fc2_origin = fc2._w.numpy() out2.stop_gradient = True out2.backward() - optimizer = fluid.optimizer.SGD(learning_rate=0.003) + optimizer = fluid.optimizer.SGD( + learning_rate=0.003, + parameter_list=(fc.parameters() + fc2.parameters())) optimizer.minimize(out2) self.assertTrue(np.array_equal(fc2_origin, fc2._w.numpy())) self.assertTrue(np.array_equal(fc_origin, fc._w.numpy())) @@ -320,7 +324,8 @@ class TestImperativeAutoPrune(unittest.TestCase): place = fluid.CPUPlace() with fluid.dygraph.guard(place): model = MyLayer("mylayer", vocab_size, size) - optimizer = fluid.optimizer.AdamOptimizer(0.001) + optimizer = fluid.optimizer.AdamOptimizer( + 0.001, parameter_list=model.parameters()) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001) indices = fluid.dygraph.to_variable(indices) @@ -338,7 +343,8 @@ class TestImperativeAutoPrune(unittest.TestCase): with fluid.dygraph.guard(place): model = MyLayer2("mylayer", vocab_size, size) - optimizer = fluid.optimizer.AdamOptimizer(0.001) + optimizer = fluid.optimizer.AdamOptimizer( + 0.001, parameter_list=model.parameters()) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001) indices = fluid.dygraph.to_variable(indices) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py b/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py index 1b201fc7f15..dbd5296e5f1 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py @@ -58,7 +58,7 @@ class TestDygraphDebugString(unittest.TestCase): out.backward() mlp.clear_gradients() unique_name_tmp, trace_var_tmp, alive_var_tmp = fluid.dygraph.base._print_debug_msg( - is_test=True) + mlp.parameters(), is_test=True) if i > 0: self.assertGreaterEqual(unique_name, unique_name_tmp) self.assertGreaterEqual(trace_var, trace_var_tmp) @@ -68,7 +68,7 @@ class TestDygraphDebugString(unittest.TestCase): trace_var = trace_var_tmp alive_var = alive_var_tmp try: - fluid.dygraph.base._print_debug_msg() + fluid.dygraph.base._print_debug_msg(mlp.parameters()) except Exception as e: raise RuntimeError( "No Exception is accepted in _print_debug_msg, but we got: {}". diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py index 0cbf5562dab..f76c3bd9580 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py @@ -23,6 +23,7 @@ import paddle.fluid as fluid import paddle.fluid.core as core from test_imperative_base import new_program_scope from paddle.fluid.dygraph.base import to_variable +from paddle.fluid.dygraph import Linear # Can use Amusic dataset as the DeepCF describes. DATA_PATH = os.environ.get('DATA_PATH', '') @@ -33,10 +34,10 @@ NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1)) class DMF(fluid.Layer): - def __init__(self, name_scope): - super(DMF, self).__init__(name_scope) - self._user_latent = fluid.FC(self.full_name(), 256) - self._item_latent = fluid.FC(self.full_name(), 256) + def __init__(self): + super(DMF, self).__init__() + self._user_latent = Linear(1000, 256) + self._item_latent = Linear(100, 256) self._user_layers = [] self._item_layers = [] @@ -45,11 +46,17 @@ class DMF(fluid.Layer): self._user_layers.append( self.add_sublayer( 'user_layer_%d' % i, - fluid.FC(self.full_name(), self._hid_sizes[i], act='relu'))) + Linear( + 256 if i == 0 else self._hid_sizes[i - 1], + self._hid_sizes[i], + act='relu'))) self._item_layers.append( self.add_sublayer( 'item_layer_%d' % i, - fluid.FC(self.full_name(), self._hid_sizes[i], act='relu'))) + Linear( + 256 if i == 0 else self._hid_sizes[i - 1], + self._hid_sizes[i], + act='relu'))) def forward(self, users, items): users = self._user_latent(users) @@ -62,17 +69,20 @@ class DMF(fluid.Layer): class MLP(fluid.Layer): - def __init__(self, name_scope): - super(MLP, self).__init__(name_scope) - self._user_latent = fluid.FC(self.full_name(), 256) - self._item_latent = fluid.FC(self.full_name(), 256) + def __init__(self): + super(MLP, self).__init__() + self._user_latent = Linear(1000, 256) + self._item_latent = Linear(100, 256) self._match_layers = [] self._hid_sizes = [128, 64] for i in range(len(self._hid_sizes)): self._match_layers.append( self.add_sublayer( 'match_layer_%d' % i, - fluid.FC(self.full_name(), self._hid_sizes[i], act='relu'))) + Linear( + 256 * 2 if i == 0 else self._hid_sizes[i - 1], + self._hid_sizes[i], + act='relu'))) def forward(self, users, items): users = self._user_latent(users) @@ -85,8 +95,8 @@ class MLP(fluid.Layer): class DeepCF(fluid.Layer): - def __init__(self, name_scope, num_users, num_items, matrix): - super(DeepCF, self).__init__(name_scope) + def __init__(self, num_users, num_items, matrix): + super(DeepCF, self).__init__() self._num_users = num_users self._num_items = num_items self._rating_matrix = self.create_parameter( @@ -97,9 +107,9 @@ class DeepCF(fluid.Layer): default_initializer=fluid.initializer.NumpyArrayInitializer(matrix)) self._rating_matrix.stop_gradient = True - self._mlp = MLP(self.full_name()) - self._dmf = DMF(self.full_name()) - self._match_fc = fluid.FC(self.full_name(), 1, act='sigmoid') + self._mlp = MLP() + self._dmf = DMF() + self._match_fc = Linear(128, 1, act='sigmoid') def forward(self, users, items): # users_emb = self._user_emb(users) @@ -208,7 +218,7 @@ class TestDygraphDeepCF(unittest.TestCase): items = fluid.layers.data('items', [1], dtype='int32') labels = fluid.layers.data('labels', [1], dtype='float32') - deepcf = DeepCF('deepcf', num_users, num_items, matrix) + deepcf = DeepCF(num_users, num_items, matrix) prediction = deepcf(users, items) loss = fluid.layers.reduce_sum( fluid.layers.log_loss(prediction, labels)) @@ -237,8 +247,9 @@ class TestDygraphDeepCF(unittest.TestCase): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - deepcf = DeepCF('deepcf', num_users, num_items, matrix) - adam = fluid.optimizer.AdamOptimizer(0.01) + deepcf = DeepCF(num_users, num_items, matrix) + adam = fluid.optimizer.AdamOptimizer( + 0.01, parameter_list=deepcf.parameters()) for e in range(NUM_EPOCHES): sys.stderr.write('epoch %d\n' % e) for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE): @@ -261,8 +272,9 @@ class TestDygraphDeepCF(unittest.TestCase): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - deepcf2 = DeepCF('deepcf', num_users, num_items, matrix) - adam2 = fluid.optimizer.AdamOptimizer(0.01) + deepcf2 = DeepCF(num_users, num_items, matrix) + adam2 = fluid.optimizer.AdamOptimizer( + 0.01, parameter_list=deepcf2.parameters()) backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = True for e in range(NUM_EPOCHES): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py index 7e8cebab44e..b7ebd23a0b7 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py @@ -22,33 +22,35 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid import Conv2D, Pool2D, FC +from paddle.fluid import Conv2D, Pool2D, Linear from test_imperative_base import new_program_scope from paddle.fluid.dygraph.base import to_variable class Discriminator(fluid.Layer): - def __init__(self, name_scope): - super(Discriminator, self).__init__(name_scope) - self._fc1 = FC(self.full_name(), size=32, act='elu') - self._fc2 = FC(self.full_name(), size=1) + def __init__(self): + super(Discriminator, self).__init__() + self._fc1 = Linear(1, 32, act='elu') + self._fc2 = Linear(32, 1) def forward(self, inputs): x = self._fc1(inputs) - return self._fc2(x) + x = self._fc2(x) + return x class Generator(fluid.Layer): - def __init__(self, name_scope): - super(Generator, self).__init__(name_scope) - self._fc1 = FC(self.full_name(), size=64, act='elu') - self._fc2 = FC(self.full_name(), size=64, act='elu') - self._fc3 = FC(self.full_name(), size=1) + def __init__(self): + super(Generator, self).__init__() + self._fc1 = Linear(2, 64, act='elu') + self._fc2 = Linear(64, 64, act='elu') + self._fc3 = Linear(64, 1) def forward(self, inputs): x = self._fc1(inputs) x = self._fc2(x) - return self._fc3(x) + x = self._fc3(x) + return x class TestDygraphGAN(unittest.TestCase): @@ -65,8 +67,8 @@ class TestDygraphGAN(unittest.TestCase): scope = fluid.core.Scope() with new_program_scope( main=discriminate_p, startup=startup, scope=scope): - discriminator = Discriminator("d") - generator = Generator("g") + discriminator = Discriminator() + generator = Generator() img = fluid.layers.data( name="img", shape=[2, 1], append_batch_size=False) @@ -93,8 +95,8 @@ class TestDygraphGAN(unittest.TestCase): sgd.minimize(d_loss) with new_program_scope(main=generate_p, startup=startup, scope=scope): - discriminator = Discriminator("d") - generator = Generator("g") + discriminator = Discriminator() + generator = Generator() noise = fluid.layers.data( name="noise", shape=[2, 2], append_batch_size=False) @@ -134,9 +136,12 @@ class TestDygraphGAN(unittest.TestCase): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - discriminator = Discriminator("d") - generator = Generator("g") - sgd = SGDOptimizer(learning_rate=1e-3) + discriminator = Discriminator() + generator = Generator() + sgd = SGDOptimizer( + learning_rate=1e-3, + parameter_list=( + discriminator.parameters() + generator.parameters())) d_real = discriminator(to_variable(np.ones([2, 1], np.float32))) d_loss_real = fluid.layers.reduce_mean( @@ -177,9 +182,12 @@ class TestDygraphGAN(unittest.TestCase): backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = True - discriminator2 = Discriminator("d") - generator2 = Generator("g") - sgd2 = SGDOptimizer(learning_rate=1e-3) + discriminator2 = Discriminator() + generator2 = Generator() + sgd2 = SGDOptimizer( + learning_rate=1e-3, + parameter_list=( + discriminator2.parameters() + generator2.parameters())) d_real2 = discriminator2(to_variable(np.ones([2, 1], np.float32))) d_loss_real2 = fluid.layers.reduce_mean( diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py index 3ac301a8f69..01f3c027746 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py @@ -131,7 +131,8 @@ class TestDygraphGNN(unittest.TestCase): to_variable(labels)) loss = fluid.layers.reduce_sum(loss) loss.backward() - adam = AdamOptimizer(learning_rate=1e-3) + adam = AdamOptimizer( + learning_rate=1e-3, parameter_list=model.parameters()) adam.minimize(loss) model.clear_gradients() @@ -156,7 +157,8 @@ class TestDygraphGNN(unittest.TestCase): logits2, to_variable(labels2)) loss2 = fluid.layers.reduce_sum(loss2) loss2.backward() - adam2 = AdamOptimizer(learning_rate=1e-3) + adam2 = AdamOptimizer( + learning_rate=1e-3, parameter_list=model2.parameters()) adam2.minimize(loss2) model2.clear_gradients() loss2_value = loss2.numpy() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py index fc0e3d190ef..9bd6f039d91 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py @@ -105,7 +105,9 @@ class TestDygraphSimpleNet(unittest.TestCase): is_sparse=is_sparse, dtype=dtype) - sgd = SGDOptimizer(learning_rate=1e-3) + sgd = SGDOptimizer( + learning_rate=1e-3, + parameter_list=simple_net.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py index ed2e14346bb..e9e0f348baf 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py @@ -23,7 +23,7 @@ import paddle import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope from utils import DyGraphProgramDescTracerTestHelper, is_equal_program @@ -77,8 +77,8 @@ class SimpleImgConvPool(fluid.dygraph.Layer): class MNIST(fluid.dygraph.Layer): - def __init__(self, name_scope): - super(MNIST, self).__init__(name_scope) + def __init__(self): + super(MNIST, self).__init__() self._simple_img_conv_pool_1 = SimpleImgConvPool( 1, 20, 5, 2, 2, act="relu") @@ -86,19 +86,21 @@ class MNIST(fluid.dygraph.Layer): self._simple_img_conv_pool_2 = SimpleImgConvPool( 20, 50, 5, 2, 2, act="relu") - pool_2_shape = 50 * 4 * 4 + self.pool_2_shape = 50 * 4 * 4 SIZE = 10 - scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5 - self._fc = FC(self.full_name(), - 10, - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.NormalInitializer( - loc=0.0, scale=scale)), - act="softmax") + scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5 + self._fc = Linear( + self.pool_2_shape, + 10, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale)), + act="softmax") def forward(self, inputs): x = self._simple_img_conv_pool_1(inputs) x = self._simple_img_conv_pool_2(x) + x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape]) x = self._fc(x) return x @@ -125,8 +127,9 @@ class TestImperativeMnist(unittest.TestCase): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - mnist = MNIST("mnist") - sgd = SGDOptimizer(learning_rate=1e-3) + mnist = MNIST() + sgd = SGDOptimizer( + learning_rate=1e-3, parameter_list=mnist.parameters()) batch_py_reader = fluid.io.PyReader(capacity=1) batch_py_reader.decorate_sample_list_generator( @@ -189,7 +192,7 @@ class TestImperativeMnist(unittest.TestCase): exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) - mnist = MNIST("mnist") + mnist = MNIST() sgd = SGDOptimizer(learning_rate=1e-3) train_reader = paddle.batch( paddle.dataset.mnist.train(), diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py index 0f5eb52e22a..4ce0ca350dd 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py @@ -39,8 +39,9 @@ class TestImperativeMnistSortGradient(unittest.TestCase): backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = True - mnist2 = MNIST("mnist") - sgd2 = SGDOptimizer(learning_rate=1e-3) + mnist2 = MNIST() + sgd2 = SGDOptimizer( + learning_rate=1e-3, parameter_list=mnist2.parameters()) train_reader2 = paddle.batch( paddle.dataset.mnist.train(), batch_size=128, drop_last=True) @@ -85,7 +86,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase): exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) - mnist = MNIST("mnist") + mnist = MNIST() sgd = SGDOptimizer(learning_rate=1e-3) train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=128, drop_last=True) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py index cc98b13b759..a9dba62a56c 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py @@ -18,7 +18,7 @@ import numpy as np import six import paddle.fluid as fluid from paddle.fluid import core -from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, BatchNorm, Embedding, GRUUnit +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm, Embedding, GRUUnit from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope @@ -27,6 +27,8 @@ class Config(object): ''' config for training ''' + # encoder rnn hidden_size + encoder_size = 200 # decoder size for decoder stage decoder_size = 128 # size for word embedding @@ -118,8 +120,8 @@ class ConvBNPool(fluid.dygraph.Layer): class OCRConv(fluid.dygraph.Layer): - def __init__(self, name_scope, is_test=False, use_cudnn=True): - super(OCRConv, self).__init__(name_scope) + def __init__(self, is_test=False, use_cudnn=True): + super(OCRConv, self).__init__() self.conv_bn_pool_1 = ConvBNPool( 2, [16, 16], [1, 16], is_test=is_test, use_cudnn=use_cudnn) self.conv_bn_pool_2 = ConvBNPool( @@ -143,7 +145,6 @@ class OCRConv(fluid.dygraph.Layer): class DynamicGRU(fluid.dygraph.Layer): def __init__(self, - scope_name, size, param_attr=None, bias_attr=None, @@ -152,7 +153,7 @@ class DynamicGRU(fluid.dygraph.Layer): candidate_activation='tanh', h_0=None, origin_mode=False): - super(DynamicGRU, self).__init__(scope_name) + super(DynamicGRU, self).__init__() self.gru_unit = GRUUnit( size * 3, @@ -164,6 +165,7 @@ class DynamicGRU(fluid.dygraph.Layer): self.h_0 = h_0 self.is_reverse = is_reverse + self.size = size def forward(self, inputs): hidden = self.h_0 @@ -188,11 +190,10 @@ class DynamicGRU(fluid.dygraph.Layer): class EncoderNet(fluid.dygraph.Layer): def __init__(self, - scope_name, - rnn_hidden_size=200, + rnn_hidden_size=Config.encoder_size, is_test=False, use_cudnn=True): - super(EncoderNet, self).__init__(scope_name) + super(EncoderNet, self).__init__() self.rnn_hidden_size = rnn_hidden_size para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02)) @@ -207,28 +208,19 @@ class EncoderNet(fluid.dygraph.Layer): shape=[Config.batch_size, rnn_hidden_size], dtype='float32', value=0) - self.ocr_convs = OCRConv( - self.full_name(), is_test=is_test, use_cudnn=use_cudnn) - - self.fc_1_layer = FC(self.full_name(), - rnn_hidden_size * 3, - param_attr=para_attr, - bias_attr=False, - num_flatten_dims=2) - self.fc_2_layer = FC(self.full_name(), - rnn_hidden_size * 3, - param_attr=para_attr, - bias_attr=False, - num_flatten_dims=2) + self.ocr_convs = OCRConv(is_test=is_test, use_cudnn=use_cudnn) + + self.fc_1_layer = Linear( + 768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False) + self.fc_2_layer = Linear( + 768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False) self.gru_forward_layer = DynamicGRU( - self.full_name(), size=rnn_hidden_size, h_0=h_0, param_attr=para_attr, bias_attr=bias_attr, candidate_activation='relu') self.gru_backward_layer = DynamicGRU( - self.full_name(), size=rnn_hidden_size, h_0=h_0, param_attr=para_attr, @@ -236,10 +228,8 @@ class EncoderNet(fluid.dygraph.Layer): candidate_activation='relu', is_reverse=True) - self.encoded_proj_fc = FC(self.full_name(), - Config.decoder_size, - bias_attr=False, - num_flatten_dims=2) + self.encoded_proj_fc = Linear( + rnn_hidden_size * 2, Config.decoder_size, bias_attr=False) def forward(self, inputs): conv_features = self.ocr_convs(inputs) @@ -272,18 +262,12 @@ class EncoderNet(fluid.dygraph.Layer): class SimpleAttention(fluid.dygraph.Layer): - def __init__(self, scope_name, decoder_size): - super(SimpleAttention, self).__init__(scope_name) - - self.fc_1 = FC(self.full_name(), - decoder_size, - act=None, - bias_attr=False) - self.fc_2 = FC(self.full_name(), - 1, - num_flatten_dims=2, - act=None, - bias_attr=False) + def __init__(self, decoder_size): + super(SimpleAttention, self).__init__() + + self.fc_1 = Linear( + decoder_size, decoder_size, act=None, bias_attr=False) + self.fc_2 = Linear(decoder_size, 1, act=None, bias_attr=False) def forward(self, encoder_vec, encoder_proj, decoder_state): @@ -311,22 +295,18 @@ class SimpleAttention(fluid.dygraph.Layer): class GRUDecoderWithAttention(fluid.dygraph.Layer): - def __init__(self, scope_name, decoder_size, num_classes): - super(GRUDecoderWithAttention, self).__init__(scope_name) - self.simple_attention = SimpleAttention(self.full_name(), decoder_size) - - self.fc_1_layer = FC(self.full_name(), - size=decoder_size * 3, - bias_attr=False) - self.fc_2_layer = FC(self.full_name(), - size=decoder_size * 3, - bias_attr=False) + def __init__(self, decoder_size, num_classes): + super(GRUDecoderWithAttention, self).__init__() + self.simple_attention = SimpleAttention(decoder_size) + + self.fc_1_layer = Linear( + Config.encoder_size * 2, decoder_size * 3, bias_attr=False) + self.fc_2_layer = Linear( + decoder_size, decoder_size * 3, bias_attr=False) self.gru_unit = GRUUnit( size=decoder_size * 3, param_attr=None, bias_attr=None) - self.out_layer = FC(self.full_name(), - size=num_classes + 2, - bias_attr=None, - act='softmax') + self.out_layer = Linear( + decoder_size, num_classes + 2, bias_attr=None, act='softmax') self.decoder_size = decoder_size @@ -357,17 +337,18 @@ class GRUDecoderWithAttention(fluid.dygraph.Layer): class OCRAttention(fluid.dygraph.Layer): - def __init__(self, scope_name): - super(OCRAttention, self).__init__(scope_name) - self.encoder_net = EncoderNet(self.full_name()) - self.fc = FC(self.full_name(), - size=Config.decoder_size, - bias_attr=False, - act='relu') + def __init__(self): + super(OCRAttention, self).__init__() + self.encoder_net = EncoderNet() + self.fc = Linear( + Config.encoder_size, + Config.decoder_size, + bias_attr=False, + act='relu') self.embedding = Embedding( [Config.num_classes + 2, Config.word_vector_dim], dtype='float32') self.gru_decoder_with_attention = GRUDecoderWithAttention( - self.full_name(), Config.decoder_size, Config.num_classes) + Config.decoder_size, Config.num_classes) def forward(self, inputs, label_in): gru_backward, encoded_vector, encoded_proj = self.encoder_net(inputs) @@ -425,14 +406,15 @@ class TestDygraphOCRAttention(unittest.TestCase): fluid.default_main_program().random_seed = seed backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = True - ocr_attention = OCRAttention("ocr_attention") + ocr_attention = OCRAttention() if Config.learning_rate_decay == "piecewise_decay": learning_rate = fluid.layers.piecewise_decay( [50000], [Config.LR, Config.LR * 0.01]) else: learning_rate = Config.LR - optimizer = fluid.optimizer.SGD(learning_rate=0.001) + optimizer = fluid.optimizer.SGD( + learning_rate=0.001, parameter_list=ocr_attention.parameters()) dy_param_init_value = {} for param in ocr_attention.parameters(): dy_param_init_value[param.name] = param.numpy() @@ -478,7 +460,7 @@ class TestDygraphOCRAttention(unittest.TestCase): # print("static start") exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) - ocr_attention = OCRAttention("ocr_attention") + ocr_attention = OCRAttention() if Config.learning_rate_decay == "piecewise_decay": learning_rate = fluid.layers.piecewise_decay( diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index 1d232ba7f98..217f57fdc82 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -23,17 +23,17 @@ import paddle import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.optimizer import SGDOptimizer, Adam -from paddle.fluid.dygraph.nn import FC +from paddle.fluid.dygraph import Linear from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope class MLP(fluid.Layer): - def __init__(self, name_scope, param_attr=None, bias_attr=None): - super(MLP, self).__init__(name_scope) + def __init__(self, param_attr=None, bias_attr=None): + super(MLP, self).__init__() - self._fc1 = FC(self.full_name(), 10) - self._fc2 = FC(self.full_name(), 10) + self._fc1 = Linear(784, 10) + self._fc2 = Linear(10, 10) def forward(self, inputs): y = self._fc1(inputs) @@ -45,13 +45,16 @@ class TestImperativeOptimizerBase(unittest.TestCase): def setUp(self): self.batch_num = 20 + def get_optimizer_dygraph(self, parameter_list): + raise NotImplementedError() + def get_optimizer(self): raise NotImplementedError() def reader_decorator(self, reader): def _reader_imple(): for item in reader(): - image = np.array(item[0]).reshape(1, 28, 28) + image = np.array(item[0]).reshape(1, 784) label = np.array(item[1]).astype('int64').reshape(1) yield image, label @@ -65,8 +68,9 @@ class TestImperativeOptimizerBase(unittest.TestCase): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - mlp = MLP('mlp') - optimizer = self.get_optimizer() + mlp = MLP() + optimizer = self.get_optimizer_dygraph( + parameter_list=mlp.parameters()) batch_py_reader = fluid.io.PyReader(capacity=1) batch_py_reader.decorate_sample_list_generator( @@ -85,6 +89,7 @@ class TestImperativeOptimizerBase(unittest.TestCase): label = data[1] label.stop_gradient = True + img = fluid.layers.reshape(img, shape=[batch_size, -1]) cost = mlp(img) avg_loss = fluid.layers.reduce_mean(cost) dy_out = avg_loss.numpy() @@ -107,7 +112,7 @@ class TestImperativeOptimizerBase(unittest.TestCase): exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) - mlp = MLP('mlp') + mlp = MLP() optimizer = self.get_optimizer() train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=128, drop_last=True) @@ -115,6 +120,7 @@ class TestImperativeOptimizerBase(unittest.TestCase): img = fluid.layers.data( name='pixel', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') + img = fluid.layers.reshape(img, shape=[batch_size, -1]) cost = mlp(img) avg_loss = fluid.layers.reduce_mean(cost) optimizer.minimize(avg_loss) @@ -162,6 +168,15 @@ class TestImperativeOptimizerBase(unittest.TestCase): class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + bd = [3, 6, 9] + optimizer = SGDOptimizer( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, + values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]), + parameter_list=parameter_list) + return optimizer + def get_optimizer(self): bd = [3, 6, 9] optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( @@ -173,6 +188,16 @@ class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.natural_exp_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True), + parameter_list=parameter_list) + return optimizer + def get_optimizer(self): optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay( learning_rate=0.1, @@ -186,6 +211,16 @@ class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.exponential_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True), + parameter_list=parameter_list) + return optimizer + def get_optimizer(self): optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay( learning_rate=0.1, @@ -199,6 +234,16 @@ class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = Adam( + learning_rate=fluid.layers.inverse_time_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True), + parameter_list=parameter_list) + return optimizer + def get_optimizer(self): optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay( learning_rate=0.1, @@ -212,6 +257,13 @@ class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.polynomial_decay( + learning_rate=0.1, decay_steps=5, cycle=self.cycle), + parameter_list=parameter_list) + return optimizer + def get_optimizer(self): optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay( learning_rate=0.1, decay_steps=5, cycle=self.cycle)) @@ -227,6 +279,13 @@ class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.cosine_decay( + learning_rate=0.1, step_each_epoch=10000, epochs=120), + parameter_list=parameter_list) + return optimizer + def get_optimizer(self): optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay( learning_rate=0.1, step_each_epoch=10000, epochs=120)) @@ -237,6 +296,13 @@ class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.noam_decay( + d_model=512, warmup_steps=8000), + parameter_list=parameter_list) + return optimizer + def get_optimizer(self): optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay( d_model=512, warmup_steps=8000)) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py b/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py index 890e088f841..ed721503a14 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py @@ -38,7 +38,8 @@ class TestImperativePartitialBackward(unittest.TestCase): for param in fc2.parameters(): self.assertIsNone(param._grad_ivar()) - optimizer = fluid.optimizer.AdamOptimizer() + optimizer = fluid.optimizer.AdamOptimizer(parameter_list=( + fc1.parameters() + fc2.parameters())) _, params_grads = optimizer.minimize(loss) self.assertListEqual( diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py index d656eaebf47..9a84c048f67 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py @@ -30,13 +30,12 @@ from utils import DyGraphProgramDescTracerTestHelper, is_equal_program class SimpleLSTMRNN(fluid.Layer): def __init__(self, - name_scope, hidden_size, num_steps, num_layers=2, init_scale=0.1, dropout=None): - super(SimpleLSTMRNN, self).__init__(name_scope) + super(SimpleLSTMRNN, self).__init__() self._hidden_size = hidden_size self._num_layers = num_layers self._init_scale = init_scale @@ -45,8 +44,9 @@ class SimpleLSTMRNN(fluid.Layer): self._num_steps = num_steps self.cell_array = [] self.hidden_array = [] + self._create_parameter() - def _build_once(self, input_embedding, init_hidden=None, init_cell=None): + def _create_parameter(self): self.weight_1_arr = [] self.weight_2_arr = [] self.bias_arr = [] @@ -135,7 +135,6 @@ class SimpleLSTMRNN(fluid.Layer): class PtbModel(fluid.Layer): def __init__(self, - name_scope, hidden_size, vocab_size, num_layers=2, @@ -143,7 +142,7 @@ class PtbModel(fluid.Layer): init_scale=0.1, is_sparse=False, dropout=None): - super(PtbModel, self).__init__(name_scope) + super(PtbModel, self).__init__() self.hidden_size = hidden_size self.vocab_size = vocab_size self.init_scale = init_scale @@ -151,7 +150,6 @@ class PtbModel(fluid.Layer): self.num_steps = num_steps self.dropout = dropout self.simple_lstm_rnn = SimpleLSTMRNN( - self.full_name(), hidden_size, num_steps, num_layers=num_layers, @@ -231,7 +229,6 @@ class TestDygraphPtbRnn(unittest.TestCase): fluid.default_main_program().random_seed = seed # TODO: marsyang1993 Change seed to ptb_model = PtbModel( - "ptb_model", hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, @@ -239,7 +236,8 @@ class TestDygraphPtbRnn(unittest.TestCase): init_scale=init_scale, is_sparse=is_sparse) - sgd = SGDOptimizer(learning_rate=1e-3) + sgd = SGDOptimizer( + learning_rate=1e-3, parameter_list=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None @@ -298,7 +296,6 @@ class TestDygraphPtbRnn(unittest.TestCase): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed ptb_model = PtbModel( - "ptb_model", hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py index 9e90f0f12a0..8e85fe5dfef 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py @@ -49,7 +49,6 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase): backward_strategy.sort_sum_gradient = True # TODO: marsyang1993 Change seed to ptb_model = PtbModel( - "ptb_model", hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, @@ -57,7 +56,8 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase): init_scale=init_scale, is_sparse=is_sparse) - sgd = SGDOptimizer(learning_rate=1e-3) + sgd = SGDOptimizer( + learning_rate=1e-3, parameter_list=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None @@ -97,7 +97,6 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed ptb_model = PtbModel( - "ptb_model", hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, diff --git a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py index 36f6daeb37f..983fe23f448 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py @@ -86,7 +86,8 @@ class TestImperativeMnist(unittest.TestCase): loss_probs = fluid.layers.elementwise_mul(dy_reward, loss_probs) loss = fluid.layers.reduce_sum(loss_probs) - sgd = SGDOptimizer(learning_rate=1e-3) + sgd = SGDOptimizer( + learning_rate=1e-3, parameter_list=policy.parameters()) dy_param_init_value = {} diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py index 684bec6ee83..b8c242d9208 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py @@ -21,7 +21,7 @@ import paddle import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.layer_helper import LayerHelper -from paddle.fluid import Conv2D, Pool2D, BatchNorm, FC +from paddle.fluid import Conv2D, Pool2D, BatchNorm, Linear from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope from utils import DyGraphProgramDescTracerTestHelper, is_equal_program @@ -44,7 +44,7 @@ train_parameters = { } -def optimizer_setting(params): +def optimizer_setting(params, parameter_list=None): ls = params["learning_strategy"] if ls["name"] == "piecewise_decay": if "total_images" not in params: @@ -58,14 +58,18 @@ def optimizer_setting(params): base_lr = params["lr"] lr = [] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] - optimizer = fluid.optimizer.SGD(learning_rate=0.01) + if fluid.in_dygraph_mode(): + optimizer = fluid.optimizer.SGD(learning_rate=0.01, + parameter_list=parameter_list) + else: + optimizer = fluid.optimizer.SGD(learning_rate=0.01) # TODO(minqiyang): Add learning rate scheduler support to dygraph mode # optimizer = fluid.optimizer.Momentum( - # learning_rate=params["lr"], - # learning_rate=fluid.layers.piecewise_decay( - # boundaries=bd, values=lr), - # momentum=0.9, - # regularization=fluid.regularizer.L2Decay(1e-4)) + # learning_rate=params["lr"], + # learning_rate=fluid.layers.piecewise_decay( + # boundaries=bd, values=lr), + # momentum=0.9, + # regularization=fluid.regularizer.L2Decay(1e-4)) return optimizer @@ -147,8 +151,8 @@ class BottleneckBlock(fluid.Layer): class ResNet(fluid.Layer): - def __init__(self, name_scope, layers=50, class_dim=102): - super(ResNet, self).__init__(name_scope) + def __init__(self, layers=50, class_dim=102): + super(ResNet, self).__init__() self.layers = layers supported_layers = [50, 101, 152] @@ -187,14 +191,17 @@ class ResNet(fluid.Layer): self.pool2d_avg = Pool2D( pool_size=7, pool_type='avg', global_pooling=True) + self.pool2d_avg_output = num_filters[-1] * 4 * 1 * 1 + import math stdv = 1.0 / math.sqrt(2048 * 1.0) - self.out = FC(self.full_name(), - size=class_dim, - act='softmax', - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.Uniform(-stdv, stdv))) + self.out = Linear( + self.pool2d_avg_output, + class_dim, + act='softmax', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv))) def forward(self, inputs): y = self.conv(inputs) @@ -202,6 +209,7 @@ class ResNet(fluid.Layer): for bottleneck_block in self.bottleneck_block_list: y = bottleneck_block(y) y = self.pool2d_avg(y) + y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output]) y = self.out(y) return y @@ -228,8 +236,9 @@ class TestDygraphResnet(unittest.TestCase): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - resnet = ResNet("resnet") - optimizer = optimizer_setting(train_parameters) + resnet = ResNet() + optimizer = optimizer_setting( + train_parameters, parameter_list=resnet.parameters()) np.random.seed(seed) import random random.seed = seed @@ -315,7 +324,7 @@ class TestDygraphResnet(unittest.TestCase): exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) - resnet = ResNet("resnet") + resnet = ResNet() optimizer = optimizer_setting(train_parameters) np.random.seed(seed) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py index 44e147e317c..8cbd08ea3e2 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py @@ -40,7 +40,7 @@ train_parameters = { } -def optimizer_setting(params): +def optimizer_setting(params, parameter_list=None): ls = params["learning_strategy"] if ls["name"] == "piecewise_decay": if "total_images" not in params: @@ -54,14 +54,18 @@ def optimizer_setting(params): base_lr = params["lr"] lr = [] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] - optimizer = fluid.optimizer.SGD(learning_rate=0.01) + if fluid.in_dygraph_mode(): + optimizer = fluid.optimizer.SGD(learning_rate=0.01, + parameter_list=parameter_list) + else: + optimizer = fluid.optimizer.SGD(learning_rate=0.01) # TODO(minqiyang): Add learning rate scheduler support to dygraph mode # optimizer = fluid.optimizer.Momentum( - # learning_rate=params["lr"], - # learning_rate=fluid.layers.piecewise_decay( - # boundaries=bd, values=lr), - # momentum=0.9, - # regularization=fluid.regularizer.L2Decay(1e-4)) + # learning_rate=params["lr"], + # learning_rate=fluid.layers.piecewise_decay( + # boundaries=bd, values=lr), + # momentum=0.9, + # regularization=fluid.regularizer.L2Decay(1e-4)) return optimizer @@ -77,8 +81,9 @@ class TestDygraphResnetSortGradient(unittest.TestCase): fluid.default_main_program().random_seed = seed backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = True - resnet = ResNet("resnet") - optimizer = optimizer_setting(train_parameters) + resnet = ResNet() + optimizer = optimizer_setting( + train_parameters, parameter_list=resnet.parameters()) np.random.seed(seed) import random random.seed = seed @@ -138,7 +143,7 @@ class TestDygraphResnetSortGradient(unittest.TestCase): exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) - resnet = ResNet("resnet") + resnet = ResNet() optimizer = optimizer_setting(train_parameters) np.random.seed(seed) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py index 565c60cfe8a..3566a37e97f 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py @@ -233,8 +233,10 @@ class TestDygraphPtbRnn(unittest.TestCase): place = fluid.CPUPlace() if not core.is_compiled_with_cuda( ) else fluid.CUDAPlace(0) - adam = Adam(learning_rate=fluid.layers.piecewise_decay( - boundaries=bd, values=lr_arr)) + adam = Adam( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr_arr), + parameter_list=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None @@ -314,8 +316,10 @@ class TestDygraphPtbRnn(unittest.TestCase): place = fluid.CPUPlace() if not core.is_compiled_with_cuda( ) else fluid.CUDAPlace(0) - adam = Adam(learning_rate=fluid.layers.piecewise_decay( - boundaries=bd, values=lr_arr)) + adam = Adam( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr_arr), + parameter_list=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None @@ -418,8 +422,10 @@ class TestDygraphPtbRnn(unittest.TestCase): place = fluid.CPUPlace() if not core.is_compiled_with_cuda( ) else fluid.CUDAPlace(0) - adam = Adam(learning_rate=fluid.layers.piecewise_decay( - boundaries=bd, values=lr_arr)) + adam = Adam( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr_arr), + parameter_list=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None @@ -521,8 +527,10 @@ class TestDygraphPtbRnn(unittest.TestCase): place = fluid.CPUPlace() if not core.is_compiled_with_cuda( ) else fluid.CUDAPlace(0) - adam = Adam(learning_rate=fluid.layers.piecewise_decay( - boundaries=bd, values=lr_arr)) + adam = Adam( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr_arr), + parameter_list=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None @@ -633,7 +641,8 @@ class TestDygraphPtbRnn(unittest.TestCase): learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr), beta1=0.8, - beta2=0.6) + beta2=0.6, + parameter_list=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None @@ -724,7 +733,8 @@ class TestDygraphPtbRnn(unittest.TestCase): learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr), beta1=0.8, - beta2=0.6) + beta2=0.6, + parameter_list=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None @@ -816,7 +826,8 @@ class TestDygraphPtbRnn(unittest.TestCase): learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr), beta1=0.8, - beta2=0.6) + beta2=0.6, + parameter_list=ptb_model.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py index 8b1ba4643a2..e0cc89962d2 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py @@ -21,7 +21,7 @@ import paddle import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.layer_helper import LayerHelper -from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope @@ -42,7 +42,7 @@ train_parameters = { } -def optimizer_setting(params): +def optimizer_setting(params, parameter_list=None): ls = params["learning_strategy"] if ls["name"] == "piecewise_decay": if "total_images" not in params: @@ -56,7 +56,11 @@ def optimizer_setting(params): #bd = [step * e for e in ls["epochs"]] #base_lr = params["lr"] #lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] - optimizer = fluid.optimizer.SGD(learning_rate=0.01) + if fluid.in_dygraph_mode(): + optimizer = fluid.optimizer.SGD(learning_rate=0.01, + parameter_list=parameter_list) + else: + optimizer = fluid.optimizer.SGD(learning_rate=0.01) return optimizer @@ -91,25 +95,27 @@ class ConvBNLayer(fluid.dygraph.Layer): class SqueezeExcitation(fluid.dygraph.Layer): - def __init__(self, name_scope, num_channels, reduction_ratio): + def __init__(self, num_channels, reduction_ratio): - super(SqueezeExcitation, self).__init__(name_scope) + super(SqueezeExcitation, self).__init__() + self._num_channels = num_channels self._pool = Pool2D(pool_size=0, pool_type='avg', global_pooling=True) - self._squeeze = FC( - self.full_name(), - size=num_channels // reduction_ratio, + self._squeeze = Linear( + num_channels, + num_channels // reduction_ratio, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0.05)), act='relu') - self._excitation = FC( - self.full_name(), - size=num_channels, + self._excitation = Linear( + num_channels // reduction_ratio, + num_channels, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0.05)), act='sigmoid') def forward(self, input): y = self._pool(input) + y = fluid.layers.reshape(y, shape=[-1, self._num_channels]) y = self._squeeze(y) y = self._excitation(y) y = fluid.layers.elementwise_mul(x=input, y=y, axis=0) @@ -141,9 +147,7 @@ class BottleneckBlock(fluid.dygraph.Layer): act='relu') self.scale = SqueezeExcitation( - self.full_name(), - num_channels=num_filters * 4, - reduction_ratio=reduction_ratio) + num_channels=num_filters * 4, reduction_ratio=reduction_ratio) if not shortcut: self.short = ConvBNLayer( @@ -175,8 +179,8 @@ class BottleneckBlock(fluid.dygraph.Layer): class SeResNeXt(fluid.dygraph.Layer): - def __init__(self, name_scope, layers=50, class_dim=102): - super(SeResNeXt, self).__init__(name_scope) + def __init__(self, layers=50, class_dim=102): + super(SeResNeXt, self).__init__() self.layers = layers supported_layers = [50, 101, 152] @@ -203,7 +207,7 @@ class SeResNeXt(fluid.dygraph.Layer): num_filters = [128, 256, 512, 1024] self.conv0 = ConvBNLayer( num_channels=3, - num_filters=3, + num_filters=64, filter_size=7, stride=2, act='relu') @@ -216,27 +220,29 @@ class SeResNeXt(fluid.dygraph.Layer): num_filters = [128, 256, 512, 1024] self.conv0 = ConvBNLayer( num_channels=3, - num_filters=3, - filter_size=7, + num_filters=64, + filter_size=3, stride=2, act='relu') self.conv1 = ConvBNLayer( - num_channels=3, - num_filters=3, - filter_size=7, + num_channels=64, + num_filters=64, + filter_size=3, stride=2, act='relu') self.conv2 = ConvBNLayer( - num_channels=7, - num_filters=3, - filter_size=7, - stride=2, + num_channels=64, + num_filters=128, + filter_size=3, + stride=1, act='relu') self.pool = Pool2D( pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') self.bottleneck_block_list = [] num_channels = 64 + if layers == 152: + num_channels = 128 for block in range(len(depth)): shortcut = False for i in range(depth[block]): @@ -258,11 +264,14 @@ class SeResNeXt(fluid.dygraph.Layer): import math stdv = 1.0 / math.sqrt(2048 * 1.0) - self.out = FC(self.full_name(), - size=class_dim, - act='softmax', - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.Uniform(-stdv, stdv))) + self.pool2d_avg_output = num_filters[-1] * 4 * 1 * 1 + + self.out = Linear( + self.pool2d_avg_output, + class_dim, + act='softmax', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv))) def forward(self, inputs): if self.layers == 50 or self.layers == 101: @@ -270,14 +279,15 @@ class SeResNeXt(fluid.dygraph.Layer): y = self.pool(y) elif self.layers == 152: y = self.conv0(inputs) - y = self.conv1(inputs) - y = self.conv2(inputs) + y = self.conv1(y) + y = self.conv2(y) y = self.pool(y) for bottleneck_block in self.bottleneck_block_list: y = bottleneck_block(y) y = self.pool2d_avg(y) y = fluid.layers.dropout(y, dropout_prob=0.2) + y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output]) y = self.out(y) return y @@ -302,8 +312,9 @@ class TestImperativeResneXt(unittest.TestCase): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - se_resnext = SeResNeXt("se_resnext") - optimizer = optimizer_setting(train_parameters) + se_resnext = SeResNeXt() + optimizer = optimizer_setting( + train_parameters, parameter_list=se_resnext.parameters()) np.random.seed(seed) import random random.seed = seed @@ -364,7 +375,7 @@ class TestImperativeResneXt(unittest.TestCase): exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) - se_resnext = SeResNeXt("se_resnext") + se_resnext = SeResNeXt() optimizer = optimizer_setting(train_parameters) np.random.seed(seed) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py index c9e0093f643..6a9c20a53d2 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py @@ -49,23 +49,27 @@ class TestSimpleNet(unittest.TestCase): with fluid.dygraph.guard(place): backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = sort_sum_gradient - adam = SGDOptimizer(learning_rate=0.001) # grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0) input_word = np.array([[1, 2], [2, 1]]).astype('int64') input = to_variable(input_word) simplenet = SimpleNet(20, 32, dtype) + adam = SGDOptimizer( + learning_rate=0.001, + parameter_list=simplenet.parameters()) input_emb, emb = simplenet(input) try: emb._w.gradient() except ValueError as e: - pass + assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str( + e) try: input_emb.gradient() except ValueError as e: - pass + assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str( + e) input_emb.backward(backward_strategy) adam.minimize(input_emb) # grad_clip=grad_clip @@ -75,13 +79,11 @@ class TestSimpleNet(unittest.TestCase): try: emb._w.gradient() except ValueError as e: - pass + assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str( + e) input_emb.clear_gradient() - try: - input_emb.gradient() - except ValueError as e: - pass + input_emb.gradient() def test_selectedrows_gradient2(self): places = [fluid.CPUPlace()] @@ -93,7 +95,6 @@ class TestSimpleNet(unittest.TestCase): with fluid.dygraph.guard(place): backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = sort_sum_gradient - adam = SGDOptimizer(learning_rate=0.001) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm( 5.0) @@ -101,16 +102,21 @@ class TestSimpleNet(unittest.TestCase): input = to_variable(input_word) simplenet = SimpleNet(20, 32, "float32") + adam = SGDOptimizer( + learning_rate=0.001, + parameter_list=simplenet.parameters()) input_emb, emb = simplenet(input) try: emb._w.gradient() except ValueError as e: - pass + assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str( + e) try: input_emb.gradient() except ValueError as e: - pass + assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str( + e) input_emb.backward(backward_strategy) adam.minimize(input_emb, grad_clip=grad_clip) @@ -120,13 +126,11 @@ class TestSimpleNet(unittest.TestCase): try: emb._w.gradient() except ValueError as e: - pass + assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str( + e) input_emb.clear_gradient() - try: - input_emb.gradient() - except ValueError as e: - pass + input_emb.gradient() if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py index 4471573142f..2e8b545b602 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py @@ -114,7 +114,9 @@ class TestDygraphSimpleNet(unittest.TestCase): is_sparse=is_sparse, dtype=dtype) - sgd = SGDOptimizer(learning_rate=1e-3) + sgd = SGDOptimizer( + learning_rate=1e-3, + parameter_list=simple_net.parameters()) dy_param_updated = dict() dy_param_init = dict() dy_loss = None diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py index 8c725c0fa28..9db1665dce0 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import paddle.fluid as fluid -from paddle.fluid import Embedding, LayerNorm, FC, Layer +from paddle.fluid import Embedding, LayerNorm, Linear, Layer from paddle.fluid.dygraph import to_variable, guard from paddle.fluid.dygraph.jit import TracedLayer from test_imperative_base import new_program_scope @@ -378,15 +378,10 @@ class PrePostProcessLayer(Layer): class PositionwiseFeedForwardLayer(Layer): - def __init__(self, name_scope, d_inner_hid, d_hid, dropout_rate): - super(PositionwiseFeedForwardLayer, self).__init__(name_scope) - self._i2h = FC(name_scope=self.full_name(), - size=d_inner_hid, - num_flatten_dims=2, - act="relu") - self._h2o = FC(name_scope=self.full_name(), - size=d_hid, - num_flatten_dims=2) + def __init__(self, d_inner_hid, d_hid, dropout_rate): + super(PositionwiseFeedForwardLayer, self).__init__() + self._i2h = Linear(d_hid, d_inner_hid, act="relu") + self._h2o = Linear(d_inner_hid, d_hid) self._dropout_rate = dropout_rate def forward(self, x): @@ -403,7 +398,6 @@ class PositionwiseFeedForwardLayer(Layer): class MultiHeadAttentionLayer(Layer): def __init__(self, - name_scope, d_key, d_value, d_model, @@ -412,28 +406,16 @@ class MultiHeadAttentionLayer(Layer): cache=None, gather_idx=None, static_kv=False): - super(MultiHeadAttentionLayer, self).__init__(name_scope) + super(MultiHeadAttentionLayer, self).__init__() self._n_head = n_head self._d_key = d_key self._d_value = d_value self._d_model = d_model self._dropout_rate = dropout_rate - self._q_fc = FC(name_scope=self.full_name(), - size=d_key * n_head, - bias_attr=False, - num_flatten_dims=2) - self._k_fc = FC(name_scope=self.full_name(), - size=d_key * n_head, - bias_attr=False, - num_flatten_dims=2) - self._v_fc = FC(name_scope=self.full_name(), - size=d_value * n_head, - bias_attr=False, - num_flatten_dims=2) - self._proj_fc = FC(name_scope=self.full_name(), - size=self._d_model, - bias_attr=False, - num_flatten_dims=2) + self._q_fc = Linear(self._d_model, d_key * n_head, bias_attr=False) + self._k_fc = Linear(self._d_model, d_key * n_head, bias_attr=False) + self._v_fc = Linear(self._d_model, d_value * n_head, bias_attr=False) + self._proj_fc = Linear(d_value * n_head, self._d_model, bias_attr=False) def forward(self, queries, keys, values, attn_bias): # compute q ,k ,v @@ -490,7 +472,6 @@ class MultiHeadAttentionLayer(Layer): class EncoderSubLayer(Layer): def __init__(self, - name_scope, n_head, d_key, d_value, @@ -502,7 +483,7 @@ class EncoderSubLayer(Layer): preprocess_cmd="n", postprocess_cmd="da"): - super(EncoderSubLayer, self).__init__(name_scope) + super(EncoderSubLayer, self).__init__() self._preprocess_cmd = preprocess_cmd self._postprocess_cmd = postprocess_cmd self._prepostprocess_dropout = prepostprocess_dropout @@ -510,14 +491,13 @@ class EncoderSubLayer(Layer): self._preprocess_layer = PrePostProcessLayer(d_model, self._preprocess_cmd, 3) self._multihead_attention_layer = MultiHeadAttentionLayer( - self.full_name(), d_key, d_value, d_model, n_head, - attention_dropout) + d_key, d_value, d_model, n_head, attention_dropout) self._postprocess_layer = PrePostProcessLayer( d_model, self._postprocess_cmd, None) self._preprocess_layer2 = PrePostProcessLayer(d_model, self._preprocess_cmd, 3) self._positionwise_feed_forward = PositionwiseFeedForwardLayer( - self.full_name(), d_inner_hid, d_model, relu_dropout) + d_inner_hid, d_model, relu_dropout) self._postprocess_layer2 = PrePostProcessLayer( d_model, self._postprocess_cmd, None) @@ -540,7 +520,6 @@ class EncoderSubLayer(Layer): class EncoderLayer(Layer): def __init__(self, - name_scope, n_layer, n_head, d_key, @@ -553,7 +532,7 @@ class EncoderLayer(Layer): preprocess_cmd="n", postprocess_cmd="da"): - super(EncoderLayer, self).__init__(name_scope) + super(EncoderLayer, self).__init__() self._preprocess_cmd = preprocess_cmd self._encoder_sublayers = list() self._prepostprocess_dropout = prepostprocess_dropout @@ -564,10 +543,10 @@ class EncoderLayer(Layer): self._encoder_sublayers.append( self.add_sublayer( 'esl_%d' % i, - EncoderSubLayer( - self.full_name(), n_head, d_key, d_value, d_model, - d_inner_hid, prepostprocess_dropout, attention_dropout, - relu_dropout, preprocess_cmd, postprocess_cmd))) + EncoderSubLayer(n_head, d_key, d_value, d_model, + d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, + preprocess_cmd, postprocess_cmd))) def forward(self, enc_input, attn_bias): for i in range(self._n_layer): @@ -580,7 +559,6 @@ class EncoderLayer(Layer): class PrepareEncoderDecoderLayer(Layer): def __init__(self, - name_scope, src_vocab_size, src_emb_dim, src_max_len, @@ -588,7 +566,7 @@ class PrepareEncoderDecoderLayer(Layer): is_sparse=False, word_emb_param_name=None, pos_enc_param_name=None): - super(PrepareEncoderDecoderLayer, self).__init__(name_scope) + super(PrepareEncoderDecoderLayer, self).__init__() self._src_max_len = src_max_len self._src_emb_dim = src_emb_dim self._src_vocab_size = src_vocab_size @@ -634,7 +612,6 @@ class PrepareEncoderDecoderLayer(Layer): class WrapEncoderLayer(Layer): def __init__(self, - name_cope, src_vocab_size, max_length, n_layer, @@ -653,10 +630,9 @@ class WrapEncoderLayer(Layer): """ The wrapper assembles together all needed layers for the encoder. """ - super(WrapEncoderLayer, self).__init__(name_cope) + super(WrapEncoderLayer, self).__init__() self._prepare_encoder_layer = PrepareEncoderDecoderLayer( - self.full_name(), src_vocab_size, d_model, max_length, @@ -664,10 +640,10 @@ class WrapEncoderLayer(Layer): is_sparse=is_sparse, word_emb_param_name=word_emb_param_names[0], pos_enc_param_name=pos_enc_param_names[0]) - self._encoder = EncoderLayer( - self.full_name(), n_layer, n_head, d_key, d_value, d_model, - d_inner_hid, prepostprocess_dropout, attention_dropout, - relu_dropout, preprocess_cmd, postprocess_cmd) + self._encoder = EncoderLayer(n_layer, n_head, d_key, d_value, d_model, + d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, + preprocess_cmd, postprocess_cmd) def forward(self, enc_inputs): src_word, src_pos, src_slf_attn_bias = enc_inputs @@ -678,7 +654,6 @@ class WrapEncoderLayer(Layer): class DecoderSubLayer(Layer): def __init__(self, - name_scope, n_head, d_key, d_value, @@ -691,14 +666,13 @@ class DecoderSubLayer(Layer): postprocess_cmd, cache=None, gather_idx=None): - super(DecoderSubLayer, self).__init__(name_scope) + super(DecoderSubLayer, self).__init__() self._postprocess_cmd = postprocess_cmd self._preprocess_cmd = preprocess_cmd self._prepostprcess_dropout = prepostprocess_dropout self._pre_process_layer = PrePostProcessLayer(d_model, preprocess_cmd, 3) self._multihead_attention_layer = MultiHeadAttentionLayer( - self.full_name(), d_key, d_value, d_model, @@ -711,7 +685,6 @@ class DecoderSubLayer(Layer): self._pre_process_layer2 = PrePostProcessLayer(d_model, preprocess_cmd, 3) self._multihead_attention_layer2 = MultiHeadAttentionLayer( - self.full_name(), d_key, d_value, d_model, @@ -725,7 +698,7 @@ class DecoderSubLayer(Layer): self._pre_process_layer3 = PrePostProcessLayer(d_model, preprocess_cmd, 3) self._positionwise_feed_forward_layer = PositionwiseFeedForwardLayer( - self.full_name(), d_inner_hid, d_model, relu_dropout) + d_inner_hid, d_model, relu_dropout) self._post_process_layer3 = PrePostProcessLayer(d_model, postprocess_cmd, None) @@ -757,7 +730,6 @@ class DecoderSubLayer(Layer): class DecoderLayer(Layer): def __init__(self, - name_scope, n_layer, n_head, d_key, @@ -771,7 +743,7 @@ class DecoderLayer(Layer): postprocess_cmd, caches=None, gather_idx=None): - super(DecoderLayer, self).__init__(name_scope) + super(DecoderLayer, self).__init__() self._pre_process_layer = PrePostProcessLayer(d_model, preprocess_cmd, 3) self._decoder_sub_layers = list() @@ -783,7 +755,6 @@ class DecoderLayer(Layer): self.add_sublayer( 'dsl_%d' % i, DecoderSubLayer( - self.full_name(), n_head, d_key, d_value, @@ -812,7 +783,6 @@ class DecoderLayer(Layer): class WrapDecoderLayer(Layer): def __init__(self, - name_scope, trg_vocab_size, max_length, n_layer, @@ -833,10 +803,9 @@ class WrapDecoderLayer(Layer): """ The wrapper assembles together all needed layers for the encoder. """ - super(WrapDecoderLayer, self).__init__(name_scope) + super(WrapDecoderLayer, self).__init__() self._prepare_decoder_layer = PrepareEncoderDecoderLayer( - self.full_name(), trg_vocab_size, d_model, max_length, @@ -845,7 +814,6 @@ class WrapDecoderLayer(Layer): word_emb_param_name=word_emb_param_names[1], pos_enc_param_name=pos_enc_param_names[1]) self._decoder_layer = DecoderLayer( - self.full_name(), n_layer, n_head, d_key, @@ -861,9 +829,7 @@ class WrapDecoderLayer(Layer): gather_idx=gather_idx) self._weight_sharing = weight_sharing if not weight_sharing: - self._fc = FC(self.full_name(), - size=trg_vocab_size, - bias_attr=False) + self._fc = Linear(d_model, trg_vocab_size, bias_attr=False) def forward(self, dec_inputs=None, enc_output=None): trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs @@ -891,7 +857,6 @@ class WrapDecoderLayer(Layer): class TransFormer(Layer): def __init__(self, - name_scope, src_vocab_size, trg_vocab_size, max_length, @@ -911,7 +876,7 @@ class TransFormer(Layer): use_py_reader=False, is_test=False, is_sparse=False): - super(TransFormer, self).__init__(name_scope) + super(TransFormer, self).__init__() self._label_smooth_eps = label_smooth_eps self._trg_vocab_size = trg_vocab_size if weight_sharing: @@ -919,7 +884,6 @@ class TransFormer(Layer): "Vocabularies in source and target should be same for weight sharing." ) self._wrap_encoder_layer = WrapEncoderLayer( - self.full_name(), src_vocab_size, max_length, n_layer, @@ -936,7 +900,6 @@ class TransFormer(Layer): weight_sharing, is_sparse=is_sparse) self._wrap_decoder_layer = WrapDecoderLayer( - self.full_name(), trg_vocab_size, max_length, n_layer, @@ -991,7 +954,6 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = True transformer = TransFormer( - 'transformer', ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, @@ -1020,9 +982,12 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): learning_rate=learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, - epsilon=TrainTaskConfig.eps) + epsilon=TrainTaskConfig.eps, + parameter_list=transformer.parameters()) else: - optimizer = fluid.optimizer.SGD(learning_rate=0.003) + optimizer = fluid.optimizer.SGD( + learning_rate=0.003, + parameter_list=transformer.parameters()) dy_param_init = dict() dy_param_updated = dict() @@ -1073,7 +1038,6 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed transformer = TransFormer( - 'transformer', ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 9476e7ce4d7..51f0a1fd40b 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -197,7 +197,8 @@ class TestLayer(LayerTest): fc1_bias_init = fc1.bias.detach() loss1.backward() - optimizer1 = fluid.optimizer.SGD(learning_rate=0.1) + optimizer1 = fluid.optimizer.SGD(learning_rate=0.1, + parameter_list=fc1.parameters()) optimizer1.minimize(loss1) fc1_weight_updated = fc1.weight.detach() @@ -224,7 +225,8 @@ class TestLayer(LayerTest): out2 = fc2(base.to_variable(inp)) loss2 = fluid.layers.reduce_mean(out2) loss2.backward() - optimizer2 = fluid.optimizer.SGD(learning_rate=0.1) + optimizer2 = fluid.optimizer.SGD(learning_rate=0.1, + parameter_list=fc2.parameters()) optimizer2.minimize(loss2) self.assertTrue( -- GitLab