提交 dca07583 编写于 作者: Z zhongpu 提交者: hong

remove params in Tracer object (in dygraph) (#20815)

* remove params in Tracer object, test=develop

* Repair failed optest, test=develop

* remove build_once & name_scope (Conv2D)
test=develop

* fix unittest
test=develop

* Conv2DTranspose

* Conv3D & Conv3DTranspose
test=develop

* Pool2D & BatchNorm

* Embedding

* LayerNorm

* GRUUnit & NCE

* PRelu

* BilinearTensorProduct

* GroupNorm & SpectralNorm

* TreeConv
test=develop

* fix LayerNorm in transformer unnittest
test=develop

* disable LayerNorm or BatchNorm in multicard
test=develop

* refine Layer.create_parameter api
test=develop

* refine LayerNorm, remove begin_norm_axis param, add normed shape check
test=develop

* LayerNorm bug fix
test=develop

* fix optest,test=develop

* fix optest, test=develop

* fix optest for pass parameter_list when constructing an Optimizer class instance, test=develop

* polish code for better code style, test=develop

* fix se_resnext optest, test=develop

* polish code for better code style, test=develop
Co-authored-by: Nsongyouwei <youwei0314@gmail.com>
上级 c3e19549
...@@ -154,14 +154,14 @@ def guard(place=None): ...@@ -154,14 +154,14 @@ def guard(place=None):
yield yield
def _print_debug_msg(limit=5, is_test=False): def _print_debug_msg(parameter_list, limit=5, is_test=False):
if not core._is_dygraph_debug_enabled(): if not core._is_dygraph_debug_enabled():
logging.warn( logging.warn(
'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug' 'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug'
) )
return return
unique_name_size = len(framework.unique_name.generator.ids) unique_name_size = len(framework.unique_name.generator.ids)
tracer_var_size = len(framework._dygraph_tracer()._vars) tracer_var_size = len(parameter_list)
alive_cpp_var_size = len(core.VarBase._alive_vars()) alive_cpp_var_size = len(core.VarBase._alive_vars())
if not is_test: if not is_test:
logging.warn( logging.warn(
......
...@@ -53,7 +53,8 @@ def save_dygraph(state_dict, model_path): ...@@ -53,7 +53,8 @@ def save_dygraph(state_dict, model_path):
state_dict = emb.state_dict() state_dict = emb.state_dict()
fluid.save_dygraph( state_dict, "paddle_dy") fluid.save_dygraph( state_dict, "paddle_dy")
adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000) ) adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000),
parameter_list = emb.parameters() )
state_dict = adam.state_dict() state_dict = adam.state_dict()
fluid.save_dygraph( state_dict, "paddle_dy") fluid.save_dygraph( state_dict, "paddle_dy")
...@@ -96,7 +97,8 @@ def load_dygraph(model_path): ...@@ -96,7 +97,8 @@ def load_dygraph(model_path):
state_dict = emb.state_dict() state_dict = emb.state_dict()
fluid.save_dygraph( state_dict, "paddle_dy") fluid.save_dygraph( state_dict, "paddle_dy")
adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000) ) adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000),
parameter_list = emb.parameters() )
state_dict = adam.state_dict() state_dict = adam.state_dict()
fluid.save_dygraph( state_dict, "padle_dy") fluid.save_dygraph( state_dict, "padle_dy")
......
...@@ -145,9 +145,13 @@ class Layer(core.Layer): ...@@ -145,9 +145,13 @@ class Layer(core.Layer):
list of :ref:`api_guide_Variable_en` : a list of Parameters. list of :ref:`api_guide_Variable_en` : a list of Parameters.
""" """
ret = [p for p in self._parameters.values()] ret = [p for p in self._parameters.values()]
parameters_set = set(ret)
if include_sublayers: if include_sublayers:
for l in self._sub_layers.values(): for l in self._sub_layers.values():
for p in l.parameters(include_sublayers): for p in l.parameters(include_sublayers):
if p in parameters_set:
continue
parameters_set.add(p)
ret.append(p) ret.append(p)
return ret return ret
...@@ -261,11 +265,6 @@ class Layer(core.Layer): ...@@ -261,11 +265,6 @@ class Layer(core.Layer):
value.set_value(self._loaddict_holder[value.name]) value.set_value(self._loaddict_holder[value.name])
if name in params:
# remove unused param in tracer
if framework._dygraph_tracer_ is not None:
framework._dygraph_tracer_._vars.pop(params[name].name,
None)
params[name] = value params[name] = value
elif isinstance(value, core.Layer): elif isinstance(value, core.Layer):
layers = self.__dict__.get('_sub_layers', None) layers = self.__dict__.get('_sub_layers', None)
......
...@@ -104,8 +104,10 @@ class PiecewiseDecay(LearningRateDecay): ...@@ -104,8 +104,10 @@ class PiecewiseDecay(LearningRateDecay):
boundaries = [10000, 20000] boundaries = [10000, 20000]
values = [1.0, 0.5, 0.1] values = [1.0, 0.5, 0.1]
with fluid.dygraph.guard(): with fluid.dygraph.guard():
emb = fluid.dygraph.Embedding( [10, 10] )
optimizer = fluid.optimizer.SGD( optimizer = fluid.optimizer.SGD(
learning_rate=fluid.dygraph.PiecewiseDecay(boundaries, values, 0) ) learning_rate=fluid.dygraph.PiecewiseDecay(boundaries, values, 0),
parameter_list = emb.parameters() )
""" """
def __init__(self, boundaries, values, begin, step=1, dtype='float32'): def __init__(self, boundaries, values, begin, step=1, dtype='float32'):
...@@ -323,12 +325,14 @@ class InverseTimeDecay(LearningRateDecay): ...@@ -323,12 +325,14 @@ class InverseTimeDecay(LearningRateDecay):
import paddle.fluid as fluid import paddle.fluid as fluid
base_lr = 0.1 base_lr = 0.1
with fluid.dygraph.guard(): with fluid.dygraph.guard():
emb = fluid.dygraph.Embedding([10, 10])
sgd_optimizer = fluid.optimizer.SGD( sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.dygraph.InverseTimeDecay( learning_rate=fluid.dygraph.InverseTimeDecay(
learning_rate=base_lr, learning_rate=base_lr,
decay_steps=10000, decay_steps=10000,
decay_rate=0.5, decay_rate=0.5,
staircase=True)) staircase=True),
parameter_list = emb.parameters())
""" """
...@@ -404,9 +408,11 @@ class PolynomialDecay(LearningRateDecay): ...@@ -404,9 +408,11 @@ class PolynomialDecay(LearningRateDecay):
total_step = 5000 total_step = 5000
end_lr = 0 end_lr = 0
with fluid.dygraph.guard(): with fluid.dygraph.guard():
emb = fluid.dygraph.Embedding( [10, 10])
optimizer = fluid.optimizer.SGD( optimizer = fluid.optimizer.SGD(
learning_rate = fluid.dygraph.PolynomialDecay( learning_rate = fluid.dygraph.PolynomialDecay(
start_lr, total_step, end_lr, power=1.0) ) start_lr, total_step, end_lr, power=1.0),
parameter_list = emb.parameters())
""" """
...@@ -536,10 +542,12 @@ class NoamDecay(LearningRateDecay): ...@@ -536,10 +542,12 @@ class NoamDecay(LearningRateDecay):
warmup_steps = 100 warmup_steps = 100
learning_rate = 0.01 learning_rate = 0.01
with fluid.dygraph.guard(): with fluid.dygraph.guard():
emb = fluid.dygraph.Embedding([10, 10])
optimizer = fluid.optimizer.SGD( optimizer = fluid.optimizer.SGD(
learning_rate = fluid.dygraph.NoamDecay( learning_rate = fluid.dygraph.NoamDecay(
1/(warmup_steps *(learning_rate ** 2)), 1/(warmup_steps *(learning_rate ** 2)),
warmup_steps) ) warmup_steps),
parameter_list = emb.parameters())
""" """
def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'): def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'):
......
...@@ -31,16 +31,8 @@ class Tracer(core.Tracer): ...@@ -31,16 +31,8 @@ class Tracer(core.Tracer):
def __init__(self): def __init__(self):
super(Tracer, self).__init__() super(Tracer, self).__init__()
self._vars = defaultdict()
self._train_mode = True self._train_mode = True
def trace_var(self, name, var):
self._vars[name] = var
def all_parameters(self):
return list((item for name, item in six.iteritems(self._vars)
if isinstance(item, framework.Parameter)))
def trace_op(self, type, inputs, outputs, attrs, stop_gradient=False): def trace_op(self, type, inputs, outputs, attrs, stop_gradient=False):
self.trace(type, inputs, outputs, attrs, self.trace(type, inputs, outputs, attrs,
framework._current_expected_place(), self._train_mode and framework._current_expected_place(), self._train_mode and
......
...@@ -4676,8 +4676,6 @@ class ParamBase(core.VarBase): ...@@ -4676,8 +4676,6 @@ class ParamBase(core.VarBase):
# self.block = default_main_program().global_block() # self.block = default_main_program().global_block()
_dygraph_tracer().trace_var(name, self)
def __str__(self): def __str__(self):
return self.to_string(True) return self.to_string(True)
......
...@@ -60,7 +60,12 @@ class Optimizer(object): ...@@ -60,7 +60,12 @@ class Optimizer(object):
""" """
@imperative_base.no_grad @imperative_base.no_grad
def __init__(self, learning_rate, regularization=None, name=None): def __init__(self,
learning_rate,
parameter_list=None,
regularization=None,
name=None):
self._parameter_list = None
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
if not isinstance(learning_rate, float) and \ if not isinstance(learning_rate, float) and \
not isinstance(learning_rate, LearningRateDecay): not isinstance(learning_rate, LearningRateDecay):
...@@ -71,6 +76,12 @@ class Optimizer(object): ...@@ -71,6 +76,12 @@ class Optimizer(object):
self._name = unique_name.generate(name) self._name = unique_name.generate(name)
else: else:
self._name = unique_name.generate(self.__class__.__name__) self._name = unique_name.generate(self.__class__.__name__)
if parameter_list is not None:
self._parameter_list = parameter_list
else:
raise AttributeError(
"parameter_list argument given to the Optimizer should not be None in dygraph mode."
)
else: else:
if not isinstance(learning_rate, float) and \ if not isinstance(learning_rate, float) and \
not isinstance(learning_rate, framework.Variable): not isinstance(learning_rate, framework.Variable):
...@@ -154,7 +165,8 @@ class Optimizer(object): ...@@ -154,7 +165,8 @@ class Optimizer(object):
state_dict = emb.state_dict() state_dict = emb.state_dict()
fluid.save_dygraph( state_dict, "paddle_dy") fluid.save_dygraph( state_dict, "paddle_dy")
adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000) ) adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000),
parameter_list = emb.parameters() )
state_dict = adam.state_dict() state_dict = adam.state_dict()
fluid.save_dygraph( state_dict, "padle_dy") fluid.save_dygraph( state_dict, "padle_dy")
...@@ -530,13 +542,8 @@ class Optimizer(object): ...@@ -530,13 +542,8 @@ class Optimizer(object):
self._dtype = loss.dtype self._dtype = loss.dtype
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
if parameter_list is not None:
parameters = parameter_list
else:
parameters = framework._dygraph_tracer().all_parameters()
params_grads = [] params_grads = []
for param in parameters: for param in self._parameter_list:
if not param.trainable: if not param.trainable:
continue continue
if param._grad_ivar() is not None: if param._grad_ivar() is not None:
...@@ -705,6 +712,9 @@ class SGDOptimizer(Optimizer): ...@@ -705,6 +712,9 @@ class SGDOptimizer(Optimizer):
Parameters: Parameters:
learning_rate (float|Variable): The learning rate used to update parameters. \ learning_rate (float|Variable): The learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element. Can be a float value or a Variable with one float value as data element.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. \ regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. \
Optional, default is None. Optional, default is None.
name (str, optional): This parameter is used by developers to print debugging information. \ name (str, optional): This parameter is used by developers to print debugging information. \
...@@ -740,10 +750,15 @@ class SGDOptimizer(Optimizer): ...@@ -740,10 +750,15 @@ class SGDOptimizer(Optimizer):
""" """
def __init__(self, learning_rate, regularization=None, name=None): def __init__(self,
learning_rate,
parameter_list=None,
regularization=None,
name=None):
assert learning_rate is not None assert learning_rate is not None
super(SGDOptimizer, self).__init__( super(SGDOptimizer, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization, regularization=regularization,
name=name) name=name)
self.type = "sgd" self.type = "sgd"
...@@ -801,6 +816,9 @@ class MomentumOptimizer(Optimizer): ...@@ -801,6 +816,9 @@ class MomentumOptimizer(Optimizer):
learning_rate (float|Variable): The learning rate used to update parameters. \ learning_rate (float|Variable): The learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element. Can be a float value or a Variable with one float value as data element.
momentum (float): Momentum factor momentum (float): Momentum factor
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
use_nesterov (bool, optional): Enables Nesterov momentum, default is false. use_nesterov (bool, optional): Enables Nesterov momentum, default is false.
regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. \ regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. \
Optional, default is None. Optional, default is None.
...@@ -841,6 +859,7 @@ class MomentumOptimizer(Optimizer): ...@@ -841,6 +859,7 @@ class MomentumOptimizer(Optimizer):
def __init__(self, def __init__(self,
learning_rate, learning_rate,
momentum, momentum,
parameter_list=None,
use_nesterov=False, use_nesterov=False,
regularization=None, regularization=None,
name=None): name=None):
...@@ -848,6 +867,7 @@ class MomentumOptimizer(Optimizer): ...@@ -848,6 +867,7 @@ class MomentumOptimizer(Optimizer):
assert momentum is not None assert momentum is not None
super(MomentumOptimizer, self).__init__( super(MomentumOptimizer, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization, regularization=regularization,
name=name) name=name)
self.type = "momentum" self.type = "momentum"
...@@ -921,6 +941,9 @@ class DGCMomentumOptimizer(Optimizer): ...@@ -921,6 +941,9 @@ class DGCMomentumOptimizer(Optimizer):
sparsity (list[float]): Get top important element from gradient tensor, the ratio is (1 - current sparsity). \ sparsity (list[float]): Get top important element from gradient tensor, the ratio is (1 - current sparsity). \
Default is [0.999]. For example, if the sparsity is [0.99, 0.999], \ Default is [0.999]. For example, if the sparsity is [0.99, 0.999], \
the top [1%, 0.1%] important element will be transmitted. the top [1%, 0.1%] important element will be transmitted.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
use_nesterov (bool): Enables Nesterov momentum. True means use Nesterov. Default is False. use_nesterov (bool): Enables Nesterov momentum. True means use Nesterov. Default is False.
local_grad_clip_norm (float, optional): Local gradient clip norm value. Optional, default is None, represent no need clip. local_grad_clip_norm (float, optional): Local gradient clip norm value. Optional, default is None, represent no need clip.
num_trainers (int, optional): The number of training nodes. Optional, default is None. num_trainers (int, optional): The number of training nodes. Optional, default is None.
...@@ -950,6 +973,7 @@ class DGCMomentumOptimizer(Optimizer): ...@@ -950,6 +973,7 @@ class DGCMomentumOptimizer(Optimizer):
rampup_begin_step, rampup_begin_step,
rampup_step=1, rampup_step=1,
sparsity=[0.999], sparsity=[0.999],
parameter_list=None,
use_nesterov=False, use_nesterov=False,
local_grad_clip_norm=None, local_grad_clip_norm=None,
num_trainers=None, num_trainers=None,
...@@ -959,6 +983,7 @@ class DGCMomentumOptimizer(Optimizer): ...@@ -959,6 +983,7 @@ class DGCMomentumOptimizer(Optimizer):
assert momentum is not None assert momentum is not None
super(DGCMomentumOptimizer, self).__init__( super(DGCMomentumOptimizer, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization, regularization=regularization,
name=name) name=name)
self.type = "dgc_momentum" self.type = "dgc_momentum"
...@@ -1286,6 +1311,9 @@ class LarsMomentumOptimizer(Optimizer): ...@@ -1286,6 +1311,9 @@ class LarsMomentumOptimizer(Optimizer):
momentum (float): momentum factor momentum (float): momentum factor
lars_coeff (float): Defines how much we trust the layer to change its weights. lars_coeff (float): Defines how much we trust the layer to change its weights.
lars_weight_decay (float): Weight decay coefficient for decaying using LARS. lars_weight_decay (float): Weight decay coefficient for decaying using LARS.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`.
Optional, default is None. Optional, default is None.
name (str, optional): This parameter is used by developers to print debugging information. \ name (str, optional): This parameter is used by developers to print debugging information. \
...@@ -1318,12 +1346,14 @@ class LarsMomentumOptimizer(Optimizer): ...@@ -1318,12 +1346,14 @@ class LarsMomentumOptimizer(Optimizer):
momentum, momentum,
lars_coeff=0.001, lars_coeff=0.001,
lars_weight_decay=0.0005, lars_weight_decay=0.0005,
parameter_list=None,
regularization=None, regularization=None,
name=None): name=None):
assert learning_rate is not None assert learning_rate is not None
assert momentum is not None assert momentum is not None
super(LarsMomentumOptimizer, self).__init__( super(LarsMomentumOptimizer, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization, regularization=regularization,
name=name) name=name)
self.type = "lars_momentum" self.type = "lars_momentum"
...@@ -1391,6 +1421,9 @@ class AdagradOptimizer(Optimizer): ...@@ -1391,6 +1421,9 @@ class AdagradOptimizer(Optimizer):
It can be a float value or a ``Variable`` with a float type. It can be a float value or a ``Variable`` with a float type.
epsilon (float, optional): A small float value for numerical stability. epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-06. The default value is 1e-06.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as
:ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None. :ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None.
name (str, optional): Normally there is no need for user to set this property. name (str, optional): Normally there is no need for user to set this property.
...@@ -1423,6 +1456,7 @@ class AdagradOptimizer(Optimizer): ...@@ -1423,6 +1456,7 @@ class AdagradOptimizer(Optimizer):
def __init__(self, def __init__(self,
learning_rate, learning_rate,
epsilon=1.0e-6, epsilon=1.0e-6,
parameter_list=None,
regularization=None, regularization=None,
name=None, name=None,
initial_accumulator_value=0.0): initial_accumulator_value=0.0):
...@@ -1430,6 +1464,7 @@ class AdagradOptimizer(Optimizer): ...@@ -1430,6 +1464,7 @@ class AdagradOptimizer(Optimizer):
assert epsilon is not None assert epsilon is not None
super(AdagradOptimizer, self).__init__( super(AdagradOptimizer, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization, regularization=regularization,
name=name) name=name)
self.type = "adagrad" self.type = "adagrad"
...@@ -1510,6 +1545,9 @@ class AdamOptimizer(Optimizer): ...@@ -1510,6 +1545,9 @@ class AdamOptimizer(Optimizer):
The default value is 0.999. The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability. epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-08. The default value is 1e-08.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as
:ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None. :ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None.
name (str, optional): Normally there is no need for user to set this property. name (str, optional): Normally there is no need for user to set this property.
...@@ -1619,6 +1657,7 @@ class AdamOptimizer(Optimizer): ...@@ -1619,6 +1657,7 @@ class AdamOptimizer(Optimizer):
beta1=0.9, beta1=0.9,
beta2=0.999, beta2=0.999,
epsilon=1e-8, epsilon=1e-8,
parameter_list=None,
regularization=None, regularization=None,
name=None, name=None,
lazy_mode=False): lazy_mode=False):
...@@ -1628,6 +1667,7 @@ class AdamOptimizer(Optimizer): ...@@ -1628,6 +1667,7 @@ class AdamOptimizer(Optimizer):
assert epsilon is not None assert epsilon is not None
super(AdamOptimizer, self).__init__( super(AdamOptimizer, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization, regularization=regularization,
name=name) name=name)
self.type = "adam" self.type = "adam"
...@@ -1747,6 +1787,9 @@ class AdamaxOptimizer(Optimizer): ...@@ -1747,6 +1787,9 @@ class AdamaxOptimizer(Optimizer):
The default value is 0.999. The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability. epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-08. The default value is 1e-08.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as
:ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None. :ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None.
name (str, optional): Normally there is no need for user to set this property. name (str, optional): Normally there is no need for user to set this property.
...@@ -1792,6 +1835,7 @@ class AdamaxOptimizer(Optimizer): ...@@ -1792,6 +1835,7 @@ class AdamaxOptimizer(Optimizer):
beta1=0.9, beta1=0.9,
beta2=0.999, beta2=0.999,
epsilon=1e-8, epsilon=1e-8,
parameter_list=None,
regularization=None, regularization=None,
name=None): name=None):
assert learning_rate is not None assert learning_rate is not None
...@@ -1800,6 +1844,7 @@ class AdamaxOptimizer(Optimizer): ...@@ -1800,6 +1844,7 @@ class AdamaxOptimizer(Optimizer):
assert epsilon is not None assert epsilon is not None
super(AdamaxOptimizer, self).__init__( super(AdamaxOptimizer, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization, regularization=regularization,
name=name) name=name)
self.type = "adamax" self.type = "adamax"
...@@ -1909,6 +1954,9 @@ class DpsgdOptimizer(Optimizer): ...@@ -1909,6 +1954,9 @@ class DpsgdOptimizer(Optimizer):
clip (float): clipping threshold clip (float): clipping threshold
batch_size (float): batch size. batch_size (float): batch size.
sigma (float): for gaussian noise. sigma (float): for gaussian noise.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
Notes: Notes:
Currently, DpsgdOptimizer doesn't support sparse parameter optimization. Currently, DpsgdOptimizer doesn't support sparse parameter optimization.
""" """
...@@ -1917,12 +1965,14 @@ class DpsgdOptimizer(Optimizer): ...@@ -1917,12 +1965,14 @@ class DpsgdOptimizer(Optimizer):
learning_rate=0.001, learning_rate=0.001,
clip=0.9, clip=0.9,
batch_size=0.999, batch_size=0.999,
sigma=1e-8): sigma=1e-8,
parameter_list=None):
assert learning_rate is not None assert learning_rate is not None
assert clip is not None assert clip is not None
assert batch_size is not None assert batch_size is not None
assert sigma is not None assert sigma is not None
super(DpsgdOptimizer, self).__init__(learning_rate=learning_rate) super(DpsgdOptimizer, self).__init__(
learning_rate=learning_rate, parameter_list=parameter_list)
self.type = "dpsgd" self.type = "dpsgd"
self._clip = clip self._clip = clip
self._batch_size = batch_size self._batch_size = batch_size
...@@ -1976,6 +2026,9 @@ class DecayedAdagradOptimizer(Optimizer): ...@@ -1976,6 +2026,9 @@ class DecayedAdagradOptimizer(Optimizer):
decay (float, optional): The decay rate. The default value is 0.95. decay (float, optional): The decay rate. The default value is 0.95.
epsilon (float, optional): A small float value for numerical stability. epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-06. The default value is 1e-06.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as
:ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None. :ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None.
name (str, optional): Normally there is no need for user to set this property. name (str, optional): Normally there is no need for user to set this property.
...@@ -2002,6 +2055,7 @@ class DecayedAdagradOptimizer(Optimizer): ...@@ -2002,6 +2055,7 @@ class DecayedAdagradOptimizer(Optimizer):
learning_rate, learning_rate,
decay=0.95, decay=0.95,
epsilon=1.0e-6, epsilon=1.0e-6,
parameter_list=None,
regularization=None, regularization=None,
name=None): name=None):
assert learning_rate is not None assert learning_rate is not None
...@@ -2010,6 +2064,7 @@ class DecayedAdagradOptimizer(Optimizer): ...@@ -2010,6 +2064,7 @@ class DecayedAdagradOptimizer(Optimizer):
super(DecayedAdagradOptimizer, self).__init__( super(DecayedAdagradOptimizer, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization, regularization=regularization,
name=name) name=name)
self.type = "decayed_adagrad" self.type = "decayed_adagrad"
...@@ -2066,6 +2121,9 @@ class AdadeltaOptimizer(Optimizer): ...@@ -2066,6 +2121,9 @@ class AdadeltaOptimizer(Optimizer):
learning_rate (float|Variable): global learning rate. learning_rate (float|Variable): global learning rate.
epsilon (float): a small float number for numeric stability. Default 1.0e-6. epsilon (float): a small float number for numeric stability. Default 1.0e-6.
rho (float): a floating point value indicating the decay rate. Default 0.95. rho (float): a floating point value indicating the decay rate. Default 0.95.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization (WeightDecayRegularizer, optional): A Regularizer, such as regularization (WeightDecayRegularizer, optional): A Regularizer, such as
fluid.regularizer.L2DecayRegularizer. Default None, meaning that there is no fluid.regularizer.L2DecayRegularizer. Default None, meaning that there is no
regularization. regularization.
...@@ -2097,6 +2155,7 @@ class AdadeltaOptimizer(Optimizer): ...@@ -2097,6 +2155,7 @@ class AdadeltaOptimizer(Optimizer):
learning_rate, learning_rate,
epsilon=1.0e-6, epsilon=1.0e-6,
rho=0.95, rho=0.95,
parameter_list=None,
regularization=None, regularization=None,
name=None): name=None):
if learning_rate is None: if learning_rate is None:
...@@ -2107,6 +2166,7 @@ class AdadeltaOptimizer(Optimizer): ...@@ -2107,6 +2166,7 @@ class AdadeltaOptimizer(Optimizer):
raise ValueError("rho is not set.") raise ValueError("rho is not set.")
super(AdadeltaOptimizer, self).__init__( super(AdadeltaOptimizer, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization, regularization=regularization,
name=name) name=name)
self.type = "adadelta" self.type = "adadelta"
...@@ -2210,6 +2270,9 @@ class RMSPropOptimizer(Optimizer): ...@@ -2210,6 +2270,9 @@ class RMSPropOptimizer(Optimizer):
the gradient; if False, by the uncentered second moment. Setting this to the gradient; if False, by the uncentered second moment. Setting this to
True may help with training, but is slightly more expensive in terms of True may help with training, but is slightly more expensive in terms of
computation and memory. Defaults to False. computation and memory. Defaults to False.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. \ regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. \
Optional, default is None. Optional, default is None.
name (str, optional): This parameter is used by developers to print debugging information. \ name (str, optional): This parameter is used by developers to print debugging information. \
...@@ -2258,10 +2321,12 @@ class RMSPropOptimizer(Optimizer): ...@@ -2258,10 +2321,12 @@ class RMSPropOptimizer(Optimizer):
epsilon=1.0e-6, epsilon=1.0e-6,
momentum=0.0, momentum=0.0,
centered=False, centered=False,
parameter_list=None,
regularization=None, regularization=None,
name=None): name=None):
super(RMSPropOptimizer, self).__init__( super(RMSPropOptimizer, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization, regularization=regularization,
name=name) name=name)
if learning_rate is None: if learning_rate is None:
...@@ -2370,6 +2435,9 @@ class FtrlOptimizer(Optimizer): ...@@ -2370,6 +2435,9 @@ class FtrlOptimizer(Optimizer):
l1 (float): L1 regularization strength, default is 0.0. l1 (float): L1 regularization strength, default is 0.0.
l2 (float): L2 regularization strength, default is 0.0. l2 (float): L2 regularization strength, default is 0.0.
lr_power (float): Learning Rate Power, default is -0.5. lr_power (float): Learning Rate Power, default is -0.5.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. \ regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. \
Optional, default is None. Optional, default is None.
name (str, optional): This parameter is used by developers to print debugging information. \ name (str, optional): This parameter is used by developers to print debugging information. \
...@@ -2418,10 +2486,12 @@ class FtrlOptimizer(Optimizer): ...@@ -2418,10 +2486,12 @@ class FtrlOptimizer(Optimizer):
l1=0.0, l1=0.0,
l2=0.0, l2=0.0,
lr_power=-0.5, lr_power=-0.5,
parameter_list=None,
regularization=None, regularization=None,
name=None): name=None):
super(FtrlOptimizer, self).__init__( super(FtrlOptimizer, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization, regularization=regularization,
name=name) name=name)
if learning_rate is None: if learning_rate is None:
...@@ -2504,6 +2574,9 @@ class LambOptimizer(AdamOptimizer): ...@@ -2504,6 +2574,9 @@ class LambOptimizer(AdamOptimizer):
beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
Default 0.999. Default 0.999.
epsilon (float, optional): A small float value for numerical stability. Default 1e-6. epsilon (float, optional): A small float value for numerical stability. Default 1e-6.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization (Regularizer|None): A Regularizer, such as regularization (Regularizer|None): A Regularizer, such as
fluid.regularizer.L1DecayRegularizer. Default None. fluid.regularizer.L1DecayRegularizer. Default None.
exclude_from_weight_decay_fn (function|None): Exclude a parameter from weight exclude_from_weight_decay_fn (function|None): Exclude a parameter from weight
...@@ -2540,6 +2613,7 @@ class LambOptimizer(AdamOptimizer): ...@@ -2540,6 +2613,7 @@ class LambOptimizer(AdamOptimizer):
beta1=0.9, beta1=0.9,
beta2=0.999, beta2=0.999,
epsilon=1e-6, epsilon=1e-6,
parameter_list=None,
regularization=None, regularization=None,
exclude_from_weight_decay_fn=None, exclude_from_weight_decay_fn=None,
name=None): name=None):
...@@ -2550,6 +2624,7 @@ class LambOptimizer(AdamOptimizer): ...@@ -2550,6 +2624,7 @@ class LambOptimizer(AdamOptimizer):
assert epsilon is not None assert epsilon is not None
super(LambOptimizer, self).__init__( super(LambOptimizer, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization, regularization=regularization,
beta1=beta1, beta1=beta1,
beta2=beta2, beta2=beta2,
......
...@@ -26,7 +26,7 @@ import paddle.fluid as fluid ...@@ -26,7 +26,7 @@ import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph import paddle.fluid.dygraph as dygraph
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
...@@ -79,8 +79,8 @@ class SimpleImgConvPool(fluid.dygraph.Layer): ...@@ -79,8 +79,8 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
class MNIST(fluid.dygraph.Layer): class MNIST(fluid.dygraph.Layer):
def __init__(self, name_scope): def __init__(self):
super(MNIST, self).__init__(name_scope) super(MNIST, self).__init__()
self._simple_img_conv_pool_1 = SimpleImgConvPool( self._simple_img_conv_pool_1 = SimpleImgConvPool(
1, 20, 5, 2, 2, act="relu") 1, 20, 5, 2, 2, act="relu")
...@@ -88,19 +88,21 @@ class MNIST(fluid.dygraph.Layer): ...@@ -88,19 +88,21 @@ class MNIST(fluid.dygraph.Layer):
self._simple_img_conv_pool_2 = SimpleImgConvPool( self._simple_img_conv_pool_2 = SimpleImgConvPool(
20, 50, 5, 2, 2, act="relu") 20, 50, 5, 2, 2, act="relu")
pool_2_shape = 50 * 4 * 4 self.pool_2_shape = 50 * 4 * 4
SIZE = 10 SIZE = 10
scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5 scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5
self._fc = FC(self.full_name(), self._fc = Linear(
10, self.pool_2_shape,
param_attr=fluid.param_attr.ParamAttr( 10,
initializer=fluid.initializer.NormalInitializer( param_attr=fluid.param_attr.ParamAttr(
loc=0.0, scale=scale)), initializer=fluid.initializer.NormalInitializer(
act="softmax") loc=0.0, scale=scale)),
act="softmax")
def forward(self, inputs, label): def forward(self, inputs, label):
x = self._simple_img_conv_pool_1(inputs) x = self._simple_img_conv_pool_1(inputs)
x = self._simple_img_conv_pool_2(x) x = self._simple_img_conv_pool_2(x)
x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape])
cost = self._fc(x) cost = self._fc(x)
loss = fluid.layers.cross_entropy(cost, label) loss = fluid.layers.cross_entropy(cost, label)
avg_loss = fluid.layers.mean(loss) avg_loss = fluid.layers.mean(loss)
...@@ -109,10 +111,11 @@ class MNIST(fluid.dygraph.Layer): ...@@ -109,10 +111,11 @@ class MNIST(fluid.dygraph.Layer):
class TestMnist(TestParallelDyGraphRunnerBase): class TestMnist(TestParallelDyGraphRunnerBase):
def get_model(self): def get_model(self):
model = MNIST("mnist") model = MNIST()
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=2, drop_last=True) paddle.dataset.mnist.train(), batch_size=2, drop_last=True)
opt = fluid.optimizer.Adam(learning_rate=1e-3) opt = fluid.optimizer.Adam(
learning_rate=1e-3, parameter_list=model.parameters())
return model, train_reader, opt return model, train_reader, opt
def run_one_loop(self, model, opt, data): def run_one_loop(self, model, opt, data):
......
...@@ -27,7 +27,7 @@ import paddle.fluid as fluid ...@@ -27,7 +27,7 @@ import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph import paddle.fluid.dygraph as dygraph
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, BatchNorm from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.layer_helper import LayerHelper
import math import math
...@@ -54,7 +54,7 @@ train_parameters = { ...@@ -54,7 +54,7 @@ train_parameters = {
} }
def optimizer_setting(params): def optimizer_setting(params, parameter_list=None):
ls = params["learning_strategy"] ls = params["learning_strategy"]
if "total_images" not in params: if "total_images" not in params:
total_images = 6149 total_images = 6149
...@@ -66,11 +66,19 @@ def optimizer_setting(params): ...@@ -66,11 +66,19 @@ def optimizer_setting(params):
bd = [step * e for e in ls["epochs"]] bd = [step * e for e in ls["epochs"]]
lr = params["lr"] lr = params["lr"]
num_epochs = params["num_epochs"] num_epochs = params["num_epochs"]
optimizer = fluid.optimizer.Momentum( if fluid.in_dygraph_mode():
learning_rate=fluid.layers.cosine_decay( optimizer = fluid.optimizer.Momentum(
learning_rate=lr, step_each_epoch=step, epochs=num_epochs), learning_rate=fluid.layers.cosine_decay(
momentum=momentum_rate, learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
regularization=fluid.regularizer.L2Decay(l2_decay)) momentum=momentum_rate,
regularization=fluid.regularizer.L2Decay(l2_decay),
parameter_list=parameter_list)
else:
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.cosine_decay(
learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
momentum=momentum_rate,
regularization=fluid.regularizer.L2Decay(l2_decay))
return optimizer return optimizer
...@@ -107,27 +115,29 @@ class ConvBNLayer(fluid.dygraph.Layer): ...@@ -107,27 +115,29 @@ class ConvBNLayer(fluid.dygraph.Layer):
class SqueezeExcitation(fluid.dygraph.Layer): class SqueezeExcitation(fluid.dygraph.Layer):
def __init__(self, name_scope, num_channels, reduction_ratio): def __init__(self, num_channels, reduction_ratio):
super(SqueezeExcitation, self).__init__(name_scope) super(SqueezeExcitation, self).__init__()
self._num_channels = num_channels
self._pool = Pool2D(pool_size=0, pool_type='avg', global_pooling=True) self._pool = Pool2D(pool_size=0, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(num_channels * 1.0) stdv = 1.0 / math.sqrt(num_channels * 1.0)
self._squeeze = FC( self._squeeze = Linear(
self.full_name(), num_channels,
size=num_channels // reduction_ratio, num_channels // reduction_ratio,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)), initializer=fluid.initializer.Uniform(-stdv, stdv)),
act='relu') act='relu')
stdv = 1.0 / math.sqrt(num_channels / 16.0 * 1.0) stdv = 1.0 / math.sqrt(num_channels / 16.0 * 1.0)
self._excitation = FC( self._excitation = Linear(
self.full_name(), num_channels // reduction_ratio,
size=num_channels, num_channels,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)), initializer=fluid.initializer.Uniform(-stdv, stdv)),
act='sigmoid') act='sigmoid')
def forward(self, input): def forward(self, input):
y = self._pool(input) y = self._pool(input)
y = fluid.layers.reshape(y, shape=[-1, self._num_channels])
y = self._squeeze(y) y = self._squeeze(y)
y = self._excitation(y) y = self._excitation(y)
y = fluid.layers.elementwise_mul(x=input, y=y, axis=0) y = fluid.layers.elementwise_mul(x=input, y=y, axis=0)
...@@ -163,9 +173,7 @@ class BottleneckBlock(fluid.dygraph.Layer): ...@@ -163,9 +173,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
act=None) act=None)
self.scale = SqueezeExcitation( self.scale = SqueezeExcitation(
self.full_name(), num_channels=num_filters * 2, reduction_ratio=reduction_ratio)
num_channels=num_filters * 2,
reduction_ratio=reduction_ratio)
if not shortcut: if not shortcut:
self.short = ConvBNLayer( self.short = ConvBNLayer(
...@@ -194,8 +202,8 @@ class BottleneckBlock(fluid.dygraph.Layer): ...@@ -194,8 +202,8 @@ class BottleneckBlock(fluid.dygraph.Layer):
class SeResNeXt(fluid.dygraph.Layer): class SeResNeXt(fluid.dygraph.Layer):
def __init__(self, name_scope, layers=50, class_dim=102): def __init__(self, layers=50, class_dim=102):
super(SeResNeXt, self).__init__(name_scope) super(SeResNeXt, self).__init__()
self.layers = layers self.layers = layers
supported_layers = [50, 101, 152] supported_layers = [50, 101, 152]
...@@ -276,10 +284,13 @@ class SeResNeXt(fluid.dygraph.Layer): ...@@ -276,10 +284,13 @@ class SeResNeXt(fluid.dygraph.Layer):
pool_size=7, pool_type='avg', global_pooling=True) pool_size=7, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(2048 * 1.0) stdv = 1.0 / math.sqrt(2048 * 1.0)
self.out = FC(self.full_name(), self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 2 * 1 * 1
size=class_dim,
param_attr=fluid.param_attr.ParamAttr( self.out = Linear(
initializer=fluid.initializer.Uniform(-stdv, stdv))) self.pool2d_avg_output,
class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
def forward(self, inputs): def forward(self, inputs):
if self.layers == 50 or self.layers == 101: if self.layers == 50 or self.layers == 101:
...@@ -294,18 +305,20 @@ class SeResNeXt(fluid.dygraph.Layer): ...@@ -294,18 +305,20 @@ class SeResNeXt(fluid.dygraph.Layer):
for bottleneck_block in self.bottleneck_block_list: for bottleneck_block in self.bottleneck_block_list:
y = bottleneck_block(y) y = bottleneck_block(y)
y = self.pool2d_avg(y) y = self.pool2d_avg(y)
y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output])
y = self.out(y) y = self.out(y)
return y return y
class TestSeResNeXt(TestParallelDyGraphRunnerBase): class TestSeResNeXt(TestParallelDyGraphRunnerBase):
def get_model(self): def get_model(self):
model = SeResNeXt("se-resnext") model = SeResNeXt()
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.flowers.test(use_xmap=False), paddle.dataset.flowers.test(use_xmap=False),
batch_size=train_parameters["batch_size"], batch_size=train_parameters["batch_size"],
drop_last=True) drop_last=True)
optimizer = optimizer_setting(train_parameters) optimizer = optimizer_setting(
train_parameters, parameter_list=model.parameters())
return model, train_reader, optimizer return model, train_reader, optimizer
def run_one_loop(self, model, opt, data): def run_one_loop(self, model, opt, data):
......
...@@ -23,7 +23,7 @@ import paddle ...@@ -23,7 +23,7 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
...@@ -75,8 +75,8 @@ class SimpleImgConvPool(fluid.dygraph.Layer): ...@@ -75,8 +75,8 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
class MNIST(fluid.dygraph.Layer): class MNIST(fluid.dygraph.Layer):
def __init__(self, name_scope): def __init__(self):
super(MNIST, self).__init__(name_scope) super(MNIST, self).__init__()
self._simple_img_conv_pool_1 = SimpleImgConvPool( self._simple_img_conv_pool_1 = SimpleImgConvPool(
1, 20, 5, 2, 2, act="relu") 1, 20, 5, 2, 2, act="relu")
...@@ -84,19 +84,21 @@ class MNIST(fluid.dygraph.Layer): ...@@ -84,19 +84,21 @@ class MNIST(fluid.dygraph.Layer):
self._simple_img_conv_pool_2 = SimpleImgConvPool( self._simple_img_conv_pool_2 = SimpleImgConvPool(
20, 50, 5, 2, 2, act="relu") 20, 50, 5, 2, 2, act="relu")
pool_2_shape = 50 * 4 * 4 self.pool_2_shape = 50 * 4 * 4
SIZE = 10 SIZE = 100 #10
scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5 scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5
self._fc = FC(self.full_name(), self._fc = Linear(
10, self.pool_2_shape,
param_attr=fluid.param_attr.ParamAttr( SIZE,
initializer=fluid.initializer.NormalInitializer( param_attr=fluid.param_attr.ParamAttr(
loc=0.0, scale=scale)), initializer=fluid.initializer.NormalInitializer(
act="softmax") loc=0.0, scale=scale)),
act="softmax")
def forward(self, inputs): def forward(self, inputs):
x = self._simple_img_conv_pool_1(inputs) x = self._simple_img_conv_pool_1(inputs)
x = self._simple_img_conv_pool_2(x) x = self._simple_img_conv_pool_2(x)
x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape])
x = self._fc(x) x = self._fc(x)
return x return x
...@@ -109,8 +111,9 @@ class TestDygraphMultiForward(unittest.TestCase): ...@@ -109,8 +111,9 @@ class TestDygraphMultiForward(unittest.TestCase):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
mnist = MNIST("mnist") mnist = MNIST()
sgd = SGDOptimizer(learning_rate=1e-3) sgd = SGDOptimizer(
learning_rate=1e-3, parameter_list=mnist.parameters())
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True) paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
...@@ -145,7 +148,7 @@ class TestDygraphMultiForward(unittest.TestCase): ...@@ -145,7 +148,7 @@ class TestDygraphMultiForward(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace( exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
mnist = MNIST("mnist") mnist = MNIST()
sgd = SGDOptimizer(learning_rate=1e-3) sgd = SGDOptimizer(learning_rate=1e-3)
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True) paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
......
...@@ -258,7 +258,9 @@ class TestImperativeAutoPrune(unittest.TestCase): ...@@ -258,7 +258,9 @@ class TestImperativeAutoPrune(unittest.TestCase):
fc2_origin = fc2._w.numpy() fc2_origin = fc2._w.numpy()
fc2._w.stop_gradient = True fc2._w.stop_gradient = True
out2.backward() out2.backward()
optimizer = fluid.optimizer.SGD(learning_rate=0.003) optimizer = fluid.optimizer.SGD(
learning_rate=0.003,
parameter_list=(fc.parameters() + fc2.parameters()))
optimizer.minimize(out2) optimizer.minimize(out2)
self.assertTrue(np.array_equal(fc2_origin, fc2._w.numpy())) self.assertTrue(np.array_equal(fc2_origin, fc2._w.numpy()))
self.assertFalse(np.array_equal(fc_origin, fc._w.numpy())) self.assertFalse(np.array_equal(fc_origin, fc._w.numpy()))
...@@ -279,7 +281,9 @@ class TestImperativeAutoPrune(unittest.TestCase): ...@@ -279,7 +281,9 @@ class TestImperativeAutoPrune(unittest.TestCase):
fc2_origin = fc2._w.numpy() fc2_origin = fc2._w.numpy()
out2.stop_gradient = True out2.stop_gradient = True
out2.backward() out2.backward()
optimizer = fluid.optimizer.SGD(learning_rate=0.003) optimizer = fluid.optimizer.SGD(
learning_rate=0.003,
parameter_list=(fc.parameters() + fc2.parameters()))
optimizer.minimize(out2) optimizer.minimize(out2)
self.assertTrue(np.array_equal(fc2_origin, fc2._w.numpy())) self.assertTrue(np.array_equal(fc2_origin, fc2._w.numpy()))
self.assertTrue(np.array_equal(fc_origin, fc._w.numpy())) self.assertTrue(np.array_equal(fc_origin, fc._w.numpy()))
...@@ -320,7 +324,8 @@ class TestImperativeAutoPrune(unittest.TestCase): ...@@ -320,7 +324,8 @@ class TestImperativeAutoPrune(unittest.TestCase):
place = fluid.CPUPlace() place = fluid.CPUPlace()
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
model = MyLayer("mylayer", vocab_size, size) model = MyLayer("mylayer", vocab_size, size)
optimizer = fluid.optimizer.AdamOptimizer(0.001) optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters())
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)
indices = fluid.dygraph.to_variable(indices) indices = fluid.dygraph.to_variable(indices)
...@@ -338,7 +343,8 @@ class TestImperativeAutoPrune(unittest.TestCase): ...@@ -338,7 +343,8 @@ class TestImperativeAutoPrune(unittest.TestCase):
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
model = MyLayer2("mylayer", vocab_size, size) model = MyLayer2("mylayer", vocab_size, size)
optimizer = fluid.optimizer.AdamOptimizer(0.001) optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters())
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)
indices = fluid.dygraph.to_variable(indices) indices = fluid.dygraph.to_variable(indices)
......
...@@ -58,7 +58,7 @@ class TestDygraphDebugString(unittest.TestCase): ...@@ -58,7 +58,7 @@ class TestDygraphDebugString(unittest.TestCase):
out.backward() out.backward()
mlp.clear_gradients() mlp.clear_gradients()
unique_name_tmp, trace_var_tmp, alive_var_tmp = fluid.dygraph.base._print_debug_msg( unique_name_tmp, trace_var_tmp, alive_var_tmp = fluid.dygraph.base._print_debug_msg(
is_test=True) mlp.parameters(), is_test=True)
if i > 0: if i > 0:
self.assertGreaterEqual(unique_name, unique_name_tmp) self.assertGreaterEqual(unique_name, unique_name_tmp)
self.assertGreaterEqual(trace_var, trace_var_tmp) self.assertGreaterEqual(trace_var, trace_var_tmp)
...@@ -68,7 +68,7 @@ class TestDygraphDebugString(unittest.TestCase): ...@@ -68,7 +68,7 @@ class TestDygraphDebugString(unittest.TestCase):
trace_var = trace_var_tmp trace_var = trace_var_tmp
alive_var = alive_var_tmp alive_var = alive_var_tmp
try: try:
fluid.dygraph.base._print_debug_msg() fluid.dygraph.base._print_debug_msg(mlp.parameters())
except Exception as e: except Exception as e:
raise RuntimeError( raise RuntimeError(
"No Exception is accepted in _print_debug_msg, but we got: {}". "No Exception is accepted in _print_debug_msg, but we got: {}".
......
...@@ -23,6 +23,7 @@ import paddle.fluid as fluid ...@@ -23,6 +23,7 @@ import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph import Linear
# Can use Amusic dataset as the DeepCF describes. # Can use Amusic dataset as the DeepCF describes.
DATA_PATH = os.environ.get('DATA_PATH', '') DATA_PATH = os.environ.get('DATA_PATH', '')
...@@ -33,10 +34,10 @@ NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1)) ...@@ -33,10 +34,10 @@ NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1))
class DMF(fluid.Layer): class DMF(fluid.Layer):
def __init__(self, name_scope): def __init__(self):
super(DMF, self).__init__(name_scope) super(DMF, self).__init__()
self._user_latent = fluid.FC(self.full_name(), 256) self._user_latent = Linear(1000, 256)
self._item_latent = fluid.FC(self.full_name(), 256) self._item_latent = Linear(100, 256)
self._user_layers = [] self._user_layers = []
self._item_layers = [] self._item_layers = []
...@@ -45,11 +46,17 @@ class DMF(fluid.Layer): ...@@ -45,11 +46,17 @@ class DMF(fluid.Layer):
self._user_layers.append( self._user_layers.append(
self.add_sublayer( self.add_sublayer(
'user_layer_%d' % i, 'user_layer_%d' % i,
fluid.FC(self.full_name(), self._hid_sizes[i], act='relu'))) Linear(
256 if i == 0 else self._hid_sizes[i - 1],
self._hid_sizes[i],
act='relu')))
self._item_layers.append( self._item_layers.append(
self.add_sublayer( self.add_sublayer(
'item_layer_%d' % i, 'item_layer_%d' % i,
fluid.FC(self.full_name(), self._hid_sizes[i], act='relu'))) Linear(
256 if i == 0 else self._hid_sizes[i - 1],
self._hid_sizes[i],
act='relu')))
def forward(self, users, items): def forward(self, users, items):
users = self._user_latent(users) users = self._user_latent(users)
...@@ -62,17 +69,20 @@ class DMF(fluid.Layer): ...@@ -62,17 +69,20 @@ class DMF(fluid.Layer):
class MLP(fluid.Layer): class MLP(fluid.Layer):
def __init__(self, name_scope): def __init__(self):
super(MLP, self).__init__(name_scope) super(MLP, self).__init__()
self._user_latent = fluid.FC(self.full_name(), 256) self._user_latent = Linear(1000, 256)
self._item_latent = fluid.FC(self.full_name(), 256) self._item_latent = Linear(100, 256)
self._match_layers = [] self._match_layers = []
self._hid_sizes = [128, 64] self._hid_sizes = [128, 64]
for i in range(len(self._hid_sizes)): for i in range(len(self._hid_sizes)):
self._match_layers.append( self._match_layers.append(
self.add_sublayer( self.add_sublayer(
'match_layer_%d' % i, 'match_layer_%d' % i,
fluid.FC(self.full_name(), self._hid_sizes[i], act='relu'))) Linear(
256 * 2 if i == 0 else self._hid_sizes[i - 1],
self._hid_sizes[i],
act='relu')))
def forward(self, users, items): def forward(self, users, items):
users = self._user_latent(users) users = self._user_latent(users)
...@@ -85,8 +95,8 @@ class MLP(fluid.Layer): ...@@ -85,8 +95,8 @@ class MLP(fluid.Layer):
class DeepCF(fluid.Layer): class DeepCF(fluid.Layer):
def __init__(self, name_scope, num_users, num_items, matrix): def __init__(self, num_users, num_items, matrix):
super(DeepCF, self).__init__(name_scope) super(DeepCF, self).__init__()
self._num_users = num_users self._num_users = num_users
self._num_items = num_items self._num_items = num_items
self._rating_matrix = self.create_parameter( self._rating_matrix = self.create_parameter(
...@@ -97,9 +107,9 @@ class DeepCF(fluid.Layer): ...@@ -97,9 +107,9 @@ class DeepCF(fluid.Layer):
default_initializer=fluid.initializer.NumpyArrayInitializer(matrix)) default_initializer=fluid.initializer.NumpyArrayInitializer(matrix))
self._rating_matrix.stop_gradient = True self._rating_matrix.stop_gradient = True
self._mlp = MLP(self.full_name()) self._mlp = MLP()
self._dmf = DMF(self.full_name()) self._dmf = DMF()
self._match_fc = fluid.FC(self.full_name(), 1, act='sigmoid') self._match_fc = Linear(128, 1, act='sigmoid')
def forward(self, users, items): def forward(self, users, items):
# users_emb = self._user_emb(users) # users_emb = self._user_emb(users)
...@@ -208,7 +218,7 @@ class TestDygraphDeepCF(unittest.TestCase): ...@@ -208,7 +218,7 @@ class TestDygraphDeepCF(unittest.TestCase):
items = fluid.layers.data('items', [1], dtype='int32') items = fluid.layers.data('items', [1], dtype='int32')
labels = fluid.layers.data('labels', [1], dtype='float32') labels = fluid.layers.data('labels', [1], dtype='float32')
deepcf = DeepCF('deepcf', num_users, num_items, matrix) deepcf = DeepCF(num_users, num_items, matrix)
prediction = deepcf(users, items) prediction = deepcf(users, items)
loss = fluid.layers.reduce_sum( loss = fluid.layers.reduce_sum(
fluid.layers.log_loss(prediction, labels)) fluid.layers.log_loss(prediction, labels))
...@@ -237,8 +247,9 @@ class TestDygraphDeepCF(unittest.TestCase): ...@@ -237,8 +247,9 @@ class TestDygraphDeepCF(unittest.TestCase):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
deepcf = DeepCF('deepcf', num_users, num_items, matrix) deepcf = DeepCF(num_users, num_items, matrix)
adam = fluid.optimizer.AdamOptimizer(0.01) adam = fluid.optimizer.AdamOptimizer(
0.01, parameter_list=deepcf.parameters())
for e in range(NUM_EPOCHES): for e in range(NUM_EPOCHES):
sys.stderr.write('epoch %d\n' % e) sys.stderr.write('epoch %d\n' % e)
for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE): for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
...@@ -261,8 +272,9 @@ class TestDygraphDeepCF(unittest.TestCase): ...@@ -261,8 +272,9 @@ class TestDygraphDeepCF(unittest.TestCase):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
deepcf2 = DeepCF('deepcf', num_users, num_items, matrix) deepcf2 = DeepCF(num_users, num_items, matrix)
adam2 = fluid.optimizer.AdamOptimizer(0.01) adam2 = fluid.optimizer.AdamOptimizer(
0.01, parameter_list=deepcf2.parameters())
backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True backward_strategy.sort_sum_gradient = True
for e in range(NUM_EPOCHES): for e in range(NUM_EPOCHES):
......
...@@ -22,33 +22,35 @@ import paddle ...@@ -22,33 +22,35 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid import Conv2D, Pool2D, FC from paddle.fluid import Conv2D, Pool2D, Linear
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
class Discriminator(fluid.Layer): class Discriminator(fluid.Layer):
def __init__(self, name_scope): def __init__(self):
super(Discriminator, self).__init__(name_scope) super(Discriminator, self).__init__()
self._fc1 = FC(self.full_name(), size=32, act='elu') self._fc1 = Linear(1, 32, act='elu')
self._fc2 = FC(self.full_name(), size=1) self._fc2 = Linear(32, 1)
def forward(self, inputs): def forward(self, inputs):
x = self._fc1(inputs) x = self._fc1(inputs)
return self._fc2(x) x = self._fc2(x)
return x
class Generator(fluid.Layer): class Generator(fluid.Layer):
def __init__(self, name_scope): def __init__(self):
super(Generator, self).__init__(name_scope) super(Generator, self).__init__()
self._fc1 = FC(self.full_name(), size=64, act='elu') self._fc1 = Linear(2, 64, act='elu')
self._fc2 = FC(self.full_name(), size=64, act='elu') self._fc2 = Linear(64, 64, act='elu')
self._fc3 = FC(self.full_name(), size=1) self._fc3 = Linear(64, 1)
def forward(self, inputs): def forward(self, inputs):
x = self._fc1(inputs) x = self._fc1(inputs)
x = self._fc2(x) x = self._fc2(x)
return self._fc3(x) x = self._fc3(x)
return x
class TestDygraphGAN(unittest.TestCase): class TestDygraphGAN(unittest.TestCase):
...@@ -65,8 +67,8 @@ class TestDygraphGAN(unittest.TestCase): ...@@ -65,8 +67,8 @@ class TestDygraphGAN(unittest.TestCase):
scope = fluid.core.Scope() scope = fluid.core.Scope()
with new_program_scope( with new_program_scope(
main=discriminate_p, startup=startup, scope=scope): main=discriminate_p, startup=startup, scope=scope):
discriminator = Discriminator("d") discriminator = Discriminator()
generator = Generator("g") generator = Generator()
img = fluid.layers.data( img = fluid.layers.data(
name="img", shape=[2, 1], append_batch_size=False) name="img", shape=[2, 1], append_batch_size=False)
...@@ -93,8 +95,8 @@ class TestDygraphGAN(unittest.TestCase): ...@@ -93,8 +95,8 @@ class TestDygraphGAN(unittest.TestCase):
sgd.minimize(d_loss) sgd.minimize(d_loss)
with new_program_scope(main=generate_p, startup=startup, scope=scope): with new_program_scope(main=generate_p, startup=startup, scope=scope):
discriminator = Discriminator("d") discriminator = Discriminator()
generator = Generator("g") generator = Generator()
noise = fluid.layers.data( noise = fluid.layers.data(
name="noise", shape=[2, 2], append_batch_size=False) name="noise", shape=[2, 2], append_batch_size=False)
...@@ -134,9 +136,12 @@ class TestDygraphGAN(unittest.TestCase): ...@@ -134,9 +136,12 @@ class TestDygraphGAN(unittest.TestCase):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
discriminator = Discriminator("d") discriminator = Discriminator()
generator = Generator("g") generator = Generator()
sgd = SGDOptimizer(learning_rate=1e-3) sgd = SGDOptimizer(
learning_rate=1e-3,
parameter_list=(
discriminator.parameters() + generator.parameters()))
d_real = discriminator(to_variable(np.ones([2, 1], np.float32))) d_real = discriminator(to_variable(np.ones([2, 1], np.float32)))
d_loss_real = fluid.layers.reduce_mean( d_loss_real = fluid.layers.reduce_mean(
...@@ -177,9 +182,12 @@ class TestDygraphGAN(unittest.TestCase): ...@@ -177,9 +182,12 @@ class TestDygraphGAN(unittest.TestCase):
backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True backward_strategy.sort_sum_gradient = True
discriminator2 = Discriminator("d") discriminator2 = Discriminator()
generator2 = Generator("g") generator2 = Generator()
sgd2 = SGDOptimizer(learning_rate=1e-3) sgd2 = SGDOptimizer(
learning_rate=1e-3,
parameter_list=(
discriminator2.parameters() + generator2.parameters()))
d_real2 = discriminator2(to_variable(np.ones([2, 1], np.float32))) d_real2 = discriminator2(to_variable(np.ones([2, 1], np.float32)))
d_loss_real2 = fluid.layers.reduce_mean( d_loss_real2 = fluid.layers.reduce_mean(
......
...@@ -131,7 +131,8 @@ class TestDygraphGNN(unittest.TestCase): ...@@ -131,7 +131,8 @@ class TestDygraphGNN(unittest.TestCase):
to_variable(labels)) to_variable(labels))
loss = fluid.layers.reduce_sum(loss) loss = fluid.layers.reduce_sum(loss)
loss.backward() loss.backward()
adam = AdamOptimizer(learning_rate=1e-3) adam = AdamOptimizer(
learning_rate=1e-3, parameter_list=model.parameters())
adam.minimize(loss) adam.minimize(loss)
model.clear_gradients() model.clear_gradients()
...@@ -156,7 +157,8 @@ class TestDygraphGNN(unittest.TestCase): ...@@ -156,7 +157,8 @@ class TestDygraphGNN(unittest.TestCase):
logits2, to_variable(labels2)) logits2, to_variable(labels2))
loss2 = fluid.layers.reduce_sum(loss2) loss2 = fluid.layers.reduce_sum(loss2)
loss2.backward() loss2.backward()
adam2 = AdamOptimizer(learning_rate=1e-3) adam2 = AdamOptimizer(
learning_rate=1e-3, parameter_list=model2.parameters())
adam2.minimize(loss2) adam2.minimize(loss2)
model2.clear_gradients() model2.clear_gradients()
loss2_value = loss2.numpy() loss2_value = loss2.numpy()
......
...@@ -105,7 +105,9 @@ class TestDygraphSimpleNet(unittest.TestCase): ...@@ -105,7 +105,9 @@ class TestDygraphSimpleNet(unittest.TestCase):
is_sparse=is_sparse, is_sparse=is_sparse,
dtype=dtype) dtype=dtype)
sgd = SGDOptimizer(learning_rate=1e-3) sgd = SGDOptimizer(
learning_rate=1e-3,
parameter_list=simple_net.parameters())
dy_param_updated = dict() dy_param_updated = dict()
dy_param_init = dict() dy_param_init = dict()
dy_loss = None dy_loss = None
......
...@@ -23,7 +23,7 @@ import paddle ...@@ -23,7 +23,7 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
from utils import DyGraphProgramDescTracerTestHelper, is_equal_program from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
...@@ -77,8 +77,8 @@ class SimpleImgConvPool(fluid.dygraph.Layer): ...@@ -77,8 +77,8 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
class MNIST(fluid.dygraph.Layer): class MNIST(fluid.dygraph.Layer):
def __init__(self, name_scope): def __init__(self):
super(MNIST, self).__init__(name_scope) super(MNIST, self).__init__()
self._simple_img_conv_pool_1 = SimpleImgConvPool( self._simple_img_conv_pool_1 = SimpleImgConvPool(
1, 20, 5, 2, 2, act="relu") 1, 20, 5, 2, 2, act="relu")
...@@ -86,19 +86,21 @@ class MNIST(fluid.dygraph.Layer): ...@@ -86,19 +86,21 @@ class MNIST(fluid.dygraph.Layer):
self._simple_img_conv_pool_2 = SimpleImgConvPool( self._simple_img_conv_pool_2 = SimpleImgConvPool(
20, 50, 5, 2, 2, act="relu") 20, 50, 5, 2, 2, act="relu")
pool_2_shape = 50 * 4 * 4 self.pool_2_shape = 50 * 4 * 4
SIZE = 10 SIZE = 10
scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5 scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5
self._fc = FC(self.full_name(), self._fc = Linear(
10, self.pool_2_shape,
param_attr=fluid.param_attr.ParamAttr( 10,
initializer=fluid.initializer.NormalInitializer( param_attr=fluid.param_attr.ParamAttr(
loc=0.0, scale=scale)), initializer=fluid.initializer.NormalInitializer(
act="softmax") loc=0.0, scale=scale)),
act="softmax")
def forward(self, inputs): def forward(self, inputs):
x = self._simple_img_conv_pool_1(inputs) x = self._simple_img_conv_pool_1(inputs)
x = self._simple_img_conv_pool_2(x) x = self._simple_img_conv_pool_2(x)
x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape])
x = self._fc(x) x = self._fc(x)
return x return x
...@@ -125,8 +127,9 @@ class TestImperativeMnist(unittest.TestCase): ...@@ -125,8 +127,9 @@ class TestImperativeMnist(unittest.TestCase):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
mnist = MNIST("mnist") mnist = MNIST()
sgd = SGDOptimizer(learning_rate=1e-3) sgd = SGDOptimizer(
learning_rate=1e-3, parameter_list=mnist.parameters())
batch_py_reader = fluid.io.PyReader(capacity=1) batch_py_reader = fluid.io.PyReader(capacity=1)
batch_py_reader.decorate_sample_list_generator( batch_py_reader.decorate_sample_list_generator(
...@@ -189,7 +192,7 @@ class TestImperativeMnist(unittest.TestCase): ...@@ -189,7 +192,7 @@ class TestImperativeMnist(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace( exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
mnist = MNIST("mnist") mnist = MNIST()
sgd = SGDOptimizer(learning_rate=1e-3) sgd = SGDOptimizer(learning_rate=1e-3)
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.mnist.train(), paddle.dataset.mnist.train(),
......
...@@ -39,8 +39,9 @@ class TestImperativeMnistSortGradient(unittest.TestCase): ...@@ -39,8 +39,9 @@ class TestImperativeMnistSortGradient(unittest.TestCase):
backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True backward_strategy.sort_sum_gradient = True
mnist2 = MNIST("mnist") mnist2 = MNIST()
sgd2 = SGDOptimizer(learning_rate=1e-3) sgd2 = SGDOptimizer(
learning_rate=1e-3, parameter_list=mnist2.parameters())
train_reader2 = paddle.batch( train_reader2 = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True) paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
...@@ -85,7 +86,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase): ...@@ -85,7 +86,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace( exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
mnist = MNIST("mnist") mnist = MNIST()
sgd = SGDOptimizer(learning_rate=1e-3) sgd = SGDOptimizer(learning_rate=1e-3)
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True) paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
......
...@@ -18,7 +18,7 @@ import numpy as np ...@@ -18,7 +18,7 @@ import numpy as np
import six import six
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, BatchNorm, Embedding, GRUUnit from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm, Embedding, GRUUnit
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
...@@ -27,6 +27,8 @@ class Config(object): ...@@ -27,6 +27,8 @@ class Config(object):
''' '''
config for training config for training
''' '''
# encoder rnn hidden_size
encoder_size = 200
# decoder size for decoder stage # decoder size for decoder stage
decoder_size = 128 decoder_size = 128
# size for word embedding # size for word embedding
...@@ -118,8 +120,8 @@ class ConvBNPool(fluid.dygraph.Layer): ...@@ -118,8 +120,8 @@ class ConvBNPool(fluid.dygraph.Layer):
class OCRConv(fluid.dygraph.Layer): class OCRConv(fluid.dygraph.Layer):
def __init__(self, name_scope, is_test=False, use_cudnn=True): def __init__(self, is_test=False, use_cudnn=True):
super(OCRConv, self).__init__(name_scope) super(OCRConv, self).__init__()
self.conv_bn_pool_1 = ConvBNPool( self.conv_bn_pool_1 = ConvBNPool(
2, [16, 16], [1, 16], is_test=is_test, use_cudnn=use_cudnn) 2, [16, 16], [1, 16], is_test=is_test, use_cudnn=use_cudnn)
self.conv_bn_pool_2 = ConvBNPool( self.conv_bn_pool_2 = ConvBNPool(
...@@ -143,7 +145,6 @@ class OCRConv(fluid.dygraph.Layer): ...@@ -143,7 +145,6 @@ class OCRConv(fluid.dygraph.Layer):
class DynamicGRU(fluid.dygraph.Layer): class DynamicGRU(fluid.dygraph.Layer):
def __init__(self, def __init__(self,
scope_name,
size, size,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
...@@ -152,7 +153,7 @@ class DynamicGRU(fluid.dygraph.Layer): ...@@ -152,7 +153,7 @@ class DynamicGRU(fluid.dygraph.Layer):
candidate_activation='tanh', candidate_activation='tanh',
h_0=None, h_0=None,
origin_mode=False): origin_mode=False):
super(DynamicGRU, self).__init__(scope_name) super(DynamicGRU, self).__init__()
self.gru_unit = GRUUnit( self.gru_unit = GRUUnit(
size * 3, size * 3,
...@@ -164,6 +165,7 @@ class DynamicGRU(fluid.dygraph.Layer): ...@@ -164,6 +165,7 @@ class DynamicGRU(fluid.dygraph.Layer):
self.h_0 = h_0 self.h_0 = h_0
self.is_reverse = is_reverse self.is_reverse = is_reverse
self.size = size
def forward(self, inputs): def forward(self, inputs):
hidden = self.h_0 hidden = self.h_0
...@@ -188,11 +190,10 @@ class DynamicGRU(fluid.dygraph.Layer): ...@@ -188,11 +190,10 @@ class DynamicGRU(fluid.dygraph.Layer):
class EncoderNet(fluid.dygraph.Layer): class EncoderNet(fluid.dygraph.Layer):
def __init__(self, def __init__(self,
scope_name, rnn_hidden_size=Config.encoder_size,
rnn_hidden_size=200,
is_test=False, is_test=False,
use_cudnn=True): use_cudnn=True):
super(EncoderNet, self).__init__(scope_name) super(EncoderNet, self).__init__()
self.rnn_hidden_size = rnn_hidden_size self.rnn_hidden_size = rnn_hidden_size
para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0,
0.02)) 0.02))
...@@ -207,28 +208,19 @@ class EncoderNet(fluid.dygraph.Layer): ...@@ -207,28 +208,19 @@ class EncoderNet(fluid.dygraph.Layer):
shape=[Config.batch_size, rnn_hidden_size], shape=[Config.batch_size, rnn_hidden_size],
dtype='float32', dtype='float32',
value=0) value=0)
self.ocr_convs = OCRConv( self.ocr_convs = OCRConv(is_test=is_test, use_cudnn=use_cudnn)
self.full_name(), is_test=is_test, use_cudnn=use_cudnn)
self.fc_1_layer = Linear(
self.fc_1_layer = FC(self.full_name(), 768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False)
rnn_hidden_size * 3, self.fc_2_layer = Linear(
param_attr=para_attr, 768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False)
bias_attr=False,
num_flatten_dims=2)
self.fc_2_layer = FC(self.full_name(),
rnn_hidden_size * 3,
param_attr=para_attr,
bias_attr=False,
num_flatten_dims=2)
self.gru_forward_layer = DynamicGRU( self.gru_forward_layer = DynamicGRU(
self.full_name(),
size=rnn_hidden_size, size=rnn_hidden_size,
h_0=h_0, h_0=h_0,
param_attr=para_attr, param_attr=para_attr,
bias_attr=bias_attr, bias_attr=bias_attr,
candidate_activation='relu') candidate_activation='relu')
self.gru_backward_layer = DynamicGRU( self.gru_backward_layer = DynamicGRU(
self.full_name(),
size=rnn_hidden_size, size=rnn_hidden_size,
h_0=h_0, h_0=h_0,
param_attr=para_attr, param_attr=para_attr,
...@@ -236,10 +228,8 @@ class EncoderNet(fluid.dygraph.Layer): ...@@ -236,10 +228,8 @@ class EncoderNet(fluid.dygraph.Layer):
candidate_activation='relu', candidate_activation='relu',
is_reverse=True) is_reverse=True)
self.encoded_proj_fc = FC(self.full_name(), self.encoded_proj_fc = Linear(
Config.decoder_size, rnn_hidden_size * 2, Config.decoder_size, bias_attr=False)
bias_attr=False,
num_flatten_dims=2)
def forward(self, inputs): def forward(self, inputs):
conv_features = self.ocr_convs(inputs) conv_features = self.ocr_convs(inputs)
...@@ -272,18 +262,12 @@ class EncoderNet(fluid.dygraph.Layer): ...@@ -272,18 +262,12 @@ class EncoderNet(fluid.dygraph.Layer):
class SimpleAttention(fluid.dygraph.Layer): class SimpleAttention(fluid.dygraph.Layer):
def __init__(self, scope_name, decoder_size): def __init__(self, decoder_size):
super(SimpleAttention, self).__init__(scope_name) super(SimpleAttention, self).__init__()
self.fc_1 = FC(self.full_name(), self.fc_1 = Linear(
decoder_size, decoder_size, decoder_size, act=None, bias_attr=False)
act=None, self.fc_2 = Linear(decoder_size, 1, act=None, bias_attr=False)
bias_attr=False)
self.fc_2 = FC(self.full_name(),
1,
num_flatten_dims=2,
act=None,
bias_attr=False)
def forward(self, encoder_vec, encoder_proj, decoder_state): def forward(self, encoder_vec, encoder_proj, decoder_state):
...@@ -311,22 +295,18 @@ class SimpleAttention(fluid.dygraph.Layer): ...@@ -311,22 +295,18 @@ class SimpleAttention(fluid.dygraph.Layer):
class GRUDecoderWithAttention(fluid.dygraph.Layer): class GRUDecoderWithAttention(fluid.dygraph.Layer):
def __init__(self, scope_name, decoder_size, num_classes): def __init__(self, decoder_size, num_classes):
super(GRUDecoderWithAttention, self).__init__(scope_name) super(GRUDecoderWithAttention, self).__init__()
self.simple_attention = SimpleAttention(self.full_name(), decoder_size) self.simple_attention = SimpleAttention(decoder_size)
self.fc_1_layer = FC(self.full_name(), self.fc_1_layer = Linear(
size=decoder_size * 3, Config.encoder_size * 2, decoder_size * 3, bias_attr=False)
bias_attr=False) self.fc_2_layer = Linear(
self.fc_2_layer = FC(self.full_name(), decoder_size, decoder_size * 3, bias_attr=False)
size=decoder_size * 3,
bias_attr=False)
self.gru_unit = GRUUnit( self.gru_unit = GRUUnit(
size=decoder_size * 3, param_attr=None, bias_attr=None) size=decoder_size * 3, param_attr=None, bias_attr=None)
self.out_layer = FC(self.full_name(), self.out_layer = Linear(
size=num_classes + 2, decoder_size, num_classes + 2, bias_attr=None, act='softmax')
bias_attr=None,
act='softmax')
self.decoder_size = decoder_size self.decoder_size = decoder_size
...@@ -357,17 +337,18 @@ class GRUDecoderWithAttention(fluid.dygraph.Layer): ...@@ -357,17 +337,18 @@ class GRUDecoderWithAttention(fluid.dygraph.Layer):
class OCRAttention(fluid.dygraph.Layer): class OCRAttention(fluid.dygraph.Layer):
def __init__(self, scope_name): def __init__(self):
super(OCRAttention, self).__init__(scope_name) super(OCRAttention, self).__init__()
self.encoder_net = EncoderNet(self.full_name()) self.encoder_net = EncoderNet()
self.fc = FC(self.full_name(), self.fc = Linear(
size=Config.decoder_size, Config.encoder_size,
bias_attr=False, Config.decoder_size,
act='relu') bias_attr=False,
act='relu')
self.embedding = Embedding( self.embedding = Embedding(
[Config.num_classes + 2, Config.word_vector_dim], dtype='float32') [Config.num_classes + 2, Config.word_vector_dim], dtype='float32')
self.gru_decoder_with_attention = GRUDecoderWithAttention( self.gru_decoder_with_attention = GRUDecoderWithAttention(
self.full_name(), Config.decoder_size, Config.num_classes) Config.decoder_size, Config.num_classes)
def forward(self, inputs, label_in): def forward(self, inputs, label_in):
gru_backward, encoded_vector, encoded_proj = self.encoder_net(inputs) gru_backward, encoded_vector, encoded_proj = self.encoder_net(inputs)
...@@ -425,14 +406,15 @@ class TestDygraphOCRAttention(unittest.TestCase): ...@@ -425,14 +406,15 @@ class TestDygraphOCRAttention(unittest.TestCase):
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True backward_strategy.sort_sum_gradient = True
ocr_attention = OCRAttention("ocr_attention") ocr_attention = OCRAttention()
if Config.learning_rate_decay == "piecewise_decay": if Config.learning_rate_decay == "piecewise_decay":
learning_rate = fluid.layers.piecewise_decay( learning_rate = fluid.layers.piecewise_decay(
[50000], [Config.LR, Config.LR * 0.01]) [50000], [Config.LR, Config.LR * 0.01])
else: else:
learning_rate = Config.LR learning_rate = Config.LR
optimizer = fluid.optimizer.SGD(learning_rate=0.001) optimizer = fluid.optimizer.SGD(
learning_rate=0.001, parameter_list=ocr_attention.parameters())
dy_param_init_value = {} dy_param_init_value = {}
for param in ocr_attention.parameters(): for param in ocr_attention.parameters():
dy_param_init_value[param.name] = param.numpy() dy_param_init_value[param.name] = param.numpy()
...@@ -478,7 +460,7 @@ class TestDygraphOCRAttention(unittest.TestCase): ...@@ -478,7 +460,7 @@ class TestDygraphOCRAttention(unittest.TestCase):
# print("static start") # print("static start")
exe = fluid.Executor(fluid.CPUPlace( exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
ocr_attention = OCRAttention("ocr_attention") ocr_attention = OCRAttention()
if Config.learning_rate_decay == "piecewise_decay": if Config.learning_rate_decay == "piecewise_decay":
learning_rate = fluid.layers.piecewise_decay( learning_rate = fluid.layers.piecewise_decay(
......
...@@ -23,17 +23,17 @@ import paddle ...@@ -23,17 +23,17 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer, Adam from paddle.fluid.optimizer import SGDOptimizer, Adam
from paddle.fluid.dygraph.nn import FC from paddle.fluid.dygraph import Linear
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
class MLP(fluid.Layer): class MLP(fluid.Layer):
def __init__(self, name_scope, param_attr=None, bias_attr=None): def __init__(self, param_attr=None, bias_attr=None):
super(MLP, self).__init__(name_scope) super(MLP, self).__init__()
self._fc1 = FC(self.full_name(), 10) self._fc1 = Linear(784, 10)
self._fc2 = FC(self.full_name(), 10) self._fc2 = Linear(10, 10)
def forward(self, inputs): def forward(self, inputs):
y = self._fc1(inputs) y = self._fc1(inputs)
...@@ -45,13 +45,16 @@ class TestImperativeOptimizerBase(unittest.TestCase): ...@@ -45,13 +45,16 @@ class TestImperativeOptimizerBase(unittest.TestCase):
def setUp(self): def setUp(self):
self.batch_num = 20 self.batch_num = 20
def get_optimizer_dygraph(self, parameter_list):
raise NotImplementedError()
def get_optimizer(self): def get_optimizer(self):
raise NotImplementedError() raise NotImplementedError()
def reader_decorator(self, reader): def reader_decorator(self, reader):
def _reader_imple(): def _reader_imple():
for item in reader(): for item in reader():
image = np.array(item[0]).reshape(1, 28, 28) image = np.array(item[0]).reshape(1, 784)
label = np.array(item[1]).astype('int64').reshape(1) label = np.array(item[1]).astype('int64').reshape(1)
yield image, label yield image, label
...@@ -65,8 +68,9 @@ class TestImperativeOptimizerBase(unittest.TestCase): ...@@ -65,8 +68,9 @@ class TestImperativeOptimizerBase(unittest.TestCase):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
mlp = MLP('mlp') mlp = MLP()
optimizer = self.get_optimizer() optimizer = self.get_optimizer_dygraph(
parameter_list=mlp.parameters())
batch_py_reader = fluid.io.PyReader(capacity=1) batch_py_reader = fluid.io.PyReader(capacity=1)
batch_py_reader.decorate_sample_list_generator( batch_py_reader.decorate_sample_list_generator(
...@@ -85,6 +89,7 @@ class TestImperativeOptimizerBase(unittest.TestCase): ...@@ -85,6 +89,7 @@ class TestImperativeOptimizerBase(unittest.TestCase):
label = data[1] label = data[1]
label.stop_gradient = True label.stop_gradient = True
img = fluid.layers.reshape(img, shape=[batch_size, -1])
cost = mlp(img) cost = mlp(img)
avg_loss = fluid.layers.reduce_mean(cost) avg_loss = fluid.layers.reduce_mean(cost)
dy_out = avg_loss.numpy() dy_out = avg_loss.numpy()
...@@ -107,7 +112,7 @@ class TestImperativeOptimizerBase(unittest.TestCase): ...@@ -107,7 +112,7 @@ class TestImperativeOptimizerBase(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace( exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
mlp = MLP('mlp') mlp = MLP()
optimizer = self.get_optimizer() optimizer = self.get_optimizer()
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True) paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
...@@ -115,6 +120,7 @@ class TestImperativeOptimizerBase(unittest.TestCase): ...@@ -115,6 +120,7 @@ class TestImperativeOptimizerBase(unittest.TestCase):
img = fluid.layers.data( img = fluid.layers.data(
name='pixel', shape=[1, 28, 28], dtype='float32') name='pixel', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
img = fluid.layers.reshape(img, shape=[batch_size, -1])
cost = mlp(img) cost = mlp(img)
avg_loss = fluid.layers.reduce_mean(cost) avg_loss = fluid.layers.reduce_mean(cost)
optimizer.minimize(avg_loss) optimizer.minimize(avg_loss)
...@@ -162,6 +168,15 @@ class TestImperativeOptimizerBase(unittest.TestCase): ...@@ -162,6 +168,15 @@ class TestImperativeOptimizerBase(unittest.TestCase):
class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
bd = [3, 6, 9]
optimizer = SGDOptimizer(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd,
values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self): def get_optimizer(self):
bd = [3, 6, 9] bd = [3, 6, 9]
optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
...@@ -173,6 +188,16 @@ class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): ...@@ -173,6 +188,16 @@ class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay( optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1, learning_rate=0.1,
...@@ -186,6 +211,16 @@ class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase): ...@@ -186,6 +211,16 @@ class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.exponential_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay( optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
learning_rate=0.1, learning_rate=0.1,
...@@ -199,6 +234,16 @@ class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase): ...@@ -199,6 +234,16 @@ class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = Adam(
learning_rate=fluid.layers.inverse_time_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay( optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay(
learning_rate=0.1, learning_rate=0.1,
...@@ -212,6 +257,13 @@ class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase): ...@@ -212,6 +257,13 @@ class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.polynomial_decay(
learning_rate=0.1, decay_steps=5, cycle=self.cycle),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay( optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
learning_rate=0.1, decay_steps=5, cycle=self.cycle)) learning_rate=0.1, decay_steps=5, cycle=self.cycle))
...@@ -227,6 +279,13 @@ class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase): ...@@ -227,6 +279,13 @@ class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.cosine_decay(
learning_rate=0.1, step_each_epoch=10000, epochs=120),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay( optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
learning_rate=0.1, step_each_epoch=10000, epochs=120)) learning_rate=0.1, step_each_epoch=10000, epochs=120))
...@@ -237,6 +296,13 @@ class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase): ...@@ -237,6 +296,13 @@ class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase): class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.noam_decay(
d_model=512, warmup_steps=8000),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self): def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay( optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
d_model=512, warmup_steps=8000)) d_model=512, warmup_steps=8000))
......
...@@ -38,7 +38,8 @@ class TestImperativePartitialBackward(unittest.TestCase): ...@@ -38,7 +38,8 @@ class TestImperativePartitialBackward(unittest.TestCase):
for param in fc2.parameters(): for param in fc2.parameters():
self.assertIsNone(param._grad_ivar()) self.assertIsNone(param._grad_ivar())
optimizer = fluid.optimizer.AdamOptimizer() optimizer = fluid.optimizer.AdamOptimizer(parameter_list=(
fc1.parameters() + fc2.parameters()))
_, params_grads = optimizer.minimize(loss) _, params_grads = optimizer.minimize(loss)
self.assertListEqual( self.assertListEqual(
......
...@@ -30,13 +30,12 @@ from utils import DyGraphProgramDescTracerTestHelper, is_equal_program ...@@ -30,13 +30,12 @@ from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
class SimpleLSTMRNN(fluid.Layer): class SimpleLSTMRNN(fluid.Layer):
def __init__(self, def __init__(self,
name_scope,
hidden_size, hidden_size,
num_steps, num_steps,
num_layers=2, num_layers=2,
init_scale=0.1, init_scale=0.1,
dropout=None): dropout=None):
super(SimpleLSTMRNN, self).__init__(name_scope) super(SimpleLSTMRNN, self).__init__()
self._hidden_size = hidden_size self._hidden_size = hidden_size
self._num_layers = num_layers self._num_layers = num_layers
self._init_scale = init_scale self._init_scale = init_scale
...@@ -45,8 +44,9 @@ class SimpleLSTMRNN(fluid.Layer): ...@@ -45,8 +44,9 @@ class SimpleLSTMRNN(fluid.Layer):
self._num_steps = num_steps self._num_steps = num_steps
self.cell_array = [] self.cell_array = []
self.hidden_array = [] self.hidden_array = []
self._create_parameter()
def _build_once(self, input_embedding, init_hidden=None, init_cell=None): def _create_parameter(self):
self.weight_1_arr = [] self.weight_1_arr = []
self.weight_2_arr = [] self.weight_2_arr = []
self.bias_arr = [] self.bias_arr = []
...@@ -135,7 +135,6 @@ class SimpleLSTMRNN(fluid.Layer): ...@@ -135,7 +135,6 @@ class SimpleLSTMRNN(fluid.Layer):
class PtbModel(fluid.Layer): class PtbModel(fluid.Layer):
def __init__(self, def __init__(self,
name_scope,
hidden_size, hidden_size,
vocab_size, vocab_size,
num_layers=2, num_layers=2,
...@@ -143,7 +142,7 @@ class PtbModel(fluid.Layer): ...@@ -143,7 +142,7 @@ class PtbModel(fluid.Layer):
init_scale=0.1, init_scale=0.1,
is_sparse=False, is_sparse=False,
dropout=None): dropout=None):
super(PtbModel, self).__init__(name_scope) super(PtbModel, self).__init__()
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.init_scale = init_scale self.init_scale = init_scale
...@@ -151,7 +150,6 @@ class PtbModel(fluid.Layer): ...@@ -151,7 +150,6 @@ class PtbModel(fluid.Layer):
self.num_steps = num_steps self.num_steps = num_steps
self.dropout = dropout self.dropout = dropout
self.simple_lstm_rnn = SimpleLSTMRNN( self.simple_lstm_rnn = SimpleLSTMRNN(
self.full_name(),
hidden_size, hidden_size,
num_steps, num_steps,
num_layers=num_layers, num_layers=num_layers,
...@@ -231,7 +229,6 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -231,7 +229,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
# TODO: marsyang1993 Change seed to # TODO: marsyang1993 Change seed to
ptb_model = PtbModel( ptb_model = PtbModel(
"ptb_model",
hidden_size=hidden_size, hidden_size=hidden_size,
vocab_size=vocab_size, vocab_size=vocab_size,
num_layers=num_layers, num_layers=num_layers,
...@@ -239,7 +236,8 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -239,7 +236,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
init_scale=init_scale, init_scale=init_scale,
is_sparse=is_sparse) is_sparse=is_sparse)
sgd = SGDOptimizer(learning_rate=1e-3) sgd = SGDOptimizer(
learning_rate=1e-3, parameter_list=ptb_model.parameters())
dy_param_updated = dict() dy_param_updated = dict()
dy_param_init = dict() dy_param_init = dict()
dy_loss = None dy_loss = None
...@@ -298,7 +296,6 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -298,7 +296,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
ptb_model = PtbModel( ptb_model = PtbModel(
"ptb_model",
hidden_size=hidden_size, hidden_size=hidden_size,
vocab_size=vocab_size, vocab_size=vocab_size,
num_layers=num_layers, num_layers=num_layers,
......
...@@ -49,7 +49,6 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase): ...@@ -49,7 +49,6 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
backward_strategy.sort_sum_gradient = True backward_strategy.sort_sum_gradient = True
# TODO: marsyang1993 Change seed to # TODO: marsyang1993 Change seed to
ptb_model = PtbModel( ptb_model = PtbModel(
"ptb_model",
hidden_size=hidden_size, hidden_size=hidden_size,
vocab_size=vocab_size, vocab_size=vocab_size,
num_layers=num_layers, num_layers=num_layers,
...@@ -57,7 +56,8 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase): ...@@ -57,7 +56,8 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
init_scale=init_scale, init_scale=init_scale,
is_sparse=is_sparse) is_sparse=is_sparse)
sgd = SGDOptimizer(learning_rate=1e-3) sgd = SGDOptimizer(
learning_rate=1e-3, parameter_list=ptb_model.parameters())
dy_param_updated = dict() dy_param_updated = dict()
dy_param_init = dict() dy_param_init = dict()
dy_loss = None dy_loss = None
...@@ -97,7 +97,6 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase): ...@@ -97,7 +97,6 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
ptb_model = PtbModel( ptb_model = PtbModel(
"ptb_model",
hidden_size=hidden_size, hidden_size=hidden_size,
vocab_size=vocab_size, vocab_size=vocab_size,
num_layers=num_layers, num_layers=num_layers,
......
...@@ -86,7 +86,8 @@ class TestImperativeMnist(unittest.TestCase): ...@@ -86,7 +86,8 @@ class TestImperativeMnist(unittest.TestCase):
loss_probs = fluid.layers.elementwise_mul(dy_reward, loss_probs) loss_probs = fluid.layers.elementwise_mul(dy_reward, loss_probs)
loss = fluid.layers.reduce_sum(loss_probs) loss = fluid.layers.reduce_sum(loss_probs)
sgd = SGDOptimizer(learning_rate=1e-3) sgd = SGDOptimizer(
learning_rate=1e-3, parameter_list=policy.parameters())
dy_param_init_value = {} dy_param_init_value = {}
......
...@@ -21,7 +21,7 @@ import paddle ...@@ -21,7 +21,7 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid import Conv2D, Pool2D, BatchNorm, FC from paddle.fluid import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
from utils import DyGraphProgramDescTracerTestHelper, is_equal_program from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
...@@ -44,7 +44,7 @@ train_parameters = { ...@@ -44,7 +44,7 @@ train_parameters = {
} }
def optimizer_setting(params): def optimizer_setting(params, parameter_list=None):
ls = params["learning_strategy"] ls = params["learning_strategy"]
if ls["name"] == "piecewise_decay": if ls["name"] == "piecewise_decay":
if "total_images" not in params: if "total_images" not in params:
...@@ -58,14 +58,18 @@ def optimizer_setting(params): ...@@ -58,14 +58,18 @@ def optimizer_setting(params):
base_lr = params["lr"] base_lr = params["lr"]
lr = [] lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.SGD(learning_rate=0.01) if fluid.in_dygraph_mode():
optimizer = fluid.optimizer.SGD(learning_rate=0.01,
parameter_list=parameter_list)
else:
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
# TODO(minqiyang): Add learning rate scheduler support to dygraph mode # TODO(minqiyang): Add learning rate scheduler support to dygraph mode
# optimizer = fluid.optimizer.Momentum( # optimizer = fluid.optimizer.Momentum(
# learning_rate=params["lr"], # learning_rate=params["lr"],
# learning_rate=fluid.layers.piecewise_decay( # learning_rate=fluid.layers.piecewise_decay(
# boundaries=bd, values=lr), # boundaries=bd, values=lr),
# momentum=0.9, # momentum=0.9,
# regularization=fluid.regularizer.L2Decay(1e-4)) # regularization=fluid.regularizer.L2Decay(1e-4))
return optimizer return optimizer
...@@ -147,8 +151,8 @@ class BottleneckBlock(fluid.Layer): ...@@ -147,8 +151,8 @@ class BottleneckBlock(fluid.Layer):
class ResNet(fluid.Layer): class ResNet(fluid.Layer):
def __init__(self, name_scope, layers=50, class_dim=102): def __init__(self, layers=50, class_dim=102):
super(ResNet, self).__init__(name_scope) super(ResNet, self).__init__()
self.layers = layers self.layers = layers
supported_layers = [50, 101, 152] supported_layers = [50, 101, 152]
...@@ -187,14 +191,17 @@ class ResNet(fluid.Layer): ...@@ -187,14 +191,17 @@ class ResNet(fluid.Layer):
self.pool2d_avg = Pool2D( self.pool2d_avg = Pool2D(
pool_size=7, pool_type='avg', global_pooling=True) pool_size=7, pool_type='avg', global_pooling=True)
self.pool2d_avg_output = num_filters[-1] * 4 * 1 * 1
import math import math
stdv = 1.0 / math.sqrt(2048 * 1.0) stdv = 1.0 / math.sqrt(2048 * 1.0)
self.out = FC(self.full_name(), self.out = Linear(
size=class_dim, self.pool2d_avg_output,
act='softmax', class_dim,
param_attr=fluid.param_attr.ParamAttr( act='softmax',
initializer=fluid.initializer.Uniform(-stdv, stdv))) param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
def forward(self, inputs): def forward(self, inputs):
y = self.conv(inputs) y = self.conv(inputs)
...@@ -202,6 +209,7 @@ class ResNet(fluid.Layer): ...@@ -202,6 +209,7 @@ class ResNet(fluid.Layer):
for bottleneck_block in self.bottleneck_block_list: for bottleneck_block in self.bottleneck_block_list:
y = bottleneck_block(y) y = bottleneck_block(y)
y = self.pool2d_avg(y) y = self.pool2d_avg(y)
y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output])
y = self.out(y) y = self.out(y)
return y return y
...@@ -228,8 +236,9 @@ class TestDygraphResnet(unittest.TestCase): ...@@ -228,8 +236,9 @@ class TestDygraphResnet(unittest.TestCase):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
resnet = ResNet("resnet") resnet = ResNet()
optimizer = optimizer_setting(train_parameters) optimizer = optimizer_setting(
train_parameters, parameter_list=resnet.parameters())
np.random.seed(seed) np.random.seed(seed)
import random import random
random.seed = seed random.seed = seed
...@@ -315,7 +324,7 @@ class TestDygraphResnet(unittest.TestCase): ...@@ -315,7 +324,7 @@ class TestDygraphResnet(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace( exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
resnet = ResNet("resnet") resnet = ResNet()
optimizer = optimizer_setting(train_parameters) optimizer = optimizer_setting(train_parameters)
np.random.seed(seed) np.random.seed(seed)
......
...@@ -40,7 +40,7 @@ train_parameters = { ...@@ -40,7 +40,7 @@ train_parameters = {
} }
def optimizer_setting(params): def optimizer_setting(params, parameter_list=None):
ls = params["learning_strategy"] ls = params["learning_strategy"]
if ls["name"] == "piecewise_decay": if ls["name"] == "piecewise_decay":
if "total_images" not in params: if "total_images" not in params:
...@@ -54,14 +54,18 @@ def optimizer_setting(params): ...@@ -54,14 +54,18 @@ def optimizer_setting(params):
base_lr = params["lr"] base_lr = params["lr"]
lr = [] lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.SGD(learning_rate=0.01) if fluid.in_dygraph_mode():
optimizer = fluid.optimizer.SGD(learning_rate=0.01,
parameter_list=parameter_list)
else:
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
# TODO(minqiyang): Add learning rate scheduler support to dygraph mode # TODO(minqiyang): Add learning rate scheduler support to dygraph mode
# optimizer = fluid.optimizer.Momentum( # optimizer = fluid.optimizer.Momentum(
# learning_rate=params["lr"], # learning_rate=params["lr"],
# learning_rate=fluid.layers.piecewise_decay( # learning_rate=fluid.layers.piecewise_decay(
# boundaries=bd, values=lr), # boundaries=bd, values=lr),
# momentum=0.9, # momentum=0.9,
# regularization=fluid.regularizer.L2Decay(1e-4)) # regularization=fluid.regularizer.L2Decay(1e-4))
return optimizer return optimizer
...@@ -77,8 +81,9 @@ class TestDygraphResnetSortGradient(unittest.TestCase): ...@@ -77,8 +81,9 @@ class TestDygraphResnetSortGradient(unittest.TestCase):
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True backward_strategy.sort_sum_gradient = True
resnet = ResNet("resnet") resnet = ResNet()
optimizer = optimizer_setting(train_parameters) optimizer = optimizer_setting(
train_parameters, parameter_list=resnet.parameters())
np.random.seed(seed) np.random.seed(seed)
import random import random
random.seed = seed random.seed = seed
...@@ -138,7 +143,7 @@ class TestDygraphResnetSortGradient(unittest.TestCase): ...@@ -138,7 +143,7 @@ class TestDygraphResnetSortGradient(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace( exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
resnet = ResNet("resnet") resnet = ResNet()
optimizer = optimizer_setting(train_parameters) optimizer = optimizer_setting(train_parameters)
np.random.seed(seed) np.random.seed(seed)
......
...@@ -233,8 +233,10 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -233,8 +233,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda( place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0) ) else fluid.CUDAPlace(0)
adam = Adam(learning_rate=fluid.layers.piecewise_decay( adam = Adam(
boundaries=bd, values=lr_arr)) learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
parameter_list=ptb_model.parameters())
dy_param_updated = dict() dy_param_updated = dict()
dy_param_init = dict() dy_param_init = dict()
dy_loss = None dy_loss = None
...@@ -314,8 +316,10 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -314,8 +316,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda( place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0) ) else fluid.CUDAPlace(0)
adam = Adam(learning_rate=fluid.layers.piecewise_decay( adam = Adam(
boundaries=bd, values=lr_arr)) learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
parameter_list=ptb_model.parameters())
dy_param_updated = dict() dy_param_updated = dict()
dy_param_init = dict() dy_param_init = dict()
dy_loss = None dy_loss = None
...@@ -418,8 +422,10 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -418,8 +422,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda( place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0) ) else fluid.CUDAPlace(0)
adam = Adam(learning_rate=fluid.layers.piecewise_decay( adam = Adam(
boundaries=bd, values=lr_arr)) learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
parameter_list=ptb_model.parameters())
dy_param_updated = dict() dy_param_updated = dict()
dy_param_init = dict() dy_param_init = dict()
dy_loss = None dy_loss = None
...@@ -521,8 +527,10 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -521,8 +527,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda( place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0) ) else fluid.CUDAPlace(0)
adam = Adam(learning_rate=fluid.layers.piecewise_decay( adam = Adam(
boundaries=bd, values=lr_arr)) learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
parameter_list=ptb_model.parameters())
dy_param_updated = dict() dy_param_updated = dict()
dy_param_init = dict() dy_param_init = dict()
dy_loss = None dy_loss = None
...@@ -633,7 +641,8 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -633,7 +641,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
learning_rate=fluid.layers.piecewise_decay( learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr), boundaries=bd, values=lr_arr),
beta1=0.8, beta1=0.8,
beta2=0.6) beta2=0.6,
parameter_list=ptb_model.parameters())
dy_param_updated = dict() dy_param_updated = dict()
dy_param_init = dict() dy_param_init = dict()
dy_loss = None dy_loss = None
...@@ -724,7 +733,8 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -724,7 +733,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
learning_rate=fluid.layers.piecewise_decay( learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr), boundaries=bd, values=lr_arr),
beta1=0.8, beta1=0.8,
beta2=0.6) beta2=0.6,
parameter_list=ptb_model.parameters())
dy_param_updated = dict() dy_param_updated = dict()
dy_param_init = dict() dy_param_init = dict()
dy_loss = None dy_loss = None
...@@ -816,7 +826,8 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -816,7 +826,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
learning_rate=fluid.layers.piecewise_decay( learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr), boundaries=bd, values=lr_arr),
beta1=0.8, beta1=0.8,
beta2=0.6) beta2=0.6,
parameter_list=ptb_model.parameters())
dy_param_updated = dict() dy_param_updated = dict()
dy_param_init = dict() dy_param_init = dict()
dy_loss = None dy_loss = None
......
...@@ -21,7 +21,7 @@ import paddle ...@@ -21,7 +21,7 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
...@@ -42,7 +42,7 @@ train_parameters = { ...@@ -42,7 +42,7 @@ train_parameters = {
} }
def optimizer_setting(params): def optimizer_setting(params, parameter_list=None):
ls = params["learning_strategy"] ls = params["learning_strategy"]
if ls["name"] == "piecewise_decay": if ls["name"] == "piecewise_decay":
if "total_images" not in params: if "total_images" not in params:
...@@ -56,7 +56,11 @@ def optimizer_setting(params): ...@@ -56,7 +56,11 @@ def optimizer_setting(params):
#bd = [step * e for e in ls["epochs"]] #bd = [step * e for e in ls["epochs"]]
#base_lr = params["lr"] #base_lr = params["lr"]
#lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] #lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.SGD(learning_rate=0.01) if fluid.in_dygraph_mode():
optimizer = fluid.optimizer.SGD(learning_rate=0.01,
parameter_list=parameter_list)
else:
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
return optimizer return optimizer
...@@ -91,25 +95,27 @@ class ConvBNLayer(fluid.dygraph.Layer): ...@@ -91,25 +95,27 @@ class ConvBNLayer(fluid.dygraph.Layer):
class SqueezeExcitation(fluid.dygraph.Layer): class SqueezeExcitation(fluid.dygraph.Layer):
def __init__(self, name_scope, num_channels, reduction_ratio): def __init__(self, num_channels, reduction_ratio):
super(SqueezeExcitation, self).__init__(name_scope) super(SqueezeExcitation, self).__init__()
self._num_channels = num_channels
self._pool = Pool2D(pool_size=0, pool_type='avg', global_pooling=True) self._pool = Pool2D(pool_size=0, pool_type='avg', global_pooling=True)
self._squeeze = FC( self._squeeze = Linear(
self.full_name(), num_channels,
size=num_channels // reduction_ratio, num_channels // reduction_ratio,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.05)), initializer=fluid.initializer.Constant(value=0.05)),
act='relu') act='relu')
self._excitation = FC( self._excitation = Linear(
self.full_name(), num_channels // reduction_ratio,
size=num_channels, num_channels,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.05)), initializer=fluid.initializer.Constant(value=0.05)),
act='sigmoid') act='sigmoid')
def forward(self, input): def forward(self, input):
y = self._pool(input) y = self._pool(input)
y = fluid.layers.reshape(y, shape=[-1, self._num_channels])
y = self._squeeze(y) y = self._squeeze(y)
y = self._excitation(y) y = self._excitation(y)
y = fluid.layers.elementwise_mul(x=input, y=y, axis=0) y = fluid.layers.elementwise_mul(x=input, y=y, axis=0)
...@@ -141,9 +147,7 @@ class BottleneckBlock(fluid.dygraph.Layer): ...@@ -141,9 +147,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
act='relu') act='relu')
self.scale = SqueezeExcitation( self.scale = SqueezeExcitation(
self.full_name(), num_channels=num_filters * 4, reduction_ratio=reduction_ratio)
num_channels=num_filters * 4,
reduction_ratio=reduction_ratio)
if not shortcut: if not shortcut:
self.short = ConvBNLayer( self.short = ConvBNLayer(
...@@ -175,8 +179,8 @@ class BottleneckBlock(fluid.dygraph.Layer): ...@@ -175,8 +179,8 @@ class BottleneckBlock(fluid.dygraph.Layer):
class SeResNeXt(fluid.dygraph.Layer): class SeResNeXt(fluid.dygraph.Layer):
def __init__(self, name_scope, layers=50, class_dim=102): def __init__(self, layers=50, class_dim=102):
super(SeResNeXt, self).__init__(name_scope) super(SeResNeXt, self).__init__()
self.layers = layers self.layers = layers
supported_layers = [50, 101, 152] supported_layers = [50, 101, 152]
...@@ -203,7 +207,7 @@ class SeResNeXt(fluid.dygraph.Layer): ...@@ -203,7 +207,7 @@ class SeResNeXt(fluid.dygraph.Layer):
num_filters = [128, 256, 512, 1024] num_filters = [128, 256, 512, 1024]
self.conv0 = ConvBNLayer( self.conv0 = ConvBNLayer(
num_channels=3, num_channels=3,
num_filters=3, num_filters=64,
filter_size=7, filter_size=7,
stride=2, stride=2,
act='relu') act='relu')
...@@ -216,27 +220,29 @@ class SeResNeXt(fluid.dygraph.Layer): ...@@ -216,27 +220,29 @@ class SeResNeXt(fluid.dygraph.Layer):
num_filters = [128, 256, 512, 1024] num_filters = [128, 256, 512, 1024]
self.conv0 = ConvBNLayer( self.conv0 = ConvBNLayer(
num_channels=3, num_channels=3,
num_filters=3, num_filters=64,
filter_size=7, filter_size=3,
stride=2, stride=2,
act='relu') act='relu')
self.conv1 = ConvBNLayer( self.conv1 = ConvBNLayer(
num_channels=3, num_channels=64,
num_filters=3, num_filters=64,
filter_size=7, filter_size=3,
stride=2, stride=2,
act='relu') act='relu')
self.conv2 = ConvBNLayer( self.conv2 = ConvBNLayer(
num_channels=7, num_channels=64,
num_filters=3, num_filters=128,
filter_size=7, filter_size=3,
stride=2, stride=1,
act='relu') act='relu')
self.pool = Pool2D( self.pool = Pool2D(
pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
self.bottleneck_block_list = [] self.bottleneck_block_list = []
num_channels = 64 num_channels = 64
if layers == 152:
num_channels = 128
for block in range(len(depth)): for block in range(len(depth)):
shortcut = False shortcut = False
for i in range(depth[block]): for i in range(depth[block]):
...@@ -258,11 +264,14 @@ class SeResNeXt(fluid.dygraph.Layer): ...@@ -258,11 +264,14 @@ class SeResNeXt(fluid.dygraph.Layer):
import math import math
stdv = 1.0 / math.sqrt(2048 * 1.0) stdv = 1.0 / math.sqrt(2048 * 1.0)
self.out = FC(self.full_name(), self.pool2d_avg_output = num_filters[-1] * 4 * 1 * 1
size=class_dim,
act='softmax', self.out = Linear(
param_attr=fluid.param_attr.ParamAttr( self.pool2d_avg_output,
initializer=fluid.initializer.Uniform(-stdv, stdv))) class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
def forward(self, inputs): def forward(self, inputs):
if self.layers == 50 or self.layers == 101: if self.layers == 50 or self.layers == 101:
...@@ -270,14 +279,15 @@ class SeResNeXt(fluid.dygraph.Layer): ...@@ -270,14 +279,15 @@ class SeResNeXt(fluid.dygraph.Layer):
y = self.pool(y) y = self.pool(y)
elif self.layers == 152: elif self.layers == 152:
y = self.conv0(inputs) y = self.conv0(inputs)
y = self.conv1(inputs) y = self.conv1(y)
y = self.conv2(inputs) y = self.conv2(y)
y = self.pool(y) y = self.pool(y)
for bottleneck_block in self.bottleneck_block_list: for bottleneck_block in self.bottleneck_block_list:
y = bottleneck_block(y) y = bottleneck_block(y)
y = self.pool2d_avg(y) y = self.pool2d_avg(y)
y = fluid.layers.dropout(y, dropout_prob=0.2) y = fluid.layers.dropout(y, dropout_prob=0.2)
y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output])
y = self.out(y) y = self.out(y)
return y return y
...@@ -302,8 +312,9 @@ class TestImperativeResneXt(unittest.TestCase): ...@@ -302,8 +312,9 @@ class TestImperativeResneXt(unittest.TestCase):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
se_resnext = SeResNeXt("se_resnext") se_resnext = SeResNeXt()
optimizer = optimizer_setting(train_parameters) optimizer = optimizer_setting(
train_parameters, parameter_list=se_resnext.parameters())
np.random.seed(seed) np.random.seed(seed)
import random import random
random.seed = seed random.seed = seed
...@@ -364,7 +375,7 @@ class TestImperativeResneXt(unittest.TestCase): ...@@ -364,7 +375,7 @@ class TestImperativeResneXt(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace( exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
se_resnext = SeResNeXt("se_resnext") se_resnext = SeResNeXt()
optimizer = optimizer_setting(train_parameters) optimizer = optimizer_setting(train_parameters)
np.random.seed(seed) np.random.seed(seed)
......
...@@ -49,23 +49,27 @@ class TestSimpleNet(unittest.TestCase): ...@@ -49,23 +49,27 @@ class TestSimpleNet(unittest.TestCase):
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = sort_sum_gradient backward_strategy.sort_sum_gradient = sort_sum_gradient
adam = SGDOptimizer(learning_rate=0.001)
# grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0) # grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0)
input_word = np.array([[1, 2], [2, 1]]).astype('int64') input_word = np.array([[1, 2], [2, 1]]).astype('int64')
input = to_variable(input_word) input = to_variable(input_word)
simplenet = SimpleNet(20, 32, dtype) simplenet = SimpleNet(20, 32, dtype)
adam = SGDOptimizer(
learning_rate=0.001,
parameter_list=simplenet.parameters())
input_emb, emb = simplenet(input) input_emb, emb = simplenet(input)
try: try:
emb._w.gradient() emb._w.gradient()
except ValueError as e: except ValueError as e:
pass assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str(
e)
try: try:
input_emb.gradient() input_emb.gradient()
except ValueError as e: except ValueError as e:
pass assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str(
e)
input_emb.backward(backward_strategy) input_emb.backward(backward_strategy)
adam.minimize(input_emb) # grad_clip=grad_clip adam.minimize(input_emb) # grad_clip=grad_clip
...@@ -75,13 +79,11 @@ class TestSimpleNet(unittest.TestCase): ...@@ -75,13 +79,11 @@ class TestSimpleNet(unittest.TestCase):
try: try:
emb._w.gradient() emb._w.gradient()
except ValueError as e: except ValueError as e:
pass assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str(
e)
input_emb.clear_gradient() input_emb.clear_gradient()
try: input_emb.gradient()
input_emb.gradient()
except ValueError as e:
pass
def test_selectedrows_gradient2(self): def test_selectedrows_gradient2(self):
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
...@@ -93,7 +95,6 @@ class TestSimpleNet(unittest.TestCase): ...@@ -93,7 +95,6 @@ class TestSimpleNet(unittest.TestCase):
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = sort_sum_gradient backward_strategy.sort_sum_gradient = sort_sum_gradient
adam = SGDOptimizer(learning_rate=0.001)
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm( grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(
5.0) 5.0)
...@@ -101,16 +102,21 @@ class TestSimpleNet(unittest.TestCase): ...@@ -101,16 +102,21 @@ class TestSimpleNet(unittest.TestCase):
input = to_variable(input_word) input = to_variable(input_word)
simplenet = SimpleNet(20, 32, "float32") simplenet = SimpleNet(20, 32, "float32")
adam = SGDOptimizer(
learning_rate=0.001,
parameter_list=simplenet.parameters())
input_emb, emb = simplenet(input) input_emb, emb = simplenet(input)
try: try:
emb._w.gradient() emb._w.gradient()
except ValueError as e: except ValueError as e:
pass assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str(
e)
try: try:
input_emb.gradient() input_emb.gradient()
except ValueError as e: except ValueError as e:
pass assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str(
e)
input_emb.backward(backward_strategy) input_emb.backward(backward_strategy)
adam.minimize(input_emb, grad_clip=grad_clip) adam.minimize(input_emb, grad_clip=grad_clip)
...@@ -120,13 +126,11 @@ class TestSimpleNet(unittest.TestCase): ...@@ -120,13 +126,11 @@ class TestSimpleNet(unittest.TestCase):
try: try:
emb._w.gradient() emb._w.gradient()
except ValueError as e: except ValueError as e:
pass assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str(
e)
input_emb.clear_gradient() input_emb.clear_gradient()
try: input_emb.gradient()
input_emb.gradient()
except ValueError as e:
pass
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -114,7 +114,9 @@ class TestDygraphSimpleNet(unittest.TestCase): ...@@ -114,7 +114,9 @@ class TestDygraphSimpleNet(unittest.TestCase):
is_sparse=is_sparse, is_sparse=is_sparse,
dtype=dtype) dtype=dtype)
sgd = SGDOptimizer(learning_rate=1e-3) sgd = SGDOptimizer(
learning_rate=1e-3,
parameter_list=simple_net.parameters())
dy_param_updated = dict() dy_param_updated = dict()
dy_param_init = dict() dy_param_init = dict()
dy_loss = None dy_loss = None
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import Embedding, LayerNorm, FC, Layer from paddle.fluid import Embedding, LayerNorm, Linear, Layer
from paddle.fluid.dygraph import to_variable, guard from paddle.fluid.dygraph import to_variable, guard
from paddle.fluid.dygraph.jit import TracedLayer from paddle.fluid.dygraph.jit import TracedLayer
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
...@@ -378,15 +378,10 @@ class PrePostProcessLayer(Layer): ...@@ -378,15 +378,10 @@ class PrePostProcessLayer(Layer):
class PositionwiseFeedForwardLayer(Layer): class PositionwiseFeedForwardLayer(Layer):
def __init__(self, name_scope, d_inner_hid, d_hid, dropout_rate): def __init__(self, d_inner_hid, d_hid, dropout_rate):
super(PositionwiseFeedForwardLayer, self).__init__(name_scope) super(PositionwiseFeedForwardLayer, self).__init__()
self._i2h = FC(name_scope=self.full_name(), self._i2h = Linear(d_hid, d_inner_hid, act="relu")
size=d_inner_hid, self._h2o = Linear(d_inner_hid, d_hid)
num_flatten_dims=2,
act="relu")
self._h2o = FC(name_scope=self.full_name(),
size=d_hid,
num_flatten_dims=2)
self._dropout_rate = dropout_rate self._dropout_rate = dropout_rate
def forward(self, x): def forward(self, x):
...@@ -403,7 +398,6 @@ class PositionwiseFeedForwardLayer(Layer): ...@@ -403,7 +398,6 @@ class PositionwiseFeedForwardLayer(Layer):
class MultiHeadAttentionLayer(Layer): class MultiHeadAttentionLayer(Layer):
def __init__(self, def __init__(self,
name_scope,
d_key, d_key,
d_value, d_value,
d_model, d_model,
...@@ -412,28 +406,16 @@ class MultiHeadAttentionLayer(Layer): ...@@ -412,28 +406,16 @@ class MultiHeadAttentionLayer(Layer):
cache=None, cache=None,
gather_idx=None, gather_idx=None,
static_kv=False): static_kv=False):
super(MultiHeadAttentionLayer, self).__init__(name_scope) super(MultiHeadAttentionLayer, self).__init__()
self._n_head = n_head self._n_head = n_head
self._d_key = d_key self._d_key = d_key
self._d_value = d_value self._d_value = d_value
self._d_model = d_model self._d_model = d_model
self._dropout_rate = dropout_rate self._dropout_rate = dropout_rate
self._q_fc = FC(name_scope=self.full_name(), self._q_fc = Linear(self._d_model, d_key * n_head, bias_attr=False)
size=d_key * n_head, self._k_fc = Linear(self._d_model, d_key * n_head, bias_attr=False)
bias_attr=False, self._v_fc = Linear(self._d_model, d_value * n_head, bias_attr=False)
num_flatten_dims=2) self._proj_fc = Linear(d_value * n_head, self._d_model, bias_attr=False)
self._k_fc = FC(name_scope=self.full_name(),
size=d_key * n_head,
bias_attr=False,
num_flatten_dims=2)
self._v_fc = FC(name_scope=self.full_name(),
size=d_value * n_head,
bias_attr=False,
num_flatten_dims=2)
self._proj_fc = FC(name_scope=self.full_name(),
size=self._d_model,
bias_attr=False,
num_flatten_dims=2)
def forward(self, queries, keys, values, attn_bias): def forward(self, queries, keys, values, attn_bias):
# compute q ,k ,v # compute q ,k ,v
...@@ -490,7 +472,6 @@ class MultiHeadAttentionLayer(Layer): ...@@ -490,7 +472,6 @@ class MultiHeadAttentionLayer(Layer):
class EncoderSubLayer(Layer): class EncoderSubLayer(Layer):
def __init__(self, def __init__(self,
name_scope,
n_head, n_head,
d_key, d_key,
d_value, d_value,
...@@ -502,7 +483,7 @@ class EncoderSubLayer(Layer): ...@@ -502,7 +483,7 @@ class EncoderSubLayer(Layer):
preprocess_cmd="n", preprocess_cmd="n",
postprocess_cmd="da"): postprocess_cmd="da"):
super(EncoderSubLayer, self).__init__(name_scope) super(EncoderSubLayer, self).__init__()
self._preprocess_cmd = preprocess_cmd self._preprocess_cmd = preprocess_cmd
self._postprocess_cmd = postprocess_cmd self._postprocess_cmd = postprocess_cmd
self._prepostprocess_dropout = prepostprocess_dropout self._prepostprocess_dropout = prepostprocess_dropout
...@@ -510,14 +491,13 @@ class EncoderSubLayer(Layer): ...@@ -510,14 +491,13 @@ class EncoderSubLayer(Layer):
self._preprocess_layer = PrePostProcessLayer(d_model, self._preprocess_layer = PrePostProcessLayer(d_model,
self._preprocess_cmd, 3) self._preprocess_cmd, 3)
self._multihead_attention_layer = MultiHeadAttentionLayer( self._multihead_attention_layer = MultiHeadAttentionLayer(
self.full_name(), d_key, d_value, d_model, n_head, d_key, d_value, d_model, n_head, attention_dropout)
attention_dropout)
self._postprocess_layer = PrePostProcessLayer( self._postprocess_layer = PrePostProcessLayer(
d_model, self._postprocess_cmd, None) d_model, self._postprocess_cmd, None)
self._preprocess_layer2 = PrePostProcessLayer(d_model, self._preprocess_layer2 = PrePostProcessLayer(d_model,
self._preprocess_cmd, 3) self._preprocess_cmd, 3)
self._positionwise_feed_forward = PositionwiseFeedForwardLayer( self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
self.full_name(), d_inner_hid, d_model, relu_dropout) d_inner_hid, d_model, relu_dropout)
self._postprocess_layer2 = PrePostProcessLayer( self._postprocess_layer2 = PrePostProcessLayer(
d_model, self._postprocess_cmd, None) d_model, self._postprocess_cmd, None)
...@@ -540,7 +520,6 @@ class EncoderSubLayer(Layer): ...@@ -540,7 +520,6 @@ class EncoderSubLayer(Layer):
class EncoderLayer(Layer): class EncoderLayer(Layer):
def __init__(self, def __init__(self,
name_scope,
n_layer, n_layer,
n_head, n_head,
d_key, d_key,
...@@ -553,7 +532,7 @@ class EncoderLayer(Layer): ...@@ -553,7 +532,7 @@ class EncoderLayer(Layer):
preprocess_cmd="n", preprocess_cmd="n",
postprocess_cmd="da"): postprocess_cmd="da"):
super(EncoderLayer, self).__init__(name_scope) super(EncoderLayer, self).__init__()
self._preprocess_cmd = preprocess_cmd self._preprocess_cmd = preprocess_cmd
self._encoder_sublayers = list() self._encoder_sublayers = list()
self._prepostprocess_dropout = prepostprocess_dropout self._prepostprocess_dropout = prepostprocess_dropout
...@@ -564,10 +543,10 @@ class EncoderLayer(Layer): ...@@ -564,10 +543,10 @@ class EncoderLayer(Layer):
self._encoder_sublayers.append( self._encoder_sublayers.append(
self.add_sublayer( self.add_sublayer(
'esl_%d' % i, 'esl_%d' % i,
EncoderSubLayer( EncoderSubLayer(n_head, d_key, d_value, d_model,
self.full_name(), n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
d_inner_hid, prepostprocess_dropout, attention_dropout, attention_dropout, relu_dropout,
relu_dropout, preprocess_cmd, postprocess_cmd))) preprocess_cmd, postprocess_cmd)))
def forward(self, enc_input, attn_bias): def forward(self, enc_input, attn_bias):
for i in range(self._n_layer): for i in range(self._n_layer):
...@@ -580,7 +559,6 @@ class EncoderLayer(Layer): ...@@ -580,7 +559,6 @@ class EncoderLayer(Layer):
class PrepareEncoderDecoderLayer(Layer): class PrepareEncoderDecoderLayer(Layer):
def __init__(self, def __init__(self,
name_scope,
src_vocab_size, src_vocab_size,
src_emb_dim, src_emb_dim,
src_max_len, src_max_len,
...@@ -588,7 +566,7 @@ class PrepareEncoderDecoderLayer(Layer): ...@@ -588,7 +566,7 @@ class PrepareEncoderDecoderLayer(Layer):
is_sparse=False, is_sparse=False,
word_emb_param_name=None, word_emb_param_name=None,
pos_enc_param_name=None): pos_enc_param_name=None):
super(PrepareEncoderDecoderLayer, self).__init__(name_scope) super(PrepareEncoderDecoderLayer, self).__init__()
self._src_max_len = src_max_len self._src_max_len = src_max_len
self._src_emb_dim = src_emb_dim self._src_emb_dim = src_emb_dim
self._src_vocab_size = src_vocab_size self._src_vocab_size = src_vocab_size
...@@ -634,7 +612,6 @@ class PrepareEncoderDecoderLayer(Layer): ...@@ -634,7 +612,6 @@ class PrepareEncoderDecoderLayer(Layer):
class WrapEncoderLayer(Layer): class WrapEncoderLayer(Layer):
def __init__(self, def __init__(self,
name_cope,
src_vocab_size, src_vocab_size,
max_length, max_length,
n_layer, n_layer,
...@@ -653,10 +630,9 @@ class WrapEncoderLayer(Layer): ...@@ -653,10 +630,9 @@ class WrapEncoderLayer(Layer):
""" """
The wrapper assembles together all needed layers for the encoder. The wrapper assembles together all needed layers for the encoder.
""" """
super(WrapEncoderLayer, self).__init__(name_cope) super(WrapEncoderLayer, self).__init__()
self._prepare_encoder_layer = PrepareEncoderDecoderLayer( self._prepare_encoder_layer = PrepareEncoderDecoderLayer(
self.full_name(),
src_vocab_size, src_vocab_size,
d_model, d_model,
max_length, max_length,
...@@ -664,10 +640,10 @@ class WrapEncoderLayer(Layer): ...@@ -664,10 +640,10 @@ class WrapEncoderLayer(Layer):
is_sparse=is_sparse, is_sparse=is_sparse,
word_emb_param_name=word_emb_param_names[0], word_emb_param_name=word_emb_param_names[0],
pos_enc_param_name=pos_enc_param_names[0]) pos_enc_param_name=pos_enc_param_names[0])
self._encoder = EncoderLayer( self._encoder = EncoderLayer(n_layer, n_head, d_key, d_value, d_model,
self.full_name(), n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
d_inner_hid, prepostprocess_dropout, attention_dropout, attention_dropout, relu_dropout,
relu_dropout, preprocess_cmd, postprocess_cmd) preprocess_cmd, postprocess_cmd)
def forward(self, enc_inputs): def forward(self, enc_inputs):
src_word, src_pos, src_slf_attn_bias = enc_inputs src_word, src_pos, src_slf_attn_bias = enc_inputs
...@@ -678,7 +654,6 @@ class WrapEncoderLayer(Layer): ...@@ -678,7 +654,6 @@ class WrapEncoderLayer(Layer):
class DecoderSubLayer(Layer): class DecoderSubLayer(Layer):
def __init__(self, def __init__(self,
name_scope,
n_head, n_head,
d_key, d_key,
d_value, d_value,
...@@ -691,14 +666,13 @@ class DecoderSubLayer(Layer): ...@@ -691,14 +666,13 @@ class DecoderSubLayer(Layer):
postprocess_cmd, postprocess_cmd,
cache=None, cache=None,
gather_idx=None): gather_idx=None):
super(DecoderSubLayer, self).__init__(name_scope) super(DecoderSubLayer, self).__init__()
self._postprocess_cmd = postprocess_cmd self._postprocess_cmd = postprocess_cmd
self._preprocess_cmd = preprocess_cmd self._preprocess_cmd = preprocess_cmd
self._prepostprcess_dropout = prepostprocess_dropout self._prepostprcess_dropout = prepostprocess_dropout
self._pre_process_layer = PrePostProcessLayer(d_model, preprocess_cmd, self._pre_process_layer = PrePostProcessLayer(d_model, preprocess_cmd,
3) 3)
self._multihead_attention_layer = MultiHeadAttentionLayer( self._multihead_attention_layer = MultiHeadAttentionLayer(
self.full_name(),
d_key, d_key,
d_value, d_value,
d_model, d_model,
...@@ -711,7 +685,6 @@ class DecoderSubLayer(Layer): ...@@ -711,7 +685,6 @@ class DecoderSubLayer(Layer):
self._pre_process_layer2 = PrePostProcessLayer(d_model, preprocess_cmd, self._pre_process_layer2 = PrePostProcessLayer(d_model, preprocess_cmd,
3) 3)
self._multihead_attention_layer2 = MultiHeadAttentionLayer( self._multihead_attention_layer2 = MultiHeadAttentionLayer(
self.full_name(),
d_key, d_key,
d_value, d_value,
d_model, d_model,
...@@ -725,7 +698,7 @@ class DecoderSubLayer(Layer): ...@@ -725,7 +698,7 @@ class DecoderSubLayer(Layer):
self._pre_process_layer3 = PrePostProcessLayer(d_model, preprocess_cmd, self._pre_process_layer3 = PrePostProcessLayer(d_model, preprocess_cmd,
3) 3)
self._positionwise_feed_forward_layer = PositionwiseFeedForwardLayer( self._positionwise_feed_forward_layer = PositionwiseFeedForwardLayer(
self.full_name(), d_inner_hid, d_model, relu_dropout) d_inner_hid, d_model, relu_dropout)
self._post_process_layer3 = PrePostProcessLayer(d_model, self._post_process_layer3 = PrePostProcessLayer(d_model,
postprocess_cmd, None) postprocess_cmd, None)
...@@ -757,7 +730,6 @@ class DecoderSubLayer(Layer): ...@@ -757,7 +730,6 @@ class DecoderSubLayer(Layer):
class DecoderLayer(Layer): class DecoderLayer(Layer):
def __init__(self, def __init__(self,
name_scope,
n_layer, n_layer,
n_head, n_head,
d_key, d_key,
...@@ -771,7 +743,7 @@ class DecoderLayer(Layer): ...@@ -771,7 +743,7 @@ class DecoderLayer(Layer):
postprocess_cmd, postprocess_cmd,
caches=None, caches=None,
gather_idx=None): gather_idx=None):
super(DecoderLayer, self).__init__(name_scope) super(DecoderLayer, self).__init__()
self._pre_process_layer = PrePostProcessLayer(d_model, preprocess_cmd, self._pre_process_layer = PrePostProcessLayer(d_model, preprocess_cmd,
3) 3)
self._decoder_sub_layers = list() self._decoder_sub_layers = list()
...@@ -783,7 +755,6 @@ class DecoderLayer(Layer): ...@@ -783,7 +755,6 @@ class DecoderLayer(Layer):
self.add_sublayer( self.add_sublayer(
'dsl_%d' % i, 'dsl_%d' % i,
DecoderSubLayer( DecoderSubLayer(
self.full_name(),
n_head, n_head,
d_key, d_key,
d_value, d_value,
...@@ -812,7 +783,6 @@ class DecoderLayer(Layer): ...@@ -812,7 +783,6 @@ class DecoderLayer(Layer):
class WrapDecoderLayer(Layer): class WrapDecoderLayer(Layer):
def __init__(self, def __init__(self,
name_scope,
trg_vocab_size, trg_vocab_size,
max_length, max_length,
n_layer, n_layer,
...@@ -833,10 +803,9 @@ class WrapDecoderLayer(Layer): ...@@ -833,10 +803,9 @@ class WrapDecoderLayer(Layer):
""" """
The wrapper assembles together all needed layers for the encoder. The wrapper assembles together all needed layers for the encoder.
""" """
super(WrapDecoderLayer, self).__init__(name_scope) super(WrapDecoderLayer, self).__init__()
self._prepare_decoder_layer = PrepareEncoderDecoderLayer( self._prepare_decoder_layer = PrepareEncoderDecoderLayer(
self.full_name(),
trg_vocab_size, trg_vocab_size,
d_model, d_model,
max_length, max_length,
...@@ -845,7 +814,6 @@ class WrapDecoderLayer(Layer): ...@@ -845,7 +814,6 @@ class WrapDecoderLayer(Layer):
word_emb_param_name=word_emb_param_names[1], word_emb_param_name=word_emb_param_names[1],
pos_enc_param_name=pos_enc_param_names[1]) pos_enc_param_name=pos_enc_param_names[1])
self._decoder_layer = DecoderLayer( self._decoder_layer = DecoderLayer(
self.full_name(),
n_layer, n_layer,
n_head, n_head,
d_key, d_key,
...@@ -861,9 +829,7 @@ class WrapDecoderLayer(Layer): ...@@ -861,9 +829,7 @@ class WrapDecoderLayer(Layer):
gather_idx=gather_idx) gather_idx=gather_idx)
self._weight_sharing = weight_sharing self._weight_sharing = weight_sharing
if not weight_sharing: if not weight_sharing:
self._fc = FC(self.full_name(), self._fc = Linear(d_model, trg_vocab_size, bias_attr=False)
size=trg_vocab_size,
bias_attr=False)
def forward(self, dec_inputs=None, enc_output=None): def forward(self, dec_inputs=None, enc_output=None):
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
...@@ -891,7 +857,6 @@ class WrapDecoderLayer(Layer): ...@@ -891,7 +857,6 @@ class WrapDecoderLayer(Layer):
class TransFormer(Layer): class TransFormer(Layer):
def __init__(self, def __init__(self,
name_scope,
src_vocab_size, src_vocab_size,
trg_vocab_size, trg_vocab_size,
max_length, max_length,
...@@ -911,7 +876,7 @@ class TransFormer(Layer): ...@@ -911,7 +876,7 @@ class TransFormer(Layer):
use_py_reader=False, use_py_reader=False,
is_test=False, is_test=False,
is_sparse=False): is_sparse=False):
super(TransFormer, self).__init__(name_scope) super(TransFormer, self).__init__()
self._label_smooth_eps = label_smooth_eps self._label_smooth_eps = label_smooth_eps
self._trg_vocab_size = trg_vocab_size self._trg_vocab_size = trg_vocab_size
if weight_sharing: if weight_sharing:
...@@ -919,7 +884,6 @@ class TransFormer(Layer): ...@@ -919,7 +884,6 @@ class TransFormer(Layer):
"Vocabularies in source and target should be same for weight sharing." "Vocabularies in source and target should be same for weight sharing."
) )
self._wrap_encoder_layer = WrapEncoderLayer( self._wrap_encoder_layer = WrapEncoderLayer(
self.full_name(),
src_vocab_size, src_vocab_size,
max_length, max_length,
n_layer, n_layer,
...@@ -936,7 +900,6 @@ class TransFormer(Layer): ...@@ -936,7 +900,6 @@ class TransFormer(Layer):
weight_sharing, weight_sharing,
is_sparse=is_sparse) is_sparse=is_sparse)
self._wrap_decoder_layer = WrapDecoderLayer( self._wrap_decoder_layer = WrapDecoderLayer(
self.full_name(),
trg_vocab_size, trg_vocab_size,
max_length, max_length,
n_layer, n_layer,
...@@ -991,7 +954,6 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): ...@@ -991,7 +954,6 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True backward_strategy.sort_sum_gradient = True
transformer = TransFormer( transformer = TransFormer(
'transformer',
ModelHyperParams.src_vocab_size, ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size, ModelHyperParams.trg_vocab_size,
ModelHyperParams.max_length + 1, ModelHyperParams.max_length + 1,
...@@ -1020,9 +982,12 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): ...@@ -1020,9 +982,12 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
learning_rate=learning_rate, learning_rate=learning_rate,
beta1=TrainTaskConfig.beta1, beta1=TrainTaskConfig.beta1,
beta2=TrainTaskConfig.beta2, beta2=TrainTaskConfig.beta2,
epsilon=TrainTaskConfig.eps) epsilon=TrainTaskConfig.eps,
parameter_list=transformer.parameters())
else: else:
optimizer = fluid.optimizer.SGD(learning_rate=0.003) optimizer = fluid.optimizer.SGD(
learning_rate=0.003,
parameter_list=transformer.parameters())
dy_param_init = dict() dy_param_init = dict()
dy_param_updated = dict() dy_param_updated = dict()
...@@ -1073,7 +1038,6 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): ...@@ -1073,7 +1038,6 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
transformer = TransFormer( transformer = TransFormer(
'transformer',
ModelHyperParams.src_vocab_size, ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size, ModelHyperParams.trg_vocab_size,
ModelHyperParams.max_length + 1, ModelHyperParams.max_length + 1,
......
...@@ -197,7 +197,8 @@ class TestLayer(LayerTest): ...@@ -197,7 +197,8 @@ class TestLayer(LayerTest):
fc1_bias_init = fc1.bias.detach() fc1_bias_init = fc1.bias.detach()
loss1.backward() loss1.backward()
optimizer1 = fluid.optimizer.SGD(learning_rate=0.1) optimizer1 = fluid.optimizer.SGD(learning_rate=0.1,
parameter_list=fc1.parameters())
optimizer1.minimize(loss1) optimizer1.minimize(loss1)
fc1_weight_updated = fc1.weight.detach() fc1_weight_updated = fc1.weight.detach()
...@@ -224,7 +225,8 @@ class TestLayer(LayerTest): ...@@ -224,7 +225,8 @@ class TestLayer(LayerTest):
out2 = fc2(base.to_variable(inp)) out2 = fc2(base.to_variable(inp))
loss2 = fluid.layers.reduce_mean(out2) loss2 = fluid.layers.reduce_mean(out2)
loss2.backward() loss2.backward()
optimizer2 = fluid.optimizer.SGD(learning_rate=0.1) optimizer2 = fluid.optimizer.SGD(learning_rate=0.1,
parameter_list=fc2.parameters())
optimizer2.minimize(loss2) optimizer2.minimize(loss2)
self.assertTrue( self.assertTrue(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册