提交 dca07583 编写于 作者: Z zhongpu 提交者: hong

remove params in Tracer object (in dygraph) (#20815)

* remove params in Tracer object, test=develop

* Repair failed optest, test=develop

* remove build_once & name_scope (Conv2D)
test=develop

* fix unittest
test=develop

* Conv2DTranspose

* Conv3D & Conv3DTranspose
test=develop

* Pool2D & BatchNorm

* Embedding

* LayerNorm

* GRUUnit & NCE

* PRelu

* BilinearTensorProduct

* GroupNorm & SpectralNorm

* TreeConv
test=develop

* fix LayerNorm in transformer unnittest
test=develop

* disable LayerNorm or BatchNorm in multicard
test=develop

* refine Layer.create_parameter api
test=develop

* refine LayerNorm, remove begin_norm_axis param, add normed shape check
test=develop

* LayerNorm bug fix
test=develop

* fix optest,test=develop

* fix optest, test=develop

* fix optest for pass parameter_list when constructing an Optimizer class instance, test=develop

* polish code for better code style, test=develop

* fix se_resnext optest, test=develop

* polish code for better code style, test=develop
Co-authored-by: Nsongyouwei <youwei0314@gmail.com>
上级 c3e19549
......@@ -154,14 +154,14 @@ def guard(place=None):
yield
def _print_debug_msg(limit=5, is_test=False):
def _print_debug_msg(parameter_list, limit=5, is_test=False):
if not core._is_dygraph_debug_enabled():
logging.warn(
'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug'
)
return
unique_name_size = len(framework.unique_name.generator.ids)
tracer_var_size = len(framework._dygraph_tracer()._vars)
tracer_var_size = len(parameter_list)
alive_cpp_var_size = len(core.VarBase._alive_vars())
if not is_test:
logging.warn(
......
......@@ -53,7 +53,8 @@ def save_dygraph(state_dict, model_path):
state_dict = emb.state_dict()
fluid.save_dygraph( state_dict, "paddle_dy")
adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000) )
adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000),
parameter_list = emb.parameters() )
state_dict = adam.state_dict()
fluid.save_dygraph( state_dict, "paddle_dy")
......@@ -96,7 +97,8 @@ def load_dygraph(model_path):
state_dict = emb.state_dict()
fluid.save_dygraph( state_dict, "paddle_dy")
adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000) )
adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000),
parameter_list = emb.parameters() )
state_dict = adam.state_dict()
fluid.save_dygraph( state_dict, "padle_dy")
......
......@@ -145,9 +145,13 @@ class Layer(core.Layer):
list of :ref:`api_guide_Variable_en` : a list of Parameters.
"""
ret = [p for p in self._parameters.values()]
parameters_set = set(ret)
if include_sublayers:
for l in self._sub_layers.values():
for p in l.parameters(include_sublayers):
if p in parameters_set:
continue
parameters_set.add(p)
ret.append(p)
return ret
......@@ -261,11 +265,6 @@ class Layer(core.Layer):
value.set_value(self._loaddict_holder[value.name])
if name in params:
# remove unused param in tracer
if framework._dygraph_tracer_ is not None:
framework._dygraph_tracer_._vars.pop(params[name].name,
None)
params[name] = value
elif isinstance(value, core.Layer):
layers = self.__dict__.get('_sub_layers', None)
......
......@@ -104,8 +104,10 @@ class PiecewiseDecay(LearningRateDecay):
boundaries = [10000, 20000]
values = [1.0, 0.5, 0.1]
with fluid.dygraph.guard():
emb = fluid.dygraph.Embedding( [10, 10] )
optimizer = fluid.optimizer.SGD(
learning_rate=fluid.dygraph.PiecewiseDecay(boundaries, values, 0) )
learning_rate=fluid.dygraph.PiecewiseDecay(boundaries, values, 0),
parameter_list = emb.parameters() )
"""
def __init__(self, boundaries, values, begin, step=1, dtype='float32'):
......@@ -323,12 +325,14 @@ class InverseTimeDecay(LearningRateDecay):
import paddle.fluid as fluid
base_lr = 0.1
with fluid.dygraph.guard():
emb = fluid.dygraph.Embedding([10, 10])
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.dygraph.InverseTimeDecay(
learning_rate=base_lr,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
staircase=True),
parameter_list = emb.parameters())
"""
......@@ -404,9 +408,11 @@ class PolynomialDecay(LearningRateDecay):
total_step = 5000
end_lr = 0
with fluid.dygraph.guard():
emb = fluid.dygraph.Embedding( [10, 10])
optimizer = fluid.optimizer.SGD(
learning_rate = fluid.dygraph.PolynomialDecay(
start_lr, total_step, end_lr, power=1.0) )
start_lr, total_step, end_lr, power=1.0),
parameter_list = emb.parameters())
"""
......@@ -536,10 +542,12 @@ class NoamDecay(LearningRateDecay):
warmup_steps = 100
learning_rate = 0.01
with fluid.dygraph.guard():
emb = fluid.dygraph.Embedding([10, 10])
optimizer = fluid.optimizer.SGD(
learning_rate = fluid.dygraph.NoamDecay(
1/(warmup_steps *(learning_rate ** 2)),
warmup_steps) )
warmup_steps),
parameter_list = emb.parameters())
"""
def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'):
......
......@@ -31,16 +31,8 @@ class Tracer(core.Tracer):
def __init__(self):
super(Tracer, self).__init__()
self._vars = defaultdict()
self._train_mode = True
def trace_var(self, name, var):
self._vars[name] = var
def all_parameters(self):
return list((item for name, item in six.iteritems(self._vars)
if isinstance(item, framework.Parameter)))
def trace_op(self, type, inputs, outputs, attrs, stop_gradient=False):
self.trace(type, inputs, outputs, attrs,
framework._current_expected_place(), self._train_mode and
......
......@@ -4676,8 +4676,6 @@ class ParamBase(core.VarBase):
# self.block = default_main_program().global_block()
_dygraph_tracer().trace_var(name, self)
def __str__(self):
return self.to_string(True)
......
......@@ -60,7 +60,12 @@ class Optimizer(object):
"""
@imperative_base.no_grad
def __init__(self, learning_rate, regularization=None, name=None):
def __init__(self,
learning_rate,
parameter_list=None,
regularization=None,
name=None):
self._parameter_list = None
if framework.in_dygraph_mode():
if not isinstance(learning_rate, float) and \
not isinstance(learning_rate, LearningRateDecay):
......@@ -71,6 +76,12 @@ class Optimizer(object):
self._name = unique_name.generate(name)
else:
self._name = unique_name.generate(self.__class__.__name__)
if parameter_list is not None:
self._parameter_list = parameter_list
else:
raise AttributeError(
"parameter_list argument given to the Optimizer should not be None in dygraph mode."
)
else:
if not isinstance(learning_rate, float) and \
not isinstance(learning_rate, framework.Variable):
......@@ -154,7 +165,8 @@ class Optimizer(object):
state_dict = emb.state_dict()
fluid.save_dygraph( state_dict, "paddle_dy")
adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000) )
adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000),
parameter_list = emb.parameters() )
state_dict = adam.state_dict()
fluid.save_dygraph( state_dict, "padle_dy")
......@@ -530,13 +542,8 @@ class Optimizer(object):
self._dtype = loss.dtype
if framework.in_dygraph_mode():
if parameter_list is not None:
parameters = parameter_list
else:
parameters = framework._dygraph_tracer().all_parameters()
params_grads = []
for param in parameters:
for param in self._parameter_list:
if not param.trainable:
continue
if param._grad_ivar() is not None:
......@@ -705,6 +712,9 @@ class SGDOptimizer(Optimizer):
Parameters:
learning_rate (float|Variable): The learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. \
Optional, default is None.
name (str, optional): This parameter is used by developers to print debugging information. \
......@@ -740,10 +750,15 @@ class SGDOptimizer(Optimizer):
"""
def __init__(self, learning_rate, regularization=None, name=None):
def __init__(self,
learning_rate,
parameter_list=None,
regularization=None,
name=None):
assert learning_rate is not None
super(SGDOptimizer, self).__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
name=name)
self.type = "sgd"
......@@ -801,6 +816,9 @@ class MomentumOptimizer(Optimizer):
learning_rate (float|Variable): The learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element.
momentum (float): Momentum factor
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
use_nesterov (bool, optional): Enables Nesterov momentum, default is false.
regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. \
Optional, default is None.
......@@ -841,6 +859,7 @@ class MomentumOptimizer(Optimizer):
def __init__(self,
learning_rate,
momentum,
parameter_list=None,
use_nesterov=False,
regularization=None,
name=None):
......@@ -848,6 +867,7 @@ class MomentumOptimizer(Optimizer):
assert momentum is not None
super(MomentumOptimizer, self).__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
name=name)
self.type = "momentum"
......@@ -921,6 +941,9 @@ class DGCMomentumOptimizer(Optimizer):
sparsity (list[float]): Get top important element from gradient tensor, the ratio is (1 - current sparsity). \
Default is [0.999]. For example, if the sparsity is [0.99, 0.999], \
the top [1%, 0.1%] important element will be transmitted.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
use_nesterov (bool): Enables Nesterov momentum. True means use Nesterov. Default is False.
local_grad_clip_norm (float, optional): Local gradient clip norm value. Optional, default is None, represent no need clip.
num_trainers (int, optional): The number of training nodes. Optional, default is None.
......@@ -950,6 +973,7 @@ class DGCMomentumOptimizer(Optimizer):
rampup_begin_step,
rampup_step=1,
sparsity=[0.999],
parameter_list=None,
use_nesterov=False,
local_grad_clip_norm=None,
num_trainers=None,
......@@ -959,6 +983,7 @@ class DGCMomentumOptimizer(Optimizer):
assert momentum is not None
super(DGCMomentumOptimizer, self).__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
name=name)
self.type = "dgc_momentum"
......@@ -1286,6 +1311,9 @@ class LarsMomentumOptimizer(Optimizer):
momentum (float): momentum factor
lars_coeff (float): Defines how much we trust the layer to change its weights.
lars_weight_decay (float): Weight decay coefficient for decaying using LARS.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`.
Optional, default is None.
name (str, optional): This parameter is used by developers to print debugging information. \
......@@ -1318,12 +1346,14 @@ class LarsMomentumOptimizer(Optimizer):
momentum,
lars_coeff=0.001,
lars_weight_decay=0.0005,
parameter_list=None,
regularization=None,
name=None):
assert learning_rate is not None
assert momentum is not None
super(LarsMomentumOptimizer, self).__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
name=name)
self.type = "lars_momentum"
......@@ -1391,6 +1421,9 @@ class AdagradOptimizer(Optimizer):
It can be a float value or a ``Variable`` with a float type.
epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-06.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as
:ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None.
name (str, optional): Normally there is no need for user to set this property.
......@@ -1423,6 +1456,7 @@ class AdagradOptimizer(Optimizer):
def __init__(self,
learning_rate,
epsilon=1.0e-6,
parameter_list=None,
regularization=None,
name=None,
initial_accumulator_value=0.0):
......@@ -1430,6 +1464,7 @@ class AdagradOptimizer(Optimizer):
assert epsilon is not None
super(AdagradOptimizer, self).__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
name=name)
self.type = "adagrad"
......@@ -1510,6 +1545,9 @@ class AdamOptimizer(Optimizer):
The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-08.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as
:ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None.
name (str, optional): Normally there is no need for user to set this property.
......@@ -1619,6 +1657,7 @@ class AdamOptimizer(Optimizer):
beta1=0.9,
beta2=0.999,
epsilon=1e-8,
parameter_list=None,
regularization=None,
name=None,
lazy_mode=False):
......@@ -1628,6 +1667,7 @@ class AdamOptimizer(Optimizer):
assert epsilon is not None
super(AdamOptimizer, self).__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
name=name)
self.type = "adam"
......@@ -1747,6 +1787,9 @@ class AdamaxOptimizer(Optimizer):
The default value is 0.999.
epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-08.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as
:ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None.
name (str, optional): Normally there is no need for user to set this property.
......@@ -1792,6 +1835,7 @@ class AdamaxOptimizer(Optimizer):
beta1=0.9,
beta2=0.999,
epsilon=1e-8,
parameter_list=None,
regularization=None,
name=None):
assert learning_rate is not None
......@@ -1800,6 +1844,7 @@ class AdamaxOptimizer(Optimizer):
assert epsilon is not None
super(AdamaxOptimizer, self).__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
name=name)
self.type = "adamax"
......@@ -1909,6 +1954,9 @@ class DpsgdOptimizer(Optimizer):
clip (float): clipping threshold
batch_size (float): batch size.
sigma (float): for gaussian noise.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
Notes:
Currently, DpsgdOptimizer doesn't support sparse parameter optimization.
"""
......@@ -1917,12 +1965,14 @@ class DpsgdOptimizer(Optimizer):
learning_rate=0.001,
clip=0.9,
batch_size=0.999,
sigma=1e-8):
sigma=1e-8,
parameter_list=None):
assert learning_rate is not None
assert clip is not None
assert batch_size is not None
assert sigma is not None
super(DpsgdOptimizer, self).__init__(learning_rate=learning_rate)
super(DpsgdOptimizer, self).__init__(
learning_rate=learning_rate, parameter_list=parameter_list)
self.type = "dpsgd"
self._clip = clip
self._batch_size = batch_size
......@@ -1976,6 +2026,9 @@ class DecayedAdagradOptimizer(Optimizer):
decay (float, optional): The decay rate. The default value is 0.95.
epsilon (float, optional): A small float value for numerical stability.
The default value is 1e-06.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization (WeightDecayRegularizer, optional): A ``Regularizer``, such as
:ref:`api_fluid_regularizer_L2DecayRegularizer`. The default value is None.
name (str, optional): Normally there is no need for user to set this property.
......@@ -2002,6 +2055,7 @@ class DecayedAdagradOptimizer(Optimizer):
learning_rate,
decay=0.95,
epsilon=1.0e-6,
parameter_list=None,
regularization=None,
name=None):
assert learning_rate is not None
......@@ -2010,6 +2064,7 @@ class DecayedAdagradOptimizer(Optimizer):
super(DecayedAdagradOptimizer, self).__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
name=name)
self.type = "decayed_adagrad"
......@@ -2066,6 +2121,9 @@ class AdadeltaOptimizer(Optimizer):
learning_rate (float|Variable): global learning rate.
epsilon (float): a small float number for numeric stability. Default 1.0e-6.
rho (float): a floating point value indicating the decay rate. Default 0.95.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization (WeightDecayRegularizer, optional): A Regularizer, such as
fluid.regularizer.L2DecayRegularizer. Default None, meaning that there is no
regularization.
......@@ -2097,6 +2155,7 @@ class AdadeltaOptimizer(Optimizer):
learning_rate,
epsilon=1.0e-6,
rho=0.95,
parameter_list=None,
regularization=None,
name=None):
if learning_rate is None:
......@@ -2107,6 +2166,7 @@ class AdadeltaOptimizer(Optimizer):
raise ValueError("rho is not set.")
super(AdadeltaOptimizer, self).__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
name=name)
self.type = "adadelta"
......@@ -2210,6 +2270,9 @@ class RMSPropOptimizer(Optimizer):
the gradient; if False, by the uncentered second moment. Setting this to
True may help with training, but is slightly more expensive in terms of
computation and memory. Defaults to False.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. \
Optional, default is None.
name (str, optional): This parameter is used by developers to print debugging information. \
......@@ -2258,10 +2321,12 @@ class RMSPropOptimizer(Optimizer):
epsilon=1.0e-6,
momentum=0.0,
centered=False,
parameter_list=None,
regularization=None,
name=None):
super(RMSPropOptimizer, self).__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
name=name)
if learning_rate is None:
......@@ -2370,6 +2435,9 @@ class FtrlOptimizer(Optimizer):
l1 (float): L1 regularization strength, default is 0.0.
l2 (float): L2 regularization strength, default is 0.0.
lr_power (float): Learning Rate Power, default is -0.5.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization: A Regularizer, such as :ref:`api_fluid_regularizer_L2DecayRegularizer`. \
Optional, default is None.
name (str, optional): This parameter is used by developers to print debugging information. \
......@@ -2418,10 +2486,12 @@ class FtrlOptimizer(Optimizer):
l1=0.0,
l2=0.0,
lr_power=-0.5,
parameter_list=None,
regularization=None,
name=None):
super(FtrlOptimizer, self).__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
name=name)
if learning_rate is None:
......@@ -2504,6 +2574,9 @@ class LambOptimizer(AdamOptimizer):
beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
Default 0.999.
epsilon (float, optional): A small float value for numerical stability. Default 1e-6.
parameter_list (list, optional): List of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
regularization (Regularizer|None): A Regularizer, such as
fluid.regularizer.L1DecayRegularizer. Default None.
exclude_from_weight_decay_fn (function|None): Exclude a parameter from weight
......@@ -2540,6 +2613,7 @@ class LambOptimizer(AdamOptimizer):
beta1=0.9,
beta2=0.999,
epsilon=1e-6,
parameter_list=None,
regularization=None,
exclude_from_weight_decay_fn=None,
name=None):
......@@ -2550,6 +2624,7 @@ class LambOptimizer(AdamOptimizer):
assert epsilon is not None
super(LambOptimizer, self).__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
beta1=beta1,
beta2=beta2,
......
......@@ -26,7 +26,7 @@ import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph
from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
from paddle.fluid.dygraph.base import to_variable
from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
......@@ -79,8 +79,8 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
class MNIST(fluid.dygraph.Layer):
def __init__(self, name_scope):
super(MNIST, self).__init__(name_scope)
def __init__(self):
super(MNIST, self).__init__()
self._simple_img_conv_pool_1 = SimpleImgConvPool(
1, 20, 5, 2, 2, act="relu")
......@@ -88,19 +88,21 @@ class MNIST(fluid.dygraph.Layer):
self._simple_img_conv_pool_2 = SimpleImgConvPool(
20, 50, 5, 2, 2, act="relu")
pool_2_shape = 50 * 4 * 4
self.pool_2_shape = 50 * 4 * 4
SIZE = 10
scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
self._fc = FC(self.full_name(),
10,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)),
act="softmax")
scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5
self._fc = Linear(
self.pool_2_shape,
10,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)),
act="softmax")
def forward(self, inputs, label):
x = self._simple_img_conv_pool_1(inputs)
x = self._simple_img_conv_pool_2(x)
x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape])
cost = self._fc(x)
loss = fluid.layers.cross_entropy(cost, label)
avg_loss = fluid.layers.mean(loss)
......@@ -109,10 +111,11 @@ class MNIST(fluid.dygraph.Layer):
class TestMnist(TestParallelDyGraphRunnerBase):
def get_model(self):
model = MNIST("mnist")
model = MNIST()
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=2, drop_last=True)
opt = fluid.optimizer.Adam(learning_rate=1e-3)
opt = fluid.optimizer.Adam(
learning_rate=1e-3, parameter_list=model.parameters())
return model, train_reader, opt
def run_one_loop(self, model, opt, data):
......
......@@ -27,7 +27,7 @@ import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph
from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, BatchNorm
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.layer_helper import LayerHelper
import math
......@@ -54,7 +54,7 @@ train_parameters = {
}
def optimizer_setting(params):
def optimizer_setting(params, parameter_list=None):
ls = params["learning_strategy"]
if "total_images" not in params:
total_images = 6149
......@@ -66,11 +66,19 @@ def optimizer_setting(params):
bd = [step * e for e in ls["epochs"]]
lr = params["lr"]
num_epochs = params["num_epochs"]
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.cosine_decay(
learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
momentum=momentum_rate,
regularization=fluid.regularizer.L2Decay(l2_decay))
if fluid.in_dygraph_mode():
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.cosine_decay(
learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
momentum=momentum_rate,
regularization=fluid.regularizer.L2Decay(l2_decay),
parameter_list=parameter_list)
else:
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.cosine_decay(
learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
momentum=momentum_rate,
regularization=fluid.regularizer.L2Decay(l2_decay))
return optimizer
......@@ -107,27 +115,29 @@ class ConvBNLayer(fluid.dygraph.Layer):
class SqueezeExcitation(fluid.dygraph.Layer):
def __init__(self, name_scope, num_channels, reduction_ratio):
def __init__(self, num_channels, reduction_ratio):
super(SqueezeExcitation, self).__init__(name_scope)
super(SqueezeExcitation, self).__init__()
self._num_channels = num_channels
self._pool = Pool2D(pool_size=0, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(num_channels * 1.0)
self._squeeze = FC(
self.full_name(),
size=num_channels // reduction_ratio,
self._squeeze = Linear(
num_channels,
num_channels // reduction_ratio,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)),
act='relu')
stdv = 1.0 / math.sqrt(num_channels / 16.0 * 1.0)
self._excitation = FC(
self.full_name(),
size=num_channels,
self._excitation = Linear(
num_channels // reduction_ratio,
num_channels,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)),
act='sigmoid')
def forward(self, input):
y = self._pool(input)
y = fluid.layers.reshape(y, shape=[-1, self._num_channels])
y = self._squeeze(y)
y = self._excitation(y)
y = fluid.layers.elementwise_mul(x=input, y=y, axis=0)
......@@ -163,9 +173,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
act=None)
self.scale = SqueezeExcitation(
self.full_name(),
num_channels=num_filters * 2,
reduction_ratio=reduction_ratio)
num_channels=num_filters * 2, reduction_ratio=reduction_ratio)
if not shortcut:
self.short = ConvBNLayer(
......@@ -194,8 +202,8 @@ class BottleneckBlock(fluid.dygraph.Layer):
class SeResNeXt(fluid.dygraph.Layer):
def __init__(self, name_scope, layers=50, class_dim=102):
super(SeResNeXt, self).__init__(name_scope)
def __init__(self, layers=50, class_dim=102):
super(SeResNeXt, self).__init__()
self.layers = layers
supported_layers = [50, 101, 152]
......@@ -276,10 +284,13 @@ class SeResNeXt(fluid.dygraph.Layer):
pool_size=7, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(2048 * 1.0)
self.out = FC(self.full_name(),
size=class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 2 * 1 * 1
self.out = Linear(
self.pool2d_avg_output,
class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
def forward(self, inputs):
if self.layers == 50 or self.layers == 101:
......@@ -294,18 +305,20 @@ class SeResNeXt(fluid.dygraph.Layer):
for bottleneck_block in self.bottleneck_block_list:
y = bottleneck_block(y)
y = self.pool2d_avg(y)
y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output])
y = self.out(y)
return y
class TestSeResNeXt(TestParallelDyGraphRunnerBase):
def get_model(self):
model = SeResNeXt("se-resnext")
model = SeResNeXt()
train_reader = paddle.batch(
paddle.dataset.flowers.test(use_xmap=False),
batch_size=train_parameters["batch_size"],
drop_last=True)
optimizer = optimizer_setting(train_parameters)
optimizer = optimizer_setting(
train_parameters, parameter_list=model.parameters())
return model, train_reader, optimizer
def run_one_loop(self, model, opt, data):
......
......@@ -23,7 +23,7 @@ import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
......@@ -75,8 +75,8 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
class MNIST(fluid.dygraph.Layer):
def __init__(self, name_scope):
super(MNIST, self).__init__(name_scope)
def __init__(self):
super(MNIST, self).__init__()
self._simple_img_conv_pool_1 = SimpleImgConvPool(
1, 20, 5, 2, 2, act="relu")
......@@ -84,19 +84,21 @@ class MNIST(fluid.dygraph.Layer):
self._simple_img_conv_pool_2 = SimpleImgConvPool(
20, 50, 5, 2, 2, act="relu")
pool_2_shape = 50 * 4 * 4
SIZE = 10
scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
self._fc = FC(self.full_name(),
10,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)),
act="softmax")
self.pool_2_shape = 50 * 4 * 4
SIZE = 100 #10
scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5
self._fc = Linear(
self.pool_2_shape,
SIZE,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)),
act="softmax")
def forward(self, inputs):
x = self._simple_img_conv_pool_1(inputs)
x = self._simple_img_conv_pool_2(x)
x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape])
x = self._fc(x)
return x
......@@ -109,8 +111,9 @@ class TestDygraphMultiForward(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
mnist = MNIST("mnist")
sgd = SGDOptimizer(learning_rate=1e-3)
mnist = MNIST()
sgd = SGDOptimizer(
learning_rate=1e-3, parameter_list=mnist.parameters())
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
......@@ -145,7 +148,7 @@ class TestDygraphMultiForward(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
mnist = MNIST("mnist")
mnist = MNIST()
sgd = SGDOptimizer(learning_rate=1e-3)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
......
......@@ -258,7 +258,9 @@ class TestImperativeAutoPrune(unittest.TestCase):
fc2_origin = fc2._w.numpy()
fc2._w.stop_gradient = True
out2.backward()
optimizer = fluid.optimizer.SGD(learning_rate=0.003)
optimizer = fluid.optimizer.SGD(
learning_rate=0.003,
parameter_list=(fc.parameters() + fc2.parameters()))
optimizer.minimize(out2)
self.assertTrue(np.array_equal(fc2_origin, fc2._w.numpy()))
self.assertFalse(np.array_equal(fc_origin, fc._w.numpy()))
......@@ -279,7 +281,9 @@ class TestImperativeAutoPrune(unittest.TestCase):
fc2_origin = fc2._w.numpy()
out2.stop_gradient = True
out2.backward()
optimizer = fluid.optimizer.SGD(learning_rate=0.003)
optimizer = fluid.optimizer.SGD(
learning_rate=0.003,
parameter_list=(fc.parameters() + fc2.parameters()))
optimizer.minimize(out2)
self.assertTrue(np.array_equal(fc2_origin, fc2._w.numpy()))
self.assertTrue(np.array_equal(fc_origin, fc._w.numpy()))
......@@ -320,7 +324,8 @@ class TestImperativeAutoPrune(unittest.TestCase):
place = fluid.CPUPlace()
with fluid.dygraph.guard(place):
model = MyLayer("mylayer", vocab_size, size)
optimizer = fluid.optimizer.AdamOptimizer(0.001)
optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters())
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)
indices = fluid.dygraph.to_variable(indices)
......@@ -338,7 +343,8 @@ class TestImperativeAutoPrune(unittest.TestCase):
with fluid.dygraph.guard(place):
model = MyLayer2("mylayer", vocab_size, size)
optimizer = fluid.optimizer.AdamOptimizer(0.001)
optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters())
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)
indices = fluid.dygraph.to_variable(indices)
......
......@@ -58,7 +58,7 @@ class TestDygraphDebugString(unittest.TestCase):
out.backward()
mlp.clear_gradients()
unique_name_tmp, trace_var_tmp, alive_var_tmp = fluid.dygraph.base._print_debug_msg(
is_test=True)
mlp.parameters(), is_test=True)
if i > 0:
self.assertGreaterEqual(unique_name, unique_name_tmp)
self.assertGreaterEqual(trace_var, trace_var_tmp)
......@@ -68,7 +68,7 @@ class TestDygraphDebugString(unittest.TestCase):
trace_var = trace_var_tmp
alive_var = alive_var_tmp
try:
fluid.dygraph.base._print_debug_msg()
fluid.dygraph.base._print_debug_msg(mlp.parameters())
except Exception as e:
raise RuntimeError(
"No Exception is accepted in _print_debug_msg, but we got: {}".
......
......@@ -23,6 +23,7 @@ import paddle.fluid as fluid
import paddle.fluid.core as core
from test_imperative_base import new_program_scope
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph import Linear
# Can use Amusic dataset as the DeepCF describes.
DATA_PATH = os.environ.get('DATA_PATH', '')
......@@ -33,10 +34,10 @@ NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1))
class DMF(fluid.Layer):
def __init__(self, name_scope):
super(DMF, self).__init__(name_scope)
self._user_latent = fluid.FC(self.full_name(), 256)
self._item_latent = fluid.FC(self.full_name(), 256)
def __init__(self):
super(DMF, self).__init__()
self._user_latent = Linear(1000, 256)
self._item_latent = Linear(100, 256)
self._user_layers = []
self._item_layers = []
......@@ -45,11 +46,17 @@ class DMF(fluid.Layer):
self._user_layers.append(
self.add_sublayer(
'user_layer_%d' % i,
fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
Linear(
256 if i == 0 else self._hid_sizes[i - 1],
self._hid_sizes[i],
act='relu')))
self._item_layers.append(
self.add_sublayer(
'item_layer_%d' % i,
fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
Linear(
256 if i == 0 else self._hid_sizes[i - 1],
self._hid_sizes[i],
act='relu')))
def forward(self, users, items):
users = self._user_latent(users)
......@@ -62,17 +69,20 @@ class DMF(fluid.Layer):
class MLP(fluid.Layer):
def __init__(self, name_scope):
super(MLP, self).__init__(name_scope)
self._user_latent = fluid.FC(self.full_name(), 256)
self._item_latent = fluid.FC(self.full_name(), 256)
def __init__(self):
super(MLP, self).__init__()
self._user_latent = Linear(1000, 256)
self._item_latent = Linear(100, 256)
self._match_layers = []
self._hid_sizes = [128, 64]
for i in range(len(self._hid_sizes)):
self._match_layers.append(
self.add_sublayer(
'match_layer_%d' % i,
fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
Linear(
256 * 2 if i == 0 else self._hid_sizes[i - 1],
self._hid_sizes[i],
act='relu')))
def forward(self, users, items):
users = self._user_latent(users)
......@@ -85,8 +95,8 @@ class MLP(fluid.Layer):
class DeepCF(fluid.Layer):
def __init__(self, name_scope, num_users, num_items, matrix):
super(DeepCF, self).__init__(name_scope)
def __init__(self, num_users, num_items, matrix):
super(DeepCF, self).__init__()
self._num_users = num_users
self._num_items = num_items
self._rating_matrix = self.create_parameter(
......@@ -97,9 +107,9 @@ class DeepCF(fluid.Layer):
default_initializer=fluid.initializer.NumpyArrayInitializer(matrix))
self._rating_matrix.stop_gradient = True
self._mlp = MLP(self.full_name())
self._dmf = DMF(self.full_name())
self._match_fc = fluid.FC(self.full_name(), 1, act='sigmoid')
self._mlp = MLP()
self._dmf = DMF()
self._match_fc = Linear(128, 1, act='sigmoid')
def forward(self, users, items):
# users_emb = self._user_emb(users)
......@@ -208,7 +218,7 @@ class TestDygraphDeepCF(unittest.TestCase):
items = fluid.layers.data('items', [1], dtype='int32')
labels = fluid.layers.data('labels', [1], dtype='float32')
deepcf = DeepCF('deepcf', num_users, num_items, matrix)
deepcf = DeepCF(num_users, num_items, matrix)
prediction = deepcf(users, items)
loss = fluid.layers.reduce_sum(
fluid.layers.log_loss(prediction, labels))
......@@ -237,8 +247,9 @@ class TestDygraphDeepCF(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
deepcf = DeepCF('deepcf', num_users, num_items, matrix)
adam = fluid.optimizer.AdamOptimizer(0.01)
deepcf = DeepCF(num_users, num_items, matrix)
adam = fluid.optimizer.AdamOptimizer(
0.01, parameter_list=deepcf.parameters())
for e in range(NUM_EPOCHES):
sys.stderr.write('epoch %d\n' % e)
for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
......@@ -261,8 +272,9 @@ class TestDygraphDeepCF(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
deepcf2 = DeepCF('deepcf', num_users, num_items, matrix)
adam2 = fluid.optimizer.AdamOptimizer(0.01)
deepcf2 = DeepCF(num_users, num_items, matrix)
adam2 = fluid.optimizer.AdamOptimizer(
0.01, parameter_list=deepcf2.parameters())
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
for e in range(NUM_EPOCHES):
......
......@@ -22,33 +22,35 @@ import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid import Conv2D, Pool2D, FC
from paddle.fluid import Conv2D, Pool2D, Linear
from test_imperative_base import new_program_scope
from paddle.fluid.dygraph.base import to_variable
class Discriminator(fluid.Layer):
def __init__(self, name_scope):
super(Discriminator, self).__init__(name_scope)
self._fc1 = FC(self.full_name(), size=32, act='elu')
self._fc2 = FC(self.full_name(), size=1)
def __init__(self):
super(Discriminator, self).__init__()
self._fc1 = Linear(1, 32, act='elu')
self._fc2 = Linear(32, 1)
def forward(self, inputs):
x = self._fc1(inputs)
return self._fc2(x)
x = self._fc2(x)
return x
class Generator(fluid.Layer):
def __init__(self, name_scope):
super(Generator, self).__init__(name_scope)
self._fc1 = FC(self.full_name(), size=64, act='elu')
self._fc2 = FC(self.full_name(), size=64, act='elu')
self._fc3 = FC(self.full_name(), size=1)
def __init__(self):
super(Generator, self).__init__()
self._fc1 = Linear(2, 64, act='elu')
self._fc2 = Linear(64, 64, act='elu')
self._fc3 = Linear(64, 1)
def forward(self, inputs):
x = self._fc1(inputs)
x = self._fc2(x)
return self._fc3(x)
x = self._fc3(x)
return x
class TestDygraphGAN(unittest.TestCase):
......@@ -65,8 +67,8 @@ class TestDygraphGAN(unittest.TestCase):
scope = fluid.core.Scope()
with new_program_scope(
main=discriminate_p, startup=startup, scope=scope):
discriminator = Discriminator("d")
generator = Generator("g")
discriminator = Discriminator()
generator = Generator()
img = fluid.layers.data(
name="img", shape=[2, 1], append_batch_size=False)
......@@ -93,8 +95,8 @@ class TestDygraphGAN(unittest.TestCase):
sgd.minimize(d_loss)
with new_program_scope(main=generate_p, startup=startup, scope=scope):
discriminator = Discriminator("d")
generator = Generator("g")
discriminator = Discriminator()
generator = Generator()
noise = fluid.layers.data(
name="noise", shape=[2, 2], append_batch_size=False)
......@@ -134,9 +136,12 @@ class TestDygraphGAN(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
discriminator = Discriminator("d")
generator = Generator("g")
sgd = SGDOptimizer(learning_rate=1e-3)
discriminator = Discriminator()
generator = Generator()
sgd = SGDOptimizer(
learning_rate=1e-3,
parameter_list=(
discriminator.parameters() + generator.parameters()))
d_real = discriminator(to_variable(np.ones([2, 1], np.float32)))
d_loss_real = fluid.layers.reduce_mean(
......@@ -177,9 +182,12 @@ class TestDygraphGAN(unittest.TestCase):
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
discriminator2 = Discriminator("d")
generator2 = Generator("g")
sgd2 = SGDOptimizer(learning_rate=1e-3)
discriminator2 = Discriminator()
generator2 = Generator()
sgd2 = SGDOptimizer(
learning_rate=1e-3,
parameter_list=(
discriminator2.parameters() + generator2.parameters()))
d_real2 = discriminator2(to_variable(np.ones([2, 1], np.float32)))
d_loss_real2 = fluid.layers.reduce_mean(
......
......@@ -131,7 +131,8 @@ class TestDygraphGNN(unittest.TestCase):
to_variable(labels))
loss = fluid.layers.reduce_sum(loss)
loss.backward()
adam = AdamOptimizer(learning_rate=1e-3)
adam = AdamOptimizer(
learning_rate=1e-3, parameter_list=model.parameters())
adam.minimize(loss)
model.clear_gradients()
......@@ -156,7 +157,8 @@ class TestDygraphGNN(unittest.TestCase):
logits2, to_variable(labels2))
loss2 = fluid.layers.reduce_sum(loss2)
loss2.backward()
adam2 = AdamOptimizer(learning_rate=1e-3)
adam2 = AdamOptimizer(
learning_rate=1e-3, parameter_list=model2.parameters())
adam2.minimize(loss2)
model2.clear_gradients()
loss2_value = loss2.numpy()
......
......@@ -105,7 +105,9 @@ class TestDygraphSimpleNet(unittest.TestCase):
is_sparse=is_sparse,
dtype=dtype)
sgd = SGDOptimizer(learning_rate=1e-3)
sgd = SGDOptimizer(
learning_rate=1e-3,
parameter_list=simple_net.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
......
......@@ -23,7 +23,7 @@ import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
......@@ -77,8 +77,8 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
class MNIST(fluid.dygraph.Layer):
def __init__(self, name_scope):
super(MNIST, self).__init__(name_scope)
def __init__(self):
super(MNIST, self).__init__()
self._simple_img_conv_pool_1 = SimpleImgConvPool(
1, 20, 5, 2, 2, act="relu")
......@@ -86,19 +86,21 @@ class MNIST(fluid.dygraph.Layer):
self._simple_img_conv_pool_2 = SimpleImgConvPool(
20, 50, 5, 2, 2, act="relu")
pool_2_shape = 50 * 4 * 4
self.pool_2_shape = 50 * 4 * 4
SIZE = 10
scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
self._fc = FC(self.full_name(),
10,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)),
act="softmax")
scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5
self._fc = Linear(
self.pool_2_shape,
10,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)),
act="softmax")
def forward(self, inputs):
x = self._simple_img_conv_pool_1(inputs)
x = self._simple_img_conv_pool_2(x)
x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape])
x = self._fc(x)
return x
......@@ -125,8 +127,9 @@ class TestImperativeMnist(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
mnist = MNIST("mnist")
sgd = SGDOptimizer(learning_rate=1e-3)
mnist = MNIST()
sgd = SGDOptimizer(
learning_rate=1e-3, parameter_list=mnist.parameters())
batch_py_reader = fluid.io.PyReader(capacity=1)
batch_py_reader.decorate_sample_list_generator(
......@@ -189,7 +192,7 @@ class TestImperativeMnist(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
mnist = MNIST("mnist")
mnist = MNIST()
sgd = SGDOptimizer(learning_rate=1e-3)
train_reader = paddle.batch(
paddle.dataset.mnist.train(),
......
......@@ -39,8 +39,9 @@ class TestImperativeMnistSortGradient(unittest.TestCase):
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
mnist2 = MNIST("mnist")
sgd2 = SGDOptimizer(learning_rate=1e-3)
mnist2 = MNIST()
sgd2 = SGDOptimizer(
learning_rate=1e-3, parameter_list=mnist2.parameters())
train_reader2 = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
......@@ -85,7 +86,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
mnist = MNIST("mnist")
mnist = MNIST()
sgd = SGDOptimizer(learning_rate=1e-3)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
......
......@@ -18,7 +18,7 @@ import numpy as np
import six
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, BatchNorm, Embedding, GRUUnit
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm, Embedding, GRUUnit
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
......@@ -27,6 +27,8 @@ class Config(object):
'''
config for training
'''
# encoder rnn hidden_size
encoder_size = 200
# decoder size for decoder stage
decoder_size = 128
# size for word embedding
......@@ -118,8 +120,8 @@ class ConvBNPool(fluid.dygraph.Layer):
class OCRConv(fluid.dygraph.Layer):
def __init__(self, name_scope, is_test=False, use_cudnn=True):
super(OCRConv, self).__init__(name_scope)
def __init__(self, is_test=False, use_cudnn=True):
super(OCRConv, self).__init__()
self.conv_bn_pool_1 = ConvBNPool(
2, [16, 16], [1, 16], is_test=is_test, use_cudnn=use_cudnn)
self.conv_bn_pool_2 = ConvBNPool(
......@@ -143,7 +145,6 @@ class OCRConv(fluid.dygraph.Layer):
class DynamicGRU(fluid.dygraph.Layer):
def __init__(self,
scope_name,
size,
param_attr=None,
bias_attr=None,
......@@ -152,7 +153,7 @@ class DynamicGRU(fluid.dygraph.Layer):
candidate_activation='tanh',
h_0=None,
origin_mode=False):
super(DynamicGRU, self).__init__(scope_name)
super(DynamicGRU, self).__init__()
self.gru_unit = GRUUnit(
size * 3,
......@@ -164,6 +165,7 @@ class DynamicGRU(fluid.dygraph.Layer):
self.h_0 = h_0
self.is_reverse = is_reverse
self.size = size
def forward(self, inputs):
hidden = self.h_0
......@@ -188,11 +190,10 @@ class DynamicGRU(fluid.dygraph.Layer):
class EncoderNet(fluid.dygraph.Layer):
def __init__(self,
scope_name,
rnn_hidden_size=200,
rnn_hidden_size=Config.encoder_size,
is_test=False,
use_cudnn=True):
super(EncoderNet, self).__init__(scope_name)
super(EncoderNet, self).__init__()
self.rnn_hidden_size = rnn_hidden_size
para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0,
0.02))
......@@ -207,28 +208,19 @@ class EncoderNet(fluid.dygraph.Layer):
shape=[Config.batch_size, rnn_hidden_size],
dtype='float32',
value=0)
self.ocr_convs = OCRConv(
self.full_name(), is_test=is_test, use_cudnn=use_cudnn)
self.fc_1_layer = FC(self.full_name(),
rnn_hidden_size * 3,
param_attr=para_attr,
bias_attr=False,
num_flatten_dims=2)
self.fc_2_layer = FC(self.full_name(),
rnn_hidden_size * 3,
param_attr=para_attr,
bias_attr=False,
num_flatten_dims=2)
self.ocr_convs = OCRConv(is_test=is_test, use_cudnn=use_cudnn)
self.fc_1_layer = Linear(
768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False)
self.fc_2_layer = Linear(
768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False)
self.gru_forward_layer = DynamicGRU(
self.full_name(),
size=rnn_hidden_size,
h_0=h_0,
param_attr=para_attr,
bias_attr=bias_attr,
candidate_activation='relu')
self.gru_backward_layer = DynamicGRU(
self.full_name(),
size=rnn_hidden_size,
h_0=h_0,
param_attr=para_attr,
......@@ -236,10 +228,8 @@ class EncoderNet(fluid.dygraph.Layer):
candidate_activation='relu',
is_reverse=True)
self.encoded_proj_fc = FC(self.full_name(),
Config.decoder_size,
bias_attr=False,
num_flatten_dims=2)
self.encoded_proj_fc = Linear(
rnn_hidden_size * 2, Config.decoder_size, bias_attr=False)
def forward(self, inputs):
conv_features = self.ocr_convs(inputs)
......@@ -272,18 +262,12 @@ class EncoderNet(fluid.dygraph.Layer):
class SimpleAttention(fluid.dygraph.Layer):
def __init__(self, scope_name, decoder_size):
super(SimpleAttention, self).__init__(scope_name)
self.fc_1 = FC(self.full_name(),
decoder_size,
act=None,
bias_attr=False)
self.fc_2 = FC(self.full_name(),
1,
num_flatten_dims=2,
act=None,
bias_attr=False)
def __init__(self, decoder_size):
super(SimpleAttention, self).__init__()
self.fc_1 = Linear(
decoder_size, decoder_size, act=None, bias_attr=False)
self.fc_2 = Linear(decoder_size, 1, act=None, bias_attr=False)
def forward(self, encoder_vec, encoder_proj, decoder_state):
......@@ -311,22 +295,18 @@ class SimpleAttention(fluid.dygraph.Layer):
class GRUDecoderWithAttention(fluid.dygraph.Layer):
def __init__(self, scope_name, decoder_size, num_classes):
super(GRUDecoderWithAttention, self).__init__(scope_name)
self.simple_attention = SimpleAttention(self.full_name(), decoder_size)
self.fc_1_layer = FC(self.full_name(),
size=decoder_size * 3,
bias_attr=False)
self.fc_2_layer = FC(self.full_name(),
size=decoder_size * 3,
bias_attr=False)
def __init__(self, decoder_size, num_classes):
super(GRUDecoderWithAttention, self).__init__()
self.simple_attention = SimpleAttention(decoder_size)
self.fc_1_layer = Linear(
Config.encoder_size * 2, decoder_size * 3, bias_attr=False)
self.fc_2_layer = Linear(
decoder_size, decoder_size * 3, bias_attr=False)
self.gru_unit = GRUUnit(
size=decoder_size * 3, param_attr=None, bias_attr=None)
self.out_layer = FC(self.full_name(),
size=num_classes + 2,
bias_attr=None,
act='softmax')
self.out_layer = Linear(
decoder_size, num_classes + 2, bias_attr=None, act='softmax')
self.decoder_size = decoder_size
......@@ -357,17 +337,18 @@ class GRUDecoderWithAttention(fluid.dygraph.Layer):
class OCRAttention(fluid.dygraph.Layer):
def __init__(self, scope_name):
super(OCRAttention, self).__init__(scope_name)
self.encoder_net = EncoderNet(self.full_name())
self.fc = FC(self.full_name(),
size=Config.decoder_size,
bias_attr=False,
act='relu')
def __init__(self):
super(OCRAttention, self).__init__()
self.encoder_net = EncoderNet()
self.fc = Linear(
Config.encoder_size,
Config.decoder_size,
bias_attr=False,
act='relu')
self.embedding = Embedding(
[Config.num_classes + 2, Config.word_vector_dim], dtype='float32')
self.gru_decoder_with_attention = GRUDecoderWithAttention(
self.full_name(), Config.decoder_size, Config.num_classes)
Config.decoder_size, Config.num_classes)
def forward(self, inputs, label_in):
gru_backward, encoded_vector, encoded_proj = self.encoder_net(inputs)
......@@ -425,14 +406,15 @@ class TestDygraphOCRAttention(unittest.TestCase):
fluid.default_main_program().random_seed = seed
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
ocr_attention = OCRAttention("ocr_attention")
ocr_attention = OCRAttention()
if Config.learning_rate_decay == "piecewise_decay":
learning_rate = fluid.layers.piecewise_decay(
[50000], [Config.LR, Config.LR * 0.01])
else:
learning_rate = Config.LR
optimizer = fluid.optimizer.SGD(learning_rate=0.001)
optimizer = fluid.optimizer.SGD(
learning_rate=0.001, parameter_list=ocr_attention.parameters())
dy_param_init_value = {}
for param in ocr_attention.parameters():
dy_param_init_value[param.name] = param.numpy()
......@@ -478,7 +460,7 @@ class TestDygraphOCRAttention(unittest.TestCase):
# print("static start")
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
ocr_attention = OCRAttention("ocr_attention")
ocr_attention = OCRAttention()
if Config.learning_rate_decay == "piecewise_decay":
learning_rate = fluid.layers.piecewise_decay(
......
......@@ -23,17 +23,17 @@ import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer, Adam
from paddle.fluid.dygraph.nn import FC
from paddle.fluid.dygraph import Linear
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
class MLP(fluid.Layer):
def __init__(self, name_scope, param_attr=None, bias_attr=None):
super(MLP, self).__init__(name_scope)
def __init__(self, param_attr=None, bias_attr=None):
super(MLP, self).__init__()
self._fc1 = FC(self.full_name(), 10)
self._fc2 = FC(self.full_name(), 10)
self._fc1 = Linear(784, 10)
self._fc2 = Linear(10, 10)
def forward(self, inputs):
y = self._fc1(inputs)
......@@ -45,13 +45,16 @@ class TestImperativeOptimizerBase(unittest.TestCase):
def setUp(self):
self.batch_num = 20
def get_optimizer_dygraph(self, parameter_list):
raise NotImplementedError()
def get_optimizer(self):
raise NotImplementedError()
def reader_decorator(self, reader):
def _reader_imple():
for item in reader():
image = np.array(item[0]).reshape(1, 28, 28)
image = np.array(item[0]).reshape(1, 784)
label = np.array(item[1]).astype('int64').reshape(1)
yield image, label
......@@ -65,8 +68,9 @@ class TestImperativeOptimizerBase(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
mlp = MLP('mlp')
optimizer = self.get_optimizer()
mlp = MLP()
optimizer = self.get_optimizer_dygraph(
parameter_list=mlp.parameters())
batch_py_reader = fluid.io.PyReader(capacity=1)
batch_py_reader.decorate_sample_list_generator(
......@@ -85,6 +89,7 @@ class TestImperativeOptimizerBase(unittest.TestCase):
label = data[1]
label.stop_gradient = True
img = fluid.layers.reshape(img, shape=[batch_size, -1])
cost = mlp(img)
avg_loss = fluid.layers.reduce_mean(cost)
dy_out = avg_loss.numpy()
......@@ -107,7 +112,7 @@ class TestImperativeOptimizerBase(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
mlp = MLP('mlp')
mlp = MLP()
optimizer = self.get_optimizer()
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
......@@ -115,6 +120,7 @@ class TestImperativeOptimizerBase(unittest.TestCase):
img = fluid.layers.data(
name='pixel', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
img = fluid.layers.reshape(img, shape=[batch_size, -1])
cost = mlp(img)
avg_loss = fluid.layers.reduce_mean(cost)
optimizer.minimize(avg_loss)
......@@ -162,6 +168,15 @@ class TestImperativeOptimizerBase(unittest.TestCase):
class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
bd = [3, 6, 9]
optimizer = SGDOptimizer(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd,
values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
bd = [3, 6, 9]
optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
......@@ -173,6 +188,16 @@ class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1,
......@@ -186,6 +211,16 @@ class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.exponential_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
learning_rate=0.1,
......@@ -199,6 +234,16 @@ class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = Adam(
learning_rate=fluid.layers.inverse_time_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay(
learning_rate=0.1,
......@@ -212,6 +257,13 @@ class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.polynomial_decay(
learning_rate=0.1, decay_steps=5, cycle=self.cycle),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
learning_rate=0.1, decay_steps=5, cycle=self.cycle))
......@@ -227,6 +279,13 @@ class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.cosine_decay(
learning_rate=0.1, step_each_epoch=10000, epochs=120),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
learning_rate=0.1, step_each_epoch=10000, epochs=120))
......@@ -237,6 +296,13 @@ class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.noam_decay(
d_model=512, warmup_steps=8000),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
d_model=512, warmup_steps=8000))
......
......@@ -38,7 +38,8 @@ class TestImperativePartitialBackward(unittest.TestCase):
for param in fc2.parameters():
self.assertIsNone(param._grad_ivar())
optimizer = fluid.optimizer.AdamOptimizer()
optimizer = fluid.optimizer.AdamOptimizer(parameter_list=(
fc1.parameters() + fc2.parameters()))
_, params_grads = optimizer.minimize(loss)
self.assertListEqual(
......
......@@ -30,13 +30,12 @@ from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
class SimpleLSTMRNN(fluid.Layer):
def __init__(self,
name_scope,
hidden_size,
num_steps,
num_layers=2,
init_scale=0.1,
dropout=None):
super(SimpleLSTMRNN, self).__init__(name_scope)
super(SimpleLSTMRNN, self).__init__()
self._hidden_size = hidden_size
self._num_layers = num_layers
self._init_scale = init_scale
......@@ -45,8 +44,9 @@ class SimpleLSTMRNN(fluid.Layer):
self._num_steps = num_steps
self.cell_array = []
self.hidden_array = []
self._create_parameter()
def _build_once(self, input_embedding, init_hidden=None, init_cell=None):
def _create_parameter(self):
self.weight_1_arr = []
self.weight_2_arr = []
self.bias_arr = []
......@@ -135,7 +135,6 @@ class SimpleLSTMRNN(fluid.Layer):
class PtbModel(fluid.Layer):
def __init__(self,
name_scope,
hidden_size,
vocab_size,
num_layers=2,
......@@ -143,7 +142,7 @@ class PtbModel(fluid.Layer):
init_scale=0.1,
is_sparse=False,
dropout=None):
super(PtbModel, self).__init__(name_scope)
super(PtbModel, self).__init__()
self.hidden_size = hidden_size
self.vocab_size = vocab_size
self.init_scale = init_scale
......@@ -151,7 +150,6 @@ class PtbModel(fluid.Layer):
self.num_steps = num_steps
self.dropout = dropout
self.simple_lstm_rnn = SimpleLSTMRNN(
self.full_name(),
hidden_size,
num_steps,
num_layers=num_layers,
......@@ -231,7 +229,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
fluid.default_main_program().random_seed = seed
# TODO: marsyang1993 Change seed to
ptb_model = PtbModel(
"ptb_model",
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
......@@ -239,7 +236,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
init_scale=init_scale,
is_sparse=is_sparse)
sgd = SGDOptimizer(learning_rate=1e-3)
sgd = SGDOptimizer(
learning_rate=1e-3, parameter_list=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
......@@ -298,7 +296,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
ptb_model = PtbModel(
"ptb_model",
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
......
......@@ -49,7 +49,6 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
backward_strategy.sort_sum_gradient = True
# TODO: marsyang1993 Change seed to
ptb_model = PtbModel(
"ptb_model",
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
......@@ -57,7 +56,8 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
init_scale=init_scale,
is_sparse=is_sparse)
sgd = SGDOptimizer(learning_rate=1e-3)
sgd = SGDOptimizer(
learning_rate=1e-3, parameter_list=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
......@@ -97,7 +97,6 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
ptb_model = PtbModel(
"ptb_model",
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
......
......@@ -86,7 +86,8 @@ class TestImperativeMnist(unittest.TestCase):
loss_probs = fluid.layers.elementwise_mul(dy_reward, loss_probs)
loss = fluid.layers.reduce_sum(loss_probs)
sgd = SGDOptimizer(learning_rate=1e-3)
sgd = SGDOptimizer(
learning_rate=1e-3, parameter_list=policy.parameters())
dy_param_init_value = {}
......
......@@ -21,7 +21,7 @@ import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid import Conv2D, Pool2D, BatchNorm, FC
from paddle.fluid import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
......@@ -44,7 +44,7 @@ train_parameters = {
}
def optimizer_setting(params):
def optimizer_setting(params, parameter_list=None):
ls = params["learning_strategy"]
if ls["name"] == "piecewise_decay":
if "total_images" not in params:
......@@ -58,14 +58,18 @@ def optimizer_setting(params):
base_lr = params["lr"]
lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
if fluid.in_dygraph_mode():
optimizer = fluid.optimizer.SGD(learning_rate=0.01,
parameter_list=parameter_list)
else:
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
# TODO(minqiyang): Add learning rate scheduler support to dygraph mode
# optimizer = fluid.optimizer.Momentum(
# learning_rate=params["lr"],
# learning_rate=fluid.layers.piecewise_decay(
# boundaries=bd, values=lr),
# momentum=0.9,
# regularization=fluid.regularizer.L2Decay(1e-4))
# learning_rate=params["lr"],
# learning_rate=fluid.layers.piecewise_decay(
# boundaries=bd, values=lr),
# momentum=0.9,
# regularization=fluid.regularizer.L2Decay(1e-4))
return optimizer
......@@ -147,8 +151,8 @@ class BottleneckBlock(fluid.Layer):
class ResNet(fluid.Layer):
def __init__(self, name_scope, layers=50, class_dim=102):
super(ResNet, self).__init__(name_scope)
def __init__(self, layers=50, class_dim=102):
super(ResNet, self).__init__()
self.layers = layers
supported_layers = [50, 101, 152]
......@@ -187,14 +191,17 @@ class ResNet(fluid.Layer):
self.pool2d_avg = Pool2D(
pool_size=7, pool_type='avg', global_pooling=True)
self.pool2d_avg_output = num_filters[-1] * 4 * 1 * 1
import math
stdv = 1.0 / math.sqrt(2048 * 1.0)
self.out = FC(self.full_name(),
size=class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
self.out = Linear(
self.pool2d_avg_output,
class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
def forward(self, inputs):
y = self.conv(inputs)
......@@ -202,6 +209,7 @@ class ResNet(fluid.Layer):
for bottleneck_block in self.bottleneck_block_list:
y = bottleneck_block(y)
y = self.pool2d_avg(y)
y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output])
y = self.out(y)
return y
......@@ -228,8 +236,9 @@ class TestDygraphResnet(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
resnet = ResNet("resnet")
optimizer = optimizer_setting(train_parameters)
resnet = ResNet()
optimizer = optimizer_setting(
train_parameters, parameter_list=resnet.parameters())
np.random.seed(seed)
import random
random.seed = seed
......@@ -315,7 +324,7 @@ class TestDygraphResnet(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
resnet = ResNet("resnet")
resnet = ResNet()
optimizer = optimizer_setting(train_parameters)
np.random.seed(seed)
......
......@@ -40,7 +40,7 @@ train_parameters = {
}
def optimizer_setting(params):
def optimizer_setting(params, parameter_list=None):
ls = params["learning_strategy"]
if ls["name"] == "piecewise_decay":
if "total_images" not in params:
......@@ -54,14 +54,18 @@ def optimizer_setting(params):
base_lr = params["lr"]
lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
if fluid.in_dygraph_mode():
optimizer = fluid.optimizer.SGD(learning_rate=0.01,
parameter_list=parameter_list)
else:
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
# TODO(minqiyang): Add learning rate scheduler support to dygraph mode
# optimizer = fluid.optimizer.Momentum(
# learning_rate=params["lr"],
# learning_rate=fluid.layers.piecewise_decay(
# boundaries=bd, values=lr),
# momentum=0.9,
# regularization=fluid.regularizer.L2Decay(1e-4))
# learning_rate=params["lr"],
# learning_rate=fluid.layers.piecewise_decay(
# boundaries=bd, values=lr),
# momentum=0.9,
# regularization=fluid.regularizer.L2Decay(1e-4))
return optimizer
......@@ -77,8 +81,9 @@ class TestDygraphResnetSortGradient(unittest.TestCase):
fluid.default_main_program().random_seed = seed
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
resnet = ResNet("resnet")
optimizer = optimizer_setting(train_parameters)
resnet = ResNet()
optimizer = optimizer_setting(
train_parameters, parameter_list=resnet.parameters())
np.random.seed(seed)
import random
random.seed = seed
......@@ -138,7 +143,7 @@ class TestDygraphResnetSortGradient(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
resnet = ResNet("resnet")
resnet = ResNet()
optimizer = optimizer_setting(train_parameters)
np.random.seed(seed)
......
......@@ -233,8 +233,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
adam = Adam(learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr))
adam = Adam(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
parameter_list=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
......@@ -314,8 +316,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
adam = Adam(learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr))
adam = Adam(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
parameter_list=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
......@@ -418,8 +422,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
adam = Adam(learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr))
adam = Adam(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
parameter_list=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
......@@ -521,8 +527,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
adam = Adam(learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr))
adam = Adam(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
parameter_list=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
......@@ -633,7 +641,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
beta1=0.8,
beta2=0.6)
beta2=0.6,
parameter_list=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
......@@ -724,7 +733,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
beta1=0.8,
beta2=0.6)
beta2=0.6,
parameter_list=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
......@@ -816,7 +826,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
beta1=0.8,
beta2=0.6)
beta2=0.6,
parameter_list=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
......
......@@ -21,7 +21,7 @@ import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
......@@ -42,7 +42,7 @@ train_parameters = {
}
def optimizer_setting(params):
def optimizer_setting(params, parameter_list=None):
ls = params["learning_strategy"]
if ls["name"] == "piecewise_decay":
if "total_images" not in params:
......@@ -56,7 +56,11 @@ def optimizer_setting(params):
#bd = [step * e for e in ls["epochs"]]
#base_lr = params["lr"]
#lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
if fluid.in_dygraph_mode():
optimizer = fluid.optimizer.SGD(learning_rate=0.01,
parameter_list=parameter_list)
else:
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
return optimizer
......@@ -91,25 +95,27 @@ class ConvBNLayer(fluid.dygraph.Layer):
class SqueezeExcitation(fluid.dygraph.Layer):
def __init__(self, name_scope, num_channels, reduction_ratio):
def __init__(self, num_channels, reduction_ratio):
super(SqueezeExcitation, self).__init__(name_scope)
super(SqueezeExcitation, self).__init__()
self._num_channels = num_channels
self._pool = Pool2D(pool_size=0, pool_type='avg', global_pooling=True)
self._squeeze = FC(
self.full_name(),
size=num_channels // reduction_ratio,
self._squeeze = Linear(
num_channels,
num_channels // reduction_ratio,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.05)),
act='relu')
self._excitation = FC(
self.full_name(),
size=num_channels,
self._excitation = Linear(
num_channels // reduction_ratio,
num_channels,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.05)),
act='sigmoid')
def forward(self, input):
y = self._pool(input)
y = fluid.layers.reshape(y, shape=[-1, self._num_channels])
y = self._squeeze(y)
y = self._excitation(y)
y = fluid.layers.elementwise_mul(x=input, y=y, axis=0)
......@@ -141,9 +147,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
act='relu')
self.scale = SqueezeExcitation(
self.full_name(),
num_channels=num_filters * 4,
reduction_ratio=reduction_ratio)
num_channels=num_filters * 4, reduction_ratio=reduction_ratio)
if not shortcut:
self.short = ConvBNLayer(
......@@ -175,8 +179,8 @@ class BottleneckBlock(fluid.dygraph.Layer):
class SeResNeXt(fluid.dygraph.Layer):
def __init__(self, name_scope, layers=50, class_dim=102):
super(SeResNeXt, self).__init__(name_scope)
def __init__(self, layers=50, class_dim=102):
super(SeResNeXt, self).__init__()
self.layers = layers
supported_layers = [50, 101, 152]
......@@ -203,7 +207,7 @@ class SeResNeXt(fluid.dygraph.Layer):
num_filters = [128, 256, 512, 1024]
self.conv0 = ConvBNLayer(
num_channels=3,
num_filters=3,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
......@@ -216,27 +220,29 @@ class SeResNeXt(fluid.dygraph.Layer):
num_filters = [128, 256, 512, 1024]
self.conv0 = ConvBNLayer(
num_channels=3,
num_filters=3,
filter_size=7,
num_filters=64,
filter_size=3,
stride=2,
act='relu')
self.conv1 = ConvBNLayer(
num_channels=3,
num_filters=3,
filter_size=7,
num_channels=64,
num_filters=64,
filter_size=3,
stride=2,
act='relu')
self.conv2 = ConvBNLayer(
num_channels=7,
num_filters=3,
filter_size=7,
stride=2,
num_channels=64,
num_filters=128,
filter_size=3,
stride=1,
act='relu')
self.pool = Pool2D(
pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
self.bottleneck_block_list = []
num_channels = 64
if layers == 152:
num_channels = 128
for block in range(len(depth)):
shortcut = False
for i in range(depth[block]):
......@@ -258,11 +264,14 @@ class SeResNeXt(fluid.dygraph.Layer):
import math
stdv = 1.0 / math.sqrt(2048 * 1.0)
self.out = FC(self.full_name(),
size=class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
self.pool2d_avg_output = num_filters[-1] * 4 * 1 * 1
self.out = Linear(
self.pool2d_avg_output,
class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
def forward(self, inputs):
if self.layers == 50 or self.layers == 101:
......@@ -270,14 +279,15 @@ class SeResNeXt(fluid.dygraph.Layer):
y = self.pool(y)
elif self.layers == 152:
y = self.conv0(inputs)
y = self.conv1(inputs)
y = self.conv2(inputs)
y = self.conv1(y)
y = self.conv2(y)
y = self.pool(y)
for bottleneck_block in self.bottleneck_block_list:
y = bottleneck_block(y)
y = self.pool2d_avg(y)
y = fluid.layers.dropout(y, dropout_prob=0.2)
y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output])
y = self.out(y)
return y
......@@ -302,8 +312,9 @@ class TestImperativeResneXt(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
se_resnext = SeResNeXt("se_resnext")
optimizer = optimizer_setting(train_parameters)
se_resnext = SeResNeXt()
optimizer = optimizer_setting(
train_parameters, parameter_list=se_resnext.parameters())
np.random.seed(seed)
import random
random.seed = seed
......@@ -364,7 +375,7 @@ class TestImperativeResneXt(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
se_resnext = SeResNeXt("se_resnext")
se_resnext = SeResNeXt()
optimizer = optimizer_setting(train_parameters)
np.random.seed(seed)
......
......@@ -49,23 +49,27 @@ class TestSimpleNet(unittest.TestCase):
with fluid.dygraph.guard(place):
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = sort_sum_gradient
adam = SGDOptimizer(learning_rate=0.001)
# grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0)
input_word = np.array([[1, 2], [2, 1]]).astype('int64')
input = to_variable(input_word)
simplenet = SimpleNet(20, 32, dtype)
adam = SGDOptimizer(
learning_rate=0.001,
parameter_list=simplenet.parameters())
input_emb, emb = simplenet(input)
try:
emb._w.gradient()
except ValueError as e:
pass
assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str(
e)
try:
input_emb.gradient()
except ValueError as e:
pass
assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str(
e)
input_emb.backward(backward_strategy)
adam.minimize(input_emb) # grad_clip=grad_clip
......@@ -75,13 +79,11 @@ class TestSimpleNet(unittest.TestCase):
try:
emb._w.gradient()
except ValueError as e:
pass
assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str(
e)
input_emb.clear_gradient()
try:
input_emb.gradient()
except ValueError as e:
pass
input_emb.gradient()
def test_selectedrows_gradient2(self):
places = [fluid.CPUPlace()]
......@@ -93,7 +95,6 @@ class TestSimpleNet(unittest.TestCase):
with fluid.dygraph.guard(place):
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = sort_sum_gradient
adam = SGDOptimizer(learning_rate=0.001)
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(
5.0)
......@@ -101,16 +102,21 @@ class TestSimpleNet(unittest.TestCase):
input = to_variable(input_word)
simplenet = SimpleNet(20, 32, "float32")
adam = SGDOptimizer(
learning_rate=0.001,
parameter_list=simplenet.parameters())
input_emb, emb = simplenet(input)
try:
emb._w.gradient()
except ValueError as e:
pass
assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str(
e)
try:
input_emb.gradient()
except ValueError as e:
pass
assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str(
e)
input_emb.backward(backward_strategy)
adam.minimize(input_emb, grad_clip=grad_clip)
......@@ -120,13 +126,11 @@ class TestSimpleNet(unittest.TestCase):
try:
emb._w.gradient()
except ValueError as e:
pass
assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str(
e)
input_emb.clear_gradient()
try:
input_emb.gradient()
except ValueError as e:
pass
input_emb.gradient()
if __name__ == '__main__':
......
......@@ -114,7 +114,9 @@ class TestDygraphSimpleNet(unittest.TestCase):
is_sparse=is_sparse,
dtype=dtype)
sgd = SGDOptimizer(learning_rate=1e-3)
sgd = SGDOptimizer(
learning_rate=1e-3,
parameter_list=simple_net.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
......
......@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest
import paddle.fluid as fluid
from paddle.fluid import Embedding, LayerNorm, FC, Layer
from paddle.fluid import Embedding, LayerNorm, Linear, Layer
from paddle.fluid.dygraph import to_variable, guard
from paddle.fluid.dygraph.jit import TracedLayer
from test_imperative_base import new_program_scope
......@@ -378,15 +378,10 @@ class PrePostProcessLayer(Layer):
class PositionwiseFeedForwardLayer(Layer):
def __init__(self, name_scope, d_inner_hid, d_hid, dropout_rate):
super(PositionwiseFeedForwardLayer, self).__init__(name_scope)
self._i2h = FC(name_scope=self.full_name(),
size=d_inner_hid,
num_flatten_dims=2,
act="relu")
self._h2o = FC(name_scope=self.full_name(),
size=d_hid,
num_flatten_dims=2)
def __init__(self, d_inner_hid, d_hid, dropout_rate):
super(PositionwiseFeedForwardLayer, self).__init__()
self._i2h = Linear(d_hid, d_inner_hid, act="relu")
self._h2o = Linear(d_inner_hid, d_hid)
self._dropout_rate = dropout_rate
def forward(self, x):
......@@ -403,7 +398,6 @@ class PositionwiseFeedForwardLayer(Layer):
class MultiHeadAttentionLayer(Layer):
def __init__(self,
name_scope,
d_key,
d_value,
d_model,
......@@ -412,28 +406,16 @@ class MultiHeadAttentionLayer(Layer):
cache=None,
gather_idx=None,
static_kv=False):
super(MultiHeadAttentionLayer, self).__init__(name_scope)
super(MultiHeadAttentionLayer, self).__init__()
self._n_head = n_head
self._d_key = d_key
self._d_value = d_value
self._d_model = d_model
self._dropout_rate = dropout_rate
self._q_fc = FC(name_scope=self.full_name(),
size=d_key * n_head,
bias_attr=False,
num_flatten_dims=2)
self._k_fc = FC(name_scope=self.full_name(),
size=d_key * n_head,
bias_attr=False,
num_flatten_dims=2)
self._v_fc = FC(name_scope=self.full_name(),
size=d_value * n_head,
bias_attr=False,
num_flatten_dims=2)
self._proj_fc = FC(name_scope=self.full_name(),
size=self._d_model,
bias_attr=False,
num_flatten_dims=2)
self._q_fc = Linear(self._d_model, d_key * n_head, bias_attr=False)
self._k_fc = Linear(self._d_model, d_key * n_head, bias_attr=False)
self._v_fc = Linear(self._d_model, d_value * n_head, bias_attr=False)
self._proj_fc = Linear(d_value * n_head, self._d_model, bias_attr=False)
def forward(self, queries, keys, values, attn_bias):
# compute q ,k ,v
......@@ -490,7 +472,6 @@ class MultiHeadAttentionLayer(Layer):
class EncoderSubLayer(Layer):
def __init__(self,
name_scope,
n_head,
d_key,
d_value,
......@@ -502,7 +483,7 @@ class EncoderSubLayer(Layer):
preprocess_cmd="n",
postprocess_cmd="da"):
super(EncoderSubLayer, self).__init__(name_scope)
super(EncoderSubLayer, self).__init__()
self._preprocess_cmd = preprocess_cmd
self._postprocess_cmd = postprocess_cmd
self._prepostprocess_dropout = prepostprocess_dropout
......@@ -510,14 +491,13 @@ class EncoderSubLayer(Layer):
self._preprocess_layer = PrePostProcessLayer(d_model,
self._preprocess_cmd, 3)
self._multihead_attention_layer = MultiHeadAttentionLayer(
self.full_name(), d_key, d_value, d_model, n_head,
attention_dropout)
d_key, d_value, d_model, n_head, attention_dropout)
self._postprocess_layer = PrePostProcessLayer(
d_model, self._postprocess_cmd, None)
self._preprocess_layer2 = PrePostProcessLayer(d_model,
self._preprocess_cmd, 3)
self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
self.full_name(), d_inner_hid, d_model, relu_dropout)
d_inner_hid, d_model, relu_dropout)
self._postprocess_layer2 = PrePostProcessLayer(
d_model, self._postprocess_cmd, None)
......@@ -540,7 +520,6 @@ class EncoderSubLayer(Layer):
class EncoderLayer(Layer):
def __init__(self,
name_scope,
n_layer,
n_head,
d_key,
......@@ -553,7 +532,7 @@ class EncoderLayer(Layer):
preprocess_cmd="n",
postprocess_cmd="da"):
super(EncoderLayer, self).__init__(name_scope)
super(EncoderLayer, self).__init__()
self._preprocess_cmd = preprocess_cmd
self._encoder_sublayers = list()
self._prepostprocess_dropout = prepostprocess_dropout
......@@ -564,10 +543,10 @@ class EncoderLayer(Layer):
self._encoder_sublayers.append(
self.add_sublayer(
'esl_%d' % i,
EncoderSubLayer(
self.full_name(), n_head, d_key, d_value, d_model,
d_inner_hid, prepostprocess_dropout, attention_dropout,
relu_dropout, preprocess_cmd, postprocess_cmd)))
EncoderSubLayer(n_head, d_key, d_value, d_model,
d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout,
preprocess_cmd, postprocess_cmd)))
def forward(self, enc_input, attn_bias):
for i in range(self._n_layer):
......@@ -580,7 +559,6 @@ class EncoderLayer(Layer):
class PrepareEncoderDecoderLayer(Layer):
def __init__(self,
name_scope,
src_vocab_size,
src_emb_dim,
src_max_len,
......@@ -588,7 +566,7 @@ class PrepareEncoderDecoderLayer(Layer):
is_sparse=False,
word_emb_param_name=None,
pos_enc_param_name=None):
super(PrepareEncoderDecoderLayer, self).__init__(name_scope)
super(PrepareEncoderDecoderLayer, self).__init__()
self._src_max_len = src_max_len
self._src_emb_dim = src_emb_dim
self._src_vocab_size = src_vocab_size
......@@ -634,7 +612,6 @@ class PrepareEncoderDecoderLayer(Layer):
class WrapEncoderLayer(Layer):
def __init__(self,
name_cope,
src_vocab_size,
max_length,
n_layer,
......@@ -653,10 +630,9 @@ class WrapEncoderLayer(Layer):
"""
The wrapper assembles together all needed layers for the encoder.
"""
super(WrapEncoderLayer, self).__init__(name_cope)
super(WrapEncoderLayer, self).__init__()
self._prepare_encoder_layer = PrepareEncoderDecoderLayer(
self.full_name(),
src_vocab_size,
d_model,
max_length,
......@@ -664,10 +640,10 @@ class WrapEncoderLayer(Layer):
is_sparse=is_sparse,
word_emb_param_name=word_emb_param_names[0],
pos_enc_param_name=pos_enc_param_names[0])
self._encoder = EncoderLayer(
self.full_name(), n_layer, n_head, d_key, d_value, d_model,
d_inner_hid, prepostprocess_dropout, attention_dropout,
relu_dropout, preprocess_cmd, postprocess_cmd)
self._encoder = EncoderLayer(n_layer, n_head, d_key, d_value, d_model,
d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout,
preprocess_cmd, postprocess_cmd)
def forward(self, enc_inputs):
src_word, src_pos, src_slf_attn_bias = enc_inputs
......@@ -678,7 +654,6 @@ class WrapEncoderLayer(Layer):
class DecoderSubLayer(Layer):
def __init__(self,
name_scope,
n_head,
d_key,
d_value,
......@@ -691,14 +666,13 @@ class DecoderSubLayer(Layer):
postprocess_cmd,
cache=None,
gather_idx=None):
super(DecoderSubLayer, self).__init__(name_scope)
super(DecoderSubLayer, self).__init__()
self._postprocess_cmd = postprocess_cmd
self._preprocess_cmd = preprocess_cmd
self._prepostprcess_dropout = prepostprocess_dropout
self._pre_process_layer = PrePostProcessLayer(d_model, preprocess_cmd,
3)
self._multihead_attention_layer = MultiHeadAttentionLayer(
self.full_name(),
d_key,
d_value,
d_model,
......@@ -711,7 +685,6 @@ class DecoderSubLayer(Layer):
self._pre_process_layer2 = PrePostProcessLayer(d_model, preprocess_cmd,
3)
self._multihead_attention_layer2 = MultiHeadAttentionLayer(
self.full_name(),
d_key,
d_value,
d_model,
......@@ -725,7 +698,7 @@ class DecoderSubLayer(Layer):
self._pre_process_layer3 = PrePostProcessLayer(d_model, preprocess_cmd,
3)
self._positionwise_feed_forward_layer = PositionwiseFeedForwardLayer(
self.full_name(), d_inner_hid, d_model, relu_dropout)
d_inner_hid, d_model, relu_dropout)
self._post_process_layer3 = PrePostProcessLayer(d_model,
postprocess_cmd, None)
......@@ -757,7 +730,6 @@ class DecoderSubLayer(Layer):
class DecoderLayer(Layer):
def __init__(self,
name_scope,
n_layer,
n_head,
d_key,
......@@ -771,7 +743,7 @@ class DecoderLayer(Layer):
postprocess_cmd,
caches=None,
gather_idx=None):
super(DecoderLayer, self).__init__(name_scope)
super(DecoderLayer, self).__init__()
self._pre_process_layer = PrePostProcessLayer(d_model, preprocess_cmd,
3)
self._decoder_sub_layers = list()
......@@ -783,7 +755,6 @@ class DecoderLayer(Layer):
self.add_sublayer(
'dsl_%d' % i,
DecoderSubLayer(
self.full_name(),
n_head,
d_key,
d_value,
......@@ -812,7 +783,6 @@ class DecoderLayer(Layer):
class WrapDecoderLayer(Layer):
def __init__(self,
name_scope,
trg_vocab_size,
max_length,
n_layer,
......@@ -833,10 +803,9 @@ class WrapDecoderLayer(Layer):
"""
The wrapper assembles together all needed layers for the encoder.
"""
super(WrapDecoderLayer, self).__init__(name_scope)
super(WrapDecoderLayer, self).__init__()
self._prepare_decoder_layer = PrepareEncoderDecoderLayer(
self.full_name(),
trg_vocab_size,
d_model,
max_length,
......@@ -845,7 +814,6 @@ class WrapDecoderLayer(Layer):
word_emb_param_name=word_emb_param_names[1],
pos_enc_param_name=pos_enc_param_names[1])
self._decoder_layer = DecoderLayer(
self.full_name(),
n_layer,
n_head,
d_key,
......@@ -861,9 +829,7 @@ class WrapDecoderLayer(Layer):
gather_idx=gather_idx)
self._weight_sharing = weight_sharing
if not weight_sharing:
self._fc = FC(self.full_name(),
size=trg_vocab_size,
bias_attr=False)
self._fc = Linear(d_model, trg_vocab_size, bias_attr=False)
def forward(self, dec_inputs=None, enc_output=None):
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
......@@ -891,7 +857,6 @@ class WrapDecoderLayer(Layer):
class TransFormer(Layer):
def __init__(self,
name_scope,
src_vocab_size,
trg_vocab_size,
max_length,
......@@ -911,7 +876,7 @@ class TransFormer(Layer):
use_py_reader=False,
is_test=False,
is_sparse=False):
super(TransFormer, self).__init__(name_scope)
super(TransFormer, self).__init__()
self._label_smooth_eps = label_smooth_eps
self._trg_vocab_size = trg_vocab_size
if weight_sharing:
......@@ -919,7 +884,6 @@ class TransFormer(Layer):
"Vocabularies in source and target should be same for weight sharing."
)
self._wrap_encoder_layer = WrapEncoderLayer(
self.full_name(),
src_vocab_size,
max_length,
n_layer,
......@@ -936,7 +900,6 @@ class TransFormer(Layer):
weight_sharing,
is_sparse=is_sparse)
self._wrap_decoder_layer = WrapDecoderLayer(
self.full_name(),
trg_vocab_size,
max_length,
n_layer,
......@@ -991,7 +954,6 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
transformer = TransFormer(
'transformer',
ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size,
ModelHyperParams.max_length + 1,
......@@ -1020,9 +982,12 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
learning_rate=learning_rate,
beta1=TrainTaskConfig.beta1,
beta2=TrainTaskConfig.beta2,
epsilon=TrainTaskConfig.eps)
epsilon=TrainTaskConfig.eps,
parameter_list=transformer.parameters())
else:
optimizer = fluid.optimizer.SGD(learning_rate=0.003)
optimizer = fluid.optimizer.SGD(
learning_rate=0.003,
parameter_list=transformer.parameters())
dy_param_init = dict()
dy_param_updated = dict()
......@@ -1073,7 +1038,6 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
transformer = TransFormer(
'transformer',
ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size,
ModelHyperParams.max_length + 1,
......
......@@ -197,7 +197,8 @@ class TestLayer(LayerTest):
fc1_bias_init = fc1.bias.detach()
loss1.backward()
optimizer1 = fluid.optimizer.SGD(learning_rate=0.1)
optimizer1 = fluid.optimizer.SGD(learning_rate=0.1,
parameter_list=fc1.parameters())
optimizer1.minimize(loss1)
fc1_weight_updated = fc1.weight.detach()
......@@ -224,7 +225,8 @@ class TestLayer(LayerTest):
out2 = fc2(base.to_variable(inp))
loss2 = fluid.layers.reduce_mean(out2)
loss2.backward()
optimizer2 = fluid.optimizer.SGD(learning_rate=0.1)
optimizer2 = fluid.optimizer.SGD(learning_rate=0.1,
parameter_list=fc2.parameters())
optimizer2.minimize(loss2)
self.assertTrue(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册