未验证 提交 94240e2e 编写于 作者: S sneaxiy 提交者: GitHub

[Cherry-pick Release/2.4] Fix multi_tensor adam and momentum bug when the...

[Cherry-pick Release/2.4] Fix multi_tensor adam and momentum bug when the parameter is list of dict (#47372)

* reformat file by black

* fix multi_tensor adam/momentum bug
上级 b143e008
......@@ -25,10 +25,8 @@ from paddle.fluid.framework import _test_eager_guard
class TestAdamOp1(OpTest):
def setUp(self):
'''Test Adam Op with supplied attributes
'''
'''Test Adam Op with supplied attributes'''
self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
......@@ -50,20 +48,19 @@ class TestAdamOp1(OpTest):
'Moment2': moment2,
'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pow': np.array([beta1_pow]).astype("float32"),
'Beta2Pow': np.array([beta2_pow]).astype("float32")
'Beta2Pow': np.array([beta2_pow]).astype("float32"),
}
self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, self.attrs)
param_out, moment1_out, moment2_out = adam_step(self.inputs, self.attrs)
self.outputs = {
'Moment1Out': moment1_out,
'Moment2Out': moment2_out,
'ParamOut': param_out,
'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
}
def test_check_output(self):
......@@ -71,13 +68,11 @@ class TestAdamOp1(OpTest):
class TestAdamOp2(OpTest):
def set_shape(self):
self.shape = (102, 105)
def setUp(self):
'''Test Adam Op with supplied attributes
'''
'''Test Adam Op with supplied attributes'''
self.op_type = "adam"
self.set_shape()
param = np.random.uniform(-1, 1, self.shape).astype("float32")
......@@ -100,20 +95,19 @@ class TestAdamOp2(OpTest):
'Moment2': moment2,
'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pow': np.array([beta1_pow]).astype("float32"),
'Beta2Pow': np.array([beta2_pow]).astype("float32")
'Beta2Pow': np.array([beta2_pow]).astype("float32"),
}
attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, attributes)
param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
self.outputs = {
'Moment1Out': moment1_out,
'Moment2Out': moment2_out,
'ParamOut': param_out,
'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
}
def test_check_output(self):
......@@ -121,16 +115,13 @@ class TestAdamOp2(OpTest):
class TestAdamOnlyTailOp(TestAdamOp2):
def set_shape(self):
self.shape = (3)
self.shape = 3
class TestAdamOpMultipleSteps(OpTest):
def setUp(self):
'''Test Adam Operator with supplied attributes
'''
'''Test Adam Operator with supplied attributes'''
self.op_type = "adam"
self.num_steps = 10
......@@ -154,19 +145,20 @@ class TestAdamOpMultipleSteps(OpTest):
'Moment2': moment2,
'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pow': np.array([self.beta1_pow]).astype("float32"),
'Beta2Pow': np.array([self.beta2_pow]).astype("float32")
'Beta2Pow': np.array([self.beta2_pow]).astype("float32"),
}
self.attrs = {
'epsilon': epsilon,
'beta1': self.beta1,
'beta2': self.beta2
'beta2': self.beta2,
}
def test_check_output(self):
for _ in range(self.num_steps):
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, self.attrs)
param_out, moment1_out, moment2_out = adam_step(
self.inputs, self.attrs
)
beta1_pow_out = self.inputs['Beta1Pow'] * self.beta1
beta2_pow_out = self.inputs['Beta2Pow'] * self.beta2
......@@ -175,7 +167,7 @@ class TestAdamOpMultipleSteps(OpTest):
'Moment2Out': moment2_out,
'ParamOut': param_out,
'Beta1PowOut': beta1_pow_out,
'Beta2PowOut': beta2_pow_out
'Beta2PowOut': beta2_pow_out,
}
# Verify output for this step
......@@ -191,8 +183,9 @@ class TestAdamOpMultipleSteps(OpTest):
self.inputs['Beta2Pow'] = beta2_pow_out
# Randomize gradient for next step
self.inputs['Grad'] = np.random.uniform(
-1, 1, (102, 105)).astype("float32")
self.inputs['Grad'] = np.random.uniform(-1, 1, (102, 105)).astype(
"float32"
)
def test_api_eager_dygraph(self):
with _test_eager_guard():
......@@ -272,8 +265,9 @@ def adamw_step(inputs, attributes):
return param_out, moment1_out, moment2_out
def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
lazy_mode):
def adam_step_sparse(
inputs, attributes, height, rows, row_numel, np_grad, lazy_mode
):
'''
Simulate one step of the adam optimizer
:param inputs: dict of inputs
......@@ -298,13 +292,16 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
param_out = np.zeros(shape=[height, row_numel])
def update_row(row_id, update_value):
moment1_out[row_id] = beta1 * moment1[row_id] + (1 -
beta1) * update_value
moment2_out[row_id] = beta2 * moment2[row_id] + (
1 - beta2) * np.square(update_value)
moment1_out[row_id] = (
beta1 * moment1[row_id] + (1 - beta1) * update_value
)
moment2_out[row_id] = beta2 * moment2[row_id] + (1 - beta2) * np.square(
update_value
)
lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
param_out[row_id] = param[row_id] - lr_t * (
moment1_out[row_id] / (np.sqrt(moment2_out[row_id]) + epsilon))
moment1_out[row_id] / (np.sqrt(moment2_out[row_id]) + epsilon)
)
if lazy_mode:
for idx, row_id in enumerate(rows):
......@@ -320,7 +317,6 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
class TestSparseAdamOp(unittest.TestCase):
def setup(self, scope, place, lazy_mode):
beta1 = 0.78
beta2 = 0.836
......@@ -339,14 +335,14 @@ class TestSparseAdamOp(unittest.TestCase):
"Moment2": np.full((height, row_numel), 5.0).astype("float32"),
'Beta1Pow': beta1_pow,
'Beta2Pow': beta2_pow,
"LearningRate": np.full((1), 2.0).astype("float32")
"LearningRate": np.full((1), 2.0).astype("float32"),
}
self.init_output = np.full((height, row_numel), 0.0).astype("float32")
self.attrs = {
'epsilon': epsilon,
'beta1': beta1,
'beta2': beta2,
'min_row_size_to_use_multithread': 2
'min_row_size_to_use_multithread': 2,
}
grad_selected_rows = scope.var('Grad').get_selected_rows()
......@@ -361,15 +357,21 @@ class TestSparseAdamOp(unittest.TestCase):
self.sparse_inputs = ["Grad"]
param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs,
height, rows, row_numel,
np_array, lazy_mode)
param_out, mom1, mom2 = adam_step_sparse(
self.dense_inputs,
self.attrs,
height,
rows,
row_numel,
np_array,
lazy_mode,
)
self.outputs = {
"ParamOut": param_out,
"Moment1Out": mom1,
"Moment2Out": mom2,
'Beta1PowOut': beta1_pow * beta1,
'Beta2PowOut': beta2_pow * beta2
'Beta2PowOut': beta2_pow * beta2,
}
def check_with_place(self, place, lazy_mode):
......@@ -414,10 +416,8 @@ class TestSparseAdamOp(unittest.TestCase):
class TestAdamOpBetaVariable(OpTest):
def setUp(self):
'''Test Adam Op with beta as Variable
'''
'''Test Adam Op with beta as Variable'''
self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
......@@ -446,15 +446,14 @@ class TestAdamOpBetaVariable(OpTest):
attributes = {'epsilon': epsilon}
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, attributes)
param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
self.outputs = {
'Moment1Out': moment1_out,
'Moment2Out': moment2_out,
'ParamOut': param_out,
'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
}
def test_check_output(self):
......@@ -462,10 +461,8 @@ class TestAdamOpBetaVariable(OpTest):
class TestAdamOpBetaEpsilonVariable(OpTest):
def setUp(self):
'''Test Adam Op with beta/epsilon as Variable
'''
'''Test Adam Op with beta/epsilon as Variable'''
self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
......@@ -495,15 +492,14 @@ class TestAdamOpBetaEpsilonVariable(OpTest):
attributes = {'epsilon': epsilon}
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, attributes)
param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
self.outputs = {
'Moment1Out': moment1_out,
'Moment2Out': moment2_out,
'ParamOut': param_out,
'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
}
def test_check_output(self):
......@@ -511,10 +507,8 @@ class TestAdamOpBetaEpsilonVariable(OpTest):
class TestAdamOpWithGlobalBetaPow(OpTest):
def setUp(self):
'''Test Adam Op with global_beta_pow
'''
'''Test Adam Op with global_beta_pow'''
self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
......@@ -544,8 +538,7 @@ class TestAdamOpWithGlobalBetaPow(OpTest):
attributes = {'epsilon': epsilon}
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, attributes)
param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
self.attrs = {'use_global_beta_pow': True}
......@@ -555,7 +548,7 @@ class TestAdamOpWithGlobalBetaPow(OpTest):
'Moment2Out': moment2_out,
'ParamOut': param_out,
'Beta1PowOut': np.array([]),
'Beta2PowOut': np.array([])
'Beta2PowOut': np.array([]),
}
def test_check_output(self):
......@@ -563,10 +556,8 @@ class TestAdamOpWithGlobalBetaPow(OpTest):
class TestAdamOpWithSkipUpdate(OpTest):
def setUp(self):
'''Test Adam Op with global_beta_pow
'''
'''Test Adam Op with global_beta_pow'''
self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
......@@ -613,7 +604,6 @@ class TestAdamOpWithSkipUpdate(OpTest):
class TestAdamOpV2(unittest.TestCase):
def test_adam_op(self):
place = fluid.CPUPlace()
shape = [2, 3, 8, 8]
......@@ -626,20 +616,20 @@ class TestAdamOpV2(unittest.TestCase):
conv = fluid.layers.conv2d(data, 8, 3)
loss = fluid.layers.reduce_mean(conv)
beta1 = fluid.layers.create_global_var(shape=[1],
value=0.85,
dtype='float32',
persistable=True)
beta2 = fluid.layers.create_global_var(shape=[1],
value=0.95,
dtype='float32',
persistable=True)
beta1 = fluid.layers.create_global_var(
shape=[1], value=0.85, dtype='float32', persistable=True
)
beta2 = fluid.layers.create_global_var(
shape=[1], value=0.95, dtype='float32', persistable=True
)
betas = [beta1, beta2]
opt = paddle.optimizer.Adam(learning_rate=1e-5,
beta1=beta1,
beta2=beta2,
weight_decay=0.01,
epsilon=1e-8)
opt = paddle.optimizer.Adam(
learning_rate=1e-5,
beta1=beta1,
beta2=beta2,
weight_decay=0.01,
epsilon=1e-8,
)
opt.minimize(loss)
exe.run(startup)
......@@ -653,8 +643,9 @@ class TestAdamOpV2(unittest.TestCase):
a = fluid.dygraph.to_variable(value)
linear = fluid.Linear(13, 5, dtype="float32")
adam = paddle.optimizer.Adam(learning_rate=0.01,
parameters=linear.parameters())
adam = paddle.optimizer.Adam(
learning_rate=0.01, parameters=linear.parameters()
)
out = linear(a)
out.backward()
adam.step()
......@@ -670,26 +661,29 @@ class TestAdamOpV2(unittest.TestCase):
state_dict = adam.state_dict()
adam.set_state_dict(state_dict)
#learning_rate is LRScheduler
# learning_rate is LRScheduler
learning_rate = paddle.optimizer.lr.CosineAnnealingDecay(
learning_rate=0.1, T_max=10)
learning_rate=0.1, T_max=10
)
adam = paddle.optimizer.Adam(
learning_rate=learning_rate,
weight_decay=fluid.regularizer.L2Decay(0.001),
parameters=emb.parameters())
parameters=emb.parameters(),
)
lr = adam.get_lr()
state_dict = adam.state_dict()
adam.set_state_dict(state_dict)
#leanrning_rate is Tensor
# leanrning_rate is Tensor
with self.assertRaises(TypeError):
learning_rate = np.array([0.01]).astype("float32")
learning_rate = paddle.to_tensor(learning_rate)
adam = paddle.optimizer.Adam(learning_rate=learning_rate,
parameters=emb.parameters())
adam = paddle.optimizer.Adam(
learning_rate=learning_rate, parameters=emb.parameters()
)
params = adam.get_opti_var_name_list()
assert (params is not None)
assert params is not None
paddle.enable_static()
def test_adam_with_grad_clip(self):
......@@ -698,9 +692,9 @@ class TestAdamOpV2(unittest.TestCase):
a = fluid.dygraph.to_variable(value)
linear = fluid.Linear(13, 5, dtype="float32")
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
adam = paddle.optimizer.Adam(0.1,
parameters=linear.parameters(),
grad_clip=clip)
adam = paddle.optimizer.Adam(
0.1, parameters=linear.parameters(), grad_clip=clip
)
out = linear(a)
out.backward()
adam.step()
......@@ -715,11 +709,11 @@ class TestAdamOpV2(unittest.TestCase):
lr = 0.01
adam.set_lr(lr)
cur_lr = adam.get_lr()
assert (lr == cur_lr)
assert lr == cur_lr
with self.assertRaises(TypeError):
lr_var = paddle.fluid.layers.create_global_var(shape=[1],
value=lr,
dtype='float32')
lr_var = paddle.fluid.layers.create_global_var(
shape=[1], value=lr, dtype='float32'
)
adam.set_lr(lr_var)
paddle.enable_static()
......@@ -727,17 +721,17 @@ class TestAdamOpV2(unittest.TestCase):
paddle.disable_static()
linear = paddle.nn.Linear(10, 10)
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adam(0.1,
beta1=-1,
parameters=linear.parameters())
adam = paddle.optimizer.Adam(
0.1, beta1=-1, parameters=linear.parameters()
)
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adam(0.1,
beta2=-1,
parameters=linear.parameters())
adam = paddle.optimizer.Adam(
0.1, beta2=-1, parameters=linear.parameters()
)
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adam(0.1,
epsilon=-1,
parameters=linear.parameters())
adam = paddle.optimizer.Adam(
0.1, epsilon=-1, parameters=linear.parameters()
)
paddle.enable_static()
def test_adam_op_with_sparse_input_and_weight_decay(self):
......@@ -746,9 +740,9 @@ class TestAdamOpV2(unittest.TestCase):
x_data = np.arange(0, 10).reshape((10, 1)).astype(np.int64)
x = paddle.to_tensor(x_data, stop_gradient=False)
emb = paddle.nn.Embedding(10, 10, sparse=True)
adam = paddle.optimizer.Adam(0.001,
parameters=emb.parameters(),
weight_decay=0.01)
adam = paddle.optimizer.Adam(
0.001, parameters=emb.parameters(), weight_decay=0.01
)
with self.assertRaises(RuntimeError):
out = emb(x)
......@@ -766,13 +760,14 @@ class TestAdamOpV2(unittest.TestCase):
class TestAdamOptimizer(unittest.TestCase):
def _test(self,
place,
use_tensor=True,
use_fluid_api=True,
use_global_beta_pow=False,
flatten_param_grads=False):
def _test(
self,
place,
use_tensor=True,
use_fluid_api=True,
use_global_beta_pow=False,
flatten_param_grads=False,
):
paddle.enable_static()
main_prog = paddle.static.Program()
startup_prog = paddle.static.Program()
......@@ -786,29 +781,30 @@ class TestAdamOptimizer(unittest.TestCase):
weight_attr1 = paddle.ParamAttr(
name="weight1",
initializer=fluid.initializer.Constant(value=1.0),
trainable=True)
trainable=True,
)
weight_attr2 = paddle.ParamAttr(
name="weight2",
initializer=fluid.initializer.Constant(value=2.0),
trainable=True)
trainable=True,
)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
with paddle.static.program_guard(main_prog, startup_prog):
with paddle.utils.unique_name.guard():
a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
b = paddle.static.data(name="b", shape=[2, 2], dtype='float32')
label = paddle.static.data(name="label",
shape=[2, 1],
dtype='int64')
label = paddle.static.data(
name="label", shape=[2, 1], dtype='int64'
)
sum = paddle.add(a, b)
z = paddle.pow(sum, 2.0)
fc_1 = fluid.layers.fc(input=z, size=2, param_attr=weight_attr1)
prediction = fluid.layers.fc(input=fc_1,
size=2,
param_attr=weight_attr2,
act='softmax')
prediction = fluid.layers.fc(
input=fc_1, size=2, param_attr=weight_attr2, act='softmax'
)
cost = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.reduce_mean(cost)
......@@ -821,19 +817,22 @@ class TestAdamOptimizer(unittest.TestCase):
value=float(beta1_init),
dtype='float32',
persistable=True,
name="beta1")
name="beta1",
)
beta2 = fluid.layers.create_global_var(
shape=[1],
value=float(beta2_init),
dtype='float32',
persistable=True,
name="beta2")
name="beta2",
)
epsilon = fluid.layers.create_global_var(
shape=[1],
value=float(epsilon_init),
dtype='float32',
persistable=True,
name="epsilon")
name="epsilon",
)
if use_fluid_api:
adam = fluid.optimizer.Adam(
learning_rate=0.01,
......@@ -843,13 +842,16 @@ class TestAdamOptimizer(unittest.TestCase):
use_global_beta_pow=use_global_beta_pow,
flatten_param_grads=flatten_param_grads,
align_size=256,
grad_clip=clip)
grad_clip=clip,
)
else:
adam = paddle.optimizer.Adam(learning_rate=0.01,
beta1=beta1,
beta2=beta2,
epsilon=epsilon,
grad_clip=clip)
adam = paddle.optimizer.Adam(
learning_rate=0.01,
beta1=beta1,
beta2=beta2,
epsilon=epsilon,
grad_clip=clip,
)
else:
if use_fluid_api:
adam = fluid.optimizer.Adam(
......@@ -860,13 +862,16 @@ class TestAdamOptimizer(unittest.TestCase):
use_global_beta_pow=use_global_beta_pow,
flatten_param_grads=flatten_param_grads,
align_size=256,
grad_clip=clip)
grad_clip=clip,
)
else:
adam = fluid.optimizer.Adam(learning_rate=0.01,
beta1=beta1_init,
beta2=beta2_init,
epsilon=epsilon_init,
grad_clip=clip)
adam = fluid.optimizer.Adam(
learning_rate=0.01,
beta1=beta1_init,
beta2=beta2_init,
epsilon=epsilon_init,
grad_clip=clip,
)
adam.minimize(loss)
......@@ -877,15 +882,16 @@ class TestAdamOptimizer(unittest.TestCase):
print("Start run on {}".format(place))
for epoch in range(10):
pred_res, loss_res = exe.run(main_prog,
feed={
"a": a_np,
"b": b_np,
"label": label_np
},
fetch_list=[prediction, loss])
print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
epoch, pred_res[0], loss_res))
pred_res, loss_res = exe.run(
main_prog,
feed={"a": a_np, "b": b_np, "label": label_np},
fetch_list=[prediction, loss],
)
print(
"Epoch {} | Prediction[0]: {}, Loss: {}".format(
epoch, pred_res[0], loss_res
)
)
paddle.disable_static()
return pred_res, loss_res
......@@ -897,10 +903,13 @@ class TestAdamOptimizer(unittest.TestCase):
for use_fluid_api in [True, False]:
for use_global_beta_pow in [True, False]:
for flatten_param_grads in [True, False]:
pred, loss = self._test(place, use_tensor,
use_fluid_api,
use_global_beta_pow,
flatten_param_grads)
pred, loss = self._test(
place,
use_tensor,
use_fluid_api,
use_global_beta_pow,
flatten_param_grads,
)
preds.append(pred)
losses.append(loss)
for pred in preds:
......@@ -922,21 +931,22 @@ class TestAdamOptimizer(unittest.TestCase):
name="weight1",
initializer=fluid.initializer.Constant(value=1.0),
regularizer=fluid.regularizer.L1DecayRegularizer(
regularization_coeff=0.1),
trainable=True)
regularization_coeff=0.1
),
trainable=True,
)
with fluid.program_guard(main):
x = fluid.data(name='x', shape=[None, 13], dtype='float32')
y = fluid.data(name='y', shape=[None, 1], dtype='float32')
y_predict = fluid.layers.fc(input=x,
size=1,
act=None,
param_attr=weight_attr)
y_predict = fluid.layers.fc(
input=x, size=1, act=None, param_attr=weight_attr
)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = paddle.mean(cost)
adam = fluid.optimizer.AdamOptimizer(0.01,
flatten_param_grads=True,
align_size=256)
adam = fluid.optimizer.AdamOptimizer(
0.01, flatten_param_grads=True, align_size=256
)
adam.minimize(avg_cost)
paddle.disable_static()
......@@ -959,13 +969,16 @@ class TestAdamOptimizer(unittest.TestCase):
adam = fluid.optimizer.Adam(use_global_beta_pow=True)
adam.minimize(loss)
self.assertRaises(Exception, adam._get_global_accumulator, 'tmp')
adam._add_global_accumulator('tmp',
type=core.VarDesc.VarType.LOD_TENSOR)
adam._add_global_accumulator(
'tmp', type=core.VarDesc.VarType.LOD_TENSOR
)
adam._get_global_accumulator('tmp')
self.assertRaises(Exception,
adam._add_global_accumulator,
adam._beta1_pow_acc_str,
type=core.VarDesc.VarType.LOD_TENSOR)
self.assertRaises(
Exception,
adam._add_global_accumulator,
adam._beta1_pow_acc_str,
type=core.VarDesc.VarType.LOD_TENSOR,
)
paddle.disable_static()
def test_adam_save_load(self):
......@@ -976,12 +989,14 @@ class TestAdamOptimizer(unittest.TestCase):
state_dict = linear.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy")
scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01,
warmup_steps=100,
verbose=True)
adam = paddle.fluid.optimizer.Adam(learning_rate=scheduler,
parameter_list=linear.parameters(),
use_global_beta_pow=True)
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True
)
adam = paddle.fluid.optimizer.Adam(
learning_rate=scheduler,
parameter_list=linear.parameters(),
use_global_beta_pow=True,
)
adam.minimize(b)
state_dict = adam.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy")
......@@ -1002,13 +1017,14 @@ class TestAdamOptimizer(unittest.TestCase):
state_dict = linear.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy")
scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01,
warmup_steps=100,
verbose=True)
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True
)
adam = paddle.fluid.optimizer.Adam(
learning_rate=scheduler,
parameter_list=linear.parameters(),
use_global_beta_pow=True)
use_global_beta_pow=True,
)
adam.minimize(b)
return adam
......@@ -1023,14 +1039,14 @@ class TestAdamOptimizer(unittest.TestCase):
self.assertRaises(AssertionError, adam2.set_state_dict, opt_state_dict)
adam3 = get_opt('float32', [10, 10]) # shape not match
opt_state_dict['beta1_pow_acc_0'] = np.array([0.9, 0.9],
dtype='float32')
opt_state_dict['beta1_pow_acc_0'] = np.array(
[0.9, 0.9], dtype='float32'
)
self.assertRaises(AssertionError, adam3.set_state_dict, opt_state_dict)
paddle.enable_static()
class TestAdamOpV2Group(TestAdamOpV2):
def test_adam_op(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
......@@ -1038,16 +1054,19 @@ class TestAdamOpV2Group(TestAdamOpV2):
linear_1 = paddle.nn.Linear(13, 5)
linear_2 = paddle.nn.Linear(5, 3)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate=0.01,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
'beta1': 0.1,
'beta2': 0.99
}],
weight_decay=0.1)
adam = paddle.optimizer.Adam(
learning_rate=0.01,
parameters=[
{'params': linear_1.parameters()},
{
'params': linear_2.parameters(),
'weight_decay': 0.001,
'beta1': 0.1,
'beta2': 0.99,
},
],
weight_decay=0.1,
)
out = linear_1(a)
out = linear_2(out)
out.backward()
......@@ -1056,13 +1075,14 @@ class TestAdamOpV2Group(TestAdamOpV2):
class TestMultiTensorAdam(unittest.TestCase):
def _adam_optimize_dygraph(self,
place,
use_param_attr=False,
use_param_group=False,
use_amp=False,
use_multi_tensor=False):
def _adam_optimize_dygraph(
self,
place,
use_param_attr=False,
use_param_group=False,
use_amp=False,
use_multi_tensor=False,
):
paddle.disable_static()
paddle.seed(10)
paddle.set_device(place)
......@@ -1072,29 +1092,40 @@ class TestMultiTensorAdam(unittest.TestCase):
weight_attr = paddle.ParamAttr(
learning_rate=0.5,
regularizer=paddle.regularizer.L2Decay(1.0),
trainable=True)
trainable=True,
)
if use_param_attr:
model = paddle.nn.Linear(5, 5, weight_attr)
else:
model = paddle.nn.Linear(5, 5)
if not use_param_group:
optimizer = paddle.optimizer.Adam(parameters=model.parameters(),
use_multi_tensor=use_multi_tensor,
multi_precision=use_amp)
optimizer = paddle.optimizer.Adam(
parameters=model.parameters(),
use_multi_tensor=use_multi_tensor,
multi_precision=use_amp,
)
else:
optimizer = paddle.optimizer.Adam(parameters=[{
'params':
model.parameters(),
'weight_decay':
0.001,
'beta1':
0.1,
'beta2':
0.99
}],
use_multi_tensor=use_multi_tensor,
multi_precision=use_amp)
parameters = list(model.parameters())
param_num = len(parameters)
optimizer = paddle.optimizer.Adam(
parameters=[
{
'params': parameters[: int(param_num / 2)],
'weight_decay': 0.001,
'beta1': 0.1,
'beta2': 0.99,
},
{
'params': parameters[int(param_num / 2) :],
'weight_decay': 0.001,
'beta1': 0.1,
'beta2': 0.99,
},
],
use_multi_tensor=use_multi_tensor,
multi_precision=use_amp,
)
for idx in range(2):
if place == 'gpu' and use_amp == True:
......@@ -1118,10 +1149,9 @@ class TestMultiTensorAdam(unittest.TestCase):
return output, model.parameters()
def _adam_optimize_static(self,
place,
use_amp=False,
use_multi_tensor=False):
def _adam_optimize_static(
self, place, use_amp=False, use_multi_tensor=False
):
paddle.enable_static()
paddle.seed(10)
np.random.seed(10)
......@@ -1130,24 +1160,26 @@ class TestMultiTensorAdam(unittest.TestCase):
exe = paddle.static.Executor(place=place)
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.optimizer.Adam(multi_precision=use_amp,
use_multi_tensor=use_multi_tensor)
optimizer = paddle.optimizer.Adam(
multi_precision=use_amp, use_multi_tensor=use_multi_tensor
)
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False)
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(shape=[2, 2],
name='X',
dtype='float16')
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(shape=[2, 2],
name='X',
dtype='float32')
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
......@@ -1159,9 +1191,9 @@ class TestMultiTensorAdam(unittest.TestCase):
x = np.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
loss_data, = exe.run(train_program,
feed={"X": x},
fetch_list=[loss.name])
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
......@@ -1174,49 +1206,59 @@ class TestMultiTensorAdam(unittest.TestCase):
def _check_with_place_amp(self, place, use_amp):
# test dygraph mode
output_dygraph1, params_dygraph1 = self._adam_optimize_dygraph(
place=place, use_amp=use_amp, use_multi_tensor=True)
place=place, use_amp=use_amp, use_multi_tensor=True
)
output_dygraph2, params_dygraph2 = self._adam_optimize_dygraph(
place=place, use_amp=use_amp, use_multi_tensor=False)
place=place, use_amp=use_amp, use_multi_tensor=False
)
np.testing.assert_allclose(output_dygraph1, output_dygraph2, rtol=1e-05)
for idx in range(len(params_dygraph1)):
np.testing.assert_allclose(params_dygraph1[idx],
params_dygraph2[idx],
rtol=1e-05)
np.testing.assert_allclose(
params_dygraph1[idx], params_dygraph2[idx], rtol=1e-05
)
# test static mode
output_static1 = self._adam_optimize_static(place=place,
use_amp=use_amp,
use_multi_tensor=True)
output_static2 = self._adam_optimize_static(place=place,
use_amp=use_amp,
use_multi_tensor=False)
output_static1 = self._adam_optimize_static(
place=place, use_amp=use_amp, use_multi_tensor=True
)
output_static2 = self._adam_optimize_static(
place=place, use_amp=use_amp, use_multi_tensor=False
)
for idx in range(len(output_static1)):
np.testing.assert_allclose(output_static1[idx],
output_static2[idx],
rtol=1e-05)
np.testing.assert_allclose(
output_static1[idx], output_static2[idx], rtol=1e-05
)
def _check_with_param_arrt(self, place, use_amp):
output1, params1 = self._adam_optimize_dygraph(place=place,
use_amp=use_amp,
use_param_attr=True,
use_multi_tensor=True)
output2, params2 = self._adam_optimize_dygraph(place=place,
use_amp=use_amp,
use_param_attr=True,
use_multi_tensor=False)
output1, params1 = self._adam_optimize_dygraph(
place=place,
use_amp=use_amp,
use_param_attr=True,
use_multi_tensor=True,
)
output2, params2 = self._adam_optimize_dygraph(
place=place,
use_amp=use_amp,
use_param_attr=True,
use_multi_tensor=False,
)
np.testing.assert_allclose(output1, output2, rtol=1e-05)
for idx in range(len(params1)):
np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
def _check_with_param_group(self, place, use_amp):
output1, params1 = self._adam_optimize_dygraph(place=place,
use_amp=use_amp,
use_param_group=True,
use_multi_tensor=True)
output2, params2 = self._adam_optimize_dygraph(place=place,
use_amp=use_amp,
use_param_group=True,
use_multi_tensor=False)
output1, params1 = self._adam_optimize_dygraph(
place=place,
use_amp=use_amp,
use_param_group=True,
use_multi_tensor=True,
)
output2, params2 = self._adam_optimize_dygraph(
place=place,
use_amp=use_amp,
use_param_group=True,
use_multi_tensor=False,
)
np.testing.assert_allclose(output1, output2, rtol=1e-05)
for idx in range(len(params1)):
......
......@@ -25,14 +25,16 @@ import numpy
from paddle.fluid.framework import _test_eager_guard
def calculate_momentum_by_numpy(param,
grad,
mu,
velocity,
use_nesterov,
learning_rate,
regularization_method=None,
regularization_coeff=1.0):
def calculate_momentum_by_numpy(
param,
grad,
mu,
velocity,
use_nesterov,
learning_rate,
regularization_method=None,
regularization_coeff=1.0,
):
if regularization_method == "l2_decay":
grad = grad + regularization_coeff * param
......@@ -44,8 +46,9 @@ def calculate_momentum_by_numpy(param,
else:
velocity_out = mu * velocity + grad
if use_nesterov:
param_out = param - grad * learning_rate - \
velocity_out * mu * learning_rate
param_out = (
param - grad * learning_rate - velocity_out * mu * learning_rate
)
else:
param_out = param - learning_rate * velocity_out
......@@ -53,7 +56,6 @@ def calculate_momentum_by_numpy(param,
class TestMomentumOp1(OpTest):
def setUp(self):
self.op_type = "momentum"
self.dtype = np.float32
......@@ -70,7 +72,7 @@ class TestMomentumOp1(OpTest):
'Param': param,
'Grad': grad,
'Velocity': velocity,
'LearningRate': learning_rate
'LearningRate': learning_rate,
}
self.attrs = {'mu': mu}
......@@ -81,7 +83,8 @@ class TestMomentumOp1(OpTest):
mu=mu,
velocity=velocity,
use_nesterov=use_nesterov,
learning_rate=learning_rate)
learning_rate=learning_rate,
)
self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
......@@ -93,7 +96,6 @@ class TestMomentumOp1(OpTest):
class TestMomentumOpFp16(TestMomentumOp1):
def init_dtype(self):
self.dtype = np.float16
......@@ -102,8 +104,7 @@ class TestMomentumOpFp16(TestMomentumOp1):
class TestMomentumOp2(OpTest):
'''Test Momentum with default values for attributes
'''
'''Test Momentum with default values for attributes'''
def setUp(self):
self.op_type = "momentum"
......@@ -119,7 +120,7 @@ class TestMomentumOp2(OpTest):
'Param': param,
'Grad': grad,
'Velocity': velocity,
'LearningRate': learning_rate
'LearningRate': learning_rate,
}
self.attrs = {'mu': mu, 'use_nesterov': use_nesterov}
......@@ -130,7 +131,8 @@ class TestMomentumOp2(OpTest):
mu=mu,
velocity=velocity,
use_nesterov=use_nesterov,
learning_rate=learning_rate)
learning_rate=learning_rate,
)
self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
......@@ -138,10 +140,10 @@ class TestMomentumOp2(OpTest):
self.check_output()
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
@unittest.skipIf(
not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
)
class TestLarsMomentumOpWithMP(OpTest):
def setUp(self):
self.config()
self.op_type = "lars_momentum"
......@@ -168,11 +170,16 @@ class TestLarsMomentumOpWithMP(OpTest):
fp32_grad = grad.astype("float32")
pnorm = np.sqrt(np.square(master_param).sum())
gnorm = np.sqrt(np.square(fp32_grad).sum())
local_lr = learning_rate * lars_coeff * pnorm / (
gnorm + lars_weight_decay * pnorm)
local_lr = (
learning_rate
* lars_coeff
* pnorm
/ (gnorm + lars_weight_decay * pnorm)
)
fp32_grad = fp32_grad * rescale_grad
velocity_out = mu * velocity + local_lr * (
fp32_grad + lars_weight_decay * master_param)
fp32_grad + lars_weight_decay * master_param
)
p_new = master_param - velocity_out
param_out = p_new.astype("float16")
master_param_out = p_new
......@@ -185,7 +192,8 @@ class TestLarsMomentumOpWithMP(OpTest):
param_outs.append(("SubParam_out_" + str(i), param_out))
master_params.append(("SubMasterParam_" + str(i), master_param))
master_param_outs.append(
("SubMasterParamOut_" + str(i), master_param_out))
("SubMasterParamOut_" + str(i), master_param_out)
)
self.inputs = {
'Param': params,
......@@ -200,13 +208,13 @@ class TestLarsMomentumOpWithMP(OpTest):
'lars_coeff': lars_coeff,
'lars_weight_decay': [lars_weight_decay],
'multi_precision': True,
'rescale_grad': rescale_grad
'rescale_grad': rescale_grad,
}
self.outputs = {
'ParamOut': param_outs,
'VelocityOut': velocity_outs,
'MasterParamOut': master_param_outs
'MasterParamOut': master_param_outs,
}
def test_check_output(self):
......@@ -221,7 +229,6 @@ class TestLarsMomentumOpWithMP(OpTest):
class TestLarsMomentumOp(OpTest):
def setUp(self):
self.config()
self.op_type = "lars_momentum"
......@@ -242,10 +249,15 @@ class TestLarsMomentumOp(OpTest):
learning_rate = np.array([0.001]).astype("float32")
pnorm = np.sqrt(np.square(param).sum())
gnorm = np.sqrt(np.square(grad).sum())
local_lr = learning_rate * lars_coeff * pnorm / (
gnorm + lars_weight_decay * param)
local_lr = (
learning_rate
* lars_coeff
* pnorm
/ (gnorm + lars_weight_decay * param)
)
velocity_out = mu * velocity + local_lr * (
grad + lars_weight_decay * param)
grad + lars_weight_decay * param
)
param_out = param - velocity_out
params.append(("SubParam_" + str(i), param))
......@@ -259,13 +271,13 @@ class TestLarsMomentumOp(OpTest):
'Param': params,
'Grad': grads,
'Velocity': velocitys,
'LearningRate': learning_rates
'LearningRate': learning_rates,
}
self.attrs = {
'mu': mu,
'lars_coeff': lars_coeff,
'lars_weight_decay': [lars_weight_decay]
'lars_weight_decay': [lars_weight_decay],
}
self.outputs = {'ParamOut': param_outs, 'VelocityOut': velocity_outs}
......@@ -278,7 +290,6 @@ class TestLarsMomentumOp(OpTest):
class TestSparseMomentumOp(unittest.TestCase):
def setUp(self):
self.use_nesterov = False
self.regularization_method = ""
......@@ -317,8 +328,9 @@ class TestSparseMomentumOp(unittest.TestCase):
velocity_np_array = np.ones((height, row_numel)).astype("float32")
velocity.set(velocity_np_array, place)
velocity_out = scope.var('VelocityOut').get_tensor()
velocity_out_np_array = np.full((height, row_numel),
0.0).astype("float32")
velocity_out_np_array = np.full((height, row_numel), 0.0).astype(
"float32"
)
velocity_out.set(velocity_out_np_array, place)
# create and initialize LearningRate Variable
......@@ -327,17 +339,19 @@ class TestSparseMomentumOp(unittest.TestCase):
lr.set(lr_array, place)
# create and run operator
op = Operator("momentum",
Param='Param',
Grad='Grad',
Velocity='Velocity',
ParamOut='ParamOut',
VelocityOut='VelocityOut',
LearningRate='LearningRate',
mu=mu,
use_nesterov=use_nesterov,
regularization_method=regularization_method,
regularization_coeff=regularization_coeff)
op = Operator(
"momentum",
Param='Param',
Grad='Grad',
Velocity='Velocity',
ParamOut='ParamOut',
VelocityOut='VelocityOut',
LearningRate='LearningRate',
mu=mu,
use_nesterov=use_nesterov,
regularization_method=regularization_method,
regularization_coeff=regularization_coeff,
)
op.run(scope, place)
# get and compare result
......@@ -360,7 +374,8 @@ class TestSparseMomentumOp(unittest.TestCase):
use_nesterov=use_nesterov,
learning_rate=lr_array,
regularization_method=regularization_method,
regularization_coeff=regularization_coeff)
regularization_coeff=regularization_coeff,
)
self.assertTrue((_velocity_out == velocity_out_np_array).all())
self.assertTrue((_param_out == param_out_np_array).all())
......@@ -377,13 +392,11 @@ class TestSparseMomentumOp(unittest.TestCase):
class TestSparseMomentumOp2(TestSparseMomentumOp):
def init_kernel(self):
self.use_nesterov = True
class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
def setUp(self):
self.init_args()
self.regularization_method = ""
......@@ -427,8 +440,9 @@ class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
velocity_np_array = np.ones((height, row_numel)).astype("float32")
velocity.set(velocity_np_array, place)
velocity_out = scope.var('VelocityOut').get_tensor()
velocity_out_np_array = np.full((height, row_numel),
0.0).astype("float32")
velocity_out_np_array = np.full((height, row_numel), 0.0).astype(
"float32"
)
velocity_out.set(velocity_out_np_array, place)
# create and initialize LearningRate Variable
......@@ -437,21 +451,23 @@ class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
lr.set(lr_array, place)
# create and run operator
op = Operator("momentum",
Param='Param',
Grad='Grad',
Velocity='Velocity',
MasterParam='MasterParam',
ParamOut='ParamOut',
VelocityOut='VelocityOut',
MasterParamOut='MasterParamOut',
LearningRate='LearningRate',
mu=mu,
use_nesterov=use_nesterov,
regularization_method=regularization_method,
regularization_coeff=regularization_coeff,
multi_precision=True,
rescale_grad=1.0)
op = Operator(
"momentum",
Param='Param',
Grad='Grad',
Velocity='Velocity',
MasterParam='MasterParam',
ParamOut='ParamOut',
VelocityOut='VelocityOut',
MasterParamOut='MasterParamOut',
LearningRate='LearningRate',
mu=mu,
use_nesterov=use_nesterov,
regularization_method=regularization_method,
regularization_coeff=regularization_coeff,
multi_precision=True,
rescale_grad=1.0,
)
op.run(scope, place)
# get and compare result
......@@ -472,7 +488,8 @@ class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
use_nesterov=use_nesterov,
learning_rate=lr_array,
regularization_method=regularization_method,
regularization_coeff=regularization_coeff)
regularization_coeff=regularization_coeff,
)
self.assertTrue((_velocity_out == velocity_out_np_array).all())
self.assertTrue((_param_out == param_out_np_array).all())
......@@ -486,23 +503,22 @@ class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
class TestSparseMomentumOpWithMultiPrecision2(
TestSparseMomentumOpWithMultiPrecision):
TestSparseMomentumOpWithMultiPrecision
):
def init_args(self):
self.use_nesterov = True
class TestMomentumV2(unittest.TestCase):
def test_momentum_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Momentum(learning_rate=0.01,
momentum=0.9,
parameters=linear.parameters())
adam = paddle.optimizer.Momentum(
learning_rate=0.01, momentum=0.9, parameters=linear.parameters()
)
out = linear(a)
out.backward()
adam.step()
......@@ -519,13 +535,15 @@ class TestMomentumV2(unittest.TestCase):
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = paddle.mean(cost)
rms_optimizer = paddle.optimizer.Momentum(learning_rate=0.1,
momentum=0.9)
rms_optimizer = paddle.optimizer.Momentum(
learning_rate=0.1, momentum=0.9
)
rms_optimizer.minimize(avg_cost)
fetch_list = [avg_cost]
train_reader = paddle.batch(paddle.dataset.uci_housing.train(),
batch_size=1)
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1
)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
......@@ -533,9 +551,9 @@ class TestMomentumV2(unittest.TestCase):
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
def test_raise_error(self):
self.assertRaises(ValueError,
paddle.optimizer.Momentum,
learning_rate=None)
self.assertRaises(
ValueError, paddle.optimizer.Momentum, learning_rate=None
)
self.assertRaises(ValueError, paddle.optimizer.Momentum, momentum=None)
def test_api_eager_dygraph(self):
......@@ -545,7 +563,6 @@ class TestMomentumV2(unittest.TestCase):
class TestMomentumOpWithDecay(OpTest):
def setUp(self):
self.op_type = "momentum"
self.dtype = np.float32
......@@ -567,14 +584,14 @@ class TestMomentumOpWithDecay(OpTest):
'Param': param,
'Grad': grad,
'Velocity': velocity,
'LearningRate': learning_rate
'LearningRate': learning_rate,
}
self.attrs = {
'mu': mu,
'use_nesterov': use_nesterov,
'regularization_method': regularization_method,
'regularization_coeff': regularization_coeff
'regularization_coeff': regularization_coeff,
}
grad = grad + regularization_coeff * param
......@@ -585,7 +602,8 @@ class TestMomentumOpWithDecay(OpTest):
mu=mu,
velocity=velocity,
use_nesterov=use_nesterov,
learning_rate=learning_rate)
learning_rate=learning_rate,
)
self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
......@@ -598,7 +616,6 @@ class TestMomentumOpWithDecay(OpTest):
class TestMomentumOpWithDecayFP16(TestMomentumOpWithDecay):
def init_config(self):
self.dtype = np.float16
......@@ -608,13 +625,11 @@ class TestMomentumOpWithDecayFP16(TestMomentumOpWithDecay):
class TestMomentumOpWithDecay2(TestMomentumOpWithDecay):
def init_config(self):
self.use_nesterov = False
class TestSparseMomentumOpWithDecay(TestSparseMomentumOp):
def setUp(self):
self.use_nesterov = False
self.regularization_method = 'l2_decay'
......@@ -622,13 +637,11 @@ class TestSparseMomentumOpWithDecay(TestSparseMomentumOp):
class TestSparseMomentumOpWithDecay2(TestSparseMomentumOpWithDecay):
def init_kernel(self):
self.use_nesterov = True
class TestMomentumOpWithDecayAPI(unittest.TestCase):
def _test_momentum_dygraph_common(self, regularization):
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
......@@ -641,13 +654,16 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase):
learning_rate=0.01,
momentum=0.9,
parameter_list=linear.parameters(),
regularization=regularization)
regularization=regularization,
)
momentum.minimize(loss)
def test_momentum_dygraph_1(self):
self._test_momentum_dygraph_common(
regularization=paddle.fluid.regularizer.L2Decay(
regularization_coeff=0.1))
regularization_coeff=0.1
)
)
def test_momentum_static(self):
paddle.enable_static()
......@@ -661,12 +677,14 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase):
avg_cost = paddle.mean(cost)
momentum_optimizer = paddle.fluid.contrib.optimizer.Momentum(
learning_rate=0.1, momentum=0.9)
learning_rate=0.1, momentum=0.9
)
momentum_optimizer.minimize(avg_cost)
fetch_list = [avg_cost]
train_reader = paddle.batch(paddle.dataset.uci_housing.train(),
batch_size=1)
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1
)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
......@@ -675,23 +693,23 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase):
class TestFusedMomentumWithDecayAPI(unittest.TestCase):
def get_program(self, weight_attr, bias_attr=False):
main_program = paddle.static.Program()
startup_program = paddle.static.Program()
with paddle.static.program_guard(main_program=main_program,
startup_program=startup_program):
with paddle.static.program_guard(
main_program=main_program, startup_program=startup_program
):
x = paddle.static.data(name='x', shape=[10, 10])
linear = paddle.nn.Linear(10,
10,
weight_attr=weight_attr,
bias_attr=bias_attr)
linear = paddle.nn.Linear(
10, 10, weight_attr=weight_attr, bias_attr=bias_attr
)
out = linear(x)
loss = paddle.mean(out)
optimizer = paddle.optimizer.Momentum(
learning_rate=0.01,
momentum=0.9,
weight_decay=paddle.regularizer.L2Decay(0.5))
weight_decay=paddle.regularizer.L2Decay(0.5),
)
optimizer.minimize(loss)
return main_program
......@@ -700,7 +718,8 @@ class TestFusedMomentumWithDecayAPI(unittest.TestCase):
weight_attr = paddle.ParamAttr(
name="weight",
initializer=paddle.nn.initializer.Constant(value=0.5),
regularizer=paddle.regularizer.L2Decay(0.1))
regularizer=paddle.regularizer.L2Decay(0.1),
)
program = self.get_program(weight_attr, bias_attr=False)
ops = program.global_block().ops
......@@ -715,11 +734,13 @@ class TestFusedMomentumWithDecayAPI(unittest.TestCase):
weight_attr = paddle.ParamAttr(
name="weight",
initializer=paddle.nn.initializer.Constant(value=0.5),
regularizer=paddle.regularizer.L1Decay(0.1))
regularizer=paddle.regularizer.L1Decay(0.1),
)
bias_attr = paddle.ParamAttr(
name="bias",
initializer=paddle.nn.initializer.Constant(value=0.),
regularizer=None)
initializer=paddle.nn.initializer.Constant(value=0.0),
regularizer=None,
)
program = self.get_program(weight_attr, bias_attr)
ops = program.global_block().ops
......@@ -734,8 +755,9 @@ class TestFusedMomentumWithDecayAPI(unittest.TestCase):
self.assertEqual(ops[-1].attr('regularization_coeff'), 0)
if 'bias' in ops[-2].input('Param'):
self.assertEqual(ops[-2].attr('regularization_method'), 'l2_decay')
self.assertEqual(ops[-2].attr('regularization_coeff'),
np.float32(0.5))
self.assertEqual(
ops[-2].attr('regularization_coeff'), np.float32(0.5)
)
def test_param_has_no_regularizer(self):
paddle.enable_static()
......@@ -749,11 +771,11 @@ class TestFusedMomentumWithDecayAPI(unittest.TestCase):
class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
def __update_params(self, momentum, linear):
for i in range(10):
inp = paddle.full(shape=[2, 2], fill_value=i,
dtype='float32').astype("float32")
inp = paddle.full(
shape=[2, 2], fill_value=i, dtype='float32'
).astype("float32")
inp = paddle.to_tensor(inp)
out = linear(inp)
loss = paddle.mean(out)
......@@ -768,32 +790,39 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
2,
2,
weight_attr=paddle.nn.initializer.Constant(value=2.0),
bias_attr=paddle.nn.initializer.Constant(value=2.0))
bias_attr=paddle.nn.initializer.Constant(value=2.0),
)
momentum_old = paddle.fluid.optimizer.Momentum(
learning_rate=0.01,
momentum=0.9,
parameter_list=linear_old.parameters(),
regularization=paddle.fluid.regularizer.L2Decay(
regularization_coeff=0.1))
regularization_coeff=0.1
),
)
self.__update_params(momentum=momentum_old, linear=linear_old)
linear_new = paddle.nn.Linear(
2,
2,
weight_attr=paddle.nn.initializer.Constant(value=2.0),
bias_attr=paddle.nn.initializer.Constant(value=2.0))
bias_attr=paddle.nn.initializer.Constant(value=2.0),
)
momentum_new = paddle.fluid.contrib.optimizer.Momentum(
learning_rate=0.01,
momentum=0.9,
parameter_list=linear_new.parameters(),
regularization=paddle.fluid.regularizer.L2Decay(
regularization_coeff=0.1))
regularization_coeff=0.1
),
)
self.__update_params(momentum=momentum_new, linear=linear_new)
self.assertEqual(
(linear_old.weight.numpy() == linear_new.weight.numpy()).all(),
True,
'the param weight updated by two Momentum optimizers should equal')
'the param weight updated by two Momentum optimizers should equal',
)
def test_vs(self, place=fluid.CPUPlace()):
places = [fluid.CPUPlace()]
......@@ -805,7 +834,6 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
class TestMomentumV2Group(TestMomentumV2):
def test_momentum_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
......@@ -813,22 +841,20 @@ class TestMomentumV2Group(TestMomentumV2):
linear_1 = paddle.nn.Linear(13, 5)
linear_2 = paddle.nn.Linear(5, 3)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Momentum(learning_rate=0.01,
parameters=[{
'params':
linear_1.parameters()
}, {
'params':
linear_2.parameters(),
'weight_decay':
0.001,
'learning_rate':
0.1,
'momentum':
0.99
}],
weight_decay=0.1,
momentum=0.9)
adam = paddle.optimizer.Momentum(
learning_rate=0.01,
parameters=[
{'params': linear_1.parameters()},
{
'params': linear_2.parameters(),
'weight_decay': 0.001,
'learning_rate': 0.1,
'momentum': 0.99,
},
],
weight_decay=0.1,
momentum=0.9,
)
out = linear_1(a)
out = linear_2(out)
out.backward()
......@@ -837,13 +863,14 @@ class TestMomentumV2Group(TestMomentumV2):
class TestMultiTensorMomentumDygraph(unittest.TestCase):
def _momentum_optimize_dygraph(self,
place,
use_param_attr=False,
use_param_group=False,
use_amp=False,
use_multi_tensor=False):
def _momentum_optimize_dygraph(
self,
place,
use_param_attr=False,
use_param_group=False,
use_amp=False,
use_multi_tensor=False,
):
paddle.disable_static()
paddle.seed(10)
paddle.set_device(place)
......@@ -851,7 +878,8 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
weight_attr = paddle.ParamAttr(
learning_rate=0.5,
regularizer=paddle.regularizer.L2Decay(1.0),
trainable=True)
trainable=True,
)
if use_param_attr:
model = paddle.nn.Linear(5, 5, weight_attr)
else:
......@@ -860,17 +888,29 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
optimizer = paddle.optimizer.Momentum(
parameters=model.parameters(),
use_multi_tensor=use_multi_tensor,
multi_precision=use_amp)
multi_precision=use_amp,
)
else:
parameters = list(model.parameters())
n = len(parameters)
optimizer = paddle.optimizer.Momentum(
parameters=[{
'params': model.parameters(),
'weight_decay': 0.001,
'learning_rate': 0.1,
'momentum': 0.99
}],
parameters=[
{
'params': parameters[: int(n / 2)],
'weight_decay': 0.001,
'learning_rate': 0.1,
'momentum': 0.99,
},
{
'params': parameters[int(n / 2) :],
'weight_decay': 0.001,
'learning_rate': 0.1,
'momentum': 0.99,
},
],
use_multi_tensor=use_multi_tensor,
multi_precision=use_amp)
multi_precision=use_amp,
)
for idx in range(5):
if place == 'gpu' and use_amp == True:
model = paddle.amp.decorate(models=model, level='O2')
......@@ -900,9 +940,11 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
def _check_with_place_amp(self, place, use_amp):
output1, params1 = self._momentum_optimize_dygraph(
place=place, use_amp=use_amp, use_multi_tensor=True)
place=place, use_amp=use_amp, use_multi_tensor=True
)
output2, params2 = self._momentum_optimize_dygraph(
place=place, use_amp=use_amp, use_multi_tensor=False)
place=place, use_amp=use_amp, use_multi_tensor=False
)
np.testing.assert_allclose(output1, output2, rtol=1e-05)
for idx in range(len(params1)):
......@@ -913,12 +955,14 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
place=place,
use_amp=use_amp,
use_param_attr=True,
use_multi_tensor=True)
use_multi_tensor=True,
)
output2, params2 = self._momentum_optimize_dygraph(
place=place,
use_amp=use_amp,
use_param_attr=True,
use_multi_tensor=False)
use_multi_tensor=False,
)
np.testing.assert_allclose(output1, output2, rtol=1e-05)
for idx in range(len(params1)):
np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
......@@ -928,12 +972,14 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
place=place,
use_amp=use_amp,
use_param_group=True,
use_multi_tensor=True)
use_multi_tensor=True,
)
output2, params2 = self._momentum_optimize_dygraph(
place=place,
use_amp=use_amp,
use_param_group=True,
use_multi_tensor=False)
use_multi_tensor=False,
)
np.testing.assert_allclose(output1, output2, rtol=1e-05)
for idx in range(len(params1)):
np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
......@@ -952,11 +998,9 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
class TestMultiTensorMomentumStatic(unittest.TestCase):
def _momentum_optimize_static(self,
place,
use_amp=False,
use_multi_tensor=False):
def _momentum_optimize_static(
self, place, use_amp=False, use_multi_tensor=False
):
paddle.enable_static()
paddle.seed(10)
np.random.seed(10)
......@@ -965,24 +1009,26 @@ class TestMultiTensorMomentumStatic(unittest.TestCase):
exe = paddle.static.Executor(place=place)
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.optimizer.Momentum(multi_precision=use_amp,
use_multi_tensor=use_multi_tensor)
optimizer = paddle.optimizer.Momentum(
multi_precision=use_amp, use_multi_tensor=use_multi_tensor
)
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False)
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(shape=[2, 2],
name='X',
dtype='float16')
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(shape=[2, 2],
name='X',
dtype='float32')
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
......@@ -994,9 +1040,9 @@ class TestMultiTensorMomentumStatic(unittest.TestCase):
x = numpy.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
loss_data, = exe.run(train_program,
feed={"X": x},
fetch_list=[loss.name])
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
......@@ -1007,12 +1053,12 @@ class TestMultiTensorMomentumStatic(unittest.TestCase):
return places
def _check_with_place_amp(self, place, use_amp):
output1 = self._momentum_optimize_static(place=place,
use_amp=use_amp,
use_multi_tensor=True)
output2 = self._momentum_optimize_static(place=place,
use_amp=use_amp,
use_multi_tensor=False)
output1 = self._momentum_optimize_static(
place=place, use_amp=use_amp, use_multi_tensor=True
)
output2 = self._momentum_optimize_static(
place=place, use_amp=use_amp, use_multi_tensor=False
)
for idx in range(len(output1)):
np.testing.assert_allclose(output1[idx], output2[idx], rtol=1e-05)
......
......@@ -163,18 +163,20 @@ class Adam(Optimizer):
_beta1_pow_acc_str = "beta1_pow_acc"
_beta2_pow_acc_str = "beta2_pow_acc"
def __init__(self,
learning_rate=0.001,
beta1=0.9,
beta2=0.999,
epsilon=1e-8,
parameters=None,
weight_decay=None,
grad_clip=None,
lazy_mode=False,
multi_precision=False,
use_multi_tensor=False,
name=None):
def __init__(
self,
learning_rate=0.001,
beta1=0.9,
beta2=0.999,
epsilon=1e-8,
parameters=None,
weight_decay=None,
grad_clip=None,
lazy_mode=False,
multi_precision=False,
use_multi_tensor=False,
name=None,
):
assert learning_rate is not None
assert beta1 is not None
assert beta2 is not None
......@@ -182,20 +184,25 @@ class Adam(Optimizer):
if not isinstance(beta1, Variable):
if not 0 <= beta1 < 1:
raise ValueError(
"Invaild value of beta1, expect beta1 in [0,1).")
"Invaild value of beta1, expect beta1 in [0,1)."
)
if not isinstance(beta2, Variable):
if not 0 <= beta2 < 1:
raise ValueError(
"Invaild value of beta2, expect beta2 in [0,1).")
"Invaild value of beta2, expect beta2 in [0,1)."
)
if not isinstance(epsilon, Variable):
if not 0 <= epsilon:
raise ValueError(
"Invaild value of epsilon, expect epsilon >= 0.")
super(Adam, self).__init__(learning_rate=learning_rate,
parameters=parameters,
weight_decay=weight_decay,
grad_clip=grad_clip,
name=name)
"Invaild value of epsilon, expect epsilon >= 0."
)
super(Adam, self).__init__(
learning_rate=learning_rate,
parameters=parameters,
weight_decay=weight_decay,
grad_clip=grad_clip,
name=name,
)
self.type = "adam"
self._beta1 = beta1
self._beta2 = beta2
......@@ -212,21 +219,13 @@ class Adam(Optimizer):
self._use_multi_tensor = use_multi_tensor
if self._use_multi_tensor:
self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
self._moment1_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
self._moment2_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
self._beta1_pow_acc_dict = {
'FP32_LODTensor': [],
'FP16_LODTensor': []
}
self._beta2_pow_acc_dict = {
'FP32_LODTensor': [],
'FP16_LODTensor': []
}
self._master_weight_dict = {
'FP32_LODTensor': None,
'FP16_LODTensor': []
}
self._param_dict = self._create_multi_tensor_dict()
self._moment1_dict = self._create_multi_tensor_dict()
self._moment2_dict = self._create_multi_tensor_dict()
self._beta1_pow_acc_dict = self._create_multi_tensor_dict()
self._beta2_pow_acc_dict = self._create_multi_tensor_dict()
self._master_weight_dict = self._create_multi_tensor_dict()
self._master_weight_dict['FP32_LODTensor'] = None
def _create_master_weight(self, param):
if param.name in self._master_weights:
......@@ -236,19 +235,23 @@ class Adam(Optimizer):
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = layers.create_global_var(name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True)
var = layers.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32
})
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
......@@ -262,20 +265,30 @@ class Adam(Optimizer):
"""
if self._name is not None:
name = self._name + "_" + name
find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
target_param = self._master_weights[
param.name] if find_master else param
find_master = (
self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (name not in self._accumulators
or target_name not in self._accumulators[name]):
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name))
name, target_name
)
)
return self._accumulators[name][target_name]
def _add_moments_pows(self, p):
acc_dtype = p.dtype
if acc_dtype == core.VarDesc.VarType.FP16 or acc_dtype == core.VarDesc.VarType.BF16:
if (
acc_dtype == core.VarDesc.VarType.FP16
or acc_dtype == core.VarDesc.VarType.BF16
):
acc_dtype = core.VarDesc.VarType.FP32
self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype)
self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype)
......@@ -283,18 +296,24 @@ class Adam(Optimizer):
name=self._beta1_pow_acc_str,
param=p,
dtype=acc_dtype,
fill_value=0.9 if isinstance(self._beta1, Variable) \
else self._beta1,
fill_value=0.9
if isinstance(self._beta1, Variable)
else self._beta1,
shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
type=core.VarDesc.VarType.LOD_TENSOR,
device='cpu',
)
self._add_accumulator(
name=self._beta2_pow_acc_str,
param=p,
dtype=acc_dtype,
fill_value=0.999 if isinstance(self._beta2, Variable) \
else self._beta2,
fill_value=0.999
if isinstance(self._beta2, Variable)
else self._beta2,
shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
type=core.VarDesc.VarType.LOD_TENSOR,
device='cpu',
)
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
......@@ -307,7 +326,10 @@ class Adam(Optimizer):
master_p = self._create_master_weight(p)
self._add_moments_pows(master_p)
continue
if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
if (
p.dtype == core.VarDesc.VarType.FP16
and not self._multi_precision
):
warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Adam optimizer."
......@@ -319,50 +341,105 @@ class Adam(Optimizer):
if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad)
moment1 = self._get_accumulator(self._moment1_acc_str,
param_and_grad[0])
moment2 = self._get_accumulator(self._moment2_acc_str,
param_and_grad[0])
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param_and_grad[0])
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
param_and_grad[0])
find_master = self._multi_precision and param_and_grad[
0].dtype == core.VarDesc.VarType.FP16
master_weight = (self._master_weights[param_and_grad[0].name]
if find_master else None)
moment1 = self._get_accumulator(
self._moment1_acc_str, param_and_grad[0]
)
moment2 = self._get_accumulator(
self._moment2_acc_str, param_and_grad[0]
)
beta1_pow_acc = self._get_accumulator(
self._beta1_pow_acc_str, param_and_grad[0]
)
beta2_pow_acc = self._get_accumulator(
self._beta2_pow_acc_str, param_and_grad[0]
)
find_master = (
self._multi_precision
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
lr = self._create_param_lr(param_and_grad)
# create the adam optimize op
if framework.in_dygraph_mode():
found_inf = self._get_auxiliary_var('found_inf')
_beta1 = self._beta1 if not isinstance(
self._beta1, Variable) else self._beta1.numpy().item(0)
_beta2 = self._beta2 if not isinstance(
self._beta2, Variable) else self._beta2.numpy().item(0)
_beta1 = (
self._beta1
if not isinstance(self._beta1, Variable)
else self._beta1.numpy().item(0)
)
_beta2 = (
self._beta2
if not isinstance(self._beta2, Variable)
else self._beta2.numpy().item(0)
)
_, _, _, _, _, _ = _C_ops.adam_(
param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
beta1_pow_acc, beta2_pow_acc, master_weight, found_inf, _beta1,
_beta2, self._epsilon, self._lazy_mode, 1000, find_master,
False)
param_and_grad[0],
param_and_grad[1],
lr,
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
found_inf,
_beta1,
_beta2,
self._epsilon,
self._lazy_mode,
1000,
find_master,
False,
)
return None
if framework._in_legacy_dygraph():
_beta1 = self._beta1 if not isinstance(
self._beta1, Variable) else self._beta1.numpy().item(0)
_beta2 = self._beta2 if not isinstance(
self._beta2, Variable) else self._beta2.numpy().item(0)
_beta1 = (
self._beta1
if not isinstance(self._beta1, Variable)
else self._beta1.numpy().item(0)
)
_beta2 = (
self._beta2
if not isinstance(self._beta2, Variable)
else self._beta2.numpy().item(0)
)
_, _, _, _, _, _ = _legacy_C_ops.adam(
param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
'beta2', _beta2, 'multi_precision', find_master)
param_and_grad[0],
param_and_grad[1],
lr,
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
param_and_grad[0],
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
'epsilon',
self._epsilon,
'lazy_mode',
self._lazy_mode,
'min_row_size_to_use_multithread',
1000,
'beta1',
_beta1,
'beta2',
_beta2,
'multi_precision',
find_master,
)
return None
......@@ -373,7 +450,7 @@ class Adam(Optimizer):
"Moment1": [moment1],
"Moment2": [moment2],
"Beta1Pow": [beta1_pow_acc],
"Beta2Pow": [beta2_pow_acc]
"Beta2Pow": [beta2_pow_acc],
}
outputs = {
"ParamOut": [param_and_grad[0]],
......@@ -385,7 +462,7 @@ class Adam(Optimizer):
attrs = {
"lazy_mode": self._lazy_mode,
"min_row_size_to_use_multithread": 1000,
"multi_precision": find_master
"multi_precision": find_master,
}
if isinstance(self._beta1, Variable):
......@@ -405,11 +482,13 @@ class Adam(Optimizer):
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
adam_op = block.append_op(type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True)
adam_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return adam_op
......@@ -426,7 +505,7 @@ class Adam(Optimizer):
.. code-block:: python
import paddle
a = paddle.rand([2,13], dtype="float32")
linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph.
......@@ -445,27 +524,34 @@ class Adam(Optimizer):
if param._grad_ivar() is not None:
grad_var = param._grad_ivar()
if in_dygraph_mode():
if hasattr(grad_var, "is_selected_rows"
) and grad_var.is_selected_rows(
) and self.regularization is not None:
if (
hasattr(grad_var, "is_selected_rows")
and grad_var.is_selected_rows()
and self.regularization is not None
):
raise RuntimeError(
"Adam don't support weight_decay with sparse parameters, please set it to None."
)
else:
if hasattr(
grad_var, "_is_sparse") and grad_var._is_sparse(
) and self.regularization is not None:
if (
hasattr(grad_var, "_is_sparse")
and grad_var._is_sparse()
and self.regularization is not None
):
raise RuntimeError(
"Adam don't support weight_decay with sparse parameters, please set it to None."
)
params_grads.append((param, grad_var))
optimize_ops = self._apply_optimize(loss=None,
startup_program=None,
params_grads=params_grads)
optimize_ops = self._apply_optimize(
loss=None,
startup_program=None,
params_grads=params_grads,
param_group_idx=0,
)
else:
# optimize parameters in groups
for param_group in self._param_groups:
for idx, param_group in enumerate(self._param_groups):
params_grads = defaultdict(lambda: list())
for param in param_group['params']:
if param.stop_gradient:
......@@ -474,13 +560,16 @@ class Adam(Optimizer):
grad_var = param._grad_ivar()
params_grads['params'].append((param, grad_var))
params_grads.update(
{k: v
for k, v in param_group.items() if k != 'params'})
self._apply_optimize(loss=None,
startup_program=None,
params_grads=params_grads)
{k: v for k, v in param_group.items() if k != 'params'}
)
self._apply_optimize(
loss=None,
startup_program=None,
params_grads=params_grads,
param_group_idx=idx,
)
def _multi_tensor_init(self, target_block, parameters):
def _multi_tensor_init(self, target_block, parameters, param_group_idx):
"""
All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
This function will be overridden in the corresponding optimizer file.
......@@ -492,26 +581,49 @@ class Adam(Optimizer):
for param in parameters:
moment1 = self._get_accumulator(self._moment1_acc_str, param)
moment2 = self._get_accumulator(self._moment2_acc_str, param)
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param)
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
param)
beta1_pow_acc = self._get_accumulator(
self._beta1_pow_acc_str, param
)
beta2_pow_acc = self._get_accumulator(
self._beta2_pow_acc_str, param
)
if param.dtype == paddle.float32:
self._param_dict['FP32_LODTensor'].append(param)
self._moment1_dict['FP32_LODTensor'].append(moment1)
self._moment2_dict['FP32_LODTensor'].append(moment2)
self._beta1_pow_acc_dict['FP32_LODTensor'].append(beta1_pow_acc)
self._beta2_pow_acc_dict['FP32_LODTensor'].append(beta2_pow_acc)
self._param_dict['FP32_LODTensor'][param_group_idx].append(
param
)
self._moment1_dict['FP32_LODTensor'][param_group_idx].append(
moment1
)
self._moment2_dict['FP32_LODTensor'][param_group_idx].append(
moment2
)
self._beta1_pow_acc_dict['FP32_LODTensor'][
param_group_idx
].append(beta1_pow_acc)
self._beta2_pow_acc_dict['FP32_LODTensor'][
param_group_idx
].append(beta2_pow_acc)
elif param.dtype == paddle.float16:
self._param_dict['FP16_LODTensor'].append(param)
self._moment1_dict['FP16_LODTensor'].append(moment1)
self._moment2_dict['FP16_LODTensor'].append(moment2)
self._beta1_pow_acc_dict['FP16_LODTensor'].append(beta1_pow_acc)
self._beta2_pow_acc_dict['FP16_LODTensor'].append(beta2_pow_acc)
self._param_dict['FP16_LODTensor'][param_group_idx].append(
param
)
self._moment1_dict['FP16_LODTensor'][param_group_idx].append(
moment1
)
self._moment2_dict['FP16_LODTensor'][param_group_idx].append(
moment2
)
self._beta1_pow_acc_dict['FP16_LODTensor'][
param_group_idx
].append(beta1_pow_acc)
self._beta2_pow_acc_dict['FP16_LODTensor'][
param_group_idx
].append(beta2_pow_acc)
if self._multi_precision:
self._master_weight_dict['FP16_LODTensor'].append(
self._master_weights[param.name])
self._master_weight_dict['FP16_LODTensor'][
param_group_idx
].append(self._master_weights[param.name])
else:
self._master_weight_dict['FP16_LODTensor'] = None
else:
......@@ -519,9 +631,13 @@ class Adam(Optimizer):
"Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR."
)
def _append_optimize_multi_tensor_op(self, target_block,
parameters_and_grads):
"""
def _append_optimize_multi_tensor_op(
self,
target_block,
parameters_and_grads,
param_group_idx,
):
"""
For Multi Tensor, append optimize merged_operator to block.
"""
assert isinstance(target_block, framework.Block)
......@@ -534,15 +650,19 @@ class Adam(Optimizer):
if param_and_grad[1] is None:
continue
if param_and_grad[0].stop_gradient is False:
if param_and_grad[
0].dtype == paddle.float32 and param_and_grad[
1].type == core.VarDesc.VarType.LOD_TENSOR:
if (
param_and_grad[0].dtype == paddle.float32
and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP32_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad)
lr_dict['FP32_LODTensor'].append(lr)
elif param_and_grad[
0].dtype == paddle.float16 and param_and_grad[
1].type == core.VarDesc.VarType.LOD_TENSOR:
elif (
param_and_grad[0].dtype == paddle.float16
and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP16_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad)
lr_dict['FP16_LODTensor'].append(lr)
......@@ -553,97 +673,149 @@ class Adam(Optimizer):
if param_and_grad[0].stop_gradient is False:
param_grad_dict = dict()
param_grad_dict['params'] = param_and_grad
param_grad_dict.update({
k: v
for k, v in parameters_and_grads.items()
if k != 'params'
})
param_grad_dict.update(
{
k: v
for k, v in parameters_and_grads.items()
if k != 'params'
}
)
param_and_grad = self._update_param_group(param_grad_dict)
if param_and_grad[
0].dtype == paddle.float32 and param_and_grad[
1].type == core.VarDesc.VarType.LOD_TENSOR:
if (
param_and_grad[0].dtype == paddle.float32
and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP32_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad)
lr_dict['FP32_LODTensor'].append(lr)
elif param_and_grad[
0].dtype == paddle.float16 and param_and_grad[
1].type == core.VarDesc.VarType.LOD_TENSOR:
elif (
param_and_grad[0].dtype == paddle.float16
and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP16_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad)
lr_dict['FP16_LODTensor'].append(lr)
multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor']
for key in multi_tensor_list:
if len(self._param_dict[key]) > 0:
if len(self._param_dict[key][param_group_idx]) > 0:
find_master = self._multi_precision and key == 'FP16_LODTensor'
_beta1 = self._beta1 if not isinstance(
self._beta1, Variable) else self._beta1.numpy().item(0)
_beta2 = self._beta2 if not isinstance(
self._beta2, Variable) else self._beta2.numpy().item(0)
_beta1 = (
self._beta1
if not isinstance(self._beta1, Variable)
else self._beta1.numpy().item(0)
)
_beta2 = (
self._beta2
if not isinstance(self._beta2, Variable)
else self._beta2.numpy().item(0)
)
if framework._non_static_mode():
master_weight = self._master_weight_dict[key]
master_weight = (
master_weight[param_group_idx]
if master_weight is not None
else None
)
if in_dygraph_mode():
_, _, _, _, _, _ = _C_ops.merged_adam_(
self._param_dict[key], grad_dict[key], lr_dict[key],
self._moment1_dict[key], self._moment2_dict[key],
self._beta1_pow_acc_dict[key],
self._beta2_pow_acc_dict[key],
self._master_weight_dict[key], _beta1, _beta2,
self._epsilon, find_master, False)
self._param_dict[key][param_group_idx],
grad_dict[key],
lr_dict[key],
self._moment1_dict[key][param_group_idx],
self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
_beta1,
_beta2,
self._epsilon,
find_master,
False,
)
else:
_, _, _, _, _, _ = _legacy_C_ops.merged_adam(
self._param_dict[key], grad_dict[key], lr_dict[key],
self._moment1_dict[key], self._moment2_dict[key],
self._beta1_pow_acc_dict[key],
self._beta2_pow_acc_dict[key],
self._master_weight_dict[key],
self._param_dict[key], self._moment1_dict[key],
self._moment2_dict[key],
self._beta1_pow_acc_dict[key],
self._beta2_pow_acc_dict[key],
self._master_weight_dict[key], 'epsilon',
self._epsilon, 'beta1', _beta1, 'beta2', _beta2,
'multi_precision', find_master)
self._param_dict[key][param_group_idx],
grad_dict[key],
lr_dict[key],
self._moment1_dict[key][param_group_idx],
self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
self._param_dict[key][param_group_idx],
self._moment1_dict[key][param_group_idx],
self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
'epsilon',
self._epsilon,
'beta1',
_beta1,
'beta2',
_beta2,
'multi_precision',
find_master,
)
else:
inputs = {
"Param": self._param_dict[key],
"Param": self._param_dict[key][param_group_idx],
"Grad": grad_dict[key],
"LearningRate": lr_dict[key],
"Moment1": self._moment1_dict[key],
"Moment2": self._moment2_dict[key],
"Beta1Pow": self._beta1_pow_acc_dict[key],
"Beta2Pow": self._beta2_pow_acc_dict[key]
"Moment1": self._moment1_dict[key][param_group_idx],
"Moment2": self._moment2_dict[key][param_group_idx],
"Beta1Pow": self._beta1_pow_acc_dict[key][
param_group_idx
],
"Beta2Pow": self._beta2_pow_acc_dict[key][
param_group_idx
],
}
outputs = {
"ParamOut": self._param_dict[key],
"Moment1Out": self._moment1_dict[key],
"Moment2Out": self._moment2_dict[key],
"Beta1PowOut": self._beta1_pow_acc_dict[key],
"Beta2PowOut": self._beta2_pow_acc_dict[key]
"ParamOut": self._param_dict[key][param_group_idx],
"Moment1Out": self._moment1_dict[key][param_group_idx],
"Moment2Out": self._moment2_dict[key][param_group_idx],
"Beta1PowOut": self._beta1_pow_acc_dict[key][
param_group_idx
],
"Beta2PowOut": self._beta2_pow_acc_dict[key][
param_group_idx
],
}
attrs = {
"epsilon": self._epsilon,
"beta1": _beta1,
"beta2": _beta2
"beta2": _beta2,
}
if find_master:
inputs["MasterParam"] = self._master_weight_dict[key]
inputs["MasterParam"] = self._master_weight_dict[key][
param_group_idx
]
outputs["MasterParamOut"] = self._master_weight_dict[
key]
key
][param_group_idx]
attrs["multi_precision"] = find_master
target_block.append_op(type="merged_adam",
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True)
target_block.append_op(
type="merged_adam",
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return None
def _update_param_group(self, parameters):
self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
self._lazy_mode = parameters.get('lazy_mode',
self._default_dict['lazy_mode'])
self._lazy_mode = parameters.get(
'lazy_mode', self._default_dict['lazy_mode']
)
parameters = parameters.get('params')
return parameters
......@@ -123,29 +123,35 @@ class Momentum(Optimizer):
"""
_velocity_acc_str = "velocity"
def __init__(self,
learning_rate=0.001,
momentum=0.9,
parameters=None,
use_nesterov=False,
weight_decay=None,
grad_clip=None,
multi_precision=False,
rescale_grad=1.0,
use_multi_tensor=False,
name=None):
def __init__(
self,
learning_rate=0.001,
momentum=0.9,
parameters=None,
use_nesterov=False,
weight_decay=None,
grad_clip=None,
multi_precision=False,
rescale_grad=1.0,
use_multi_tensor=False,
name=None,
):
if learning_rate is None:
raise ValueError("learning_rate is not set")
if momentum is None:
raise ValueError("momentum is not set")
predicate = lambda regular: isinstance(regular,
(L2DecayRegularizer, float))
predicate = lambda regular: isinstance(
regular, (L2DecayRegularizer, float)
)
if isinstance(parameters, list):
if isinstance(parameters[0], dict):
for param_group in parameters:
decay = param_group[
'weight_decay'] if 'weight_decay' in param_group else weight_decay
decay = (
param_group['weight_decay']
if 'weight_decay' in param_group
else weight_decay
)
reg_method, reg_coeff = self._update_regularization(decay)
param_group['regularization_method'] = reg_method
param_group['regularization_coeff'] = reg_coeff
......@@ -153,16 +159,20 @@ class Momentum(Optimizer):
param_group['weight_decay'] = py_regular
py_regular = None if predicate(weight_decay) else weight_decay
super(Momentum, self).__init__(learning_rate=learning_rate,
parameters=parameters,
weight_decay=py_regular,
grad_clip=grad_clip,
name=name)
super(Momentum, self).__init__(
learning_rate=learning_rate,
parameters=parameters,
weight_decay=py_regular,
grad_clip=grad_clip,
name=name,
)
self.type = "momentum"
self._momentum = momentum
self._use_nesterov = bool(use_nesterov)
self._regularization_method, self._regularization_coeff = self._update_regularization(
weight_decay)
(
self._regularization_method,
self._regularization_coeff,
) = self._update_regularization(weight_decay)
self._multi_precision = multi_precision
self._rescale_grad = rescale_grad
self._master_weights = {}
......@@ -176,29 +186,21 @@ class Momentum(Optimizer):
}
self._use_multi_tensor = use_multi_tensor
if self._use_multi_tensor:
self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
self._velocity_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
self._master_weight_dict = {
'FP32_LODTensor': None,
'FP16_LODTensor': []
}
self._regularization_method_dict = {
'FP32_LODTensor': [],
'FP16_LODTensor': []
}
self._regularization_coeff_dict = {
'FP32_LODTensor': [],
'FP16_LODTensor': []
}
self._param_dict = self._create_multi_tensor_dict()
self._velocity_dict = self._create_multi_tensor_dict()
self._master_weight_dict = self._create_multi_tensor_dict()
self._master_weight_dict['FP32_LODTensor'] = None
self._regularization_method_dict = self._create_multi_tensor_dict()
self._regularization_coeff_dict = self._create_multi_tensor_dict()
def _update_regularization(self, weight_decay):
reg_method = ""
reg_coeff = 0.0
if (isinstance(weight_decay, L2DecayRegularizer)):
if isinstance(weight_decay, L2DecayRegularizer):
reg_method = "l2_decay"
reg_coeff = weight_decay._regularization_coeff
if (isinstance(weight_decay, float)):
if isinstance(weight_decay, float):
reg_method = "l2_decay"
reg_coeff = weight_decay
return reg_method, reg_coeff
......@@ -211,19 +213,23 @@ class Momentum(Optimizer):
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = layers.create_global_var(name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True)
var = layers.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32
})
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
......@@ -239,15 +245,22 @@ class Momentum(Optimizer):
"""
if self._name is not None:
name = self._name + "_" + name
find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
target_param = self._master_weights[
param.name] if find_master else param
find_master = (
self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (name not in self._accumulators
or target_name not in self._accumulators[name]):
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name))
name, target_name
)
)
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters):
......@@ -265,7 +278,10 @@ class Momentum(Optimizer):
master_p = self._create_master_weight(p)
self._add_accumulator(self._velocity_acc_str, master_p)
continue
if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
if (
p.dtype == core.VarDesc.VarType.FP16
and not self._multi_precision
):
warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Momentum optimizer."
......@@ -273,25 +289,28 @@ class Momentum(Optimizer):
self._add_accumulator(self._velocity_acc_str, p)
def _create_regularization_of_grad(self, param, grad, regularization=None):
""" Create and add backward regularization Operators
"""Create and add backward regularization Operators
Function helper of append_regularization_ops.
"""
# If ParamAttr is set to L2Decay, we skip doing regularization here. And then we fused
# L2Decay with momentum which can refer to _append_optimize_op below.
if hasattr(param, 'regularizer') and isinstance(param.regularizer,
L2DecayRegularizer):
if hasattr(param, 'regularizer') and isinstance(
param.regularizer, L2DecayRegularizer
):
return grad
return super(Momentum, self)._create_regularization_of_grad(
param, grad, regularization)
param, grad, regularization
)
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad)
velocity_acc = self._get_accumulator(self._velocity_acc_str,
param_and_grad[0])
velocity_acc = self._get_accumulator(
self._velocity_acc_str, param_and_grad[0]
)
lr = self._create_param_lr(param_and_grad)
# For fusion of momentum and l2decay
......@@ -308,30 +327,56 @@ class Momentum(Optimizer):
regularization_method = ""
regularization_coeff = 0.0
find_master = self._multi_precision and param_and_grad[
0].dtype == core.VarDesc.VarType.FP16
master_weight = (self._master_weights[param_and_grad[0].name]
if find_master else None)
find_master = (
self._multi_precision
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
if _in_legacy_dygraph():
if isinstance(param_and_grad, dict):
self._update_regularization(param_and_grad['weight_decay'])
_, _, _ = _legacy_C_ops.momentum(
param_and_grad[0], param_and_grad[1], velocity_acc, lr,
master_weight, param_and_grad[0], velocity_acc, master_weight,
'mu', self._momentum, 'use_nesterov', self._use_nesterov,
'regularization_method', regularization_method,
'regularization_coeff', regularization_coeff, 'multi_precision',
find_master)
param_and_grad[0],
param_and_grad[1],
velocity_acc,
lr,
master_weight,
param_and_grad[0],
velocity_acc,
master_weight,
'mu',
self._momentum,
'use_nesterov',
self._use_nesterov,
'regularization_method',
regularization_method,
'regularization_coeff',
regularization_coeff,
'multi_precision',
find_master,
)
return None
if in_dygraph_mode():
if isinstance(param_and_grad, dict):
self._update_regularization(param_and_grad['weight_decay'])
return _C_ops.momentum_(param_and_grad[0], param_and_grad[1],
velocity_acc, lr, master_weight,
self._momentum, self._use_nesterov,
regularization_method, regularization_coeff,
find_master, self._rescale_grad)
return _C_ops.momentum_(
param_and_grad[0],
param_and_grad[1],
velocity_acc,
lr,
master_weight,
self._momentum,
self._use_nesterov,
regularization_method,
regularization_coeff,
find_master,
self._rescale_grad,
)
attrs = {
"mu": self._momentum,
......@@ -339,19 +384,19 @@ class Momentum(Optimizer):
"regularization_method": regularization_method,
"regularization_coeff": regularization_coeff,
"multi_precision": find_master,
"rescale_grad": self._rescale_grad
"rescale_grad": self._rescale_grad,
}
inputs = {
"Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]],
"Velocity": [velocity_acc],
"LearningRate": [lr]
"LearningRate": [lr],
}
outputs = {
"ParamOut": [param_and_grad[0]],
"VelocityOut": [velocity_acc]
"VelocityOut": [velocity_acc],
}
if find_master:
......@@ -359,15 +404,17 @@ class Momentum(Optimizer):
outputs["MasterParamOut"] = master_weight
# create the momentum optimize op
momentum_op = block.append_op(type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True)
momentum_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return momentum_op
def _multi_tensor_init(self, target_block, parameters):
def _multi_tensor_init(self, target_block, parameters, param_group_idx):
"""
All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
This function will be overridden in the corresponding optimizer file.
......@@ -385,38 +432,59 @@ class Momentum(Optimizer):
# we skip param's l2decay before, so fuse it with momentum here.
if isinstance(param.regularizer, L2DecayRegularizer):
regularization_method = "l2_decay"
regularization_coeff = param.regularizer._regularization_coeff
regularization_coeff = (
param.regularizer._regularization_coeff
)
elif param.regularizer is not None:
regularization_method = ""
regularization_coeff = 0.0
if param.dtype == paddle.float32:
self._param_dict['FP32_LODTensor'].append(param)
self._velocity_dict['FP32_LODTensor'].append(velocity_acc)
self._param_dict['FP32_LODTensor'][param_group_idx].append(
param
)
self._velocity_dict['FP32_LODTensor'][param_group_idx].append(
velocity_acc
)
# fp32 no master weight
self._regularization_method_dict['FP32_LODTensor'].append(
regularization_method)
self._regularization_coeff_dict['FP32_LODTensor'].append(
regularization_coeff)
self._regularization_method_dict['FP32_LODTensor'][
param_group_idx
].append(regularization_method)
self._regularization_coeff_dict['FP32_LODTensor'][
param_group_idx
].append(regularization_coeff)
elif param.dtype == paddle.float16:
self._param_dict['FP16_LODTensor'].append(param)
self._velocity_dict['FP16_LODTensor'].append(velocity_acc)
self._param_dict['FP16_LODTensor'][param_group_idx].append(
param
)
self._velocity_dict['FP16_LODTensor'][param_group_idx].append(
velocity_acc
)
if self._multi_precision:
self._master_weight_dict['FP16_LODTensor'].append(
self._master_weights[param.name])
self._master_weight_dict['FP16_LODTensor'][
param_group_idx
].append(self._master_weights[param.name])
else:
self._master_weight_dict['FP16_LODTensor'] = None
self._regularization_method_dict['FP16_LODTensor'].append(
regularization_method)
self._regularization_coeff_dict['FP16_LODTensor'].append(
regularization_coeff)
self._master_weight_dict['FP16_LODTensor'][
param_group_idx
] = None
self._regularization_method_dict['FP16_LODTensor'][
param_group_idx
].append(regularization_method)
self._regularization_coeff_dict['FP16_LODTensor'][
param_group_idx
].append(regularization_coeff)
else:
raise ValueError(
"Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR."
)
def _append_optimize_multi_tensor_op(self, target_block,
parameters_and_grads):
"""
def _append_optimize_multi_tensor_op(
self,
target_block,
parameters_and_grads,
param_group_idx,
):
"""
For Multi Tensor, append optimize merged_operator to block.
"""
assert isinstance(target_block, framework.Block)
......@@ -429,15 +497,19 @@ class Momentum(Optimizer):
if param_and_grad[1] is None:
continue
if param_and_grad[0].stop_gradient is False:
if param_and_grad[
0].dtype == paddle.float32 and param_and_grad[
1].type == core.VarDesc.VarType.LOD_TENSOR:
if (
param_and_grad[0].dtype == paddle.float32
and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP32_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad)
lr_dict['FP32_LODTensor'].append(lr)
elif param_and_grad[
0].dtype == paddle.float16 and param_and_grad[
1].type == core.VarDesc.VarType.LOD_TENSOR:
elif (
param_and_grad[0].dtype == paddle.float16
and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP16_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad)
lr_dict['FP16_LODTensor'].append(lr)
......@@ -448,97 +520,144 @@ class Momentum(Optimizer):
if param_and_grad[0].stop_gradient is False:
param_grad_dict = dict()
param_grad_dict['params'] = param_and_grad
param_grad_dict.update({
k: v
for k, v in parameters_and_grads.items()
if k != 'params'
})
param_grad_dict.update(
{
k: v
for k, v in parameters_and_grads.items()
if k != 'params'
}
)
param_and_grad = self._update_param_group(param_grad_dict)
if param_and_grad[
0].dtype == paddle.float32 and param_and_grad[
1].type == core.VarDesc.VarType.LOD_TENSOR:
if (
param_and_grad[0].dtype == paddle.float32
and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP32_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad)
lr_dict['FP32_LODTensor'].append(lr)
elif param_and_grad[
0].dtype == paddle.float16 and param_and_grad[
1].type == core.VarDesc.VarType.LOD_TENSOR:
elif (
param_and_grad[0].dtype == paddle.float16
and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP16_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad)
lr_dict['FP16_LODTensor'].append(lr)
multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor']
for key in multi_tensor_list:
if len(self._param_dict[key]) > 0:
if len(self._param_dict[key][param_group_idx]) > 0:
find_master = self._multi_precision and key == 'FP16_LODTensor'
master_weight = self._master_weight_dict[key]
master_weight = (
master_weight[param_group_idx]
if master_weight is not None
else None
)
if framework._non_static_mode():
if in_dygraph_mode():
_, _, _ = _C_ops.merged_momentum_(
self._param_dict[key], grad_dict[key],
self._velocity_dict[key], lr_dict[key],
self._master_weight_dict[key], self._momentum,
self._param_dict[key][param_group_idx],
grad_dict[key],
self._velocity_dict[key][param_group_idx],
lr_dict[key],
master_weight,
self._momentum,
self._use_nesterov,
self._regularization_method_dict[key],
self._regularization_coeff_dict[key], find_master,
self._rescale_grad)
self._regularization_method_dict[key][
param_group_idx
],
self._regularization_coeff_dict[key][
param_group_idx
],
find_master,
self._rescale_grad,
)
else:
_, _, _ = _legacy_C_ops.merged_momentum(
self._param_dict[key], grad_dict[key],
self._velocity_dict[key], lr_dict[key],
self._master_weight_dict[key],
self._param_dict[key], self._velocity_dict[key],
self._master_weight_dict[key], 'mu', self._momentum,
'use_nesterov', self._use_nesterov,
self._param_dict[key][param_group_idx],
grad_dict[key],
self._velocity_dict[key][param_group_idx],
lr_dict[key],
master_weight,
self._param_dict[key][param_group_idx],
self._velocity_dict[key][param_group_idx],
master_weight,
'mu',
self._momentum,
'use_nesterov',
self._use_nesterov,
'regularization_method',
self._regularization_method_dict[key],
self._regularization_method_dict[key][
param_group_idx
],
'regularization_coeff',
self._regularization_coeff_dict[key],
'multi_precision', find_master)
self._regularization_coeff_dict[key][
param_group_idx
],
'multi_precision',
find_master,
)
else:
inputs = {
"Param": self._param_dict[key],
"Param": self._param_dict[key][param_group_idx],
"Grad": grad_dict[key],
"Velocity": self._velocity_dict[key],
"Velocity": self._velocity_dict[key][param_group_idx],
"LearningRate": lr_dict[key],
}
outputs = {
"ParamOut": self._param_dict[key],
"VelocityOut": self._velocity_dict[key],
"ParamOut": self._param_dict[key][param_group_idx],
"VelocityOut": self._velocity_dict[key][
param_group_idx
],
}
attrs = {
"mu":
self._momentum,
"use_nesterov":
self._use_nesterov,
"regularization_method":
self._regularization_method_dict[key],
"regularization_coeff":
self._regularization_coeff_dict[key],
"mu": self._momentum,
"use_nesterov": self._use_nesterov,
"regularization_method": self._regularization_method_dict[
key
][
param_group_idx
],
"regularization_coeff": self._regularization_coeff_dict[
key
][param_group_idx],
}
if find_master:
inputs["MasterParam"] = self._master_weight_dict[key]
inputs["MasterParam"] = self._master_weight_dict[key][
param_group_idx
]
outputs["MasterParamOut"] = self._master_weight_dict[
key]
key
][param_group_idx]
attrs["multi_precision"] = find_master
target_block.append_op(type="merged_momentum",
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True)
target_block.append_op(
type="merged_momentum",
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True,
)
return None
def _update_param_group(self, parameters):
self._momentum = parameters.get('momentum',
self._default_dict['momentum'])
self._use_nesterov = parameters.get('use_nesterov',
self._default_dict['use_nesterov'])
self._rescale_grad = parameters.get('rescale_grad',
self._default_dict['rescale_grad'])
self._momentum = parameters.get(
'momentum', self._default_dict['momentum']
)
self._use_nesterov = parameters.get(
'use_nesterov', self._default_dict['use_nesterov']
)
self._rescale_grad = parameters.get(
'rescale_grad', self._default_dict['rescale_grad']
)
self._regularization_method = parameters.get(
'regularization_method',
self._default_dict['regularization_method'])
'regularization_method', self._default_dict['regularization_method']
)
self._regularization_coeff = parameters.get(
'regularization_coeff', self._default_dict['regularization_coeff'])
'regularization_coeff', self._default_dict['regularization_coeff']
)
parameters = parameters.get('params')
return parameters
......@@ -21,13 +21,30 @@ from collections import defaultdict
import paddle
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
from paddle.fluid.framework import (
Program,
Variable,
name_scope,
default_main_program,
default_startup_program,
device_guard,
)
from ..fluid import framework
from ..fluid import layers
from ..fluid import unique_name
from ..fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
from ..fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops
from ..fluid.backward import (
append_backward,
_some_in_set_,
_append_grad_suffix_,
_get_no_grad_set_name,
)
from ..fluid.clip import (
GradientClipBase,
GradientClipByNorm,
error_clip_callback,
append_gradient_clip_ops,
)
from ..fluid.framework import program_guard, Parameter
from ..fluid.initializer import Constant
from ..fluid.layer_helper import LayerHelper
......@@ -42,24 +59,36 @@ from .. import compat as cpt
from .lr import LRScheduler
import copy
from paddle import _C_ops, _legacy_C_ops
from paddle.fluid.framework import _in_legacy_dygraph, _in_eager_without_dygraph_check, _current_expected_place, in_dygraph_mode
from paddle.fluid.framework import (
_in_legacy_dygraph,
_in_eager_without_dygraph_check,
_current_expected_place,
in_dygraph_mode,
)
__all__ = []
@framework.static_only
def append_backward_new(loss_list,
parameter_list=None,
no_grad_set=None,
callbacks=None,
checkpoints=None,
distop_context=None):
def append_backward_new(
loss_list,
parameter_list=None,
no_grad_set=None,
callbacks=None,
checkpoints=None,
distop_context=None,
):
from paddle.incubate.autograd.primx import orig2prim, Transform
program = default_main_program()
assert program.num_blocks == 1, "The append_backward_new interface is designed to process only one block."
assert (
program.num_blocks == 1
), "The append_backward_new interface is designed to process only one block."
block = program.current_block()
for el in loss_list:
assert el.block == block, f'variable in loss_list should be in current block of main program'
assert (
el.block == block
), f'variable in loss_list should be in current block of main program'
orig2prim(block)
ad = Transform(block)
......@@ -163,12 +192,14 @@ class Optimizer(object):
"""
@imperative_base.no_grad
def __init__(self,
learning_rate,
parameters=None,
weight_decay=None,
grad_clip=None,
name=None):
def __init__(
self,
learning_rate,
parameters=None,
weight_decay=None,
grad_clip=None,
name=None,
):
if parameters is not None:
# paddle.Tensor is also iterable, so here we don't check whether
......@@ -177,13 +208,16 @@ class Optimizer(object):
if isinstance(parameters, (paddle.Tensor, core.eager.Tensor)):
raise TypeError(
"`parameters` argument given to the optimizer should be "
"an iterable of paddle Tensors, but got argument type is `{}`."
.format(type(parameters)))
"an iterable of paddle Tensors, but got argument type is `{}`.".format(
type(parameters)
)
)
if isinstance(parameters, dict):
raise TypeError(
"`parameters` argument should not get dict type, "
"if parameter groups is needed, please set `parameters`"
" as list of dict")
" as list of dict"
)
self._parameter_list = list(parameters)
else:
self._parameter_list = None
......@@ -197,18 +231,22 @@ class Optimizer(object):
if weight_decay is not None:
if not isinstance(self._parameter_list[0], dict):
for param in self._parameter_list:
if hasattr(param, 'regularizer'
) and param.regularizer is not None:
if (
hasattr(param, 'regularizer')
and param.regularizer is not None
):
logging.info(
"If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. "
"The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
% weight_decay.__str__())
% weight_decay.__str__()
)
break
if not isinstance(learning_rate, (float, LRScheduler)):
raise TypeError(
"learning rate should be float or LRScheduler, got %s here" %
type(learning_rate))
"learning rate should be float or LRScheduler, got %s here"
% type(learning_rate)
)
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase):
raise TypeError(
......@@ -216,6 +254,7 @@ class Optimizer(object):
)
if isinstance(weight_decay, float):
from ..fluid.regularizer import L2Decay
self.regularization = L2Decay(weight_decay)
else:
self.regularization = weight_decay
......@@ -227,8 +266,9 @@ class Optimizer(object):
if self._parameter_list:
if isinstance(self._parameter_list[0], dict):
for param_group in self._parameter_list:
assert 'params' in param_group, \
'params should be set in parameters if parameter groups are optimized in different options'
assert (
'params' in param_group
), 'params should be set in parameters if parameter groups are optimized in different options'
self._dtype = self._parameter_list[0]['params'][0].dtype
else:
self._dtype = self._parameter_list[0].dtype
......@@ -248,7 +288,7 @@ class Optimizer(object):
self.clear_gradients = self.clear_grad
self._default_dict = {
'weight_decay': self.regularization,
'grad_clip': self._grad_clip
'grad_clip': self._grad_clip,
}
self._param_groups = []
......@@ -261,13 +301,20 @@ class Optimizer(object):
# NOTE: Multi Tensor: Pass in all parameters and gradients to the op kernel of the Optimizer at one time for updating for dygraph mode.
# Optimizer support list: [ paddle.optimizer.Momentum, paddle.optimizer.Adam].
self._use_multi_tensor = None
self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
self._param_dict = self._create_multi_tensor_dict()
self._auxiliary_vars = {}
def _set_auxiliary_var(self, key, val):
self._auxiliary_vars[key] = val
def _create_multi_tensor_dict(self):
n = len(self._param_groups) if self._param_groups is not None else 1
return {
'FP32_LODTensor': [[] for _ in range(n)],
'FP16_LODTensor': [[] for _ in range(n)],
}
def _get_auxiliary_var(self, key):
return self._auxiliary_vars.get(key, None)
......@@ -277,12 +324,12 @@ class Optimizer(object):
Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be include in state dict.
If the optimizer never be called(minimize function), the state_dict is empty.
Args:
Args:
None
Returns:
state_dict(dict) : dict contains all the Tensor used by optimizer
Examples:
.. code-block:: python
......@@ -311,11 +358,11 @@ class Optimizer(object):
'''
Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be changed.
Args:
Args:
state_dict(dict) : Dict contains all the Tensor needed by optimizer
Return:
None
Examples:
.. code-block:: python
......@@ -326,7 +373,7 @@ class Optimizer(object):
layer_state_dict = emb.state_dict()
paddle.save(layer_state_dict, "emb.pdparams")
scheduler = paddle.optimizer.lr.NoamDecay(
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.optimizer.Adam(
learning_rate=scheduler,
......@@ -353,8 +400,9 @@ class Optimizer(object):
self._accumulators_holder = state_dict
for k, v in self._accumulators.items():
for para_name, var_tmp in v.items():
assert var_tmp.name in state_dict, \
"optimizer Tensor {} not found".format( var_tmp.name )
assert (
var_tmp.name in state_dict
), "optimizer Tensor {} not found".format(var_tmp.name)
var = var_tmp.value()
tensor = var.get_tensor()
model_np = np.array(tensor)
......@@ -368,16 +416,23 @@ class Optimizer(object):
elif isinstance(load_para, np.ndarray):
load_para_np = load_para
else:
raise RuntimeError("State dict type {} not supprt".format(
str(type(load_para))))
assert model_np.shape == load_para_np.shape, \
"Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
model_np.name, model_np.shape, load_para_np.shape)
raise RuntimeError(
"State dict type {} not supprt".format(
str(type(load_para))
)
)
assert (
model_np.shape == load_para_np.shape
), "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
model_np.name, model_np.shape, load_para_np.shape
)
assert model_np.dtype == load_para_np.dtype, \
"Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {} but load tensor with dtype {}".format(
model_np.name, model_np.dtype, load_para_np.dtype)
assert (
model_np.dtype == load_para_np.dtype
), "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {} but load tensor with dtype {}".format(
model_np.name, model_np.dtype, load_para_np.dtype
)
tensor.set(load_para_np, framework._current_expected_place())
......@@ -386,51 +441,63 @@ class Optimizer(object):
def _create_global_learning_rate(self):
# lr var can't be float16, for pure fp16 training, should extra handle the dtype for lr
_lr_dtype = paddle.get_default_dtype(
) if self._dtype is None else self._dtype
_lr_dtype = paddle.float32 if (
paddle.get_default_dtype() != "float16"
and _lr_dtype == paddle.float16) else _lr_dtype
_lr_dtype = (
paddle.get_default_dtype() if self._dtype is None else self._dtype
)
_lr_dtype = (
paddle.float32
if (
paddle.get_default_dtype() != "float16"
and _lr_dtype == paddle.float16
)
else _lr_dtype
)
if isinstance(self._learning_rate, LRScheduler):
lr_var = self._global_learning_rate()
# only create global lr_var once
if not isinstance(lr_var, framework.Variable):
lr_name = unique_name.generate('learning_rate')
self._learning_rate._var_name = lr_name
lr_var = self.helper.create_global_variable(name=lr_name,
shape=[1],
persistable=True,
stop_gradient=True,
dtype=_lr_dtype)
lr_var = self.helper.create_global_variable(
name=lr_name,
shape=[1],
persistable=True,
stop_gradient=True,
dtype=_lr_dtype,
)
main_prog = framework.default_main_program()
main_prog.lr_sheduler = self._learning_rate
main_prog.lr_var = lr_var
self._learning_rate_map[
framework.default_main_program()] = lr_var
framework.default_main_program()
] = lr_var
lr_value = float(self._learning_rate())
self.helper.set_variable_initializer(
lr_var, initializer=Constant(value=lr_value))
lr_var, initializer=Constant(value=lr_value)
)
elif isinstance(self._learning_rate, float):
# only create global lr_var once
lr = self._global_learning_rate()
if isinstance(lr, framework.Variable):
return
else:
self._learning_rate_map[framework.default_main_program(
)] = layers.create_global_var(
self._learning_rate_map[
framework.default_main_program()
] = layers.create_global_var(
name=unique_name.generate("learning_rate"),
shape=[1],
value=float(self._learning_rate),
dtype=_lr_dtype,
persistable=True)
persistable=True,
)
@framework.dygraph_only
def set_lr(self, value):
"""
:api_attr: imperative
Set the value of the learning rate manually in the optimizer. If the optimizer use LRScheduler,
this API cannot be invoked, because it will lead to conflict.
......@@ -439,7 +506,7 @@ class Optimizer(object):
Returns:
None
Examples:
.. code-block:: python
......@@ -465,7 +532,8 @@ class Optimizer(object):
if not isinstance(value, (int, float)):
raise TypeError(
"The type of 'value' in optimizer.set_lr must be float, but received %s."
% (type(value)))
% (type(value))
)
if isinstance(self._learning_rate, LRScheduler):
raise RuntimeError(
"optimizer's learning rate can't be LRScheduler when invoke this API, because this will lead to conflict."
......@@ -475,27 +543,40 @@ class Optimizer(object):
if current_lr is not None:
if in_dygraph_mode():
place = _current_expected_place()
_C_ops.full_(current_lr, list(current_lr.shape), float(value),
current_lr.dtype, place)
_C_ops.full_(
current_lr,
list(current_lr.shape),
float(value),
current_lr.dtype,
place,
)
elif _in_legacy_dygraph():
_legacy_C_ops.fill_constant(current_lr, 'value', float(value),
'dtype', current_lr.dtype, 'shape',
list(current_lr.shape))
_legacy_C_ops.fill_constant(
current_lr,
'value',
float(value),
'dtype',
current_lr.dtype,
'shape',
list(current_lr.shape),
)
else:
global_block = framework.default_main_program().global_block()
global_block.append_op(type='fill_constant',
outputs={'Out': [current_lr]},
attrs={
'dtype': current_lr.dtype,
'shape': list(current_lr.shape),
'value': float(value)
},
stop_gradient=True)
global_block.append_op(
type='fill_constant',
outputs={'Out': [current_lr]},
attrs={
'dtype': current_lr.dtype,
'shape': list(current_lr.shape),
'value': float(value),
},
stop_gradient=True,
)
def get_lr(self):
"""
Get current learning rate of optimizer.
Get current learning rate of optimizer.
If 'LRScheduler' is not used, the return value is all the same.
If 'LRScheduler' is used, the return value is the current scheduled learing rete.
......@@ -565,8 +646,7 @@ class Optimizer(object):
return self._learning_rate_map.get(program, None)
def _append_optimize_op(self, block, param_and_grad):
""" append optimize operator to block and return all the added optimize_op
"""
"""append optimize operator to block and return all the added optimize_op"""
raise NotImplementedError(
"Class \"Optimizer\" connot be used directly as an optimizer, please use its subclasses such as \"Adam\""
)
......@@ -583,8 +663,8 @@ class Optimizer(object):
return self._global_learning_rate()
else:
with default_main_program()._lr_schedule_guard(
is_with_opt=True), framework.name_scope(
'scale_with_param_lr'):
is_with_opt=True
), framework.name_scope('scale_with_param_lr'):
return self._global_learning_rate() * param_lr
else:
return self._global_learning_rate()
......@@ -611,14 +691,16 @@ class Optimizer(object):
"""
pass
def _add_accumulator(self,
name,
param,
dtype=None,
fill_value=0.0,
shape=None,
type=None,
device=None):
def _add_accumulator(
self,
name,
param,
dtype=None,
fill_value=0.0,
shape=None,
type=None,
device=None,
):
"""Utility function to add an accumulator for a parameter
Args:
......@@ -630,13 +712,17 @@ class Optimizer(object):
"""
if self._name is not None:
name = self._name + "_" + name
if (name in self._accumulators
and param.name in self._accumulators[name]):
if (
name in self._accumulators
and param.name in self._accumulators[name]
):
if framework._non_static_mode():
return self._accumulators[name][param.name]
raise Exception(
"Accumulator {} already exists for parameter {}".format(
name, param.name))
name, param.name
)
)
if shape == None:
shape = param.shape
assert isinstance(self.helper, LayerHelper)
......@@ -650,20 +736,25 @@ class Optimizer(object):
persistable=True,
dtype=dtype or param.dtype,
type=core.VarDesc.VarType.LOD_TENSOR
if framework._in_eager_without_dygraph_check() else
(param.type if type is None else type),
if framework._in_eager_without_dygraph_check()
else (param.type if type is None else type),
shape=shape,
belong_to_optimizer=True)
belong_to_optimizer=True,
)
if device is None:
device = self._get_device_for_param(param.name)
with device_guard(device):
self.helper.set_variable_initializer(
var, initializer=Constant(value=float(fill_value)))
var, initializer=Constant(value=float(fill_value))
)
if framework._non_static_mode():
if len(self._accumulators_holder) > 0:
assert var_name in self._accumulators_holder, \
"Optimizer set error, {} should in state dict".format( var_name )
assert (
var_name in self._accumulators_holder
), "Optimizer set error, {} should in state dict".format(
var_name
)
var.set_value(self._accumulators_holder[var_name])
self._accumulators[name][param.name] = var
......@@ -681,11 +772,15 @@ class Optimizer(object):
"""
if self._name is not None:
name = self._name + "_" + name
if (name not in self._accumulators
or param.name not in self._accumulators[name]):
if (
name not in self._accumulators
or param.name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, param.name))
name, param.name
)
)
return self._accumulators[name][param.name]
def _update_param_device_map(self, parameters_and_grads, target_block):
......@@ -693,13 +788,15 @@ class Optimizer(object):
if param_and_grad[0].stop_gradient is False:
param_name = param_and_grad[0].name
ops = target_block.ops
device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName(
device_attr_name = (
core.op_proto_and_checker_maker.kOpDeviceAttrName()
)
for op in ops:
input_arg_names = op.input_arg_names
if param_name in input_arg_names:
self._param_device_map[param_name] = op.attr(
device_attr_name)
device_attr_name
)
break
def _get_device_for_param(self, param_name):
......@@ -708,7 +805,9 @@ class Optimizer(object):
device = self._param_device_map[param_name]
return device
def _create_optimization_pass(self, parameters_and_grads):
def _create_optimization_pass(
self, parameters_and_grads, param_group_idx=0
):
"""Add optimization operators to update gradients to tensors.
Args:
......@@ -736,10 +835,12 @@ class Optimizer(object):
target_block = global_block
current_block = framework.default_main_program().current_block()
if current_block.idx != global_block.idx:
assert current_block.backward_block_idx != -1, \
"current block is not global_block, but it doesn't have backward block."
assert (
current_block.backward_block_idx != -1
), "current block is not global_block, but it doesn't have backward block."
target_block = framework.default_main_program().blocks[
current_block.backward_block_idx]
current_block.backward_block_idx
]
start = len(target_block.ops)
self.helper = LayerHelper(self.__class__.__name__)
......@@ -748,57 +849,91 @@ class Optimizer(object):
# NOTE: Multi Tensor support [ Momentum, Adam ] for dygraph mode
if self._use_multi_tensor and self.__class__.__name__ in [
'Momentum', 'Adam'
'Momentum',
'Adam',
]:
if len(self._param_dict['FP32_LODTensor']) == 0 and len(
self._param_dict['FP16_LODTensor']) == 0:
if (
len(self._param_dict['FP32_LODTensor'][param_group_idx]) == 0
and len(self._param_dict['FP16_LODTensor'][param_group_idx])
== 0
):
if isinstance(parameters_and_grads, list):
self._multi_tensor_init(target_block, [
p[0]
for p in parameters_and_grads if not p[0].stop_gradient
])
assert param_group_idx == 0
self._multi_tensor_init(
target_block,
[
p[0]
for p in parameters_and_grads
if not p[0].stop_gradient
],
param_group_idx,
)
else:
self._update_param_group(parameters_and_grads)
self._multi_tensor_init(target_block, [
p[0] for p in parameters_and_grads['params']
if not p[0].stop_gradient
])
self._multi_tensor_init(
target_block,
[
p[0]
for p in parameters_and_grads['params']
if not p[0].stop_gradient
],
param_group_idx,
)
if framework._non_static_mode():
self._append_optimize_multi_tensor_op(target_block,
parameters_and_grads)
self._append_optimize_multi_tensor_op(
target_block,
parameters_and_grads,
param_group_idx=param_group_idx,
)
else:
self._update_param_device_map(parameters_and_grads,
target_block)
self._update_param_device_map(
parameters_and_grads, target_block
)
# NOTE: Multi Tensor requires all parameters to be in the same device and program.
# param_grad_list = [p_0,g_0,p_1,g_1,....]
param_grad_list = []
for param_and_grad in parameters_and_grads:
if not param_and_grad[0].stop_gradient and param_and_grad[
1] is not None:
if (
not param_and_grad[0].stop_gradient
and param_and_grad[1] is not None
):
param_grad_list.append(param_and_grad[0])
param_grad_list.append(param_and_grad[1])
with param_grad_list[0].block.program._optimized_guard(
param_grad_list), name_scope("optimizer"):
param_grad_list
), name_scope("optimizer"):
device = self._get_device_for_param(param_grad_list[0].name)
with device_guard(device):
self._append_optimize_multi_tensor_op(
target_block, parameters_and_grads)
target_block,
parameters_and_grads,
param_group_idx=param_group_idx,
)
else:
if not framework._non_static_mode():
params_grads_device_map = parameters_and_grads[
'params'] if isinstance(parameters_and_grads,
dict) else parameters_and_grads
self._update_param_device_map(params_grads_device_map,
target_block)
params_grads_device_map = (
parameters_and_grads['params']
if isinstance(parameters_and_grads, dict)
else parameters_and_grads
)
self._update_param_device_map(
params_grads_device_map, target_block
)
if isinstance(parameters_and_grads, list):
self._create_accumulators(target_block, [
p[0] for p in parameters_and_grads if not p[0].stop_gradient
])
self._create_accumulators(
target_block,
[
p[0]
for p in parameters_and_grads
if not p[0].stop_gradient
],
)
else:
params_acc_dict = parameters_and_grads.copy()
params_acc_dict['params'] = [
p[0] for p in params_acc_dict['params']
p[0]
for p in params_acc_dict['params']
if not p[0].stop_gradient
]
self._create_accumulators(target_block, params_acc_dict)
......@@ -809,8 +944,9 @@ class Optimizer(object):
if param_and_grad[1] is None:
continue
if param_and_grad[0].stop_gradient is False:
self._append_optimize_op(target_block,
param_and_grad)
self._append_optimize_op(
target_block, param_and_grad
)
else:
for param_and_grad in parameters_and_grads['params']:
if param_and_grad[1] is None:
......@@ -818,25 +954,31 @@ class Optimizer(object):
if param_and_grad[0].stop_gradient is False:
param_grad_dict = dict()
param_grad_dict['params'] = param_and_grad
param_grad_dict.update({
k: v
for k, v in parameters_and_grads.items()
if k != 'params'
})
self._append_optimize_op(target_block,
param_grad_dict)
param_grad_dict.update(
{
k: v
for k, v in parameters_and_grads.items()
if k != 'params'
}
)
self._append_optimize_op(
target_block, param_grad_dict
)
else:
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
continue
with param_and_grad[0].block.program._optimized_guard(
param_and_grad), name_scope("optimizer"):
param_and_grad
), name_scope("optimizer"):
if param_and_grad[0].stop_gradient is False:
device = self._get_device_for_param(
param_and_grad[0].name)
param_and_grad[0].name
)
with device_guard(device):
optimize_op = self._append_optimize_op(
target_block, param_and_grad)
target_block, param_and_grad
)
# Get custom finish ops for subclasses
# FIXME: Need to fix this once we figure out how to handle dependencies
......@@ -848,12 +990,14 @@ class Optimizer(object):
def _append_dgc_ops(self, param_and_grad):
pass
def backward(self,
loss,
startup_program=None,
parameters=None,
no_grad_set=None,
callbacks=None):
def backward(
self,
loss,
startup_program=None,
parameters=None,
no_grad_set=None,
callbacks=None,
):
"""
The first part of ``minimize``, do auto-diff to append backward operations for
the current program.
......@@ -884,7 +1028,7 @@ class Optimizer(object):
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate = 0.01,
adam = paddle.optimizer.Adam(learning_rate = 0.01,
parameters = linear.parameters())
out = linear(a)
out.backward()
......@@ -902,8 +1046,7 @@ class Optimizer(object):
self._dtype = loss.dtype
if framework._non_static_mode():
parameter_list = parameters if parameters \
else self._parameter_list
parameter_list = parameters if parameters else self._parameter_list
params_grads = []
for param in parameter_list:
......@@ -917,23 +1060,26 @@ class Optimizer(object):
if callbacks is None:
callbacks = [error_clip_callback]
else:
assert (isinstance(callbacks, list))
assert isinstance(callbacks, list)
program = loss.block.program
assert len(loss.shape) == 1 and loss.shape[0] == 1, \
"The loss.shape should be (1L,), but the current loss.shape is {}. " \
assert len(loss.shape) == 1 and loss.shape[0] == 1, (
"The loss.shape should be (1L,), but the current loss.shape is {}. "
"Maybe that you should call paddle.mean to process the current loss.".format(
loss.shape)
parameter_list = parameters if parameters \
else self._parameter_list
loss.shape
)
)
parameter_list = parameters if parameters else self._parameter_list
with program_guard(program, startup_program):
from paddle.incubate.autograd.utils import prim_enabled
if prim_enabled():
params_grads = append_backward_new([loss], parameter_list,
act_no_grad_set,
callbacks)
params_grads = append_backward_new(
[loss], parameter_list, act_no_grad_set, callbacks
)
else:
params_grads = append_backward(loss, parameter_list,
act_no_grad_set, callbacks)
params_grads = append_backward(
loss, parameter_list, act_no_grad_set, callbacks
)
# Note: since we can't use all_reduce_op now,
# dgc_op should be the last op of one grad.
self._append_dgc_ops(params_grads)
......@@ -978,13 +1124,16 @@ class Optimizer(object):
params_grads = append_gradient_clip_ops(params_grads)
# Add regularization if any
params_grads = self.append_regularization_ops(params_grads,
self.regularization)
params_grads = self.append_regularization_ops(
params_grads, self.regularization
)
optimize_ops = self._create_optimization_pass(params_grads)
return optimize_ops
def _apply_optimize(self, loss, startup_program, params_grads):
def _apply_optimize(
self, loss, startup_program, params_grads, param_group_idx=0
):
"""
Second part of `minimize`, appending optimization operators for
given `params_grads` pairs.
......@@ -997,38 +1146,49 @@ class Optimizer(object):
list: A list of operators appended to the current program.
"""
if framework._non_static_mode():
with program_guard(framework.default_main_program(),
framework.default_startup_program()):
with program_guard(
framework.default_main_program(),
framework.default_startup_program(),
):
if isinstance(params_grads, list):
if self._grad_clip is not None:
params_grads = self._grad_clip(params_grads)
params_grads = self.append_regularization_ops(
params_grads, self.regularization)
params_grads, self.regularization
)
else:
grad_clip = params_grads['grad_clip']
if grad_clip is not None:
params_grads['params'] = grad_clip(
params_grads['params'])
params_grads['params']
)
params_grads['params'] = self.append_regularization_ops(
params_grads['params'], self.regularization)
optimize_ops = self._create_optimization_pass(params_grads)
params_grads['params'], self.regularization
)
optimize_ops = self._create_optimization_pass(
params_grads, param_group_idx=param_group_idx
)
else:
assert param_group_idx == 0
program = loss.block.program
with program_guard(program, startup_program):
optimize_ops = self.apply_gradients(params_grads)
return optimize_ops
def _create_regularization_of_grad(self, param, grad, regularization=None):
""" Create and add backward regularization Operators
"""Create and add backward regularization Operators
Function helper of append_regularization_ops.
"""
# If no gradient or no regularization is specified, then we don't need to do anything
if grad is None or (
(not hasattr(param, 'regularizer') or
(hasattr(param, 'regularizer') and param.regularizer is None))
and regularization is None):
(
not hasattr(param, 'regularizer')
or (hasattr(param, 'regularizer') and param.regularizer is None)
)
and regularization is None
):
return grad
regularization_term = None
if hasattr(param, 'regularizer') and param.regularizer is not None:
......@@ -1057,7 +1217,8 @@ class Optimizer(object):
dtype=param.dtype,
shape=param.shape,
lod_level=param.lod_level,
type=core.VarDesc.VarType.LOD_TENSOR)
type=core.VarDesc.VarType.LOD_TENSOR,
)
inputs = {"X": [grad, regularization_term]}
outputs = {"Out": [new_grad]}
......@@ -1065,9 +1226,9 @@ class Optimizer(object):
return new_grad
def append_regularization_ops(self,
parameters_and_grads,
regularization=None):
def append_regularization_ops(
self, parameters_and_grads, regularization=None
):
r"""Create and add backward regularization Operators
Creates and adds backward regularization operators in the BlockDesc.
......@@ -1092,21 +1253,28 @@ class Optimizer(object):
if framework._non_static_mode():
for param, grad in parameters_and_grads:
new_grad = self._create_regularization_of_grad(
param, grad, regularization)
param, grad, regularization
)
params_and_grads.append((param, new_grad))
else:
repeate_regularizer = False
with framework.name_scope('regularization'):
for param, grad in parameters_and_grads:
if not repeate_regularizer and param.regularizer is not None and regularization is not None:
if (
not repeate_regularizer
and param.regularizer is not None
and regularization is not None
):
repeate_regularizer = True
logging.info(
"If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
"The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
% regularization.__str__())
% regularization.__str__()
)
with param.block.program._optimized_guard([param, grad]):
new_grad = self._create_regularization_of_grad(
param, grad, regularization)
param, grad, regularization
)
params_and_grads.append((param, new_grad))
return params_and_grads
......@@ -1114,7 +1282,8 @@ class Optimizer(object):
no_grad_set = _get_no_grad_set_name(no_grad_set)
parameters = loss.block.program.global_block().all_parameters()
param_no_trainable = set(
[param.name for param in parameters if param.stop_gradient is True])
[param.name for param in parameters if param.stop_gradient is True]
)
# If the parameter is no trainable, it should not have a gradient.
no_grad_set.update(param_no_trainable)
......@@ -1128,13 +1297,13 @@ class Optimizer(object):
If not, new gradient will accumulat on previous gradient.
There are two method to clear grad: set_to_zero or delete grad.
Args:
set_to_zero (bool, optional): If set grads to zero or not, default is True.
Returns:
None
Examples:
.. code-block:: python
......@@ -1145,7 +1314,7 @@ class Optimizer(object):
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate = 0.01,
adam = paddle.optimizer.Adam(learning_rate = 0.01,
parameters = linear.parameters())
out = linear(a)
out.backward()
......@@ -1155,7 +1324,8 @@ class Optimizer(object):
"""
param_list = []
if self._parameter_list is None or not isinstance(
self._parameter_list[0], dict):
self._parameter_list[0], dict
):
for p in self._parameter_list:
if not p.stop_gradient:
param_list.append(p)
......@@ -1172,11 +1342,9 @@ class Optimizer(object):
core.clear_gradients(param_list, set_to_zero)
@imperative_base.no_grad
def minimize(self,
loss,
startup_program=None,
parameters=None,
no_grad_set=None):
def minimize(
self, loss, startup_program=None, parameters=None, no_grad_set=None
):
"""
Add operations to minimize ``loss`` by updating ``parameters``.
......@@ -1195,13 +1363,13 @@ class Optimizer(object):
tuple: tuple (optimize_ops, params_grads), A list of operators appended
by minimize and a list of (param, grad) tensor pairs, param is
``Parameter``, grad is the gradient value corresponding to the parameter.
In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
indicate program pruning. If so, the program will be pruned by ``feed`` and
In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
indicate program pruning. If so, the program will be pruned by ``feed`` and
``fetch_list`` before run, see details in ``Executor``.
Examples:
.. code-block:: python
import paddle
linear = paddle.nn.Linear(10, 10)
input = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
......@@ -1221,17 +1389,18 @@ class Optimizer(object):
"""
assert isinstance(loss, Variable), "The loss should be an Tensor."
parameter_list = parameters if parameters \
else self._parameter_list
parameter_list = parameters if parameters else self._parameter_list
params_grads = self.backward(loss,
startup_program=startup_program,
parameters=parameter_list,
no_grad_set=no_grad_set)
params_grads = self.backward(
loss,
startup_program=startup_program,
parameters=parameter_list,
no_grad_set=no_grad_set,
)
optimize_ops = self._apply_optimize(loss,
startup_program=startup_program,
params_grads=params_grads)
optimize_ops = self._apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads
)
return optimize_ops, params_grads
......@@ -1240,7 +1409,7 @@ class Optimizer(object):
def step(self):
"""
Execute the optimizer and update parameters once.
Returns:
None
......@@ -1254,7 +1423,7 @@ class Optimizer(object):
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate = 0.01,
adam = paddle.optimizer.Adam(learning_rate = 0.01,
parameters = linear.parameters())
out = linear(a)
out.backward()
......@@ -1271,13 +1440,16 @@ class Optimizer(object):
grad_var = param._grad_ivar()
params_grads.append((param, grad_var))
self._apply_optimize(loss=None,
startup_program=None,
params_grads=params_grads)
self._apply_optimize(
loss=None,
startup_program=None,
params_grads=params_grads,
param_group_idx=0,
)
else:
# optimize parameters in groups
for param_group in self._param_groups:
for idx, param_group in enumerate(self._param_groups):
params_grads = defaultdict(lambda: list())
for param in param_group['params']:
if param.stop_gradient:
......@@ -1286,11 +1458,14 @@ class Optimizer(object):
grad_var = param._grad_ivar()
params_grads['params'].append((param, grad_var))
params_grads.update(
{k: v
for k, v in param_group.items() if k != 'params'})
self._apply_optimize(loss=None,
startup_program=None,
params_grads=params_grads)
{k: v for k, v in param_group.items() if k != 'params'}
)
self._apply_optimize(
loss=None,
startup_program=None,
params_grads=params_grads,
param_group_idx=idx,
)
def _add_param_group(self, param_group):
"""
......@@ -1306,7 +1481,8 @@ class Optimizer(object):
elif isinstance(params, set):
raise TypeError(
"optimizer parameters should be in ordered collections,"
"but received set, please use list instead.")
"but received set, please use list instead."
)
else:
param_group['params'] = list(params)
......@@ -1320,18 +1496,21 @@ class Optimizer(object):
if not param_set.isdisjoint(set(param_group['params'])):
raise ValueError(
"some parameters appear in more than one parameter group")
"some parameters appear in more than one parameter group"
)
for param in param_group['params']:
weight_decay = param_group['weight_decay']
if isinstance(weight_decay, float):
from ..fluid.regularizer import L2Decay
regularization = L2Decay(weight_decay)
else:
regularization = weight_decay
param.regularizer = regularization
param.optimize_attr['learning_rate'] = param_group.get(
'learning_rate', 1.)
'learning_rate', 1.0
)
self._param_groups.append(param_group)
......@@ -1345,7 +1524,7 @@ class Optimizer(object):
pass
@framework.dygraph_only
def _multi_tensor_init(self, target_block, parameters):
def _multi_tensor_init(self, target_block, parameters, param_group_idx):
"""
All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
This function will be overridden in the corresponding optimizer file.
......@@ -1357,9 +1536,10 @@ class Optimizer(object):
pass
@framework.dygraph_only
def _append_optimize_multi_tensor_op(self, target_block,
parameters_and_grads):
"""
def _append_optimize_multi_tensor_op(
self, target_block, parameters_and_grads, param_group_idx
):
"""
For Multi Tensor, append optimize merged_operator to block.
"""
pass
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册