“626abfc33ac373b2c18ef1b26d0b1470eb8f94c0”上不存在“paddle/fluid/prim/api/manual_prim/utils/utils.h”
未验证 提交 94240e2e 编写于 作者: S sneaxiy 提交者: GitHub

[Cherry-pick Release/2.4] Fix multi_tensor adam and momentum bug when the...

[Cherry-pick Release/2.4] Fix multi_tensor adam and momentum bug when the parameter is list of dict (#47372)

* reformat file by black

* fix multi_tensor adam/momentum bug
上级 b143e008
......@@ -25,10 +25,8 @@ from paddle.fluid.framework import _test_eager_guard
class TestAdamOp1(OpTest):
def setUp(self):
'''Test Adam Op with supplied attributes
'''
'''Test Adam Op with supplied attributes'''
self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
......@@ -50,20 +48,19 @@ class TestAdamOp1(OpTest):
'Moment2': moment2,
'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pow': np.array([beta1_pow]).astype("float32"),
'Beta2Pow': np.array([beta2_pow]).astype("float32")
'Beta2Pow': np.array([beta2_pow]).astype("float32"),
}
self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, self.attrs)
param_out, moment1_out, moment2_out = adam_step(self.inputs, self.attrs)
self.outputs = {
'Moment1Out': moment1_out,
'Moment2Out': moment2_out,
'ParamOut': param_out,
'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
}
def test_check_output(self):
......@@ -71,13 +68,11 @@ class TestAdamOp1(OpTest):
class TestAdamOp2(OpTest):
def set_shape(self):
self.shape = (102, 105)
def setUp(self):
'''Test Adam Op with supplied attributes
'''
'''Test Adam Op with supplied attributes'''
self.op_type = "adam"
self.set_shape()
param = np.random.uniform(-1, 1, self.shape).astype("float32")
......@@ -100,20 +95,19 @@ class TestAdamOp2(OpTest):
'Moment2': moment2,
'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pow': np.array([beta1_pow]).astype("float32"),
'Beta2Pow': np.array([beta2_pow]).astype("float32")
'Beta2Pow': np.array([beta2_pow]).astype("float32"),
}
attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, attributes)
param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
self.outputs = {
'Moment1Out': moment1_out,
'Moment2Out': moment2_out,
'ParamOut': param_out,
'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
}
def test_check_output(self):
......@@ -121,16 +115,13 @@ class TestAdamOp2(OpTest):
class TestAdamOnlyTailOp(TestAdamOp2):
def set_shape(self):
self.shape = (3)
self.shape = 3
class TestAdamOpMultipleSteps(OpTest):
def setUp(self):
'''Test Adam Operator with supplied attributes
'''
'''Test Adam Operator with supplied attributes'''
self.op_type = "adam"
self.num_steps = 10
......@@ -154,19 +145,20 @@ class TestAdamOpMultipleSteps(OpTest):
'Moment2': moment2,
'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pow': np.array([self.beta1_pow]).astype("float32"),
'Beta2Pow': np.array([self.beta2_pow]).astype("float32")
'Beta2Pow': np.array([self.beta2_pow]).astype("float32"),
}
self.attrs = {
'epsilon': epsilon,
'beta1': self.beta1,
'beta2': self.beta2
'beta2': self.beta2,
}
def test_check_output(self):
for _ in range(self.num_steps):
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, self.attrs)
param_out, moment1_out, moment2_out = adam_step(
self.inputs, self.attrs
)
beta1_pow_out = self.inputs['Beta1Pow'] * self.beta1
beta2_pow_out = self.inputs['Beta2Pow'] * self.beta2
......@@ -175,7 +167,7 @@ class TestAdamOpMultipleSteps(OpTest):
'Moment2Out': moment2_out,
'ParamOut': param_out,
'Beta1PowOut': beta1_pow_out,
'Beta2PowOut': beta2_pow_out
'Beta2PowOut': beta2_pow_out,
}
# Verify output for this step
......@@ -191,8 +183,9 @@ class TestAdamOpMultipleSteps(OpTest):
self.inputs['Beta2Pow'] = beta2_pow_out
# Randomize gradient for next step
self.inputs['Grad'] = np.random.uniform(
-1, 1, (102, 105)).astype("float32")
self.inputs['Grad'] = np.random.uniform(-1, 1, (102, 105)).astype(
"float32"
)
def test_api_eager_dygraph(self):
with _test_eager_guard():
......@@ -272,8 +265,9 @@ def adamw_step(inputs, attributes):
return param_out, moment1_out, moment2_out
def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
lazy_mode):
def adam_step_sparse(
inputs, attributes, height, rows, row_numel, np_grad, lazy_mode
):
'''
Simulate one step of the adam optimizer
:param inputs: dict of inputs
......@@ -298,13 +292,16 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
param_out = np.zeros(shape=[height, row_numel])
def update_row(row_id, update_value):
moment1_out[row_id] = beta1 * moment1[row_id] + (1 -
beta1) * update_value
moment2_out[row_id] = beta2 * moment2[row_id] + (
1 - beta2) * np.square(update_value)
moment1_out[row_id] = (
beta1 * moment1[row_id] + (1 - beta1) * update_value
)
moment2_out[row_id] = beta2 * moment2[row_id] + (1 - beta2) * np.square(
update_value
)
lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
param_out[row_id] = param[row_id] - lr_t * (
moment1_out[row_id] / (np.sqrt(moment2_out[row_id]) + epsilon))
moment1_out[row_id] / (np.sqrt(moment2_out[row_id]) + epsilon)
)
if lazy_mode:
for idx, row_id in enumerate(rows):
......@@ -320,7 +317,6 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
class TestSparseAdamOp(unittest.TestCase):
def setup(self, scope, place, lazy_mode):
beta1 = 0.78
beta2 = 0.836
......@@ -339,14 +335,14 @@ class TestSparseAdamOp(unittest.TestCase):
"Moment2": np.full((height, row_numel), 5.0).astype("float32"),
'Beta1Pow': beta1_pow,
'Beta2Pow': beta2_pow,
"LearningRate": np.full((1), 2.0).astype("float32")
"LearningRate": np.full((1), 2.0).astype("float32"),
}
self.init_output = np.full((height, row_numel), 0.0).astype("float32")
self.attrs = {
'epsilon': epsilon,
'beta1': beta1,
'beta2': beta2,
'min_row_size_to_use_multithread': 2
'min_row_size_to_use_multithread': 2,
}
grad_selected_rows = scope.var('Grad').get_selected_rows()
......@@ -361,15 +357,21 @@ class TestSparseAdamOp(unittest.TestCase):
self.sparse_inputs = ["Grad"]
param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs,
height, rows, row_numel,
np_array, lazy_mode)
param_out, mom1, mom2 = adam_step_sparse(
self.dense_inputs,
self.attrs,
height,
rows,
row_numel,
np_array,
lazy_mode,
)
self.outputs = {
"ParamOut": param_out,
"Moment1Out": mom1,
"Moment2Out": mom2,
'Beta1PowOut': beta1_pow * beta1,
'Beta2PowOut': beta2_pow * beta2
'Beta2PowOut': beta2_pow * beta2,
}
def check_with_place(self, place, lazy_mode):
......@@ -414,10 +416,8 @@ class TestSparseAdamOp(unittest.TestCase):
class TestAdamOpBetaVariable(OpTest):
def setUp(self):
'''Test Adam Op with beta as Variable
'''
'''Test Adam Op with beta as Variable'''
self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
......@@ -446,15 +446,14 @@ class TestAdamOpBetaVariable(OpTest):
attributes = {'epsilon': epsilon}
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, attributes)
param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
self.outputs = {
'Moment1Out': moment1_out,
'Moment2Out': moment2_out,
'ParamOut': param_out,
'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
}
def test_check_output(self):
......@@ -462,10 +461,8 @@ class TestAdamOpBetaVariable(OpTest):
class TestAdamOpBetaEpsilonVariable(OpTest):
def setUp(self):
'''Test Adam Op with beta/epsilon as Variable
'''
'''Test Adam Op with beta/epsilon as Variable'''
self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
......@@ -495,15 +492,14 @@ class TestAdamOpBetaEpsilonVariable(OpTest):
attributes = {'epsilon': epsilon}
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, attributes)
param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
self.outputs = {
'Moment1Out': moment1_out,
'Moment2Out': moment2_out,
'ParamOut': param_out,
'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
}
def test_check_output(self):
......@@ -511,10 +507,8 @@ class TestAdamOpBetaEpsilonVariable(OpTest):
class TestAdamOpWithGlobalBetaPow(OpTest):
def setUp(self):
'''Test Adam Op with global_beta_pow
'''
'''Test Adam Op with global_beta_pow'''
self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
......@@ -544,8 +538,7 @@ class TestAdamOpWithGlobalBetaPow(OpTest):
attributes = {'epsilon': epsilon}
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, attributes)
param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
self.attrs = {'use_global_beta_pow': True}
......@@ -555,7 +548,7 @@ class TestAdamOpWithGlobalBetaPow(OpTest):
'Moment2Out': moment2_out,
'ParamOut': param_out,
'Beta1PowOut': np.array([]),
'Beta2PowOut': np.array([])
'Beta2PowOut': np.array([]),
}
def test_check_output(self):
......@@ -563,10 +556,8 @@ class TestAdamOpWithGlobalBetaPow(OpTest):
class TestAdamOpWithSkipUpdate(OpTest):
def setUp(self):
'''Test Adam Op with global_beta_pow
'''
'''Test Adam Op with global_beta_pow'''
self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
......@@ -613,7 +604,6 @@ class TestAdamOpWithSkipUpdate(OpTest):
class TestAdamOpV2(unittest.TestCase):
def test_adam_op(self):
place = fluid.CPUPlace()
shape = [2, 3, 8, 8]
......@@ -626,20 +616,20 @@ class TestAdamOpV2(unittest.TestCase):
conv = fluid.layers.conv2d(data, 8, 3)
loss = fluid.layers.reduce_mean(conv)
beta1 = fluid.layers.create_global_var(shape=[1],
value=0.85,
dtype='float32',
persistable=True)
beta2 = fluid.layers.create_global_var(shape=[1],
value=0.95,
dtype='float32',
persistable=True)
beta1 = fluid.layers.create_global_var(
shape=[1], value=0.85, dtype='float32', persistable=True
)
beta2 = fluid.layers.create_global_var(
shape=[1], value=0.95, dtype='float32', persistable=True
)
betas = [beta1, beta2]
opt = paddle.optimizer.Adam(learning_rate=1e-5,
opt = paddle.optimizer.Adam(
learning_rate=1e-5,
beta1=beta1,
beta2=beta2,
weight_decay=0.01,
epsilon=1e-8)
epsilon=1e-8,
)
opt.minimize(loss)
exe.run(startup)
......@@ -653,8 +643,9 @@ class TestAdamOpV2(unittest.TestCase):
a = fluid.dygraph.to_variable(value)
linear = fluid.Linear(13, 5, dtype="float32")
adam = paddle.optimizer.Adam(learning_rate=0.01,
parameters=linear.parameters())
adam = paddle.optimizer.Adam(
learning_rate=0.01, parameters=linear.parameters()
)
out = linear(a)
out.backward()
adam.step()
......@@ -670,26 +661,29 @@ class TestAdamOpV2(unittest.TestCase):
state_dict = adam.state_dict()
adam.set_state_dict(state_dict)
#learning_rate is LRScheduler
# learning_rate is LRScheduler
learning_rate = paddle.optimizer.lr.CosineAnnealingDecay(
learning_rate=0.1, T_max=10)
learning_rate=0.1, T_max=10
)
adam = paddle.optimizer.Adam(
learning_rate=learning_rate,
weight_decay=fluid.regularizer.L2Decay(0.001),
parameters=emb.parameters())
parameters=emb.parameters(),
)
lr = adam.get_lr()
state_dict = adam.state_dict()
adam.set_state_dict(state_dict)
#leanrning_rate is Tensor
# leanrning_rate is Tensor
with self.assertRaises(TypeError):
learning_rate = np.array([0.01]).astype("float32")
learning_rate = paddle.to_tensor(learning_rate)
adam = paddle.optimizer.Adam(learning_rate=learning_rate,
parameters=emb.parameters())
adam = paddle.optimizer.Adam(
learning_rate=learning_rate, parameters=emb.parameters()
)
params = adam.get_opti_var_name_list()
assert (params is not None)
assert params is not None
paddle.enable_static()
def test_adam_with_grad_clip(self):
......@@ -698,9 +692,9 @@ class TestAdamOpV2(unittest.TestCase):
a = fluid.dygraph.to_variable(value)
linear = fluid.Linear(13, 5, dtype="float32")
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
adam = paddle.optimizer.Adam(0.1,
parameters=linear.parameters(),
grad_clip=clip)
adam = paddle.optimizer.Adam(
0.1, parameters=linear.parameters(), grad_clip=clip
)
out = linear(a)
out.backward()
adam.step()
......@@ -715,11 +709,11 @@ class TestAdamOpV2(unittest.TestCase):
lr = 0.01
adam.set_lr(lr)
cur_lr = adam.get_lr()
assert (lr == cur_lr)
assert lr == cur_lr
with self.assertRaises(TypeError):
lr_var = paddle.fluid.layers.create_global_var(shape=[1],
value=lr,
dtype='float32')
lr_var = paddle.fluid.layers.create_global_var(
shape=[1], value=lr, dtype='float32'
)
adam.set_lr(lr_var)
paddle.enable_static()
......@@ -727,17 +721,17 @@ class TestAdamOpV2(unittest.TestCase):
paddle.disable_static()
linear = paddle.nn.Linear(10, 10)
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adam(0.1,
beta1=-1,
parameters=linear.parameters())
adam = paddle.optimizer.Adam(
0.1, beta1=-1, parameters=linear.parameters()
)
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adam(0.1,
beta2=-1,
parameters=linear.parameters())
adam = paddle.optimizer.Adam(
0.1, beta2=-1, parameters=linear.parameters()
)
with self.assertRaises(ValueError):
adam = paddle.optimizer.Adam(0.1,
epsilon=-1,
parameters=linear.parameters())
adam = paddle.optimizer.Adam(
0.1, epsilon=-1, parameters=linear.parameters()
)
paddle.enable_static()
def test_adam_op_with_sparse_input_and_weight_decay(self):
......@@ -746,9 +740,9 @@ class TestAdamOpV2(unittest.TestCase):
x_data = np.arange(0, 10).reshape((10, 1)).astype(np.int64)
x = paddle.to_tensor(x_data, stop_gradient=False)
emb = paddle.nn.Embedding(10, 10, sparse=True)
adam = paddle.optimizer.Adam(0.001,
parameters=emb.parameters(),
weight_decay=0.01)
adam = paddle.optimizer.Adam(
0.001, parameters=emb.parameters(), weight_decay=0.01
)
with self.assertRaises(RuntimeError):
out = emb(x)
......@@ -766,13 +760,14 @@ class TestAdamOpV2(unittest.TestCase):
class TestAdamOptimizer(unittest.TestCase):
def _test(self,
def _test(
self,
place,
use_tensor=True,
use_fluid_api=True,
use_global_beta_pow=False,
flatten_param_grads=False):
flatten_param_grads=False,
):
paddle.enable_static()
main_prog = paddle.static.Program()
startup_prog = paddle.static.Program()
......@@ -786,29 +781,30 @@ class TestAdamOptimizer(unittest.TestCase):
weight_attr1 = paddle.ParamAttr(
name="weight1",
initializer=fluid.initializer.Constant(value=1.0),
trainable=True)
trainable=True,
)
weight_attr2 = paddle.ParamAttr(
name="weight2",
initializer=fluid.initializer.Constant(value=2.0),
trainable=True)
trainable=True,
)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
with paddle.static.program_guard(main_prog, startup_prog):
with paddle.utils.unique_name.guard():
a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
b = paddle.static.data(name="b", shape=[2, 2], dtype='float32')
label = paddle.static.data(name="label",
shape=[2, 1],
dtype='int64')
label = paddle.static.data(
name="label", shape=[2, 1], dtype='int64'
)
sum = paddle.add(a, b)
z = paddle.pow(sum, 2.0)
fc_1 = fluid.layers.fc(input=z, size=2, param_attr=weight_attr1)
prediction = fluid.layers.fc(input=fc_1,
size=2,
param_attr=weight_attr2,
act='softmax')
prediction = fluid.layers.fc(
input=fc_1, size=2, param_attr=weight_attr2, act='softmax'
)
cost = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.reduce_mean(cost)
......@@ -821,19 +817,22 @@ class TestAdamOptimizer(unittest.TestCase):
value=float(beta1_init),
dtype='float32',
persistable=True,
name="beta1")
name="beta1",
)
beta2 = fluid.layers.create_global_var(
shape=[1],
value=float(beta2_init),
dtype='float32',
persistable=True,
name="beta2")
name="beta2",
)
epsilon = fluid.layers.create_global_var(
shape=[1],
value=float(epsilon_init),
dtype='float32',
persistable=True,
name="epsilon")
name="epsilon",
)
if use_fluid_api:
adam = fluid.optimizer.Adam(
learning_rate=0.01,
......@@ -843,13 +842,16 @@ class TestAdamOptimizer(unittest.TestCase):
use_global_beta_pow=use_global_beta_pow,
flatten_param_grads=flatten_param_grads,
align_size=256,
grad_clip=clip)
grad_clip=clip,
)
else:
adam = paddle.optimizer.Adam(learning_rate=0.01,
adam = paddle.optimizer.Adam(
learning_rate=0.01,
beta1=beta1,
beta2=beta2,
epsilon=epsilon,
grad_clip=clip)
grad_clip=clip,
)
else:
if use_fluid_api:
adam = fluid.optimizer.Adam(
......@@ -860,13 +862,16 @@ class TestAdamOptimizer(unittest.TestCase):
use_global_beta_pow=use_global_beta_pow,
flatten_param_grads=flatten_param_grads,
align_size=256,
grad_clip=clip)
grad_clip=clip,
)
else:
adam = fluid.optimizer.Adam(learning_rate=0.01,
adam = fluid.optimizer.Adam(
learning_rate=0.01,
beta1=beta1_init,
beta2=beta2_init,
epsilon=epsilon_init,
grad_clip=clip)
grad_clip=clip,
)
adam.minimize(loss)
......@@ -877,15 +882,16 @@ class TestAdamOptimizer(unittest.TestCase):
print("Start run on {}".format(place))
for epoch in range(10):
pred_res, loss_res = exe.run(main_prog,
feed={
"a": a_np,
"b": b_np,
"label": label_np
},
fetch_list=[prediction, loss])
print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
epoch, pred_res[0], loss_res))
pred_res, loss_res = exe.run(
main_prog,
feed={"a": a_np, "b": b_np, "label": label_np},
fetch_list=[prediction, loss],
)
print(
"Epoch {} | Prediction[0]: {}, Loss: {}".format(
epoch, pred_res[0], loss_res
)
)
paddle.disable_static()
return pred_res, loss_res
......@@ -897,10 +903,13 @@ class TestAdamOptimizer(unittest.TestCase):
for use_fluid_api in [True, False]:
for use_global_beta_pow in [True, False]:
for flatten_param_grads in [True, False]:
pred, loss = self._test(place, use_tensor,
pred, loss = self._test(
place,
use_tensor,
use_fluid_api,
use_global_beta_pow,
flatten_param_grads)
flatten_param_grads,
)
preds.append(pred)
losses.append(loss)
for pred in preds:
......@@ -922,21 +931,22 @@ class TestAdamOptimizer(unittest.TestCase):
name="weight1",
initializer=fluid.initializer.Constant(value=1.0),
regularizer=fluid.regularizer.L1DecayRegularizer(
regularization_coeff=0.1),
trainable=True)
regularization_coeff=0.1
),
trainable=True,
)
with fluid.program_guard(main):
x = fluid.data(name='x', shape=[None, 13], dtype='float32')
y = fluid.data(name='y', shape=[None, 1], dtype='float32')
y_predict = fluid.layers.fc(input=x,
size=1,
act=None,
param_attr=weight_attr)
y_predict = fluid.layers.fc(
input=x, size=1, act=None, param_attr=weight_attr
)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = paddle.mean(cost)
adam = fluid.optimizer.AdamOptimizer(0.01,
flatten_param_grads=True,
align_size=256)
adam = fluid.optimizer.AdamOptimizer(
0.01, flatten_param_grads=True, align_size=256
)
adam.minimize(avg_cost)
paddle.disable_static()
......@@ -959,13 +969,16 @@ class TestAdamOptimizer(unittest.TestCase):
adam = fluid.optimizer.Adam(use_global_beta_pow=True)
adam.minimize(loss)
self.assertRaises(Exception, adam._get_global_accumulator, 'tmp')
adam._add_global_accumulator('tmp',
type=core.VarDesc.VarType.LOD_TENSOR)
adam._add_global_accumulator(
'tmp', type=core.VarDesc.VarType.LOD_TENSOR
)
adam._get_global_accumulator('tmp')
self.assertRaises(Exception,
self.assertRaises(
Exception,
adam._add_global_accumulator,
adam._beta1_pow_acc_str,
type=core.VarDesc.VarType.LOD_TENSOR)
type=core.VarDesc.VarType.LOD_TENSOR,
)
paddle.disable_static()
def test_adam_save_load(self):
......@@ -976,12 +989,14 @@ class TestAdamOptimizer(unittest.TestCase):
state_dict = linear.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy")
scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01,
warmup_steps=100,
verbose=True)
adam = paddle.fluid.optimizer.Adam(learning_rate=scheduler,
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True
)
adam = paddle.fluid.optimizer.Adam(
learning_rate=scheduler,
parameter_list=linear.parameters(),
use_global_beta_pow=True)
use_global_beta_pow=True,
)
adam.minimize(b)
state_dict = adam.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy")
......@@ -1002,13 +1017,14 @@ class TestAdamOptimizer(unittest.TestCase):
state_dict = linear.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy")
scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01,
warmup_steps=100,
verbose=True)
scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True
)
adam = paddle.fluid.optimizer.Adam(
learning_rate=scheduler,
parameter_list=linear.parameters(),
use_global_beta_pow=True)
use_global_beta_pow=True,
)
adam.minimize(b)
return adam
......@@ -1023,14 +1039,14 @@ class TestAdamOptimizer(unittest.TestCase):
self.assertRaises(AssertionError, adam2.set_state_dict, opt_state_dict)
adam3 = get_opt('float32', [10, 10]) # shape not match
opt_state_dict['beta1_pow_acc_0'] = np.array([0.9, 0.9],
dtype='float32')
opt_state_dict['beta1_pow_acc_0'] = np.array(
[0.9, 0.9], dtype='float32'
)
self.assertRaises(AssertionError, adam3.set_state_dict, opt_state_dict)
paddle.enable_static()
class TestAdamOpV2Group(TestAdamOpV2):
def test_adam_op(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
......@@ -1038,16 +1054,19 @@ class TestAdamOpV2Group(TestAdamOpV2):
linear_1 = paddle.nn.Linear(13, 5)
linear_2 = paddle.nn.Linear(5, 3)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate=0.01,
parameters=[{
'params': linear_1.parameters()
}, {
adam = paddle.optimizer.Adam(
learning_rate=0.01,
parameters=[
{'params': linear_1.parameters()},
{
'params': linear_2.parameters(),
'weight_decay': 0.001,
'beta1': 0.1,
'beta2': 0.99
}],
weight_decay=0.1)
'beta2': 0.99,
},
],
weight_decay=0.1,
)
out = linear_1(a)
out = linear_2(out)
out.backward()
......@@ -1056,13 +1075,14 @@ class TestAdamOpV2Group(TestAdamOpV2):
class TestMultiTensorAdam(unittest.TestCase):
def _adam_optimize_dygraph(self,
def _adam_optimize_dygraph(
self,
place,
use_param_attr=False,
use_param_group=False,
use_amp=False,
use_multi_tensor=False):
use_multi_tensor=False,
):
paddle.disable_static()
paddle.seed(10)
paddle.set_device(place)
......@@ -1072,29 +1092,40 @@ class TestMultiTensorAdam(unittest.TestCase):
weight_attr = paddle.ParamAttr(
learning_rate=0.5,
regularizer=paddle.regularizer.L2Decay(1.0),
trainable=True)
trainable=True,
)
if use_param_attr:
model = paddle.nn.Linear(5, 5, weight_attr)
else:
model = paddle.nn.Linear(5, 5)
if not use_param_group:
optimizer = paddle.optimizer.Adam(parameters=model.parameters(),
optimizer = paddle.optimizer.Adam(
parameters=model.parameters(),
use_multi_tensor=use_multi_tensor,
multi_precision=use_amp)
multi_precision=use_amp,
)
else:
optimizer = paddle.optimizer.Adam(parameters=[{
'params':
model.parameters(),
'weight_decay':
0.001,
'beta1':
0.1,
'beta2':
0.99
}],
parameters = list(model.parameters())
param_num = len(parameters)
optimizer = paddle.optimizer.Adam(
parameters=[
{
'params': parameters[: int(param_num / 2)],
'weight_decay': 0.001,
'beta1': 0.1,
'beta2': 0.99,
},
{
'params': parameters[int(param_num / 2) :],
'weight_decay': 0.001,
'beta1': 0.1,
'beta2': 0.99,
},
],
use_multi_tensor=use_multi_tensor,
multi_precision=use_amp)
multi_precision=use_amp,
)
for idx in range(2):
if place == 'gpu' and use_amp == True:
......@@ -1118,10 +1149,9 @@ class TestMultiTensorAdam(unittest.TestCase):
return output, model.parameters()
def _adam_optimize_static(self,
place,
use_amp=False,
use_multi_tensor=False):
def _adam_optimize_static(
self, place, use_amp=False, use_multi_tensor=False
):
paddle.enable_static()
paddle.seed(10)
np.random.seed(10)
......@@ -1130,24 +1160,26 @@ class TestMultiTensorAdam(unittest.TestCase):
exe = paddle.static.Executor(place=place)
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.optimizer.Adam(multi_precision=use_amp,
use_multi_tensor=use_multi_tensor)
optimizer = paddle.optimizer.Adam(
multi_precision=use_amp, use_multi_tensor=use_multi_tensor
)
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False)
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(shape=[2, 2],
name='X',
dtype='float16')
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(shape=[2, 2],
name='X',
dtype='float32')
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
......@@ -1159,9 +1191,9 @@ class TestMultiTensorAdam(unittest.TestCase):
x = np.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
loss_data, = exe.run(train_program,
feed={"X": x},
fetch_list=[loss.name])
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
......@@ -1174,49 +1206,59 @@ class TestMultiTensorAdam(unittest.TestCase):
def _check_with_place_amp(self, place, use_amp):
# test dygraph mode
output_dygraph1, params_dygraph1 = self._adam_optimize_dygraph(
place=place, use_amp=use_amp, use_multi_tensor=True)
place=place, use_amp=use_amp, use_multi_tensor=True
)
output_dygraph2, params_dygraph2 = self._adam_optimize_dygraph(
place=place, use_amp=use_amp, use_multi_tensor=False)
place=place, use_amp=use_amp, use_multi_tensor=False
)
np.testing.assert_allclose(output_dygraph1, output_dygraph2, rtol=1e-05)
for idx in range(len(params_dygraph1)):
np.testing.assert_allclose(params_dygraph1[idx],
params_dygraph2[idx],
rtol=1e-05)
np.testing.assert_allclose(
params_dygraph1[idx], params_dygraph2[idx], rtol=1e-05
)
# test static mode
output_static1 = self._adam_optimize_static(place=place,
use_amp=use_amp,
use_multi_tensor=True)
output_static2 = self._adam_optimize_static(place=place,
use_amp=use_amp,
use_multi_tensor=False)
output_static1 = self._adam_optimize_static(
place=place, use_amp=use_amp, use_multi_tensor=True
)
output_static2 = self._adam_optimize_static(
place=place, use_amp=use_amp, use_multi_tensor=False
)
for idx in range(len(output_static1)):
np.testing.assert_allclose(output_static1[idx],
output_static2[idx],
rtol=1e-05)
np.testing.assert_allclose(
output_static1[idx], output_static2[idx], rtol=1e-05
)
def _check_with_param_arrt(self, place, use_amp):
output1, params1 = self._adam_optimize_dygraph(place=place,
output1, params1 = self._adam_optimize_dygraph(
place=place,
use_amp=use_amp,
use_param_attr=True,
use_multi_tensor=True)
output2, params2 = self._adam_optimize_dygraph(place=place,
use_multi_tensor=True,
)
output2, params2 = self._adam_optimize_dygraph(
place=place,
use_amp=use_amp,
use_param_attr=True,
use_multi_tensor=False)
use_multi_tensor=False,
)
np.testing.assert_allclose(output1, output2, rtol=1e-05)
for idx in range(len(params1)):
np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
def _check_with_param_group(self, place, use_amp):
output1, params1 = self._adam_optimize_dygraph(place=place,
output1, params1 = self._adam_optimize_dygraph(
place=place,
use_amp=use_amp,
use_param_group=True,
use_multi_tensor=True)
output2, params2 = self._adam_optimize_dygraph(place=place,
use_multi_tensor=True,
)
output2, params2 = self._adam_optimize_dygraph(
place=place,
use_amp=use_amp,
use_param_group=True,
use_multi_tensor=False)
use_multi_tensor=False,
)
np.testing.assert_allclose(output1, output2, rtol=1e-05)
for idx in range(len(params1)):
......
......@@ -25,14 +25,16 @@ import numpy
from paddle.fluid.framework import _test_eager_guard
def calculate_momentum_by_numpy(param,
def calculate_momentum_by_numpy(
param,
grad,
mu,
velocity,
use_nesterov,
learning_rate,
regularization_method=None,
regularization_coeff=1.0):
regularization_coeff=1.0,
):
if regularization_method == "l2_decay":
grad = grad + regularization_coeff * param
......@@ -44,8 +46,9 @@ def calculate_momentum_by_numpy(param,
else:
velocity_out = mu * velocity + grad
if use_nesterov:
param_out = param - grad * learning_rate - \
velocity_out * mu * learning_rate
param_out = (
param - grad * learning_rate - velocity_out * mu * learning_rate
)
else:
param_out = param - learning_rate * velocity_out
......@@ -53,7 +56,6 @@ def calculate_momentum_by_numpy(param,
class TestMomentumOp1(OpTest):
def setUp(self):
self.op_type = "momentum"
self.dtype = np.float32
......@@ -70,7 +72,7 @@ class TestMomentumOp1(OpTest):
'Param': param,
'Grad': grad,
'Velocity': velocity,
'LearningRate': learning_rate
'LearningRate': learning_rate,
}
self.attrs = {'mu': mu}
......@@ -81,7 +83,8 @@ class TestMomentumOp1(OpTest):
mu=mu,
velocity=velocity,
use_nesterov=use_nesterov,
learning_rate=learning_rate)
learning_rate=learning_rate,
)
self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
......@@ -93,7 +96,6 @@ class TestMomentumOp1(OpTest):
class TestMomentumOpFp16(TestMomentumOp1):
def init_dtype(self):
self.dtype = np.float16
......@@ -102,8 +104,7 @@ class TestMomentumOpFp16(TestMomentumOp1):
class TestMomentumOp2(OpTest):
'''Test Momentum with default values for attributes
'''
'''Test Momentum with default values for attributes'''
def setUp(self):
self.op_type = "momentum"
......@@ -119,7 +120,7 @@ class TestMomentumOp2(OpTest):
'Param': param,
'Grad': grad,
'Velocity': velocity,
'LearningRate': learning_rate
'LearningRate': learning_rate,
}
self.attrs = {'mu': mu, 'use_nesterov': use_nesterov}
......@@ -130,7 +131,8 @@ class TestMomentumOp2(OpTest):
mu=mu,
velocity=velocity,
use_nesterov=use_nesterov,
learning_rate=learning_rate)
learning_rate=learning_rate,
)
self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
......@@ -138,10 +140,10 @@ class TestMomentumOp2(OpTest):
self.check_output()
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
@unittest.skipIf(
not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
)
class TestLarsMomentumOpWithMP(OpTest):
def setUp(self):
self.config()
self.op_type = "lars_momentum"
......@@ -168,11 +170,16 @@ class TestLarsMomentumOpWithMP(OpTest):
fp32_grad = grad.astype("float32")
pnorm = np.sqrt(np.square(master_param).sum())
gnorm = np.sqrt(np.square(fp32_grad).sum())
local_lr = learning_rate * lars_coeff * pnorm / (
gnorm + lars_weight_decay * pnorm)
local_lr = (
learning_rate
* lars_coeff
* pnorm
/ (gnorm + lars_weight_decay * pnorm)
)
fp32_grad = fp32_grad * rescale_grad
velocity_out = mu * velocity + local_lr * (
fp32_grad + lars_weight_decay * master_param)
fp32_grad + lars_weight_decay * master_param
)
p_new = master_param - velocity_out
param_out = p_new.astype("float16")
master_param_out = p_new
......@@ -185,7 +192,8 @@ class TestLarsMomentumOpWithMP(OpTest):
param_outs.append(("SubParam_out_" + str(i), param_out))
master_params.append(("SubMasterParam_" + str(i), master_param))
master_param_outs.append(
("SubMasterParamOut_" + str(i), master_param_out))
("SubMasterParamOut_" + str(i), master_param_out)
)
self.inputs = {
'Param': params,
......@@ -200,13 +208,13 @@ class TestLarsMomentumOpWithMP(OpTest):
'lars_coeff': lars_coeff,
'lars_weight_decay': [lars_weight_decay],
'multi_precision': True,
'rescale_grad': rescale_grad
'rescale_grad': rescale_grad,
}
self.outputs = {
'ParamOut': param_outs,
'VelocityOut': velocity_outs,
'MasterParamOut': master_param_outs
'MasterParamOut': master_param_outs,
}
def test_check_output(self):
......@@ -221,7 +229,6 @@ class TestLarsMomentumOpWithMP(OpTest):
class TestLarsMomentumOp(OpTest):
def setUp(self):
self.config()
self.op_type = "lars_momentum"
......@@ -242,10 +249,15 @@ class TestLarsMomentumOp(OpTest):
learning_rate = np.array([0.001]).astype("float32")
pnorm = np.sqrt(np.square(param).sum())
gnorm = np.sqrt(np.square(grad).sum())
local_lr = learning_rate * lars_coeff * pnorm / (
gnorm + lars_weight_decay * param)
local_lr = (
learning_rate
* lars_coeff
* pnorm
/ (gnorm + lars_weight_decay * param)
)
velocity_out = mu * velocity + local_lr * (
grad + lars_weight_decay * param)
grad + lars_weight_decay * param
)
param_out = param - velocity_out
params.append(("SubParam_" + str(i), param))
......@@ -259,13 +271,13 @@ class TestLarsMomentumOp(OpTest):
'Param': params,
'Grad': grads,
'Velocity': velocitys,
'LearningRate': learning_rates
'LearningRate': learning_rates,
}
self.attrs = {
'mu': mu,
'lars_coeff': lars_coeff,
'lars_weight_decay': [lars_weight_decay]
'lars_weight_decay': [lars_weight_decay],
}
self.outputs = {'ParamOut': param_outs, 'VelocityOut': velocity_outs}
......@@ -278,7 +290,6 @@ class TestLarsMomentumOp(OpTest):
class TestSparseMomentumOp(unittest.TestCase):
def setUp(self):
self.use_nesterov = False
self.regularization_method = ""
......@@ -317,8 +328,9 @@ class TestSparseMomentumOp(unittest.TestCase):
velocity_np_array = np.ones((height, row_numel)).astype("float32")
velocity.set(velocity_np_array, place)
velocity_out = scope.var('VelocityOut').get_tensor()
velocity_out_np_array = np.full((height, row_numel),
0.0).astype("float32")
velocity_out_np_array = np.full((height, row_numel), 0.0).astype(
"float32"
)
velocity_out.set(velocity_out_np_array, place)
# create and initialize LearningRate Variable
......@@ -327,7 +339,8 @@ class TestSparseMomentumOp(unittest.TestCase):
lr.set(lr_array, place)
# create and run operator
op = Operator("momentum",
op = Operator(
"momentum",
Param='Param',
Grad='Grad',
Velocity='Velocity',
......@@ -337,7 +350,8 @@ class TestSparseMomentumOp(unittest.TestCase):
mu=mu,
use_nesterov=use_nesterov,
regularization_method=regularization_method,
regularization_coeff=regularization_coeff)
regularization_coeff=regularization_coeff,
)
op.run(scope, place)
# get and compare result
......@@ -360,7 +374,8 @@ class TestSparseMomentumOp(unittest.TestCase):
use_nesterov=use_nesterov,
learning_rate=lr_array,
regularization_method=regularization_method,
regularization_coeff=regularization_coeff)
regularization_coeff=regularization_coeff,
)
self.assertTrue((_velocity_out == velocity_out_np_array).all())
self.assertTrue((_param_out == param_out_np_array).all())
......@@ -377,13 +392,11 @@ class TestSparseMomentumOp(unittest.TestCase):
class TestSparseMomentumOp2(TestSparseMomentumOp):
def init_kernel(self):
self.use_nesterov = True
class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
def setUp(self):
self.init_args()
self.regularization_method = ""
......@@ -427,8 +440,9 @@ class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
velocity_np_array = np.ones((height, row_numel)).astype("float32")
velocity.set(velocity_np_array, place)
velocity_out = scope.var('VelocityOut').get_tensor()
velocity_out_np_array = np.full((height, row_numel),
0.0).astype("float32")
velocity_out_np_array = np.full((height, row_numel), 0.0).astype(
"float32"
)
velocity_out.set(velocity_out_np_array, place)
# create and initialize LearningRate Variable
......@@ -437,7 +451,8 @@ class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
lr.set(lr_array, place)
# create and run operator
op = Operator("momentum",
op = Operator(
"momentum",
Param='Param',
Grad='Grad',
Velocity='Velocity',
......@@ -451,7 +466,8 @@ class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
regularization_method=regularization_method,
regularization_coeff=regularization_coeff,
multi_precision=True,
rescale_grad=1.0)
rescale_grad=1.0,
)
op.run(scope, place)
# get and compare result
......@@ -472,7 +488,8 @@ class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
use_nesterov=use_nesterov,
learning_rate=lr_array,
regularization_method=regularization_method,
regularization_coeff=regularization_coeff)
regularization_coeff=regularization_coeff,
)
self.assertTrue((_velocity_out == velocity_out_np_array).all())
self.assertTrue((_param_out == param_out_np_array).all())
......@@ -486,23 +503,22 @@ class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
class TestSparseMomentumOpWithMultiPrecision2(
TestSparseMomentumOpWithMultiPrecision):
TestSparseMomentumOpWithMultiPrecision
):
def init_args(self):
self.use_nesterov = True
class TestMomentumV2(unittest.TestCase):
def test_momentum_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Momentum(learning_rate=0.01,
momentum=0.9,
parameters=linear.parameters())
adam = paddle.optimizer.Momentum(
learning_rate=0.01, momentum=0.9, parameters=linear.parameters()
)
out = linear(a)
out.backward()
adam.step()
......@@ -519,13 +535,15 @@ class TestMomentumV2(unittest.TestCase):
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = paddle.mean(cost)
rms_optimizer = paddle.optimizer.Momentum(learning_rate=0.1,
momentum=0.9)
rms_optimizer = paddle.optimizer.Momentum(
learning_rate=0.1, momentum=0.9
)
rms_optimizer.minimize(avg_cost)
fetch_list = [avg_cost]
train_reader = paddle.batch(paddle.dataset.uci_housing.train(),
batch_size=1)
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1
)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
......@@ -533,9 +551,9 @@ class TestMomentumV2(unittest.TestCase):
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
def test_raise_error(self):
self.assertRaises(ValueError,
paddle.optimizer.Momentum,
learning_rate=None)
self.assertRaises(
ValueError, paddle.optimizer.Momentum, learning_rate=None
)
self.assertRaises(ValueError, paddle.optimizer.Momentum, momentum=None)
def test_api_eager_dygraph(self):
......@@ -545,7 +563,6 @@ class TestMomentumV2(unittest.TestCase):
class TestMomentumOpWithDecay(OpTest):
def setUp(self):
self.op_type = "momentum"
self.dtype = np.float32
......@@ -567,14 +584,14 @@ class TestMomentumOpWithDecay(OpTest):
'Param': param,
'Grad': grad,
'Velocity': velocity,
'LearningRate': learning_rate
'LearningRate': learning_rate,
}
self.attrs = {
'mu': mu,
'use_nesterov': use_nesterov,
'regularization_method': regularization_method,
'regularization_coeff': regularization_coeff
'regularization_coeff': regularization_coeff,
}
grad = grad + regularization_coeff * param
......@@ -585,7 +602,8 @@ class TestMomentumOpWithDecay(OpTest):
mu=mu,
velocity=velocity,
use_nesterov=use_nesterov,
learning_rate=learning_rate)
learning_rate=learning_rate,
)
self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
......@@ -598,7 +616,6 @@ class TestMomentumOpWithDecay(OpTest):
class TestMomentumOpWithDecayFP16(TestMomentumOpWithDecay):
def init_config(self):
self.dtype = np.float16
......@@ -608,13 +625,11 @@ class TestMomentumOpWithDecayFP16(TestMomentumOpWithDecay):
class TestMomentumOpWithDecay2(TestMomentumOpWithDecay):
def init_config(self):
self.use_nesterov = False
class TestSparseMomentumOpWithDecay(TestSparseMomentumOp):
def setUp(self):
self.use_nesterov = False
self.regularization_method = 'l2_decay'
......@@ -622,13 +637,11 @@ class TestSparseMomentumOpWithDecay(TestSparseMomentumOp):
class TestSparseMomentumOpWithDecay2(TestSparseMomentumOpWithDecay):
def init_kernel(self):
self.use_nesterov = True
class TestMomentumOpWithDecayAPI(unittest.TestCase):
def _test_momentum_dygraph_common(self, regularization):
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
......@@ -641,13 +654,16 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase):
learning_rate=0.01,
momentum=0.9,
parameter_list=linear.parameters(),
regularization=regularization)
regularization=regularization,
)
momentum.minimize(loss)
def test_momentum_dygraph_1(self):
self._test_momentum_dygraph_common(
regularization=paddle.fluid.regularizer.L2Decay(
regularization_coeff=0.1))
regularization_coeff=0.1
)
)
def test_momentum_static(self):
paddle.enable_static()
......@@ -661,12 +677,14 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase):
avg_cost = paddle.mean(cost)
momentum_optimizer = paddle.fluid.contrib.optimizer.Momentum(
learning_rate=0.1, momentum=0.9)
learning_rate=0.1, momentum=0.9
)
momentum_optimizer.minimize(avg_cost)
fetch_list = [avg_cost]
train_reader = paddle.batch(paddle.dataset.uci_housing.train(),
batch_size=1)
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1
)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
......@@ -675,23 +693,23 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase):
class TestFusedMomentumWithDecayAPI(unittest.TestCase):
def get_program(self, weight_attr, bias_attr=False):
main_program = paddle.static.Program()
startup_program = paddle.static.Program()
with paddle.static.program_guard(main_program=main_program,
startup_program=startup_program):
with paddle.static.program_guard(
main_program=main_program, startup_program=startup_program
):
x = paddle.static.data(name='x', shape=[10, 10])
linear = paddle.nn.Linear(10,
10,
weight_attr=weight_attr,
bias_attr=bias_attr)
linear = paddle.nn.Linear(
10, 10, weight_attr=weight_attr, bias_attr=bias_attr
)
out = linear(x)
loss = paddle.mean(out)
optimizer = paddle.optimizer.Momentum(
learning_rate=0.01,
momentum=0.9,
weight_decay=paddle.regularizer.L2Decay(0.5))
weight_decay=paddle.regularizer.L2Decay(0.5),
)
optimizer.minimize(loss)
return main_program
......@@ -700,7 +718,8 @@ class TestFusedMomentumWithDecayAPI(unittest.TestCase):
weight_attr = paddle.ParamAttr(
name="weight",
initializer=paddle.nn.initializer.Constant(value=0.5),
regularizer=paddle.regularizer.L2Decay(0.1))
regularizer=paddle.regularizer.L2Decay(0.1),
)
program = self.get_program(weight_attr, bias_attr=False)
ops = program.global_block().ops
......@@ -715,11 +734,13 @@ class TestFusedMomentumWithDecayAPI(unittest.TestCase):
weight_attr = paddle.ParamAttr(
name="weight",
initializer=paddle.nn.initializer.Constant(value=0.5),
regularizer=paddle.regularizer.L1Decay(0.1))
regularizer=paddle.regularizer.L1Decay(0.1),
)
bias_attr = paddle.ParamAttr(
name="bias",
initializer=paddle.nn.initializer.Constant(value=0.),
regularizer=None)
initializer=paddle.nn.initializer.Constant(value=0.0),
regularizer=None,
)
program = self.get_program(weight_attr, bias_attr)
ops = program.global_block().ops
......@@ -734,8 +755,9 @@ class TestFusedMomentumWithDecayAPI(unittest.TestCase):
self.assertEqual(ops[-1].attr('regularization_coeff'), 0)
if 'bias' in ops[-2].input('Param'):
self.assertEqual(ops[-2].attr('regularization_method'), 'l2_decay')
self.assertEqual(ops[-2].attr('regularization_coeff'),
np.float32(0.5))
self.assertEqual(
ops[-2].attr('regularization_coeff'), np.float32(0.5)
)
def test_param_has_no_regularizer(self):
paddle.enable_static()
......@@ -749,11 +771,11 @@ class TestFusedMomentumWithDecayAPI(unittest.TestCase):
class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
def __update_params(self, momentum, linear):
for i in range(10):
inp = paddle.full(shape=[2, 2], fill_value=i,
dtype='float32').astype("float32")
inp = paddle.full(
shape=[2, 2], fill_value=i, dtype='float32'
).astype("float32")
inp = paddle.to_tensor(inp)
out = linear(inp)
loss = paddle.mean(out)
......@@ -768,32 +790,39 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
2,
2,
weight_attr=paddle.nn.initializer.Constant(value=2.0),
bias_attr=paddle.nn.initializer.Constant(value=2.0))
bias_attr=paddle.nn.initializer.Constant(value=2.0),
)
momentum_old = paddle.fluid.optimizer.Momentum(
learning_rate=0.01,
momentum=0.9,
parameter_list=linear_old.parameters(),
regularization=paddle.fluid.regularizer.L2Decay(
regularization_coeff=0.1))
regularization_coeff=0.1
),
)
self.__update_params(momentum=momentum_old, linear=linear_old)
linear_new = paddle.nn.Linear(
2,
2,
weight_attr=paddle.nn.initializer.Constant(value=2.0),
bias_attr=paddle.nn.initializer.Constant(value=2.0))
bias_attr=paddle.nn.initializer.Constant(value=2.0),
)
momentum_new = paddle.fluid.contrib.optimizer.Momentum(
learning_rate=0.01,
momentum=0.9,
parameter_list=linear_new.parameters(),
regularization=paddle.fluid.regularizer.L2Decay(
regularization_coeff=0.1))
regularization_coeff=0.1
),
)
self.__update_params(momentum=momentum_new, linear=linear_new)
self.assertEqual(
(linear_old.weight.numpy() == linear_new.weight.numpy()).all(),
True,
'the param weight updated by two Momentum optimizers should equal')
'the param weight updated by two Momentum optimizers should equal',
)
def test_vs(self, place=fluid.CPUPlace()):
places = [fluid.CPUPlace()]
......@@ -805,7 +834,6 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
class TestMomentumV2Group(TestMomentumV2):
def test_momentum_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
......@@ -813,22 +841,20 @@ class TestMomentumV2Group(TestMomentumV2):
linear_1 = paddle.nn.Linear(13, 5)
linear_2 = paddle.nn.Linear(5, 3)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Momentum(learning_rate=0.01,
parameters=[{
'params':
linear_1.parameters()
}, {
'params':
linear_2.parameters(),
'weight_decay':
0.001,
'learning_rate':
0.1,
'momentum':
0.99
}],
adam = paddle.optimizer.Momentum(
learning_rate=0.01,
parameters=[
{'params': linear_1.parameters()},
{
'params': linear_2.parameters(),
'weight_decay': 0.001,
'learning_rate': 0.1,
'momentum': 0.99,
},
],
weight_decay=0.1,
momentum=0.9)
momentum=0.9,
)
out = linear_1(a)
out = linear_2(out)
out.backward()
......@@ -837,13 +863,14 @@ class TestMomentumV2Group(TestMomentumV2):
class TestMultiTensorMomentumDygraph(unittest.TestCase):
def _momentum_optimize_dygraph(self,
def _momentum_optimize_dygraph(
self,
place,
use_param_attr=False,
use_param_group=False,
use_amp=False,
use_multi_tensor=False):
use_multi_tensor=False,
):
paddle.disable_static()
paddle.seed(10)
paddle.set_device(place)
......@@ -851,7 +878,8 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
weight_attr = paddle.ParamAttr(
learning_rate=0.5,
regularizer=paddle.regularizer.L2Decay(1.0),
trainable=True)
trainable=True,
)
if use_param_attr:
model = paddle.nn.Linear(5, 5, weight_attr)
else:
......@@ -860,17 +888,29 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
optimizer = paddle.optimizer.Momentum(
parameters=model.parameters(),
use_multi_tensor=use_multi_tensor,
multi_precision=use_amp)
multi_precision=use_amp,
)
else:
parameters = list(model.parameters())
n = len(parameters)
optimizer = paddle.optimizer.Momentum(
parameters=[{
'params': model.parameters(),
parameters=[
{
'params': parameters[: int(n / 2)],
'weight_decay': 0.001,
'learning_rate': 0.1,
'momentum': 0.99,
},
{
'params': parameters[int(n / 2) :],
'weight_decay': 0.001,
'learning_rate': 0.1,
'momentum': 0.99
}],
'momentum': 0.99,
},
],
use_multi_tensor=use_multi_tensor,
multi_precision=use_amp)
multi_precision=use_amp,
)
for idx in range(5):
if place == 'gpu' and use_amp == True:
model = paddle.amp.decorate(models=model, level='O2')
......@@ -900,9 +940,11 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
def _check_with_place_amp(self, place, use_amp):
output1, params1 = self._momentum_optimize_dygraph(
place=place, use_amp=use_amp, use_multi_tensor=True)
place=place, use_amp=use_amp, use_multi_tensor=True
)
output2, params2 = self._momentum_optimize_dygraph(
place=place, use_amp=use_amp, use_multi_tensor=False)
place=place, use_amp=use_amp, use_multi_tensor=False
)
np.testing.assert_allclose(output1, output2, rtol=1e-05)
for idx in range(len(params1)):
......@@ -913,12 +955,14 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
place=place,
use_amp=use_amp,
use_param_attr=True,
use_multi_tensor=True)
use_multi_tensor=True,
)
output2, params2 = self._momentum_optimize_dygraph(
place=place,
use_amp=use_amp,
use_param_attr=True,
use_multi_tensor=False)
use_multi_tensor=False,
)
np.testing.assert_allclose(output1, output2, rtol=1e-05)
for idx in range(len(params1)):
np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
......@@ -928,12 +972,14 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
place=place,
use_amp=use_amp,
use_param_group=True,
use_multi_tensor=True)
use_multi_tensor=True,
)
output2, params2 = self._momentum_optimize_dygraph(
place=place,
use_amp=use_amp,
use_param_group=True,
use_multi_tensor=False)
use_multi_tensor=False,
)
np.testing.assert_allclose(output1, output2, rtol=1e-05)
for idx in range(len(params1)):
np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
......@@ -952,11 +998,9 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
class TestMultiTensorMomentumStatic(unittest.TestCase):
def _momentum_optimize_static(self,
place,
use_amp=False,
use_multi_tensor=False):
def _momentum_optimize_static(
self, place, use_amp=False, use_multi_tensor=False
):
paddle.enable_static()
paddle.seed(10)
np.random.seed(10)
......@@ -965,24 +1009,26 @@ class TestMultiTensorMomentumStatic(unittest.TestCase):
exe = paddle.static.Executor(place=place)
train_program = paddle.static.Program()
startup_program = paddle.static.Program()
optimizer = paddle.optimizer.Momentum(multi_precision=use_amp,
use_multi_tensor=use_multi_tensor)
optimizer = paddle.optimizer.Momentum(
multi_precision=use_amp, use_multi_tensor=use_multi_tensor
)
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
init_loss_scaling=128.0,
use_dynamic_loss_scaling=True,
use_pure_fp16=True,
use_fp16_guard=False)
use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program):
if use_amp:
data = paddle.static.data(shape=[2, 2],
name='X',
dtype='float16')
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float16'
)
else:
data = paddle.static.data(shape=[2, 2],
name='X',
dtype='float32')
data = paddle.static.data(
shape=[2, 2], name='X', dtype='float32'
)
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
optimizer.minimize(loss)
......@@ -994,9 +1040,9 @@ class TestMultiTensorMomentumStatic(unittest.TestCase):
x = numpy.random.random(size=(2, 2)).astype('float32')
out = []
for idx in range(5):
loss_data, = exe.run(train_program,
feed={"X": x},
fetch_list=[loss.name])
(loss_data,) = exe.run(
train_program, feed={"X": x}, fetch_list=[loss.name]
)
out.append(loss_data)
return out
......@@ -1007,12 +1053,12 @@ class TestMultiTensorMomentumStatic(unittest.TestCase):
return places
def _check_with_place_amp(self, place, use_amp):
output1 = self._momentum_optimize_static(place=place,
use_amp=use_amp,
use_multi_tensor=True)
output2 = self._momentum_optimize_static(place=place,
use_amp=use_amp,
use_multi_tensor=False)
output1 = self._momentum_optimize_static(
place=place, use_amp=use_amp, use_multi_tensor=True
)
output2 = self._momentum_optimize_static(
place=place, use_amp=use_amp, use_multi_tensor=False
)
for idx in range(len(output1)):
np.testing.assert_allclose(output1[idx], output2[idx], rtol=1e-05)
......
......@@ -163,7 +163,8 @@ class Adam(Optimizer):
_beta1_pow_acc_str = "beta1_pow_acc"
_beta2_pow_acc_str = "beta2_pow_acc"
def __init__(self,
def __init__(
self,
learning_rate=0.001,
beta1=0.9,
beta2=0.999,
......@@ -174,7 +175,8 @@ class Adam(Optimizer):
lazy_mode=False,
multi_precision=False,
use_multi_tensor=False,
name=None):
name=None,
):
assert learning_rate is not None
assert beta1 is not None
assert beta2 is not None
......@@ -182,20 +184,25 @@ class Adam(Optimizer):
if not isinstance(beta1, Variable):
if not 0 <= beta1 < 1:
raise ValueError(
"Invaild value of beta1, expect beta1 in [0,1).")
"Invaild value of beta1, expect beta1 in [0,1)."
)
if not isinstance(beta2, Variable):
if not 0 <= beta2 < 1:
raise ValueError(
"Invaild value of beta2, expect beta2 in [0,1).")
"Invaild value of beta2, expect beta2 in [0,1)."
)
if not isinstance(epsilon, Variable):
if not 0 <= epsilon:
raise ValueError(
"Invaild value of epsilon, expect epsilon >= 0.")
super(Adam, self).__init__(learning_rate=learning_rate,
"Invaild value of epsilon, expect epsilon >= 0."
)
super(Adam, self).__init__(
learning_rate=learning_rate,
parameters=parameters,
weight_decay=weight_decay,
grad_clip=grad_clip,
name=name)
name=name,
)
self.type = "adam"
self._beta1 = beta1
self._beta2 = beta2
......@@ -212,21 +219,13 @@ class Adam(Optimizer):
self._use_multi_tensor = use_multi_tensor
if self._use_multi_tensor:
self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
self._moment1_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
self._moment2_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
self._beta1_pow_acc_dict = {
'FP32_LODTensor': [],
'FP16_LODTensor': []
}
self._beta2_pow_acc_dict = {
'FP32_LODTensor': [],
'FP16_LODTensor': []
}
self._master_weight_dict = {
'FP32_LODTensor': None,
'FP16_LODTensor': []
}
self._param_dict = self._create_multi_tensor_dict()
self._moment1_dict = self._create_multi_tensor_dict()
self._moment2_dict = self._create_multi_tensor_dict()
self._beta1_pow_acc_dict = self._create_multi_tensor_dict()
self._beta2_pow_acc_dict = self._create_multi_tensor_dict()
self._master_weight_dict = self._create_multi_tensor_dict()
self._master_weight_dict['FP32_LODTensor'] = None
def _create_master_weight(self, param):
if param.name in self._master_weights:
......@@ -236,19 +235,23 @@ class Adam(Optimizer):
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = layers.create_global_var(name=var_name,
var = layers.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True)
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(type="cast",
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32
})
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
......@@ -262,20 +265,30 @@ class Adam(Optimizer):
"""
if self._name is not None:
name = self._name + "_" + name
find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
target_param = self._master_weights[
param.name] if find_master else param
find_master = (
self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (name not in self._accumulators
or target_name not in self._accumulators[name]):
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name))
name, target_name
)
)
return self._accumulators[name][target_name]
def _add_moments_pows(self, p):
acc_dtype = p.dtype
if acc_dtype == core.VarDesc.VarType.FP16 or acc_dtype == core.VarDesc.VarType.BF16:
if (
acc_dtype == core.VarDesc.VarType.FP16
or acc_dtype == core.VarDesc.VarType.BF16
):
acc_dtype = core.VarDesc.VarType.FP32
self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype)
self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype)
......@@ -283,18 +296,24 @@ class Adam(Optimizer):
name=self._beta1_pow_acc_str,
param=p,
dtype=acc_dtype,
fill_value=0.9 if isinstance(self._beta1, Variable) \
fill_value=0.9
if isinstance(self._beta1, Variable)
else self._beta1,
shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
type=core.VarDesc.VarType.LOD_TENSOR,
device='cpu',
)
self._add_accumulator(
name=self._beta2_pow_acc_str,
param=p,
dtype=acc_dtype,
fill_value=0.999 if isinstance(self._beta2, Variable) \
fill_value=0.999
if isinstance(self._beta2, Variable)
else self._beta2,
shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
type=core.VarDesc.VarType.LOD_TENSOR,
device='cpu',
)
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
......@@ -307,7 +326,10 @@ class Adam(Optimizer):
master_p = self._create_master_weight(p)
self._add_moments_pows(master_p)
continue
if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
if (
p.dtype == core.VarDesc.VarType.FP16
and not self._multi_precision
):
warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Adam optimizer."
......@@ -319,50 +341,105 @@ class Adam(Optimizer):
if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad)
moment1 = self._get_accumulator(self._moment1_acc_str,
param_and_grad[0])
moment2 = self._get_accumulator(self._moment2_acc_str,
param_and_grad[0])
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param_and_grad[0])
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
param_and_grad[0])
find_master = self._multi_precision and param_and_grad[
0].dtype == core.VarDesc.VarType.FP16
master_weight = (self._master_weights[param_and_grad[0].name]
if find_master else None)
moment1 = self._get_accumulator(
self._moment1_acc_str, param_and_grad[0]
)
moment2 = self._get_accumulator(
self._moment2_acc_str, param_and_grad[0]
)
beta1_pow_acc = self._get_accumulator(
self._beta1_pow_acc_str, param_and_grad[0]
)
beta2_pow_acc = self._get_accumulator(
self._beta2_pow_acc_str, param_and_grad[0]
)
find_master = (
self._multi_precision
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
lr = self._create_param_lr(param_and_grad)
# create the adam optimize op
if framework.in_dygraph_mode():
found_inf = self._get_auxiliary_var('found_inf')
_beta1 = self._beta1 if not isinstance(
self._beta1, Variable) else self._beta1.numpy().item(0)
_beta2 = self._beta2 if not isinstance(
self._beta2, Variable) else self._beta2.numpy().item(0)
_beta1 = (
self._beta1
if not isinstance(self._beta1, Variable)
else self._beta1.numpy().item(0)
)
_beta2 = (
self._beta2
if not isinstance(self._beta2, Variable)
else self._beta2.numpy().item(0)
)
_, _, _, _, _, _ = _C_ops.adam_(
param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
beta1_pow_acc, beta2_pow_acc, master_weight, found_inf, _beta1,
_beta2, self._epsilon, self._lazy_mode, 1000, find_master,
False)
param_and_grad[0],
param_and_grad[1],
lr,
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
found_inf,
_beta1,
_beta2,
self._epsilon,
self._lazy_mode,
1000,
find_master,
False,
)
return None
if framework._in_legacy_dygraph():
_beta1 = self._beta1 if not isinstance(
self._beta1, Variable) else self._beta1.numpy().item(0)
_beta2 = self._beta2 if not isinstance(
self._beta2, Variable) else self._beta2.numpy().item(0)
_beta1 = (
self._beta1
if not isinstance(self._beta1, Variable)
else self._beta1.numpy().item(0)
)
_beta2 = (
self._beta2
if not isinstance(self._beta2, Variable)
else self._beta2.numpy().item(0)
)
_, _, _, _, _, _ = _legacy_C_ops.adam(
param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
'beta2', _beta2, 'multi_precision', find_master)
param_and_grad[0],
param_and_grad[1],
lr,
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
param_and_grad[0],
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
'epsilon',
self._epsilon,
'lazy_mode',
self._lazy_mode,
'min_row_size_to_use_multithread',
1000,
'beta1',
_beta1,
'beta2',
_beta2,
'multi_precision',
find_master,
)
return None
......@@ -373,7 +450,7 @@ class Adam(Optimizer):
"Moment1": [moment1],
"Moment2": [moment2],
"Beta1Pow": [beta1_pow_acc],
"Beta2Pow": [beta2_pow_acc]
"Beta2Pow": [beta2_pow_acc],
}
outputs = {
"ParamOut": [param_and_grad[0]],
......@@ -385,7 +462,7 @@ class Adam(Optimizer):
attrs = {
"lazy_mode": self._lazy_mode,
"min_row_size_to_use_multithread": 1000,
"multi_precision": find_master
"multi_precision": find_master,
}
if isinstance(self._beta1, Variable):
......@@ -405,11 +482,13 @@ class Adam(Optimizer):
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
adam_op = block.append_op(type=self.type,
adam_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True)
stop_gradient=True,
)
return adam_op
......@@ -445,27 +524,34 @@ class Adam(Optimizer):
if param._grad_ivar() is not None:
grad_var = param._grad_ivar()
if in_dygraph_mode():
if hasattr(grad_var, "is_selected_rows"
) and grad_var.is_selected_rows(
) and self.regularization is not None:
if (
hasattr(grad_var, "is_selected_rows")
and grad_var.is_selected_rows()
and self.regularization is not None
):
raise RuntimeError(
"Adam don't support weight_decay with sparse parameters, please set it to None."
)
else:
if hasattr(
grad_var, "_is_sparse") and grad_var._is_sparse(
) and self.regularization is not None:
if (
hasattr(grad_var, "_is_sparse")
and grad_var._is_sparse()
and self.regularization is not None
):
raise RuntimeError(
"Adam don't support weight_decay with sparse parameters, please set it to None."
)
params_grads.append((param, grad_var))
optimize_ops = self._apply_optimize(loss=None,
optimize_ops = self._apply_optimize(
loss=None,
startup_program=None,
params_grads=params_grads)
params_grads=params_grads,
param_group_idx=0,
)
else:
# optimize parameters in groups
for param_group in self._param_groups:
for idx, param_group in enumerate(self._param_groups):
params_grads = defaultdict(lambda: list())
for param in param_group['params']:
if param.stop_gradient:
......@@ -474,13 +560,16 @@ class Adam(Optimizer):
grad_var = param._grad_ivar()
params_grads['params'].append((param, grad_var))
params_grads.update(
{k: v
for k, v in param_group.items() if k != 'params'})
self._apply_optimize(loss=None,
{k: v for k, v in param_group.items() if k != 'params'}
)
self._apply_optimize(
loss=None,
startup_program=None,
params_grads=params_grads)
params_grads=params_grads,
param_group_idx=idx,
)
def _multi_tensor_init(self, target_block, parameters):
def _multi_tensor_init(self, target_block, parameters, param_group_idx):
"""
All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
This function will be overridden in the corresponding optimizer file.
......@@ -492,26 +581,49 @@ class Adam(Optimizer):
for param in parameters:
moment1 = self._get_accumulator(self._moment1_acc_str, param)
moment2 = self._get_accumulator(self._moment2_acc_str, param)
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param)
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
param)
beta1_pow_acc = self._get_accumulator(
self._beta1_pow_acc_str, param
)
beta2_pow_acc = self._get_accumulator(
self._beta2_pow_acc_str, param
)
if param.dtype == paddle.float32:
self._param_dict['FP32_LODTensor'].append(param)
self._moment1_dict['FP32_LODTensor'].append(moment1)
self._moment2_dict['FP32_LODTensor'].append(moment2)
self._beta1_pow_acc_dict['FP32_LODTensor'].append(beta1_pow_acc)
self._beta2_pow_acc_dict['FP32_LODTensor'].append(beta2_pow_acc)
self._param_dict['FP32_LODTensor'][param_group_idx].append(
param
)
self._moment1_dict['FP32_LODTensor'][param_group_idx].append(
moment1
)
self._moment2_dict['FP32_LODTensor'][param_group_idx].append(
moment2
)
self._beta1_pow_acc_dict['FP32_LODTensor'][
param_group_idx
].append(beta1_pow_acc)
self._beta2_pow_acc_dict['FP32_LODTensor'][
param_group_idx
].append(beta2_pow_acc)
elif param.dtype == paddle.float16:
self._param_dict['FP16_LODTensor'].append(param)
self._moment1_dict['FP16_LODTensor'].append(moment1)
self._moment2_dict['FP16_LODTensor'].append(moment2)
self._beta1_pow_acc_dict['FP16_LODTensor'].append(beta1_pow_acc)
self._beta2_pow_acc_dict['FP16_LODTensor'].append(beta2_pow_acc)
self._param_dict['FP16_LODTensor'][param_group_idx].append(
param
)
self._moment1_dict['FP16_LODTensor'][param_group_idx].append(
moment1
)
self._moment2_dict['FP16_LODTensor'][param_group_idx].append(
moment2
)
self._beta1_pow_acc_dict['FP16_LODTensor'][
param_group_idx
].append(beta1_pow_acc)
self._beta2_pow_acc_dict['FP16_LODTensor'][
param_group_idx
].append(beta2_pow_acc)
if self._multi_precision:
self._master_weight_dict['FP16_LODTensor'].append(
self._master_weights[param.name])
self._master_weight_dict['FP16_LODTensor'][
param_group_idx
].append(self._master_weights[param.name])
else:
self._master_weight_dict['FP16_LODTensor'] = None
else:
......@@ -519,8 +631,12 @@ class Adam(Optimizer):
"Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR."
)
def _append_optimize_multi_tensor_op(self, target_block,
parameters_and_grads):
def _append_optimize_multi_tensor_op(
self,
target_block,
parameters_and_grads,
param_group_idx,
):
"""
For Multi Tensor, append optimize merged_operator to block.
"""
......@@ -534,15 +650,19 @@ class Adam(Optimizer):
if param_and_grad[1] is None:
continue
if param_and_grad[0].stop_gradient is False:
if param_and_grad[
0].dtype == paddle.float32 and param_and_grad[
1].type == core.VarDesc.VarType.LOD_TENSOR:
if (
param_and_grad[0].dtype == paddle.float32
and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP32_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad)
lr_dict['FP32_LODTensor'].append(lr)
elif param_and_grad[
0].dtype == paddle.float16 and param_and_grad[
1].type == core.VarDesc.VarType.LOD_TENSOR:
elif (
param_and_grad[0].dtype == paddle.float16
and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP16_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad)
lr_dict['FP16_LODTensor'].append(lr)
......@@ -553,97 +673,149 @@ class Adam(Optimizer):
if param_and_grad[0].stop_gradient is False:
param_grad_dict = dict()
param_grad_dict['params'] = param_and_grad
param_grad_dict.update({
param_grad_dict.update(
{
k: v
for k, v in parameters_and_grads.items()
if k != 'params'
})
}
)
param_and_grad = self._update_param_group(param_grad_dict)
if param_and_grad[
0].dtype == paddle.float32 and param_and_grad[
1].type == core.VarDesc.VarType.LOD_TENSOR:
if (
param_and_grad[0].dtype == paddle.float32
and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP32_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad)
lr_dict['FP32_LODTensor'].append(lr)
elif param_and_grad[
0].dtype == paddle.float16 and param_and_grad[
1].type == core.VarDesc.VarType.LOD_TENSOR:
elif (
param_and_grad[0].dtype == paddle.float16
and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP16_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad)
lr_dict['FP16_LODTensor'].append(lr)
multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor']
for key in multi_tensor_list:
if len(self._param_dict[key]) > 0:
if len(self._param_dict[key][param_group_idx]) > 0:
find_master = self._multi_precision and key == 'FP16_LODTensor'
_beta1 = self._beta1 if not isinstance(
self._beta1, Variable) else self._beta1.numpy().item(0)
_beta2 = self._beta2 if not isinstance(
self._beta2, Variable) else self._beta2.numpy().item(0)
_beta1 = (
self._beta1
if not isinstance(self._beta1, Variable)
else self._beta1.numpy().item(0)
)
_beta2 = (
self._beta2
if not isinstance(self._beta2, Variable)
else self._beta2.numpy().item(0)
)
if framework._non_static_mode():
master_weight = self._master_weight_dict[key]
master_weight = (
master_weight[param_group_idx]
if master_weight is not None
else None
)
if in_dygraph_mode():
_, _, _, _, _, _ = _C_ops.merged_adam_(
self._param_dict[key], grad_dict[key], lr_dict[key],
self._moment1_dict[key], self._moment2_dict[key],
self._beta1_pow_acc_dict[key],
self._beta2_pow_acc_dict[key],
self._master_weight_dict[key], _beta1, _beta2,
self._epsilon, find_master, False)
self._param_dict[key][param_group_idx],
grad_dict[key],
lr_dict[key],
self._moment1_dict[key][param_group_idx],
self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
_beta1,
_beta2,
self._epsilon,
find_master,
False,
)
else:
_, _, _, _, _, _ = _legacy_C_ops.merged_adam(
self._param_dict[key], grad_dict[key], lr_dict[key],
self._moment1_dict[key], self._moment2_dict[key],
self._beta1_pow_acc_dict[key],
self._beta2_pow_acc_dict[key],
self._master_weight_dict[key],
self._param_dict[key], self._moment1_dict[key],
self._moment2_dict[key],
self._beta1_pow_acc_dict[key],
self._beta2_pow_acc_dict[key],
self._master_weight_dict[key], 'epsilon',
self._epsilon, 'beta1', _beta1, 'beta2', _beta2,
'multi_precision', find_master)
self._param_dict[key][param_group_idx],
grad_dict[key],
lr_dict[key],
self._moment1_dict[key][param_group_idx],
self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
self._param_dict[key][param_group_idx],
self._moment1_dict[key][param_group_idx],
self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
'epsilon',
self._epsilon,
'beta1',
_beta1,
'beta2',
_beta2,
'multi_precision',
find_master,
)
else:
inputs = {
"Param": self._param_dict[key],
"Param": self._param_dict[key][param_group_idx],
"Grad": grad_dict[key],
"LearningRate": lr_dict[key],
"Moment1": self._moment1_dict[key],
"Moment2": self._moment2_dict[key],
"Beta1Pow": self._beta1_pow_acc_dict[key],
"Beta2Pow": self._beta2_pow_acc_dict[key]
"Moment1": self._moment1_dict[key][param_group_idx],
"Moment2": self._moment2_dict[key][param_group_idx],
"Beta1Pow": self._beta1_pow_acc_dict[key][
param_group_idx
],
"Beta2Pow": self._beta2_pow_acc_dict[key][
param_group_idx
],
}
outputs = {
"ParamOut": self._param_dict[key],
"Moment1Out": self._moment1_dict[key],
"Moment2Out": self._moment2_dict[key],
"Beta1PowOut": self._beta1_pow_acc_dict[key],
"Beta2PowOut": self._beta2_pow_acc_dict[key]
"ParamOut": self._param_dict[key][param_group_idx],
"Moment1Out": self._moment1_dict[key][param_group_idx],
"Moment2Out": self._moment2_dict[key][param_group_idx],
"Beta1PowOut": self._beta1_pow_acc_dict[key][
param_group_idx
],
"Beta2PowOut": self._beta2_pow_acc_dict[key][
param_group_idx
],
}
attrs = {
"epsilon": self._epsilon,
"beta1": _beta1,
"beta2": _beta2
"beta2": _beta2,
}
if find_master:
inputs["MasterParam"] = self._master_weight_dict[key]
inputs["MasterParam"] = self._master_weight_dict[key][
param_group_idx
]
outputs["MasterParamOut"] = self._master_weight_dict[
key]
key
][param_group_idx]
attrs["multi_precision"] = find_master
target_block.append_op(type="merged_adam",
target_block.append_op(
type="merged_adam",
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True)
stop_gradient=True,
)
return None
def _update_param_group(self, parameters):
self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
self._lazy_mode = parameters.get('lazy_mode',
self._default_dict['lazy_mode'])
self._lazy_mode = parameters.get(
'lazy_mode', self._default_dict['lazy_mode']
)
parameters = parameters.get('params')
return parameters
......@@ -123,7 +123,8 @@ class Momentum(Optimizer):
"""
_velocity_acc_str = "velocity"
def __init__(self,
def __init__(
self,
learning_rate=0.001,
momentum=0.9,
parameters=None,
......@@ -133,19 +134,24 @@ class Momentum(Optimizer):
multi_precision=False,
rescale_grad=1.0,
use_multi_tensor=False,
name=None):
name=None,
):
if learning_rate is None:
raise ValueError("learning_rate is not set")
if momentum is None:
raise ValueError("momentum is not set")
predicate = lambda regular: isinstance(regular,
(L2DecayRegularizer, float))
predicate = lambda regular: isinstance(
regular, (L2DecayRegularizer, float)
)
if isinstance(parameters, list):
if isinstance(parameters[0], dict):
for param_group in parameters:
decay = param_group[
'weight_decay'] if 'weight_decay' in param_group else weight_decay
decay = (
param_group['weight_decay']
if 'weight_decay' in param_group
else weight_decay
)
reg_method, reg_coeff = self._update_regularization(decay)
param_group['regularization_method'] = reg_method
param_group['regularization_coeff'] = reg_coeff
......@@ -153,16 +159,20 @@ class Momentum(Optimizer):
param_group['weight_decay'] = py_regular
py_regular = None if predicate(weight_decay) else weight_decay
super(Momentum, self).__init__(learning_rate=learning_rate,
super(Momentum, self).__init__(
learning_rate=learning_rate,
parameters=parameters,
weight_decay=py_regular,
grad_clip=grad_clip,
name=name)
name=name,
)
self.type = "momentum"
self._momentum = momentum
self._use_nesterov = bool(use_nesterov)
self._regularization_method, self._regularization_coeff = self._update_regularization(
weight_decay)
(
self._regularization_method,
self._regularization_coeff,
) = self._update_regularization(weight_decay)
self._multi_precision = multi_precision
self._rescale_grad = rescale_grad
self._master_weights = {}
......@@ -176,29 +186,21 @@ class Momentum(Optimizer):
}
self._use_multi_tensor = use_multi_tensor
if self._use_multi_tensor:
self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
self._velocity_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
self._master_weight_dict = {
'FP32_LODTensor': None,
'FP16_LODTensor': []
}
self._regularization_method_dict = {
'FP32_LODTensor': [],
'FP16_LODTensor': []
}
self._regularization_coeff_dict = {
'FP32_LODTensor': [],
'FP16_LODTensor': []
}
self._param_dict = self._create_multi_tensor_dict()
self._velocity_dict = self._create_multi_tensor_dict()
self._master_weight_dict = self._create_multi_tensor_dict()
self._master_weight_dict['FP32_LODTensor'] = None
self._regularization_method_dict = self._create_multi_tensor_dict()
self._regularization_coeff_dict = self._create_multi_tensor_dict()
def _update_regularization(self, weight_decay):
reg_method = ""
reg_coeff = 0.0
if (isinstance(weight_decay, L2DecayRegularizer)):
if isinstance(weight_decay, L2DecayRegularizer):
reg_method = "l2_decay"
reg_coeff = weight_decay._regularization_coeff
if (isinstance(weight_decay, float)):
if isinstance(weight_decay, float):
reg_method = "l2_decay"
reg_coeff = weight_decay
return reg_method, reg_coeff
......@@ -211,19 +213,23 @@ class Momentum(Optimizer):
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = layers.create_global_var(name=var_name,
var = layers.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True)
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(type="cast",
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32
})
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
......@@ -239,15 +245,22 @@ class Momentum(Optimizer):
"""
if self._name is not None:
name = self._name + "_" + name
find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
target_param = self._master_weights[
param.name] if find_master else param
find_master = (
self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (name not in self._accumulators
or target_name not in self._accumulators[name]):
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name))
name, target_name
)
)
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters):
......@@ -265,7 +278,10 @@ class Momentum(Optimizer):
master_p = self._create_master_weight(p)
self._add_accumulator(self._velocity_acc_str, master_p)
continue
if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
if (
p.dtype == core.VarDesc.VarType.FP16
and not self._multi_precision
):
warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Momentum optimizer."
......@@ -273,25 +289,28 @@ class Momentum(Optimizer):
self._add_accumulator(self._velocity_acc_str, p)
def _create_regularization_of_grad(self, param, grad, regularization=None):
""" Create and add backward regularization Operators
"""Create and add backward regularization Operators
Function helper of append_regularization_ops.
"""
# If ParamAttr is set to L2Decay, we skip doing regularization here. And then we fused
# L2Decay with momentum which can refer to _append_optimize_op below.
if hasattr(param, 'regularizer') and isinstance(param.regularizer,
L2DecayRegularizer):
if hasattr(param, 'regularizer') and isinstance(
param.regularizer, L2DecayRegularizer
):
return grad
return super(Momentum, self)._create_regularization_of_grad(
param, grad, regularization)
param, grad, regularization
)
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad)
velocity_acc = self._get_accumulator(self._velocity_acc_str,
param_and_grad[0])
velocity_acc = self._get_accumulator(
self._velocity_acc_str, param_and_grad[0]
)
lr = self._create_param_lr(param_and_grad)
# For fusion of momentum and l2decay
......@@ -308,30 +327,56 @@ class Momentum(Optimizer):
regularization_method = ""
regularization_coeff = 0.0
find_master = self._multi_precision and param_and_grad[
0].dtype == core.VarDesc.VarType.FP16
master_weight = (self._master_weights[param_and_grad[0].name]
if find_master else None)
find_master = (
self._multi_precision
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
if _in_legacy_dygraph():
if isinstance(param_and_grad, dict):
self._update_regularization(param_and_grad['weight_decay'])
_, _, _ = _legacy_C_ops.momentum(
param_and_grad[0], param_and_grad[1], velocity_acc, lr,
master_weight, param_and_grad[0], velocity_acc, master_weight,
'mu', self._momentum, 'use_nesterov', self._use_nesterov,
'regularization_method', regularization_method,
'regularization_coeff', regularization_coeff, 'multi_precision',
find_master)
param_and_grad[0],
param_and_grad[1],
velocity_acc,
lr,
master_weight,
param_and_grad[0],
velocity_acc,
master_weight,
'mu',
self._momentum,
'use_nesterov',
self._use_nesterov,
'regularization_method',
regularization_method,
'regularization_coeff',
regularization_coeff,
'multi_precision',
find_master,
)
return None
if in_dygraph_mode():
if isinstance(param_and_grad, dict):
self._update_regularization(param_and_grad['weight_decay'])
return _C_ops.momentum_(param_and_grad[0], param_and_grad[1],
velocity_acc, lr, master_weight,
self._momentum, self._use_nesterov,
regularization_method, regularization_coeff,
find_master, self._rescale_grad)
return _C_ops.momentum_(
param_and_grad[0],
param_and_grad[1],
velocity_acc,
lr,
master_weight,
self._momentum,
self._use_nesterov,
regularization_method,
regularization_coeff,
find_master,
self._rescale_grad,
)
attrs = {
"mu": self._momentum,
......@@ -339,19 +384,19 @@ class Momentum(Optimizer):
"regularization_method": regularization_method,
"regularization_coeff": regularization_coeff,
"multi_precision": find_master,
"rescale_grad": self._rescale_grad
"rescale_grad": self._rescale_grad,
}
inputs = {
"Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]],
"Velocity": [velocity_acc],
"LearningRate": [lr]
"LearningRate": [lr],
}
outputs = {
"ParamOut": [param_and_grad[0]],
"VelocityOut": [velocity_acc]
"VelocityOut": [velocity_acc],
}
if find_master:
......@@ -359,15 +404,17 @@ class Momentum(Optimizer):
outputs["MasterParamOut"] = master_weight
# create the momentum optimize op
momentum_op = block.append_op(type=self.type,
momentum_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True)
stop_gradient=True,
)
return momentum_op
def _multi_tensor_init(self, target_block, parameters):
def _multi_tensor_init(self, target_block, parameters, param_group_idx):
"""
All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
This function will be overridden in the corresponding optimizer file.
......@@ -385,37 +432,58 @@ class Momentum(Optimizer):
# we skip param's l2decay before, so fuse it with momentum here.
if isinstance(param.regularizer, L2DecayRegularizer):
regularization_method = "l2_decay"
regularization_coeff = param.regularizer._regularization_coeff
regularization_coeff = (
param.regularizer._regularization_coeff
)
elif param.regularizer is not None:
regularization_method = ""
regularization_coeff = 0.0
if param.dtype == paddle.float32:
self._param_dict['FP32_LODTensor'].append(param)
self._velocity_dict['FP32_LODTensor'].append(velocity_acc)
self._param_dict['FP32_LODTensor'][param_group_idx].append(
param
)
self._velocity_dict['FP32_LODTensor'][param_group_idx].append(
velocity_acc
)
# fp32 no master weight
self._regularization_method_dict['FP32_LODTensor'].append(
regularization_method)
self._regularization_coeff_dict['FP32_LODTensor'].append(
regularization_coeff)
self._regularization_method_dict['FP32_LODTensor'][
param_group_idx
].append(regularization_method)
self._regularization_coeff_dict['FP32_LODTensor'][
param_group_idx
].append(regularization_coeff)
elif param.dtype == paddle.float16:
self._param_dict['FP16_LODTensor'].append(param)
self._velocity_dict['FP16_LODTensor'].append(velocity_acc)
self._param_dict['FP16_LODTensor'][param_group_idx].append(
param
)
self._velocity_dict['FP16_LODTensor'][param_group_idx].append(
velocity_acc
)
if self._multi_precision:
self._master_weight_dict['FP16_LODTensor'].append(
self._master_weights[param.name])
self._master_weight_dict['FP16_LODTensor'][
param_group_idx
].append(self._master_weights[param.name])
else:
self._master_weight_dict['FP16_LODTensor'] = None
self._regularization_method_dict['FP16_LODTensor'].append(
regularization_method)
self._regularization_coeff_dict['FP16_LODTensor'].append(
regularization_coeff)
self._master_weight_dict['FP16_LODTensor'][
param_group_idx
] = None
self._regularization_method_dict['FP16_LODTensor'][
param_group_idx
].append(regularization_method)
self._regularization_coeff_dict['FP16_LODTensor'][
param_group_idx
].append(regularization_coeff)
else:
raise ValueError(
"Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR."
)
def _append_optimize_multi_tensor_op(self, target_block,
parameters_and_grads):
def _append_optimize_multi_tensor_op(
self,
target_block,
parameters_and_grads,
param_group_idx,
):
"""
For Multi Tensor, append optimize merged_operator to block.
"""
......@@ -429,15 +497,19 @@ class Momentum(Optimizer):
if param_and_grad[1] is None:
continue
if param_and_grad[0].stop_gradient is False:
if param_and_grad[
0].dtype == paddle.float32 and param_and_grad[
1].type == core.VarDesc.VarType.LOD_TENSOR:
if (
param_and_grad[0].dtype == paddle.float32
and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP32_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad)
lr_dict['FP32_LODTensor'].append(lr)
elif param_and_grad[
0].dtype == paddle.float16 and param_and_grad[
1].type == core.VarDesc.VarType.LOD_TENSOR:
elif (
param_and_grad[0].dtype == paddle.float16
and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP16_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad)
lr_dict['FP16_LODTensor'].append(lr)
......@@ -448,97 +520,144 @@ class Momentum(Optimizer):
if param_and_grad[0].stop_gradient is False:
param_grad_dict = dict()
param_grad_dict['params'] = param_and_grad
param_grad_dict.update({
param_grad_dict.update(
{
k: v
for k, v in parameters_and_grads.items()
if k != 'params'
})
}
)
param_and_grad = self._update_param_group(param_grad_dict)
if param_and_grad[
0].dtype == paddle.float32 and param_and_grad[
1].type == core.VarDesc.VarType.LOD_TENSOR:
if (
param_and_grad[0].dtype == paddle.float32
and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP32_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad)
lr_dict['FP32_LODTensor'].append(lr)
elif param_and_grad[
0].dtype == paddle.float16 and param_and_grad[
1].type == core.VarDesc.VarType.LOD_TENSOR:
elif (
param_and_grad[0].dtype == paddle.float16
and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP16_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad)
lr_dict['FP16_LODTensor'].append(lr)
multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor']
for key in multi_tensor_list:
if len(self._param_dict[key]) > 0:
if len(self._param_dict[key][param_group_idx]) > 0:
find_master = self._multi_precision and key == 'FP16_LODTensor'
master_weight = self._master_weight_dict[key]
master_weight = (
master_weight[param_group_idx]
if master_weight is not None
else None
)
if framework._non_static_mode():
if in_dygraph_mode():
_, _, _ = _C_ops.merged_momentum_(
self._param_dict[key], grad_dict[key],
self._velocity_dict[key], lr_dict[key],
self._master_weight_dict[key], self._momentum,
self._param_dict[key][param_group_idx],
grad_dict[key],
self._velocity_dict[key][param_group_idx],
lr_dict[key],
master_weight,
self._momentum,
self._use_nesterov,
self._regularization_method_dict[key],
self._regularization_coeff_dict[key], find_master,
self._rescale_grad)
self._regularization_method_dict[key][
param_group_idx
],
self._regularization_coeff_dict[key][
param_group_idx
],
find_master,
self._rescale_grad,
)
else:
_, _, _ = _legacy_C_ops.merged_momentum(
self._param_dict[key], grad_dict[key],
self._velocity_dict[key], lr_dict[key],
self._master_weight_dict[key],
self._param_dict[key], self._velocity_dict[key],
self._master_weight_dict[key], 'mu', self._momentum,
'use_nesterov', self._use_nesterov,
self._param_dict[key][param_group_idx],
grad_dict[key],
self._velocity_dict[key][param_group_idx],
lr_dict[key],
master_weight,
self._param_dict[key][param_group_idx],
self._velocity_dict[key][param_group_idx],
master_weight,
'mu',
self._momentum,
'use_nesterov',
self._use_nesterov,
'regularization_method',
self._regularization_method_dict[key],
self._regularization_method_dict[key][
param_group_idx
],
'regularization_coeff',
self._regularization_coeff_dict[key],
'multi_precision', find_master)
self._regularization_coeff_dict[key][
param_group_idx
],
'multi_precision',
find_master,
)
else:
inputs = {
"Param": self._param_dict[key],
"Param": self._param_dict[key][param_group_idx],
"Grad": grad_dict[key],
"Velocity": self._velocity_dict[key],
"Velocity": self._velocity_dict[key][param_group_idx],
"LearningRate": lr_dict[key],
}
outputs = {
"ParamOut": self._param_dict[key],
"VelocityOut": self._velocity_dict[key],
"ParamOut": self._param_dict[key][param_group_idx],
"VelocityOut": self._velocity_dict[key][
param_group_idx
],
}
attrs = {
"mu":
self._momentum,
"use_nesterov":
self._use_nesterov,
"regularization_method":
self._regularization_method_dict[key],
"regularization_coeff":
self._regularization_coeff_dict[key],
"mu": self._momentum,
"use_nesterov": self._use_nesterov,
"regularization_method": self._regularization_method_dict[
key
][
param_group_idx
],
"regularization_coeff": self._regularization_coeff_dict[
key
][param_group_idx],
}
if find_master:
inputs["MasterParam"] = self._master_weight_dict[key]
inputs["MasterParam"] = self._master_weight_dict[key][
param_group_idx
]
outputs["MasterParamOut"] = self._master_weight_dict[
key]
key
][param_group_idx]
attrs["multi_precision"] = find_master
target_block.append_op(type="merged_momentum",
target_block.append_op(
type="merged_momentum",
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True)
stop_gradient=True,
)
return None
def _update_param_group(self, parameters):
self._momentum = parameters.get('momentum',
self._default_dict['momentum'])
self._use_nesterov = parameters.get('use_nesterov',
self._default_dict['use_nesterov'])
self._rescale_grad = parameters.get('rescale_grad',
self._default_dict['rescale_grad'])
self._momentum = parameters.get(
'momentum', self._default_dict['momentum']
)
self._use_nesterov = parameters.get(
'use_nesterov', self._default_dict['use_nesterov']
)
self._rescale_grad = parameters.get(
'rescale_grad', self._default_dict['rescale_grad']
)
self._regularization_method = parameters.get(
'regularization_method',
self._default_dict['regularization_method'])
'regularization_method', self._default_dict['regularization_method']
)
self._regularization_coeff = parameters.get(
'regularization_coeff', self._default_dict['regularization_coeff'])
'regularization_coeff', self._default_dict['regularization_coeff']
)
parameters = parameters.get('params')
return parameters
......@@ -21,13 +21,30 @@ from collections import defaultdict
import paddle
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
from paddle.fluid.framework import (
Program,
Variable,
name_scope,
default_main_program,
default_startup_program,
device_guard,
)
from ..fluid import framework
from ..fluid import layers
from ..fluid import unique_name
from ..fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
from ..fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops
from ..fluid.backward import (
append_backward,
_some_in_set_,
_append_grad_suffix_,
_get_no_grad_set_name,
)
from ..fluid.clip import (
GradientClipBase,
GradientClipByNorm,
error_clip_callback,
append_gradient_clip_ops,
)
from ..fluid.framework import program_guard, Parameter
from ..fluid.initializer import Constant
from ..fluid.layer_helper import LayerHelper
......@@ -42,24 +59,36 @@ from .. import compat as cpt
from .lr import LRScheduler
import copy
from paddle import _C_ops, _legacy_C_ops
from paddle.fluid.framework import _in_legacy_dygraph, _in_eager_without_dygraph_check, _current_expected_place, in_dygraph_mode
from paddle.fluid.framework import (
_in_legacy_dygraph,
_in_eager_without_dygraph_check,
_current_expected_place,
in_dygraph_mode,
)
__all__ = []
@framework.static_only
def append_backward_new(loss_list,
def append_backward_new(
loss_list,
parameter_list=None,
no_grad_set=None,
callbacks=None,
checkpoints=None,
distop_context=None):
distop_context=None,
):
from paddle.incubate.autograd.primx import orig2prim, Transform
program = default_main_program()
assert program.num_blocks == 1, "The append_backward_new interface is designed to process only one block."
assert (
program.num_blocks == 1
), "The append_backward_new interface is designed to process only one block."
block = program.current_block()
for el in loss_list:
assert el.block == block, f'variable in loss_list should be in current block of main program'
assert (
el.block == block
), f'variable in loss_list should be in current block of main program'
orig2prim(block)
ad = Transform(block)
......@@ -163,12 +192,14 @@ class Optimizer(object):
"""
@imperative_base.no_grad
def __init__(self,
def __init__(
self,
learning_rate,
parameters=None,
weight_decay=None,
grad_clip=None,
name=None):
name=None,
):
if parameters is not None:
# paddle.Tensor is also iterable, so here we don't check whether
......@@ -177,13 +208,16 @@ class Optimizer(object):
if isinstance(parameters, (paddle.Tensor, core.eager.Tensor)):
raise TypeError(
"`parameters` argument given to the optimizer should be "
"an iterable of paddle Tensors, but got argument type is `{}`."
.format(type(parameters)))
"an iterable of paddle Tensors, but got argument type is `{}`.".format(
type(parameters)
)
)
if isinstance(parameters, dict):
raise TypeError(
"`parameters` argument should not get dict type, "
"if parameter groups is needed, please set `parameters`"
" as list of dict")
" as list of dict"
)
self._parameter_list = list(parameters)
else:
self._parameter_list = None
......@@ -197,18 +231,22 @@ class Optimizer(object):
if weight_decay is not None:
if not isinstance(self._parameter_list[0], dict):
for param in self._parameter_list:
if hasattr(param, 'regularizer'
) and param.regularizer is not None:
if (
hasattr(param, 'regularizer')
and param.regularizer is not None
):
logging.info(
"If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. "
"The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
% weight_decay.__str__())
% weight_decay.__str__()
)
break
if not isinstance(learning_rate, (float, LRScheduler)):
raise TypeError(
"learning rate should be float or LRScheduler, got %s here" %
type(learning_rate))
"learning rate should be float or LRScheduler, got %s here"
% type(learning_rate)
)
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase):
raise TypeError(
......@@ -216,6 +254,7 @@ class Optimizer(object):
)
if isinstance(weight_decay, float):
from ..fluid.regularizer import L2Decay
self.regularization = L2Decay(weight_decay)
else:
self.regularization = weight_decay
......@@ -227,8 +266,9 @@ class Optimizer(object):
if self._parameter_list:
if isinstance(self._parameter_list[0], dict):
for param_group in self._parameter_list:
assert 'params' in param_group, \
'params should be set in parameters if parameter groups are optimized in different options'
assert (
'params' in param_group
), 'params should be set in parameters if parameter groups are optimized in different options'
self._dtype = self._parameter_list[0]['params'][0].dtype
else:
self._dtype = self._parameter_list[0].dtype
......@@ -248,7 +288,7 @@ class Optimizer(object):
self.clear_gradients = self.clear_grad
self._default_dict = {
'weight_decay': self.regularization,
'grad_clip': self._grad_clip
'grad_clip': self._grad_clip,
}
self._param_groups = []
......@@ -261,13 +301,20 @@ class Optimizer(object):
# NOTE: Multi Tensor: Pass in all parameters and gradients to the op kernel of the Optimizer at one time for updating for dygraph mode.
# Optimizer support list: [ paddle.optimizer.Momentum, paddle.optimizer.Adam].
self._use_multi_tensor = None
self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
self._param_dict = self._create_multi_tensor_dict()
self._auxiliary_vars = {}
def _set_auxiliary_var(self, key, val):
self._auxiliary_vars[key] = val
def _create_multi_tensor_dict(self):
n = len(self._param_groups) if self._param_groups is not None else 1
return {
'FP32_LODTensor': [[] for _ in range(n)],
'FP16_LODTensor': [[] for _ in range(n)],
}
def _get_auxiliary_var(self, key):
return self._auxiliary_vars.get(key, None)
......@@ -353,8 +400,9 @@ class Optimizer(object):
self._accumulators_holder = state_dict
for k, v in self._accumulators.items():
for para_name, var_tmp in v.items():
assert var_tmp.name in state_dict, \
"optimizer Tensor {} not found".format( var_tmp.name )
assert (
var_tmp.name in state_dict
), "optimizer Tensor {} not found".format(var_tmp.name)
var = var_tmp.value()
tensor = var.get_tensor()
model_np = np.array(tensor)
......@@ -368,16 +416,23 @@ class Optimizer(object):
elif isinstance(load_para, np.ndarray):
load_para_np = load_para
else:
raise RuntimeError("State dict type {} not supprt".format(
str(type(load_para))))
raise RuntimeError(
"State dict type {} not supprt".format(
str(type(load_para))
)
)
assert model_np.shape == load_para_np.shape, \
"Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
model_np.name, model_np.shape, load_para_np.shape)
assert (
model_np.shape == load_para_np.shape
), "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
model_np.name, model_np.shape, load_para_np.shape
)
assert model_np.dtype == load_para_np.dtype, \
"Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {} but load tensor with dtype {}".format(
model_np.name, model_np.dtype, load_para_np.dtype)
assert (
model_np.dtype == load_para_np.dtype
), "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {} but load tensor with dtype {}".format(
model_np.name, model_np.dtype, load_para_np.dtype
)
tensor.set(load_para_np, framework._current_expected_place())
......@@ -386,45 +441,57 @@ class Optimizer(object):
def _create_global_learning_rate(self):
# lr var can't be float16, for pure fp16 training, should extra handle the dtype for lr
_lr_dtype = paddle.get_default_dtype(
) if self._dtype is None else self._dtype
_lr_dtype = paddle.float32 if (
_lr_dtype = (
paddle.get_default_dtype() if self._dtype is None else self._dtype
)
_lr_dtype = (
paddle.float32
if (
paddle.get_default_dtype() != "float16"
and _lr_dtype == paddle.float16) else _lr_dtype
and _lr_dtype == paddle.float16
)
else _lr_dtype
)
if isinstance(self._learning_rate, LRScheduler):
lr_var = self._global_learning_rate()
# only create global lr_var once
if not isinstance(lr_var, framework.Variable):
lr_name = unique_name.generate('learning_rate')
self._learning_rate._var_name = lr_name
lr_var = self.helper.create_global_variable(name=lr_name,
lr_var = self.helper.create_global_variable(
name=lr_name,
shape=[1],
persistable=True,
stop_gradient=True,
dtype=_lr_dtype)
dtype=_lr_dtype,
)
main_prog = framework.default_main_program()
main_prog.lr_sheduler = self._learning_rate
main_prog.lr_var = lr_var
self._learning_rate_map[
framework.default_main_program()] = lr_var
framework.default_main_program()
] = lr_var
lr_value = float(self._learning_rate())
self.helper.set_variable_initializer(
lr_var, initializer=Constant(value=lr_value))
lr_var, initializer=Constant(value=lr_value)
)
elif isinstance(self._learning_rate, float):
# only create global lr_var once
lr = self._global_learning_rate()
if isinstance(lr, framework.Variable):
return
else:
self._learning_rate_map[framework.default_main_program(
)] = layers.create_global_var(
self._learning_rate_map[
framework.default_main_program()
] = layers.create_global_var(
name=unique_name.generate("learning_rate"),
shape=[1],
value=float(self._learning_rate),
dtype=_lr_dtype,
persistable=True)
persistable=True,
)
@framework.dygraph_only
def set_lr(self, value):
......@@ -465,7 +532,8 @@ class Optimizer(object):
if not isinstance(value, (int, float)):
raise TypeError(
"The type of 'value' in optimizer.set_lr must be float, but received %s."
% (type(value)))
% (type(value))
)
if isinstance(self._learning_rate, LRScheduler):
raise RuntimeError(
"optimizer's learning rate can't be LRScheduler when invoke this API, because this will lead to conflict."
......@@ -475,23 +543,36 @@ class Optimizer(object):
if current_lr is not None:
if in_dygraph_mode():
place = _current_expected_place()
_C_ops.full_(current_lr, list(current_lr.shape), float(value),
current_lr.dtype, place)
_C_ops.full_(
current_lr,
list(current_lr.shape),
float(value),
current_lr.dtype,
place,
)
elif _in_legacy_dygraph():
_legacy_C_ops.fill_constant(current_lr, 'value', float(value),
'dtype', current_lr.dtype, 'shape',
list(current_lr.shape))
_legacy_C_ops.fill_constant(
current_lr,
'value',
float(value),
'dtype',
current_lr.dtype,
'shape',
list(current_lr.shape),
)
else:
global_block = framework.default_main_program().global_block()
global_block.append_op(type='fill_constant',
global_block.append_op(
type='fill_constant',
outputs={'Out': [current_lr]},
attrs={
'dtype': current_lr.dtype,
'shape': list(current_lr.shape),
'value': float(value)
'value': float(value),
},
stop_gradient=True)
stop_gradient=True,
)
def get_lr(self):
"""
......@@ -565,8 +646,7 @@ class Optimizer(object):
return self._learning_rate_map.get(program, None)
def _append_optimize_op(self, block, param_and_grad):
""" append optimize operator to block and return all the added optimize_op
"""
"""append optimize operator to block and return all the added optimize_op"""
raise NotImplementedError(
"Class \"Optimizer\" connot be used directly as an optimizer, please use its subclasses such as \"Adam\""
)
......@@ -583,8 +663,8 @@ class Optimizer(object):
return self._global_learning_rate()
else:
with default_main_program()._lr_schedule_guard(
is_with_opt=True), framework.name_scope(
'scale_with_param_lr'):
is_with_opt=True
), framework.name_scope('scale_with_param_lr'):
return self._global_learning_rate() * param_lr
else:
return self._global_learning_rate()
......@@ -611,14 +691,16 @@ class Optimizer(object):
"""
pass
def _add_accumulator(self,
def _add_accumulator(
self,
name,
param,
dtype=None,
fill_value=0.0,
shape=None,
type=None,
device=None):
device=None,
):
"""Utility function to add an accumulator for a parameter
Args:
......@@ -630,13 +712,17 @@ class Optimizer(object):
"""
if self._name is not None:
name = self._name + "_" + name
if (name in self._accumulators
and param.name in self._accumulators[name]):
if (
name in self._accumulators
and param.name in self._accumulators[name]
):
if framework._non_static_mode():
return self._accumulators[name][param.name]
raise Exception(
"Accumulator {} already exists for parameter {}".format(
name, param.name))
name, param.name
)
)
if shape == None:
shape = param.shape
assert isinstance(self.helper, LayerHelper)
......@@ -650,20 +736,25 @@ class Optimizer(object):
persistable=True,
dtype=dtype or param.dtype,
type=core.VarDesc.VarType.LOD_TENSOR
if framework._in_eager_without_dygraph_check() else
(param.type if type is None else type),
if framework._in_eager_without_dygraph_check()
else (param.type if type is None else type),
shape=shape,
belong_to_optimizer=True)
belong_to_optimizer=True,
)
if device is None:
device = self._get_device_for_param(param.name)
with device_guard(device):
self.helper.set_variable_initializer(
var, initializer=Constant(value=float(fill_value)))
var, initializer=Constant(value=float(fill_value))
)
if framework._non_static_mode():
if len(self._accumulators_holder) > 0:
assert var_name in self._accumulators_holder, \
"Optimizer set error, {} should in state dict".format( var_name )
assert (
var_name in self._accumulators_holder
), "Optimizer set error, {} should in state dict".format(
var_name
)
var.set_value(self._accumulators_holder[var_name])
self._accumulators[name][param.name] = var
......@@ -681,11 +772,15 @@ class Optimizer(object):
"""
if self._name is not None:
name = self._name + "_" + name
if (name not in self._accumulators
or param.name not in self._accumulators[name]):
if (
name not in self._accumulators
or param.name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, param.name))
name, param.name
)
)
return self._accumulators[name][param.name]
def _update_param_device_map(self, parameters_and_grads, target_block):
......@@ -693,13 +788,15 @@ class Optimizer(object):
if param_and_grad[0].stop_gradient is False:
param_name = param_and_grad[0].name
ops = target_block.ops
device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName(
device_attr_name = (
core.op_proto_and_checker_maker.kOpDeviceAttrName()
)
for op in ops:
input_arg_names = op.input_arg_names
if param_name in input_arg_names:
self._param_device_map[param_name] = op.attr(
device_attr_name)
device_attr_name
)
break
def _get_device_for_param(self, param_name):
......@@ -708,7 +805,9 @@ class Optimizer(object):
device = self._param_device_map[param_name]
return device
def _create_optimization_pass(self, parameters_and_grads):
def _create_optimization_pass(
self, parameters_and_grads, param_group_idx=0
):
"""Add optimization operators to update gradients to tensors.
Args:
......@@ -736,10 +835,12 @@ class Optimizer(object):
target_block = global_block
current_block = framework.default_main_program().current_block()
if current_block.idx != global_block.idx:
assert current_block.backward_block_idx != -1, \
"current block is not global_block, but it doesn't have backward block."
assert (
current_block.backward_block_idx != -1
), "current block is not global_block, but it doesn't have backward block."
target_block = framework.default_main_program().blocks[
current_block.backward_block_idx]
current_block.backward_block_idx
]
start = len(target_block.ops)
self.helper = LayerHelper(self.__class__.__name__)
......@@ -748,57 +849,91 @@ class Optimizer(object):
# NOTE: Multi Tensor support [ Momentum, Adam ] for dygraph mode
if self._use_multi_tensor and self.__class__.__name__ in [
'Momentum', 'Adam'
'Momentum',
'Adam',
]:
if len(self._param_dict['FP32_LODTensor']) == 0 and len(
self._param_dict['FP16_LODTensor']) == 0:
if (
len(self._param_dict['FP32_LODTensor'][param_group_idx]) == 0
and len(self._param_dict['FP16_LODTensor'][param_group_idx])
== 0
):
if isinstance(parameters_and_grads, list):
self._multi_tensor_init(target_block, [
assert param_group_idx == 0
self._multi_tensor_init(
target_block,
[
p[0]
for p in parameters_and_grads if not p[0].stop_gradient
])
for p in parameters_and_grads
if not p[0].stop_gradient
],
param_group_idx,
)
else:
self._update_param_group(parameters_and_grads)
self._multi_tensor_init(target_block, [
p[0] for p in parameters_and_grads['params']
self._multi_tensor_init(
target_block,
[
p[0]
for p in parameters_and_grads['params']
if not p[0].stop_gradient
])
],
param_group_idx,
)
if framework._non_static_mode():
self._append_optimize_multi_tensor_op(target_block,
parameters_and_grads)
self._append_optimize_multi_tensor_op(
target_block,
parameters_and_grads,
param_group_idx=param_group_idx,
)
else:
self._update_param_device_map(parameters_and_grads,
target_block)
self._update_param_device_map(
parameters_and_grads, target_block
)
# NOTE: Multi Tensor requires all parameters to be in the same device and program.
# param_grad_list = [p_0,g_0,p_1,g_1,....]
param_grad_list = []
for param_and_grad in parameters_and_grads:
if not param_and_grad[0].stop_gradient and param_and_grad[
1] is not None:
if (
not param_and_grad[0].stop_gradient
and param_and_grad[1] is not None
):
param_grad_list.append(param_and_grad[0])
param_grad_list.append(param_and_grad[1])
with param_grad_list[0].block.program._optimized_guard(
param_grad_list), name_scope("optimizer"):
param_grad_list
), name_scope("optimizer"):
device = self._get_device_for_param(param_grad_list[0].name)
with device_guard(device):
self._append_optimize_multi_tensor_op(
target_block, parameters_and_grads)
target_block,
parameters_and_grads,
param_group_idx=param_group_idx,
)
else:
if not framework._non_static_mode():
params_grads_device_map = parameters_and_grads[
'params'] if isinstance(parameters_and_grads,
dict) else parameters_and_grads
self._update_param_device_map(params_grads_device_map,
target_block)
params_grads_device_map = (
parameters_and_grads['params']
if isinstance(parameters_and_grads, dict)
else parameters_and_grads
)
self._update_param_device_map(
params_grads_device_map, target_block
)
if isinstance(parameters_and_grads, list):
self._create_accumulators(target_block, [
p[0] for p in parameters_and_grads if not p[0].stop_gradient
])
self._create_accumulators(
target_block,
[
p[0]
for p in parameters_and_grads
if not p[0].stop_gradient
],
)
else:
params_acc_dict = parameters_and_grads.copy()
params_acc_dict['params'] = [
p[0] for p in params_acc_dict['params']
p[0]
for p in params_acc_dict['params']
if not p[0].stop_gradient
]
self._create_accumulators(target_block, params_acc_dict)
......@@ -809,8 +944,9 @@ class Optimizer(object):
if param_and_grad[1] is None:
continue
if param_and_grad[0].stop_gradient is False:
self._append_optimize_op(target_block,
param_and_grad)
self._append_optimize_op(
target_block, param_and_grad
)
else:
for param_and_grad in parameters_and_grads['params']:
if param_and_grad[1] is None:
......@@ -818,25 +954,31 @@ class Optimizer(object):
if param_and_grad[0].stop_gradient is False:
param_grad_dict = dict()
param_grad_dict['params'] = param_and_grad
param_grad_dict.update({
param_grad_dict.update(
{
k: v
for k, v in parameters_and_grads.items()
if k != 'params'
})
self._append_optimize_op(target_block,
param_grad_dict)
}
)
self._append_optimize_op(
target_block, param_grad_dict
)
else:
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
continue
with param_and_grad[0].block.program._optimized_guard(
param_and_grad), name_scope("optimizer"):
param_and_grad
), name_scope("optimizer"):
if param_and_grad[0].stop_gradient is False:
device = self._get_device_for_param(
param_and_grad[0].name)
param_and_grad[0].name
)
with device_guard(device):
optimize_op = self._append_optimize_op(
target_block, param_and_grad)
target_block, param_and_grad
)
# Get custom finish ops for subclasses
# FIXME: Need to fix this once we figure out how to handle dependencies
......@@ -848,12 +990,14 @@ class Optimizer(object):
def _append_dgc_ops(self, param_and_grad):
pass
def backward(self,
def backward(
self,
loss,
startup_program=None,
parameters=None,
no_grad_set=None,
callbacks=None):
callbacks=None,
):
"""
The first part of ``minimize``, do auto-diff to append backward operations for
the current program.
......@@ -902,8 +1046,7 @@ class Optimizer(object):
self._dtype = loss.dtype
if framework._non_static_mode():
parameter_list = parameters if parameters \
else self._parameter_list
parameter_list = parameters if parameters else self._parameter_list
params_grads = []
for param in parameter_list:
......@@ -917,23 +1060,26 @@ class Optimizer(object):
if callbacks is None:
callbacks = [error_clip_callback]
else:
assert (isinstance(callbacks, list))
assert isinstance(callbacks, list)
program = loss.block.program
assert len(loss.shape) == 1 and loss.shape[0] == 1, \
"The loss.shape should be (1L,), but the current loss.shape is {}. " \
assert len(loss.shape) == 1 and loss.shape[0] == 1, (
"The loss.shape should be (1L,), but the current loss.shape is {}. "
"Maybe that you should call paddle.mean to process the current loss.".format(
loss.shape)
parameter_list = parameters if parameters \
else self._parameter_list
loss.shape
)
)
parameter_list = parameters if parameters else self._parameter_list
with program_guard(program, startup_program):
from paddle.incubate.autograd.utils import prim_enabled
if prim_enabled():
params_grads = append_backward_new([loss], parameter_list,
act_no_grad_set,
callbacks)
params_grads = append_backward_new(
[loss], parameter_list, act_no_grad_set, callbacks
)
else:
params_grads = append_backward(loss, parameter_list,
act_no_grad_set, callbacks)
params_grads = append_backward(
loss, parameter_list, act_no_grad_set, callbacks
)
# Note: since we can't use all_reduce_op now,
# dgc_op should be the last op of one grad.
self._append_dgc_ops(params_grads)
......@@ -978,13 +1124,16 @@ class Optimizer(object):
params_grads = append_gradient_clip_ops(params_grads)
# Add regularization if any
params_grads = self.append_regularization_ops(params_grads,
self.regularization)
params_grads = self.append_regularization_ops(
params_grads, self.regularization
)
optimize_ops = self._create_optimization_pass(params_grads)
return optimize_ops
def _apply_optimize(self, loss, startup_program, params_grads):
def _apply_optimize(
self, loss, startup_program, params_grads, param_group_idx=0
):
"""
Second part of `minimize`, appending optimization operators for
given `params_grads` pairs.
......@@ -997,38 +1146,49 @@ class Optimizer(object):
list: A list of operators appended to the current program.
"""
if framework._non_static_mode():
with program_guard(framework.default_main_program(),
framework.default_startup_program()):
with program_guard(
framework.default_main_program(),
framework.default_startup_program(),
):
if isinstance(params_grads, list):
if self._grad_clip is not None:
params_grads = self._grad_clip(params_grads)
params_grads = self.append_regularization_ops(
params_grads, self.regularization)
params_grads, self.regularization
)
else:
grad_clip = params_grads['grad_clip']
if grad_clip is not None:
params_grads['params'] = grad_clip(
params_grads['params'])
params_grads['params']
)
params_grads['params'] = self.append_regularization_ops(
params_grads['params'], self.regularization)
optimize_ops = self._create_optimization_pass(params_grads)
params_grads['params'], self.regularization
)
optimize_ops = self._create_optimization_pass(
params_grads, param_group_idx=param_group_idx
)
else:
assert param_group_idx == 0
program = loss.block.program
with program_guard(program, startup_program):
optimize_ops = self.apply_gradients(params_grads)
return optimize_ops
def _create_regularization_of_grad(self, param, grad, regularization=None):
""" Create and add backward regularization Operators
"""Create and add backward regularization Operators
Function helper of append_regularization_ops.
"""
# If no gradient or no regularization is specified, then we don't need to do anything
if grad is None or (
(not hasattr(param, 'regularizer') or
(hasattr(param, 'regularizer') and param.regularizer is None))
and regularization is None):
(
not hasattr(param, 'regularizer')
or (hasattr(param, 'regularizer') and param.regularizer is None)
)
and regularization is None
):
return grad
regularization_term = None
if hasattr(param, 'regularizer') and param.regularizer is not None:
......@@ -1057,7 +1217,8 @@ class Optimizer(object):
dtype=param.dtype,
shape=param.shape,
lod_level=param.lod_level,
type=core.VarDesc.VarType.LOD_TENSOR)
type=core.VarDesc.VarType.LOD_TENSOR,
)
inputs = {"X": [grad, regularization_term]}
outputs = {"Out": [new_grad]}
......@@ -1065,9 +1226,9 @@ class Optimizer(object):
return new_grad
def append_regularization_ops(self,
parameters_and_grads,
regularization=None):
def append_regularization_ops(
self, parameters_and_grads, regularization=None
):
r"""Create and add backward regularization Operators
Creates and adds backward regularization operators in the BlockDesc.
......@@ -1092,21 +1253,28 @@ class Optimizer(object):
if framework._non_static_mode():
for param, grad in parameters_and_grads:
new_grad = self._create_regularization_of_grad(
param, grad, regularization)
param, grad, regularization
)
params_and_grads.append((param, new_grad))
else:
repeate_regularizer = False
with framework.name_scope('regularization'):
for param, grad in parameters_and_grads:
if not repeate_regularizer and param.regularizer is not None and regularization is not None:
if (
not repeate_regularizer
and param.regularizer is not None
and regularization is not None
):
repeate_regularizer = True
logging.info(
"If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
"The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
% regularization.__str__())
% regularization.__str__()
)
with param.block.program._optimized_guard([param, grad]):
new_grad = self._create_regularization_of_grad(
param, grad, regularization)
param, grad, regularization
)
params_and_grads.append((param, new_grad))
return params_and_grads
......@@ -1114,7 +1282,8 @@ class Optimizer(object):
no_grad_set = _get_no_grad_set_name(no_grad_set)
parameters = loss.block.program.global_block().all_parameters()
param_no_trainable = set(
[param.name for param in parameters if param.stop_gradient is True])
[param.name for param in parameters if param.stop_gradient is True]
)
# If the parameter is no trainable, it should not have a gradient.
no_grad_set.update(param_no_trainable)
......@@ -1155,7 +1324,8 @@ class Optimizer(object):
"""
param_list = []
if self._parameter_list is None or not isinstance(
self._parameter_list[0], dict):
self._parameter_list[0], dict
):
for p in self._parameter_list:
if not p.stop_gradient:
param_list.append(p)
......@@ -1172,11 +1342,9 @@ class Optimizer(object):
core.clear_gradients(param_list, set_to_zero)
@imperative_base.no_grad
def minimize(self,
loss,
startup_program=None,
parameters=None,
no_grad_set=None):
def minimize(
self, loss, startup_program=None, parameters=None, no_grad_set=None
):
"""
Add operations to minimize ``loss`` by updating ``parameters``.
......@@ -1221,17 +1389,18 @@ class Optimizer(object):
"""
assert isinstance(loss, Variable), "The loss should be an Tensor."
parameter_list = parameters if parameters \
else self._parameter_list
parameter_list = parameters if parameters else self._parameter_list
params_grads = self.backward(loss,
params_grads = self.backward(
loss,
startup_program=startup_program,
parameters=parameter_list,
no_grad_set=no_grad_set)
no_grad_set=no_grad_set,
)
optimize_ops = self._apply_optimize(loss,
startup_program=startup_program,
params_grads=params_grads)
optimize_ops = self._apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads
)
return optimize_ops, params_grads
......@@ -1271,13 +1440,16 @@ class Optimizer(object):
grad_var = param._grad_ivar()
params_grads.append((param, grad_var))
self._apply_optimize(loss=None,
self._apply_optimize(
loss=None,
startup_program=None,
params_grads=params_grads)
params_grads=params_grads,
param_group_idx=0,
)
else:
# optimize parameters in groups
for param_group in self._param_groups:
for idx, param_group in enumerate(self._param_groups):
params_grads = defaultdict(lambda: list())
for param in param_group['params']:
if param.stop_gradient:
......@@ -1286,11 +1458,14 @@ class Optimizer(object):
grad_var = param._grad_ivar()
params_grads['params'].append((param, grad_var))
params_grads.update(
{k: v
for k, v in param_group.items() if k != 'params'})
self._apply_optimize(loss=None,
{k: v for k, v in param_group.items() if k != 'params'}
)
self._apply_optimize(
loss=None,
startup_program=None,
params_grads=params_grads)
params_grads=params_grads,
param_group_idx=idx,
)
def _add_param_group(self, param_group):
"""
......@@ -1306,7 +1481,8 @@ class Optimizer(object):
elif isinstance(params, set):
raise TypeError(
"optimizer parameters should be in ordered collections,"
"but received set, please use list instead.")
"but received set, please use list instead."
)
else:
param_group['params'] = list(params)
......@@ -1320,18 +1496,21 @@ class Optimizer(object):
if not param_set.isdisjoint(set(param_group['params'])):
raise ValueError(
"some parameters appear in more than one parameter group")
"some parameters appear in more than one parameter group"
)
for param in param_group['params']:
weight_decay = param_group['weight_decay']
if isinstance(weight_decay, float):
from ..fluid.regularizer import L2Decay
regularization = L2Decay(weight_decay)
else:
regularization = weight_decay
param.regularizer = regularization
param.optimize_attr['learning_rate'] = param_group.get(
'learning_rate', 1.)
'learning_rate', 1.0
)
self._param_groups.append(param_group)
......@@ -1345,7 +1524,7 @@ class Optimizer(object):
pass
@framework.dygraph_only
def _multi_tensor_init(self, target_block, parameters):
def _multi_tensor_init(self, target_block, parameters, param_group_idx):
"""
All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
This function will be overridden in the corresponding optimizer file.
......@@ -1357,8 +1536,9 @@ class Optimizer(object):
pass
@framework.dygraph_only
def _append_optimize_multi_tensor_op(self, target_block,
parameters_and_grads):
def _append_optimize_multi_tensor_op(
self, target_block, parameters_and_grads, param_group_idx
):
"""
For Multi Tensor, append optimize merged_operator to block.
"""
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册