未验证 提交 94240e2e 编写于 作者: S sneaxiy 提交者: GitHub

[Cherry-pick Release/2.4] Fix multi_tensor adam and momentum bug when the...

[Cherry-pick Release/2.4] Fix multi_tensor adam and momentum bug when the parameter is list of dict (#47372)

* reformat file by black

* fix multi_tensor adam/momentum bug
上级 b143e008
...@@ -25,10 +25,8 @@ from paddle.fluid.framework import _test_eager_guard ...@@ -25,10 +25,8 @@ from paddle.fluid.framework import _test_eager_guard
class TestAdamOp1(OpTest): class TestAdamOp1(OpTest):
def setUp(self): def setUp(self):
'''Test Adam Op with supplied attributes '''Test Adam Op with supplied attributes'''
'''
self.op_type = "adam" self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32") param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
...@@ -50,20 +48,19 @@ class TestAdamOp1(OpTest): ...@@ -50,20 +48,19 @@ class TestAdamOp1(OpTest):
'Moment2': moment2, 'Moment2': moment2,
'LearningRate': np.array([learning_rate]).astype("float32"), 'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pow': np.array([beta1_pow]).astype("float32"), 'Beta1Pow': np.array([beta1_pow]).astype("float32"),
'Beta2Pow': np.array([beta2_pow]).astype("float32") 'Beta2Pow': np.array([beta2_pow]).astype("float32"),
} }
self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2} self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
param_out, moment1_out, \ param_out, moment1_out, moment2_out = adam_step(self.inputs, self.attrs)
moment2_out = adam_step(self.inputs, self.attrs)
self.outputs = { self.outputs = {
'Moment1Out': moment1_out, 'Moment1Out': moment1_out,
'Moment2Out': moment2_out, 'Moment2Out': moment2_out,
'ParamOut': param_out, 'ParamOut': param_out,
'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1, 'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2 'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
} }
def test_check_output(self): def test_check_output(self):
...@@ -71,13 +68,11 @@ class TestAdamOp1(OpTest): ...@@ -71,13 +68,11 @@ class TestAdamOp1(OpTest):
class TestAdamOp2(OpTest): class TestAdamOp2(OpTest):
def set_shape(self): def set_shape(self):
self.shape = (102, 105) self.shape = (102, 105)
def setUp(self): def setUp(self):
'''Test Adam Op with supplied attributes '''Test Adam Op with supplied attributes'''
'''
self.op_type = "adam" self.op_type = "adam"
self.set_shape() self.set_shape()
param = np.random.uniform(-1, 1, self.shape).astype("float32") param = np.random.uniform(-1, 1, self.shape).astype("float32")
...@@ -100,20 +95,19 @@ class TestAdamOp2(OpTest): ...@@ -100,20 +95,19 @@ class TestAdamOp2(OpTest):
'Moment2': moment2, 'Moment2': moment2,
'LearningRate': np.array([learning_rate]).astype("float32"), 'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pow': np.array([beta1_pow]).astype("float32"), 'Beta1Pow': np.array([beta1_pow]).astype("float32"),
'Beta2Pow': np.array([beta2_pow]).astype("float32") 'Beta2Pow': np.array([beta2_pow]).astype("float32"),
} }
attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2} attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
param_out, moment1_out, \ param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
moment2_out = adam_step(self.inputs, attributes)
self.outputs = { self.outputs = {
'Moment1Out': moment1_out, 'Moment1Out': moment1_out,
'Moment2Out': moment2_out, 'Moment2Out': moment2_out,
'ParamOut': param_out, 'ParamOut': param_out,
'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1, 'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2 'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
} }
def test_check_output(self): def test_check_output(self):
...@@ -121,16 +115,13 @@ class TestAdamOp2(OpTest): ...@@ -121,16 +115,13 @@ class TestAdamOp2(OpTest):
class TestAdamOnlyTailOp(TestAdamOp2): class TestAdamOnlyTailOp(TestAdamOp2):
def set_shape(self): def set_shape(self):
self.shape = (3) self.shape = 3
class TestAdamOpMultipleSteps(OpTest): class TestAdamOpMultipleSteps(OpTest):
def setUp(self): def setUp(self):
'''Test Adam Operator with supplied attributes '''Test Adam Operator with supplied attributes'''
'''
self.op_type = "adam" self.op_type = "adam"
self.num_steps = 10 self.num_steps = 10
...@@ -154,19 +145,20 @@ class TestAdamOpMultipleSteps(OpTest): ...@@ -154,19 +145,20 @@ class TestAdamOpMultipleSteps(OpTest):
'Moment2': moment2, 'Moment2': moment2,
'LearningRate': np.array([learning_rate]).astype("float32"), 'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pow': np.array([self.beta1_pow]).astype("float32"), 'Beta1Pow': np.array([self.beta1_pow]).astype("float32"),
'Beta2Pow': np.array([self.beta2_pow]).astype("float32") 'Beta2Pow': np.array([self.beta2_pow]).astype("float32"),
} }
self.attrs = { self.attrs = {
'epsilon': epsilon, 'epsilon': epsilon,
'beta1': self.beta1, 'beta1': self.beta1,
'beta2': self.beta2 'beta2': self.beta2,
} }
def test_check_output(self): def test_check_output(self):
for _ in range(self.num_steps): for _ in range(self.num_steps):
param_out, moment1_out, \ param_out, moment1_out, moment2_out = adam_step(
moment2_out = adam_step(self.inputs, self.attrs) self.inputs, self.attrs
)
beta1_pow_out = self.inputs['Beta1Pow'] * self.beta1 beta1_pow_out = self.inputs['Beta1Pow'] * self.beta1
beta2_pow_out = self.inputs['Beta2Pow'] * self.beta2 beta2_pow_out = self.inputs['Beta2Pow'] * self.beta2
...@@ -175,7 +167,7 @@ class TestAdamOpMultipleSteps(OpTest): ...@@ -175,7 +167,7 @@ class TestAdamOpMultipleSteps(OpTest):
'Moment2Out': moment2_out, 'Moment2Out': moment2_out,
'ParamOut': param_out, 'ParamOut': param_out,
'Beta1PowOut': beta1_pow_out, 'Beta1PowOut': beta1_pow_out,
'Beta2PowOut': beta2_pow_out 'Beta2PowOut': beta2_pow_out,
} }
# Verify output for this step # Verify output for this step
...@@ -191,8 +183,9 @@ class TestAdamOpMultipleSteps(OpTest): ...@@ -191,8 +183,9 @@ class TestAdamOpMultipleSteps(OpTest):
self.inputs['Beta2Pow'] = beta2_pow_out self.inputs['Beta2Pow'] = beta2_pow_out
# Randomize gradient for next step # Randomize gradient for next step
self.inputs['Grad'] = np.random.uniform( self.inputs['Grad'] = np.random.uniform(-1, 1, (102, 105)).astype(
-1, 1, (102, 105)).astype("float32") "float32"
)
def test_api_eager_dygraph(self): def test_api_eager_dygraph(self):
with _test_eager_guard(): with _test_eager_guard():
...@@ -272,8 +265,9 @@ def adamw_step(inputs, attributes): ...@@ -272,8 +265,9 @@ def adamw_step(inputs, attributes):
return param_out, moment1_out, moment2_out return param_out, moment1_out, moment2_out
def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad, def adam_step_sparse(
lazy_mode): inputs, attributes, height, rows, row_numel, np_grad, lazy_mode
):
''' '''
Simulate one step of the adam optimizer Simulate one step of the adam optimizer
:param inputs: dict of inputs :param inputs: dict of inputs
...@@ -298,13 +292,16 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad, ...@@ -298,13 +292,16 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
param_out = np.zeros(shape=[height, row_numel]) param_out = np.zeros(shape=[height, row_numel])
def update_row(row_id, update_value): def update_row(row_id, update_value):
moment1_out[row_id] = beta1 * moment1[row_id] + (1 - moment1_out[row_id] = (
beta1) * update_value beta1 * moment1[row_id] + (1 - beta1) * update_value
moment2_out[row_id] = beta2 * moment2[row_id] + ( )
1 - beta2) * np.square(update_value) moment2_out[row_id] = beta2 * moment2[row_id] + (1 - beta2) * np.square(
update_value
)
lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow) lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
param_out[row_id] = param[row_id] - lr_t * ( param_out[row_id] = param[row_id] - lr_t * (
moment1_out[row_id] / (np.sqrt(moment2_out[row_id]) + epsilon)) moment1_out[row_id] / (np.sqrt(moment2_out[row_id]) + epsilon)
)
if lazy_mode: if lazy_mode:
for idx, row_id in enumerate(rows): for idx, row_id in enumerate(rows):
...@@ -320,7 +317,6 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad, ...@@ -320,7 +317,6 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
class TestSparseAdamOp(unittest.TestCase): class TestSparseAdamOp(unittest.TestCase):
def setup(self, scope, place, lazy_mode): def setup(self, scope, place, lazy_mode):
beta1 = 0.78 beta1 = 0.78
beta2 = 0.836 beta2 = 0.836
...@@ -339,14 +335,14 @@ class TestSparseAdamOp(unittest.TestCase): ...@@ -339,14 +335,14 @@ class TestSparseAdamOp(unittest.TestCase):
"Moment2": np.full((height, row_numel), 5.0).astype("float32"), "Moment2": np.full((height, row_numel), 5.0).astype("float32"),
'Beta1Pow': beta1_pow, 'Beta1Pow': beta1_pow,
'Beta2Pow': beta2_pow, 'Beta2Pow': beta2_pow,
"LearningRate": np.full((1), 2.0).astype("float32") "LearningRate": np.full((1), 2.0).astype("float32"),
} }
self.init_output = np.full((height, row_numel), 0.0).astype("float32") self.init_output = np.full((height, row_numel), 0.0).astype("float32")
self.attrs = { self.attrs = {
'epsilon': epsilon, 'epsilon': epsilon,
'beta1': beta1, 'beta1': beta1,
'beta2': beta2, 'beta2': beta2,
'min_row_size_to_use_multithread': 2 'min_row_size_to_use_multithread': 2,
} }
grad_selected_rows = scope.var('Grad').get_selected_rows() grad_selected_rows = scope.var('Grad').get_selected_rows()
...@@ -361,15 +357,21 @@ class TestSparseAdamOp(unittest.TestCase): ...@@ -361,15 +357,21 @@ class TestSparseAdamOp(unittest.TestCase):
self.sparse_inputs = ["Grad"] self.sparse_inputs = ["Grad"]
param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs, param_out, mom1, mom2 = adam_step_sparse(
height, rows, row_numel, self.dense_inputs,
np_array, lazy_mode) self.attrs,
height,
rows,
row_numel,
np_array,
lazy_mode,
)
self.outputs = { self.outputs = {
"ParamOut": param_out, "ParamOut": param_out,
"Moment1Out": mom1, "Moment1Out": mom1,
"Moment2Out": mom2, "Moment2Out": mom2,
'Beta1PowOut': beta1_pow * beta1, 'Beta1PowOut': beta1_pow * beta1,
'Beta2PowOut': beta2_pow * beta2 'Beta2PowOut': beta2_pow * beta2,
} }
def check_with_place(self, place, lazy_mode): def check_with_place(self, place, lazy_mode):
...@@ -414,10 +416,8 @@ class TestSparseAdamOp(unittest.TestCase): ...@@ -414,10 +416,8 @@ class TestSparseAdamOp(unittest.TestCase):
class TestAdamOpBetaVariable(OpTest): class TestAdamOpBetaVariable(OpTest):
def setUp(self): def setUp(self):
'''Test Adam Op with beta as Variable '''Test Adam Op with beta as Variable'''
'''
self.op_type = "adam" self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32") param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
...@@ -446,15 +446,14 @@ class TestAdamOpBetaVariable(OpTest): ...@@ -446,15 +446,14 @@ class TestAdamOpBetaVariable(OpTest):
attributes = {'epsilon': epsilon} attributes = {'epsilon': epsilon}
param_out, moment1_out, \ param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
moment2_out = adam_step(self.inputs, attributes)
self.outputs = { self.outputs = {
'Moment1Out': moment1_out, 'Moment1Out': moment1_out,
'Moment2Out': moment2_out, 'Moment2Out': moment2_out,
'ParamOut': param_out, 'ParamOut': param_out,
'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1, 'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2 'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
} }
def test_check_output(self): def test_check_output(self):
...@@ -462,10 +461,8 @@ class TestAdamOpBetaVariable(OpTest): ...@@ -462,10 +461,8 @@ class TestAdamOpBetaVariable(OpTest):
class TestAdamOpBetaEpsilonVariable(OpTest): class TestAdamOpBetaEpsilonVariable(OpTest):
def setUp(self): def setUp(self):
'''Test Adam Op with beta/epsilon as Variable '''Test Adam Op with beta/epsilon as Variable'''
'''
self.op_type = "adam" self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32") param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
...@@ -495,15 +492,14 @@ class TestAdamOpBetaEpsilonVariable(OpTest): ...@@ -495,15 +492,14 @@ class TestAdamOpBetaEpsilonVariable(OpTest):
attributes = {'epsilon': epsilon} attributes = {'epsilon': epsilon}
param_out, moment1_out, \ param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
moment2_out = adam_step(self.inputs, attributes)
self.outputs = { self.outputs = {
'Moment1Out': moment1_out, 'Moment1Out': moment1_out,
'Moment2Out': moment2_out, 'Moment2Out': moment2_out,
'ParamOut': param_out, 'ParamOut': param_out,
'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1, 'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2 'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
} }
def test_check_output(self): def test_check_output(self):
...@@ -511,10 +507,8 @@ class TestAdamOpBetaEpsilonVariable(OpTest): ...@@ -511,10 +507,8 @@ class TestAdamOpBetaEpsilonVariable(OpTest):
class TestAdamOpWithGlobalBetaPow(OpTest): class TestAdamOpWithGlobalBetaPow(OpTest):
def setUp(self): def setUp(self):
'''Test Adam Op with global_beta_pow '''Test Adam Op with global_beta_pow'''
'''
self.op_type = "adam" self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32") param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
...@@ -544,8 +538,7 @@ class TestAdamOpWithGlobalBetaPow(OpTest): ...@@ -544,8 +538,7 @@ class TestAdamOpWithGlobalBetaPow(OpTest):
attributes = {'epsilon': epsilon} attributes = {'epsilon': epsilon}
param_out, moment1_out, \ param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
moment2_out = adam_step(self.inputs, attributes)
self.attrs = {'use_global_beta_pow': True} self.attrs = {'use_global_beta_pow': True}
...@@ -555,7 +548,7 @@ class TestAdamOpWithGlobalBetaPow(OpTest): ...@@ -555,7 +548,7 @@ class TestAdamOpWithGlobalBetaPow(OpTest):
'Moment2Out': moment2_out, 'Moment2Out': moment2_out,
'ParamOut': param_out, 'ParamOut': param_out,
'Beta1PowOut': np.array([]), 'Beta1PowOut': np.array([]),
'Beta2PowOut': np.array([]) 'Beta2PowOut': np.array([]),
} }
def test_check_output(self): def test_check_output(self):
...@@ -563,10 +556,8 @@ class TestAdamOpWithGlobalBetaPow(OpTest): ...@@ -563,10 +556,8 @@ class TestAdamOpWithGlobalBetaPow(OpTest):
class TestAdamOpWithSkipUpdate(OpTest): class TestAdamOpWithSkipUpdate(OpTest):
def setUp(self): def setUp(self):
'''Test Adam Op with global_beta_pow '''Test Adam Op with global_beta_pow'''
'''
self.op_type = "adam" self.op_type = "adam"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32") param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
...@@ -613,7 +604,6 @@ class TestAdamOpWithSkipUpdate(OpTest): ...@@ -613,7 +604,6 @@ class TestAdamOpWithSkipUpdate(OpTest):
class TestAdamOpV2(unittest.TestCase): class TestAdamOpV2(unittest.TestCase):
def test_adam_op(self): def test_adam_op(self):
place = fluid.CPUPlace() place = fluid.CPUPlace()
shape = [2, 3, 8, 8] shape = [2, 3, 8, 8]
...@@ -626,20 +616,20 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -626,20 +616,20 @@ class TestAdamOpV2(unittest.TestCase):
conv = fluid.layers.conv2d(data, 8, 3) conv = fluid.layers.conv2d(data, 8, 3)
loss = fluid.layers.reduce_mean(conv) loss = fluid.layers.reduce_mean(conv)
beta1 = fluid.layers.create_global_var(shape=[1], beta1 = fluid.layers.create_global_var(
value=0.85, shape=[1], value=0.85, dtype='float32', persistable=True
dtype='float32', )
persistable=True) beta2 = fluid.layers.create_global_var(
beta2 = fluid.layers.create_global_var(shape=[1], shape=[1], value=0.95, dtype='float32', persistable=True
value=0.95, )
dtype='float32',
persistable=True)
betas = [beta1, beta2] betas = [beta1, beta2]
opt = paddle.optimizer.Adam(learning_rate=1e-5, opt = paddle.optimizer.Adam(
beta1=beta1, learning_rate=1e-5,
beta2=beta2, beta1=beta1,
weight_decay=0.01, beta2=beta2,
epsilon=1e-8) weight_decay=0.01,
epsilon=1e-8,
)
opt.minimize(loss) opt.minimize(loss)
exe.run(startup) exe.run(startup)
...@@ -653,8 +643,9 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -653,8 +643,9 @@ class TestAdamOpV2(unittest.TestCase):
a = fluid.dygraph.to_variable(value) a = fluid.dygraph.to_variable(value)
linear = fluid.Linear(13, 5, dtype="float32") linear = fluid.Linear(13, 5, dtype="float32")
adam = paddle.optimizer.Adam(learning_rate=0.01, adam = paddle.optimizer.Adam(
parameters=linear.parameters()) learning_rate=0.01, parameters=linear.parameters()
)
out = linear(a) out = linear(a)
out.backward() out.backward()
adam.step() adam.step()
...@@ -670,26 +661,29 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -670,26 +661,29 @@ class TestAdamOpV2(unittest.TestCase):
state_dict = adam.state_dict() state_dict = adam.state_dict()
adam.set_state_dict(state_dict) adam.set_state_dict(state_dict)
#learning_rate is LRScheduler # learning_rate is LRScheduler
learning_rate = paddle.optimizer.lr.CosineAnnealingDecay( learning_rate = paddle.optimizer.lr.CosineAnnealingDecay(
learning_rate=0.1, T_max=10) learning_rate=0.1, T_max=10
)
adam = paddle.optimizer.Adam( adam = paddle.optimizer.Adam(
learning_rate=learning_rate, learning_rate=learning_rate,
weight_decay=fluid.regularizer.L2Decay(0.001), weight_decay=fluid.regularizer.L2Decay(0.001),
parameters=emb.parameters()) parameters=emb.parameters(),
)
lr = adam.get_lr() lr = adam.get_lr()
state_dict = adam.state_dict() state_dict = adam.state_dict()
adam.set_state_dict(state_dict) adam.set_state_dict(state_dict)
#leanrning_rate is Tensor # leanrning_rate is Tensor
with self.assertRaises(TypeError): with self.assertRaises(TypeError):
learning_rate = np.array([0.01]).astype("float32") learning_rate = np.array([0.01]).astype("float32")
learning_rate = paddle.to_tensor(learning_rate) learning_rate = paddle.to_tensor(learning_rate)
adam = paddle.optimizer.Adam(learning_rate=learning_rate, adam = paddle.optimizer.Adam(
parameters=emb.parameters()) learning_rate=learning_rate, parameters=emb.parameters()
)
params = adam.get_opti_var_name_list() params = adam.get_opti_var_name_list()
assert (params is not None) assert params is not None
paddle.enable_static() paddle.enable_static()
def test_adam_with_grad_clip(self): def test_adam_with_grad_clip(self):
...@@ -698,9 +692,9 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -698,9 +692,9 @@ class TestAdamOpV2(unittest.TestCase):
a = fluid.dygraph.to_variable(value) a = fluid.dygraph.to_variable(value)
linear = fluid.Linear(13, 5, dtype="float32") linear = fluid.Linear(13, 5, dtype="float32")
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
adam = paddle.optimizer.Adam(0.1, adam = paddle.optimizer.Adam(
parameters=linear.parameters(), 0.1, parameters=linear.parameters(), grad_clip=clip
grad_clip=clip) )
out = linear(a) out = linear(a)
out.backward() out.backward()
adam.step() adam.step()
...@@ -715,11 +709,11 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -715,11 +709,11 @@ class TestAdamOpV2(unittest.TestCase):
lr = 0.01 lr = 0.01
adam.set_lr(lr) adam.set_lr(lr)
cur_lr = adam.get_lr() cur_lr = adam.get_lr()
assert (lr == cur_lr) assert lr == cur_lr
with self.assertRaises(TypeError): with self.assertRaises(TypeError):
lr_var = paddle.fluid.layers.create_global_var(shape=[1], lr_var = paddle.fluid.layers.create_global_var(
value=lr, shape=[1], value=lr, dtype='float32'
dtype='float32') )
adam.set_lr(lr_var) adam.set_lr(lr_var)
paddle.enable_static() paddle.enable_static()
...@@ -727,17 +721,17 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -727,17 +721,17 @@ class TestAdamOpV2(unittest.TestCase):
paddle.disable_static() paddle.disable_static()
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
adam = paddle.optimizer.Adam(0.1, adam = paddle.optimizer.Adam(
beta1=-1, 0.1, beta1=-1, parameters=linear.parameters()
parameters=linear.parameters()) )
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
adam = paddle.optimizer.Adam(0.1, adam = paddle.optimizer.Adam(
beta2=-1, 0.1, beta2=-1, parameters=linear.parameters()
parameters=linear.parameters()) )
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
adam = paddle.optimizer.Adam(0.1, adam = paddle.optimizer.Adam(
epsilon=-1, 0.1, epsilon=-1, parameters=linear.parameters()
parameters=linear.parameters()) )
paddle.enable_static() paddle.enable_static()
def test_adam_op_with_sparse_input_and_weight_decay(self): def test_adam_op_with_sparse_input_and_weight_decay(self):
...@@ -746,9 +740,9 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -746,9 +740,9 @@ class TestAdamOpV2(unittest.TestCase):
x_data = np.arange(0, 10).reshape((10, 1)).astype(np.int64) x_data = np.arange(0, 10).reshape((10, 1)).astype(np.int64)
x = paddle.to_tensor(x_data, stop_gradient=False) x = paddle.to_tensor(x_data, stop_gradient=False)
emb = paddle.nn.Embedding(10, 10, sparse=True) emb = paddle.nn.Embedding(10, 10, sparse=True)
adam = paddle.optimizer.Adam(0.001, adam = paddle.optimizer.Adam(
parameters=emb.parameters(), 0.001, parameters=emb.parameters(), weight_decay=0.01
weight_decay=0.01) )
with self.assertRaises(RuntimeError): with self.assertRaises(RuntimeError):
out = emb(x) out = emb(x)
...@@ -766,13 +760,14 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -766,13 +760,14 @@ class TestAdamOpV2(unittest.TestCase):
class TestAdamOptimizer(unittest.TestCase): class TestAdamOptimizer(unittest.TestCase):
def _test(
def _test(self, self,
place, place,
use_tensor=True, use_tensor=True,
use_fluid_api=True, use_fluid_api=True,
use_global_beta_pow=False, use_global_beta_pow=False,
flatten_param_grads=False): flatten_param_grads=False,
):
paddle.enable_static() paddle.enable_static()
main_prog = paddle.static.Program() main_prog = paddle.static.Program()
startup_prog = paddle.static.Program() startup_prog = paddle.static.Program()
...@@ -786,29 +781,30 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -786,29 +781,30 @@ class TestAdamOptimizer(unittest.TestCase):
weight_attr1 = paddle.ParamAttr( weight_attr1 = paddle.ParamAttr(
name="weight1", name="weight1",
initializer=fluid.initializer.Constant(value=1.0), initializer=fluid.initializer.Constant(value=1.0),
trainable=True) trainable=True,
)
weight_attr2 = paddle.ParamAttr( weight_attr2 = paddle.ParamAttr(
name="weight2", name="weight2",
initializer=fluid.initializer.Constant(value=2.0), initializer=fluid.initializer.Constant(value=2.0),
trainable=True) trainable=True,
)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
with paddle.static.program_guard(main_prog, startup_prog): with paddle.static.program_guard(main_prog, startup_prog):
with paddle.utils.unique_name.guard(): with paddle.utils.unique_name.guard():
a = paddle.static.data(name="a", shape=[2, 2], dtype='float32') a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
b = paddle.static.data(name="b", shape=[2, 2], dtype='float32') b = paddle.static.data(name="b", shape=[2, 2], dtype='float32')
label = paddle.static.data(name="label", label = paddle.static.data(
shape=[2, 1], name="label", shape=[2, 1], dtype='int64'
dtype='int64') )
sum = paddle.add(a, b) sum = paddle.add(a, b)
z = paddle.pow(sum, 2.0) z = paddle.pow(sum, 2.0)
fc_1 = fluid.layers.fc(input=z, size=2, param_attr=weight_attr1) fc_1 = fluid.layers.fc(input=z, size=2, param_attr=weight_attr1)
prediction = fluid.layers.fc(input=fc_1, prediction = fluid.layers.fc(
size=2, input=fc_1, size=2, param_attr=weight_attr2, act='softmax'
param_attr=weight_attr2, )
act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=label) cost = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.reduce_mean(cost) loss = fluid.layers.reduce_mean(cost)
...@@ -821,19 +817,22 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -821,19 +817,22 @@ class TestAdamOptimizer(unittest.TestCase):
value=float(beta1_init), value=float(beta1_init),
dtype='float32', dtype='float32',
persistable=True, persistable=True,
name="beta1") name="beta1",
)
beta2 = fluid.layers.create_global_var( beta2 = fluid.layers.create_global_var(
shape=[1], shape=[1],
value=float(beta2_init), value=float(beta2_init),
dtype='float32', dtype='float32',
persistable=True, persistable=True,
name="beta2") name="beta2",
)
epsilon = fluid.layers.create_global_var( epsilon = fluid.layers.create_global_var(
shape=[1], shape=[1],
value=float(epsilon_init), value=float(epsilon_init),
dtype='float32', dtype='float32',
persistable=True, persistable=True,
name="epsilon") name="epsilon",
)
if use_fluid_api: if use_fluid_api:
adam = fluid.optimizer.Adam( adam = fluid.optimizer.Adam(
learning_rate=0.01, learning_rate=0.01,
...@@ -843,13 +842,16 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -843,13 +842,16 @@ class TestAdamOptimizer(unittest.TestCase):
use_global_beta_pow=use_global_beta_pow, use_global_beta_pow=use_global_beta_pow,
flatten_param_grads=flatten_param_grads, flatten_param_grads=flatten_param_grads,
align_size=256, align_size=256,
grad_clip=clip) grad_clip=clip,
)
else: else:
adam = paddle.optimizer.Adam(learning_rate=0.01, adam = paddle.optimizer.Adam(
beta1=beta1, learning_rate=0.01,
beta2=beta2, beta1=beta1,
epsilon=epsilon, beta2=beta2,
grad_clip=clip) epsilon=epsilon,
grad_clip=clip,
)
else: else:
if use_fluid_api: if use_fluid_api:
adam = fluid.optimizer.Adam( adam = fluid.optimizer.Adam(
...@@ -860,13 +862,16 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -860,13 +862,16 @@ class TestAdamOptimizer(unittest.TestCase):
use_global_beta_pow=use_global_beta_pow, use_global_beta_pow=use_global_beta_pow,
flatten_param_grads=flatten_param_grads, flatten_param_grads=flatten_param_grads,
align_size=256, align_size=256,
grad_clip=clip) grad_clip=clip,
)
else: else:
adam = fluid.optimizer.Adam(learning_rate=0.01, adam = fluid.optimizer.Adam(
beta1=beta1_init, learning_rate=0.01,
beta2=beta2_init, beta1=beta1_init,
epsilon=epsilon_init, beta2=beta2_init,
grad_clip=clip) epsilon=epsilon_init,
grad_clip=clip,
)
adam.minimize(loss) adam.minimize(loss)
...@@ -877,15 +882,16 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -877,15 +882,16 @@ class TestAdamOptimizer(unittest.TestCase):
print("Start run on {}".format(place)) print("Start run on {}".format(place))
for epoch in range(10): for epoch in range(10):
pred_res, loss_res = exe.run(main_prog, pred_res, loss_res = exe.run(
feed={ main_prog,
"a": a_np, feed={"a": a_np, "b": b_np, "label": label_np},
"b": b_np, fetch_list=[prediction, loss],
"label": label_np )
}, print(
fetch_list=[prediction, loss]) "Epoch {} | Prediction[0]: {}, Loss: {}".format(
print("Epoch {} | Prediction[0]: {}, Loss: {}".format( epoch, pred_res[0], loss_res
epoch, pred_res[0], loss_res)) )
)
paddle.disable_static() paddle.disable_static()
return pred_res, loss_res return pred_res, loss_res
...@@ -897,10 +903,13 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -897,10 +903,13 @@ class TestAdamOptimizer(unittest.TestCase):
for use_fluid_api in [True, False]: for use_fluid_api in [True, False]:
for use_global_beta_pow in [True, False]: for use_global_beta_pow in [True, False]:
for flatten_param_grads in [True, False]: for flatten_param_grads in [True, False]:
pred, loss = self._test(place, use_tensor, pred, loss = self._test(
use_fluid_api, place,
use_global_beta_pow, use_tensor,
flatten_param_grads) use_fluid_api,
use_global_beta_pow,
flatten_param_grads,
)
preds.append(pred) preds.append(pred)
losses.append(loss) losses.append(loss)
for pred in preds: for pred in preds:
...@@ -922,21 +931,22 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -922,21 +931,22 @@ class TestAdamOptimizer(unittest.TestCase):
name="weight1", name="weight1",
initializer=fluid.initializer.Constant(value=1.0), initializer=fluid.initializer.Constant(value=1.0),
regularizer=fluid.regularizer.L1DecayRegularizer( regularizer=fluid.regularizer.L1DecayRegularizer(
regularization_coeff=0.1), regularization_coeff=0.1
trainable=True) ),
trainable=True,
)
with fluid.program_guard(main): with fluid.program_guard(main):
x = fluid.data(name='x', shape=[None, 13], dtype='float32') x = fluid.data(name='x', shape=[None, 13], dtype='float32')
y = fluid.data(name='y', shape=[None, 1], dtype='float32') y = fluid.data(name='y', shape=[None, 1], dtype='float32')
y_predict = fluid.layers.fc(input=x, y_predict = fluid.layers.fc(
size=1, input=x, size=1, act=None, param_attr=weight_attr
act=None, )
param_attr=weight_attr)
cost = fluid.layers.square_error_cost(input=y_predict, label=y) cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = paddle.mean(cost) avg_cost = paddle.mean(cost)
adam = fluid.optimizer.AdamOptimizer(0.01, adam = fluid.optimizer.AdamOptimizer(
flatten_param_grads=True, 0.01, flatten_param_grads=True, align_size=256
align_size=256) )
adam.minimize(avg_cost) adam.minimize(avg_cost)
paddle.disable_static() paddle.disable_static()
...@@ -959,13 +969,16 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -959,13 +969,16 @@ class TestAdamOptimizer(unittest.TestCase):
adam = fluid.optimizer.Adam(use_global_beta_pow=True) adam = fluid.optimizer.Adam(use_global_beta_pow=True)
adam.minimize(loss) adam.minimize(loss)
self.assertRaises(Exception, adam._get_global_accumulator, 'tmp') self.assertRaises(Exception, adam._get_global_accumulator, 'tmp')
adam._add_global_accumulator('tmp', adam._add_global_accumulator(
type=core.VarDesc.VarType.LOD_TENSOR) 'tmp', type=core.VarDesc.VarType.LOD_TENSOR
)
adam._get_global_accumulator('tmp') adam._get_global_accumulator('tmp')
self.assertRaises(Exception, self.assertRaises(
adam._add_global_accumulator, Exception,
adam._beta1_pow_acc_str, adam._add_global_accumulator,
type=core.VarDesc.VarType.LOD_TENSOR) adam._beta1_pow_acc_str,
type=core.VarDesc.VarType.LOD_TENSOR,
)
paddle.disable_static() paddle.disable_static()
def test_adam_save_load(self): def test_adam_save_load(self):
...@@ -976,12 +989,14 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -976,12 +989,14 @@ class TestAdamOptimizer(unittest.TestCase):
state_dict = linear.state_dict() state_dict = linear.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy") fluid.save_dygraph(state_dict, "paddle_dy")
scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, scheduler = paddle.optimizer.lr.NoamDecay(
warmup_steps=100, d_model=0.01, warmup_steps=100, verbose=True
verbose=True) )
adam = paddle.fluid.optimizer.Adam(learning_rate=scheduler, adam = paddle.fluid.optimizer.Adam(
parameter_list=linear.parameters(), learning_rate=scheduler,
use_global_beta_pow=True) parameter_list=linear.parameters(),
use_global_beta_pow=True,
)
adam.minimize(b) adam.minimize(b)
state_dict = adam.state_dict() state_dict = adam.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy") fluid.save_dygraph(state_dict, "paddle_dy")
...@@ -1002,13 +1017,14 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -1002,13 +1017,14 @@ class TestAdamOptimizer(unittest.TestCase):
state_dict = linear.state_dict() state_dict = linear.state_dict()
fluid.save_dygraph(state_dict, "paddle_dy") fluid.save_dygraph(state_dict, "paddle_dy")
scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, scheduler = paddle.optimizer.lr.NoamDecay(
warmup_steps=100, d_model=0.01, warmup_steps=100, verbose=True
verbose=True) )
adam = paddle.fluid.optimizer.Adam( adam = paddle.fluid.optimizer.Adam(
learning_rate=scheduler, learning_rate=scheduler,
parameter_list=linear.parameters(), parameter_list=linear.parameters(),
use_global_beta_pow=True) use_global_beta_pow=True,
)
adam.minimize(b) adam.minimize(b)
return adam return adam
...@@ -1023,14 +1039,14 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -1023,14 +1039,14 @@ class TestAdamOptimizer(unittest.TestCase):
self.assertRaises(AssertionError, adam2.set_state_dict, opt_state_dict) self.assertRaises(AssertionError, adam2.set_state_dict, opt_state_dict)
adam3 = get_opt('float32', [10, 10]) # shape not match adam3 = get_opt('float32', [10, 10]) # shape not match
opt_state_dict['beta1_pow_acc_0'] = np.array([0.9, 0.9], opt_state_dict['beta1_pow_acc_0'] = np.array(
dtype='float32') [0.9, 0.9], dtype='float32'
)
self.assertRaises(AssertionError, adam3.set_state_dict, opt_state_dict) self.assertRaises(AssertionError, adam3.set_state_dict, opt_state_dict)
paddle.enable_static() paddle.enable_static()
class TestAdamOpV2Group(TestAdamOpV2): class TestAdamOpV2Group(TestAdamOpV2):
def test_adam_op(self): def test_adam_op(self):
paddle.disable_static() paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32") value = np.arange(26).reshape(2, 13).astype("float32")
...@@ -1038,16 +1054,19 @@ class TestAdamOpV2Group(TestAdamOpV2): ...@@ -1038,16 +1054,19 @@ class TestAdamOpV2Group(TestAdamOpV2):
linear_1 = paddle.nn.Linear(13, 5) linear_1 = paddle.nn.Linear(13, 5)
linear_2 = paddle.nn.Linear(5, 3) linear_2 = paddle.nn.Linear(5, 3)
# This can be any optimizer supported by dygraph. # This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate=0.01, adam = paddle.optimizer.Adam(
parameters=[{ learning_rate=0.01,
'params': linear_1.parameters() parameters=[
}, { {'params': linear_1.parameters()},
'params': linear_2.parameters(), {
'weight_decay': 0.001, 'params': linear_2.parameters(),
'beta1': 0.1, 'weight_decay': 0.001,
'beta2': 0.99 'beta1': 0.1,
}], 'beta2': 0.99,
weight_decay=0.1) },
],
weight_decay=0.1,
)
out = linear_1(a) out = linear_1(a)
out = linear_2(out) out = linear_2(out)
out.backward() out.backward()
...@@ -1056,13 +1075,14 @@ class TestAdamOpV2Group(TestAdamOpV2): ...@@ -1056,13 +1075,14 @@ class TestAdamOpV2Group(TestAdamOpV2):
class TestMultiTensorAdam(unittest.TestCase): class TestMultiTensorAdam(unittest.TestCase):
def _adam_optimize_dygraph(
def _adam_optimize_dygraph(self, self,
place, place,
use_param_attr=False, use_param_attr=False,
use_param_group=False, use_param_group=False,
use_amp=False, use_amp=False,
use_multi_tensor=False): use_multi_tensor=False,
):
paddle.disable_static() paddle.disable_static()
paddle.seed(10) paddle.seed(10)
paddle.set_device(place) paddle.set_device(place)
...@@ -1072,29 +1092,40 @@ class TestMultiTensorAdam(unittest.TestCase): ...@@ -1072,29 +1092,40 @@ class TestMultiTensorAdam(unittest.TestCase):
weight_attr = paddle.ParamAttr( weight_attr = paddle.ParamAttr(
learning_rate=0.5, learning_rate=0.5,
regularizer=paddle.regularizer.L2Decay(1.0), regularizer=paddle.regularizer.L2Decay(1.0),
trainable=True) trainable=True,
)
if use_param_attr: if use_param_attr:
model = paddle.nn.Linear(5, 5, weight_attr) model = paddle.nn.Linear(5, 5, weight_attr)
else: else:
model = paddle.nn.Linear(5, 5) model = paddle.nn.Linear(5, 5)
if not use_param_group: if not use_param_group:
optimizer = paddle.optimizer.Adam(parameters=model.parameters(), optimizer = paddle.optimizer.Adam(
use_multi_tensor=use_multi_tensor, parameters=model.parameters(),
multi_precision=use_amp) use_multi_tensor=use_multi_tensor,
multi_precision=use_amp,
)
else: else:
optimizer = paddle.optimizer.Adam(parameters=[{ parameters = list(model.parameters())
'params': param_num = len(parameters)
model.parameters(), optimizer = paddle.optimizer.Adam(
'weight_decay': parameters=[
0.001, {
'beta1': 'params': parameters[: int(param_num / 2)],
0.1, 'weight_decay': 0.001,
'beta2': 'beta1': 0.1,
0.99 'beta2': 0.99,
}], },
use_multi_tensor=use_multi_tensor, {
multi_precision=use_amp) 'params': parameters[int(param_num / 2) :],
'weight_decay': 0.001,
'beta1': 0.1,
'beta2': 0.99,
},
],
use_multi_tensor=use_multi_tensor,
multi_precision=use_amp,
)
for idx in range(2): for idx in range(2):
if place == 'gpu' and use_amp == True: if place == 'gpu' and use_amp == True:
...@@ -1118,10 +1149,9 @@ class TestMultiTensorAdam(unittest.TestCase): ...@@ -1118,10 +1149,9 @@ class TestMultiTensorAdam(unittest.TestCase):
return output, model.parameters() return output, model.parameters()
def _adam_optimize_static(self, def _adam_optimize_static(
place, self, place, use_amp=False, use_multi_tensor=False
use_amp=False, ):
use_multi_tensor=False):
paddle.enable_static() paddle.enable_static()
paddle.seed(10) paddle.seed(10)
np.random.seed(10) np.random.seed(10)
...@@ -1130,24 +1160,26 @@ class TestMultiTensorAdam(unittest.TestCase): ...@@ -1130,24 +1160,26 @@ class TestMultiTensorAdam(unittest.TestCase):
exe = paddle.static.Executor(place=place) exe = paddle.static.Executor(place=place)
train_program = paddle.static.Program() train_program = paddle.static.Program()
startup_program = paddle.static.Program() startup_program = paddle.static.Program()
optimizer = paddle.optimizer.Adam(multi_precision=use_amp, optimizer = paddle.optimizer.Adam(
use_multi_tensor=use_multi_tensor) multi_precision=use_amp, use_multi_tensor=use_multi_tensor
)
if use_amp: if use_amp:
optimizer = paddle.static.amp.decorate( optimizer = paddle.static.amp.decorate(
optimizer, optimizer,
init_loss_scaling=128.0, init_loss_scaling=128.0,
use_dynamic_loss_scaling=True, use_dynamic_loss_scaling=True,
use_pure_fp16=True, use_pure_fp16=True,
use_fp16_guard=False) use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program): with paddle.static.program_guard(train_program, startup_program):
if use_amp: if use_amp:
data = paddle.static.data(shape=[2, 2], data = paddle.static.data(
name='X', shape=[2, 2], name='X', dtype='float16'
dtype='float16') )
else: else:
data = paddle.static.data(shape=[2, 2], data = paddle.static.data(
name='X', shape=[2, 2], name='X', dtype='float32'
dtype='float32') )
hidden = paddle.static.nn.fc(x=data, size=10) hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden) loss = paddle.mean(hidden)
optimizer.minimize(loss) optimizer.minimize(loss)
...@@ -1159,9 +1191,9 @@ class TestMultiTensorAdam(unittest.TestCase): ...@@ -1159,9 +1191,9 @@ class TestMultiTensorAdam(unittest.TestCase):
x = np.random.random(size=(2, 2)).astype('float32') x = np.random.random(size=(2, 2)).astype('float32')
out = [] out = []
for idx in range(5): for idx in range(5):
loss_data, = exe.run(train_program, (loss_data,) = exe.run(
feed={"X": x}, train_program, feed={"X": x}, fetch_list=[loss.name]
fetch_list=[loss.name]) )
out.append(loss_data) out.append(loss_data)
return out return out
...@@ -1174,49 +1206,59 @@ class TestMultiTensorAdam(unittest.TestCase): ...@@ -1174,49 +1206,59 @@ class TestMultiTensorAdam(unittest.TestCase):
def _check_with_place_amp(self, place, use_amp): def _check_with_place_amp(self, place, use_amp):
# test dygraph mode # test dygraph mode
output_dygraph1, params_dygraph1 = self._adam_optimize_dygraph( output_dygraph1, params_dygraph1 = self._adam_optimize_dygraph(
place=place, use_amp=use_amp, use_multi_tensor=True) place=place, use_amp=use_amp, use_multi_tensor=True
)
output_dygraph2, params_dygraph2 = self._adam_optimize_dygraph( output_dygraph2, params_dygraph2 = self._adam_optimize_dygraph(
place=place, use_amp=use_amp, use_multi_tensor=False) place=place, use_amp=use_amp, use_multi_tensor=False
)
np.testing.assert_allclose(output_dygraph1, output_dygraph2, rtol=1e-05) np.testing.assert_allclose(output_dygraph1, output_dygraph2, rtol=1e-05)
for idx in range(len(params_dygraph1)): for idx in range(len(params_dygraph1)):
np.testing.assert_allclose(params_dygraph1[idx], np.testing.assert_allclose(
params_dygraph2[idx], params_dygraph1[idx], params_dygraph2[idx], rtol=1e-05
rtol=1e-05) )
# test static mode # test static mode
output_static1 = self._adam_optimize_static(place=place, output_static1 = self._adam_optimize_static(
use_amp=use_amp, place=place, use_amp=use_amp, use_multi_tensor=True
use_multi_tensor=True) )
output_static2 = self._adam_optimize_static(place=place, output_static2 = self._adam_optimize_static(
use_amp=use_amp, place=place, use_amp=use_amp, use_multi_tensor=False
use_multi_tensor=False) )
for idx in range(len(output_static1)): for idx in range(len(output_static1)):
np.testing.assert_allclose(output_static1[idx], np.testing.assert_allclose(
output_static2[idx], output_static1[idx], output_static2[idx], rtol=1e-05
rtol=1e-05) )
def _check_with_param_arrt(self, place, use_amp): def _check_with_param_arrt(self, place, use_amp):
output1, params1 = self._adam_optimize_dygraph(place=place, output1, params1 = self._adam_optimize_dygraph(
use_amp=use_amp, place=place,
use_param_attr=True, use_amp=use_amp,
use_multi_tensor=True) use_param_attr=True,
output2, params2 = self._adam_optimize_dygraph(place=place, use_multi_tensor=True,
use_amp=use_amp, )
use_param_attr=True, output2, params2 = self._adam_optimize_dygraph(
use_multi_tensor=False) place=place,
use_amp=use_amp,
use_param_attr=True,
use_multi_tensor=False,
)
np.testing.assert_allclose(output1, output2, rtol=1e-05) np.testing.assert_allclose(output1, output2, rtol=1e-05)
for idx in range(len(params1)): for idx in range(len(params1)):
np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05) np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
def _check_with_param_group(self, place, use_amp): def _check_with_param_group(self, place, use_amp):
output1, params1 = self._adam_optimize_dygraph(place=place, output1, params1 = self._adam_optimize_dygraph(
use_amp=use_amp, place=place,
use_param_group=True, use_amp=use_amp,
use_multi_tensor=True) use_param_group=True,
output2, params2 = self._adam_optimize_dygraph(place=place, use_multi_tensor=True,
use_amp=use_amp, )
use_param_group=True, output2, params2 = self._adam_optimize_dygraph(
use_multi_tensor=False) place=place,
use_amp=use_amp,
use_param_group=True,
use_multi_tensor=False,
)
np.testing.assert_allclose(output1, output2, rtol=1e-05) np.testing.assert_allclose(output1, output2, rtol=1e-05)
for idx in range(len(params1)): for idx in range(len(params1)):
......
...@@ -25,14 +25,16 @@ import numpy ...@@ -25,14 +25,16 @@ import numpy
from paddle.fluid.framework import _test_eager_guard from paddle.fluid.framework import _test_eager_guard
def calculate_momentum_by_numpy(param, def calculate_momentum_by_numpy(
grad, param,
mu, grad,
velocity, mu,
use_nesterov, velocity,
learning_rate, use_nesterov,
regularization_method=None, learning_rate,
regularization_coeff=1.0): regularization_method=None,
regularization_coeff=1.0,
):
if regularization_method == "l2_decay": if regularization_method == "l2_decay":
grad = grad + regularization_coeff * param grad = grad + regularization_coeff * param
...@@ -44,8 +46,9 @@ def calculate_momentum_by_numpy(param, ...@@ -44,8 +46,9 @@ def calculate_momentum_by_numpy(param,
else: else:
velocity_out = mu * velocity + grad velocity_out = mu * velocity + grad
if use_nesterov: if use_nesterov:
param_out = param - grad * learning_rate - \ param_out = (
velocity_out * mu * learning_rate param - grad * learning_rate - velocity_out * mu * learning_rate
)
else: else:
param_out = param - learning_rate * velocity_out param_out = param - learning_rate * velocity_out
...@@ -53,7 +56,6 @@ def calculate_momentum_by_numpy(param, ...@@ -53,7 +56,6 @@ def calculate_momentum_by_numpy(param,
class TestMomentumOp1(OpTest): class TestMomentumOp1(OpTest):
def setUp(self): def setUp(self):
self.op_type = "momentum" self.op_type = "momentum"
self.dtype = np.float32 self.dtype = np.float32
...@@ -70,7 +72,7 @@ class TestMomentumOp1(OpTest): ...@@ -70,7 +72,7 @@ class TestMomentumOp1(OpTest):
'Param': param, 'Param': param,
'Grad': grad, 'Grad': grad,
'Velocity': velocity, 'Velocity': velocity,
'LearningRate': learning_rate 'LearningRate': learning_rate,
} }
self.attrs = {'mu': mu} self.attrs = {'mu': mu}
...@@ -81,7 +83,8 @@ class TestMomentumOp1(OpTest): ...@@ -81,7 +83,8 @@ class TestMomentumOp1(OpTest):
mu=mu, mu=mu,
velocity=velocity, velocity=velocity,
use_nesterov=use_nesterov, use_nesterov=use_nesterov,
learning_rate=learning_rate) learning_rate=learning_rate,
)
self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out} self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
...@@ -93,7 +96,6 @@ class TestMomentumOp1(OpTest): ...@@ -93,7 +96,6 @@ class TestMomentumOp1(OpTest):
class TestMomentumOpFp16(TestMomentumOp1): class TestMomentumOpFp16(TestMomentumOp1):
def init_dtype(self): def init_dtype(self):
self.dtype = np.float16 self.dtype = np.float16
...@@ -102,8 +104,7 @@ class TestMomentumOpFp16(TestMomentumOp1): ...@@ -102,8 +104,7 @@ class TestMomentumOpFp16(TestMomentumOp1):
class TestMomentumOp2(OpTest): class TestMomentumOp2(OpTest):
'''Test Momentum with default values for attributes '''Test Momentum with default values for attributes'''
'''
def setUp(self): def setUp(self):
self.op_type = "momentum" self.op_type = "momentum"
...@@ -119,7 +120,7 @@ class TestMomentumOp2(OpTest): ...@@ -119,7 +120,7 @@ class TestMomentumOp2(OpTest):
'Param': param, 'Param': param,
'Grad': grad, 'Grad': grad,
'Velocity': velocity, 'Velocity': velocity,
'LearningRate': learning_rate 'LearningRate': learning_rate,
} }
self.attrs = {'mu': mu, 'use_nesterov': use_nesterov} self.attrs = {'mu': mu, 'use_nesterov': use_nesterov}
...@@ -130,7 +131,8 @@ class TestMomentumOp2(OpTest): ...@@ -130,7 +131,8 @@ class TestMomentumOp2(OpTest):
mu=mu, mu=mu,
velocity=velocity, velocity=velocity,
use_nesterov=use_nesterov, use_nesterov=use_nesterov,
learning_rate=learning_rate) learning_rate=learning_rate,
)
self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out} self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
...@@ -138,10 +140,10 @@ class TestMomentumOp2(OpTest): ...@@ -138,10 +140,10 @@ class TestMomentumOp2(OpTest):
self.check_output() self.check_output()
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(
"core is not compiled with CUDA") not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
)
class TestLarsMomentumOpWithMP(OpTest): class TestLarsMomentumOpWithMP(OpTest):
def setUp(self): def setUp(self):
self.config() self.config()
self.op_type = "lars_momentum" self.op_type = "lars_momentum"
...@@ -168,11 +170,16 @@ class TestLarsMomentumOpWithMP(OpTest): ...@@ -168,11 +170,16 @@ class TestLarsMomentumOpWithMP(OpTest):
fp32_grad = grad.astype("float32") fp32_grad = grad.astype("float32")
pnorm = np.sqrt(np.square(master_param).sum()) pnorm = np.sqrt(np.square(master_param).sum())
gnorm = np.sqrt(np.square(fp32_grad).sum()) gnorm = np.sqrt(np.square(fp32_grad).sum())
local_lr = learning_rate * lars_coeff * pnorm / ( local_lr = (
gnorm + lars_weight_decay * pnorm) learning_rate
* lars_coeff
* pnorm
/ (gnorm + lars_weight_decay * pnorm)
)
fp32_grad = fp32_grad * rescale_grad fp32_grad = fp32_grad * rescale_grad
velocity_out = mu * velocity + local_lr * ( velocity_out = mu * velocity + local_lr * (
fp32_grad + lars_weight_decay * master_param) fp32_grad + lars_weight_decay * master_param
)
p_new = master_param - velocity_out p_new = master_param - velocity_out
param_out = p_new.astype("float16") param_out = p_new.astype("float16")
master_param_out = p_new master_param_out = p_new
...@@ -185,7 +192,8 @@ class TestLarsMomentumOpWithMP(OpTest): ...@@ -185,7 +192,8 @@ class TestLarsMomentumOpWithMP(OpTest):
param_outs.append(("SubParam_out_" + str(i), param_out)) param_outs.append(("SubParam_out_" + str(i), param_out))
master_params.append(("SubMasterParam_" + str(i), master_param)) master_params.append(("SubMasterParam_" + str(i), master_param))
master_param_outs.append( master_param_outs.append(
("SubMasterParamOut_" + str(i), master_param_out)) ("SubMasterParamOut_" + str(i), master_param_out)
)
self.inputs = { self.inputs = {
'Param': params, 'Param': params,
...@@ -200,13 +208,13 @@ class TestLarsMomentumOpWithMP(OpTest): ...@@ -200,13 +208,13 @@ class TestLarsMomentumOpWithMP(OpTest):
'lars_coeff': lars_coeff, 'lars_coeff': lars_coeff,
'lars_weight_decay': [lars_weight_decay], 'lars_weight_decay': [lars_weight_decay],
'multi_precision': True, 'multi_precision': True,
'rescale_grad': rescale_grad 'rescale_grad': rescale_grad,
} }
self.outputs = { self.outputs = {
'ParamOut': param_outs, 'ParamOut': param_outs,
'VelocityOut': velocity_outs, 'VelocityOut': velocity_outs,
'MasterParamOut': master_param_outs 'MasterParamOut': master_param_outs,
} }
def test_check_output(self): def test_check_output(self):
...@@ -221,7 +229,6 @@ class TestLarsMomentumOpWithMP(OpTest): ...@@ -221,7 +229,6 @@ class TestLarsMomentumOpWithMP(OpTest):
class TestLarsMomentumOp(OpTest): class TestLarsMomentumOp(OpTest):
def setUp(self): def setUp(self):
self.config() self.config()
self.op_type = "lars_momentum" self.op_type = "lars_momentum"
...@@ -242,10 +249,15 @@ class TestLarsMomentumOp(OpTest): ...@@ -242,10 +249,15 @@ class TestLarsMomentumOp(OpTest):
learning_rate = np.array([0.001]).astype("float32") learning_rate = np.array([0.001]).astype("float32")
pnorm = np.sqrt(np.square(param).sum()) pnorm = np.sqrt(np.square(param).sum())
gnorm = np.sqrt(np.square(grad).sum()) gnorm = np.sqrt(np.square(grad).sum())
local_lr = learning_rate * lars_coeff * pnorm / ( local_lr = (
gnorm + lars_weight_decay * param) learning_rate
* lars_coeff
* pnorm
/ (gnorm + lars_weight_decay * param)
)
velocity_out = mu * velocity + local_lr * ( velocity_out = mu * velocity + local_lr * (
grad + lars_weight_decay * param) grad + lars_weight_decay * param
)
param_out = param - velocity_out param_out = param - velocity_out
params.append(("SubParam_" + str(i), param)) params.append(("SubParam_" + str(i), param))
...@@ -259,13 +271,13 @@ class TestLarsMomentumOp(OpTest): ...@@ -259,13 +271,13 @@ class TestLarsMomentumOp(OpTest):
'Param': params, 'Param': params,
'Grad': grads, 'Grad': grads,
'Velocity': velocitys, 'Velocity': velocitys,
'LearningRate': learning_rates 'LearningRate': learning_rates,
} }
self.attrs = { self.attrs = {
'mu': mu, 'mu': mu,
'lars_coeff': lars_coeff, 'lars_coeff': lars_coeff,
'lars_weight_decay': [lars_weight_decay] 'lars_weight_decay': [lars_weight_decay],
} }
self.outputs = {'ParamOut': param_outs, 'VelocityOut': velocity_outs} self.outputs = {'ParamOut': param_outs, 'VelocityOut': velocity_outs}
...@@ -278,7 +290,6 @@ class TestLarsMomentumOp(OpTest): ...@@ -278,7 +290,6 @@ class TestLarsMomentumOp(OpTest):
class TestSparseMomentumOp(unittest.TestCase): class TestSparseMomentumOp(unittest.TestCase):
def setUp(self): def setUp(self):
self.use_nesterov = False self.use_nesterov = False
self.regularization_method = "" self.regularization_method = ""
...@@ -317,8 +328,9 @@ class TestSparseMomentumOp(unittest.TestCase): ...@@ -317,8 +328,9 @@ class TestSparseMomentumOp(unittest.TestCase):
velocity_np_array = np.ones((height, row_numel)).astype("float32") velocity_np_array = np.ones((height, row_numel)).astype("float32")
velocity.set(velocity_np_array, place) velocity.set(velocity_np_array, place)
velocity_out = scope.var('VelocityOut').get_tensor() velocity_out = scope.var('VelocityOut').get_tensor()
velocity_out_np_array = np.full((height, row_numel), velocity_out_np_array = np.full((height, row_numel), 0.0).astype(
0.0).astype("float32") "float32"
)
velocity_out.set(velocity_out_np_array, place) velocity_out.set(velocity_out_np_array, place)
# create and initialize LearningRate Variable # create and initialize LearningRate Variable
...@@ -327,17 +339,19 @@ class TestSparseMomentumOp(unittest.TestCase): ...@@ -327,17 +339,19 @@ class TestSparseMomentumOp(unittest.TestCase):
lr.set(lr_array, place) lr.set(lr_array, place)
# create and run operator # create and run operator
op = Operator("momentum", op = Operator(
Param='Param', "momentum",
Grad='Grad', Param='Param',
Velocity='Velocity', Grad='Grad',
ParamOut='ParamOut', Velocity='Velocity',
VelocityOut='VelocityOut', ParamOut='ParamOut',
LearningRate='LearningRate', VelocityOut='VelocityOut',
mu=mu, LearningRate='LearningRate',
use_nesterov=use_nesterov, mu=mu,
regularization_method=regularization_method, use_nesterov=use_nesterov,
regularization_coeff=regularization_coeff) regularization_method=regularization_method,
regularization_coeff=regularization_coeff,
)
op.run(scope, place) op.run(scope, place)
# get and compare result # get and compare result
...@@ -360,7 +374,8 @@ class TestSparseMomentumOp(unittest.TestCase): ...@@ -360,7 +374,8 @@ class TestSparseMomentumOp(unittest.TestCase):
use_nesterov=use_nesterov, use_nesterov=use_nesterov,
learning_rate=lr_array, learning_rate=lr_array,
regularization_method=regularization_method, regularization_method=regularization_method,
regularization_coeff=regularization_coeff) regularization_coeff=regularization_coeff,
)
self.assertTrue((_velocity_out == velocity_out_np_array).all()) self.assertTrue((_velocity_out == velocity_out_np_array).all())
self.assertTrue((_param_out == param_out_np_array).all()) self.assertTrue((_param_out == param_out_np_array).all())
...@@ -377,13 +392,11 @@ class TestSparseMomentumOp(unittest.TestCase): ...@@ -377,13 +392,11 @@ class TestSparseMomentumOp(unittest.TestCase):
class TestSparseMomentumOp2(TestSparseMomentumOp): class TestSparseMomentumOp2(TestSparseMomentumOp):
def init_kernel(self): def init_kernel(self):
self.use_nesterov = True self.use_nesterov = True
class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase): class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
def setUp(self): def setUp(self):
self.init_args() self.init_args()
self.regularization_method = "" self.regularization_method = ""
...@@ -427,8 +440,9 @@ class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase): ...@@ -427,8 +440,9 @@ class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
velocity_np_array = np.ones((height, row_numel)).astype("float32") velocity_np_array = np.ones((height, row_numel)).astype("float32")
velocity.set(velocity_np_array, place) velocity.set(velocity_np_array, place)
velocity_out = scope.var('VelocityOut').get_tensor() velocity_out = scope.var('VelocityOut').get_tensor()
velocity_out_np_array = np.full((height, row_numel), velocity_out_np_array = np.full((height, row_numel), 0.0).astype(
0.0).astype("float32") "float32"
)
velocity_out.set(velocity_out_np_array, place) velocity_out.set(velocity_out_np_array, place)
# create and initialize LearningRate Variable # create and initialize LearningRate Variable
...@@ -437,21 +451,23 @@ class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase): ...@@ -437,21 +451,23 @@ class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
lr.set(lr_array, place) lr.set(lr_array, place)
# create and run operator # create and run operator
op = Operator("momentum", op = Operator(
Param='Param', "momentum",
Grad='Grad', Param='Param',
Velocity='Velocity', Grad='Grad',
MasterParam='MasterParam', Velocity='Velocity',
ParamOut='ParamOut', MasterParam='MasterParam',
VelocityOut='VelocityOut', ParamOut='ParamOut',
MasterParamOut='MasterParamOut', VelocityOut='VelocityOut',
LearningRate='LearningRate', MasterParamOut='MasterParamOut',
mu=mu, LearningRate='LearningRate',
use_nesterov=use_nesterov, mu=mu,
regularization_method=regularization_method, use_nesterov=use_nesterov,
regularization_coeff=regularization_coeff, regularization_method=regularization_method,
multi_precision=True, regularization_coeff=regularization_coeff,
rescale_grad=1.0) multi_precision=True,
rescale_grad=1.0,
)
op.run(scope, place) op.run(scope, place)
# get and compare result # get and compare result
...@@ -472,7 +488,8 @@ class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase): ...@@ -472,7 +488,8 @@ class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
use_nesterov=use_nesterov, use_nesterov=use_nesterov,
learning_rate=lr_array, learning_rate=lr_array,
regularization_method=regularization_method, regularization_method=regularization_method,
regularization_coeff=regularization_coeff) regularization_coeff=regularization_coeff,
)
self.assertTrue((_velocity_out == velocity_out_np_array).all()) self.assertTrue((_velocity_out == velocity_out_np_array).all())
self.assertTrue((_param_out == param_out_np_array).all()) self.assertTrue((_param_out == param_out_np_array).all())
...@@ -486,23 +503,22 @@ class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase): ...@@ -486,23 +503,22 @@ class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
class TestSparseMomentumOpWithMultiPrecision2( class TestSparseMomentumOpWithMultiPrecision2(
TestSparseMomentumOpWithMultiPrecision): TestSparseMomentumOpWithMultiPrecision
):
def init_args(self): def init_args(self):
self.use_nesterov = True self.use_nesterov = True
class TestMomentumV2(unittest.TestCase): class TestMomentumV2(unittest.TestCase):
def test_momentum_dygraph(self): def test_momentum_dygraph(self):
paddle.disable_static() paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32") value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value) a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5) linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph. # This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Momentum(learning_rate=0.01, adam = paddle.optimizer.Momentum(
momentum=0.9, learning_rate=0.01, momentum=0.9, parameters=linear.parameters()
parameters=linear.parameters()) )
out = linear(a) out = linear(a)
out.backward() out.backward()
adam.step() adam.step()
...@@ -519,13 +535,15 @@ class TestMomentumV2(unittest.TestCase): ...@@ -519,13 +535,15 @@ class TestMomentumV2(unittest.TestCase):
cost = fluid.layers.square_error_cost(input=y_predict, label=y) cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = paddle.mean(cost) avg_cost = paddle.mean(cost)
rms_optimizer = paddle.optimizer.Momentum(learning_rate=0.1, rms_optimizer = paddle.optimizer.Momentum(
momentum=0.9) learning_rate=0.1, momentum=0.9
)
rms_optimizer.minimize(avg_cost) rms_optimizer.minimize(avg_cost)
fetch_list = [avg_cost] fetch_list = [avg_cost]
train_reader = paddle.batch(paddle.dataset.uci_housing.train(), train_reader = paddle.batch(
batch_size=1) paddle.dataset.uci_housing.train(), batch_size=1
)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
...@@ -533,9 +551,9 @@ class TestMomentumV2(unittest.TestCase): ...@@ -533,9 +551,9 @@ class TestMomentumV2(unittest.TestCase):
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
def test_raise_error(self): def test_raise_error(self):
self.assertRaises(ValueError, self.assertRaises(
paddle.optimizer.Momentum, ValueError, paddle.optimizer.Momentum, learning_rate=None
learning_rate=None) )
self.assertRaises(ValueError, paddle.optimizer.Momentum, momentum=None) self.assertRaises(ValueError, paddle.optimizer.Momentum, momentum=None)
def test_api_eager_dygraph(self): def test_api_eager_dygraph(self):
...@@ -545,7 +563,6 @@ class TestMomentumV2(unittest.TestCase): ...@@ -545,7 +563,6 @@ class TestMomentumV2(unittest.TestCase):
class TestMomentumOpWithDecay(OpTest): class TestMomentumOpWithDecay(OpTest):
def setUp(self): def setUp(self):
self.op_type = "momentum" self.op_type = "momentum"
self.dtype = np.float32 self.dtype = np.float32
...@@ -567,14 +584,14 @@ class TestMomentumOpWithDecay(OpTest): ...@@ -567,14 +584,14 @@ class TestMomentumOpWithDecay(OpTest):
'Param': param, 'Param': param,
'Grad': grad, 'Grad': grad,
'Velocity': velocity, 'Velocity': velocity,
'LearningRate': learning_rate 'LearningRate': learning_rate,
} }
self.attrs = { self.attrs = {
'mu': mu, 'mu': mu,
'use_nesterov': use_nesterov, 'use_nesterov': use_nesterov,
'regularization_method': regularization_method, 'regularization_method': regularization_method,
'regularization_coeff': regularization_coeff 'regularization_coeff': regularization_coeff,
} }
grad = grad + regularization_coeff * param grad = grad + regularization_coeff * param
...@@ -585,7 +602,8 @@ class TestMomentumOpWithDecay(OpTest): ...@@ -585,7 +602,8 @@ class TestMomentumOpWithDecay(OpTest):
mu=mu, mu=mu,
velocity=velocity, velocity=velocity,
use_nesterov=use_nesterov, use_nesterov=use_nesterov,
learning_rate=learning_rate) learning_rate=learning_rate,
)
self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out} self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
...@@ -598,7 +616,6 @@ class TestMomentumOpWithDecay(OpTest): ...@@ -598,7 +616,6 @@ class TestMomentumOpWithDecay(OpTest):
class TestMomentumOpWithDecayFP16(TestMomentumOpWithDecay): class TestMomentumOpWithDecayFP16(TestMomentumOpWithDecay):
def init_config(self): def init_config(self):
self.dtype = np.float16 self.dtype = np.float16
...@@ -608,13 +625,11 @@ class TestMomentumOpWithDecayFP16(TestMomentumOpWithDecay): ...@@ -608,13 +625,11 @@ class TestMomentumOpWithDecayFP16(TestMomentumOpWithDecay):
class TestMomentumOpWithDecay2(TestMomentumOpWithDecay): class TestMomentumOpWithDecay2(TestMomentumOpWithDecay):
def init_config(self): def init_config(self):
self.use_nesterov = False self.use_nesterov = False
class TestSparseMomentumOpWithDecay(TestSparseMomentumOp): class TestSparseMomentumOpWithDecay(TestSparseMomentumOp):
def setUp(self): def setUp(self):
self.use_nesterov = False self.use_nesterov = False
self.regularization_method = 'l2_decay' self.regularization_method = 'l2_decay'
...@@ -622,13 +637,11 @@ class TestSparseMomentumOpWithDecay(TestSparseMomentumOp): ...@@ -622,13 +637,11 @@ class TestSparseMomentumOpWithDecay(TestSparseMomentumOp):
class TestSparseMomentumOpWithDecay2(TestSparseMomentumOpWithDecay): class TestSparseMomentumOpWithDecay2(TestSparseMomentumOpWithDecay):
def init_kernel(self): def init_kernel(self):
self.use_nesterov = True self.use_nesterov = True
class TestMomentumOpWithDecayAPI(unittest.TestCase): class TestMomentumOpWithDecayAPI(unittest.TestCase):
def _test_momentum_dygraph_common(self, regularization): def _test_momentum_dygraph_common(self, regularization):
paddle.disable_static() paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
...@@ -641,13 +654,16 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase): ...@@ -641,13 +654,16 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase):
learning_rate=0.01, learning_rate=0.01,
momentum=0.9, momentum=0.9,
parameter_list=linear.parameters(), parameter_list=linear.parameters(),
regularization=regularization) regularization=regularization,
)
momentum.minimize(loss) momentum.minimize(loss)
def test_momentum_dygraph_1(self): def test_momentum_dygraph_1(self):
self._test_momentum_dygraph_common( self._test_momentum_dygraph_common(
regularization=paddle.fluid.regularizer.L2Decay( regularization=paddle.fluid.regularizer.L2Decay(
regularization_coeff=0.1)) regularization_coeff=0.1
)
)
def test_momentum_static(self): def test_momentum_static(self):
paddle.enable_static() paddle.enable_static()
...@@ -661,12 +677,14 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase): ...@@ -661,12 +677,14 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase):
avg_cost = paddle.mean(cost) avg_cost = paddle.mean(cost)
momentum_optimizer = paddle.fluid.contrib.optimizer.Momentum( momentum_optimizer = paddle.fluid.contrib.optimizer.Momentum(
learning_rate=0.1, momentum=0.9) learning_rate=0.1, momentum=0.9
)
momentum_optimizer.minimize(avg_cost) momentum_optimizer.minimize(avg_cost)
fetch_list = [avg_cost] fetch_list = [avg_cost]
train_reader = paddle.batch(paddle.dataset.uci_housing.train(), train_reader = paddle.batch(
batch_size=1) paddle.dataset.uci_housing.train(), batch_size=1
)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
...@@ -675,23 +693,23 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase): ...@@ -675,23 +693,23 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase):
class TestFusedMomentumWithDecayAPI(unittest.TestCase): class TestFusedMomentumWithDecayAPI(unittest.TestCase):
def get_program(self, weight_attr, bias_attr=False): def get_program(self, weight_attr, bias_attr=False):
main_program = paddle.static.Program() main_program = paddle.static.Program()
startup_program = paddle.static.Program() startup_program = paddle.static.Program()
with paddle.static.program_guard(main_program=main_program, with paddle.static.program_guard(
startup_program=startup_program): main_program=main_program, startup_program=startup_program
):
x = paddle.static.data(name='x', shape=[10, 10]) x = paddle.static.data(name='x', shape=[10, 10])
linear = paddle.nn.Linear(10, linear = paddle.nn.Linear(
10, 10, 10, weight_attr=weight_attr, bias_attr=bias_attr
weight_attr=weight_attr, )
bias_attr=bias_attr)
out = linear(x) out = linear(x)
loss = paddle.mean(out) loss = paddle.mean(out)
optimizer = paddle.optimizer.Momentum( optimizer = paddle.optimizer.Momentum(
learning_rate=0.01, learning_rate=0.01,
momentum=0.9, momentum=0.9,
weight_decay=paddle.regularizer.L2Decay(0.5)) weight_decay=paddle.regularizer.L2Decay(0.5),
)
optimizer.minimize(loss) optimizer.minimize(loss)
return main_program return main_program
...@@ -700,7 +718,8 @@ class TestFusedMomentumWithDecayAPI(unittest.TestCase): ...@@ -700,7 +718,8 @@ class TestFusedMomentumWithDecayAPI(unittest.TestCase):
weight_attr = paddle.ParamAttr( weight_attr = paddle.ParamAttr(
name="weight", name="weight",
initializer=paddle.nn.initializer.Constant(value=0.5), initializer=paddle.nn.initializer.Constant(value=0.5),
regularizer=paddle.regularizer.L2Decay(0.1)) regularizer=paddle.regularizer.L2Decay(0.1),
)
program = self.get_program(weight_attr, bias_attr=False) program = self.get_program(weight_attr, bias_attr=False)
ops = program.global_block().ops ops = program.global_block().ops
...@@ -715,11 +734,13 @@ class TestFusedMomentumWithDecayAPI(unittest.TestCase): ...@@ -715,11 +734,13 @@ class TestFusedMomentumWithDecayAPI(unittest.TestCase):
weight_attr = paddle.ParamAttr( weight_attr = paddle.ParamAttr(
name="weight", name="weight",
initializer=paddle.nn.initializer.Constant(value=0.5), initializer=paddle.nn.initializer.Constant(value=0.5),
regularizer=paddle.regularizer.L1Decay(0.1)) regularizer=paddle.regularizer.L1Decay(0.1),
)
bias_attr = paddle.ParamAttr( bias_attr = paddle.ParamAttr(
name="bias", name="bias",
initializer=paddle.nn.initializer.Constant(value=0.), initializer=paddle.nn.initializer.Constant(value=0.0),
regularizer=None) regularizer=None,
)
program = self.get_program(weight_attr, bias_attr) program = self.get_program(weight_attr, bias_attr)
ops = program.global_block().ops ops = program.global_block().ops
...@@ -734,8 +755,9 @@ class TestFusedMomentumWithDecayAPI(unittest.TestCase): ...@@ -734,8 +755,9 @@ class TestFusedMomentumWithDecayAPI(unittest.TestCase):
self.assertEqual(ops[-1].attr('regularization_coeff'), 0) self.assertEqual(ops[-1].attr('regularization_coeff'), 0)
if 'bias' in ops[-2].input('Param'): if 'bias' in ops[-2].input('Param'):
self.assertEqual(ops[-2].attr('regularization_method'), 'l2_decay') self.assertEqual(ops[-2].attr('regularization_method'), 'l2_decay')
self.assertEqual(ops[-2].attr('regularization_coeff'), self.assertEqual(
np.float32(0.5)) ops[-2].attr('regularization_coeff'), np.float32(0.5)
)
def test_param_has_no_regularizer(self): def test_param_has_no_regularizer(self):
paddle.enable_static() paddle.enable_static()
...@@ -749,11 +771,11 @@ class TestFusedMomentumWithDecayAPI(unittest.TestCase): ...@@ -749,11 +771,11 @@ class TestFusedMomentumWithDecayAPI(unittest.TestCase):
class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase): class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
def __update_params(self, momentum, linear): def __update_params(self, momentum, linear):
for i in range(10): for i in range(10):
inp = paddle.full(shape=[2, 2], fill_value=i, inp = paddle.full(
dtype='float32').astype("float32") shape=[2, 2], fill_value=i, dtype='float32'
).astype("float32")
inp = paddle.to_tensor(inp) inp = paddle.to_tensor(inp)
out = linear(inp) out = linear(inp)
loss = paddle.mean(out) loss = paddle.mean(out)
...@@ -768,32 +790,39 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase): ...@@ -768,32 +790,39 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
2, 2,
2, 2,
weight_attr=paddle.nn.initializer.Constant(value=2.0), weight_attr=paddle.nn.initializer.Constant(value=2.0),
bias_attr=paddle.nn.initializer.Constant(value=2.0)) bias_attr=paddle.nn.initializer.Constant(value=2.0),
)
momentum_old = paddle.fluid.optimizer.Momentum( momentum_old = paddle.fluid.optimizer.Momentum(
learning_rate=0.01, learning_rate=0.01,
momentum=0.9, momentum=0.9,
parameter_list=linear_old.parameters(), parameter_list=linear_old.parameters(),
regularization=paddle.fluid.regularizer.L2Decay( regularization=paddle.fluid.regularizer.L2Decay(
regularization_coeff=0.1)) regularization_coeff=0.1
),
)
self.__update_params(momentum=momentum_old, linear=linear_old) self.__update_params(momentum=momentum_old, linear=linear_old)
linear_new = paddle.nn.Linear( linear_new = paddle.nn.Linear(
2, 2,
2, 2,
weight_attr=paddle.nn.initializer.Constant(value=2.0), weight_attr=paddle.nn.initializer.Constant(value=2.0),
bias_attr=paddle.nn.initializer.Constant(value=2.0)) bias_attr=paddle.nn.initializer.Constant(value=2.0),
)
momentum_new = paddle.fluid.contrib.optimizer.Momentum( momentum_new = paddle.fluid.contrib.optimizer.Momentum(
learning_rate=0.01, learning_rate=0.01,
momentum=0.9, momentum=0.9,
parameter_list=linear_new.parameters(), parameter_list=linear_new.parameters(),
regularization=paddle.fluid.regularizer.L2Decay( regularization=paddle.fluid.regularizer.L2Decay(
regularization_coeff=0.1)) regularization_coeff=0.1
),
)
self.__update_params(momentum=momentum_new, linear=linear_new) self.__update_params(momentum=momentum_new, linear=linear_new)
self.assertEqual( self.assertEqual(
(linear_old.weight.numpy() == linear_new.weight.numpy()).all(), (linear_old.weight.numpy() == linear_new.weight.numpy()).all(),
True, True,
'the param weight updated by two Momentum optimizers should equal') 'the param weight updated by two Momentum optimizers should equal',
)
def test_vs(self, place=fluid.CPUPlace()): def test_vs(self, place=fluid.CPUPlace()):
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
...@@ -805,7 +834,6 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase): ...@@ -805,7 +834,6 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
class TestMomentumV2Group(TestMomentumV2): class TestMomentumV2Group(TestMomentumV2):
def test_momentum_dygraph(self): def test_momentum_dygraph(self):
paddle.disable_static() paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32") value = np.arange(26).reshape(2, 13).astype("float32")
...@@ -813,22 +841,20 @@ class TestMomentumV2Group(TestMomentumV2): ...@@ -813,22 +841,20 @@ class TestMomentumV2Group(TestMomentumV2):
linear_1 = paddle.nn.Linear(13, 5) linear_1 = paddle.nn.Linear(13, 5)
linear_2 = paddle.nn.Linear(5, 3) linear_2 = paddle.nn.Linear(5, 3)
# This can be any optimizer supported by dygraph. # This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Momentum(learning_rate=0.01, adam = paddle.optimizer.Momentum(
parameters=[{ learning_rate=0.01,
'params': parameters=[
linear_1.parameters() {'params': linear_1.parameters()},
}, { {
'params': 'params': linear_2.parameters(),
linear_2.parameters(), 'weight_decay': 0.001,
'weight_decay': 'learning_rate': 0.1,
0.001, 'momentum': 0.99,
'learning_rate': },
0.1, ],
'momentum': weight_decay=0.1,
0.99 momentum=0.9,
}], )
weight_decay=0.1,
momentum=0.9)
out = linear_1(a) out = linear_1(a)
out = linear_2(out) out = linear_2(out)
out.backward() out.backward()
...@@ -837,13 +863,14 @@ class TestMomentumV2Group(TestMomentumV2): ...@@ -837,13 +863,14 @@ class TestMomentumV2Group(TestMomentumV2):
class TestMultiTensorMomentumDygraph(unittest.TestCase): class TestMultiTensorMomentumDygraph(unittest.TestCase):
def _momentum_optimize_dygraph(
def _momentum_optimize_dygraph(self, self,
place, place,
use_param_attr=False, use_param_attr=False,
use_param_group=False, use_param_group=False,
use_amp=False, use_amp=False,
use_multi_tensor=False): use_multi_tensor=False,
):
paddle.disable_static() paddle.disable_static()
paddle.seed(10) paddle.seed(10)
paddle.set_device(place) paddle.set_device(place)
...@@ -851,7 +878,8 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase): ...@@ -851,7 +878,8 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
weight_attr = paddle.ParamAttr( weight_attr = paddle.ParamAttr(
learning_rate=0.5, learning_rate=0.5,
regularizer=paddle.regularizer.L2Decay(1.0), regularizer=paddle.regularizer.L2Decay(1.0),
trainable=True) trainable=True,
)
if use_param_attr: if use_param_attr:
model = paddle.nn.Linear(5, 5, weight_attr) model = paddle.nn.Linear(5, 5, weight_attr)
else: else:
...@@ -860,17 +888,29 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase): ...@@ -860,17 +888,29 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
optimizer = paddle.optimizer.Momentum( optimizer = paddle.optimizer.Momentum(
parameters=model.parameters(), parameters=model.parameters(),
use_multi_tensor=use_multi_tensor, use_multi_tensor=use_multi_tensor,
multi_precision=use_amp) multi_precision=use_amp,
)
else: else:
parameters = list(model.parameters())
n = len(parameters)
optimizer = paddle.optimizer.Momentum( optimizer = paddle.optimizer.Momentum(
parameters=[{ parameters=[
'params': model.parameters(), {
'weight_decay': 0.001, 'params': parameters[: int(n / 2)],
'learning_rate': 0.1, 'weight_decay': 0.001,
'momentum': 0.99 'learning_rate': 0.1,
}], 'momentum': 0.99,
},
{
'params': parameters[int(n / 2) :],
'weight_decay': 0.001,
'learning_rate': 0.1,
'momentum': 0.99,
},
],
use_multi_tensor=use_multi_tensor, use_multi_tensor=use_multi_tensor,
multi_precision=use_amp) multi_precision=use_amp,
)
for idx in range(5): for idx in range(5):
if place == 'gpu' and use_amp == True: if place == 'gpu' and use_amp == True:
model = paddle.amp.decorate(models=model, level='O2') model = paddle.amp.decorate(models=model, level='O2')
...@@ -900,9 +940,11 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase): ...@@ -900,9 +940,11 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
def _check_with_place_amp(self, place, use_amp): def _check_with_place_amp(self, place, use_amp):
output1, params1 = self._momentum_optimize_dygraph( output1, params1 = self._momentum_optimize_dygraph(
place=place, use_amp=use_amp, use_multi_tensor=True) place=place, use_amp=use_amp, use_multi_tensor=True
)
output2, params2 = self._momentum_optimize_dygraph( output2, params2 = self._momentum_optimize_dygraph(
place=place, use_amp=use_amp, use_multi_tensor=False) place=place, use_amp=use_amp, use_multi_tensor=False
)
np.testing.assert_allclose(output1, output2, rtol=1e-05) np.testing.assert_allclose(output1, output2, rtol=1e-05)
for idx in range(len(params1)): for idx in range(len(params1)):
...@@ -913,12 +955,14 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase): ...@@ -913,12 +955,14 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
place=place, place=place,
use_amp=use_amp, use_amp=use_amp,
use_param_attr=True, use_param_attr=True,
use_multi_tensor=True) use_multi_tensor=True,
)
output2, params2 = self._momentum_optimize_dygraph( output2, params2 = self._momentum_optimize_dygraph(
place=place, place=place,
use_amp=use_amp, use_amp=use_amp,
use_param_attr=True, use_param_attr=True,
use_multi_tensor=False) use_multi_tensor=False,
)
np.testing.assert_allclose(output1, output2, rtol=1e-05) np.testing.assert_allclose(output1, output2, rtol=1e-05)
for idx in range(len(params1)): for idx in range(len(params1)):
np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05) np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
...@@ -928,12 +972,14 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase): ...@@ -928,12 +972,14 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
place=place, place=place,
use_amp=use_amp, use_amp=use_amp,
use_param_group=True, use_param_group=True,
use_multi_tensor=True) use_multi_tensor=True,
)
output2, params2 = self._momentum_optimize_dygraph( output2, params2 = self._momentum_optimize_dygraph(
place=place, place=place,
use_amp=use_amp, use_amp=use_amp,
use_param_group=True, use_param_group=True,
use_multi_tensor=False) use_multi_tensor=False,
)
np.testing.assert_allclose(output1, output2, rtol=1e-05) np.testing.assert_allclose(output1, output2, rtol=1e-05)
for idx in range(len(params1)): for idx in range(len(params1)):
np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05) np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
...@@ -952,11 +998,9 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase): ...@@ -952,11 +998,9 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase):
class TestMultiTensorMomentumStatic(unittest.TestCase): class TestMultiTensorMomentumStatic(unittest.TestCase):
def _momentum_optimize_static(
def _momentum_optimize_static(self, self, place, use_amp=False, use_multi_tensor=False
place, ):
use_amp=False,
use_multi_tensor=False):
paddle.enable_static() paddle.enable_static()
paddle.seed(10) paddle.seed(10)
np.random.seed(10) np.random.seed(10)
...@@ -965,24 +1009,26 @@ class TestMultiTensorMomentumStatic(unittest.TestCase): ...@@ -965,24 +1009,26 @@ class TestMultiTensorMomentumStatic(unittest.TestCase):
exe = paddle.static.Executor(place=place) exe = paddle.static.Executor(place=place)
train_program = paddle.static.Program() train_program = paddle.static.Program()
startup_program = paddle.static.Program() startup_program = paddle.static.Program()
optimizer = paddle.optimizer.Momentum(multi_precision=use_amp, optimizer = paddle.optimizer.Momentum(
use_multi_tensor=use_multi_tensor) multi_precision=use_amp, use_multi_tensor=use_multi_tensor
)
if use_amp: if use_amp:
optimizer = paddle.static.amp.decorate( optimizer = paddle.static.amp.decorate(
optimizer, optimizer,
init_loss_scaling=128.0, init_loss_scaling=128.0,
use_dynamic_loss_scaling=True, use_dynamic_loss_scaling=True,
use_pure_fp16=True, use_pure_fp16=True,
use_fp16_guard=False) use_fp16_guard=False,
)
with paddle.static.program_guard(train_program, startup_program): with paddle.static.program_guard(train_program, startup_program):
if use_amp: if use_amp:
data = paddle.static.data(shape=[2, 2], data = paddle.static.data(
name='X', shape=[2, 2], name='X', dtype='float16'
dtype='float16') )
else: else:
data = paddle.static.data(shape=[2, 2], data = paddle.static.data(
name='X', shape=[2, 2], name='X', dtype='float32'
dtype='float32') )
hidden = paddle.static.nn.fc(x=data, size=10) hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden) loss = paddle.mean(hidden)
optimizer.minimize(loss) optimizer.minimize(loss)
...@@ -994,9 +1040,9 @@ class TestMultiTensorMomentumStatic(unittest.TestCase): ...@@ -994,9 +1040,9 @@ class TestMultiTensorMomentumStatic(unittest.TestCase):
x = numpy.random.random(size=(2, 2)).astype('float32') x = numpy.random.random(size=(2, 2)).astype('float32')
out = [] out = []
for idx in range(5): for idx in range(5):
loss_data, = exe.run(train_program, (loss_data,) = exe.run(
feed={"X": x}, train_program, feed={"X": x}, fetch_list=[loss.name]
fetch_list=[loss.name]) )
out.append(loss_data) out.append(loss_data)
return out return out
...@@ -1007,12 +1053,12 @@ class TestMultiTensorMomentumStatic(unittest.TestCase): ...@@ -1007,12 +1053,12 @@ class TestMultiTensorMomentumStatic(unittest.TestCase):
return places return places
def _check_with_place_amp(self, place, use_amp): def _check_with_place_amp(self, place, use_amp):
output1 = self._momentum_optimize_static(place=place, output1 = self._momentum_optimize_static(
use_amp=use_amp, place=place, use_amp=use_amp, use_multi_tensor=True
use_multi_tensor=True) )
output2 = self._momentum_optimize_static(place=place, output2 = self._momentum_optimize_static(
use_amp=use_amp, place=place, use_amp=use_amp, use_multi_tensor=False
use_multi_tensor=False) )
for idx in range(len(output1)): for idx in range(len(output1)):
np.testing.assert_allclose(output1[idx], output2[idx], rtol=1e-05) np.testing.assert_allclose(output1[idx], output2[idx], rtol=1e-05)
......
...@@ -163,18 +163,20 @@ class Adam(Optimizer): ...@@ -163,18 +163,20 @@ class Adam(Optimizer):
_beta1_pow_acc_str = "beta1_pow_acc" _beta1_pow_acc_str = "beta1_pow_acc"
_beta2_pow_acc_str = "beta2_pow_acc" _beta2_pow_acc_str = "beta2_pow_acc"
def __init__(self, def __init__(
learning_rate=0.001, self,
beta1=0.9, learning_rate=0.001,
beta2=0.999, beta1=0.9,
epsilon=1e-8, beta2=0.999,
parameters=None, epsilon=1e-8,
weight_decay=None, parameters=None,
grad_clip=None, weight_decay=None,
lazy_mode=False, grad_clip=None,
multi_precision=False, lazy_mode=False,
use_multi_tensor=False, multi_precision=False,
name=None): use_multi_tensor=False,
name=None,
):
assert learning_rate is not None assert learning_rate is not None
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
...@@ -182,20 +184,25 @@ class Adam(Optimizer): ...@@ -182,20 +184,25 @@ class Adam(Optimizer):
if not isinstance(beta1, Variable): if not isinstance(beta1, Variable):
if not 0 <= beta1 < 1: if not 0 <= beta1 < 1:
raise ValueError( raise ValueError(
"Invaild value of beta1, expect beta1 in [0,1).") "Invaild value of beta1, expect beta1 in [0,1)."
)
if not isinstance(beta2, Variable): if not isinstance(beta2, Variable):
if not 0 <= beta2 < 1: if not 0 <= beta2 < 1:
raise ValueError( raise ValueError(
"Invaild value of beta2, expect beta2 in [0,1).") "Invaild value of beta2, expect beta2 in [0,1)."
)
if not isinstance(epsilon, Variable): if not isinstance(epsilon, Variable):
if not 0 <= epsilon: if not 0 <= epsilon:
raise ValueError( raise ValueError(
"Invaild value of epsilon, expect epsilon >= 0.") "Invaild value of epsilon, expect epsilon >= 0."
super(Adam, self).__init__(learning_rate=learning_rate, )
parameters=parameters, super(Adam, self).__init__(
weight_decay=weight_decay, learning_rate=learning_rate,
grad_clip=grad_clip, parameters=parameters,
name=name) weight_decay=weight_decay,
grad_clip=grad_clip,
name=name,
)
self.type = "adam" self.type = "adam"
self._beta1 = beta1 self._beta1 = beta1
self._beta2 = beta2 self._beta2 = beta2
...@@ -212,21 +219,13 @@ class Adam(Optimizer): ...@@ -212,21 +219,13 @@ class Adam(Optimizer):
self._use_multi_tensor = use_multi_tensor self._use_multi_tensor = use_multi_tensor
if self._use_multi_tensor: if self._use_multi_tensor:
self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []} self._param_dict = self._create_multi_tensor_dict()
self._moment1_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []} self._moment1_dict = self._create_multi_tensor_dict()
self._moment2_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []} self._moment2_dict = self._create_multi_tensor_dict()
self._beta1_pow_acc_dict = { self._beta1_pow_acc_dict = self._create_multi_tensor_dict()
'FP32_LODTensor': [], self._beta2_pow_acc_dict = self._create_multi_tensor_dict()
'FP16_LODTensor': [] self._master_weight_dict = self._create_multi_tensor_dict()
} self._master_weight_dict['FP32_LODTensor'] = None
self._beta2_pow_acc_dict = {
'FP32_LODTensor': [],
'FP16_LODTensor': []
}
self._master_weight_dict = {
'FP32_LODTensor': None,
'FP16_LODTensor': []
}
def _create_master_weight(self, param): def _create_master_weight(self, param):
if param.name in self._master_weights: if param.name in self._master_weights:
...@@ -236,19 +235,23 @@ class Adam(Optimizer): ...@@ -236,19 +235,23 @@ class Adam(Optimizer):
var_name = param.name + "_fp32_master" var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name) var_name = unique_name.generate(var_name)
var = layers.create_global_var(name=var_name, var = layers.create_global_var(
shape=param.shape, name=var_name,
value=0, shape=param.shape,
dtype='float32', value=0,
persistable=True) dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block() block = self.helper.startup_program.global_block()
block.append_op(type="cast", block.append_op(
inputs={"X": [param]}, type="cast",
outputs={"Out": [var]}, inputs={"X": [param]},
attrs={ outputs={"Out": [var]},
"in_dtype": param.dtype, attrs={
"out_dtype": core.VarDesc.VarType.FP32 "in_dtype": param.dtype,
}) "out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var self._master_weights[param.name] = var
return var return var
...@@ -262,20 +265,30 @@ class Adam(Optimizer): ...@@ -262,20 +265,30 @@ class Adam(Optimizer):
""" """
if self._name is not None: if self._name is not None:
name = self._name + "_" + name name = self._name + "_" + name
find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16 find_master = (
target_param = self._master_weights[ self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
param.name] if find_master else param )
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name target_name = target_param.name
if (name not in self._accumulators if (
or target_name not in self._accumulators[name]): name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception( raise Exception(
"Accumulator {} does not exist for parameter {}".format( "Accumulator {} does not exist for parameter {}".format(
name, target_name)) name, target_name
)
)
return self._accumulators[name][target_name] return self._accumulators[name][target_name]
def _add_moments_pows(self, p): def _add_moments_pows(self, p):
acc_dtype = p.dtype acc_dtype = p.dtype
if acc_dtype == core.VarDesc.VarType.FP16 or acc_dtype == core.VarDesc.VarType.BF16: if (
acc_dtype == core.VarDesc.VarType.FP16
or acc_dtype == core.VarDesc.VarType.BF16
):
acc_dtype = core.VarDesc.VarType.FP32 acc_dtype = core.VarDesc.VarType.FP32
self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype) self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype)
self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype) self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype)
...@@ -283,18 +296,24 @@ class Adam(Optimizer): ...@@ -283,18 +296,24 @@ class Adam(Optimizer):
name=self._beta1_pow_acc_str, name=self._beta1_pow_acc_str,
param=p, param=p,
dtype=acc_dtype, dtype=acc_dtype,
fill_value=0.9 if isinstance(self._beta1, Variable) \ fill_value=0.9
else self._beta1, if isinstance(self._beta1, Variable)
else self._beta1,
shape=[1], shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu') type=core.VarDesc.VarType.LOD_TENSOR,
device='cpu',
)
self._add_accumulator( self._add_accumulator(
name=self._beta2_pow_acc_str, name=self._beta2_pow_acc_str,
param=p, param=p,
dtype=acc_dtype, dtype=acc_dtype,
fill_value=0.999 if isinstance(self._beta2, Variable) \ fill_value=0.999
else self._beta2, if isinstance(self._beta2, Variable)
else self._beta2,
shape=[1], shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu') type=core.VarDesc.VarType.LOD_TENSOR,
device='cpu',
)
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -307,7 +326,10 @@ class Adam(Optimizer): ...@@ -307,7 +326,10 @@ class Adam(Optimizer):
master_p = self._create_master_weight(p) master_p = self._create_master_weight(p)
self._add_moments_pows(master_p) self._add_moments_pows(master_p)
continue continue
if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision: if (
p.dtype == core.VarDesc.VarType.FP16
and not self._multi_precision
):
warnings.warn( warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Adam optimizer." "Consider using multi_precision=True option of the Adam optimizer."
...@@ -319,50 +341,105 @@ class Adam(Optimizer): ...@@ -319,50 +341,105 @@ class Adam(Optimizer):
if isinstance(param_and_grad, dict): if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad) param_and_grad = self._update_param_group(param_and_grad)
moment1 = self._get_accumulator(self._moment1_acc_str, moment1 = self._get_accumulator(
param_and_grad[0]) self._moment1_acc_str, param_and_grad[0]
moment2 = self._get_accumulator(self._moment2_acc_str, )
param_and_grad[0]) moment2 = self._get_accumulator(
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, self._moment2_acc_str, param_and_grad[0]
param_and_grad[0]) )
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, beta1_pow_acc = self._get_accumulator(
param_and_grad[0]) self._beta1_pow_acc_str, param_and_grad[0]
find_master = self._multi_precision and param_and_grad[ )
0].dtype == core.VarDesc.VarType.FP16 beta2_pow_acc = self._get_accumulator(
master_weight = (self._master_weights[param_and_grad[0].name] self._beta2_pow_acc_str, param_and_grad[0]
if find_master else None) )
find_master = (
self._multi_precision
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
)
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
# create the adam optimize op # create the adam optimize op
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
found_inf = self._get_auxiliary_var('found_inf') found_inf = self._get_auxiliary_var('found_inf')
_beta1 = self._beta1 if not isinstance( _beta1 = (
self._beta1, Variable) else self._beta1.numpy().item(0) self._beta1
_beta2 = self._beta2 if not isinstance( if not isinstance(self._beta1, Variable)
self._beta2, Variable) else self._beta2.numpy().item(0) else self._beta1.numpy().item(0)
)
_beta2 = (
self._beta2
if not isinstance(self._beta2, Variable)
else self._beta2.numpy().item(0)
)
_, _, _, _, _, _ = _C_ops.adam_( _, _, _, _, _, _ = _C_ops.adam_(
param_and_grad[0], param_and_grad[1], lr, moment1, moment2, param_and_grad[0],
beta1_pow_acc, beta2_pow_acc, master_weight, found_inf, _beta1, param_and_grad[1],
_beta2, self._epsilon, self._lazy_mode, 1000, find_master, lr,
False) moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
found_inf,
_beta1,
_beta2,
self._epsilon,
self._lazy_mode,
1000,
find_master,
False,
)
return None return None
if framework._in_legacy_dygraph(): if framework._in_legacy_dygraph():
_beta1 = self._beta1 if not isinstance( _beta1 = (
self._beta1, Variable) else self._beta1.numpy().item(0) self._beta1
_beta2 = self._beta2 if not isinstance( if not isinstance(self._beta1, Variable)
self._beta2, Variable) else self._beta2.numpy().item(0) else self._beta1.numpy().item(0)
)
_beta2 = (
self._beta2
if not isinstance(self._beta2, Variable)
else self._beta2.numpy().item(0)
)
_, _, _, _, _, _ = _legacy_C_ops.adam( _, _, _, _, _, _ = _legacy_C_ops.adam(
param_and_grad[0], param_and_grad[1], lr, moment1, moment2, param_and_grad[0],
beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0], param_and_grad[1],
moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight, lr,
'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode, moment1,
'min_row_size_to_use_multithread', 1000, 'beta1', _beta1, moment2,
'beta2', _beta2, 'multi_precision', find_master) beta1_pow_acc,
beta2_pow_acc,
master_weight,
param_and_grad[0],
moment1,
moment2,
beta1_pow_acc,
beta2_pow_acc,
master_weight,
'epsilon',
self._epsilon,
'lazy_mode',
self._lazy_mode,
'min_row_size_to_use_multithread',
1000,
'beta1',
_beta1,
'beta2',
_beta2,
'multi_precision',
find_master,
)
return None return None
...@@ -373,7 +450,7 @@ class Adam(Optimizer): ...@@ -373,7 +450,7 @@ class Adam(Optimizer):
"Moment1": [moment1], "Moment1": [moment1],
"Moment2": [moment2], "Moment2": [moment2],
"Beta1Pow": [beta1_pow_acc], "Beta1Pow": [beta1_pow_acc],
"Beta2Pow": [beta2_pow_acc] "Beta2Pow": [beta2_pow_acc],
} }
outputs = { outputs = {
"ParamOut": [param_and_grad[0]], "ParamOut": [param_and_grad[0]],
...@@ -385,7 +462,7 @@ class Adam(Optimizer): ...@@ -385,7 +462,7 @@ class Adam(Optimizer):
attrs = { attrs = {
"lazy_mode": self._lazy_mode, "lazy_mode": self._lazy_mode,
"min_row_size_to_use_multithread": 1000, "min_row_size_to_use_multithread": 1000,
"multi_precision": find_master "multi_precision": find_master,
} }
if isinstance(self._beta1, Variable): if isinstance(self._beta1, Variable):
...@@ -405,11 +482,13 @@ class Adam(Optimizer): ...@@ -405,11 +482,13 @@ class Adam(Optimizer):
inputs["MasterParam"] = master_weight inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight outputs["MasterParamOut"] = master_weight
adam_op = block.append_op(type=self.type, adam_op = block.append_op(
inputs=inputs, type=self.type,
outputs=outputs, inputs=inputs,
attrs=attrs, outputs=outputs,
stop_gradient=True) attrs=attrs,
stop_gradient=True,
)
return adam_op return adam_op
...@@ -426,7 +505,7 @@ class Adam(Optimizer): ...@@ -426,7 +505,7 @@ class Adam(Optimizer):
.. code-block:: python .. code-block:: python
import paddle import paddle
a = paddle.rand([2,13], dtype="float32") a = paddle.rand([2,13], dtype="float32")
linear = paddle.nn.Linear(13, 5) linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph. # This can be any optimizer supported by dygraph.
...@@ -445,27 +524,34 @@ class Adam(Optimizer): ...@@ -445,27 +524,34 @@ class Adam(Optimizer):
if param._grad_ivar() is not None: if param._grad_ivar() is not None:
grad_var = param._grad_ivar() grad_var = param._grad_ivar()
if in_dygraph_mode(): if in_dygraph_mode():
if hasattr(grad_var, "is_selected_rows" if (
) and grad_var.is_selected_rows( hasattr(grad_var, "is_selected_rows")
) and self.regularization is not None: and grad_var.is_selected_rows()
and self.regularization is not None
):
raise RuntimeError( raise RuntimeError(
"Adam don't support weight_decay with sparse parameters, please set it to None." "Adam don't support weight_decay with sparse parameters, please set it to None."
) )
else: else:
if hasattr( if (
grad_var, "_is_sparse") and grad_var._is_sparse( hasattr(grad_var, "_is_sparse")
) and self.regularization is not None: and grad_var._is_sparse()
and self.regularization is not None
):
raise RuntimeError( raise RuntimeError(
"Adam don't support weight_decay with sparse parameters, please set it to None." "Adam don't support weight_decay with sparse parameters, please set it to None."
) )
params_grads.append((param, grad_var)) params_grads.append((param, grad_var))
optimize_ops = self._apply_optimize(loss=None, optimize_ops = self._apply_optimize(
startup_program=None, loss=None,
params_grads=params_grads) startup_program=None,
params_grads=params_grads,
param_group_idx=0,
)
else: else:
# optimize parameters in groups # optimize parameters in groups
for param_group in self._param_groups: for idx, param_group in enumerate(self._param_groups):
params_grads = defaultdict(lambda: list()) params_grads = defaultdict(lambda: list())
for param in param_group['params']: for param in param_group['params']:
if param.stop_gradient: if param.stop_gradient:
...@@ -474,13 +560,16 @@ class Adam(Optimizer): ...@@ -474,13 +560,16 @@ class Adam(Optimizer):
grad_var = param._grad_ivar() grad_var = param._grad_ivar()
params_grads['params'].append((param, grad_var)) params_grads['params'].append((param, grad_var))
params_grads.update( params_grads.update(
{k: v {k: v for k, v in param_group.items() if k != 'params'}
for k, v in param_group.items() if k != 'params'}) )
self._apply_optimize(loss=None, self._apply_optimize(
startup_program=None, loss=None,
params_grads=params_grads) startup_program=None,
params_grads=params_grads,
param_group_idx=idx,
)
def _multi_tensor_init(self, target_block, parameters): def _multi_tensor_init(self, target_block, parameters, param_group_idx):
""" """
All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32). All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
This function will be overridden in the corresponding optimizer file. This function will be overridden in the corresponding optimizer file.
...@@ -492,26 +581,49 @@ class Adam(Optimizer): ...@@ -492,26 +581,49 @@ class Adam(Optimizer):
for param in parameters: for param in parameters:
moment1 = self._get_accumulator(self._moment1_acc_str, param) moment1 = self._get_accumulator(self._moment1_acc_str, param)
moment2 = self._get_accumulator(self._moment2_acc_str, param) moment2 = self._get_accumulator(self._moment2_acc_str, param)
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, beta1_pow_acc = self._get_accumulator(
param) self._beta1_pow_acc_str, param
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, )
param) beta2_pow_acc = self._get_accumulator(
self._beta2_pow_acc_str, param
)
if param.dtype == paddle.float32: if param.dtype == paddle.float32:
self._param_dict['FP32_LODTensor'].append(param) self._param_dict['FP32_LODTensor'][param_group_idx].append(
self._moment1_dict['FP32_LODTensor'].append(moment1) param
self._moment2_dict['FP32_LODTensor'].append(moment2) )
self._beta1_pow_acc_dict['FP32_LODTensor'].append(beta1_pow_acc) self._moment1_dict['FP32_LODTensor'][param_group_idx].append(
self._beta2_pow_acc_dict['FP32_LODTensor'].append(beta2_pow_acc) moment1
)
self._moment2_dict['FP32_LODTensor'][param_group_idx].append(
moment2
)
self._beta1_pow_acc_dict['FP32_LODTensor'][
param_group_idx
].append(beta1_pow_acc)
self._beta2_pow_acc_dict['FP32_LODTensor'][
param_group_idx
].append(beta2_pow_acc)
elif param.dtype == paddle.float16: elif param.dtype == paddle.float16:
self._param_dict['FP16_LODTensor'].append(param) self._param_dict['FP16_LODTensor'][param_group_idx].append(
self._moment1_dict['FP16_LODTensor'].append(moment1) param
self._moment2_dict['FP16_LODTensor'].append(moment2) )
self._beta1_pow_acc_dict['FP16_LODTensor'].append(beta1_pow_acc) self._moment1_dict['FP16_LODTensor'][param_group_idx].append(
self._beta2_pow_acc_dict['FP16_LODTensor'].append(beta2_pow_acc) moment1
)
self._moment2_dict['FP16_LODTensor'][param_group_idx].append(
moment2
)
self._beta1_pow_acc_dict['FP16_LODTensor'][
param_group_idx
].append(beta1_pow_acc)
self._beta2_pow_acc_dict['FP16_LODTensor'][
param_group_idx
].append(beta2_pow_acc)
if self._multi_precision: if self._multi_precision:
self._master_weight_dict['FP16_LODTensor'].append( self._master_weight_dict['FP16_LODTensor'][
self._master_weights[param.name]) param_group_idx
].append(self._master_weights[param.name])
else: else:
self._master_weight_dict['FP16_LODTensor'] = None self._master_weight_dict['FP16_LODTensor'] = None
else: else:
...@@ -519,9 +631,13 @@ class Adam(Optimizer): ...@@ -519,9 +631,13 @@ class Adam(Optimizer):
"Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR." "Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR."
) )
def _append_optimize_multi_tensor_op(self, target_block, def _append_optimize_multi_tensor_op(
parameters_and_grads): self,
""" target_block,
parameters_and_grads,
param_group_idx,
):
"""
For Multi Tensor, append optimize merged_operator to block. For Multi Tensor, append optimize merged_operator to block.
""" """
assert isinstance(target_block, framework.Block) assert isinstance(target_block, framework.Block)
...@@ -534,15 +650,19 @@ class Adam(Optimizer): ...@@ -534,15 +650,19 @@ class Adam(Optimizer):
if param_and_grad[1] is None: if param_and_grad[1] is None:
continue continue
if param_and_grad[0].stop_gradient is False: if param_and_grad[0].stop_gradient is False:
if param_and_grad[ if (
0].dtype == paddle.float32 and param_and_grad[ param_and_grad[0].dtype == paddle.float32
1].type == core.VarDesc.VarType.LOD_TENSOR: and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP32_LODTensor'].append(param_and_grad[1]) grad_dict['FP32_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
lr_dict['FP32_LODTensor'].append(lr) lr_dict['FP32_LODTensor'].append(lr)
elif param_and_grad[ elif (
0].dtype == paddle.float16 and param_and_grad[ param_and_grad[0].dtype == paddle.float16
1].type == core.VarDesc.VarType.LOD_TENSOR: and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP16_LODTensor'].append(param_and_grad[1]) grad_dict['FP16_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
lr_dict['FP16_LODTensor'].append(lr) lr_dict['FP16_LODTensor'].append(lr)
...@@ -553,97 +673,149 @@ class Adam(Optimizer): ...@@ -553,97 +673,149 @@ class Adam(Optimizer):
if param_and_grad[0].stop_gradient is False: if param_and_grad[0].stop_gradient is False:
param_grad_dict = dict() param_grad_dict = dict()
param_grad_dict['params'] = param_and_grad param_grad_dict['params'] = param_and_grad
param_grad_dict.update({ param_grad_dict.update(
k: v {
for k, v in parameters_and_grads.items() k: v
if k != 'params' for k, v in parameters_and_grads.items()
}) if k != 'params'
}
)
param_and_grad = self._update_param_group(param_grad_dict) param_and_grad = self._update_param_group(param_grad_dict)
if param_and_grad[ if (
0].dtype == paddle.float32 and param_and_grad[ param_and_grad[0].dtype == paddle.float32
1].type == core.VarDesc.VarType.LOD_TENSOR: and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP32_LODTensor'].append(param_and_grad[1]) grad_dict['FP32_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
lr_dict['FP32_LODTensor'].append(lr) lr_dict['FP32_LODTensor'].append(lr)
elif param_and_grad[ elif (
0].dtype == paddle.float16 and param_and_grad[ param_and_grad[0].dtype == paddle.float16
1].type == core.VarDesc.VarType.LOD_TENSOR: and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP16_LODTensor'].append(param_and_grad[1]) grad_dict['FP16_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
lr_dict['FP16_LODTensor'].append(lr) lr_dict['FP16_LODTensor'].append(lr)
multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor'] multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor']
for key in multi_tensor_list: for key in multi_tensor_list:
if len(self._param_dict[key]) > 0: if len(self._param_dict[key][param_group_idx]) > 0:
find_master = self._multi_precision and key == 'FP16_LODTensor' find_master = self._multi_precision and key == 'FP16_LODTensor'
_beta1 = self._beta1 if not isinstance( _beta1 = (
self._beta1, Variable) else self._beta1.numpy().item(0) self._beta1
_beta2 = self._beta2 if not isinstance( if not isinstance(self._beta1, Variable)
self._beta2, Variable) else self._beta2.numpy().item(0) else self._beta1.numpy().item(0)
)
_beta2 = (
self._beta2
if not isinstance(self._beta2, Variable)
else self._beta2.numpy().item(0)
)
if framework._non_static_mode(): if framework._non_static_mode():
master_weight = self._master_weight_dict[key]
master_weight = (
master_weight[param_group_idx]
if master_weight is not None
else None
)
if in_dygraph_mode(): if in_dygraph_mode():
_, _, _, _, _, _ = _C_ops.merged_adam_( _, _, _, _, _, _ = _C_ops.merged_adam_(
self._param_dict[key], grad_dict[key], lr_dict[key], self._param_dict[key][param_group_idx],
self._moment1_dict[key], self._moment2_dict[key], grad_dict[key],
self._beta1_pow_acc_dict[key], lr_dict[key],
self._beta2_pow_acc_dict[key], self._moment1_dict[key][param_group_idx],
self._master_weight_dict[key], _beta1, _beta2, self._moment2_dict[key][param_group_idx],
self._epsilon, find_master, False) self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
_beta1,
_beta2,
self._epsilon,
find_master,
False,
)
else: else:
_, _, _, _, _, _ = _legacy_C_ops.merged_adam( _, _, _, _, _, _ = _legacy_C_ops.merged_adam(
self._param_dict[key], grad_dict[key], lr_dict[key], self._param_dict[key][param_group_idx],
self._moment1_dict[key], self._moment2_dict[key], grad_dict[key],
self._beta1_pow_acc_dict[key], lr_dict[key],
self._beta2_pow_acc_dict[key], self._moment1_dict[key][param_group_idx],
self._master_weight_dict[key], self._moment2_dict[key][param_group_idx],
self._param_dict[key], self._moment1_dict[key], self._beta1_pow_acc_dict[key][param_group_idx],
self._moment2_dict[key], self._beta2_pow_acc_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key], master_weight,
self._beta2_pow_acc_dict[key], self._param_dict[key][param_group_idx],
self._master_weight_dict[key], 'epsilon', self._moment1_dict[key][param_group_idx],
self._epsilon, 'beta1', _beta1, 'beta2', _beta2, self._moment2_dict[key][param_group_idx],
'multi_precision', find_master) self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
'epsilon',
self._epsilon,
'beta1',
_beta1,
'beta2',
_beta2,
'multi_precision',
find_master,
)
else: else:
inputs = { inputs = {
"Param": self._param_dict[key], "Param": self._param_dict[key][param_group_idx],
"Grad": grad_dict[key], "Grad": grad_dict[key],
"LearningRate": lr_dict[key], "LearningRate": lr_dict[key],
"Moment1": self._moment1_dict[key], "Moment1": self._moment1_dict[key][param_group_idx],
"Moment2": self._moment2_dict[key], "Moment2": self._moment2_dict[key][param_group_idx],
"Beta1Pow": self._beta1_pow_acc_dict[key], "Beta1Pow": self._beta1_pow_acc_dict[key][
"Beta2Pow": self._beta2_pow_acc_dict[key] param_group_idx
],
"Beta2Pow": self._beta2_pow_acc_dict[key][
param_group_idx
],
} }
outputs = { outputs = {
"ParamOut": self._param_dict[key], "ParamOut": self._param_dict[key][param_group_idx],
"Moment1Out": self._moment1_dict[key], "Moment1Out": self._moment1_dict[key][param_group_idx],
"Moment2Out": self._moment2_dict[key], "Moment2Out": self._moment2_dict[key][param_group_idx],
"Beta1PowOut": self._beta1_pow_acc_dict[key], "Beta1PowOut": self._beta1_pow_acc_dict[key][
"Beta2PowOut": self._beta2_pow_acc_dict[key] param_group_idx
],
"Beta2PowOut": self._beta2_pow_acc_dict[key][
param_group_idx
],
} }
attrs = { attrs = {
"epsilon": self._epsilon, "epsilon": self._epsilon,
"beta1": _beta1, "beta1": _beta1,
"beta2": _beta2 "beta2": _beta2,
} }
if find_master: if find_master:
inputs["MasterParam"] = self._master_weight_dict[key] inputs["MasterParam"] = self._master_weight_dict[key][
param_group_idx
]
outputs["MasterParamOut"] = self._master_weight_dict[ outputs["MasterParamOut"] = self._master_weight_dict[
key] key
][param_group_idx]
attrs["multi_precision"] = find_master attrs["multi_precision"] = find_master
target_block.append_op(type="merged_adam", target_block.append_op(
inputs=inputs, type="merged_adam",
outputs=outputs, inputs=inputs,
attrs=attrs, outputs=outputs,
stop_gradient=True) attrs=attrs,
stop_gradient=True,
)
return None return None
def _update_param_group(self, parameters): def _update_param_group(self, parameters):
self._beta1 = parameters.get('beta1', self._default_dict['beta1']) self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
self._beta2 = parameters.get('beta2', self._default_dict['beta2']) self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
self._epsilon = parameters.get('epsilon', self._default_dict['epsilon']) self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
self._lazy_mode = parameters.get('lazy_mode', self._lazy_mode = parameters.get(
self._default_dict['lazy_mode']) 'lazy_mode', self._default_dict['lazy_mode']
)
parameters = parameters.get('params') parameters = parameters.get('params')
return parameters return parameters
...@@ -123,29 +123,35 @@ class Momentum(Optimizer): ...@@ -123,29 +123,35 @@ class Momentum(Optimizer):
""" """
_velocity_acc_str = "velocity" _velocity_acc_str = "velocity"
def __init__(self, def __init__(
learning_rate=0.001, self,
momentum=0.9, learning_rate=0.001,
parameters=None, momentum=0.9,
use_nesterov=False, parameters=None,
weight_decay=None, use_nesterov=False,
grad_clip=None, weight_decay=None,
multi_precision=False, grad_clip=None,
rescale_grad=1.0, multi_precision=False,
use_multi_tensor=False, rescale_grad=1.0,
name=None): use_multi_tensor=False,
name=None,
):
if learning_rate is None: if learning_rate is None:
raise ValueError("learning_rate is not set") raise ValueError("learning_rate is not set")
if momentum is None: if momentum is None:
raise ValueError("momentum is not set") raise ValueError("momentum is not set")
predicate = lambda regular: isinstance(regular, predicate = lambda regular: isinstance(
(L2DecayRegularizer, float)) regular, (L2DecayRegularizer, float)
)
if isinstance(parameters, list): if isinstance(parameters, list):
if isinstance(parameters[0], dict): if isinstance(parameters[0], dict):
for param_group in parameters: for param_group in parameters:
decay = param_group[ decay = (
'weight_decay'] if 'weight_decay' in param_group else weight_decay param_group['weight_decay']
if 'weight_decay' in param_group
else weight_decay
)
reg_method, reg_coeff = self._update_regularization(decay) reg_method, reg_coeff = self._update_regularization(decay)
param_group['regularization_method'] = reg_method param_group['regularization_method'] = reg_method
param_group['regularization_coeff'] = reg_coeff param_group['regularization_coeff'] = reg_coeff
...@@ -153,16 +159,20 @@ class Momentum(Optimizer): ...@@ -153,16 +159,20 @@ class Momentum(Optimizer):
param_group['weight_decay'] = py_regular param_group['weight_decay'] = py_regular
py_regular = None if predicate(weight_decay) else weight_decay py_regular = None if predicate(weight_decay) else weight_decay
super(Momentum, self).__init__(learning_rate=learning_rate, super(Momentum, self).__init__(
parameters=parameters, learning_rate=learning_rate,
weight_decay=py_regular, parameters=parameters,
grad_clip=grad_clip, weight_decay=py_regular,
name=name) grad_clip=grad_clip,
name=name,
)
self.type = "momentum" self.type = "momentum"
self._momentum = momentum self._momentum = momentum
self._use_nesterov = bool(use_nesterov) self._use_nesterov = bool(use_nesterov)
self._regularization_method, self._regularization_coeff = self._update_regularization( (
weight_decay) self._regularization_method,
self._regularization_coeff,
) = self._update_regularization(weight_decay)
self._multi_precision = multi_precision self._multi_precision = multi_precision
self._rescale_grad = rescale_grad self._rescale_grad = rescale_grad
self._master_weights = {} self._master_weights = {}
...@@ -176,29 +186,21 @@ class Momentum(Optimizer): ...@@ -176,29 +186,21 @@ class Momentum(Optimizer):
} }
self._use_multi_tensor = use_multi_tensor self._use_multi_tensor = use_multi_tensor
if self._use_multi_tensor: if self._use_multi_tensor:
self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []} self._param_dict = self._create_multi_tensor_dict()
self._velocity_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []} self._velocity_dict = self._create_multi_tensor_dict()
self._master_weight_dict = { self._master_weight_dict = self._create_multi_tensor_dict()
'FP32_LODTensor': None, self._master_weight_dict['FP32_LODTensor'] = None
'FP16_LODTensor': [] self._regularization_method_dict = self._create_multi_tensor_dict()
} self._regularization_coeff_dict = self._create_multi_tensor_dict()
self._regularization_method_dict = {
'FP32_LODTensor': [],
'FP16_LODTensor': []
}
self._regularization_coeff_dict = {
'FP32_LODTensor': [],
'FP16_LODTensor': []
}
def _update_regularization(self, weight_decay): def _update_regularization(self, weight_decay):
reg_method = "" reg_method = ""
reg_coeff = 0.0 reg_coeff = 0.0
if (isinstance(weight_decay, L2DecayRegularizer)): if isinstance(weight_decay, L2DecayRegularizer):
reg_method = "l2_decay" reg_method = "l2_decay"
reg_coeff = weight_decay._regularization_coeff reg_coeff = weight_decay._regularization_coeff
if (isinstance(weight_decay, float)): if isinstance(weight_decay, float):
reg_method = "l2_decay" reg_method = "l2_decay"
reg_coeff = weight_decay reg_coeff = weight_decay
return reg_method, reg_coeff return reg_method, reg_coeff
...@@ -211,19 +213,23 @@ class Momentum(Optimizer): ...@@ -211,19 +213,23 @@ class Momentum(Optimizer):
var_name = param.name + "_fp32_master" var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name) var_name = unique_name.generate(var_name)
var = layers.create_global_var(name=var_name, var = layers.create_global_var(
shape=param.shape, name=var_name,
value=0, shape=param.shape,
dtype='float32', value=0,
persistable=True) dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block() block = self.helper.startup_program.global_block()
block.append_op(type="cast", block.append_op(
inputs={"X": [param]}, type="cast",
outputs={"Out": [var]}, inputs={"X": [param]},
attrs={ outputs={"Out": [var]},
"in_dtype": param.dtype, attrs={
"out_dtype": core.VarDesc.VarType.FP32 "in_dtype": param.dtype,
}) "out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var self._master_weights[param.name] = var
return var return var
...@@ -239,15 +245,22 @@ class Momentum(Optimizer): ...@@ -239,15 +245,22 @@ class Momentum(Optimizer):
""" """
if self._name is not None: if self._name is not None:
name = self._name + "_" + name name = self._name + "_" + name
find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16 find_master = (
target_param = self._master_weights[ self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
param.name] if find_master else param )
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name target_name = target_param.name
if (name not in self._accumulators if (
or target_name not in self._accumulators[name]): name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception( raise Exception(
"Accumulator {} does not exist for parameter {}".format( "Accumulator {} does not exist for parameter {}".format(
name, target_name)) name, target_name
)
)
return self._accumulators[name][target_name] return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
...@@ -265,7 +278,10 @@ class Momentum(Optimizer): ...@@ -265,7 +278,10 @@ class Momentum(Optimizer):
master_p = self._create_master_weight(p) master_p = self._create_master_weight(p)
self._add_accumulator(self._velocity_acc_str, master_p) self._add_accumulator(self._velocity_acc_str, master_p)
continue continue
if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision: if (
p.dtype == core.VarDesc.VarType.FP16
and not self._multi_precision
):
warnings.warn( warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Momentum optimizer." "Consider using multi_precision=True option of the Momentum optimizer."
...@@ -273,25 +289,28 @@ class Momentum(Optimizer): ...@@ -273,25 +289,28 @@ class Momentum(Optimizer):
self._add_accumulator(self._velocity_acc_str, p) self._add_accumulator(self._velocity_acc_str, p)
def _create_regularization_of_grad(self, param, grad, regularization=None): def _create_regularization_of_grad(self, param, grad, regularization=None):
""" Create and add backward regularization Operators """Create and add backward regularization Operators
Function helper of append_regularization_ops. Function helper of append_regularization_ops.
""" """
# If ParamAttr is set to L2Decay, we skip doing regularization here. And then we fused # If ParamAttr is set to L2Decay, we skip doing regularization here. And then we fused
# L2Decay with momentum which can refer to _append_optimize_op below. # L2Decay with momentum which can refer to _append_optimize_op below.
if hasattr(param, 'regularizer') and isinstance(param.regularizer, if hasattr(param, 'regularizer') and isinstance(
L2DecayRegularizer): param.regularizer, L2DecayRegularizer
):
return grad return grad
return super(Momentum, self)._create_regularization_of_grad( return super(Momentum, self)._create_regularization_of_grad(
param, grad, regularization) param, grad, regularization
)
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
if isinstance(param_and_grad, dict): if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad) param_and_grad = self._update_param_group(param_and_grad)
velocity_acc = self._get_accumulator(self._velocity_acc_str, velocity_acc = self._get_accumulator(
param_and_grad[0]) self._velocity_acc_str, param_and_grad[0]
)
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
# For fusion of momentum and l2decay # For fusion of momentum and l2decay
...@@ -308,30 +327,56 @@ class Momentum(Optimizer): ...@@ -308,30 +327,56 @@ class Momentum(Optimizer):
regularization_method = "" regularization_method = ""
regularization_coeff = 0.0 regularization_coeff = 0.0
find_master = self._multi_precision and param_and_grad[ find_master = (
0].dtype == core.VarDesc.VarType.FP16 self._multi_precision
master_weight = (self._master_weights[param_and_grad[0].name] and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
if find_master else None) )
master_weight = (
self._master_weights[param_and_grad[0].name]
if find_master
else None
)
if _in_legacy_dygraph(): if _in_legacy_dygraph():
if isinstance(param_and_grad, dict): if isinstance(param_and_grad, dict):
self._update_regularization(param_and_grad['weight_decay']) self._update_regularization(param_and_grad['weight_decay'])
_, _, _ = _legacy_C_ops.momentum( _, _, _ = _legacy_C_ops.momentum(
param_and_grad[0], param_and_grad[1], velocity_acc, lr, param_and_grad[0],
master_weight, param_and_grad[0], velocity_acc, master_weight, param_and_grad[1],
'mu', self._momentum, 'use_nesterov', self._use_nesterov, velocity_acc,
'regularization_method', regularization_method, lr,
'regularization_coeff', regularization_coeff, 'multi_precision', master_weight,
find_master) param_and_grad[0],
velocity_acc,
master_weight,
'mu',
self._momentum,
'use_nesterov',
self._use_nesterov,
'regularization_method',
regularization_method,
'regularization_coeff',
regularization_coeff,
'multi_precision',
find_master,
)
return None return None
if in_dygraph_mode(): if in_dygraph_mode():
if isinstance(param_and_grad, dict): if isinstance(param_and_grad, dict):
self._update_regularization(param_and_grad['weight_decay']) self._update_regularization(param_and_grad['weight_decay'])
return _C_ops.momentum_(param_and_grad[0], param_and_grad[1], return _C_ops.momentum_(
velocity_acc, lr, master_weight, param_and_grad[0],
self._momentum, self._use_nesterov, param_and_grad[1],
regularization_method, regularization_coeff, velocity_acc,
find_master, self._rescale_grad) lr,
master_weight,
self._momentum,
self._use_nesterov,
regularization_method,
regularization_coeff,
find_master,
self._rescale_grad,
)
attrs = { attrs = {
"mu": self._momentum, "mu": self._momentum,
...@@ -339,19 +384,19 @@ class Momentum(Optimizer): ...@@ -339,19 +384,19 @@ class Momentum(Optimizer):
"regularization_method": regularization_method, "regularization_method": regularization_method,
"regularization_coeff": regularization_coeff, "regularization_coeff": regularization_coeff,
"multi_precision": find_master, "multi_precision": find_master,
"rescale_grad": self._rescale_grad "rescale_grad": self._rescale_grad,
} }
inputs = { inputs = {
"Param": [param_and_grad[0]], "Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]], "Grad": [param_and_grad[1]],
"Velocity": [velocity_acc], "Velocity": [velocity_acc],
"LearningRate": [lr] "LearningRate": [lr],
} }
outputs = { outputs = {
"ParamOut": [param_and_grad[0]], "ParamOut": [param_and_grad[0]],
"VelocityOut": [velocity_acc] "VelocityOut": [velocity_acc],
} }
if find_master: if find_master:
...@@ -359,15 +404,17 @@ class Momentum(Optimizer): ...@@ -359,15 +404,17 @@ class Momentum(Optimizer):
outputs["MasterParamOut"] = master_weight outputs["MasterParamOut"] = master_weight
# create the momentum optimize op # create the momentum optimize op
momentum_op = block.append_op(type=self.type, momentum_op = block.append_op(
inputs=inputs, type=self.type,
outputs=outputs, inputs=inputs,
attrs=attrs, outputs=outputs,
stop_gradient=True) attrs=attrs,
stop_gradient=True,
)
return momentum_op return momentum_op
def _multi_tensor_init(self, target_block, parameters): def _multi_tensor_init(self, target_block, parameters, param_group_idx):
""" """
All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32). All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
This function will be overridden in the corresponding optimizer file. This function will be overridden in the corresponding optimizer file.
...@@ -385,38 +432,59 @@ class Momentum(Optimizer): ...@@ -385,38 +432,59 @@ class Momentum(Optimizer):
# we skip param's l2decay before, so fuse it with momentum here. # we skip param's l2decay before, so fuse it with momentum here.
if isinstance(param.regularizer, L2DecayRegularizer): if isinstance(param.regularizer, L2DecayRegularizer):
regularization_method = "l2_decay" regularization_method = "l2_decay"
regularization_coeff = param.regularizer._regularization_coeff regularization_coeff = (
param.regularizer._regularization_coeff
)
elif param.regularizer is not None: elif param.regularizer is not None:
regularization_method = "" regularization_method = ""
regularization_coeff = 0.0 regularization_coeff = 0.0
if param.dtype == paddle.float32: if param.dtype == paddle.float32:
self._param_dict['FP32_LODTensor'].append(param) self._param_dict['FP32_LODTensor'][param_group_idx].append(
self._velocity_dict['FP32_LODTensor'].append(velocity_acc) param
)
self._velocity_dict['FP32_LODTensor'][param_group_idx].append(
velocity_acc
)
# fp32 no master weight # fp32 no master weight
self._regularization_method_dict['FP32_LODTensor'].append( self._regularization_method_dict['FP32_LODTensor'][
regularization_method) param_group_idx
self._regularization_coeff_dict['FP32_LODTensor'].append( ].append(regularization_method)
regularization_coeff) self._regularization_coeff_dict['FP32_LODTensor'][
param_group_idx
].append(regularization_coeff)
elif param.dtype == paddle.float16: elif param.dtype == paddle.float16:
self._param_dict['FP16_LODTensor'].append(param) self._param_dict['FP16_LODTensor'][param_group_idx].append(
self._velocity_dict['FP16_LODTensor'].append(velocity_acc) param
)
self._velocity_dict['FP16_LODTensor'][param_group_idx].append(
velocity_acc
)
if self._multi_precision: if self._multi_precision:
self._master_weight_dict['FP16_LODTensor'].append( self._master_weight_dict['FP16_LODTensor'][
self._master_weights[param.name]) param_group_idx
].append(self._master_weights[param.name])
else: else:
self._master_weight_dict['FP16_LODTensor'] = None self._master_weight_dict['FP16_LODTensor'][
self._regularization_method_dict['FP16_LODTensor'].append( param_group_idx
regularization_method) ] = None
self._regularization_coeff_dict['FP16_LODTensor'].append( self._regularization_method_dict['FP16_LODTensor'][
regularization_coeff) param_group_idx
].append(regularization_method)
self._regularization_coeff_dict['FP16_LODTensor'][
param_group_idx
].append(regularization_coeff)
else: else:
raise ValueError( raise ValueError(
"Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR." "Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR."
) )
def _append_optimize_multi_tensor_op(self, target_block, def _append_optimize_multi_tensor_op(
parameters_and_grads): self,
""" target_block,
parameters_and_grads,
param_group_idx,
):
"""
For Multi Tensor, append optimize merged_operator to block. For Multi Tensor, append optimize merged_operator to block.
""" """
assert isinstance(target_block, framework.Block) assert isinstance(target_block, framework.Block)
...@@ -429,15 +497,19 @@ class Momentum(Optimizer): ...@@ -429,15 +497,19 @@ class Momentum(Optimizer):
if param_and_grad[1] is None: if param_and_grad[1] is None:
continue continue
if param_and_grad[0].stop_gradient is False: if param_and_grad[0].stop_gradient is False:
if param_and_grad[ if (
0].dtype == paddle.float32 and param_and_grad[ param_and_grad[0].dtype == paddle.float32
1].type == core.VarDesc.VarType.LOD_TENSOR: and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP32_LODTensor'].append(param_and_grad[1]) grad_dict['FP32_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
lr_dict['FP32_LODTensor'].append(lr) lr_dict['FP32_LODTensor'].append(lr)
elif param_and_grad[ elif (
0].dtype == paddle.float16 and param_and_grad[ param_and_grad[0].dtype == paddle.float16
1].type == core.VarDesc.VarType.LOD_TENSOR: and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP16_LODTensor'].append(param_and_grad[1]) grad_dict['FP16_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
lr_dict['FP16_LODTensor'].append(lr) lr_dict['FP16_LODTensor'].append(lr)
...@@ -448,97 +520,144 @@ class Momentum(Optimizer): ...@@ -448,97 +520,144 @@ class Momentum(Optimizer):
if param_and_grad[0].stop_gradient is False: if param_and_grad[0].stop_gradient is False:
param_grad_dict = dict() param_grad_dict = dict()
param_grad_dict['params'] = param_and_grad param_grad_dict['params'] = param_and_grad
param_grad_dict.update({ param_grad_dict.update(
k: v {
for k, v in parameters_and_grads.items() k: v
if k != 'params' for k, v in parameters_and_grads.items()
}) if k != 'params'
}
)
param_and_grad = self._update_param_group(param_grad_dict) param_and_grad = self._update_param_group(param_grad_dict)
if param_and_grad[ if (
0].dtype == paddle.float32 and param_and_grad[ param_and_grad[0].dtype == paddle.float32
1].type == core.VarDesc.VarType.LOD_TENSOR: and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP32_LODTensor'].append(param_and_grad[1]) grad_dict['FP32_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
lr_dict['FP32_LODTensor'].append(lr) lr_dict['FP32_LODTensor'].append(lr)
elif param_and_grad[ elif (
0].dtype == paddle.float16 and param_and_grad[ param_and_grad[0].dtype == paddle.float16
1].type == core.VarDesc.VarType.LOD_TENSOR: and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR
):
grad_dict['FP16_LODTensor'].append(param_and_grad[1]) grad_dict['FP16_LODTensor'].append(param_and_grad[1])
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
lr_dict['FP16_LODTensor'].append(lr) lr_dict['FP16_LODTensor'].append(lr)
multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor'] multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor']
for key in multi_tensor_list: for key in multi_tensor_list:
if len(self._param_dict[key]) > 0: if len(self._param_dict[key][param_group_idx]) > 0:
find_master = self._multi_precision and key == 'FP16_LODTensor' find_master = self._multi_precision and key == 'FP16_LODTensor'
master_weight = self._master_weight_dict[key]
master_weight = (
master_weight[param_group_idx]
if master_weight is not None
else None
)
if framework._non_static_mode(): if framework._non_static_mode():
if in_dygraph_mode(): if in_dygraph_mode():
_, _, _ = _C_ops.merged_momentum_( _, _, _ = _C_ops.merged_momentum_(
self._param_dict[key], grad_dict[key], self._param_dict[key][param_group_idx],
self._velocity_dict[key], lr_dict[key], grad_dict[key],
self._master_weight_dict[key], self._momentum, self._velocity_dict[key][param_group_idx],
lr_dict[key],
master_weight,
self._momentum,
self._use_nesterov, self._use_nesterov,
self._regularization_method_dict[key], self._regularization_method_dict[key][
self._regularization_coeff_dict[key], find_master, param_group_idx
self._rescale_grad) ],
self._regularization_coeff_dict[key][
param_group_idx
],
find_master,
self._rescale_grad,
)
else: else:
_, _, _ = _legacy_C_ops.merged_momentum( _, _, _ = _legacy_C_ops.merged_momentum(
self._param_dict[key], grad_dict[key], self._param_dict[key][param_group_idx],
self._velocity_dict[key], lr_dict[key], grad_dict[key],
self._master_weight_dict[key], self._velocity_dict[key][param_group_idx],
self._param_dict[key], self._velocity_dict[key], lr_dict[key],
self._master_weight_dict[key], 'mu', self._momentum, master_weight,
'use_nesterov', self._use_nesterov, self._param_dict[key][param_group_idx],
self._velocity_dict[key][param_group_idx],
master_weight,
'mu',
self._momentum,
'use_nesterov',
self._use_nesterov,
'regularization_method', 'regularization_method',
self._regularization_method_dict[key], self._regularization_method_dict[key][
param_group_idx
],
'regularization_coeff', 'regularization_coeff',
self._regularization_coeff_dict[key], self._regularization_coeff_dict[key][
'multi_precision', find_master) param_group_idx
],
'multi_precision',
find_master,
)
else: else:
inputs = { inputs = {
"Param": self._param_dict[key], "Param": self._param_dict[key][param_group_idx],
"Grad": grad_dict[key], "Grad": grad_dict[key],
"Velocity": self._velocity_dict[key], "Velocity": self._velocity_dict[key][param_group_idx],
"LearningRate": lr_dict[key], "LearningRate": lr_dict[key],
} }
outputs = { outputs = {
"ParamOut": self._param_dict[key], "ParamOut": self._param_dict[key][param_group_idx],
"VelocityOut": self._velocity_dict[key], "VelocityOut": self._velocity_dict[key][
param_group_idx
],
} }
attrs = { attrs = {
"mu": "mu": self._momentum,
self._momentum, "use_nesterov": self._use_nesterov,
"use_nesterov": "regularization_method": self._regularization_method_dict[
self._use_nesterov, key
"regularization_method": ][
self._regularization_method_dict[key], param_group_idx
"regularization_coeff": ],
self._regularization_coeff_dict[key], "regularization_coeff": self._regularization_coeff_dict[
key
][param_group_idx],
} }
if find_master: if find_master:
inputs["MasterParam"] = self._master_weight_dict[key] inputs["MasterParam"] = self._master_weight_dict[key][
param_group_idx
]
outputs["MasterParamOut"] = self._master_weight_dict[ outputs["MasterParamOut"] = self._master_weight_dict[
key] key
][param_group_idx]
attrs["multi_precision"] = find_master attrs["multi_precision"] = find_master
target_block.append_op(type="merged_momentum", target_block.append_op(
inputs=inputs, type="merged_momentum",
outputs=outputs, inputs=inputs,
attrs=attrs, outputs=outputs,
stop_gradient=True) attrs=attrs,
stop_gradient=True,
)
return None return None
def _update_param_group(self, parameters): def _update_param_group(self, parameters):
self._momentum = parameters.get('momentum', self._momentum = parameters.get(
self._default_dict['momentum']) 'momentum', self._default_dict['momentum']
self._use_nesterov = parameters.get('use_nesterov', )
self._default_dict['use_nesterov']) self._use_nesterov = parameters.get(
self._rescale_grad = parameters.get('rescale_grad', 'use_nesterov', self._default_dict['use_nesterov']
self._default_dict['rescale_grad']) )
self._rescale_grad = parameters.get(
'rescale_grad', self._default_dict['rescale_grad']
)
self._regularization_method = parameters.get( self._regularization_method = parameters.get(
'regularization_method', 'regularization_method', self._default_dict['regularization_method']
self._default_dict['regularization_method']) )
self._regularization_coeff = parameters.get( self._regularization_coeff = parameters.get(
'regularization_coeff', self._default_dict['regularization_coeff']) 'regularization_coeff', self._default_dict['regularization_coeff']
)
parameters = parameters.get('params') parameters = parameters.get('params')
return parameters return parameters
...@@ -21,13 +21,30 @@ from collections import defaultdict ...@@ -21,13 +21,30 @@ from collections import defaultdict
import paddle import paddle
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard from paddle.fluid.framework import (
Program,
Variable,
name_scope,
default_main_program,
default_startup_program,
device_guard,
)
from ..fluid import framework from ..fluid import framework
from ..fluid import layers from ..fluid import layers
from ..fluid import unique_name from ..fluid import unique_name
from ..fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name from ..fluid.backward import (
from ..fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops append_backward,
_some_in_set_,
_append_grad_suffix_,
_get_no_grad_set_name,
)
from ..fluid.clip import (
GradientClipBase,
GradientClipByNorm,
error_clip_callback,
append_gradient_clip_ops,
)
from ..fluid.framework import program_guard, Parameter from ..fluid.framework import program_guard, Parameter
from ..fluid.initializer import Constant from ..fluid.initializer import Constant
from ..fluid.layer_helper import LayerHelper from ..fluid.layer_helper import LayerHelper
...@@ -42,24 +59,36 @@ from .. import compat as cpt ...@@ -42,24 +59,36 @@ from .. import compat as cpt
from .lr import LRScheduler from .lr import LRScheduler
import copy import copy
from paddle import _C_ops, _legacy_C_ops from paddle import _C_ops, _legacy_C_ops
from paddle.fluid.framework import _in_legacy_dygraph, _in_eager_without_dygraph_check, _current_expected_place, in_dygraph_mode from paddle.fluid.framework import (
_in_legacy_dygraph,
_in_eager_without_dygraph_check,
_current_expected_place,
in_dygraph_mode,
)
__all__ = [] __all__ = []
@framework.static_only @framework.static_only
def append_backward_new(loss_list, def append_backward_new(
parameter_list=None, loss_list,
no_grad_set=None, parameter_list=None,
callbacks=None, no_grad_set=None,
checkpoints=None, callbacks=None,
distop_context=None): checkpoints=None,
distop_context=None,
):
from paddle.incubate.autograd.primx import orig2prim, Transform from paddle.incubate.autograd.primx import orig2prim, Transform
program = default_main_program() program = default_main_program()
assert program.num_blocks == 1, "The append_backward_new interface is designed to process only one block." assert (
program.num_blocks == 1
), "The append_backward_new interface is designed to process only one block."
block = program.current_block() block = program.current_block()
for el in loss_list: for el in loss_list:
assert el.block == block, f'variable in loss_list should be in current block of main program' assert (
el.block == block
), f'variable in loss_list should be in current block of main program'
orig2prim(block) orig2prim(block)
ad = Transform(block) ad = Transform(block)
...@@ -163,12 +192,14 @@ class Optimizer(object): ...@@ -163,12 +192,14 @@ class Optimizer(object):
""" """
@imperative_base.no_grad @imperative_base.no_grad
def __init__(self, def __init__(
learning_rate, self,
parameters=None, learning_rate,
weight_decay=None, parameters=None,
grad_clip=None, weight_decay=None,
name=None): grad_clip=None,
name=None,
):
if parameters is not None: if parameters is not None:
# paddle.Tensor is also iterable, so here we don't check whether # paddle.Tensor is also iterable, so here we don't check whether
...@@ -177,13 +208,16 @@ class Optimizer(object): ...@@ -177,13 +208,16 @@ class Optimizer(object):
if isinstance(parameters, (paddle.Tensor, core.eager.Tensor)): if isinstance(parameters, (paddle.Tensor, core.eager.Tensor)):
raise TypeError( raise TypeError(
"`parameters` argument given to the optimizer should be " "`parameters` argument given to the optimizer should be "
"an iterable of paddle Tensors, but got argument type is `{}`." "an iterable of paddle Tensors, but got argument type is `{}`.".format(
.format(type(parameters))) type(parameters)
)
)
if isinstance(parameters, dict): if isinstance(parameters, dict):
raise TypeError( raise TypeError(
"`parameters` argument should not get dict type, " "`parameters` argument should not get dict type, "
"if parameter groups is needed, please set `parameters`" "if parameter groups is needed, please set `parameters`"
" as list of dict") " as list of dict"
)
self._parameter_list = list(parameters) self._parameter_list = list(parameters)
else: else:
self._parameter_list = None self._parameter_list = None
...@@ -197,18 +231,22 @@ class Optimizer(object): ...@@ -197,18 +231,22 @@ class Optimizer(object):
if weight_decay is not None: if weight_decay is not None:
if not isinstance(self._parameter_list[0], dict): if not isinstance(self._parameter_list[0], dict):
for param in self._parameter_list: for param in self._parameter_list:
if hasattr(param, 'regularizer' if (
) and param.regularizer is not None: hasattr(param, 'regularizer')
and param.regularizer is not None
):
logging.info( logging.info(
"If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. " "If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. "
"The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!" "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
% weight_decay.__str__()) % weight_decay.__str__()
)
break break
if not isinstance(learning_rate, (float, LRScheduler)): if not isinstance(learning_rate, (float, LRScheduler)):
raise TypeError( raise TypeError(
"learning rate should be float or LRScheduler, got %s here" % "learning rate should be float or LRScheduler, got %s here"
type(learning_rate)) % type(learning_rate)
)
if grad_clip is not None: if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase): if not isinstance(grad_clip, GradientClipBase):
raise TypeError( raise TypeError(
...@@ -216,6 +254,7 @@ class Optimizer(object): ...@@ -216,6 +254,7 @@ class Optimizer(object):
) )
if isinstance(weight_decay, float): if isinstance(weight_decay, float):
from ..fluid.regularizer import L2Decay from ..fluid.regularizer import L2Decay
self.regularization = L2Decay(weight_decay) self.regularization = L2Decay(weight_decay)
else: else:
self.regularization = weight_decay self.regularization = weight_decay
...@@ -227,8 +266,9 @@ class Optimizer(object): ...@@ -227,8 +266,9 @@ class Optimizer(object):
if self._parameter_list: if self._parameter_list:
if isinstance(self._parameter_list[0], dict): if isinstance(self._parameter_list[0], dict):
for param_group in self._parameter_list: for param_group in self._parameter_list:
assert 'params' in param_group, \ assert (
'params should be set in parameters if parameter groups are optimized in different options' 'params' in param_group
), 'params should be set in parameters if parameter groups are optimized in different options'
self._dtype = self._parameter_list[0]['params'][0].dtype self._dtype = self._parameter_list[0]['params'][0].dtype
else: else:
self._dtype = self._parameter_list[0].dtype self._dtype = self._parameter_list[0].dtype
...@@ -248,7 +288,7 @@ class Optimizer(object): ...@@ -248,7 +288,7 @@ class Optimizer(object):
self.clear_gradients = self.clear_grad self.clear_gradients = self.clear_grad
self._default_dict = { self._default_dict = {
'weight_decay': self.regularization, 'weight_decay': self.regularization,
'grad_clip': self._grad_clip 'grad_clip': self._grad_clip,
} }
self._param_groups = [] self._param_groups = []
...@@ -261,13 +301,20 @@ class Optimizer(object): ...@@ -261,13 +301,20 @@ class Optimizer(object):
# NOTE: Multi Tensor: Pass in all parameters and gradients to the op kernel of the Optimizer at one time for updating for dygraph mode. # NOTE: Multi Tensor: Pass in all parameters and gradients to the op kernel of the Optimizer at one time for updating for dygraph mode.
# Optimizer support list: [ paddle.optimizer.Momentum, paddle.optimizer.Adam]. # Optimizer support list: [ paddle.optimizer.Momentum, paddle.optimizer.Adam].
self._use_multi_tensor = None self._use_multi_tensor = None
self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
self._param_dict = self._create_multi_tensor_dict()
self._auxiliary_vars = {} self._auxiliary_vars = {}
def _set_auxiliary_var(self, key, val): def _set_auxiliary_var(self, key, val):
self._auxiliary_vars[key] = val self._auxiliary_vars[key] = val
def _create_multi_tensor_dict(self):
n = len(self._param_groups) if self._param_groups is not None else 1
return {
'FP32_LODTensor': [[] for _ in range(n)],
'FP16_LODTensor': [[] for _ in range(n)],
}
def _get_auxiliary_var(self, key): def _get_auxiliary_var(self, key):
return self._auxiliary_vars.get(key, None) return self._auxiliary_vars.get(key, None)
...@@ -277,12 +324,12 @@ class Optimizer(object): ...@@ -277,12 +324,12 @@ class Optimizer(object):
Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be include in state dict. Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be include in state dict.
If the optimizer never be called(minimize function), the state_dict is empty. If the optimizer never be called(minimize function), the state_dict is empty.
Args: Args:
None None
Returns: Returns:
state_dict(dict) : dict contains all the Tensor used by optimizer state_dict(dict) : dict contains all the Tensor used by optimizer
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -311,11 +358,11 @@ class Optimizer(object): ...@@ -311,11 +358,11 @@ class Optimizer(object):
''' '''
Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be changed. Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be changed.
Args: Args:
state_dict(dict) : Dict contains all the Tensor needed by optimizer state_dict(dict) : Dict contains all the Tensor needed by optimizer
Return: Return:
None None
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -326,7 +373,7 @@ class Optimizer(object): ...@@ -326,7 +373,7 @@ class Optimizer(object):
layer_state_dict = emb.state_dict() layer_state_dict = emb.state_dict()
paddle.save(layer_state_dict, "emb.pdparams") paddle.save(layer_state_dict, "emb.pdparams")
scheduler = paddle.optimizer.lr.NoamDecay( scheduler = paddle.optimizer.lr.NoamDecay(
d_model=0.01, warmup_steps=100, verbose=True) d_model=0.01, warmup_steps=100, verbose=True)
adam = paddle.optimizer.Adam( adam = paddle.optimizer.Adam(
learning_rate=scheduler, learning_rate=scheduler,
...@@ -353,8 +400,9 @@ class Optimizer(object): ...@@ -353,8 +400,9 @@ class Optimizer(object):
self._accumulators_holder = state_dict self._accumulators_holder = state_dict
for k, v in self._accumulators.items(): for k, v in self._accumulators.items():
for para_name, var_tmp in v.items(): for para_name, var_tmp in v.items():
assert var_tmp.name in state_dict, \ assert (
"optimizer Tensor {} not found".format( var_tmp.name ) var_tmp.name in state_dict
), "optimizer Tensor {} not found".format(var_tmp.name)
var = var_tmp.value() var = var_tmp.value()
tensor = var.get_tensor() tensor = var.get_tensor()
model_np = np.array(tensor) model_np = np.array(tensor)
...@@ -368,16 +416,23 @@ class Optimizer(object): ...@@ -368,16 +416,23 @@ class Optimizer(object):
elif isinstance(load_para, np.ndarray): elif isinstance(load_para, np.ndarray):
load_para_np = load_para load_para_np = load_para
else: else:
raise RuntimeError("State dict type {} not supprt".format( raise RuntimeError(
str(type(load_para)))) "State dict type {} not supprt".format(
str(type(load_para))
assert model_np.shape == load_para_np.shape, \ )
"Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format( )
model_np.name, model_np.shape, load_para_np.shape)
assert (
model_np.shape == load_para_np.shape
), "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
model_np.name, model_np.shape, load_para_np.shape
)
assert model_np.dtype == load_para_np.dtype, \ assert (
"Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {} but load tensor with dtype {}".format( model_np.dtype == load_para_np.dtype
model_np.name, model_np.dtype, load_para_np.dtype) ), "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {} but load tensor with dtype {}".format(
model_np.name, model_np.dtype, load_para_np.dtype
)
tensor.set(load_para_np, framework._current_expected_place()) tensor.set(load_para_np, framework._current_expected_place())
...@@ -386,51 +441,63 @@ class Optimizer(object): ...@@ -386,51 +441,63 @@ class Optimizer(object):
def _create_global_learning_rate(self): def _create_global_learning_rate(self):
# lr var can't be float16, for pure fp16 training, should extra handle the dtype for lr # lr var can't be float16, for pure fp16 training, should extra handle the dtype for lr
_lr_dtype = paddle.get_default_dtype( _lr_dtype = (
) if self._dtype is None else self._dtype paddle.get_default_dtype() if self._dtype is None else self._dtype
_lr_dtype = paddle.float32 if ( )
paddle.get_default_dtype() != "float16" _lr_dtype = (
and _lr_dtype == paddle.float16) else _lr_dtype paddle.float32
if (
paddle.get_default_dtype() != "float16"
and _lr_dtype == paddle.float16
)
else _lr_dtype
)
if isinstance(self._learning_rate, LRScheduler): if isinstance(self._learning_rate, LRScheduler):
lr_var = self._global_learning_rate() lr_var = self._global_learning_rate()
# only create global lr_var once # only create global lr_var once
if not isinstance(lr_var, framework.Variable): if not isinstance(lr_var, framework.Variable):
lr_name = unique_name.generate('learning_rate') lr_name = unique_name.generate('learning_rate')
self._learning_rate._var_name = lr_name self._learning_rate._var_name = lr_name
lr_var = self.helper.create_global_variable(name=lr_name, lr_var = self.helper.create_global_variable(
shape=[1], name=lr_name,
persistable=True, shape=[1],
stop_gradient=True, persistable=True,
dtype=_lr_dtype) stop_gradient=True,
dtype=_lr_dtype,
)
main_prog = framework.default_main_program() main_prog = framework.default_main_program()
main_prog.lr_sheduler = self._learning_rate main_prog.lr_sheduler = self._learning_rate
main_prog.lr_var = lr_var main_prog.lr_var = lr_var
self._learning_rate_map[ self._learning_rate_map[
framework.default_main_program()] = lr_var framework.default_main_program()
] = lr_var
lr_value = float(self._learning_rate()) lr_value = float(self._learning_rate())
self.helper.set_variable_initializer( self.helper.set_variable_initializer(
lr_var, initializer=Constant(value=lr_value)) lr_var, initializer=Constant(value=lr_value)
)
elif isinstance(self._learning_rate, float): elif isinstance(self._learning_rate, float):
# only create global lr_var once # only create global lr_var once
lr = self._global_learning_rate() lr = self._global_learning_rate()
if isinstance(lr, framework.Variable): if isinstance(lr, framework.Variable):
return return
else: else:
self._learning_rate_map[framework.default_main_program( self._learning_rate_map[
)] = layers.create_global_var( framework.default_main_program()
] = layers.create_global_var(
name=unique_name.generate("learning_rate"), name=unique_name.generate("learning_rate"),
shape=[1], shape=[1],
value=float(self._learning_rate), value=float(self._learning_rate),
dtype=_lr_dtype, dtype=_lr_dtype,
persistable=True) persistable=True,
)
@framework.dygraph_only @framework.dygraph_only
def set_lr(self, value): def set_lr(self, value):
""" """
:api_attr: imperative :api_attr: imperative
Set the value of the learning rate manually in the optimizer. If the optimizer use LRScheduler, Set the value of the learning rate manually in the optimizer. If the optimizer use LRScheduler,
this API cannot be invoked, because it will lead to conflict. this API cannot be invoked, because it will lead to conflict.
...@@ -439,7 +506,7 @@ class Optimizer(object): ...@@ -439,7 +506,7 @@ class Optimizer(object):
Returns: Returns:
None None
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -465,7 +532,8 @@ class Optimizer(object): ...@@ -465,7 +532,8 @@ class Optimizer(object):
if not isinstance(value, (int, float)): if not isinstance(value, (int, float)):
raise TypeError( raise TypeError(
"The type of 'value' in optimizer.set_lr must be float, but received %s." "The type of 'value' in optimizer.set_lr must be float, but received %s."
% (type(value))) % (type(value))
)
if isinstance(self._learning_rate, LRScheduler): if isinstance(self._learning_rate, LRScheduler):
raise RuntimeError( raise RuntimeError(
"optimizer's learning rate can't be LRScheduler when invoke this API, because this will lead to conflict." "optimizer's learning rate can't be LRScheduler when invoke this API, because this will lead to conflict."
...@@ -475,27 +543,40 @@ class Optimizer(object): ...@@ -475,27 +543,40 @@ class Optimizer(object):
if current_lr is not None: if current_lr is not None:
if in_dygraph_mode(): if in_dygraph_mode():
place = _current_expected_place() place = _current_expected_place()
_C_ops.full_(current_lr, list(current_lr.shape), float(value), _C_ops.full_(
current_lr.dtype, place) current_lr,
list(current_lr.shape),
float(value),
current_lr.dtype,
place,
)
elif _in_legacy_dygraph(): elif _in_legacy_dygraph():
_legacy_C_ops.fill_constant(current_lr, 'value', float(value), _legacy_C_ops.fill_constant(
'dtype', current_lr.dtype, 'shape', current_lr,
list(current_lr.shape)) 'value',
float(value),
'dtype',
current_lr.dtype,
'shape',
list(current_lr.shape),
)
else: else:
global_block = framework.default_main_program().global_block() global_block = framework.default_main_program().global_block()
global_block.append_op(type='fill_constant', global_block.append_op(
outputs={'Out': [current_lr]}, type='fill_constant',
attrs={ outputs={'Out': [current_lr]},
'dtype': current_lr.dtype, attrs={
'shape': list(current_lr.shape), 'dtype': current_lr.dtype,
'value': float(value) 'shape': list(current_lr.shape),
}, 'value': float(value),
stop_gradient=True) },
stop_gradient=True,
)
def get_lr(self): def get_lr(self):
""" """
Get current learning rate of optimizer. Get current learning rate of optimizer.
If 'LRScheduler' is not used, the return value is all the same. If 'LRScheduler' is not used, the return value is all the same.
If 'LRScheduler' is used, the return value is the current scheduled learing rete. If 'LRScheduler' is used, the return value is the current scheduled learing rete.
...@@ -565,8 +646,7 @@ class Optimizer(object): ...@@ -565,8 +646,7 @@ class Optimizer(object):
return self._learning_rate_map.get(program, None) return self._learning_rate_map.get(program, None)
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
""" append optimize operator to block and return all the added optimize_op """append optimize operator to block and return all the added optimize_op"""
"""
raise NotImplementedError( raise NotImplementedError(
"Class \"Optimizer\" connot be used directly as an optimizer, please use its subclasses such as \"Adam\"" "Class \"Optimizer\" connot be used directly as an optimizer, please use its subclasses such as \"Adam\""
) )
...@@ -583,8 +663,8 @@ class Optimizer(object): ...@@ -583,8 +663,8 @@ class Optimizer(object):
return self._global_learning_rate() return self._global_learning_rate()
else: else:
with default_main_program()._lr_schedule_guard( with default_main_program()._lr_schedule_guard(
is_with_opt=True), framework.name_scope( is_with_opt=True
'scale_with_param_lr'): ), framework.name_scope('scale_with_param_lr'):
return self._global_learning_rate() * param_lr return self._global_learning_rate() * param_lr
else: else:
return self._global_learning_rate() return self._global_learning_rate()
...@@ -611,14 +691,16 @@ class Optimizer(object): ...@@ -611,14 +691,16 @@ class Optimizer(object):
""" """
pass pass
def _add_accumulator(self, def _add_accumulator(
name, self,
param, name,
dtype=None, param,
fill_value=0.0, dtype=None,
shape=None, fill_value=0.0,
type=None, shape=None,
device=None): type=None,
device=None,
):
"""Utility function to add an accumulator for a parameter """Utility function to add an accumulator for a parameter
Args: Args:
...@@ -630,13 +712,17 @@ class Optimizer(object): ...@@ -630,13 +712,17 @@ class Optimizer(object):
""" """
if self._name is not None: if self._name is not None:
name = self._name + "_" + name name = self._name + "_" + name
if (name in self._accumulators if (
and param.name in self._accumulators[name]): name in self._accumulators
and param.name in self._accumulators[name]
):
if framework._non_static_mode(): if framework._non_static_mode():
return self._accumulators[name][param.name] return self._accumulators[name][param.name]
raise Exception( raise Exception(
"Accumulator {} already exists for parameter {}".format( "Accumulator {} already exists for parameter {}".format(
name, param.name)) name, param.name
)
)
if shape == None: if shape == None:
shape = param.shape shape = param.shape
assert isinstance(self.helper, LayerHelper) assert isinstance(self.helper, LayerHelper)
...@@ -650,20 +736,25 @@ class Optimizer(object): ...@@ -650,20 +736,25 @@ class Optimizer(object):
persistable=True, persistable=True,
dtype=dtype or param.dtype, dtype=dtype or param.dtype,
type=core.VarDesc.VarType.LOD_TENSOR type=core.VarDesc.VarType.LOD_TENSOR
if framework._in_eager_without_dygraph_check() else if framework._in_eager_without_dygraph_check()
(param.type if type is None else type), else (param.type if type is None else type),
shape=shape, shape=shape,
belong_to_optimizer=True) belong_to_optimizer=True,
)
if device is None: if device is None:
device = self._get_device_for_param(param.name) device = self._get_device_for_param(param.name)
with device_guard(device): with device_guard(device):
self.helper.set_variable_initializer( self.helper.set_variable_initializer(
var, initializer=Constant(value=float(fill_value))) var, initializer=Constant(value=float(fill_value))
)
if framework._non_static_mode(): if framework._non_static_mode():
if len(self._accumulators_holder) > 0: if len(self._accumulators_holder) > 0:
assert var_name in self._accumulators_holder, \ assert (
"Optimizer set error, {} should in state dict".format( var_name ) var_name in self._accumulators_holder
), "Optimizer set error, {} should in state dict".format(
var_name
)
var.set_value(self._accumulators_holder[var_name]) var.set_value(self._accumulators_holder[var_name])
self._accumulators[name][param.name] = var self._accumulators[name][param.name] = var
...@@ -681,11 +772,15 @@ class Optimizer(object): ...@@ -681,11 +772,15 @@ class Optimizer(object):
""" """
if self._name is not None: if self._name is not None:
name = self._name + "_" + name name = self._name + "_" + name
if (name not in self._accumulators if (
or param.name not in self._accumulators[name]): name not in self._accumulators
or param.name not in self._accumulators[name]
):
raise Exception( raise Exception(
"Accumulator {} does not exist for parameter {}".format( "Accumulator {} does not exist for parameter {}".format(
name, param.name)) name, param.name
)
)
return self._accumulators[name][param.name] return self._accumulators[name][param.name]
def _update_param_device_map(self, parameters_and_grads, target_block): def _update_param_device_map(self, parameters_and_grads, target_block):
...@@ -693,13 +788,15 @@ class Optimizer(object): ...@@ -693,13 +788,15 @@ class Optimizer(object):
if param_and_grad[0].stop_gradient is False: if param_and_grad[0].stop_gradient is False:
param_name = param_and_grad[0].name param_name = param_and_grad[0].name
ops = target_block.ops ops = target_block.ops
device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName( device_attr_name = (
core.op_proto_and_checker_maker.kOpDeviceAttrName()
) )
for op in ops: for op in ops:
input_arg_names = op.input_arg_names input_arg_names = op.input_arg_names
if param_name in input_arg_names: if param_name in input_arg_names:
self._param_device_map[param_name] = op.attr( self._param_device_map[param_name] = op.attr(
device_attr_name) device_attr_name
)
break break
def _get_device_for_param(self, param_name): def _get_device_for_param(self, param_name):
...@@ -708,7 +805,9 @@ class Optimizer(object): ...@@ -708,7 +805,9 @@ class Optimizer(object):
device = self._param_device_map[param_name] device = self._param_device_map[param_name]
return device return device
def _create_optimization_pass(self, parameters_and_grads): def _create_optimization_pass(
self, parameters_and_grads, param_group_idx=0
):
"""Add optimization operators to update gradients to tensors. """Add optimization operators to update gradients to tensors.
Args: Args:
...@@ -736,10 +835,12 @@ class Optimizer(object): ...@@ -736,10 +835,12 @@ class Optimizer(object):
target_block = global_block target_block = global_block
current_block = framework.default_main_program().current_block() current_block = framework.default_main_program().current_block()
if current_block.idx != global_block.idx: if current_block.idx != global_block.idx:
assert current_block.backward_block_idx != -1, \ assert (
"current block is not global_block, but it doesn't have backward block." current_block.backward_block_idx != -1
), "current block is not global_block, but it doesn't have backward block."
target_block = framework.default_main_program().blocks[ target_block = framework.default_main_program().blocks[
current_block.backward_block_idx] current_block.backward_block_idx
]
start = len(target_block.ops) start = len(target_block.ops)
self.helper = LayerHelper(self.__class__.__name__) self.helper = LayerHelper(self.__class__.__name__)
...@@ -748,57 +849,91 @@ class Optimizer(object): ...@@ -748,57 +849,91 @@ class Optimizer(object):
# NOTE: Multi Tensor support [ Momentum, Adam ] for dygraph mode # NOTE: Multi Tensor support [ Momentum, Adam ] for dygraph mode
if self._use_multi_tensor and self.__class__.__name__ in [ if self._use_multi_tensor and self.__class__.__name__ in [
'Momentum', 'Adam' 'Momentum',
'Adam',
]: ]:
if len(self._param_dict['FP32_LODTensor']) == 0 and len( if (
self._param_dict['FP16_LODTensor']) == 0: len(self._param_dict['FP32_LODTensor'][param_group_idx]) == 0
and len(self._param_dict['FP16_LODTensor'][param_group_idx])
== 0
):
if isinstance(parameters_and_grads, list): if isinstance(parameters_and_grads, list):
self._multi_tensor_init(target_block, [ assert param_group_idx == 0
p[0] self._multi_tensor_init(
for p in parameters_and_grads if not p[0].stop_gradient target_block,
]) [
p[0]
for p in parameters_and_grads
if not p[0].stop_gradient
],
param_group_idx,
)
else: else:
self._update_param_group(parameters_and_grads) self._update_param_group(parameters_and_grads)
self._multi_tensor_init(target_block, [ self._multi_tensor_init(
p[0] for p in parameters_and_grads['params'] target_block,
if not p[0].stop_gradient [
]) p[0]
for p in parameters_and_grads['params']
if not p[0].stop_gradient
],
param_group_idx,
)
if framework._non_static_mode(): if framework._non_static_mode():
self._append_optimize_multi_tensor_op(target_block, self._append_optimize_multi_tensor_op(
parameters_and_grads) target_block,
parameters_and_grads,
param_group_idx=param_group_idx,
)
else: else:
self._update_param_device_map(parameters_and_grads, self._update_param_device_map(
target_block) parameters_and_grads, target_block
)
# NOTE: Multi Tensor requires all parameters to be in the same device and program. # NOTE: Multi Tensor requires all parameters to be in the same device and program.
# param_grad_list = [p_0,g_0,p_1,g_1,....] # param_grad_list = [p_0,g_0,p_1,g_1,....]
param_grad_list = [] param_grad_list = []
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
if not param_and_grad[0].stop_gradient and param_and_grad[ if (
1] is not None: not param_and_grad[0].stop_gradient
and param_and_grad[1] is not None
):
param_grad_list.append(param_and_grad[0]) param_grad_list.append(param_and_grad[0])
param_grad_list.append(param_and_grad[1]) param_grad_list.append(param_and_grad[1])
with param_grad_list[0].block.program._optimized_guard( with param_grad_list[0].block.program._optimized_guard(
param_grad_list), name_scope("optimizer"): param_grad_list
), name_scope("optimizer"):
device = self._get_device_for_param(param_grad_list[0].name) device = self._get_device_for_param(param_grad_list[0].name)
with device_guard(device): with device_guard(device):
self._append_optimize_multi_tensor_op( self._append_optimize_multi_tensor_op(
target_block, parameters_and_grads) target_block,
parameters_and_grads,
param_group_idx=param_group_idx,
)
else: else:
if not framework._non_static_mode(): if not framework._non_static_mode():
params_grads_device_map = parameters_and_grads[ params_grads_device_map = (
'params'] if isinstance(parameters_and_grads, parameters_and_grads['params']
dict) else parameters_and_grads if isinstance(parameters_and_grads, dict)
self._update_param_device_map(params_grads_device_map, else parameters_and_grads
target_block) )
self._update_param_device_map(
params_grads_device_map, target_block
)
if isinstance(parameters_and_grads, list): if isinstance(parameters_and_grads, list):
self._create_accumulators(target_block, [ self._create_accumulators(
p[0] for p in parameters_and_grads if not p[0].stop_gradient target_block,
]) [
p[0]
for p in parameters_and_grads
if not p[0].stop_gradient
],
)
else: else:
params_acc_dict = parameters_and_grads.copy() params_acc_dict = parameters_and_grads.copy()
params_acc_dict['params'] = [ params_acc_dict['params'] = [
p[0] for p in params_acc_dict['params'] p[0]
for p in params_acc_dict['params']
if not p[0].stop_gradient if not p[0].stop_gradient
] ]
self._create_accumulators(target_block, params_acc_dict) self._create_accumulators(target_block, params_acc_dict)
...@@ -809,8 +944,9 @@ class Optimizer(object): ...@@ -809,8 +944,9 @@ class Optimizer(object):
if param_and_grad[1] is None: if param_and_grad[1] is None:
continue continue
if param_and_grad[0].stop_gradient is False: if param_and_grad[0].stop_gradient is False:
self._append_optimize_op(target_block, self._append_optimize_op(
param_and_grad) target_block, param_and_grad
)
else: else:
for param_and_grad in parameters_and_grads['params']: for param_and_grad in parameters_and_grads['params']:
if param_and_grad[1] is None: if param_and_grad[1] is None:
...@@ -818,25 +954,31 @@ class Optimizer(object): ...@@ -818,25 +954,31 @@ class Optimizer(object):
if param_and_grad[0].stop_gradient is False: if param_and_grad[0].stop_gradient is False:
param_grad_dict = dict() param_grad_dict = dict()
param_grad_dict['params'] = param_and_grad param_grad_dict['params'] = param_and_grad
param_grad_dict.update({ param_grad_dict.update(
k: v {
for k, v in parameters_and_grads.items() k: v
if k != 'params' for k, v in parameters_and_grads.items()
}) if k != 'params'
self._append_optimize_op(target_block, }
param_grad_dict) )
self._append_optimize_op(
target_block, param_grad_dict
)
else: else:
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None: if param_and_grad[1] is None:
continue continue
with param_and_grad[0].block.program._optimized_guard( with param_and_grad[0].block.program._optimized_guard(
param_and_grad), name_scope("optimizer"): param_and_grad
), name_scope("optimizer"):
if param_and_grad[0].stop_gradient is False: if param_and_grad[0].stop_gradient is False:
device = self._get_device_for_param( device = self._get_device_for_param(
param_and_grad[0].name) param_and_grad[0].name
)
with device_guard(device): with device_guard(device):
optimize_op = self._append_optimize_op( optimize_op = self._append_optimize_op(
target_block, param_and_grad) target_block, param_and_grad
)
# Get custom finish ops for subclasses # Get custom finish ops for subclasses
# FIXME: Need to fix this once we figure out how to handle dependencies # FIXME: Need to fix this once we figure out how to handle dependencies
...@@ -848,12 +990,14 @@ class Optimizer(object): ...@@ -848,12 +990,14 @@ class Optimizer(object):
def _append_dgc_ops(self, param_and_grad): def _append_dgc_ops(self, param_and_grad):
pass pass
def backward(self, def backward(
loss, self,
startup_program=None, loss,
parameters=None, startup_program=None,
no_grad_set=None, parameters=None,
callbacks=None): no_grad_set=None,
callbacks=None,
):
""" """
The first part of ``minimize``, do auto-diff to append backward operations for The first part of ``minimize``, do auto-diff to append backward operations for
the current program. the current program.
...@@ -884,7 +1028,7 @@ class Optimizer(object): ...@@ -884,7 +1028,7 @@ class Optimizer(object):
a = paddle.to_tensor(value) a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5) linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph. # This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate = 0.01, adam = paddle.optimizer.Adam(learning_rate = 0.01,
parameters = linear.parameters()) parameters = linear.parameters())
out = linear(a) out = linear(a)
out.backward() out.backward()
...@@ -902,8 +1046,7 @@ class Optimizer(object): ...@@ -902,8 +1046,7 @@ class Optimizer(object):
self._dtype = loss.dtype self._dtype = loss.dtype
if framework._non_static_mode(): if framework._non_static_mode():
parameter_list = parameters if parameters \ parameter_list = parameters if parameters else self._parameter_list
else self._parameter_list
params_grads = [] params_grads = []
for param in parameter_list: for param in parameter_list:
...@@ -917,23 +1060,26 @@ class Optimizer(object): ...@@ -917,23 +1060,26 @@ class Optimizer(object):
if callbacks is None: if callbacks is None:
callbacks = [error_clip_callback] callbacks = [error_clip_callback]
else: else:
assert (isinstance(callbacks, list)) assert isinstance(callbacks, list)
program = loss.block.program program = loss.block.program
assert len(loss.shape) == 1 and loss.shape[0] == 1, \ assert len(loss.shape) == 1 and loss.shape[0] == 1, (
"The loss.shape should be (1L,), but the current loss.shape is {}. " \ "The loss.shape should be (1L,), but the current loss.shape is {}. "
"Maybe that you should call paddle.mean to process the current loss.".format( "Maybe that you should call paddle.mean to process the current loss.".format(
loss.shape) loss.shape
parameter_list = parameters if parameters \ )
else self._parameter_list )
parameter_list = parameters if parameters else self._parameter_list
with program_guard(program, startup_program): with program_guard(program, startup_program):
from paddle.incubate.autograd.utils import prim_enabled from paddle.incubate.autograd.utils import prim_enabled
if prim_enabled(): if prim_enabled():
params_grads = append_backward_new([loss], parameter_list, params_grads = append_backward_new(
act_no_grad_set, [loss], parameter_list, act_no_grad_set, callbacks
callbacks) )
else: else:
params_grads = append_backward(loss, parameter_list, params_grads = append_backward(
act_no_grad_set, callbacks) loss, parameter_list, act_no_grad_set, callbacks
)
# Note: since we can't use all_reduce_op now, # Note: since we can't use all_reduce_op now,
# dgc_op should be the last op of one grad. # dgc_op should be the last op of one grad.
self._append_dgc_ops(params_grads) self._append_dgc_ops(params_grads)
...@@ -978,13 +1124,16 @@ class Optimizer(object): ...@@ -978,13 +1124,16 @@ class Optimizer(object):
params_grads = append_gradient_clip_ops(params_grads) params_grads = append_gradient_clip_ops(params_grads)
# Add regularization if any # Add regularization if any
params_grads = self.append_regularization_ops(params_grads, params_grads = self.append_regularization_ops(
self.regularization) params_grads, self.regularization
)
optimize_ops = self._create_optimization_pass(params_grads) optimize_ops = self._create_optimization_pass(params_grads)
return optimize_ops return optimize_ops
def _apply_optimize(self, loss, startup_program, params_grads): def _apply_optimize(
self, loss, startup_program, params_grads, param_group_idx=0
):
""" """
Second part of `minimize`, appending optimization operators for Second part of `minimize`, appending optimization operators for
given `params_grads` pairs. given `params_grads` pairs.
...@@ -997,38 +1146,49 @@ class Optimizer(object): ...@@ -997,38 +1146,49 @@ class Optimizer(object):
list: A list of operators appended to the current program. list: A list of operators appended to the current program.
""" """
if framework._non_static_mode(): if framework._non_static_mode():
with program_guard(framework.default_main_program(), with program_guard(
framework.default_startup_program()): framework.default_main_program(),
framework.default_startup_program(),
):
if isinstance(params_grads, list): if isinstance(params_grads, list):
if self._grad_clip is not None: if self._grad_clip is not None:
params_grads = self._grad_clip(params_grads) params_grads = self._grad_clip(params_grads)
params_grads = self.append_regularization_ops( params_grads = self.append_regularization_ops(
params_grads, self.regularization) params_grads, self.regularization
)
else: else:
grad_clip = params_grads['grad_clip'] grad_clip = params_grads['grad_clip']
if grad_clip is not None: if grad_clip is not None:
params_grads['params'] = grad_clip( params_grads['params'] = grad_clip(
params_grads['params']) params_grads['params']
)
params_grads['params'] = self.append_regularization_ops( params_grads['params'] = self.append_regularization_ops(
params_grads['params'], self.regularization) params_grads['params'], self.regularization
optimize_ops = self._create_optimization_pass(params_grads) )
optimize_ops = self._create_optimization_pass(
params_grads, param_group_idx=param_group_idx
)
else: else:
assert param_group_idx == 0
program = loss.block.program program = loss.block.program
with program_guard(program, startup_program): with program_guard(program, startup_program):
optimize_ops = self.apply_gradients(params_grads) optimize_ops = self.apply_gradients(params_grads)
return optimize_ops return optimize_ops
def _create_regularization_of_grad(self, param, grad, regularization=None): def _create_regularization_of_grad(self, param, grad, regularization=None):
""" Create and add backward regularization Operators """Create and add backward regularization Operators
Function helper of append_regularization_ops. Function helper of append_regularization_ops.
""" """
# If no gradient or no regularization is specified, then we don't need to do anything # If no gradient or no regularization is specified, then we don't need to do anything
if grad is None or ( if grad is None or (
(not hasattr(param, 'regularizer') or (
(hasattr(param, 'regularizer') and param.regularizer is None)) not hasattr(param, 'regularizer')
and regularization is None): or (hasattr(param, 'regularizer') and param.regularizer is None)
)
and regularization is None
):
return grad return grad
regularization_term = None regularization_term = None
if hasattr(param, 'regularizer') and param.regularizer is not None: if hasattr(param, 'regularizer') and param.regularizer is not None:
...@@ -1057,7 +1217,8 @@ class Optimizer(object): ...@@ -1057,7 +1217,8 @@ class Optimizer(object):
dtype=param.dtype, dtype=param.dtype,
shape=param.shape, shape=param.shape,
lod_level=param.lod_level, lod_level=param.lod_level,
type=core.VarDesc.VarType.LOD_TENSOR) type=core.VarDesc.VarType.LOD_TENSOR,
)
inputs = {"X": [grad, regularization_term]} inputs = {"X": [grad, regularization_term]}
outputs = {"Out": [new_grad]} outputs = {"Out": [new_grad]}
...@@ -1065,9 +1226,9 @@ class Optimizer(object): ...@@ -1065,9 +1226,9 @@ class Optimizer(object):
return new_grad return new_grad
def append_regularization_ops(self, def append_regularization_ops(
parameters_and_grads, self, parameters_and_grads, regularization=None
regularization=None): ):
r"""Create and add backward regularization Operators r"""Create and add backward regularization Operators
Creates and adds backward regularization operators in the BlockDesc. Creates and adds backward regularization operators in the BlockDesc.
...@@ -1092,21 +1253,28 @@ class Optimizer(object): ...@@ -1092,21 +1253,28 @@ class Optimizer(object):
if framework._non_static_mode(): if framework._non_static_mode():
for param, grad in parameters_and_grads: for param, grad in parameters_and_grads:
new_grad = self._create_regularization_of_grad( new_grad = self._create_regularization_of_grad(
param, grad, regularization) param, grad, regularization
)
params_and_grads.append((param, new_grad)) params_and_grads.append((param, new_grad))
else: else:
repeate_regularizer = False repeate_regularizer = False
with framework.name_scope('regularization'): with framework.name_scope('regularization'):
for param, grad in parameters_and_grads: for param, grad in parameters_and_grads:
if not repeate_regularizer and param.regularizer is not None and regularization is not None: if (
not repeate_regularizer
and param.regularizer is not None
and regularization is not None
):
repeate_regularizer = True repeate_regularizer = True
logging.info( logging.info(
"If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. " "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
"The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!" "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
% regularization.__str__()) % regularization.__str__()
)
with param.block.program._optimized_guard([param, grad]): with param.block.program._optimized_guard([param, grad]):
new_grad = self._create_regularization_of_grad( new_grad = self._create_regularization_of_grad(
param, grad, regularization) param, grad, regularization
)
params_and_grads.append((param, new_grad)) params_and_grads.append((param, new_grad))
return params_and_grads return params_and_grads
...@@ -1114,7 +1282,8 @@ class Optimizer(object): ...@@ -1114,7 +1282,8 @@ class Optimizer(object):
no_grad_set = _get_no_grad_set_name(no_grad_set) no_grad_set = _get_no_grad_set_name(no_grad_set)
parameters = loss.block.program.global_block().all_parameters() parameters = loss.block.program.global_block().all_parameters()
param_no_trainable = set( param_no_trainable = set(
[param.name for param in parameters if param.stop_gradient is True]) [param.name for param in parameters if param.stop_gradient is True]
)
# If the parameter is no trainable, it should not have a gradient. # If the parameter is no trainable, it should not have a gradient.
no_grad_set.update(param_no_trainable) no_grad_set.update(param_no_trainable)
...@@ -1128,13 +1297,13 @@ class Optimizer(object): ...@@ -1128,13 +1297,13 @@ class Optimizer(object):
If not, new gradient will accumulat on previous gradient. If not, new gradient will accumulat on previous gradient.
There are two method to clear grad: set_to_zero or delete grad. There are two method to clear grad: set_to_zero or delete grad.
Args: Args:
set_to_zero (bool, optional): If set grads to zero or not, default is True. set_to_zero (bool, optional): If set grads to zero or not, default is True.
Returns: Returns:
None None
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1145,7 +1314,7 @@ class Optimizer(object): ...@@ -1145,7 +1314,7 @@ class Optimizer(object):
a = paddle.to_tensor(value) a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5) linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph. # This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate = 0.01, adam = paddle.optimizer.Adam(learning_rate = 0.01,
parameters = linear.parameters()) parameters = linear.parameters())
out = linear(a) out = linear(a)
out.backward() out.backward()
...@@ -1155,7 +1324,8 @@ class Optimizer(object): ...@@ -1155,7 +1324,8 @@ class Optimizer(object):
""" """
param_list = [] param_list = []
if self._parameter_list is None or not isinstance( if self._parameter_list is None or not isinstance(
self._parameter_list[0], dict): self._parameter_list[0], dict
):
for p in self._parameter_list: for p in self._parameter_list:
if not p.stop_gradient: if not p.stop_gradient:
param_list.append(p) param_list.append(p)
...@@ -1172,11 +1342,9 @@ class Optimizer(object): ...@@ -1172,11 +1342,9 @@ class Optimizer(object):
core.clear_gradients(param_list, set_to_zero) core.clear_gradients(param_list, set_to_zero)
@imperative_base.no_grad @imperative_base.no_grad
def minimize(self, def minimize(
loss, self, loss, startup_program=None, parameters=None, no_grad_set=None
startup_program=None, ):
parameters=None,
no_grad_set=None):
""" """
Add operations to minimize ``loss`` by updating ``parameters``. Add operations to minimize ``loss`` by updating ``parameters``.
...@@ -1195,13 +1363,13 @@ class Optimizer(object): ...@@ -1195,13 +1363,13 @@ class Optimizer(object):
tuple: tuple (optimize_ops, params_grads), A list of operators appended tuple: tuple (optimize_ops, params_grads), A list of operators appended
by minimize and a list of (param, grad) tensor pairs, param is by minimize and a list of (param, grad) tensor pairs, param is
``Parameter``, grad is the gradient value corresponding to the parameter. ``Parameter``, grad is the gradient value corresponding to the parameter.
In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
indicate program pruning. If so, the program will be pruned by ``feed`` and indicate program pruning. If so, the program will be pruned by ``feed`` and
``fetch_list`` before run, see details in ``Executor``. ``fetch_list`` before run, see details in ``Executor``.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
linear = paddle.nn.Linear(10, 10) linear = paddle.nn.Linear(10, 10)
input = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1) input = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
...@@ -1221,17 +1389,18 @@ class Optimizer(object): ...@@ -1221,17 +1389,18 @@ class Optimizer(object):
""" """
assert isinstance(loss, Variable), "The loss should be an Tensor." assert isinstance(loss, Variable), "The loss should be an Tensor."
parameter_list = parameters if parameters \ parameter_list = parameters if parameters else self._parameter_list
else self._parameter_list
params_grads = self.backward(loss, params_grads = self.backward(
startup_program=startup_program, loss,
parameters=parameter_list, startup_program=startup_program,
no_grad_set=no_grad_set) parameters=parameter_list,
no_grad_set=no_grad_set,
)
optimize_ops = self._apply_optimize(loss, optimize_ops = self._apply_optimize(
startup_program=startup_program, loss, startup_program=startup_program, params_grads=params_grads
params_grads=params_grads) )
return optimize_ops, params_grads return optimize_ops, params_grads
...@@ -1240,7 +1409,7 @@ class Optimizer(object): ...@@ -1240,7 +1409,7 @@ class Optimizer(object):
def step(self): def step(self):
""" """
Execute the optimizer and update parameters once. Execute the optimizer and update parameters once.
Returns: Returns:
None None
...@@ -1254,7 +1423,7 @@ class Optimizer(object): ...@@ -1254,7 +1423,7 @@ class Optimizer(object):
a = paddle.to_tensor(value) a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5) linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph. # This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adam(learning_rate = 0.01, adam = paddle.optimizer.Adam(learning_rate = 0.01,
parameters = linear.parameters()) parameters = linear.parameters())
out = linear(a) out = linear(a)
out.backward() out.backward()
...@@ -1271,13 +1440,16 @@ class Optimizer(object): ...@@ -1271,13 +1440,16 @@ class Optimizer(object):
grad_var = param._grad_ivar() grad_var = param._grad_ivar()
params_grads.append((param, grad_var)) params_grads.append((param, grad_var))
self._apply_optimize(loss=None, self._apply_optimize(
startup_program=None, loss=None,
params_grads=params_grads) startup_program=None,
params_grads=params_grads,
param_group_idx=0,
)
else: else:
# optimize parameters in groups # optimize parameters in groups
for param_group in self._param_groups: for idx, param_group in enumerate(self._param_groups):
params_grads = defaultdict(lambda: list()) params_grads = defaultdict(lambda: list())
for param in param_group['params']: for param in param_group['params']:
if param.stop_gradient: if param.stop_gradient:
...@@ -1286,11 +1458,14 @@ class Optimizer(object): ...@@ -1286,11 +1458,14 @@ class Optimizer(object):
grad_var = param._grad_ivar() grad_var = param._grad_ivar()
params_grads['params'].append((param, grad_var)) params_grads['params'].append((param, grad_var))
params_grads.update( params_grads.update(
{k: v {k: v for k, v in param_group.items() if k != 'params'}
for k, v in param_group.items() if k != 'params'}) )
self._apply_optimize(loss=None, self._apply_optimize(
startup_program=None, loss=None,
params_grads=params_grads) startup_program=None,
params_grads=params_grads,
param_group_idx=idx,
)
def _add_param_group(self, param_group): def _add_param_group(self, param_group):
""" """
...@@ -1306,7 +1481,8 @@ class Optimizer(object): ...@@ -1306,7 +1481,8 @@ class Optimizer(object):
elif isinstance(params, set): elif isinstance(params, set):
raise TypeError( raise TypeError(
"optimizer parameters should be in ordered collections," "optimizer parameters should be in ordered collections,"
"but received set, please use list instead.") "but received set, please use list instead."
)
else: else:
param_group['params'] = list(params) param_group['params'] = list(params)
...@@ -1320,18 +1496,21 @@ class Optimizer(object): ...@@ -1320,18 +1496,21 @@ class Optimizer(object):
if not param_set.isdisjoint(set(param_group['params'])): if not param_set.isdisjoint(set(param_group['params'])):
raise ValueError( raise ValueError(
"some parameters appear in more than one parameter group") "some parameters appear in more than one parameter group"
)
for param in param_group['params']: for param in param_group['params']:
weight_decay = param_group['weight_decay'] weight_decay = param_group['weight_decay']
if isinstance(weight_decay, float): if isinstance(weight_decay, float):
from ..fluid.regularizer import L2Decay from ..fluid.regularizer import L2Decay
regularization = L2Decay(weight_decay) regularization = L2Decay(weight_decay)
else: else:
regularization = weight_decay regularization = weight_decay
param.regularizer = regularization param.regularizer = regularization
param.optimize_attr['learning_rate'] = param_group.get( param.optimize_attr['learning_rate'] = param_group.get(
'learning_rate', 1.) 'learning_rate', 1.0
)
self._param_groups.append(param_group) self._param_groups.append(param_group)
...@@ -1345,7 +1524,7 @@ class Optimizer(object): ...@@ -1345,7 +1524,7 @@ class Optimizer(object):
pass pass
@framework.dygraph_only @framework.dygraph_only
def _multi_tensor_init(self, target_block, parameters): def _multi_tensor_init(self, target_block, parameters, param_group_idx):
""" """
All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32). All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
This function will be overridden in the corresponding optimizer file. This function will be overridden in the corresponding optimizer file.
...@@ -1357,9 +1536,10 @@ class Optimizer(object): ...@@ -1357,9 +1536,10 @@ class Optimizer(object):
pass pass
@framework.dygraph_only @framework.dygraph_only
def _append_optimize_multi_tensor_op(self, target_block, def _append_optimize_multi_tensor_op(
parameters_and_grads): self, target_block, parameters_and_grads, param_group_idx
""" ):
"""
For Multi Tensor, append optimize merged_operator to block. For Multi Tensor, append optimize merged_operator to block.
""" """
pass pass
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册