diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 97c50adf4a7829d7303bb6bc158c7674d752d01b..e9d48d8562927d9c77ec59366e927912492f00dc 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -108,8 +108,12 @@ class Optimizer(object): self.regularization = regularization self._grad_clip = grad_clip self._learning_rate = learning_rate - # the learning rate type should be inferenced from loss + self._dtype = None + # Infer the dtype form parameter + if self._parameter_list: + self._dtype = self._parameter_list[0].dtype + # each program should have a independent learning rate # program -> Variable(learning_rate) self._learning_rate_map = dict() @@ -768,7 +772,10 @@ class Optimizer(object): else: act_no_grad_set = self._get_no_grad_set(loss, no_grad_set) - self._dtype = loss.dtype + # Infer dtype by loss if None + if self._dtype is None: + self._dtype = loss.dtype + if framework.in_dygraph_mode(): parameter_list = parameter_list if parameter_list \ else self._parameter_list diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py index 91d705223316360b8c05954259724a5f7d246440..369a5bdae046f26d6cc0e20205701965e9918421 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer.py @@ -23,7 +23,8 @@ import paddle.fluid.core as core import paddle.compat as cpt import numpy as np from paddle.fluid.backward import append_backward -from paddle.fluid.framework import Program, program_guard +from paddle.fluid.framework import Program, program_guard, convert_np_dtype_to_dtype_ +import paddle class TestOptimizer(unittest.TestCase): @@ -1042,5 +1043,37 @@ class TestGradientMergeOptimizer(unittest.TestCase): ['sgd', 'sgd']) +class TestOptimizerDtype(unittest.TestCase): + ''' + The dtype of optimizer should be inferred by parameters, and the learning rate + is cteated with the same dtype. + ''' + + def check_with_dtype(self, dtype): + class MyLayer(paddle.nn.Layer): + def __init__(self, dtype): + super(MyLayer, self).__init__() + self._w = self.create_parameter([2, 3], dtype=dtype) + self._b = self.create_parameter([2, 3], dtype=dtype) + + def forward(self, x): + return x * self._w + self._b + + with paddle.fluid.dygraph.guard(): + model = MyLayer(dtype) + x = paddle.rand([10, 2, 3], dtype=dtype) + loss = model(x) + adam = paddle.optimizer.Adam(parameters=model.parameters()) + loss.backward() + adam.step() + self.assertEqual(adam._dtype, convert_np_dtype_to_dtype_(dtype)) + + def test_float64(self): + self.check_with_dtype('float64') + + def test_float32(self): + self.check_with_dtype('float32') + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index 910c9b185dbaab017ea900676c372af2e344d561..c51c00f4a716dbe908b28fac29e6ad8064d33076 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -270,7 +270,6 @@ class Adam(Optimizer): adam.step() adam.clear_grad() """ - self._dtype = None params_grads = [] for param in self._parameter_list: if not param.trainable: diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index 2aa7fa115ec2efdcfe3765bca5916696e830a84a..5f742820178ceed84a403dd04eb13343269e00a9 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -210,7 +210,6 @@ class AdamW(Adam): @framework.dygraph_only @imperative_base.no_grad def step(self): - self._dtype = None params_grads = [] for param in self._parameter_list: if not param.trainable: diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 1cfc0b66e7b671429b4dbbb17a3e586931a6a6e7..212dad7c77cb4f8ecd88d79187bc4a1c8b40a478 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -132,8 +132,12 @@ class Optimizer(object): self.regularization = weight_decay self._grad_clip = grad_clip self._learning_rate = learning_rate - # the learning rate type should be inferenced from loss + self._dtype = None + # Infer the dtype form parameter + if self._parameter_list: + self._dtype = self._parameter_list[0].dtype + # each program should have a independent learning rate # program -> tensor(learning_rate) self._learning_rate_map = dict() @@ -675,7 +679,10 @@ class Optimizer(object): else: act_no_grad_set = self._get_no_grad_set(loss, no_grad_set) - self._dtype = loss.dtype + # Infer dtype by loss if None + if self._dtype is None: + self._dtype = loss.dtype + if framework.in_dygraph_mode(): parameter_list = parameters if parameters \ else self._parameter_list @@ -885,6 +892,7 @@ class Optimizer(object): return optimize_ops, params_grads + @imperative_base.no_grad @framework.dygraph_only def step(self): """ @@ -910,7 +918,6 @@ class Optimizer(object): adam.step() adam.clear_grad() """ - self._dtype = None params_grads = [] for param in self._parameter_list: if not param.trainable: