From a4b9daf97c9ea0091009f81442b6c6e07f09e2ca Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 28 Dec 2020 10:28:23 +0800 Subject: [PATCH] fix optimizer dtype (#29917) --- python/paddle/fluid/optimizer.py | 11 ++++-- .../fluid/tests/unittests/test_optimizer.py | 35 ++++++++++++++++++- python/paddle/optimizer/adam.py | 1 - python/paddle/optimizer/adamw.py | 1 - python/paddle/optimizer/optimizer.py | 13 +++++-- 5 files changed, 53 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 97c50adf4a..e9d48d8562 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -108,8 +108,12 @@ class Optimizer(object): self.regularization = regularization self._grad_clip = grad_clip self._learning_rate = learning_rate - # the learning rate type should be inferenced from loss + self._dtype = None + # Infer the dtype form parameter + if self._parameter_list: + self._dtype = self._parameter_list[0].dtype + # each program should have a independent learning rate # program -> Variable(learning_rate) self._learning_rate_map = dict() @@ -768,7 +772,10 @@ class Optimizer(object): else: act_no_grad_set = self._get_no_grad_set(loss, no_grad_set) - self._dtype = loss.dtype + # Infer dtype by loss if None + if self._dtype is None: + self._dtype = loss.dtype + if framework.in_dygraph_mode(): parameter_list = parameter_list if parameter_list \ else self._parameter_list diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py index 91d7052233..369a5bdae0 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer.py @@ -23,7 +23,8 @@ import paddle.fluid.core as core import paddle.compat as cpt import numpy as np from paddle.fluid.backward import append_backward -from paddle.fluid.framework import Program, program_guard +from paddle.fluid.framework import Program, program_guard, convert_np_dtype_to_dtype_ +import paddle class TestOptimizer(unittest.TestCase): @@ -1042,5 +1043,37 @@ class TestGradientMergeOptimizer(unittest.TestCase): ['sgd', 'sgd']) +class TestOptimizerDtype(unittest.TestCase): + ''' + The dtype of optimizer should be inferred by parameters, and the learning rate + is cteated with the same dtype. + ''' + + def check_with_dtype(self, dtype): + class MyLayer(paddle.nn.Layer): + def __init__(self, dtype): + super(MyLayer, self).__init__() + self._w = self.create_parameter([2, 3], dtype=dtype) + self._b = self.create_parameter([2, 3], dtype=dtype) + + def forward(self, x): + return x * self._w + self._b + + with paddle.fluid.dygraph.guard(): + model = MyLayer(dtype) + x = paddle.rand([10, 2, 3], dtype=dtype) + loss = model(x) + adam = paddle.optimizer.Adam(parameters=model.parameters()) + loss.backward() + adam.step() + self.assertEqual(adam._dtype, convert_np_dtype_to_dtype_(dtype)) + + def test_float64(self): + self.check_with_dtype('float64') + + def test_float32(self): + self.check_with_dtype('float32') + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index 910c9b185d..c51c00f4a7 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -270,7 +270,6 @@ class Adam(Optimizer): adam.step() adam.clear_grad() """ - self._dtype = None params_grads = [] for param in self._parameter_list: if not param.trainable: diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index 2aa7fa115e..5f74282017 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -210,7 +210,6 @@ class AdamW(Adam): @framework.dygraph_only @imperative_base.no_grad def step(self): - self._dtype = None params_grads = [] for param in self._parameter_list: if not param.trainable: diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 1cfc0b66e7..212dad7c77 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -132,8 +132,12 @@ class Optimizer(object): self.regularization = weight_decay self._grad_clip = grad_clip self._learning_rate = learning_rate - # the learning rate type should be inferenced from loss + self._dtype = None + # Infer the dtype form parameter + if self._parameter_list: + self._dtype = self._parameter_list[0].dtype + # each program should have a independent learning rate # program -> tensor(learning_rate) self._learning_rate_map = dict() @@ -675,7 +679,10 @@ class Optimizer(object): else: act_no_grad_set = self._get_no_grad_set(loss, no_grad_set) - self._dtype = loss.dtype + # Infer dtype by loss if None + if self._dtype is None: + self._dtype = loss.dtype + if framework.in_dygraph_mode(): parameter_list = parameters if parameters \ else self._parameter_list @@ -885,6 +892,7 @@ class Optimizer(object): return optimize_ops, params_grads + @imperative_base.no_grad @framework.dygraph_only def step(self): """ @@ -910,7 +918,6 @@ class Optimizer(object): adam.step() adam.clear_grad() """ - self._dtype = None params_grads = [] for param in self._parameter_list: if not param.trainable: -- GitLab