From f80cee11e866ed629eceeba51ad7da00c6c5a16a Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Thu, 23 Jun 2022 14:11:14 +0800 Subject: [PATCH] add float_only for layer_to (#43760) --- python/paddle/fluid/dygraph/amp/auto_cast.py | 4 +- python/paddle/fluid/dygraph/layers.py | 109 ++++++++++-------- .../test_imperative_auto_mixed_precision.py | 8 ++ 3 files changed, 70 insertions(+), 51 deletions(-) diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py index f441a35ca0f..12ddf1a0f8e 100644 --- a/python/paddle/fluid/dygraph/amp/auto_cast.py +++ b/python/paddle/fluid/dygraph/amp/auto_cast.py @@ -173,7 +173,9 @@ def pure_fp16_initialize(models): paddle.nn.BatchNorm2D, paddle.nn.BatchNorm3D, paddle.nn.LayerNorm, paddle.nn.SyncBatchNorm)): continue - layer._to_impl(dtype='float16', include_sublayers=False) + layer._to_impl(dtype='float16', + include_sublayers=False, + floating_only=True) return models diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 490c6a1ca76..3a1def85c6e 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -1576,7 +1576,8 @@ class Layer(object): return self._to_impl(device=device, dtype=dtype, blocking=blocking, - include_sublayers=True) + include_sublayers=True, + floating_only=False) def _apply(self, func, device, dtype, blocking, include_sublayers=True): if include_sublayers: @@ -1599,11 +1600,62 @@ class Layer(object): self._dtype = dtype + def _transform(self, t, device, dtype, blocking): + if device is None: + device = t.place + if dtype is None: + dtype = t.dtype + + if type(dtype) is not VarDesc.VarType: + dtype = convert_np_dtype_to_dtype_(dtype) + + # 1. gpu place need to determine whether the memory is sufficient for allocation: + if t.place.is_gpu_place(): + # for gpu, minimum memory allocation unit is 256 bytes. + size_dtype = core.size_of_dtype(dtype) + # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space. + # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough. + waiting_alloc_memory = ( + (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2 + gpu_memory_available = core.gpu_memory_available() + if gpu_memory_available < waiting_alloc_memory: + # Copy param / Tensor to cpu + t_used = t._copy_to(paddle.CPUPlace(), + blocking) # k-v type will error + # Release mem of t + t.value().get_tensor()._clear() + else: + t_used = t + else: + t_used = t + + # 2. cast param / Tensor to dtype + if dtype is not None and dtype != t_used.dtype: + with paddle.fluid.framework._dygraph_place_guard( + place=t_used.place): + t_casted = t_used.cast(dtype=dtype) + else: + t_casted = t_used + + # 3. Copy casted cpu param / Tensor to device + if device is not None and not t_casted.place._equals(device): + new_t = t_casted._copy_to(device, blocking) + else: + new_t = t_casted + + # 4. share Tensor to origin param / Tensor + dst_tensor = t.value().get_tensor() + src_tensor = new_t.value().get_tensor() + dst_tensor._share_data_with(src_tensor) + + return t + def _to_impl(self, device=None, dtype=None, blocking=None, - include_sublayers=True): + include_sublayers=True, + floating_only=False): ''' Cast the parameters and buffers of Layer by the give device, dtype and blocking. @@ -1619,6 +1671,8 @@ class Layer(object): include_sublayers(bool|True, optional): If True, deal with self and all sublayers parameters and buffers, if not only deal with self parameters and buffers. Default: True. + floating_only(bool|False, optional): If True, only cast all floating point parameters and buffers of Layer by the give device, dtype and blocking. + Returns: self @@ -1646,54 +1700,9 @@ class Layer(object): bool), "blocking value error, must be the True, False or None" def transform(t, device, dtype, blocking): - if device is None: - device = t.place - if dtype is None: - dtype = t.dtype - - if type(dtype) is not VarDesc.VarType: - dtype = convert_np_dtype_to_dtype_(dtype) - - # 1. gpu place need to determine whether the memory is sufficient for allocation: - if t.place.is_gpu_place(): - # for gpu, minimum memory allocation unit is 256 bytes. - size_dtype = core.size_of_dtype(dtype) - # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space. - # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough. - waiting_alloc_memory = ( - (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2 - gpu_memory_available = core.gpu_memory_available() - if gpu_memory_available < waiting_alloc_memory: - # Copy param / Tensor to cpu - t_used = t._copy_to(paddle.CPUPlace(), - blocking) # k-v type will error - # Release mem of t - t.value().get_tensor()._clear() - else: - t_used = t - else: - t_used = t - - # 2. cast param / Tensor to dtype - if dtype is not None and dtype != t_used.dtype: - with paddle.fluid.framework._dygraph_place_guard( - place=t_used.place): - t_casted = t_used.cast(dtype=dtype) - else: - t_casted = t_used - - # 3. Copy casted cpu param / Tensor to device - if device is not None and not t_casted.place._equals(device): - new_t = t_casted._copy_to(device, blocking) - else: - new_t = t_casted - - # 4. share Tensor to origin param / Tensor - dst_tensor = t.value().get_tensor() - src_tensor = new_t.value().get_tensor() - dst_tensor._share_data_with(src_tensor) - - return t + if floating_only and (not paddle.is_floating_point(t)): + return t + return self._transform(t, device, dtype, blocking) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py index 0b2df9885ab..6a5ddd3157b 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py @@ -707,6 +707,14 @@ class TestAmpDecorator(unittest.TestCase): for param in model.parameters(): self.assertEqual((param.dtype == paddle.float32), True) + def test_floating_only(self): + model = paddle.nn.Linear(2, 4) + buffer = paddle.to_tensor(np.array([5]).astype("int32")) + model.register_buffer("buffer_name", buffer, persistable=True) + model = paddle.amp.decorate(models=model, level='O2') + self.assertEqual((model._buffers["buffer_name"].dtype == paddle.int32), + True) + class TestStateDictHookForAMP(unittest.TestCase): -- GitLab