add float_only for layer_to (#43760)

f80cee11 · zhangbo9674 · GitHub · 178b2440 · f80cee11 · f80cee11
3 changed file
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -173,7 +173,9 @@ def pure_fp16_initialize(models):
                            paddle.nn.BatchNorm2D, paddle.nn.BatchNorm3D,
                            paddle.nn.LayerNorm, paddle.nn.SyncBatchNorm)):
                continue
-            layer._to_impl(dtype='float16', include_sublayers=False)
+            layer._to_impl(dtype='float16',
+                           include_sublayers=False,
+                           floating_only=True)
    return models



--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -1576,7 +1576,8 @@ class Layer(object):
        return self._to_impl(device=device,
                             dtype=dtype,
                             blocking=blocking,
-                             include_sublayers=True)
+                             include_sublayers=True,
+                             floating_only=False)

    def _apply(self, func, device, dtype, blocking, include_sublayers=True):
        if include_sublayers:
@@ -1599,11 +1600,62 @@ class Layer(object):

        self._dtype = dtype

+    def _transform(self, t, device, dtype, blocking):
+        if device is None:
+            device = t.place
+        if dtype is None:
+            dtype = t.dtype
+
+        if type(dtype) is not VarDesc.VarType:
+            dtype = convert_np_dtype_to_dtype_(dtype)
+
+        # 1. gpu place need to determine whether the memory is sufficient for allocation:
+        if t.place.is_gpu_place():
+            # for gpu, minimum memory allocation unit is 256 bytes.
+            size_dtype = core.size_of_dtype(dtype)
+            # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
+            # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
+            waiting_alloc_memory = (
+                (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+            gpu_memory_available = core.gpu_memory_available()
+            if gpu_memory_available < waiting_alloc_memory:
+                # Copy param / Tensor to cpu
+                t_used = t._copy_to(paddle.CPUPlace(),
+                                    blocking)  # k-v type will error
+                # Release mem of t
+                t.value().get_tensor()._clear()
+            else:
+                t_used = t
+        else:
+            t_used = t
+
+        # 2. cast param / Tensor to dtype
+        if dtype is not None and dtype != t_used.dtype:
+            with paddle.fluid.framework._dygraph_place_guard(
+                    place=t_used.place):
+                t_casted = t_used.cast(dtype=dtype)
+        else:
+            t_casted = t_used
+
+        # 3. Copy casted cpu param / Tensor to device
+        if device is not None and not t_casted.place._equals(device):
+            new_t = t_casted._copy_to(device, blocking)
+        else:
+            new_t = t_casted
+
+        # 4. share Tensor to origin param / Tensor
+        dst_tensor = t.value().get_tensor()
+        src_tensor = new_t.value().get_tensor()
+        dst_tensor._share_data_with(src_tensor)
+
+        return t
+
    def _to_impl(self,
                 device=None,
                 dtype=None,
                 blocking=None,
-                 include_sublayers=True):
+                 include_sublayers=True,
+                 floating_only=False):
        '''
        Cast the parameters and buffers of Layer by the give device, dtype and blocking.

@@ -1619,6 +1671,8 @@ class Layer(object):
            
            include_sublayers(bool|True, optional): If True, deal with self and all sublayers parameters and buffers, if not only deal with self parameters and buffers. Default: True.

+            floating_only(bool|False, optional): If True, only cast all floating point parameters and buffers of Layer by the give device, dtype and blocking.
+
        Returns:
            self

@@ -1646,54 +1700,9 @@ class Layer(object):
                bool), "blocking value error, must be the True, False or None"

        def transform(t, device, dtype, blocking):
-            if device is None:
-                device = t.place
-            if dtype is None:
-                dtype = t.dtype
-
-            if type(dtype) is not VarDesc.VarType:
-                dtype = convert_np_dtype_to_dtype_(dtype)
-
-            # 1. gpu place need to determine whether the memory is sufficient for allocation:
-            if t.place.is_gpu_place():
-                # for gpu, minimum memory allocation unit is 256 bytes.
-                size_dtype = core.size_of_dtype(dtype)
-                # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
-                # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
-                waiting_alloc_memory = (
-                    (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
-                gpu_memory_available = core.gpu_memory_available()
-                if gpu_memory_available < waiting_alloc_memory:
-                    # Copy param / Tensor to cpu
-                    t_used = t._copy_to(paddle.CPUPlace(),
-                                        blocking)  # k-v type will error
-                    # Release mem of t
-                    t.value().get_tensor()._clear()
-                else:
-                    t_used = t
-            else:
-                t_used = t
-
-            # 2. cast param / Tensor to dtype
-            if dtype is not None and dtype != t_used.dtype:
-                with paddle.fluid.framework._dygraph_place_guard(
-                        place=t_used.place):
-                    t_casted = t_used.cast(dtype=dtype)
-            else:
-                t_casted = t_used
-
-            # 3. Copy casted cpu param / Tensor to device
-            if device is not None and not t_casted.place._equals(device):
-                new_t = t_casted._copy_to(device, blocking)
-            else:
-                new_t = t_casted
-
-            # 4. share Tensor to origin param / Tensor
-            dst_tensor = t.value().get_tensor()
-            src_tensor = new_t.value().get_tensor()
-            dst_tensor._share_data_with(src_tensor)
-
-            return t
+            if floating_only and (not paddle.is_floating_point(t)):
+                return t
+            return self._transform(t, device, dtype, blocking)

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=UserWarning)

--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -707,6 +707,14 @@ class TestAmpDecorator(unittest.TestCase):
        for param in model.parameters():
            self.assertEqual((param.dtype == paddle.float32), True)

+    def test_floating_only(self):
+        model = paddle.nn.Linear(2, 4)
+        buffer = paddle.to_tensor(np.array([5]).astype("int32"))
+        model.register_buffer("buffer_name", buffer, persistable=True)
+        model = paddle.amp.decorate(models=model, level='O2')
+        self.assertEqual((model._buffers["buffer_name"].dtype == paddle.int32),
+                         True)
+

 class TestStateDictHookForAMP(unittest.TestCase):