From f80cee11e866ed629eceeba51ad7da00c6c5a16a Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Thu, 23 Jun 2022 14:11:14 +0800
Subject: [PATCH] add float_only for layer_to (#43760)

---
 python/paddle/fluid/dygraph/amp/auto_cast.py  |   4 +-
 python/paddle/fluid/dygraph/layers.py         | 109 ++++++++++--------
 .../test_imperative_auto_mixed_precision.py   |   8 ++
 3 files changed, 70 insertions(+), 51 deletions(-)

diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index f441a35ca0f..12ddf1a0f8e 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -173,7 +173,9 @@ def pure_fp16_initialize(models):
                             paddle.nn.BatchNorm2D, paddle.nn.BatchNorm3D,
                             paddle.nn.LayerNorm, paddle.nn.SyncBatchNorm)):
                 continue
-            layer._to_impl(dtype='float16', include_sublayers=False)
+            layer._to_impl(dtype='float16',
+                           include_sublayers=False,
+                           floating_only=True)
     return models
 
 
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 490c6a1ca76..3a1def85c6e 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -1576,7 +1576,8 @@ class Layer(object):
         return self._to_impl(device=device,
                              dtype=dtype,
                              blocking=blocking,
-                             include_sublayers=True)
+                             include_sublayers=True,
+                             floating_only=False)
 
     def _apply(self, func, device, dtype, blocking, include_sublayers=True):
         if include_sublayers:
@@ -1599,11 +1600,62 @@ class Layer(object):
 
         self._dtype = dtype
 
+    def _transform(self, t, device, dtype, blocking):
+        if device is None:
+            device = t.place
+        if dtype is None:
+            dtype = t.dtype
+
+        if type(dtype) is not VarDesc.VarType:
+            dtype = convert_np_dtype_to_dtype_(dtype)
+
+        # 1. gpu place need to determine whether the memory is sufficient for allocation:
+        if t.place.is_gpu_place():
+            # for gpu, minimum memory allocation unit is 256 bytes.
+            size_dtype = core.size_of_dtype(dtype)
+            # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
+            # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
+            waiting_alloc_memory = (
+                (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+            gpu_memory_available = core.gpu_memory_available()
+            if gpu_memory_available < waiting_alloc_memory:
+                # Copy param / Tensor to cpu
+                t_used = t._copy_to(paddle.CPUPlace(),
+                                    blocking)  # k-v type will error
+                # Release mem of t
+                t.value().get_tensor()._clear()
+            else:
+                t_used = t
+        else:
+            t_used = t
+
+        # 2. cast param / Tensor to dtype
+        if dtype is not None and dtype != t_used.dtype:
+            with paddle.fluid.framework._dygraph_place_guard(
+                    place=t_used.place):
+                t_casted = t_used.cast(dtype=dtype)
+        else:
+            t_casted = t_used
+
+        # 3. Copy casted cpu param / Tensor to device
+        if device is not None and not t_casted.place._equals(device):
+            new_t = t_casted._copy_to(device, blocking)
+        else:
+            new_t = t_casted
+
+        # 4. share Tensor to origin param / Tensor
+        dst_tensor = t.value().get_tensor()
+        src_tensor = new_t.value().get_tensor()
+        dst_tensor._share_data_with(src_tensor)
+
+        return t
+
     def _to_impl(self,
                  device=None,
                  dtype=None,
                  blocking=None,
-                 include_sublayers=True):
+                 include_sublayers=True,
+                 floating_only=False):
         '''
         Cast the parameters and buffers of Layer by the give device, dtype and blocking.
 
@@ -1619,6 +1671,8 @@ class Layer(object):
             
             include_sublayers(bool|True, optional): If True, deal with self and all sublayers parameters and buffers, if not only deal with self parameters and buffers. Default: True.
 
+            floating_only(bool|False, optional): If True, only cast all floating point parameters and buffers of Layer by the give device, dtype and blocking.
+
         Returns:
             self
 
@@ -1646,54 +1700,9 @@ class Layer(object):
                 bool), "blocking value error, must be the True, False or None"
 
         def transform(t, device, dtype, blocking):
-            if device is None:
-                device = t.place
-            if dtype is None:
-                dtype = t.dtype
-
-            if type(dtype) is not VarDesc.VarType:
-                dtype = convert_np_dtype_to_dtype_(dtype)
-
-            # 1. gpu place need to determine whether the memory is sufficient for allocation:
-            if t.place.is_gpu_place():
-                # for gpu, minimum memory allocation unit is 256 bytes.
-                size_dtype = core.size_of_dtype(dtype)
-                # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
-                # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
-                waiting_alloc_memory = (
-                    (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
-                gpu_memory_available = core.gpu_memory_available()
-                if gpu_memory_available < waiting_alloc_memory:
-                    # Copy param / Tensor to cpu
-                    t_used = t._copy_to(paddle.CPUPlace(),
-                                        blocking)  # k-v type will error
-                    # Release mem of t
-                    t.value().get_tensor()._clear()
-                else:
-                    t_used = t
-            else:
-                t_used = t
-
-            # 2. cast param / Tensor to dtype
-            if dtype is not None and dtype != t_used.dtype:
-                with paddle.fluid.framework._dygraph_place_guard(
-                        place=t_used.place):
-                    t_casted = t_used.cast(dtype=dtype)
-            else:
-                t_casted = t_used
-
-            # 3. Copy casted cpu param / Tensor to device
-            if device is not None and not t_casted.place._equals(device):
-                new_t = t_casted._copy_to(device, blocking)
-            else:
-                new_t = t_casted
-
-            # 4. share Tensor to origin param / Tensor
-            dst_tensor = t.value().get_tensor()
-            src_tensor = new_t.value().get_tensor()
-            dst_tensor._share_data_with(src_tensor)
-
-            return t
+            if floating_only and (not paddle.is_floating_point(t)):
+                return t
+            return self._transform(t, device, dtype, blocking)
 
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore", category=UserWarning)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index 0b2df9885ab..6a5ddd3157b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -707,6 +707,14 @@ class TestAmpDecorator(unittest.TestCase):
         for param in model.parameters():
             self.assertEqual((param.dtype == paddle.float32), True)
 
+    def test_floating_only(self):
+        model = paddle.nn.Linear(2, 4)
+        buffer = paddle.to_tensor(np.array([5]).astype("int32"))
+        model.register_buffer("buffer_name", buffer, persistable=True)
+        model = paddle.amp.decorate(models=model, level='O2')
+        self.assertEqual((model._buffers["buffer_name"].dtype == paddle.int32),
+                         True)
+
 
 class TestStateDictHookForAMP(unittest.TestCase):
 
-- 
GitLab