未验证 提交 f80cee11 编写于 作者: Z zhangbo9674 提交者: GitHub

add float_only for layer_to (#43760)

上级 178b2440
......@@ -173,7 +173,9 @@ def pure_fp16_initialize(models):
paddle.nn.BatchNorm2D, paddle.nn.BatchNorm3D,
paddle.nn.LayerNorm, paddle.nn.SyncBatchNorm)):
continue
layer._to_impl(dtype='float16', include_sublayers=False)
layer._to_impl(dtype='float16',
include_sublayers=False,
floating_only=True)
return models
......
......@@ -1576,7 +1576,8 @@ class Layer(object):
return self._to_impl(device=device,
dtype=dtype,
blocking=blocking,
include_sublayers=True)
include_sublayers=True,
floating_only=False)
def _apply(self, func, device, dtype, blocking, include_sublayers=True):
if include_sublayers:
......@@ -1599,11 +1600,62 @@ class Layer(object):
self._dtype = dtype
def _transform(self, t, device, dtype, blocking):
if device is None:
device = t.place
if dtype is None:
dtype = t.dtype
if type(dtype) is not VarDesc.VarType:
dtype = convert_np_dtype_to_dtype_(dtype)
# 1. gpu place need to determine whether the memory is sufficient for allocation:
if t.place.is_gpu_place():
# for gpu, minimum memory allocation unit is 256 bytes.
size_dtype = core.size_of_dtype(dtype)
# Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
# Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
waiting_alloc_memory = (
(np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
gpu_memory_available = core.gpu_memory_available()
if gpu_memory_available < waiting_alloc_memory:
# Copy param / Tensor to cpu
t_used = t._copy_to(paddle.CPUPlace(),
blocking) # k-v type will error
# Release mem of t
t.value().get_tensor()._clear()
else:
t_used = t
else:
t_used = t
# 2. cast param / Tensor to dtype
if dtype is not None and dtype != t_used.dtype:
with paddle.fluid.framework._dygraph_place_guard(
place=t_used.place):
t_casted = t_used.cast(dtype=dtype)
else:
t_casted = t_used
# 3. Copy casted cpu param / Tensor to device
if device is not None and not t_casted.place._equals(device):
new_t = t_casted._copy_to(device, blocking)
else:
new_t = t_casted
# 4. share Tensor to origin param / Tensor
dst_tensor = t.value().get_tensor()
src_tensor = new_t.value().get_tensor()
dst_tensor._share_data_with(src_tensor)
return t
def _to_impl(self,
device=None,
dtype=None,
blocking=None,
include_sublayers=True):
include_sublayers=True,
floating_only=False):
'''
Cast the parameters and buffers of Layer by the give device, dtype and blocking.
......@@ -1619,6 +1671,8 @@ class Layer(object):
include_sublayers(bool|True, optional): If True, deal with self and all sublayers parameters and buffers, if not only deal with self parameters and buffers. Default: True.
floating_only(bool|False, optional): If True, only cast all floating point parameters and buffers of Layer by the give device, dtype and blocking.
Returns:
self
......@@ -1646,54 +1700,9 @@ class Layer(object):
bool), "blocking value error, must be the True, False or None"
def transform(t, device, dtype, blocking):
if device is None:
device = t.place
if dtype is None:
dtype = t.dtype
if type(dtype) is not VarDesc.VarType:
dtype = convert_np_dtype_to_dtype_(dtype)
# 1. gpu place need to determine whether the memory is sufficient for allocation:
if t.place.is_gpu_place():
# for gpu, minimum memory allocation unit is 256 bytes.
size_dtype = core.size_of_dtype(dtype)
# Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
# Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
waiting_alloc_memory = (
(np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
gpu_memory_available = core.gpu_memory_available()
if gpu_memory_available < waiting_alloc_memory:
# Copy param / Tensor to cpu
t_used = t._copy_to(paddle.CPUPlace(),
blocking) # k-v type will error
# Release mem of t
t.value().get_tensor()._clear()
else:
t_used = t
else:
t_used = t
# 2. cast param / Tensor to dtype
if dtype is not None and dtype != t_used.dtype:
with paddle.fluid.framework._dygraph_place_guard(
place=t_used.place):
t_casted = t_used.cast(dtype=dtype)
else:
t_casted = t_used
# 3. Copy casted cpu param / Tensor to device
if device is not None and not t_casted.place._equals(device):
new_t = t_casted._copy_to(device, blocking)
else:
new_t = t_casted
# 4. share Tensor to origin param / Tensor
dst_tensor = t.value().get_tensor()
src_tensor = new_t.value().get_tensor()
dst_tensor._share_data_with(src_tensor)
return t
if floating_only and (not paddle.is_floating_point(t)):
return t
return self._transform(t, device, dtype, blocking)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning)
......
......@@ -707,6 +707,14 @@ class TestAmpDecorator(unittest.TestCase):
for param in model.parameters():
self.assertEqual((param.dtype == paddle.float32), True)
def test_floating_only(self):
model = paddle.nn.Linear(2, 4)
buffer = paddle.to_tensor(np.array([5]).astype("int32"))
model.register_buffer("buffer_name", buffer, persistable=True)
model = paddle.amp.decorate(models=model, level='O2')
self.assertEqual((model._buffers["buffer_name"].dtype == paddle.int32),
True)
class TestStateDictHookForAMP(unittest.TestCase):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册