未验证 提交 0cf3e8f9 编写于 作者: Z zhangbo9674 提交者: GitHub

[cherry pick] Refine param conversion logic in layer.to (#38068)

优化layer.to实现逻辑,相关pr:
Remove additional warnning in layer.to ( #36700)
Refine param conversion logic in layer.to ( #36862)
Fix Layer.to() of device bug ( #37156)
上级 e37a9f13
......@@ -527,8 +527,13 @@ PYBIND11_MODULE(core_noavx, m) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("cudnn_version", &platform::CudnnVersion);
m.def("gpu_memory_available", []() {
size_t available = 0;
size_t total = 0;
paddle::platform::GpuMemoryUsage(&available, &total);
return available;
});
#endif
#ifdef PADDLE_WITH_NCCL
m.def("nccl_version", &GetNCCLVersion);
#endif
......
......@@ -119,10 +119,7 @@ def _in_amp_guard():
@dygraph_only
def pure_fp16_initialize(enable_pure_fp16, models, optimizers):
if not enable_pure_fp16:
return models, optimizers
def pure_fp16_initialize(models):
for idx in range(len(models)):
for layer in models[idx].sublayers(include_self=True):
layer._casted_by_pure_fp16 = True
......@@ -132,43 +129,7 @@ def pure_fp16_initialize(enable_pure_fp16, models, optimizers):
paddle.nn.BatchNorm, paddle.nn.LayerNorm)):
continue
layer.to(dtype='float16')
for idx_opt in range(len(optimizers)):
# update _param_groups
if getattr(optimizers[idx_opt], '_param_groups', None) and isinstance(
optimizers[idx_opt]._param_groups[0], dict):
for param_group in optimizers[idx_opt]._param_groups:
for i, param in enumerate(param_group['params']):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(
include_self=True):
if id(param) in layer._parameters_transform_map:
param_group['params'][
i] = layer._parameters_transform_map[id(
param)][0]
for param_group in optimizers[idx_opt]._parameter_list:
params = param_group['params']
for i, param in enumerate(params):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(
include_self=True):
if id(param) in layer._parameters_transform_map:
params[i] = layer._parameters_transform_map[id(
param)][0]
# update _parameter_list
else:
for i, param in enumerate(optimizers[idx_opt]._parameter_list):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(include_self=True):
if id(param) in layer._parameters_transform_map:
optimizers[idx_opt]._parameter_list[
i] = layer._parameters_transform_map[id(param)][
0]
if hasattr(optimizers[idx_opt], '_param_groups'):
optimizers[idx_opt]._param_groups[
i] = layer._parameters_transform_map[id(
param)][0]
return models, optimizers
return models
def check_models(models):
......@@ -397,8 +358,7 @@ def amp_decorate(models,
"optimizers must be either a single optimizer or a list of optimizers."
)
models, optimizers = pure_fp16_initialize(
enable_pure_fp16=True, models=models, optimizers=optimizers)
models = pure_fp16_initialize(models=models)
# supprot master_weight
for idx_opt in range(len(optimizers)):
......
......@@ -121,9 +121,6 @@ class Layer(core.Layer):
self._forward_pre_hooks = collections.OrderedDict()
self._forward_post_hooks = collections.OrderedDict()
self._parameters_transform_map = {}
self._buffers_transform_map = {}
self._casted_by_pure_fp16 = False
self._state_dict_hooks = collections.OrderedDict()
......@@ -1473,24 +1470,14 @@ class Layer(core.Layer):
if param is not None:
with no_grad():
param_applied = func(param, device, dtype, blocking)
assert param.is_leaf
param_applied.stop_gradient = param.stop_gradient
self._parameters[key] = param_applied
if param.grad is not None:
with no_grad():
grad_applied = func(param._grad_ivar(), device, dtype,
blocking)
grad_applied.stop_gradient = param._grad_ivar(
).stop_gradient
self._parameters[key]._set_grad_ivar(grad_applied)
self._parameters_transform_map[id(param)] = [param_applied, key]
for key, buf in self._buffers.items():
self._buffers[key] = func(buf, device, dtype, blocking)
self._buffers_transform_map[id(buf)] = [self._buffers[key], key]
def to(self, device=None, dtype=None, blocking=None):
'''
......@@ -1568,24 +1555,54 @@ class Layer(core.Layer):
if dtype is None:
dtype = t.dtype
new_t = t._copy_to(device, blocking)
if isinstance(t, framework.ParamBase):
if dtype is not None and dtype != t.dtype:
framework._dygraph_tracer().trace_op(
type='cast',
inputs={'X': new_t},
outputs={'Out': new_t},
attrs={
'in_dtype': t.dtype,
'out_dtype': convert_np_dtype_to_dtype_(dtype)
})
if type(dtype) is str:
dtype = convert_np_dtype_to_dtype_(dtype)
# 1. gpu place need to determine whether the memory is sufficient for allocation:
if t.place.is_gpu_place():
# for gpu, minimum memory allocation unit is 256 bytes.
size_dtype = core.size_of_dtype(dtype)
# Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
# Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
waiting_alloc_memory = (
(np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
gpu_memory_available = core.gpu_memory_available()
if gpu_memory_available < waiting_alloc_memory:
# Copy param / Tensor to cpu
t_used = t._copy_to(paddle.CPUPlace(),
blocking) # k-v type will error
# Release mem of t
t.value().get_tensor()._clear()
else:
t_used = t
else:
t_used = t
# 2. cast param / Tensor to dtype
if dtype is not None and dtype != t_used.dtype:
with paddle.fluid.framework._dygraph_place_guard(
place=t_used.place):
t_casted = t_used.cast(dtype=dtype)
else:
t_casted = t_used
# 3. Copy casted cpu param / Tensor to device
if device is not None and not t_casted.place._equals(device):
new_t = t_casted._copy_to(device, blocking)
else:
if dtype is not None and dtype != t.dtype:
new_t = new_t.cast(dtype=dtype)
new_t = t_casted
# 4. share Tensor to origin param / Tensor
dst_tensor = t.value().get_tensor()
src_tensor = new_t.value().get_tensor()
dst_tensor._share_data_with(src_tensor)
return t
return new_t
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning)
self._apply(transform, device, dtype, blocking)
self._apply(transform, device, dtype, blocking)
self._dtype = dtype
# [aliases] Compatible with old method names
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册