未验证 提交 0cf3e8f9 编写于 作者: Z zhangbo9674 提交者: GitHub

[cherry pick] Refine param conversion logic in layer.to (#38068)

优化layer.to实现逻辑,相关pr:
Remove additional warnning in layer.to ( #36700)
Refine param conversion logic in layer.to ( #36862)
Fix Layer.to() of device bug ( #37156)
上级 e37a9f13
...@@ -527,8 +527,13 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -527,8 +527,13 @@ PYBIND11_MODULE(core_noavx, m) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("cudnn_version", &platform::CudnnVersion); m.def("cudnn_version", &platform::CudnnVersion);
m.def("gpu_memory_available", []() {
size_t available = 0;
size_t total = 0;
paddle::platform::GpuMemoryUsage(&available, &total);
return available;
});
#endif #endif
#ifdef PADDLE_WITH_NCCL #ifdef PADDLE_WITH_NCCL
m.def("nccl_version", &GetNCCLVersion); m.def("nccl_version", &GetNCCLVersion);
#endif #endif
......
...@@ -119,10 +119,7 @@ def _in_amp_guard(): ...@@ -119,10 +119,7 @@ def _in_amp_guard():
@dygraph_only @dygraph_only
def pure_fp16_initialize(enable_pure_fp16, models, optimizers): def pure_fp16_initialize(models):
if not enable_pure_fp16:
return models, optimizers
for idx in range(len(models)): for idx in range(len(models)):
for layer in models[idx].sublayers(include_self=True): for layer in models[idx].sublayers(include_self=True):
layer._casted_by_pure_fp16 = True layer._casted_by_pure_fp16 = True
...@@ -132,43 +129,7 @@ def pure_fp16_initialize(enable_pure_fp16, models, optimizers): ...@@ -132,43 +129,7 @@ def pure_fp16_initialize(enable_pure_fp16, models, optimizers):
paddle.nn.BatchNorm, paddle.nn.LayerNorm)): paddle.nn.BatchNorm, paddle.nn.LayerNorm)):
continue continue
layer.to(dtype='float16') layer.to(dtype='float16')
return models
for idx_opt in range(len(optimizers)):
# update _param_groups
if getattr(optimizers[idx_opt], '_param_groups', None) and isinstance(
optimizers[idx_opt]._param_groups[0], dict):
for param_group in optimizers[idx_opt]._param_groups:
for i, param in enumerate(param_group['params']):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(
include_self=True):
if id(param) in layer._parameters_transform_map:
param_group['params'][
i] = layer._parameters_transform_map[id(
param)][0]
for param_group in optimizers[idx_opt]._parameter_list:
params = param_group['params']
for i, param in enumerate(params):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(
include_self=True):
if id(param) in layer._parameters_transform_map:
params[i] = layer._parameters_transform_map[id(
param)][0]
# update _parameter_list
else:
for i, param in enumerate(optimizers[idx_opt]._parameter_list):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(include_self=True):
if id(param) in layer._parameters_transform_map:
optimizers[idx_opt]._parameter_list[
i] = layer._parameters_transform_map[id(param)][
0]
if hasattr(optimizers[idx_opt], '_param_groups'):
optimizers[idx_opt]._param_groups[
i] = layer._parameters_transform_map[id(
param)][0]
return models, optimizers
def check_models(models): def check_models(models):
...@@ -397,8 +358,7 @@ def amp_decorate(models, ...@@ -397,8 +358,7 @@ def amp_decorate(models,
"optimizers must be either a single optimizer or a list of optimizers." "optimizers must be either a single optimizer or a list of optimizers."
) )
models, optimizers = pure_fp16_initialize( models = pure_fp16_initialize(models=models)
enable_pure_fp16=True, models=models, optimizers=optimizers)
# supprot master_weight # supprot master_weight
for idx_opt in range(len(optimizers)): for idx_opt in range(len(optimizers)):
......
...@@ -121,9 +121,6 @@ class Layer(core.Layer): ...@@ -121,9 +121,6 @@ class Layer(core.Layer):
self._forward_pre_hooks = collections.OrderedDict() self._forward_pre_hooks = collections.OrderedDict()
self._forward_post_hooks = collections.OrderedDict() self._forward_post_hooks = collections.OrderedDict()
self._parameters_transform_map = {}
self._buffers_transform_map = {}
self._casted_by_pure_fp16 = False self._casted_by_pure_fp16 = False
self._state_dict_hooks = collections.OrderedDict() self._state_dict_hooks = collections.OrderedDict()
...@@ -1473,24 +1470,14 @@ class Layer(core.Layer): ...@@ -1473,24 +1470,14 @@ class Layer(core.Layer):
if param is not None: if param is not None:
with no_grad(): with no_grad():
param_applied = func(param, device, dtype, blocking) param_applied = func(param, device, dtype, blocking)
assert param.is_leaf
param_applied.stop_gradient = param.stop_gradient
self._parameters[key] = param_applied
if param.grad is not None: if param.grad is not None:
with no_grad(): with no_grad():
grad_applied = func(param._grad_ivar(), device, dtype, grad_applied = func(param._grad_ivar(), device, dtype,
blocking) blocking)
grad_applied.stop_gradient = param._grad_ivar(
).stop_gradient
self._parameters[key]._set_grad_ivar(grad_applied)
self._parameters_transform_map[id(param)] = [param_applied, key]
for key, buf in self._buffers.items(): for key, buf in self._buffers.items():
self._buffers[key] = func(buf, device, dtype, blocking) self._buffers[key] = func(buf, device, dtype, blocking)
self._buffers_transform_map[id(buf)] = [self._buffers[key], key]
def to(self, device=None, dtype=None, blocking=None): def to(self, device=None, dtype=None, blocking=None):
''' '''
...@@ -1568,24 +1555,54 @@ class Layer(core.Layer): ...@@ -1568,24 +1555,54 @@ class Layer(core.Layer):
if dtype is None: if dtype is None:
dtype = t.dtype dtype = t.dtype
new_t = t._copy_to(device, blocking) if type(dtype) is str:
if isinstance(t, framework.ParamBase): dtype = convert_np_dtype_to_dtype_(dtype)
if dtype is not None and dtype != t.dtype:
framework._dygraph_tracer().trace_op( # 1. gpu place need to determine whether the memory is sufficient for allocation:
type='cast', if t.place.is_gpu_place():
inputs={'X': new_t}, # for gpu, minimum memory allocation unit is 256 bytes.
outputs={'Out': new_t}, size_dtype = core.size_of_dtype(dtype)
attrs={ # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
'in_dtype': t.dtype, # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
'out_dtype': convert_np_dtype_to_dtype_(dtype) waiting_alloc_memory = (
}) (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
gpu_memory_available = core.gpu_memory_available()
if gpu_memory_available < waiting_alloc_memory:
# Copy param / Tensor to cpu
t_used = t._copy_to(paddle.CPUPlace(),
blocking) # k-v type will error
# Release mem of t
t.value().get_tensor()._clear()
else:
t_used = t
else:
t_used = t
# 2. cast param / Tensor to dtype
if dtype is not None and dtype != t_used.dtype:
with paddle.fluid.framework._dygraph_place_guard(
place=t_used.place):
t_casted = t_used.cast(dtype=dtype)
else:
t_casted = t_used
# 3. Copy casted cpu param / Tensor to device
if device is not None and not t_casted.place._equals(device):
new_t = t_casted._copy_to(device, blocking)
else: else:
if dtype is not None and dtype != t.dtype: new_t = t_casted
new_t = new_t.cast(dtype=dtype)
# 4. share Tensor to origin param / Tensor
dst_tensor = t.value().get_tensor()
src_tensor = new_t.value().get_tensor()
dst_tensor._share_data_with(src_tensor)
return t
return new_t with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning)
self._apply(transform, device, dtype, blocking)
self._apply(transform, device, dtype, blocking)
self._dtype = dtype self._dtype = dtype
# [aliases] Compatible with old method names # [aliases] Compatible with old method names
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册