未验证 提交 993ec76a 编写于 作者: Z zhangbo9674 提交者: GitHub

Refine param conversion logic in layer.to (#36862)

* refine layer to

* delete comment

* refine logic

* refine code

* refine pure_fp16_init

* refine comment
上级 aac00f6a
......@@ -546,8 +546,13 @@ PYBIND11_MODULE(core_noavx, m) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("cudnn_version", &platform::CudnnVersion);
m.def("gpu_memory_available", []() {
size_t available = 0;
size_t total = 0;
paddle::platform::GpuMemoryUsage(&available, &total);
return available;
});
#endif
#ifdef PADDLE_WITH_NCCL
m.def("nccl_version", &GetNCCLVersion);
#endif
......
......@@ -119,10 +119,7 @@ def _in_amp_guard():
@dygraph_only
def pure_fp16_initialize(enable_pure_fp16, models, optimizers):
if not enable_pure_fp16:
return models, optimizers
def pure_fp16_initialize(models):
for idx in range(len(models)):
for layer in models[idx].sublayers(include_self=True):
layer._casted_by_pure_fp16 = True
......@@ -132,43 +129,7 @@ def pure_fp16_initialize(enable_pure_fp16, models, optimizers):
paddle.nn.BatchNorm, paddle.nn.LayerNorm)):
continue
layer.to(dtype='float16')
for idx_opt in range(len(optimizers)):
# update _param_groups
if getattr(optimizers[idx_opt], '_param_groups', None) and isinstance(
optimizers[idx_opt]._param_groups[0], dict):
for param_group in optimizers[idx_opt]._param_groups:
for i, param in enumerate(param_group['params']):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(
include_self=True):
if id(param) in layer._parameters_transform_map:
param_group['params'][
i] = layer._parameters_transform_map[id(
param)][0]
for param_group in optimizers[idx_opt]._parameter_list:
params = param_group['params']
for i, param in enumerate(params):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(
include_self=True):
if id(param) in layer._parameters_transform_map:
params[i] = layer._parameters_transform_map[id(
param)][0]
# update _parameter_list
else:
for i, param in enumerate(optimizers[idx_opt]._parameter_list):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(include_self=True):
if id(param) in layer._parameters_transform_map:
optimizers[idx_opt]._parameter_list[
i] = layer._parameters_transform_map[id(param)][
0]
if hasattr(optimizers[idx_opt], '_param_groups'):
optimizers[idx_opt]._param_groups[
i] = layer._parameters_transform_map[id(
param)][0]
return models, optimizers
return models
def check_models(models):
......@@ -401,8 +362,7 @@ def amp_decorate(models,
"optimizers must be either a single optimizer or a list of optimizers."
)
models, optimizers = pure_fp16_initialize(
enable_pure_fp16=True, models=models, optimizers=optimizers)
models = pure_fp16_initialize(models=models)
# supprot master_weight
for idx_opt in range(len(optimizers)):
......
......@@ -121,9 +121,6 @@ class Layer(core.Layer):
self._forward_pre_hooks = collections.OrderedDict()
self._forward_post_hooks = collections.OrderedDict()
self._parameters_transform_map = {}
self._buffers_transform_map = {}
self._casted_by_pure_fp16 = False
self._state_dict_hooks = collections.OrderedDict()
......@@ -1473,29 +1470,14 @@ class Layer(core.Layer):
if param is not None:
with no_grad():
param_applied = func(param, device, dtype, blocking)
assert param.is_leaf
param_applied.stop_gradient = param.stop_gradient
if hasattr(param_applied, 'is_distributed'):
param_applied.is_distributed = param.is_distributed
self._parameters[key] = param_applied
if param.grad is not None:
with no_grad():
grad_applied = func(param._grad_ivar(), device, dtype,
blocking)
grad_applied.stop_gradient = param._grad_ivar(
).stop_gradient
if hasattr(param._grad_ivar(), 'is_distributed'):
grad_applied.is_distributed = param._grad_ivar(
).is_distributed
self._parameters[key]._set_grad_ivar(grad_applied)
self._parameters_transform_map[id(param)] = [param_applied, key]
for key, buf in self._buffers.items():
self._buffers[key] = func(buf, device, dtype, blocking)
self._buffers_transform_map[id(buf)] = [self._buffers[key], key]
def to(self, device=None, dtype=None, blocking=None):
'''
......@@ -1574,22 +1556,59 @@ class Layer(core.Layer):
if dtype is None:
dtype = t.dtype
new_t = t._copy_to(device, blocking)
if isinstance(t, framework.ParamBase):
if dtype is not None and dtype != t.dtype:
# 1. gpu place need to determine whether the memory is sufficient for allocation:
if t.place.is_gpu_place():
gpu_memory_available = core.gpu_memory_available()
# for gpu, minimum memory allocation unit is 256 bytes.
if type(dtype) is str:
size_dtype = core.size_of_dtype(
convert_np_dtype_to_dtype_(dtype))
else:
size_dtype = core.size_of_dtype(dtype)
# Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
# Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
waiting_alloc_memory = (
(t.numel().numpy()[0] * size_dtype) / 256 + 1) * 256 * 1.2
if gpu_memory_available < waiting_alloc_memory:
# Copy param / Tensor to cpu
t_used = t._copy_to(paddle.CPUPlace(),
blocking) # k-v type will error
# Release mem of t
t.value().get_tensor()._clear()
else:
t_used = t
else:
t_used = t
# 2. cast param / Tensor to dtype
if dtype is not None and dtype != t_used.dtype:
if isinstance(t_used, framework.ParamBase):
from paddle.fluid.layer_helper import LayerHelper
helper = LayerHelper("cast", **locals())
t_casted = helper.create_variable_for_type_inference(
dtype=dtype)
framework._dygraph_tracer().trace_op(
type='cast',
inputs={'X': new_t},
outputs={'Out': new_t},
inputs={'X': t_used},
outputs={'Out': t_casted},
attrs={
'in_dtype': t.dtype,
'in_dtype': t_used.dtype,
'out_dtype': convert_np_dtype_to_dtype_(dtype)
})
else:
if dtype is not None and dtype != t.dtype:
new_t = new_t.cast(dtype=dtype)
t_casted = t_used.cast(dtype=dtype)
else:
t_casted = t_used
# 3. Copy casted cpu param / Tensor to device
new_t = t_casted._copy_to(device, blocking)
# 4. share Tensor to origin param / Tensor
dst_tensor = t.value().get_tensor()
src_tensor = new_t.value().get_tensor()
dst_tensor._share_data_with(src_tensor)
return new_t
return t
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册