diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 7a0930ddde010caf9e43d5d596bdd49ead4e7908..050a25c47838a1df21ccf285292b2f5124a3b83b 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -546,8 +546,13 @@ PYBIND11_MODULE(core_noavx, m) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("cudnn_version", &platform::CudnnVersion); + m.def("gpu_memory_available", []() { + size_t available = 0; + size_t total = 0; + paddle::platform::GpuMemoryUsage(&available, &total); + return available; + }); #endif - #ifdef PADDLE_WITH_NCCL m.def("nccl_version", &GetNCCLVersion); #endif diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py index ddde3e66c56dc7e2c0b5a3763b2ca91ed2af936c..006287752839dde7c598a16a2230b50e4f03bbb8 100644 --- a/python/paddle/fluid/dygraph/amp/auto_cast.py +++ b/python/paddle/fluid/dygraph/amp/auto_cast.py @@ -119,10 +119,7 @@ def _in_amp_guard(): @dygraph_only -def pure_fp16_initialize(enable_pure_fp16, models, optimizers): - if not enable_pure_fp16: - return models, optimizers - +def pure_fp16_initialize(models): for idx in range(len(models)): for layer in models[idx].sublayers(include_self=True): layer._casted_by_pure_fp16 = True @@ -132,43 +129,7 @@ def pure_fp16_initialize(enable_pure_fp16, models, optimizers): paddle.nn.BatchNorm, paddle.nn.LayerNorm)): continue layer.to(dtype='float16') - - for idx_opt in range(len(optimizers)): - # update _param_groups - if getattr(optimizers[idx_opt], '_param_groups', None) and isinstance( - optimizers[idx_opt]._param_groups[0], dict): - for param_group in optimizers[idx_opt]._param_groups: - for i, param in enumerate(param_group['params']): - for idx_model in range(len(models)): - for layer in models[idx_model].sublayers( - include_self=True): - if id(param) in layer._parameters_transform_map: - param_group['params'][ - i] = layer._parameters_transform_map[id( - param)][0] - for param_group in optimizers[idx_opt]._parameter_list: - params = param_group['params'] - for i, param in enumerate(params): - for idx_model in range(len(models)): - for layer in models[idx_model].sublayers( - include_self=True): - if id(param) in layer._parameters_transform_map: - params[i] = layer._parameters_transform_map[id( - param)][0] - # update _parameter_list - else: - for i, param in enumerate(optimizers[idx_opt]._parameter_list): - for idx_model in range(len(models)): - for layer in models[idx_model].sublayers(include_self=True): - if id(param) in layer._parameters_transform_map: - optimizers[idx_opt]._parameter_list[ - i] = layer._parameters_transform_map[id(param)][ - 0] - if hasattr(optimizers[idx_opt], '_param_groups'): - optimizers[idx_opt]._param_groups[ - i] = layer._parameters_transform_map[id( - param)][0] - return models, optimizers + return models def check_models(models): @@ -401,8 +362,7 @@ def amp_decorate(models, "optimizers must be either a single optimizer or a list of optimizers." ) - models, optimizers = pure_fp16_initialize( - enable_pure_fp16=True, models=models, optimizers=optimizers) + models = pure_fp16_initialize(models=models) # supprot master_weight for idx_opt in range(len(optimizers)): diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 6120cc7c6adaeae95446731bdfc4ccdcdfe858bb..8bf8300c8a26344dc33e2677ab5128c04679c722 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -121,9 +121,6 @@ class Layer(core.Layer): self._forward_pre_hooks = collections.OrderedDict() self._forward_post_hooks = collections.OrderedDict() - self._parameters_transform_map = {} - self._buffers_transform_map = {} - self._casted_by_pure_fp16 = False self._state_dict_hooks = collections.OrderedDict() @@ -1473,29 +1470,14 @@ class Layer(core.Layer): if param is not None: with no_grad(): param_applied = func(param, device, dtype, blocking) - assert param.is_leaf - param_applied.stop_gradient = param.stop_gradient - if hasattr(param_applied, 'is_distributed'): - param_applied.is_distributed = param.is_distributed - self._parameters[key] = param_applied if param.grad is not None: with no_grad(): grad_applied = func(param._grad_ivar(), device, dtype, blocking) - grad_applied.stop_gradient = param._grad_ivar( - ).stop_gradient - if hasattr(param._grad_ivar(), 'is_distributed'): - grad_applied.is_distributed = param._grad_ivar( - ).is_distributed - self._parameters[key]._set_grad_ivar(grad_applied) - - self._parameters_transform_map[id(param)] = [param_applied, key] - for key, buf in self._buffers.items(): self._buffers[key] = func(buf, device, dtype, blocking) - self._buffers_transform_map[id(buf)] = [self._buffers[key], key] def to(self, device=None, dtype=None, blocking=None): ''' @@ -1574,22 +1556,59 @@ class Layer(core.Layer): if dtype is None: dtype = t.dtype - new_t = t._copy_to(device, blocking) - if isinstance(t, framework.ParamBase): - if dtype is not None and dtype != t.dtype: + # 1. gpu place need to determine whether the memory is sufficient for allocation: + if t.place.is_gpu_place(): + gpu_memory_available = core.gpu_memory_available() + # for gpu, minimum memory allocation unit is 256 bytes. + if type(dtype) is str: + size_dtype = core.size_of_dtype( + convert_np_dtype_to_dtype_(dtype)) + else: + size_dtype = core.size_of_dtype(dtype) + # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space. + # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough. + waiting_alloc_memory = ( + (t.numel().numpy()[0] * size_dtype) / 256 + 1) * 256 * 1.2 + if gpu_memory_available < waiting_alloc_memory: + # Copy param / Tensor to cpu + t_used = t._copy_to(paddle.CPUPlace(), + blocking) # k-v type will error + # Release mem of t + t.value().get_tensor()._clear() + else: + t_used = t + else: + t_used = t + + # 2. cast param / Tensor to dtype + if dtype is not None and dtype != t_used.dtype: + if isinstance(t_used, framework.ParamBase): + from paddle.fluid.layer_helper import LayerHelper + helper = LayerHelper("cast", **locals()) + t_casted = helper.create_variable_for_type_inference( + dtype=dtype) framework._dygraph_tracer().trace_op( type='cast', - inputs={'X': new_t}, - outputs={'Out': new_t}, + inputs={'X': t_used}, + outputs={'Out': t_casted}, attrs={ - 'in_dtype': t.dtype, + 'in_dtype': t_used.dtype, 'out_dtype': convert_np_dtype_to_dtype_(dtype) }) + else: + t_casted = t_used.cast(dtype=dtype) else: - if dtype is not None and dtype != t.dtype: - new_t = new_t.cast(dtype=dtype) + t_casted = t_used + + # 3. Copy casted cpu param / Tensor to device + new_t = t_casted._copy_to(device, blocking) + + # 4. share Tensor to origin param / Tensor + dst_tensor = t.value().get_tensor() + src_tensor = new_t.value().get_tensor() + dst_tensor._share_data_with(src_tensor) - return new_t + return t with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning)