# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #Taken and modified for fairscale from: # https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/misc/param_bucket.py #Commit: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e import os import time import numpy as np import paddle import paddle.fluid as fluid from paddle.fluid import core from ..meta_parallel.sharding.sharding_utils import Type, device_guard class InternalStorage: """ This is a basic class, which is responsible for consolidating the basic storage tensor. """ # Support integration parameter tensor def __init__(self, size, dtype, device, convert_cpu=False): self._params = [] self._param_ids = [] self._fill = 0 self._device = device self._dtype = dtype # The actual flat tensor size = [size] if isinstance(size, int) else size if convert_cpu: value = np.zeros( size, dtype=np.float16) if Type.fp16.value == dtype else np.zeros( size, dtype=np.float32) self.buffer = core.VarBase(value=value, place=core.CPUPlace()) else: self.buffer = paddle.zeros(size, dtype=dtype) def to(self, device, dtype=None, keep_alignment=True): """ Move the underlying buffer """ assert self.buffer is not None, "Cannot move a collapsed bucket, please rebuild it" assert (dtype == Type.fp32.value or Type.fp16.value), "Conversion type is not supported now" dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device() .split(":")[1]) if self._device != device: tmp_buffer = self.buffer.cuda( dev_id) if device == "gpu" else self.buffer.cpu() for param in self._params: param.clear_gradient(False) param._gradient_set_empty(False) self.buffer.value().get_tensor()._clear() self.buffer = tmp_buffer self._device = device if dtype is not None: self.buffer = self.buffer.cast(dtype=dtype) self._dtype = dtype class ParamStorage(InternalStorage): """ This is a basic class to simplify the handling of parameter InternalStorages. """ def __init__(self, size, dtype, device): super().__init__(size, dtype, device, convert_cpu=True) self.param2align = None def to(self, device, dtype=None, keep_alignment=True): """ Move the underlying buffer """ super().to(device, dtype) if keep_alignment: self._array_params() @fluid.dygraph.no_grad def add_rank_params(self, trainable_params, param2align, convert_gpu=True): """ Add new parameters to the InternalStorage. Params becomes a view of this InternalStorage buffer. """ assert all([ id(param) not in self._param_ids for param in trainable_params ]), "The same param cannot be checked in twice" assert self.buffer is not None self.param2align = param2align cpu_param_shape = list() for param in trainable_params: p_shape = self._add_param_as_view(param, param2align[param.name], convert_gpu) cpu_param_shape.append(p_shape) if convert_gpu: # buffer convert from cpu to cuda dev_id = int(paddle.get_device().split(":")[1]) self.buffer = self.buffer.cuda(dev_id) self._fill = 0 for idx, param in enumerate(trainable_params): self._convert_buffer(param, cpu_param_shape[idx], param2align[param.name]) self._params.append(param) self._param_ids.append(id(param)) @fluid.dygraph.no_grad def _add_param_as_view(self, param, align, convert_gpu=True): assert ( param.dtype == self.buffer.dtype ), "Different types for the InternalStorage and the param, cannot proceed: {} - {}".format( param.dtype, self.buffer.dtype) var_end = self._fill + np.prod(param.shape) offset = var_end + align assert offset <= np.prod(self.buffer.shape) p_shape = param.shape origin_state = param.stop_gradient param.stop_gradient = True param.flatten_() param.stop_gradient = origin_state # Copy the current param value dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device() .split(":")[1]) with device_guard(dev_id, "cpu"): tmp_var = core.VarBase(tensor=self.buffer._slice(self._fill, var_end)) if convert_gpu: param_cpu = param.cpu() param.value().get_tensor()._clear() tmp_var.set_value(param_cpu) else: tmp_var.set_value(param) self._fill = offset return p_shape @fluid.dygraph.no_grad def _convert_buffer(self, param, p_shape, align): var_end = self._fill + np.prod(p_shape) offset = var_end + align assert offset <= np.prod(self.buffer.shape) # Convert the param value tmp_tensor = self.buffer._slice(self._fill, var_end) param.value().get_tensor()._share_data_with(tmp_tensor) param.value().get_tensor()._set_dims(p_shape) self._fill = offset @fluid.dygraph.no_grad def _array_params(self): """ Given the parameters which have been registered previously, rebuild the whole InternalStorage. """ assert len(self._params) > 0 assert self.param2align is not None self._fill = 0 for p in self._params: self._convert_buffer(p, p.shape, self.param2align[p.name]) # modify class GradStorage(InternalStorage): """ This is a basic class to simplify the handling of gradient InternalStorages """ def __init__(self, size, dtype, device, destination, parm2align, convert_cpu=False): if isinstance(size, np.int64): size = size.tolist() super().__init__(size, dtype, device, convert_cpu) self._max_size = size self._release = False self.params_checked_in = 0 self.destination = destination self._parm2align = parm2align self.sent = False def reset_checked_in(self): """ Reset the counter of the parameter grads which have been checked in """ self.params_checked_in = 0 self.sent = False @property def all_checked_in(self): """ Judge all the expected gradient check-in happened """ return len(self._params) == self.params_checked_in def can_add_grad_view(self, param, align): """ Is there enough InternalStorage to add this parameter gradient, and whether this param have already checked in. """ return self._fill + np.prod( param.shape) + align <= self._max_size and id( param) not in self._param_ids def to(self, device, dtype=None, keep_alignment=True): """ Move the underlying buffer """ if self._release: self.rebuild() super().to(device, dtype) if keep_alignment: self._array_grads() @fluid.dygraph.no_grad def add_grad(self, param, align): """ Add a new parameter gradient to the InternalStorage. Param.grad becomes a view of this InternalStorage buffer. """ assert id( param ) not in self._param_ids, "The same gradients cannot be checked in twice" self._add_grad_as_view(param, align) self._params.append(param) self._param_ids.append(id(param)) @fluid.dygraph.no_grad def manumal_relase(self): """ Release the buffer from InternalStorage. The InternalStorage will need to be rebuilt before use. """ if not self._release: for p in self._params: if p.grad is not None: p.clear_gradient(False) p._gradient_set_empty(False) self.buffer = None self._fill = 0 self.params_checked_in = 0 self._release = True @fluid.dygraph.no_grad def rebuild(self): """ Given the parameter gradients which have been registered previously, rebuild the whole InternalStorage. """ if self._release: self.buffer = paddle.zeros([self._max_size], dtype=self._dtype) for p in self._params: self._add_grad_as_view(p, self._parm2align[p.name]) self._release = False @fluid.dygraph.no_grad def _array_grads(self): """ Given the parameters gradients which have been registered previously, rebuild the whole InternalStorage. """ if len(self._params) > 0: self._fill = 0 for p in self._params: self._add_grad_as_view(p, self._parm2align[p.name]) @fluid.dygraph.no_grad def _add_grad_as_view(self, param, align): assert np.prod( self.buffer.shape ) > 0, "Cannot add a gradient to a released InternalStorage, please rebuild" assert param.dtype == self.buffer.dtype grad_end = self._fill + np.prod(param.shape) offset = grad_end + align assert offset <= np.prod(self.buffer.shape) # Copy the current grad value to InternalStorage dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device() .split(":")[1]) if self._device == "cpu": with device_guard(dev_id, self._device): tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end)) param._copy_gradient_from(tmp_var) tmp_var.value().get_tensor()._clear() elif self._device == "gpu": tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end)) param._copy_gradient_from(tmp_var) tmp_var.value().get_tensor()._clear() self._fill = offset