未验证 提交 2bbdc47a 编写于 作者: W wanghuancoder 提交者: GitHub

delete old dygraph sharding (#49334)

* delete old dygraph sharding
上级 2ca3d3f7
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# The file has been adapted from fairscale file:
# https://github.com/facebookresearch/fairscale/blob/main/fairscale/optim/oss.py
# Git commit hash: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e
# We retain the following license from the original files:
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
import logging
from collections import OrderedDict
import numpy as np
import paddle
import paddle.distributed as dist
from paddle.distributed.collective import _get_global_group, new_group
from paddle.fluid.clip import ClipGradByGlobalNorm
from paddle.framework import core
from paddle.optimizer import Optimizer
from ...meta_parallel.sharding.sharding_utils import (
ShardingClipGrad,
Type,
device_guard,
)
from ...utils.internal_storage import GradStorage, ParamStorage
# CUDA alignment 256 bytes, cpu alignment 4096 bytes
alignment = {"gpu": 256, "cpu": 4096}
align = {
Type.fp16.value: 2,
Type.fp32.value: 4,
}
class ShardingOptimizerStage2(Optimizer):
"""
A wrapper for Sharding Stage2 Optimizer in Dygraph.
.. warning: ShardingOptimizer encapsulates the optimization strategy and integrates it into the optimizer.
.. ZeRO: 1.https://arxiv.org/pdf/1910.02054.pdf 2.https://arxiv.org/pdf/1910.02054.pdf.
"""
# TODO (Baibaifan)
# Feature Notes:
# 1. Unified memory for parameters and parameters.grad to InternalStorage.
# 2. Support the segmentation of optimizer parameters and partial updating of parameters.
# 3. Dynamically adjust training parameters and models.
# 4. Support offload function.
# 5. Support the establishment of independent communication groups.
# 6. Broadcast_fp16 is not supported now.
def __init__(
self,
params,
optim,
group=None,
offload=False,
device="gpu",
pertrain_sync_models=True,
**kw
):
super().__init__(optim._learning_rate, params, kw)
# Segmentation information
self._dtype_rank_params = (
OrderedDict()
) # {dtype:[param1,param2]} device, rank, params
self._param2rank = {}
self.__segment_params = []
self._rank_buffer_size = {} # {dtype: {rank: numel+alignment}}
self._param2align = {} # {param.name: align}
# Default information
self._optim_defaults = kw
self._optim = optim
assert hasattr(
self._optim, "_master_weights"
), "Must use optimizer with _master_weights attribute"
self._local_params = params
self._default_device = device
self._pfp16 = (
len(
list(
filter(
lambda x: x.trainable and x.dtype == Type.fp16.value,
self._local_params,
)
)
)
> 0
)
self.group = (
new_group(_get_global_group().ranks) if group is None else group
)
self.world_size = self.group.nranks
self.rank = self.group.rank
self._global_root_rank = self.group.ranks[0]
# Synchronous all ranks models
if pertrain_sync_models:
self._sync_params_and_buffers()
self.param_storages = {} # {dtype: {rank: InternalStorage}}
if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm):
logging.warning(
"While using ClipGradByGlobalNorm in ShardingOptimizer, the grad clip of original optimizer will be changed."
)
self._optim._grad_clip = ShardingClipGrad(
self._optim._grad_clip, paddle.get_device(), self.group
)
if self._optim._parameter_list and isinstance(
self._optim._parameter_list[0], dict
):
for item in self._optim._param_groups:
if "grad_clip" in item.keys():
item["grad_clip"] = ShardingClipGrad(
self._optim._grad_clip,
paddle.get_device(),
self.group,
)
if offload:
assert (
self._pfp16
), "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16"
self.offload = offload # Using for offload
self.offload_device = "cpu"
self.offload_buffer_size = 0
self.offload_param2align = {}
self.offload_params = None
self.offload_grads = None
self._master_params = {}
# Update optimizer parameters and adjust parameter storage and use according to rank.
self._update_opt_status()
@paddle.autograd.no_grad()
def _sync_params_and_buffers(self):
"""
Sync all model states for all ranks
"""
for p in self._local_params:
dist.broadcast(
p, src=self._global_root_rank, group=self.group, sync_op=True
)
# Multi stream operation will be supported later
dist.wait(tensor=p, group=self.group, use_calc_stream=True)
def _generate_master_params(self, trainable_params):
if self.offload:
for param in trainable_params:
if param.name not in self._master_params.keys():
self._master_params[param.name] = core.VarBase(
name=param.name,
value=param.cast(dtype=Type.fp32.value).numpy(),
place=core.CPUPlace(),
stop_gradient=param.stop_gradient,
)
else:
for param in trainable_params:
if param.dtype == Type.fp16.value:
self._optim._master_weights[param.name] = paddle.cast(
param, Type.fp32.value
)
def _update_opt_status(self):
"""Update optimizer status and parameter storage information, and special functions to be developed."""
# func 1
self._integration_params()
# fun 2 TODO
# Segement helpers
def _segment_params(self):
"""
Divide all optimizer parameters equally into rank.
"""
if len(self.__segment_params) == 0:
self.__segment_params, param_lists = [
[] for _ in range(self.world_size)
], [[] for _ in range(self.world_size)]
sizes = [0] * self.world_size
for param in self._local_params:
# Add this param to rank with smallest size.
rank = sizes.index(min(sizes))
param_lists[rank].append(param)
# Statistical real numels
sizes[rank] += np.prod(param.shape) if param.trainable else 0
for rank, params in enumerate(param_lists):
self.__segment_params[rank].extend(params)
return self.__segment_params
@property
def local_params(self):
return self._local_params
@property
def param2rank(self):
"""Map the params to the rank which owns them"""
if len(self._param2rank) == 0:
for rank, params in enumerate(self._segment_params()):
for param in params:
self._param2rank[param.name] = rank
return self._param2rank
@property
def dtype_rank_params(self):
"""
Divide the parameters into groups according to rank and dtype.
"""
if len(self._dtype_rank_params) == 0:
# Assign the parameters of each rank according to the type
for param in self._local_params:
if param.dtype not in self._dtype_rank_params.keys():
self._dtype_rank_params[param.dtype] = [
[] for _ in range(self.world_size)
]
self._dtype_rank_params[param.dtype][
self.param2rank[param.name]
].append(param)
# Sort per rank params by size
for dtype in self._dtype_rank_params.keys():
for rank_params in self._dtype_rank_params[dtype]:
rank_params.sort(key=lambda x: np.prod(x.shape))
return self._dtype_rank_params
@property
def rank_buffer_size(self):
"""
Count the memory size of the parameters corresponding to rank under the corresponding dtype.
"""
# CUDA alignment 256 bytes
if len(self._rank_buffer_size) == 0:
for dtype in self.dtype_rank_params.keys():
if dtype not in self._rank_buffer_size.keys():
self._rank_buffer_size[dtype] = {}
for dst_rank, per_rank_params in enumerate(
self.dtype_rank_params[dtype]
):
if dst_rank not in self._rank_buffer_size[dtype].keys():
self._rank_buffer_size[dtype][dst_rank] = 0
for param in per_rank_params:
if not param.trainable:
continue
size = np.prod(param.shape) * align[dtype]
remaining = size % alignment[self._default_device]
ali = (
0
if remaining == 0
else alignment[self._default_device] - remaining
)
align_ = ali // align[dtype]
self._rank_buffer_size[dtype][dst_rank] += (
np.prod(param.shape) + align_
)
self._param2align[param.name] = align_
return self._rank_buffer_size
def _integration_params(self):
"""
Integrate the parameters into a continuous memory according to rank, and support the update of training parameters.
"""
for dtype, per_rank_params in self.dtype_rank_params.items():
if dtype not in self.param_storages.keys():
self.param_storages[dtype] = {}
for dst_rank, params in enumerate(per_rank_params):
if len(params) > 0:
# Merge all the trainable params in a single InternalStorage
trainable_params = list(
filter(lambda x: x.trainable, params)
)
if self._pfp16 and dst_rank == self.rank:
self._generate_master_params(trainable_params)
if trainable_params:
param_storage = ParamStorage(
size=self.rank_buffer_size[dtype][dst_rank],
dtype=dtype,
device=self._default_device,
)
param_storage.add_rank_params(
trainable_params, self._param2align
)
self.param_storages[dtype][dst_rank] = param_storage
# Clear the InternalStorage keys which are not in use anymore
dtype_in_use = list(self.dtype_rank_params.keys())
dtype_to_pop = list(
filter(lambda x: x not in dtype_in_use, self.param_storages.keys())
)
for d in dtype_to_pop:
self.param_storages.pop(d)
if self.offload:
self._optim._master_weights = self._master_params
cpu_master_params = [p for p in self._master_params.values()]
for param in cpu_master_params:
size = np.prod(param.shape) * align[Type.fp32.value]
remaining = size % alignment[self.offload_device]
ali = (
0
if remaining == 0
else alignment[self.offload_device] - remaining
)
align_ = ali // align[Type.fp32.value]
self.offload_buffer_size += np.prod(param.shape) + align_
self.offload_param2align[param.name] = align_
if cpu_master_params:
with device_guard(self.rank, self.offload_device):
self.offload_params = ParamStorage(
size=self.offload_buffer_size,
dtype=Type.fp32.value,
device=self.offload_device,
)
self.offload_params.add_rank_params(
cpu_master_params, self.offload_param2align, False
)
self.offload_params.buffer.stop_gradient = False
self.offload_grads = GradStorage(
size=self.offload_buffer_size,
dtype=Type.fp32.value,
device=self.offload_device,
destination=self.rank,
parm2align=self.offload_param2align,
convert_cpu=True,
)
for p in cpu_master_params:
self.offload_grads.add_grad(
p, self.offload_param2align[p.name]
)
self._optim._master_weights[
self.offload_params.buffer.name
] = self.offload_params.buffer
def _offload_acc_grad(self, param_name, grad_fp32_cpu):
"""accumulate grads with offload strategy"""
with device_guard(self.rank, self.offload_device):
if param_name in self._master_params.keys():
if self._master_params[param_name].grad is None:
self._master_params[param_name]._copy_gradient_from(
grad_fp32_cpu
)
else:
self._master_params[param_name].grad.add_(grad_fp32_cpu)
self.offload_params.buffer._copy_gradient_from(
self.offload_grads.buffer
)
def _offload_scale_grad(self, scale_size):
"""scale grads with offload strategy"""
with device_guard(self.rank, self.offload_device):
self.offload_grads.buffer.scale_(scale=scale_size)
def _offload_clear_grad(self):
"""clear grads with offload strategy"""
with device_guard(self.rank, self.offload_device):
self.offload_grads.buffer.zero_()
def step(self):
"""
A wrapper for Optimizer's step function to finish the update operation of the optimizer.
"""
if self.offload:
params_list = [self.offload_params.buffer]
# TODO(Baibaifan): Offload will support param_groups later
if not isinstance(self._optim._param_groups[0], dict):
self._optim._parameter_list = params_list
self._optim._param_groups = params_list
# Run the optimizer of the current rank step
if self.offload:
with device_guard(device=self.offload_device):
self._optim.step()
dev_id = int(paddle.get_device().split(":")[1])
for param in self._local_params:
if param.name in self._master_params.keys():
param.set_value(
self._master_params[param.name]
.cuda(dev_id)
.cast(dtype=param.dtype)
)
else:
self._optim.step()
# Synchronize all the updated shards in between the ranks
self._broadcast_params()
def minimize(self):
raise RuntimeError(
"optimizer.minimize() not support now, please use optimizer.step()"
)
def set_state_dict(self, state_dict):
self._optim.set_state_dict(state_dict)
def state_dict(self):
return self._optim.state_dict()
def _clear_cache(self):
self.__segment_params.clear()
self._dtype_rank_params.clear()
self._param2rank.clear()
@paddle.autograd.no_grad()
def _broadcast_params(self):
"""Broadcast the parameters of the current rank to each rank"""
assert self._default_device == "gpu", "Only supported gpu"
# Exchange all the shards with the other ranks
for dtype_per_rank in self.param_storages.values():
for dst_rank, internal_storage in dtype_per_rank.items():
dist.broadcast(
tensor=internal_storage.buffer,
src=self.group.ranks[dst_rank],
group=self.group,
sync_op=True,
)
# Multi stream operation will be supported later
dist.wait(
tensor=internal_storage.buffer,
group=self.group,
use_calc_stream=True,
)
...@@ -11,5 +11,3 @@ ...@@ -11,5 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .sharding_utils import Type, device_guard
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
from enum import Enum
from types import MethodType
import numpy as np
import paddle
from paddle import _legacy_C_ops
from paddle.fluid import core, layers
from paddle.fluid.dygraph import base as imperative_base
from paddle.fluid.dygraph import to_variable
from paddle.fluid.framework import dygraph_only
class Taskflow:
"""
Task flows, one way linked list for task acquisition.
"""
def __init__(self, task, callback):
self.task = task
self.callback = callback
class Type(Enum):
"""
Type of trainable parameters
"""
fp16 = paddle.float16
bf16 = paddle.bfloat16
fp32 = paddle.float32
class ShardingClipGrad:
def __init__(self, clip, device, group):
self._clip = clip
self._device = device
self._group = group
@imperative_base.no_grad
def _dygraph_clip(self, params_grads):
sum_square_fp32, sum_square_fp16 = [], []
unslice_params_fp32, unslice_params_fp16 = [], []
for p, g in params_grads:
p_slice = True # using for slice parameter in sharding stage3
if g is None or getattr(p, 'need_clip', True) is False:
continue
if hasattr(p, "unslice"):
p_slice = False
merge_grad = g
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.get_tensor_from_selected_rows(
layers.merge_selected_rows(g)
)
square = paddle.square(merge_grad)
sum_square = paddle.sum(square)
if p.dtype == paddle.float16:
if p_slice:
sum_square_fp16.append(sum_square)
else:
unslice_params_fp16.append(sum_square)
elif p.dtype == paddle.float32:
if p_slice:
sum_square_fp32.append(sum_square)
else:
unslice_params_fp32.append(sum_square)
# global norm of non-distributed FP16 params_and_grads
if len(sum_square_fp16) == 0:
global_norm_fp16 = paddle.to_tensor([0.0], dtype=paddle.float32)
else:
global_norm_fp16 = layers.concat(sum_square_fp16)
global_norm_fp16 = paddle.sum(global_norm_fp16)
global_norm_fp16 = paddle.cast(
global_norm_fp16, dtype=paddle.float32
)
# global norm of non-distributed FP16 params_and_grads for unslice parameter
if len(unslice_params_fp16) == 0:
global_unslice_fp16 = paddle.to_tensor([0.0], dtype=paddle.float32)
else:
global_unslice_fp16 = layers.concat(unslice_params_fp16)
global_unslice_fp16 = paddle.sum(global_unslice_fp16)
global_unslice_fp16 = paddle.cast(
global_unslice_fp16, dtype=paddle.float32
)
# global norm of non-distributed FP32 params_and_grads
global_norm_fp32 = (
layers.concat(sum_square_fp32)
if len(sum_square_fp32) != 0
else paddle.to_tensor([0.0], dtype=paddle.float32)
)
global_norm_fp32 = paddle.sum(global_norm_fp32)
# global norm of non-distributed FP32 params_and_grads for unslice parameter
global_unslice_fp32 = (
layers.concat(unslice_params_fp32)
if len(unslice_params_fp32) != 0
else paddle.to_tensor([0.0], dtype=paddle.float32)
)
global_unslice_fp32 = paddle.sum(global_unslice_fp32)
global_unslice_var = global_unslice_fp16 + global_unslice_fp32
global_norm_var = (
global_norm_fp16
+ global_norm_fp32
+ 1.0 / self._group.nranks * global_unslice_var
)
# add all reduce to get global norm of distributed params_and_grads
dev_id = int(self._device.split(":")[1])
with device_guard(dev_id, "gpu"):
paddle.distributed.all_reduce(global_norm_var, group=self._group)
global_norm_var = paddle.sqrt(global_norm_var)
max_global_norm = layers.fill_constant(
shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
)
clip_var = paddle.divide(
x=max_global_norm,
y=paddle.maximum(x=global_norm_var, y=max_global_norm),
)
clip_var_fp16 = paddle.cast(clip_var, paddle.float16)
for p, g in params_grads:
if getattr(p, 'need_clip', True) is False or g is None:
continue
origin_state = g.stop_gradient
g.stop_gradient = True
if p.dtype == paddle.float16:
g.scale_(clip_var_fp16)
else:
g.scale_(clip_var)
g.stop_gradient = origin_state
p._reset_grad_inplace_version(True)
return params_grads
def __getattr__(self, item):
return getattr(self._clip, item)
def __call__(self, params_grads):
return self._dygraph_clip(params_grads)
@contextlib.contextmanager
def device_guard(dev_id=0, device="cpu"):
origin_device = paddle.device.get_device()
if device == "cpu":
paddle.set_device(device)
elif device == "gpu":
paddle.set_device("gpu:{}".format(dev_id))
try:
yield
finally:
paddle.set_device(origin_device)
@dygraph_only
def ShardingScaler(scaler):
def unscale_method(self, optimizer):
if not self._enable:
return
param_grads = []
param_grads_fp16 = []
param_grads_fp32 = []
if hasattr(optimizer, "update_slice"):
optimizer.update_slice()
optimizer.update_scaler = True
if getattr(optimizer._optim, '_param_groups', None) and isinstance(
optimizer._optim._param_groups[0], dict
):
for group in optimizer._optim._param_groups:
for param in group['params']:
if param._grad_ivar() is not None:
param_grads.append(param._grad_ivar())
if param._grad_ivar().dtype in [
core.VarDesc.VarType.FP16,
paddle.float16,
]:
param_grads_fp16.append(param._grad_ivar())
else:
param_grads_fp32.append(param._grad_ivar())
else:
for param in optimizer._optim._parameter_list:
if param.grad is not None:
param_grads.append(param.grad)
if param.grad.dtype in [
core.VarDesc.VarType.FP16,
paddle.float16,
]:
param_grads_fp16.append(param.grad)
else:
param_grads_fp32.append(param.grad)
temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool_))
temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool_))
device = "cpu" if optimizer.offload else "gpu"
dev_id = (
0 if device == "cpu" else int(paddle.get_device().split(":")[1])
)
with device_guard(dev_id, device):
if len(param_grads_fp16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp16,
self._scale,
param_grads_fp16,
temp_found_inf_fp16,
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
self._scale,
param_grads_fp32,
temp_found_inf_fp32,
)
self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
paddle.distributed.all_reduce(
is_found_inf,
op=paddle.distributed.ReduceOp.MAX,
group=optimizer.group,
)
self._found_inf = is_found_inf.numpy()[0]
scaler._unscale = MethodType(unscale_method, scaler)
return scaler
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# The file has been adapted from fairscale file:
# https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/misc/param_bucket.py
# Git commit hash: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e
# We retain the following license from the original files:
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import paddle
from paddle import framework
# (TODO: GhostScreaming) It will be removed later.
from paddle.fluid import core
from ..meta_parallel.sharding.sharding_utils import Type, device_guard
class InternalStorage:
"""
This is a basic class, which is responsible for consolidating the basic storage tensor.
"""
# Support integration parameter tensor
def __init__(self, size, dtype, device, convert_cpu=False):
self._params = []
self._param_ids = []
self._fill = 0
self._device = device
self._dtype = dtype
# The actual flat tensor
size = [size] if isinstance(size, int) else size
if convert_cpu:
value = (
np.zeros(size, dtype=np.float16)
if Type.fp16.value == dtype
else np.zeros(size, dtype=np.float32)
)
self.buffer = core.VarBase(value=value, place=core.CPUPlace())
else:
self.buffer = paddle.zeros(size, dtype=dtype)
def to(self, device, dtype=None, keep_alignment=True):
"""
Move the underlying buffer
"""
assert (
self.buffer is not None
), "Cannot move a collapsed bucket, please rebuild it"
assert (
dtype == Type.fp32.value or Type.fp16.value
), "Conversion type is not supported now"
dev_id = (
0
if paddle.get_device() == "cpu"
else int(paddle.get_device().split(":")[1])
)
if self._device != device:
tmp_buffer = (
self.buffer.cuda(dev_id)
if device == "gpu"
else self.buffer.cpu()
)
for param in self._params:
param.clear_gradient(False)
param._gradient_set_empty(False)
self.buffer.value().get_tensor()._clear()
self.buffer = tmp_buffer
self._device = device
if dtype is not None:
self.buffer = self.buffer.cast(dtype=dtype)
self._dtype = dtype
class ParamStorage(InternalStorage):
"""
This is a basic class to simplify the handling of parameter InternalStorages.
"""
def __init__(self, size, dtype, device):
super().__init__(size, dtype, device, convert_cpu=True)
self.param2align = None
def to(self, device, dtype=None, keep_alignment=True):
"""
Move the underlying buffer
"""
super().to(device, dtype)
if keep_alignment:
self._array_params()
@framework.no_grad()
def add_rank_params(self, trainable_params, param2align, convert_gpu=True):
"""
Add new parameters to the InternalStorage. Params becomes a view of this InternalStorage buffer.
"""
assert all(
[id(param) not in self._param_ids for param in trainable_params]
), "The same param cannot be checked in twice"
assert self.buffer is not None
self.param2align = param2align
cpu_param_shape = list()
for param in trainable_params:
p_shape = self._add_param_as_view(
param, param2align[param.name], convert_gpu
)
cpu_param_shape.append(p_shape)
if convert_gpu:
# buffer convert from cpu to cuda
dev_id = int(paddle.get_device().split(":")[1])
self.buffer = self.buffer.cuda(dev_id)
self._fill = 0
for idx, param in enumerate(trainable_params):
self._convert_buffer(
param, cpu_param_shape[idx], param2align[param.name]
)
self._params.append(param)
self._param_ids.append(id(param))
@framework.no_grad()
def _add_param_as_view(self, param, align, convert_gpu=True):
assert (
param.dtype == self.buffer.dtype
), "Different types for the InternalStorage and the param, cannot proceed: {} - {}".format(
param.dtype, self.buffer.dtype
)
var_end = self._fill + np.prod(param.shape)
offset = var_end + align
assert offset <= np.prod(self.buffer.shape)
p_shape = param.shape
origin_state = param.stop_gradient
param.stop_gradient = True
param.flatten_()
param.stop_gradient = origin_state
# Copy the current param value
dev_id = (
0
if paddle.get_device() == "cpu"
else int(paddle.get_device().split(":")[1])
)
with device_guard(dev_id, "cpu"):
tmp_var = core.VarBase(
tensor=self.buffer._slice(self._fill, var_end)
)
if convert_gpu:
param_cpu = param.cpu()
param.value().get_tensor()._clear()
tmp_var.set_value(param_cpu)
else:
tmp_var.set_value(param)
self._fill = offset
return p_shape
@framework.no_grad()
def _convert_buffer(self, param, p_shape, align):
var_end = self._fill + np.prod(p_shape)
offset = var_end + align
assert offset <= np.prod(self.buffer.shape)
# Convert the param value
tmp_tensor = self.buffer._slice(self._fill, var_end)
param.value().get_tensor()._share_data_with(tmp_tensor)
param.value().get_tensor()._set_dims(p_shape)
self._fill = offset
@framework.no_grad()
def _array_params(self):
"""
Given the parameters which have been registered previously, rebuild the whole InternalStorage.
"""
assert len(self._params) > 0
assert self.param2align is not None
self._fill = 0
for p in self._params:
self._convert_buffer(p, p.shape, self.param2align[p.name]) # modify
class GradStorage(InternalStorage):
"""
This is a basic class to simplify the handling of gradient InternalStorages
"""
def __init__(
self, size, dtype, device, destination, parm2align, convert_cpu=False
):
if isinstance(size, np.int64):
size = size.tolist()
super().__init__(size, dtype, device, convert_cpu)
self._max_size = size
self._release = False
self.params_checked_in = 0
self.destination = destination
self._parm2align = parm2align
self.sent = False
def reset_checked_in(self):
"""Reset the counter of the parameter grads which have been checked in"""
self.params_checked_in = 0
self.sent = False
@property
def all_checked_in(self):
"""Judge all the expected gradient check-in happened"""
return len(self._params) == self.params_checked_in
def can_add_grad_view(self, param, align):
"""Is there enough InternalStorage to add this parameter gradient, and whether this param have already checked in."""
return (
self._fill + np.prod(param.shape) + align <= self._max_size
and id(param) not in self._param_ids
)
def to(self, device, dtype=None, keep_alignment=True):
"""
Move the underlying buffer
"""
if self._release:
self.rebuild()
super().to(device, dtype)
if keep_alignment:
self._array_grads()
@framework.no_grad()
def add_grad(self, param, align):
"""
Add a new parameter gradient to the InternalStorage. Param.grad becomes a view of this InternalStorage buffer.
"""
assert (
id(param) not in self._param_ids
), "The same gradients cannot be checked in twice"
self._add_grad_as_view(param, align)
self._params.append(param)
self._param_ids.append(id(param))
@framework.no_grad()
def manumal_relase(self):
"""
Release the buffer from InternalStorage. The InternalStorage will need to be rebuilt before use.
"""
if not self._release:
for p in self._params:
if p.grad is not None:
p.clear_gradient(False)
p._gradient_set_empty(False)
self.buffer = None
self._fill = 0
self.params_checked_in = 0
self._release = True
@framework.no_grad()
def rebuild(self):
"""
Given the parameter gradients which have been registered previously, rebuild the whole InternalStorage.
"""
if self._release:
self.buffer = paddle.zeros([self._max_size], dtype=self._dtype)
for p in self._params:
self._add_grad_as_view(p, self._parm2align[p.name])
self._release = False
@framework.no_grad()
def _array_grads(self):
"""
Given the parameters gradients which have been registered previously, rebuild the whole InternalStorage.
"""
if len(self._params) > 0:
self._fill = 0
for p in self._params:
self._add_grad_as_view(p, self._parm2align[p.name])
@framework.no_grad()
def _add_grad_as_view(self, param, align):
assert (
np.prod(self.buffer.shape) > 0
), "Cannot add a gradient to a released InternalStorage, please rebuild"
assert param.dtype == self.buffer.dtype
grad_end = self._fill + np.prod(param.shape)
offset = grad_end + align
assert offset <= np.prod(self.buffer.shape)
# Copy the current grad value to InternalStorage
dev_id = (
0
if paddle.get_device() == "cpu"
else int(paddle.get_device().split(":")[1])
)
if self._device == "cpu":
with device_guard(dev_id, self._device):
tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
param._copy_gradient_from(tmp_var)
tmp_var.value().get_tensor()._clear()
elif self._device == "gpu":
tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
param._copy_gradient_from(tmp_var)
tmp_var.value().get_tensor()._clear()
self._fill = offset
...@@ -16,13 +16,6 @@ import logging ...@@ -16,13 +16,6 @@ import logging
import os import os
import paddle import paddle
# Old version
from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import (
ShardingOptimizerStage2,
)
# New version
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import ( from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import (
GroupShardedOptimizerStage2, GroupShardedOptimizerStage2,
) )
...@@ -35,17 +28,7 @@ from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import ...@@ -35,17 +28,7 @@ from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import ( from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import (
GroupShardedScaler, GroupShardedScaler,
) )
from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import (
ShardingStage2,
)
from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import (
ShardingStage3,
)
from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import (
ShardingScaler,
)
from paddle.distributed.utils.log_utils import get_logger from paddle.distributed.utils.log_utils import get_logger
from paddle.fluid.framework import in_dygraph_mode
from paddle.optimizer import Optimizer from paddle.optimizer import Optimizer
logger_ = get_logger(logging.WARNING) logger_ = get_logger(logging.WARNING)
...@@ -148,7 +131,6 @@ def group_sharded_parallel( ...@@ -148,7 +131,6 @@ def group_sharded_parallel(
logger_.info("*" * 30) logger_.info("*" * 30)
logger_.info("Sharded level os uses sharded level os_g achieved now.") logger_.info("Sharded level os uses sharded level os_g achieved now.")
logger_.info("*" * 30) logger_.info("*" * 30)
if in_dygraph_mode():
optimizer = GroupShardedOptimizerStage2( optimizer = GroupShardedOptimizerStage2(
params=optimizer._parameter_list, params=optimizer._parameter_list,
optim=optimizer, optim=optimizer,
...@@ -166,24 +148,7 @@ def group_sharded_parallel( ...@@ -166,24 +148,7 @@ def group_sharded_parallel(
dp_group=dp_group, dp_group=dp_group,
device=device, device=device,
) )
else:
optimizer = ShardingOptimizerStage2(
params=model.parameters(),
optim=optimizer,
group=group,
offload=offload,
device=device,
)
model = ShardingStage2(
model,
optimizer,
group=group,
sync_buffers=sync_buffers,
buffer_max_size=buffer_max_size,
device=device,
)
elif level == 'p_g_os': elif level == 'p_g_os':
if in_dygraph_mode():
model = GroupShardedStage3( model = GroupShardedStage3(
model, model,
optimizer=optimizer, optimizer=optimizer,
...@@ -195,24 +160,10 @@ def group_sharded_parallel( ...@@ -195,24 +160,10 @@ def group_sharded_parallel(
dp_group=dp_group, dp_group=dp_group,
device=device, device=device,
) )
else:
model = ShardingStage3(
model,
optimizer=optimizer,
group=group,
sync_buffers=sync_buffers,
segment_size=segment_size,
offload=offload,
sync_comm=sync_comm,
device=device,
)
else: else:
raise ValueError("Please enter the correct level.") raise ValueError("Please enter the correct level.")
if isinstance(scaler, paddle.amp.GradScaler): if isinstance(scaler, paddle.amp.GradScaler):
if in_dygraph_mode():
scaler = GroupShardedScaler(scaler) scaler = GroupShardedScaler(scaler)
else:
scaler = ShardingScaler(scaler)
logger_.info("*" * 30) logger_.info("*" * 30)
logger_.info( logger_.info(
"If there is a communication hang using group sharded, please check whether the communication operations of each process are unified." "If there is a communication hang using group sharded, please check whether the communication operations of each process are unified."
...@@ -275,9 +226,9 @@ def save_group_sharded_model(model, output, optimizer=None): ...@@ -275,9 +226,9 @@ def save_group_sharded_model(model, output, optimizer=None):
), "Saving directory ({}) should be a directory, not a file".format(output) ), "Saving directory ({}) should be a directory, not a file".format(output)
os.makedirs(output, exist_ok=True) os.makedirs(output, exist_ok=True)
output_model = os.path.join(output, "model.pdmodel") output_model = os.path.join(output, "model.pdmodel")
if isinstance(model, (ShardingStage2, GroupShardedStage2)): if isinstance(model, GroupShardedStage2):
paddle.save(model._layer.state_dict(), output_model) paddle.save(model._layer.state_dict(), output_model)
elif isinstance(model, (ShardingStage3, GroupShardedStage3)): elif isinstance(model, GroupShardedStage3):
convert2cpu = True if model._offload else False convert2cpu = True if model._offload else False
model.get_all_parameters(convert2cpu=convert2cpu) model.get_all_parameters(convert2cpu=convert2cpu)
paddle.save(model._layer.state_dict(), output_model) paddle.save(model._layer.state_dict(), output_model)
......
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.distributed import fleet
from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import (
ShardingOptimizerStage2,
)
from paddle.distributed.fleet.utils.internal_storage import GradStorage
from paddle.nn import Linear
base_lr = 0.1
momentum_rate = 0.9
l2_decay = 1e-4
epoch = 100
batch_size = 32
class_dim = 102
class MLP(fluid.Layer):
def __init__(self, param_attr=None, bias_attr=None):
super().__init__()
self._linear1 = Linear(10, 10)
self._linear2 = Linear(10, 10)
def forward(self, inputs):
y = self._linear1(inputs)
y = self._linear2(y)
return y
def reader_decorator():
def __reader__():
for _ in range(100):
img = np.random.rand(10).astype('float32')
label = np.ones(1).astype('int64')
yield img, label
return __reader__
def optimizer_setting(parameter_list=None):
optimizer = paddle.optimizer.Momentum(
learning_rate=base_lr,
momentum=momentum_rate,
weight_decay=paddle.regularizer.L2Decay(l2_decay),
parameters=parameter_list,
)
return optimizer
def train_mlp():
fleet.init(is_collective=True)
group = paddle.distributed.new_group([0, 1])
mlp = MLP()
optimizer = optimizer_setting(parameter_list=mlp.parameters())
oss_optimizer = ShardingOptimizerStage2(
params=mlp.parameters(), optim=optimizer, group=group
)
# cover grad_storage code
trainable_param2align = dict()
for p in mlp.parameters():
trainable_param2align[p.name] = 0
grad_storage = GradStorage(
10000,
dtype=paddle.float32,
device="gpu",
destination=0,
parm2align=trainable_param2align,
)
for p in mlp.parameters():
grad_storage.can_add_grad_view(p, trainable_param2align[p.name])
grad_storage.add_grad(p, trainable_param2align[p.name])
grad_storage.manumal_relase()
grad_storage.rebuild()
grad_storage.reset_checked_in()
train_reader = paddle.batch(
reader_decorator(), batch_size=batch_size, drop_last=True
)
train_loader = paddle.io.DataLoader.from_generator(
capacity=32,
use_double_buffer=True,
iterable=True,
return_list=True,
use_multiprocess=True,
)
train_loader.set_sample_list_generator(train_reader)
for eop in range(epoch):
mlp.train()
for batch_id, data in enumerate(train_loader()):
img, label = data
label.stop_gradient = True
img.stop_gradient = True
out = mlp(img)
loss = paddle.nn.functional.cross_entropy(input=out, label=label)
avg_loss = paddle.mean(x=loss)
acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1)
acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5)
dy_out = avg_loss.numpy()
avg_loss.backward()
oss_optimizer.step()
# oss_optimizer clear cache
oss_optimizer._clear_cache()
# check optimizer.minimize() error
try:
oss_optimizer.minimize()
except:
print(
"====== Find sharding_stage2_optimizer.minimize() error ======"
)
return
if __name__ == '__main__':
train_mlp()
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import tempfile
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.distributed import fleet
from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import (
ShardingOptimizerStage2,
)
from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import (
ShardingStage2,
)
from paddle.nn import Linear
seed = 2022
epoch = 2
linear_size = 1000
strategy = fleet.DistributedStrategy()
strategy.hybrid_configs = {
"dp_degree": 2,
"mp_degree": 1,
"pp_degree": 1,
"sharding_degree": 1,
}
np.random.seed(seed)
paddle.seed(seed)
class MLP(fluid.Layer):
def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
super().__init__()
self._linear1 = Linear(linear_size, linear_size)
self._linear2 = Linear(linear_size, linear_size)
self._linear3 = Linear(linear_size, 10)
def forward(self, inputs):
y = self._linear1(inputs)
y = self._linear2(y)
y = self._linear3(y)
return y
def reader_decorator(linear_size=1000):
def __reader__():
for _ in range(100):
img = np.random.rand(linear_size).astype('float32')
label = np.ones(1).astype('int64')
yield img, label
return __reader__
def optimizer_setting(model, use_pure_fp16, opt_group=False):
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
optimizer = paddle.optimizer.AdamW(
parameters=[{"params": model.parameters()}]
if opt_group
else model.parameters(),
learning_rate=0.001,
weight_decay=0.00001,
grad_clip=clip,
multi_precision=use_pure_fp16,
)
return optimizer
def train_mlp(
model,
sharding_stage,
batch_size=100,
use_pure_fp16=False,
accumulate_grad=False,
opt_group=False,
save_model=False,
):
if sharding_stage == "dp":
hcg = fleet.get_hybrid_communicate_group()
group = hcg.get_check_parallel_group()
else:
group = paddle.distributed.new_group([0, 1])
if opt_group:
optimizer = optimizer_setting(
model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group
)
else:
optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
if sharding_stage == 2:
optimizer = ShardingOptimizerStage2(
params=model.parameters(), optim=optimizer, group=group
)
model = ShardingStage2(
model, optimizer, group=group, buffer_max_size=2**21
)
else:
optimizer = fleet.distributed_optimizer(optimizer)
model = fleet.distributed_model(model)
train_reader = paddle.batch(
reader_decorator(), batch_size=batch_size, drop_last=True
)
train_loader = paddle.io.DataLoader.from_generator(
capacity=32,
use_double_buffer=True,
iterable=True,
return_list=True,
use_multiprocess=True,
)
train_loader.set_sample_list_generator(train_reader)
if sharding_stage == 2:
model.to(device="gpu")
for eop in range(epoch):
model.train()
for batch_id, data in enumerate(train_loader()):
img, label = data
label.stop_gradient = True
img.stop_gradient = True
out = model(img)
loss = paddle.nn.functional.cross_entropy(input=out, label=label)
avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
if batch_size == 20:
avg_loss = avg_loss / 5
avg_loss.backward()
if not accumulate_grad:
optimizer.step()
optimizer.clear_grad()
if accumulate_grad:
optimizer.step()
optimizer.clear_grad()
if save_model:
return model, optimizer
return model.parameters()
def test_dp_stage2():
mlp = MLP()
state_dict = mlp.state_dict()
mlp1 = MLP()
mlp2 = MLP()
mlp3 = MLP()
mlp4 = MLP()
mlp5 = MLP()
mlp6 = MLP()
mlp1.set_state_dict(state_dict)
mlp2.set_state_dict(state_dict)
mlp3.set_state_dict(state_dict)
mlp4.set_state_dict(state_dict)
mlp5.set_state_dict(state_dict)
mlp6.set_state_dict(state_dict)
# DP VS stage2
dp_params = train_mlp(
mlp1, sharding_stage="dp", use_pure_fp16=False, opt_group=False
)
stage2_params = train_mlp(
mlp2, sharding_stage=2, use_pure_fp16=False, opt_group=False
)
for i in range(len(dp_params)):
np.testing.assert_allclose(
dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6
)
# stage2 accumulate grad
stage2_params = train_mlp(mlp3, sharding_stage=2, accumulate_grad=True)
stage2_accumulate_grad = train_mlp(
mlp4, sharding_stage=2, batch_size=20, accumulate_grad=True
)
for i in range(len(stage2_params)):
np.testing.assert_allclose(
stage2_params[i].numpy(),
stage2_accumulate_grad[i].numpy(),
rtol=1e-5,
atol=1e-5,
)
# stage2 param list VS param group
stage2_params = train_mlp(
mlp5, sharding_stage=2, use_pure_fp16=False, opt_group=True
)
for i in range(len(dp_params)):
np.testing.assert_allclose(
dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6
)
# save/load model
output_dir = tempfile.mkdtemp()
model_file = os.path.join(output_dir, "model.pdmodel")
optimizer_file = os.path.join(output_dir, "model.pdopt")
model_stage2, optimizer_stage2 = train_mlp(
mlp6,
sharding_stage=2,
use_pure_fp16=False,
opt_group=False,
save_model=True,
)
paddle.save(model_stage2.state_dict(), model_file)
paddle.save(optimizer_stage2.state_dict(), optimizer_file)
m_state_dict = paddle.load(model_file)
opt_state_dict = paddle.load(optimizer_file)
model_stage2.set_state_dict(m_state_dict)
optimizer_stage2.set_state_dict(opt_state_dict)
shutil.rmtree(output_dir)
return
if __name__ == '__main__':
fleet.init(is_collective=True, strategy=strategy)
test_dp_stage2()
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from dygraph_sharding_stage2 import MLP, optimizer_setting, reader_decorator
import paddle
from paddle.distributed import fleet
from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import (
ShardingOptimizerStage2,
)
from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import (
ShardingStage2,
)
from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import (
ShardingScaler,
)
seed = 2021
epoch = 2
batch_size = 32
linear_size = 1000
strategy = fleet.DistributedStrategy()
strategy.hybrid_configs = {
"dp_degree": 2,
"mp_degree": 1,
"pp_degree": 1,
"sharding_degree": 1,
}
np.random.seed(seed)
paddle.seed(seed)
def train_mlp(model, offload=False):
optimizer = optimizer_setting(model=model, use_pure_fp16=True)
model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
scaler = ShardingScaler(scaler)
optimizer = ShardingOptimizerStage2(
params=model.parameters(), optim=optimizer, offload=offload
)
model = ShardingStage2(model, optimizer, buffer_max_size=2**21)
train_reader = paddle.batch(
reader_decorator(linear_size), batch_size=batch_size, drop_last=True
)
train_loader = paddle.io.DataLoader.from_generator(
capacity=32,
use_double_buffer=True,
iterable=True,
return_list=True,
use_multiprocess=True,
)
train_loader.set_sample_list_generator(train_reader)
for eop in range(epoch):
model.train()
for batch_id, data in enumerate(train_loader()):
img, label = data
label.stop_gradient = True
img.stop_gradient = True
with paddle.amp.auto_cast(True, level='O2'):
out = model(img)
loss = paddle.nn.functional.cross_entropy(
input=out, label=label
)
avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
scaler.scale(avg_loss).backward()
scaler.step(optimizer)
scaler.update()
optimizer.clear_grad()
for dtype in optimizer.param_storages:
for dst_rank, param_storage in optimizer.param_storages[dtype].items():
param_storage.to(device="gpu", dtype=dtype)
return model.parameters()
def test_sharding_stage2_offload():
mlp = MLP(linear_size)
mlp_offload = MLP(linear_size)
mlp_offload.set_state_dict(mlp.state_dict())
mlp_params = train_mlp(mlp, offload=False)
mlp_offload_params = train_mlp(mlp_offload, offload=True)
for i in range(len(mlp_params)):
np.testing.assert_allclose(
mlp_params[i].numpy(),
mlp_offload_params[i].numpy(),
rtol=5e-3,
atol=5e-3,
)
return
if __name__ == '__main__':
fleet.init(is_collective=True, strategy=strategy)
test_sharding_stage2_offload()
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import tempfile
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.distributed import fleet
from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import (
ShardingOptimizerStage2,
)
from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import (
ShardingStage2,
)
from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import (
ShardingStage3,
)
from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import (
ShardingScaler,
)
from paddle.nn import Linear
epoch = 10
paddle.seed(2021)
np.random.seed(2021)
base_lr = 0.1
momentum_rate = 0.9
l2_decay = 1e-4
class MLP(fluid.Layer):
def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
super().__init__()
self._linear1 = Linear(linear_size, linear_size)
self._linear2 = Linear(linear_size, linear_size)
self._linear3 = Linear(linear_size, 10)
def forward(self, inputs):
y = self._linear1(inputs)
y = self._linear2(y)
y = self._linear3(y)
return y
def reader_decorator(linear_size=1000):
def __reader__():
for _ in range(100):
img = np.random.rand(linear_size).astype('float32')
label = np.ones(1).astype('int64')
yield img, label
return __reader__
def optimizer_setting(model, use_pure_fp16, opt_group=False):
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
optimizer = paddle.optimizer.Momentum(
parameters=[{"params": list(model.parameters())}]
if opt_group
else list(model.parameters()),
learning_rate=0.001,
weight_decay=0.00001,
grad_clip=clip,
multi_precision=use_pure_fp16,
)
return optimizer
def train_mlp(
model,
sharding_stage,
use_pure_fp16=False,
accumulate_grad=False,
batch_size=100,
opt_group=False,
sync_comm=False,
test_minimize=False,
save_model=False,
):
group = paddle.distributed.new_group([0, 1])
if opt_group:
optimizer = optimizer_setting(
model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group
)
else:
optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
if use_pure_fp16:
model = paddle.amp.decorate(
models=model, level='O2', save_dtype='float32'
)
scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
scaler = ShardingScaler(scaler)
if sharding_stage == 2:
optimizer = ShardingOptimizerStage2(
params=model.parameters(), optim=optimizer, group=group
)
model = ShardingStage2(
model, optimizer, group=group, buffer_max_size=2**21
)
elif sharding_stage == 3:
model = ShardingStage3(
model, optimizer=optimizer, group=group, sync_comm=sync_comm
)
# check optimizer.minimize() error
if test_minimize:
try:
optimizer.minimize()
except:
print(
"====== Find sharding_stage3_optimizer.minimize() error ======"
)
return
train_reader = paddle.batch(
reader_decorator(), batch_size=batch_size, drop_last=True
)
train_loader = paddle.io.DataLoader.from_generator(
capacity=32,
use_double_buffer=True,
iterable=True,
return_list=True,
use_multiprocess=True,
)
train_loader.set_sample_list_generator(train_reader)
for eop in range(epoch):
model.train()
for batch_id, data in enumerate(train_loader()):
img, label = data
label.stop_gradient = True
img.stop_gradient = True
with paddle.amp.auto_cast(True, level='O2'):
out = model(img)
loss = paddle.nn.functional.cross_entropy(
input=out, label=label
)
avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
if batch_size == 20:
avg_loss = avg_loss / 5
if not use_pure_fp16:
avg_loss.backward()
else:
scaler.scale(avg_loss).backward()
if not accumulate_grad:
if not use_pure_fp16:
optimizer.step()
else:
scaler.step(optimizer)
scaler.update()
optimizer.clear_grad()
if accumulate_grad:
if not use_pure_fp16:
optimizer.step()
else:
scaler.step(optimizer)
scaler.update()
optimizer.clear_grad()
if sharding_stage == 3:
model.get_all_parameters()
if save_model:
return model, optimizer
return model.parameters()
def test_stage2_stage3():
mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6, mlp7, mlp8, mlp9, mlp10 = (
MLP(),
MLP(),
MLP(),
MLP(),
MLP(),
MLP(),
MLP(),
MLP(),
MLP(),
MLP(),
MLP(),
)
state_dict = mlp.state_dict()
mlp1.set_state_dict(state_dict)
mlp2.set_state_dict(state_dict)
mlp3.set_state_dict(state_dict)
mlp4.set_state_dict(state_dict)
mlp5.set_state_dict(state_dict)
mlp6.set_state_dict(state_dict)
mlp7.set_state_dict(state_dict)
mlp8.set_state_dict(state_dict)
mlp9.set_state_dict(state_dict)
mlp10.set_state_dict(state_dict)
# fp32
stage2_params = train_mlp(
mlp1, sharding_stage=2, use_pure_fp16=False, opt_group=False
)
stage3_params = train_mlp(
mlp2, sharding_stage=3, use_pure_fp16=False, opt_group=False
)
for i in range(len(stage2_params)):
np.testing.assert_allclose(
stage2_params[i].numpy(),
stage3_params[i].numpy(),
rtol=1e-6,
atol=1e-6,
)
# fp32 accumulate grad
stage3_params = train_mlp(
mlp3,
sharding_stage=3,
use_pure_fp16=False,
accumulate_grad=True,
opt_group=True,
)
stage3_params_add = train_mlp(
mlp4,
sharding_stage=3,
use_pure_fp16=False,
accumulate_grad=True,
batch_size=20,
opt_group=True,
)
for i in range(len(stage3_params)):
np.testing.assert_allclose(
stage3_params[i].numpy(),
stage3_params_add[i].numpy(),
rtol=1e-6,
atol=1e-4,
)
# fp16
stage2_params = train_mlp(
mlp5, sharding_stage=2, use_pure_fp16=True, opt_group=False
)
stage3_params = train_mlp(
mlp6, sharding_stage=3, use_pure_fp16=True, opt_group=False
)
for i in range(len(stage2_params)):
np.testing.assert_allclose(
stage2_params[i].numpy(),
stage3_params[i].numpy(),
rtol=1e-4,
atol=1e-3,
)
# fp16 sync_comm
stage3_params = train_mlp(
mlp7, sharding_stage=3, use_pure_fp16=True, opt_group=False
)
stage3_params_re = train_mlp(
mlp8,
sharding_stage=3,
use_pure_fp16=True,
opt_group=False,
sync_comm=True,
)
for i in range(len(stage3_params)):
np.testing.assert_allclose(
stage3_params[i].numpy(), stage3_params_re[i].numpy(), rtol=1e-6
)
# save/load model
output_dir = tempfile.mkdtemp()
model_file = os.path.join(output_dir, "model.pdmodel")
optimizer_file = os.path.join(output_dir, "model.pdopt")
model_stage3, optimizer_stage3 = train_mlp(
mlp9,
sharding_stage=3,
use_pure_fp16=False,
opt_group=False,
save_model=True,
)
paddle.save(model_stage3.state_dict(), model_file)
paddle.save(optimizer_stage3.state_dict(), optimizer_file)
m_state_dict = paddle.load(model_file)
opt_state_dict = paddle.load(optimizer_file)
model_stage3.set_state_dict(m_state_dict)
optimizer_stage3.set_state_dict(opt_state_dict)
shutil.rmtree(output_dir)
# check optimizer.minimize() error
train_mlp(
mlp10,
sharding_stage=3,
use_pure_fp16=False,
opt_group=False,
test_minimize=True,
)
if __name__ == '__main__':
fleet.init(is_collective=True)
test_stage2_stage3()
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.distributed import fleet
from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import (
ShardingStage3,
)
from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import (
ShardingScaler,
)
from paddle.nn import Linear
epoch = 10
paddle.seed(2022)
np.random.seed(2022)
base_lr = 0.1
momentum_rate = 0.9
l2_decay = 1e-4
class MLP(fluid.Layer):
def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
super().__init__()
self._linear1 = Linear(linear_size, linear_size)
self._linear2 = Linear(linear_size, linear_size)
self._linear3 = Linear(linear_size, 10)
def forward(self, inputs):
y = self._linear1(inputs)
y = self._linear2(y)
y = self._linear3(y)
return y
def reader_decorator(linear_size=1000):
def __reader__():
for _ in range(100):
img = np.random.rand(linear_size).astype('float32')
label = np.ones(1).astype('int64')
yield img, label
return __reader__
def optimizer_setting(model, use_pure_fp16, opt_group=False):
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
optimizer = paddle.optimizer.AdamW(
parameters=[{"params": model.parameters()}]
if opt_group
else model.parameters(),
learning_rate=0.001,
weight_decay=0.00001,
grad_clip=clip,
multi_precision=use_pure_fp16,
)
return optimizer
def train_mlp(
model,
use_pure_fp16=False,
accumulate_grad=False,
offload=False,
batch_size=100,
convert2cpu=False,
):
group = paddle.distributed.new_group([0, 1])
optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
if use_pure_fp16:
model = paddle.amp.decorate(
models=model, level='O2', save_dtype='float32'
)
scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
scaler = ShardingScaler(scaler)
model = ShardingStage3(
model, optimizer=optimizer, group=group, offload=offload
)
train_reader = paddle.batch(
reader_decorator(), batch_size=batch_size, drop_last=True
)
train_loader = paddle.io.DataLoader.from_generator(
capacity=32,
use_double_buffer=True,
iterable=True,
return_list=True,
use_multiprocess=True,
)
train_loader.set_sample_list_generator(train_reader)
for eop in range(epoch):
model.train()
for batch_id, data in enumerate(train_loader()):
img, label = data
label.stop_gradient = True
img.stop_gradient = True
with paddle.amp.auto_cast(True, level='O2'):
out = model(img)
loss = paddle.nn.functional.cross_entropy(
input=out, label=label
)
avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
if accumulate_grad:
avg_loss = avg_loss / 5
if not use_pure_fp16:
avg_loss.backward()
else:
scaler.scale(avg_loss).backward()
if not accumulate_grad:
if not use_pure_fp16:
optimizer.step()
else:
scaler.step(optimizer)
scaler.update()
optimizer.clear_grad()
if accumulate_grad:
if not use_pure_fp16:
optimizer.step()
else:
scaler.step(optimizer)
scaler.update()
optimizer.clear_grad()
if not convert2cpu:
model.get_all_parameters()
else:
model.get_all_parameters(convert2cpu)
return model.parameters()
def test_stage3_offload():
mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6 = (
MLP(),
MLP(),
MLP(),
MLP(),
MLP(),
MLP(),
MLP(),
)
state_dict = mlp.state_dict()
mlp1.set_state_dict(state_dict)
mlp2.set_state_dict(state_dict)
mlp3.set_state_dict(state_dict)
mlp4.set_state_dict(state_dict)
mlp5.set_state_dict(state_dict)
mlp6.set_state_dict(state_dict)
# fp32 offload
stage3_params = train_mlp(mlp1, use_pure_fp16=False)
stage3_params_offload = train_mlp(mlp2, use_pure_fp16=False, offload=True)
for i in range(len(stage3_params)):
np.testing.assert_allclose(
stage3_params[i].numpy(),
stage3_params_offload[i].numpy(),
rtol=1e-6,
atol=1e-8,
)
# fp16 offload
stage3_params = train_mlp(mlp3, use_pure_fp16=True)
stage3_params_offload = train_mlp(mlp4, use_pure_fp16=True, offload=True)
for i in range(len(stage3_params)):
np.testing.assert_allclose(
stage3_params[i].numpy(),
stage3_params_offload[i].numpy(),
rtol=1e-2,
atol=1e-2,
)
# fp32 accumulate grad offload
stage3_params = train_mlp(
mlp5, use_pure_fp16=False, batch_size=20, accumulate_grad=True
)
stage3_params_offload = train_mlp(
mlp6,
use_pure_fp16=False,
accumulate_grad=True,
offload=True,
batch_size=20,
convert2cpu=True,
)
for i in range(len(stage3_params)):
np.testing.assert_allclose(
stage3_params[i].numpy(),
stage3_params_offload[i].numpy(),
rtol=1e-6,
atol=1e-8,
)
return
if __name__ == '__main__':
fleet.init(is_collective=True)
test_stage3_offload()
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import tempfile
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.distributed import fleet
from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import (
ShardingOptimizerStage2,
)
from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import (
ShardingStage2,
)
from paddle.nn import Linear
seed = 2022
epoch = 2
linear_size = 1000
strategy = fleet.DistributedStrategy()
strategy.hybrid_configs = {
"dp_degree": 16,
"mp_degree": 1,
"pp_degree": 1,
"sharding_degree": 1,
}
np.random.seed(seed)
paddle.seed(seed)
class MLP(fluid.Layer):
def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
super().__init__()
self._linear1 = Linear(linear_size, linear_size)
self._linear2 = Linear(linear_size, linear_size)
self._linear3 = Linear(linear_size, linear_size)
self._linear4 = Linear(linear_size, linear_size)
self._linear5 = Linear(linear_size, 10)
def forward(self, inputs):
y = self._linear1(inputs)
y = self._linear2(y)
y = self._linear3(y)
y = self._linear4(y)
y = self._linear5(y)
return y
def reader_decorator(linear_size=1000):
def __reader__():
for _ in range(100):
img = np.random.rand(linear_size).astype('float32')
label = np.ones(1).astype('int64')
yield img, label
return __reader__
def optimizer_setting(model, use_pure_fp16, opt_group=False):
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
optimizer = paddle.optimizer.AdamW(
parameters=[{"params": model.parameters()}]
if opt_group
else model.parameters(),
learning_rate=0.001,
weight_decay=0.00001,
grad_clip=clip,
multi_precision=use_pure_fp16,
)
return optimizer
def train_mlp(
model,
sharding_stage,
batch_size=100,
use_pure_fp16=False,
accumulate_grad=False,
opt_group=False,
save_model=False,
):
if sharding_stage == "dp":
hcg = fleet.get_hybrid_communicate_group()
group = hcg.get_check_parallel_group()
else:
group = paddle.distributed.new_group(
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
)
if opt_group:
optimizer = optimizer_setting(
model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group
)
else:
optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
if sharding_stage == 2:
optimizer = ShardingOptimizerStage2(
params=model.parameters(), optim=optimizer, group=group
)
model = ShardingStage2(
model, optimizer, group=group, buffer_max_size=2**21
)
else:
optimizer = fleet.distributed_optimizer(optimizer)
model = fleet.distributed_model(model)
train_reader = paddle.batch(
reader_decorator(), batch_size=batch_size, drop_last=True
)
train_loader = paddle.io.DataLoader.from_generator(
capacity=32,
use_double_buffer=True,
iterable=True,
return_list=True,
use_multiprocess=True,
)
train_loader.set_sample_list_generator(train_reader)
if sharding_stage == 2:
model.to(device="gpu")
for eop in range(epoch):
model.train()
for batch_id, data in enumerate(train_loader()):
img, label = data
label.stop_gradient = True
img.stop_gradient = True
out = model(img)
loss = paddle.nn.functional.cross_entropy(input=out, label=label)
avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
if batch_size == 20:
avg_loss = avg_loss / 5
avg_loss.backward()
if not accumulate_grad:
optimizer.step()
optimizer.clear_grad()
if accumulate_grad:
optimizer.step()
optimizer.clear_grad()
if save_model:
return model, optimizer
return model.parameters()
def test_dp_stage2():
mlp = MLP()
state_dict = mlp.state_dict()
mlp1 = MLP()
mlp2 = MLP()
mlp3 = MLP()
mlp4 = MLP()
mlp5 = MLP()
mlp6 = MLP()
mlp1.set_state_dict(state_dict)
mlp2.set_state_dict(state_dict)
mlp3.set_state_dict(state_dict)
mlp4.set_state_dict(state_dict)
mlp5.set_state_dict(state_dict)
mlp6.set_state_dict(state_dict)
# DP VS stage2
dp_params = train_mlp(
mlp1, sharding_stage="dp", use_pure_fp16=False, opt_group=False
)
stage2_params = train_mlp(
mlp2, sharding_stage=2, use_pure_fp16=False, opt_group=False
)
for i in range(len(dp_params)):
np.testing.assert_allclose(
dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6, atol=5e-4
)
# stage2 accumulate grad
stage2_params = train_mlp(mlp3, sharding_stage=2, accumulate_grad=True)
stage2_accumulate_grad = train_mlp(
mlp4, sharding_stage=2, batch_size=20, accumulate_grad=True
)
for i in range(len(stage2_params)):
np.testing.assert_allclose(
stage2_params[i].numpy(),
stage2_accumulate_grad[i].numpy(),
rtol=1e-5,
atol=1e-5,
)
# stage2 param list VS param group
stage2_params = train_mlp(
mlp5, sharding_stage=2, use_pure_fp16=False, opt_group=True
)
for i in range(len(dp_params)):
np.testing.assert_allclose(
dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6, atol=5e-4
)
# save/load model
output_dir = tempfile.mkdtemp()
try:
model_file = os.path.join(output_dir, "model.pdmodel")
optimizer_file = os.path.join(output_dir, "model.pdopt")
model_stage2, optimizer_stage2 = train_mlp(
mlp6,
sharding_stage=2,
use_pure_fp16=False,
opt_group=False,
save_model=True,
)
paddle.save(model_stage2.state_dict(), model_file)
paddle.save(optimizer_stage2.state_dict(), optimizer_file)
m_state_dict = paddle.load(model_file)
opt_state_dict = paddle.load(optimizer_file)
model_stage2.set_state_dict(m_state_dict)
optimizer_stage2.set_state_dict(opt_state_dict)
except Exception as e:
shutil.rmtree(output_dir)
raise e
else:
shutil.rmtree(output_dir)
if __name__ == '__main__':
fleet.init(is_collective=True, strategy=strategy)
test_dp_stage2()
...@@ -25,13 +25,6 @@ class TestDYgrapShardingDP(TestDistBase): ...@@ -25,13 +25,6 @@ class TestDYgrapShardingDP(TestDistBase):
self._trainers = 16 self._trainers = 16
self._init_env() self._init_env()
def test_hybrid_sharding_stage2(self):
self.check_with_place(
"mn_dygraph_sharding_stage2.py",
backend="nccl",
need_envs=os.environ,
)
def test_hybrid_sharding_stage3(self): def test_hybrid_sharding_stage3(self):
self.check_with_place( self.check_with_place(
"mn_dygraph_group_sharded_stage3.py", "mn_dygraph_group_sharded_stage3.py",
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册