未验证 提交 5d14afd2 编写于 作者: D digger yu 提交者: GitHub

fix typo deepspeed/runtime (#3663)

Co-authored-by: NOlatunji Ruwase <olruwase@microsoft.com>
上级 460bec46
......@@ -800,9 +800,9 @@ class Init(InsertPostInitMethodToModuleSubClasses):
f'"nvme_path" in DeepSpeed Config cannot be None if remote device is {OffloadDeviceEnum.nvme}'
def _post_init_method(self, module):
#see_memory_usage(f"Before converting parmas in {module.__class__.__name__}", force=False)
#see_memory_usage(f"Before converting params in {module.__class__.__name__}", force=False)
print_rank_0(f'Converting Params in {module.__class__.__name__}', force=False)
see_memory_usage(f"Before converting and partitioning parmas in {module.__class__.__name__}", force=False)
see_memory_usage(f"Before converting and partitioning params in {module.__class__.__name__}", force=False)
global param_count
for name, param in module.named_parameters(recurse=False):
......@@ -825,7 +825,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
param.partition()
see_memory_usage(
f"Param count {param_count}. After converting and partitioning parmas in {module.__class__.__name__}",
f"Param count {param_count}. After converting and partitioning params in {module.__class__.__name__}",
force=False)
def _convert_to_deepspeed_param(self, param):
......@@ -1404,7 +1404,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
partition_size = param.ds_tensor.ds_numel
start = self.get_partition_rank() * partition_size
end = start + partition_size
#print_rank_0("REduce scatter was executed for praam {param.ds_id}")
#print_rank_0("REduce scatter was executed for param {param.ds_id}")
if start < param.ds_numel and end > param.ds_numel:
elements = param.ds_numel - start
param.grad.view(-1).narrow(0, start, elements).copy_(reduced_partition.narrow(0, 0, elements))
......
......@@ -892,7 +892,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
else:
self.fp32_partitioned_groups_flat[i].grad = gradient_buffer.narrow(0, 0, num_elements)
# Initialize the optimizer states with the flattended fp32 partition.
# Initialize the optimizer states with the flattened fp32 partition.
if not is_adagrad:
self._optimizer_step(i)
......@@ -906,7 +906,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
f'[End] Initialize optimizer states {i} / {num_subgroups} subgroups, num_elems: {num_elements}, swappable opt/param:{swappable_optimizer_subgroup}/{swappable_param_subgroup}',
force=False)
# Initialize the optimizer states with the flattended fp32 partition.
# Initialize the optimizer states with the flattened fp32 partition.
if is_adagrad:
self.optimizer = torch.optim.Adagrad(self.fp32_partitioned_groups_flat, **self.optimizer.defaults)
......
......@@ -611,7 +611,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
self.single_partition_of_fp32_groups[i].grad = get_accelerator().pin_memory(
single_grad_partition) if self.cpu_offload else single_grad_partition
# Initialize the optimizer states with the flattended fp32 partition.
# Initialize the optimizer states with the flattened fp32 partition.
# State initialization for the Adagrad optimizer occurs at construction as opposed to other optimizers
# which do lazy initialization of the state at the first call to step.
if isinstance(self.optimizer, torch.optim.Adagrad):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册