未验证 提交 5127b2fa 编写于 作者: S Stas Bekman 提交者: GitHub

improve debug (#1215)

上级 497b741f
...@@ -18,7 +18,7 @@ from .offload_constants import * ...@@ -18,7 +18,7 @@ from .offload_constants import *
from ..utils import see_memory_usage from ..utils import see_memory_usage
from deepspeed.utils import log_dist, init_distributed from deepspeed.utils import log_dist, init_distributed
from deepspeed.utils.debug import debug_param2name_id_shape, debug_module2name, debug_param2name, debug_param2name_id_shape_status, printflock, log_rank_file from deepspeed.utils.debug import debug_param2name_id_shape, debug_param2name_id_shape_device, debug_module2name, debug_param2name, debug_param2name_id_shape_status, printflock, log_rank_file
from ..swap_tensor.partitioned_param_swapper import AsyncPartitionedParameterSwapper, PartitionedParamStatus from ..swap_tensor.partitioned_param_swapper import AsyncPartitionedParameterSwapper, PartitionedParamStatus
from ..config import DeepSpeedConfig from ..config import DeepSpeedConfig
...@@ -568,7 +568,7 @@ class Init(InsertPostInitMethodToModuleSubClasses): ...@@ -568,7 +568,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
def partition(param_list=None, hierarchy=0, has_been_updated=False): def partition(param_list=None, hierarchy=0, has_been_updated=False):
cls = param cls = param
print_rank_0( print_rank_0(
f"{'--'*hierarchy}----Partitioning param with id {cls.ds_id} dev {cls.device} shape {cls.shape}" f"{'--'*hierarchy}----Partitioning param {debug_param2name_id_shape_device(cls)}"
) )
if param_list is None: if param_list is None:
param_list = [cls] param_list = [cls]
...@@ -589,7 +589,8 @@ class Init(InsertPostInitMethodToModuleSubClasses): ...@@ -589,7 +589,8 @@ class Init(InsertPostInitMethodToModuleSubClasses):
accumulate=False): accumulate=False):
cls = param cls = param
print_rank_0( print_rank_0(
f"{'--'*hierarchy}----Partitioning param gradient with id {cls.ds_id}") f"{'--'*hierarchy}----Partitioning param gradient with id {debug_param2name_id_shape_device(cls)}"
)
if param_list is None: if param_list is None:
param_list = [cls] param_list = [cls]
if isinstance(partition_buffers, torch.Tensor): if isinstance(partition_buffers, torch.Tensor):
......
...@@ -2182,7 +2182,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object): ...@@ -2182,7 +2182,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
params_to_reduce = [param for i, param, param_id in self.params_in_ipg_bucket] params_to_reduce = [param for i, param, param_id in self.params_in_ipg_bucket]
#print(f"Params in ipg bucket {self.params_in_ipg_bucket}") #print(f"Params in ipg bucket {self.params_in_ipg_bucket}")
#print(f"Reducing {[(param.ds_id, param.grad) for param in params_to_reduce]}") #print(f"Reducing {[(debug_param2name_id_shape(param), param.grad) for param in params_to_reduce]}")
#exit(0) #exit(0)
if self.contiguous_gradients: if self.contiguous_gradients:
reduction_list = [self.ipg_buffer[self.ipg_index]] reduction_list = [self.ipg_buffer[self.ipg_index]]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册