improve debug (#1215)

5127b2fa · Stas Bekman · GitHub · 497b741f · 5127b2fa · 5127b2fa
隐藏空白更改
内联并排

Showing with 5 addition and 4 deletion

deepspeed/runtime/zero/partition_parameters.py deepspeed/runtime/zero/partition_parameters.py +4 -3

deepspeed/runtime/zero/stage3.py deepspeed/runtime/zero/stage3.py +1 -1

未找到文件。
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -18,7 +18,7 @@ from .offload_constants import *
 from ..utils import see_memory_usage
 from deepspeed.utils import log_dist, init_distributed
-from deepspeed.utils.debug import debug_param2name_id_shape, debug_module2name, debug_param2name, debug_param2name_id_shape_status, printflock, log_rank_file
+from deepspeed.utils.debug import debug_param2name_id_shape, debug_param2name_id_shape_device, debug_module2name, debug_param2name, debug_param2name_id_shape_status, printflock, log_rank_file
 from ..swap_tensor.partitioned_param_swapper import AsyncPartitionedParameterSwapper, PartitionedParamStatus
 from ..config import DeepSpeedConfig
@@ -568,7 +568,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
        def partition(param_list=None, hierarchy=0, has_been_updated=False):
            cls = param
            print_rank_0(
-                f"{'--'*hierarchy}----Partitioning param with id {cls.ds_id} dev {cls.device} shape {cls.shape}"
+                f"{'--'*hierarchy}----Partitioning param {debug_param2name_id_shape_device(cls)}"
            )
            if param_list is None:
                param_list = [cls]
@@ -589,7 +589,8 @@ class Init(InsertPostInitMethodToModuleSubClasses):
                                accumulate=False):
            cls = param
            print_rank_0(
-                f"{'--'*hierarchy}----Partitioning param gradient with id {cls.ds_id}")
+                f"{'--'*hierarchy}----Partitioning param gradient with id {debug_param2name_id_shape_device(cls)}"
+            )
            if param_list is None:
                param_list = [cls]
                if isinstance(partition_buffers, torch.Tensor):

--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -2182,7 +2182,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
        params_to_reduce = [param for i, param, param_id in self.params_in_ipg_bucket]
        #print(f"Params in ipg bucket {self.params_in_ipg_bucket}")
-        #print(f"Reducing {[(param.ds_id, param.grad) for param in params_to_reduce]}")
+        #print(f"Reducing {[(debug_param2name_id_shape(param), param.grad) for param in params_to_reduce]}")
        #exit(0)
        if self.contiguous_gradients:
            reduction_list = [self.ipg_buffer[self.ipg_index]]