未验证 提交 261d6370 编写于 作者: O Olatunji Ruwase 提交者: GitHub

Make fp32 default communication data type (#2970)

* Make fp32 default communication data type

* Fix asserts
上级 a6317eb5
......@@ -797,10 +797,9 @@ class DeepSpeedEngine(Module):
res = self._config.communication_data_type
if res is not None:
return res
elif self.fp16_enabled() or self.zero_optimization_stage():
if self.fp16_enabled():
return torch.float16
elif self.bfloat16_enabled():
return torch.bfloat16
return torch.float32
......
......@@ -208,9 +208,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
self.reduce_bucket_size = int(reduce_bucket_size)
if self.reduce_scatter:
assert self.communication_data_type in (
torch.float16, torch.bfloat16, torch.float32
), f"ZeRO-3 supports only float16 or bfloat16 communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
valid_reduce_scatter_dtypes = (torch.float16, torch.bfloat16, torch.float32)
assert self.communication_data_type in valid_reduce_scatter_dtypes, f"ZeRO-3 supports {valid_reduce_scatter_dtypes} communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with ZeRO-3 with reduce scatter enabled"
assert self.postscale_gradients, "pre-scale gradients is not yet supported with ZeRO-3 with reduce scatter enabled"
......
......@@ -156,6 +156,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
# ZeRO stage 1 (False) or 2 (True)
self.partition_gradients = partition_grads
self.zero_stage_string = "ZeRO-2" if partition_grads else "ZeRO-1"
self.timers = timers
......@@ -218,16 +219,16 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
self.fp16_master_weights_and_gradients = fp16_master_weights_and_gradients
if self.fp16_master_weights_and_gradients:
assert self.cpu_offload and type(self.optimizer) in [
DeepSpeedCPUAdam
], f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32. Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}. Either disable fp16_master_weights_and_gradients or enable ZeRO-2 Offload with DeepSpeedCPUAdam"
assert self.cpu_offload and type(self.optimizer) in [DeepSpeedCPUAdam], \
f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32."\
f"Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}." \
f"Either disable fp16_master_weights_and_gradients or enable {self.zero_stage_string} Offload with DeepSpeedCPUAdam."
if self.reduce_scatter:
assert self.communication_data_type in (
torch.float16, torch.bfloat16
), f"ZeRO-2 supports only float16 or bfloat16 communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with ZeRO-2 with reduce scatter enabled"
assert self.postscale_gradients, "pre-scale gradients is not yet supported with ZeRO-2 with reduce scatter enabled"
valid_reduce_scatter_dtypes = (torch.float16, torch.bfloat16, torch.float32)
assert self.communication_data_type in valid_reduce_scatter_dtypes, f"{self.zero_stage_string} supports {valid_reduce_scatter_dtypes} communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
assert self.postscale_gradients, "pre-scale gradients is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
# param flattened by groups
self.bit16_groups = []
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册