],f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32. Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}. Either disable fp16_master_weights_and_gradients or enable ZeRO-2 Offload with DeepSpeedCPUAdam"
f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32."\
f"Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}." \
f"Either disable fp16_master_weights_and_gradients or enable {self.zero_stage_string} Offload with DeepSpeedCPUAdam."
ifself.reduce_scatter:
assertself.communication_data_typein(
torch.float16,torch.bfloat16
),f"ZeRO-2 supports only float16 or bfloat16 communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
assertself.gradient_predivide_factor==1.0,"gradient_predivide_factor != 1.0 is not yet supported with ZeRO-2 with reduce scatter enabled"
assertself.postscale_gradients,"pre-scale gradients is not yet supported with ZeRO-2 with reduce scatter enabled"
assertself.communication_data_typeinvalid_reduce_scatter_dtypes,f"{self.zero_stage_string} supports {valid_reduce_scatter_dtypes} communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
assertself.gradient_predivide_factor==1.0,"gradient_predivide_factor != 1.0 is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
assertself.postscale_gradients,"pre-scale gradients is not yet supported with {self.zero_stage_string} with reduce scatter enabled"