diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 538cc04759b06b234526695042f01749780d88f0..0e45aa6ddab138452f354c1d45d867fd6d37ab39 100755 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -653,11 +653,10 @@ def replace_transformer_layer(orig_layer_impl, new_bias = torch.empty((weight_shape[0]), device=child.weight.device, dtype=child.weight.dtype) - if z_inference: with deepspeed.zero.GatheredParameters(child.bias, modifier_rank=0): new_bias.data.copy_(child.bias.data) - else: + elif child.bias: new_bias.data.copy_(child.bias.data) return LinearAllreduce(data, child.bias if child.bias is None else \ torch.nn.parameter.Parameter(new_bias.to(torch.cuda.current_device())), mp_group) @@ -704,7 +703,9 @@ def replace_transformer_layer(orig_layer_impl, child.weight.shape[1] // mp_size), device=child.weight.device, dtype=child.weight.dtype) - data = mp_replace.copy(new_weight, child.weight.ds_tensor.data) + data = mp_replace.copy(new_weight, + child.weight.ds_tensor.data if hasattr(child.weight, 'ds_tensor') else \ + child.weight.data) new_embedding = nn.Embedding(child.weight.shape[0], child.weight.shape[1] // mp_size) new_embedding.weight.data.copy_(data)