未验证 提交 6f77da1b 编写于 作者: K Kevin Ko 提交者: GitHub

Add `scale_attn_by_inverse_layer_idx` feature (#2486)

* Add scale_attn_by_inverse_layer_idx feature

* Fix layer_id bug

* Fix scaling value
Co-authored-by: NConnor Holmes <connorholmes@microsoft.com>
Co-authored-by: NReza Yazdani <44502768+RezaYazdaniAminabadi@users.noreply.github.com>
上级 d2d1b4c3
......@@ -406,6 +406,9 @@ def replace_transformer_layer(orig_layer_impl,
quantizer = GroupQuantizer(q_int8=quantize)
if inference:
scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx if hasattr(
config,
'scale_attn_by_inverse_layer_idx') else False
if moe:
ep_world_size = dist.get_world_size()
local_ep_size = 1 if num_experts < ep_world_size else num_experts // ep_world_size
......@@ -422,7 +425,8 @@ def replace_transformer_layer(orig_layer_impl,
q_int8=quantize,
moe_experts=local_ep_size,
global_experts=num_experts,
mlp_type=moe_type)
mlp_type=moe_type,
scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx)
else:
rotary_dim = config.rotary_dim if hasattr(config, 'rotary_dim') else child.attention.rotary_ndims \
if hasattr(child, 'attention') and hasattr(child.attention,'rotary_ndims') else -1
......@@ -454,7 +458,8 @@ def replace_transformer_layer(orig_layer_impl,
mlp_act_func_type=policy.mlp_act_func_type,
training_mp_size=training_mp_size,
bigscience_bloom=bigscience_bloom,
max_out_tokens=max_out_tokens)
max_out_tokens=max_out_tokens,
scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx)
if quantize and quantize_settings is not None:
(quantization_scales,
......
......@@ -227,6 +227,10 @@ class DeepSpeedAttention(nn.Module):
self.norm_factor = math.sqrt(
math.sqrt(self.config.hidden_size // self.config.heads))
if self.config.scale_attn_by_inverse_layer_idx is True:
self.norm_factor *= math.sqrt(self.config.layer_id + 1)
# https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/gpt2/modeling_gpt2.py#L191
self.score_context_func = inference_cuda_module.softmax_context_fp32 if (not config.fp16) else \
inference_cuda_module.softmax_context_fp16
self.linear_func = inference_cuda_module.linear_layer_fp16 if config.fp16 else \
......
......@@ -64,7 +64,8 @@ class DeepSpeedInferenceConfig(TransformerConfig):
mlp_act_func_type=ActivationFuncType.GELU,
training_mp_size=1,
bigscience_bloom=False,
max_out_tokens=1024):
max_out_tokens=1024,
scale_attn_by_inverse_layer_idx=False):
super(DeepSpeedInferenceConfig,
self).__init__(
hidden_size,
......@@ -92,6 +93,7 @@ class DeepSpeedInferenceConfig(TransformerConfig):
self.training_mp_size = training_mp_size
self.bigscience_bloom = bigscience_bloom
self.max_out_tokens = max_out_tokens
self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
@classmethod
def from_dict(cls, json_object):
......
......@@ -277,8 +277,6 @@ class DeepSpeedSelfAttentionFunction(Function):
) else 0
sliced_alibi = alibi[offset:batch_heads + offset, :, :]
#
attn_key_value = score_context_func(
qkv_out,
((1 - input_mask).to(qkv_out.dype) *
......@@ -436,6 +434,10 @@ class DeepSpeedSelfAttention(nn.Module):
math.sqrt(self.config.hidden_size // self.config.heads))
self.qkv_merging = qkv_merging
if self.config.scale_attn_by_inverse_layer_idx is True:
self.norm_factor *= math.sqrt(self.config.layer_id + 1)
# https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/gpt2/modeling_gpt2.py#L191
global inference_cuda_module
if inference_cuda_module is None:
builder = op_builder.InferenceBuilder()
......
......@@ -68,7 +68,8 @@ class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig):
noisy_gate_policy=None,
drop_tokens=True,
use_rts=False,
mlp_type='standard'):
mlp_type='standard',
scale_attn_by_inverse_layer_idx=False):
super(DeepSpeedMoEInferenceConfig,
self).__init__(
hidden_size,
......@@ -97,6 +98,7 @@ class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig):
self.use_rts = use_rts
self.global_experts = global_experts
self.mlp_type = mlp_type
self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
@classmethod
def from_dict(cls, json_object):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册