未验证 提交 31ddaae2 编写于 作者: W WangXi 提交者: GitHub

fused_attention fused_feedforward api support Model Tensor Parallel (#42985)

上级 360b8383
......@@ -20,156 +20,11 @@ import paddle
import paddle.fluid as fluid
from test_dist_base import TestDistRunnerBase, runtime_main
import paddle.distributed.fleet as fleet
import paddle.incubate.nn.functional as incubate_f
from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
from paddle.fluid.dygraph.layers import Layer
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid import core
from paddle.nn.initializer import Constant
from paddle.incubate.nn import FusedMultiHeadAttention
paddle.enable_static()
def _set_var_distributed(var):
if var is None:
return
var.is_distributed = True
# NOTE: use current_block and find_var_recursive to support while_loop
startup_block = paddle.static.default_startup_program().current_block()
main_block = paddle.static.default_main_program().current_block()
startup_block._find_var_recursive(var.name).is_distributed = True
main_block._find_var_recursive(var.name).is_distributed = True
class ParallelFusedMultiHeadAttention(Layer):
def __init__(self,
embed_dim,
num_heads,
dropout_rate=0.5,
attn_dropout_rate=0.5,
kdim=None,
vdim=None,
normalize_before=False,
need_weights=False,
qkv_weight_attr=None,
qkv_bias_attr=None,
linear_weight_attr=None,
linear_bias_attr=None,
pre_ln_scale_attr=None,
pre_ln_bias_attr=None,
ln_scale_attr=None,
ln_bias_attr=None,
epsilon=1e-5,
nranks=1,
ring_id=-1,
name=None):
super(ParallelFusedMultiHeadAttention, self).__init__()
assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
"but received {}".format(embed_dim))
assert num_heads > 0, ("Expected nhead to be greater than 0, "
"but received {}".format(num_heads))
self.normalize_before = normalize_before
self._dtype = self._helper.get_default_dtype()
self._epsilon = epsilon
self._ring_id = ring_id
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
self.kdim = kdim
self.vdim = vdim
self.need_weights = need_weights
assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
assert need_weights == False, "Only support need_weight is False now."
# tensor model parallel
assert num_heads % nranks == 0
num_heads = num_heads // nranks
self.qkv_weight = self.create_parameter(
shape=[3, num_heads, self.head_dim, embed_dim],
attr=qkv_weight_attr,
dtype=self._dtype,
is_bias=False)
self.qkv_bias = self.create_parameter(
shape=[3, num_heads, self.head_dim],
attr=qkv_bias_attr,
dtype=self._dtype,
is_bias=True)
self.linear_weight = self.create_parameter(
shape=[num_heads * self.head_dim, embed_dim],
attr=linear_weight_attr,
dtype=self._dtype,
is_bias=False)
self.linear_bias = self.create_parameter(shape=[embed_dim],
attr=linear_bias_attr,
dtype=self._dtype,
is_bias=True)
# tensor model parallel
if nranks > 1:
assert ring_id != -1
# column parallel
_set_var_distributed(self.qkv_weight)
_set_var_distributed(self.qkv_bias)
# row parallel
_set_var_distributed(self.linear_weight)
if normalize_before:
self.pre_ln_scale = self.create_parameter(
attr=pre_ln_scale_attr,
shape=[embed_dim],
default_initializer=Constant(value=1.0))
self.pre_ln_bias = self.create_parameter(attr=pre_ln_bias_attr,
shape=[embed_dim],
is_bias=True)
self.ln_scale = None
self.ln_bias = None
else:
self.pre_ln_scale = None
self.pre_ln_bias = None
self.ln_scale = self.create_parameter(
attr=ln_scale_attr,
shape=[embed_dim],
default_initializer=Constant(value=1.0))
self.ln_bias = self.create_parameter(attr=ln_bias_attr,
shape=[embed_dim],
is_bias=True)
self.dropout_rate = dropout_rate
self.attn_dropout_rate = attn_dropout_rate
self.name = name
def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
out = incubate_f.fused_multi_head_attention(
x=query,
qkv_weight=self.qkv_weight,
linear_weight=self.linear_weight,
pre_layer_norm=self.normalize_before,
pre_ln_scale=self.pre_ln_scale,
pre_ln_bias=self.pre_ln_bias,
ln_scale=self.ln_scale,
ln_bias=self.ln_bias,
pre_ln_epsilon=self._epsilon,
qkv_bias=self.qkv_bias,
linear_bias=self.linear_bias,
attn_mask=attn_mask,
dropout_rate=self.dropout_rate,
attn_dropout_rate=self.attn_dropout_rate,
ln_epsilon=self._epsilon,
training=self.training,
ring_id=self._ring_id,
name=self.name)
return out
def get_param_attr(weight, bias):
weight_attr = paddle.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(weight))
......@@ -208,7 +63,7 @@ def create_model(data, rank):
qkv_w_attr, qkv_b_attr = get_param_attr(col_qkv_w, col_qkv_b)
linear_w_attr, linear_b_attr = get_param_attr(row_linear_w, linear_b)
attn = ParallelFusedMultiHeadAttention(hidden,
attn = FusedMultiHeadAttention(hidden,
n_head,
dropout_rate=0.0,
attn_dropout_rate=0.0,
......@@ -229,7 +84,7 @@ def create_model(data, rank):
qkv_w_attr, qkv_b_attr = get_param_attr(qkv_w, qkv_b)
linear_w_attr, linear_b_attr = get_param_attr(linear_w, linear_b)
attn = ParallelFusedMultiHeadAttention(hidden,
attn = FusedMultiHeadAttention(hidden,
n_head,
dropout_rate=0.0,
attn_dropout_rate=0.0,
......
......@@ -20,11 +20,7 @@ import paddle
import paddle.fluid as fluid
from test_dist_base import TestDistRunnerBase, runtime_main
import paddle.distributed.fleet as fleet
from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
from paddle.fluid.dygraph.layers import Layer
from paddle.fluid.layer_helper import LayerHelper
from paddle.nn.initializer import Constant
from paddle.incubate.nn import FusedFeedForward
paddle.enable_static()
......@@ -34,239 +30,6 @@ IN_SIZE = 2 * MODEL_PARALLEL_SIZE
OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
def fused_feedforward(x,
linear1_weight,
linear2_weight,
linear1_bias=None,
linear2_bias=None,
ln1_scale=None,
ln1_bias=None,
ln2_scale=None,
ln2_bias=None,
dropout1_rate=0.5,
dropout2_rate=0.5,
activation="relu",
ln1_epsilon=1e-5,
ln2_epsilon=1e-5,
pre_layer_norm=False,
training=True,
mode='upscale_in_train',
ring_id=-1,
name=None):
seed = None
if mode not in ('downscale_in_infer', 'upscale_in_train'):
raise ValueError(
"mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
)
mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode #semantic transfer
helper = LayerHelper("fused_feedforward")
dtype = x.dtype
check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
'fused_feedforward')
check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
'fused_feedforward')
out = helper.create_variable_for_type_inference(x.dtype)
dropout1_mask = helper.create_variable_for_type_inference(
'uint8', stop_gradient=True)
dropout2_mask = helper.create_variable_for_type_inference(
'uint8', stop_gradient=True)
ln1_mean = helper.create_variable_for_type_inference(x.dtype,
stop_gradient=True)
ln1_variance = helper.create_variable_for_type_inference(x.dtype,
stop_gradient=True)
ln2_mean = helper.create_variable_for_type_inference(x.dtype,
stop_gradient=True)
ln2_variance = helper.create_variable_for_type_inference(x.dtype,
stop_gradient=True)
linear1_out = helper.create_variable_for_type_inference(x.dtype,
stop_gradient=True)
ln1_out = helper.create_variable_for_type_inference(x.dtype,
stop_gradient=True)
dropout1_out = helper.create_variable_for_type_inference(x.dtype,
stop_gradient=True)
dropout2_out = helper.create_variable_for_type_inference(x.dtype,
stop_gradient=True)
if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
seed = helper.main_program.random_seed
helper.append_op(type='fused_feedforward',
inputs={
'X': x,
'Linear1Weight': linear1_weight,
'Linear1Bias': linear1_bias,
'Linear2Weight': linear2_weight,
'Linear2Bias': linear2_bias,
'Ln1Scale': ln1_scale,
'Ln1Bias': ln1_bias,
'Ln2Scale': ln2_scale,
'Ln2Bias': ln2_bias,
},
outputs={
'Out': out,
'Dropout1Mask': dropout1_mask,
'Dropout2Mask': dropout2_mask,
'Ln1Mean': ln1_mean,
'Ln1Variance': ln1_variance,
'Ln2Mean': ln2_mean,
'Ln2Variance': ln2_variance,
'Linear1Out': linear1_out,
'Ln1Out': ln1_out,
'Dropout1Out': dropout1_out,
'Dropout2Out': dropout2_out,
},
attrs={
'dropout1_rate': dropout1_rate,
'dropout2_rate': dropout2_rate,
'act_method': activation,
'pre_layer_norm': pre_layer_norm,
'ln1_epsilon': ln1_epsilon,
'ln2_epsilon': ln2_epsilon,
'dropout1_is_test': not training,
'dropout2_is_test': not training,
'dropout1_fix_seed': seed is not None,
'dropout2_fix_seed': seed is not None,
'dropout1_seed': seed if seed is not None else 0,
'dropout2_seed': seed if seed is not None else 0,
'dropout1_implementation': mode,
'dropout2_implementation': mode,
'ring_id': ring_id,
})
return out
def _set_var_distributed(var):
if var is None:
return
var.is_distributed = True
# NOTE: use current_block and find_var_recursive to support while_loop
startup_block = paddle.static.default_startup_program().current_block()
main_block = paddle.static.default_main_program().current_block()
startup_block._find_var_recursive(var.name).is_distributed = True
main_block._find_var_recursive(var.name).is_distributed = True
class ParallelFusedFeedForward(Layer):
def __init__(self,
d_model,
dim_feedforward,
dropout_rate=0.1,
epsilon=1e-05,
activation="relu",
act_dropout_rate=None,
normalize_before=False,
linear1_weight_attr=None,
linear1_bias_attr=None,
linear2_weight_attr=None,
linear2_bias_attr=None,
ln1_scale_attr=None,
ln1_bias_attr=None,
ln2_scale_attr=None,
ln2_bias_attr=None,
nranks=1,
ring_id=-1,
name=None):
super(ParallelFusedFeedForward, self).__init__()
assert d_model > 0, (
"Expected d_model to be greater than 0, but received {}".format(
d_model))
assert dim_feedforward > 0, (
"Expected dim_feedforward to be greater than 0, but received {}".
format(dim_feedforward))
self._dtype = self._helper.get_default_dtype()
self._d_model = d_model
assert dim_feedforward % nranks == 0
dim_feedforward = dim_feedforward // nranks
self._dim_feedforward = dim_feedforward
self._dropout_rate = dropout_rate
self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
self._act_method = activation
self._normalize_before = normalize_before
self._epsilon = epsilon
self._ring_id = ring_id
self._linear1_weight = self.create_parameter(
shape=[d_model, dim_feedforward],
attr=linear1_weight_attr,
dtype=self._dtype,
is_bias=False)
self._linear1_bias = self.create_parameter(shape=[dim_feedforward],
attr=linear1_bias_attr,
dtype=self._dtype,
is_bias=True)
self._linear2_weight = self.create_parameter(
shape=[dim_feedforward, d_model],
attr=linear2_weight_attr,
dtype=self._dtype,
is_bias=False)
self._linear2_bias = self.create_parameter(shape=[d_model],
attr=linear2_bias_attr,
dtype=self._dtype,
is_bias=True)
if nranks > 1:
assert ring_id != -1
# column parallel
_set_var_distributed(self._linear1_weight)
_set_var_distributed(self._linear1_bias)
_set_var_distributed(self._linear2_weight)
if normalize_before:
self._ln1_scale = self.create_parameter(
shape=[d_model],
attr=ln1_scale_attr,
is_bias=False,
default_initializer=Constant(1.0))
self._ln1_bias = self.create_parameter(shape=[d_model],
attr=ln1_bias_attr,
is_bias=True)
self._ln2_scale = None
self._ln2_bias = None
else:
self._ln1_bias = None
self._ln2_bias = None
self._ln2_scale = self.create_parameter(
shape=[d_model],
attr=ln2_scale_attr,
is_bias=False,
default_initializer=Constant(1.0))
self._ln2_bias = self.create_parameter(shape=[d_model],
attr=ln2_bias_attr,
is_bias=True)
self.name = name
def forward(self, src, cache=None):
out = fused_feedforward(src,
self._linear1_weight,
self._linear2_weight,
self._linear1_bias,
self._linear2_bias,
self._ln1_scale,
self._ln1_bias,
self._ln2_scale,
self._ln2_bias,
dropout1_rate=self._act_dropout_rate,
dropout2_rate=self._dropout_rate,
activation=self._act_method,
ln1_epsilon=self._epsilon,
ln2_epsilon=self._epsilon,
pre_layer_norm=self._normalize_before,
training=self.training,
ring_id=self._ring_id,
name=self.name)
return out
def get_param_attr(weight, bias):
weight_attr = paddle.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(weight))
......@@ -295,7 +58,7 @@ def create_model(data, rank):
w0_attr, b0_attr = get_param_attr(col_w0, col_b0)
w1_attr, b1_attr = get_param_attr(row_w1, b1)
ffn = ParallelFusedFeedForward(IN_SIZE,
ffn = FusedFeedForward(IN_SIZE,
OUT_SIZE,
dropout_rate=0.0,
activation='gelu',
......@@ -315,7 +78,7 @@ def create_model(data, rank):
w0_attr, b0_attr = get_param_attr(w0, b0)
w1_attr, b1_attr = get_param_attr(w1, b1)
ffn = ParallelFusedFeedForward(IN_SIZE,
ffn = FusedFeedForward(IN_SIZE,
OUT_SIZE,
dropout_rate=0.0,
activation='gelu',
......
......@@ -83,7 +83,7 @@ def compute_reference(pre_layer_norm, query, attn_mask, ln_scale, ln_bias,
if ln_bias is None:
has_bias = False
if (pre_layer_norm):
if pre_layer_norm:
ln_out = layer_norm(query, True, has_bias, ln_scale, ln_bias)
num_head = qkv_weight.shape[1]
......@@ -97,7 +97,7 @@ def compute_reference(pre_layer_norm, query, attn_mask, ln_scale, ln_bias,
if qkv_bias is not None:
qkv_bias = qkv_bias.reshape(qkv_bias.shape[0] * qkv_bias.shape[1] *
qkv_bias.shape[2])
if (pre_layer_norm):
if pre_layer_norm:
ln_out = ln_out.reshape(batch_size * seq_len, embed_dim)
qkv = fc(ln_out, qkv_weight)
if qkv_bias is not None:
......@@ -239,12 +239,12 @@ class TestFusedAttentionAPI(unittest.TestCase):
attn_mask_tensor = paddle.to_tensor(self.attn_mask)
else:
attn_mask_tensor = None
fused_attn = FusedMultiHeadAttention(self.embed_dim, self.num_heads,
self.dropout_prob,
self.attn_dropout_prob, self.kdim,
self.vdim, self.pre_layer_norm,
self.need_weight, self.weight_attr,
self.bias_attr)
fused_attn = FusedMultiHeadAttention(
self.embed_dim, self.num_heads, self.dropout_prob,
self.attn_dropout_prob, self.kdim, self.vdim, self.pre_layer_norm,
self.need_weight, self.weight_attr, self.bias_attr,
self.weight_attr, self.bias_attr, self.weight_attr, self.bias_attr,
self.weight_attr, self.bias_attr)
if self.bias_attr is not False:
qkv_bias = np.random.random(
fused_attn.qkv_bias.shape).astype('float32')
......@@ -260,13 +260,19 @@ class TestFusedAttentionAPI(unittest.TestCase):
if self.bias_attr is not False:
fused_attn_qkv_bias = fused_attn.qkv_bias.numpy()
fused_attn_linear_bias = fused_attn.linear_bias.numpy()
if self.pre_layer_norm:
fused_attn_pre_ln_bias = fused_attn.pre_ln_bias.numpy()
fused_attn_ln_bias = None
else:
fused_attn_pre_ln_bias = None
fused_attn_ln_bias = fused_attn.ln_bias.numpy()
ref_out = compute_reference(
self.pre_layer_norm, self.query, self.attn_mask,
fused_attn.pre_ln_scale.numpy(), fused_attn_pre_ln_bias,
fused_attn.ln_scale.numpy(), fused_attn_ln_bias,
fused_attn.pre_ln_scale.numpy() if self.pre_layer_norm else None,
fused_attn_pre_ln_bias,
fused_attn.ln_scale.numpy() if not self.pre_layer_norm else None,
fused_attn_ln_bias,
fused_attn.qkv_weight.numpy(), fused_attn_qkv_bias,
fused_attn.linear_weight.numpy(), fused_attn_linear_bias)
np.testing.assert_allclose(ref_out,
......@@ -275,12 +281,12 @@ class TestFusedAttentionAPI(unittest.TestCase):
atol=self.atol)
def run_static(self):
fused_attn = FusedMultiHeadAttention(self.embed_dim, self.num_heads,
self.dropout_prob,
self.attn_dropout_prob, self.kdim,
self.vdim, self.pre_layer_norm,
self.need_weight, self.weight_attr,
self.bias_attr)
fused_attn = FusedMultiHeadAttention(
self.embed_dim, self.num_heads, self.dropout_prob,
self.attn_dropout_prob, self.kdim, self.vdim, self.pre_layer_norm,
self.need_weight, self.weight_attr, self.bias_attr,
self.weight_attr, self.bias_attr, self.weight_attr, self.bias_attr,
self.weight_attr, self.bias_attr)
x = paddle.static.data(
name='X',
......@@ -304,11 +310,27 @@ class TestFusedAttentionAPI(unittest.TestCase):
qkv_bias = None
linear_bias = None
ln_scale = None
ln_2_scale = None
ln_bias = None
ln_2_bias = None
if self.has_attn_mask:
if self.bias_attr is False:
out, qkv_weight, out_linear_weight, ln_scale, ln_2_scale = exe.run(
if self.pre_layer_norm:
out, qkv_weight, out_linear_weight, ln_scale = exe.run(
paddle.static.default_main_program(),
feed={
"X": self.query,
"SrcMask": self.attn_mask
},
fetch_list=[
final_out,
fused_attn.qkv_weight,
fused_attn.linear_weight,
fused_attn.pre_ln_scale,
])
else:
out, qkv_weight, out_linear_weight, ln_2_scale = exe.run(
paddle.static.default_main_program(),
feed={
"X": self.query,
......@@ -316,45 +338,89 @@ class TestFusedAttentionAPI(unittest.TestCase):
},
fetch_list=[
final_out, fused_attn.qkv_weight,
fused_attn.linear_weight, fused_attn.pre_ln_scale,
fused_attn.ln_scale
fused_attn.linear_weight, fused_attn.ln_scale
])
else:
out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias = exe.run(
if self.pre_layer_norm:
out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias = exe.run(
paddle.static.default_main_program(),
feed={
"X": self.query,
"SrcMask": self.attn_mask
},
fetch_list=[
final_out, fused_attn.qkv_weight, fused_attn.qkv_bias,
fused_attn.linear_weight, fused_attn.linear_bias,
fused_attn.pre_ln_scale, fused_attn.pre_ln_bias,
fused_attn.ln_scale, fused_attn.ln_bias
final_out,
fused_attn.qkv_weight,
fused_attn.qkv_bias,
fused_attn.linear_weight,
fused_attn.linear_bias,
fused_attn.pre_ln_scale,
fused_attn.pre_ln_bias,
])
else:
out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_2_scale, ln_2_bias = exe.run(
paddle.static.default_main_program(),
feed={
"X": self.query,
"SrcMask": self.attn_mask
},
fetch_list=[
final_out, fused_attn.qkv_weight,
fused_attn.qkv_bias, fused_attn.linear_weight,
fused_attn.linear_bias, fused_attn.ln_scale,
fused_attn.ln_bias
])
else:
if self.bias_attr is False:
out, qkv_weight, out_linear_weight, ln_scale, ln_2_scale = exe.run(
if self.pre_layer_norm:
out, qkv_weight, out_linear_weight, ln_scale = exe.run(
paddle.static.default_main_program(),
feed={
"X": self.query,
},
fetch_list=[
final_out,
fused_attn.qkv_weight,
fused_attn.linear_weight,
fused_attn.pre_ln_scale,
])
else:
out, qkv_weight, out_linear_weight, ln_2_scale = exe.run(
paddle.static.default_main_program(),
feed={
"X": self.query,
},
fetch_list=[
final_out, fused_attn.qkv_weight,
fused_attn.linear_weight, fused_attn.pre_ln_scale,
fused_attn.ln_scale
fused_attn.linear_weight, fused_attn.ln_scale
])
else:
out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias = exe.run(
if self.pre_layer_norm:
out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias = exe.run(
paddle.static.default_main_program(),
feed={
"X": self.query,
},
fetch_list=[
final_out, fused_attn.qkv_weight, fused_attn.qkv_bias,
fused_attn.linear_weight, fused_attn.linear_bias,
fused_attn.pre_ln_scale, fused_attn.pre_ln_bias,
fused_attn.ln_scale, fused_attn.ln_bias
final_out,
fused_attn.qkv_weight,
fused_attn.qkv_bias,
fused_attn.linear_weight,
fused_attn.linear_bias,
fused_attn.pre_ln_scale,
fused_attn.pre_ln_bias,
])
else:
out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_2_scale, ln_2_bias = exe.run(
paddle.static.default_main_program(),
feed={
"X": self.query,
},
fetch_list=[
final_out, fused_attn.qkv_weight,
fused_attn.qkv_bias, fused_attn.linear_weight,
fused_attn.linear_bias, fused_attn.ln_scale,
fused_attn.ln_bias
])
return out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias
......
......@@ -45,6 +45,7 @@ def fused_feedforward(x,
pre_layer_norm=False,
training=True,
mode='upscale_in_train',
ring_id=-1,
name=None):
r"""
This is a fusion operator to compute feed forward layer in transformer model architecture.
......@@ -88,6 +89,7 @@ def fused_feedforward(x,
- train: out = input * mask
- inference: out = input * (1.0 - p)
ring_id (int, optional): For distributed forward in tensor model parallel, only support NCCL. Default is -1, means not using tensor parallel.
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
......@@ -132,7 +134,8 @@ def fused_feedforward(x,
"dropout1_fix_seed", seed is not None, "dropout2_fix_seed", seed
is not None, "dropout1_seed", seed if seed is not None else 0,
"dropout2_seed", seed if seed is not None else 0,
'dropout1_implementation', mode, 'dropout2_implementation', mode)
'dropout1_implementation', mode, 'dropout2_implementation', mode,
'ring_id', ring_id)
return out
helper = LayerHelper("fused_feedforward")
......@@ -206,7 +209,8 @@ def fused_feedforward(x,
'dropout1_seed': seed if seed is not None else 0,
'dropout2_seed': seed if seed is not None else 0,
'dropout1_implementation': mode,
'dropout2_implementation': mode
'dropout2_implementation': mode,
'ring_id': ring_id,
})
return out
......
......@@ -158,15 +158,39 @@ class FusedMultiHeadAttention(Layer):
(True) or post_layer_norm architecture (False). Default False.
need_weights (bool, optional): Indicate whether to return the attention
weights. Now, only False is supported. Default False.
weight_attr(ParamAttr, optional): To specify the weight parameter property.
Default: None, which means the default weight parameter property is used.
See usage for details in :code:`ParamAttr`.
bias_attr (ParamAttr|bool, optional): To specify the bias parameter property.
Default: None, which means the default bias parameter property is used.
If it is set to False, this layer will not have trainable bias parameter.
See usage for details in :code:`ParamAttr`.
qkv_weight_attr(ParamAttr, optional): To specify the weight parameter property
for QKV projection computation. Default: None, which means the default weight
parameter property is used. See usage for details in :code:`ParamAttr`.
qkv_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
for QKV projection computation. The `False` value means the corresponding layer
would not have trainable bias parameter. Default: None, which means the
default bias parameter property is used. See usage for details in :code:`ParamAttr`.
linear_weight_attr(ParamAttr, optional): To specify the weight parameter property
for linear projection computation. Default: None, which means the default weight
parameter property is used. See usage for details in :code:`ParamAttr`.
linear_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
for linear projection computation. The `False` value means the corresponding layer would
not have trainable bias parameter. Default: None, which means the default bias
parameter property is used. See usage for details in :code:`ParamAttr`.
pre_ln_scale_attr(ParamAttr, optional): To specify the weight parameter property
for pre_layer_norm computation. Otherwise, all layers both use it as
`attr` to create parameters. Default: None, which means the default weight
parameter property is used. See usage for details in :code:`ParamAttr`.
pre_ln_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
for pre_layer_norm computation. The `False` value means the corresponding layer would
not have trainable bias parameter. Default: None, which means the default bias
parameter property is used. See usage for details in :code:`ParamAttr`.
ln_scale_attr(ParamAttr, optional): To specify the weight parameter property
for post_layer_norm computation. Default: None, which means the default weight
parameter property is used. See usage for details in :code:`ParamAttr`.
ln_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
for post_layer_norm computation. The `False` value means the corresponding layer would
not have trainable bias parameter. Default: None, which means the default bias
parameter property is used. See usage for details in :code:`ParamAttr`.
epsilon (float, optional): The small value added to the variance to prevent
division by zero. Default: 1e-05.
nranks (int, optional): Distributed tensor model parallel nranks. Default is 1, means not using tensor parallel.
ring_id (int, optional): For distributed tensor model parallel. Default is -1, means not using tensor parallel.
Examples:
......@@ -191,9 +215,17 @@ class FusedMultiHeadAttention(Layer):
vdim=None,
normalize_before=False,
need_weights=False,
weight_attr=None,
bias_attr=None,
qkv_weight_attr=None,
qkv_bias_attr=None,
linear_weight_attr=None,
linear_bias_attr=None,
pre_ln_scale_attr=None,
pre_ln_bias_attr=None,
ln_scale_attr=None,
ln_bias_attr=None,
epsilon=1e-5,
nranks=1,
ring_id=-1,
name=None):
super(FusedMultiHeadAttention, self).__init__()
......@@ -204,9 +236,8 @@ class FusedMultiHeadAttention(Layer):
self.normalize_before = normalize_before
self._dtype = self._helper.get_default_dtype()
self._weight_attr = weight_attr
self._bias_attr = bias_attr
self._epsilon = epsilon
self._ring_id = ring_id
self.embed_dim = embed_dim
self.num_heads = num_heads
......@@ -215,39 +246,59 @@ class FusedMultiHeadAttention(Layer):
self.vdim = vdim
self.need_weights = need_weights
assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
assert need_weights == False, "Only support need_weight is False now."
assert need_weights is False, "Only support need_weight is False now."
# tensor model parallel
assert num_heads % nranks == 0
num_heads = num_heads // nranks
self.qkv_weight = self.create_parameter(
shape=[3, num_heads, self.head_dim, embed_dim],
attr=self._weight_attr,
attr=qkv_weight_attr,
dtype=self._dtype,
is_bias=False)
self.qkv_bias = self.create_parameter(
shape=[3, num_heads, self.head_dim],
attr=self._bias_attr,
attr=qkv_bias_attr,
dtype=self._dtype,
is_bias=True)
self.linear_weight = self.create_parameter(shape=[embed_dim, embed_dim],
attr=self._weight_attr,
self.linear_weight = self.create_parameter(
shape=[num_heads * self.head_dim, embed_dim],
attr=linear_weight_attr,
dtype=self._dtype,
is_bias=False)
self.linear_bias = self.create_parameter(shape=[embed_dim],
attr=self._bias_attr,
attr=linear_bias_attr,
dtype=self._dtype,
is_bias=True)
# tensor model parallel
if nranks > 1:
assert ring_id != -1
# column parallel
_set_var_distributed(self.qkv_weight)
_set_var_distributed(self.qkv_bias)
# row parallel
_set_var_distributed(self.linear_weight)
if normalize_before:
self.pre_ln_scale = self.create_parameter(
attr=self._weight_attr,
attr=pre_ln_scale_attr,
shape=[embed_dim],
default_initializer=Constant(value=1.0))
self.pre_ln_bias = self.create_parameter(attr=self._bias_attr,
self.pre_ln_bias = self.create_parameter(attr=pre_ln_bias_attr,
shape=[embed_dim],
is_bias=True)
self.ln_scale = None
self.ln_bias = None
else:
self.pre_ln_scale = None
self.pre_ln_bias = None
self.ln_scale = self.create_parameter(
attr=self._weight_attr,
attr=ln_scale_attr,
shape=[embed_dim],
default_initializer=Constant(value=1.0))
self.ln_bias = self.create_parameter(attr=self._bias_attr,
self.ln_bias = self.create_parameter(attr=ln_bias_attr,
shape=[embed_dim],
is_bias=True)
......@@ -294,8 +345,6 @@ class FusedMultiHeadAttention(Layer):
# Support bool or int mask
attn_mask = _convert_attention_mask(attn_mask, query.dtype)
assert cache == None, "Only support cache is None now."
out = incubate_f.fused_multi_head_attention(
x=query,
qkv_weight=self.qkv_weight,
......@@ -308,11 +357,13 @@ class FusedMultiHeadAttention(Layer):
pre_ln_epsilon=self._epsilon,
qkv_bias=self.qkv_bias,
linear_bias=self.linear_bias,
cache_kv=cache,
attn_mask=attn_mask,
dropout_rate=self.dropout_rate,
attn_dropout_rate=self.attn_dropout_rate,
ln_epsilon=self._epsilon,
training=self.training,
ring_id=self._ring_id,
name=self.name)
return out
......@@ -338,14 +389,38 @@ class FusedFeedForward(Layer):
If None, use the value of `dropout_rate`. Default None
normalize_before (bool, optional): Indicate whether to put layer normalization
into, preprocessing or postprocessing. Default False
weight_attr (ParamAttr, optional): The attribute for the learnable weight of this layer.
The default value is None and the weight will be initialized to zero. For detailed
information, please refer to paddle.ParamAttr.
bias_attr (ParamAttr|bool, optional): The attribute for the learnable bias of thi layer.
If it is set to False, no bias will be added to the output. If it is set to None or one
kind of ParamAttr, a bias parameter will be created according to ParamAttr. For detailed
information, please refer to paddle.ParamAttr. The default value is None and the bias
will be initialized to zero.
linear1_weight_attr(ParamAttr, optional): To specify the weight parameter property
for FFN first linear. Default: None, which means the default weight
parameter property is used. See usage for details in :code:`ParamAttr`.
linear1_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
for FFN first linear. The `False` value means the corresponding layer would
not have trainable bias parameter. Default: None, which means the default bias
parameter property is used. See usage for details in :code:`ParamAttr`.
linear2_weight_attr(ParamAttr, optional): To specify the weight parameter property
for FFN second linear. Default: None, which means the default weight
parameter property is used. See usage for details in :code:`ParamAttr`.
linear2_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
for FFN second linear. The `False` value means the corresponding layer would
not have trainable bias parameter. Default: None, which means the default bias
parameter property is used. See usage for details in :code:`ParamAttr`.
ln1_scale_attr(ParamAttr, optional): To specify the weight parameter property
for FFN pre_layer_norm. Default: None, which means the default weight
parameter property is used. See usage for details in :code:`ParamAttr`.
ln1_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
for FFN pre_layer_norm. The `False` value means the corresponding layer would
not have trainable bias parameter. Default: None, which means the default bias
parameter property is used. See usage for details in :code:`ParamAttr`.
ln2_scale_attr(ParamAttr, optional): To specify the weight parameter property
for FFN post_layer_norm. Default: None, which means the default weight
parameter property is used. See usage for details in :code:`ParamAttr`.
ln2_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
for FFN layer_norm. The `False` value means the corresponding layer would
not have trainable bias parameter. Default: None, which means the default bias
parameter property is used. See usage for details in :code:`ParamAttr`.
nranks (int, optional): Distributed tensor model parallel nranks. Default is 1, means not using tensor parallel.
ring_id (int, optional): For distributed tensor model parallel. Default is -1, means not using tensor parallel.
name (str, optional): The default value is None. Normally there is no need for user to set
this property. For more information, please refer to :ref:`api_guide_Name`.
Examples:
.. code-block:: python
......@@ -369,8 +444,16 @@ class FusedFeedForward(Layer):
activation="relu",
act_dropout_rate=None,
normalize_before=False,
weight_attr=None,
bias_attr=None,
linear1_weight_attr=None,
linear1_bias_attr=None,
linear2_weight_attr=None,
linear2_bias_attr=None,
ln1_scale_attr=None,
ln1_bias_attr=None,
ln2_scale_attr=None,
ln2_bias_attr=None,
nranks=1,
ring_id=-1,
name=None):
super(FusedFeedForward, self).__init__()
......@@ -383,51 +466,68 @@ class FusedFeedForward(Layer):
self._dtype = self._helper.get_default_dtype()
self._d_model = d_model
assert dim_feedforward % nranks == 0
dim_feedforward = dim_feedforward // nranks
self._dim_feedforward = dim_feedforward
self._dropout_rate = dropout_rate
self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
self._act_method = activation
self._normalize_before = normalize_before
self._epsilon = epsilon
self._ring_id = ring_id
self._linear1_weight = self.create_parameter(
shape=[d_model, dim_feedforward],
attr=weight_attr,
attr=linear1_weight_attr,
dtype=self._dtype,
is_bias=False)
self._linear1_bias = self.create_parameter(shape=[dim_feedforward],
attr=bias_attr,
attr=linear1_bias_attr,
dtype=self._dtype,
is_bias=True)
self._linear2_weight = self.create_parameter(
shape=[dim_feedforward, d_model],
attr=weight_attr,
attr=linear2_weight_attr,
dtype=self._dtype,
is_bias=False)
self._linear2_bias = self.create_parameter(shape=[d_model],
attr=bias_attr,
attr=linear2_bias_attr,
dtype=self._dtype,
is_bias=True)
if nranks > 1:
assert ring_id != -1
# column parallel
_set_var_distributed(self._linear1_weight)
_set_var_distributed(self._linear1_bias)
_set_var_distributed(self._linear2_weight)
if normalize_before:
self._ln1_scale = self.create_parameter(
shape=[d_model],
attr=None,
attr=ln1_scale_attr,
is_bias=False,
default_initializer=Constant(1.0))
self._ln1_bias = self.create_parameter(shape=[d_model],
attr=None,
attr=ln1_bias_attr,
is_bias=True)
self._ln2_scale = None
self._ln2_bias = None
else:
self._ln1_scale = None
self._ln1_bias = None
self._ln2_scale = self.create_parameter(
shape=[d_model],
attr=None,
attr=ln2_scale_attr,
is_bias=False,
default_initializer=Constant(1.0))
self._ln2_bias = self.create_parameter(shape=[d_model],
attr=None,
attr=ln2_bias_attr,
is_bias=True)
self.name = name
def forward(self, src, cache=None):
......@@ -448,6 +548,7 @@ class FusedFeedForward(Layer):
ln2_epsilon=self._epsilon,
pre_layer_norm=self._normalize_before,
training=self.training,
ring_id=self._ring_id,
name=self.name)
return out
......@@ -553,8 +654,14 @@ class FusedTransformerEncoderLayer(Layer):
dropout_rate=dropout_rate,
attn_dropout_rate=attn_dropout_rate,
normalize_before=self.normalize_before,
weight_attr=weight_attrs[0],
bias_attr=bias_attrs[0])
qkv_weight_attr=weight_attrs[0],
qkv_bias_attr=bias_attrs[0],
linear_weight_attr=weight_attrs[0],
linear_bias_attr=bias_attrs[0],
pre_ln_scale_attr=weight_attrs[0],
pre_ln_bias_attr=bias_attrs[0],
ln_scale_attr=weight_attrs[0],
ln_bias_attr=bias_attrs[0])
self.ffn = FusedFeedForward(d_model,
dim_feedforward,
......@@ -562,8 +669,10 @@ class FusedTransformerEncoderLayer(Layer):
activation=activation,
act_dropout_rate=act_dropout_rate,
normalize_before=self.normalize_before,
weight_attr=weight_attrs[1],
bias_attr=bias_attrs[1])
linear1_weight_attr=weight_attrs[1],
linear1_bias_attr=bias_attrs[1],
linear2_weight_attr=weight_attrs[1],
linear2_bias_attr=bias_attrs[1])
def forward(self, src, src_mask=None, cache=None):
"""
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册