未验证 提交 19b87aec 编写于 作者: W WangXi 提交者: GitHub

[cherry-pick 2.3] Cherry parallel fused transformer api (#43505)

* Rename dropout is test (#43098)

* replace dropout_is_test with is_test.
* improve atol on a100.

* fused_attention fused_feedforward api support Model Tensor Parallel (#42985)

* fix is_test bug in fused_feedforward. (#43508)
Co-authored-by: NLi Min <11663212+limin2021@users.noreply.github.com>
上级 1a660c8a
......@@ -190,7 +190,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
// the same as QKOut's shape.
ctx->SetOutputDim("AttnDropoutOut",
{x_dim[0], y_dim[1], x_dim[1], out_seq_len});
if (ctx->Attrs().Get<bool>("attn_dropout_is_test") == false) {
if (ctx->Attrs().Get<bool>("is_test") == false) {
ctx->SetOutputDim("AttnDropoutMaskOut",
{x_dim[0], y_dim[1], x_dim[1], out_seq_len});
}
......@@ -202,7 +202,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("FMHAOut", {x_dim[0], x_dim[1], y_dim[1], y_dim[2]});
ctx->SetOutputDim("OutLinearOut", ctx->GetInputDim("X"));
if (ctx->Attrs().Get<bool>("dropout_is_test") == false) {
if (ctx->Attrs().Get<bool>("is_test") == false) {
ctx->SetOutputDim("DropoutMaskOut", ctx->GetInputDim("X"));
}
......@@ -297,7 +297,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
platform::errors::InvalidArgument(
"'attn_dropout_rate' must be between 0.0 and 1.0."));
});
AddAttr<bool>("attn_dropout_is_test",
AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(false);
......@@ -341,11 +341,6 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
platform::errors::InvalidArgument(
"'dropout_rate' must be between 0.0 and 1.0."));
});
AddAttr<bool>("dropout_is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(false);
AddAttr<bool>("dropout_fix_seed",
"A flag indicating whether to use a fixed seed to generate "
"random mask. NOTE: DO NOT set this flag to true in "
......@@ -414,10 +409,9 @@ class FusedAttentionGradOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_EQ(
ctx->Attrs().Get<bool>("attn_dropout_is_test"), false,
PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
platform::errors::InvalidArgument(
"GradOp is only callable when attn_dropout_is_test is false"));
"GradOp is only callable when is_test is false"));
if (ctx->Attrs().Get<bool>("pre_layer_norm") == false) {
OP_INOUT_CHECK(ctx->HasInput("Ln2Mean"), "Input", "Ln2Mean",
......
......@@ -108,7 +108,7 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
const float ln_epsilon = ctx.Attr<float>("ln_epsilon");
float attn_dropout_rate = ctx.Attr<float>("attn_dropout_rate");
bool is_test_1 = ctx.Attr<bool>("attn_dropout_is_test");
bool is_test_1 = ctx.Attr<bool>("is_test");
auto &dropout_implementation_1 =
ctx.Attr<std::string>("attn_dropout_implementation");
bool is_upscale_in_train_1 =
......@@ -279,7 +279,7 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
const float ln2epsilon = ctx.Attr<float>("ln_epsilon");
float attn_dropout_prob = ctx.Attr<float>("attn_dropout_rate");
bool is_test_1 = ctx.Attr<bool>("attn_dropout_is_test");
bool is_test_1 = ctx.Attr<bool>("is_test");
auto &dropout_implementation_1 =
ctx.Attr<std::string>("attn_dropout_implementation");
bool is_upscale_in_train_1 =
......
......@@ -82,7 +82,7 @@ struct DropoutParam {
auto& dropout_implementation =
context.Attr<std::string>(pre_fix + "implementation");
is_upscale_in_train = (dropout_implementation == "upscale_in_train");
is_test = context.Attr<bool>(pre_fix + "is_test");
is_test = context.Attr<bool>("is_test");
fix_seed = context.Attr<bool>(pre_fix + "fix_seed");
std::string str_seed = "Dropout";
......
......@@ -61,14 +61,14 @@ class FusedFeedForwardOp : public framework::OperatorWithKernel {
tmp_dim_x[dim_x.size() - 1] =
dim_Linear1Weight[dim_Linear1Weight.size() - 1];
context->SetOutputDim("Out", dim_x);
if (context->Attrs().Get<bool>("dropout1_is_test") == false) {
if (context->Attrs().Get<bool>("is_test") == false) {
context->SetOutputDim("Dropout1Mask", tmp_dim_x);
}
context->SetOutputDim("Dropout1Out", tmp_dim_x);
context->SetOutputDim("Linear1Out", tmp_dim_x);
context->SetOutputDim("Dropout2Out", dim_x);
if (context->Attrs().Get<bool>("dropout2_is_test") == false) {
if (context->Attrs().Get<bool>("is_test") == false) {
context->SetOutputDim("Dropout2Mask", dim_x);
}
framework::DDim mean_dim =
......@@ -185,9 +185,7 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker {
"dropout2_implementation can only be downgrade_in_infer or "
"upscale_in_train"));
});
AddAttr<bool>("dropout1_is_test", "the is_test of first dropout")
.SetDefault(false);
AddAttr<bool>("dropout2_is_test", "the is_test of second dropout")
AddAttr<bool>("is_test", "the is_test attribute of dropout")
.SetDefault(false);
AddAttr<bool>("dropout1_fix_seed", "the is_test of first dropout")
.SetDefault(false);
......@@ -218,10 +216,7 @@ class FusedFeedForwardOpGrad : public framework::OperatorWithKernel {
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("dropout1_is_test"), false,
platform::errors::InvalidArgument(
"GradOp is only callable when is_test is false"));
PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("dropout2_is_test"), false,
PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
platform::errors::InvalidArgument(
"GradOp is only callable when is_test is false"));
bool pre_layer_norm = ctx->Attrs().Get<bool>("pre_layer_norm");
......
......@@ -221,7 +221,7 @@ class FusedMultiTransformerOpOpMaker
"'dropout_rate' must be between 0.0 and 1.0."));
});
AddAttr<bool>("dropout_is_test",
AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(false);
......
......@@ -20,154 +20,11 @@ import paddle
import paddle.fluid as fluid
from test_dist_base import TestDistRunnerBase, runtime_main
import paddle.distributed.fleet as fleet
import paddle.incubate.nn.functional as incubate_f
from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
from paddle.fluid.dygraph.layers import Layer
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid import core
from paddle.nn.initializer import Constant
from paddle.incubate.nn import FusedMultiHeadAttention
paddle.enable_static()
def _set_var_distributed(var):
if var is None:
return
var.is_distributed = True
# NOTE: use current_block and find_var_recursive to support while_loop
startup_block = paddle.static.default_startup_program().current_block()
main_block = paddle.static.default_main_program().current_block()
startup_block._find_var_recursive(var.name).is_distributed = True
main_block._find_var_recursive(var.name).is_distributed = True
class ParallelFusedMultiHeadAttention(Layer):
def __init__(self,
embed_dim,
num_heads,
dropout_rate=0.5,
attn_dropout_rate=0.5,
kdim=None,
vdim=None,
normalize_before=False,
need_weights=False,
qkv_weight_attr=None,
qkv_bias_attr=None,
linear_weight_attr=None,
linear_bias_attr=None,
pre_ln_scale_attr=None,
pre_ln_bias_attr=None,
ln_scale_attr=None,
ln_bias_attr=None,
epsilon=1e-5,
nranks=1,
ring_id=-1,
name=None):
super(ParallelFusedMultiHeadAttention, self).__init__()
assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
"but recieved {}".format(embed_dim))
assert num_heads > 0, ("Expected nhead to be greater than 0, "
"but recieved {}".format(num_heads))
self.normalize_before = normalize_before
self._dtype = self._helper.get_default_dtype()
self._epsilon = epsilon
self._ring_id = ring_id
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
self.kdim = kdim
self.vdim = vdim
self.need_weights = need_weights
assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
assert need_weights == False, "Only support need_weight is False now."
# tensor model parallel
assert num_heads % nranks == 0
num_heads = num_heads // nranks
self.qkv_weight = self.create_parameter(
shape=[3, num_heads, self.head_dim, embed_dim],
attr=qkv_weight_attr,
dtype=self._dtype,
is_bias=False)
self.qkv_bias = self.create_parameter(
shape=[3, num_heads, self.head_dim],
attr=qkv_bias_attr,
dtype=self._dtype,
is_bias=True)
self.linear_weight = self.create_parameter(
shape=[num_heads * self.head_dim, embed_dim],
attr=linear_weight_attr,
dtype=self._dtype,
is_bias=False)
self.linear_bias = self.create_parameter(
shape=[embed_dim],
attr=linear_bias_attr,
dtype=self._dtype,
is_bias=True)
# tensor model parallel
if nranks > 1:
assert ring_id != -1
# column parallel
_set_var_distributed(self.qkv_weight)
_set_var_distributed(self.qkv_bias)
# row parallel
_set_var_distributed(self.linear_weight)
if normalize_before:
self.pre_ln_scale = self.create_parameter(
attr=pre_ln_scale_attr,
shape=[embed_dim],
default_initializer=Constant(value=1.0))
self.pre_ln_bias = self.create_parameter(
attr=pre_ln_bias_attr, shape=[embed_dim], is_bias=True)
self.ln_scale = None
self.ln_bias = None
else:
self.pre_ln_scale = None
self.pre_ln_bias = None
self.ln_scale = self.create_parameter(
attr=ln_scale_attr,
shape=[embed_dim],
default_initializer=Constant(value=1.0))
self.ln_bias = self.create_parameter(
attr=ln_bias_attr, shape=[embed_dim], is_bias=True)
self.dropout_rate = dropout_rate
self.attn_dropout_rate = attn_dropout_rate
self.name = name
def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
out = incubate_f.fused_multi_head_attention(
x=query,
qkv_weight=self.qkv_weight,
linear_weight=self.linear_weight,
pre_layer_norm=self.normalize_before,
pre_ln_scale=self.pre_ln_scale,
pre_ln_bias=self.pre_ln_bias,
ln_scale=self.ln_scale,
ln_bias=self.ln_bias,
pre_ln_epsilon=self._epsilon,
qkv_bias=self.qkv_bias,
linear_bias=self.linear_bias,
attn_mask=attn_mask,
dropout_rate=self.dropout_rate,
attn_dropout_rate=self.attn_dropout_rate,
ln_epsilon=self._epsilon,
training=self.training,
ring_id=self._ring_id,
name=self.name)
return out
def get_param_attr(weight, bias):
weight_attr = paddle.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(weight))
......@@ -206,7 +63,7 @@ def create_model(data, rank):
qkv_w_attr, qkv_b_attr = get_param_attr(col_qkv_w, col_qkv_b)
linear_w_attr, linear_b_attr = get_param_attr(row_linear_w, linear_b)
attn = ParallelFusedMultiHeadAttention(
attn = FusedMultiHeadAttention(
hidden,
n_head,
dropout_rate=0.0,
......@@ -228,7 +85,7 @@ def create_model(data, rank):
qkv_w_attr, qkv_b_attr = get_param_attr(qkv_w, qkv_b)
linear_w_attr, linear_b_attr = get_param_attr(linear_w, linear_b)
attn = ParallelFusedMultiHeadAttention(
attn = FusedMultiHeadAttention(
hidden,
n_head,
dropout_rate=0.0,
......
......@@ -20,11 +20,7 @@ import paddle
import paddle.fluid as fluid
from test_dist_base import TestDistRunnerBase, runtime_main
import paddle.distributed.fleet as fleet
from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
from paddle.fluid.dygraph.layers import Layer
from paddle.fluid.layer_helper import LayerHelper
from paddle.nn.initializer import Constant
from paddle.incubate.nn import FusedFeedForward
paddle.enable_static()
......@@ -34,239 +30,6 @@ IN_SIZE = 2 * MODEL_PARALLEL_SIZE
OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
def fused_feedforward(x,
linear1_weight,
linear2_weight,
linear1_bias=None,
linear2_bias=None,
ln1_scale=None,
ln1_bias=None,
ln2_scale=None,
ln2_bias=None,
dropout1_rate=0.5,
dropout2_rate=0.5,
activation="relu",
ln1_epsilon=1e-5,
ln2_epsilon=1e-5,
pre_layer_norm=False,
training=True,
mode='upscale_in_train',
ring_id=-1,
name=None):
seed = None
if mode not in ('downscale_in_infer', 'upscale_in_train'):
raise ValueError(
"mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode #semantic transfer
helper = LayerHelper("fused_feedforward")
dtype = x.dtype
check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
'fused_feedforward')
check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
'fused_feedforward')
out = helper.create_variable_for_type_inference(x.dtype)
dropout1_mask = helper.create_variable_for_type_inference(
'uint8', stop_gradient=True)
dropout2_mask = helper.create_variable_for_type_inference(
'uint8', stop_gradient=True)
ln1_mean = helper.create_variable_for_type_inference(
x.dtype, stop_gradient=True)
ln1_variance = helper.create_variable_for_type_inference(
x.dtype, stop_gradient=True)
ln2_mean = helper.create_variable_for_type_inference(
x.dtype, stop_gradient=True)
ln2_variance = helper.create_variable_for_type_inference(
x.dtype, stop_gradient=True)
linear1_out = helper.create_variable_for_type_inference(
x.dtype, stop_gradient=True)
ln1_out = helper.create_variable_for_type_inference(
x.dtype, stop_gradient=True)
dropout1_out = helper.create_variable_for_type_inference(
x.dtype, stop_gradient=True)
dropout2_out = helper.create_variable_for_type_inference(
x.dtype, stop_gradient=True)
if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
seed = helper.main_program.random_seed
helper.append_op(
type='fused_feedforward',
inputs={
'X': x,
'Linear1Weight': linear1_weight,
'Linear1Bias': linear1_bias,
'Linear2Weight': linear2_weight,
'Linear2Bias': linear2_bias,
'Ln1Scale': ln1_scale,
'Ln1Bias': ln1_bias,
'Ln2Scale': ln2_scale,
'Ln2Bias': ln2_bias,
},
outputs={
'Out': out,
'Dropout1Mask': dropout1_mask,
'Dropout2Mask': dropout2_mask,
'Ln1Mean': ln1_mean,
'Ln1Variance': ln1_variance,
'Ln2Mean': ln2_mean,
'Ln2Variance': ln2_variance,
'Linear1Out': linear1_out,
'Ln1Out': ln1_out,
'Dropout1Out': dropout1_out,
'Dropout2Out': dropout2_out,
},
attrs={
'dropout1_rate': dropout1_rate,
'dropout2_rate': dropout2_rate,
'act_method': activation,
'pre_layer_norm': pre_layer_norm,
'ln1_epsilon': ln1_epsilon,
'ln2_epsilon': ln2_epsilon,
'dropout1_is_test': not training,
'dropout2_is_test': not training,
'dropout1_fix_seed': seed is not None,
'dropout2_fix_seed': seed is not None,
'dropout1_seed': seed if seed is not None else 0,
'dropout2_seed': seed if seed is not None else 0,
'dropout1_implementation': mode,
'dropout2_implementation': mode,
'ring_id': ring_id,
})
return out
def _set_var_distributed(var):
if var is None:
return
var.is_distributed = True
# NOTE: use current_block and find_var_recursive to support while_loop
startup_block = paddle.static.default_startup_program().current_block()
main_block = paddle.static.default_main_program().current_block()
startup_block._find_var_recursive(var.name).is_distributed = True
main_block._find_var_recursive(var.name).is_distributed = True
class ParallelFusedFeedForward(Layer):
def __init__(self,
d_model,
dim_feedforward,
dropout_rate=0.1,
epsilon=1e-05,
activation="relu",
act_dropout_rate=None,
normalize_before=False,
linear1_weight_attr=None,
linear1_bias_attr=None,
linear2_weight_attr=None,
linear2_bias_attr=None,
ln1_scale_attr=None,
ln1_bias_attr=None,
ln2_scale_attr=None,
ln2_bias_attr=None,
nranks=1,
ring_id=-1,
name=None):
super(ParallelFusedFeedForward, self).__init__()
assert d_model > 0, (
"Expected d_model to be greater than 0, but recieved {}".format(
d_model))
assert dim_feedforward > 0, (
"Expected dim_feedforward to be greater than 0, but recieved {}".
format(dim_feedforward))
self._dtype = self._helper.get_default_dtype()
self._d_model = d_model
assert dim_feedforward % nranks == 0
dim_feedforward = dim_feedforward // nranks
self._dim_feedforward = dim_feedforward
self._dropout_rate = dropout_rate
self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
self._act_method = activation
self._normalize_before = normalize_before
self._epsilon = epsilon
self._ring_id = ring_id
self._linear1_weight = self.create_parameter(
shape=[d_model, dim_feedforward],
attr=linear1_weight_attr,
dtype=self._dtype,
is_bias=False)
self._linear1_bias = self.create_parameter(
shape=[dim_feedforward],
attr=linear1_bias_attr,
dtype=self._dtype,
is_bias=True)
self._linear2_weight = self.create_parameter(
shape=[dim_feedforward, d_model],
attr=linear2_weight_attr,
dtype=self._dtype,
is_bias=False)
self._linear2_bias = self.create_parameter(
shape=[d_model],
attr=linear2_bias_attr,
dtype=self._dtype,
is_bias=True)
if nranks > 1:
assert ring_id != -1
# column parallel
_set_var_distributed(self._linear1_weight)
_set_var_distributed(self._linear1_bias)
_set_var_distributed(self._linear2_weight)
if normalize_before:
self._ln1_scale = self.create_parameter(
shape=[d_model],
attr=ln1_scale_attr,
is_bias=False,
default_initializer=Constant(1.0))
self._ln1_bias = self.create_parameter(
shape=[d_model], attr=ln1_bias_attr, is_bias=True)
self._ln2_scale = None
self._ln2_bias = None
else:
self._ln1_bias = None
self._ln2_bias = None
self._ln2_scale = self.create_parameter(
shape=[d_model],
attr=ln2_scale_attr,
is_bias=False,
default_initializer=Constant(1.0))
self._ln2_bias = self.create_parameter(
shape=[d_model], attr=ln2_bias_attr, is_bias=True)
self.name = name
def forward(self, src, cache=None):
out = fused_feedforward(
src,
self._linear1_weight,
self._linear2_weight,
self._linear1_bias,
self._linear2_bias,
self._ln1_scale,
self._ln1_bias,
self._ln2_scale,
self._ln2_bias,
dropout1_rate=self._act_dropout_rate,
dropout2_rate=self._dropout_rate,
activation=self._act_method,
ln1_epsilon=self._epsilon,
ln2_epsilon=self._epsilon,
pre_layer_norm=self._normalize_before,
training=self.training,
ring_id=self._ring_id,
name=self.name)
return out
def get_param_attr(weight, bias):
weight_attr = paddle.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(weight))
......@@ -295,7 +58,7 @@ def create_model(data, rank):
w0_attr, b0_attr = get_param_attr(col_w0, col_b0)
w1_attr, b1_attr = get_param_attr(row_w1, b1)
ffn = ParallelFusedFeedForward(
ffn = FusedFeedForward(
IN_SIZE,
OUT_SIZE,
dropout_rate=0.0,
......@@ -316,7 +79,7 @@ def create_model(data, rank):
w0_attr, b0_attr = get_param_attr(w0, b0)
w1_attr, b1_attr = get_param_attr(w1, b1)
ffn = ParallelFusedFeedForward(
ffn = FusedFeedForward(
IN_SIZE,
OUT_SIZE,
dropout_rate=0.0,
......
......@@ -35,6 +35,18 @@ class TestFusedAttentionOp(OpTest):
def setUp(self):
self.config()
self.generate_input_data()
self.rtol = 1e-5
# FIXME(limin29): Because there is a problem with the test precision
# on A100, atol is temporarily set to 1e-2, and it will be
# changed back after the precision problem is solved.
self.atol = 1e-2
# make sure local development precision
if "V100" in paddle.device.cuda.get_device_name():
self.atol = 1e-4
if self.x_type is np.float16:
self.atol = 1e-1
paddle.set_default_dtype(self.x_type)
self.__class__.op_type = "fused_attention"
# use autograd to check grad in this unittest.
......@@ -273,9 +285,9 @@ class TestFusedAttentionOp(OpTest):
final_out_ref, x_grad_ref = self.GetBaselineOut()
final_out, x_grad = self.GetFusedAttentionOut()
np.testing.assert_allclose(
final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
final_out_ref, final_out.numpy(), rtol=self.rtol, atol=self.atol)
np.testing.assert_allclose(
x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
x_grad_ref, x_grad.numpy(), rtol=self.rtol, atol=self.atol)
class TestFusedAttentionOpBiasIsNone(TestFusedAttentionOp):
......@@ -306,9 +318,9 @@ class TestFusedAttentionOpFp16(TestFusedAttentionOp):
final_out_ref, x_grad_ref = self.GetBaselineOut()
final_out, x_grad = self.GetFusedAttentionOut()
np.testing.assert_allclose(
final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-1)
final_out_ref, final_out.numpy(), rtol=self.rtol, atol=self.atol)
np.testing.assert_allclose(
x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-1)
x_grad_ref, x_grad.numpy(), rtol=self.rtol, atol=self.atol)
class TestFusedAttentionOpCacheKV(TestFusedAttentionOp):
......@@ -324,7 +336,10 @@ class TestFusedAttentionOpCacheKV(TestFusedAttentionOp):
final_out_ref = self.GetBaselineOut()
final_out, cache_kv_out = self.GetFusedAttentionOut()
np.testing.assert_allclose(
final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
final_out_ref,
final_out.numpy(),
rtol=self.rtol,
atol=self.atol)
if __name__ == "__main__":
......
......@@ -83,7 +83,7 @@ def compute_reference(pre_layer_norm, query, attn_mask, ln_scale, ln_bias,
if ln_bias is None:
has_bias = False
if (pre_layer_norm):
if pre_layer_norm:
ln_out = layer_norm(query, True, has_bias, ln_scale, ln_bias)
num_head = qkv_weight.shape[1]
......@@ -96,7 +96,7 @@ def compute_reference(pre_layer_norm, query, attn_mask, ln_scale, ln_bias,
if qkv_bias is not None:
qkv_bias = qkv_bias.reshape(qkv_bias.shape[0] * qkv_bias.shape[1] *
qkv_bias.shape[2])
if (pre_layer_norm):
if pre_layer_norm:
ln_out = ln_out.reshape(batch_size * seq_len, embed_dim)
qkv = fc(ln_out, qkv_weight)
if qkv_bias is not None:
......@@ -173,6 +173,17 @@ class TestFusedAttentionAPI(unittest.TestCase):
self.config()
self.generate_input_data()
self.rtol = 1e-5
# FIXME(limin29): Because there is a problem with the test precision
# on A100, atol is temporarily set to 1e-2, and it will be
# changed back after the precision problem is solved.
self.atol = 1e-2
# make sure local development precision
if "V100" in paddle.device.cuda.get_device_name():
self.atol = 1e-4
if self.x_type is np.float16:
self.atol = 1e-1
def setAttnMask(self):
self.has_attn_mask = True
......@@ -230,7 +241,9 @@ class TestFusedAttentionAPI(unittest.TestCase):
fused_attn = FusedMultiHeadAttention(
self.embed_dim, self.num_heads, self.dropout_prob,
self.attn_dropout_prob, self.kdim, self.vdim, self.pre_layer_norm,
self.need_weight, self.weight_attr, self.bias_attr)
self.need_weight, self.weight_attr, self.bias_attr,
self.weight_attr, self.bias_attr, self.weight_attr, self.bias_attr,
self.weight_attr, self.bias_attr)
if self.bias_attr is not False:
qkv_bias = np.random.random(fused_attn.qkv_bias.shape).astype(
'float32')
......@@ -247,22 +260,31 @@ class TestFusedAttentionAPI(unittest.TestCase):
if self.bias_attr is not False:
fused_attn_qkv_bias = fused_attn.qkv_bias.numpy()
fused_attn_linear_bias = fused_attn.linear_bias.numpy()
if self.pre_layer_norm:
fused_attn_pre_ln_bias = fused_attn.pre_ln_bias.numpy()
fused_attn_ln_bias = None
else:
fused_attn_pre_ln_bias = None
fused_attn_ln_bias = fused_attn.ln_bias.numpy()
ref_out = compute_reference(
self.pre_layer_norm, self.query, self.attn_mask,
fused_attn.pre_ln_scale.numpy(), fused_attn_pre_ln_bias,
fused_attn.ln_scale.numpy(), fused_attn_ln_bias,
fused_attn.pre_ln_scale.numpy()
if self.pre_layer_norm else None, fused_attn_pre_ln_bias,
fused_attn.ln_scale.numpy()
if not self.pre_layer_norm else None, fused_attn_ln_bias,
fused_attn.qkv_weight.numpy(), fused_attn_qkv_bias,
fused_attn.linear_weight.numpy(), fused_attn_linear_bias)
np.testing.assert_allclose(ref_out, out.numpy(), rtol=1e-5, atol=1e-4)
np.testing.assert_allclose(
ref_out, out.numpy(), rtol=self.rtol, atol=self.atol)
def run_static(self):
fused_attn = FusedMultiHeadAttention(
self.embed_dim, self.num_heads, self.dropout_prob,
self.attn_dropout_prob, self.kdim, self.vdim, self.pre_layer_norm,
self.need_weight, self.weight_attr, self.bias_attr)
self.need_weight, self.weight_attr, self.bias_attr,
self.weight_attr, self.bias_attr, self.weight_attr, self.bias_attr,
self.weight_attr, self.bias_attr)
x = paddle.static.data(
name='X',
......@@ -286,49 +308,101 @@ class TestFusedAttentionAPI(unittest.TestCase):
qkv_bias = None
linear_bias = None
ln_scale = None
ln_2_scale = None
ln_bias = None
ln_2_bias = None
if self.has_attn_mask:
if self.bias_attr is False:
out, qkv_weight, out_linear_weight, ln_scale, ln_2_scale = exe.run(
if self.pre_layer_norm:
out, qkv_weight, out_linear_weight, ln_scale = exe.run(
paddle.static.default_main_program(),
feed={"X": self.query,
"SrcMask": self.attn_mask},
fetch_list=[
final_out,
fused_attn.qkv_weight,
fused_attn.linear_weight,
fused_attn.pre_ln_scale,
])
else:
out, qkv_weight, out_linear_weight, ln_2_scale = exe.run(
paddle.static.default_main_program(),
feed={"X": self.query,
"SrcMask": self.attn_mask},
fetch_list=[
final_out, fused_attn.qkv_weight,
fused_attn.linear_weight, fused_attn.pre_ln_scale,
fused_attn.ln_scale
fused_attn.linear_weight, fused_attn.ln_scale
])
else:
if self.pre_layer_norm:
out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias = exe.run(
paddle.static.default_main_program(),
feed={"X": self.query,
"SrcMask": self.attn_mask},
fetch_list=[
final_out,
fused_attn.qkv_weight,
fused_attn.qkv_bias,
fused_attn.linear_weight,
fused_attn.linear_bias,
fused_attn.pre_ln_scale,
fused_attn.pre_ln_bias,
])
else:
out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias = exe.run(
out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_2_scale, ln_2_bias = exe.run(
paddle.static.default_main_program(),
feed={"X": self.query,
"SrcMask": self.attn_mask},
fetch_list=[
final_out, fused_attn.qkv_weight, fused_attn.qkv_bias,
fused_attn.linear_weight, fused_attn.linear_bias,
fused_attn.pre_ln_scale, fused_attn.pre_ln_bias,
fused_attn.ln_scale, fused_attn.ln_bias
final_out, fused_attn.qkv_weight,
fused_attn.qkv_bias, fused_attn.linear_weight,
fused_attn.linear_bias, fused_attn.ln_scale,
fused_attn.ln_bias
])
else:
if self.bias_attr is False:
out, qkv_weight, out_linear_weight, ln_scale, ln_2_scale = exe.run(
if self.pre_layer_norm:
out, qkv_weight, out_linear_weight, ln_scale = exe.run(
paddle.static.default_main_program(),
feed={"X": self.query, },
fetch_list=[
final_out,
fused_attn.qkv_weight,
fused_attn.linear_weight,
fused_attn.pre_ln_scale,
])
else:
out, qkv_weight, out_linear_weight, ln_2_scale = exe.run(
paddle.static.default_main_program(),
feed={"X": self.query, },
fetch_list=[
final_out, fused_attn.qkv_weight,
fused_attn.linear_weight, fused_attn.pre_ln_scale,
fused_attn.ln_scale
fused_attn.linear_weight, fused_attn.ln_scale
])
else:
out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias = exe.run(
if self.pre_layer_norm:
out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias = exe.run(
paddle.static.default_main_program(),
feed={"X": self.query, },
fetch_list=[
final_out, fused_attn.qkv_weight, fused_attn.qkv_bias,
fused_attn.linear_weight, fused_attn.linear_bias,
fused_attn.pre_ln_scale, fused_attn.pre_ln_bias,
fused_attn.ln_scale, fused_attn.ln_bias
final_out,
fused_attn.qkv_weight,
fused_attn.qkv_bias,
fused_attn.linear_weight,
fused_attn.linear_bias,
fused_attn.pre_ln_scale,
fused_attn.pre_ln_bias,
])
else:
out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_2_scale, ln_2_bias = exe.run(
paddle.static.default_main_program(),
feed={"X": self.query, },
fetch_list=[
final_out, fused_attn.qkv_weight,
fused_attn.qkv_bias, fused_attn.linear_weight,
fused_attn.linear_bias, fused_attn.ln_scale,
fused_attn.ln_bias
])
return out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias
......@@ -341,7 +415,7 @@ class TestFusedAttentionAPI(unittest.TestCase):
self.attn_mask, ln_scale, ln_bias,
ln_2_scale, ln_2_bias, qkv_weight, qkv_bias,
linear_weight, linear_bias)
np.testing.assert_allclose(ref_out, out, rtol=1e-5, atol=1e-4)
np.testing.assert_allclose(ref_out, out, rtol=self.rtol, atol=self.atol)
def test_dynamic_api(self):
paddle.disable_static(place=paddle.CUDAPlace(0))
......
......@@ -39,6 +39,11 @@ class TestFusedFFNOp(OpTest):
def getDiff(self):
self.rtol = 1e-3
# FIXME(limin29): Because there is a problem with the test precision
# on A100, atol is temporarily set to 1e-2, and it will be
# changed back after the precision problem is solved.
self.atol = 1e-2
if "V100" in paddle.device.cuda.get_device_name():
self.atol = 1e-4
def getActivation(self):
......
......@@ -49,6 +49,14 @@ class TestFusedTransformerEncoderLayer(unittest.TestCase):
self.setPreLayerNorm()
self.setAttnMask()
self.rtol = 1e-3
# FIXME(limin29): Because there is a problem with the test precision
# on A100, atol is temporarily set to 1e-2, and it will be
# changed back after the precision problem is solved.
self.atol = 1e-2
if "V100" in paddle.device.cuda.get_device_name():
self.atol = 1e-4
def fused_weight(self, weight, num_head):
a = paddle.transpose(weight, perm=[1, 0])
return paddle.reshape(
......@@ -151,13 +159,13 @@ class TestFusedTransformerEncoderLayer(unittest.TestCase):
self.assertTrue(fused_encoder.fused_attn.extra_repr(), correct_attn_str)
np.testing.assert_allclose(
fused_out.numpy(), base_out.numpy(), rtol=1e-3, atol=1e-4)
fused_out.numpy(), base_out.numpy(), rtol=self.rtol, atol=self.atol)
self.assertTrue(
np.allclose(
fused_out.grad.numpy(),
base_out.grad.numpy(),
rtol=1e-3,
atol=1e-4))
rtol=self.rtol,
atol=self.atol))
class TestFusedTransformerEncoderLayerAct(TestFusedTransformerEncoderLayer):
......
......@@ -45,6 +45,7 @@ def fused_feedforward(x,
pre_layer_norm=False,
training=True,
mode='upscale_in_train',
ring_id=-1,
name=None):
r"""
This is a fusion operator to compute feed forward layer in transformer model architecture.
......@@ -88,6 +89,7 @@ def fused_feedforward(x,
- train: out = input * mask
- inference: out = input * (1.0 - p)
ring_id (int, optional): For distributed forward in tensor model parallel, only support NCCL. Default is -1, means not using tensor parallel.
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
......@@ -127,12 +129,11 @@ def fused_feedforward(x,
'pre_layer_norm', pre_layer_norm, 'ln1_epsilon', ln1_epsilon,
'ln2_epsilon', ln2_epsilon, 'act_method', activation,
'dropout1_rate', dropout1_rate, 'dropout2_rate', dropout2_rate,
"dropout1_is_test", not training, "dropout2_is_test", not training,
"dropout1_fix_seed", seed is not None, "dropout2_fix_seed",
seed is not None, "dropout1_seed", seed
"is_test", not training, "dropout1_fix_seed", seed is not None,
"dropout2_fix_seed", seed is not None, "dropout1_seed", seed
if seed is not None else 0, "dropout2_seed", seed
if seed is not None else 0, 'dropout1_implementation', mode,
'dropout2_implementation', mode)
'dropout2_implementation', mode, 'ring_id', ring_id)
return out
helper = LayerHelper("fused_feedforward")
......@@ -200,14 +201,14 @@ def fused_feedforward(x,
'pre_layer_norm': pre_layer_norm,
'ln1_epsilon': ln1_epsilon,
'ln2_epsilon': ln2_epsilon,
'dropout1_is_test': not training,
'dropout2_is_test': not training,
'is_test': not training,
'dropout1_fix_seed': seed is not None,
'dropout2_fix_seed': seed is not None,
'dropout1_seed': seed if seed is not None else 0,
'dropout2_seed': seed if seed is not None else 0,
'dropout1_implementation': mode,
'dropout2_implementation': mode
'dropout2_implementation': mode,
'ring_id': ring_id,
})
return out
......@@ -368,10 +369,9 @@ def fused_multi_head_attention(x,
attn_mask, linear_weight, linear_bias, ln_scale, ln_bias,
'pre_layer_norm', pre_layer_norm, 'epsilon', pre_ln_epsilon,
'dropout_rate', dropout_rate, 'attn_dropout_rate',
attn_dropout_rate, 'ln_epsilon', ln_epsilon, 'attn_dropout_is_test',
not training, 'dropout_is_test', not training,
'attn_dropout_fix_seed', seed is not None, 'dropout_fix_seed',
seed is not None, 'attn_dropout_seed', seed
attn_dropout_rate, 'ln_epsilon', ln_epsilon, 'is_test',
not training, 'attn_dropout_fix_seed', seed is not None,
'dropout_fix_seed', seed is not None, 'attn_dropout_seed', seed
if seed is not None else 0, 'dropout_seed', seed
if seed is not None else 0, 'attn_dropout_implementation', mode,
'dropout_implementation', mode, 'ring_id', ring_id)
......@@ -417,8 +417,7 @@ def fused_multi_head_attention(x,
'ln_epsilon': ln_epsilon,
'dropout_rate': dropout_rate,
'attn_dropout_rate': attn_dropout_rate,
'attn_dropout_is_test': not training,
'dropout_is_test': not training,
'is_test': not training,
'attn_dropout_fix_seed': seed is not None,
'dropout_fix_seed': seed is not None,
'attn_dropout_seed': seed if seed is not None else 0,
......@@ -656,7 +655,7 @@ def fused_multi_transformer(x,
time_step, attn_mask, linear_weights, linear_biases, ffn_ln_scales,
ffn_ln_biases, ffn1_weights, ffn1_biases, ffn2_weights, ffn2_biases,
cache_kvs, 'pre_layer_norm', pre_layer_norm, 'epsilon', epsilon,
'dropout_rate', dropout_rate, 'dropout_is_test', not training,
'dropout_rate', dropout_rate, 'is_test', not training,
'dropout_implementation', mode, 'act_method', activation, 'ring_id',
ring_id)
if cache_kvs is not None:
......@@ -703,7 +702,7 @@ def fused_multi_transformer(x,
'pre_layer_norm': pre_layer_norm,
'epsilon': epsilon,
'dropout_rate': dropout_rate,
'dropout_is_test': not training,
'is_test': not training,
'dropout_implementation': mode,
'act_method': activation,
'ring_id': ring_id
......
......@@ -61,15 +61,39 @@ class FusedMultiHeadAttention(Layer):
(True) or post_layer_norm architecture (False). Default False.
need_weights (bool, optional): Indicate whether to return the attention
weights. Now, only False is supported. Default False.
weight_attr(ParamAttr, optional): To specify the weight parameter property.
Default: None, which means the default weight parameter property is used.
See usage for details in :code:`ParamAttr`.
bias_attr (ParamAttr|bool, optional): To specify the bias parameter property.
Default: None, which means the default bias parameter property is used.
If it is set to False, this layer will not have trainable bias parameter.
See usage for details in :code:`ParamAttr`.
qkv_weight_attr(ParamAttr, optional): To specify the weight parameter property
for QKV projection computation. Default: None, which means the default weight
parameter property is used. See usage for details in :code:`ParamAttr`.
qkv_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
for QKV projection computation. The `False` value means the corresponding layer
would not have trainable bias parameter. Default: None, which means the
default bias parameter property is used. See usage for details in :code:`ParamAttr`.
linear_weight_attr(ParamAttr, optional): To specify the weight parameter property
for linear projection computation. Default: None, which means the default weight
parameter property is used. See usage for details in :code:`ParamAttr`.
linear_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
for linear projection computation. The `False` value means the corresponding layer would
not have trainable bias parameter. Default: None, which means the default bias
parameter property is used. See usage for details in :code:`ParamAttr`.
pre_ln_scale_attr(ParamAttr, optional): To specify the weight parameter property
for pre_layer_norm computation. Otherwise, all layers both use it as
`attr` to create parameters. Default: None, which means the default weight
parameter property is used. See usage for details in :code:`ParamAttr`.
pre_ln_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
for pre_layer_norm computation. The `False` value means the corresponding layer would
not have trainable bias parameter. Default: None, which means the default bias
parameter property is used. See usage for details in :code:`ParamAttr`.
ln_scale_attr(ParamAttr, optional): To specify the weight parameter property
for post_layer_norm computation. Default: None, which means the default weight
parameter property is used. See usage for details in :code:`ParamAttr`.
ln_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
for post_layer_norm computation. The `False` value means the corresponding layer would
not have trainable bias parameter. Default: None, which means the default bias
parameter property is used. See usage for details in :code:`ParamAttr`.
epsilon (float, optional): The small value added to the variance to prevent
division by zero. Default: 1e-05.
nranks (int, optional): Distributed tensor model parallel nranks. Default is 1, means not using tensor parallel.
ring_id (int, optional): For distributed tensor model parallel. Default is -1, means not using tensor parallel.
Examples:
......@@ -94,22 +118,29 @@ class FusedMultiHeadAttention(Layer):
vdim=None,
normalize_before=False,
need_weights=False,
weight_attr=None,
bias_attr=None,
qkv_weight_attr=None,
qkv_bias_attr=None,
linear_weight_attr=None,
linear_bias_attr=None,
pre_ln_scale_attr=None,
pre_ln_bias_attr=None,
ln_scale_attr=None,
ln_bias_attr=None,
epsilon=1e-5,
nranks=1,
ring_id=-1,
name=None):
super(FusedMultiHeadAttention, self).__init__()
assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
"but recieved {}".format(embed_dim))
"but received {}".format(embed_dim))
assert num_heads > 0, ("Expected nhead to be greater than 0, "
"but recieved {}".format(num_heads))
"but received {}".format(num_heads))
self.normalize_before = normalize_before
self._dtype = self._helper.get_default_dtype()
self._weight_attr = weight_attr
self._bias_attr = bias_attr
self._epsilon = epsilon
self._ring_id = ring_id
self.embed_dim = embed_dim
self.num_heads = num_heads
......@@ -118,41 +149,60 @@ class FusedMultiHeadAttention(Layer):
self.vdim = vdim
self.need_weights = need_weights
assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
assert need_weights == False, "Only support need_weight is False now."
assert need_weights is False, "Only support need_weight is False now."
# tensor model parallel
assert num_heads % nranks == 0
num_heads = num_heads // nranks
self.qkv_weight = self.create_parameter(
shape=[3, num_heads, self.head_dim, embed_dim],
attr=self._weight_attr,
attr=qkv_weight_attr,
dtype=self._dtype,
is_bias=False)
self.qkv_bias = self.create_parameter(
shape=[3, num_heads, self.head_dim],
attr=self._bias_attr,
attr=qkv_bias_attr,
dtype=self._dtype,
is_bias=True)
self.linear_weight = self.create_parameter(
shape=[embed_dim, embed_dim],
attr=self._weight_attr,
shape=[num_heads * self.head_dim, embed_dim],
attr=linear_weight_attr,
dtype=self._dtype,
is_bias=False)
self.linear_bias = self.create_parameter(
shape=[embed_dim],
attr=self._bias_attr,
attr=linear_bias_attr,
dtype=self._dtype,
is_bias=True)
# tensor model parallel
if nranks > 1:
assert ring_id != -1
# column parallel
_set_var_distributed(self.qkv_weight)
_set_var_distributed(self.qkv_bias)
# row parallel
_set_var_distributed(self.linear_weight)
if normalize_before:
self.pre_ln_scale = self.create_parameter(
attr=self._weight_attr,
attr=pre_ln_scale_attr,
shape=[embed_dim],
default_initializer=Constant(value=1.0))
self.pre_ln_bias = self.create_parameter(
attr=self._bias_attr, shape=[embed_dim], is_bias=True)
attr=pre_ln_bias_attr, shape=[embed_dim], is_bias=True)
self.ln_scale = None
self.ln_bias = None
else:
self.pre_ln_scale = None
self.pre_ln_bias = None
self.ln_scale = self.create_parameter(
attr=self._weight_attr,
attr=ln_scale_attr,
shape=[embed_dim],
default_initializer=Constant(value=1.0))
self.ln_bias = self.create_parameter(
attr=self._bias_attr, shape=[embed_dim], is_bias=True)
attr=ln_bias_attr, shape=[embed_dim], is_bias=True)
self.dropout_rate = dropout_rate
self.attn_dropout_rate = attn_dropout_rate
......@@ -197,8 +247,6 @@ class FusedMultiHeadAttention(Layer):
# Support bool or int mask
attn_mask = _convert_attention_mask(attn_mask, query.dtype)
assert cache == None, "Only support cache is None now."
out = incubate_f.fused_multi_head_attention(
x=query,
qkv_weight=self.qkv_weight,
......@@ -211,11 +259,13 @@ class FusedMultiHeadAttention(Layer):
pre_ln_epsilon=self._epsilon,
qkv_bias=self.qkv_bias,
linear_bias=self.linear_bias,
cache_kv=cache,
attn_mask=attn_mask,
dropout_rate=self.dropout_rate,
attn_dropout_rate=self.attn_dropout_rate,
ln_epsilon=self._epsilon,
training=self.training,
ring_id=self._ring_id,
name=self.name)
return out
......@@ -241,14 +291,38 @@ class FusedFeedForward(Layer):
If None, use the value of `dropout_rate`. Default None
normalize_before (bool, optional): Indicate whether to put layer normalization
into, preprocessing or postprocessing. Default False
weight_attr (ParamAttr, optional): The attribute for the learnable weight of this layer.
The default value is None and the weight will be initialized to zero. For detailed
information, please refer to paddle.ParamAttr.
bias_attr (ParamAttr|bool, optional): The attribute for the learnable bias of thi layer.
If it is set to False, no bias will be added to the output. If it is set to None or one
kind of ParamAttr, a bias parameter will be created according to ParamAttr. For detailed
information, please refer to paddle.ParamAttr. The default value is None and the bias
will be initialized to zero.
linear1_weight_attr(ParamAttr, optional): To specify the weight parameter property
for FFN first linear. Default: None, which means the default weight
parameter property is used. See usage for details in :code:`ParamAttr`.
linear1_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
for FFN first linear. The `False` value means the corresponding layer would
not have trainable bias parameter. Default: None, which means the default bias
parameter property is used. See usage for details in :code:`ParamAttr`.
linear2_weight_attr(ParamAttr, optional): To specify the weight parameter property
for FFN second linear. Default: None, which means the default weight
parameter property is used. See usage for details in :code:`ParamAttr`.
linear2_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
for FFN second linear. The `False` value means the corresponding layer would
not have trainable bias parameter. Default: None, which means the default bias
parameter property is used. See usage for details in :code:`ParamAttr`.
ln1_scale_attr(ParamAttr, optional): To specify the weight parameter property
for FFN pre_layer_norm. Default: None, which means the default weight
parameter property is used. See usage for details in :code:`ParamAttr`.
ln1_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
for FFN pre_layer_norm. The `False` value means the corresponding layer would
not have trainable bias parameter. Default: None, which means the default bias
parameter property is used. See usage for details in :code:`ParamAttr`.
ln2_scale_attr(ParamAttr, optional): To specify the weight parameter property
for FFN post_layer_norm. Default: None, which means the default weight
parameter property is used. See usage for details in :code:`ParamAttr`.
ln2_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
for FFN layer_norm. The `False` value means the corresponding layer would
not have trainable bias parameter. Default: None, which means the default bias
parameter property is used. See usage for details in :code:`ParamAttr`.
nranks (int, optional): Distributed tensor model parallel nranks. Default is 1, means not using tensor parallel.
ring_id (int, optional): For distributed tensor model parallel. Default is -1, means not using tensor parallel.
name (str, optional): The default value is None. Normally there is no need for user to set
this property. For more information, please refer to :ref:`api_guide_Name`.
Examples:
.. code-block:: python
......@@ -272,62 +346,90 @@ class FusedFeedForward(Layer):
activation="relu",
act_dropout_rate=None,
normalize_before=False,
weight_attr=None,
bias_attr=None,
linear1_weight_attr=None,
linear1_bias_attr=None,
linear2_weight_attr=None,
linear2_bias_attr=None,
ln1_scale_attr=None,
ln1_bias_attr=None,
ln2_scale_attr=None,
ln2_bias_attr=None,
nranks=1,
ring_id=-1,
name=None):
super(FusedFeedForward, self).__init__()
assert d_model > 0, (
"Expected d_model to be greater than 0, but recieved {}".format(
"Expected d_model to be greater than 0, but received {}".format(
d_model))
assert dim_feedforward > 0, (
"Expected dim_feedforward to be greater than 0, but recieved {}".
"Expected dim_feedforward to be greater than 0, but received {}".
format(dim_feedforward))
self._dtype = self._helper.get_default_dtype()
self._d_model = d_model
assert dim_feedforward % nranks == 0
dim_feedforward = dim_feedforward // nranks
self._dim_feedforward = dim_feedforward
self._dropout_rate = dropout_rate
self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
self._act_method = activation
self._normalize_before = normalize_before
self._epsilon = epsilon
self._ring_id = ring_id
self._linear1_weight = self.create_parameter(
shape=[d_model, dim_feedforward],
attr=weight_attr,
attr=linear1_weight_attr,
dtype=self._dtype,
is_bias=False)
self._linear1_bias = self.create_parameter(
shape=[dim_feedforward],
attr=bias_attr,
attr=linear1_bias_attr,
dtype=self._dtype,
is_bias=True)
self._linear2_weight = self.create_parameter(
shape=[dim_feedforward, d_model],
attr=weight_attr,
attr=linear2_weight_attr,
dtype=self._dtype,
is_bias=False)
self._linear2_bias = self.create_parameter(
shape=[d_model], attr=bias_attr, dtype=self._dtype, is_bias=True)
shape=[d_model],
attr=linear2_bias_attr,
dtype=self._dtype,
is_bias=True)
if nranks > 1:
assert ring_id != -1
# column parallel
_set_var_distributed(self._linear1_weight)
_set_var_distributed(self._linear1_bias)
_set_var_distributed(self._linear2_weight)
if normalize_before:
self._ln1_scale = self.create_parameter(
shape=[d_model],
attr=None,
attr=ln1_scale_attr,
is_bias=False,
default_initializer=Constant(1.0))
self._ln1_bias = self.create_parameter(
shape=[d_model], attr=None, is_bias=True)
shape=[d_model], attr=ln1_bias_attr, is_bias=True)
self._ln2_scale = None
self._ln2_bias = None
else:
self._ln1_scale = None
self._ln1_bias = None
self._ln2_scale = self.create_parameter(
shape=[d_model],
attr=None,
attr=ln2_scale_attr,
is_bias=False,
default_initializer=Constant(1.0))
self._ln2_bias = self.create_parameter(
shape=[d_model], attr=None, is_bias=True)
shape=[d_model], attr=ln2_bias_attr, is_bias=True)
self.name = name
def forward(self, src, cache=None):
......@@ -348,6 +450,7 @@ class FusedFeedForward(Layer):
ln2_epsilon=self._epsilon,
pre_layer_norm=self._normalize_before,
training=self.training,
ring_id=self._ring_id,
name=self.name)
return out
......@@ -434,12 +537,12 @@ class FusedTransformerEncoderLayer(Layer):
super(FusedTransformerEncoderLayer, self).__init__()
assert d_model > 0, ("Expected d_model to be greater than 0, "
"but recieved {}".format(d_model))
"but received {}".format(d_model))
assert nhead > 0, ("Expected nhead to be greater than 0, "
"but recieved {}".format(nhead))
"but received {}".format(nhead))
assert dim_feedforward > 0, (
"Expected dim_feedforward to be greater than 0, "
"but recieved {}".format(dim_feedforward))
"but received {}".format(dim_feedforward))
attn_dropout_rate = dropout_rate if attn_dropout_rate is None else attn_dropout_rate
act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
self.normalize_before = normalize_before
......@@ -453,8 +556,14 @@ class FusedTransformerEncoderLayer(Layer):
dropout_rate=dropout_rate,
attn_dropout_rate=attn_dropout_rate,
normalize_before=self.normalize_before,
weight_attr=weight_attrs[0],
bias_attr=bias_attrs[0])
qkv_weight_attr=weight_attrs[0],
qkv_bias_attr=bias_attrs[0],
linear_weight_attr=weight_attrs[0],
linear_bias_attr=bias_attrs[0],
pre_ln_scale_attr=weight_attrs[0],
pre_ln_bias_attr=bias_attrs[0],
ln_scale_attr=weight_attrs[0],
ln_bias_attr=bias_attrs[0])
self.ffn = FusedFeedForward(
d_model,
......@@ -463,8 +572,10 @@ class FusedTransformerEncoderLayer(Layer):
activation=activation,
act_dropout_rate=act_dropout_rate,
normalize_before=self.normalize_before,
weight_attr=weight_attrs[1],
bias_attr=bias_attrs[1])
linear1_weight_attr=weight_attrs[1],
linear1_bias_attr=bias_attrs[1],
linear2_weight_attr=weight_attrs[1],
linear2_bias_attr=bias_attrs[1])
def forward(self, src, src_mask=None, cache=None):
"""
......@@ -808,11 +919,11 @@ class FusedMultiTransformer(Layer):
super(FusedMultiTransformer, self).__init__()
assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
"but recieved {}".format(embed_dim))
"but received {}".format(embed_dim))
assert num_heads > 0, ("Expected nhead to be greater than 0, "
"but recieved {}".format(num_heads))
"but received {}".format(num_heads))
assert dim_feedforward > 0, (
"Expected dim_feedforward to be greater than 0, but recieved {}".
"Expected dim_feedforward to be greater than 0, but received {}".
format(dim_feedforward))
self.normalize_before = normalize_before
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册