diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index e473f8ff0662cfc3fd7bdc5010bfa1dc08fba85f..ff57b21a1864b2f56aae4fc925c77168e5f3c01b 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -190,7 +190,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     // the same as QKOut's shape.
     ctx->SetOutputDim("AttnDropoutOut",
                       {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
-    if (ctx->Attrs().Get<bool>("attn_dropout_is_test") == false) {
+    if (ctx->Attrs().Get<bool>("is_test") == false) {
       ctx->SetOutputDim("AttnDropoutMaskOut",
                         {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     }
@@ -202,7 +202,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("FMHAOut", {x_dim[0], x_dim[1], y_dim[1], y_dim[2]});
     ctx->SetOutputDim("OutLinearOut", ctx->GetInputDim("X"));
 
-    if (ctx->Attrs().Get<bool>("dropout_is_test") == false) {
+    if (ctx->Attrs().Get<bool>("is_test") == false) {
       ctx->SetOutputDim("DropoutMaskOut", ctx->GetInputDim("X"));
     }
 
@@ -297,7 +297,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
               platform::errors::InvalidArgument(
                   "'attn_dropout_rate' must be between 0.0 and 1.0."));
         });
-    AddAttr<bool>("attn_dropout_is_test",
+    AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
@@ -341,11 +341,6 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
                             platform::errors::InvalidArgument(
                                 "'dropout_rate' must be between 0.0 and 1.0."));
         });
-
-    AddAttr<bool>("dropout_is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
     AddAttr<bool>("dropout_fix_seed",
                   "A flag indicating whether to use a fixed seed to generate "
                   "random mask. NOTE: DO NOT set this flag to true in "
@@ -414,10 +409,9 @@ class FusedAttentionGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->Attrs().Get<bool>("attn_dropout_is_test"), false,
-        platform::errors::InvalidArgument(
-            "GradOp is only callable when attn_dropout_is_test is false"));
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
+                      platform::errors::InvalidArgument(
+                          "GradOp is only callable when is_test is false"));
 
     if (ctx->Attrs().Get<bool>("pre_layer_norm") == false) {
       OP_INOUT_CHECK(ctx->HasInput("Ln2Mean"), "Input", "Ln2Mean",
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index d26577f06fe683fb1528c61b4401b9e578c90c9f..e94f3a5077da31baee429d00d3dfd518dc4965fc 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -108,7 +108,7 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     const float ln_epsilon = ctx.Attr<float>("ln_epsilon");
 
     float attn_dropout_rate = ctx.Attr<float>("attn_dropout_rate");
-    bool is_test_1 = ctx.Attr<bool>("attn_dropout_is_test");
+    bool is_test_1 = ctx.Attr<bool>("is_test");
     auto &dropout_implementation_1 =
         ctx.Attr<std::string>("attn_dropout_implementation");
     bool is_upscale_in_train_1 =
@@ -279,7 +279,7 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     const float ln2epsilon = ctx.Attr<float>("ln_epsilon");
 
     float attn_dropout_prob = ctx.Attr<float>("attn_dropout_rate");
-    bool is_test_1 = ctx.Attr<bool>("attn_dropout_is_test");
+    bool is_test_1 = ctx.Attr<bool>("is_test");
     auto &dropout_implementation_1 =
         ctx.Attr<std::string>("attn_dropout_implementation");
     bool is_upscale_in_train_1 =
diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h
index 0a33a60f8123dd5b2a8bbc13b30f8e79da4a247a..c352f08ec2ba7d8ca5b0ce072f7396f17b0e09c9 100644
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -82,7 +82,7 @@ struct DropoutParam {
     auto& dropout_implementation =
         context.Attr<std::string>(pre_fix + "implementation");
     is_upscale_in_train = (dropout_implementation == "upscale_in_train");
-    is_test = context.Attr<bool>(pre_fix + "is_test");
+    is_test = context.Attr<bool>("is_test");
     fix_seed = context.Attr<bool>(pre_fix + "fix_seed");
 
     std::string str_seed = "Dropout";
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index f3f8f1742757783a082437638f67407700963eb1..8e15232acda90ea0f021b01b45d8caef06d1caf7 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -61,14 +61,14 @@ class FusedFeedForwardOp : public framework::OperatorWithKernel {
     tmp_dim_x[dim_x.size() - 1] =
         dim_Linear1Weight[dim_Linear1Weight.size() - 1];
     context->SetOutputDim("Out", dim_x);
-    if (context->Attrs().Get<bool>("dropout1_is_test") == false) {
+    if (context->Attrs().Get<bool>("is_test") == false) {
       context->SetOutputDim("Dropout1Mask", tmp_dim_x);
     }
     context->SetOutputDim("Dropout1Out", tmp_dim_x);
     context->SetOutputDim("Linear1Out", tmp_dim_x);
     context->SetOutputDim("Dropout2Out", dim_x);
 
-    if (context->Attrs().Get<bool>("dropout2_is_test") == false) {
+    if (context->Attrs().Get<bool>("is_test") == false) {
       context->SetOutputDim("Dropout2Mask", dim_x);
     }
     framework::DDim mean_dim =
@@ -185,9 +185,7 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker {
                   "dropout2_implementation can only be downgrade_in_infer or "
                   "upscale_in_train"));
         });
-    AddAttr<bool>("dropout1_is_test", "the is_test of first dropout")
-        .SetDefault(false);
-    AddAttr<bool>("dropout2_is_test", "the is_test of second dropout")
+    AddAttr<bool>("is_test", "the is_test attribute of dropout")
         .SetDefault(false);
     AddAttr<bool>("dropout1_fix_seed", "the is_test of first dropout")
         .SetDefault(false);
@@ -218,10 +216,7 @@ class FusedFeedForwardOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("dropout1_is_test"), false,
-                      platform::errors::InvalidArgument(
-                          "GradOp is only callable when is_test is false"));
-    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("dropout2_is_test"), false,
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
                       platform::errors::InvalidArgument(
                           "GradOp is only callable when is_test is false"));
     bool pre_layer_norm = ctx->Attrs().Get<bool>("pre_layer_norm");
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
index c95ca6fe0c96c45b0252e303351aa566092e37d2..98602e4edd0a2399faba2e3ec212bcf5d62d545d 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
@@ -221,7 +221,7 @@ class FusedMultiTransformerOpOpMaker
                                 "'dropout_rate' must be between 0.0 and 1.0."));
         });
 
-    AddAttr<bool>("dropout_is_test",
+    AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
index b57f26776234eb65a57cc65df2ccd5a6a38a2144..163438d2d24276d4c0ce201d683526cfa2fca715 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
@@ -20,154 +20,11 @@ import paddle
 import paddle.fluid as fluid
 from test_dist_base import TestDistRunnerBase, runtime_main
 import paddle.distributed.fleet as fleet
-import paddle.incubate.nn.functional as incubate_f
-
-from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
-from paddle.fluid.dygraph.layers import Layer
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid import core
-from paddle.nn.initializer import Constant
+from paddle.incubate.nn import FusedMultiHeadAttention
 
 paddle.enable_static()
 
 
-def _set_var_distributed(var):
-    if var is None:
-        return
-
-    var.is_distributed = True
-
-    # NOTE: use current_block and find_var_recursive to support while_loop
-    startup_block = paddle.static.default_startup_program().current_block()
-    main_block = paddle.static.default_main_program().current_block()
-    startup_block._find_var_recursive(var.name).is_distributed = True
-    main_block._find_var_recursive(var.name).is_distributed = True
-
-
-class ParallelFusedMultiHeadAttention(Layer):
-    def __init__(self,
-                 embed_dim,
-                 num_heads,
-                 dropout_rate=0.5,
-                 attn_dropout_rate=0.5,
-                 kdim=None,
-                 vdim=None,
-                 normalize_before=False,
-                 need_weights=False,
-                 qkv_weight_attr=None,
-                 qkv_bias_attr=None,
-                 linear_weight_attr=None,
-                 linear_bias_attr=None,
-                 pre_ln_scale_attr=None,
-                 pre_ln_bias_attr=None,
-                 ln_scale_attr=None,
-                 ln_bias_attr=None,
-                 epsilon=1e-5,
-                 nranks=1,
-                 ring_id=-1,
-                 name=None):
-        super(ParallelFusedMultiHeadAttention, self).__init__()
-
-        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but recieved {}".format(embed_dim))
-        assert num_heads > 0, ("Expected nhead to be greater than 0, "
-                               "but recieved {}".format(num_heads))
-
-        self.normalize_before = normalize_before
-        self._dtype = self._helper.get_default_dtype()
-        self._epsilon = epsilon
-        self._ring_id = ring_id
-
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.head_dim = embed_dim // num_heads
-        self.kdim = kdim
-        self.vdim = vdim
-        self.need_weights = need_weights
-        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
-        assert need_weights == False, "Only support need_weight is False now."
-
-        # tensor model parallel
-        assert num_heads % nranks == 0
-        num_heads = num_heads // nranks
-
-        self.qkv_weight = self.create_parameter(
-            shape=[3, num_heads, self.head_dim, embed_dim],
-            attr=qkv_weight_attr,
-            dtype=self._dtype,
-            is_bias=False)
-        self.qkv_bias = self.create_parameter(
-            shape=[3, num_heads, self.head_dim],
-            attr=qkv_bias_attr,
-            dtype=self._dtype,
-            is_bias=True)
-        self.linear_weight = self.create_parameter(
-            shape=[num_heads * self.head_dim, embed_dim],
-            attr=linear_weight_attr,
-            dtype=self._dtype,
-            is_bias=False)
-        self.linear_bias = self.create_parameter(
-            shape=[embed_dim],
-            attr=linear_bias_attr,
-            dtype=self._dtype,
-            is_bias=True)
-
-        # tensor model parallel
-        if nranks > 1:
-            assert ring_id != -1
-            # column parallel
-            _set_var_distributed(self.qkv_weight)
-            _set_var_distributed(self.qkv_bias)
-            # row parallel
-            _set_var_distributed(self.linear_weight)
-
-        if normalize_before:
-            self.pre_ln_scale = self.create_parameter(
-                attr=pre_ln_scale_attr,
-                shape=[embed_dim],
-                default_initializer=Constant(value=1.0))
-            self.pre_ln_bias = self.create_parameter(
-                attr=pre_ln_bias_attr, shape=[embed_dim], is_bias=True)
-            self.ln_scale = None
-            self.ln_bias = None
-        else:
-            self.pre_ln_scale = None
-            self.pre_ln_bias = None
-            self.ln_scale = self.create_parameter(
-                attr=ln_scale_attr,
-                shape=[embed_dim],
-                default_initializer=Constant(value=1.0))
-            self.ln_bias = self.create_parameter(
-                attr=ln_bias_attr, shape=[embed_dim], is_bias=True)
-
-        self.dropout_rate = dropout_rate
-        self.attn_dropout_rate = attn_dropout_rate
-
-        self.name = name
-
-    def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
-        out = incubate_f.fused_multi_head_attention(
-            x=query,
-            qkv_weight=self.qkv_weight,
-            linear_weight=self.linear_weight,
-            pre_layer_norm=self.normalize_before,
-            pre_ln_scale=self.pre_ln_scale,
-            pre_ln_bias=self.pre_ln_bias,
-            ln_scale=self.ln_scale,
-            ln_bias=self.ln_bias,
-            pre_ln_epsilon=self._epsilon,
-            qkv_bias=self.qkv_bias,
-            linear_bias=self.linear_bias,
-            attn_mask=attn_mask,
-            dropout_rate=self.dropout_rate,
-            attn_dropout_rate=self.attn_dropout_rate,
-            ln_epsilon=self._epsilon,
-            training=self.training,
-            ring_id=self._ring_id,
-            name=self.name)
-        return out
-
-
 def get_param_attr(weight, bias):
     weight_attr = paddle.ParamAttr(
         initializer=fluid.initializer.NumpyArrayInitializer(weight))
@@ -206,7 +63,7 @@ def create_model(data, rank):
         qkv_w_attr, qkv_b_attr = get_param_attr(col_qkv_w, col_qkv_b)
         linear_w_attr, linear_b_attr = get_param_attr(row_linear_w, linear_b)
 
-        attn = ParallelFusedMultiHeadAttention(
+        attn = FusedMultiHeadAttention(
             hidden,
             n_head,
             dropout_rate=0.0,
@@ -228,7 +85,7 @@ def create_model(data, rank):
         qkv_w_attr, qkv_b_attr = get_param_attr(qkv_w, qkv_b)
         linear_w_attr, linear_b_attr = get_param_attr(linear_w, linear_b)
 
-        attn = ParallelFusedMultiHeadAttention(
+        attn = FusedMultiHeadAttention(
             hidden,
             n_head,
             dropout_rate=0.0,
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
index 5f467da6a6465467dbf0c64122b6933df92a4cbc..e9af3884537f92ff7ee2f0199d07d324bff4f862 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
@@ -20,11 +20,7 @@ import paddle
 import paddle.fluid as fluid
 from test_dist_base import TestDistRunnerBase, runtime_main
 import paddle.distributed.fleet as fleet
-
-from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
-from paddle.fluid.dygraph.layers import Layer
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.nn.initializer import Constant
+from paddle.incubate.nn import FusedFeedForward
 
 paddle.enable_static()
 
@@ -34,239 +30,6 @@ IN_SIZE = 2 * MODEL_PARALLEL_SIZE
 OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
 
 
-def fused_feedforward(x,
-                      linear1_weight,
-                      linear2_weight,
-                      linear1_bias=None,
-                      linear2_bias=None,
-                      ln1_scale=None,
-                      ln1_bias=None,
-                      ln2_scale=None,
-                      ln2_bias=None,
-                      dropout1_rate=0.5,
-                      dropout2_rate=0.5,
-                      activation="relu",
-                      ln1_epsilon=1e-5,
-                      ln2_epsilon=1e-5,
-                      pre_layer_norm=False,
-                      training=True,
-                      mode='upscale_in_train',
-                      ring_id=-1,
-                      name=None):
-    seed = None
-    if mode not in ('downscale_in_infer', 'upscale_in_train'):
-        raise ValueError(
-            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
-    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
-
-    helper = LayerHelper("fused_feedforward")
-    dtype = x.dtype
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'fused_feedforward')
-    check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
-                'fused_feedforward')
-
-    out = helper.create_variable_for_type_inference(x.dtype)
-    dropout1_mask = helper.create_variable_for_type_inference(
-        'uint8', stop_gradient=True)
-    dropout2_mask = helper.create_variable_for_type_inference(
-        'uint8', stop_gradient=True)
-    ln1_mean = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    ln1_variance = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    ln2_mean = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    ln2_variance = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    linear1_out = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    ln1_out = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    dropout1_out = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    dropout2_out = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-
-    if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
-        seed = helper.main_program.random_seed
-
-    helper.append_op(
-        type='fused_feedforward',
-        inputs={
-            'X': x,
-            'Linear1Weight': linear1_weight,
-            'Linear1Bias': linear1_bias,
-            'Linear2Weight': linear2_weight,
-            'Linear2Bias': linear2_bias,
-            'Ln1Scale': ln1_scale,
-            'Ln1Bias': ln1_bias,
-            'Ln2Scale': ln2_scale,
-            'Ln2Bias': ln2_bias,
-        },
-        outputs={
-            'Out': out,
-            'Dropout1Mask': dropout1_mask,
-            'Dropout2Mask': dropout2_mask,
-            'Ln1Mean': ln1_mean,
-            'Ln1Variance': ln1_variance,
-            'Ln2Mean': ln2_mean,
-            'Ln2Variance': ln2_variance,
-            'Linear1Out': linear1_out,
-            'Ln1Out': ln1_out,
-            'Dropout1Out': dropout1_out,
-            'Dropout2Out': dropout2_out,
-        },
-        attrs={
-            'dropout1_rate': dropout1_rate,
-            'dropout2_rate': dropout2_rate,
-            'act_method': activation,
-            'pre_layer_norm': pre_layer_norm,
-            'ln1_epsilon': ln1_epsilon,
-            'ln2_epsilon': ln2_epsilon,
-            'dropout1_is_test': not training,
-            'dropout2_is_test': not training,
-            'dropout1_fix_seed': seed is not None,
-            'dropout2_fix_seed': seed is not None,
-            'dropout1_seed': seed if seed is not None else 0,
-            'dropout2_seed': seed if seed is not None else 0,
-            'dropout1_implementation': mode,
-            'dropout2_implementation': mode,
-            'ring_id': ring_id,
-        })
-    return out
-
-
-def _set_var_distributed(var):
-    if var is None:
-        return
-
-    var.is_distributed = True
-
-    # NOTE: use current_block and find_var_recursive to support while_loop
-    startup_block = paddle.static.default_startup_program().current_block()
-    main_block = paddle.static.default_main_program().current_block()
-    startup_block._find_var_recursive(var.name).is_distributed = True
-    main_block._find_var_recursive(var.name).is_distributed = True
-
-
-class ParallelFusedFeedForward(Layer):
-    def __init__(self,
-                 d_model,
-                 dim_feedforward,
-                 dropout_rate=0.1,
-                 epsilon=1e-05,
-                 activation="relu",
-                 act_dropout_rate=None,
-                 normalize_before=False,
-                 linear1_weight_attr=None,
-                 linear1_bias_attr=None,
-                 linear2_weight_attr=None,
-                 linear2_bias_attr=None,
-                 ln1_scale_attr=None,
-                 ln1_bias_attr=None,
-                 ln2_scale_attr=None,
-                 ln2_bias_attr=None,
-                 nranks=1,
-                 ring_id=-1,
-                 name=None):
-        super(ParallelFusedFeedForward, self).__init__()
-        assert d_model > 0, (
-            "Expected d_model to be greater than 0, but recieved {}".format(
-                d_model))
-        assert dim_feedforward > 0, (
-            "Expected dim_feedforward to be greater than 0, but recieved {}".
-            format(dim_feedforward))
-
-        self._dtype = self._helper.get_default_dtype()
-        self._d_model = d_model
-
-        assert dim_feedforward % nranks == 0
-        dim_feedforward = dim_feedforward // nranks
-        self._dim_feedforward = dim_feedforward
-        self._dropout_rate = dropout_rate
-        self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
-        self._act_method = activation
-        self._normalize_before = normalize_before
-        self._epsilon = epsilon
-        self._ring_id = ring_id
-
-        self._linear1_weight = self.create_parameter(
-            shape=[d_model, dim_feedforward],
-            attr=linear1_weight_attr,
-            dtype=self._dtype,
-            is_bias=False)
-        self._linear1_bias = self.create_parameter(
-            shape=[dim_feedforward],
-            attr=linear1_bias_attr,
-            dtype=self._dtype,
-            is_bias=True)
-
-        self._linear2_weight = self.create_parameter(
-            shape=[dim_feedforward, d_model],
-            attr=linear2_weight_attr,
-            dtype=self._dtype,
-            is_bias=False)
-
-        self._linear2_bias = self.create_parameter(
-            shape=[d_model],
-            attr=linear2_bias_attr,
-            dtype=self._dtype,
-            is_bias=True)
-
-        if nranks > 1:
-            assert ring_id != -1
-            # column parallel
-            _set_var_distributed(self._linear1_weight)
-            _set_var_distributed(self._linear1_bias)
-            _set_var_distributed(self._linear2_weight)
-
-        if normalize_before:
-            self._ln1_scale = self.create_parameter(
-                shape=[d_model],
-                attr=ln1_scale_attr,
-                is_bias=False,
-                default_initializer=Constant(1.0))
-            self._ln1_bias = self.create_parameter(
-                shape=[d_model], attr=ln1_bias_attr, is_bias=True)
-            self._ln2_scale = None
-            self._ln2_bias = None
-        else:
-            self._ln1_bias = None
-            self._ln2_bias = None
-            self._ln2_scale = self.create_parameter(
-                shape=[d_model],
-                attr=ln2_scale_attr,
-                is_bias=False,
-                default_initializer=Constant(1.0))
-            self._ln2_bias = self.create_parameter(
-                shape=[d_model], attr=ln2_bias_attr, is_bias=True)
-
-        self.name = name
-
-    def forward(self, src, cache=None):
-        out = fused_feedforward(
-            src,
-            self._linear1_weight,
-            self._linear2_weight,
-            self._linear1_bias,
-            self._linear2_bias,
-            self._ln1_scale,
-            self._ln1_bias,
-            self._ln2_scale,
-            self._ln2_bias,
-            dropout1_rate=self._act_dropout_rate,
-            dropout2_rate=self._dropout_rate,
-            activation=self._act_method,
-            ln1_epsilon=self._epsilon,
-            ln2_epsilon=self._epsilon,
-            pre_layer_norm=self._normalize_before,
-            training=self.training,
-            ring_id=self._ring_id,
-            name=self.name)
-        return out
-
-
 def get_param_attr(weight, bias):
     weight_attr = paddle.ParamAttr(
         initializer=fluid.initializer.NumpyArrayInitializer(weight))
@@ -295,7 +58,7 @@ def create_model(data, rank):
         w0_attr, b0_attr = get_param_attr(col_w0, col_b0)
         w1_attr, b1_attr = get_param_attr(row_w1, b1)
 
-        ffn = ParallelFusedFeedForward(
+        ffn = FusedFeedForward(
             IN_SIZE,
             OUT_SIZE,
             dropout_rate=0.0,
@@ -316,7 +79,7 @@ def create_model(data, rank):
         w0_attr, b0_attr = get_param_attr(w0, b0)
         w1_attr, b1_attr = get_param_attr(w1, b1)
 
-        ffn = ParallelFusedFeedForward(
+        ffn = FusedFeedForward(
             IN_SIZE,
             OUT_SIZE,
             dropout_rate=0.0,
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
index a3ae2a20dba23ef39510e962b148d40364f85e72..255388a53bf8741a406f5b25bd861f6f800d5a99 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
@@ -35,6 +35,18 @@ class TestFusedAttentionOp(OpTest):
     def setUp(self):
         self.config()
         self.generate_input_data()
+
+        self.rtol = 1e-5
+        # FIXME(limin29): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        # make sure local development precision
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
+        if self.x_type is np.float16:
+            self.atol = 1e-1
+
         paddle.set_default_dtype(self.x_type)
         self.__class__.op_type = "fused_attention"
         # use autograd to check grad in this unittest.
@@ -273,9 +285,9 @@ class TestFusedAttentionOp(OpTest):
         final_out_ref, x_grad_ref = self.GetBaselineOut()
         final_out, x_grad = self.GetFusedAttentionOut()
         np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
+            final_out_ref, final_out.numpy(), rtol=self.rtol, atol=self.atol)
         np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
+            x_grad_ref, x_grad.numpy(), rtol=self.rtol, atol=self.atol)
 
 
 class TestFusedAttentionOpBiasIsNone(TestFusedAttentionOp):
@@ -306,9 +318,9 @@ class TestFusedAttentionOpFp16(TestFusedAttentionOp):
         final_out_ref, x_grad_ref = self.GetBaselineOut()
         final_out, x_grad = self.GetFusedAttentionOut()
         np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-1)
+            final_out_ref, final_out.numpy(), rtol=self.rtol, atol=self.atol)
         np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-1)
+            x_grad_ref, x_grad.numpy(), rtol=self.rtol, atol=self.atol)
 
 
 class TestFusedAttentionOpCacheKV(TestFusedAttentionOp):
@@ -324,7 +336,10 @@ class TestFusedAttentionOpCacheKV(TestFusedAttentionOp):
             final_out_ref = self.GetBaselineOut()
             final_out, cache_kv_out = self.GetFusedAttentionOut()
             np.testing.assert_allclose(
-                final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
+                final_out_ref,
+                final_out.numpy(),
+                rtol=self.rtol,
+                atol=self.atol)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
index bdaf32ee0726dcbcf362fe1864913126db4904f0..39e3cf968912b5c8ef3e82f4863bf98443d5b5b8 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
@@ -83,7 +83,7 @@ def compute_reference(pre_layer_norm, query, attn_mask, ln_scale, ln_bias,
     if ln_bias is None:
         has_bias = False
 
-    if (pre_layer_norm):
+    if pre_layer_norm:
         ln_out = layer_norm(query, True, has_bias, ln_scale, ln_bias)
 
     num_head = qkv_weight.shape[1]
@@ -96,7 +96,7 @@ def compute_reference(pre_layer_norm, query, attn_mask, ln_scale, ln_bias,
     if qkv_bias is not None:
         qkv_bias = qkv_bias.reshape(qkv_bias.shape[0] * qkv_bias.shape[1] *
                                     qkv_bias.shape[2])
-    if (pre_layer_norm):
+    if pre_layer_norm:
         ln_out = ln_out.reshape(batch_size * seq_len, embed_dim)
         qkv = fc(ln_out, qkv_weight)
         if qkv_bias is not None:
@@ -173,6 +173,17 @@ class TestFusedAttentionAPI(unittest.TestCase):
         self.config()
         self.generate_input_data()
 
+        self.rtol = 1e-5
+        # FIXME(limin29): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        # make sure local development precision
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
+        if self.x_type is np.float16:
+            self.atol = 1e-1
+
     def setAttnMask(self):
         self.has_attn_mask = True
 
@@ -230,7 +241,9 @@ class TestFusedAttentionAPI(unittest.TestCase):
         fused_attn = FusedMultiHeadAttention(
             self.embed_dim, self.num_heads, self.dropout_prob,
             self.attn_dropout_prob, self.kdim, self.vdim, self.pre_layer_norm,
-            self.need_weight, self.weight_attr, self.bias_attr)
+            self.need_weight, self.weight_attr, self.bias_attr,
+            self.weight_attr, self.bias_attr, self.weight_attr, self.bias_attr,
+            self.weight_attr, self.bias_attr)
         if self.bias_attr is not False:
             qkv_bias = np.random.random(fused_attn.qkv_bias.shape).astype(
                 'float32')
@@ -247,22 +260,31 @@ class TestFusedAttentionAPI(unittest.TestCase):
         if self.bias_attr is not False:
             fused_attn_qkv_bias = fused_attn.qkv_bias.numpy()
             fused_attn_linear_bias = fused_attn.linear_bias.numpy()
-            fused_attn_pre_ln_bias = fused_attn.pre_ln_bias.numpy()
-            fused_attn_ln_bias = fused_attn.ln_bias.numpy()
+            if self.pre_layer_norm:
+                fused_attn_pre_ln_bias = fused_attn.pre_ln_bias.numpy()
+                fused_attn_ln_bias = None
+            else:
+                fused_attn_pre_ln_bias = None
+                fused_attn_ln_bias = fused_attn.ln_bias.numpy()
 
         ref_out = compute_reference(
             self.pre_layer_norm, self.query, self.attn_mask,
-            fused_attn.pre_ln_scale.numpy(), fused_attn_pre_ln_bias,
-            fused_attn.ln_scale.numpy(), fused_attn_ln_bias,
+            fused_attn.pre_ln_scale.numpy()
+            if self.pre_layer_norm else None, fused_attn_pre_ln_bias,
+            fused_attn.ln_scale.numpy()
+            if not self.pre_layer_norm else None, fused_attn_ln_bias,
             fused_attn.qkv_weight.numpy(), fused_attn_qkv_bias,
             fused_attn.linear_weight.numpy(), fused_attn_linear_bias)
-        np.testing.assert_allclose(ref_out, out.numpy(), rtol=1e-5, atol=1e-4)
+        np.testing.assert_allclose(
+            ref_out, out.numpy(), rtol=self.rtol, atol=self.atol)
 
     def run_static(self):
         fused_attn = FusedMultiHeadAttention(
             self.embed_dim, self.num_heads, self.dropout_prob,
             self.attn_dropout_prob, self.kdim, self.vdim, self.pre_layer_norm,
-            self.need_weight, self.weight_attr, self.bias_attr)
+            self.need_weight, self.weight_attr, self.bias_attr,
+            self.weight_attr, self.bias_attr, self.weight_attr, self.bias_attr,
+            self.weight_attr, self.bias_attr)
 
         x = paddle.static.data(
             name='X',
@@ -286,50 +308,102 @@ class TestFusedAttentionAPI(unittest.TestCase):
 
         qkv_bias = None
         linear_bias = None
+        ln_scale = None
+        ln_2_scale = None
         ln_bias = None
         ln_2_bias = None
         if self.has_attn_mask:
             if self.bias_attr is False:
-                out, qkv_weight, out_linear_weight, ln_scale, ln_2_scale = exe.run(
-                    paddle.static.default_main_program(),
-                    feed={"X": self.query,
-                          "SrcMask": self.attn_mask},
-                    fetch_list=[
-                        final_out, fused_attn.qkv_weight,
-                        fused_attn.linear_weight, fused_attn.pre_ln_scale,
-                        fused_attn.ln_scale
-                    ])
+                if self.pre_layer_norm:
+                    out, qkv_weight, out_linear_weight, ln_scale = exe.run(
+                        paddle.static.default_main_program(),
+                        feed={"X": self.query,
+                              "SrcMask": self.attn_mask},
+                        fetch_list=[
+                            final_out,
+                            fused_attn.qkv_weight,
+                            fused_attn.linear_weight,
+                            fused_attn.pre_ln_scale,
+                        ])
+                else:
+                    out, qkv_weight, out_linear_weight, ln_2_scale = exe.run(
+                        paddle.static.default_main_program(),
+                        feed={"X": self.query,
+                              "SrcMask": self.attn_mask},
+                        fetch_list=[
+                            final_out, fused_attn.qkv_weight,
+                            fused_attn.linear_weight, fused_attn.ln_scale
+                        ])
             else:
-                out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias = exe.run(
-                    paddle.static.default_main_program(),
-                    feed={"X": self.query,
-                          "SrcMask": self.attn_mask},
-                    fetch_list=[
-                        final_out, fused_attn.qkv_weight, fused_attn.qkv_bias,
-                        fused_attn.linear_weight, fused_attn.linear_bias,
-                        fused_attn.pre_ln_scale, fused_attn.pre_ln_bias,
-                        fused_attn.ln_scale, fused_attn.ln_bias
-                    ])
+                if self.pre_layer_norm:
+                    out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias = exe.run(
+                        paddle.static.default_main_program(),
+                        feed={"X": self.query,
+                              "SrcMask": self.attn_mask},
+                        fetch_list=[
+                            final_out,
+                            fused_attn.qkv_weight,
+                            fused_attn.qkv_bias,
+                            fused_attn.linear_weight,
+                            fused_attn.linear_bias,
+                            fused_attn.pre_ln_scale,
+                            fused_attn.pre_ln_bias,
+                        ])
+                else:
+                    out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_2_scale, ln_2_bias = exe.run(
+                        paddle.static.default_main_program(),
+                        feed={"X": self.query,
+                              "SrcMask": self.attn_mask},
+                        fetch_list=[
+                            final_out, fused_attn.qkv_weight,
+                            fused_attn.qkv_bias, fused_attn.linear_weight,
+                            fused_attn.linear_bias, fused_attn.ln_scale,
+                            fused_attn.ln_bias
+                        ])
         else:
             if self.bias_attr is False:
-                out, qkv_weight, out_linear_weight, ln_scale, ln_2_scale = exe.run(
-                    paddle.static.default_main_program(),
-                    feed={"X": self.query, },
-                    fetch_list=[
-                        final_out, fused_attn.qkv_weight,
-                        fused_attn.linear_weight, fused_attn.pre_ln_scale,
-                        fused_attn.ln_scale
-                    ])
+                if self.pre_layer_norm:
+                    out, qkv_weight, out_linear_weight, ln_scale = exe.run(
+                        paddle.static.default_main_program(),
+                        feed={"X": self.query, },
+                        fetch_list=[
+                            final_out,
+                            fused_attn.qkv_weight,
+                            fused_attn.linear_weight,
+                            fused_attn.pre_ln_scale,
+                        ])
+                else:
+                    out, qkv_weight, out_linear_weight, ln_2_scale = exe.run(
+                        paddle.static.default_main_program(),
+                        feed={"X": self.query, },
+                        fetch_list=[
+                            final_out, fused_attn.qkv_weight,
+                            fused_attn.linear_weight, fused_attn.ln_scale
+                        ])
             else:
-                out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias = exe.run(
-                    paddle.static.default_main_program(),
-                    feed={"X": self.query, },
-                    fetch_list=[
-                        final_out, fused_attn.qkv_weight, fused_attn.qkv_bias,
-                        fused_attn.linear_weight, fused_attn.linear_bias,
-                        fused_attn.pre_ln_scale, fused_attn.pre_ln_bias,
-                        fused_attn.ln_scale, fused_attn.ln_bias
-                    ])
+                if self.pre_layer_norm:
+                    out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias = exe.run(
+                        paddle.static.default_main_program(),
+                        feed={"X": self.query, },
+                        fetch_list=[
+                            final_out,
+                            fused_attn.qkv_weight,
+                            fused_attn.qkv_bias,
+                            fused_attn.linear_weight,
+                            fused_attn.linear_bias,
+                            fused_attn.pre_ln_scale,
+                            fused_attn.pre_ln_bias,
+                        ])
+                else:
+                    out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_2_scale, ln_2_bias = exe.run(
+                        paddle.static.default_main_program(),
+                        feed={"X": self.query, },
+                        fetch_list=[
+                            final_out, fused_attn.qkv_weight,
+                            fused_attn.qkv_bias, fused_attn.linear_weight,
+                            fused_attn.linear_bias, fused_attn.ln_scale,
+                            fused_attn.ln_bias
+                        ])
         return out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias
 
     def test_static_api(self):
@@ -341,7 +415,7 @@ class TestFusedAttentionAPI(unittest.TestCase):
                                     self.attn_mask, ln_scale, ln_bias,
                                     ln_2_scale, ln_2_bias, qkv_weight, qkv_bias,
                                     linear_weight, linear_bias)
-        np.testing.assert_allclose(ref_out, out, rtol=1e-5, atol=1e-4)
+        np.testing.assert_allclose(ref_out, out, rtol=self.rtol, atol=self.atol)
 
     def test_dynamic_api(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
index a533b5d87a5a9be1809f24f8107501f380afdfe7..a3b72fd6a8f8bf9e3886fe04ab2cb1e50805f4b1 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
@@ -39,7 +39,12 @@ class TestFusedFFNOp(OpTest):
 
     def getDiff(self):
         self.rtol = 1e-3
-        self.atol = 1e-4
+        # FIXME(limin29): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
 
     def getActivation(self):
         self.act_method = "gelu"
diff --git a/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py b/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py
index 7dc86d0dea382fcda7298a7b997a322f34d57a34..843b495e85b9a758f8693946895f11f07da0857e 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py
@@ -49,6 +49,14 @@ class TestFusedTransformerEncoderLayer(unittest.TestCase):
         self.setPreLayerNorm()
         self.setAttnMask()
 
+        self.rtol = 1e-3
+        # FIXME(limin29): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
+
     def fused_weight(self, weight, num_head):
         a = paddle.transpose(weight, perm=[1, 0])
         return paddle.reshape(
@@ -151,13 +159,13 @@ class TestFusedTransformerEncoderLayer(unittest.TestCase):
         self.assertTrue(fused_encoder.fused_attn.extra_repr(), correct_attn_str)
 
         np.testing.assert_allclose(
-            fused_out.numpy(), base_out.numpy(), rtol=1e-3, atol=1e-4)
+            fused_out.numpy(), base_out.numpy(), rtol=self.rtol, atol=self.atol)
         self.assertTrue(
             np.allclose(
                 fused_out.grad.numpy(),
                 base_out.grad.numpy(),
-                rtol=1e-3,
-                atol=1e-4))
+                rtol=self.rtol,
+                atol=self.atol))
 
 
 class TestFusedTransformerEncoderLayerAct(TestFusedTransformerEncoderLayer):
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index 3e263f1c6d3aef62396d8c8c39da229dee6458d3..08c7eaa73ec30a1c4bfdbdd4b709c165e9620a3f 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -45,6 +45,7 @@ def fused_feedforward(x,
                       pre_layer_norm=False,
                       training=True,
                       mode='upscale_in_train',
+                      ring_id=-1,
                       name=None):
     r"""
     This is a fusion operator to compute feed forward layer in transformer model architecture.
@@ -88,6 +89,7 @@ def fused_feedforward(x,
 
                                   - train: out = input * mask
                                   - inference: out = input * (1.0 - p)
+        ring_id (int, optional): For distributed forward in tensor model parallel, only support NCCL. Default is -1, means not using tensor parallel.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -127,12 +129,11 @@ def fused_feedforward(x,
             'pre_layer_norm', pre_layer_norm, 'ln1_epsilon', ln1_epsilon,
             'ln2_epsilon', ln2_epsilon, 'act_method', activation,
             'dropout1_rate', dropout1_rate, 'dropout2_rate', dropout2_rate,
-            "dropout1_is_test", not training, "dropout2_is_test", not training,
-            "dropout1_fix_seed", seed is not None, "dropout2_fix_seed",
-            seed is not None, "dropout1_seed", seed
+            "is_test", not training, "dropout1_fix_seed", seed is not None,
+            "dropout2_fix_seed", seed is not None, "dropout1_seed", seed
             if seed is not None else 0, "dropout2_seed", seed
             if seed is not None else 0, 'dropout1_implementation', mode,
-            'dropout2_implementation', mode)
+            'dropout2_implementation', mode, 'ring_id', ring_id)
         return out
 
     helper = LayerHelper("fused_feedforward")
@@ -200,14 +201,14 @@ def fused_feedforward(x,
             'pre_layer_norm': pre_layer_norm,
             'ln1_epsilon': ln1_epsilon,
             'ln2_epsilon': ln2_epsilon,
-            'dropout1_is_test': not training,
-            'dropout2_is_test': not training,
+            'is_test': not training,
             'dropout1_fix_seed': seed is not None,
             'dropout2_fix_seed': seed is not None,
             'dropout1_seed': seed if seed is not None else 0,
             'dropout2_seed': seed if seed is not None else 0,
             'dropout1_implementation': mode,
-            'dropout2_implementation': mode
+            'dropout2_implementation': mode,
+            'ring_id': ring_id,
         })
     return out
 
@@ -368,10 +369,9 @@ def fused_multi_head_attention(x,
             attn_mask, linear_weight, linear_bias, ln_scale, ln_bias,
             'pre_layer_norm', pre_layer_norm, 'epsilon', pre_ln_epsilon,
             'dropout_rate', dropout_rate, 'attn_dropout_rate',
-            attn_dropout_rate, 'ln_epsilon', ln_epsilon, 'attn_dropout_is_test',
-            not training, 'dropout_is_test', not training,
-            'attn_dropout_fix_seed', seed is not None, 'dropout_fix_seed',
-            seed is not None, 'attn_dropout_seed', seed
+            attn_dropout_rate, 'ln_epsilon', ln_epsilon, 'is_test',
+            not training, 'attn_dropout_fix_seed', seed is not None,
+            'dropout_fix_seed', seed is not None, 'attn_dropout_seed', seed
             if seed is not None else 0, 'dropout_seed', seed
             if seed is not None else 0, 'attn_dropout_implementation', mode,
             'dropout_implementation', mode, 'ring_id', ring_id)
@@ -417,8 +417,7 @@ def fused_multi_head_attention(x,
             'ln_epsilon': ln_epsilon,
             'dropout_rate': dropout_rate,
             'attn_dropout_rate': attn_dropout_rate,
-            'attn_dropout_is_test': not training,
-            'dropout_is_test': not training,
+            'is_test': not training,
             'attn_dropout_fix_seed': seed is not None,
             'dropout_fix_seed': seed is not None,
             'attn_dropout_seed': seed if seed is not None else 0,
@@ -656,7 +655,7 @@ def fused_multi_transformer(x,
             time_step, attn_mask, linear_weights, linear_biases, ffn_ln_scales,
             ffn_ln_biases, ffn1_weights, ffn1_biases, ffn2_weights, ffn2_biases,
             cache_kvs, 'pre_layer_norm', pre_layer_norm, 'epsilon', epsilon,
-            'dropout_rate', dropout_rate, 'dropout_is_test', not training,
+            'dropout_rate', dropout_rate, 'is_test', not training,
             'dropout_implementation', mode, 'act_method', activation, 'ring_id',
             ring_id)
         if cache_kvs is not None:
@@ -703,7 +702,7 @@ def fused_multi_transformer(x,
             'pre_layer_norm': pre_layer_norm,
             'epsilon': epsilon,
             'dropout_rate': dropout_rate,
-            'dropout_is_test': not training,
+            'is_test': not training,
             'dropout_implementation': mode,
             'act_method': activation,
             'ring_id': ring_id
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index d76b990958c9450603763b038cdc189a7104c126..d5f76ef5729593b173dda1ca2aef19c6d1f46326 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -61,15 +61,39 @@ class FusedMultiHeadAttention(Layer):
             (True) or post_layer_norm architecture (False). Default False.
         need_weights (bool, optional): Indicate whether to return the attention
             weights. Now, only False is supported. Default False.
-        weight_attr(ParamAttr, optional):  To specify the weight parameter property.
-            Default: None, which means the default weight parameter property is used.
-            See usage for details in :code:`ParamAttr`.
-        bias_attr (ParamAttr|bool, optional): To specify the bias parameter property.
-            Default: None, which means the default bias parameter property is used.
-            If it is set to False, this layer will not have trainable bias parameter.
-            See usage for details in :code:`ParamAttr`.
+        qkv_weight_attr(ParamAttr, optional): To specify the weight parameter property
+            for QKV projection computation. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        qkv_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
+            for QKV projection computation. The `False` value means the corresponding layer
+            would not have trainable bias parameter. Default: None, which means the
+            default bias parameter property is used. See usage for details in :code:`ParamAttr`.
+        linear_weight_attr(ParamAttr, optional): To specify the weight parameter property
+            for linear projection computation. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        linear_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
+            for linear projection computation. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        pre_ln_scale_attr(ParamAttr, optional): To specify the weight parameter property
+            for pre_layer_norm computation. Otherwise, all layers both use it as
+            `attr` to create parameters. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        pre_ln_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
+            for pre_layer_norm computation. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ln_scale_attr(ParamAttr, optional): To specify the weight parameter property
+            for post_layer_norm computation. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ln_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
+            for post_layer_norm computation. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
         epsilon (float, optional): The small value added to the variance to prevent
             division by zero. Default: 1e-05.
+        nranks (int, optional): Distributed tensor model parallel nranks. Default is 1, means not using tensor parallel.
+        ring_id (int, optional): For distributed tensor model parallel. Default is -1, means not using tensor parallel.
 
     Examples:
 
@@ -94,22 +118,29 @@ class FusedMultiHeadAttention(Layer):
                  vdim=None,
                  normalize_before=False,
                  need_weights=False,
-                 weight_attr=None,
-                 bias_attr=None,
+                 qkv_weight_attr=None,
+                 qkv_bias_attr=None,
+                 linear_weight_attr=None,
+                 linear_bias_attr=None,
+                 pre_ln_scale_attr=None,
+                 pre_ln_bias_attr=None,
+                 ln_scale_attr=None,
+                 ln_bias_attr=None,
                  epsilon=1e-5,
+                 nranks=1,
+                 ring_id=-1,
                  name=None):
         super(FusedMultiHeadAttention, self).__init__()
 
         assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but recieved {}".format(embed_dim))
+                               "but received {}".format(embed_dim))
         assert num_heads > 0, ("Expected nhead to be greater than 0, "
-                               "but recieved {}".format(num_heads))
+                               "but received {}".format(num_heads))
 
         self.normalize_before = normalize_before
         self._dtype = self._helper.get_default_dtype()
-        self._weight_attr = weight_attr
-        self._bias_attr = bias_attr
         self._epsilon = epsilon
+        self._ring_id = ring_id
 
         self.embed_dim = embed_dim
         self.num_heads = num_heads
@@ -118,41 +149,60 @@ class FusedMultiHeadAttention(Layer):
         self.vdim = vdim
         self.need_weights = need_weights
         assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
-        assert need_weights == False, "Only support need_weight is False now."
+        assert need_weights is False, "Only support need_weight is False now."
+
+        # tensor model parallel
+        assert num_heads % nranks == 0
+        num_heads = num_heads // nranks
 
         self.qkv_weight = self.create_parameter(
             shape=[3, num_heads, self.head_dim, embed_dim],
-            attr=self._weight_attr,
+            attr=qkv_weight_attr,
             dtype=self._dtype,
             is_bias=False)
         self.qkv_bias = self.create_parameter(
             shape=[3, num_heads, self.head_dim],
-            attr=self._bias_attr,
+            attr=qkv_bias_attr,
             dtype=self._dtype,
             is_bias=True)
         self.linear_weight = self.create_parameter(
-            shape=[embed_dim, embed_dim],
-            attr=self._weight_attr,
+            shape=[num_heads * self.head_dim, embed_dim],
+            attr=linear_weight_attr,
             dtype=self._dtype,
             is_bias=False)
         self.linear_bias = self.create_parameter(
             shape=[embed_dim],
-            attr=self._bias_attr,
+            attr=linear_bias_attr,
             dtype=self._dtype,
             is_bias=True)
 
-        self.pre_ln_scale = self.create_parameter(
-            attr=self._weight_attr,
-            shape=[embed_dim],
-            default_initializer=Constant(value=1.0))
-        self.pre_ln_bias = self.create_parameter(
-            attr=self._bias_attr, shape=[embed_dim], is_bias=True)
-        self.ln_scale = self.create_parameter(
-            attr=self._weight_attr,
-            shape=[embed_dim],
-            default_initializer=Constant(value=1.0))
-        self.ln_bias = self.create_parameter(
-            attr=self._bias_attr, shape=[embed_dim], is_bias=True)
+        # tensor model parallel
+        if nranks > 1:
+            assert ring_id != -1
+            # column parallel
+            _set_var_distributed(self.qkv_weight)
+            _set_var_distributed(self.qkv_bias)
+            # row parallel
+            _set_var_distributed(self.linear_weight)
+
+        if normalize_before:
+            self.pre_ln_scale = self.create_parameter(
+                attr=pre_ln_scale_attr,
+                shape=[embed_dim],
+                default_initializer=Constant(value=1.0))
+            self.pre_ln_bias = self.create_parameter(
+                attr=pre_ln_bias_attr, shape=[embed_dim], is_bias=True)
+            self.ln_scale = None
+            self.ln_bias = None
+        else:
+            self.pre_ln_scale = None
+            self.pre_ln_bias = None
+            self.ln_scale = self.create_parameter(
+                attr=ln_scale_attr,
+                shape=[embed_dim],
+                default_initializer=Constant(value=1.0))
+            self.ln_bias = self.create_parameter(
+                attr=ln_bias_attr, shape=[embed_dim], is_bias=True)
 
         self.dropout_rate = dropout_rate
         self.attn_dropout_rate = attn_dropout_rate
@@ -197,8 +247,6 @@ class FusedMultiHeadAttention(Layer):
             # Support bool or int mask
             attn_mask = _convert_attention_mask(attn_mask, query.dtype)
 
-        assert cache == None, "Only support cache is None now."
-
         out = incubate_f.fused_multi_head_attention(
             x=query,
             qkv_weight=self.qkv_weight,
@@ -211,11 +259,13 @@ class FusedMultiHeadAttention(Layer):
             pre_ln_epsilon=self._epsilon,
             qkv_bias=self.qkv_bias,
             linear_bias=self.linear_bias,
+            cache_kv=cache,
             attn_mask=attn_mask,
             dropout_rate=self.dropout_rate,
             attn_dropout_rate=self.attn_dropout_rate,
             ln_epsilon=self._epsilon,
             training=self.training,
+            ring_id=self._ring_id,
             name=self.name)
         return out
 
@@ -241,14 +291,38 @@ class FusedFeedForward(Layer):
             If None, use the value of `dropout_rate`. Default None
         normalize_before (bool, optional): Indicate whether to put layer normalization
             into, preprocessing or postprocessing. Default False
-        weight_attr (ParamAttr, optional): The attribute for the learnable weight of this layer.
-            The default value is None and the weight will be initialized to zero. For detailed
-            information, please refer to paddle.ParamAttr.
-        bias_attr (ParamAttr|bool, optional): The attribute for the learnable bias of thi layer.
-            If it is set to False, no bias will be added to the output. If it is set to None or one
-            kind of ParamAttr, a bias parameter will be created according to ParamAttr. For detailed
-            information, please refer to paddle.ParamAttr. The default value is None and the bias
-            will be initialized to zero.
+        linear1_weight_attr(ParamAttr, optional): To specify the weight parameter property
+            for FFN first linear. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        linear1_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
+            for FFN first linear. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        linear2_weight_attr(ParamAttr, optional): To specify the weight parameter property
+            for FFN second linear. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        linear2_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
+            for FFN second linear. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ln1_scale_attr(ParamAttr, optional): To specify the weight parameter property
+            for FFN pre_layer_norm. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ln1_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
+            for FFN pre_layer_norm. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ln2_scale_attr(ParamAttr, optional): To specify the weight parameter property
+            for FFN post_layer_norm. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ln2_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
+            for FFN layer_norm. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        nranks (int, optional): Distributed tensor model parallel nranks. Default is 1, means not using tensor parallel.
+        ring_id (int, optional): For distributed tensor model parallel. Default is -1, means not using tensor parallel.
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
         .. code-block:: python
@@ -272,62 +346,90 @@ class FusedFeedForward(Layer):
                  activation="relu",
                  act_dropout_rate=None,
                  normalize_before=False,
-                 weight_attr=None,
-                 bias_attr=None,
+                 linear1_weight_attr=None,
+                 linear1_bias_attr=None,
+                 linear2_weight_attr=None,
+                 linear2_bias_attr=None,
+                 ln1_scale_attr=None,
+                 ln1_bias_attr=None,
+                 ln2_scale_attr=None,
+                 ln2_bias_attr=None,
+                 nranks=1,
+                 ring_id=-1,
                  name=None):
 
         super(FusedFeedForward, self).__init__()
         assert d_model > 0, (
-            "Expected d_model to be greater than 0, but recieved {}".format(
+            "Expected d_model to be greater than 0, but received {}".format(
                 d_model))
         assert dim_feedforward > 0, (
-            "Expected dim_feedforward to be greater than 0, but recieved {}".
+            "Expected dim_feedforward to be greater than 0, but received {}".
             format(dim_feedforward))
 
         self._dtype = self._helper.get_default_dtype()
         self._d_model = d_model
+
+        assert dim_feedforward % nranks == 0
+        dim_feedforward = dim_feedforward // nranks
         self._dim_feedforward = dim_feedforward
         self._dropout_rate = dropout_rate
         self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
         self._act_method = activation
         self._normalize_before = normalize_before
         self._epsilon = epsilon
+        self._ring_id = ring_id
 
         self._linear1_weight = self.create_parameter(
             shape=[d_model, dim_feedforward],
-            attr=weight_attr,
+            attr=linear1_weight_attr,
             dtype=self._dtype,
             is_bias=False)
         self._linear1_bias = self.create_parameter(
             shape=[dim_feedforward],
-            attr=bias_attr,
+            attr=linear1_bias_attr,
             dtype=self._dtype,
             is_bias=True)
 
         self._linear2_weight = self.create_parameter(
             shape=[dim_feedforward, d_model],
-            attr=weight_attr,
+            attr=linear2_weight_attr,
             dtype=self._dtype,
             is_bias=False)
 
         self._linear2_bias = self.create_parameter(
-            shape=[d_model], attr=bias_attr, dtype=self._dtype, is_bias=True)
-
-        self._ln1_scale = self.create_parameter(
             shape=[d_model],
-            attr=None,
-            is_bias=False,
-            default_initializer=Constant(1.0))
-        self._ln1_bias = self.create_parameter(
-            shape=[d_model], attr=None, is_bias=True)
+            attr=linear2_bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+
+        if nranks > 1:
+            assert ring_id != -1
+            # column parallel
+            _set_var_distributed(self._linear1_weight)
+            _set_var_distributed(self._linear1_bias)
+            _set_var_distributed(self._linear2_weight)
+
+        if normalize_before:
+            self._ln1_scale = self.create_parameter(
+                shape=[d_model],
+                attr=ln1_scale_attr,
+                is_bias=False,
+                default_initializer=Constant(1.0))
+            self._ln1_bias = self.create_parameter(
+                shape=[d_model], attr=ln1_bias_attr, is_bias=True)
+            self._ln2_scale = None
+            self._ln2_bias = None
+        else:
+            self._ln1_scale = None
+            self._ln1_bias = None
+            self._ln2_scale = self.create_parameter(
+                shape=[d_model],
+                attr=ln2_scale_attr,
+                is_bias=False,
+                default_initializer=Constant(1.0))
+            self._ln2_bias = self.create_parameter(
+                shape=[d_model], attr=ln2_bias_attr, is_bias=True)
 
-        self._ln2_scale = self.create_parameter(
-            shape=[d_model],
-            attr=None,
-            is_bias=False,
-            default_initializer=Constant(1.0))
-        self._ln2_bias = self.create_parameter(
-            shape=[d_model], attr=None, is_bias=True)
         self.name = name
 
     def forward(self, src, cache=None):
@@ -348,6 +450,7 @@ class FusedFeedForward(Layer):
             ln2_epsilon=self._epsilon,
             pre_layer_norm=self._normalize_before,
             training=self.training,
+            ring_id=self._ring_id,
             name=self.name)
         return out
 
@@ -434,12 +537,12 @@ class FusedTransformerEncoderLayer(Layer):
 
         super(FusedTransformerEncoderLayer, self).__init__()
         assert d_model > 0, ("Expected d_model to be greater than 0, "
-                             "but recieved {}".format(d_model))
+                             "but received {}".format(d_model))
         assert nhead > 0, ("Expected nhead to be greater than 0, "
-                           "but recieved {}".format(nhead))
+                           "but received {}".format(nhead))
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but recieved {}".format(dim_feedforward))
+            "but received {}".format(dim_feedforward))
         attn_dropout_rate = dropout_rate if attn_dropout_rate is None else attn_dropout_rate
         act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
         self.normalize_before = normalize_before
@@ -453,8 +556,14 @@ class FusedTransformerEncoderLayer(Layer):
             dropout_rate=dropout_rate,
             attn_dropout_rate=attn_dropout_rate,
             normalize_before=self.normalize_before,
-            weight_attr=weight_attrs[0],
-            bias_attr=bias_attrs[0])
+            qkv_weight_attr=weight_attrs[0],
+            qkv_bias_attr=bias_attrs[0],
+            linear_weight_attr=weight_attrs[0],
+            linear_bias_attr=bias_attrs[0],
+            pre_ln_scale_attr=weight_attrs[0],
+            pre_ln_bias_attr=bias_attrs[0],
+            ln_scale_attr=weight_attrs[0],
+            ln_bias_attr=bias_attrs[0])
 
         self.ffn = FusedFeedForward(
             d_model,
@@ -463,8 +572,10 @@ class FusedTransformerEncoderLayer(Layer):
             activation=activation,
             act_dropout_rate=act_dropout_rate,
             normalize_before=self.normalize_before,
-            weight_attr=weight_attrs[1],
-            bias_attr=bias_attrs[1])
+            linear1_weight_attr=weight_attrs[1],
+            linear1_bias_attr=bias_attrs[1],
+            linear2_weight_attr=weight_attrs[1],
+            linear2_bias_attr=bias_attrs[1])
 
     def forward(self, src, src_mask=None, cache=None):
         """
@@ -808,11 +919,11 @@ class FusedMultiTransformer(Layer):
         super(FusedMultiTransformer, self).__init__()
 
         assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but recieved {}".format(embed_dim))
+                               "but received {}".format(embed_dim))
         assert num_heads > 0, ("Expected nhead to be greater than 0, "
-                               "but recieved {}".format(num_heads))
+                               "but received {}".format(num_heads))
         assert dim_feedforward > 0, (
-            "Expected dim_feedforward to be greater than 0, but recieved {}".
+            "Expected dim_feedforward to be greater than 0, but received {}".
             format(dim_feedforward))
 
         self.normalize_before = normalize_before