diff --git a/model_zoo/official/nlp/bert/README.md b/model_zoo/official/nlp/bert/README.md
index 311678f9bcdaec1ee75ecde248750fd634d6570a..922a1f8054a7e4911aef51dd600112abfe79cf9c 100644
--- a/model_zoo/official/nlp/bert/README.md
+++ b/model_zoo/official/nlp/bert/README.md
@@ -161,7 +161,6 @@ For example, the schema file of cn-wiki-128 dataset for pretraining shows as fol
     ├─dataset.py                              # data preprocessing
     ├─finetune_eval_config.py                 # parameter configuration for finetuning
     ├─finetune_eval_model.py                  # backbone code of network
-    ├─fused_layer_norm.py                     # Layernormal is optimized for Ascend
     ├─sample_process.py                       # sample processing
     ├─utils.py                                # util function
   ├─pretrain_eval.py                          # train and eval net  
diff --git a/model_zoo/official/nlp/bert/src/bert_model.py b/model_zoo/official/nlp/bert/src/bert_model.py
index 4199c13b5a12cf561147877476496f1e9ae9e0f1..c305ddc1a2c5786cf2e431caafe2b872a13a57c0 100644
--- a/model_zoo/official/nlp/bert/src/bert_model.py
+++ b/model_zoo/official/nlp/bert/src/bert_model.py
@@ -25,7 +25,6 @@ from mindspore.ops import operations as P
 from mindspore.ops import composite as C
 from mindspore.common.tensor import Tensor
 from mindspore.common.parameter import Parameter
-from .fused_layer_norm import FusedLayerNorm
 
 
 class BertConfig:
@@ -78,8 +77,7 @@ class BertConfig:
                  input_mask_from_dataset=True,
                  token_type_ids_from_dataset=True,
                  dtype=mstype.float32,
-                 compute_type=mstype.float32,
-                 enable_fused_layernorm=False):
+                 compute_type=mstype.float32):
         self.batch_size = batch_size
         self.seq_length = seq_length
         self.vocab_size = vocab_size
@@ -98,7 +96,6 @@ class BertConfig:
         self.use_relative_positions = use_relative_positions
         self.dtype = dtype
         self.compute_type = compute_type
-        self.enable_fused_layernorm = enable_fused_layernorm
 
 
 class EmbeddingLookup(nn.Cell):
@@ -245,19 +242,14 @@ class BertOutput(nn.Cell):
                  out_channels,
                  initializer_range=0.02,
                  dropout_prob=0.1,
-                 compute_type=mstype.float32,
-                 enable_fused_layernorm=False):
+                 compute_type=mstype.float32):
         super(BertOutput, self).__init__()
         self.dense = nn.Dense(in_channels, out_channels,
                               weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
         self.dropout = nn.Dropout(1 - dropout_prob)
         self.dropout_prob = dropout_prob
         self.add = P.TensorAdd()
-        if compute_type == mstype.float16:
-            self.layernorm = FusedLayerNorm((out_channels,),
-                                            use_batch_norm=enable_fused_layernorm).to_float(compute_type)
-        else:
-            self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
+        self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
         self.cast = P.Cast()
 
     def construct(self, hidden_status, input_tensor):
@@ -615,8 +607,7 @@ class BertSelfAttention(nn.Cell):
                  initializer_range=0.02,
                  hidden_dropout_prob=0.1,
                  use_relative_positions=False,
-                 compute_type=mstype.float32,
-                 enable_fused_layernorm=False):
+                 compute_type=mstype.float32):
         super(BertSelfAttention, self).__init__()
         if hidden_size % num_attention_heads != 0:
             raise ValueError("The hidden size (%d) is not a multiple of the number "
@@ -644,8 +635,7 @@ class BertSelfAttention(nn.Cell):
                                  out_channels=hidden_size,
                                  initializer_range=initializer_range,
                                  dropout_prob=hidden_dropout_prob,
-                                 compute_type=compute_type,
-                                 enable_fused_layernorm=enable_fused_layernorm)
+                                 compute_type=compute_type)
         self.reshape = P.Reshape()
         self.shape = (-1, hidden_size)
 
@@ -687,8 +677,7 @@ class BertEncoderCell(nn.Cell):
                  hidden_dropout_prob=0.1,
                  use_relative_positions=False,
                  hidden_act="gelu",
-                 compute_type=mstype.float32,
-                 enable_fused_layernorm=False):
+                 compute_type=mstype.float32):
         super(BertEncoderCell, self).__init__()
         self.attention = BertSelfAttention(
             batch_size=batch_size,
@@ -700,8 +689,7 @@ class BertEncoderCell(nn.Cell):
             initializer_range=initializer_range,
             hidden_dropout_prob=hidden_dropout_prob,
             use_relative_positions=use_relative_positions,
-            compute_type=compute_type,
-            enable_fused_layernorm=enable_fused_layernorm)
+            compute_type=compute_type)
         self.intermediate = nn.Dense(in_channels=hidden_size,
                                      out_channels=intermediate_size,
                                      activation=hidden_act,
@@ -710,8 +698,7 @@ class BertEncoderCell(nn.Cell):
                                  out_channels=hidden_size,
                                  initializer_range=initializer_range,
                                  dropout_prob=hidden_dropout_prob,
-                                 compute_type=compute_type,
-                                 enable_fused_layernorm=enable_fused_layernorm)
+                                 compute_type=compute_type)
 
     def construct(self, hidden_states, attention_mask):
         # self-attention
@@ -758,8 +745,7 @@ class BertTransformer(nn.Cell):
                  use_relative_positions=False,
                  hidden_act="gelu",
                  compute_type=mstype.float32,
-                 return_all_encoders=False,
-                 enable_fused_layernorm=False):
+                 return_all_encoders=False):
         super(BertTransformer, self).__init__()
         self.return_all_encoders = return_all_encoders
 
@@ -776,8 +762,7 @@ class BertTransformer(nn.Cell):
                                     hidden_dropout_prob=hidden_dropout_prob,
                                     use_relative_positions=use_relative_positions,
                                     hidden_act=hidden_act,
-                                    compute_type=compute_type,
-                                    enable_fused_layernorm=enable_fused_layernorm)
+                                    compute_type=compute_type)
             layers.append(layer)
 
         self.layers = nn.CellList(layers)
@@ -904,8 +889,7 @@ class BertModel(nn.Cell):
             use_relative_positions=config.use_relative_positions,
             hidden_act=config.hidden_act,
             compute_type=config.compute_type,
-            return_all_encoders=True,
-            enable_fused_layernorm=config.enable_fused_layernorm)
+            return_all_encoders=True)
 
         self.cast = P.Cast()
         self.dtype = config.dtype
diff --git a/model_zoo/official/nlp/bert/src/fused_layer_norm.py b/model_zoo/official/nlp/bert/src/fused_layer_norm.py
deleted file mode 100644
index 2736fdbd57791e9d4a07c55ef947b042ca66e7f8..0000000000000000000000000000000000000000
--- a/model_zoo/official/nlp/bert/src/fused_layer_norm.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""fused layernorm"""
-import numpy as np
-from mindspore.ops import operations as P
-from mindspore.ops import functional as F
-from mindspore.common.parameter import Parameter
-from mindspore.common.initializer import initializer
-from mindspore.ops.primitive import constexpr
-import mindspore.common.dtype as mstype
-from mindspore.nn.cell import Cell
-
-
-
-__all__ = ['FusedLayerNorm']
-
-@constexpr
-def get_shape_for_norm(x_shape, begin_norm_axis):
-    print("input_shape: ", x_shape)
-    norm_shape = x_shape[begin_norm_axis:]
-    output_shape = (1, -1, 1, int(np.prod(norm_shape)))
-    print("output_shape: ", output_shape)
-    return output_shape
-
-class FusedLayerNorm(Cell):
-    r"""
-    Applies Layer Normalization over a mini-batch of inputs.
-
-    Layer normalization is widely used in recurrent neural networks. It applies
-    normalization over a mini-batch of inputs for each single training case as described
-    in the paper `Layer Normalization <https://arxiv.org/pdf/1607.06450.pdf>`_. Unlike batch
-    normalization, layer normalization performs exactly the same computation at training and
-    testing times. It can be described using the following formula. It is applied across all channels
-    and pixel but only one batch size.
-
-    .. math::
-        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
-
-    Args:
-        normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
-            `begin_norm_axis ... R - 1`.
-        begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions
-            `begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1.
-        begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
-            will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
-            the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1.
-        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
-            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
-            'he_uniform', etc. Default: 'ones'.
-        beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
-            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
-            'he_uniform', etc. Default: 'zeros'.
-        use_batch_nrom (bool): Whether use batchnorm to preocess.
-
-    Inputs:
-        - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
-          and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`.
-
-    Outputs:
-        Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`.
-
-    Examples:
-        >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
-        >>> shape1 = x.shape[1:]
-        >>> m = nn.LayerNorm(shape1,  begin_norm_axis=1, begin_params_axis=1)
-        >>> m(x)
-    """
-    def __init__(self,
-                 normalized_shape,
-                 begin_norm_axis=-1,
-                 begin_params_axis=-1,
-                 gamma_init='ones',
-                 beta_init='zeros',
-                 use_batch_norm=False):
-        super(FusedLayerNorm, self).__init__()
-        if not isinstance(normalized_shape, (tuple, list)):
-            raise TypeError("The type of 'normalized_shape' should be tuple[int] or list[int], but '{}' type is {}."
-                            .format(normalized_shape, type(normalized_shape)))
-        self.normalized_shape = normalized_shape
-        self.begin_norm_axis = begin_norm_axis
-        self.begin_params_axis = begin_params_axis
-        self.gamma = Parameter(initializer(
-            gamma_init, normalized_shape), name="gamma")
-        self.beta = Parameter(initializer(
-            beta_init, normalized_shape), name="beta")
-        self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis)
-
-        self.batch_norm = P.BatchNorm(is_training=True, epsilon=1e-5)
-        self.use_batch_norm = use_batch_norm
-
-    def construct(self, input_x):
-        """Applies Layer Normalization over a mini-batch of inputs"""
-        if self.use_batch_norm and self.training:
-            ones = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 1.0)
-            zeros = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 0.0)
-            shape_x = F.shape(input_x)
-            norm_shape = get_shape_for_norm(shape_x, self.begin_norm_axis)
-            input_x = F.reshape(input_x, norm_shape)
-            output, _, _, _, _, _ = self.batch_norm(input_x, ones, zeros, None, None)
-            output = F.reshape(output, shape_x)
-            y = output * self.gamma + self.beta
-        else:
-            y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
-        return y
-
-    def extend_repr(self):
-        """Display instance object as string."""
-        s = 'normalized_shape={}, begin_norm_axis={}, begin_params_axis={}, gamma{}, beta={}'.format(
-            self.normalized_shape, self.begin_norm_axis, self.begin_params_axis, self.gamma, self.beta)
-        return s
diff --git a/model_zoo/official/nlp/tinybert/README.md b/model_zoo/official/nlp/tinybert/README.md
index b5220711224a747f79130765362727d77fd12558..3309fd773dec9abb6755b49dd8f8631b8332f839 100644
--- a/model_zoo/official/nlp/tinybert/README.md
+++ b/model_zoo/official/nlp/tinybert/README.md
@@ -113,7 +113,6 @@ For example, the dataset is cn-wiki-128, the schema file for general distill pha
     ├─__init__.py
     ├─assessment_method.py               # assessment method for evaluation
     ├─dataset.py                         # data processing
-    ├─fused_layer_norm.py                # Layernormal is optimized for Ascend
     ├─gd_config.py                       # parameter configuration for general distill phase
     ├─td_config.py                       # parameter configuration for task distill phase
     ├─tinybert_for_gd_td.py              # backbone code of network
@@ -229,7 +228,6 @@ Parameters for bert network:
     token_type_ids_from_dataset     use the token type ids loaded from dataset or not: True | False, default is True
     dtype                           data type of input: mstype.float16 | mstype.float32, default is mstype.float32
     compute_type                    compute type in BertTransformer: mstype.float16 | mstype.float32, default is mstype.float16
-    enable_fused_layernorm          use batchnorm instead of layernorm to improve performance, default is False
 ```
 ## [Training Process](#contents)
 ### Training
diff --git a/model_zoo/official/nlp/tinybert/src/fused_layer_norm.py b/model_zoo/official/nlp/tinybert/src/fused_layer_norm.py
deleted file mode 100644
index 7b9fb56227c387bb7fded27967246faa6f070f3e..0000000000000000000000000000000000000000
--- a/model_zoo/official/nlp/tinybert/src/fused_layer_norm.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""fused layernorm"""
-import numpy as np
-from mindspore.ops import operations as P
-from mindspore.ops import functional as F
-from mindspore.common.parameter import Parameter
-from mindspore.common.initializer import initializer
-from mindspore.ops.primitive import constexpr
-import mindspore.common.dtype as mstype
-from mindspore.nn.cell import Cell
-
-
-
-__all__ = ['FusedLayerNorm']
-
-@constexpr
-def get_shape_for_norm(x_shape, begin_norm_axis):
-    print("input_shape: ", x_shape)
-    norm_shape = x_shape[begin_norm_axis:]
-    output_shape = (1, -1, 1, int(np.prod(norm_shape)))
-    print("output_shape: ", output_shape)
-    return output_shape
-
-class FusedLayerNorm(Cell):
-    r"""
-    Applies Layer Normalization over a mini-batch of inputs.
-
-    Layer normalization is widely used in recurrent neural networks. It applies
-    normalization over a mini-batch of inputs for each single training case as described
-    in the paper `Layer Normalization <https://arxiv.org/pdf/1607.06450.pdf>`_. Unlike batch
-    normalization, layer normalization performs exactly the same computation at training and
-    testing times. It can be described using the following formula. It is applied across all channels
-    and pixel but only one batch size.
-
-    .. math::
-        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
-
-    Args:
-        normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
-            `begin_norm_axis ... R - 1`.
-        begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions
-            `begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1.
-        begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
-            will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
-            the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1.
-        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
-            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
-            'he_uniform', etc. Default: 'ones'.
-        beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
-            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
-            'he_uniform', etc. Default: 'zeros'.
-        use_batch_nrom (bool): Whether use batchnorm to preocess.
-
-    Inputs:
-        - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
-          and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`.
-
-    Outputs:
-        Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`.
-
-    Examples:
-        >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
-        >>> shape1 = x.shape[1:]
-        >>> m = nn.LayerNorm(shape1,  begin_norm_axis=1, begin_params_axis=1)
-        >>> m(x)
-    """
-    def __init__(self,
-                 normalized_shape,
-                 begin_norm_axis=-1,
-                 begin_params_axis=-1,
-                 gamma_init='ones',
-                 beta_init='zeros',
-                 use_batch_norm=False):
-        super(FusedLayerNorm, self).__init__()
-        if not isinstance(normalized_shape, (tuple, list)):
-            raise TypeError("The type of 'normalized_shape' should be tuple[int] or list[int], but '{}' type is {}."
-                            .format(normalized_shape, type(normalized_shape)))
-        self.normalized_shape = normalized_shape
-        self.begin_norm_axis = begin_norm_axis
-        self.begin_params_axis = begin_params_axis
-        self.gamma = Parameter(initializer(
-            gamma_init, normalized_shape), name="gamma")
-        self.beta = Parameter(initializer(
-            beta_init, normalized_shape), name="beta")
-        self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis)
-
-        self.batch_norm = P.BatchNorm(is_training=True, epsilon=1e-5)
-        self.use_batch_norm = use_batch_norm
-
-    def construct(self, input_x):
-        """fusedlayernorm"""
-        if self.use_batch_norm and self.training:
-            ones = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 1.0)
-            zeros = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 0.0)
-            shape_x = F.shape(input_x)
-            norm_shape = get_shape_for_norm(shape_x, self.begin_norm_axis)
-            input_x = F.reshape(input_x, norm_shape)
-            output, _, _, _, _, _ = self.batch_norm(input_x, ones, zeros, None, None)
-            output = F.reshape(output, shape_x)
-            y = output * self.gamma + self.beta
-        else:
-            y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
-        return y
-
-    def extend_repr(self):
-        """Display instance object as string."""
-        s = 'normalized_shape={}, begin_norm_axis={}, begin_params_axis={}, gamma{}, beta={}'.format(
-            self.normalized_shape, self.begin_norm_axis, self.begin_params_axis, self.gamma, self.beta)
-        return s
diff --git a/model_zoo/official/nlp/tinybert/src/gd_config.py b/model_zoo/official/nlp/tinybert/src/gd_config.py
index d2dc09d8fa08b520fcd86f55f5cf7d2a4b2e1e09..57a35af69bbb4d7a834b76ef1c68b469c98ea8dd 100644
--- a/model_zoo/official/nlp/tinybert/src/gd_config.py
+++ b/model_zoo/official/nlp/tinybert/src/gd_config.py
@@ -55,8 +55,7 @@ bert_teacher_net_cfg = BertConfig(
     input_mask_from_dataset=True,
     token_type_ids_from_dataset=True,
     dtype=mstype.float32,
-    compute_type=mstype.float16,
-    enable_fused_layernorm=False
+    compute_type=mstype.float16
 )
 bert_student_net_cfg = BertConfig(
     batch_size=32,
@@ -76,6 +75,5 @@ bert_student_net_cfg = BertConfig(
     input_mask_from_dataset=True,
     token_type_ids_from_dataset=True,
     dtype=mstype.float32,
-    compute_type=mstype.float16,
-    enable_fused_layernorm=False
+    compute_type=mstype.float16
 )
diff --git a/model_zoo/official/nlp/tinybert/src/td_config.py b/model_zoo/official/nlp/tinybert/src/td_config.py
index 2a9046587e9128b993ee0f51b0ee80fc7a5422b3..54234b543e3f9f0c1901e213236d23af316a7253 100644
--- a/model_zoo/official/nlp/tinybert/src/td_config.py
+++ b/model_zoo/official/nlp/tinybert/src/td_config.py
@@ -74,8 +74,7 @@ td_teacher_net_cfg = BertConfig(
     input_mask_from_dataset=True,
     token_type_ids_from_dataset=True,
     dtype=mstype.float32,
-    compute_type=mstype.float16,
-    enable_fused_layernorm=False
+    compute_type=mstype.float16
 )
 td_student_net_cfg = BertConfig(
     batch_size=32,
@@ -95,6 +94,5 @@ td_student_net_cfg = BertConfig(
     input_mask_from_dataset=True,
     token_type_ids_from_dataset=True,
     dtype=mstype.float32,
-    compute_type=mstype.float16,
-    enable_fused_layernorm=False
+    compute_type=mstype.float16
 )
diff --git a/model_zoo/official/nlp/tinybert/src/tinybert_model.py b/model_zoo/official/nlp/tinybert/src/tinybert_model.py
index b2499e8c53dfb5731c27f2bcb89bc9031c324d12..c45208522036b03b922de85e2d34ff42e4647dfa 100644
--- a/model_zoo/official/nlp/tinybert/src/tinybert_model.py
+++ b/model_zoo/official/nlp/tinybert/src/tinybert_model.py
@@ -25,7 +25,6 @@ from mindspore.ops import composite as C
 from mindspore.common.tensor import Tensor
 from mindspore.common.parameter import Parameter
 from mindspore import context
-from .fused_layer_norm import FusedLayerNorm
 
 
 class BertConfig:
@@ -78,8 +77,7 @@ class BertConfig:
                  input_mask_from_dataset=True,
                  token_type_ids_from_dataset=True,
                  dtype=mstype.float32,
-                 compute_type=mstype.float32,
-                 enable_fused_layernorm=False):
+                 compute_type=mstype.float32):
         self.batch_size = batch_size
         self.seq_length = seq_length
         self.vocab_size = vocab_size
@@ -98,7 +96,6 @@ class BertConfig:
         self.use_relative_positions = use_relative_positions
         self.dtype = dtype
         self.compute_type = compute_type
-        self.enable_fused_layernorm = enable_fused_layernorm
 
 
 class EmbeddingLookup(nn.Cell):
@@ -244,8 +241,7 @@ class BertOutput(nn.Cell):
                  out_channels,
                  initializer_range=0.02,
                  dropout_prob=0.1,
-                 compute_type=mstype.float32,
-                 enable_fused_layernorm=False):
+                 compute_type=mstype.float32):
         super(BertOutput, self).__init__()
         self.dense = nn.Dense(in_channels, out_channels,
                               weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
@@ -256,11 +252,7 @@ class BertOutput(nn.Cell):
             self.layernorm = nn.LayerNorm((out_channels,)).to_float(mstype.float32)
             self.compute_type = compute_type
         else:
-            if compute_type == mstype.float16:
-                self.layernorm = FusedLayerNorm((out_channels,),
-                                                use_batch_norm=enable_fused_layernorm).to_float(compute_type)
-            else:
-                self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
+            self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
 
         self.cast = P.Cast()
 
@@ -602,8 +594,7 @@ class BertSelfAttention(nn.Cell):
                  initializer_range=0.02,
                  hidden_dropout_prob=0.1,
                  use_relative_positions=False,
-                 compute_type=mstype.float32,
-                 enable_fused_layernorm=False):
+                 compute_type=mstype.float32):
         super(BertSelfAttention, self).__init__()
         if hidden_size % num_attention_heads != 0:
             raise ValueError("The hidden size (%d) is not a multiple of the number "
@@ -628,8 +619,7 @@ class BertSelfAttention(nn.Cell):
                                  out_channels=hidden_size,
                                  initializer_range=initializer_range,
                                  dropout_prob=hidden_dropout_prob,
-                                 compute_type=compute_type,
-                                 enable_fused_layernorm=enable_fused_layernorm)
+                                 compute_type=compute_type)
         self.reshape = P.Reshape()
         self.shape = (-1, hidden_size)
 
@@ -672,8 +662,7 @@ class BertEncoderCell(nn.Cell):
                  hidden_dropout_prob=0.1,
                  use_relative_positions=False,
                  hidden_act="gelu",
-                 compute_type=mstype.float32,
-                 enable_fused_layernorm=False):
+                 compute_type=mstype.float32):
         super(BertEncoderCell, self).__init__()
         self.attention = BertSelfAttention(
             batch_size=batch_size,
@@ -685,8 +674,7 @@ class BertEncoderCell(nn.Cell):
             initializer_range=initializer_range,
             hidden_dropout_prob=hidden_dropout_prob,
             use_relative_positions=use_relative_positions,
-            compute_type=compute_type,
-            enable_fused_layernorm=enable_fused_layernorm)
+            compute_type=compute_type)
         self.intermediate = nn.Dense(in_channels=hidden_size,
                                      out_channels=intermediate_size,
                                      activation=hidden_act,
@@ -695,8 +683,7 @@ class BertEncoderCell(nn.Cell):
                                  out_channels=hidden_size,
                                  initializer_range=initializer_range,
                                  dropout_prob=hidden_dropout_prob,
-                                 compute_type=compute_type,
-                                 enable_fused_layernorm=enable_fused_layernorm)
+                                 compute_type=compute_type)
     def construct(self, hidden_states, attention_mask):
         """bert encoder cell"""
         # self-attention
@@ -743,8 +730,7 @@ class BertTransformer(nn.Cell):
                  use_relative_positions=False,
                  hidden_act="gelu",
                  compute_type=mstype.float32,
-                 return_all_encoders=False,
-                 enable_fused_layernorm=False):
+                 return_all_encoders=False):
         super(BertTransformer, self).__init__()
         self.return_all_encoders = return_all_encoders
         layers = []
@@ -760,8 +746,7 @@ class BertTransformer(nn.Cell):
                                     hidden_dropout_prob=hidden_dropout_prob,
                                     use_relative_positions=use_relative_positions,
                                     hidden_act=hidden_act,
-                                    compute_type=compute_type,
-                                    enable_fused_layernorm=enable_fused_layernorm)
+                                    compute_type=compute_type)
             layers.append(layer)
         self.layers = nn.CellList(layers)
         self.reshape = P.Reshape()
@@ -877,8 +862,7 @@ class BertModel(nn.Cell):
             use_relative_positions=config.use_relative_positions,
             hidden_act=config.hidden_act,
             compute_type=config.compute_type,
-            return_all_encoders=True,
-            enable_fused_layernorm=config.enable_fused_layernorm)
+            return_all_encoders=True)
         self.cast = P.Cast()
         self.dtype = config.dtype
         self.cast_compute_type = SaturateCast(dst_type=config.compute_type)
@@ -981,8 +965,7 @@ class TinyBertModel(nn.Cell):
             use_relative_positions=config.use_relative_positions,
             hidden_act=config.hidden_act,
             compute_type=config.compute_type,
-            return_all_encoders=True,
-            enable_fused_layernorm=config.enable_fused_layernorm)
+            return_all_encoders=True)
         self.cast = P.Cast()
         self.dtype = config.dtype
         self.cast_compute_type = SaturateCast(dst_type=config.compute_type)
diff --git a/tests/st/networks/models/bert/bert_performance/test_bert_tdt_lossscale.py b/tests/st/networks/models/bert/bert_performance/test_bert_tdt_lossscale.py
index 2b84daed517702f58b8c0c4c393978fdc88fcac7..f44fd64074b599dbecc7913dbc9b5fbe8abb25be 100644
--- a/tests/st/networks/models/bert/bert_performance/test_bert_tdt_lossscale.py
+++ b/tests/st/networks/models/bert/bert_performance/test_bert_tdt_lossscale.py
@@ -82,8 +82,7 @@ def get_config(version='base', batch_size=1):
             input_mask_from_dataset=True,
             token_type_ids_from_dataset=True,
             dtype=mstype.float32,
-            compute_type=mstype.float16,
-            enable_fused_layernorm=False)
+            compute_type=mstype.float16)
     else:
         bert_config = BertConfig(batch_size=batch_size)
     return bert_config
diff --git a/tests/st/networks/models/bert/bert_precision/test_bert_tdt_lossscale.py b/tests/st/networks/models/bert/bert_precision/test_bert_tdt_lossscale.py
index 8511b5b3a5e9eea0d5323a6493fdae10292a3ad8..3501b9a5341a54b1967576be8bdd49b17032e5a8 100644
--- a/tests/st/networks/models/bert/bert_precision/test_bert_tdt_lossscale.py
+++ b/tests/st/networks/models/bert/bert_precision/test_bert_tdt_lossscale.py
@@ -82,8 +82,7 @@ def get_config(version='base', batch_size=1):
             input_mask_from_dataset=True,
             token_type_ids_from_dataset=True,
             dtype=mstype.float32,
-            compute_type=mstype.float16,
-            enable_fused_layernorm=False)
+            compute_type=mstype.float16)
     else:
         bert_config = BertConfig(batch_size=batch_size)
     return bert_config