diff --git a/ppocr/modeling/backbones/multiheadAttention.py b/ppocr/modeling/backbones/multiheadAttention.py
deleted file mode 100755
index 6aba81dec91a6a67172f666dc010c864920b782f..0000000000000000000000000000000000000000
--- a/ppocr/modeling/backbones/multiheadAttention.py
+++ /dev/null
@@ -1,157 +0,0 @@
-import paddle
-from paddle import nn
-import paddle.nn.functional as F
-from paddle.nn import Linear
-from paddle.nn.initializer import XavierUniform as xavier_uniform_
-from paddle.nn.initializer import Constant as constant_
-from paddle.nn.initializer import XavierNormal as xavier_normal_
-
-zeros_ = constant_(value=0.)
-ones_ = constant_(value=1.)
-
-
-class MultiheadAttentionOptim(nn.Layer):
-    r"""Allows the model to jointly attend to information
-    from different representation subspaces.
-    See reference: Attention Is All You Need
-
-    .. math::
-        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
-        \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
-
-    Args:
-        embed_dim: total dimension of the model
-        num_heads: parallel attention layers, or heads
-
-    Examples::
-
-        >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
-        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
-    """
-
-    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False):
-        super(MultiheadAttentionOptim, self).__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.head_dim = embed_dim // num_heads
-        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
-        self.scaling = self.head_dim ** -0.5
-
-        self.out_proj = Linear(embed_dim, embed_dim, bias_attr=bias)
-
-        self._reset_parameters()
-
-        self.conv1 = paddle.nn.Conv2D(in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
-        self.conv2 = paddle.nn.Conv2D(in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
-        self.conv3 = paddle.nn.Conv2D(in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
-
-    def _reset_parameters(self):
-
-
-        xavier_uniform_(self.out_proj.weight)
-
-
-    def forward(self, query, key, value, key_padding_mask=None, incremental_state=None,
-                need_weights=True, static_kv=False, attn_mask=None):
-        """
-        Inputs of forward function
-            query: [target length, batch size, embed dim]
-            key: [sequence length, batch size, embed dim]
-            value: [sequence length, batch size, embed dim]
-            key_padding_mask: if True, mask padding based on batch size
-            incremental_state: if provided, previous time steps are cashed
-            need_weights: output attn_output_weights
-            static_kv: key and value are static
-
-        Outputs of forward function
-            attn_output: [target length, batch size, embed dim]
-            attn_output_weights: [batch size, target length, sequence length]
-        """
-
-
-        tgt_len, bsz, embed_dim = query.shape
-        assert embed_dim == self.embed_dim
-        assert list(query.shape) == [tgt_len, bsz, embed_dim]
-        assert key.shape == value.shape
-
-        q = self._in_proj_q(query)
-        k = self._in_proj_k(key)
-        v = self._in_proj_v(value)
-        q *= self.scaling
-
-
-        q = q.reshape([tgt_len, bsz * self.num_heads, self.head_dim]).transpose([1, 0, 2])
-        k = k.reshape([-1, bsz * self.num_heads, self.head_dim]).transpose([1, 0, 2])
-        v = v.reshape([-1, bsz * self.num_heads, self.head_dim]).transpose([1, 0, 2])
-
-
-        src_len = k.shape[1]
-
-        if key_padding_mask is not None:
-            assert key_padding_mask.shape[0] == bsz
-            assert key_padding_mask.shape[1] == src_len
-
-        
-        attn_output_weights = paddle.bmm(q, k.transpose([0,2,1]))
-        assert list(attn_output_weights.shape) == [bsz * self.num_heads, tgt_len, src_len]
-
-        if attn_mask is not None:
-            attn_mask = attn_mask.unsqueeze(0)
-            attn_output_weights += attn_mask
-        if key_padding_mask is not None:
-            attn_output_weights = attn_output_weights.reshape([bsz, self.num_heads, tgt_len, src_len])
-            key = key_padding_mask.unsqueeze(1).unsqueeze(2).astype('float32')
-            
-            y = paddle.full(shape=key.shape, dtype='float32', fill_value='-inf')
-           
-            y = paddle.where(key==0.,key, y)
-
-            attn_output_weights += y
-            attn_output_weights = attn_output_weights.reshape([bsz*self.num_heads, tgt_len, src_len])
-
-        attn_output_weights = F.softmax(
-            attn_output_weights.astype('float32'), axis=-1,
-            dtype=paddle.float32 if attn_output_weights.dtype == paddle.float16 else attn_output_weights.dtype)
-        attn_output_weights = F.dropout(attn_output_weights, p=self.dropout, training=self.training)
-
-        attn_output = paddle.bmm(attn_output_weights, v)
-        assert list(attn_output.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
-        attn_output = attn_output.transpose([1, 0,2]).reshape([tgt_len, bsz, embed_dim])
-        attn_output = self.out_proj(attn_output)
-
-        if need_weights:
-            # average attention weights over heads
-            attn_output_weights = attn_output_weights.reshape([bsz, self.num_heads, tgt_len, src_len])
-            attn_output_weights = attn_output_weights.sum(axis=1) / self.num_heads
-        else:
-            attn_output_weights = None
-
-        return attn_output, attn_output_weights
-
-
-    def _in_proj_q(self, query):
-        query = query.transpose([1, 2, 0])
-        query = paddle.unsqueeze(query, axis=2)
-        res = self.conv1(query)
-        res = paddle.squeeze(res, axis=2)
-        res = res.transpose([2, 0, 1])
-        return res
-
-    def _in_proj_k(self, key):
-        
-        key = key.transpose([1, 2, 0])
-        key = paddle.unsqueeze(key, axis=2)
-        res = self.conv2(key)
-        res = paddle.squeeze(res, axis=2)
-        res = res.transpose([2, 0, 1])
-        return res
-
-    def _in_proj_v(self, value):
-        
-        value = value.transpose([1,2,0])#(1, 2, 0)
-        value = paddle.unsqueeze(value, axis=2)
-        res = self.conv3(value)
-        res = paddle.squeeze(res, axis=2)
-        res = res.transpose([2, 0, 1])
-        return res