diff --git a/configs/rec/rec_mtb_nrtr.yml b/configs/rec/rec_mtb_nrtr.yml
index 635c392d705acd1fcfbf9f744a8d7167c448d74c..8639a28a931247ee34f2e3842407fd1d2e065950 100644
--- a/configs/rec/rec_mtb_nrtr.yml
+++ b/configs/rec/rec_mtb_nrtr.yml
@@ -46,7 +46,7 @@ Architecture:
     name: Transformer
     d_model: 512
     num_encoder_layers: 6
-    beam_size: 10 # When Beam size is greater than 0, it means to use beam search when evaluation.
+    beam_size: -1 # When Beam size is greater than 0, it means to use beam search when evaluation.
     
 
 Loss:
@@ -65,7 +65,7 @@ Train:
     name: LMDBDataSet
     data_dir: ./train_data/data_lmdb_release/training/
     transforms:
-      - NRTRDecodeImage: # load image
+      - DecodeImage: # load image
           img_mode: BGR
           channel_first: False
       - NRTRLabelEncode: # Class handling label
@@ -85,7 +85,7 @@ Eval:
     name: LMDBDataSet
     data_dir: ./train_data/data_lmdb_release/evaluation/
     transforms:
-      - NRTRDecodeImage: # load image
+      - DecodeImage: # load image
           img_mode: BGR
           channel_first: False
       - NRTRLabelEncode: # Class handling label
diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py
index 643ec70503fb0015e9f7a448d3a6cf9f99171493..d7b47a8ac8beac684192cd8245e519fd1f600e6b 100644
--- a/ppocr/data/imaug/label_ops.py
+++ b/ppocr/data/imaug/label_ops.py
@@ -174,21 +174,26 @@ class NRTRLabelEncode(BaseRecLabelEncode):
         super(NRTRLabelEncode,
               self).__init__(max_text_length, character_dict_path,
                              character_type, use_space_char)
+
     def __call__(self, data):
         text = data['label']
         text = self.encode(text)
         if text is None:
             return None
+        if len(text) >= self.max_text_len - 1:
+            return None
         data['length'] = np.array(len(text))
         text.insert(0, 2)
         text.append(3)
         text = text + [0] * (self.max_text_len - len(text))
         data['label'] = np.array(text)
         return data
+
     def add_special_char(self, dict_character):
-        dict_character = ['blank','<unk>','<s>','</s>'] + dict_character
+        dict_character = ['blank', '<unk>', '<s>', '</s>'] + dict_character
         return dict_character
 
+
 class CTCLabelEncode(BaseRecLabelEncode):
     """ Convert between text-label and text-index """
 
@@ -588,7 +593,7 @@ class SARLabelEncode(BaseRecLabelEncode):
         data['length'] = np.array(len(text))
         target = [self.start_idx] + text + [self.end_idx]
         padded_text = [self.padding_idx for _ in range(self.max_text_len)]
-        
+
         padded_text[:len(target)] = target
         data['label'] = np.array(padded_text)
         return data
diff --git a/ppocr/data/imaug/rec_img_aug.py b/ppocr/data/imaug/rec_img_aug.py
index 51f5855ac36c40f1808fdec4dc00c540792b15e7..2c6302386579df7024f6f8570870967b2483f283 100644
--- a/ppocr/data/imaug/rec_img_aug.py
+++ b/ppocr/data/imaug/rec_img_aug.py
@@ -44,12 +44,33 @@ class ClsResizeImg(object):
 
 
 class NRTRRecResizeImg(object):
-    def __init__(self, image_shape, resize_type, **kwargs):
+    def __init__(self, image_shape, resize_type, padding=False, **kwargs):
         self.image_shape = image_shape
         self.resize_type = resize_type
+        self.padding = padding
 
     def __call__(self, data):
         img = data['image']
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        image_shape = self.image_shape
+        if self.padding:
+            imgC, imgH, imgW = image_shape
+            # todo: change to 0 and modified image shape
+            h = img.shape[0]
+            w = img.shape[1]
+            ratio = w / float(h)
+            if math.ceil(imgH * ratio) > imgW:
+                resized_w = imgW
+            else:
+                resized_w = int(math.ceil(imgH * ratio))
+            resized_image = cv2.resize(img, (resized_w, imgH))
+            norm_img = np.expand_dims(resized_image, -1)
+            norm_img = norm_img.transpose((2, 0, 1))
+            resized_image = norm_img.astype(np.float32) / 128. - 1.
+            padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
+            padding_im[:, :, 0:resized_w] = resized_image
+            data['image'] = padding_im
+            return data
         if self.resize_type == 'PIL':
             image_pil = Image.fromarray(np.uint8(img))
             img = image_pil.resize(self.image_shape, Image.ANTIALIAS)
@@ -109,7 +130,8 @@ class SARRecResizeImg(object):
 
     def __call__(self, data):
         img = data['image']
-        norm_img, resize_shape, pad_shape, valid_ratio = resize_norm_img_sar(img, self.image_shape, self.width_downsample_ratio)
+        norm_img, resize_shape, pad_shape, valid_ratio = resize_norm_img_sar(
+            img, self.image_shape, self.width_downsample_ratio)
         data['image'] = norm_img
         data['resized_shape'] = resize_shape
         data['pad_shape'] = pad_shape
diff --git a/ppocr/data/simple_dataset.py b/ppocr/data/simple_dataset.py
index e9c3394cbe930d5169ae005e7582a2902e697b7e..6a33e1342506f26ccaa4a146f3f02fadfbd741a2 100644
--- a/ppocr/data/simple_dataset.py
+++ b/ppocr/data/simple_dataset.py
@@ -15,7 +15,6 @@ import numpy as np
 import os
 import random
 from paddle.io import Dataset
-
 from .imaug import transform, create_operators
 
 
diff --git a/ppocr/modeling/backbones/rec_nrtr_mtb.py b/ppocr/modeling/backbones/rec_nrtr_mtb.py
index 04b5c9bb5fdff448fbf7ad366bc39bf0e3ebfe6b..22e02a6371c3ff8b28fd88b5cfa1087309d551f8 100644
--- a/ppocr/modeling/backbones/rec_nrtr_mtb.py
+++ b/ppocr/modeling/backbones/rec_nrtr_mtb.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from paddle import nn
+import paddle
 
 
 class MTB(nn.Layer):
@@ -40,7 +41,8 @@ class MTB(nn.Layer):
         x = self.block(images)
         if self.cnn_num == 2:
             # (b, w, h, c)
-            x = x.transpose([0, 3, 2, 1])
-            x_shape = x.shape
-            x = x.reshape([x_shape[0], x_shape[1], x_shape[2] * x_shape[3]])
+            x = paddle.transpose(x, [0, 3, 2, 1])
+            x_shape = paddle.shape(x)
+            x = paddle.reshape(
+                x, [x_shape[0], x_shape[1], x_shape[2] * x_shape[3]])
         return x
diff --git a/ppocr/modeling/heads/multiheadAttention.py b/ppocr/modeling/heads/multiheadAttention.py
index 651d4f577d2f5d1c11e36f90d1c7fea5fc3ab86e..900865ba1a8d80a108b3247ce1aff91c242860f2 100755
--- a/ppocr/modeling/heads/multiheadAttention.py
+++ b/ppocr/modeling/heads/multiheadAttention.py
@@ -71,8 +71,6 @@ class MultiheadAttention(nn.Layer):
                 value,
                 key_padding_mask=None,
                 incremental_state=None,
-                need_weights=True,
-                static_kv=False,
                 attn_mask=None):
         """
         Inputs of forward function
@@ -88,46 +86,42 @@ class MultiheadAttention(nn.Layer):
             attn_output: [target length, batch size, embed dim]
             attn_output_weights: [batch size, target length, sequence length]
         """
-        tgt_len, bsz, embed_dim = query.shape
-        assert embed_dim == self.embed_dim
-        assert list(query.shape) == [tgt_len, bsz, embed_dim]
-        assert key.shape == value.shape
-
+        q_shape = paddle.shape(query)
+        src_shape = paddle.shape(key)
         q = self._in_proj_q(query)
         k = self._in_proj_k(key)
         v = self._in_proj_v(value)
         q *= self.scaling
-
-        q = q.reshape([tgt_len, bsz * self.num_heads, self.head_dim]).transpose(
-            [1, 0, 2])
-        k = k.reshape([-1, bsz * self.num_heads, self.head_dim]).transpose(
-            [1, 0, 2])
-        v = v.reshape([-1, bsz * self.num_heads, self.head_dim]).transpose(
-            [1, 0, 2])
-
-        src_len = k.shape[1]
-
+        q = paddle.transpose(
+            paddle.reshape(
+                q, [q_shape[0], q_shape[1], self.num_heads, self.head_dim]),
+            [1, 2, 0, 3])
+        k = paddle.transpose(
+            paddle.reshape(
+                k, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]),
+            [1, 2, 0, 3])
+        v = paddle.transpose(
+            paddle.reshape(
+                v, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]),
+            [1, 2, 0, 3])
         if key_padding_mask is not None:
-            assert key_padding_mask.shape[0] == bsz
-            assert key_padding_mask.shape[1] == src_len
-
-        attn_output_weights = paddle.bmm(q, k.transpose([0, 2, 1]))
-        assert list(attn_output_weights.
-                    shape) == [bsz * self.num_heads, tgt_len, src_len]
-
+            assert key_padding_mask.shape[0] == q_shape[1]
+            assert key_padding_mask.shape[1] == src_shape[0]
+        attn_output_weights = paddle.matmul(q,
+                                            paddle.transpose(k, [0, 1, 3, 2]))
         if attn_mask is not None:
-            attn_mask = attn_mask.unsqueeze(0)
+            attn_mask = paddle.unsqueeze(paddle.unsqueeze(attn_mask, 0), 0)
             attn_output_weights += attn_mask
         if key_padding_mask is not None:
-            attn_output_weights = attn_output_weights.reshape(
-                [bsz, self.num_heads, tgt_len, src_len])
-            key = key_padding_mask.unsqueeze(1).unsqueeze(2).astype('float32')
-            y = paddle.full(shape=key.shape, dtype='float32', fill_value='-inf')
+            attn_output_weights = paddle.reshape(
+                attn_output_weights,
+                [q_shape[1], self.num_heads, q_shape[0], src_shape[0]])
+            key = paddle.unsqueeze(paddle.unsqueeze(key_padding_mask, 1), 2)
+            key = paddle.cast(key, 'float32')
+            y = paddle.full(
+                shape=paddle.shape(key), dtype='float32', fill_value='-inf')
             y = paddle.where(key == 0., key, y)
             attn_output_weights += y
-            attn_output_weights = attn_output_weights.reshape(
-                [bsz * self.num_heads, tgt_len, src_len])
-
         attn_output_weights = F.softmax(
             attn_output_weights.astype('float32'),
             axis=-1,
@@ -136,43 +130,34 @@ class MultiheadAttention(nn.Layer):
         attn_output_weights = F.dropout(
             attn_output_weights, p=self.dropout, training=self.training)
 
-        attn_output = paddle.bmm(attn_output_weights, v)
-        assert list(attn_output.
-                    shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
-        attn_output = attn_output.transpose([1, 0, 2]).reshape(
-            [tgt_len, bsz, embed_dim])
+        attn_output = paddle.matmul(attn_output_weights, v)
+        attn_output = paddle.reshape(
+            paddle.transpose(attn_output, [2, 0, 1, 3]),
+            [q_shape[0], q_shape[1], self.embed_dim])
         attn_output = self.out_proj(attn_output)
 
-        if need_weights:
-            # average attention weights over heads
-            attn_output_weights = attn_output_weights.reshape(
-                [bsz, self.num_heads, tgt_len, src_len])
-            attn_output_weights = attn_output_weights.sum(
-                axis=1) / self.num_heads
-        else:
-            attn_output_weights = None
-        return attn_output, attn_output_weights
+        return attn_output
 
     def _in_proj_q(self, query):
-        query = query.transpose([1, 2, 0])
+        query = paddle.transpose(query, [1, 2, 0])
         query = paddle.unsqueeze(query, axis=2)
         res = self.conv1(query)
         res = paddle.squeeze(res, axis=2)
-        res = res.transpose([2, 0, 1])
+        res = paddle.transpose(res, [2, 0, 1])
         return res
 
     def _in_proj_k(self, key):
-        key = key.transpose([1, 2, 0])
+        key = paddle.transpose(key, [1, 2, 0])
         key = paddle.unsqueeze(key, axis=2)
         res = self.conv2(key)
         res = paddle.squeeze(res, axis=2)
-        res = res.transpose([2, 0, 1])
+        res = paddle.transpose(res, [2, 0, 1])
         return res
 
     def _in_proj_v(self, value):
-        value = value.transpose([1, 2, 0])  #(1, 2, 0)
+        value = paddle.transpose(value, [1, 2, 0])  #(1, 2, 0)
         value = paddle.unsqueeze(value, axis=2)
         res = self.conv3(value)
         res = paddle.squeeze(res, axis=2)
-        res = res.transpose([2, 0, 1])
+        res = paddle.transpose(res, [2, 0, 1])
         return res
diff --git a/ppocr/modeling/heads/rec_nrtr_head.py b/ppocr/modeling/heads/rec_nrtr_head.py
index 05dba677b4109897b6a20888151e680e652d6741..38ba0c917840ea7d1e2a3c2bf0da32c2c35f2b40 100644
--- a/ppocr/modeling/heads/rec_nrtr_head.py
+++ b/ppocr/modeling/heads/rec_nrtr_head.py
@@ -61,12 +61,12 @@ class Transformer(nn.Layer):
                  custom_decoder=None,
                  in_channels=0,
                  out_channels=0,
-                 dst_vocab_size=99,
                  scale_embedding=True):
         super(Transformer, self).__init__()
+        self.out_channels = out_channels + 1
         self.embedding = Embeddings(
             d_model=d_model,
-            vocab=dst_vocab_size,
+            vocab=self.out_channels,
             padding_idx=0,
             scale_embedding=scale_embedding)
         self.positional_encoding = PositionalEncoding(
@@ -96,9 +96,10 @@ class Transformer(nn.Layer):
         self.beam_size = beam_size
         self.d_model = d_model
         self.nhead = nhead
-        self.tgt_word_prj = nn.Linear(d_model, dst_vocab_size, bias_attr=False)
+        self.tgt_word_prj = nn.Linear(
+            d_model, self.out_channels, bias_attr=False)
         w0 = np.random.normal(0.0, d_model**-0.5,
-                              (d_model, dst_vocab_size)).astype(np.float32)
+                              (d_model, self.out_channels)).astype(np.float32)
         self.tgt_word_prj.weight.set_value(w0)
         self.apply(self._init_weights)
 
@@ -156,46 +157,41 @@ class Transformer(nn.Layer):
                 return self.forward_test(src)
 
     def forward_test(self, src):
-        bs = src.shape[0]
+        bs = paddle.shape(src)[0]
         if self.encoder is not None:
-            src = self.positional_encoding(src.transpose([1, 0, 2]))
+            src = self.positional_encoding(paddle.transpose(src, [1, 0, 2]))
             memory = self.encoder(src)
         else:
-            memory = src.squeeze(2).transpose([2, 0, 1])
+            memory = paddle.transpose(paddle.squeeze(src, 2), [2, 0, 1])
         dec_seq = paddle.full((bs, 1), 2, dtype=paddle.int64)
+        dec_prob = paddle.full((bs, 1), 1., dtype=paddle.float32)
         for len_dec_seq in range(1, 25):
-            src_enc = memory.clone()
-            tgt_key_padding_mask = self.generate_padding_mask(dec_seq)
-            dec_seq_embed = self.embedding(dec_seq).transpose([1, 0, 2])
+            dec_seq_embed = paddle.transpose(self.embedding(dec_seq), [1, 0, 2])
             dec_seq_embed = self.positional_encoding(dec_seq_embed)
-            tgt_mask = self.generate_square_subsequent_mask(dec_seq_embed.shape[
-                0])
+            tgt_mask = self.generate_square_subsequent_mask(
+                paddle.shape(dec_seq_embed)[0])
             output = self.decoder(
                 dec_seq_embed,
-                src_enc,
+                memory,
                 tgt_mask=tgt_mask,
                 memory_mask=None,
-                tgt_key_padding_mask=tgt_key_padding_mask,
+                tgt_key_padding_mask=None,
                 memory_key_padding_mask=None)
-            dec_output = output.transpose([1, 0, 2])
-
-            dec_output = dec_output[:,
-                                    -1, :]  # Pick the last step: (bh * bm) * d_h
-            word_prob = F.log_softmax(self.tgt_word_prj(dec_output), axis=1)
-            word_prob = word_prob.reshape([1, bs, -1])
-            preds_idx = word_prob.argmax(axis=2)
-
+            dec_output = paddle.transpose(output, [1, 0, 2])
+            dec_output = dec_output[:, -1, :]
+            word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=1)
+            preds_idx = paddle.argmax(word_prob, axis=1)
             if paddle.equal_all(
-                    preds_idx[-1],
+                    preds_idx,
                     paddle.full(
-                        preds_idx[-1].shape, 3, dtype='int64')):
+                        paddle.shape(preds_idx), 3, dtype='int64')):
                 break
-
-            preds_prob = word_prob.max(axis=2)
+            preds_prob = paddle.max(word_prob, axis=1)
             dec_seq = paddle.concat(
-                [dec_seq, preds_idx.reshape([-1, 1])], axis=1)
-
-        return dec_seq
+                [dec_seq, paddle.reshape(preds_idx, [-1, 1])], axis=1)
+            dec_prob = paddle.concat(
+                [dec_prob, paddle.reshape(preds_prob, [-1, 1])], axis=1)
+        return [dec_seq, dec_prob]
 
     def forward_beam(self, images):
         ''' Translation work in one batch '''
@@ -211,14 +207,15 @@ class Transformer(nn.Layer):
                                 n_prev_active_inst, n_bm):
             ''' Collect tensor parts associated to active instances. '''
 
-            _, *d_hs = beamed_tensor.shape
+            beamed_tensor_shape = paddle.shape(beamed_tensor)
             n_curr_active_inst = len(curr_active_inst_idx)
-            new_shape = (n_curr_active_inst * n_bm, *d_hs)
+            new_shape = (n_curr_active_inst * n_bm, beamed_tensor_shape[1],
+                         beamed_tensor_shape[2])
 
             beamed_tensor = beamed_tensor.reshape([n_prev_active_inst, -1])
             beamed_tensor = beamed_tensor.index_select(
-                paddle.to_tensor(curr_active_inst_idx), axis=0)
-            beamed_tensor = beamed_tensor.reshape([*new_shape])
+                curr_active_inst_idx, axis=0)
+            beamed_tensor = beamed_tensor.reshape(new_shape)
 
             return beamed_tensor
 
@@ -249,44 +246,26 @@ class Transformer(nn.Layer):
                     b.get_current_state() for b in inst_dec_beams if not b.done
                 ]
                 dec_partial_seq = paddle.stack(dec_partial_seq)
-
                 dec_partial_seq = dec_partial_seq.reshape([-1, len_dec_seq])
                 return dec_partial_seq
 
-            def prepare_beam_memory_key_padding_mask(
-                    inst_dec_beams, memory_key_padding_mask, n_bm):
-                keep = []
-                for idx in (memory_key_padding_mask):
-                    if not inst_dec_beams[idx].done:
-                        keep.append(idx)
-                memory_key_padding_mask = memory_key_padding_mask[
-                    paddle.to_tensor(keep)]
-                len_s = memory_key_padding_mask.shape[-1]
-                n_inst = memory_key_padding_mask.shape[0]
-                memory_key_padding_mask = paddle.concat(
-                    [memory_key_padding_mask for i in range(n_bm)], axis=1)
-                memory_key_padding_mask = memory_key_padding_mask.reshape(
-                    [n_inst * n_bm, len_s])  #repeat(1, n_bm)
-                return memory_key_padding_mask
-
             def predict_word(dec_seq, enc_output, n_active_inst, n_bm,
                              memory_key_padding_mask):
-                tgt_key_padding_mask = self.generate_padding_mask(dec_seq)
-                dec_seq = self.embedding(dec_seq).transpose([1, 0, 2])
+                dec_seq = paddle.transpose(self.embedding(dec_seq), [1, 0, 2])
                 dec_seq = self.positional_encoding(dec_seq)
-                tgt_mask = self.generate_square_subsequent_mask(dec_seq.shape[
-                    0])
+                tgt_mask = self.generate_square_subsequent_mask(
+                    paddle.shape(dec_seq)[0])
                 dec_output = self.decoder(
                     dec_seq,
                     enc_output,
                     tgt_mask=tgt_mask,
-                    tgt_key_padding_mask=tgt_key_padding_mask,
-                    memory_key_padding_mask=memory_key_padding_mask,
-                ).transpose([1, 0, 2])
+                    tgt_key_padding_mask=None,
+                    memory_key_padding_mask=memory_key_padding_mask, )
+                dec_output = paddle.transpose(dec_output, [1, 0, 2])
                 dec_output = dec_output[:,
                                         -1, :]  # Pick the last step: (bh * bm) * d_h
-                word_prob = F.log_softmax(self.tgt_word_prj(dec_output), axis=1)
-                word_prob = word_prob.reshape([n_active_inst, n_bm, -1])
+                word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=1)
+                word_prob = paddle.reshape(word_prob, [n_active_inst, n_bm, -1])
                 return word_prob
 
             def collect_active_inst_idx_list(inst_beams, word_prob,
@@ -302,9 +281,8 @@ class Transformer(nn.Layer):
 
             n_active_inst = len(inst_idx_to_position_map)
             dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq)
-            memory_key_padding_mask = None
             word_prob = predict_word(dec_seq, enc_output, n_active_inst, n_bm,
-                                     memory_key_padding_mask)
+                                     None)
             # Update the beam with predicted word prob information and collect incomplete instances
             active_inst_idx_list = collect_active_inst_idx_list(
                 inst_dec_beams, word_prob, inst_idx_to_position_map)
@@ -324,27 +302,21 @@ class Transformer(nn.Layer):
 
         with paddle.no_grad():
             #-- Encode
-
             if self.encoder is not None:
                 src = self.positional_encoding(images.transpose([1, 0, 2]))
-                src_enc = self.encoder(src).transpose([1, 0, 2])
+                src_enc = self.encoder(src)
             else:
                 src_enc = images.squeeze(2).transpose([0, 2, 1])
 
-            #-- Repeat data for beam search
             n_bm = self.beam_size
-            n_inst, len_s, d_h = src_enc.shape
-            src_enc = paddle.concat([src_enc for i in range(n_bm)], axis=1)
-            src_enc = src_enc.reshape([n_inst * n_bm, len_s, d_h]).transpose(
-                [1, 0, 2])
-            #-- Prepare beams
-            inst_dec_beams = [Beam(n_bm) for _ in range(n_inst)]
-
-            #-- Bookkeeping for active or not
-            active_inst_idx_list = list(range(n_inst))
+            src_shape = paddle.shape(src_enc)
+            inst_dec_beams = [Beam(n_bm) for _ in range(1)]
+            active_inst_idx_list = list(range(1))
+            # Repeat data for beam search
+            src_enc = paddle.tile(src_enc, [1, n_bm, 1])
             inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(
                 active_inst_idx_list)
-            #-- Decode
+            # Decode
             for len_dec_seq in range(1, 25):
                 src_enc_copy = src_enc.clone()
                 active_inst_idx_list = beam_decode_step(
@@ -358,10 +330,19 @@ class Transformer(nn.Layer):
         batch_hyp, batch_scores = collect_hypothesis_and_scores(inst_dec_beams,
                                                                 1)
         result_hyp = []
-        for bs_hyp in batch_hyp:
-            bs_hyp_pad = bs_hyp[0] + [3] * (25 - len(bs_hyp[0]))
+        hyp_scores = []
+        for bs_hyp, score in zip(batch_hyp, batch_scores):
+            l = len(bs_hyp[0])
+            bs_hyp_pad = bs_hyp[0] + [3] * (25 - l)
             result_hyp.append(bs_hyp_pad)
-        return paddle.to_tensor(np.array(result_hyp), dtype=paddle.int64)
+            score = float(score) / l
+            hyp_score = [score for _ in range(25)]
+            hyp_scores.append(hyp_score)
+        return [
+            paddle.to_tensor(
+                np.array(result_hyp), dtype=paddle.int64),
+            paddle.to_tensor(hyp_scores)
+        ]
 
     def generate_square_subsequent_mask(self, sz):
         """Generate a square mask for the sequence. The masked positions are filled with float('-inf').
@@ -376,7 +357,7 @@ class Transformer(nn.Layer):
         return mask
 
     def generate_padding_mask(self, x):
-        padding_mask = x.equal(paddle.to_tensor(0, dtype=x.dtype))
+        padding_mask = paddle.equal(x, paddle.to_tensor(0, dtype=x.dtype))
         return padding_mask
 
     def _reset_parameters(self):
@@ -514,17 +495,17 @@ class TransformerEncoderLayer(nn.Layer):
             src,
             src,
             attn_mask=src_mask,
-            key_padding_mask=src_key_padding_mask)[0]
+            key_padding_mask=src_key_padding_mask)
         src = src + self.dropout1(src2)
         src = self.norm1(src)
 
-        src = src.transpose([1, 2, 0])
+        src = paddle.transpose(src, [1, 2, 0])
         src = paddle.unsqueeze(src, 2)
         src2 = self.conv2(F.relu(self.conv1(src)))
         src2 = paddle.squeeze(src2, 2)
-        src2 = src2.transpose([2, 0, 1])
+        src2 = paddle.transpose(src2, [2, 0, 1])
         src = paddle.squeeze(src, 2)
-        src = src.transpose([2, 0, 1])
+        src = paddle.transpose(src, [2, 0, 1])
 
         src = src + self.dropout2(src2)
         src = self.norm2(src)
@@ -598,7 +579,7 @@ class TransformerDecoderLayer(nn.Layer):
             tgt,
             tgt,
             attn_mask=tgt_mask,
-            key_padding_mask=tgt_key_padding_mask)[0]
+            key_padding_mask=tgt_key_padding_mask)
         tgt = tgt + self.dropout1(tgt2)
         tgt = self.norm1(tgt)
         tgt2 = self.multihead_attn(
@@ -606,18 +587,18 @@ class TransformerDecoderLayer(nn.Layer):
             memory,
             memory,
             attn_mask=memory_mask,
-            key_padding_mask=memory_key_padding_mask)[0]
+            key_padding_mask=memory_key_padding_mask)
         tgt = tgt + self.dropout2(tgt2)
         tgt = self.norm2(tgt)
 
         # default
-        tgt = tgt.transpose([1, 2, 0])
+        tgt = paddle.transpose(tgt, [1, 2, 0])
         tgt = paddle.unsqueeze(tgt, 2)
         tgt2 = self.conv2(F.relu(self.conv1(tgt)))
         tgt2 = paddle.squeeze(tgt2, 2)
-        tgt2 = tgt2.transpose([2, 0, 1])
+        tgt2 = paddle.transpose(tgt2, [2, 0, 1])
         tgt = paddle.squeeze(tgt, 2)
-        tgt = tgt.transpose([2, 0, 1])
+        tgt = paddle.transpose(tgt, [2, 0, 1])
 
         tgt = tgt + self.dropout3(tgt2)
         tgt = self.norm3(tgt)
@@ -656,8 +637,8 @@ class PositionalEncoding(nn.Layer):
             (-math.log(10000.0) / dim))
         pe[:, 0::2] = paddle.sin(position * div_term)
         pe[:, 1::2] = paddle.cos(position * div_term)
-        pe = pe.unsqueeze(0)
-        pe = pe.transpose([1, 0, 2])
+        pe = paddle.unsqueeze(pe, 0)
+        pe = paddle.transpose(pe, [1, 0, 2])
         self.register_buffer('pe', pe)
 
     def forward(self, x):
@@ -670,7 +651,7 @@ class PositionalEncoding(nn.Layer):
         Examples:
             >>> output = pos_encoder(x)
         """
-        x = x + self.pe[:x.shape[0], :]
+        x = x + self.pe[:paddle.shape(x)[0], :]
         return self.dropout(x)
 
 
@@ -702,7 +683,7 @@ class PositionalEncoding_2d(nn.Layer):
             (-math.log(10000.0) / dim))
         pe[:, 0::2] = paddle.sin(position * div_term)
         pe[:, 1::2] = paddle.cos(position * div_term)
-        pe = pe.unsqueeze(0).transpose([1, 0, 2])
+        pe = paddle.transpose(paddle.unsqueeze(pe, 0), [1, 0, 2])
         self.register_buffer('pe', pe)
 
         self.avg_pool_1 = nn.AdaptiveAvgPool2D((1, 1))
@@ -722,22 +703,23 @@ class PositionalEncoding_2d(nn.Layer):
         Examples:
             >>> output = pos_encoder(x)
         """
-        w_pe = self.pe[:x.shape[-1], :]
+        w_pe = self.pe[:paddle.shape(x)[-1], :]
         w1 = self.linear1(self.avg_pool_1(x).squeeze()).unsqueeze(0)
         w_pe = w_pe * w1
-        w_pe = w_pe.transpose([1, 2, 0])
-        w_pe = w_pe.unsqueeze(2)
+        w_pe = paddle.transpose(w_pe, [1, 2, 0])
+        w_pe = paddle.unsqueeze(w_pe, 2)
 
-        h_pe = self.pe[:x.shape[-2], :]
+        h_pe = self.pe[:paddle.shape(x).shape[-2], :]
         w2 = self.linear2(self.avg_pool_2(x).squeeze()).unsqueeze(0)
         h_pe = h_pe * w2
-        h_pe = h_pe.transpose([1, 2, 0])
-        h_pe = h_pe.unsqueeze(3)
+        h_pe = paddle.transpose(h_pe, [1, 2, 0])
+        h_pe = paddle.unsqueeze(h_pe, 3)
 
         x = x + w_pe + h_pe
-        x = x.reshape(
-            [x.shape[0], x.shape[1], x.shape[2] * x.shape[3]]).transpose(
-                [2, 0, 1])
+        x = paddle.transpose(
+            paddle.reshape(x,
+                           [x.shape[0], x.shape[1], x.shape[2] * x.shape[3]]),
+            [2, 0, 1])
 
         return self.dropout(x)
 
@@ -817,7 +799,7 @@ class Beam():
     def sort_scores(self):
         "Sort the scores."
         return self.scores, paddle.to_tensor(
-            [i for i in range(self.scores.shape[0])], dtype='int32')
+            [i for i in range(int(self.scores.shape[0]))], dtype='int32')
 
     def get_the_best_score_and_idx(self):
         "Get the score of the best in the beam."
diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py
index 6ff375eb43e773f68f89f66663835ffe45da09b5..96b2169d28004d408bd567db2b3130b681fcb582 100644
--- a/ppocr/postprocess/rec_postprocess.py
+++ b/ppocr/postprocess/rec_postprocess.py
@@ -169,15 +169,20 @@ class NRTRLabelDecode(BaseRecLabelDecode):
                                               character_type, use_space_char)
 
     def __call__(self, preds, label=None, *args, **kwargs):
-        if preds.dtype == paddle.int64:
-            if isinstance(preds, paddle.Tensor):
-                preds = preds.numpy()
-            if preds[0][0] == 2:
-                preds_idx = preds[:, 1:]
-            else:
-                preds_idx = preds
 
-            text = self.decode(preds_idx)
+        if len(preds) == 2:
+            preds_id = preds[0]
+            preds_prob = preds[1]
+            if isinstance(preds_id, paddle.Tensor):
+                preds_id = preds_id.numpy()
+            if isinstance(preds_prob, paddle.Tensor):
+                preds_prob = preds_prob.numpy()
+            if preds_id[0][0] == 2:
+                preds_idx = preds_id[:, 1:]
+                preds_prob = preds_prob[:, 1:]
+            else:
+                preds_idx = preds_id
+            text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
             if label is None:
                 return text
             label = self.decode(label[:, 1:])
diff --git a/tools/export_model.py b/tools/export_model.py
index cae87aca129134d64711e364bf10428d69500a06..d8fe297235b2f5de6861d387cff64e8737cd30c0 100755
--- a/tools/export_model.py
+++ b/tools/export_model.py
@@ -60,6 +60,8 @@ def export_single_model(model, arch_config, save_path, logger):
                     "When there is tps in the network, variable length input is not supported, and the input size needs to be the same as during training"
                 )
                 infer_shape[-1] = 100
+            if arch_config["algorithm"] == "NRTR":
+                infer_shape = [1, 32, 100]
         elif arch_config["model_type"] == "table":
             infer_shape = [3, 488, 488]
         model = to_static(
diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py
index 7401a16ee662ceed1f8010adc3db0769e3efadb6..311d1e92b30167b7495bf129e2f28a87f7690572 100755
--- a/tools/infer/predict_rec.py
+++ b/tools/infer/predict_rec.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import os
 import sys
-
+from PIL import Image
 __dir__ = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(__dir__)
 sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
@@ -61,6 +61,13 @@ class TextRecognizer(object):
                 "character_dict_path": args.rec_char_dict_path,
                 "use_space_char": args.use_space_char
             }
+        elif self.rec_algorithm == 'NRTR':
+            postprocess_params = {
+                'name': 'NRTRLabelDecode',
+                "character_type": args.rec_char_type,
+                "character_dict_path": args.rec_char_dict_path,
+                "use_space_char": args.use_space_char
+            }
         self.postprocess_op = build_post_process(postprocess_params)
         self.predictor, self.input_tensor, self.output_tensors, self.config = \
             utility.create_predictor(args, 'rec', logger)
@@ -87,6 +94,30 @@ class TextRecognizer(object):
 
     def resize_norm_img(self, img, max_wh_ratio):
         imgC, imgH, imgW = self.rec_image_shape
+        if imgC == 1:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+            # h = img.shape[0]
+            # w = img.shape[1]
+            # ratio = w / float(h)
+            # if math.ceil(imgH * ratio) > imgW:
+            #     resized_w = imgW
+            # else:
+            #     resized_w = int(math.ceil(imgH * ratio))
+            # resized_image = cv2.resize(img, (resized_w, imgH))
+            # #norm_img = np.expand_dims(resized_image, -1)
+            # #norm_img = norm_img.transpose((2, 0, 1))
+            # resized_image = resized_image.astype(np.float32) / 128. - 1.
+            # padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
+            # padding_im[0, :, 0:resized_w] = resized_image
+
+            # return padding_im
+            image_pil = Image.fromarray(np.uint8(img))
+            img = image_pil.resize([100, 32], Image.ANTIALIAS)
+            img = np.array(img)
+            norm_img = np.expand_dims(img, -1)
+            norm_img = norm_img.transpose((2, 0, 1))
+            return norm_img.astype(np.float32) / 128. - 1.
+
         assert imgC == img.shape[2]
         max_wh_ratio = max(max_wh_ratio, imgW / imgH)
         imgW = int((32 * max_wh_ratio))
@@ -252,14 +283,16 @@ class TextRecognizer(object):
             else:
                 self.input_tensor.copy_from_cpu(norm_img_batch)
                 self.predictor.run()
-
                 outputs = []
                 for output_tensor in self.output_tensors:
                     output = output_tensor.copy_to_cpu()
                     outputs.append(output)
                 if self.benchmark:
                     self.autolog.times.stamp()
-                preds = outputs[0]
+                if len(outputs) != 1:
+                    preds = outputs
+                else:
+                    preds = outputs[0]
             rec_result = self.postprocess_op(preds)
             for rno in range(len(rec_result)):
                 rec_res[indices[beg_img_no + rno]] = rec_result[rno]