diff --git a/configs/rec/rec_mtb_nrtr.yml b/configs/rec/rec_mtb_nrtr.yml
index 04267500854310dc6d5df9318bb8c056c65cd5b5..dfe2cc9811120f0a5960d02a28e39ada83b98104 100644
--- a/configs/rec/rec_mtb_nrtr.yml
+++ b/configs/rec/rec_mtb_nrtr.yml
@@ -49,7 +49,7 @@ Architecture:
     
 
 Loss:
-  name: NRTRLoss
+  name: CESmoothingLoss
   smoothing: True
 
 PostProcess:
@@ -68,8 +68,8 @@ Train:
           img_mode: BGR
           channel_first: False
       - NRTRLabelEncode: # Class handling label
-      - NRTRRecResizeImg:
-          image_shape: [100, 32]
+      - GrayRecResizeImg:
+          image_shape: [100, 32] # W H
           resize_type: PIL # PIL or OpenCV
       - KeepKeys:
           keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
@@ -88,8 +88,8 @@ Eval:
           img_mode: BGR
           channel_first: False
       - NRTRLabelEncode: # Class handling label
-      - NRTRRecResizeImg:
-          image_shape: [100, 32]
+      - GrayRecResizeImg:
+          image_shape: [100, 32] # W H
           resize_type: PIL # PIL or OpenCV
       - KeepKeys:
           keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
diff --git a/configs/rec/rec_svtrnet.yml b/configs/rec/rec_svtrnet.yml
index 233d5e276577cad0144456ef7df1e20de99891f9..a700e4bd92995e8c0da2bf7623fe25e746483b1b 100644
--- a/configs/rec/rec_svtrnet.yml
+++ b/configs/rec/rec_svtrnet.yml
@@ -77,7 +77,7 @@ Metric:
 Train:
   dataset:
     name: LMDBDataSet
-    data_dir: ./train_data/data_lmdb_release/training/
+    data_dir: ./train_data/data_lmdb_release/training
     transforms:
       - DecodeImage: # load image
           img_mode: BGR
@@ -98,7 +98,7 @@ Train:
 Eval:
   dataset:
     name: LMDBDataSet
-    data_dir: ./train_data/data_lmdb_release/validation/
+    data_dir: ./train_data/data_lmdb_release/validation
     transforms:
       - DecodeImage: # load image
           img_mode: BGR
diff --git a/configs/rec/rec_vitstr.yml b/configs/rec/rec_vitstr.yml
new file mode 100644
index 0000000000000000000000000000000000000000..005db0184ae3319edffacb29a1dfd1751460a00a
--- /dev/null
+++ b/configs/rec/rec_vitstr.yml
@@ -0,0 +1,100 @@
+Global:
+  use_gpu: True
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  save_model_dir: ./output/rec/vitstr/
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations after the 0th iteration#
+  eval_batch_step: [0, 50]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  save_inference_dir:
+  use_visualdl: False
+  infer_img: doc/imgs_words_en/word_10.png
+  # for data or label process
+  character_dict_path: ppocr/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  infer_mode: False
+  use_space_char: False
+  save_res_path: ./output/rec/predicts_vitstr.txt
+
+
+Optimizer:
+  name: Adadelta
+  epsilon: 0.00000001
+  rho: 0.95
+  clip_norm: 5.0
+  lr:
+    learning_rate: 1.0
+
+Architecture:
+  model_type: rec
+  algorithm: ViTSTR
+  in_channels: 1
+  Transform:
+  Backbone:
+    name: ViTSTR
+    scale: tiny
+  Neck:
+    name: SequenceEncoder
+    encoder_type: reshape
+  Head:
+    name: CTCHead
+
+Loss:
+  name: CESmoothingLoss
+  smoothing: False
+  with_all: True
+
+PostProcess:
+  name: ViTSTRLabelDecode
+
+Metric:
+  name: RecMetric
+  main_indicator: acc
+
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ./train_data/data_lmdb_release/training
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - ViTSTRLabelEncode: # Class handling label
+      - GrayRecResizeImg:
+          image_shape: [224, 224] # W H
+          resize_type: PIL # PIL or OpenCV
+          inter_type: 'Image.BICUBIC'
+          scale: false
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 48
+    drop_last: True
+    num_workers: 2
+
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ./train_data/data_lmdb_release/validation
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - ViTSTRLabelEncode: # Class handling label
+      - GrayRecResizeImg:
+          image_shape: [224, 224] # W H
+          resize_type: PIL # PIL or OpenCV
+          inter_type: 'Image.BICUBIC'
+          scale: false
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2
diff --git a/doc/doc_ch/algorithm_overview.md b/doc/doc_ch/algorithm_overview.md
index 6227a21498eda7d8527e21e7f2567995251d9e47..934ac08537504fe6fa4d78c1d3635ac43a201efb 100755
--- a/doc/doc_ch/algorithm_overview.md
+++ b/doc/doc_ch/algorithm_overview.md
@@ -66,6 +66,7 @@
 - [x]  [SAR](./algorithm_rec_sar.md)
 - [x]  [SEED](./algorithm_rec_seed.md)
 - [x]  [SVTR](./algorithm_rec_svtr.md)
+- [x]  [ViTSTR](./algorithm_rec_vitstr.md)
 
 参考[DTRB](https://arxiv.org/abs/1904.01906)[3]文字识别训练和评估流程，使用MJSynth和SynthText两个文字识别数据集训练，在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估，算法效果如下：
 
@@ -84,7 +85,7 @@
 |SAR|Resnet31| 87.20% | rec_r31_sar | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) |
 |SEED|Aster_Resnet| 85.35% | rec_resnet_stn_bilstm_att | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) |
 |SVTR|SVTR-Tiny| 89.25% | rec_svtr_tiny_none_ctc_en | [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) |
-
+|ViTSTR|ViTSTR| 79.82% | rec_vitstr_none_ctc_en | [训练模型](https://paddleocr.bj.bcebos.com/rec_vitstr_none_none_train.tar) |
 
 <a name="2"></a>
 
diff --git a/doc/doc_ch/algorithm_rec_vitstr.md b/doc/doc_ch/algorithm_rec_vitstr.md
new file mode 100644
index 0000000000000000000000000000000000000000..bd83b8d9c2d9474310cc12d716e9d34467bf74a5
--- /dev/null
+++ b/doc/doc_ch/algorithm_rec_vitstr.md
@@ -0,0 +1,154 @@
+# 场景文本识别算法-ViTSTR
+
+- [1. 算法简介](#1)
+- [2. 环境配置](#2)
+- [3. 模型训练、评估、预测](#3)
+    - [3.1 训练](#3-1)
+    - [3.2 评估](#3-2)
+    - [3.3 预测](#3-3)
+- [4. 推理部署](#4)
+    - [4.1 Python推理](#4-1)
+    - [4.2 C++推理](#4-2)
+    - [4.3 Serving服务化部署](#4-3)
+    - [4.4 更多推理部署](#4-4)
+- [5. FAQ](#5)
+
+<a name="1"></a>
+## 1. 算法简介
+
+论文信息：
+> [Vision Transformer for Fast and Efficient Scene Text Recognition](https://arxiv.org/abs/2105.08582)
+> Rowel Atienza
+> ICDAR, 2021
+
+
+<a name="model"></a>
+`ViTSTR`使用MJSynth和SynthText两个文字识别数据集训练，在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估，算法复现效果如下：
+
+|模型|骨干网络|配置文件|Acc|下载链接|
+| --- | --- | --- | --- | --- |
+|ViTSTR|ViTSTR|[rec_vitstr.yml](../../configs/rec/rec_vitstr.yml)|79.82%|[训练模型](https://paddleocr.bj.bcebos.com/rec_vitstr_none_none_train.tar)|
+
+<a name="2"></a>
+## 2. 环境配置
+请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境，参考[《项目克隆》](./clone.md)克隆项目代码。
+
+
+<a name="3"></a>
+## 3. 模型训练、评估、预测
+
+<a name="3-1"></a>
+### 3.1 模型训练
+
+请参考[文本识别训练教程](./recognition.md)。PaddleOCR对代码进行了模块化，训练`ViTSTR`识别模型时需要**更换配置文件**为`ViTSTR`的[配置文件](../../configs/rec/rec_ViTSTR.yml)。
+
+#### 启动训练
+
+
+具体地，在完成数据准备后，便可以启动训练，训练命令如下：
+```shell
+#单卡训练（训练周期长，不建议）
+python3 tools/train.py -c configs/rec/rec_vitstr.yml
+
+#多卡训练，通过--gpus参数指定卡号
+python3 -m paddle.distributed.launch --gpus '0,1,2,3'  tools/train.py -c configs/rec/rec_vitstr.yml
+```
+
+<a name="3-2"></a>
+### 3.2 评估
+
+可下载已训练完成的[模型文件](#model)，使用如下命令进行评估：
+
+```shell
+# 注意将pretrained_model的路径设置为本地路径。
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_vitstr.yml -o Global.pretrained_model=./rec_vitstr_train/best_accuracy
+```
+
+<a name="3-3"></a>
+### 3.3 预测
+
+使用如下命令进行单张图片预测：
+```shell
+# 注意将pretrained_model的路径设置为本地路径。
+python3 tools/infer_rec.py -c configs/rec/rec_vitstr.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_vitstr_train/best_accuracy
+# 预测文件夹下所有图像时，可修改infer_img为文件夹，如 Global.infer_img='./doc/imgs_words_en/'。
+```
+
+
+<a name="4"></a>
+## 4. 推理部署
+
+<a name="4-1"></a>
+### 4.1 Python推理
+首先将训练得到best模型，转换成inference model。这里以训练完成的模型为例（[模型下载地址](https://paddleocr.bj.bcebos.com/rec_vitstr_none_none_train.tar) )，可以使用如下命令进行转换：
+
+```shell
+# 注意将pretrained_model的路径设置为本地路径。
+python3 tools/export_model.py -c configs/rec/rec_vitstr.yml -o Global.pretrained_model=./rec_vitstr_train/best_accuracy Global.save_inference_dir=./inference/rec_vitstr/
+```
+**注意：**
+- 如果您是在自己的数据集上训练的模型，并且调整了字典文件，请注意修改配置文件中的`character_dict_path`是否是所需要的字典文件。
+- 如果您修改了训练时的输入大小，请修改`tools/export_model.py`文件中的对应NRTR的`infer_shape`。
+
+转换成功后，在目录下有三个文件：
+```
+/inference/rec_vitstr/
+    ├── inference.pdiparams         # 识别inference模型的参数文件
+    ├── inference.pdiparams.info    # 识别inference模型的参数信息，可忽略
+    └── inference.pdmodel           # 识别inference模型的program文件
+```
+
+执行如下命令进行模型推理：
+
+```shell
+python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_vitstr/' --rec_algorithm='ViTSTR' --rec_image_shape='1,224,224' --rec_char_dict_path='./ppocr/utils/EN_symbol_dict.txt'
+# 预测文件夹下所有图像时，可修改image_dir为文件夹，如 --image_dir='./doc/imgs_words_en/'。
+```
+
+![](../imgs_words_en/word_10.png)
+
+执行命令后，上面图像的预测结果（识别的文本和得分）会打印到屏幕上，示例如下：
+结果如下：
+```shell
+Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9265879392623901)
+```
+
+**注意**：
+
+- 训练上述模型采用的图像分辨率是[1，224，224]，需要通过参数`rec_image_shape`设置为您训练时的识别图像形状。
+- 在推理时需要设置参数`rec_char_dict_path`指定字典，如果您修改了字典，请修改该参数为您的字典文件。
+- 如果您修改了预处理方法，需修改`tools/infer/predict_rec.py`中NRTR的预处理为您的预处理方法。
+
+
+<a name="4-2"></a>
+### 4.2 C++推理部署
+
+由于C++预处理后处理还未支持NRTR，所以暂未支持
+
+<a name="4-3"></a>
+### 4.3 Serving服务化部署
+
+暂不支持
+
+<a name="4-4"></a>
+### 4.4 更多推理部署
+
+暂不支持
+
+<a name="5"></a>
+## 5. FAQ
+
+1. 在`ViTSTR`论文中，使用在ImageNet1k上的预训练权重进行初始化训练，我们在训练未采用预训练权重，最终精度没有变化甚至有所提高。
+2. 我们仅仅复现了`ViTSTR`中的tiny版本，如果有需要使用small、base版本，可直接使用源开源repo中的预训练权重转为Paddle权重即可使用。
+
+## 引用
+
+```bibtex
+@article{Atienza2021ViTSTR,
+  title     = {Vision Transformer for Fast and Efficient Scene Text Recognition},
+  author    = {Rowel Atienza},
+  booktitle = {ICDAR},
+  year      = {2021},
+  url       = {https://arxiv.org/abs/2105.08582}
+}
+```
diff --git a/doc/doc_en/algorithm_overview_en.md b/doc/doc_en/algorithm_overview_en.md
index 383cbe39bbd2eb8ca85f497888920ce87cb1837e..213d95807dd14189b27051679b1791e43307d328 100755
--- a/doc/doc_en/algorithm_overview_en.md
+++ b/doc/doc_en/algorithm_overview_en.md
@@ -65,6 +65,7 @@ Supported text recognition algorithms (Click the link to get the tutorial):
 - [x]  [SAR](./algorithm_rec_sar_en.md)
 - [x]  [SEED](./algorithm_rec_seed_en.md)
 - [x]  [SVTR](./algorithm_rec_svtr_en.md)
+- [x]  [ViTSTR](./algorithm_rec_vitstr_en.md)
 
 Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow:
 
@@ -83,7 +84,7 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r
 |SAR|Resnet31| 87.20% | rec_r31_sar | [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) |
 |SEED|Aster_Resnet| 85.35% | rec_resnet_stn_bilstm_att | [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) |
 |SVTR|SVTR-Tiny| 89.25% | rec_svtr_tiny_none_ctc_en | [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) |
-
+|ViTSTR|ViTSTR| 79.82% | rec_vitstr_none_ctc_en | [trained model](https://paddleocr.bj.bcebos.com/rec_vitstr_none_none_train.tar) |
 
 <a name="2"></a>
 
diff --git a/doc/doc_en/algorithm_rec_vitstr_en.md b/doc/doc_en/algorithm_rec_vitstr_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..666798c3e0b6fccf21fcbbc0e09fc1fad0c8acff
--- /dev/null
+++ b/doc/doc_en/algorithm_rec_vitstr_en.md
@@ -0,0 +1,134 @@
+# ViTSTR
+
+- [1. Introduction](#1)
+- [2. Environment](#2)
+- [3. Model Training / Evaluation / Prediction](#3)
+    - [3.1 Training](#3-1)
+    - [3.2 Evaluation](#3-2)
+    - [3.3 Prediction](#3-3)
+- [4. Inference and Deployment](#4)
+    - [4.1 Python Inference](#4-1)
+    - [4.2 C++ Inference](#4-2)
+    - [4.3 Serving](#4-3)
+    - [4.4 More](#4-4)
+- [5. FAQ](#5)
+
+<a name="1"></a>
+## 1. Introduction
+
+Paper:
+> [Vision Transformer for Fast and Efficient Scene Text Recognition](https://arxiv.org/abs/2105.08582)
+> Rowel Atienza
+> ICDAR, 2021
+
+Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows:
+
+|Model|Backbone|config|Acc|Download link|
+| --- | --- | --- | --- | --- |
+|ViTSTR|ViTSTR|[rec_vitstr.yml](../../configs/rec/rec_vitstr.yml)|79.82%|[训练模型](https://paddleocr.bj.bcebos.com/rec_vitstr_none_none_train.tar)|
+
+<a name="2"></a>
+## 2. Environment
+Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code.
+
+
+<a name="3"></a>
+## 3. Model Training / Evaluation / Prediction
+
+Please refer to [Text Recognition Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
+
+Training:
+
+Specifically, after the data preparation is completed, the training can be started. The training command is as follows:
+
+```
+#Single GPU training (long training period, not recommended)
+python3 tools/train.py -c configs/rec/rec_vitstr.yml
+
+#Multi GPU training, specify the gpu number through the --gpus parameter
+python3 -m paddle.distributed.launch --gpus '0,1,2,3'  tools/train.py -c configs/rec/rec_vitstr.yml
+```
+
+Evaluation:
+
+```
+# GPU evaluation
+python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_vitstr.yml -o Global.pretrained_model={path/to/weights}/best_accuracy
+```
+
+Prediction:
+
+```
+# The configuration file used for prediction must match the training
+python3 tools/infer_rec.py -c configs/rec/rec_vitstr.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_vitstr_train/best_accuracy
+```
+
+<a name="4"></a>
+## 4. Inference and Deployment
+
+<a name="4-1"></a>
+### 4.1 Python Inference
+First, the model saved during the ViTSTR text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/rec_vitstr_none_none_train.tar)) ), you can use the following command to convert:
+
+```
+python3 tools/export_model.py -c configs/rec/rec_vitstr.yml -o Global.pretrained_model=./rec_vitstr_train/best_accuracy  Global.save_inference_dir=./inference/rec_vitstr
+```
+
+**Note:**
+- If you are training the model on your own dataset and have modified the dictionary file, please pay attention to modify the `character_dict_path` in the configuration file to the modified dictionary file.
+- If you modified the input size during training, please modify the `infer_shape` corresponding to ViTSTR in the `tools/export_model.py` file.
+
+After the conversion is successful, there are three files in the directory:
+```
+/inference/rec_vitstr/
+    ├── inference.pdiparams
+    ├── inference.pdiparams.info
+    └── inference.pdmodel
+```
+
+
+For ViTSTR text recognition model inference, the following commands can be executed:
+
+```
+python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_vitstr/' --rec_algorithm='ViTSTR' --rec_image_shape='1,224,224' --rec_char_dict_path='./ppocr/utils/EN_symbol_dict.txt'
+```
+
+![](../imgs_words_en/word_10.png)
+
+After executing the command, the prediction result (recognized text and score) of the image above is printed to the screen, an example is as follows:
+The result is as follows:
+```shell
+Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9265879392623901)
+```
+
+<a name="4-2"></a>
+### 4.2 C++ Inference
+
+Not supported
+
+<a name="4-3"></a>
+### 4.3 Serving
+
+Not supported
+
+<a name="4-4"></a>
+### 4.4 More
+
+Not supported
+
+<a name="5"></a>
+## 5. FAQ
+
+1. In the `ViTSTR` paper, using pre-trained weights on ImageNet1k for initial training, we did not use pre-trained weights in training, and the final accuracy did not change or even improved.
+
+## Citation
+
+```bibtex
+@article{Atienza2021ViTSTR,
+  title     = {Vision Transformer for Fast and Efficient Scene Text Recognition},
+  author    = {Rowel Atienza},
+  booktitle = {ICDAR},
+  year      = {2021},
+  url       = {https://arxiv.org/abs/2105.08582}
+}
+```
diff --git a/ppocr/data/imaug/__init__.py b/ppocr/data/imaug/__init__.py
index 548832fb0d116ba2de622bd97562b591d74501d8..2dbc92a7037c58b09753330e9c5f1b9791252ef6 100644
--- a/ppocr/data/imaug/__init__.py
+++ b/ppocr/data/imaug/__init__.py
@@ -23,7 +23,7 @@ from .random_crop_data import EastRandomCropData, RandomCropImgMask
 from .make_pse_gt import MakePseGt
 
 from .rec_img_aug import RecAug, RecConAug, RecResizeImg, ClsResizeImg, \
-    SRNRecResizeImg, NRTRRecResizeImg, SARRecResizeImg, PRENResizeImg
+    SRNRecResizeImg, GrayRecResizeImg, SARRecResizeImg, PRENResizeImg
 from .ssl_img_aug import SSLRotateResize
 from .randaugment import RandAugment
 from .copy_paste import CopyPaste
diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py
index 02a5187dad27b76d04e866de45333d79383c1347..0366e3f7854513b79350dea4ddd6b29178c7fffc 100644
--- a/ppocr/data/imaug/label_ops.py
+++ b/ppocr/data/imaug/label_ops.py
@@ -443,7 +443,9 @@ class KieLabelEncode(object):
             elif 'key_cls' in anno.keys():
                 labels.append(anno['key_cls'])
             else:
-                raise ValueError("Cannot found 'key_cls' in ann.keys(), please check your training annotation.")
+                raise ValueError(
+                    "Cannot found 'key_cls' in ann.keys(), please check your training annotation."
+                )
             edges.append(ann.get('edge', 0))
         ann_infos = dict(
             image=data['image'],
@@ -838,6 +840,37 @@ class PRENLabelEncode(BaseRecLabelEncode):
         return data
 
 
+class ViTSTRLabelEncode(BaseRecLabelEncode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self,
+                 max_text_length,
+                 character_dict_path=None,
+                 use_space_char=False,
+                 **kwargs):
+
+        super(ViTSTRLabelEncode, self).__init__(
+            max_text_length, character_dict_path, use_space_char)
+
+    def __call__(self, data):
+        text = data['label']
+        text = self.encode(text)
+        if text is None:
+            return None
+        if len(text) >= self.max_text_len:
+            return None
+        data['length'] = np.array(len(text))
+        text.insert(0, 0)
+        text.append(1)
+        text = text + [0] * (self.max_text_len + 2 - len(text))
+        data['label'] = np.array(text)
+        return data
+
+    def add_special_char(self, dict_character):
+        dict_character = ['<s>', '</s>'] + dict_character
+        return dict_character
+
+
 class VQATokenLabelEncode(object):
     """
     Label encode for NLP VQA methods
diff --git a/ppocr/data/imaug/rec_img_aug.py b/ppocr/data/imaug/rec_img_aug.py
index 7483dffe5b6d9a0a2204702757fcb49762a1cc7a..0697baf436fa1f345bbd33c7e0847be0d8f1df8c 100644
--- a/ppocr/data/imaug/rec_img_aug.py
+++ b/ppocr/data/imaug/rec_img_aug.py
@@ -87,11 +87,19 @@ class ClsResizeImg(object):
         return data
 
 
-class NRTRRecResizeImg(object):
-    def __init__(self, image_shape, resize_type, padding=False, **kwargs):
+class GrayRecResizeImg(object):
+    def __init__(self,
+                 image_shape,
+                 resize_type,
+                 inter_type='Image.ANTIALIAS',
+                 scale=True,
+                 padding=False,
+                 **kwargs):
         self.image_shape = image_shape
         self.resize_type = resize_type
         self.padding = padding
+        self.inter_type = eval(inter_type)
+        self.scale = scale
 
     def __call__(self, data):
         img = data['image']
@@ -117,13 +125,16 @@ class NRTRRecResizeImg(object):
             return data
         if self.resize_type == 'PIL':
             image_pil = Image.fromarray(np.uint8(img))
-            img = image_pil.resize(self.image_shape, Image.ANTIALIAS)
+            img = image_pil.resize(self.image_shape, self.inter_type)
             img = np.array(img)
         if self.resize_type == 'OpenCV':
             img = cv2.resize(img, self.image_shape)
         norm_img = np.expand_dims(img, -1)
         norm_img = norm_img.transpose((2, 0, 1))
-        data['image'] = norm_img.astype(np.float32) / 128. - 1.
+        if self.scale:
+            data['image'] = norm_img.astype(np.float32) / 128. - 1.
+        else:
+            data['image'] = norm_img.astype(np.float32) / 255.
         return data
 
 
diff --git a/ppocr/losses/__init__.py b/ppocr/losses/__init__.py
index de8419b7c1cf6a30ab7195a1cbcbb10a5e52642d..6c4545eb21a2a2cf7ddd1b0a0f2023b56b41e196 100755
--- a/ppocr/losses/__init__.py
+++ b/ppocr/losses/__init__.py
@@ -30,7 +30,7 @@ from .det_fce_loss import FCELoss
 from .rec_ctc_loss import CTCLoss
 from .rec_att_loss import AttentionLoss
 from .rec_srn_loss import SRNLoss
-from .rec_nrtr_loss import NRTRLoss
+from .rec_ce_smooth_loss import CESmoothingLoss
 from .rec_sar_loss import SARLoss
 from .rec_aster_loss import AsterLoss
 from .rec_pren_loss import PRENLoss
@@ -60,8 +60,9 @@ def build_loss(config):
     support_dict = [
         'DBLoss', 'PSELoss', 'EASTLoss', 'SASTLoss', 'FCELoss', 'CTCLoss',
         'ClsLoss', 'AttentionLoss', 'SRNLoss', 'PGLoss', 'CombinedLoss',
-        'NRTRLoss', 'TableAttentionLoss', 'SARLoss', 'AsterLoss', 'SDMGRLoss',
-        'VQASerTokenLayoutLMLoss', 'LossFromOutput', 'PRENLoss', 'MultiLoss'
+        'CESmoothingLoss', 'TableAttentionLoss', 'SARLoss', 'AsterLoss',
+        'SDMGRLoss', 'VQASerTokenLayoutLMLoss', 'LossFromOutput', 'PRENLoss',
+        'MultiLoss'
     ]
     config = copy.deepcopy(config)
     module_name = config.pop('name')
diff --git a/ppocr/losses/rec_nrtr_loss.py b/ppocr/losses/rec_ce_smooth_loss.py
similarity index 73%
rename from ppocr/losses/rec_nrtr_loss.py
rename to ppocr/losses/rec_ce_smooth_loss.py
index 200a6d0486dbf6f76dd674eb58f641b31a70f31c..22243ed41f4ce739377a39112c640c00cb4b7792 100644
--- a/ppocr/losses/rec_nrtr_loss.py
+++ b/ppocr/losses/rec_ce_smooth_loss.py
@@ -3,16 +3,20 @@ from paddle import nn
 import paddle.nn.functional as F
 
 
-class NRTRLoss(nn.Layer):
-    def __init__(self, smoothing=True, **kwargs):
-        super(NRTRLoss, self).__init__()
+class CESmoothingLoss(nn.Layer):
+    def __init__(self, smoothing=True, with_all=False, **kwargs):
+        super(CESmoothingLoss, self).__init__()
         self.loss_func = nn.CrossEntropyLoss(reduction='mean', ignore_index=0)
         self.smoothing = smoothing
+        self.with_all = with_all
 
     def forward(self, pred, batch):
         pred = pred.reshape([-1, pred.shape[2]])
-        max_len = batch[2].max()
-        tgt = batch[1][:, 1:2 + max_len]
+        if self.with_all:
+            tgt = batch[1]
+        else:
+            max_len = batch[2].max()
+            tgt = batch[1][:, 1:2 + max_len]
         tgt = tgt.reshape([-1])
         if self.smoothing:
             eps = 0.1
diff --git a/ppocr/modeling/backbones/__init__.py b/ppocr/modeling/backbones/__init__.py
index 072d6e0f84d4126d256c26aa5baf17c9dc4e63df..a368e7481628cd836963618cd4cbfca12ba2080b 100755
--- a/ppocr/modeling/backbones/__init__.py
+++ b/ppocr/modeling/backbones/__init__.py
@@ -32,10 +32,11 @@ def build_backbone(config, model_type):
         from .rec_micronet import MicroNet
         from .rec_efficientb3_pren import EfficientNetb3_PREN
         from .rec_svtrnet import SVTRNet
+        from .rec_vitstr import ViTSTR
         support_dict = [
             'MobileNetV1Enhance', 'MobileNetV3', 'ResNet', 'ResNetFPN', 'MTB',
             "ResNet31", "ResNet_ASTER", 'MicroNet', 'EfficientNetb3_PREN',
-            'SVTRNet'
+            'SVTRNet', 'ViTSTR'
         ]
     elif model_type == "e2e":
         from .e2e_resnet_vd_pg import ResNet
diff --git a/ppocr/modeling/backbones/rec_svtrnet.py b/ppocr/modeling/backbones/rec_svtrnet.py
index c57bf46345d6e08f23b9258358f77f2285366314..c2c07f4476929d49237c8e9a10713f881f5f556b 100644
--- a/ppocr/modeling/backbones/rec_svtrnet.py
+++ b/ppocr/modeling/backbones/rec_svtrnet.py
@@ -147,7 +147,7 @@ class Attention(nn.Layer):
                  dim,
                  num_heads=8,
                  mixer='Global',
-                 HW=[8, 25],
+                 HW=None,
                  local_k=[7, 11],
                  qkv_bias=False,
                  qk_scale=None,
@@ -210,7 +210,7 @@ class Block(nn.Layer):
                  num_heads,
                  mixer='Global',
                  local_mixer=[7, 11],
-                 HW=[8, 25],
+                 HW=None,
                  mlp_ratio=4.,
                  qkv_bias=False,
                  qk_scale=None,
@@ -274,7 +274,9 @@ class PatchEmbed(nn.Layer):
                  img_size=[32, 100],
                  in_channels=3,
                  embed_dim=768,
-                 sub_num=2):
+                 sub_num=2,
+                 patch_size=[4, 4],
+                 mode='pope'):
         super().__init__()
         num_patches = (img_size[1] // (2 ** sub_num)) * \
                       (img_size[0] // (2 ** sub_num))
@@ -282,50 +284,56 @@ class PatchEmbed(nn.Layer):
         self.num_patches = num_patches
         self.embed_dim = embed_dim
         self.norm = None
-        if sub_num == 2:
-            self.proj = nn.Sequential(
-                ConvBNLayer(
-                    in_channels=in_channels,
-                    out_channels=embed_dim // 2,
-                    kernel_size=3,
-                    stride=2,
-                    padding=1,
-                    act=nn.GELU,
-                    bias_attr=None),
-                ConvBNLayer(
-                    in_channels=embed_dim // 2,
-                    out_channels=embed_dim,
-                    kernel_size=3,
-                    stride=2,
-                    padding=1,
-                    act=nn.GELU,
-                    bias_attr=None))
-        if sub_num == 3:
-            self.proj = nn.Sequential(
-                ConvBNLayer(
-                    in_channels=in_channels,
-                    out_channels=embed_dim // 4,
-                    kernel_size=3,
-                    stride=2,
-                    padding=1,
-                    act=nn.GELU,
-                    bias_attr=None),
-                ConvBNLayer(
-                    in_channels=embed_dim // 4,
-                    out_channels=embed_dim // 2,
-                    kernel_size=3,
-                    stride=2,
-                    padding=1,
-                    act=nn.GELU,
-                    bias_attr=None),
-                ConvBNLayer(
-                    in_channels=embed_dim // 2,
-                    out_channels=embed_dim,
-                    kernel_size=3,
-                    stride=2,
-                    padding=1,
-                    act=nn.GELU,
-                    bias_attr=None))
+        if mode == 'pope':
+            if sub_num == 2:
+                self.proj = nn.Sequential(
+                    ConvBNLayer(
+                        in_channels=in_channels,
+                        out_channels=embed_dim // 2,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act=nn.GELU,
+                        bias_attr=None),
+                    ConvBNLayer(
+                        in_channels=embed_dim // 2,
+                        out_channels=embed_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act=nn.GELU,
+                        bias_attr=None))
+            if sub_num == 3:
+                self.proj = nn.Sequential(
+                    ConvBNLayer(
+                        in_channels=in_channels,
+                        out_channels=embed_dim // 4,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act=nn.GELU,
+                        bias_attr=None),
+                    ConvBNLayer(
+                        in_channels=embed_dim // 4,
+                        out_channels=embed_dim // 2,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act=nn.GELU,
+                        bias_attr=None),
+                    ConvBNLayer(
+                        in_channels=embed_dim // 2,
+                        out_channels=embed_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act=nn.GELU,
+                        bias_attr=None))
+        elif mode == 'linear':
+            self.proj = nn.Conv2D(
+                1, embed_dim, kernel_size=patch_size, stride=patch_size)
+            self.num_patches = img_size[0] // patch_size[0] * img_size[
+                1] // patch_size[1]
 
     def forward(self, x):
         B, C, H, W = x.shape
diff --git a/ppocr/modeling/backbones/rec_vitstr.py b/ppocr/modeling/backbones/rec_vitstr.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5d7d5148a1120e6f97a321b4135c6780c0c5db2
--- /dev/null
+++ b/ppocr/modeling/backbones/rec_vitstr.py
@@ -0,0 +1,120 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from: 
+https://github.com/roatienza/deep-text-recognition-benchmark/blob/master/modules/vitstr.py
+"""
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from ppocr.modeling.backbones.rec_svtrnet import Block, PatchEmbed, zeros_, trunc_normal_, ones_
+
+scale_dim_heads = {'tiny': [192, 3], 'small': [384, 6], 'base': [768, 12]}
+
+
+class ViTSTR(nn.Layer):
+    def __init__(self,
+                 img_size=[224, 224],
+                 in_channels=1,
+                 scale='tiny',
+                 seqlen=27,
+                 patch_size=[16, 16],
+                 embed_dim=None,
+                 depth=12,
+                 num_heads=None,
+                 mlp_ratio=4,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_path_rate=0.,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 norm_layer='nn.LayerNorm',
+                 act_layer='nn.GELU',
+                 epsilon=1e-6,
+                 out_channels=None,
+                 **kwargs):
+        super().__init__()
+        self.seqlen = seqlen
+        embed_dim = embed_dim if embed_dim is not None else scale_dim_heads[
+            scale][0]
+        num_heads = num_heads if num_heads is not None else scale_dim_heads[
+            scale][1]
+        out_channels = out_channels if out_channels is not None else embed_dim
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            in_channels=in_channels,
+            embed_dim=embed_dim,
+            patch_size=patch_size,
+            mode='linear')
+        num_patches = self.patch_embed.num_patches
+
+        self.pos_embed = self.create_parameter(
+            shape=[1, num_patches + 1, embed_dim], default_initializer=zeros_)
+        self.add_parameter("pos_embed", self.pos_embed)
+        self.cls_token = self.create_parameter(
+            shape=[1, 1, embed_dim], default_initializer=zeros_)
+        self.add_parameter("cls_token", self.cls_token)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+        self.blocks = nn.LayerList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=eval(act_layer),
+                epsilon=epsilon,
+                prenorm=False) for i in range(depth)
+        ])
+        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
+
+        self.out_channels = out_channels
+
+        trunc_normal_(self.pos_embed)
+        trunc_normal_(self.cls_token)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+        cls_tokens = paddle.tile(self.cls_token, repeat_times=[B, 1, 1])
+        x = paddle.concat((cls_tokens, x), axis=1)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = x[:, :self.seqlen]
+        return x.transpose([0, 2, 1]).unsqueeze(2)
diff --git a/ppocr/postprocess/__init__.py b/ppocr/postprocess/__init__.py
index f50b5f1c5f8e617066bb47636c8f4d2b171b6ecb..4f900ee1fc716ebec78feaaed07bf258d3d0df0a 100644
--- a/ppocr/postprocess/__init__.py
+++ b/ppocr/postprocess/__init__.py
@@ -27,7 +27,7 @@ from .sast_postprocess import SASTPostProcess
 from .fce_postprocess import FCEPostProcess
 from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, \
     DistillationCTCLabelDecode, TableLabelDecode, NRTRLabelDecode, SARLabelDecode, \
-    SEEDLabelDecode, PRENLabelDecode
+    SEEDLabelDecode, PRENLabelDecode, ViTSTRLabelDecode
 from .cls_postprocess import ClsPostProcess
 from .pg_postprocess import PGPostProcess
 from .vqa_token_ser_layoutlm_postprocess import VQASerTokenLayoutLMPostProcess
@@ -42,7 +42,7 @@ def build_post_process(config, global_config=None):
         'DistillationDBPostProcess', 'NRTRLabelDecode', 'SARLabelDecode',
         'SEEDLabelDecode', 'VQASerTokenLayoutLMPostProcess',
         'VQAReTokenLayoutLMPostProcess', 'PRENLabelDecode',
-        'DistillationSARLabelDecode'
+        'DistillationSARLabelDecode', 'ViTSTRLabelDecode'
     ]
 
     if config['name'] == 'PSEPostProcess':
diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py
index bf0fd890bf25949361665d212bf8e1a657054e5b..df6203fadaae9e99c1df125e7172166e5e3d8acd 100644
--- a/ppocr/postprocess/rec_postprocess.py
+++ b/ppocr/postprocess/rec_postprocess.py
@@ -188,13 +188,13 @@ class NRTRLabelDecode(BaseRecLabelDecode):
             char_list = []
             conf_list = []
             for idx in range(len(text_index[batch_idx])):
-                if text_index[batch_idx][idx] == 3:  # end
-                    break
                 try:
-                    char_list.append(self.character[int(text_index[batch_idx][
-                        idx])])
+                    char_idx = self.character[int(text_index[batch_idx][idx])]
                 except:
                     continue
+                if char_idx == '</s>':  # end
+                    break
+                char_list.append(char_idx)
                 if text_prob is not None:
                     conf_list.append(text_prob[batch_idx][idx])
                 else:
@@ -204,6 +204,32 @@ class NRTRLabelDecode(BaseRecLabelDecode):
         return result_list
 
 
+class ViTSTRLabelDecode(NRTRLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, character_dict_path=None, use_space_char=False,
+                 **kwargs):
+        super(ViTSTRLabelDecode, self).__init__(character_dict_path,
+                                                use_space_char)
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        if isinstance(preds, paddle.Tensor):
+            preds = preds[:, 1:].numpy()
+        else:
+            preds = preds[:, 1:]
+        preds_idx = preds.argmax(axis=2)
+        preds_prob = preds.max(axis=2)
+        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+        if label is None:
+            return text
+        label = self.decode(label[:, 1:])
+        return text, label
+
+    def add_special_char(self, dict_character):
+        dict_character = ['<s>', '</s>'] + dict_character
+        return dict_character
+
+
 class AttnLabelDecode(BaseRecLabelDecode):
     """ Convert between text-label and text-index """
 
diff --git a/test_tipc/configs/rec_mtb_nrtr/rec_mtb_nrtr.yml b/test_tipc/configs/rec_mtb_nrtr/rec_mtb_nrtr.yml
index 15119bb2a9de02c19684d21ad5a1859db94895ce..3936ab58adfca2b5f900b99c84766e3c1058236e 100644
--- a/test_tipc/configs/rec_mtb_nrtr/rec_mtb_nrtr.yml
+++ b/test_tipc/configs/rec_mtb_nrtr/rec_mtb_nrtr.yml
@@ -49,7 +49,7 @@ Architecture:
     
 
 Loss:
-  name: NRTRLoss
+  name: CESmoothingLoss
   smoothing: True
 
 PostProcess:
@@ -69,7 +69,7 @@ Train:
           img_mode: BGR
           channel_first: False
       - NRTRLabelEncode: # Class handling label
-      - NRTRRecResizeImg:
+      - GrayRecResizeImg:
           image_shape: [100, 32]
           resize_type: PIL # PIL or OpenCV
       - KeepKeys:
@@ -90,7 +90,7 @@ Eval:
           img_mode: BGR
           channel_first: False
       - NRTRLabelEncode: # Class handling label
-      - NRTRRecResizeImg:
+      - GrayRecResizeImg:
           image_shape: [100, 32]
           resize_type: PIL # PIL or OpenCV
       - KeepKeys:
diff --git a/test_tipc/configs/rec_svtrnet/rec_svtrnet.yml b/test_tipc/configs/rec_svtrnet/rec_svtrnet.yml
new file mode 100644
index 0000000000000000000000000000000000000000..26facca34d20536cb19c3b1f80b0828ebc817e50
--- /dev/null
+++ b/test_tipc/configs/rec_svtrnet/rec_svtrnet.yml
@@ -0,0 +1,119 @@
+Global:
+  use_gpu: True
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  save_model_dir: ./output/rec/svtr/
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations after the 0th iteration
+  eval_batch_step: [0, 2000]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  save_inference_dir:
+  use_visualdl: False
+  infer_img: doc/imgs_words_en/word_10.png
+  # for data or label process
+  character_dict_path:
+  character_type: en
+  max_text_length: 25
+  infer_mode: False
+  use_space_char: False
+  save_res_path: ./output/rec/predicts_svtr_tiny.txt
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.99
+  epsilon: 0.00000008
+  weight_decay: 0.05
+  no_weight_decay_name: norm pos_embed
+  one_dim_param_no_weight_decay: true
+  lr:
+    name: Cosine
+    learning_rate: 0.0005
+    warmup_epoch: 2
+
+Architecture:
+  model_type: rec
+  algorithm: SVTR
+  Transform:
+    name: STN_ON
+    tps_inputsize: [32, 64]
+    tps_outputsize: [32, 100]
+    num_control_points: 20
+    tps_margins: [0.05,0.05]
+    stn_activation: none
+  Backbone:
+    name: SVTRNet
+    img_size: [32, 100]
+    out_char_num: 25
+    out_channels: 192
+    patch_merging: 'Conv'
+    embed_dim: [64, 128, 256]
+    depth: [3, 6, 3]
+    num_heads: [2, 4, 8]
+    mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global']
+    local_mixer: [[7, 11], [7, 11], [7, 11]]
+    last_stage: True
+    prenorm: false
+  Neck:
+    name: SequenceEncoder
+    encoder_type: reshape
+  Head:
+    name: CTCHead
+
+Loss:
+  name: CTCLoss
+
+PostProcess:
+  name: CTCLabelDecode
+
+Metric:
+  name: RecMetric
+  main_indicator: acc
+
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/ic15_data/
+    label_file_list: ["./train_data/ic15_data/rec_gt_train.txt"]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - CTCLabelEncode: # Class handling label
+      - RecResizeImg:
+          character_dict_path:
+          image_shape: [3, 64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 512
+    drop_last: True
+    num_workers: 4
+
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/ic15_data
+    label_file_list: ["./train_data/ic15_data/rec_gt_test.txt"]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - CTCLabelEncode: # Class handling label
+      - RecResizeImg:
+          character_dict_path:
+          image_shape: [3, 64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2
diff --git a/test_tipc/configs/rec_svtrnet/train_infer_python.txt b/test_tipc/configs/rec_svtrnet/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..72526063e090b40b8926f8fdc2acc42a705841e6
--- /dev/null
+++ b/test_tipc/configs/rec_svtrnet/train_infer_python.txt
@@ -0,0 +1,53 @@
+===========================train_params===========================
+model_name:rec_svtrnet
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:True|True
+Global.auto_cast:null
+Global.epoch_num:lite_train_lite_infer=2|whole_train_whole_infer=300
+Global.save_model_dir:./output/
+Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=64
+Global.pretrained_model:null
+train_model_name:latest
+train_infer_img_dir:./inference/rec_inference
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c test_tipc/configs/rec_svtrnet/rec_svtrnet.yml -o
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:tools/eval.py -c test_tipc/configs/rec_svtrnet/rec_svtrnet.yml -o
+null:null
+##
+===========================infer_params===========================
+Global.save_inference_dir:./output/
+Global.checkpoints:
+norm_export:tools/export_model.py -c test_tipc/configs/rec_svtrnet/rec_svtrnet.yml -o
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+##
+train_model:./inference/rec_svtrnet_train/best_accuracy
+infer_export:tools/export_model.py -c test_tipc/configs/rec_svtrnet/rec_svtrnet.yml -o
+infer_quant:False
+inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/EN_symbol_dict.txt --rec_image_shape="1,224,224" --rec_algorithm="SVTR"
+--use_gpu:True|False
+--enable_mkldnn:False
+--cpu_threads:1|6
+--rec_batch_num:1|6
+--use_tensorrt:False
+--precision:fp32|int8
+--rec_model_dir:
+--image_dir:./inference/rec_inference
+--save_log_path:./test/output/
+--benchmark:True
+null:null
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[1,224,224]}]
diff --git a/test_tipc/configs/rec_vitstr/rec_vitstr.yml b/test_tipc/configs/rec_vitstr/rec_vitstr.yml
new file mode 100644
index 0000000000000000000000000000000000000000..427bce4b5adfd6ddb51e162741d10a9ba003d001
--- /dev/null
+++ b/test_tipc/configs/rec_vitstr/rec_vitstr.yml
@@ -0,0 +1,101 @@
+Global:
+  use_gpu: True
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  save_model_dir: ./output/rec/vitstr/
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations after the 0th iteration#
+  eval_batch_step: [0, 2000]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  save_inference_dir:
+  use_visualdl: False
+  infer_img: doc/imgs_words_en/word_10.png
+  # for data or label process
+  character_dict_path: ppocr/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  infer_mode: False
+  use_space_char: False
+  save_res_path: ./output/rec/predicts_vitstr.txt
+
+
+Optimizer:
+  name: Adadelta
+  epsilon: 0.00000001
+  rho: 0.95
+  clip_norm: 5.0
+  lr:
+    learning_rate: 1.0
+
+Architecture:
+  model_type: rec
+  algorithm: ViTSTR
+  in_channels: 1
+  Transform:
+  Backbone:
+    name: ViTSTR
+  Neck:
+    name: SequenceEncoder
+    encoder_type: reshape
+  Head:
+    name: CTCHead
+
+Loss:
+  name: CESmoothingLoss
+  smoothing: False
+  with_all: True
+
+PostProcess:
+  name: ViTSTRLabelDecode
+
+Metric:
+  name: RecMetric
+  main_indicator: acc
+
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/ic15_data/
+    label_file_list: ["./train_data/ic15_data/rec_gt_train.txt"]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - ViTSTRLabelEncode: # Class handling label
+      - GrayRecResizeImg:
+          image_shape: [224, 224] # W H
+          resize_type: PIL # PIL or OpenCV
+          inter_type: 'Image.BICUBIC'
+          scale: false
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 48
+    drop_last: True
+    num_workers: 2
+
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/ic15_data
+    label_file_list: ["./train_data/ic15_data/rec_gt_test.txt"]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - ViTSTRLabelEncode: # Class handling label
+      - GrayRecResizeImg:
+          image_shape: [224, 224] # W H
+          resize_type: PIL # PIL or OpenCV
+          inter_type: 'Image.BICUBIC'
+          scale: false
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2
diff --git a/test_tipc/configs/rec_vitstr/train_infer_python.txt b/test_tipc/configs/rec_vitstr/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6c7af1fb87375922350b7b54af6997a03cc91b1a
--- /dev/null
+++ b/test_tipc/configs/rec_vitstr/train_infer_python.txt
@@ -0,0 +1,53 @@
+===========================train_params===========================
+model_name:rec_vitstr
+python:python3.7
+gpu_list:0|0,1
+Global.use_gpu:True|True
+Global.auto_cast:null
+Global.epoch_num:lite_train_lite_infer=2|whole_train_whole_infer=300
+Global.save_model_dir:./output/
+Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=64
+Global.pretrained_model:null
+train_model_name:latest
+train_infer_img_dir:./inference/rec_inference
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c test_tipc/configs/rec_vitstr/rec_vitstr.yml -o
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:tools/eval.py -c test_tipc/configs/rec_vitstr/rec_vitstr.yml -o
+null:null
+##
+===========================infer_params===========================
+Global.save_inference_dir:./output/
+Global.checkpoints:
+norm_export:tools/export_model.py -c test_tipc/configs/rec_vitstr/rec_vitstr.yml -o
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+##
+train_model:./inference/rec_vitstr_train/best_accuracy
+infer_export:tools/export_model.py -c test_tipc/configs/rec_vitstr/rec_vitstr.yml -o
+infer_quant:False
+inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/EN_symbol_dict.txt --rec_image_shape="1,224,224" --rec_algorithm="ViTSTR"
+--use_gpu:True|False
+--enable_mkldnn:False
+--cpu_threads:1|6
+--rec_batch_num:1|6
+--use_tensorrt:False
+--precision:fp32|int8
+--rec_model_dir:
+--image_dir:./inference/rec_inference
+--save_log_path:./test/output/
+--benchmark:True
+null:null
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[1,224,224]}]
diff --git a/tools/export_model.py b/tools/export_model.py
index c0cbcd361cec31c51616a7154836c234f076a86e..6e003f2ffa8755913fe76ee893677da70e2459b2 100755
--- a/tools/export_model.py
+++ b/tools/export_model.py
@@ -73,6 +73,12 @@ def export_single_model(model, arch_config, save_path, logger, quanter=None):
                 shape=[None, 3, 64, 512], dtype="float32"),
         ]
         model = to_static(model, input_spec=other_shape)
+    elif arch_config["algorithm"] == "ViTSTR":
+        other_shape = [
+            paddle.static.InputSpec(
+                shape=[None, 1, 224, 224], dtype="float32"),
+        ]
+        model = to_static(model, input_spec=other_shape)
     else:
         infer_shape = [3, -1, -1]
         if arch_config["model_type"] == "rec":
diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py
index 3664ef2caf4b888d6a3918202256c99cc54c5eb1..1945667972310cdef03daac7d2bfdb52373b950b 100755
--- a/tools/infer/predict_rec.py
+++ b/tools/infer/predict_rec.py
@@ -69,6 +69,12 @@ class TextRecognizer(object):
                 "character_dict_path": args.rec_char_dict_path,
                 "use_space_char": args.use_space_char
             }
+        elif self.rec_algorithm == 'ViTSTR':
+            postprocess_params = {
+                'name': 'ViTSTRLabelDecode',
+                "character_dict_path": args.rec_char_dict_path,
+                "use_space_char": args.use_space_char
+            }
         self.postprocess_op = build_post_process(postprocess_params)
         self.predictor, self.input_tensor, self.output_tensors, self.config = \
             utility.create_predictor(args, 'rec', logger)
@@ -96,15 +102,22 @@ class TextRecognizer(object):
 
     def resize_norm_img(self, img, max_wh_ratio):
         imgC, imgH, imgW = self.rec_image_shape
-        if self.rec_algorithm == 'NRTR':
+        if self.rec_algorithm == 'NRTR' or self.rec_algorithm == 'ViTSTR':
             img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
             # return padding_im
             image_pil = Image.fromarray(np.uint8(img))
-            img = image_pil.resize([100, 32], Image.ANTIALIAS)
+            if self.rec_algorithm == 'ViTSTR':
+                img = image_pil.resize([imgW, imgH], Image.BICUBIC)
+            else:
+                img = image_pil.resize([imgW, imgH], Image.ANTIALIAS)
             img = np.array(img)
             norm_img = np.expand_dims(img, -1)
             norm_img = norm_img.transpose((2, 0, 1))
-            return norm_img.astype(np.float32) / 128. - 1.
+            if self.rec_algorithm == 'ViTSTR':
+                norm_img = norm_img.astype(np.float32) / 255.
+            else:
+                norm_img = norm_img.astype(np.float32) / 128. - 1.
+            return norm_img
 
         assert imgC == img.shape[2]
         imgW = int((imgH * max_wh_ratio))
diff --git a/tools/program.py b/tools/program.py
index aa0d2698cf66c928f87217996c31c042e1c8aa02..745c28b87292480159ed41285f2a79c6bf5d0abe 100755
--- a/tools/program.py
+++ b/tools/program.py
@@ -307,7 +307,8 @@ def train(config,
             train_stats.update(stats)
 
             if log_writer is not None and dist.get_rank() == 0:
-                log_writer.log_metrics(metrics=train_stats.get(), prefix="TRAIN", step=global_step)
+                log_writer.log_metrics(
+                    metrics=train_stats.get(), prefix="TRAIN", step=global_step)
 
             if dist.get_rank() == 0 and (
                 (global_step > 0 and global_step % print_batch_step == 0) or
@@ -354,7 +355,8 @@ def train(config,
 
                 # logger metric
                 if log_writer is not None:
-                    log_writer.log_metrics(metrics=cur_metric, prefix="EVAL", step=global_step)
+                    log_writer.log_metrics(
+                        metrics=cur_metric, prefix="EVAL", step=global_step)
 
                 if cur_metric[main_indicator] >= best_model_dict[
                         main_indicator]:
@@ -377,11 +379,18 @@ def train(config,
                 logger.info(best_str)
                 # logger best metric
                 if log_writer is not None:
-                    log_writer.log_metrics(metrics={
-                        "best_{}".format(main_indicator): best_model_dict[main_indicator]
-                        }, prefix="EVAL", step=global_step)
-                    
-                    log_writer.log_model(is_best=True, prefix="best_accuracy", metadata=best_model_dict)
+                    log_writer.log_metrics(
+                        metrics={
+                            "best_{}".format(main_indicator):
+                            best_model_dict[main_indicator]
+                        },
+                        prefix="EVAL",
+                        step=global_step)
+
+                    log_writer.log_model(
+                        is_best=True,
+                        prefix="best_accuracy",
+                        metadata=best_model_dict)
 
             reader_start = time.time()
         if dist.get_rank() == 0:
@@ -413,7 +422,8 @@ def train(config,
                 epoch=epoch,
                 global_step=global_step)
             if log_writer is not None:
-                log_writer.log_model(is_best=False, prefix='iter_epoch_{}'.format(epoch))
+                log_writer.log_model(
+                    is_best=False, prefix='iter_epoch_{}'.format(epoch))
 
     best_str = 'best metric, {}'.format(', '.join(
         ['{}: {}'.format(k, v) for k, v in best_model_dict.items()]))
@@ -564,7 +574,8 @@ def preprocess(is_train=False):
     assert alg in [
         'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN',
         'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE',
-        'SEED', 'SDMGR', 'LayoutXLM', 'LayoutLM', 'PREN', 'FCE', 'SVTR'
+        'SEED', 'SDMGR', 'LayoutXLM', 'LayoutLM', 'PREN', 'FCE', 'SVTR',
+        'ViTSTR'
     ]
 
     if use_xpu:
@@ -585,7 +596,8 @@ def preprocess(is_train=False):
         vdl_writer_path = '{}/vdl/'.format(save_model_dir)
         log_writer = VDLLogger(save_model_dir)
         loggers.append(log_writer)
-    if ('use_wandb' in config['Global'] and config['Global']['use_wandb']) or 'wandb' in config:
+    if ('use_wandb' in config['Global'] and
+            config['Global']['use_wandb']) or 'wandb' in config:
         save_dir = config['Global']['save_model_dir']
         wandb_writer_path = "{}/wandb".format(save_dir)
         if "wandb" in config: