diff --git a/configs/rec/rec_mtb_nrtr.yml b/configs/rec/rec_mtb_nrtr.yml index 405674882c98489003a6efdb08cb3277e72ff612..4e5826adc990c30aaee1d63fd5b4523944906eee 100644 --- a/configs/rec/rec_mtb_nrtr.yml +++ b/configs/rec/rec_mtb_nrtr.yml @@ -9,7 +9,7 @@ Global: eval_batch_step: [0, 2000] cal_metric_during_train: True pretrained_model: - checkpoints: + checkpoints: save_inference_dir: use_visualdl: False infer_img: doc/imgs_words_en/word_10.png @@ -82,7 +82,7 @@ Train: Eval: dataset: name: LMDBDataSet - data_dir: ./train_data/data_lmdb_release/evaluation/ + data_dir: ./train_data/data_lmdb_release/validation/ transforms: - DecodeImage: # load image img_mode: BGR @@ -97,5 +97,5 @@ Eval: shuffle: False drop_last: False batch_size_per_card: 256 - num_workers: 1 + num_workers: 4 use_shared_memory: False diff --git a/configs/rec/rec_r45_abinet.yml b/configs/rec/rec_r45_abinet.yml new file mode 100644 index 0000000000000000000000000000000000000000..a756fead9f383dc135e3c6de337da88bb64ab801 --- /dev/null +++ b/configs/rec/rec_r45_abinet.yml @@ -0,0 +1,103 @@ +Global: + use_gpu: True + epoch_num: 10 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/r45_abinet/ + save_epoch_step: 1 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_abinet.txt + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.99 + clip_norm: 20.0 + lr: + name: Piecewise + decay_epochs: [6] + values: [0.0001, 0.00001] + regularizer: + name: 'L2' + factor: 0. + +Architecture: + model_type: rec + algorithm: ABINet + in_channels: 3 + Transform: + Backbone: + name: ResNet45 + + Head: + name: ABINetHead + use_lang: True + iter_size: 3 + + +Loss: + name: CELoss + ignore_index: &ignore_index 100 # Must be greater than the number of character classes + +PostProcess: + name: ABINetLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - ABINetLabelEncode: # Class handling label + ignore_index: *ignore_index + - ABINetRecResizeImg: + image_shape: [3, 32, 128] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 96 + drop_last: True + num_workers: 4 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/validation/ + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - ABINetLabelEncode: # Class handling label + ignore_index: *ignore_index + - ABINetRecResizeImg: + image_shape: [3, 32, 128] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 4 + use_shared_memory: False diff --git a/configs/rec/rec_svtrnet.yml b/configs/rec/rec_svtrnet.yml index a700e4bd92995e8c0da2bf7623fe25e746483b1b..a3d292b6baa9d07e05b74901e668052c588a097b 100644 --- a/configs/rec/rec_svtrnet.yml +++ b/configs/rec/rec_svtrnet.yml @@ -26,7 +26,7 @@ Optimizer: name: AdamW beta1: 0.9 beta2: 0.99 - epsilon: 0.00000008 + epsilon: 8.e-8 weight_decay: 0.05 no_weight_decay_name: norm pos_embed one_dim_param_no_weight_decay: true diff --git a/configs/rec/rec_vitstr_none_ce.yml b/configs/rec/rec_vitstr_none_ce.yml index c71eba0cc2e6c814ebba2b10ecc5f5296ae7577f..b969c83a5d77b8c8ffde97ce5f914512074cf26e 100644 --- a/configs/rec/rec_vitstr_none_ce.yml +++ b/configs/rec/rec_vitstr_none_ce.yml @@ -6,7 +6,7 @@ Global: save_model_dir: ./output/rec/vitstr_none_ce/ save_epoch_step: 1 # evaluation is run every 2000 iterations after the 0th iteration# - eval_batch_step: [0, 50] + eval_batch_step: [0, 2000] cal_metric_during_train: True pretrained_model: checkpoints: @@ -23,7 +23,7 @@ Global: Optimizer: name: Adadelta - epsilon: 0.00000001 + epsilon: 1.e-8 rho: 0.95 clip_norm: 5.0 lr: @@ -45,8 +45,8 @@ Architecture: Loss: name: CELoss - smoothing: False with_all: True + ignore_index: &ignore_index 0 # Must be zero or greater than the number of character classes PostProcess: name: ViTSTRLabelDecode @@ -58,12 +58,13 @@ Metric: Train: dataset: name: LMDBDataSet - data_dir: ./train_data/data_lmdb_release/training + data_dir: ./train_data/data_lmdb_release/training/ transforms: - DecodeImage: # load image img_mode: BGR channel_first: False - ViTSTRLabelEncode: # Class handling label + ignore_index: *ignore_index - GrayRecResizeImg: image_shape: [224, 224] # W H resize_type: PIL # PIL or OpenCV @@ -80,12 +81,13 @@ Train: Eval: dataset: name: LMDBDataSet - data_dir: ./train_data/data_lmdb_release/validation + data_dir: ./train_data/data_lmdb_release/validation/ transforms: - DecodeImage: # load image img_mode: BGR channel_first: False - ViTSTRLabelEncode: # Class handling label + ignore_index: *ignore_index - GrayRecResizeImg: image_shape: [224, 224] # W H resize_type: PIL # PIL or OpenCV diff --git a/doc/doc_ch/algorithm_overview.md b/doc/doc_ch/algorithm_overview.md index 4196ec05af0e038d27d29fec110a8b2556542310..eb81e4cd6dae2542dd07d0e25fe543419f798c9b 100755 --- a/doc/doc_ch/algorithm_overview.md +++ b/doc/doc_ch/algorithm_overview.md @@ -67,6 +67,7 @@ - [x] [SEED](./algorithm_rec_seed.md) - [x] [SVTR](./algorithm_rec_svtr.md) - [x] [ViTSTR](./algorithm_rec_vitstr.md) +- [x] [ABINet](./algorithm_rec_abinet.md) 参考[DTRB](https://arxiv.org/abs/1904.01906)[3]文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: @@ -86,6 +87,7 @@ |SEED|Aster_Resnet| 85.35% | rec_resnet_stn_bilstm_att | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) | |SVTR|SVTR-Tiny| 89.25% | rec_svtr_tiny_none_ctc_en | [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) | |ViTSTR|ViTSTR| 79.82% | rec_vitstr_none_ce_en | [训练模型](https://paddleocr.bj.bcebos.com/rec_vitstr_none_ce_train.tar) | +|ABINet|Resnet45| 90.75% | rec_r45_abinet_en | [训练模型](https://paddleocr.bj.bcebos.com/rec_r45_abinet_train.tar) | diff --git a/doc/doc_ch/algorithm_rec_abinet.md b/doc/doc_ch/algorithm_rec_abinet.md new file mode 100644 index 0000000000000000000000000000000000000000..d20c703014bf1e1947f6a37003d80602ef5b0582 --- /dev/null +++ b/doc/doc_ch/algorithm_rec_abinet.md @@ -0,0 +1,155 @@ +# 场景文本识别算法-ABINet + +- [1. 算法简介](#1) +- [2. 环境配置](#2) +- [3. 模型训练、评估、预测](#3) + - [3.1 训练](#3-1) + - [3.2 评估](#3-2) + - [3.3 预测](#3-3) +- [4. 推理部署](#4) + - [4.1 Python推理](#4-1) + - [4.2 C++推理](#4-2) + - [4.3 Serving服务化部署](#4-3) + - [4.4 更多推理部署](#4-4) +- [5. FAQ](#5) + + +## 1. 算法简介 + +论文信息: +> [ABINet: Read Like Humans: Autonomous, Bidirectional and Iterative Language Modeling for Scene Text Recognition](https://openaccess.thecvf.com/content/CVPR2021/papers/Fang_Read_Like_Humans_Autonomous_Bidirectional_and_Iterative_Language_Modeling_for_CVPR_2021_paper.pdf) +> Shancheng Fang and Hongtao Xie and Yuxin Wang and Zhendong Mao and Yongdong Zhang +> CVPR, 2021 + + + +`ABINet`使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下: + +|模型|骨干网络|配置文件|Acc|下载链接| +| --- | --- | --- | --- | --- | +|ABINet|ResNet45|[rec_r45_abinet.yml](../../configs/rec/rec_r45_abinet.yml)|90.75%|[训练模型]()/[预训练模型]| + + +## 2. 环境配置 +请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。 + + + +## 3. 模型训练、评估、预测 + + +### 3.1 模型训练 + +请参考[文本识别训练教程](./recognition.md)。PaddleOCR对代码进行了模块化,训练`ABINet`识别模型时需要**更换配置文件**为`ABINet`的[配置文件](../../configs/rec/rec_r45_abinet.yml)。 + +#### 启动训练 + + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: +```shell +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_r45_abinet.yml + +#多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r45_abinet.yml +``` + + +### 3.2 评估 + +可下载已训练完成的[模型文件](#model),使用如下命令进行评估: + +```shell +# 注意将pretrained_model的路径设置为本地路径。 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r45_abinet.yml -o Global.pretrained_model=./rec_r45_abinet_train/best_accuracy +``` + + +### 3.3 预测 + +使用如下命令进行单张图片预测: +```shell +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/infer_rec.py -c configs/rec/rec_r45_abinet.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_r45_abinet_train/best_accuracy +# 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/imgs_words_en/'。 +``` + + + +## 4. 推理部署 + + +### 4.1 Python推理 +首先将训练得到best模型,转换成inference model。这里以训练完成的模型为例([模型下载地址]() ),可以使用如下命令进行转换: + +```shell +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/export_model.py -c configs/rec/rec_r45_abinet.yml -o Global.pretrained_model=./rec_r45_abinet_train/best_accuracy Global.save_inference_dir=./inference/rec_r45_abinet/ +``` +**注意:** +- 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否是所需要的字典文件。 +- 如果您修改了训练时的输入大小,请修改`tools/export_model.py`文件中的对应ABINet的`infer_shape`。 + +转换成功后,在目录下有三个文件: +``` +/inference/rec_r45_abinet/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +执行如下命令进行模型推理: + +```shell +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_r45_abinet/' --rec_algorithm='ABINet' --rec_image_shape='3,32,128' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt' +# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/imgs_words_en/'。 +``` + +![](../imgs_words_en/word_10.png) + +执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下: +结果如下: +```shell +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999995231628418) +``` + +**注意**: + +- 训练上述模型采用的图像分辨率是[3,32,128],需要通过参数`rec_image_shape`设置为您训练时的识别图像形状。 +- 在推理时需要设置参数`rec_char_dict_path`指定字典,如果您修改了字典,请修改该参数为您的字典文件。 +- 如果您修改了预处理方法,需修改`tools/infer/predict_rec.py`中ABINet的预处理为您的预处理方法。 + + + +### 4.2 C++推理部署 + +由于C++预处理后处理还未支持ABINet,所以暂未支持 + + +### 4.3 Serving服务化部署 + +暂不支持 + + +### 4.4 更多推理部署 + +暂不支持 + + +## 5. FAQ + +1. MJSynth和SynthText两种数据集来自于[ABINet源repo](https://github.com/FangShancheng/ABINet) 。 +2. 我们使用ABINet作者提供的预训练模型进行finetune训练。 + +## 引用 + +```bibtex +@article{Fang2021ABINet, + title = {ABINet: Read Like Humans: Autonomous, Bidirectional and Iterative Language Modeling for Scene Text Recognition}, + author = {Shancheng Fang and Hongtao Xie and Yuxin Wang and Zhendong Mao and Yongdong Zhang}, + booktitle = {CVPR}, + year = {2021}, + url = {https://arxiv.org/abs/2103.06495}, + pages = {7098-7107} +} +``` diff --git a/doc/doc_ch/algorithm_rec_nrtr.md b/doc/doc_ch/algorithm_rec_nrtr.md index d3b626d0241a6c36797cba20a58f007e383c7aee..c619ac1dbc23dcf33e1405c95ab9dd075a24b347 100644 --- a/doc/doc_ch/algorithm_rec_nrtr.md +++ b/doc/doc_ch/algorithm_rec_nrtr.md @@ -12,6 +12,7 @@ - [4.3 Serving服务化部署](#4-3) - [4.4 更多推理部署](#4-4) - [5. FAQ](#5) +- [6. 发行公告](#6) ## 1. 算法简介 @@ -110,7 +111,7 @@ python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' 执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下: 结果如下: ```shell -Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9265879392623901) +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9465042352676392) ``` **注意**: @@ -140,12 +141,147 @@ Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9265879392623901) 1. `NRTR`论文中使用Beam搜索进行解码字符,但是速度较慢,这里默认未使用Beam搜索,以贪婪搜索进行解码字符。 + +## 6. 发行公告 + +1. release/2.6更新NRTR代码结构,新版NRTR可加载旧版(release/2.5及之前)模型参数,使用下面示例代码将旧版模型参数转换为新版模型参数: + +```python + + params = paddle.load('path/' + '.pdparams') # 旧版本参数 + state_dict = model.state_dict() # 新版模型参数 + new_state_dict = {} + + for k1, v1 in state_dict.items(): + + k = k1 + if 'encoder' in k and 'self_attn' in k and 'qkv' in k and 'weight' in k: + + k_para = k[:13] + 'layers.' + k[13:] + q = params[k_para.replace('qkv', 'conv1')].transpose((1, 0, 2, 3)) + k = params[k_para.replace('qkv', 'conv2')].transpose((1, 0, 2, 3)) + v = params[k_para.replace('qkv', 'conv3')].transpose((1, 0, 2, 3)) + + new_state_dict[k1] = np.concatenate([q[:, :, 0, 0], k[:, :, 0, 0], v[:, :, 0, 0]], -1) + + elif 'encoder' in k and 'self_attn' in k and 'qkv' in k and 'bias' in k: + + k_para = k[:13] + 'layers.' + k[13:] + q = params[k_para.replace('qkv', 'conv1')] + k = params[k_para.replace('qkv', 'conv2')] + v = params[k_para.replace('qkv', 'conv3')] + + new_state_dict[k1] = np.concatenate([q, k, v], -1) + + elif 'encoder' in k and 'self_attn' in k and 'out_proj' in k: + + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para] + + elif 'encoder' in k and 'norm3' in k: + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para.replace('norm3', 'norm2')] + + elif 'encoder' in k and 'norm1' in k: + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para] + + + elif 'decoder' in k and 'self_attn' in k and 'qkv' in k and 'weight' in k: + k_para = k[:13] + 'layers.' + k[13:] + q = params[k_para.replace('qkv', 'conv1')].transpose((1, 0, 2, 3)) + k = params[k_para.replace('qkv', 'conv2')].transpose((1, 0, 2, 3)) + v = params[k_para.replace('qkv', 'conv3')].transpose((1, 0, 2, 3)) + new_state_dict[k1] = np.concatenate([q[:, :, 0, 0], k[:, :, 0, 0], v[:, :, 0, 0]], -1) + + elif 'decoder' in k and 'self_attn' in k and 'qkv' in k and 'bias' in k: + k_para = k[:13] + 'layers.' + k[13:] + q = params[k_para.replace('qkv', 'conv1')] + k = params[k_para.replace('qkv', 'conv2')] + v = params[k_para.replace('qkv', 'conv3')] + new_state_dict[k1] = np.concatenate([q, k, v], -1) + + elif 'decoder' in k and 'self_attn' in k and 'out_proj' in k: + + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para] + + elif 'decoder' in k and 'cross_attn' in k and 'q' in k and 'weight' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + q = params[k_para.replace('q', 'conv1')].transpose((1, 0, 2, 3)) + new_state_dict[k1] = q[:, :, 0, 0] + + elif 'decoder' in k and 'cross_attn' in k and 'q' in k and 'bias' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + q = params[k_para.replace('q', 'conv1')] + new_state_dict[k1] = q + + elif 'decoder' in k and 'cross_attn' in k and 'kv' in k and 'weight' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + k = params[k_para.replace('kv', 'conv2')].transpose((1, 0, 2, 3)) + v = params[k_para.replace('kv', 'conv3')].transpose((1, 0, 2, 3)) + new_state_dict[k1] = np.concatenate([k[:, :, 0, 0], v[:, :, 0, 0]], -1) + + elif 'decoder' in k and 'cross_attn' in k and 'kv' in k and 'bias' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + k = params[k_para.replace('kv', 'conv2')] + v = params[k_para.replace('kv', 'conv3')] + new_state_dict[k1] = np.concatenate([k, v], -1) + + elif 'decoder' in k and 'cross_attn' in k and 'out_proj' in k: + + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + new_state_dict[k1] = params[k_para] + elif 'decoder' in k and 'norm' in k: + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para] + elif 'mlp' in k and 'weight' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('fc', 'conv') + k_para = k_para.replace('mlp.', '') + w = params[k_para].transpose((1, 0, 2, 3)) + new_state_dict[k1] = w[:, :, 0, 0] + elif 'mlp' in k and 'bias' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('fc', 'conv') + k_para = k_para.replace('mlp.', '') + w = params[k_para] + new_state_dict[k1] = w + + else: + new_state_dict[k1] = params[k1] + + if list(new_state_dict[k1].shape) != list(v1.shape): + print(k1) + + + for k, v1 in state_dict.items(): + if k not in new_state_dict.keys(): + print(1, k) + elif list(new_state_dict[k].shape) != list(v1.shape): + print(2, k) + + + + model.set_state_dict(new_state_dict) + paddle.save(model.state_dict(), 'nrtrnew_from_old_params.pdparams') + +``` + +2. 新版相比与旧版,代码结构简洁,推理速度有所提高。 + + ## 引用 ```bibtex @article{Sheng2019NRTR, title = {NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition}, - author = {Fenfen Sheng and Zhineng Chen andBo Xu}, + author = {Fenfen Sheng and Zhineng Chen and Bo Xu}, booktitle = {ICDAR}, year = {2019}, url = {http://arxiv.org/abs/1806.00926}, diff --git a/doc/doc_en/algorithm_overview_en.md b/doc/doc_en/algorithm_overview_en.md index f2284542ed8197bd12559d998607512476130d65..28aca7c0d171008156104fbcc786707538fd49ef 100755 --- a/doc/doc_en/algorithm_overview_en.md +++ b/doc/doc_en/algorithm_overview_en.md @@ -66,6 +66,7 @@ Supported text recognition algorithms (Click the link to get the tutorial): - [x] [SEED](./algorithm_rec_seed_en.md) - [x] [SVTR](./algorithm_rec_svtr_en.md) - [x] [ViTSTR](./algorithm_rec_vitstr_en.md) +- [x] [ABINet](./algorithm_rec_abinet_en.md) Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow: @@ -85,6 +86,7 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r |SEED|Aster_Resnet| 85.35% | rec_resnet_stn_bilstm_att | [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) | |SVTR|SVTR-Tiny| 89.25% | rec_svtr_tiny_none_ctc_en | [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) | |ViTSTR|ViTSTR| 79.82% | rec_vitstr_none_ce_en | [trained model](https://paddleocr.bj.bcebos.com/rec_vitstr_none_none_train.tar) | +|ABINet|Resnet45| 90.75% | rec_r45_abinet_en | [trained model](https://paddleocr.bj.bcebos.com/rec_r45_abinet_train.tar) | diff --git a/doc/doc_en/algorithm_rec_abinet_en.md b/doc/doc_en/algorithm_rec_abinet_en.md new file mode 100644 index 0000000000000000000000000000000000000000..3b0f6c09a95f8d1ea15dd7dcae2dc2e29d969370 --- /dev/null +++ b/doc/doc_en/algorithm_rec_abinet_en.md @@ -0,0 +1,136 @@ +# ABINet + +- [1. Introduction](#1) +- [2. Environment](#2) +- [3. Model Training / Evaluation / Prediction](#3) + - [3.1 Training](#3-1) + - [3.2 Evaluation](#3-2) + - [3.3 Prediction](#3-3) +- [4. Inference and Deployment](#4) + - [4.1 Python Inference](#4-1) + - [4.2 C++ Inference](#4-2) + - [4.3 Serving](#4-3) + - [4.4 More](#4-4) +- [5. FAQ](#5) + + +## 1. Introduction + +Paper: +> [ABINet: Read Like Humans: Autonomous, Bidirectional and Iterative Language Modeling for Scene Text Recognition](https://openaccess.thecvf.com/content/CVPR2021/papers/Fang_Read_Like_Humans_Autonomous_Bidirectional_and_Iterative_Language_Modeling_for_CVPR_2021_paper.pdf) +> Shancheng Fang and Hongtao Xie and Yuxin Wang and Zhendong Mao and Yongdong Zhang +> CVPR, 2021 + +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Model|Backbone|config|Acc|Download link| +| --- | --- | --- | --- | --- | +|ABINet|ResNet45|[rec_r45_abinet.yml](../../configs/rec/rec_r45_abinet.yml)|90.75%|[trained model]()/[pretrained model]()| + + +## 2. Environment +Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code. + + + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +Training: + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +``` +#Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_r45_abinet.yml + +#Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r45_abinet.yml +``` + +Evaluation: + +``` +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r45_abinet.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +Prediction: + +``` +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_r45_abinet.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_r45_abinet_train/best_accuracy +``` + + +## 4. Inference and Deployment + + +### 4.1 Python Inference +First, the model saved during the ABINet text recognition training process is converted into an inference model. ( [Model download link]()) ), you can use the following command to convert: + +``` +python3 tools/export_model.py -c configs/rec/rec_r45_abinet.yml -o Global.pretrained_model=./rec_r45_abinet_train/best_accuracy Global.save_inference_dir=./inference/rec_r45_abinet +``` + +**Note:** +- If you are training the model on your own dataset and have modified the dictionary file, please pay attention to modify the `character_dict_path` in the configuration file to the modified dictionary file. +- If you modified the input size during training, please modify the `infer_shape` corresponding to ABINet in the `tools/export_model.py` file. + +After the conversion is successful, there are three files in the directory: +``` +/inference/rec_r45_abinet/ + ├── inference.pdiparams + ├── inference.pdiparams.info + └── inference.pdmodel +``` + + +For ABINet text recognition model inference, the following commands can be executed: + +``` +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_r45_abinet/' --rec_algorithm='ABINet' --rec_image_shape='3,32,128' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt' +``` + +![](../imgs_words_en/word_10.png) + +After executing the command, the prediction result (recognized text and score) of the image above is printed to the screen, an example is as follows: +The result is as follows: +```shell +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999995231628418) +``` + + +### 4.2 C++ Inference + +Not supported + + +### 4.3 Serving + +Not supported + + +### 4.4 More + +Not supported + + +## 5. FAQ + +1. Note that the MJSynth and SynthText datasets come from [ABINet repo](https://github.com/FangShancheng/ABINet). +2. We use the pre-trained model provided by the ABINet authors for finetune training. + +## Citation + +```bibtex +@article{Fang2021ABINet, + title = {ABINet: Read Like Humans: Autonomous, Bidirectional and Iterative Language Modeling for Scene Text Recognition}, + author = {Shancheng Fang and Hongtao Xie and Yuxin Wang and Zhendong Mao and Yongdong Zhang}, + booktitle = {CVPR}, + year = {2021}, + url = {https://arxiv.org/abs/2103.06495}, + pages = {7098-7107} +} +``` diff --git a/doc/doc_en/algorithm_rec_nrtr_en.md b/doc/doc_en/algorithm_rec_nrtr_en.md index 40c9b91629964dfaed74bf96aaa947f3fce99446..309d7ab123065f15a599a1220ab3f39afffb9b60 100644 --- a/doc/doc_en/algorithm_rec_nrtr_en.md +++ b/doc/doc_en/algorithm_rec_nrtr_en.md @@ -12,6 +12,7 @@ - [4.3 Serving](#4-3) - [4.4 More](#4-4) - [5. FAQ](#5) +- [6. Release Note](#6) ## 1. Introduction @@ -25,7 +26,7 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval |Model|Backbone|config|Acc|Download link| | --- | --- | --- | --- | --- | -|NRTR|MTB|[rec_mtb_nrtr.yml](../../configs/rec/rec_mtb_nrtr.yml)|84.21%|[train model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar)| +|NRTR|MTB|[rec_mtb_nrtr.yml](../../configs/rec/rec_mtb_nrtr.yml)|84.21%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar)| ## 2. Environment @@ -98,7 +99,7 @@ python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' After executing the command, the prediction result (recognized text and score) of the image above is printed to the screen, an example is as follows: The result is as follows: ```shell -Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9265879392623901) +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9465042352676392) ``` @@ -121,12 +122,146 @@ Not supported 1. In the `NRTR` paper, Beam search is used to decode characters, but the speed is slow. Beam search is not used by default here, and greedy search is used to decode characters. + +## 6. Release Note + +1. The release/2.6 version updates the NRTR code structure. The new version of NRTR can load the model parameters of the old version (release/2.5 and before), and you may use the following code to convert the old version model parameters to the new version model parameters: + +```python + + params = paddle.load('path/' + '.pdparams') # the old version parameters + state_dict = model.state_dict() # the new version model parameters + new_state_dict = {} + + for k1, v1 in state_dict.items(): + + k = k1 + if 'encoder' in k and 'self_attn' in k and 'qkv' in k and 'weight' in k: + + k_para = k[:13] + 'layers.' + k[13:] + q = params[k_para.replace('qkv', 'conv1')].transpose((1, 0, 2, 3)) + k = params[k_para.replace('qkv', 'conv2')].transpose((1, 0, 2, 3)) + v = params[k_para.replace('qkv', 'conv3')].transpose((1, 0, 2, 3)) + + new_state_dict[k1] = np.concatenate([q[:, :, 0, 0], k[:, :, 0, 0], v[:, :, 0, 0]], -1) + + elif 'encoder' in k and 'self_attn' in k and 'qkv' in k and 'bias' in k: + + k_para = k[:13] + 'layers.' + k[13:] + q = params[k_para.replace('qkv', 'conv1')] + k = params[k_para.replace('qkv', 'conv2')] + v = params[k_para.replace('qkv', 'conv3')] + + new_state_dict[k1] = np.concatenate([q, k, v], -1) + + elif 'encoder' in k and 'self_attn' in k and 'out_proj' in k: + + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para] + + elif 'encoder' in k and 'norm3' in k: + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para.replace('norm3', 'norm2')] + + elif 'encoder' in k and 'norm1' in k: + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para] + + + elif 'decoder' in k and 'self_attn' in k and 'qkv' in k and 'weight' in k: + k_para = k[:13] + 'layers.' + k[13:] + q = params[k_para.replace('qkv', 'conv1')].transpose((1, 0, 2, 3)) + k = params[k_para.replace('qkv', 'conv2')].transpose((1, 0, 2, 3)) + v = params[k_para.replace('qkv', 'conv3')].transpose((1, 0, 2, 3)) + new_state_dict[k1] = np.concatenate([q[:, :, 0, 0], k[:, :, 0, 0], v[:, :, 0, 0]], -1) + + elif 'decoder' in k and 'self_attn' in k and 'qkv' in k and 'bias' in k: + k_para = k[:13] + 'layers.' + k[13:] + q = params[k_para.replace('qkv', 'conv1')] + k = params[k_para.replace('qkv', 'conv2')] + v = params[k_para.replace('qkv', 'conv3')] + new_state_dict[k1] = np.concatenate([q, k, v], -1) + + elif 'decoder' in k and 'self_attn' in k and 'out_proj' in k: + + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para] + + elif 'decoder' in k and 'cross_attn' in k and 'q' in k and 'weight' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + q = params[k_para.replace('q', 'conv1')].transpose((1, 0, 2, 3)) + new_state_dict[k1] = q[:, :, 0, 0] + + elif 'decoder' in k and 'cross_attn' in k and 'q' in k and 'bias' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + q = params[k_para.replace('q', 'conv1')] + new_state_dict[k1] = q + + elif 'decoder' in k and 'cross_attn' in k and 'kv' in k and 'weight' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + k = params[k_para.replace('kv', 'conv2')].transpose((1, 0, 2, 3)) + v = params[k_para.replace('kv', 'conv3')].transpose((1, 0, 2, 3)) + new_state_dict[k1] = np.concatenate([k[:, :, 0, 0], v[:, :, 0, 0]], -1) + + elif 'decoder' in k and 'cross_attn' in k and 'kv' in k and 'bias' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + k = params[k_para.replace('kv', 'conv2')] + v = params[k_para.replace('kv', 'conv3')] + new_state_dict[k1] = np.concatenate([k, v], -1) + + elif 'decoder' in k and 'cross_attn' in k and 'out_proj' in k: + + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + new_state_dict[k1] = params[k_para] + elif 'decoder' in k and 'norm' in k: + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para] + elif 'mlp' in k and 'weight' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('fc', 'conv') + k_para = k_para.replace('mlp.', '') + w = params[k_para].transpose((1, 0, 2, 3)) + new_state_dict[k1] = w[:, :, 0, 0] + elif 'mlp' in k and 'bias' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('fc', 'conv') + k_para = k_para.replace('mlp.', '') + w = params[k_para] + new_state_dict[k1] = w + + else: + new_state_dict[k1] = params[k1] + + if list(new_state_dict[k1].shape) != list(v1.shape): + print(k1) + + + for k, v1 in state_dict.items(): + if k not in new_state_dict.keys(): + print(1, k) + elif list(new_state_dict[k].shape) != list(v1.shape): + print(2, k) + + + + model.set_state_dict(new_state_dict) + paddle.save(model.state_dict(), 'nrtrnew_from_old_params.pdparams') + +``` + +2. The new version has a clean code structure and improved inference speed compared with the old version. + ## Citation ```bibtex @article{Sheng2019NRTR, title = {NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition}, - author = {Fenfen Sheng and Zhineng Chen andBo Xu}, + author = {Fenfen Sheng and Zhineng Chen and Bo Xu}, booktitle = {ICDAR}, year = {2019}, url = {http://arxiv.org/abs/1806.00926}, diff --git a/doc/doc_en/algorithm_rec_vitstr_en.md b/doc/doc_en/algorithm_rec_vitstr_en.md index b6f26d39e24755e2ea9c055819eeff7d2d3dd592..a6f9e2f15df69e7949a4b9713274d9b83ff98f60 100644 --- a/doc/doc_en/algorithm_rec_vitstr_en.md +++ b/doc/doc_en/algorithm_rec_vitstr_en.md @@ -25,7 +25,7 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval |Model|Backbone|config|Acc|Download link| | --- | --- | --- | --- | --- | -|ViTSTR|ViTSTR|[rec_vitstr_none_ce.yml](../../configs/rec/rec_vitstr_none_ce.yml)|79.82%|[训练模型](https://paddleocr.bj.bcebos.com/rec_vitstr_none_none_train.tar)| +|ViTSTR|ViTSTR|[rec_vitstr_none_ce.yml](../../configs/rec/rec_vitstr_none_ce.yml)|79.82%|[trained model](https://paddleocr.bj.bcebos.com/rec_vitstr_none_none_train.tar)| ## 2. Environment diff --git a/ppocr/data/imaug/__init__.py b/ppocr/data/imaug/__init__.py index 88c9603fb00a315e9df22dec27ac850c277089ec..58e8a5c780558631bc1d7f9cc178907e8f163bee 100644 --- a/ppocr/data/imaug/__init__.py +++ b/ppocr/data/imaug/__init__.py @@ -24,7 +24,7 @@ from .make_pse_gt import MakePseGt from .rec_img_aug import RecAug, RecConAug, RecResizeImg, ClsResizeImg, \ - SRNRecResizeImg, GrayRecResizeImg, SARRecResizeImg, PRENResizeImg + SRNRecResizeImg, GrayRecResizeImg, SARRecResizeImg, PRENResizeImg, ABINetRecResizeImg from .ssl_img_aug import SSLRotateResize from .randaugment import RandAugment from .copy_paste import CopyPaste diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py index 0366e3f7854513b79350dea4ddd6b29178c7fffc..d96ecf639a68bd6550ce731813644dd785ef1bb8 100644 --- a/ppocr/data/imaug/label_ops.py +++ b/ppocr/data/imaug/label_ops.py @@ -157,37 +157,6 @@ class BaseRecLabelEncode(object): return text_list -class NRTRLabelEncode(BaseRecLabelEncode): - """ Convert between text-label and text-index """ - - def __init__(self, - max_text_length, - character_dict_path=None, - use_space_char=False, - **kwargs): - - super(NRTRLabelEncode, self).__init__( - max_text_length, character_dict_path, use_space_char) - - def __call__(self, data): - text = data['label'] - text = self.encode(text) - if text is None: - return None - if len(text) >= self.max_text_len - 1: - return None - data['length'] = np.array(len(text)) - text.insert(0, 2) - text.append(3) - text = text + [0] * (self.max_text_len - len(text)) - data['label'] = np.array(text) - return data - - def add_special_char(self, dict_character): - dict_character = ['blank', '', '', ''] + dict_character - return dict_character - - class CTCLabelEncode(BaseRecLabelEncode): """ Convert between text-label and text-index """ @@ -840,37 +809,6 @@ class PRENLabelEncode(BaseRecLabelEncode): return data -class ViTSTRLabelEncode(BaseRecLabelEncode): - """ Convert between text-label and text-index """ - - def __init__(self, - max_text_length, - character_dict_path=None, - use_space_char=False, - **kwargs): - - super(ViTSTRLabelEncode, self).__init__( - max_text_length, character_dict_path, use_space_char) - - def __call__(self, data): - text = data['label'] - text = self.encode(text) - if text is None: - return None - if len(text) >= self.max_text_len: - return None - data['length'] = np.array(len(text)) - text.insert(0, 0) - text.append(1) - text = text + [0] * (self.max_text_len + 2 - len(text)) - data['label'] = np.array(text) - return data - - def add_special_char(self, dict_character): - dict_character = ['', ''] + dict_character - return dict_character - - class VQATokenLabelEncode(object): """ Label encode for NLP VQA methods @@ -1077,3 +1015,99 @@ class MultiLabelEncode(BaseRecLabelEncode): data_out['label_sar'] = sar['label'] data_out['length'] = ctc['length'] return data_out + + +class NRTRLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + + super(NRTRLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len - 1: + return None + data['length'] = np.array(len(text)) + text.insert(0, 2) + text.append(3) + text = text + [0] * (self.max_text_len - len(text)) + data['label'] = np.array(text) + return data + + def add_special_char(self, dict_character): + dict_character = ['blank', '', '', ''] + dict_character + return dict_character + + +class ViTSTRLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + ignore_index=0, + **kwargs): + + super(ViTSTRLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + self.ignore_index = ignore_index + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len: + return None + data['length'] = np.array(len(text)) + text.insert(0, self.ignore_index) + text.append(1) + text = text + [self.ignore_index] * (self.max_text_len + 2 - len(text)) + data['label'] = np.array(text) + return data + + def add_special_char(self, dict_character): + dict_character = ['', ''] + dict_character + return dict_character + + +class ABINetLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + ignore_index=100, + **kwargs): + + super(ABINetLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + self.ignore_index = ignore_index + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len: + return None + data['length'] = np.array(len(text)) + text.append(0) + text = text + [self.ignore_index] * (self.max_text_len + 1 - len(text)) + data['label'] = np.array(text) + return data + + def add_special_char(self, dict_character): + dict_character = [''] + dict_character + return dict_character diff --git a/ppocr/data/imaug/operators.py b/ppocr/data/imaug/operators.py index 09736515e7a388e191a12826e1e9e348e2fcde86..5397d71ccb466235e64f85e1eb9365ba03d2aa17 100644 --- a/ppocr/data/imaug/operators.py +++ b/ppocr/data/imaug/operators.py @@ -67,39 +67,6 @@ class DecodeImage(object): return data -class NRTRDecodeImage(object): - """ decode image """ - - def __init__(self, img_mode='RGB', channel_first=False, **kwargs): - self.img_mode = img_mode - self.channel_first = channel_first - - def __call__(self, data): - img = data['image'] - if six.PY2: - assert type(img) is str and len( - img) > 0, "invalid input 'img' in DecodeImage" - else: - assert type(img) is bytes and len( - img) > 0, "invalid input 'img' in DecodeImage" - img = np.frombuffer(img, dtype='uint8') - - img = cv2.imdecode(img, 1) - - if img is None: - return None - if self.img_mode == 'GRAY': - img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) - elif self.img_mode == 'RGB': - assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape) - img = img[:, :, ::-1] - img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - if self.channel_first: - img = img.transpose((2, 0, 1)) - data['image'] = img - return data - - class NormalizeImage(object): """ normalize image such as substract mean, divide std """ diff --git a/ppocr/data/imaug/rec_img_aug.py b/ppocr/data/imaug/rec_img_aug.py index d3120f595052d95a7abd7bb347c841e5628d43fb..a8b3b813b3397a35c909f0356743a3d548e1d1b6 100644 --- a/ppocr/data/imaug/rec_img_aug.py +++ b/ppocr/data/imaug/rec_img_aug.py @@ -279,6 +279,24 @@ class PRENResizeImg(object): return data +class ABINetRecResizeImg(object): + def __init__(self, + image_shape, + infer_mode=False, + character_dict_path=None, + **kwargs): + self.image_shape = image_shape + self.infer_mode = infer_mode + self.character_dict_path = character_dict_path + + def __call__(self, data): + img = data['image'] + norm_img, valid_ratio = resize_norm_img_abinet(img, self.image_shape) + data['image'] = norm_img + data['valid_ratio'] = valid_ratio + return data + + def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25): imgC, imgH, imgW_min, imgW_max = image_shape h = img.shape[0] @@ -397,6 +415,26 @@ def resize_norm_img_srn(img, image_shape): return np.reshape(img_black, (c, row, col)).astype(np.float32) +def resize_norm_img_abinet(img, image_shape): + imgC, imgH, imgW = image_shape + + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) + resized_w = imgW + resized_image = resized_image.astype('float32') + resized_image = resized_image / 255. + + mean = np.array([0.485, 0.456, 0.406]) + std = np.array([0.229, 0.224, 0.225]) + resized_image = ( + resized_image - mean[None, None, ...]) / std[None, None, ...] + resized_image = resized_image.transpose((2, 0, 1)) + resized_image = resized_image.astype('float32') + + valid_ratio = min(1.0, float(resized_w / imgW)) + return resized_image, valid_ratio + + def srn_other_inputs(image_shape, num_heads, max_text_length): imgC, imgH, imgW = image_shape diff --git a/ppocr/losses/rec_ce_loss.py b/ppocr/losses/rec_ce_loss.py index b837ac27bbc15007ab2fa0e02a2b5b0b447cf6fb..614384de863c15b106aef831f8e938b89dadc246 100644 --- a/ppocr/losses/rec_ce_loss.py +++ b/ppocr/losses/rec_ce_loss.py @@ -4,31 +4,63 @@ import paddle.nn.functional as F class CELoss(nn.Layer): - def __init__(self, smoothing=True, with_all=False, **kwargs): + def __init__(self, + smoothing=False, + with_all=False, + ignore_index=-1, + **kwargs): super(CELoss, self).__init__() - self.loss_func = nn.CrossEntropyLoss(reduction='mean', ignore_index=0) + if ignore_index >= 0: + self.loss_func = nn.CrossEntropyLoss( + reduction='mean', ignore_index=ignore_index) + else: + self.loss_func = nn.CrossEntropyLoss(reduction='mean') self.smoothing = smoothing self.with_all = with_all def forward(self, pred, batch): - pred = pred.reshape([-1, pred.shape[2]]) - if self.with_all: - tgt = batch[1] - else: - max_len = batch[2].max() - tgt = batch[1][:, 1:2 + max_len] - tgt = tgt.reshape([-1]) - if self.smoothing: - eps = 0.1 - n_class = pred.shape[1] - one_hot = F.one_hot(tgt, pred.shape[1]) - one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1) - log_prb = F.log_softmax(pred, axis=1) - non_pad_mask = paddle.not_equal( - tgt, paddle.zeros( - tgt.shape, dtype=tgt.dtype)) - loss = -(one_hot * log_prb).sum(axis=1) - loss = loss.masked_select(non_pad_mask).mean() + + if isinstance(pred, dict): # for ABINet + loss = {} + loss_sum = [] + for name, logits in pred.items(): + if isinstance(logits, list): + logit_num = len(logits) + all_tgt = paddle.concat([batch[1]] * logit_num, 0) + all_logits = paddle.concat(logits, 0) + flt_logtis = all_logits.reshape([-1, all_logits.shape[2]]) + flt_tgt = all_tgt.reshape([-1]) + else: + flt_logtis = logits.reshape([-1, logits.shape[2]]) + flt_tgt = batch[1].reshape([-1]) + loss[name + '_loss'] = self.loss_func(flt_logtis, flt_tgt) + loss_sum.append(loss[name + '_loss']) + loss['loss'] = sum(loss_sum) + return loss else: - loss = self.loss_func(pred, tgt) - return {'loss': loss} + if self.with_all: # for ViTSTR + tgt = batch[1] + pred = pred.reshape([-1, pred.shape[2]]) + tgt = tgt.reshape([-1]) + loss = self.loss_func(pred, tgt) + return {'loss': loss} + else: # for NRTR + max_len = batch[2].max() + tgt = batch[1][:, 1:2 + max_len] + pred = pred.reshape([-1, pred.shape[2]]) + tgt = tgt.reshape([-1]) + if self.smoothing: + eps = 0.1 + n_class = pred.shape[1] + one_hot = F.one_hot(tgt, pred.shape[1]) + one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / ( + n_class - 1) + log_prb = F.log_softmax(pred, axis=1) + non_pad_mask = paddle.not_equal( + tgt, paddle.zeros( + tgt.shape, dtype=tgt.dtype)) + loss = -(one_hot * log_prb).sum(axis=1) + loss = loss.masked_select(non_pad_mask).mean() + else: + loss = self.loss_func(pred, tgt) + return {'loss': loss} diff --git a/ppocr/modeling/backbones/__init__.py b/ppocr/modeling/backbones/__init__.py index a368e7481628cd836963618cd4cbfca12ba2080b..56d2c2b9dba508bcb05b6e57e4df1292bb13c299 100755 --- a/ppocr/modeling/backbones/__init__.py +++ b/ppocr/modeling/backbones/__init__.py @@ -27,7 +27,7 @@ def build_backbone(config, model_type): from .rec_resnet_fpn import ResNetFPN from .rec_mv1_enhance import MobileNetV1Enhance from .rec_nrtr_mtb import MTB - from .rec_resnet_31 import ResNet31 + from .rec_resnet import ResNet31, ResNet45 from .rec_resnet_aster import ResNet_ASTER from .rec_micronet import MicroNet from .rec_efficientb3_pren import EfficientNetb3_PREN @@ -35,29 +35,29 @@ def build_backbone(config, model_type): from .rec_vitstr import ViTSTR support_dict = [ 'MobileNetV1Enhance', 'MobileNetV3', 'ResNet', 'ResNetFPN', 'MTB', - "ResNet31", "ResNet_ASTER", 'MicroNet', 'EfficientNetb3_PREN', - 'SVTRNet', 'ViTSTR' + 'ResNet31', 'ResNet45', 'ResNet_ASTER', 'MicroNet', + 'EfficientNetb3_PREN', 'SVTRNet', 'ViTSTR' ] - elif model_type == "e2e": + elif model_type == 'e2e': from .e2e_resnet_vd_pg import ResNet support_dict = ['ResNet'] elif model_type == 'kie': from .kie_unet_sdmgr import Kie_backbone support_dict = ['Kie_backbone'] - elif model_type == "table": + elif model_type == 'table': from .table_resnet_vd import ResNet from .table_mobilenet_v3 import MobileNetV3 - support_dict = ["ResNet", "MobileNetV3"] + support_dict = ['ResNet', 'MobileNetV3'] elif model_type == 'vqa': from .vqa_layoutlm import LayoutLMForSer, LayoutLMv2ForSer, LayoutLMv2ForRe, LayoutXLMForSer, LayoutXLMForRe support_dict = [ - "LayoutLMForSer", "LayoutLMv2ForSer", 'LayoutLMv2ForRe', - "LayoutXLMForSer", 'LayoutXLMForRe' + 'LayoutLMForSer', 'LayoutLMv2ForSer', 'LayoutLMv2ForRe', + 'LayoutXLMForSer', 'LayoutXLMForRe' ] else: raise NotImplementedError - module_name = config.pop("name") + module_name = config.pop('name') assert module_name in support_dict, Exception( "when model typs is {}, backbone only support {}".format(model_type, support_dict)) diff --git a/ppocr/modeling/backbones/rec_resnet.py b/ppocr/modeling/backbones/rec_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..ce76b642cccf147b912561f349de580a258d0c86 --- /dev/null +++ b/ppocr/modeling/backbones/rec_resnet.py @@ -0,0 +1,280 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/layers/conv_layer.py +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/backbones/resnet31_ocr.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +from paddle.nn.initializer import KaimingNormal +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np +import math + +__all__ = ["ResNet31", "ResNet45"] + + +def conv1x1(in_planes, out_planes, stride=1): + return nn.Conv2D( + in_planes, + out_planes, + kernel_size=1, + stride=stride, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False) + + +def conv3x3(in_channel, out_channel, stride=1): + return nn.Conv2D( + in_channel, + out_channel, + kernel_size=3, + stride=stride, + padding=1, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False) + + +class BasicBlock(nn.Layer): + expansion = 1 + + def __init__(self, in_channels, channels, stride=1, downsample=None): + super().__init__() + self.conv1 = conv1x1(in_channels, channels) + self.bn1 = nn.BatchNorm2D(channels) + self.relu = nn.ReLU() + self.conv2 = conv3x3(channels, channels, stride) + self.bn2 = nn.BatchNorm2D(channels) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + out += residual + out = self.relu(out) + + return out + + +class ResNet31(nn.Layer): + ''' + Args: + in_channels (int): Number of channels of input image tensor. + layers (list[int]): List of BasicBlock number for each stage. + channels (list[int]): List of out_channels of Conv2d layer. + out_indices (None | Sequence[int]): Indices of output stages. + last_stage_pool (bool): If True, add `MaxPool2d` layer to last stage. + ''' + + def __init__(self, + in_channels=3, + layers=[1, 2, 5, 3], + channels=[64, 128, 256, 256, 512, 512, 512], + out_indices=None, + last_stage_pool=False): + super(ResNet31, self).__init__() + assert isinstance(in_channels, int) + assert isinstance(last_stage_pool, bool) + + self.out_indices = out_indices + self.last_stage_pool = last_stage_pool + + # conv 1 (Conv Conv) + self.conv1_1 = nn.Conv2D( + in_channels, channels[0], kernel_size=3, stride=1, padding=1) + self.bn1_1 = nn.BatchNorm2D(channels[0]) + self.relu1_1 = nn.ReLU() + + self.conv1_2 = nn.Conv2D( + channels[0], channels[1], kernel_size=3, stride=1, padding=1) + self.bn1_2 = nn.BatchNorm2D(channels[1]) + self.relu1_2 = nn.ReLU() + + # conv 2 (Max-pooling, Residual block, Conv) + self.pool2 = nn.MaxPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block2 = self._make_layer(channels[1], channels[2], layers[0]) + self.conv2 = nn.Conv2D( + channels[2], channels[2], kernel_size=3, stride=1, padding=1) + self.bn2 = nn.BatchNorm2D(channels[2]) + self.relu2 = nn.ReLU() + + # conv 3 (Max-pooling, Residual block, Conv) + self.pool3 = nn.MaxPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block3 = self._make_layer(channels[2], channels[3], layers[1]) + self.conv3 = nn.Conv2D( + channels[3], channels[3], kernel_size=3, stride=1, padding=1) + self.bn3 = nn.BatchNorm2D(channels[3]) + self.relu3 = nn.ReLU() + + # conv 4 (Max-pooling, Residual block, Conv) + self.pool4 = nn.MaxPool2D( + kernel_size=(2, 1), stride=(2, 1), padding=0, ceil_mode=True) + self.block4 = self._make_layer(channels[3], channels[4], layers[2]) + self.conv4 = nn.Conv2D( + channels[4], channels[4], kernel_size=3, stride=1, padding=1) + self.bn4 = nn.BatchNorm2D(channels[4]) + self.relu4 = nn.ReLU() + + # conv 5 ((Max-pooling), Residual block, Conv) + self.pool5 = None + if self.last_stage_pool: + self.pool5 = nn.MaxPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block5 = self._make_layer(channels[4], channels[5], layers[3]) + self.conv5 = nn.Conv2D( + channels[5], channels[5], kernel_size=3, stride=1, padding=1) + self.bn5 = nn.BatchNorm2D(channels[5]) + self.relu5 = nn.ReLU() + + self.out_channels = channels[-1] + + def _make_layer(self, input_channels, output_channels, blocks): + layers = [] + for _ in range(blocks): + downsample = None + if input_channels != output_channels: + downsample = nn.Sequential( + nn.Conv2D( + input_channels, + output_channels, + kernel_size=1, + stride=1, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False), + nn.BatchNorm2D(output_channels), ) + + layers.append( + BasicBlock( + input_channels, output_channels, downsample=downsample)) + input_channels = output_channels + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1_1(x) + x = self.bn1_1(x) + x = self.relu1_1(x) + + x = self.conv1_2(x) + x = self.bn1_2(x) + x = self.relu1_2(x) + + outs = [] + for i in range(4): + layer_index = i + 2 + pool_layer = getattr(self, f'pool{layer_index}') + block_layer = getattr(self, f'block{layer_index}') + conv_layer = getattr(self, f'conv{layer_index}') + bn_layer = getattr(self, f'bn{layer_index}') + relu_layer = getattr(self, f'relu{layer_index}') + + if pool_layer is not None: + x = pool_layer(x) + x = block_layer(x) + x = conv_layer(x) + x = bn_layer(x) + x = relu_layer(x) + + outs.append(x) + + if self.out_indices is not None: + return tuple([outs[i] for i in self.out_indices]) + + return x + + +class ResNet(nn.Layer): + def __init__(self, block, layers, in_channels=3): + self.inplanes = 32 + super(ResNet, self).__init__() + self.conv1 = nn.Conv2D( + 3, + 32, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False) + self.bn1 = nn.BatchNorm2D(32) + self.relu = nn.ReLU() + + self.layer1 = self._make_layer(block, 32, layers[0], stride=2) + self.layer2 = self._make_layer(block, 64, layers[1], stride=1) + self.layer3 = self._make_layer(block, 128, layers[2], stride=2) + self.layer4 = self._make_layer(block, 256, layers[3], stride=1) + self.layer5 = self._make_layer(block, 512, layers[4], stride=1) + self.out_channels = 512 + + # for m in self.modules(): + # if isinstance(m, nn.Conv2D): + # n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels + # m.weight.data.normal_(0, math.sqrt(2. / n)) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + # downsample = True + downsample = nn.Sequential( + nn.Conv2D( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False), + nn.BatchNorm2D(planes * block.expansion), ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + # print(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + # print(x) + x = self.layer4(x) + x = self.layer5(x) + return x + + +def ResNet45(in_channels=3): + return ResNet(BasicBlock, [3, 4, 6, 6, 3], in_channels=in_channels) diff --git a/ppocr/modeling/heads/__init__.py b/ppocr/modeling/heads/__init__.py index 1670ea38e66baa683e6faab0ec4b12bc517f3c41..14e6aab854d8942083f1e3466f554c0876bcf403 100755 --- a/ppocr/modeling/heads/__init__.py +++ b/ppocr/modeling/heads/__init__.py @@ -33,6 +33,7 @@ def build_head(config): from .rec_aster_head import AsterHead from .rec_pren_head import PRENHead from .rec_multi_head import MultiHead + from .rec_abinet_head import ABINetHead # cls head from .cls_head import ClsHead @@ -46,7 +47,7 @@ def build_head(config): 'DBHead', 'PSEHead', 'FCEHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead', 'AttentionHead', 'SRNHead', 'PGHead', 'Transformer', 'TableAttentionHead', 'SARHead', 'AsterHead', 'SDMGRHead', 'PRENHead', - 'MultiHead' + 'MultiHead', 'ABINetHead' ] #table head diff --git a/ppocr/modeling/heads/multiheadAttention.py b/ppocr/modeling/heads/multiheadAttention.py deleted file mode 100755 index 900865ba1a8d80a108b3247ce1aff91c242860f2..0000000000000000000000000000000000000000 --- a/ppocr/modeling/heads/multiheadAttention.py +++ /dev/null @@ -1,163 +0,0 @@ -# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -from paddle import nn -import paddle.nn.functional as F -from paddle.nn import Linear -from paddle.nn.initializer import XavierUniform as xavier_uniform_ -from paddle.nn.initializer import Constant as constant_ -from paddle.nn.initializer import XavierNormal as xavier_normal_ - -zeros_ = constant_(value=0.) -ones_ = constant_(value=1.) - - -class MultiheadAttention(nn.Layer): - """Allows the model to jointly attend to information - from different representation subspaces. - See reference: Attention Is All You Need - - .. math:: - \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O - \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) - - Args: - embed_dim: total dimension of the model - num_heads: parallel attention layers, or heads - - """ - - def __init__(self, - embed_dim, - num_heads, - dropout=0., - bias=True, - add_bias_kv=False, - add_zero_attn=False): - super(MultiheadAttention, self).__init__() - self.embed_dim = embed_dim - self.num_heads = num_heads - self.dropout = dropout - self.head_dim = embed_dim // num_heads - assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" - self.scaling = self.head_dim**-0.5 - self.out_proj = Linear(embed_dim, embed_dim, bias_attr=bias) - self._reset_parameters() - self.conv1 = paddle.nn.Conv2D( - in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1)) - self.conv2 = paddle.nn.Conv2D( - in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1)) - self.conv3 = paddle.nn.Conv2D( - in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1)) - - def _reset_parameters(self): - xavier_uniform_(self.out_proj.weight) - - def forward(self, - query, - key, - value, - key_padding_mask=None, - incremental_state=None, - attn_mask=None): - """ - Inputs of forward function - query: [target length, batch size, embed dim] - key: [sequence length, batch size, embed dim] - value: [sequence length, batch size, embed dim] - key_padding_mask: if True, mask padding based on batch size - incremental_state: if provided, previous time steps are cashed - need_weights: output attn_output_weights - static_kv: key and value are static - - Outputs of forward function - attn_output: [target length, batch size, embed dim] - attn_output_weights: [batch size, target length, sequence length] - """ - q_shape = paddle.shape(query) - src_shape = paddle.shape(key) - q = self._in_proj_q(query) - k = self._in_proj_k(key) - v = self._in_proj_v(value) - q *= self.scaling - q = paddle.transpose( - paddle.reshape( - q, [q_shape[0], q_shape[1], self.num_heads, self.head_dim]), - [1, 2, 0, 3]) - k = paddle.transpose( - paddle.reshape( - k, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]), - [1, 2, 0, 3]) - v = paddle.transpose( - paddle.reshape( - v, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]), - [1, 2, 0, 3]) - if key_padding_mask is not None: - assert key_padding_mask.shape[0] == q_shape[1] - assert key_padding_mask.shape[1] == src_shape[0] - attn_output_weights = paddle.matmul(q, - paddle.transpose(k, [0, 1, 3, 2])) - if attn_mask is not None: - attn_mask = paddle.unsqueeze(paddle.unsqueeze(attn_mask, 0), 0) - attn_output_weights += attn_mask - if key_padding_mask is not None: - attn_output_weights = paddle.reshape( - attn_output_weights, - [q_shape[1], self.num_heads, q_shape[0], src_shape[0]]) - key = paddle.unsqueeze(paddle.unsqueeze(key_padding_mask, 1), 2) - key = paddle.cast(key, 'float32') - y = paddle.full( - shape=paddle.shape(key), dtype='float32', fill_value='-inf') - y = paddle.where(key == 0., key, y) - attn_output_weights += y - attn_output_weights = F.softmax( - attn_output_weights.astype('float32'), - axis=-1, - dtype=paddle.float32 if attn_output_weights.dtype == paddle.float16 - else attn_output_weights.dtype) - attn_output_weights = F.dropout( - attn_output_weights, p=self.dropout, training=self.training) - - attn_output = paddle.matmul(attn_output_weights, v) - attn_output = paddle.reshape( - paddle.transpose(attn_output, [2, 0, 1, 3]), - [q_shape[0], q_shape[1], self.embed_dim]) - attn_output = self.out_proj(attn_output) - - return attn_output - - def _in_proj_q(self, query): - query = paddle.transpose(query, [1, 2, 0]) - query = paddle.unsqueeze(query, axis=2) - res = self.conv1(query) - res = paddle.squeeze(res, axis=2) - res = paddle.transpose(res, [2, 0, 1]) - return res - - def _in_proj_k(self, key): - key = paddle.transpose(key, [1, 2, 0]) - key = paddle.unsqueeze(key, axis=2) - res = self.conv2(key) - res = paddle.squeeze(res, axis=2) - res = paddle.transpose(res, [2, 0, 1]) - return res - - def _in_proj_v(self, value): - value = paddle.transpose(value, [1, 2, 0]) #(1, 2, 0) - value = paddle.unsqueeze(value, axis=2) - res = self.conv3(value) - res = paddle.squeeze(res, axis=2) - res = paddle.transpose(res, [2, 0, 1]) - return res diff --git a/ppocr/modeling/heads/rec_abinet_head.py b/ppocr/modeling/heads/rec_abinet_head.py new file mode 100644 index 0000000000000000000000000000000000000000..a0f60f1be1727e85380eedb7d311ce9445f88b8e --- /dev/null +++ b/ppocr/modeling/heads/rec_abinet_head.py @@ -0,0 +1,296 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/FangShancheng/ABINet/tree/main/modules +""" + +import math +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle.nn import LayerList +from ppocr.modeling.heads.rec_nrtr_head import TransformerBlock, PositionalEncoding + + +class BCNLanguage(nn.Layer): + def __init__(self, + d_model=512, + nhead=8, + num_layers=4, + dim_feedforward=2048, + dropout=0., + max_length=25, + detach=True, + num_classes=37): + super().__init__() + + self.d_model = d_model + self.detach = detach + self.max_length = max_length + 1 # additional stop token + self.proj = nn.Linear(num_classes, d_model, bias_attr=False) + self.token_encoder = PositionalEncoding( + dropout=0.1, dim=d_model, max_len=self.max_length) + self.pos_encoder = PositionalEncoding( + dropout=0, dim=d_model, max_len=self.max_length) + + self.decoder = nn.LayerList([ + TransformerBlock( + d_model=d_model, + nhead=nhead, + dim_feedforward=dim_feedforward, + attention_dropout_rate=dropout, + residual_dropout_rate=dropout, + with_self_attn=False, + with_cross_attn=True) for i in range(num_layers) + ]) + + self.cls = nn.Linear(d_model, num_classes) + + def forward(self, tokens, lengths): + """ + Args: + tokens: (B, N, C) where N is length, B is batch size and C is classes number + lengths: (B,) + """ + if self.detach: tokens = tokens.detach() + embed = self.proj(tokens) # (B, N, C) + embed = self.token_encoder(embed) # (B, N, C) + padding_mask = _get_mask(lengths, self.max_length) + zeros = paddle.zeros_like(embed) # (B, N, C) + qeury = self.pos_encoder(zeros) + for decoder_layer in self.decoder: + qeury = decoder_layer(qeury, embed, cross_mask=padding_mask) + output = qeury # (B, N, C) + + logits = self.cls(output) # (B, N, C) + + return output, logits + + +def encoder_layer(in_c, out_c, k=3, s=2, p=1): + return nn.Sequential( + nn.Conv2D(in_c, out_c, k, s, p), nn.BatchNorm2D(out_c), nn.ReLU()) + + +def decoder_layer(in_c, + out_c, + k=3, + s=1, + p=1, + mode='nearest', + scale_factor=None, + size=None): + align_corners = False if mode == 'nearest' else True + return nn.Sequential( + nn.Upsample( + size=size, + scale_factor=scale_factor, + mode=mode, + align_corners=align_corners), + nn.Conv2D(in_c, out_c, k, s, p), + nn.BatchNorm2D(out_c), + nn.ReLU()) + + +class PositionAttention(nn.Layer): + def __init__(self, + max_length, + in_channels=512, + num_channels=64, + h=8, + w=32, + mode='nearest', + **kwargs): + super().__init__() + self.max_length = max_length + self.k_encoder = nn.Sequential( + encoder_layer( + in_channels, num_channels, s=(1, 2)), + encoder_layer( + num_channels, num_channels, s=(2, 2)), + encoder_layer( + num_channels, num_channels, s=(2, 2)), + encoder_layer( + num_channels, num_channels, s=(2, 2))) + self.k_decoder = nn.Sequential( + decoder_layer( + num_channels, num_channels, scale_factor=2, mode=mode), + decoder_layer( + num_channels, num_channels, scale_factor=2, mode=mode), + decoder_layer( + num_channels, num_channels, scale_factor=2, mode=mode), + decoder_layer( + num_channels, in_channels, size=(h, w), mode=mode)) + + self.pos_encoder = PositionalEncoding( + dropout=0, dim=in_channels, max_len=max_length) + self.project = nn.Linear(in_channels, in_channels) + + def forward(self, x): + B, C, H, W = x.shape + k, v = x, x + + # calculate key vector + features = [] + for i in range(0, len(self.k_encoder)): + k = self.k_encoder[i](k) + features.append(k) + for i in range(0, len(self.k_decoder) - 1): + k = self.k_decoder[i](k) + # print(k.shape, features[len(self.k_decoder) - 2 - i].shape) + k = k + features[len(self.k_decoder) - 2 - i] + k = self.k_decoder[-1](k) + + # calculate query vector + # TODO q=f(q,k) + zeros = paddle.zeros( + (B, self.max_length, C), dtype=x.dtype) # (T, N, C) + q = self.pos_encoder(zeros) # (B, N, C) + q = self.project(q) # (B, N, C) + + # calculate attention + attn_scores = q @k.flatten(2) # (B, N, (H*W)) + attn_scores = attn_scores / (C**0.5) + attn_scores = F.softmax(attn_scores, axis=-1) + + v = v.flatten(2).transpose([0, 2, 1]) # (B, (H*W), C) + attn_vecs = attn_scores @v # (B, N, C) + + return attn_vecs, attn_scores.reshape([0, self.max_length, H, W]) + + +class ABINetHead(nn.Layer): + def __init__(self, + in_channels, + out_channels, + d_model=512, + nhead=8, + num_layers=3, + dim_feedforward=2048, + dropout=0.1, + max_length=25, + use_lang=False, + iter_size=1): + super().__init__() + self.max_length = max_length + 1 + self.pos_encoder = PositionalEncoding( + dropout=0.1, dim=d_model, max_len=8 * 32) + self.encoder = nn.LayerList([ + TransformerBlock( + d_model=d_model, + nhead=nhead, + dim_feedforward=dim_feedforward, + attention_dropout_rate=dropout, + residual_dropout_rate=dropout, + with_self_attn=True, + with_cross_attn=False) for i in range(num_layers) + ]) + self.decoder = PositionAttention( + max_length=max_length + 1, # additional stop token + mode='nearest', ) + self.out_channels = out_channels + self.cls = nn.Linear(d_model, self.out_channels) + self.use_lang = use_lang + if use_lang: + self.iter_size = iter_size + self.language = BCNLanguage( + d_model=d_model, + nhead=nhead, + num_layers=4, + dim_feedforward=dim_feedforward, + dropout=dropout, + max_length=max_length, + num_classes=self.out_channels) + # alignment + self.w_att_align = nn.Linear(2 * d_model, d_model) + self.cls_align = nn.Linear(d_model, self.out_channels) + + def forward(self, x, targets=None): + x = x.transpose([0, 2, 3, 1]) + _, H, W, C = x.shape + feature = x.flatten(1, 2) + feature = self.pos_encoder(feature) + for encoder_layer in self.encoder: + feature = encoder_layer(feature) + feature = feature.reshape([0, H, W, C]).transpose([0, 3, 1, 2]) + v_feature, attn_scores = self.decoder( + feature) # (B, N, C), (B, C, H, W) + vis_logits = self.cls(v_feature) # (B, N, C) + logits = vis_logits + vis_lengths = _get_length(vis_logits) + if self.use_lang: + align_logits = vis_logits + align_lengths = vis_lengths + all_l_res, all_a_res = [], [] + for i in range(self.iter_size): + tokens = F.softmax(align_logits, axis=-1) + lengths = align_lengths + lengths = paddle.clip( + lengths, 2, self.max_length) # TODO:move to langauge model + l_feature, l_logits = self.language(tokens, lengths) + + # alignment + all_l_res.append(l_logits) + fuse = paddle.concat((l_feature, v_feature), -1) + f_att = F.sigmoid(self.w_att_align(fuse)) + output = f_att * v_feature + (1 - f_att) * l_feature + align_logits = self.cls_align(output) # (B, N, C) + + align_lengths = _get_length(align_logits) + all_a_res.append(align_logits) + if self.training: + return { + 'align': all_a_res, + 'lang': all_l_res, + 'vision': vis_logits + } + else: + logits = align_logits + if self.training: + return logits + else: + return F.softmax(logits, -1) + + +def _get_length(logit): + """ Greed decoder to obtain length from logit""" + out = (logit.argmax(-1) == 0) + abn = out.any(-1) + out_int = out.cast('int32') + out = (out_int.cumsum(-1) == 1) & out + out = out.cast('int32') + out = out.argmax(-1) + out = out + 1 + out = paddle.where(abn, out, paddle.to_tensor(logit.shape[1])) + return out + + +def _get_mask(length, max_length): + """Generate a square mask for the sequence. The masked positions are filled with float('-inf'). + Unmasked positions are filled with float(0.0). + """ + length = length.unsqueeze(-1) + B = paddle.shape(length)[0] + grid = paddle.arange(0, max_length).unsqueeze(0).tile([B, 1]) + zero_mask = paddle.zeros([B, max_length], dtype='float32') + inf_mask = paddle.full([B, max_length], '-inf', dtype='float32') + diag_mask = paddle.diag( + paddle.full( + [max_length], '-inf', dtype=paddle.float32), + offset=0, + name=None) + mask = paddle.where(grid >= length, inf_mask, zero_mask) + mask = mask.unsqueeze(1) + diag_mask + return mask.unsqueeze(1) diff --git a/ppocr/modeling/heads/rec_nrtr_head.py b/ppocr/modeling/heads/rec_nrtr_head.py index 38ba0c917840ea7d1e2a3c2bf0da32c2c35f2b40..bf9ef56145e6edfb15bd30235b4a62588396ba96 100644 --- a/ppocr/modeling/heads/rec_nrtr_head.py +++ b/ppocr/modeling/heads/rec_nrtr_head.py @@ -14,20 +14,15 @@ import math import paddle -import copy from paddle import nn import paddle.nn.functional as F from paddle.nn import LayerList -from paddle.nn.initializer import XavierNormal as xavier_uniform_ -from paddle.nn import Dropout, Linear, LayerNorm, Conv2D +# from paddle.nn.initializer import XavierNormal as xavier_uniform_ +from paddle.nn import Dropout, Linear, LayerNorm import numpy as np -from ppocr.modeling.heads.multiheadAttention import MultiheadAttention -from paddle.nn.initializer import Constant as constant_ +from ppocr.modeling.backbones.rec_svtrnet import Mlp, zeros_, ones_ from paddle.nn.initializer import XavierNormal as xavier_normal_ -zeros_ = constant_(value=0.) -ones_ = constant_(value=1.) - class Transformer(nn.Layer): """A transformer model. User is able to modify the attributes as needed. The architechture @@ -45,7 +40,6 @@ class Transformer(nn.Layer): dropout: the dropout value (default=0.1). custom_encoder: custom encoder (default=None). custom_decoder: custom decoder (default=None). - """ def __init__(self, @@ -54,45 +48,49 @@ class Transformer(nn.Layer): num_encoder_layers=6, beam_size=0, num_decoder_layers=6, + max_len=25, dim_feedforward=1024, attention_dropout_rate=0.0, residual_dropout_rate=0.1, - custom_encoder=None, - custom_decoder=None, in_channels=0, out_channels=0, scale_embedding=True): super(Transformer, self).__init__() self.out_channels = out_channels + 1 + self.max_len = max_len self.embedding = Embeddings( d_model=d_model, vocab=self.out_channels, padding_idx=0, scale_embedding=scale_embedding) self.positional_encoding = PositionalEncoding( - dropout=residual_dropout_rate, - dim=d_model, ) - if custom_encoder is not None: - self.encoder = custom_encoder - else: - if num_encoder_layers > 0: - encoder_layer = TransformerEncoderLayer( - d_model, nhead, dim_feedforward, attention_dropout_rate, - residual_dropout_rate) - self.encoder = TransformerEncoder(encoder_layer, - num_encoder_layers) - else: - self.encoder = None - - if custom_decoder is not None: - self.decoder = custom_decoder + dropout=residual_dropout_rate, dim=d_model) + + if num_encoder_layers > 0: + self.encoder = nn.LayerList([ + TransformerBlock( + d_model, + nhead, + dim_feedforward, + attention_dropout_rate, + residual_dropout_rate, + with_self_attn=True, + with_cross_attn=False) for i in range(num_encoder_layers) + ]) else: - decoder_layer = TransformerDecoderLayer( - d_model, nhead, dim_feedforward, attention_dropout_rate, - residual_dropout_rate) - self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers) + self.encoder = None + + self.decoder = nn.LayerList([ + TransformerBlock( + d_model, + nhead, + dim_feedforward, + attention_dropout_rate, + residual_dropout_rate, + with_self_attn=True, + with_cross_attn=True) for i in range(num_decoder_layers) + ]) - self._reset_parameters() self.beam_size = beam_size self.d_model = d_model self.nhead = nhead @@ -105,7 +103,7 @@ class Transformer(nn.Layer): def _init_weights(self, m): - if isinstance(m, nn.Conv2D): + if isinstance(m, nn.Linear): xavier_normal_(m.weight) if m.bias is not None: zeros_(m.bias) @@ -113,24 +111,20 @@ class Transformer(nn.Layer): def forward_train(self, src, tgt): tgt = tgt[:, :-1] - tgt_key_padding_mask = self.generate_padding_mask(tgt) - tgt = self.embedding(tgt).transpose([1, 0, 2]) + tgt = self.embedding(tgt) tgt = self.positional_encoding(tgt) - tgt_mask = self.generate_square_subsequent_mask(tgt.shape[0]) + tgt_mask = self.generate_square_subsequent_mask(tgt.shape[1]) if self.encoder is not None: - src = self.positional_encoding(src.transpose([1, 0, 2])) - memory = self.encoder(src) + src = self.positional_encoding(src) + for encoder_layer in self.encoder: + src = encoder_layer(src) + memory = src # B N C else: - memory = src.squeeze(2).transpose([2, 0, 1]) - output = self.decoder( - tgt, - memory, - tgt_mask=tgt_mask, - memory_mask=None, - tgt_key_padding_mask=tgt_key_padding_mask, - memory_key_padding_mask=None) - output = output.transpose([1, 0, 2]) + memory = src # B N C + for decoder_layer in self.decoder: + tgt = decoder_layer(tgt, memory, self_mask=tgt_mask) + output = tgt logit = self.tgt_word_prj(output) return logit @@ -140,8 +134,8 @@ class Transformer(nn.Layer): src: the sequence to the encoder (required). tgt: the sequence to the decoder (required). Shape: - - src: :math:`(S, N, E)`. - - tgt: :math:`(T, N, E)`. + - src: :math:`(B, sN, C)`. + - tgt: :math:`(B, tN, C)`. Examples: >>> output = transformer_model(src, tgt) """ @@ -157,36 +151,35 @@ class Transformer(nn.Layer): return self.forward_test(src) def forward_test(self, src): + bs = paddle.shape(src)[0] if self.encoder is not None: - src = self.positional_encoding(paddle.transpose(src, [1, 0, 2])) - memory = self.encoder(src) + src = self.positional_encoding(src) + for encoder_layer in self.encoder: + src = encoder_layer(src) + memory = src # B N C else: - memory = paddle.transpose(paddle.squeeze(src, 2), [2, 0, 1]) + memory = src dec_seq = paddle.full((bs, 1), 2, dtype=paddle.int64) dec_prob = paddle.full((bs, 1), 1., dtype=paddle.float32) - for len_dec_seq in range(1, 25): - dec_seq_embed = paddle.transpose(self.embedding(dec_seq), [1, 0, 2]) + for len_dec_seq in range(1, self.max_len): + dec_seq_embed = self.embedding(dec_seq) dec_seq_embed = self.positional_encoding(dec_seq_embed) tgt_mask = self.generate_square_subsequent_mask( - paddle.shape(dec_seq_embed)[0]) - output = self.decoder( - dec_seq_embed, - memory, - tgt_mask=tgt_mask, - memory_mask=None, - tgt_key_padding_mask=None, - memory_key_padding_mask=None) - dec_output = paddle.transpose(output, [1, 0, 2]) + paddle.shape(dec_seq_embed)[1]) + tgt = dec_seq_embed + for decoder_layer in self.decoder: + tgt = decoder_layer(tgt, memory, self_mask=tgt_mask) + dec_output = tgt dec_output = dec_output[:, -1, :] - word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=1) - preds_idx = paddle.argmax(word_prob, axis=1) + word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=-1) + preds_idx = paddle.argmax(word_prob, axis=-1) if paddle.equal_all( preds_idx, paddle.full( paddle.shape(preds_idx), 3, dtype='int64')): break - preds_prob = paddle.max(word_prob, axis=1) + preds_prob = paddle.max(word_prob, axis=-1) dec_seq = paddle.concat( [dec_seq, paddle.reshape(preds_idx, [-1, 1])], axis=1) dec_prob = paddle.concat( @@ -194,10 +187,10 @@ class Transformer(nn.Layer): return [dec_seq, dec_prob] def forward_beam(self, images): - ''' Translation work in one batch ''' + """ Translation work in one batch """ def get_inst_idx_to_tensor_position_map(inst_idx_list): - ''' Indicate the position of an instance in a tensor. ''' + """ Indicate the position of an instance in a tensor. """ return { inst_idx: tensor_position for tensor_position, inst_idx in enumerate(inst_idx_list) @@ -205,7 +198,7 @@ class Transformer(nn.Layer): def collect_active_part(beamed_tensor, curr_active_inst_idx, n_prev_active_inst, n_bm): - ''' Collect tensor parts associated to active instances. ''' + """ Collect tensor parts associated to active instances. """ beamed_tensor_shape = paddle.shape(beamed_tensor) n_curr_active_inst = len(curr_active_inst_idx) @@ -237,9 +230,8 @@ class Transformer(nn.Layer): return active_src_enc, active_inst_idx_to_position_map def beam_decode_step(inst_dec_beams, len_dec_seq, enc_output, - inst_idx_to_position_map, n_bm, - memory_key_padding_mask): - ''' Decode and update beam status, and then return active beam idx ''' + inst_idx_to_position_map, n_bm): + """ Decode and update beam status, and then return active beam idx """ def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq): dec_partial_seq = [ @@ -249,19 +241,15 @@ class Transformer(nn.Layer): dec_partial_seq = dec_partial_seq.reshape([-1, len_dec_seq]) return dec_partial_seq - def predict_word(dec_seq, enc_output, n_active_inst, n_bm, - memory_key_padding_mask): - dec_seq = paddle.transpose(self.embedding(dec_seq), [1, 0, 2]) + def predict_word(dec_seq, enc_output, n_active_inst, n_bm): + dec_seq = self.embedding(dec_seq) dec_seq = self.positional_encoding(dec_seq) tgt_mask = self.generate_square_subsequent_mask( - paddle.shape(dec_seq)[0]) - dec_output = self.decoder( - dec_seq, - enc_output, - tgt_mask=tgt_mask, - tgt_key_padding_mask=None, - memory_key_padding_mask=memory_key_padding_mask, ) - dec_output = paddle.transpose(dec_output, [1, 0, 2]) + paddle.shape(dec_seq)[1]) + tgt = dec_seq + for decoder_layer in self.decoder: + tgt = decoder_layer(tgt, enc_output, self_mask=tgt_mask) + dec_output = tgt dec_output = dec_output[:, -1, :] # Pick the last step: (bh * bm) * d_h word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=1) @@ -281,8 +269,7 @@ class Transformer(nn.Layer): n_active_inst = len(inst_idx_to_position_map) dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq) - word_prob = predict_word(dec_seq, enc_output, n_active_inst, n_bm, - None) + word_prob = predict_word(dec_seq, enc_output, n_active_inst, n_bm) # Update the beam with predicted word prob information and collect incomplete instances active_inst_idx_list = collect_active_inst_idx_list( inst_dec_beams, word_prob, inst_idx_to_position_map) @@ -303,10 +290,10 @@ class Transformer(nn.Layer): with paddle.no_grad(): #-- Encode if self.encoder is not None: - src = self.positional_encoding(images.transpose([1, 0, 2])) + src = self.positional_encoding(images) src_enc = self.encoder(src) else: - src_enc = images.squeeze(2).transpose([0, 2, 1]) + src_enc = images n_bm = self.beam_size src_shape = paddle.shape(src_enc) @@ -317,11 +304,11 @@ class Transformer(nn.Layer): inst_idx_to_position_map = get_inst_idx_to_tensor_position_map( active_inst_idx_list) # Decode - for len_dec_seq in range(1, 25): + for len_dec_seq in range(1, self.max_len): src_enc_copy = src_enc.clone() active_inst_idx_list = beam_decode_step( inst_dec_beams, len_dec_seq, src_enc_copy, - inst_idx_to_position_map, n_bm, None) + inst_idx_to_position_map, n_bm) if not active_inst_idx_list: break # all instances have finished their path to src_enc, inst_idx_to_position_map = collate_active_info( @@ -354,261 +341,124 @@ class Transformer(nn.Layer): shape=[sz, sz], dtype='float32', fill_value='-inf'), diagonal=1) mask = mask + mask_inf - return mask - - def generate_padding_mask(self, x): - padding_mask = paddle.equal(x, paddle.to_tensor(0, dtype=x.dtype)) - return padding_mask + return mask.unsqueeze([0, 1]) - def _reset_parameters(self): - """Initiate parameters in the transformer model.""" - for p in self.parameters(): - if p.dim() > 1: - xavier_uniform_(p) +class MultiheadAttention(nn.Layer): + """Allows the model to jointly attend to information + from different representation subspaces. + See reference: Attention Is All You Need + .. math:: + \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O + \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) -class TransformerEncoder(nn.Layer): - """TransformerEncoder is a stack of N encoder layers Args: - encoder_layer: an instance of the TransformerEncoderLayer() class (required). - num_layers: the number of sub-encoder-layers in the encoder (required). - norm: the layer normalization component (optional). - """ + embed_dim: total dimension of the model + num_heads: parallel attention layers, or heads - def __init__(self, encoder_layer, num_layers): - super(TransformerEncoder, self).__init__() - self.layers = _get_clones(encoder_layer, num_layers) - self.num_layers = num_layers + """ - def forward(self, src): - """Pass the input through the endocder layers in turn. - Args: - src: the sequnce to the encoder (required). - mask: the mask for the src sequence (optional). - src_key_padding_mask: the mask for the src keys per batch (optional). - """ - output = src + def __init__(self, embed_dim, num_heads, dropout=0., self_attn=False): + super(MultiheadAttention, self).__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + # self.dropout = dropout + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + self.scale = self.head_dim**-0.5 + self.self_attn = self_attn + if self_attn: + self.qkv = nn.Linear(embed_dim, embed_dim * 3) + else: + self.q = nn.Linear(embed_dim, embed_dim) + self.kv = nn.Linear(embed_dim, embed_dim * 2) + self.attn_drop = nn.Dropout(dropout) + self.out_proj = nn.Linear(embed_dim, embed_dim) - for i in range(self.num_layers): - output = self.layers[i](output, - src_mask=None, - src_key_padding_mask=None) + def forward(self, query, key=None, attn_mask=None): - return output + qN = query.shape[1] + if self.self_attn: + qkv = self.qkv(query).reshape( + (0, qN, 3, self.num_heads, self.head_dim)).transpose( + (2, 0, 3, 1, 4)) + q, k, v = qkv[0], qkv[1], qkv[2] + else: + kN = key.shape[1] + q = self.q(query).reshape( + [0, qN, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) + kv = self.kv(key).reshape( + (0, kN, 2, self.num_heads, self.head_dim)).transpose( + (2, 0, 3, 1, 4)) + k, v = kv[0], kv[1] -class TransformerDecoder(nn.Layer): - """TransformerDecoder is a stack of N decoder layers + attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale - Args: - decoder_layer: an instance of the TransformerDecoderLayer() class (required). - num_layers: the number of sub-decoder-layers in the decoder (required). - norm: the layer normalization component (optional). + if attn_mask is not None: + attn += attn_mask - """ + attn = F.softmax(attn, axis=-1) + attn = self.attn_drop(attn) - def __init__(self, decoder_layer, num_layers): - super(TransformerDecoder, self).__init__() - self.layers = _get_clones(decoder_layer, num_layers) - self.num_layers = num_layers + x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape( + (0, qN, self.embed_dim)) + x = self.out_proj(x) - def forward(self, - tgt, - memory, - tgt_mask=None, - memory_mask=None, - tgt_key_padding_mask=None, - memory_key_padding_mask=None): - """Pass the inputs (and mask) through the decoder layer in turn. + return x - Args: - tgt: the sequence to the decoder (required). - memory: the sequnce from the last layer of the encoder (required). - tgt_mask: the mask for the tgt sequence (optional). - memory_mask: the mask for the memory sequence (optional). - tgt_key_padding_mask: the mask for the tgt keys per batch (optional). - memory_key_padding_mask: the mask for the memory keys per batch (optional). - """ - output = tgt - for i in range(self.num_layers): - output = self.layers[i]( - output, - memory, - tgt_mask=tgt_mask, - memory_mask=memory_mask, - tgt_key_padding_mask=tgt_key_padding_mask, - memory_key_padding_mask=memory_key_padding_mask) - - return output - - -class TransformerEncoderLayer(nn.Layer): - """TransformerEncoderLayer is made up of self-attn and feedforward network. - This standard encoder layer is based on the paper "Attention Is All You Need". - Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, - Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in - Neural Information Processing Systems, pages 6000-6010. Users may modify or implement - in a different way during application. - - Args: - d_model: the number of expected features in the input (required). - nhead: the number of heads in the multiheadattention models (required). - dim_feedforward: the dimension of the feedforward network model (default=2048). - dropout: the dropout value (default=0.1). - - """ +class TransformerBlock(nn.Layer): def __init__(self, d_model, nhead, dim_feedforward=2048, attention_dropout_rate=0.0, - residual_dropout_rate=0.1): - super(TransformerEncoderLayer, self).__init__() - self.self_attn = MultiheadAttention( - d_model, nhead, dropout=attention_dropout_rate) - - self.conv1 = Conv2D( - in_channels=d_model, - out_channels=dim_feedforward, - kernel_size=(1, 1)) - self.conv2 = Conv2D( - in_channels=dim_feedforward, - out_channels=d_model, - kernel_size=(1, 1)) - - self.norm1 = LayerNorm(d_model) - self.norm2 = LayerNorm(d_model) - self.dropout1 = Dropout(residual_dropout_rate) - self.dropout2 = Dropout(residual_dropout_rate) - - def forward(self, src, src_mask=None, src_key_padding_mask=None): - """Pass the input through the endocder layer. - Args: - src: the sequnce to the encoder layer (required). - src_mask: the mask for the src sequence (optional). - src_key_padding_mask: the mask for the src keys per batch (optional). - """ - src2 = self.self_attn( - src, - src, - src, - attn_mask=src_mask, - key_padding_mask=src_key_padding_mask) - src = src + self.dropout1(src2) - src = self.norm1(src) - - src = paddle.transpose(src, [1, 2, 0]) - src = paddle.unsqueeze(src, 2) - src2 = self.conv2(F.relu(self.conv1(src))) - src2 = paddle.squeeze(src2, 2) - src2 = paddle.transpose(src2, [2, 0, 1]) - src = paddle.squeeze(src, 2) - src = paddle.transpose(src, [2, 0, 1]) - - src = src + self.dropout2(src2) - src = self.norm2(src) - return src - - -class TransformerDecoderLayer(nn.Layer): - """TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network. - This standard decoder layer is based on the paper "Attention Is All You Need". - Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, - Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in - Neural Information Processing Systems, pages 6000-6010. Users may modify or implement - in a different way during application. - - Args: - d_model: the number of expected features in the input (required). - nhead: the number of heads in the multiheadattention models (required). - dim_feedforward: the dimension of the feedforward network model (default=2048). - dropout: the dropout value (default=0.1). - - """ + residual_dropout_rate=0.1, + with_self_attn=True, + with_cross_attn=False, + epsilon=1e-5): + super(TransformerBlock, self).__init__() + self.with_self_attn = with_self_attn + if with_self_attn: + self.self_attn = MultiheadAttention( + d_model, + nhead, + dropout=attention_dropout_rate, + self_attn=with_self_attn) + self.norm1 = LayerNorm(d_model, epsilon=epsilon) + self.dropout1 = Dropout(residual_dropout_rate) + self.with_cross_attn = with_cross_attn + if with_cross_attn: + self.cross_attn = MultiheadAttention( #for self_attn of encoder or cross_attn of decoder + d_model, + nhead, + dropout=attention_dropout_rate) + self.norm2 = LayerNorm(d_model, epsilon=epsilon) + self.dropout2 = Dropout(residual_dropout_rate) + + self.mlp = Mlp(in_features=d_model, + hidden_features=dim_feedforward, + act_layer=nn.ReLU, + drop=residual_dropout_rate) + + self.norm3 = LayerNorm(d_model, epsilon=epsilon) - def __init__(self, - d_model, - nhead, - dim_feedforward=2048, - attention_dropout_rate=0.0, - residual_dropout_rate=0.1): - super(TransformerDecoderLayer, self).__init__() - self.self_attn = MultiheadAttention( - d_model, nhead, dropout=attention_dropout_rate) - self.multihead_attn = MultiheadAttention( - d_model, nhead, dropout=attention_dropout_rate) - - self.conv1 = Conv2D( - in_channels=d_model, - out_channels=dim_feedforward, - kernel_size=(1, 1)) - self.conv2 = Conv2D( - in_channels=dim_feedforward, - out_channels=d_model, - kernel_size=(1, 1)) - - self.norm1 = LayerNorm(d_model) - self.norm2 = LayerNorm(d_model) - self.norm3 = LayerNorm(d_model) - self.dropout1 = Dropout(residual_dropout_rate) - self.dropout2 = Dropout(residual_dropout_rate) self.dropout3 = Dropout(residual_dropout_rate) - def forward(self, - tgt, - memory, - tgt_mask=None, - memory_mask=None, - tgt_key_padding_mask=None, - memory_key_padding_mask=None): - """Pass the inputs (and mask) through the decoder layer. + def forward(self, tgt, memory=None, self_mask=None, cross_mask=None): + if self.with_self_attn: + tgt1 = self.self_attn(tgt, attn_mask=self_mask) + tgt = self.norm1(tgt + self.dropout1(tgt1)) - Args: - tgt: the sequence to the decoder layer (required). - memory: the sequnce from the last layer of the encoder (required). - tgt_mask: the mask for the tgt sequence (optional). - memory_mask: the mask for the memory sequence (optional). - tgt_key_padding_mask: the mask for the tgt keys per batch (optional). - memory_key_padding_mask: the mask for the memory keys per batch (optional). - - """ - tgt2 = self.self_attn( - tgt, - tgt, - tgt, - attn_mask=tgt_mask, - key_padding_mask=tgt_key_padding_mask) - tgt = tgt + self.dropout1(tgt2) - tgt = self.norm1(tgt) - tgt2 = self.multihead_attn( - tgt, - memory, - memory, - attn_mask=memory_mask, - key_padding_mask=memory_key_padding_mask) - tgt = tgt + self.dropout2(tgt2) - tgt = self.norm2(tgt) - - # default - tgt = paddle.transpose(tgt, [1, 2, 0]) - tgt = paddle.unsqueeze(tgt, 2) - tgt2 = self.conv2(F.relu(self.conv1(tgt))) - tgt2 = paddle.squeeze(tgt2, 2) - tgt2 = paddle.transpose(tgt2, [2, 0, 1]) - tgt = paddle.squeeze(tgt, 2) - tgt = paddle.transpose(tgt, [2, 0, 1]) - - tgt = tgt + self.dropout3(tgt2) - tgt = self.norm3(tgt) + if self.with_cross_attn: + tgt2 = self.cross_attn(tgt, key=memory, attn_mask=cross_mask) + tgt = self.norm2(tgt + self.dropout2(tgt2)) + tgt = self.norm3(tgt + self.dropout3(self.mlp(tgt))) return tgt -def _get_clones(module, N): - return LayerList([copy.deepcopy(module) for i in range(N)]) - - class PositionalEncoding(nn.Layer): """Inject some information about the relative or absolute position of the tokens in the sequence. The positional encodings have the same dimension as @@ -651,8 +501,9 @@ class PositionalEncoding(nn.Layer): Examples: >>> output = pos_encoder(x) """ + x = x.transpose([1, 0, 2]) x = x + self.pe[:paddle.shape(x)[0], :] - return self.dropout(x) + return self.dropout(x).transpose([1, 0, 2]) class PositionalEncoding_2d(nn.Layer): @@ -725,7 +576,7 @@ class PositionalEncoding_2d(nn.Layer): class Embeddings(nn.Layer): - def __init__(self, d_model, vocab, padding_idx, scale_embedding): + def __init__(self, d_model, vocab, padding_idx=None, scale_embedding=True): super(Embeddings, self).__init__() self.embedding = nn.Embedding(vocab, d_model, padding_idx=padding_idx) w0 = np.random.normal(0.0, d_model**-0.5, @@ -742,7 +593,7 @@ class Embeddings(nn.Layer): class Beam(): - ''' Beam search ''' + """ Beam search """ def __init__(self, size, device=False): diff --git a/ppocr/postprocess/__init__.py b/ppocr/postprocess/__init__.py index 4f900ee1fc716ebec78feaaed07bf258d3d0df0a..2635117c84298bcf77a77cdcab553278f2df642b 100644 --- a/ppocr/postprocess/__init__.py +++ b/ppocr/postprocess/__init__.py @@ -27,7 +27,7 @@ from .sast_postprocess import SASTPostProcess from .fce_postprocess import FCEPostProcess from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, \ DistillationCTCLabelDecode, TableLabelDecode, NRTRLabelDecode, SARLabelDecode, \ - SEEDLabelDecode, PRENLabelDecode, ViTSTRLabelDecode + SEEDLabelDecode, PRENLabelDecode, ViTSTRLabelDecode, ABINetLabelDecode from .cls_postprocess import ClsPostProcess from .pg_postprocess import PGPostProcess from .vqa_token_ser_layoutlm_postprocess import VQASerTokenLayoutLMPostProcess @@ -42,7 +42,7 @@ def build_post_process(config, global_config=None): 'DistillationDBPostProcess', 'NRTRLabelDecode', 'SARLabelDecode', 'SEEDLabelDecode', 'VQASerTokenLayoutLMPostProcess', 'VQAReTokenLayoutLMPostProcess', 'PRENLabelDecode', - 'DistillationSARLabelDecode', 'ViTSTRLabelDecode' + 'DistillationSARLabelDecode', 'ViTSTRLabelDecode', 'ABINetLabelDecode' ] if config['name'] == 'PSEPostProcess': diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index df6203fadaae9e99c1df125e7172166e5e3d8acd..c77420ad19b76e203262568ba77f06ea248a1655 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -140,96 +140,6 @@ class DistillationCTCLabelDecode(CTCLabelDecode): return output -class NRTRLabelDecode(BaseRecLabelDecode): - """ Convert between text-label and text-index """ - - def __init__(self, character_dict_path=None, use_space_char=True, **kwargs): - super(NRTRLabelDecode, self).__init__(character_dict_path, - use_space_char) - - def __call__(self, preds, label=None, *args, **kwargs): - - if len(preds) == 2: - preds_id = preds[0] - preds_prob = preds[1] - if isinstance(preds_id, paddle.Tensor): - preds_id = preds_id.numpy() - if isinstance(preds_prob, paddle.Tensor): - preds_prob = preds_prob.numpy() - if preds_id[0][0] == 2: - preds_idx = preds_id[:, 1:] - preds_prob = preds_prob[:, 1:] - else: - preds_idx = preds_id - text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) - if label is None: - return text - label = self.decode(label[:, 1:]) - else: - if isinstance(preds, paddle.Tensor): - preds = preds.numpy() - preds_idx = preds.argmax(axis=2) - preds_prob = preds.max(axis=2) - text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) - if label is None: - return text - label = self.decode(label[:, 1:]) - return text, label - - def add_special_char(self, dict_character): - dict_character = ['blank', '', '', ''] + dict_character - return dict_character - - def decode(self, text_index, text_prob=None, is_remove_duplicate=False): - """ convert text-index into text-label. """ - result_list = [] - batch_size = len(text_index) - for batch_idx in range(batch_size): - char_list = [] - conf_list = [] - for idx in range(len(text_index[batch_idx])): - try: - char_idx = self.character[int(text_index[batch_idx][idx])] - except: - continue - if char_idx == '': # end - break - char_list.append(char_idx) - if text_prob is not None: - conf_list.append(text_prob[batch_idx][idx]) - else: - conf_list.append(1) - text = ''.join(char_list) - result_list.append((text.lower(), np.mean(conf_list).tolist())) - return result_list - - -class ViTSTRLabelDecode(NRTRLabelDecode): - """ Convert between text-label and text-index """ - - def __init__(self, character_dict_path=None, use_space_char=False, - **kwargs): - super(ViTSTRLabelDecode, self).__init__(character_dict_path, - use_space_char) - - def __call__(self, preds, label=None, *args, **kwargs): - if isinstance(preds, paddle.Tensor): - preds = preds[:, 1:].numpy() - else: - preds = preds[:, 1:] - preds_idx = preds.argmax(axis=2) - preds_prob = preds.max(axis=2) - text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) - if label is None: - return text - label = self.decode(label[:, 1:]) - return text, label - - def add_special_char(self, dict_character): - dict_character = ['', ''] + dict_character - return dict_character - - class AttnLabelDecode(BaseRecLabelDecode): """ Convert between text-label and text-index """ @@ -778,3 +688,122 @@ class PRENLabelDecode(BaseRecLabelDecode): return text label = self.decode(label) return text, label + + +class NRTRLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=True, **kwargs): + super(NRTRLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + + if len(preds) == 2: + preds_id = preds[0] + preds_prob = preds[1] + if isinstance(preds_id, paddle.Tensor): + preds_id = preds_id.numpy() + if isinstance(preds_prob, paddle.Tensor): + preds_prob = preds_prob.numpy() + if preds_id[0][0] == 2: + preds_idx = preds_id[:, 1:] + preds_prob = preds_prob[:, 1:] + else: + preds_idx = preds_id + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label[:, 1:]) + else: + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label[:, 1:]) + return text, label + + def add_special_char(self, dict_character): + dict_character = ['blank', '', '', ''] + dict_character + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + try: + char_idx = self.character[int(text_index[batch_idx][idx])] + except: + continue + if char_idx == '': # end + break + char_list.append(char_idx) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text.lower(), np.mean(conf_list).tolist())) + return result_list + + +class ViTSTRLabelDecode(NRTRLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(ViTSTRLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, paddle.Tensor): + preds = preds[:, 1:].numpy() + else: + preds = preds[:, 1:] + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label[:, 1:]) + return text, label + + def add_special_char(self, dict_character): + dict_character = ['', ''] + dict_character + return dict_character + + +class ABINetLabelDecode(NRTRLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(ABINetLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, dict): + preds = preds['align'][-1].numpy() + elif isinstance(preds, paddle.Tensor): + preds = preds.numpy() + else: + preds = preds + + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label) + return text, label + + def add_special_char(self, dict_character): + dict_character = [''] + dict_character + return dict_character diff --git a/test_tipc/configs/rec_mtb_nrtr/rec_mtb_nrtr.yml b/test_tipc/configs/rec_mtb_nrtr/rec_mtb_nrtr.yml index ba6728d8362825f66ea3ea726896faf6c3372ac3..8118d587248b7e4797e3a75c897e7b0a3d71b364 100644 --- a/test_tipc/configs/rec_mtb_nrtr/rec_mtb_nrtr.yml +++ b/test_tipc/configs/rec_mtb_nrtr/rec_mtb_nrtr.yml @@ -99,5 +99,5 @@ Eval: shuffle: False drop_last: False batch_size_per_card: 256 - num_workers: 1 + num_workers: 4 use_shared_memory: False diff --git a/test_tipc/configs/rec_r45_abinet/rec_r45_abinet.yml b/test_tipc/configs/rec_r45_abinet/rec_r45_abinet.yml new file mode 100644 index 0000000000000000000000000000000000000000..306c49d68311321e72b669ae44b730176f7764e2 --- /dev/null +++ b/test_tipc/configs/rec_r45_abinet/rec_r45_abinet.yml @@ -0,0 +1,105 @@ +Global: + use_gpu: True + epoch_num: 10 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/r45_abinet/ + save_epoch_step: 1 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_abinet.txt + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.99 + clip_norm: 20.0 + lr: + name: Piecewise + decay_epochs: [6] + values: [0.0001, 0.00001] + regularizer: + name: 'L2' + factor: 0. + +Architecture: + model_type: rec + algorithm: ABINet + in_channels: 3 + Transform: + Backbone: + name: ResNet45 + + Head: + name: ABINetHead + use_lang: True + iter_size: 3 + + +Loss: + name: CELoss + ignore_index: &ignore_index 100 # Must be greater than the number of character classes + +PostProcess: + name: ABINetLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ic15_data/ + label_file_list: ["./train_data/ic15_data/rec_gt_train.txt"] + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - ABINetLabelEncode: # Class handling label + ignore_index: *ignore_index + - ABINetRecResizeImg: + image_shape: [3, 32, 128] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 96 + drop_last: True + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ic15_data + label_file_list: ["./train_data/ic15_data/rec_gt_test.txt"] + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - ABINetLabelEncode: # Class handling label + ignore_index: *ignore_index + - ABINetRecResizeImg: + image_shape: [3, 32, 128] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 4 + use_shared_memory: False diff --git a/test_tipc/configs/rec_r45_abinet/train_infer_python.txt b/test_tipc/configs/rec_r45_abinet/train_infer_python.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecab1bcbbde11fc6d14357b6715033704c2c3316 --- /dev/null +++ b/test_tipc/configs/rec_r45_abinet/train_infer_python.txt @@ -0,0 +1,53 @@ +===========================train_params=========================== +model_name:rec_abinet +python:python3.7 +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:null +Global.epoch_num:lite_train_lite_infer=2|whole_train_whole_infer=300 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=64 +Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./inference/rec_inference +null:null +## +trainer:norm_train +norm_train:tools/train.py -c test_tipc/configs/rec_r45_abinet/rec_r45_abinet.yml -o +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:tools/eval.py -c test_tipc/configs/rec_r45_abinet/rec_r45_abinet.yml -o +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.checkpoints: +norm_export:tools/export_model.py -c test_tipc/configs/rec_r45_abinet/rec_r45_abinet.yml -o +quant_export:null +fpgm_export:null +distill_export:null +export1:null +export2:null +## +train_model:./inference/rec_r45_abinet_train/best_accuracy +infer_export:tools/export_model.py -c test_tipc/configs/rec_r45_abinet/rec_r45_abinet.yml -o +infer_quant:False +inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dict.txt --rec_image_shape="3,32,128" --rec_algorithm="ABINet" +--use_gpu:True|False +--enable_mkldnn:False +--cpu_threads:6 +--rec_batch_num:1|6 +--use_tensorrt:False +--precision:fp32 +--rec_model_dir: +--image_dir:./inference/rec_inference +--save_log_path:./test/output/ +--benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,32,128]}] diff --git a/test_tipc/configs/rec_svtrnet/rec_svtrnet.yml b/test_tipc/configs/rec_svtrnet/rec_svtrnet.yml index 26facca34d20536cb19c3b1f80b0828ebc817e50..bffd8ac09a36ea7cb0bf3e7552ee2f5d34e86416 100644 --- a/test_tipc/configs/rec_svtrnet/rec_svtrnet.yml +++ b/test_tipc/configs/rec_svtrnet/rec_svtrnet.yml @@ -26,7 +26,7 @@ Optimizer: name: AdamW beta1: 0.9 beta2: 0.99 - epsilon: 0.00000008 + epsilon: 8.e-8 weight_decay: 0.05 no_weight_decay_name: norm pos_embed one_dim_param_no_weight_decay: true diff --git a/test_tipc/configs/rec_svtrnet/train_infer_python.txt b/test_tipc/configs/rec_svtrnet/train_infer_python.txt index 60fd25ad4f910b65fdf958034c48e6ba0d36c4c3..a7e4a24063b2e248f2ab92d5efd257a2837c0a34 100644 --- a/test_tipc/configs/rec_svtrnet/train_infer_python.txt +++ b/test_tipc/configs/rec_svtrnet/train_infer_python.txt @@ -37,7 +37,7 @@ export2:null train_model:./inference/rec_svtrnet_train/best_accuracy infer_export:tools/export_model.py -c test_tipc/configs/rec_svtrnet/rec_svtrnet.yml -o infer_quant:False -inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/EN_symbol_dict.txt --rec_image_shape="3,64,256" --rec_algorithm="SVTR" +inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dict.txt --rec_image_shape="3,64,256" --rec_algorithm="SVTR" --use_gpu:True|False --enable_mkldnn:False --cpu_threads:6 @@ -50,4 +50,4 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/EN_symbo --benchmark:True null:null ===========================infer_benchmark_params========================== -random_infer_input:[{float32,[1=3,64,256]}] +random_infer_input:[{float32,[3,64,256]}] diff --git a/test_tipc/configs/rec_vitstr_none_ce/rec_vitstr_none_ce.yml b/test_tipc/configs/rec_vitstr_none_ce/rec_vitstr_none_ce.yml index 4e8151f59910d507fea77376d54755f9f85ec53f..a0aed488755f7cb6fed18a5747e9b7f62f57da86 100644 --- a/test_tipc/configs/rec_vitstr_none_ce/rec_vitstr_none_ce.yml +++ b/test_tipc/configs/rec_vitstr_none_ce/rec_vitstr_none_ce.yml @@ -23,7 +23,7 @@ Global: Optimizer: name: Adadelta - epsilon: 0.00000001 + epsilon: 1.e-8 rho: 0.95 clip_norm: 5.0 lr: @@ -46,6 +46,7 @@ Loss: name: CELoss smoothing: False with_all: True + ignore_index: &ignore_index 0 # Must be zero or greater than the number of character classes PostProcess: name: ViTSTRLabelDecode @@ -64,6 +65,7 @@ Train: img_mode: BGR channel_first: False - ViTSTRLabelEncode: # Class handling label + ignore_index: *ignore_index - GrayRecResizeImg: image_shape: [224, 224] # W H resize_type: PIL # PIL or OpenCV @@ -87,6 +89,7 @@ Eval: img_mode: BGR channel_first: False - ViTSTRLabelEncode: # Class handling label + ignore_index: *ignore_index - GrayRecResizeImg: image_shape: [224, 224] # W H resize_type: PIL # PIL or OpenCV diff --git a/tools/export_model.py b/tools/export_model.py index 7ef2c36b127264243afc801b89d6b1055b7bb32f..e2673239b1f47596f43baad81784a73ae295b609 100755 --- a/tools/export_model.py +++ b/tools/export_model.py @@ -79,6 +79,19 @@ def export_single_model(model, arch_config, save_path, logger, quanter=None): shape=[None, 1, 224, 224], dtype="float32"), ] model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "ABINet": + other_shape = [ + paddle.static.InputSpec( + shape=[None, 3, 32, 128], dtype="float32"), + ] + # print([None, 3, 32, 128]) + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "NRTR": + other_shape = [ + paddle.static.InputSpec( + shape=[None, 1, 32, 100], dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) else: infer_shape = [3, -1, -1] if arch_config["model_type"] == "rec": @@ -90,8 +103,6 @@ def export_single_model(model, arch_config, save_path, logger, quanter=None): "When there is tps in the network, variable length input is not supported, and the input size needs to be the same as during training" ) infer_shape[-1] = 100 - if arch_config["algorithm"] == "NRTR": - infer_shape = [1, 32, 100] elif arch_config["model_type"] == "table": infer_shape = [3, 488, 488] model = to_static( diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py index 1945667972310cdef03daac7d2bfdb52373b950b..a95f55596647acc0eaca9616b5630917d7ebdf3a 100755 --- a/tools/infer/predict_rec.py +++ b/tools/infer/predict_rec.py @@ -75,6 +75,12 @@ class TextRecognizer(object): "character_dict_path": args.rec_char_dict_path, "use_space_char": args.use_space_char } + elif self.rec_algorithm == 'ABINet': + postprocess_params = { + 'name': 'ABINetLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } self.postprocess_op = build_post_process(postprocess_params) self.predictor, self.input_tensor, self.output_tensors, self.config = \ utility.create_predictor(args, 'rec', logger) @@ -145,17 +151,6 @@ class TextRecognizer(object): padding_im[:, :, 0:resized_w] = resized_image return padding_im - def resize_norm_img_svtr(self, img, image_shape): - - imgC, imgH, imgW = image_shape - resized_image = cv2.resize( - img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) - resized_image = resized_image.astype('float32') - resized_image = resized_image.transpose((2, 0, 1)) / 255 - resized_image -= 0.5 - resized_image /= 0.5 - return resized_image - def resize_norm_img_srn(self, img, image_shape): imgC, imgH, imgW = image_shape @@ -263,6 +258,35 @@ class TextRecognizer(object): return padding_im, resize_shape, pad_shape, valid_ratio + def resize_norm_img_svtr(self, img, image_shape): + + imgC, imgH, imgW = image_shape + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) + resized_image = resized_image.astype('float32') + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + return resized_image + + def resize_norm_img_abinet(self, img, image_shape): + + imgC, imgH, imgW = image_shape + + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) + resized_image = resized_image.astype('float32') + resized_image = resized_image / 255. + + mean = np.array([0.485, 0.456, 0.406]) + std = np.array([0.229, 0.224, 0.225]) + resized_image = ( + resized_image - mean[None, None, ...]) / std[None, None, ...] + resized_image = resized_image.transpose((2, 0, 1)) + resized_image = resized_image.astype('float32') + + return resized_image + def __call__(self, img_list): img_num = len(img_list) # Calculate the aspect ratio of all text bars @@ -313,6 +337,11 @@ class TextRecognizer(object): self.rec_image_shape) norm_img = norm_img[np.newaxis, :] norm_img_batch.append(norm_img) + elif self.rec_algorithm == "ABINet": + norm_img = self.resize_norm_img_abinet( + img_list[indices[ino]], self.rec_image_shape) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) else: norm_img = self.resize_norm_img(img_list[indices[ino]], max_wh_ratio) diff --git a/tools/program.py b/tools/program.py index 745c28b87292480159ed41285f2a79c6bf5d0abe..f61d3f9af7262d760d303baa16d769c717802562 100755 --- a/tools/program.py +++ b/tools/program.py @@ -575,7 +575,7 @@ def preprocess(is_train=False): 'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN', 'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE', 'SEED', 'SDMGR', 'LayoutXLM', 'LayoutLM', 'PREN', 'FCE', 'SVTR', - 'ViTSTR' + 'ViTSTR', 'ABINet' ] if use_xpu: