diff --git a/README.md b/README.md index 44780542cb786f142a9de9608aef4156eb878d66..24ed65ac6402882a8498b57e497e59959ce72be8 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 **近期更新** - 2020.7.9 添加支持空格的识别模型,[识别效果](#支持空格的中文OCR效果展示) +- 2020.7.9 添加数据增强、学习率衰减策略,具体参考[配置文件](./doc/doc_ch/config.md) - 2020.6.8 添加[数据集](./doc/doc_ch/datasets.md),并保持持续更新 - 2020.6.5 支持 `attetnion` 模型导出 `inference_model` - 2020.6.5 支持单独预测识别时,输出结果得分 @@ -51,6 +52,8 @@ mkdir inference && cd inference wget https://paddleocr.bj.bcebos.com/ch_models/ch_det_mv3_db_infer.tar && tar xf ch_det_mv3_db_infer.tar # 下载超轻量级中文OCR模型的识别模型并解压 wget https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_infer.tar && tar xf ch_rec_mv3_crnn_infer.tar +# 下载支持空格的超轻量级中文OCR模型的识别模型并解压 +wget https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance_infer.tar && tar xf ch_rec_mv3_crnn_enhance_infer.tar cd .. ``` #### (2)通用中文OCR模型下载 @@ -60,6 +63,8 @@ mkdir inference && cd inference wget https://paddleocr.bj.bcebos.com/ch_models/ch_det_r50_vd_db_infer.tar && tar xf ch_det_r50_vd_db_infer.tar # 下载通用中文OCR模型的识别模型并解压 wget https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_infer.tar && tar xf ch_rec_r34_vd_crnn_infer.tar +# 下载支持空格的通用中文OCR模型的识别模型并解压 +wget https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_enhance_infer.tar && tar xf ch_rec_r34_vd_crnn_enhance_infer.tar cd .. ``` @@ -85,6 +90,13 @@ python3 tools/infer/predict_system.py --image_dir="./doc/imgs/11.jpg" --det_mode python3 tools/infer/predict_system.py --image_dir="./doc/imgs/11.jpg" --det_model_dir="./inference/ch_det_r50_vd_db/" --rec_model_dir="./inference/ch_rec_r34_vd_crnn/" ``` +带空格的通用中文OCR模型的体验可以按照上述步骤下载相应的模型,并且更新相关的参数,示例如下: + +``` +# 预测image_dir指定的单张图像 +python3 tools/infer/predict_system.py --image_dir="./doc/imgs_en/img_12.jpg" --det_model_dir="./inference/ch_det_r50_vd_db/" --rec_model_dir="./inference/ch_rec_r34_vd_crnn_enhance/" +``` + 更多的文本检测、识别串联推理使用方式请参考文档教程中[基于预测引擎推理](./doc/doc_ch/inference.md)。 ## 文档教程 diff --git a/README_en.md b/README_en.md index faa5b83c3ec531b1ef5e6d21a7a7444d7ef86d15..dffa7153ac70134cca39e23c58f0fcee96eb02d5 100644 --- a/README_en.md +++ b/README_en.md @@ -5,6 +5,7 @@ PaddleOCR aims to create a rich, leading, and practical OCR tools that help user **Recent updates**、 - 2020.7.9 Add recognition model to support space, [recognition result](#space Chinese OCR results) +- 2020.7.9 Add data auguments and learning rate decay strategies,please read [config](./doc/doc_en/config_en.md) - 2020.6.8 Add [dataset](./doc/doc_en/datasets_en.md) and keep updating - 2020.6.5 Support exporting `attention` model to `inference_model` - 2020.6.5 Support separate prediction and recognition, output result score @@ -52,6 +53,9 @@ mkdir inference && cd inference wget https://paddleocr.bj.bcebos.com/ch_models/ch_det_mv3_db_infer.tar && tar xf ch_det_mv3_db_infer.tar # Download the recognition part of the lightweight Chinese OCR and decompress it wget https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_infer.tar && tar xf ch_rec_mv3_crnn_infer.tar +# Download the space-recognized part of the lightweight Chinese OCR and decompress it +wget https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance_infer.tar && tar xf ch_rec_mv3_crnn_enhance_infer.tar + cd .. ``` #### (2) Download General Chinese OCR models @@ -61,6 +65,8 @@ mkdir inference && cd inference wget https://paddleocr.bj.bcebos.com/ch_models/ch_det_r50_vd_db_infer.tar && tar xf ch_det_r50_vd_db_infer.tar # Download the recognition part of the generic Chinese OCR model and decompress it wget https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_infer.tar && tar xf ch_rec_r34_vd_crnn_infer.tar +# Download the space-recognition part of the generic Chinese OCR model and decompress it +wget https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_enhance_infer.tar && tar xf ch_rec_r34_vd_crnn_enhance_infer.tar cd .. ``` @@ -86,6 +92,13 @@ To run inference of the Generic Chinese OCR model, follow these steps above to d python3 tools/infer/predict_system.py --image_dir="./doc/imgs/11.jpg" --det_model_dir="./inference/ch_det_r50_vd_db/" --rec_model_dir="./inference/ch_rec_r34_vd_crnn/" ``` +To run inference of the space-Generic Chinese OCR model, follow these steps above to download the corresponding models and update the relevant parameters. Examples are as follows: + +``` +# Prediction on a single image by specifying image path to image_dir +python3 tools/infer/predict_system.py --image_dir="./doc/imgs_en/img_12.jpg" --det_model_dir="./inference/ch_det_r50_vd_db/" --rec_model_dir="./inference/ch_rec_r34_vd_crnn_enhance/" +``` + For more text detection and recognition models, please refer to the document [Inference](./doc/doc_en/inference_en.md) ## DOCUMENTATION diff --git a/configs/rec/rec_chinese_common_train.yml b/configs/rec/rec_chinese_common_train.yml index af56dca26911bfcf8bbc361c7d506cb6980618db..72728bd21391134f1fea40af032dc72064c0f257 100644 --- a/configs/rec/rec_chinese_common_train.yml +++ b/configs/rec/rec_chinese_common_train.yml @@ -14,6 +14,8 @@ Global: character_type: ch character_dict_path: ./ppocr/utils/ppocr_keys_v1.txt loss_type: ctc + distort: false + add_sapce: false reader_yml: ./configs/rec/rec_chinese_reader.yml pretrain_weights: checkpoints: diff --git a/configs/rec/rec_chinese_lite_train.yml b/configs/rec/rec_chinese_lite_train.yml index 9749c39955a152120afd4f295fb3e2056d07071f..adc6d69699cdc7b8b5e8de59d794c175a4ae0a51 100755 --- a/configs/rec/rec_chinese_lite_train.yml +++ b/configs/rec/rec_chinese_lite_train.yml @@ -15,6 +15,7 @@ Global: character_dict_path: ./ppocr/utils/ppocr_keys_v1.txt loss_type: ctc distort: false + add_space: false reader_yml: ./configs/rec/rec_chinese_reader.yml pretrain_weights: checkpoints: diff --git a/doc/doc_ch/config.md b/doc/doc_ch/config.md index ae16263e5272641f95d5e8842da08ac65d7a0b12..66ab23423cb8a219d9322108df22c9ad267a67aa 100644 --- a/doc/doc_ch/config.md +++ b/doc/doc_ch/config.md @@ -30,6 +30,8 @@ | character_type | 设置字符类型 | ch | en/ch, en时将使用默认dict,ch时使用自定义dict| | character_dict_path | 设置字典路径 | ./ppocr/utils/ic15_dict.txt | \ | | loss_type | 设置 loss 类型 | ctc | 支持两种loss: ctc / attention | +| distort | 设置是否使用数据增强 | false | 设置为true时,将在训练时随机进行扰动,支持的扰动操作可阅读[img_tools.py](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/ppocr/data/rec/img_tools.py) | +| add_space | 设置是否识别空格 | false | 仅在"ch"模式下支持空格 | | reader_yml | 设置reader配置文件 | ./configs/rec/rec_icdar15_reader.yml | \ | | pretrain_weights | 加载预训练模型路径 | ./pretrain_models/CRNN/best_accuracy | \ | | checkpoints | 加载模型参数路径 | None | 用于中断后加载参数继续训练 | diff --git a/doc/doc_ch/recognition.md b/doc/doc_ch/recognition.md index b5dc484fd18102cdd01512a72a2dce92ae945e04..44270308676f002f16119e0b98ecf2b8568f97bc 100644 --- a/doc/doc_ch/recognition.md +++ b/doc/doc_ch/recognition.md @@ -94,7 +94,7 @@ word_dict.txt 每行有一个单字,将字符与数字索引映射在一起, `ppocr/utils/ic15_dict.txt` 是一个包含36个字符的英文字典, 您可以按需使用。 -如需自定义dic文件,请修改 `configs/rec/rec_icdar15_train.yml` 中的 `character_dict_path` 字段, 并将 `character_type` 设置为 `ch`。 +如需自定义dic文件,请在 `configs/rec/rec_icdar15_train.yml` 中添加 `character_dict_path` 字段, 并将 `character_type` 设置为 `ch`。 ### 启动训练 @@ -157,12 +157,26 @@ Global: character_type: ch # 添加自定义字典,如修改字典请将路径指向新字典 character_dict_path: ./ppocr/utils/ppocr_keys_v1.txt + # 训练时添加数据增强 + distort: true + # 识别空格 + add_space: true ... # 修改reader类型 reader_yml: ./configs/rec/rec_chinese_reader.yml ... ... + +Optimizer: + ... + # 添加学习率衰减策略 + decay: + function: cosine_decay + # 每个 epoch 包含 iter 数 + step_each_epoch: 20 + # 总共训练epoch数 + total_epoch: 1000 ``` **注意,预测/评估时的配置文件请务必与训练一致。** diff --git a/doc/doc_en/config_en.md b/doc/doc_en/config_en.md index b9ad03947c545a4760331f835a9cc85be6ff67a7..aab1b9a33ad86d1f4465971badf43905f18c03f8 100644 --- a/doc/doc_en/config_en.md +++ b/doc/doc_en/config_en.md @@ -30,6 +30,8 @@ Take `rec_chinese_lite_train.yml` as an example | character_type | Set character type | ch | en/ch, the default dict will be used for en, and the custom dict will be used for ch| | character_dict_path | Set dictionary path | ./ppocr/utils/ic15_dict.txt | \ | | loss_type | Set loss type | ctc | Supports two types of loss: ctc / attention | +| distort | Set use distort | false | Support distort type ,read [img_tools.py](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/ppocr/data/rec/img_tools.py) | +| add_space | Wether to recognize space | false | Only support in "ch" mode | | reader_yml | Set the reader configuration file | ./configs/rec/rec_icdar15_reader.yml | \ | | pretrain_weights | Load pre-trained model path | ./pretrain_models/CRNN/best_accuracy | \ | | checkpoints | Load saved model path | None | Used to load saved parameters to continue training after interruption | diff --git a/doc/doc_en/recognition_en.md b/doc/doc_en/recognition_en.md index 9a862c7a67c6d2277bc6472b304534788e06921d..ec301c83a6fabc0e19540710557e4997cc4dde7c 100644 --- a/doc/doc_en/recognition_en.md +++ b/doc/doc_en/recognition_en.md @@ -158,9 +158,23 @@ Global: ... # Modify reader type reader_yml: ./configs/rec/rec_chinese_reader.yml + # Whether to use data augmentation + distort: true + # Whether to recognize spaces + add_space: true ... ... + +Optimizer: + ... + # Add learning rate decay strategy + decay: + function: cosine_decay + # Each epoch contains iter number + step_each_epoch: 20 + # Total epoch number + total_epoch: 1000 ``` **Note that the configuration file for prediction/evaluation must be consistent with the training.** diff --git a/doc/imgs_en/img_12.jpg b/doc/imgs_en/img_12.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b0d289538a5dd63fdf391e02466df2bbe5ea76c1 Binary files /dev/null and b/doc/imgs_en/img_12.jpg differ diff --git a/ppocr/data/rec/dataset_traversal.py b/ppocr/data/rec/dataset_traversal.py index f54c1d307ee2545c69a2664b149b69e50b4f8490..510a028451302a92ebc179792ecbcb1ff8649807 100755 --- a/ppocr/data/rec/dataset_traversal.py +++ b/ppocr/data/rec/dataset_traversal.py @@ -58,6 +58,7 @@ class LMDBReader(object): else: self.batch_size = params['test_batch_size_per_card'] self.drop_last = False + self.use_distort = False self.infer_img = params['infer_img'] def load_hierarchical_lmdb_dataset(self): @@ -206,6 +207,7 @@ class SimpleReader(object): else: self.batch_size = params['test_batch_size_per_card'] self.drop_last = False + self.use_distort = False def __call__(self, process_id): if self.mode != 'train': diff --git a/ppocr/data/rec/img_tools.py b/ppocr/data/rec/img_tools.py index a8fa988ef6c938e92c32ebe14f50d9108b26e01e..76c4315a887db11f6d543666d08b0ed60ebce698 100755 --- a/ppocr/data/rec/img_tools.py +++ b/ppocr/data/rec/img_tools.py @@ -136,6 +136,9 @@ def jitter(img): def add_gasuss_noise(image, mean=0, var=0.1): + """ + Gasuss noise + """ noise = np.random.normal(mean, var**0.5, image.shape) out = image + 0.5 * noise @@ -152,9 +155,8 @@ def get_crop(image): top_min = 1 top_max = 8 top_crop = int(random.randint(top_min, top_max)) - + top_crop = min(top_crop, h - 1) crop_img = image.copy() - ratio = random.randint(0, 1) if ratio: crop_img = crop_img[top_crop:h, :, :] @@ -249,13 +251,13 @@ def get_warpR(config): dst2 = r.dot(p2) dst3 = r.dot(p3) dst4 = r.dot(p4) - list_dst = [dst1, dst2, dst3, dst4] + list_dst = np.array([dst1, dst2, dst3, dst4]) org = np.array([[0, 0], [w, 0], [0, h], [w, h]], np.float32) dst = np.zeros((4, 2), np.float32) # Project onto the image plane - for i in range(4): - dst[i, 0] = list_dst[i][0] * z / (z - list_dst[i][2]) + pcenter[0] - dst[i, 1] = list_dst[i][1] * z / (z - list_dst[i][2]) + pcenter[1] + dst[:, 0] = list_dst[:, 0] * z / (z - list_dst[:, 2]) + pcenter[0] + dst[:, 1] = list_dst[:, 1] * z / (z - list_dst[:, 2]) + pcenter[1] + warpR = cv2.getPerspectiveTransform(org, dst) dst1, dst2, dst3, dst4 = dst diff --git a/ppocr/utils/character.py b/ppocr/utils/character.py index 3cbc31a49b991cab7f2f8d8c56db4e0d611fbf55..26b1ba77884faec115d7c8b8754ebc2ed10411d3 100755 --- a/ppocr/utils/character.py +++ b/ppocr/utils/character.py @@ -30,12 +30,17 @@ class CharacterOps(object): dict_character = list(self.character_str) elif self.character_type == "ch": character_dict_path = config['character_dict_path'] + add_space = False + if 'add_space' in config: + add_space = config['add_space'] self.character_str = "" with open(character_dict_path, "rb") as fin: lines = fin.readlines() for line in lines: line = line.decode('utf-8').strip("\n").strip("\r\n") self.character_str += line + if add_space: + self.character_str += " " dict_character = list(self.character_str) elif self.character_type == "en_sensitive": # same with ASTER setting (use 94 char). diff --git a/ppocr/utils/ppocr_keys_v1.txt b/ppocr/utils/ppocr_keys_v1.txt index 3ca70d0a2ef1e1c5f690a6ca58bbeb3bc2acc855..84b885d8352226e49b1d5d791b8f43a663e246aa 100644 --- a/ppocr/utils/ppocr_keys_v1.txt +++ b/ppocr/utils/ppocr_keys_v1.txt @@ -6620,5 +6620,4 @@ j 緖 續 紹 -懮 - +懮 \ No newline at end of file diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py index 9761ddbad9123372d706db158aba8008956f30e9..3be013ed33d109aeb42ea676b9edee5fc46f1c93 100755 --- a/tools/infer/predict_rec.py +++ b/tools/infer/predict_rec.py @@ -39,7 +39,8 @@ class TextRecognizer(object): self.rec_algorithm = args.rec_algorithm char_ops_params = { "character_type": args.rec_char_type, - "character_dict_path": args.rec_char_dict_path + "character_dict_path": args.rec_char_dict_path, + "add_space": args.rec_add_space } if self.rec_algorithm != "RARE": char_ops_params['loss_type'] = 'ctc' diff --git a/tools/infer/utility.py b/tools/infer/utility.py index 0cc0971359767962f6d501e210c7fef8d24ecbcd..bbd130d1a699238fe16fce4db283a6374060a3c4 100755 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -63,6 +63,7 @@ def parse_args(): "--rec_char_dict_path", type=str, default="./ppocr/utils/ppocr_keys_v1.txt") + parser.add_argument("--rec_add_space", type=bool, default=True) return parser.parse_args()