未验证 提交 631fd9fd 编写于 作者: X xiaoting 提交者: GitHub

Merge branch 'dygraph' into dygraph_doc

include LICENSE.txt include LICENSE.txt
include README.md include README.md
recursive-include ppocr/utils *.txt utility.py character.py check.py recursive-include ppocr/utils *.txt utility.py logging.py
recursive-include ppocr/data/det *.py recursive-include ppocr/data/ *.py
recursive-include ppocr/postprocess *.py recursive-include ppocr/postprocess *.py
recursive-include ppocr/postprocess/lanms *.* recursive-include tools/infer *.py
recursive-include tools/infer *.py \ No newline at end of file
...@@ -8,7 +8,6 @@ Global: ...@@ -8,7 +8,6 @@ Global:
# evaluation is run every 5000 iterations after the 4000th iteration # evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [0, 1000] eval_batch_step: [0, 1000]
# if pretrained_model is saved in static mode, load_static_weights must set to True # if pretrained_model is saved in static mode, load_static_weights must set to True
load_static_weights: True
cal_metric_during_train: True cal_metric_during_train: True
pretrained_model: pretrained_model:
checkpoints: checkpoints:
use_gpu: true
epoch_num: 1200
log_smooth_window: 20
print_batch_step: 2
save_model_dir: ./output/ch_db_mv3/
save_epoch_step: 1200
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [3000, 2000]
# if pretrained_model is saved in static mode, load_static_weights must set to True
load_static_weights: True
cal_metric_during_train: False
pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
checkpoints: #./output/det_db_0.001_DiceLoss_256_pp_config_2.0b_4gpu/best_accuracy
use_visualdl: False
infer_img: doc/imgs_en/img_10.jpg
save_res_path: ./output/det_db/predicts_db.txt
model_type: det
algorithm: DB
name: MobileNetV3
scale: 0.5
model_name: large
disable_se: True
name: DBFPN
out_channels: 96
name: DBHead
k: 50
name: DBLoss
balance_loss: true
main_loss_type: DiceLoss
alpha: 5
beta: 10
ohem_ratio: 3
name: Adam
beta1: 0.9
beta2: 0.999
name: Cosine
learning_rate: 0.001
warmup_epoch: 2
name: 'L2'
factor: 0
name: DBPostProcess
thresh: 0.3
box_thresh: 0.6
max_candidates: 1000
unclip_ratio: 1.5
name: DetMetric
main_indicator: hmean
name: SimpleDataSet
data_dir: ./train_data/icdar2015/text_localization/
- ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
ratio_list: [1.0]
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- IaaAugment:
- { 'type': Fliplr, 'args': { 'p': 0.5 } }
- { 'type': Affine, 'args': { 'rotate': [-10, 10] } }
- { 'type': Resize, 'args': { 'size': [0.5, 3] } }
- EastRandomCropData:
size: [960, 960]
max_tries: 50
keep_ratio: true
- MakeBorderMap:
shrink_ratio: 0.4
thresh_min: 0.3
thresh_max: 0.7
- MakeShrinkMap:
shrink_ratio: 0.4
min_text_size: 8
- NormalizeImage:
scale: 1./255.
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
order: 'hwc'
- ToCHWImage:
- KeepKeys:
keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list
shuffle: True
drop_last: False
batch_size_per_card: 8
num_workers: 4
name: SimpleDataSet
data_dir: ./train_data/icdar2015/text_localization/
- ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- DetResizeForTest:
# image_shape: [736, 1280]
- NormalizeImage:
scale: 1./255.
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
order: 'hwc'
- ToCHWImage:
- KeepKeys:
keep_keys: ['image', 'shape', 'polys', 'ignore_tags']
shuffle: False
drop_last: False
batch_size_per_card: 1 # must be 1
num_workers: 2
use_gpu: true
epoch_num: 1200
log_smooth_window: 20
print_batch_step: 2
save_model_dir: ./output/ch_db_res18/
save_epoch_step: 1200
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [3000, 2000]
# if pretrained_model is saved in static mode, load_static_weights must set to True
load_static_weights: True
cal_metric_during_train: False
pretrained_model: ./pretrain_models/ResNet18_vd_pretrained
checkpoints: #./output/det_db_0.001_DiceLoss_256_pp_config_2.0b_4gpu/best_accuracy
use_visualdl: False
infer_img: doc/imgs_en/img_10.jpg
save_res_path: ./output/det_db/predicts_db.txt
model_type: det
algorithm: DB
name: ResNet
layers: 18
disable_se: True
name: DBFPN
out_channels: 256
name: DBHead
k: 50
name: DBLoss
balance_loss: true
main_loss_type: DiceLoss
alpha: 5
beta: 10
ohem_ratio: 3
name: Adam
beta1: 0.9
beta2: 0.999
name: Cosine
learning_rate: 0.001
warmup_epoch: 2
name: 'L2'
factor: 0
name: DBPostProcess
thresh: 0.3
box_thresh: 0.6
max_candidates: 1000
unclip_ratio: 1.5
name: DetMetric
main_indicator: hmean
name: SimpleDataSet
data_dir: ./train_data/icdar2015/text_localization/
- ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
ratio_list: [1.0]
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- IaaAugment:
- { 'type': Fliplr, 'args': { 'p': 0.5 } }
- { 'type': Affine, 'args': { 'rotate': [-10, 10] } }
- { 'type': Resize, 'args': { 'size': [0.5, 3] } }
- EastRandomCropData:
size: [960, 960]
max_tries: 50
keep_ratio: true
- MakeBorderMap:
shrink_ratio: 0.4
thresh_min: 0.3
thresh_max: 0.7
- MakeShrinkMap:
shrink_ratio: 0.4
min_text_size: 8
- NormalizeImage:
scale: 1./255.
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
order: 'hwc'
- ToCHWImage:
- KeepKeys:
keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list
shuffle: True
drop_last: False
batch_size_per_card: 8
num_workers: 4
name: SimpleDataSet
data_dir: ./train_data/icdar2015/text_localization/
- ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- DetResizeForTest:
# image_shape: [736, 1280]
- NormalizeImage:
scale: 1./255.
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
order: 'hwc'
- ToCHWImage:
- KeepKeys:
keep_keys: ['image', 'shape', 'polys', 'ignore_tags']
shuffle: False
drop_last: False
batch_size_per_card: 1 # must be 1
num_workers: 2
...@@ -45,9 +45,7 @@ Optimizer: ...@@ -45,9 +45,7 @@ Optimizer:
beta1: 0.9 beta1: 0.9
beta2: 0.999 beta2: 0.999
lr: lr:
# name: Cosine
learning_rate: 0.001 learning_rate: 0.001
# warmup_epoch: 0
regularizer: regularizer:
name: 'L2' name: 'L2'
factor: 0 factor: 0
use_gpu: true
epoch_num: 10000
log_smooth_window: 20
print_batch_step: 2
save_model_dir: ./output/east_mv3/
save_epoch_step: 1000
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [4000, 5000]
# if pretrained_model is saved in static mode, load_static_weights must set to True
load_static_weights: True
cal_metric_during_train: False
pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
use_visualdl: False
save_res_path: ./output/det_east/predicts_east.txt
model_type: det
algorithm: EAST
name: MobileNetV3
scale: 0.5
model_name: large
model_name: small
name: EASTHead
model_name: small
name: EASTLoss
name: Adam
beta1: 0.9
beta2: 0.999
# name: Cosine
learning_rate: 0.001
# warmup_epoch: 0
name: 'L2'
factor: 0
name: EASTPostProcess
score_thresh: 0.8
cover_thresh: 0.1
nms_thresh: 0.2
name: DetMetric
main_indicator: hmean
name: SimpleDataSet
data_dir: ./train_data/icdar2015/text_localization/
- ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
ratio_list: [1.0]
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- EASTProcessTrain:
image_shape: [512, 512]
background_ratio: 0.125
min_crop_side_ratio: 0.1
min_text_size: 10
- KeepKeys:
keep_keys: ['image', 'score_map', 'geo_map', 'training_mask'] # dataloader will return list in this order
shuffle: True
drop_last: False
batch_size_per_card: 16
num_workers: 8
name: SimpleDataSet
data_dir: ./train_data/icdar2015/text_localization/
- ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- DetResizeForTest:
limit_side_len: 2400
limit_type: max
- NormalizeImage:
scale: 1./255.
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
order: 'hwc'
- ToCHWImage:
- KeepKeys:
keep_keys: ['image', 'shape', 'polys', 'ignore_tags']
shuffle: False
drop_last: False
batch_size_per_card: 1 # must be 1
num_workers: 2
\ No newline at end of file
use_gpu: true
epoch_num: 1200
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/det_rc/det_r50_vd/
save_epoch_step: 1200
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [5000,4000]
# if pretrained_model is saved in static mode, load_static_weights must set to True
load_static_weights: True
cal_metric_during_train: False
pretrained_model: ./pretrain_models/ResNet50_vd_ssld_pretrained
use_visualdl: False
infer_img: doc/imgs_en/img_10.jpg
save_res_path: ./output/det_db/predicts_db.txt
model_type: det
algorithm: DB
name: ResNet
layers: 50
name: DBFPN
out_channels: 256
name: DBHead
k: 50
name: DBLoss
balance_loss: true
main_loss_type: DiceLoss
alpha: 5
beta: 10
ohem_ratio: 3
name: Adam
beta1: 0.9
beta2: 0.999
learning_rate: 0.001
name: 'L2'
factor: 0
name: DBPostProcess
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5
name: DetMetric
main_indicator: hmean
name: SimpleDataSet
data_dir: ./train_data/icdar2015/text_localization/
- ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
ratio_list: [0.5]
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- IaaAugment:
- { 'type': Fliplr, 'args': { 'p': 0.5 } }
- { 'type': Affine, 'args': { 'rotate': [-10, 10] } }
- { 'type': Resize, 'args': { 'size': [0.5, 3] } }
- EastRandomCropData:
size: [640, 640]
max_tries: 50
keep_ratio: true
- MakeBorderMap:
shrink_ratio: 0.4
thresh_min: 0.3
thresh_max: 0.7
- MakeShrinkMap:
shrink_ratio: 0.4
min_text_size: 8
- NormalizeImage:
scale: 1./255.
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
order: 'hwc'
- ToCHWImage:
- KeepKeys:
keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list
shuffle: True
drop_last: False
batch_size_per_card: 16
num_workers: 8
name: SimpleDataSet
data_dir: ./train_data/icdar2015/text_localization/
- ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- DetResizeForTest:
image_shape: [736, 1280]
- NormalizeImage:
scale: 1./255.
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
order: 'hwc'
- ToCHWImage:
- KeepKeys:
keep_keys: ['image', 'shape', 'polys', 'ignore_tags']
shuffle: False
drop_last: False
batch_size_per_card: 1 # must be 1
num_workers: 8
\ No newline at end of file
use_gpu: true
epoch_num: 10000
log_smooth_window: 20
print_batch_step: 2
save_model_dir: ./output/east_r50_vd/
save_epoch_step: 1000
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [4000, 5000]
# if pretrained_model is saved in static mode, load_static_weights must set to True
load_static_weights: True
cal_metric_during_train: False
pretrained_model: ./pretrain_models/ResNet50_vd_pretrained/
use_visualdl: False
save_res_path: ./output/det_east/predicts_east.txt
model_type: det
algorithm: EAST
name: ResNet
layers: 50
model_name: large
name: EASTHead
model_name: large
name: EASTLoss
name: Adam
beta1: 0.9
beta2: 0.999
# name: Cosine
learning_rate: 0.001
# warmup_epoch: 0
name: 'L2'
factor: 0
name: EASTPostProcess
score_thresh: 0.8
cover_thresh: 0.1
nms_thresh: 0.2
name: DetMetric
main_indicator: hmean
name: SimpleDataSet
data_dir: ./train_data/icdar2015/text_localization/
- ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
ratio_list: [1.0]
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- EASTProcessTrain:
image_shape: [512, 512]
background_ratio: 0.125
min_crop_side_ratio: 0.1
min_text_size: 10
- KeepKeys:
keep_keys: ['image', 'score_map', 'geo_map', 'training_mask'] # dataloader will return list in this order
shuffle: True
drop_last: False
batch_size_per_card: 8
num_workers: 8
name: SimpleDataSet
data_dir: ./train_data/icdar2015/text_localization/
- ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- DetResizeForTest:
limit_side_len: 2400
limit_type: max
- NormalizeImage:
scale: 1./255.
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
order: 'hwc'
- ToCHWImage:
- KeepKeys:
keep_keys: ['image', 'shape', 'polys', 'ignore_tags']
shuffle: False
drop_last: False
batch_size_per_card: 1 # must be 1
num_workers: 2
\ No newline at end of file
use_gpu: true
epoch_num: 5000
log_smooth_window: 20
print_batch_step: 2
save_model_dir: ./output/sast_r50_vd_ic15/
save_epoch_step: 1000
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [4000, 5000]
# if pretrained_model is saved in static mode, load_static_weights must set to True
load_static_weights: True
cal_metric_during_train: False
pretrained_model: ./pretrain_models/ResNet50_vd_ssld_pretrained/
use_visualdl: False
save_res_path: ./output/sast_r50_vd_ic15/predicts_sast.txt
model_type: det
algorithm: SAST
name: ResNet_SAST
layers: 50
with_cab: True
name: SASTHead
name: SASTLoss
name: Adam
beta1: 0.9
beta2: 0.999
# name: Cosine
learning_rate: 0.001
# warmup_epoch: 0
name: 'L2'
factor: 0
name: SASTPostProcess
score_thresh: 0.5
sample_pts_num: 2
nms_thresh: 0.2
expand_scale: 1.0
shrink_ratio_of_width: 0.3
name: DetMetric
main_indicator: hmean
name: SimpleDataSet
data_dir: ./train_data/
label_file_path: [./train_data/art_latin_icdar_14pt/train_no_tt_test/train_label_json.txt, ./train_data/total_text_icdar_14pt/train_label_json.txt]
data_ratio_list: [0.5, 0.5]
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- SASTProcessTrain:
image_shape: [512, 512]
min_crop_side_ratio: 0.3
min_crop_size: 24
min_text_size: 4
max_text_size: 512
- KeepKeys:
keep_keys: ['image', 'score_map', 'border_map', 'training_mask', 'tvo_map', 'tco_map'] # dataloader will return list in this order
shuffle: True
drop_last: False
batch_size_per_card: 4
num_workers: 4
name: SimpleDataSet
data_dir: ./train_data/icdar2015/text_localization/
- ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- DetResizeForTest:
resize_long: 1536
- NormalizeImage:
scale: 1./255.
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
order: 'hwc'
- ToCHWImage:
- KeepKeys:
keep_keys: ['image', 'shape', 'polys', 'ignore_tags']
shuffle: False
drop_last: False
batch_size_per_card: 1 # must be 1
num_workers: 2
\ No newline at end of file
use_gpu: true
epoch_num: 5000
log_smooth_window: 20
print_batch_step: 2
save_model_dir: ./output/sast_r50_vd_tt/
save_epoch_step: 1000
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [4000, 5000]
# if pretrained_model is saved in static mode, load_static_weights must set to True
load_static_weights: True
cal_metric_during_train: False
pretrained_model: ./pretrain_models/ResNet50_vd_ssld_pretrained/
use_visualdl: False
save_res_path: ./output/sast_r50_vd_tt/predicts_sast.txt
model_type: det
algorithm: SAST
name: ResNet_SAST
layers: 50
with_cab: True
name: SASTHead
name: SASTLoss
name: Adam
beta1: 0.9
beta2: 0.999
# name: Cosine
learning_rate: 0.001
# warmup_epoch: 0
name: 'L2'
factor: 0
name: SASTPostProcess
score_thresh: 0.5
sample_pts_num: 6
nms_thresh: 0.2
expand_scale: 1.2
shrink_ratio_of_width: 0.2
name: DetMetric
main_indicator: hmean
name: SimpleDataSet
label_file_list: [./train_data/icdar2013/train_label_json.txt, ./train_data/icdar2015/train_label_json.txt, ./train_data/icdar17_mlt_latin/train_label_json.txt, ./train_data/coco_text_icdar_4pts/train_label_json.txt]
ratio_list: [0.1, 0.45, 0.3, 0.15]
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- SASTProcessTrain:
image_shape: [512, 512]
min_crop_side_ratio: 0.3
min_crop_size: 24
min_text_size: 4
max_text_size: 512
- KeepKeys:
keep_keys: ['image', 'score_map', 'border_map', 'training_mask', 'tvo_map', 'tco_map'] # dataloader will return list in this order
shuffle: True
drop_last: False
batch_size_per_card: 4
num_workers: 4
name: SimpleDataSet
data_dir: ./train_data/
- ./train_data/total_text_icdar_14pt/test_label_json.txt
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- DetResizeForTest:
resize_long: 768
- NormalizeImage:
scale: 1./255.
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
order: 'hwc'
- ToCHWImage:
- KeepKeys:
keep_keys: ['image', 'shape', 'polys', 'ignore_tags']
shuffle: False
drop_last: False
batch_size_per_card: 1 # must be 1
num_workers: 2
\ No newline at end of file
...@@ -3,7 +3,7 @@ Global: ...@@ -3,7 +3,7 @@ Global:
epoch_num: 500 epoch_num: 500
log_smooth_window: 20 log_smooth_window: 20
print_batch_step: 10 print_batch_step: 10
save_model_dir: ./output/rec_chinese_common_v1.1 save_model_dir: ./output/rec_chinese_common_v2.0
save_epoch_step: 3 save_epoch_step: 3
# evaluation is run every 5000 iterations after the 4000th iteration # evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [0, 2000] eval_batch_step: [0, 2000]
...@@ -3,7 +3,7 @@ Global: ...@@ -3,7 +3,7 @@ Global:
epoch_num: 500 epoch_num: 500
log_smooth_window: 20 log_smooth_window: 20
print_batch_step: 10 print_batch_step: 10
save_model_dir: ./output/rec_chinese_lite_v1.1 save_model_dir: ./output/rec_chinese_lite_v2.0
save_epoch_step: 3 save_epoch_step: 3
# evaluation is run every 5000 iterations after the 4000th iteration # evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [0, 2000] eval_batch_step: [0, 2000]
...@@ -19,7 +19,7 @@ Global: ...@@ -19,7 +19,7 @@ Global:
character_type: ch character_type: ch
max_text_length: 25 max_text_length: 25
infer_mode: False infer_mode: False
use_space_char: False use_space_char: True
Optimizer: Optimizer:
Global: Global:
use_gpu: true use_gpu: True
epoch_num: 500 epoch_num: 500
log_smooth_window: 20 log_smooth_window: 20
print_batch_step: 10 print_batch_step: 10
...@@ -15,7 +15,7 @@ Global: ...@@ -15,7 +15,7 @@ Global:
use_visualdl: False use_visualdl: False
infer_img: infer_img:
# for data or label process # for data or label process
character_dict_path: ppocr/utils/dict/ic15_dict.txt character_dict_path: ppocr/utils/dict/en_dict.txt
character_type: ch character_type: ch
max_text_length: 25 max_text_length: 25
infer_mode: False infer_mode: False
Global: Global:
use_gpu: true use_gpu: True
epoch_num: 500 epoch_num: 500
log_smooth_window: 20 log_smooth_window: 20
print_batch_step: 10 print_batch_step: 10
...@@ -9,9 +9,9 @@ Global: ...@@ -9,9 +9,9 @@ Global:
eval_batch_step: [0, 2000] eval_batch_step: [0, 2000]
# if pretrained_model is saved in static mode, load_static_weights must set to True # if pretrained_model is saved in static mode, load_static_weights must set to True
cal_metric_during_train: True cal_metric_during_train: True
pretrained_model: pretrained_model:
checkpoints: checkpoints:
save_inference_dir: save_inference_dir:
use_visualdl: False use_visualdl: False
infer_img: infer_img:
# for data or label process # for data or label process
...@@ -19,7 +19,7 @@ Global: ...@@ -19,7 +19,7 @@ Global:
character_type: french character_type: french
max_text_length: 25 max_text_length: 25
infer_mode: False infer_mode: False
use_space_char: True use_space_char: False
Optimizer: Optimizer:
Global: Global:
use_gpu: true use_gpu: True
epoch_num: 500 epoch_num: 500
log_smooth_window: 20 log_smooth_window: 20
print_batch_step: 10 print_batch_step: 10
...@@ -19,7 +19,7 @@ Global: ...@@ -19,7 +19,7 @@ Global:
character_type: german character_type: german
max_text_length: 25 max_text_length: 25
infer_mode: False infer_mode: False
use_space_char: True use_space_char: False
Optimizer: Optimizer:
Global: Global:
use_gpu: true use_gpu: True
epoch_num: 500 epoch_num: 500
log_smooth_window: 20 log_smooth_window: 20
print_batch_step: 10 print_batch_step: 10
...@@ -19,7 +19,7 @@ Global: ...@@ -19,7 +19,7 @@ Global:
character_type: japan character_type: japan
max_text_length: 25 max_text_length: 25
infer_mode: False infer_mode: False
use_space_char: True use_space_char: False
Optimizer: Optimizer:
Global: Global:
use_gpu: true use_gpu: True
epoch_num: 500 epoch_num: 500
log_smooth_window: 20 log_smooth_window: 20
print_batch_step: 10 print_batch_step: 10
...@@ -19,7 +19,7 @@ Global: ...@@ -19,7 +19,7 @@ Global:
character_type: korean character_type: korean
max_text_length: 25 max_text_length: 25
infer_mode: False infer_mode: False
use_space_char: True use_space_char: False
Optimizer: Optimizer:
...@@ -5,7 +5,7 @@ Global: ...@@ -5,7 +5,7 @@ Global:
print_batch_step: 10 print_batch_step: 10
save_model_dir: ./output/rec/mv3_none_bilstm_ctc/ save_model_dir: ./output/rec/mv3_none_bilstm_ctc/
save_epoch_step: 3 save_epoch_step: 3
# evaluation is run every 5000 iterations after the 4000th iteration # evaluation is run every 2000 iterations
eval_batch_step: [0, 2000] eval_batch_step: [0, 2000]
# if pretrained_model is saved in static mode, load_static_weights must set to True # if pretrained_model is saved in static mode, load_static_weights must set to True
cal_metric_during_train: True cal_metric_during_train: True
...@@ -13,7 +13,7 @@ Global: ...@@ -13,7 +13,7 @@ Global:
checkpoints: checkpoints:
save_inference_dir: save_inference_dir:
use_visualdl: False use_visualdl: False
infer_img: doc/imgs_words/ch/word_1.jpg infer_img: doc/imgs_words_en/word_10.png
# for data or label process # for data or label process
character_dict_path: character_dict_path:
character_type: en character_type: en
...@@ -21,7 +21,6 @@ Global: ...@@ -21,7 +21,6 @@ Global:
infer_mode: False infer_mode: False
use_space_char: False use_space_char: False
Optimizer: Optimizer:
name: Adam name: Adam
beta1: 0.9 beta1: 0.9
use_gpu: True
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/mv3_none_none_ctc/
save_epoch_step: 3
# evaluation is run every 2000 iterations
eval_batch_step: [0, 2000]
# if pretrained_model is saved in static mode, load_static_weights must set to True
cal_metric_during_train: True
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_type: en
max_text_length: 25
infer_mode: False
use_space_char: False
name: Adam
beta1: 0.9
beta2: 0.999
learning_rate: 0.0005
name: 'L2'
factor: 0
model_type: rec
algorithm: Rosetta
name: MobileNetV3
scale: 0.5
model_name: large
name: SequenceEncoder
encoder_type: reshape
name: CTCHead
fc_decay: 0.0004
name: CTCLoss
name: CTCLabelDecode
name: RecMetric
main_indicator: acc
name: LMDBDateSet
data_dir: ./train_data/data_lmdb_release/training/
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 32, 100]
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
shuffle: False
batch_size_per_card: 256
drop_last: True
num_workers: 8
name: LMDBDateSet
data_dir: ./train_data/data_lmdb_release/validation/
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 32, 100]
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 8
use_gpu: true
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/mv3_tps_bilstm_ctc/
save_epoch_step: 3
# evaluation is run every 2000 iterations
eval_batch_step: [0, 2000]
# if pretrained_model is saved in static mode, load_static_weights must set to True
cal_metric_during_train: True
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_type: en
max_text_length: 25
infer_mode: False
use_space_char: False
name: Adam
beta1: 0.9
beta2: 0.999
learning_rate: 0.0005
name: 'L2'
factor: 0
model_type: rec
algorithm: STARNet
name: TPS
num_fiducial: 20
loc_lr: 0.1
model_name: small
name: MobileNetV3
scale: 0.5
model_name: large
name: SequenceEncoder
encoder_type: rnn
hidden_size: 96
name: CTCHead
fc_decay: 0.0004
name: CTCLoss
name: CTCLabelDecode
name: RecMetric
main_indicator: acc
name: LMDBDateSet
data_dir: ./train_data/data_lmdb_release/training/
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 32, 100]
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
shuffle: False
batch_size_per_card: 256
drop_last: True
num_workers: 8
name: LMDBDateSet
data_dir: ./train_data/data_lmdb_release/validation/
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 32, 100]
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 4
...@@ -5,7 +5,7 @@ Global: ...@@ -5,7 +5,7 @@ Global:
print_batch_step: 10 print_batch_step: 10
save_model_dir: ./output/rec/r34_vd_none_bilstm_ctc/ save_model_dir: ./output/rec/r34_vd_none_bilstm_ctc/
save_epoch_step: 3 save_epoch_step: 3
# evaluation is run every 5000 iterations after the 4000th iteration # evaluation is run every 2000 iterations
eval_batch_step: [0, 2000] eval_batch_step: [0, 2000]
# if pretrained_model is saved in static mode, load_static_weights must set to True # if pretrained_model is saved in static mode, load_static_weights must set to True
cal_metric_during_train: True cal_metric_during_train: True
...@@ -13,7 +13,7 @@ Global: ...@@ -13,7 +13,7 @@ Global:
checkpoints: checkpoints:
save_inference_dir: save_inference_dir:
use_visualdl: False use_visualdl: False
infer_img: doc/imgs_words/ch/word_1.jpg infer_img: doc/imgs_words_en/word_10.png
# for data or label process # for data or label process
character_dict_path: character_dict_path:
character_type: en character_type: en
...@@ -21,7 +21,6 @@ Global: ...@@ -21,7 +21,6 @@ Global:
infer_mode: False infer_mode: False
use_space_char: False use_space_char: False
Optimizer: Optimizer:
name: Adam name: Adam
beta1: 0.9 beta1: 0.9
...@@ -71,7 +70,7 @@ Train: ...@@ -71,7 +70,7 @@ Train:
- KeepKeys: - KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader: loader:
shuffle: False shuffle: True
batch_size_per_card: 256 batch_size_per_card: 256
drop_last: True drop_last: True
num_workers: 8 num_workers: 8
use_gpu: true
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/r34_vd_none_none_ctc/
save_epoch_step: 3
# evaluation is run every 2000 iterations
eval_batch_step: [0, 2000]
# if pretrained_model is saved in static mode, load_static_weights must set to True
cal_metric_during_train: True
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_type: en
max_text_length: 25
infer_mode: False
use_space_char: False
name: Adam
beta1: 0.9
beta2: 0.999
learning_rate: 0.0005
name: 'L2'
factor: 0
model_type: rec
algorithm: Rosetta
name: ResNet
layers: 34
name: SequenceEncoder
encoder_type: reshape
name: CTCHead
fc_decay: 0.0004
name: CTCLoss
name: CTCLabelDecode
name: RecMetric
main_indicator: acc
name: LMDBDateSet
data_dir: ./train_data/data_lmdb_release/training/
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 32, 100]
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
shuffle: True
batch_size_per_card: 256
drop_last: True
num_workers: 8
name: LMDBDateSet
data_dir: ./train_data/data_lmdb_release/validation/
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 32, 100]
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 4
...@@ -5,7 +5,7 @@ Global: ...@@ -5,7 +5,7 @@ Global:
print_batch_step: 10 print_batch_step: 10
save_model_dir: ./output/rec/r34_vd_tps_bilstm_ctc/ save_model_dir: ./output/rec/r34_vd_tps_bilstm_ctc/
save_epoch_step: 3 save_epoch_step: 3
# evaluation is run every 5000 iterations after the 4000th iteration # evaluation is run every 2000 iterations
eval_batch_step: [0, 2000] eval_batch_step: [0, 2000]
# if pretrained_model is saved in static mode, load_static_weights must set to True # if pretrained_model is saved in static mode, load_static_weights must set to True
cal_metric_during_train: True cal_metric_during_train: True
...@@ -13,7 +13,7 @@ Global: ...@@ -13,7 +13,7 @@ Global:
checkpoints: checkpoints:
save_inference_dir: save_inference_dir:
use_visualdl: False use_visualdl: False
infer_img: doc/imgs_words/ch/word_1.jpg infer_img: doc/imgs_words_en/word_10.png
# for data or label process # for data or label process
character_dict_path: character_dict_path:
character_type: en character_type: en
...@@ -21,7 +21,6 @@ Global: ...@@ -21,7 +21,6 @@ Global:
infer_mode: False infer_mode: False
use_space_char: False use_space_char: False
Optimizer: Optimizer:
name: Adam name: Adam
beta1: 0.9 beta1: 0.9
...@@ -34,7 +33,7 @@ Optimizer: ...@@ -34,7 +33,7 @@ Optimizer:
Architecture: Architecture:
model_type: rec model_type: rec
algorithm: CRNN algorithm: STARNet
Transform: Transform:
name: TPS name: TPS
num_fiducial: 20 num_fiducial: 20
...@@ -81,7 +81,8 @@ cv::Mat Classifier::Run(cv::Mat &img) { ...@@ -81,7 +81,8 @@ cv::Mat Classifier::Run(cv::Mat &img) {
void Classifier::LoadModel(const std::string &model_dir) { void Classifier::LoadModel(const std::string &model_dir) {
AnalysisConfig config; AnalysisConfig config;
config.SetModel(model_dir + "/model", model_dir + "/params"); config.SetModel(model_dir + "/inference.pdmodel",
model_dir + "/inference.pdiparams");
if (this->use_gpu_) { if (this->use_gpu_) {
config.EnableUseGpu(this->gpu_mem_, this->gpu_id_); config.EnableUseGpu(this->gpu_mem_, this->gpu_id_);
...@@ -18,7 +18,8 @@ namespace PaddleOCR { ...@@ -18,7 +18,8 @@ namespace PaddleOCR {
void DBDetector::LoadModel(const std::string &model_dir) { void DBDetector::LoadModel(const std::string &model_dir) {
AnalysisConfig config; AnalysisConfig config;
config.SetModel(model_dir + "/model", model_dir + "/params"); config.SetModel(model_dir + "/inference.pdmodel",
model_dir + "/inference.pdiparams");
if (this->use_gpu_) { if (this->use_gpu_) {
config.EnableUseGpu(this->gpu_mem_, this->gpu_id_); config.EnableUseGpu(this->gpu_mem_, this->gpu_id_);
...@@ -103,7 +103,8 @@ void CRNNRecognizer::Run(std::vector<std::vector<std::vector<int>>> boxes, ...@@ -103,7 +103,8 @@ void CRNNRecognizer::Run(std::vector<std::vector<std::vector<int>>> boxes,
void CRNNRecognizer::LoadModel(const std::string &model_dir) { void CRNNRecognizer::LoadModel(const std::string &model_dir) {
AnalysisConfig config; AnalysisConfig config;
config.SetModel(model_dir + "/model", model_dir + "/params"); config.SetModel(model_dir + "/inference.pdmodel",
model_dir + "/inference.pdiparams");
if (this->use_gpu_) { if (this->use_gpu_) {
config.EnableUseGpu(this->gpu_mem_, this->gpu_id_); config.EnableUseGpu(this->gpu_mem_, this->gpu_id_);
English | [简体中文](README_cn.md)
## Introduction
Many users hope package the PaddleOCR service into a docker image, so that it can be quickly released and used in the docker or k8s environment.
This page provides some standardized code to achieve this goal. You can quickly publish the PaddleOCR project into a callable Restful API service through the following steps. (At present, the deployment based on the HubServing mode is implemented first, and author plans to increase the deployment of the PaddleServing mode in the futrue)
## 1. Prerequisites
You need to install the following basic components first:
a. Docker
b. Graphics driver and CUDA 10.0+(GPU)
c. NVIDIA Container Toolkit(GPU,Docker 19.03+ can skip this)
d. cuDNN 7.6+(GPU)
## 2. Build Image
a. Goto Dockerfile directory(ps:Need to distinguish between cpu and gpu version, the following takes cpu as an example, gpu version needs to replace the keyword)
cd deploy/docker/hubserving/cpu
c. Build image
docker build -t paddleocr:cpu .
## 3. Start container
a. CPU version
sudo docker run -dp 8868:8868 --name paddle_ocr paddleocr:cpu
b. GPU version (base on NVIDIA Container Toolkit)
sudo nvidia-docker run -dp 8868:8868 --name paddle_ocr paddleocr:gpu
c. GPU version (Docker 19.03++)
sudo docker run -dp 8868:8868 --gpus all --name paddle_ocr paddleocr:gpu
d. Check service status(If you can see the following statement then it means completed:Successfully installed ocr_system && Running on
docker logs -f paddle_ocr
## 4. Test
a. Calculate the Base64 encoding of the picture to be recognized (if you just test, you can use a free online tool, like:https://freeonlinetools24.com/base64-image/)
b. Post a service request(sample request in sample_request.txt)
curl -H "Content-Type:application/json" -X POST --data "{\"images\": [\"Input image Base64 encode(need to delete the code 'data:image/jpg;base64,')\"]}" http://localhost:8868/predict/ocr_system
c. Get resposne(If the call is successful, the following result will be returned)
[English](README.md) | 简体中文
## Docker化部署服务
本文将提供一些标准化的代码来实现这样的目标。大家通过如下步骤可以把PaddleOCR项目快速发布成可调用的Restful API服务。(目前暂时先实现了基于HubServing模式的部署,后续作者计划增加PaddleServing模式的部署)
## 1.实施前提准备
a. Docker环境
b. 显卡驱动和CUDA 10.0+(GPU)
c. NVIDIA Container Toolkit(GPU,Docker 19.03以上版本可以跳过此步)
d. cuDNN 7.6+(GPU)
## 2.制作镜像
cd deploy/docker/hubserving/cpu
docker build -t paddleocr:cpu .
## 3.启动Docker容器
a. CPU 版本
sudo docker run -dp 8868:8868 --name paddle_ocr paddleocr:cpu
b. GPU 版本 (通过NVIDIA Container Toolkit)
sudo nvidia-docker run -dp 8868:8868 --name paddle_ocr paddleocr:gpu
c. GPU 版本 (Docker 19.03以上版本,可以直接用如下命令)
sudo docker run -dp 8868:8869 --gpus all --name paddle_ocr paddleocr:gpu
d. 检查服务运行情况(出现:Successfully installed ocr_system和Running on 等信息,表示运行成功)
docker logs -f paddle_ocr
## 4.测试服务
a. 计算待识别图片的Base64编码(如果只是测试一下效果,可以通过免费的在线工具实现,如:http://tool.chinaz.com/tools/imgtobase/)
b. 发送服务请求(可参见sample_request.txt中的值)
curl -H "Content-Type:application/json" -X POST --data "{\"images\": [\"填入图片Base64编码(需要删除'data:image/jpg;base64,')\"]}" http://localhost:8868/predict/ocr_system
c. 返回结果(如果调用成功,会返回如下结果)
# Version: 1.0.0
FROM hub.baidubce.com/paddlepaddle/paddle:latest-gpu-cuda10.0-cudnn7-dev
# PaddleOCR base on Python3.7
RUN pip3.7 install --upgrade pip -i https://mirror.baidu.com/pypi/simple
RUN python3.7 -m pip install paddlepaddle==2.0.0rc0 -i https://mirror.baidu.com/pypi/simple
RUN pip3.7 install paddlehub --upgrade -i https://mirror.baidu.com/pypi/simple
RUN git clone https://github.com/PaddlePaddle/PaddleOCR.git /PaddleOCR
RUN pip3.7 install -r requirements.txt -i https://mirror.baidu.com/pypi/simple
RUN mkdir -p /PaddleOCR/inference/
# Download orc detect model(light version). if you want to change normal version, you can change ch_ppocr_mobile_v1.1_det_infer to ch_ppocr_server_v1.1_det_infer, also remember change det_model_dir in deploy/hubserving/ocr_system/params.py)
ADD {link} /PaddleOCR/inference/
RUN tar xf /PaddleOCR/inference/{file} -C /PaddleOCR/inference/
# Download direction classifier(light version). If you want to change normal version, you can change ch_ppocr_mobile_v1.1_cls_infer to ch_ppocr_mobile_v1.1_cls_infer, also remember change cls_model_dir in deploy/hubserving/ocr_system/params.py)
ADD {link} /PaddleOCR/inference/
RUN tar xf /PaddleOCR/inference/{file}.tar -C /PaddleOCR/inference/
# Download orc recognition model(light version). If you want to change normal version, you can change ch_ppocr_mobile_v1.1_rec_infer to ch_ppocr_server_v1.1_rec_infer, also remember change rec_model_dir in deploy/hubserving/ocr_system/params.py)
ADD {link} /PaddleOCR/inference/
RUN tar xf /PaddleOCR/inference/{file}.tar -C /PaddleOCR/inference/
CMD ["/bin/bash","-c","hub install deploy/hubserving/ocr_system/ && hub serving start -m ocr_system"]
\ No newline at end of file
# Version: 1.0.0
FROM hub.baidubce.com/paddlepaddle/paddle:latest-gpu-cuda10.0-cudnn7-dev
# PaddleOCR base on Python3.7
RUN pip3.7 install --upgrade pip -i https://mirror.baidu.com/pypi/simple
RUN python3.7 -m pip install paddlepaddle-gpu==2.0.0rc0 -i https://mirror.baidu.com/pypi/simple
RUN pip3.7 install paddlehub --upgrade -i https://mirror.baidu.com/pypi/simple
RUN git clone https://github.com/PaddlePaddle/PaddleOCR.git /PaddleOCR
RUN pip3.7 install -r requirements.txt -i https://mirror.baidu.com/pypi/simple
RUN mkdir -p /PaddleOCR/inference/
# Download orc detect model(light version). if you want to change normal version, you can change ch_ppocr_mobile_v1.1_det_infer to ch_ppocr_server_v1.1_det_infer, also remember change det_model_dir in deploy/hubserving/ocr_system/params.py)
ADD {link} /PaddleOCR/inference/
RUN tar xf /PaddleOCR/inference/{file}.tar -C /PaddleOCR/inference/
# Download direction classifier(light version). If you want to change normal version, you can change ch_ppocr_mobile_v1.1_cls_infer to ch_ppocr_mobile_v1.1_cls_infer, also remember change cls_model_dir in deploy/hubserving/ocr_system/params.py)
ADD {link} /PaddleOCR/inference/
RUN tar xf /PaddleOCR/inference/{file} -C /PaddleOCR/inference/
# Download orc recognition model(light version). If you want to change normal version, you can change ch_ppocr_mobile_v1.1_rec_infer to ch_ppocr_server_v1.1_rec_infer, also remember change rec_model_dir in deploy/hubserving/ocr_system/params.py)
ADD {link} /PaddleOCR/inference/
RUN tar xf /PaddleOCR/inference/{file}.tar -C /PaddleOCR/inference/
CMD ["/bin/bash","-c","hub install deploy/hubserving/ocr_system/ && hub serving start -m ocr_system"]
\ No newline at end of file
"modules_info": {
"ocr_cls": {
"init_args": {
"version": "1.0.0",
"use_gpu": true
"predict_args": {
"port": 8866,
"use_multiprocess": false,
"workers": 2
# -*- coding:utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
sys.path.insert(0, ".")
from paddlehub.common.logger import logger
from paddlehub.module.module import moduleinfo, runnable, serving
import cv2
import paddlehub as hub
from tools.infer.utility import base64_to_cv2
from tools.infer.predict_cls import TextClassifier
summary="ocr recognition service",
class OCRCls(hub.Module):
def _initialize(self, use_gpu=False, enable_mkldnn=False):
initialize with the necessary elements
from ocr_cls.params import read_params
cfg = read_params()
cfg.use_gpu = use_gpu
if use_gpu:
_places = os.environ["CUDA_VISIBLE_DEVICES"]
print("use gpu: ", use_gpu)
print("CUDA_VISIBLE_DEVICES: ", _places)
cfg.gpu_mem = 8000
raise RuntimeError(
"Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES via export CUDA_VISIBLE_DEVICES=cuda_device_id."
cfg.ir_optim = True
cfg.enable_mkldnn = enable_mkldnn
self.text_classifier = TextClassifier(cfg)
def read_images(self, paths=[]):
images = []
for img_path in paths:
assert os.path.isfile(
img_path), "The {} isn't a valid file.".format(img_path)
img = cv2.imread(img_path)
if img is None:
logger.info("error in loading image:{}".format(img_path))
return images
def predict(self, images=[], paths=[]):
Get the text angle in the predicted images.
images (list(numpy.ndarray)): images data, shape of each is [H, W, C]. If images not paths
paths (list[str]): The paths of images. If paths not images
res (list): The result of text detection box and save path of images.
if images != [] and isinstance(images, list) and paths == []:
predicted_data = images
elif images == [] and isinstance(paths, list) and paths != []:
predicted_data = self.read_images(paths)
raise TypeError("The input data is inconsistent with expectations.")
assert predicted_data != [], "There is not any image to be predicted. Please check the input data."
img_list = []
for img in predicted_data:
if img is None:
rec_res_final = []
img_list, cls_res, predict_time = self.text_classifier(img_list)
for dno in range(len(cls_res)):
angle, score = cls_res[dno]
'angle': angle,
'confidence': float(score),
except Exception as e:
return [[]]
return [rec_res_final]
def serving_method(self, images, **kwargs):
Run as a service.
images_decode = [base64_to_cv2(image) for image in images]
results = self.predict(images_decode, **kwargs)
return results
if __name__ == '__main__':
ocr = OCRCls()
image_path = [
res = ocr.predict(paths=image_path)
# -*- coding:utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
class Config(object):
def read_params():
cfg = Config()
#params for text classifier
cfg.cls_model_dir = "./inference/ch_ppocr_mobile_v1.1_cls_infer/"
cfg.cls_image_shape = "3, 48, 192"
cfg.label_list = ['0', '180']
cfg.cls_batch_num = 30
cfg.cls_thresh = 0.9
cfg.use_zero_copy_run = False
cfg.use_pdserving = False
return cfg
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
} }
} }
}, },
"port": 8866, "port": 8865,
"use_multiprocess": false, "use_multiprocess": false,
"workers": 2 "workers": 2
} }
...@@ -3,20 +3,14 @@ from __future__ import absolute_import ...@@ -3,20 +3,14 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import argparse
import ast
import copy
import math
import os import os
import time import sys
sys.path.insert(0, ".")
from paddle.fluid.core import AnalysisConfig, create_paddle_predictor, PaddleTensor
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.module.module import moduleinfo, runnable, serving from paddlehub.module.module import moduleinfo, runnable, serving
from PIL import Image
import cv2 import cv2
import numpy as np import numpy as np
import paddle.fluid as fluid
import paddlehub as hub import paddlehub as hub
from tools.infer.utility import base64_to_cv2 from tools.infer.utility import base64_to_cv2
...@@ -67,9 +61,7 @@ class OCRDet(hub.Module): ...@@ -67,9 +61,7 @@ class OCRDet(hub.Module):
images.append(img) images.append(img)
return images return images
def predict(self, def predict(self, images=[], paths=[]):
""" """
Get the text box in the predicted images. Get the text box in the predicted images.
Args: Args:
...@@ -87,7 +79,7 @@ class OCRDet(hub.Module): ...@@ -87,7 +79,7 @@ class OCRDet(hub.Module):
raise TypeError("The input data is inconsistent with expectations.") raise TypeError("The input data is inconsistent with expectations.")
assert predicted_data != [], "There is not any image to be predicted. Please check the input data." assert predicted_data != [], "There is not any image to be predicted. Please check the input data."
all_results = [] all_results = []
for img in predicted_data: for img in predicted_data:
if img is None: if img is None:
...@@ -99,11 +91,9 @@ class OCRDet(hub.Module): ...@@ -99,11 +91,9 @@ class OCRDet(hub.Module):
rec_res_final = [] rec_res_final = []
for dno in range(len(dt_boxes)): for dno in range(len(dt_boxes)):
rec_res_final.append( rec_res_final.append({
{ 'text_region': dt_boxes[dno].astype(np.int).tolist()
'text_region': dt_boxes[dno].astype(np.int).tolist() })
all_results.append(rec_res_final) all_results.append(rec_res_final)
return all_results return all_results
...@@ -116,7 +106,7 @@ class OCRDet(hub.Module): ...@@ -116,7 +106,7 @@ class OCRDet(hub.Module):
results = self.predict(images_decode, **kwargs) results = self.predict(images_decode, **kwargs)
return results return results
if __name__ == '__main__': if __name__ == '__main__':
ocr = OCRDet() ocr = OCRDet()
image_path = [ image_path = [
...@@ -124,4 +114,4 @@ if __name__ == '__main__': ...@@ -124,4 +114,4 @@ if __name__ == '__main__':
'./doc/imgs/12.jpg', './doc/imgs/12.jpg',
] ]
res = ocr.predict(paths=image_path) res = ocr.predict(paths=image_path)
print(res) print(res)
\ No newline at end of file
...@@ -10,16 +10,17 @@ class Config(object): ...@@ -10,16 +10,17 @@ class Config(object):
def read_params(): def read_params():
cfg = Config() cfg = Config()
#params for text detector #params for text detector
cfg.det_algorithm = "DB" cfg.det_algorithm = "DB"
cfg.det_model_dir = "./inference/ch_det_mv3_db/" cfg.det_model_dir = "./inference/ch_ppocr_mobile_v1.1_det_infer/"
cfg.det_max_side_len = 960 cfg.det_limit_side_len = 960
cfg.det_limit_type = 'max'
#DB parmas #DB parmas
cfg.det_db_thresh =0.3 cfg.det_db_thresh = 0.3
cfg.det_db_box_thresh =0.5 cfg.det_db_box_thresh = 0.5
cfg.det_db_unclip_ratio =2.0 cfg.det_db_unclip_ratio = 2.0
# #EAST parmas # #EAST parmas
# cfg.det_east_score_thresh = 0.8 # cfg.det_east_score_thresh = 0.8
...@@ -37,5 +38,6 @@ def read_params(): ...@@ -37,5 +38,6 @@ def read_params():
# cfg.use_space_char = True # cfg.use_space_char = True
cfg.use_zero_copy_run = False cfg.use_zero_copy_run = False
cfg.use_pdserving = False
return cfg return cfg
...@@ -3,20 +3,13 @@ from __future__ import absolute_import ...@@ -3,20 +3,13 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import argparse
import ast
import copy
import math
import os import os
import time import sys
sys.path.insert(0, ".")
from paddle.fluid.core import AnalysisConfig, create_paddle_predictor, PaddleTensor
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.module.module import moduleinfo, runnable, serving from paddlehub.module.module import moduleinfo, runnable, serving
from PIL import Image
import cv2 import cv2
import numpy as np
import paddle.fluid as fluid
import paddlehub as hub import paddlehub as hub
from tools.infer.utility import base64_to_cv2 from tools.infer.utility import base64_to_cv2
...@@ -67,9 +60,7 @@ class OCRRec(hub.Module): ...@@ -67,9 +60,7 @@ class OCRRec(hub.Module):
images.append(img) images.append(img)
return images return images
def predict(self, def predict(self, images=[], paths=[]):
""" """
Get the text box in the predicted images. Get the text box in the predicted images.
Args: Args:
...@@ -87,31 +78,28 @@ class OCRRec(hub.Module): ...@@ -87,31 +78,28 @@ class OCRRec(hub.Module):
raise TypeError("The input data is inconsistent with expectations.") raise TypeError("The input data is inconsistent with expectations.")
assert predicted_data != [], "There is not any image to be predicted. Please check the input data." assert predicted_data != [], "There is not any image to be predicted. Please check the input data."
img_list = [] img_list = []
for img in predicted_data: for img in predicted_data:
if img is None: if img is None:
continue continue
img_list.append(img) img_list.append(img)
rec_res_final = [] rec_res_final = []
try: try:
rec_res, predict_time = self.text_recognizer(img_list) rec_res, predict_time = self.text_recognizer(img_list)
for dno in range(len(rec_res)): for dno in range(len(rec_res)):
text, score = rec_res[dno] text, score = rec_res[dno]
rec_res_final.append( rec_res_final.append({
{ 'text': text,
'text': text, 'confidence': float(score),
'confidence': float(score), })
except Exception as e: except Exception as e:
print(e) print(e)
return [[]] return [[]]
return [rec_res_final] return [rec_res_final]
@serving @serving
def serving_method(self, images, **kwargs): def serving_method(self, images, **kwargs):
""" """
...@@ -121,7 +109,7 @@ class OCRRec(hub.Module): ...@@ -121,7 +109,7 @@ class OCRRec(hub.Module):
results = self.predict(images_decode, **kwargs) results = self.predict(images_decode, **kwargs)
return results return results
if __name__ == '__main__': if __name__ == '__main__':
ocr = OCRRec() ocr = OCRRec()
image_path = [ image_path = [
...@@ -130,4 +118,4 @@ if __name__ == '__main__': ...@@ -130,4 +118,4 @@ if __name__ == '__main__':
'./doc/imgs_words/ch/word_3.jpg', './doc/imgs_words/ch/word_3.jpg',
] ]
res = ocr.predict(paths=image_path) res = ocr.predict(paths=image_path)
print(res) print(res)
\ No newline at end of file
...@@ -10,25 +10,10 @@ class Config(object): ...@@ -10,25 +10,10 @@ class Config(object):
def read_params(): def read_params():
cfg = Config() cfg = Config()
# #params for text detector
# cfg.det_algorithm = "DB"
# cfg.det_model_dir = "./inference/ch_det_mv3_db/"
# cfg.det_max_side_len = 960
# #DB parmas
# cfg.det_db_thresh =0.3
# cfg.det_db_box_thresh =0.5
# cfg.det_db_unclip_ratio =2.0
# #EAST parmas
# cfg.det_east_score_thresh = 0.8
# cfg.det_east_cover_thresh = 0.1
# cfg.det_east_nms_thresh = 0.2
#params for text recognizer #params for text recognizer
cfg.rec_algorithm = "CRNN" cfg.rec_algorithm = "CRNN"
cfg.rec_model_dir = "./inference/ch_rec_mv3_crnn/" cfg.rec_model_dir = "./inference/ch_ppocr_mobile_v1.1_rec_infer/"
cfg.rec_image_shape = "3, 32, 320" cfg.rec_image_shape = "3, 32, 320"
cfg.rec_char_type = 'ch' cfg.rec_char_type = 'ch'
...@@ -39,5 +24,6 @@ def read_params(): ...@@ -39,5 +24,6 @@ def read_params():
cfg.use_space_char = True cfg.use_space_char = True
cfg.use_zero_copy_run = False cfg.use_zero_copy_run = False
cfg.use_pdserving = False
return cfg return cfg
...@@ -3,20 +3,16 @@ from __future__ import absolute_import ...@@ -3,20 +3,16 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import argparse
import ast
import copy
import math
import os import os
import sys
sys.path.insert(0, ".")
import time import time
from paddle.fluid.core import AnalysisConfig, create_paddle_predictor, PaddleTensor
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.module.module import moduleinfo, runnable, serving from paddlehub.module.module import moduleinfo, runnable, serving
from PIL import Image
import cv2 import cv2
import numpy as np import numpy as np
import paddle.fluid as fluid
import paddlehub as hub import paddlehub as hub
from tools.infer.utility import base64_to_cv2 from tools.infer.utility import base64_to_cv2
...@@ -52,7 +48,7 @@ class OCRSystem(hub.Module): ...@@ -52,7 +48,7 @@ class OCRSystem(hub.Module):
) )
cfg.ir_optim = True cfg.ir_optim = True
cfg.enable_mkldnn = enable_mkldnn cfg.enable_mkldnn = enable_mkldnn
self.text_sys = TextSystem(cfg) self.text_sys = TextSystem(cfg)
def read_images(self, paths=[]): def read_images(self, paths=[]):
...@@ -67,9 +63,7 @@ class OCRSystem(hub.Module): ...@@ -67,9 +63,7 @@ class OCRSystem(hub.Module):
images.append(img) images.append(img)
return images return images
def predict(self, def predict(self, images=[], paths=[]):
""" """
Get the chinese texts in the predicted images. Get the chinese texts in the predicted images.
Args: Args:
...@@ -104,13 +98,11 @@ class OCRSystem(hub.Module): ...@@ -104,13 +98,11 @@ class OCRSystem(hub.Module):
for dno in range(dt_num): for dno in range(dt_num):
text, score = rec_res[dno] text, score = rec_res[dno]
rec_res_final.append( rec_res_final.append({
{ 'text': text,
'text': text, 'confidence': float(score),
'confidence': float(score), 'text_region': dt_boxes[dno].astype(np.int).tolist()
'text_region': dt_boxes[dno].astype(np.int).tolist() })
all_results.append(rec_res_final) all_results.append(rec_res_final)
return all_results return all_results
...@@ -123,7 +115,7 @@ class OCRSystem(hub.Module): ...@@ -123,7 +115,7 @@ class OCRSystem(hub.Module):
results = self.predict(images_decode, **kwargs) results = self.predict(images_decode, **kwargs)
return results return results
if __name__ == '__main__': if __name__ == '__main__':
ocr = OCRSystem() ocr = OCRSystem()
image_path = [ image_path = [
...@@ -131,4 +123,4 @@ if __name__ == '__main__': ...@@ -131,4 +123,4 @@ if __name__ == '__main__':
'./doc/imgs/12.jpg', './doc/imgs/12.jpg',
] ]
res = ocr.predict(paths=image_path) res = ocr.predict(paths=image_path)
print(res) print(res)
\ No newline at end of file
...@@ -10,16 +10,17 @@ class Config(object): ...@@ -10,16 +10,17 @@ class Config(object):
def read_params(): def read_params():
cfg = Config() cfg = Config()
#params for text detector #params for text detector
cfg.det_algorithm = "DB" cfg.det_algorithm = "DB"
cfg.det_model_dir = "./inference/ch_det_mv3_db/" cfg.det_model_dir = "./inference/ch_ppocr_mobile_v1.1_det_infer/"
cfg.det_max_side_len = 960 cfg.det_limit_side_len = 960
cfg.det_limit_type = 'max'
#DB parmas #DB parmas
cfg.det_db_thresh =0.3 cfg.det_db_thresh = 0.3
cfg.det_db_box_thresh =0.5 cfg.det_db_box_thresh = 0.5
cfg.det_db_unclip_ratio =2.0 cfg.det_db_unclip_ratio = 2.0
#EAST parmas #EAST parmas
cfg.det_east_score_thresh = 0.8 cfg.det_east_score_thresh = 0.8
...@@ -28,7 +29,7 @@ def read_params(): ...@@ -28,7 +29,7 @@ def read_params():
#params for text recognizer #params for text recognizer
cfg.rec_algorithm = "CRNN" cfg.rec_algorithm = "CRNN"
cfg.rec_model_dir = "./inference/ch_rec_mv3_crnn/" cfg.rec_model_dir = "./inference/ch_ppocr_mobile_v1.1_rec_infer/"
cfg.rec_image_shape = "3, 32, 320" cfg.rec_image_shape = "3, 32, 320"
cfg.rec_char_type = 'ch' cfg.rec_char_type = 'ch'
...@@ -38,6 +39,15 @@ def read_params(): ...@@ -38,6 +39,15 @@ def read_params():
cfg.rec_char_dict_path = "./ppocr/utils/ppocr_keys_v1.txt" cfg.rec_char_dict_path = "./ppocr/utils/ppocr_keys_v1.txt"
cfg.use_space_char = True cfg.use_space_char = True
#params for text classifier
cfg.use_angle_cls = True
cfg.cls_model_dir = "./inference/ch_ppocr_mobile_v1.1_cls_infer/"
cfg.cls_image_shape = "3, 48, 192"
cfg.label_list = ['0', '180']
cfg.cls_batch_num = 30
cfg.cls_thresh = 0.9
cfg.use_zero_copy_run = False cfg.use_zero_copy_run = False
cfg.use_pdserving = False
return cfg return cfg
[English](readme_en.md) | 简体中文
- 基于PaddleHub Serving的部署:代码路径为"`./deploy/hubserving`",按照本教程使用;
- 基于PaddleServing的部署:代码路径为"`./deploy/pdserving`",使用方法参考[文档](../../deploy/pdserving/readme.md)
# 基于PaddleHub Serving的服务部署
└─ ocr_cls 分类模块服务包
└─ ocr_det 检测模块服务包
└─ ocr_rec 识别模块服务包
└─ ocr_system 检测+识别串联服务包
└─ __init__.py 空文件,必选
└─ config.json 配置文件,可选,使用配置启动服务时作为参数传入
└─ module.py 主模块,必选,包含服务的完整逻辑
└─ params.py 参数文件,必选,包含模型路径、前后处理参数等参数
## 快速启动服务
### 1. 准备环境
# 安装paddlehub
pip3 install paddlehub --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple
### 2. 下载推理模型
**模型路径可在`params.py`中查看和修改。** 更多模型可以从PaddleOCR提供的[模型库](../../doc/doc_ch/models_list.md)下载,也可以替换成自己训练转换好的模型。
### 3. 安装服务模块
* 在Linux环境下,安装示例如下:
# 安装检测服务模块:
hub install deploy/hubserving/ocr_det/
# 或,安装分类服务模块:
hub install deploy/hubserving/ocr_cls/
# 或,安装识别服务模块:
hub install deploy/hubserving/ocr_rec/
# 或,安装检测+识别串联服务模块:
hub install deploy/hubserving/ocr_system/
* 在Windows环境下(文件夹的分隔符为`\`),安装示例如下:
# 安装检测服务模块:
hub install deploy\hubserving\ocr_det\
# 或,安装分类服务模块:
hub install deploy\hubserving\ocr_cls\
# 或,安装识别服务模块:
hub install deploy\hubserving\ocr_rec\
# 或,安装检测+识别串联服务模块:
hub install deploy\hubserving\ocr_system\
### 4. 启动服务
#### 方式1. 命令行命令启动(仅支持CPU)
$ hub serving start --modules [Module1==Version1, Module2==Version2, ...] \
--port XXXX \
--use_multiprocess \
--workers \
|--modules/-m|PaddleHub Serving预安装模型,以多个Module==Version键值对的形式列出<br>*`当不指定Version时,默认选择最新版本`*|
如启动串联服务: ```hub serving start -m ocr_system```
#### 方式2. 配置文件启动(支持CPU、GPU)
```hub serving start -c config.json```
"modules_info": {
"ocr_system": {
"init_args": {
"version": "1.0.0",
"use_gpu": true
"predict_args": {
"port": 8868,
"use_multiprocess": false,
"workers": 2
- `init_args`中的可配参数与`module.py`中的`_initialize`函数接口一致。其中,**当`use_gpu`为`true`时,表示使用GPU启动服务**。
- `predict_args`中的可配参数与`module.py`中的`predict`函数接口一致。
- 使用配置文件启动服务时,其他参数会被忽略。
- 如果使用GPU预测(即,`use_gpu`置为`true`),则需要在启动服务之前,设置CUDA_VISIBLE_DEVICES环境变量,如:```export CUDA_VISIBLE_DEVICES=0```,否则不用设置。
- **`use_gpu`不可与`use_multiprocess`同时为`true`**。
如,使用GPU 3号卡启动串联服务:
hub serving start -c deploy/hubserving/ocr_system/config.json
## 发送预测请求
```python tools/test_hubserving.py server_url image_path```
- **server_url**:服务地址,格式为
- **image_path**:测试图像路径,可以是单张图片路径,也可以是图像集合目录路径
```python tools/test_hubserving.py ./doc/imgs/```
## 返回结果格式说明
|confidence|float| 文本识别置信度或文本角度分类置信度|
| 字段名/模块名 | ocr_det | ocr_cls | ocr_rec | ocr_system |
| ---- | ---- | ---- | ---- | ---- |
|angle| | ✔ | | ✔ |
|text| | |✔|✔|
|confidence| |✔ |✔|✔|
|text_region| ✔| | |✔ |
**说明:** 如果需要增加、删除、修改返回字段,可在相应模块的`module.py`文件中进行修改,完整流程参考下一节自定义修改服务模块。
## 自定义修改服务模块
- 1、 停止服务
```hub serving stop --port/-p XXXX```
- 2、 到相应的`module.py`和`params.py`等文件中根据实际需求修改代码。
例如,如果需要替换部署服务所用模型,则需要到`params.py`中修改模型路径参数`det_model_dir`和`rec_model_dir`,如果需要关闭文本方向分类器,则将参数`use_angle_cls`置为`False`,当然,同时可能还需要修改其他相关参数,请根据实际情况修改调试。 **强烈建议修改后先直接运行`module.py`调试,能正确运行预测后再启动服务测试。**
- 3、 卸载旧服务包
```hub uninstall ocr_system```
- 4、 安装修改后的新服务包
```hub install deploy/hubserving/ocr_system/```
- 5、重新启动服务
```hub serving start -m ocr_system```
English | [简体中文](readme.md)
PaddleOCR provides 2 service deployment methods:
- Based on **PaddleHub Serving**: Code path is "`./deploy/hubserving`". Please follow this tutorial.
- Based on **PaddleServing**: Code path is "`./deploy/pdserving`". Please refer to the [tutorial](../../deploy/pdserving/readme.md) for usage.
# Service deployment based on PaddleHub Serving
The hubserving service deployment directory includes three service packages: detection, recognition, and two-stage series connection. Please select the corresponding service package to install and start service according to your needs. The directory is as follows:
└─ ocr_det detection module service package
└─ ocr_cls angle class module service package
└─ ocr_rec recognition module service package
└─ ocr_system two-stage series connection service package
Each service pack contains 3 files. Take the 2-stage series connection service package as an example, the directory is as follows:
└─ __init__.py Empty file, required
└─ config.json Configuration file, optional, passed in as a parameter when using configuration to start the service
└─ module.py Main module file, required, contains the complete logic of the service
└─ params.py Parameter file, required, including parameters such as model path, pre- and post-processing parameters
## Quick start service
The following steps take the 2-stage series service as an example. If only the detection service or recognition service is needed, replace the corresponding file path.
### 1. Prepare the environment
# Install paddlehub
pip3 install paddlehub --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple
### 2. Download inference model
Before installing the service module, you need to prepare the inference model and put it in the correct path. By default, the ultra lightweight model of v1.1 is used, and the default model path is:
detection model: ./inference/ch_ppocr_mobile_v1.1_det_infer/
recognition model: ./inference/ch_ppocr_mobile_v1.1_rec_infer/
text direction classifier: ./inference/ch_ppocr_mobile_v1.1_cls_infer/
**The model path can be found and modified in `params.py`.** More models provided by PaddleOCR can be obtained from the [model library](../../doc/doc_en/models_list_en.md). You can also use models trained by yourself.
### 3. Install Service Module
PaddleOCR provides 3 kinds of service modules, install the required modules according to your needs.
* On Linux platform, the examples are as follows.
# Install the detection service module:
hub install deploy/hubserving/ocr_det/
# Or, install the angle class service module:
hub install deploy/hubserving/ocr_cls/
# Or, install the recognition service module:
hub install deploy/hubserving/ocr_rec/
# Or, install the 2-stage series service module:
hub install deploy/hubserving/ocr_system/
* On Windows platform, the examples are as follows.
# Install the detection service module:
hub install deploy\hubserving\ocr_det\
# Or, install the angle class service module:
hub install deploy\hubserving\ocr_cls\
# Or, install the recognition service module:
hub install deploy\hubserving\ocr_rec\
# Or, install the 2-stage series service module:
hub install deploy\hubserving\ocr_system\
### 4. Start service
#### Way 1. Start with command line parameters (CPU only)
**start command:**
$ hub serving start --modules [Module1==Version1, Module2==Version2, ...] \
--port XXXX \
--use_multiprocess \
--workers \
|--modules/-m|PaddleHub Serving pre-installed model, listed in the form of multiple Module==Version key-value pairs<br>*`When Version is not specified, the latest version is selected by default`*|
|--port/-p|Service port, default is 8866|
|--use_multiprocess|Enable concurrent mode, the default is single-process mode, this mode is recommended for multi-core CPU machines<br>*`Windows operating system only supports single-process mode`*|
|--workers|The number of concurrent tasks specified in concurrent mode, the default is `2*cpu_count-1`, where `cpu_count` is the number of CPU cores|
For example, start the 2-stage series service:
hub serving start -m ocr_system
This completes the deployment of a service API, using the default port number 8866.
#### Way 2. Start with configuration file(CPU、GPU)
**start command:**
hub serving start --config/-c config.json
Wherein, the format of `config.json` is as follows:
"modules_info": {
"ocr_system": {
"init_args": {
"version": "1.0.0",
"use_gpu": true
"predict_args": {
"port": 8868,
"use_multiprocess": false,
"workers": 2
- The configurable parameters in `init_args` are consistent with the `_initialize` function interface in `module.py`. Among them, **when `use_gpu` is `true`, it means that the GPU is used to start the service**.
- The configurable parameters in `predict_args` are consistent with the `predict` function interface in `module.py`.
- When using the configuration file to start the service, other parameters will be ignored.
- If you use GPU prediction (that is, `use_gpu` is set to `true`), you need to set the environment variable CUDA_VISIBLE_DEVICES before starting the service, such as: ```export CUDA_VISIBLE_DEVICES=0```, otherwise you do not need to set it.
- **`use_gpu` and `use_multiprocess` cannot be `true` at the same time.**
For example, use GPU card No. 3 to start the 2-stage series service:
hub serving start -c deploy/hubserving/ocr_system/config.json
## Send prediction requests
After the service starts, you can use the following command to send a prediction request to obtain the prediction result:
python tools/test_hubserving.py server_url image_path
Two parameters need to be passed to the script:
- **server_url**:service address,format of which is
For example, if the detection, recognition and 2-stage serial services are started with provided configuration files, the respective `server_url` would be:
- **image_path**:Test image path, can be a single image path or an image directory path
python tools/test_hubserving.py ./doc/imgs/
## Returned result format
The returned result is a list. Each item in the list is a dict. The dict may contain three fields. The information is as follows:
|field name|data type|description|
|text|str|text content|
|confidence|float|text recognition confidence|
|text_region|list|text location coordinates|
The fields returned by different modules are different. For example, the results returned by the text recognition service module do not contain `text_region`. The details are as follows:
| field name/module name | ocr_det | ocr_cls | ocr_rec | ocr_system |
| ---- | ---- | ---- | ---- | ---- |
|angle| | ✔ | | ✔ |
|text| | |✔|✔|
|confidence| |✔ |✔|✔|
|text_region| ✔| | |✔ |
**Note:** If you need to add, delete or modify the returned fields, you can modify the file `module.py` of the corresponding module. For the complete process, refer to the user-defined modification service module in the next section.
## User defined service module modification
If you need to modify the service logic, the following steps are generally required (take the modification of `ocr_system` for example):
- 1. Stop service
hub serving stop --port/-p XXXX
- 2. Modify the code in the corresponding files, like `module.py` and `params.py`, according to the actual needs.
For example, if you need to replace the model used by the deployed service, you need to modify model path parameters `det_model_dir` and `rec_model_dir` in `params.py`. If you want to turn off the text direction classifier, set the parameter `use_angle_cls` to `False`. Of course, other related parameters may need to be modified at the same time. Please modify and debug according to the actual situation. It is suggested to run `module.py` directly for debugging after modification before starting the service test.
- 3. Uninstall old service module
hub uninstall ocr_system
- 4. Install modified service module
hub install deploy/hubserving/ocr_system/
- 5. Restart service
hub serving start -m ocr_system
# 添加新算法
* 数据加载和处理
* 网络
* 后处理
* 损失函数
* 指标评估
* 优化器
## 数据加载和处理
数据加载和处理由不同的模块(module)组成,其完成了图片的读取、数据增强和label的制作。这一部分在[ppocr/data](../../ppocr/data)下。 各个文件及文件夹作用说明如下:
├── imaug # 图片的读取、数据增强和label制作相关的文件
│ ├── label_ops.py # 对label进行变换的modules
│ ├── operators.py # 对image进行变换的modules
│ ├──.....
├── __init__.py
├── lmdb_dataset.py # 读取lmdb的数据集的dataset
└── simple_dataset.py # 读取以`image_path\tgt`形式保存的数据集的dataset
1.[ppocr/data/imaug](../../ppocr/data/imaug) 文件夹下新建文件,如my_module.py。
2. 在 my_module.py 文件内添加相关代码,示例代码如下:
class MyModule:
def __init__(self, *args, **kwargs):
# your init code
def __call__(self, data):
img = data['image']
label = data['label']
# your process code
data['image'] = img
data['label'] = label
return data
3.[ppocr/data/imaug/\__init\__.py](../../ppocr/data/imaug/__init__.py) 文件内导入添加的模块。
# angle class data process
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- MyModule:
args1: args1
args2: args2
- KeepKeys:
keep_keys: [ 'image', 'label' ] # dataloader will return list in this order
## 网络
网络部分完成了网络的组网操作,PaddleOCR将网络划分为四部分,这一部分在[ppocr/modeling](../../ppocr/modeling)下。 进入网络的数据将按照顺序(transforms->backbones->
├── architectures # 网络的组网代码
├── transforms # 网络的图像变换模块
├── backbones # 网络的特征提取模块
├── necks # 网络的特征增强模块
└── heads # 网络的输出模块
1.[ppocr/modeling/backbones](../../ppocr/modeling/backbones) 文件夹下新建文件,如my_backbone.py。
2. 在 my_backbone.py 文件内添加相关代码,示例代码如下:
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
class MyBackbone(nn.Layer):
def __init__(self, *args, **kwargs):
super(MyBackbone, self).__init__()
# your init code
self.conv = nn.xxxx
def forward(self, inputs):
# your necwork forward
y = self.conv(inputs)
return y
model_type: rec
algorithm: CRNN
name: MyTransform
args1: args1
args2: args2
name: MyBackbone
args1: args1
name: MyNeck
args1: args1
name: MyHead
args1: args1
## 后处理
1.[ppocr/postprocess](../../ppocr/postprocess) 文件夹下新建文件,如 my_postprocess.py。
2. 在 my_postprocess.py 文件内添加相关代码,示例代码如下:
import paddle
class MyPostProcess:
def __init__(self, *args, **kwargs):
# your init code
def __call__(self, preds, label=None, *args, **kwargs):
if isinstance(preds, paddle.Tensor):
preds = preds.numpy()
# you preds decode code
preds = self.decode_preds(preds)
if label is None:
return preds
# you label decode code
label = self.decode_label(label)
return preds, label
def decode_preds(self, preds):
# you preds decode code
def decode_label(self, preds):
# you label decode code
name: MyPostProcess
args1: args1
args2: args2
## 损失函数
1.[ppocr/losses](../../ppocr/losses) 文件夹下新建文件,如 my_loss.py。
2. 在 my_loss.py 文件内添加相关代码,示例代码如下:
import paddle
from paddle import nn
class MyLoss(nn.Layer):
def __init__(self, **kwargs):
super(MyLoss, self).__init__()
# you init code
def __call__(self, predicts, batch):
label = batch[1]
# your loss code
loss = self.loss(input=predicts, label=label)
return {'loss': loss}
name: MyLoss
args1: args1
args2: args2
## 指标评估
指标评估用于计算网络在当前batch上的性能。这一部分在[ppocr/metrics](../../ppocr/metrics)下。 PaddleOCR内置了检测,分类和识别等算法相关的指标评估模块,对于没有内置的模块可通过如下步骤添加:
1.[ppocr/metrics](../../ppocr/metrics) 文件夹下新建文件,如my_metric.py。
2. 在 my_metric.py 文件内添加相关代码,示例代码如下:
class MyMetric(object):
def __init__(self, main_indicator='acc', **kwargs):
# main_indicator is used for select best model
self.main_indicator = main_indicator
def __call__(self, preds, batch, *args, **kwargs):
# preds is out of postprocess
# batch is out of dataloader
labels = batch[1]
cur_correct_num = 0
cur_all_num = 0
# you metric code
self.correct_num += cur_correct_num
self.all_num += cur_all_num
return {'acc': cur_correct_num / cur_all_num, }
def get_metric(self):
return metircs {
'acc': 0,
'norm_edit_dis': 0,
acc = self.correct_num / self.all_num
return {'acc': acc}
def reset(self):
# reset metric
self.correct_num = 0
self.all_num = 0
name: MyMetric
main_indicator: acc
## 优化器
优化器用于训练网络。优化器内部还包含了网络正则化和学习率衰减模块。 这一部分在[ppocr/optimizer](../../ppocr/optimizer)下。 PaddleOCR内置了`Momentum`,`Adam`
1.[ppocr/optimizer/optimizer.py](../../ppocr/optimizer/optimizer.py) 文件内创建自己的优化器,示例代码如下:
from paddle import optimizer as optim
class MyOptim(object):
def __init__(self, learning_rate=0.001, *args, **kwargs):
self.learning_rate = learning_rate
def __call__(self, parameters):
# It is recommended to wrap the built-in optimizer of paddle
opt = optim.XXX(
return opt
name: MyOptim
args1: args1
args2: args2
name: Cosine
learning_rate: 0.001
name: 'L2'
factor: 0
\ No newline at end of file
...@@ -41,8 +41,8 @@ PaddleOCR基于动态图开源的文本识别算法列表: ...@@ -41,8 +41,8 @@ PaddleOCR基于动态图开源的文本识别算法列表:
- [x] CRNN([paper](https://arxiv.org/abs/1507.05717))(ppocr推荐) - [x] CRNN([paper](https://arxiv.org/abs/1507.05717))(ppocr推荐)
- [x] Rosetta([paper](https://arxiv.org/abs/1910.05085)) - [x] Rosetta([paper](https://arxiv.org/abs/1910.05085))
- [x] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html)) - [x] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html))
- [ ] RARE([paper](https://arxiv.org/abs/1603.03915v1)) - [ ] RARE([paper](https://arxiv.org/abs/1603.03915v1)) coming soon
- [ ] SRN([paper](https://arxiv.org/abs/2003.12294)) - [ ] SRN([paper](https://arxiv.org/abs/2003.12294)) coming soon
参考[DTRB](https://arxiv.org/abs/1904.01906)文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: 参考[DTRB](https://arxiv.org/abs/1904.01906)文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下:
...@@ -55,4 +55,5 @@ PaddleOCR基于动态图开源的文本识别算法列表: ...@@ -55,4 +55,5 @@ PaddleOCR基于动态图开源的文本识别算法列表:
|STAR-Net|Resnet34_vd|83.93%|rec_r34_vd_tps_bilstm_ctc|[下载链接](link)| |STAR-Net|Resnet34_vd|83.93%|rec_r34_vd_tps_bilstm_ctc|[下载链接](link)|
|STAR-Net|MobileNetV3|81.56%|rec_mv3_tps_bilstm_ctc|[下载链接](link)| |STAR-Net|MobileNetV3|81.56%|rec_mv3_tps_bilstm_ctc|[下载链接](link)|
PaddleOCR文本识别算法的训练和使用请参考文档教程中[模型训练/评估中的文本识别部分](./recognition.md) PaddleOCR文本识别算法的训练和使用请参考文档教程中[模型训练/评估中的文本识别部分](./recognition.md)
# 可选参数列表 ## 可选参数列表
以下列表可以通过`--help`查看 以下列表可以通过`--help`查看
...@@ -8,65 +8,115 @@ ...@@ -8,65 +8,115 @@
| -o | ALL | 设置配置文件里的参数内容 | None | 使用-o配置相较于-c选择的配置文件具有更高的优先级。例如:`-o Global.use_gpu=false` | | -o | ALL | 设置配置文件里的参数内容 | None | 使用-o配置相较于-c选择的配置文件具有更高的优先级。例如:`-o Global.use_gpu=false` |
## 配置文件 Global 参数介绍 ## 配置文件参数介绍
`rec_chinese_lite_train_v1.1.yml ` 为例 `rec_chinese_lite_train_v1.1.yml ` 为例
### Global
| 字段 | 用途 | 默认值 | 备注 | | 字段 | 用途 | 默认值 | 备注 |
| :----------------------: | :---------------------: | :--------------: | :--------------------: | | :----------------------: | :---------------------: | :--------------: | :--------------------: |
| algorithm | 设置算法 | 与配置文件同步 | 选择模型,支持模型请参考[简介](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/README.md) | | use_gpu | 设置代码是否在gpu运行 | true | \ |
| use_gpu | 设置代码运行场所 | true | \ | | epoch_num | 最大训练epoch数 | 500 | \ |
| epoch_num | 最大训练epoch数 | 3000 | \ |
| log_smooth_window | 滑动窗口大小 | 20 | \ | | log_smooth_window | 滑动窗口大小 | 20 | \ |
| print_batch_step | 设置打印log间隔 | 10 | \ | | print_batch_step | 设置打印log间隔 | 10 | \ |
| save_model_dir | 设置模型保存路径 | output/{算法名称} | \ | | save_model_dir | 设置模型保存路径 | output/{算法名称} | \ |
| save_epoch_step | 设置模型保存间隔 | 3 | \ | | save_epoch_step | 设置模型保存间隔 | 3 | \ |
| eval_batch_step | 设置模型评估间隔 | 2000 或 [1000, 2000] | 2000 表示每2000次迭代评估一次,[1000, 2000]表示从1000次迭代开始,每2000次评估一次 | | eval_batch_step | 设置模型评估间隔 | 2000 或 [1000, 2000] | 2000 表示每2000次迭代评估一次,[1000, 2000]表示从1000次迭代开始,每2000次评估一次 |
|train_batch_size_per_card | 设置训练时单卡batch size | 256 | \ | | cal_metric_during_train | 设置是否在训练过程中评估指标,此时评估的是模型在当前batch下的指标 | true | \ |
| test_batch_size_per_card | 设置评估时单卡batch size | 256 | \ | | load_static_weights | 设置预训练模型是否是静态图模式保存(目前仅检测算法需要) | true | \ |
| image_shape | 设置输入图片尺寸 | [3, 32, 100] | \ | | pretrained_model | 设置加载预训练模型路径 | ./pretrain_models/CRNN/best_accuracy | \ |
| checkpoints | 加载模型参数路径 | None | 用于中断后加载参数继续训练 |
| use_visualdl | 设置是否启用visualdl进行可视化log展示 | False | [教程地址](https://www.paddlepaddle.org.cn/paddle/visualdl) |
| infer_img | 设置预测图像路径或文件夹路径 | ./infer_img | \|
| character_dict_path | 设置字典路径 | ./ppocr/utils/ppocr_keys_v1.txt | \ |
| max_text_length | 设置文本最大长度 | 25 | \ | | max_text_length | 设置文本最大长度 | 25 | \ |
| character_type | 设置字符类型 | ch | en/ch, en时将使用默认dict,ch时使用自定义dict| | character_type | 设置字符类型 | ch | en/ch, en时将使用默认dict,ch时使用自定义dict|
| character_dict_path | 设置字典路径 | ./ppocr/utils/ic15_dict.txt | \ | | use_space_char | 设置是否识别空格 | True | 仅在 character_type=ch 时支持空格 |
| loss_type | 设置 loss 类型 | ctc | 支持两种loss: ctc / attention |
| distort | 设置是否使用数据增强 | false | 设置为true时,将在训练时随机进行扰动,支持的扰动操作可阅读[img_tools.py](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/ppocr/data/rec/img_tools.py) |
| use_space_char | 设置是否识别空格 | false | 仅在 character_type=ch 时支持空格 |
| label_list | 设置方向分类器支持的角度 | ['0','180'] | 仅在方向分类器中生效 | | label_list | 设置方向分类器支持的角度 | ['0','180'] | 仅在方向分类器中生效 |
| average_window | ModelAverage优化器中的窗口长度计算比例 | 0.15 | 目前仅应用与SRN | | save_res_path | 设置检测模型的结果保存地址 | ./output/det_db/predicts_db.txt | 仅在检测模型中生效 |
| max_average_window | 平均值计算窗口长度的最大值 | 15625 | 推荐设置为一轮训练中mini-batchs的数目|
| min_average_window | 平均值计算窗口长度的最小值 | 10000 | \ |
| reader_yml | 设置reader配置文件 | ./configs/rec/rec_icdar15_reader.yml | \ |
| pretrain_weights | 加载预训练模型路径 | ./pretrain_models/CRNN/best_accuracy | \ |
| checkpoints | 加载模型参数路径 | None | 用于中断后加载参数继续训练 |
| save_inference_dir | inference model 保存路径 | None | 用于保存inference model |
## 配置文件 Reader 系列参数介绍 ### Optimizer ([ppocr/optimizer](../../ppocr/optimizer))
`rec_chinese_reader.yml` 为例 | 字段 | 用途 | 默认值 | 备注 |
| :---------------------: | :---------------------: | :--------------: | :--------------------: |
| name | 优化器类名 | Adam | 目前支持`Momentum`,`Adam`,`RMSProp`, 见[ppocr/optimizer/optimizer.py](../../ppocr/optimizer/optimizer.py) |
| beta1 | 设置一阶矩估计的指数衰减率 | 0.9 | \ |
| beta2 | 设置二阶矩估计的指数衰减率 | 0.999 | \ |
| **lr** | 设置学习率decay方式 | - | \ |
| name | 学习率decay类名 | Cosine | 目前支持`Linear`,`Cosine`,`Step`,`Piecewise`, 见[ppocr/optimizer/learning_rate.py](../../ppocr/optimizer/learning_rate.py) |
| learning_rate | 基础学习率 | 0.001 | \ |
| **regularizer** | 设置网络正则化方式 | - | \ |
| name | 正则化类名 | L2 | 目前支持`L1`,`L2`, 见[ppocr/optimizer/regularizer.py](../../ppocr/optimizer/regularizer.py) |
| factor | 学习率衰减系数 | 0.00004 | \ |
| 字段 | 用途 | 默认值 | 备注 |
| :----------------------: | :---------------------: | :--------------: | :--------------------: |
| reader_function | 选择数据读取方式 | ppocr.data.rec.dataset_traversal,SimpleReader | 支持SimpleReader / LMDBReader 两种数据读取方式 |
| num_workers | 设置数据读取线程数 | 8 | \ |
| img_set_dir | 数据集路径 | ./train_data | \ |
| label_file_path | 数据标签路径 | ./train_data/rec_gt_train.txt| \ |
| infer_img | 预测图像文件夹路径 | ./infer_img | \|
## 配置文件 Optimizer 系列参数介绍 ### Architecture ([ppocr/modeling](../../ppocr/modeling))
| 字段 | 用途 | 默认值 | 备注 |
| :---------------------: | :---------------------: | :--------------: | :--------------------: |
| model_type | 网络类型 | rec | 目前支持`rec`,`det`,`cls` |
| algorithm | 模型名称 | CRNN | 支持列表见[algorithm_overview](./algorithm_overview.md) |
| **Transform** | 设置变换方式 | - | 目前仅rec类型的算法支持, 具体见[ppocr/modeling/transform](../../ppocr/modeling/transform) |
| name | 变换方式类名 | TPS | 目前支持`TPS` |
| num_fiducial | TPS控制点数 | 20 | 上下边各十个 |
| loc_lr | 定位网络学习率 | 0.1 | \ |
| model_name | 定位网络大小 | small | 目前支持`small`,`large` |
| **Backbone** | 设置网络backbone类名 | - | 具体见[ppocr/modeling/backbones](../../ppocr/modeling/backbones) |
| name | backbone类名 | ResNet | 目前支持`MobileNetV3`,`ResNet` |
| layers | resnet层数 | 34 | 支持18,34,50,101,152,200 |
| model_name | MobileNetV3 网络大小 | small | 支持`small`,`large` |
| **Neck** | 设置网络neck | - | 具体见[ppocr/modeling/necks](../../ppocr/modeling/necks) |
| name | neck类名 | SequenceEncoder | 目前支持`SequenceEncoder`,`DBFPN` |
| encoder_type | SequenceEncoder编码器类型 | rnn | 支持`reshape`,`fc`,`rnn` |
| hidden_size | rnn内部单元数 | 48 | \ |
| out_channels | DBFPN输出通道数 | 256 | \ |
| **Head** | 设置网络Head | - | 具体见[ppocr/modeling/heads](../../ppocr/modeling/heads) |
| name | head类名 | CTCHead | 目前支持`CTCHead`,`DBHead`,`ClsHead` |
| fc_decay | CTCHead正则化系数 | 0.0004 | \ |
| k | DBHead二值化系数 | 50 | \ |
| class_dim | ClsHead输出分类数 | 2 | \ |
`rec_icdar15_train.yml` 为例 ### Loss ([ppocr/losses](../../ppocr/losses))
| 字段 | 用途 | 默认值 | 备注 |
| :---------------------: | :---------------------: | :--------------: | :--------------------: |
| name | 网络loss类名 | CTCLoss | 目前支持`CTCLoss`,`DBLoss`,`ClsLoss` |
| balance_loss | DBLossloss中是否对正负样本数量进行均衡(使用OHEM) | True | \ |
| ohem_ratio | DBLossloss中的OHEM的负正样本比例 | 3 | \ |
| main_loss_type | DBLossloss中shrink_map所采用的的loss | DiceLoss | 支持`DiceLoss`,`BCELoss` |
| alpha | DBLossloss中shrink_map_loss的系数 | 5 | \ |
| beta | DBLossloss中threshold_map_loss的系数 | 10 | \ |
### PostProcess ([ppocr/postprocess](../../ppocr/postprocess))
| 字段 | 用途 | 默认值 | 备注 |
| :---------------------: | :---------------------: | :--------------: | :--------------------: |
| name | 后处理类名 | CTCLabelDecode | 目前支持`CTCLoss`,`AttnLabelDecode`,`DBPostProcess`,`ClsPostProcess` |
| thresh | DBPostProcess中分割图进行二值化的阈值 | 0.3 | \ |
| box_thresh | DBPostProcess中对输出框进行过滤的阈值,低于此阈值的框不会输出 | 0.7 | \ |
| max_candidates | DBPostProcess中输出的最大文本框数量 | 1000 | |
| unclip_ratio | DBPostProcess中对文本框进行放大的比例 | 2.0 | \ |
### Metric ([ppocr/metrics](../../ppocr/metrics))
| 字段 | 用途 | 默认值 | 备注 |
| :---------------------: | :---------------------: | :--------------: | :--------------------: |
| name | 指标评估方法名称 | CTCLabelDecode | 目前支持`DetMetric`,`RecMetric`,`ClsMetric` |
| main_indicator | 主要指标,用于选取最优模型 | acc | 对于检测方法为hmean,识别和分类方法为acc |
### Dataset ([ppocr/data](../../ppocr/data))
| 字段 | 用途 | 默认值 | 备注 | | 字段 | 用途 | 默认值 | 备注 |
| :---------------------: | :---------------------: | :--------------: | :--------------------: | | :---------------------: | :---------------------: | :--------------: | :--------------------: |
| function | 选择优化器 | pocr.optimizer,AdamDecay | 目前只支持Adam方式 | | **dataset** | 每次迭代返回一个样本 | - | - |
| base_lr | 设置初始学习率 | 0.0005 | \ | | name | dataset类名 | SimpleDataSet | 目前支持`SimpleDataSet``LMDBDateSet` |
| beta1 | 设置一阶矩估计的指数衰减率 | 0.9 | \ | | data_dir | 数据集图片存放路径 | ./train_data | \ |
| beta2 | 设置二阶矩估计的指数衰减率 | 0.999 | \ | | label_file_list | 数据标签路径 | ["./train_data/train_list.txt"] | dataset为LMDBDateSet时不需要此参数 |
| decay | 是否使用decay | \ | \ | | ratio_list | 数据集的比例 | [1.0] | 若label_file_list中有两个train_list,且ratio_list为[0.4,0.6],则从train_list1中采样40%,从train_list2中采样60%组合整个dataset |
| function(decay) | 设置decay方式 | - | 目前支持cosine_decay, cosine_decay_warmup与piecewise_decay | | transforms | 对图片和标签进行变换的方法列表 | [DecodeImage,CTCLabelEncode,RecResizeImg,KeepKeys] | 见[ppocr/data/imaug](../../ppocr/data/imaug) |
| step_each_epoch | 每个epoch包含多少次迭代, cosine_decay/cosine_decay_warmup时有效 | 20 | 计算方式:total_image_num / (batch_size_per_card * card_size) | | **loader** | dataloader相关 | - | |
| total_epoch | 总共迭代多少个epoch, cosine_decay/cosine_decay_warmup时有效 | 1000 | 与Global.epoch_num 一致 | | shuffle | 每个epoch是否将数据集顺序打乱 | True | \ |
| warmup_minibatch | 线性warmup的迭代次数, cosine_decay_warmup时有效 | 1000 | \ | | batch_size_per_card | 训练时单卡batch size | 256 | \ |
| boundaries | 学习率下降时的迭代次数间隔, piecewise_decay时有效 | - | 参数为列表形式 | | drop_last | 是否丢弃因数据集样本数不能被 batch_size 整除而产生的最后一个不完整的mini-batch | True | \ |
| decay_rate | 学习率衰减系数, piecewise_decay时有效 | - | \ | | num_workers | 用于加载数据的子进程个数,若为0即为不开启子进程,在主进程中进行数据加载 | 8 | \ |
\ No newline at end of file
...@@ -142,9 +142,8 @@ word_dict.txt 每行有一个单字,将字符与数字索引映射在一起, ...@@ -142,9 +142,8 @@ word_dict.txt 每行有一个单字,将字符与数字索引映射在一起,
<a name="支持空格"></a> <a name="支持空格"></a>
- 添加空格类别 - 添加空格类别
如果希望支持识别"空格"类别, 请将yml文件中的 `use_space_char` 字段设置为 `true` 如果希望支持识别"空格"类别, 请将yml文件中的 `use_space_char` 字段设置为 `True`
**注意:`use_space_char` 仅在 `character_type=ch` 时生效**
<a name="启动训练"></a> <a name="启动训练"></a>
### 启动训练 ### 启动训练
...@@ -167,10 +166,9 @@ tar -xf rec_mv3_none_bilstm_ctc.tar && rm -rf rec_mv3_none_bilstm_ctc.tar ...@@ -167,10 +166,9 @@ tar -xf rec_mv3_none_bilstm_ctc.tar && rm -rf rec_mv3_none_bilstm_ctc.tar
*如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false* *如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false*
``` ```
# GPU训练 支持单卡,多卡训练,通过CUDA_VISIBLE_DEVICES指定卡号 # GPU训练 支持单卡,多卡训练,通过--gpus参数指定卡号
# 训练icdar15英文数据 并将训练日志保存为 tain_rec.log # 训练icdar15英文数据 并将训练日志保存为 tain_rec.log
python3 tools/train.py -c configs/rec/rec_icdar15_train.yml 2>&1 | tee train_rec.log python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_icdar15_train.yml
``` ```
<a name="数据增强"></a> <a name="数据增强"></a>
- 数据增强 - 数据增强
...@@ -195,8 +193,8 @@ PaddleOCR支持训练和评估交替进行, 可以在 `configs/rec/rec_icdar15_t ...@@ -195,8 +193,8 @@ PaddleOCR支持训练和评估交替进行, 可以在 `configs/rec/rec_icdar15_t
| 配置文件 | 算法名称 | backbone | trans | seq | pred | | 配置文件 | 算法名称 | backbone | trans | seq | pred |
| :--------: | :-------: | :-------: | :-------: | :-----: | :-----: | | :--------: | :-------: | :-------: | :-------: | :-----: | :-----: |
| [rec_chinese_lite_train_v1.1.yml](../../configs/rec/ch_ppocr_v1.1/rec_chinese_lite_train_v1.1.yml) | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc |
| [rec_chinese_common_train_v1.1.yml](../../configs/rec/ch_ppocr_v1.1/rec_chinese_common_train_v1.1.yml) | CRNN | ResNet34_vd | None | BiLSTM | ctc | | [rec_chinese_common_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml) | CRNN | ResNet34_vd | None | BiLSTM | ctc |
| rec_chinese_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | | rec_chinese_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc |
| rec_chinese_common_train.yml | CRNN | ResNet34_vd | None | BiLSTM | ctc | | rec_chinese_common_train.yml | CRNN | ResNet34_vd | None | BiLSTM | ctc |
| rec_icdar15_train.yml | CRNN | Mobilenet_v3 large 0.5 | None | BiLSTM | ctc | | rec_icdar15_train.yml | CRNN | Mobilenet_v3 large 0.5 | None | BiLSTM | ctc |
...@@ -210,39 +208,69 @@ PaddleOCR支持训练和评估交替进行, 可以在 `configs/rec/rec_icdar15_t ...@@ -210,39 +208,69 @@ PaddleOCR支持训练和评估交替进行, 可以在 `configs/rec/rec_icdar15_t
| rec_r34_vd_tps_bilstm_ctc.yml | STARNet | Resnet34_vd | tps | BiLSTM | ctc | | rec_r34_vd_tps_bilstm_ctc.yml | STARNet | Resnet34_vd | tps | BiLSTM | ctc |
| rec_r50fpn_vd_none_srn.yml | SRN | Resnet50_fpn_vd | None | rnn | srn | | rec_r50fpn_vd_none_srn.yml | SRN | Resnet50_fpn_vd | None | rnn | srn |
训练中文数据,推荐使用[rec_chinese_lite_train_v1.1.yml](../../configs/rec/ch_ppocr_v1.1/rec_chinese_lite_train_v1.1.yml),如您希望尝试其他算法在中文数据集上的效果,请参考下列说明修改配置文件: 训练中文数据,推荐使用[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml),如您希望尝试其他算法在中文数据集上的效果,请参考下列说明修改配置文件:
`rec_mv3_none_none_ctc.yml` 为例: `rec_chinese_lite_train_v2.0.yml` 为例:
``` ```
Global: Global:
... ...
# 修改 image_shape 以适应长文本 # 添加自定义字典,如修改字典请将路径指向新字典
image_shape: [3, 32, 320] character_dict_path: ppocr/utils/ppocr_keys_v1.txt
# 修改字符类型 # 修改字符类型
character_type: ch character_type: ch
# 添加自定义字典,如修改字典请将路径指向新字典
character_dict_path: ./ppocr/utils/ppocr_keys_v1.txt
# 训练时添加数据增强
distort: true
# 识别空格
use_space_char: true
# 修改reader类型
reader_yml: ./configs/rec/rec_chinese_reader.yml
... ...
# 识别空格
use_space_char: True
Optimizer: Optimizer:
... ...
# 添加学习率衰减策略 # 添加学习率衰减策略
decay: lr:
function: cosine_decay name: Cosine
# 每个 epoch 包含 iter 数 learning_rate: 0.001
step_each_epoch: 20 ...
# 总共训练epoch数
total_epoch: 1000 ...
# 数据集格式,支持LMDBDateSet以及SimpleDataSet
name: SimpleDataSet
# 数据集路径
data_dir: ./train_data/
# 训练集标签文件
label_file_list: ["./train_data/train_list.txt"]
- RecResizeImg:
# 修改 image_shape 以适应长文本
image_shape: [3, 32, 320]
# 单卡训练的batch_size
batch_size_per_card: 256
# 数据集格式,支持LMDBDateSet以及SimpleDataSet
name: SimpleDataSet
# 数据集路径
data_dir: ./train_data
# 验证集标签文件
label_file_list: ["./train_data/val_list.txt"]
- RecResizeImg:
# 修改 image_shape 以适应长文本
image_shape: [3, 32, 320]
# 单卡验证的batch_size
batch_size_per_card: 256
``` ```
**注意,预测/评估时的配置文件请务必与训练一致。** **注意,预测/评估时的配置文件请务必与训练一致。**
...@@ -270,39 +298,41 @@ Global: ...@@ -270,39 +298,41 @@ Global:
... ...
# 添加自定义字典,如修改字典请将路径指向新字典 # 添加自定义字典,如修改字典请将路径指向新字典
character_dict_path: ./ppocr/utils/dict/french_dict.txt character_dict_path: ./ppocr/utils/dict/french_dict.txt
# 训练时添加数据增强
distort: true
# 识别空格
use_space_char: true
# 修改reader类型
reader_yml: ./configs/rec/multi_languages/rec_french_reader.yml
同时需要修改数据读取文件 `rec_french_reader.yml`
... ...
# 修改训练数据存放的目录名 # 识别空格
img_set_dir: ./train_data use_space_char: True
# 修改 label 文件名称
label_file_path: ./train_data/french_train.txt
... ...
# 数据集格式,支持LMDBDateSet以及SimpleDataSet
name: SimpleDataSet
# 数据集路径
data_dir: ./train_data/
# 训练集标签文件
label_file_list: ["./train_data/french_train.txt"]
# 数据集格式,支持LMDBDateSet以及SimpleDataSet
name: SimpleDataSet
# 数据集路径
data_dir: ./train_data
# 验证集标签文件
label_file_list: ["./train_data/french_val.txt"]
``` ```
<a name="评估"></a> <a name="评估"></a>
### 评估 ### 评估
评估数据集可以通过 `configs/rec/rec_icdar15_reader.yml` 修改EvalReader中的 `label_file_path` 设置。 评估数据集可以通过 `configs/rec/rec_icdar15_train.yml` 修改Eval中的 `label_file_path` 设置。
*注意* 评估时必须确保配置文件中 infer_img 字段为空 *注意* 评估时必须确保配置文件中 infer_img 字段为空
``` ```
# GPU 评估, Global.checkpoints 为待测权重 # GPU 评估, Global.checkpoints 为待测权重
python3 tools/eval.py -c configs/rec/rec_icdar15_train.yml -o Global.checkpoints={path/to/weights}/best_accuracy python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_icdar15_train.yml -o Global.checkpoints={path/to/weights}/best_accuracy
``` ```
<a name="预测"></a> <a name="预测"></a>
...@@ -332,12 +362,12 @@ infer_img: doc/imgs_words/en/word_1.png ...@@ -332,12 +362,12 @@ infer_img: doc/imgs_words/en/word_1.png
word : joint word : joint
``` ```
预测使用的配置文件必须与训练一致,如您通过 `python3 tools/train.py -c configs/rec/ch_ppocr_v1.1/rec_chinese_lite_train_v1.1.yml` 完成了中文模型的训练, 预测使用的配置文件必须与训练一致,如您通过 `python3 tools/train.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml` 完成了中文模型的训练,
您可以使用如下命令进行中文模型预测。 您可以使用如下命令进行中文模型预测。
``` ```
# 预测中文结果 # 预测中文结果
python3 tools/infer_rec.py -c configs/rec/ch_ppocr_v1.1/rec_chinese_lite_train_v1.1.yml -o Global.checkpoints={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/ch/word_1.jpg python3 tools/infer_rec.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml -o Global.checkpoints={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/ch/word_1.jpg
``` ```
预测图片: 预测图片:
...@@ -261,6 +261,61 @@ im_show.save('result.jpg') ...@@ -261,6 +261,61 @@ im_show.save('result.jpg')
paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_dir} --rec_model_dir {your_rec_model_dir} --rec_char_dict_path {your_rec_char_dict_path} --cls_model_dir {your_cls_model_dir} --use_angle_cls true --cls true paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_dir} --rec_model_dir {your_rec_model_dir} --rec_char_dict_path {your_rec_char_dict_path} --cls_model_dir {your_cls_model_dir} --use_angle_cls true --cls true
``` ```
### 使用网络图片或者numpy数组作为输入
1. 网络图片
from paddleocr import PaddleOCR, draw_ocr
# Paddleocr目前支持中英文、英文、法语、德语、韩语、日语,可以通过修改lang参数进行切换
# 参数依次为`ch`, `en`, `french`, `german`, `korean`, `japan`。
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg'
result = ocr.ocr(img_path, cls=True)
for line in result:
# 显示结果
from PIL import Image
image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result]
txts = [line[1][0] for line in result]
scores = [line[1][1] for line in result]
im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/simfang.ttf')
im_show = Image.fromarray(im_show)
paddleocr --image_dir http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg --use_angle_cls=true
2. numpy数组
from paddleocr import PaddleOCR, draw_ocr
# Paddleocr目前支持中英文、英文、法语、德语、韩语、日语,可以通过修改lang参数进行切换
# 参数依次为`ch`, `en`, `french`, `german`, `korean`, `japan`。
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs/11.jpg'
img = cv2.imread(img_path)
# img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), 如果你自己训练的模型支持灰度图,可以将这句话的注释取消
result = ocr.ocr(img_path, cls=True)
for line in result:
# 显示结果
from PIL import Image
image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result]
txts = [line[1][0] for line in result]
scores = [line[1][1] for line in result]
im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/simfang.ttf')
im_show = Image.fromarray(im_show)
## 参数说明 ## 参数说明
| 字段 | 说明 | 默认值 | | 字段 | 说明 | 默认值 |
...@@ -285,6 +340,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_ ...@@ -285,6 +340,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_
| max_text_length | 识别算法能识别的最大文字长度 | 25 | | max_text_length | 识别算法能识别的最大文字长度 | 25 |
| rec_char_dict_path | 识别模型字典路径,当rec_model_dir使用方式2传参时需要修改为自己的字典路径 | ./ppocr/utils/ppocr_keys_v1.txt | | rec_char_dict_path | 识别模型字典路径,当rec_model_dir使用方式2传参时需要修改为自己的字典路径 | ./ppocr/utils/ppocr_keys_v1.txt |
| use_space_char | 是否识别空格 | TRUE | | use_space_char | 是否识别空格 | TRUE |
| drop_score | 对输出按照分数(来自于识别模型)进行过滤,低于此分数的不返回 | 0.5 |
| use_angle_cls | 是否加载分类模型 | FALSE | | use_angle_cls | 是否加载分类模型 | FALSE |
| cls_model_dir | 分类模型所在文件夹。传参方式有两种,1. None: 自动下载内置模型到 `~/.paddleocr/cls`;2.自己转换好的inference模型路径,模型路径下必须包含model和params文件 | None | | cls_model_dir | 分类模型所在文件夹。传参方式有两种,1. None: 自动下载内置模型到 `~/.paddleocr/cls`;2.自己转换好的inference模型路径,模型路径下必须包含model和params文件 | None |
| cls_image_shape | 分类算法的输入图片尺寸 | "3, 48, 192" | | cls_image_shape | 分类算法的输入图片尺寸 | "3, 48, 192" |
...@@ -295,4 +351,4 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_ ...@@ -295,4 +351,4 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_
| lang | 模型语言类型,目前支持 中文(ch)和英文(en) | ch | | lang | 模型语言类型,目前支持 中文(ch)和英文(en) | ch |
| det | 前向时使用启动检测 | TRUE | | det | 前向时使用启动检测 | TRUE |
| rec | 前向时是否启动识别 | TRUE | | rec | 前向时是否启动识别 | TRUE |
| cls | 前向时是否启动分类 | FALSE | | cls | 前向时是否启动分类 (命令行模式下使用use_angle_cls控制前向是否启动分类) | FALSE |
# Add new algorithm
PaddleOCR decomposes an algorithm into the following parts, and modularizes each part to make it more convenient to develop new algorithms.
* Data loading and processing
* Network
* Post-processing
* Loss
* Metric
* Optimizer
The following will introduce each part separately, and introduce how to add the modules required for the new algorithm.
## Data loading and processing
Data loading and processing are composed of different modules, which complete the image reading, data augment and label production. This part is under [ppocr/data](../../ppocr/data). The explanation of each file and folder are as follows:
├── imaug # Scripts for image reading, data augment and label production
│ ├── label_ops.py # Modules that transform the label
│ ├── operators.py # Modules that transform the image
│ ├──.....
├── __init__.py
├── lmdb_dataset.py # The dataset that reads the lmdb
└── simple_dataset.py # Read the dataset saved in the form of `image_path\tgt`
PaddleOCR has a large number of built-in image operation related modules. For modules that are not built-in, you can add them through the following steps:
1. Create a new file under the [ppocr/data/imaug](../../ppocr/data/imaug) folder, such as my_module.py.
2. Add code in the my_module.py file, the sample code is as follows:
class MyModule:
def __init__(self, *args, **kwargs):
# your init code
def __call__(self, data):
img = data['image']
label = data['label']
# your process code
data['image'] = img
data['label'] = label
return data
3. Import the added module in the [ppocr/data/imaug/\__init\__.py](../../ppocr/data/imaug/__init__.py) file.
All different modules of data processing are executed by sequence, combined and executed in the form of a list in the config file. Such as:
# angle class data process
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- MyModule:
args1: args1
args2: args2
- KeepKeys:
keep_keys: [ 'image', 'label' ] # dataloader will return list in this order
## Network
The network part completes the construction of the network, and PaddleOCR divides the network into four parts, which are under [ppocr/modeling](../../ppocr/modeling). The data entering the network will pass through these four parts in sequence(transforms->backbones->
├── architectures # Code for building network
├── transforms # Image Transformation Module
├── backbones # Feature extraction module
├── necks # Feature enhancement module
└── heads # Output module
PaddleOCR has built-in commonly used modules related to algorithms such as DB, EAST, SAST, CRNN and Attention. For modules that do not have built-in, you can add them through the following steps, the four parts are added in the same steps, take backbones as an example:
1. Create a new file under the [ppocr/modeling/backbones](../../ppocr/modeling/backbones) folder, such as my_backbone.py.
2. Add code in the my_backbone.py file, the sample code is as follows:
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
class MyBackbone(nn.Layer):
def __init__(self, *args, **kwargs):
super(MyBackbone, self).__init__()
# your init code
self.conv = nn.xxxx
def forward(self, inputs):
# your necwork forward
y = self.conv(inputs)
return y
3. Import the added module in the [ppocr/modeling/backbones/\__init\__.py](../../ppocr/modeling/backbones/__init__.py) file.
After adding the four-part modules of the network, you only need to configure them in the configuration file to use, such as:
model_type: rec
algorithm: CRNN
name: MyTransform
args1: args1
args2: args2
name: MyBackbone
args1: args1
name: MyNeck
args1: args1
name: MyHead
args1: args1
## Post-processing
Post-processing realizes decoding network output to obtain text box or recognized text. This part is under [ppocr/postprocess](../../ppocr/postprocess).
PaddleOCR has built-in post-processing modules related to algorithms such as DB, EAST, SAST, CRNN and Attention. For components that are not built-in, they can be added through the following steps:
1. Create a new file under the [ppocr/postprocess](../../ppocr/postprocess) folder, such as my_postprocess.py.
2. Add code in the my_postprocess.py file, the sample code is as follows:
import paddle
class MyPostProcess:
def __init__(self, *args, **kwargs):
# your init code
def __call__(self, preds, label=None, *args, **kwargs):
if isinstance(preds, paddle.Tensor):
preds = preds.numpy()
# you preds decode code
preds = self.decode_preds(preds)
if label is None:
return preds
# you label decode code
label = self.decode_label(label)
return preds, label
def decode_preds(self, preds):
# you preds decode code
def decode_label(self, preds):
# you label decode code
3. Import the added module in the [ppocr/postprocess/\__init\__.py](../../ppocr/postprocess/__init__.py) file.
After the post-processing module is added, you only need to configure it in the configuration file to use, such as:
name: MyPostProcess
args1: args1
args2: args2
## Loss
The loss function is used to calculate the distance between the network output and the label. This part is under [ppocr/losses](../../ppocr/losses).
PaddleOCR has built-in loss function modules related to algorithms such as DB, EAST, SAST, CRNN and Attention. For modules that do not have built-in modules, you can add them through the following steps:
1. Create a new file in the [ppocr/losses](../../ppocr/losses) folder, such as my_loss.py.
2. Add code in the my_loss.py file, the sample code is as follows:
import paddle
from paddle import nn
class MyLoss(nn.Layer):
def __init__(self, **kwargs):
super(MyLoss, self).__init__()
# you init code
def __call__(self, predicts, batch):
label = batch[1]
# your loss code
loss = self.loss(input=predicts, label=label)
return {'loss': loss}
3. Import the added module in the [ppocr/losses/\__init\__.py](../../ppocr/losses/__init__.py) file.
After the loss function module is added, you only need to configure it in the configuration file to use it, such as:
name: MyLoss
args1: args1
args2: args2
## Metric
Metric is used to calculate the performance of the network on the current batch. This part is under [ppocr/metrics](../../ppocr/metrics). PaddleOCR has built-in evaluation modules related to algorithms such as detection, classification and recognition. For modules that do not have built-in modules, you can add them through the following steps:
1. Create a new file under the [ppocr/metrics](../../ppocr/metrics) folder, such as my_metric.py.
2. Add code in the my_metric.py file, the sample code is as follows:
class MyMetric(object):
def __init__(self, main_indicator='acc', **kwargs):
# main_indicator is used for select best model
self.main_indicator = main_indicator
def __call__(self, preds, batch, *args, **kwargs):
# preds is out of postprocess
# batch is out of dataloader
labels = batch[1]
cur_correct_num = 0
cur_all_num = 0
# you metric code
self.correct_num += cur_correct_num
self.all_num += cur_all_num
return {'acc': cur_correct_num / cur_all_num, }
def get_metric(self):
return metircs {
'acc': 0,
'norm_edit_dis': 0,
acc = self.correct_num / self.all_num
return {'acc': acc}
def reset(self):
# reset metric
self.correct_num = 0
self.all_num = 0
3. Import the added module in the [ppocr/metrics/\__init\__.py](../../ppocr/metrics/__init__.py) file.
After the metric module is added, you only need to configure it in the configuration file to use it, such as:
name: MyMetric
main_indicator: acc
## 优化器
The optimizer is used to train the network. The optimizer also contains network regularization and learning rate decay modules. This part is under [ppocr/optimizer](../../ppocr/optimizer). PaddleOCR has built-in
Commonly used optimizer modules such as `Momentum`, `Adam` and `RMSProp`, common regularization modules such as `Linear`, `Cosine`, `Step` and `Piecewise`, and common learning rate decay modules such as `L1Decay` and `L2Decay`.
Modules without built-in can be added through the following steps, take `optimizer` as an example:
1. Create your own optimizer in the [ppocr/optimizer/optimizer.py](../../ppocr/optimizer/optimizer.py) file, the sample code is as follows:
from paddle import optimizer as optim
class MyOptim(object):
def __init__(self, learning_rate=0.001, *args, **kwargs):
self.learning_rate = learning_rate
def __call__(self, parameters):
# It is recommended to wrap the built-in optimizer of paddle
opt = optim.XXX(
return opt
After the optimizer module is added, you only need to configure it in the configuration file to use, such as:
name: MyOptim
args1: args1
args2: args2
name: Cosine
learning_rate: 0.001
name: 'L2'
factor: 0
\ No newline at end of file
...@@ -42,8 +42,8 @@ PaddleOCR open-source text recognition algorithms list: ...@@ -42,8 +42,8 @@ PaddleOCR open-source text recognition algorithms list:
- [x] CRNN([paper](https://arxiv.org/abs/1507.05717)) - [x] CRNN([paper](https://arxiv.org/abs/1507.05717))
- [x] Rosetta([paper](https://arxiv.org/abs/1910.05085)) - [x] Rosetta([paper](https://arxiv.org/abs/1910.05085))
- [x] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html)) - [x] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html))
- [ ] RARE([paper](https://arxiv.org/abs/1603.03915v1)) - [ ] RARE([paper](https://arxiv.org/abs/1603.03915v1)) coming soon
- [ ] SRN([paper](https://arxiv.org/abs/2003.12294))(Baidu Self-Research) - [ ] SRN([paper](https://arxiv.org/abs/2003.12294))(Baidu Self-Research) coming soon
Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow: Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow:
...@@ -56,4 +56,5 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r ...@@ -56,4 +56,5 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r
|STAR-Net|Resnet34_vd|83.93%|rec_r34_vd_tps_bilstm_ctc|[Download link](https://paddleocr.bj.bcebos.com/rec_r34_vd_tps_bilstm_ctc.tar)| |STAR-Net|Resnet34_vd|83.93%|rec_r34_vd_tps_bilstm_ctc|[Download link](https://paddleocr.bj.bcebos.com/rec_r34_vd_tps_bilstm_ctc.tar)|
|STAR-Net|MobileNetV3|81.56%|rec_mv3_tps_bilstm_ctc|[Download link](https://paddleocr.bj.bcebos.com/rec_mv3_tps_bilstm_ctc.tar)| |STAR-Net|MobileNetV3|81.56%|rec_mv3_tps_bilstm_ctc|[Download link](https://paddleocr.bj.bcebos.com/rec_mv3_tps_bilstm_ctc.tar)|
Please refer to the document for training guide and use of PaddleOCR text recognition algorithms [Text recognition model training/evaluation/prediction](./doc/doc_en/recognition_en.md) Please refer to the document for training guide and use of PaddleOCR text recognition algorithms [Text recognition model training/evaluation/prediction](./doc/doc_en/recognition_en.md)
# OPTIONAL PARAMETERS LIST ## Optional parameter list
The following list can be viewed via `--help` The following list can be viewed through `--help`
| FLAG | Supported script | Use | Defaults | Note | | FLAG | Supported script | Use | Defaults | Note |
| :----------------------: | :------------: | :---------------: | :--------------: | :-----------------: | | :----------------------: | :------------: | :---------------: | :--------------: | :-----------------: |
| -c | ALL | Specify configuration file to use | None | **Please refer to the parameter introduction for configuration file usage** | | -c | ALL | Specify configuration file to use | None | **Please refer to the parameter introduction for configuration file usage** |
| -o | ALL | set configuration options | None | Configuration using -o has higher priority than the configuration file selected with -c. E.g: `-o Global.use_gpu=false` | | -o | ALL | set configuration options | None | Configuration using -o has higher priority than the configuration file selected with -c. E.g: -o Global.use_gpu=false |
Take `rec_chinese_lite_train_v1.1.yml` as an example Take rec_chinese_lite_train_v1.1.yml as an example
### Global
| Parameter | Use | Default | Note | | Parameter | Use | Defaults | Note |
| :----------------------: | :---------------------: | :--------------: | :--------------------: | | :----------------------: | :---------------------: | :--------------: | :--------------------: |
| algorithm | Select algorithm to use | Synchronize with configuration file | For selecting model, please refer to the supported model [list](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/README_en.md) | | use_gpu | Set using GPU or not | true | \ |
| use_gpu | Set using GPU or not | true | \ | | epoch_num | Maximum training epoch number | 500 | \ |
| epoch_num | Maximum training epoch number | 3000 | \ |
| log_smooth_window | Sliding window size | 20 | \ | | log_smooth_window | Sliding window size | 20 | \ |
| print_batch_step | Set print log interval | 10 | \ | | print_batch_step | Set print log interval | 10 | \ |
| save_model_dir | Set model save path | output/{model_name} | \ | | save_model_dir | Set model save path | output/{算法名称} | \ |
| save_epoch_step | Set model save interval | 3 | \ | | save_epoch_step | Set model save interval | 3 | \ |
| eval_batch_step | Set the model evaluation interval |2000 or [1000, 2000] |runing evaluation every 2000 iters or evaluation is run every 2000 iterations after the 1000th iteration | | eval_batch_step | Set the model evaluation interval | 2000 or [1000, 2000] | runing evaluation every 2000 iters or evaluation is run every 2000 iterations after the 1000th iteration |
|train_batch_size_per_card | Set the batch size during training | 256 | \ | | cal_metric_during_train | Set whether to evaluate the metric during the training process. At this time, the metric of the model under the current batch is evaluated | true | \ |
| test_batch_size_per_card | Set the batch size during testing | 256 | \ | | load_static_weights | Set whether the pre-training model is saved in static graph mode (currently only required by the detection algorithm) | true | \ |
| image_shape | Set input image size | [3, 32, 100] | \ | | pretrained_model | Set the path of the pre-trained model | ./pretrain_models/CRNN/best_accuracy | \ |
| max_text_length | Set the maximum text length | 25 | \ | | checkpoints | set model parameter path | None | Used to load parameters after interruption to continue training|
| character_type | Set character type | ch | en/ch, the default dict will be used for en, and the custom dict will be used for ch| | use_visualdl | Set whether to enable visualdl for visual log display | False | [Tutorial](https://www.paddlepaddle.org.cn/paddle/visualdl) |
| character_dict_path | Set dictionary path | ./ppocr/utils/ic15_dict.txt | \ | | infer_img | Set inference image path or folder path | ./infer_img | \|
| loss_type | Set loss type | ctc | Supports two types of loss: ctc / attention | | character_dict_path | Set dictionary path | ./ppocr/utils/ppocr_keys_v1.txt | \ |
| distort | Set use distort | false | Support distort type ,read [img_tools.py](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/ppocr/data/rec/img_tools.py) | | max_text_length | Set the maximum length of text | 25 | \ |
| use_space_char | Wether to recognize space | false | Only support in character_type=ch mode | | character_type | Set character type | ch | en/ch, the default dict will be used for en, and the custom dict will be used for ch |
label_list | Set the angle supported by the direction classifier | ['0','180'] | Only valid in the direction classifier | | use_space_char | Set whether to recognize spaces | True | Only support in character_type=ch mode |
| reader_yml | Set the reader configuration file | ./configs/rec/rec_icdar15_reader.yml | \ | | label_list | Set the angle supported by the direction classifier | ['0','180'] | Only valid in angle classifier model |
| pretrain_weights | Load pre-trained model path | ./pretrain_models/CRNN/best_accuracy | \ | | save_res_path | Set the save address of the test model results | ./output/det_db/predicts_db.txt | Only valid in the text detection model |
| checkpoints | Load saved model path | None | Used to load saved parameters to continue training after interruption |
| save_inference_dir | path to save model for inference | None | Use to save inference model | ### Optimizer ([ppocr/optimizer](../../ppocr/optimizer))
| :---------------------: | :---------------------: | :--------------: | :--------------------: |
Take `rec_chinese_reader.yml` as an example: | name | Optimizer class name | Adam | Currently supports`Momentum`,`Adam`,`RMSProp`, see [ppocr/optimizer/optimizer.py](../../ppocr/optimizer/optimizer.py) |
| beta1 | Set the exponential decay rate for the 1st moment estimates | 0.9 | \ |
| Parameter | Use | Default | Note | | beta2 | Set the exponential decay rate for the 2nd moment estimates | 0.999 | \ |
| :----------------------: | :---------------------: | :--------------: | :--------------------: | | **lr** | Set the learning rate decay method | - | \ |
| reader_function | Select data reading method | ppocr.data.rec.dataset_traversal,SimpleReader | Support two data reading methods: SimpleReader / LMDBReader | | name | Learning rate decay class name | Cosine | Currently supports`Linear`,`Cosine`,`Step`,`Piecewise`, see[ppocr/optimizer/learning_rate.py](../../ppocr/optimizer/learning_rate.py) |
| num_workers | Set the number of data reading threads | 8 | \ | | learning_rate | Set the base learning rate | 0.001 | \ |
| img_set_dir | Image folder path | ./train_data | \ | | **regularizer** | Set network regularization method | - | \ |
| label_file_path | Groundtruth file path | ./train_data/rec_gt_train.txt| \ | | name | Regularizer class name | L2 | Currently support`L1`,`L2`, see[ppocr/optimizer/regularizer.py](../../ppocr/optimizer/regularizer.py) |
| infer_img | Result folder path | ./infer_img | \| | factor | Learning rate decay coefficient | 0.00004 | \ |
### Architecture ([ppocr/modeling](../../ppocr/modeling))
In ppocr, the network is divided into four stages: Transform, Backbone, Neck and Head
| Parameter | Use | Defaults | Note |
| :---------------------: | :---------------------: | :--------------: | :--------------------: |
| model_type | Network Type | rec | Currently support`rec`,`det`,`cls` |
| algorithm | Model name | CRNN | See [algorithm_overview](./algorithm_overview.md) for the support list |
| **Transform** | Set the transformation method | - | Currently only recognition algorithms are supported, see [ppocr/modeling/transform](../../ppocr/modeling/transform) for details |
| name | Transformation class name | TPS | Currently supports `TPS` |
| num_fiducial | Number of TPS control points | 20 | Ten on the top and bottom |
| loc_lr | Localization network learning rate | 0.1 | \ |
| model_name | Localization network size | small | Currently support`small`,`large` |
| **Backbone** | Set the network backbone class name | - | see [ppocr/modeling/backbones](../../ppocr/modeling/backbones) |
| name | backbone class name | ResNet | Currently support`MobileNetV3`,`ResNet` |
| layers | resnet layers | 34 | Currently support18,34,50,101,152,200 |
| model_name | MobileNetV3 network size | small | Currently support`small`,`large` |
| **Neck** | Set network neck | - | see[ppocr/modeling/necks](../../ppocr/modeling/necks) |
| name | neck class name | SequenceEncoder | Currently support`SequenceEncoder`,`DBFPN` |
| encoder_type | SequenceEncoder encoder type | rnn | Currently support`reshape`,`fc`,`rnn` |
| hidden_size | rnn number of internal units | 48 | \ |
| out_channels | Number of DBFPN output channels | 256 | \ |
| **Head** | Set the network head | - | see[ppocr/modeling/heads](../../ppocr/modeling/heads) |
| name | head class name | CTCHead | Currently support`CTCHead`,`DBHead`,`ClsHead` |
| fc_decay | CTCHead regularization coefficient | 0.0004 | \ |
| k | DBHead binarization coefficient | 50 | \ |
| class_dim | ClsHead output category number | 2 | \ |
### Loss ([ppocr/losses](../../ppocr/losses))
| Parameter | Use | Defaults | Note |
| :---------------------: | :---------------------: | :--------------: | :--------------------: |
| name | loss class name | CTCLoss | Currently support`CTCLoss`,`DBLoss`,`ClsLoss` |
| balance_loss | Whether to balance the number of positive and negative samples in DBLossloss (using OHEM) | True | \ |
| ohem_ratio | The negative and positive sample ratio of OHEM in DBLossloss | 3 | \ |
| main_loss_type | The loss used by shrink_map in DBLossloss | DiceLoss | Currently support`DiceLoss`,`BCELoss` |
| alpha | The coefficient of shrink_map_loss in DBLossloss | 5 | \ |
| beta | The coefficient of threshold_map_loss in DBLossloss | 10 | \ |
## INTRODUCTION TO OPTIMIZER PARAMETERS OF CONFIGURATION FILE ### PostProcess ([ppocr/postprocess](../../ppocr/postprocess))
Take `rec_icdar15_train.yml` as an example: | Parameter | Use | Defaults | Note |
| :---------------------: | :---------------------: | :--------------: | :--------------------: |
| name | Post-processing class name | CTCLabelDecode | Currently support`CTCLoss`,`AttnLabelDecode`,`DBPostProcess`,`ClsPostProcess` |
| thresh | The threshold for binarization of the segmentation map in DBPostProcess | 0.3 | \ |
| box_thresh | The threshold for filtering output boxes in DBPostProcess. Boxes below this threshold will not be output | 0.7 | \ |
| max_candidates | The maximum number of text boxes output in DBPostProcess | 1000 | |
| unclip_ratio | The unclip ratio of the text box in DBPostProcess | 2.0 | \ |
### Metric ([ppocr/metrics](../../ppocr/metrics))
| Parameter | Use | Defaults | Note |
| :---------------------: | :---------------------: | :--------------: | :--------------------: |
| name | Metric method name | CTCLabelDecode | Currently support`DetMetric`,`RecMetric`,`ClsMetric` |
| main_indicator | Main indicators, used to select the best model | acc | For the detection method is hmean, the recognition and classification method is acc |
| Parameter | Use | Default | None | ### Dataset ([ppocr/data](../../ppocr/data))
| Parameter | Use | Defaults | Note |
| :---------------------: | :---------------------: | :--------------: | :--------------------: | | :---------------------: | :---------------------: | :--------------: | :--------------------: |
| function | Select Optimizer function | pocr.optimizer,AdamDecay | Only support Adam | | **dataset** | Return one sample per iteration | - | - |
| base_lr | Set the base lr | 0.0005 | \ | | name | dataset class name | SimpleDataSet | Currently support`SimpleDataSet`,`LMDBDateSet` |
| beta1 | Set the exponential decay rate for the 1st moment estimates | 0.9 | \ | | data_dir | Image folder path | ./train_data | \ |
| beta2 | Set the exponential decay rate for the 2nd moment estimates | 0.999 | \ | | label_file_list | Groundtruth file path | ["./train_data/train_list.txt"] | This parameter is not required when dataset is LMDBDateSet |
| decay | Whether to use decay | \ | \ | | ratio_list | Ratio of data set | [1.0] | If there are two train_lists in label_file_list and ratio_list is [0.4,0.6], 40% will be sampled from train_list1, and 60% will be sampled from train_list2 to combine the entire dataset |
| function(decay) | Set the decay function | cosine_decay | Support cosine_decay, cosine_decay_warmup and piecewise_decay | | transforms | List of methods to transform images and labels | [DecodeImage,CTCLabelEncode,RecResizeImg,KeepKeys] | see[ppocr/data/imaug](../../ppocr/data/imaug) |
| step_each_epoch | The number of steps in an epoch. Used in cosine_decay/cosine_decay_warmup | 20 | Calculation: total_image_num / (batch_size_per_card * card_size) | | **loader** | dataloader related | - | |
| total_epoch | The number of epochs. Used in cosine_decay/cosine_decay_warmup | 1000 | Consistent with Global.epoch_num | | shuffle | Does each epoch disrupt the order of the data set | True | \ |
| warmup_minibatch | Number of steps for linear warmup. Used in cosine_decay_warmup | 1000 | \ | | batch_size_per_card | Single card batch size during training | 256 | \ |
| boundaries | The step intervals to reduce learning rate. Used in piecewise_decay | - | The format is list | | drop_last | Whether to discard the last incomplete mini-batch because the number of samples in the data set cannot be divisible by batch_size | True | \ |
| decay_rate | Learning rate decay rate. Used in piecewise_decay | - | \ | | num_workers | The number of sub-processes used to load data, if it is 0, the sub-process is not started, and the data is loaded in the main process | 8 | \ |
\ No newline at end of file
...@@ -135,7 +135,7 @@ If you need to customize dic file, please add character_dict_path field in confi ...@@ -135,7 +135,7 @@ If you need to customize dic file, please add character_dict_path field in confi
<a name="Add_space_category"></a> <a name="Add_space_category"></a>
- Add space category - Add space category
If you want to support the recognition of the `space` category, please set the `use_space_char` field in the yml file to `true`. If you want to support the recognition of the `space` category, please set the `use_space_char` field in the yml file to `True`.
**Note: use_space_char only takes effect when character_type=ch** **Note: use_space_char only takes effect when character_type=ch**
...@@ -158,10 +158,9 @@ tar -xf rec_mv3_none_bilstm_ctc.tar && rm -rf rec_mv3_none_bilstm_ctc.tar ...@@ -158,10 +158,9 @@ tar -xf rec_mv3_none_bilstm_ctc.tar && rm -rf rec_mv3_none_bilstm_ctc.tar
Start training: Start training:
``` ```
# GPU training Support single card and multi-card training, specify the card number through CUDA_VISIBLE_DEVICES # GPU training Support single card and multi-card training, specify the card number through --gpus
# Training icdar15 English data and saving the log as train_rec.log # Training icdar15 English data and saving the log as train_rec.log
python3 tools/train.py -c configs/rec/rec_icdar15_train.yml 2>&1 | tee train_rec.log python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_icdar15_train.yml
``` ```
<a name="Data_Augmentation"></a> <a name="Data_Augmentation"></a>
- Data Augmentation - Data Augmentation
...@@ -184,8 +183,8 @@ If the evaluation set is large, the test will be time-consuming. It is recommend ...@@ -184,8 +183,8 @@ If the evaluation set is large, the test will be time-consuming. It is recommend
| Configuration file | Algorithm | backbone | trans | seq | pred | | Configuration file | Algorithm | backbone | trans | seq | pred |
| :--------: | :-------: | :-------: | :-------: | :-----: | :-----: | | :--------: | :-------: | :-------: | :-------: | :-----: | :-----: |
| [rec_chinese_lite_train_v1.1.yml](../../configs/rec/ch_ppocr_v1.1/rec_chinese_lite_train_v1.1.yml) | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc |
| [rec_chinese_common_train_v1.1.yml](../../configs/rec/ch_ppocr_v1.1/rec_chinese_common_train_v1.1.yml) | CRNN | ResNet34_vd | None | BiLSTM | ctc | | [rec_chinese_common_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml) | CRNN | ResNet34_vd | None | BiLSTM | ctc |
| rec_chinese_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | | rec_chinese_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc |
| rec_chinese_common_train.yml | CRNN | ResNet34_vd | None | BiLSTM | ctc | | rec_chinese_common_train.yml | CRNN | ResNet34_vd | None | BiLSTM | ctc |
| rec_icdar15_train.yml | CRNN | Mobilenet_v3 large 0.5 | None | BiLSTM | ctc | | rec_icdar15_train.yml | CRNN | Mobilenet_v3 large 0.5 | None | BiLSTM | ctc |
...@@ -199,39 +198,69 @@ If the evaluation set is large, the test will be time-consuming. It is recommend ...@@ -199,39 +198,69 @@ If the evaluation set is large, the test will be time-consuming. It is recommend
| rec_r34_vd_tps_bilstm_ctc.yml | STARNet | Resnet34_vd | tps | BiLSTM | ctc | | rec_r34_vd_tps_bilstm_ctc.yml | STARNet | Resnet34_vd | tps | BiLSTM | ctc |
For training Chinese data, it is recommended to use For training Chinese data, it is recommended to use
训练中文数据,推荐使用[rec_chinese_lite_train_v1.1.yml](../../configs/rec/ch_ppocr_v1.1/rec_chinese_lite_train_v1.1.yml). If you want to try the result of other algorithms on the Chinese data set, please refer to the following instructions to modify the configuration file: [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml). If you want to try the result of other algorithms on the Chinese data set, please refer to the following instructions to modify the configuration file:
co co
Take `rec_mv3_none_none_ctc.yml` as an example: Take `rec_chinese_lite_train_v2.0.yml` as an example:
``` ```
Global: Global:
... ...
# Modify image_shape to fit long text # Add a custom dictionary, such as modify the dictionary, please point the path to the new dictionary
image_shape: [3, 32, 320] character_dict_path: ppocr/utils/ppocr_keys_v1.txt
# Modify character type # Modify character type
character_type: ch character_type: ch
# Add a custom dictionary, such as modify the dictionary, please point the path to the new dictionary
character_dict_path: ./ppocr/utils/ppocr_keys_v1.txt
... ...
# Modify reader type
reader_yml: ./configs/rec/rec_chinese_reader.yml
# Whether to use data augmentation
distort: true
# Whether to recognize spaces # Whether to recognize spaces
use_space_char: true use_space_char: True
Optimizer: Optimizer:
... ...
# Add learning rate decay strategy # Add learning rate decay strategy
decay: lr:
function: cosine_decay name: Cosine
# Each epoch contains iter number learning_rate: 0.001
step_each_epoch: 20 ...
# Total epoch number
total_epoch: 1000 ...
# Type of dataset,we support LMDBDateSet and SimpleDataSet
name: SimpleDataSet
# Path of dataset
data_dir: ./train_data/
# Path of train list
label_file_list: ["./train_data/train_list.txt"]
- RecResizeImg:
# Modify image_shape to fit long text
image_shape: [3, 32, 320]
# Train batch_size for Single card
batch_size_per_card: 256
# Type of dataset,we support LMDBDateSet and SimpleDataSet
name: SimpleDataSet
# Path of dataset
data_dir: ./train_data
# Path of eval list
label_file_list: ["./train_data/val_list.txt"]
- RecResizeImg:
# Modify image_shape to fit long text
image_shape: [3, 32, 320]
# Eval batch_size for Single card
batch_size_per_card: 256
``` ```
**Note that the configuration file for prediction/evaluation must be consistent with the training.** **Note that the configuration file for prediction/evaluation must be consistent with the training.**
...@@ -257,18 +286,33 @@ Take `rec_french_lite_train` as an example: ...@@ -257,18 +286,33 @@ Take `rec_french_lite_train` as an example:
``` ```
Global: Global:
... ...
# Add a custom dictionary, if you modify the dictionary # Add a custom dictionary, such as modify the dictionary, please point the path to the new dictionary
# please point the path to the new dictionary
character_dict_path: ./ppocr/utils/dict/french_dict.txt character_dict_path: ./ppocr/utils/dict/french_dict.txt
# Add data augmentation during training
distort: true
# Identify spaces
use_space_char: true
# Modify reader type
reader_yml: ./configs/rec/multi_languages/rec_french_reader.yml
... ...
# Whether to recognize spaces
use_space_char: True
... ...
# Type of dataset,we support LMDBDateSet and SimpleDataSet
name: SimpleDataSet
# Path of dataset
data_dir: ./train_data/
# Path of train list
label_file_list: ["./train_data/french_train.txt"]
# Type of dataset,we support LMDBDateSet and SimpleDataSet
name: SimpleDataSet
# Path of dataset
data_dir: ./train_data
# Path of eval list
label_file_list: ["./train_data/french_val.txt"]
``` ```
<a name="EVALUATION"></a> <a name="EVALUATION"></a>
...@@ -277,9 +321,8 @@ Global: ...@@ -277,9 +321,8 @@ Global:
The evaluation data set can be modified via `configs/rec/rec_icdar15_reader.yml` setting of `label_file_path` in EvalReader. The evaluation data set can be modified via `configs/rec/rec_icdar15_reader.yml` setting of `label_file_path` in EvalReader.
``` ```
# GPU evaluation, Global.checkpoints is the weight to be tested # GPU evaluation, Global.checkpoints is the weight to be tested
python3 tools/eval.py -c configs/rec/rec_icdar15_reader.yml -o Global.checkpoints={path/to/weights}/best_accuracy python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_icdar15_reader.yml -o Global.checkpoints={path/to/weights}/best_accuracy
``` ```
<a name="PREDICTION"></a> <a name="PREDICTION"></a>
...@@ -294,7 +337,7 @@ The default prediction picture is stored in `infer_img`, and the weight is speci ...@@ -294,7 +337,7 @@ The default prediction picture is stored in `infer_img`, and the weight is speci
``` ```
# Predict English results # Predict English results
python3 tools/infer_rec.py -c configs/rec/ch_ppocr_v1.1/rec_chinese_lite_train_v1.1.yml -o Global.checkpoints={path/to/weights}/best_accuracy TestReader.infer_img=doc/imgs_words/en/word_1.jpg python3 tools/infer_rec.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml -o Global.checkpoints={path/to/weights}/best_accuracy TestReader.infer_img=doc/imgs_words/en/word_1.jpg
``` ```
Input image: Input image:
...@@ -309,11 +352,11 @@ infer_img: doc/imgs_words/en/word_1.png ...@@ -309,11 +352,11 @@ infer_img: doc/imgs_words/en/word_1.png
word : joint word : joint
``` ```
The configuration file used for prediction must be consistent with the training. For example, you completed the training of the Chinese model with `python3 tools/train.py -c configs/rec/ch_ppocr_v1.1/rec_chinese_lite_train_v1.1.yml`, you can use the following command to predict the Chinese model: The configuration file used for prediction must be consistent with the training. For example, you completed the training of the Chinese model with `python3 tools/train.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml`, you can use the following command to predict the Chinese model:
``` ```
# Predict Chinese results # Predict Chinese results
python3 tools/infer_rec.py -c configs/rec/ch_ppocr_v1.1/rec_chinese_lite_train_v1.1.yml -o Global.checkpoints={path/to/weights}/best_accuracy TestReader.infer_img=doc/imgs_words/ch/word_1.jpg python3 tools/infer_rec.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml -o Global.checkpoints={path/to/weights}/best_accuracy TestReader.infer_img=doc/imgs_words/ch/word_1.jpg
``` ```
Input image: Input image:
...@@ -271,6 +271,59 @@ im_show.save('result.jpg') ...@@ -271,6 +271,59 @@ im_show.save('result.jpg')
paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_dir} --rec_model_dir {your_rec_model_dir} --rec_char_dict_path {your_rec_char_dict_path} --cls_model_dir {your_cls_model_dir} --use_angle_cls true --cls true paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_dir} --rec_model_dir {your_rec_model_dir} --rec_char_dict_path {your_rec_char_dict_path} --cls_model_dir {your_cls_model_dir} --use_angle_cls true --cls true
``` ```
### Use web images or numpy array as input
1. Web image
Use by code
from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg'
result = ocr.ocr(img_path, cls=True)
for line in result:
# show result
from PIL import Image
image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result]
txts = [line[1][0] for line in result]
scores = [line[1][1] for line in result]
im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/simfang.ttf')
im_show = Image.fromarray(im_show)
Use by command line
paddleocr --image_dir http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg --use_angle_cls=true
2. Numpy array
Support numpy array as input only when used by code
from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs/11.jpg'
img = cv2.imread(img_path)
# img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), If your own training model supports grayscale images, you can uncomment this line
result = ocr.ocr(img_path, cls=True)
for line in result:
# show result
from PIL import Image
image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result]
txts = [line[1][0] for line in result]
scores = [line[1][1] for line in result]
im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/simfang.ttf')
im_show = Image.fromarray(im_show)
## Parameter Description ## Parameter Description
| Parameter | Description | Default value | | Parameter | Description | Default value |
...@@ -295,6 +348,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_ ...@@ -295,6 +348,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_
| max_text_length | The maximum text length that the recognition algorithm can recognize | 25 | | max_text_length | The maximum text length that the recognition algorithm can recognize | 25 |
| rec_char_dict_path | the alphabet path which needs to be modified to your own path when `rec_model_Name` use mode 2 | ./ppocr/utils/ppocr_keys_v1.txt | | rec_char_dict_path | the alphabet path which needs to be modified to your own path when `rec_model_Name` use mode 2 | ./ppocr/utils/ppocr_keys_v1.txt |
| use_space_char | Whether to recognize spaces | TRUE | | use_space_char | Whether to recognize spaces | TRUE |
| drop_score | Filter the output by score (from the recognition model), and those below this score will not be returned | 0.5 |
| use_angle_cls | Whether to load classification model | FALSE | | use_angle_cls | Whether to load classification model | FALSE |
| cls_model_dir | the classification inference model folder. There are two ways to transfer parameters, 1. None: Automatically download the built-in model to `~/.paddleocr/cls`; 2. The path of the inference model converted by yourself, the model and params files must be included in the model path | None | | cls_model_dir | the classification inference model folder. There are two ways to transfer parameters, 1. None: Automatically download the built-in model to `~/.paddleocr/cls`; 2. The path of the inference model converted by yourself, the model and params files must be included in the model path | None |
| cls_image_shape | image shape of classification algorithm | "3,48,192" | | cls_image_shape | image shape of classification algorithm | "3,48,192" |
...@@ -305,4 +359,4 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_ ...@@ -305,4 +359,4 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_
| lang | The support language, now only Chinese(ch)、English(en)、French(french)、German(german)、Korean(korean)、Japanese(japan) are supported | ch | | lang | The support language, now only Chinese(ch)、English(en)、French(french)、German(german)、Korean(korean)、Japanese(japan) are supported | ch |
| det | Enable detction when `ppocr.ocr` func exec | TRUE | | det | Enable detction when `ppocr.ocr` func exec | TRUE |
| rec | Enable recognition when `ppocr.ocr` func exec | TRUE | | rec | Enable recognition when `ppocr.ocr` func exec | TRUE |
| cls | Enable classification when `ppocr.ocr` func exec | FALSE | | cls | Enable classification when `ppocr.ocr` func exec((Use use_angle_cls in command line mode to control whether to start classification in the forward direction) | FALSE |
...@@ -26,6 +26,9 @@ from .randaugment import RandAugment ...@@ -26,6 +26,9 @@ from .randaugment import RandAugment
from .operators import * from .operators import *
from .label_ops import * from .label_ops import *
from .east_process import *
from .sast_process import *
def transform(data, ops=None): def transform(data, ops=None):
""" transform """ """ transform """
...@@ -52,6 +52,7 @@ class DetLabelEncode(object): ...@@ -52,6 +52,7 @@ class DetLabelEncode(object):
txt_tags.append(True) txt_tags.append(True)
else: else:
txt_tags.append(False) txt_tags.append(False)
boxes = self.expand_points_num(boxes)
boxes = np.array(boxes, dtype=np.float32) boxes = np.array(boxes, dtype=np.float32)
txt_tags = np.array(txt_tags, dtype=np.bool) txt_tags = np.array(txt_tags, dtype=np.bool)
...@@ -70,6 +71,17 @@ class DetLabelEncode(object): ...@@ -70,6 +71,17 @@ class DetLabelEncode(object):
rect[3] = pts[np.argmax(diff)] rect[3] = pts[np.argmax(diff)]
return rect return rect
def expand_points_num(self, boxes):
max_points_num = 0
for box in boxes:
if len(box) > max_points_num:
max_points_num = len(box)
ex_boxes = []
for box in boxes:
ex_box = box + [box[-1]] * (max_points_num - len(box))
return ex_boxes
class BaseRecLabelEncode(object): class BaseRecLabelEncode(object):
""" Convert between text-label and text-index """ """ Convert between text-label and text-index """
...@@ -83,7 +95,7 @@ class BaseRecLabelEncode(object): ...@@ -83,7 +95,7 @@ class BaseRecLabelEncode(object):
'ch', 'en', 'en_sensitive', 'french', 'german', 'japan', 'korean' 'ch', 'en', 'en_sensitive', 'french', 'german', 'japan', 'korean'
] ]
assert character_type in support_character_type, "Only {} are supported now but get {}".format( assert character_type in support_character_type, "Only {} are supported now but get {}".format(
support_character_type, self.character_str) support_character_type, character_type)
self.max_text_len = max_text_length self.max_text_len = max_text_length
if character_type == "en": if character_type == "en":
...@@ -27,14 +27,13 @@ class SimpleDataSet(Dataset): ...@@ -27,14 +27,13 @@ class SimpleDataSet(Dataset):
global_config = config['Global'] global_config = config['Global']
dataset_config = config[mode]['dataset'] dataset_config = config[mode]['dataset']
loader_config = config[mode]['loader'] loader_config = config[mode]['loader']
batch_size = loader_config['batch_size_per_card']
self.delimiter = dataset_config.get('delimiter', '\t') self.delimiter = dataset_config.get('delimiter', '\t')
label_file_list = dataset_config.pop('label_file_list') label_file_list = dataset_config.pop('label_file_list')
data_source_num = len(label_file_list) data_source_num = len(label_file_list)
ratio_list = dataset_config.get("ratio_list", [1.0]) ratio_list = dataset_config.get("ratio_list", [1.0])
if isinstance(ratio_list, (float, int)): if isinstance(ratio_list, (float, int)):
ratio_list = [float(ratio_list)] * len(data_source_num) ratio_list = [float(ratio_list)] * int(data_source_num)
assert len( assert len(
ratio_list ratio_list
...@@ -76,6 +75,8 @@ class SimpleDataSet(Dataset): ...@@ -76,6 +75,8 @@ class SimpleDataSet(Dataset):
label = substr[1] label = substr[1]
img_path = os.path.join(self.data_dir, file_name) img_path = os.path.join(self.data_dir, file_name)
data = {'img_path': img_path, 'label': label} data = {'img_path': img_path, 'label': label}
if not os.path.exists(img_path):
raise Exception("{} does not exist!".format(img_path))
with open(data['img_path'], 'rb') as f: with open(data['img_path'], 'rb') as f:
img = f.read() img = f.read()
data['image'] = img data['image'] = img
...@@ -18,6 +18,8 @@ import copy ...@@ -18,6 +18,8 @@ import copy
def build_loss(config): def build_loss(config):
# det loss # det loss
from .det_db_loss import DBLoss from .det_db_loss import DBLoss
from .det_east_loss import EASTLoss
from .det_sast_loss import SASTLoss
# rec loss # rec loss
from .rec_ctc_loss import CTCLoss from .rec_ctc_loss import CTCLoss
...@@ -25,7 +27,7 @@ def build_loss(config): ...@@ -25,7 +27,7 @@ def build_loss(config):
# cls loss # cls loss
from .cls_loss import ClsLoss from .cls_loss import ClsLoss
support_dict = ['DBLoss', 'CTCLoss', 'ClsLoss'] support_dict = ['DBLoss', 'EASTLoss', 'SASTLoss', 'CTCLoss', 'ClsLoss']
config = copy.deepcopy(config) config = copy.deepcopy(config)
module_name = config.pop('name') module_name = config.pop('name')
...@@ -16,7 +16,7 @@ from __future__ import division ...@@ -16,7 +16,7 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
from paddle import nn from paddle import nn
from ppocr.modeling.transform import build_transform from ppocr.modeling.transforms import build_transform
from ppocr.modeling.backbones import build_backbone from ppocr.modeling.backbones import build_backbone
from ppocr.modeling.necks import build_neck from ppocr.modeling.necks import build_neck
from ppocr.modeling.heads import build_head from ppocr.modeling.heads import build_head
...@@ -19,6 +19,7 @@ def build_backbone(config, model_type): ...@@ -19,6 +19,7 @@ def build_backbone(config, model_type):
if model_type == 'det': if model_type == 'det':
from .det_mobilenet_v3 import MobileNetV3 from .det_mobilenet_v3 import MobileNetV3
from .det_resnet_vd import ResNet from .det_resnet_vd import ResNet
from .det_resnet_vd_sast import ResNet_SAST
support_dict = ['MobileNetV3', 'ResNet', 'ResNet_SAST'] support_dict = ['MobileNetV3', 'ResNet', 'ResNet_SAST']
elif model_type == 'rec' or model_type == 'cls': elif model_type == 'rec' or model_type == 'cls':
from .rec_mobilenet_v3 import MobileNetV3 from .rec_mobilenet_v3 import MobileNetV3
...@@ -27,7 +27,7 @@ class BaseRecLabelDecode(object): ...@@ -27,7 +27,7 @@ class BaseRecLabelDecode(object):
'ch', 'en', 'en_sensitive', 'french', 'german', 'japan', 'korean' 'ch', 'en', 'en_sensitive', 'french', 'german', 'japan', 'korean'
] ]
assert character_type in support_character_type, "Only {} are supported now but get {}".format( assert character_type in support_character_type, "Only {} are supported now but get {}".format(
support_character_type, self.character_str) support_character_type, character_type)
if character_type == "en": if character_type == "en":
self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
想要评论请 注册