提交 aad3093a 编写于 作者: W WenmuZhou

dygraph first commit

上级 10f7e519
TrainReader:
reader_function: ppocr.data.det.dataset_traversal,TrainReader
process_function: ppocr.data.det.db_process,DBProcessTrain
num_workers: 8
img_set_dir: ./train_data/icdar2015/text_localization/
label_file_path: ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
EvalReader:
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
process_function: ppocr.data.det.db_process,DBProcessTest
img_set_dir: ./train_data/icdar2015/text_localization/
label_file_path: ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
test_image_shape: [736, 1280]
TestReader:
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
process_function: ppocr.data.det.db_process,DBProcessTest
infer_img:
img_set_dir: ./train_data/icdar2015/text_localization/
label_file_path: ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
test_image_shape: [736, 1280]
do_eval: True
TrainReader:
reader_function: ppocr.data.det.dataset_traversal,TrainReader
process_function: ppocr.data.det.east_process,EASTProcessTrain
num_workers: 8
img_set_dir: ./train_data/icdar2015/text_localization/
label_file_path: ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
background_ratio: 0.125
min_crop_side_ratio: 0.1
min_text_size: 10
EvalReader:
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
process_function: ppocr.data.det.east_process,EASTProcessTest
img_set_dir: ./train_data/icdar2015/text_localization/
label_file_path: ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
TestReader:
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
process_function: ppocr.data.det.east_process,EASTProcessTest
infer_img:
img_set_dir: ./train_data/icdar2015/text_localization/
label_file_path: ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
do_eval: True
Global:
algorithm: DB
use_gpu: true
epoch_num: 1200
log_smooth_window: 20
print_batch_step: 2
save_model_dir: ./output/det_db/
save_epoch_step: 200
save_model_dir: ./output/20201010/
save_epoch_step: 1200
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [4000, 5000]
train_batch_size_per_card: 16
test_batch_size_per_card: 16
image_shape: [3, 640, 640]
reader_yml: ./configs/det/det_db_icdar15_reader.yml
pretrain_weights: ./pretrain_models/MobileNetV3_large_x0_5_pretrained/
checkpoints:
save_res_path: ./output/det_db/predicts_db.txt
eval_batch_step: 8
# if pretrained_model is saved in static mode, load_static_weights must set to True
load_static_weights: True
cal_metric_during_train: False
pretrained_model: /home/zhoujun20/pretrain_models/MobileNetV3_large_x0_5_pretrained
checkpoints: #./output/det_db_0.001_DiceLoss_256_pp_config_2.0b_4gpu/best_accuracy
save_inference_dir:
use_visualdl: True
infer_img: doc/imgs_en/img_10.jpg
save_res_path: ./output/det_db/predicts_db.txt
Architecture:
function: ppocr.modeling.architectures.det_model,DetModel
Optimizer:
name: Adam
beta1: 0.9
beta2: 0.999
learning_rate:
# name: Cosine
lr: 0.001
# warmup_epoch: 0
regularizer:
name: 'L2'
factor: 0
Backbone:
function: ppocr.modeling.backbones.det_mobilenet_v3,MobileNetV3
Architecture:
type: det
algorithm: DB
Transform:
Backbone:
name: MobileNetV3
scale: 0.5
model_name: large
Head:
function: ppocr.modeling.heads.det_db_head,DBHead
model_name: large
Neck:
name: FPN
out_channels: 256
Head:
name: DBHead
k: 50
inner_channels: 96
out_channels: 2
Loss:
function: ppocr.modeling.losses.det_db_loss,DBLoss
name: DBLoss
balance_loss: true
main_loss_type: DiceLoss
alpha: 5
beta: 10
ohem_ratio: 3
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999
PostProcess:
function: ppocr.postprocess.db_postprocess,DBPostProcess
name: DBPostProcess
thresh: 0.3
box_thresh: 0.7
box_thresh: 0.6
max_candidates: 1000
unclip_ratio: 2.0
unclip_ratio: 1.5
Metric:
name: DetMetric
main_indicator: hmean
TRAIN:
dataset:
name: SimpleDataSet
data_dir: /home/zhoujun20/detection/
file_list:
- /home/zhoujun20/detection/train_icdar2015_label.txt # dataset1
ratio_list: [1.0]
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- IaaAugment:
augmenter_args:
- { 'type': Fliplr, 'args': { 'p': 0.5 } }
- { 'type': Affine, 'args': { 'rotate': [ -10,10 ] } }
- { 'type': Resize,'args': { 'size': [ 0.5,3 ] } }
- EastRandomCropData:
size: [ 640,640 ]
max_tries: 50
keep_ratio: true
- MakeBorderMap:
shrink_ratio: 0.4
thresh_min: 0.3
thresh_max: 0.7
- MakeShrinkMap:
shrink_ratio: 0.4
min_text_size: 8
- NormalizeImage:
scale: 1./255.
mean: [ 0.485, 0.456, 0.406 ]
std: [ 0.229, 0.224, 0.225 ]
order: 'hwc'
- ToCHWImage:
- keepKeys:
keep_keys: ['image','threshold_map','threshold_mask','shrink_map','shrink_mask'] # dataloader将按照此顺序返回list
loader:
shuffle: True
drop_last: False
batch_size: 16
num_workers: 6
EVAL:
dataset:
name: SimpleDataSet
data_dir: /home/zhoujun20/detection/
file_list:
- /home/zhoujun20/detection/test_icdar2015_label.txt
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- DetResizeForTest:
image_shape: [736,1280]
- NormalizeImage:
scale: 1./255.
mean: [ 0.485, 0.456, 0.406 ]
std: [ 0.229, 0.224, 0.225 ]
order: 'hwc'
- ToCHWImage:
- keepKeys:
keep_keys: ['image','shape','polys','ignore_tags']
loader:
shuffle: False
drop_last: False
batch_size: 1 # must be 1
num_workers: 6
\ No newline at end of file
Global:
algorithm: EAST
use_gpu: true
epoch_num: 100000
log_smooth_window: 20
print_batch_step: 5
save_model_dir: ./output/det_east/
save_epoch_step: 200
eval_batch_step: [5000, 5000]
train_batch_size_per_card: 16
test_batch_size_per_card: 16
image_shape: [3, 512, 512]
reader_yml: ./configs/det/det_east_icdar15_reader.yml
pretrain_weights: ./pretrain_models/MobileNetV3_large_x0_5_pretrained/
checkpoints:
save_res_path: ./output/det_east/predicts_east.txt
save_inference_dir:
Architecture:
function: ppocr.modeling.architectures.det_model,DetModel
Backbone:
function: ppocr.modeling.backbones.det_mobilenet_v3,MobileNetV3
scale: 0.5
model_name: large
Head:
function: ppocr.modeling.heads.det_east_head,EASTHead
model_name: small
Loss:
function: ppocr.modeling.losses.det_east_loss,EASTLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999
PostProcess:
function: ppocr.postprocess.east_postprocess,EASTPostPocess
score_thresh: 0.8
cover_thresh: 0.1
nms_thresh: 0.2
Global:
algorithm: DB
use_gpu: true
epoch_num: 1200
log_smooth_window: 20
print_batch_step: 2
save_model_dir: ./output/det_db/
save_epoch_step: 200
eval_batch_step: [5000, 5000]
train_batch_size_per_card: 8
test_batch_size_per_card: 16
image_shape: [3, 640, 640]
reader_yml: ./configs/det/det_db_icdar15_reader.yml
pretrain_weights: ./pretrain_models/ResNet50_vd_ssld_pretrained/
save_res_path: ./output/det_db/predicts_db.txt
checkpoints:
save_model_dir: ./output/20201010/
save_epoch_step: 1200
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: 8
# if pretrained_model is saved in static mode, load_static_weights must set to True
load_static_weights: True
cal_metric_during_train: False
pretrained_model: /home/zhoujun20/pretrain_models/MobileNetV3_large_x0_5_pretrained
checkpoints: #./output/det_db_0.001_DiceLoss_256_pp_config_2.0b_4gpu/best_accuracy
save_inference_dir:
use_visualdl: True
infer_img: doc/imgs_en/img_10.jpg
save_res_path: ./output/det_db/predicts_db.txt
Architecture:
function: ppocr.modeling.architectures.det_model,DetModel
Optimizer:
name: Adam
beta1: 0.9
beta2: 0.999
learning_rate:
# name: Cosine
lr: 0.001
# warmup_epoch: 0
regularizer:
name: 'L2'
factor: 0
Backbone:
function: ppocr.modeling.backbones.det_resnet_vd,ResNet
Architecture:
type: det
algorithm: DB
Transform:
Backbone:
name: ResNet
layers: 50
Head:
function: ppocr.modeling.heads.det_db_head,DBHead
model_name: large
Neck:
name: FPN
out_channels: 256
Head:
name: DBHead
k: 50
inner_channels: 256
out_channels: 2
Loss:
function: ppocr.modeling.losses.det_db_loss,DBLoss
name: DBLoss
balance_loss: true
main_loss_type: DiceLoss
alpha: 5
beta: 10
ohem_ratio: 3
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999
PostProcess:
function: ppocr.postprocess.db_postprocess,DBPostProcess
name: DBPostProcess
thresh: 0.3
box_thresh: 0.7
box_thresh: 0.6
max_candidates: 1000
unclip_ratio: 1.5
Metric:
name: DetMetric
main_indicator: hmean
TRAIN:
dataset:
name: SimpleDataSet
data_dir: /home/zhoujun20/detection/
file_list:
- /home/zhoujun20/detection/train_icdar2015_label.txt # dataset1
ratio_list: [1.0]
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- IaaAugment:
augmenter_args:
- { 'type': Fliplr, 'args': { 'p': 0.5 } }
- { 'type': Affine, 'args': { 'rotate': [ -10,10 ] } }
- { 'type': Resize,'args': { 'size': [ 0.5,3 ] } }
- EastRandomCropData:
size: [ 640,640 ]
max_tries: 50
keep_ratio: true
- MakeBorderMap:
shrink_ratio: 0.4
thresh_min: 0.3
thresh_max: 0.7
- MakeShrinkMap:
shrink_ratio: 0.4
min_text_size: 8
- NormalizeImage:
scale: 1./255.
mean: [ 0.485, 0.456, 0.406 ]
std: [ 0.229, 0.224, 0.225 ]
order: 'hwc'
- ToCHWImage:
- keepKeys:
keep_keys: ['image','threshold_map','threshold_mask','shrink_map','shrink_mask'] # dataloader将按照此顺序返回list
loader:
shuffle: True
drop_last: False
batch_size: 16
num_workers: 6
EVAL:
dataset:
name: SimpleDataSet
data_dir: /home/zhoujun20/detection/
file_list:
- /home/zhoujun20/detection/test_icdar2015_label.txt
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- DetResizeForTest:
image_shape: [736,1280]
- NormalizeImage:
scale: 1./255.
mean: [ 0.485, 0.456, 0.406 ]
std: [ 0.229, 0.224, 0.225 ]
order: 'hwc'
- ToCHWImage:
- keepKeys:
keep_keys: ['image','shape','polys','ignore_tags']
loader:
shuffle: False
drop_last: False
batch_size: 1 # must be 1
num_workers: 6
\ No newline at end of file
Global:
algorithm: EAST
use_gpu: true
epoch_num: 100000
log_smooth_window: 20
print_batch_step: 5
save_model_dir: ./output/det_east/
save_epoch_step: 200
eval_batch_step: [5000, 5000]
train_batch_size_per_card: 8
test_batch_size_per_card: 16
image_shape: [3, 512, 512]
reader_yml: ./configs/det/det_east_icdar15_reader.yml
pretrain_weights: ./pretrain_models/ResNet50_vd_ssld_pretrained/
save_res_path: ./output/det_east/predicts_east.txt
checkpoints:
save_inference_dir:
Architecture:
function: ppocr.modeling.architectures.det_model,DetModel
Backbone:
function: ppocr.modeling.backbones.det_resnet_vd,ResNet
layers: 50
Head:
function: ppocr.modeling.heads.det_east_head,EASTHead
model_name: large
Loss:
function: ppocr.modeling.losses.det_east_loss,EASTLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999
PostProcess:
function: ppocr.postprocess.east_postprocess,EASTPostPocess
score_thresh: 0.8
cover_thresh: 0.1
nms_thresh: 0.2
Global:
algorithm: SAST
use_gpu: true
epoch_num: 2000
log_smooth_window: 20
print_batch_step: 2
save_model_dir: ./output/det_sast/
save_epoch_step: 20
eval_batch_step: 5000
train_batch_size_per_card: 8
test_batch_size_per_card: 8
image_shape: [3, 512, 512]
reader_yml: ./configs/det/det_sast_icdar15_reader.yml
pretrain_weights: ./pretrain_models/ResNet50_vd_ssld_pretrained/
save_res_path: ./output/det_sast/predicts_sast.txt
checkpoints:
save_inference_dir:
Architecture:
function: ppocr.modeling.architectures.det_model,DetModel
Backbone:
function: ppocr.modeling.backbones.det_resnet_vd_sast,ResNet
layers: 50
Head:
function: ppocr.modeling.heads.det_sast_head,SASTHead
model_name: large
only_fpn_up: False
# with_cab: False
with_cab: True
Loss:
function: ppocr.modeling.losses.det_sast_loss,SASTLoss
Optimizer:
function: ppocr.optimizer,RMSProp
base_lr: 0.001
decay:
function: piecewise_decay
boundaries: [30000, 50000, 80000, 100000, 150000]
decay_rate: 0.3
PostProcess:
function: ppocr.postprocess.sast_postprocess,SASTPostProcess
score_thresh: 0.5
sample_pts_num: 2
nms_thresh: 0.2
expand_scale: 1.0
shrink_ratio_of_width: 0.3
\ No newline at end of file
Global:
algorithm: SAST
use_gpu: true
epoch_num: 2000
log_smooth_window: 20
print_batch_step: 2
save_model_dir: ./output/det_sast/
save_epoch_step: 20
eval_batch_step: 5000
train_batch_size_per_card: 8
test_batch_size_per_card: 1
image_shape: [3, 512, 512]
reader_yml: ./configs/det/det_sast_totaltext_reader.yml
pretrain_weights: ./pretrain_models/ResNet50_vd_ssld_pretrained/
save_res_path: ./output/det_sast/predicts_sast.txt
checkpoints:
save_inference_dir:
Architecture:
function: ppocr.modeling.architectures.det_model,DetModel
Backbone:
function: ppocr.modeling.backbones.det_resnet_vd_sast,ResNet
layers: 50
Head:
function: ppocr.modeling.heads.det_sast_head,SASTHead
model_name: large
only_fpn_up: False
# with_cab: False
with_cab: True
Loss:
function: ppocr.modeling.losses.det_sast_loss,SASTLoss
Optimizer:
function: ppocr.optimizer,RMSProp
base_lr: 0.001
decay:
function: piecewise_decay
boundaries: [30000, 50000, 80000, 100000, 150000]
decay_rate: 0.3
PostProcess:
function: ppocr.postprocess.sast_postprocess,SASTPostProcess
score_thresh: 0.5
sample_pts_num: 6
nms_thresh: 0.2
expand_scale: 1.2
shrink_ratio_of_width: 0.2
\ No newline at end of file
TrainReader:
reader_function: ppocr.data.det.dataset_traversal,TrainReader
process_function: ppocr.data.det.sast_process,SASTProcessTrain
num_workers: 8
img_set_dir: ./train_data/
label_file_path: [./train_data/icdar2013/train_label_json.txt, ./train_data/icdar2015/train_label_json.txt, ./train_data/icdar17_mlt_latin/train_label_json.txt, ./train_data/coco_text_icdar_4pts/train_label_json.txt]
data_ratio_list: [0.1, 0.45, 0.3, 0.15]
min_crop_side_ratio: 0.3
min_crop_size: 24
min_text_size: 4
max_text_size: 512
EvalReader:
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
process_function: ppocr.data.det.sast_process,SASTProcessTest
img_set_dir: ./train_data/icdar2015/text_localization/
label_file_path: ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
max_side_len: 1536
TestReader:
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
process_function: ppocr.data.det.sast_process,SASTProcessTest
infer_img: ./train_data/icdar2015/text_localization/ch4_test_images/img_11.jpg
max_side_len: 1536
TrainReader:
reader_function: ppocr.data.det.dataset_traversal,TrainReader
process_function: ppocr.data.det.sast_process,SASTProcessTrain
num_workers: 8
img_set_dir: ./train_data/
label_file_path: [./train_data/art_latin_icdar_14pt/train_no_tt_test/train_label_json.txt, ./train_data/total_text_icdar_14pt/train_label_json.txt]
data_ratio_list: [0.5, 0.5]
min_crop_side_ratio: 0.3
min_crop_size: 24
min_text_size: 4
max_text_size: 512
EvalReader:
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
process_function: ppocr.data.det.sast_process,SASTProcessTest
img_set_dir: ./train_data/
label_file_path: ./train_data/total_text_icdar_14pt/test_label_json.txt
max_side_len: 768
TestReader:
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
process_function: ppocr.data.det.sast_process,SASTProcessTest
infer_img: ./train_data/afs/total_text/Images/Test/img623.jpg
max_side_len: 768
TrainReader:
reader_function: ppocr.data.rec.dataset_traversal,LMDBReader
num_workers: 8
lmdb_sets_dir: ./train_data/data_lmdb_release/training/
EvalReader:
reader_function: ppocr.data.rec.dataset_traversal,LMDBReader
lmdb_sets_dir: ./train_data/data_lmdb_release/validation/
TestReader:
reader_function: ppocr.data.rec.dataset_traversal,LMDBReader
lmdb_sets_dir: ./train_data/data_lmdb_release/evaluation/
Global:
algorithm: CRNN
use_gpu: true
epoch_num: 3000
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec_CRNN
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 128
test_batch_size_per_card: 128
image_shape: [3, 32, 320]
max_text_length: 25
character_type: ch
character_dict_path: ./ppocr/utils/ppocr_keys_v1.txt
loss_type: ctc
distort: false
use_space_char: false
reader_yml: ./configs/rec/rec_chinese_reader.yml
pretrain_weights:
checkpoints:
save_inference_dir:
infer_img:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
Backbone:
function: ppocr.modeling.backbones.rec_resnet_vd,ResNet
layers: 34
Head:
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
encoder_type: rnn
SeqRNN:
hidden_size: 256
Loss:
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.0005
beta1: 0.9
beta2: 0.999
Global:
algorithm: CRNN
use_gpu: true
epoch_num: 3000
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec_CRNN
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 320]
max_text_length: 25
character_type: ch
character_dict_path: ./ppocr/utils/ppocr_keys_v1.txt
loss_type: ctc
distort: false
use_space_char: false
reader_yml: ./configs/rec/rec_chinese_reader.yml
pretrain_weights:
checkpoints:
save_inference_dir:
infer_img:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
Backbone:
function: ppocr.modeling.backbones.rec_mobilenet_v3,MobileNetV3
scale: 0.5
model_name: small
Head:
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
encoder_type: rnn
SeqRNN:
hidden_size: 48
Loss:
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.0005
beta1: 0.9
beta2: 0.999
TrainReader:
reader_function: ppocr.data.rec.dataset_traversal,SimpleReader
num_workers: 8
img_set_dir: ./train_data
label_file_path: ./train_data/rec_gt_train.txt
EvalReader:
reader_function: ppocr.data.rec.dataset_traversal,SimpleReader
img_set_dir: ./train_data
label_file_path: ./train_data/rec_gt_test.txt
TestReader:
reader_function: ppocr.data.rec.dataset_traversal,SimpleReader
TrainReader:
reader_function: ppocr.data.rec.dataset_traversal,SimpleReader
num_workers: 8
img_set_dir: ./train_data/ic15_data
label_file_path: ./train_data/ic15_data/rec_gt_train.txt
EvalReader:
reader_function: ppocr.data.rec.dataset_traversal,SimpleReader
img_set_dir: ./train_data/ic15_data
label_file_path: ./train_data/ic15_data/rec_gt_test.txt
TestReader:
reader_function: ppocr.data.rec.dataset_traversal,SimpleReader
Global:
algorithm: CRNN
use_gpu: true
epoch_num: 1000
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec_CRNN
save_epoch_step: 300
eval_batch_step: 500
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: en
loss_type: ctc
distort: true
debug: false
reader_yml: ./configs/rec/rec_icdar15_reader.yml
pretrain_weights: ./pretrain_models/rec_mv3_none_bilstm_ctc/best_accuracy
checkpoints:
save_inference_dir:
infer_img:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
Backbone:
function: ppocr.modeling.backbones.rec_mobilenet_v3,MobileNetV3
scale: 0.5
model_name: large
Head:
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
encoder_type: rnn
SeqRNN:
hidden_size: 96
Loss:
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.0005
beta1: 0.9
beta2: 0.999
decay:
function: cosine_decay
step_each_epoch: 20
total_epoch: 1000
Global:
algorithm: CRNN
use_gpu: true
epoch_num: 72
use_gpu: false
epoch_num: 500
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output/rec_CRNN
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: en
loss_type: ctc
reader_yml: ./configs/rec/rec_benchmark_reader.yml
pretrain_weights:
checkpoints:
save_model_dir: ./output/rec/test/
save_epoch_step: 500
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: 127
# if pretrained_model is saved in static mode, load_static_weights must set to True
load_static_weights: True
cal_metric_during_train: True
pretrained_model:
checkpoints: #output/rec/rec_crnn/best_accuracy
save_inference_dir:
infer_img:
use_visualdl: False
infer_img: doc/imgs_words/ch/word_1.jpg
# for data or label process
max_text_length: 80
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
character_type: 'ch'
use_space_char: False
infer_mode: False
use_tps: False
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
Backbone:
function: ppocr.modeling.backbones.rec_mobilenet_v3,MobileNetV3
scale: 0.5
model_name: large
Optimizer:
name: Adam
beta1: 0.9
beta2: 0.999
learning_rate:
name: Cosine
lr: 0.001
warmup_epoch: 4
regularizer:
name: 'L2'
factor: 0.00001
Head:
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
encoder_type: rnn
SeqRNN:
Architecture:
type: rec
algorithm: CRNN
Transform:
Backbone:
name: MobileNetV3
scale: 0.5
model_name: small
small_stride: [ 1, 2, 2, 2 ]
Neck:
name: SequenceEncoder
encoder_type: fc
hidden_size: 96
Head:
name: CTC
fc_decay: 0.00001
Loss:
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
name: CTCLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999
PostProcess:
name: CTCLabelDecode
Metric:
name: RecMetric
main_indicator: acc
TRAIN:
dataset:
name: SimpleDataSet
data_dir: /home/zhoujun20/rec
file_list:
- /home/zhoujun20/rec/real_data.txt # dataset1
ratio_list: [ 0.4,0.6 ]
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecAug:
- RecResizeImg:
image_shape: [ 3,32,320 ]
- keepKeys:
keep_keys: [ 'image','label','length' ] # dataloader将按照此顺序返回list
loader:
batch_size: 256
shuffle: True
drop_last: True
num_workers: 6
EVAL:
dataset:
name: SimpleDataSet
data_dir: /home/zhoujun20/rec
file_list:
- /home/zhoujun20/rec/label_val_all.txt
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [ 3,32,320 ]
- keepKeys:
keep_keys: [ 'image','label','length' ] # dataloader将按照此顺序返回list
loader:
shuffle: False
drop_last: False
batch_size: 256
num_workers: 6
Global:
use_gpu: true
epoch_num: 500
log_smooth_window: 20
print_batch_step: 1
save_model_dir: ./output/rec/test/
save_epoch_step: 500
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: 1016
# if pretrained_model is saved in static mode, load_static_weights must set to True
load_static_weights: True
cal_metric_during_train: True
pretrained_model:
checkpoints: #output/rec/rec_crnn/best_accuracy
save_inference_dir:
use_visualdl: True
infer_img: doc/imgs_words/ch/word_1.jpg
# for data or label process
max_text_length: 80
character_dict_path: /home/zhoujun20/rec/lmdb/dict.txt
character_type: 'ch'
use_space_char: True
infer_mode: False
use_tps: False
Optimizer:
name: Adam
beta1: 0.9
beta2: 0.999
learning_rate:
name: Cosine
lr: 0.0005
warmup_epoch: 1
regularizer:
name: 'L2'
factor: 0.00001
Architecture:
type: rec
algorithm: CRNN
Transform:
Backbone:
name: MobileNetV3
scale: 0.5
model_name: small
small_stride: [ 1, 2, 2, 2 ]
Neck:
name: SequenceEncoder
encoder_type: rnn
hidden_size: 48
Head:
name: CTC
fc_decay: 0.00001
Loss:
name: CTCLoss
PostProcess:
name: CTCLabelDecode
Metric:
name: RecMetric
main_indicator: acc
TRAIN:
dataset:
name: LMDBDateSet
file_list:
- /home/zhoujun20/rec/lmdb/train # dataset1
ratio_list: [ 0.4,0.6 ]
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecAug:
- RecResizeImg:
image_shape: [ 3,32,320 ]
- keepKeys:
keep_keys: [ 'image','label','length' ] # dataloader将按照此顺序返回list
loader:
batch_size: 256
shuffle: True
drop_last: True
num_workers: 6
EVAL:
dataset:
name: LMDBDateSet
file_list:
- /home/zhoujun20/rec/lmdb/val
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [ 3,32,320 ]
- keepKeys:
keep_keys: [ 'image','label','length' ] # dataloader将按照此顺序返回list
loader:
shuffle: False
drop_last: False
batch_size: 256
num_workers: 6
Global:
algorithm: Rosetta
use_gpu: true
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output/rec_Rosetta
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: en
loss_type: ctc
reader_yml: ./configs/rec/rec_benchmark_reader.yml
pretrain_weights:
checkpoints:
save_inference_dir:
infer_img:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
Backbone:
function: ppocr.modeling.backbones.rec_mobilenet_v3,MobileNetV3
scale: 0.5
model_name: large
Head:
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
encoder_type: reshape
Loss:
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999
Global:
algorithm: RARE
use_gpu: true
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output/rec_RARE
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: en
loss_type: attention
tps: true
reader_yml: ./configs/rec/rec_benchmark_reader.yml
pretrain_weights:
checkpoints:
save_inference_dir:
infer_img:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
TPS:
function: ppocr.modeling.stns.tps,TPS
num_fiducial: 20
loc_lr: 0.1
model_name: small
Backbone:
function: ppocr.modeling.backbones.rec_mobilenet_v3,MobileNetV3
scale: 0.5
model_name: large
Head:
function: ppocr.modeling.heads.rec_attention_head,AttentionPredict
encoder_type: rnn
SeqRNN:
hidden_size: 96
Attention:
decoder_size: 96
word_vector_dim: 96
Loss:
function: ppocr.modeling.losses.rec_attention_loss,AttentionLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999
Global:
algorithm: STARNet
use_gpu: true
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output/rec_STARNet
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: en
loss_type: ctc
tps: true
reader_yml: ./configs/rec/rec_benchmark_reader.yml
pretrain_weights:
checkpoints:
save_inference_dir:
infer_img:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
TPS:
function: ppocr.modeling.stns.tps,TPS
num_fiducial: 20
loc_lr: 0.1
model_name: small
Backbone:
function: ppocr.modeling.backbones.rec_mobilenet_v3,MobileNetV3
scale: 0.5
model_name: large
Head:
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
encoder_type: rnn
SeqRNN:
hidden_size: 96
Loss:
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999
Global:
algorithm: CRNN
use_gpu: true
epoch_num: 72
use_gpu: false
epoch_num: 500
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output/rec_CRNN
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: en
loss_type: ctc
reader_yml: ./configs/rec/rec_benchmark_reader.yml
pretrain_weights:
checkpoints:
save_model_dir: ./output/rec/test/
save_epoch_step: 500
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: 127
# if pretrained_model is saved in static mode, load_static_weights must set to True
load_static_weights: True
cal_metric_during_train: True
pretrained_model:
checkpoints: #output/rec/rec_crnn/best_accuracy
save_inference_dir:
infer_img:
use_visualdl: False
infer_img: doc/imgs_words/ch/word_1.jpg
# for data or label process
max_text_length: 80
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
character_type: 'ch'
use_space_char: False
infer_mode: False
use_tps: False
Optimizer:
name: Adam
beta1: 0.9
beta2: 0.999
learning_rate:
name: Cosine
lr: 0.001
warmup_epoch: 4
regularizer:
name: 'L2'
factor: 0.00001
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
type: rec
algorithm: CRNN
Transform:
Backbone:
name: ResNet
layers: 200
Neck:
name: SequenceEncoder
encoder_type: fc
hidden_size: 96
Head:
name: CTC
fc_decay: 0.00001
Backbone:
function: ppocr.modeling.backbones.rec_resnet_vd,ResNet
layers: 34
Loss:
name: CTCLoss
Head:
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
encoder_type: rnn
SeqRNN:
hidden_size: 256
PostProcess:
name: CTCLabelDecode
Loss:
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
Metric:
name: RecMetric
main_indicator: acc
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999
TRAIN:
dataset:
name: SimpleDataSet
data_dir: /home/zhoujun20/rec
file_list:
- /home/zhoujun20/rec/real_data.txt # dataset1
ratio_list: [ 0.4,0.6 ]
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecAug:
- RecResizeImg:
image_shape: [ 3,32,320 ]
- keepKeys:
keep_keys: [ 'image','label','length' ] # dataloader将按照此顺序返回list
loader:
batch_size: 256
shuffle: True
drop_last: True
num_workers: 6
EVAL:
dataset:
name: SimpleDataSet
data_dir: /home/zhoujun20/rec
file_list:
- /home/zhoujun20/rec/label_val_all.txt
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [ 3,32,320 ]
- keepKeys:
keep_keys: [ 'image','label','length' ] # dataloader将按照此顺序返回list
loader:
shuffle: False
drop_last: False
batch_size: 256
num_workers: 6
Global:
algorithm: Rosetta
use_gpu: true
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output/rec_Rosetta
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: en
loss_type: ctc
reader_yml: ./configs/rec/rec_benchmark_reader.yml
pretrain_weights:
checkpoints:
save_inference_dir:
infer_img:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
Backbone:
function: ppocr.modeling.backbones.rec_resnet_vd,ResNet
layers: 34
Head:
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
encoder_type: reshape
Loss:
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999
Global:
algorithm: RARE
use_gpu: true
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output/rec_RARE
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: en
loss_type: attention
tps: true
reader_yml: ./configs/rec/rec_benchmark_reader.yml
pretrain_weights:
checkpoints:
save_inference_dir:
infer_img:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
TPS:
function: ppocr.modeling.stns.tps,TPS
num_fiducial: 20
loc_lr: 0.1
model_name: large
Backbone:
function: ppocr.modeling.backbones.rec_resnet_vd,ResNet
layers: 34
Head:
function: ppocr.modeling.heads.rec_attention_head,AttentionPredict
encoder_type: rnn
SeqRNN:
hidden_size: 256
Attention:
decoder_size: 128
word_vector_dim: 128
Loss:
function: ppocr.modeling.losses.rec_attention_loss,AttentionLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999
Global:
algorithm: STARNet
use_gpu: true
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output/rec_STARNet
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: en
loss_type: ctc
tps: true
reader_yml: ./configs/rec/rec_benchmark_reader.yml
pretrain_weights:
checkpoints:
save_inference_dir:
infer_img:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
TPS:
function: ppocr.modeling.stns.tps,TPS
num_fiducial: 20
loc_lr: 0.1
model_name: large
Backbone:
function: ppocr.modeling.backbones.rec_resnet_vd,ResNet
layers: 34
Head:
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
encoder_type: rnn
SeqRNN:
hidden_size: 256
Loss:
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999
Global:
algorithm: SRN
use_gpu: true
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output/rec_pvam_withrotate
save_epoch_step: 1
eval_batch_step: 8000
train_batch_size_per_card: 64
test_batch_size_per_card: 1
image_shape: [1, 64, 256]
max_text_length: 25
character_type: en
loss_type: srn
num_heads: 8
average_window: 0.15
max_average_window: 15625
min_average_window: 10000
reader_yml: ./configs/rec/rec_benchmark_reader.yml
pretrain_weights:
checkpoints:
save_inference_dir:
infer_img:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
Backbone:
function: ppocr.modeling.backbones.rec_resnet_fpn,ResNet
layers: 50
Head:
function: ppocr.modeling.heads.rec_srn_all_head,SRNPredict
encoder_type: rnn
num_encoder_TUs: 2
num_decoder_TUs: 4
hidden_dims: 512
SeqRNN:
hidden_size: 256
Loss:
function: ppocr.modeling.losses.rec_srn_loss,SRNLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.0001
beta1: 0.9
beta2: 0.999
......@@ -11,3 +11,114 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import os
import sys
import numpy as np
import paddle
__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
import copy
from paddle.io import DataLoader, DistributedBatchSampler, BatchSampler
import paddle.distributed as dist
from ppocr.data.imaug import transform, create_operators
__all__ = ['build_dataloader', 'transform', 'create_operators']
def build_dataset(config, global_config):
from ppocr.data.dataset import SimpleDataSet, LMDBDateSet
support_dict = ['SimpleDataSet', 'LMDBDateSet']
module_name = config.pop('name')
assert module_name in support_dict, Exception(
'DataSet only support {}'.format(support_dict))
dataset = eval(module_name)(config, global_config)
return dataset
def build_dataloader(config, device, distributed=False, global_config=None):
from ppocr.data.dataset import BatchBalancedDataLoader
config = copy.deepcopy(config)
dataset_config = config['dataset']
_dataset_list = []
file_list = dataset_config.pop('file_list')
if len(file_list) == 1:
ratio_list = [1.0]
else:
ratio_list = dataset_config.pop('ratio_list')
for file in file_list:
dataset_config['file_list'] = file
_dataset = build_dataset(dataset_config, global_config)
_dataset_list.append(_dataset)
data_loader = BatchBalancedDataLoader(_dataset_list, ratio_list,
distributed, device, config['loader'])
return data_loader, _dataset.info_dict
def test_loader():
import time
from tools.program import load_config, ArgsParser
FLAGS = ArgsParser().parse_args()
config = load_config(FLAGS.config)
place = paddle.CPUPlace()
paddle.disable_static(place)
import time
data_loader, _ = build_dataloader(
config['TRAIN'], place, global_config=config['Global'])
start = time.time()
print(len(data_loader))
for epoch in range(1):
print('epoch {} ****************'.format(epoch))
for i, batch in enumerate(data_loader):
if i > len(data_loader):
break
t = time.time() - start
start = time.time()
print('{}, batch : {} ,time {}'.format(i, len(batch[0]), t))
continue
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt
import cv2
fig = plt.figure()
# # cv2.imwrite('img.jpg',batch[0].numpy()[0].transpose((1,2,0)))
# # cv2.imwrite('bmap.jpg',batch[1].numpy()[0])
# # cv2.imwrite('bmask.jpg',batch[2].numpy()[0])
# # cv2.imwrite('smap.jpg',batch[3].numpy()[0])
# # cv2.imwrite('smask.jpg',batch[4].numpy()[0])
plt.title('img')
plt.imshow(batch[0].numpy()[0].transpose((1, 2, 0)))
# plt.figure()
# plt.title('bmap')
# plt.imshow(batch[1].numpy()[0],cmap='Greys')
# plt.figure()
# plt.title('bmask')
# plt.imshow(batch[2].numpy()[0],cmap='Greys')
# plt.figure()
# plt.title('smap')
# plt.imshow(batch[3].numpy()[0],cmap='Greys')
# plt.figure()
# plt.title('smask')
# plt.imshow(batch[4].numpy()[0],cmap='Greys')
# plt.show()
# break
if __name__ == '__main__':
test_loader()
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import numpy as np
import os
import lmdb
import random
import signal
import paddle
from paddle.io import Dataset, DataLoader, DistributedBatchSampler, BatchSampler
from .imaug import transform, create_operators
from ppocr.utils.logging import get_logger
def term_mp(sig_num, frame):
""" kill all child processes
"""
pid = os.getpid()
pgid = os.getpgid(os.getpid())
print("main proc {} exit, kill process group " "{}".format(pid, pgid))
os.killpg(pgid, signal.SIGKILL)
signal.signal(signal.SIGINT, term_mp)
signal.signal(signal.SIGTERM, term_mp)
class ModeException(Exception):
"""
ModeException
"""
def __init__(self, message='', mode=''):
message += "\nOnly the following 3 modes are supported: " \
"train, valid, test. Given mode is {}".format(mode)
super(ModeException, self).__init__(message)
class SampleNumException(Exception):
"""
SampleNumException
"""
def __init__(self, message='', sample_num=0, batch_size=1):
message += "\nError: The number of the whole data ({}) " \
"is smaller than the batch_size ({}), and drop_last " \
"is turnning on, so nothing will feed in program, " \
"Terminated now. Please reset batch_size to a smaller " \
"number or feed more data!".format(sample_num, batch_size)
super(SampleNumException, self).__init__(message)
def get_file_list(file_list, data_dir, delimiter='\t'):
"""
read label list from file and shuffle the list
Args:
params(dict):
"""
if isinstance(file_list, str):
file_list = [file_list]
data_source_list = []
for file in file_list:
with open(file) as f:
full_lines = [line.strip() for line in f]
for line in full_lines:
try:
img_path, label = line.split(delimiter)
except:
logger = get_logger()
logger.warning('label error in {}'.format(line))
img_path = os.path.join(data_dir, img_path)
data = {'img_path': img_path, 'label': label}
data_source_list.append(data)
return data_source_list
class LMDBDateSet(Dataset):
def __init__(self, config, global_config):
super(LMDBDateSet, self).__init__()
self.data_list = self.load_lmdb_dataset(
config['file_list'], global_config['max_text_length'])
random.shuffle(self.data_list)
self.ops = create_operators(config['transforms'], global_config)
# for rec
character = ''
for op in self.ops:
if hasattr(op, 'character'):
character = getattr(op, 'character')
self.info_dict = {'character': character}
def load_lmdb_dataset(self, data_dir, max_text_length):
self.env = lmdb.open(
data_dir,
max_readers=32,
readonly=True,
lock=False,
readahead=False,
meminit=False)
if not self.env:
print('cannot create lmdb from %s' % (data_dir))
exit(0)
filtered_index_list = []
with self.env.begin(write=False) as txn:
nSamples = int(txn.get('num-samples'.encode()))
self.nSamples = nSamples
for index in range(self.nSamples):
index += 1 # lmdb starts with 1
label_key = 'label-%09d'.encode() % index
label = txn.get(label_key).decode('utf-8')
if len(label) > max_text_length:
# print(f'The length of the label is longer than max_length: length
# {len(label)}, {label} in dataset {self.root}')
continue
# By default, images containing characters which are not in opt.character are filtered.
# You can add [UNK] token to `opt.character` in utils.py instead of this filtering.
filtered_index_list.append(index)
return filtered_index_list
def print_lmdb_sets_info(self, lmdb_sets):
lmdb_info_strs = []
for dataset_idx in range(len(lmdb_sets)):
tmp_str = " %s:%d," % (lmdb_sets[dataset_idx]['dirpath'],
lmdb_sets[dataset_idx]['num_samples'])
lmdb_info_strs.append(tmp_str)
lmdb_info_strs = ''.join(lmdb_info_strs)
logger = get_logger()
logger.info("DataSummary:" + lmdb_info_strs)
return
def __getitem__(self, idx):
idx = self.data_list[idx]
with self.env.begin(write=False) as txn:
label_key = 'label-%09d'.encode() % idx
label = txn.get(label_key)
if label is not None:
label = label.decode('utf-8')
img_key = 'image-%09d'.encode() % idx
imgbuf = txn.get(img_key)
data = {'image': imgbuf, 'label': label}
outs = transform(data, self.ops)
else:
outs = None
if outs is None:
return self.__getitem__(np.random.randint(self.__len__()))
return outs
def __len__(self):
return len(self.data_list)
class SimpleDataSet(Dataset):
def __init__(self, config, global_config):
super(SimpleDataSet, self).__init__()
delimiter = config.get('delimiter', '\t')
self.data_list = get_file_list(config['file_list'], config['data_dir'],
delimiter)
random.shuffle(self.data_list)
self.ops = create_operators(config['transforms'], global_config)
# for rec
character = ''
for op in self.ops:
if hasattr(op, 'character'):
character = getattr(op, 'character')
self.info_dict = {'character': character}
def __getitem__(self, idx):
data = copy.deepcopy(self.data_list[idx])
with open(data['img_path'], 'rb') as f:
img = f.read()
data['image'] = img
outs = transform(data, self.ops)
if outs is None:
return self.__getitem__(np.random.randint(self.__len__()))
return outs
def __len__(self):
return len(self.data_list)
class BatchBalancedDataLoader(object):
def __init__(self,
dataset_list: list,
ratio_list: list,
distributed,
device,
loader_args: dict):
"""
对datasetlist里的dataset按照ratio_list里对应的比例组合,似的每个batch里的数据按按照比例采样的
:param dataset_list: 数据集列表
:param ratio_list: 比例列表
:param loader_args: dataloader的配置
"""
assert sum(ratio_list) == 1 and len(dataset_list) == len(ratio_list)
self.dataset_len = 0
self.data_loader_list = []
self.dataloader_iter_list = []
all_batch_size = loader_args.pop('batch_size')
batch_size_list = list(
map(int, [max(1.0, all_batch_size * x) for x in ratio_list]))
remain_num = all_batch_size - sum(batch_size_list)
batch_size_list[np.argmax(ratio_list)] += remain_num
for _dataset, _batch_size in zip(dataset_list, batch_size_list):
if distributed:
batch_sampler_class = DistributedBatchSampler
else:
batch_sampler_class = BatchSampler
batch_sampler = batch_sampler_class(
dataset=_dataset,
batch_size=_batch_size,
shuffle=loader_args['shuffle'],
drop_last=loader_args['drop_last'], )
_data_loader = DataLoader(
dataset=_dataset,
batch_sampler=batch_sampler,
places=device,
num_workers=loader_args['num_workers'],
return_list=True, )
self.data_loader_list.append(_data_loader)
self.dataloader_iter_list.append(iter(_data_loader))
self.dataset_len += len(_dataset)
def __iter__(self):
return self
def __len__(self):
return min([len(x) for x in self.data_loader_list])
def __next__(self):
batch = []
for i, data_loader_iter in enumerate(self.dataloader_iter_list):
try:
_batch_i = next(data_loader_iter)
batch.append(_batch_i)
except StopIteration:
self.dataloader_iter_list[i] = iter(self.data_loader_list[i])
_batch_i = next(self.dataloader_iter_list[i])
batch.append(_batch_i)
except ValueError:
pass
if len(batch) > 0:
batch_list = []
batch_item_size = len(batch[0])
for i in range(batch_item_size):
cur_item_list = [batch_i[i] for batch_i in batch]
batch_list.append(paddle.concat(cur_item_list, axis=0))
else:
batch_list = batch[0]
return batch_list
def fill_batch(batch):
"""
2020.09.08: The current paddle version only supports returning data with the same length.
Therefore, fill in the batches with inconsistent lengths.
this method is currently only useful for text detection
"""
keys = list(range(len(batch[0])))
v_max_len_dict = {}
for k in keys:
v_max_len_dict[k] = max([len(item[k]) for item in batch])
for item in batch:
length = []
for k in keys:
v = item[k]
length.append(len(v))
assert isinstance(v, np.ndarray)
if len(v) == v_max_len_dict[k]:
continue
try:
tmp_shape = [v_max_len_dict[k] - len(v)] + list(v[0].shape)
except:
a = 1
tmp_array = np.zeros(tmp_shape, dtype=v[0].dtype)
new_array = np.concatenate([v, tmp_array])
item[k] = new_array
item.append(length)
return batch
# -*- coding:utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
import random
import cv2
import math
import imgaug
import imgaug.augmenters as iaa
def AugmentData(data):
img = data['image']
shape = img.shape
aug = iaa.Sequential(
[iaa.Fliplr(0.5), iaa.Affine(rotate=(-10, 10)), iaa.Resize(
(0.5, 3))]).to_deterministic()
def may_augment_annotation(aug, data, shape):
if aug is None:
return data
line_polys = []
for poly in data['polys']:
new_poly = may_augment_poly(aug, shape, poly)
line_polys.append(new_poly)
data['polys'] = np.array(line_polys)
return data
def may_augment_poly(aug, img_shape, poly):
keypoints = [imgaug.Keypoint(p[0], p[1]) for p in poly]
keypoints = aug.augment_keypoints(
[imgaug.KeypointsOnImage(
keypoints, shape=img_shape)])[0].keypoints
poly = [(p.x, p.y) for p in keypoints]
return poly
img_aug = aug.augment_image(img)
data['image'] = img_aug
data = may_augment_annotation(aug, data, shape)
return data
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import os
import sys
import math
import random
import functools
import numpy as np
import cv2
import string
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from ppocr.utils.utility import create_module
from ppocr.utils.utility import get_image_file_list
import time
class TrainReader(object):
def __init__(self, params):
self.num_workers = params['num_workers']
self.label_file_path = params['label_file_path']
print(self.label_file_path)
self.use_mul_data = False
if isinstance(self.label_file_path, list):
self.use_mul_data = True
self.data_ratio_list = params['data_ratio_list']
self.batch_size = params['train_batch_size_per_card']
assert 'process_function' in params,\
"absence process_function in Reader"
self.process = create_module(params['process_function'])(params)
def __call__(self, process_id):
def sample_iter_reader():
with open(self.label_file_path, "rb") as fin:
label_infor_list = fin.readlines()
img_num = len(label_infor_list)
img_id_list = list(range(img_num))
random.shuffle(img_id_list)
if sys.platform == "win32" and self.num_workers != 1:
print("multiprocess is not fully compatible with Windows."
"num_workers will be 1.")
self.num_workers = 1
for img_id in range(process_id, img_num, self.num_workers):
label_infor = label_infor_list[img_id_list[img_id]]
outs = self.process(label_infor)
if outs is None:
continue
yield outs
def sample_iter_reader_mul():
batch_size = 1000
data_source_list = self.label_file_path
batch_size_list = list(map(int, [max(1.0, batch_size * x) for x in self.data_ratio_list]))
print(self.data_ratio_list, batch_size_list)
data_filename_list, data_size_list, fetch_record_list = [], [], []
for data_source in data_source_list:
image_files = open(data_source, "rb").readlines()
random.shuffle(image_files)
data_filename_list.append(image_files)
data_size_list.append(len(image_files))
fetch_record_list.append(0)
image_batch = []
# get a batch of img_fns and poly_fns
for i in range(0, len(batch_size_list)):
bs = batch_size_list[i]
ds = data_size_list[i]
image_names = data_filename_list[i]
fetch_record = fetch_record_list[i]
data_path = data_source_list[i]
for j in range(fetch_record, fetch_record + bs):
index = j % ds
image_batch.append(image_names[index])
if (fetch_record + bs) > ds:
fetch_record_list[i] = 0
random.shuffle(data_filename_list[i])
else:
fetch_record_list[i] = fetch_record + bs
if sys.platform == "win32":
print("multiprocess is not fully compatible with Windows."
"num_workers will be 1.")
self.num_workers = 1
for label_infor in image_batch:
outs = self.process(label_infor)
if outs is None:
continue
yield outs
def batch_iter_reader():
batch_outs = []
if self.use_mul_data:
print("Sample date from multiple datasets!")
for outs in sample_iter_reader_mul():
batch_outs.append(outs)
if len(batch_outs) == self.batch_size:
yield batch_outs
batch_outs = []
else:
for outs in sample_iter_reader():
batch_outs.append(outs)
if len(batch_outs) == self.batch_size:
yield batch_outs
batch_outs = []
return batch_iter_reader
class EvalTestReader(object):
def __init__(self, params):
self.params = params
assert 'process_function' in params,\
"absence process_function in EvalTestReader"
def __call__(self, mode):
process_function = create_module(self.params['process_function'])(
self.params)
batch_size = self.params['test_batch_size_per_card']
img_list = []
if mode != "test":
img_set_dir = self.params['img_set_dir']
img_name_list_path = self.params['label_file_path']
with open(img_name_list_path, "rb") as fin:
lines = fin.readlines()
for line in lines:
img_name = line.decode().strip("\n").split("\t")[0]
img_path = os.path.join(img_set_dir, img_name)
img_list.append(img_path)
else:
img_path = self.params['infer_img']
img_list = get_image_file_list(img_path)
def batch_iter_reader():
batch_outs = []
for img_path in img_list:
img = cv2.imread(img_path)
if img is None:
logger.info("{} does not exist!".format(img_path))
continue
elif len(list(img.shape)) == 2 or img.shape[2] == 1:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
outs = process_function(img)
outs.append(img_path)
batch_outs.append(outs)
if len(batch_outs) == batch_size:
yield batch_outs
batch_outs = []
if len(batch_outs) != 0:
yield batch_outs
return batch_iter_reader
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import math
import cv2
import numpy as np
import json
import sys
from ppocr.utils.utility import initial_logger, check_and_read_gif
logger = initial_logger()
from .data_augment import AugmentData
from .random_crop_data import RandomCropData
from .make_shrink_map import MakeShrinkMap
from .make_border_map import MakeBorderMap
class DBProcessTrain(object):
"""
DB pre-process for Train mode
"""
def __init__(self, params):
self.img_set_dir = params['img_set_dir']
self.image_shape = params['image_shape']
def order_points_clockwise(self, pts):
rect = np.zeros((4, 2), dtype="float32")
s = pts.sum(axis=1)
rect[0] = pts[np.argmin(s)]
rect[2] = pts[np.argmax(s)]
diff = np.diff(pts, axis=1)
rect[1] = pts[np.argmin(diff)]
rect[3] = pts[np.argmax(diff)]
return rect
def make_data_dict(self, imgvalue, entry):
boxes = []
texts = []
ignores = []
for rect in entry:
points = rect['points']
transcription = rect['transcription']
try:
box = self.order_points_clockwise(
np.array(points).reshape(-1, 2))
if cv2.contourArea(box) > 0:
boxes.append(box)
texts.append(transcription)
ignores.append(transcription in ['*', '###'])
except:
print('load label failed!')
data = {
'image': imgvalue,
'shape': [imgvalue.shape[0], imgvalue.shape[1]],
'polys': np.array(boxes),
'texts': texts,
'ignore_tags': ignores,
}
return data
def NormalizeImage(self, data):
im = data['image']
img_mean = [0.485, 0.456, 0.406]
img_std = [0.229, 0.224, 0.225]
im = im.astype(np.float32, copy=False)
im = im / 255
im -= img_mean
im /= img_std
channel_swap = (2, 0, 1)
im = im.transpose(channel_swap)
data['image'] = im
return data
def FilterKeys(self, data):
filter_keys = ['polys', 'texts', 'ignore_tags', 'shape']
for key in filter_keys:
if key in data:
del data[key]
return data
def convert_label_infor(self, label_infor):
label_infor = label_infor.decode()
label_infor = label_infor.encode('utf-8').decode('utf-8-sig')
substr = label_infor.strip("\n").split("\t")
img_path = self.img_set_dir + substr[0]
label = json.loads(substr[1])
return img_path, label
def __call__(self, label_infor):
img_path, gt_label = self.convert_label_infor(label_infor)
imgvalue, flag = check_and_read_gif(img_path)
if not flag:
imgvalue = cv2.imread(img_path)
if imgvalue is None:
logger.info("{} does not exist!".format(img_path))
return None
if len(list(imgvalue.shape)) == 2 or imgvalue.shape[2] == 1:
imgvalue = cv2.cvtColor(imgvalue, cv2.COLOR_GRAY2BGR)
data = self.make_data_dict(imgvalue, gt_label)
data = AugmentData(data)
data = RandomCropData(data, self.image_shape[1:])
data = MakeShrinkMap(data)
data = MakeBorderMap(data)
data = self.NormalizeImage(data)
data = self.FilterKeys(data)
return data['image'], data['shrink_map'], data['shrink_mask'], data[
'threshold_map'], data['threshold_mask']
class DBProcessTest(object):
"""
DB pre-process for Test mode
"""
def __init__(self, params):
super(DBProcessTest, self).__init__()
self.resize_type = 0
if 'test_image_shape' in params:
self.image_shape = params['test_image_shape']
# print(self.image_shape)
self.resize_type = 1
if 'max_side_len' in params:
self.max_side_len = params['max_side_len']
else:
self.max_side_len = 2400
def resize_image_type0(self, im):
"""
resize image to a size multiple of 32 which is required by the network
args:
img(array): array with shape [h, w, c]
return(tuple):
img, (ratio_h, ratio_w)
"""
max_side_len = self.max_side_len
h, w, _ = im.shape
resize_w = w
resize_h = h
# limit the max side
if max(resize_h, resize_w) > max_side_len:
if resize_h > resize_w:
ratio = float(max_side_len) / resize_h
else:
ratio = float(max_side_len) / resize_w
else:
ratio = 1.
resize_h = int(resize_h * ratio)
resize_w = int(resize_w * ratio)
if resize_h % 32 == 0:
resize_h = resize_h
elif resize_h // 32 <= 1:
resize_h = 32
else:
resize_h = (resize_h // 32 - 1) * 32
if resize_w % 32 == 0:
resize_w = resize_w
elif resize_w // 32 <= 1:
resize_w = 32
else:
resize_w = (resize_w // 32 - 1) * 32
try:
if int(resize_w) <= 0 or int(resize_h) <= 0:
return None, (None, None)
im = cv2.resize(im, (int(resize_w), int(resize_h)))
except:
print(im.shape, resize_w, resize_h)
sys.exit(0)
ratio_h = resize_h / float(h)
ratio_w = resize_w / float(w)
return im, (ratio_h, ratio_w)
def resize_image_type1(self, im):
resize_h, resize_w = self.image_shape
ori_h, ori_w = im.shape[:2] # (h, w, c)
im = cv2.resize(im, (int(resize_w), int(resize_h)))
ratio_h = float(resize_h) / ori_h
ratio_w = float(resize_w) / ori_w
return im, (ratio_h, ratio_w)
def normalize(self, im):
img_mean = [0.485, 0.456, 0.406]
img_std = [0.229, 0.224, 0.225]
im = im.astype(np.float32, copy=False)
im = im / 255
im[:, :, 0] -= img_mean[0]
im[:, :, 1] -= img_mean[1]
im[:, :, 2] -= img_mean[2]
im[:, :, 0] /= img_std[0]
im[:, :, 1] /= img_std[1]
im[:, :, 2] /= img_std[2]
channel_swap = (2, 0, 1)
im = im.transpose(channel_swap)
return im
def __call__(self, im):
if self.resize_type == 0:
im, (ratio_h, ratio_w) = self.resize_image_type0(im)
else:
im, (ratio_h, ratio_w) = self.resize_image_type1(im)
im = self.normalize(im)
im = im[np.newaxis, :]
return [im, (ratio_h, ratio_w)]
此差异已折叠。
# -*- coding:utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
import cv2
np.seterr(divide='ignore', invalid='ignore')
import pyclipper
from shapely.geometry import Polygon
import sys
import warnings
warnings.simplefilter("ignore")
def draw_border_map(polygon, canvas, mask, shrink_ratio):
polygon = np.array(polygon)
assert polygon.ndim == 2
assert polygon.shape[1] == 2
polygon_shape = Polygon(polygon)
if polygon_shape.area <= 0:
return
distance = polygon_shape.area * (
1 - np.power(shrink_ratio, 2)) / polygon_shape.length
subject = [tuple(l) for l in polygon]
padding = pyclipper.PyclipperOffset()
padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
padded_polygon = np.array(padding.Execute(distance)[0])
cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
xmin = padded_polygon[:, 0].min()
xmax = padded_polygon[:, 0].max()
ymin = padded_polygon[:, 1].min()
ymax = padded_polygon[:, 1].max()
width = xmax - xmin + 1
height = ymax - ymin + 1
polygon[:, 0] = polygon[:, 0] - xmin
polygon[:, 1] = polygon[:, 1] - ymin
xs = np.broadcast_to(
np.linspace(
0, width - 1, num=width).reshape(1, width), (height, width))
ys = np.broadcast_to(
np.linspace(
0, height - 1, num=height).reshape(height, 1), (height, width))
distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
for i in range(polygon.shape[0]):
j = (i + 1) % polygon.shape[0]
absolute_distance = _distance(xs, ys, polygon[i], polygon[j])
distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
distance_map = distance_map.min(axis=0)
xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
1 - distance_map[ymin_valid - ymin:ymax_valid - ymax + height,
xmin_valid - xmin:xmax_valid - xmax + width],
canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1])
def _distance(xs, ys, point_1, point_2):
'''
compute the distance from point to a line
ys: coordinates in the first axis
xs: coordinates in the second axis
point_1, point_2: (x, y), the end of the line
'''
height, width = xs.shape[:2]
square_distance_1 = np.square(xs - point_1[0]) + np.square(ys - point_1[1])
square_distance_2 = np.square(xs - point_2[0]) + np.square(ys - point_2[1])
square_distance = np.square(point_1[0] - point_2[0]) + np.square(point_1[
1] - point_2[1])
cosin = (square_distance - square_distance_1 - square_distance_2) / (
2 * np.sqrt(square_distance_1 * square_distance_2))
square_sin = 1 - np.square(cosin)
square_sin = np.nan_to_num(square_sin)
result = np.sqrt(square_distance_1 * square_distance_2 * square_sin /
square_distance)
result[cosin <
0] = np.sqrt(np.fmin(square_distance_1, square_distance_2))[cosin <
0]
# self.extend_line(point_1, point_2, result)
return result
def extend_line(point_1, point_2, result, shrink_ratio):
ex_point_1 = (
int(
round(point_1[0] + (point_1[0] - point_2[0]) * (1 + shrink_ratio))),
int(
round(point_1[1] + (point_1[1] - point_2[1]) * (1 + shrink_ratio))))
cv2.line(
result,
tuple(ex_point_1),
tuple(point_1),
4096.0,
1,
lineType=cv2.LINE_AA,
shift=0)
ex_point_2 = (
int(
round(point_2[0] + (point_2[0] - point_1[0]) * (1 + shrink_ratio))),
int(
round(point_2[1] + (point_2[1] - point_1[1]) * (1 + shrink_ratio))))
cv2.line(
result,
tuple(ex_point_2),
tuple(point_2),
4096.0,
1,
lineType=cv2.LINE_AA,
shift=0)
return ex_point_1, ex_point_2
def MakeBorderMap(data):
shrink_ratio = 0.4
thresh_min = 0.3
thresh_max = 0.7
im = data['image']
text_polys = data['polys']
ignore_tags = data['ignore_tags']
canvas = np.zeros(im.shape[:2], dtype=np.float32)
mask = np.zeros(im.shape[:2], dtype=np.float32)
for i in range(len(text_polys)):
if ignore_tags[i]:
continue
draw_border_map(
text_polys[i], canvas, mask=mask, shrink_ratio=shrink_ratio)
canvas = canvas * (thresh_max - thresh_min) + thresh_min
data['threshold_map'] = canvas
data['threshold_mask'] = mask
return data
# -*- coding:utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
import cv2
from shapely.geometry import Polygon
import pyclipper
def validate_polygons(polygons, ignore_tags, h, w):
'''
polygons (numpy.array, required): of shape (num_instances, num_points, 2)
'''
if len(polygons) == 0:
return polygons, ignore_tags
assert len(polygons) == len(ignore_tags)
for polygon in polygons:
polygon[:, 0] = np.clip(polygon[:, 0], 0, w - 1)
polygon[:, 1] = np.clip(polygon[:, 1], 0, h - 1)
for i in range(len(polygons)):
area = polygon_area(polygons[i])
if abs(area) < 1:
ignore_tags[i] = True
if area > 0:
polygons[i] = polygons[i][::-1, :]
return polygons, ignore_tags
def polygon_area(polygon):
edge = 0
for i in range(polygon.shape[0]):
next_index = (i + 1) % polygon.shape[0]
edge += (polygon[next_index, 0] - polygon[i, 0]) * (
polygon[next_index, 1] - polygon[i, 1])
return edge / 2.
def MakeShrinkMap(data):
min_text_size = 8
shrink_ratio = 0.4
image = data['image']
text_polys = data['polys']
ignore_tags = data['ignore_tags']
h, w = image.shape[:2]
text_polys, ignore_tags = validate_polygons(text_polys, ignore_tags, h, w)
gt = np.zeros((h, w), dtype=np.float32)
# gt = np.zeros((1, h, w), dtype=np.float32)
mask = np.ones((h, w), dtype=np.float32)
for i in range(len(text_polys)):
polygon = text_polys[i]
height = max(polygon[:, 1]) - min(polygon[:, 1])
width = max(polygon[:, 0]) - min(polygon[:, 0])
# height = min(np.linalg.norm(polygon[0] - polygon[3]),
# np.linalg.norm(polygon[1] - polygon[2]))
# width = min(np.linalg.norm(polygon[0] - polygon[1]),
# np.linalg.norm(polygon[2] - polygon[3]))
if ignore_tags[i] or min(height, width) < min_text_size:
cv2.fillPoly(mask, polygon.astype(np.int32)[np.newaxis, :, :], 0)
ignore_tags[i] = True
else:
polygon_shape = Polygon(polygon)
distance = polygon_shape.area * (
1 - np.power(shrink_ratio, 2)) / polygon_shape.length
subject = [tuple(l) for l in text_polys[i]]
padding = pyclipper.PyclipperOffset()
padding.AddPath(subject, pyclipper.JT_ROUND,
pyclipper.ET_CLOSEDPOLYGON)
shrinked = padding.Execute(-distance)
if shrinked == []:
cv2.fillPoly(mask,
polygon.astype(np.int32)[np.newaxis, :, :], 0)
ignore_tags[i] = True
continue
shrinked = np.array(shrinked[0]).reshape(-1, 2)
cv2.fillPoly(gt, [shrinked.astype(np.int32)], 1)
# cv2.fillPoly(gt[0], [shrinked.astype(np.int32)], 1)
data['shrink_map'] = gt
data['shrink_mask'] = mask
return data
此差异已折叠。
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from .iaa_augment import IaaAugment
from .make_border_map import MakeBorderMap
from .make_shrink_map import MakeShrinkMap
from .random_crop_data import EastRandomCropData, PSERandomCrop
from .rec_img_aug import RecAug, RecResizeImg
from .operators import *
from .label_ops import *
def transform(data, ops=None):
""" transform """
if ops is None:
ops = []
for op in ops:
data = op(data)
if data is None:
return None
return data
def create_operators(op_param_list, global_config=None):
"""
create operators based on the config
Args:
params(list): a dict list, used to create some operators
"""
assert isinstance(op_param_list, list), ('operator config should be a list')
ops = []
for operator in op_param_list:
assert isinstance(operator,
dict) and len(operator) == 1, "yaml format error"
op_name = list(operator)[0]
param = {} if operator[op_name] is None else operator[op_name]
if global_config is not None:
param.update(global_config)
op = eval(op_name)(**param)
ops.append(op)
return ops
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
import imgaug
import imgaug.augmenters as iaa
class AugmenterBuilder(object):
def __init__(self):
pass
def build(self, args, root=True):
if args is None or len(args) == 0:
return None
elif isinstance(args, list):
if root:
sequence = [self.build(value, root=False) for value in args]
return iaa.Sequential(sequence)
else:
return getattr(iaa, args[0])(
*[self.to_tuple_if_list(a) for a in args[1:]])
elif isinstance(args, dict):
cls = getattr(iaa, args['type'])
return cls(**{
k: self.to_tuple_if_list(v)
for k, v in args['args'].items()
})
else:
raise RuntimeError('unknown augmenter arg: ' + str(args))
def to_tuple_if_list(self, obj):
if isinstance(obj, list):
return tuple(obj)
return obj
class IaaAugment():
def __init__(self, augmenter_args=None, **kwargs):
if augmenter_args is None:
augmenter_args = [{
'type': 'Fliplr',
'args': {
'p': 0.5
}
}, {
'type': 'Affine',
'args': {
'rotate': [-10, 10]
}
}, {
'type': 'Resize',
'args': {
'size': [0.5, 3]
}
}]
self.augmenter = AugmenterBuilder().build(augmenter_args)
def __call__(self, data):
image = data['image']
shape = image.shape
if self.augmenter:
aug = self.augmenter.to_deterministic()
data['image'] = aug.augment_image(image)
data = self.may_augment_annotation(aug, data, shape)
return data
def may_augment_annotation(self, aug, data, shape):
if aug is None:
return data
line_polys = []
for poly in data['polys']:
new_poly = self.may_augment_poly(aug, shape, poly)
line_polys.append(new_poly)
data['polys'] = np.array(line_polys)
return data
def may_augment_poly(self, aug, img_shape, poly):
keypoints = [imgaug.Keypoint(p[0], p[1]) for p in poly]
keypoints = aug.augment_keypoints(
[imgaug.KeypointsOnImage(
keypoints, shape=img_shape)])[0].keypoints
poly = [(p.x, p.y) for p in keypoints]
return poly
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
from ppocr.utils.logging import get_logger
class DetLabelEncode(object):
def __init__(self, **kwargs):
pass
def __call__(self, data):
import json
label = data['label']
label = json.loads(label)
nBox = len(label)
boxes, txts, txt_tags = [], [], []
for bno in range(0, nBox):
box = label[bno]['points']
txt = label[bno]['transcription']
boxes.append(box)
txts.append(txt)
if txt in ['*', '###']:
txt_tags.append(True)
else:
txt_tags.append(False)
boxes = np.array(boxes, dtype=np.float32)
txt_tags = np.array(txt_tags, dtype=np.bool)
data['polys'] = boxes
data['texts'] = txts
data['ignore_tags'] = txt_tags
return data
def order_points_clockwise(self, pts):
rect = np.zeros((4, 2), dtype="float32")
s = pts.sum(axis=1)
rect[0] = pts[np.argmin(s)]
rect[2] = pts[np.argmax(s)]
diff = np.diff(pts, axis=1)
rect[1] = pts[np.argmin(diff)]
rect[3] = pts[np.argmax(diff)]
return rect
class BaseRecLabelEncode(object):
""" Convert between text-label and text-index """
def __init__(self,
max_text_length,
character_dict_path=None,
character_type='ch',
use_space_char=False):
support_character_type = ['ch', 'en', 'en_sensitive']
assert character_type in support_character_type, "Only {} are supported now but get {}".format(
support_character_type, self.character_str)
self.max_text_len = max_text_length
if character_type == "en":
self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
dict_character = list(self.character_str)
elif character_type == "ch":
self.character_str = ""
assert character_dict_path is not None, "character_dict_path should not be None when character_type is ch"
with open(character_dict_path, "rb") as fin:
lines = fin.readlines()
for line in lines:
line = line.decode('utf-8').strip("\n").strip("\r\n")
self.character_str += line
if use_space_char:
self.character_str += " "
dict_character = list(self.character_str)
elif character_type == "en_sensitive":
# same with ASTER setting (use 94 char).
import string
self.character_str = string.printable[:-6]
dict_character = list(self.character_str)
self.character_type = character_type
dict_character = self.add_special_char(dict_character)
self.dict = {}
for i, char in enumerate(dict_character):
self.dict[char] = i
self.character = dict_character
def add_special_char(self, dict_character):
return dict_character
def encode(self, text):
"""convert text-label into text-index.
input:
text: text labels of each image. [batch_size]
output:
text: concatenated text index for CTCLoss.
[sum(text_lengths)] = [text_index_0 + text_index_1 + ... + text_index_(n - 1)]
length: length of each text. [batch_size]
"""
if len(text) > self.max_text_len:
return None
if self.character_type == "en":
text = text.lower()
text_list = []
for char in text:
if char not in self.dict:
# logger = get_logger()
# logger.warning('{} is not in dict'.format(char))
continue
text_list.append(self.dict[char])
if len(text_list) == 0:
return None
return text_list
def get_ignored_tokens(self):
return [0] # for ctc blank
class CTCLabelEncode(BaseRecLabelEncode):
""" Convert between text-label and text-index """
def __init__(self,
max_text_length,
character_dict_path=None,
character_type='ch',
use_space_char=False,
**kwargs):
super(CTCLabelEncode,
self).__init__(max_text_length, character_dict_path,
character_type, use_space_char)
def __call__(self, data):
text = data['label']
text = self.encode(text)
if text is None:
return None
data['length'] = np.array(len(text))
text = text + [0] * (self.max_text_len - len(text))
data['label'] = np.array(text)
return data
def add_special_char(self, dict_character):
dict_character = ['blank'] + dict_character
return dict_character
class AttnLabelEncode(BaseRecLabelEncode):
""" Convert between text-label and text-index """
def __init__(self,
max_text_length,
character_dict_path=None,
character_type='ch',
use_space_char=False,
**kwargs):
super(AttnLabelEncode,
self).__init__(max_text_length, character_dict_path,
character_type, use_space_char)
self.beg_str = "sos"
self.end_str = "eos"
def add_special_char(self, dict_character):
dict_character = [self.beg_str, self.end_str] + dict_character
return dict_character
def __call__(self, text):
text = self.encode(text)
return text
def get_ignored_tokens(self):
beg_idx = self.get_beg_end_flag_idx("beg")
end_idx = self.get_beg_end_flag_idx("end")
return [beg_idx, end_idx]
def get_beg_end_flag_idx(self, beg_or_end):
if beg_or_end == "beg":
idx = np.array(self.dict[self.beg_str])
elif beg_or_end == "end":
idx = np.array(self.dict[self.end_str])
else:
assert False, "Unsupport type %s in get_beg_end_flag_idx" \
% beg_or_end
return idx
# -*- coding:utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
import cv2
np.seterr(divide='ignore', invalid='ignore')
import pyclipper
from shapely.geometry import Polygon
import sys
import warnings
warnings.simplefilter("ignore")
__all__ = ['MakeBorderMap']
class MakeBorderMap(object):
def __init__(self,
shrink_ratio=0.4,
thresh_min=0.3,
thresh_max=0.7,
**kwargs):
self.shrink_ratio = shrink_ratio
self.thresh_min = thresh_min
self.thresh_max = thresh_max
def __call__(self, data: dict) -> dict:
img = data['image']
text_polys = data['polys']
ignore_tags = data['ignore_tags']
canvas = np.zeros(img.shape[:2], dtype=np.float32)
mask = np.zeros(img.shape[:2], dtype=np.float32)
for i in range(len(text_polys)):
if ignore_tags[i]:
continue
self.draw_border_map(text_polys[i], canvas, mask=mask)
canvas = canvas * (self.thresh_max - self.thresh_min) + self.thresh_min
data['threshold_map'] = canvas
data['threshold_mask'] = mask
return data
def draw_border_map(self, polygon, canvas, mask):
polygon = np.array(polygon)
assert polygon.ndim == 2
assert polygon.shape[1] == 2
polygon_shape = Polygon(polygon)
if polygon_shape.area <= 0:
return
distance = polygon_shape.area * (
1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
subject = [tuple(l) for l in polygon]
padding = pyclipper.PyclipperOffset()
padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
padded_polygon = np.array(padding.Execute(distance)[0])
cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
xmin = padded_polygon[:, 0].min()
xmax = padded_polygon[:, 0].max()
ymin = padded_polygon[:, 1].min()
ymax = padded_polygon[:, 1].max()
width = xmax - xmin + 1
height = ymax - ymin + 1
polygon[:, 0] = polygon[:, 0] - xmin
polygon[:, 1] = polygon[:, 1] - ymin
xs = np.broadcast_to(
np.linspace(
0, width - 1, num=width).reshape(1, width), (height, width))
ys = np.broadcast_to(
np.linspace(
0, height - 1, num=height).reshape(height, 1), (height, width))
distance_map = np.zeros(
(polygon.shape[0], height, width), dtype=np.float32)
for i in range(polygon.shape[0]):
j = (i + 1) % polygon.shape[0]
absolute_distance = self._distance(xs, ys, polygon[i], polygon[j])
distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
distance_map = distance_map.min(axis=0)
xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
1 - distance_map[ymin_valid - ymin:ymax_valid - ymax + height,
xmin_valid - xmin:xmax_valid - xmax + width],
canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1])
def _distance(self, xs, ys, point_1, point_2):
'''
compute the distance from point to a line
ys: coordinates in the first axis
xs: coordinates in the second axis
point_1, point_2: (x, y), the end of the line
'''
height, width = xs.shape[:2]
square_distance_1 = np.square(xs - point_1[0]) + np.square(ys - point_1[
1])
square_distance_2 = np.square(xs - point_2[0]) + np.square(ys - point_2[
1])
square_distance = np.square(point_1[0] - point_2[0]) + np.square(
point_1[1] - point_2[1])
cosin = (square_distance - square_distance_1 - square_distance_2) / (
2 * np.sqrt(square_distance_1 * square_distance_2))
square_sin = 1 - np.square(cosin)
square_sin = np.nan_to_num(square_sin)
result = np.sqrt(square_distance_1 * square_distance_2 * square_sin /
square_distance)
result[cosin <
0] = np.sqrt(np.fmin(square_distance_1, square_distance_2))[cosin
< 0]
# self.extend_line(point_1, point_2, result)
return result
def extend_line(self, point_1, point_2, result, shrink_ratio):
ex_point_1 = (int(
round(point_1[0] + (point_1[0] - point_2[0]) * (1 + shrink_ratio))),
int(
round(point_1[1] + (point_1[1] - point_2[1]) * (
1 + shrink_ratio))))
cv2.line(
result,
tuple(ex_point_1),
tuple(point_1),
4096.0,
1,
lineType=cv2.LINE_AA,
shift=0)
ex_point_2 = (int(
round(point_2[0] + (point_2[0] - point_1[0]) * (1 + shrink_ratio))),
int(
round(point_2[1] + (point_2[1] - point_1[1]) * (
1 + shrink_ratio))))
cv2.line(
result,
tuple(ex_point_2),
tuple(point_2),
4096.0,
1,
lineType=cv2.LINE_AA,
shift=0)
return ex_point_1, ex_point_2
# -*- coding:utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
import cv2
from shapely.geometry import Polygon
import pyclipper
__all__ = ['MakeShrinkMap']
class MakeShrinkMap(object):
r'''
Making binary mask from detection data with ICDAR format.
Typically following the process of class `MakeICDARData`.
'''
def __init__(self, min_text_size=8, shrink_ratio=0.4, **kwargs):
self.min_text_size = min_text_size
self.shrink_ratio = shrink_ratio
def __call__(self, data):
image = data['image']
text_polys = data['polys']
ignore_tags = data['ignore_tags']
h, w = image.shape[:2]
text_polys, ignore_tags = self.validate_polygons(text_polys,
ignore_tags, h, w)
gt = np.zeros((h, w), dtype=np.float32)
# gt = np.zeros((1, h, w), dtype=np.float32)
mask = np.ones((h, w), dtype=np.float32)
for i in range(len(text_polys)):
polygon = text_polys[i]
height = max(polygon[:, 1]) - min(polygon[:, 1])
width = max(polygon[:, 0]) - min(polygon[:, 0])
if ignore_tags[i] or min(height, width) < self.min_text_size:
cv2.fillPoly(mask,
polygon.astype(np.int32)[np.newaxis, :, :], 0)
ignore_tags[i] = True
else:
polygon_shape = Polygon(polygon)
distance = polygon_shape.area * (
1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
subject = [tuple(l) for l in text_polys[i]]
padding = pyclipper.PyclipperOffset()
padding.AddPath(subject, pyclipper.JT_ROUND,
pyclipper.ET_CLOSEDPOLYGON)
shrinked = padding.Execute(-distance)
if shrinked == []:
cv2.fillPoly(mask,
polygon.astype(np.int32)[np.newaxis, :, :], 0)
ignore_tags[i] = True
continue
shrinked = np.array(shrinked[0]).reshape(-1, 2)
cv2.fillPoly(gt, [shrinked.astype(np.int32)], 1)
# cv2.fillPoly(gt[0], [shrinked.astype(np.int32)], 1)
data['shrink_map'] = gt
data['shrink_mask'] = mask
return data
def validate_polygons(self, polygons, ignore_tags, h, w):
'''
polygons (numpy.array, required): of shape (num_instances, num_points, 2)
'''
if len(polygons) == 0:
return polygons, ignore_tags
assert len(polygons) == len(ignore_tags)
for polygon in polygons:
polygon[:, 0] = np.clip(polygon[:, 0], 0, w - 1)
polygon[:, 1] = np.clip(polygon[:, 1], 0, h - 1)
for i in range(len(polygons)):
area = self.polygon_area(polygons[i])
if abs(area) < 1:
ignore_tags[i] = True
if area > 0:
polygons[i] = polygons[i][::-1, :]
return polygons, ignore_tags
def polygon_area(self, polygon):
# return cv2.contourArea(polygon.astype(np.float32))
edge = 0
for i in range(polygon.shape[0]):
next_index = (i + 1) % polygon.shape[0]
edge += (polygon[next_index, 0] - polygon[i, 0]) * (
polygon[next_index, 1] - polygon[i, 1])
return edge / 2.
"""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import sys
import six
import cv2
import numpy as np
class DecodeImage(object):
""" decode image """
def __init__(self, img_mode='RGB', channel_first=False, **kwargs):
self.img_mode = img_mode
self.channel_first = channel_first
def __call__(self, data):
img = data['image']
if six.PY2:
assert type(img) is str and len(
img) > 0, "invalid input 'img' in DecodeImage"
else:
assert type(img) is bytes and len(
img) > 0, "invalid input 'img' in DecodeImage"
img = np.frombuffer(img, dtype='uint8')
img = cv2.imdecode(img, 1)
if self.img_mode == 'GRAY':
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
elif self.img_mode == 'RGB':
assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape)
img = img[:, :, ::-1]
if self.channel_first:
img = img.transpose((2, 0, 1))
data['image'] = img
return data
class NormalizeImage(object):
""" normalize image such as substract mean, divide std
"""
def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs):
if isinstance(scale, str):
scale = eval(scale)
self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
mean = mean if mean is not None else [0.485, 0.456, 0.406]
std = std if std is not None else [0.229, 0.224, 0.225]
shape = (3, 1, 1) if order == 'chw' else (1, 1, 3)
self.mean = np.array(mean).reshape(shape).astype('float32')
self.std = np.array(std).reshape(shape).astype('float32')
def __call__(self, data):
img = data['image']
from PIL import Image
if isinstance(img, Image.Image):
img = np.array(img)
assert isinstance(img,
np.ndarray), "invalid input 'img' in NormalizeImage"
data['image'] = (
img.astype('float32') * self.scale - self.mean) / self.std
return data
class ToCHWImage(object):
""" convert hwc image to chw image
"""
def __init__(self, **kwargs):
pass
def __call__(self, data):
img = data['image']
from PIL import Image
if isinstance(img, Image.Image):
img = np.array(img)
data['image'] = img.transpose((2, 0, 1))
return data
class keepKeys(object):
def __init__(self, keep_keys, **kwargs):
self.keep_keys = keep_keys
def __call__(self, data):
data_list = []
for key in self.keep_keys:
data_list.append(data[key])
return data_list
class DetResizeForTest(object):
def __init__(self, **kwargs):
super(DetResizeForTest, self).__init__()
self.resize_type = 0
if 'image_shape' in kwargs:
self.image_shape = kwargs['image_shape']
self.resize_type = 1
if 'limit_side_len' in kwargs:
self.limit_side_len = kwargs['limit_side_len']
self.limit_type = kwargs.get('limit_type', 'min')
else:
self.limit_side_len = 736
self.limit_type = 'min'
def __call__(self, data):
img = data['image']
if self.resize_type == 0:
img, shape = self.resize_image_type0(img)
else:
img, shape = self.resize_image_type1(img)
data['image'] = img
data['shape'] = shape
return data
def resize_image_type1(self, img):
resize_h, resize_w = self.image_shape
ori_h, ori_w = img.shape[:2] # (h, w, c)
img = cv2.resize(img, (int(resize_w), int(resize_h)))
return img, np.array([ori_h, ori_w])
def resize_image_type0(self, img):
"""
resize image to a size multiple of 32 which is required by the network
args:
img(array): array with shape [h, w, c]
return(tuple):
img, (ratio_h, ratio_w)
"""
limit_side_len = self.limit_side_len
h, w, _ = img.shape
# limit the max side
if self.limit_type == 'max':
if max(h, w) > limit_side_len:
if h > w:
ratio = float(limit_side_len) / h
else:
ratio = float(limit_side_len) / w
else:
ratio = 1.
else:
if min(h, w) < limit_side_len:
if h < w:
ratio = float(limit_side_len) / h
else:
ratio = float(limit_side_len) / w
else:
ratio = 1.
resize_h = int(h * ratio)
resize_w = int(w * ratio)
resize_h = int(round(resize_h / 32) * 32)
resize_w = int(round(resize_w / 32) * 32)
try:
if int(resize_w) <= 0 or int(resize_h) <= 0:
return None, (None, None)
img = cv2.resize(img, (int(resize_w), int(resize_h)))
except:
print(img.shape, resize_w, resize_h)
sys.exit(0)
return img, np.array([h, w])
......@@ -108,13 +108,20 @@ def crop_area(im, text_polys, min_crop_side_ratio, max_tries):
return 0, 0, w, h
def RandomCropData(data, size):
max_tries = 10
min_crop_side_ratio = 0.1
require_original_image = False
keep_ratio = True
im = data['image']
class EastRandomCropData(object):
def __init__(self,
size=(640, 640),
max_tries=10,
min_crop_side_ratio=0.1,
keep_ratio=True,
**kwargs):
self.size = size
self.max_tries = max_tries
self.min_crop_side_ratio = min_crop_side_ratio
self.keep_ratio = keep_ratio
def __call__(self, data):
img = data['image']
text_polys = data['polys']
ignore_tags = data['ignore_tags']
texts = data['texts']
......@@ -122,22 +129,24 @@ def RandomCropData(data, size):
text_polys[i] for i, tag in enumerate(ignore_tags) if not tag
]
# 计算crop区域
crop_x, crop_y, crop_w, crop_h = crop_area(im, all_care_polys,
min_crop_side_ratio, max_tries)
crop_x, crop_y, crop_w, crop_h = crop_area(
img, all_care_polys, self.min_crop_side_ratio, self.max_tries)
# crop 图片 保持比例填充
scale_w = size[0] / crop_w
scale_h = size[1] / crop_h
scale_w = self.size[0] / crop_w
scale_h = self.size[1] / crop_h
scale = min(scale_w, scale_h)
h = int(crop_h * scale)
w = int(crop_w * scale)
if keep_ratio:
padimg = np.zeros((size[1], size[0], im.shape[2]), im.dtype)
if self.keep_ratio:
padimg = np.zeros((self.size[1], self.size[0], img.shape[2]),
img.dtype)
padimg[:h, :w] = cv2.resize(
im[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], (w, h))
img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], (w, h))
img = padimg
else:
img = cv2.resize(im[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w],
tuple(size))
img = cv2.resize(
img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w],
tuple(self.size))
# crop 文本框
text_polys_crop = []
ignore_tags_crop = []
......@@ -153,3 +162,49 @@ def RandomCropData(data, size):
data['ignore_tags'] = ignore_tags_crop
data['texts'] = texts_crop
return data
class PSERandomCrop(object):
def __init__(self, size, **kwargs):
self.size = size
def __call__(self, data):
imgs = data['imgs']
h, w = imgs[0].shape[0:2]
th, tw = self.size
if w == tw and h == th:
return imgs
# label中存在文本实例,并且按照概率进行裁剪,使用threshold_label_map控制
if np.max(imgs[2]) > 0 and random.random() > 3 / 8:
# 文本实例的左上角点
tl = np.min(np.where(imgs[2] > 0), axis=1) - self.size
tl[tl < 0] = 0
# 文本实例的右下角点
br = np.max(np.where(imgs[2] > 0), axis=1) - self.size
br[br < 0] = 0
# 保证选到右下角点时,有足够的距离进行crop
br[0] = min(br[0], h - th)
br[1] = min(br[1], w - tw)
for _ in range(50000):
i = random.randint(tl[0], br[0])
j = random.randint(tl[1], br[1])
# 保证shrink_label_map有文本
if imgs[1][i:i + th, j:j + tw].sum() <= 0:
continue
else:
break
else:
i = random.randint(0, h - th)
j = random.randint(0, w - tw)
# return i, j, th, tw
for idx in range(len(imgs)):
if len(imgs[idx].shape) == 3:
imgs[idx] = imgs[idx][i:i + th, j:j + tw, :]
else:
imgs[idx] = imgs[idx][i:i + th, j:j + tw]
data['imgs'] = imgs
return data
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,3 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .augment import tia_perspective, tia_distort, tia_stretch
__all__ = ['tia_distort', 'tia_stretch', 'tia_perspective']
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from .warp_mls import WarpMLS
def tia_distort(src, segment=4):
img_h, img_w = src.shape[:2]
cut = img_w // segment
thresh = cut // 3
src_pts = list()
dst_pts = list()
src_pts.append([0, 0])
src_pts.append([img_w, 0])
src_pts.append([img_w, img_h])
src_pts.append([0, img_h])
dst_pts.append([np.random.randint(thresh), np.random.randint(thresh)])
dst_pts.append(
[img_w - np.random.randint(thresh), np.random.randint(thresh)])
dst_pts.append(
[img_w - np.random.randint(thresh), img_h - np.random.randint(thresh)])
dst_pts.append(
[np.random.randint(thresh), img_h - np.random.randint(thresh)])
half_thresh = thresh * 0.5
for cut_idx in np.arange(1, segment, 1):
src_pts.append([cut * cut_idx, 0])
src_pts.append([cut * cut_idx, img_h])
dst_pts.append([
cut * cut_idx + np.random.randint(thresh) - half_thresh,
np.random.randint(thresh) - half_thresh
])
dst_pts.append([
cut * cut_idx + np.random.randint(thresh) - half_thresh,
img_h + np.random.randint(thresh) - half_thresh
])
trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h)
dst = trans.generate()
return dst
def tia_stretch(src, segment=4):
img_h, img_w = src.shape[:2]
cut = img_w // segment
thresh = cut * 4 // 5
src_pts = list()
dst_pts = list()
src_pts.append([0, 0])
src_pts.append([img_w, 0])
src_pts.append([img_w, img_h])
src_pts.append([0, img_h])
dst_pts.append([0, 0])
dst_pts.append([img_w, 0])
dst_pts.append([img_w, img_h])
dst_pts.append([0, img_h])
half_thresh = thresh * 0.5
for cut_idx in np.arange(1, segment, 1):
move = np.random.randint(thresh) - half_thresh
src_pts.append([cut * cut_idx, 0])
src_pts.append([cut * cut_idx, img_h])
dst_pts.append([cut * cut_idx + move, 0])
dst_pts.append([cut * cut_idx + move, img_h])
trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h)
dst = trans.generate()
return dst
def tia_perspective(src):
img_h, img_w = src.shape[:2]
thresh = img_h // 2
src_pts = list()
dst_pts = list()
src_pts.append([0, 0])
src_pts.append([img_w, 0])
src_pts.append([img_w, img_h])
src_pts.append([0, img_h])
dst_pts.append([0, np.random.randint(thresh)])
dst_pts.append([img_w, np.random.randint(thresh)])
dst_pts.append([img_w, img_h - np.random.randint(thresh)])
dst_pts.append([0, img_h - np.random.randint(thresh)])
trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h)
dst = trans.generate()
return dst
\ No newline at end of file
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import Levenshtein
class RecMetric(object):
def __init__(self, main_indicator='acc', **kwargs):
self.main_indicator = main_indicator
self.reset()
def __call__(self, pred_label, *args, **kwargs):
preds, labels = pred_label
correct_num = 0
all_num = 0
norm_edit_dis = 0.0
for (pred, pred_conf), (target, _) in zip(preds, labels):
norm_edit_dis += Levenshtein.distance(pred, target) / max(
len(pred), len(target))
if pred == target:
correct_num += 1
all_num += 1
# if all_num < 10 and kwargs.get('show_str', False):
# print('{} -> {}'.format(pred, target))
self.correct_num += correct_num
self.all_num += all_num
self.norm_edit_dis += norm_edit_dis
return {
'acc': correct_num / all_num,
'norm_edit_dis': 1 - norm_edit_dis / all_num
}
def get_metric(self):
"""
return metircs {
'acc': 0,
'norm_edit_dis': 0,
}
"""
acc = self.correct_num / self.all_num
norm_edit_dis = 1 - self.norm_edit_dis / self.all_num
self.reset()
return {'acc': acc, 'norm_edit_dis': norm_edit_dis}
def reset(self):
self.correct_num = 0
self.all_num = 0
self.norm_edit_dis = 0
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册