diff --git a/PPOCRLabel/README.md b/PPOCRLabel/README.md index 2d6e7f98bb2c2dc4d1c696628e45f4649bf84c1c..9e5b3245b0cfb56d300155a94f64d38edcdbb599 100644 --- a/PPOCRLabel/README.md +++ b/PPOCRLabel/README.md @@ -34,10 +34,10 @@ PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, w pip3 install --upgrade pip # If you have cuda9 or cuda10 installed on your machine, please run the following command to install -python3 -m pip install paddlepaddle-gpu==2.0.0 -i https://mirror.baidu.com/pypi/simple +python3 -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple # If you only have cpu on your machine, please run the following command to install -python3 -m pip install paddlepaddle==2.0.0 -i https://mirror.baidu.com/pypi/simple +python3 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple ``` For more software version requirements, please refer to the instructions in [Installation Document](https://www.paddlepaddle.org.cn/install/quick) for operation. diff --git a/PPOCRLabel/README_ch.md b/PPOCRLabel/README_ch.md index ecc2ab600eaf6bcfe71923f7fc6a9de82fa54ba7..7f9351dfe185be2417162f2c786f5eec0b58816a 100644 --- a/PPOCRLabel/README_ch.md +++ b/PPOCRLabel/README_ch.md @@ -37,11 +37,11 @@ PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具,内置P pip3 install --upgrade pip 如果您的机器安装的是CUDA9或CUDA10,请运行以下命令安装 -python3 -m pip install paddlepaddle-gpu==2.0.0 -i https://mirror.baidu.com/pypi/simple +python3 -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple 如果您的机器是CPU,请运行以下命令安装 -python3 -m pip install paddlepaddle==2.0.0 -i https://mirror.baidu.com/pypi/simple +python3 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple ``` 更多的版本需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 diff --git a/PTDN/common_func.sh b/PTDN/common_func.sh new file mode 100644 index 0000000000000000000000000000000000000000..3f0fa66b77ff50b23b1e83dea506580f549f8ecf --- /dev/null +++ b/PTDN/common_func.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +function func_parser_key(){ + strs=$1 + IFS=":" + array=(${strs}) + tmp=${array[0]} + echo ${tmp} +} + +function func_parser_value(){ + strs=$1 + IFS=":" + array=(${strs}) + tmp=${array[1]} + echo ${tmp} +} + +function func_set_params(){ + key=$1 + value=$2 + if [ ${key}x = "null"x ];then + echo " " + elif [[ ${value} = "null" ]] || [[ ${value} = " " ]] || [ ${#value} -le 0 ];then + echo " " + else + echo "${key}=${value}" + fi +} + +function func_parser_params(){ + strs=$1 + IFS=":" + array=(${strs}) + key=${array[0]} + tmp=${array[1]} + IFS="|" + res="" + for _params in ${tmp[*]}; do + IFS="=" + array=(${_params}) + mode=${array[0]} + value=${array[1]} + if [[ ${mode} = ${MODE} ]]; then + IFS="|" + #echo $(func_set_params "${mode}" "${value}") + echo $value + break + fi + IFS="|" + done + echo ${res} +} + +function status_check(){ + last_status=$1 # the exit code + run_command=$2 + run_log=$3 + if [ $last_status -eq 0 ]; then + echo -e "\033[33m Run successfully with command - ${run_command}! \033[0m" | tee -a ${run_log} + else + echo -e "\033[33m Run failed with command - ${run_command}! \033[0m" | tee -a ${run_log} + fi +} + diff --git a/PTDN/compare_results.py b/PTDN/compare_results.py new file mode 100644 index 0000000000000000000000000000000000000000..35af38809fe7d564707d0d538f7d0159cb6edfbd --- /dev/null +++ b/PTDN/compare_results.py @@ -0,0 +1,138 @@ +import numpy as np +import os +import subprocess +import json +import argparse +import glob + + +def init_args(): + parser = argparse.ArgumentParser() + # params for testing assert allclose + parser.add_argument("--atol", type=float, default=1e-3) + parser.add_argument("--rtol", type=float, default=1e-3) + parser.add_argument("--gt_file", type=str, default="") + parser.add_argument("--log_file", type=str, default="") + parser.add_argument("--precision", type=str, default="fp32") + return parser + + +def parse_args(): + parser = init_args() + return parser.parse_args() + + +def run_shell_command(cmd): + p = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + out, err = p.communicate() + + if p.returncode == 0: + return out.decode('utf-8') + else: + return None + +def parser_results_from_log_by_name(log_path, names_list): + if not os.path.exists(log_path): + raise ValueError("The log file {} does not exists!".format(log_path)) + + if names_list is None or len(names_list) < 1: + return [] + + parser_results = {} + for name in names_list: + cmd = "grep {} {}".format(name, log_path) + outs = run_shell_command(cmd) + outs = outs.split("\n")[0] + result = outs.split("{}".format(name))[-1] + try: + result = json.loads(result) + except: + result = np.array([int(r) for r in result.split()]).reshape(-1, 4) + parser_results[name] = result + return parser_results + +def load_gt_from_file(gt_file): + if not os.path.exists(gt_file): + raise ValueError("The log file {} does not exists!".format(gt_file)) + with open(gt_file, 'r') as f: + data = f.readlines() + f.close() + parser_gt = {} + for line in data: + image_name, result = line.strip("\n").split("\t") + image_name = image_name.split('/')[-1] + try: + result = json.loads(result) + except: + result = np.array([int(r) for r in result.split()]).reshape(-1, 4) + parser_gt[image_name] = result + return parser_gt + + +def load_gt_from_txts(gt_file): + gt_list = glob.glob(gt_file) + gt_collection = {} + for gt_f in gt_list: + gt_dict = load_gt_from_file(gt_f) + basename = os.path.basename(gt_f) + if "fp32" in basename: + gt_collection["fp32"] = [gt_dict, gt_f] + elif "fp16" in basename: + gt_collection["fp16"] = [gt_dict, gt_f] + elif "int8" in basename: + gt_collection["int8"] = [gt_dict, gt_f] + else: + continue + return gt_collection + + +def collect_predict_from_logs(log_path, key_list): + log_list = glob.glob(log_path) + pred_collection = {} + for log_f in log_list: + pred_dict = parser_results_from_log_by_name(log_f, key_list) + key = os.path.basename(log_f) + pred_collection[key] = pred_dict + + return pred_collection + + +def testing_assert_allclose(dict_x, dict_y, atol=1e-7, rtol=1e-7): + for k in dict_x: + np.testing.assert_allclose( + np.array(dict_x[k]), np.array(dict_y[k]), atol=atol, rtol=rtol) + + +if __name__ == "__main__": + # Usage: + # python3.7 tests/compare_results.py --gt_file=./tests/results/*.txt --log_file=./tests/output/infer_*.log + + args = parse_args() + + gt_collection = load_gt_from_txts(args.gt_file) + key_list = gt_collection["fp32"][0].keys() + + pred_collection = collect_predict_from_logs(args.log_file, key_list) + for filename in pred_collection.keys(): + if "fp32" in filename: + gt_dict, gt_filename = gt_collection["fp32"] + elif "fp16" in filename: + gt_dict, gt_filename = gt_collection["fp16"] + elif "int8" in filename: + gt_dict, gt_filename = gt_collection["int8"] + else: + continue + pred_dict = pred_collection[filename] + + try: + testing_assert_allclose( + gt_dict, pred_dict, atol=args.atol, rtol=args.rtol) + print( + "Assert allclose passed! The results of {} and {} are consistent!". + format(filename, gt_filename)) + except Exception as E: + print(E) + raise ValueError( + "The results of {} and the results of {} are inconsistent!". + format(filename, gt_filename)) diff --git a/PTDN/configs/det_mv3_db.yml b/PTDN/configs/det_mv3_db.yml new file mode 100644 index 0000000000000000000000000000000000000000..5eada6d53dd3364238bdfc6a3c40515ca0726688 --- /dev/null +++ b/PTDN/configs/det_mv3_db.yml @@ -0,0 +1,126 @@ +Global: + use_gpu: false + epoch_num: 5 + log_smooth_window: 20 + print_batch_step: 1 + save_model_dir: ./output/db_mv3/ + save_epoch_step: 1200 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 400] + cal_metric_during_train: False + pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_db/predicts_db.txt + +Architecture: + model_type: det + algorithm: DB + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: False + Neck: + name: DBFPN + out_channels: 256 + Head: + name: DBHead + k: 50 + +Loss: + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + +Optimizer: + name: Adam #Momentum + #momentum: 0.9 + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - Resize: + size: [640, 640] + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 + num_workers: 0 + use_shared_memory: False + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: + image_shape: [736, 1280] + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 0 + use_shared_memory: False diff --git a/PTDN/configs/det_r50_vd_db.yml b/PTDN/configs/det_r50_vd_db.yml new file mode 100644 index 0000000000000000000000000000000000000000..f512de808141a0a0e815f9477de80b893ae3c946 --- /dev/null +++ b/PTDN/configs/det_r50_vd_db.yml @@ -0,0 +1,124 @@ +Global: + use_gpu: false + epoch_num: 5 + log_smooth_window: 20 + print_batch_step: 1 + save_model_dir: ./output/db_mv3/ + save_epoch_step: 1200 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 400] + cal_metric_during_train: False + pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_db/predicts_db.txt + +Architecture: + model_type: det + algorithm: DB + Transform: + Backbone: + name: ResNet #MobileNetV3 + layers: 50 + Neck: + name: DBFPN + out_channels: 256 + Head: + name: DBHead + k: 50 + +Loss: + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 #5 + beta: 10 #10 + ohem_ratio: 3 + +Optimizer: + name: Adam #Momentum + #momentum: 0.9 + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - Resize: + # size: [640, 640] + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 + num_workers: 0 + use_shared_memory: False + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: + image_shape: [736, 1280] + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 0 + use_shared_memory: False diff --git a/PTDN/configs/ppocr_det_mobile_params.txt b/PTDN/configs/ppocr_det_mobile_params.txt new file mode 100644 index 0000000000000000000000000000000000000000..63a78fb39f05552651fe02832e6e2622f5cba155 --- /dev/null +++ b/PTDN/configs/ppocr_det_mobile_params.txt @@ -0,0 +1,100 @@ +===========================train_params=========================== +model_name:ocr_det +python:python3.7 +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:null +Global.epoch_num:lite_train_infer=1|whole_train_infer=300 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_infer=2|whole_train_infer=4 +Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/ +null:null +## +trainer:norm_train|pact_train|fpgm_train +norm_train:tools/train.py -c tests/configs/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained +pact_train:deploy/slim/quantization/quant.py -c tests/configs/det_mv3_db.yml -o +fpgm_train:deploy/slim/prune/sensitivity_anal.py -c tests/configs/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/det_mv3_db_v2.0_train/best_accuracy +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.pretrained_model: +norm_export:tools/export_model.py -c tests/configs/det_mv3_db.yml -o +quant_export:deploy/slim/quantization/export_model.py -c tests/configs/det_mv3_db.yml -o +fpgm_export:deploy/slim/prune/export_prune_model.py -c tests/configs/det_mv3_db.yml -o +distill_export:null +export1:null +export2:null +## +train_model:./inference/ch_ppocr_mobile_v2.0_det_train/best_accuracy +infer_export:tools/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o +infer_quant:False +inference:tools/infer/predict_det.py +--use_gpu:True|False +--enable_mkldnn:True|False +--cpu_threads:1|6 +--rec_batch_num:1 +--use_tensorrt:False|True +--precision:fp32|fp16|int8 +--det_model_dir: +--image_dir:./inference/ch_det_data_50/all-sum-510/ +null:null +--benchmark:True +null:null +===========================cpp_infer_params=========================== +use_opencv:True +infer_model:./inference/ch_ppocr_mobile_v2.0_det_infer/ +infer_quant:False +inference:./deploy/cpp_infer/build/ppocr det +--use_gpu:True|False +--enable_mkldnn:True|False +--cpu_threads:1|6 +--rec_batch_num:1 +--use_tensorrt:False|True +--precision:fp32|fp16 +--det_model_dir: +--image_dir:./inference/ch_det_data_50/all-sum-510/ +null:null +--benchmark:True +===========================serving_params=========================== +model_name:ocr_det +python:python3.7 +trans_model:-m paddle_serving_client.convert +--dirname:./inference/ch_ppocr_mobile_v2.0_det_infer/ +--model_filename:inference.pdmodel +--params_filename:inference.pdiparams +--serving_server:./deploy/pdserving/ppocr_det_mobile_2.0_serving/ +--serving_client:./deploy/pdserving/ppocr_det_mobile_2.0_client/ +serving_dir:./deploy/pdserving +web_service:web_service_det.py --config=config.yml --opt op.det.concurrency=1 +op.det.local_service_conf.devices:null|0 +op.det.local_service_conf.use_mkldnn:True|False +op.det.local_service_conf.thread_num:1|6 +op.det.local_service_conf.use_trt:False|True +op.det.local_service_conf.precision:fp32|fp16|int8 +pipline:pipeline_http_client.py --image_dir=../../doc/imgs +===========================kl_quant_params=========================== +infer_model:./inference/ch_ppocr_mobile_v2.0_det_infer/ +infer_export:tools/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o +infer_quant:True +inference:tools/infer/predict_det.py +--use_gpu:True|False +--enable_mkldnn:True|False +--cpu_threads:1|6 +--rec_batch_num:1 +--use_tensorrt:False|True +--precision:int8 +--det_model_dir: +--image_dir:./inference/ch_det_data_50/all-sum-510/ +null:null +--benchmark:True +null:null +null:null diff --git a/PTDN/configs/ppocr_det_server_params.txt b/PTDN/configs/ppocr_det_server_params.txt new file mode 100644 index 0000000000000000000000000000000000000000..bba4ef44f769ed16671ead55a0eba6ee986aaaaa --- /dev/null +++ b/PTDN/configs/ppocr_det_server_params.txt @@ -0,0 +1,83 @@ +===========================train_params=========================== +model_name:ocr_server_det +python:python3.7 +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:null +Global.epoch_num:lite_train_infer=2|whole_train_infer=300 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_infer=2|whole_train_infer=4 +Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/ +null:null +## +trainer:norm_train|pact_train|fpgm_export +norm_train:tools/train.py -c tests/configs/det_r50_vd_db.yml -o +quant_export:deploy/slim/quantization/export_model.py -c tests/configs/det_r50_vd_db.yml -o +fpgm_export:deploy/slim/prune/export_prune_model.py -c tests/configs/det_r50_vd_db.yml -o +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:tools/eval.py -c tests/configs/det_r50_vd_db.yml -o +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.pretrained_model: +norm_export:tools/export_model.py -c tests/configs/det_r50_vd_db.yml -o +quant_export:null +fpgm_export:null +distill_export:null +export1:null +export2:null +## +train_model:./inference/ch_ppocr_server_v2.0_det_train/best_accuracy +infer_export:tools/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml -o +infer_quant:False +inference:tools/infer/predict_det.py +--use_gpu:True|False +--enable_mkldnn:True|False +--cpu_threads:1|6 +--rec_batch_num:1 +--use_tensorrt:False|True +--precision:fp32|fp16|int8 +--det_model_dir: +--image_dir:./inference/ch_det_data_50/all-sum-510/ +--save_log_path:null +--benchmark:True +null:null +===========================cpp_infer_params=========================== +use_opencv:True +infer_model:./inference/ch_ppocr_server_v2.0_det_infer/ +infer_quant:False +inference:./deploy/cpp_infer/build/ppocr det +--use_gpu:True|False +--enable_mkldnn:True|False +--cpu_threads:1|6 +--rec_batch_num:1 +--use_tensorrt:False|True +--precision:fp32|fp16 +--det_model_dir: +--image_dir:./inference/ch_det_data_50/all-sum-510/ +null:null +--benchmark:True +===========================serving_params=========================== +model_name:ocr_det_server +python:python3.7 +trans_model:-m paddle_serving_client.convert +--dirname:./inference/ch_ppocr_server_v2.0_det_infer/ +--model_filename:inference.pdmodel +--params_filename:inference.pdiparams +--serving_server:./deploy/pdserving/ppocr_det_mobile_2.0_serving/ +--serving_client:./deploy/pdserving/ppocr_det_mobile_2.0_client/ +serving_dir:./deploy/pdserving +web_service:web_service_det.py --config=config.yml --opt op.det.concurrency=1 +op.det.local_service_conf.devices:null|0 +op.det.local_service_conf.use_mkldnn:True|False +op.det.local_service_conf.thread_num:1|6 +op.det.local_service_conf.use_trt:False|True +op.det.local_service_conf.precision:fp32|fp16|int8 +pipline:pipeline_http_client.py --image_dir=../../doc/imgs diff --git a/tests/ocr_rec_params.txt b/PTDN/configs/ppocr_rec_mobile_params.txt similarity index 53% rename from tests/ocr_rec_params.txt rename to PTDN/configs/ppocr_rec_mobile_params.txt index 71d12f90b3bda128c3f6047c6740911dac417954..f3f3a54e14e042693d28559e487852a079f77bdd 100644 --- a/tests/ocr_rec_params.txt +++ b/PTDN/configs/ppocr_rec_mobile_params.txt @@ -1,7 +1,7 @@ ===========================train_params=========================== model_name:ocr_rec python:python3.7 -gpu_list:0|2,3 +gpu_list:0|0,1 Global.use_gpu:True|True Global.auto_cast:null Global.epoch_num:lite_train_infer=2|whole_train_infer=300 @@ -9,7 +9,7 @@ Global.save_model_dir:./output/ Train.loader.batch_size_per_card:lite_train_infer=128|whole_train_infer=128 Global.pretrained_model:null train_model_name:latest -train_infer_img_dir:./train_data/ic15_data/train +train_infer_img_dir:./inference/rec_inference null:null ## trainer:norm_train|pact_train @@ -41,7 +41,7 @@ inference:tools/infer/predict_rec.py --use_gpu:True|False --enable_mkldnn:True|False --cpu_threads:1|6 ---rec_batch_num:1 +--rec_batch_num:1|6 --use_tensorrt:True|False --precision:fp32|fp16|int8 --rec_model_dir: @@ -49,3 +49,35 @@ inference:tools/infer/predict_rec.py --save_log_path:./test/output/ --benchmark:True null:null +===========================cpp_infer_params=========================== +use_opencv:True +infer_model:./inference/ch_ppocr_mobile_v2.0_rec_infer/ +infer_quant:False +inference:./deploy/cpp_infer/build/ppocr rec +--use_gpu:True|False +--enable_mkldnn:True|False +--cpu_threads:1|6 +--rec_batch_num:1 +--use_tensorrt:False|True +--precision:fp32|fp16 +--rec_model_dir: +--image_dir:./inference/rec_inference/ +null:null +--benchmark:True +===========================serving_params=========================== +model_name:ocr_rec +python:python3.7 +trans_model:-m paddle_serving_client.convert +--dirname:./inference/ch_ppocr_mobile_v2.0_rec_infer/ +--model_filename:inference.pdmodel +--params_filename:inference.pdiparams +--serving_server:./deploy/pdserving/ppocr_rec_mobile_2.0_serving/ +--serving_client:./deploy/pdserving/ppocr_rec_mobile_2.0_client/ +serving_dir:./deploy/pdserving +web_service:web_service_rec.py --config=config.yml --opt op.rec.concurrency=1 +op.rec.local_service_conf.devices:null|0 +op.rec.local_service_conf.use_mkldnn:True|False +op.rec.local_service_conf.thread_num:1|6 +op.rec.local_service_conf.use_trt:False|True +op.rec.local_service_conf.precision:fp32|fp16|int8 +pipline:pipeline_http_client.py --image_dir=../../doc/imgs_words_en diff --git a/PTDN/configs/ppocr_rec_server_params.txt b/PTDN/configs/ppocr_rec_server_params.txt new file mode 100644 index 0000000000000000000000000000000000000000..77961e8e651e0d770dae64860cc129aa2d50dcf2 --- /dev/null +++ b/PTDN/configs/ppocr_rec_server_params.txt @@ -0,0 +1,83 @@ +===========================train_params=========================== +model_name:ocr_server_rec +python:python3.7 +gpu_list:0|0,1 +Global.use_gpu:True|True +Global.auto_cast:null +Global.epoch_num:lite_train_infer=2|whole_train_infer=300 +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:lite_train_infer=128|whole_train_infer=128 +Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./inference/rec_inference +null:null +## +trainer:norm_train|pact_train +norm_train:tools/train.py -c tests/configs/rec_icdar15_r34_train.yml -o +pact_train:deploy/slim/quantization/quant.py -c tests/configs/rec_icdar15_r34_train.yml -o +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:tools/eval.py -c tests/configs/rec_icdar15_r34_train.yml -o +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.pretrained_model: +norm_export:tools/export_model.py -c tests/configs/rec_icdar15_r34_train.yml -o +quant_export:deploy/slim/quantization/export_model.py -c tests/configs/rec_icdar15_r34_train.yml -o +fpgm_export:null +distill_export:null +export1:null +export2:null +## +infer_model:./inference/ch_ppocr_server_v2.0_rec_infer/ +infer_export:null +infer_quant:False +inference:tools/infer/predict_rec.py +--use_gpu:True|False +--enable_mkldnn:True|False +--cpu_threads:1|6 +--rec_batch_num:1|6 +--use_tensorrt:True|False +--precision:fp32|fp16|int8 +--rec_model_dir: +--image_dir:./inference/rec_inference +--save_log_path:./test/output/ +--benchmark:True +null:null +===========================cpp_infer_params=========================== +use_opencv:True +infer_model:./inference/ch_ppocr_server_v2.0_rec_infer/ +infer_quant:False +inference:./deploy/cpp_infer/build/ppocr rec +--use_gpu:True|False +--enable_mkldnn:True|False +--cpu_threads:1|6 +--rec_batch_num:1 +--use_tensorrt:False|True +--precision:fp32|fp16 +--rec_model_dir: +--image_dir:./inference/rec_inference/ +null:null +--benchmark:True +===========================serving_params=========================== +model_name:ocr_server_rec +python:python3.7 +trans_model:-m paddle_serving_client.convert +--dirname:./inference/ch_ppocr_server_v2.0_rec_infer/ +--model_filename:inference.pdmodel +--params_filename:inference.pdiparams +--serving_server:./deploy/pdserving/ppocr_rec_mobile_2.0_serving/ +--serving_client:./deploy/pdserving/ppocr_rec_mobile_2.0_client/ +serving_dir:./deploy/pdserving +web_service:web_service_rec.py --config=config.yml --opt op.rec.concurrency=1 +op.rec.local_service_conf.devices:null|0 +op.rec.local_service_conf.use_mkldnn:True|False +op.rec.local_service_conf.thread_num:1|6 +op.rec.local_service_conf.use_trt:False|True +op.rec.local_service_conf.precision:fp32|fp16|int8 +pipline:pipeline_http_client.py --image_dir=../../doc/imgs_words_en diff --git a/PTDN/configs/ppocr_sys_mobile_params.txt b/PTDN/configs/ppocr_sys_mobile_params.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb6e0960a77c946e7d452c1026368682be0c4579 --- /dev/null +++ b/PTDN/configs/ppocr_sys_mobile_params.txt @@ -0,0 +1,67 @@ +===========================train_params=========================== +model_name:ocr_system_mobile +python:python3.7 +gpu_list:null +Global.use_gpu:null +Global.auto_cast:null +Global.epoch_num:null +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:null +Global.pretrained_model:null +train_model_name:null +train_infer_img_dir:null +null:null +## +trainer: +norm_train:null +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.pretrained_model: +norm_export:null +quant_export:null +fpgm_export:null +distill_export:null +export1:null +export2:null +## +infer_model:./inference/ch_ppocr_mobile_v2.0_det_infer/ +infer_export:null +infer_quant:False +inference:tools/infer/predict_system.py +--use_gpu:True +--enable_mkldnn:True|False +--cpu_threads:1|6 +--rec_batch_num:1 +--use_tensorrt:False|True +--precision:fp32|fp16|int8 +--det_model_dir: +--image_dir:./inference/ch_det_data_50/all-sum-510/ +--save_log_path:null +--benchmark:True +--rec_model_dir:./inference/ch_ppocr_mobile_v2.0_rec_infer/ +===========================cpp_infer_params=========================== +use_opencv:True +infer_model:./inference/ch_ppocr_mobile_v2.0_det_infer/ +infer_quant:False +inference:./deploy/cpp_infer/build/ppocr system +--use_gpu:True|False +--enable_mkldnn:True|False +--cpu_threads:1|6 +--rec_batch_num:1 +--use_tensorrt:False|True +--precision:fp32|fp16 +--det_model_dir: +--image_dir:./inference/ch_det_data_50/all-sum-510/ +--rec_model_dir:./inference/ch_ppocr_mobile_v2.0_rec_infer/ +--benchmark:True + diff --git a/PTDN/configs/ppocr_sys_server_params.txt b/PTDN/configs/ppocr_sys_server_params.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c49f7ddf43dbca2562bb206d92e5aeb84e703aa --- /dev/null +++ b/PTDN/configs/ppocr_sys_server_params.txt @@ -0,0 +1,66 @@ +===========================train_params=========================== +model_name:ocr_system_server +python:python3.7 +gpu_list:null +Global.use_gpu:null +Global.auto_cast:null +Global.epoch_num:null +Global.save_model_dir:./output/ +Train.loader.batch_size_per_card:null +Global.pretrained_model:null +train_model_name:null +train_infer_img_dir:null +null:null +## +trainer: +norm_train:null +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +Global.save_inference_dir:./output/ +Global.pretrained_model: +norm_export:null +quant_export:null +fpgm_export:null +distill_export:null +export1:null +export2:null +## +infer_model:./inference/ch_ppocr_server_v2.0_det_infer/ +infer_export:null +infer_quant:False +inference:tools/infer/predict_system.py +--use_gpu:True +--enable_mkldnn:True|False +--cpu_threads:1|6 +--rec_batch_num:1 +--use_tensorrt:False|True +--precision:fp32|fp16|int8 +--det_model_dir: +--image_dir:./inference/ch_det_data_50/all-sum-510/ +--save_log_path:null +--benchmark:True +--rec_model_dir:./inference/ch_ppocr_server_v2.0_rec_infer/ +===========================cpp_infer_params=========================== +use_opencv:True +infer_model:./inference/ch_ppocr_server_v2.0_det_infer/ +infer_quant:False +inference:./deploy/cpp_infer/build/ppocr system +--use_gpu:True|False +--enable_mkldnn:True|False +--cpu_threads:1|6 +--rec_batch_num:1 +--use_tensorrt:False|True +--precision:fp32|fp16 +--det_model_dir: +--image_dir:./inference/ch_det_data_50/all-sum-510/ +--rec_model_dir:./inference/ch_ppocr_server_v2.0_rec_infer/ +--benchmark:True \ No newline at end of file diff --git a/PTDN/configs/rec_icdar15_r34_train.yml b/PTDN/configs/rec_icdar15_r34_train.yml new file mode 100644 index 0000000000000000000000000000000000000000..5825c3e9622728e050941a34a055514b2c184659 --- /dev/null +++ b/PTDN/configs/rec_icdar15_r34_train.yml @@ -0,0 +1,99 @@ +Global: + use_gpu: true + epoch_num: 72 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/ic15/ + save_epoch_step: 3 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: ./ + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: ppocr/utils/en_dict.txt + character_type: EN + max_text_length: 25 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_ic15.txt + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.0005 + regularizer: + name: 'L2' + factor: 0 + +Architecture: + model_type: rec + algorithm: CRNN + Transform: + Backbone: + name: ResNet + layers: 34 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 256 + Head: + name: CTCHead + fc_decay: 0 + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ic15_data/ + label_file_list: ["./train_data/ic15_data/rec_gt_train.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + use_shared_memory: False + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ic15_data + label_file_list: ["./train_data/ic15_data/rec_gt_test.txt"] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - CTCLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 4 + use_shared_memory: False diff --git a/PTDN/docs/compare_cpp_right.png b/PTDN/docs/compare_cpp_right.png new file mode 100644 index 0000000000000000000000000000000000000000..f9d0ba8ef8007ebc95ebffe2d593ff9e90066343 Binary files /dev/null and b/PTDN/docs/compare_cpp_right.png differ diff --git a/PTDN/docs/compare_cpp_wrong.png b/PTDN/docs/compare_cpp_wrong.png new file mode 100644 index 0000000000000000000000000000000000000000..621d446bbbe9ba10c3069ef5e59c463b714d42ad Binary files /dev/null and b/PTDN/docs/compare_cpp_wrong.png differ diff --git a/PTDN/docs/compare_right.png b/PTDN/docs/compare_right.png new file mode 100644 index 0000000000000000000000000000000000000000..3d74ef1cd5c5506b759886b5cfa541acac50f493 Binary files /dev/null and b/PTDN/docs/compare_right.png differ diff --git a/PTDN/docs/compare_wrong.png b/PTDN/docs/compare_wrong.png new file mode 100644 index 0000000000000000000000000000000000000000..26ad576d2f341072be81b99af154a2499d1ba05f Binary files /dev/null and b/PTDN/docs/compare_wrong.png differ diff --git a/PTDN/docs/guide.png b/PTDN/docs/guide.png new file mode 100644 index 0000000000000000000000000000000000000000..319ac819daff38ed77e84cdff2b122e8bc4a8e5f Binary files /dev/null and b/PTDN/docs/guide.png differ diff --git a/PTDN/docs/install.md b/PTDN/docs/install.md new file mode 100644 index 0000000000000000000000000000000000000000..28b92426fa04da79ce63381fffa9f52a0f42813f --- /dev/null +++ b/PTDN/docs/install.md @@ -0,0 +1,48 @@ + +## 环境配置 + +本教程适用于PTDN目录下基础功能测试的运行环境搭建。 + +推荐环境: +- CUDA 10.1 +- CUDNN 7.6 +- TensorRT 6.1.0.5 / 7.1 + + +推荐docker镜像安装,按照如下命令创建镜像,当前目录映射到镜像中的`/paddle`目录下 +``` +nvidia-docker run --name paddle -it -v $PWD:/paddle paddlepaddle/paddle:latest-dev-cuda10.1-cudnn7-gcc82 /bin/bash +cd /paddle + +# 安装带TRT的paddle +pip3.7 install https://paddle-wheel.bj.bcebos.com/with-trt/2.1.3/linux-gpu-cuda10.1-cudnn7-mkl-gcc8.2-trt6-avx/paddlepaddle_gpu-2.1.3.post101-cp37-cp37m-linux_x86_64.whl + +# 安装AutoLog +git clone https://github.com/LDOUBLEV/AutoLog +cd AutoLog +pip3.7 install -r requirements.txt +python3.7 setup.py bdist_wheel +pip3.7 install ./dist/auto_log-1.0.0-py3-none-any.whl + + +# 下载OCR代码 +cd ../ +git clone https://github.com/PaddlePaddle/PaddleOCR + +``` + +安装PaddleOCR依赖: +``` +cd PaddleOCR +pip3.7 install -r requirements.txt +``` + +## FAQ : +Q. You are using Paddle compiled with TensorRT, but TensorRT dynamic library is not found. Ignore this if TensorRT is not needed. + +A. 问题一般是当前安装paddle版本带TRT,但是本地环境找不到TensorRT的预测库,需要下载TensorRT库,解压后设置环境变量LD_LIBRARY_PATH; +如: +``` +export LD_LIBRARY_PATH=/usr/local/python3.7.0/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/paddle/package/TensorRT-6.0.1.5/lib +``` +或者问题是下载的TensorRT版本和当前paddle中编译的TRT版本不匹配,需要下载版本相符的TRT。 diff --git a/PTDN/docs/test.png b/PTDN/docs/test.png new file mode 100644 index 0000000000000000000000000000000000000000..f99f23d7050eb61879cf317c0d7728ef14531b08 Binary files /dev/null and b/PTDN/docs/test.png differ diff --git a/PTDN/docs/test_inference_cpp.md b/PTDN/docs/test_inference_cpp.md new file mode 100644 index 0000000000000000000000000000000000000000..140860cb506513cbaa0fdc621848568d90e8ef5c --- /dev/null +++ b/PTDN/docs/test_inference_cpp.md @@ -0,0 +1,60 @@ +# C++预测功能测试 + +C++预测功能测试的主程序为`test_inference_cpp.sh`,可以测试基于C++预测库的模型推理功能。 + +## 1. 测试结论汇总 + +基于训练是否使用量化,进行本测试的模型可以分为`正常模型`和`量化模型`,这两类模型对应的C++预测功能汇总如下: + +| 模型类型 |device | batchsize | tensorrt | mkldnn | cpu多线程 | +| ---- | ---- | ---- | :----: | :----: | :----: | +| 正常模型 | GPU | 1/6 | fp32/fp16 | - | - | +| 正常模型 | CPU | 1/6 | - | fp32 | 支持 | +| 量化模型 | GPU | 1/6 | int8 | - | - | +| 量化模型 | CPU | 1/6 | - | int8 | 支持 | + +## 2. 测试流程 +### 2.1 功能测试 +先运行`prepare.sh`准备数据和模型,然后运行`test_inference_cpp.sh`进行测试,最终在```tests/output```目录下生成`cpp_infer_*.log`后缀的日志文件。 + +```shell +bash tests/prepare.sh ./tests/configs/ppocr_det_mobile_params.txt "cpp_infer" + +# 用法1: +bash tests/test_inference_cpp.sh ./tests/configs/ppocr_det_mobile_params.txt +# 用法2: 指定GPU卡预测,第三个传入参数为GPU卡号 +bash tests/test_inference_cpp.sh ./tests/configs/ppocr_det_mobile_params.txt '1' +``` + + +### 2.2 精度测试 + +使用compare_results.py脚本比较模型预测的结果是否符合预期,主要步骤包括: +- 提取日志中的预测坐标; +- 从本地文件中提取保存好的坐标结果; +- 比较上述两个结果是否符合精度预期,误差大于设置阈值时会报错。 + +#### 使用方式 +运行命令: +```shell +python3.7 tests/compare_results.py --gt_file=./tests/results/cpp_*.txt --log_file=./tests/output/cpp_*.log --atol=1e-3 --rtol=1e-3 +``` + +参数介绍: +- gt_file: 指向事先保存好的预测结果路径,支持*.txt 结尾,会自动索引*.txt格式的文件,文件默认保存在tests/result/ 文件夹下 +- log_file: 指向运行tests/test.sh 脚本的infer模式保存的预测日志,预测日志中打印的有预测结果,比如:文本框,预测文本,类别等等,同样支持infer_*.log格式传入 +- atol: 设置的绝对误差 +- rtol: 设置的相对误差 + +#### 运行结果 + +正常运行效果如下图: + + +出现不一致结果时的运行输出: + + + +## 3. 更多教程 + +本文档为功能测试用,更详细的c++预测使用教程请参考:[服务器端C++预测](https://github.com/PaddlePaddle/PaddleOCR/tree/dygraph/deploy/cpp_infer) diff --git a/PTDN/docs/test_train_inference_python.md b/PTDN/docs/test_train_inference_python.md new file mode 100644 index 0000000000000000000000000000000000000000..8c468ffd34fcd7d949331c9097c7993ca7a1e391 --- /dev/null +++ b/PTDN/docs/test_train_inference_python.md @@ -0,0 +1,119 @@ +# 基础训练预测功能测试 + +基础训练预测功能测试的主程序为`test_train_inference_python.sh`,可以测试基于Python的模型训练、评估、推理等基本功能,包括裁剪、量化、蒸馏。 + +## 1. 测试结论汇总 + +- 训练相关: + +| 算法名称 | 模型名称 | 单机单卡 | 单机多卡 | 多机多卡 | 模型压缩(单机多卡) | +| :---- | :---- | :---- | :---- | :---- | :---- | +| DB | ch_ppocr_mobile_v2.0_det| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | 正常训练:FPGM裁剪、PACT量化
离线量化(无需训练) | +| DB | ch_ppocr_server_v2.0_det| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | 正常训练:FPGM裁剪、PACT量化
离线量化(无需训练) | +| CRNN | ch_ppocr_mobile_v2.0_rec| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | 正常训练:PACT量化
离线量化(无需训练) | +| CRNN | ch_ppocr_server_v2.0_rec| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | 正常训练:PACT量化
离线量化(无需训练) | +|PP-OCR| ch_ppocr_mobile_v2.0| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | - | +|PP-OCR| ch_ppocr_server_v2.0| 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | - | +|PP-OCRv2| ch_PP-OCRv2 | 正常训练
混合精度 | 正常训练
混合精度 | 正常训练
混合精度 | - | + + +- 预测相关:基于训练是否使用量化,可以将训练产出的模型可以分为`正常模型`和`量化模型`,这两类模型对应的预测功能汇总如下, + +| 模型类型 |device | batchsize | tensorrt | mkldnn | cpu多线程 | +| ---- | ---- | ---- | :----: | :----: | :----: | +| 正常模型 | GPU | 1/6 | fp32/fp16 | - | - | +| 正常模型 | CPU | 1/6 | - | fp32 | 支持 | +| 量化模型 | GPU | 1/6 | int8 | - | - | +| 量化模型 | CPU | 1/6 | - | int8 | 支持 | + + +## 2. 测试流程 +### 2.1 安装依赖 +- 安装PaddlePaddle >= 2.0 +- 安装PaddleOCR依赖 + ``` + pip3 install -r ../requirements.txt + ``` +- 安装autolog(规范化日志输出工具) + ``` + git clone https://github.com/LDOUBLEV/AutoLog + cd AutoLog + pip3 install -r requirements.txt + python3 setup.py bdist_wheel + pip3 install ./dist/auto_log-1.0.0-py3-none-any.whl + cd ../ + ``` + + +### 2.2 功能测试 +先运行`prepare.sh`准备数据和模型,然后运行`test_train_inference_python.sh`进行测试,最终在```tests/output```目录下生成`python_infer_*.log`格式的日志文件。 + + +`test_train_inference_python.sh`包含5种运行模式,每种模式的运行数据不同,分别用于测试速度和精度,分别是: + +- 模式1:lite_train_infer,使用少量数据训练,用于快速验证训练到预测的走通流程,不验证精度和速度; +```shell +bash tests/prepare.sh ./tests/configs/ppocr_det_mobile_params.txt 'lite_train_infer' +bash tests/test_train_inference_python.sh ./tests/configs/ppocr_det_mobile_params.txt 'lite_train_infer' +``` + +- 模式2:whole_infer,使用少量数据训练,一定量数据预测,用于验证训练后的模型执行预测,预测速度是否合理; +```shell +bash tests/prepare.sh ./tests/configs/ppocr_det_mobile_params.txt 'whole_infer' +bash tests/test_train_inference_python.sh ./tests/configs/ppocr_det_mobile_params.txt 'whole_infer' +``` + +- 模式3:infer,不训练,全量数据预测,走通开源模型评估、动转静,检查inference model预测时间和精度; +```shell +bash tests/prepare.sh ./tests/configs/ppocr_det_mobile_params.txt 'infer' +# 用法1: +bash tests/test_train_inference_python.sh ./tests/configs/ppocr_det_mobile_params.txt 'infer' +# 用法2: 指定GPU卡预测,第三个传入参数为GPU卡号 +bash tests/test_train_inference_python.sh ./tests/configs/ppocr_det_mobile_params.txt 'infer' '1' +``` + +- 模式4:whole_train_infer,CE: 全量数据训练,全量数据预测,验证模型训练精度,预测精度,预测速度; +```shell +bash tests/prepare.sh ./tests/configs/ppocr_det_mobile_params.txt 'whole_train_infer' +bash tests/test_train_inference_python.sh ./tests/configs/ppocr_det_mobile_params.txt 'whole_train_infer' +``` + +- 模式5:klquant_infer,测试离线量化; +```shell +bash tests/prepare.sh ./tests/configs/ppocr_det_mobile_params.txt 'klquant_infer' +bash tests/test_train_inference_python.sh tests/configs/ppocr_det_mobile_params.txt 'klquant_infer' +``` + + +### 2.3 精度测试 + +使用compare_results.py脚本比较模型预测的结果是否符合预期,主要步骤包括: +- 提取日志中的预测坐标; +- 从本地文件中提取保存好的坐标结果; +- 比较上述两个结果是否符合精度预期,误差大于设置阈值时会报错。 + +#### 使用方式 +运行命令: +```shell +python3.7 tests/compare_results.py --gt_file=./tests/results/python_*.txt --log_file=./tests/output/python_*.log --atol=1e-3 --rtol=1e-3 +``` + +参数介绍: +- gt_file: 指向事先保存好的预测结果路径,支持*.txt 结尾,会自动索引*.txt格式的文件,文件默认保存在tests/result/ 文件夹下 +- log_file: 指向运行tests/test.sh 脚本的infer模式保存的预测日志,预测日志中打印的有预测结果,比如:文本框,预测文本,类别等等,同样支持infer_*.log格式传入 +- atol: 设置的绝对误差 +- rtol: 设置的相对误差 + +#### 运行结果 + +正常运行效果如下图: + + +出现不一致结果时的运行输出: + + + +## 3. 更多教程 +本文档为功能测试用,更丰富的训练预测使用教程请参考: +[模型训练](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/training.md) +[基于Python预测引擎推理](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/inference.md) diff --git a/PTDN/prepare.sh b/PTDN/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..d842f4f573d0b1bd697bdad9b67a765ebcf6da6c --- /dev/null +++ b/PTDN/prepare.sh @@ -0,0 +1,138 @@ +#!/bin/bash +FILENAME=$1 + +# MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer', 'infer', +# 'cpp_infer', 'serving_infer', 'klquant_infer'] + +MODE=$2 + +dataline=$(cat ${FILENAME}) + +# parser params +IFS=$'\n' +lines=(${dataline}) +function func_parser_key(){ + strs=$1 + IFS=":" + array=(${strs}) + tmp=${array[0]} + echo ${tmp} +} +function func_parser_value(){ + strs=$1 + IFS=":" + array=(${strs}) + tmp=${array[1]} + echo ${tmp} +} +IFS=$'\n' +# The training params +model_name=$(func_parser_value "${lines[1]}") + +trainer_list=$(func_parser_value "${lines[14]}") + +# MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer'] +MODE=$2 + +if [ ${MODE} = "lite_train_infer" ];then + # pretrain lite train data + wget -nc -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams + wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar + cd ./pretrain_models/ && tar xf det_mv3_db_v2.0_train.tar && cd ../ + rm -rf ./train_data/icdar2015 + rm -rf ./train_data/ic15_data + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015_lite.tar + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ic15_data.tar # todo change to bcebos + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/rec_inference.tar + wget -nc -P ./deploy/slim/prune https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/sen.pickle + + cd ./train_data/ && tar xf icdar2015_lite.tar && tar xf ic15_data.tar + ln -s ./icdar2015_lite ./icdar2015 + cd ../ + cd ./inference && tar xf rec_inference.tar && cd ../ +elif [ ${MODE} = "whole_train_infer" ];then + wget -nc -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams + rm -rf ./train_data/icdar2015 + rm -rf ./train_data/ic15_data + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015.tar + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ic15_data.tar + cd ./train_data/ && tar xf icdar2015.tar && tar xf ic15_data.tar && cd ../ +elif [ ${MODE} = "whole_infer" ];then + wget -nc -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams + rm -rf ./train_data/icdar2015 + rm -rf ./train_data/ic15_data + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015_infer.tar + wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ic15_data.tar + cd ./train_data/ && tar xf icdar2015_infer.tar && tar xf ic15_data.tar + ln -s ./icdar2015_infer ./icdar2015 + cd ../ +elif [ ${MODE} = "infer" ];then + if [ ${model_name} = "ocr_det" ]; then + eval_model_name="ch_ppocr_mobile_v2.0_det_train" + rm -rf ./train_data/icdar2015 + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar + cd ./inference && tar xf ${eval_model_name}.tar && tar xf ch_det_data_50.tar && cd ../ + elif [ ${model_name} = "ocr_server_det" ]; then + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar + cd ./inference && tar xf ch_ppocr_server_v2.0_det_train.tar && tar xf ch_det_data_50.tar && cd ../ + elif [ ${model_name} = "ocr_system_mobile" ]; then + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar + cd ./inference && tar xf ch_ppocr_mobile_v2.0_det_infer.tar && tar xf ch_ppocr_mobile_v2.0_rec_infer.tar && tar xf ch_det_data_50.tar && cd ../ + elif [ ${model_name} = "ocr_system_server" ]; then + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar + cd ./inference && tar xf ch_ppocr_server_v2.0_det_infer.tar && tar xf ch_ppocr_server_v2.0_rec_infer.tar && tar xf ch_det_data_50.tar && cd ../ + elif [ ${model_name} = "ocr_rec" ]; then + rm -rf ./train_data/ic15_data + eval_model_name="ch_ppocr_mobile_v2.0_rec_infer" + wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/rec_inference.tar + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar + cd ./inference && tar xf ${eval_model_name}.tar && tar xf rec_inference.tar && cd ../ + elif [ ${model_name} = "ocr_server_rec" ]; then + rm -rf ./train_data/ic15_data + eval_model_name="ch_ppocr_server_v2.0_rec_infer" + wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/rec_inference.tar + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar + cd ./inference && tar xf ${eval_model_name}.tar && tar xf rec_inference.tar && cd ../ + fi +elif [ ${MODE} = "klquant_infer" ];then + if [ ${model_name} = "ocr_det" ]; then + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar + cd ./inference && tar xf ch_ppocr_mobile_v2.0_det_infer.tar && tar xf ch_det_data_50.tar && cd ../ + fi +elif [ ${MODE} = "cpp_infer" ];then + if [ ${model_name} = "ocr_det" ]; then + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar + cd ./inference && tar xf ch_ppocr_mobile_v2.0_det_infer.tar && tar xf ch_det_data_50.tar && cd ../ + elif [ ${model_name} = "ocr_rec" ]; then + wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/rec_inference.tar + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar + cd ./inference && tar xf ch_ppocr_mobile_v2.0_rec_infer.tar && tar xf rec_inference.tar && cd ../ + elif [ ${model_name} = "ocr_system" ]; then + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar + cd ./inference && tar xf ch_ppocr_mobile_v2.0_det_infer.tar && tar xf ch_ppocr_mobile_v2.0_rec_infer.tar && tar xf ch_det_data_50.tar && cd ../ + fi +fi + +if [ ${MODE} = "serving_infer" ];then + # prepare serving env + python_name=$(func_parser_value "${lines[2]}") + wget https://paddle-serving.bj.bcebos.com/chain/paddle_serving_server_gpu-0.0.0.post101-py3-none-any.whl + ${python_name} -m pip install install paddle_serving_server_gpu-0.0.0.post101-py3-none-any.whl + ${python_name} -m pip install paddle_serving_client==0.6.1 + ${python_name} -m pip install paddle-serving-app==0.6.3 + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar + wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar + cd ./inference && tar xf ch_ppocr_mobile_v2.0_det_infer.tar && tar xf ch_ppocr_mobile_v2.0_rec_infer.tar && tar xf ch_ppocr_server_v2.0_rec_infer.tar && tar xf ch_ppocr_server_v2.0_det_infer.tar && cd ../ +fi diff --git a/PTDN/readme.md b/PTDN/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..71e888a2fe05a0a6d700b40250dd80d5f6d041e0 --- /dev/null +++ b/PTDN/readme.md @@ -0,0 +1,110 @@ + +# 推理部署导航 + +## 1. 简介 + +飞桨除了基本的模型训练和预测,还提供了支持多端多平台的高性能推理部署工具。本文档提供了PaddleOCR中所有模型的推理部署导航PTDN(Paddle Train Deploy Navigation),方便用户查阅每种模型的推理部署打通情况,并可以进行一键测试。 + +
+ +
+ +## 2. 汇总信息 + +打通情况汇总如下,已填写的部分表示可以使用本工具进行一键测试,未填写的表示正在支持中。 + +**字段说明:** +- 基础训练预测:包括模型训练、Paddle Inference Python预测。 +- 更多训练方式:包括多机多卡、混合精度。 +- 模型压缩:包括裁剪、离线/在线量化、蒸馏。 +- 其他预测部署:包括Paddle Inference C++预测、Paddle Serving部署、Paddle-Lite部署等。 + +更详细的mkldnn、Tensorrt等预测加速相关功能的支持情况可以查看各测试工具的[更多教程](#more)。 + +| 算法论文 | 模型名称 | 模型类型 | 基础
训练预测 | 更多
训练方式 | 模型压缩 | 其他预测部署 | +| :--- | :--- | :----: | :--------: | :---- | :---- | :---- | +| DB |ch_ppocr_mobile_v2.0_det | 检测 | 支持 | 多机多卡
混合精度 | FPGM裁剪
离线量化| Paddle Inference: C++
Paddle Serving: Python, C++
Paddle-Lite:
(1) ARM CPU(C++) | +| DB |ch_ppocr_server_v2.0_det | 检测 | 支持 | 多机多卡
混合精度 | FPGM裁剪
离线量化| Paddle Inference: C++
Paddle Serving: Python, C++
Paddle-Lite:
(1) ARM CPU(C++) | +| DB |ch_PP-OCRv2_det | 检测 | +| CRNN |ch_ppocr_mobile_v2.0_rec | 识别 | 支持 | 多机多卡
混合精度 | PACT量化
离线量化| Paddle Inference: C++
Paddle Serving: Python, C++
Paddle-Lite:
(1) ARM CPU(C++) | +| CRNN |ch_ppocr_server_v2.0_rec | 识别 | 支持 | 多机多卡
混合精度 | PACT量化
离线量化| Paddle Inference: C++
Paddle Serving: Python, C++
Paddle-Lite:
(1) ARM CPU(C++) | +| CRNN |ch_PP-OCRv2_rec | 识别 | +| PP-OCR |ch_ppocr_mobile_v2.0 | 检测+识别 | 支持 | 多机多卡
混合精度 | - | Paddle Inference: C++
Paddle Serving: Python, C++
Paddle-Lite:
(1) ARM CPU(C++) | +| PP-OCR |ch_ppocr_server_v2.0 | 检测+识别 | 支持 | 多机多卡
混合精度 | - | Paddle Inference: C++
Paddle Serving: Python, C++
Paddle-Lite:
(1) ARM CPU(C++) | +|PP-OCRv2|ch_PP-OCRv2 | 检测+识别 | +| DB |det_mv3_db_v2.0 | 检测 | +| DB |det_r50_vd_db_v2.0 | 检测 | +| EAST |det_mv3_east_v2.0 | 检测 | +| EAST |det_r50_vd_east_v2.0 | 检测 | +| PSENet |det_mv3_pse_v2.0 | 检测 | +| PSENet |det_r50_vd_pse_v2.0 | 检测 | +| SAST |det_r50_vd_sast_totaltext_v2.0 | 检测 | +| Rosetta|rec_mv3_none_none_ctc_v2.0 | 识别 | +| Rosetta|rec_r34_vd_none_none_ctc_v2.0 | 识别 | +| CRNN |rec_mv3_none_bilstm_ctc_v2.0 | 识别 | +| CRNN |rec_r34_vd_none_bilstm_ctc_v2.0| 识别 | +| StarNet|rec_mv3_tps_bilstm_ctc_v2.0 | 识别 | +| StarNet|rec_r34_vd_tps_bilstm_ctc_v2.0 | 识别 | +| RARE |rec_mv3_tps_bilstm_att_v2.0 | 识别 | +| RARE |rec_r34_vd_tps_bilstm_att_v2.0 | 识别 | +| SRN |rec_r50fpn_vd_none_srn | 识别 | +| NRTR |rec_mtb_nrtr | 识别 | +| SAR |rec_r31_sar | 识别 | +| PGNet |rec_r34_vd_none_none_ctc_v2.0 | 端到端| + + + +## 3. 一键测试工具使用 +### 目录介绍 + +```shell +PTDN/ +├── configs/ # 配置文件目录 + ├── det_mv3_db.yml # 测试mobile版ppocr检测模型训练的yml文件 + ├── det_r50_vd_db.yml # 测试server版ppocr检测模型训练的yml文件 + ├── rec_icdar15_r34_train.yml # 测试server版ppocr识别模型训练的yml文件 + ├── ppocr_sys_mobile_params.txt # 测试mobile版ppocr检测+识别模型串联的参数配置文件 + ├── ppocr_det_mobile_params.txt # 测试mobile版ppocr检测模型的参数配置文件 + ├── ppocr_rec_mobile_params.txt # 测试mobile版ppocr识别模型的参数配置文件 + ├── ppocr_sys_server_params.txt # 测试server版ppocr检测+识别模型串联的参数配置文件 + ├── ppocr_det_server_params.txt # 测试server版ppocr检测模型的参数配置文件 + ├── ppocr_rec_server_params.txt # 测试server版ppocr识别模型的参数配置文件 + ├── ... +├── results/ # 预先保存的预测结果,用于和实际预测结果进行精读比对 + ├── python_ppocr_det_mobile_results_fp32.txt # 预存的mobile版ppocr检测模型python预测fp32精度的结果 + ├── python_ppocr_det_mobile_results_fp16.txt # 预存的mobile版ppocr检测模型python预测fp16精度的结果 + ├── cpp_ppocr_det_mobile_results_fp32.txt # 预存的mobile版ppocr检测模型c++预测的fp32精度的结果 + ├── cpp_ppocr_det_mobile_results_fp16.txt # 预存的mobile版ppocr检测模型c++预测的fp16精度的结果 + ├── ... +├── prepare.sh # 完成test_*.sh运行所需要的数据和模型下载 +├── test_train_inference_python.sh # 测试python训练预测的主程序 +├── test_inference_cpp.sh # 测试c++预测的主程序 +├── test_serving.sh # 测试serving部署预测的主程序 +├── test_lite.sh # 测试lite部署预测的主程序 +├── compare_results.py # 用于对比log中的预测结果与results中的预存结果精度误差是否在限定范围内 +└── readme.md # 使用文档 +``` + +### 测试流程 +使用本工具,可以测试不同功能的支持情况,以及预测结果是否对齐,测试流程如下: +
+ +
+ +1. 运行prepare.sh准备测试所需数据和模型; +2. 运行要测试的功能对应的测试脚本`test_*.sh`,产出log,由log可以看到不同配置是否运行成功; +3. 用`compare_results.py`对比log中的预测结果和预存在results目录下的结果,判断预测精度是否符合预期(在误差范围内)。 + +其中,有4个测试主程序,功能如下: +- `test_train_inference_python.sh`:测试基于Python的模型训练、评估、推理等基本功能,包括裁剪、量化、蒸馏。 +- `test_inference_cpp.sh`:测试基于C++的模型推理。 +- `test_serving.sh`:测试基于Paddle Serving的服务化部署功能。 +- `test_lite.sh`:测试基于Paddle-Lite的端侧预测部署功能。 + + +#### 更多教程 +各功能测试中涉及混合精度、裁剪、量化等训练相关,及mkldnn、Tensorrt等多种预测相关参数配置,请点击下方相应链接了解更多细节和使用教程: +[test_train_inference_python 使用](docs/test_train_inference_python.md) +[test_inference_cpp 使用](docs/test_inference_cpp.md) +[test_serving 使用](docs/test_serving.md) +[test_lite 使用](docs/test_lite.md) diff --git a/PTDN/results/cpp_ppocr_det_mobile_results_fp16.txt b/PTDN/results/cpp_ppocr_det_mobile_results_fp16.txt new file mode 100644 index 0000000000000000000000000000000000000000..34cde2526d2c719a473bec36b9801f56c954561c --- /dev/null +++ b/PTDN/results/cpp_ppocr_det_mobile_results_fp16.txt @@ -0,0 +1,50 @@ +../../inference/ch_det_data_50/all-sum-510/00008790.jpg 208 404 282 404 282 421 208 421 58 396 107 396 107 413 58 413 197 387 296 387 296 403 197 403 161 389 174 389 174 402 161 402 34 378 134 378 134 394 34 394 323 377 329 377 329 382 323 382 199 370 292 370 292 383 199 383 216 309 274 309 274 325 216 325 161 304 173 304 173 315 161 315 370 301 437 301 437 317 370 317 30 301 135 300 135 316 30 317 221 291 270 291 270 308 221 308 58 224 106 224 106 238 58 238 216 222 274 222 274 239 216 239 161 217 174 217 174 229 161 229 33 205 133 205 133 221 33 221 221 204 270 204 270 221 221 221 73 145 385 145 385 162 73 162 52 119 119 119 119 135 52 135 72 50 296 50 296 66 72 66 54 15 118 15 118 32 54 32 +../../inference/ch_det_data_50/all-sum-510/00018946.jpg 439 327 476 327 476 341 439 341 85 284 142 284 142 308 85 308 300 278 380 278 380 299 300 299 195 262 287 275 284 299 192 286 196 196 454 218 452 244 194 222 343 182 376 182 376 193 343 193 198 162 341 169 340 195 197 188 176 130 381 145 380 165 175 150 176 100 417 118 415 148 174 130 +../../inference/ch_det_data_50/all-sum-510/00034387.jpg 263 459 741 459 741 485 263 485 346 415 421 415 421 444 346 444 544 418 568 418 568 442 544 442 684 415 712 415 712 444 684 444 173 413 228 413 228 444 173 444 872 412 910 412 910 447 872 447 55 415 76 415 76 443 55 443 855 371 927 371 927 401 855 401 347 371 420 371 420 400 347 400 672 370 725 370 725 402 672 402 537 371 571 371 571 401 537 401 136 364 230 367 229 403 135 400 55 370 76 370 76 399 55 399 856 328 927 328 927 358 856 358 350 328 420 328 420 358 350 358 672 326 725 326 725 358 672 358 539 327 571 327 571 359 539 359 170 326 229 323 231 357 171 359 56 328 76 328 76 358 56 358 297 326 316 326 316 334 297 334 854 284 927 284 927 314 854 314 672 284 725 284 725 315 672 315 344 284 431 282 432 315 345 317 537 283 570 283 570 314 537 314 170 281 228 281 228 315 170 315 55 285 75 285 75 314 55 314 856 241 927 241 927 270 856 270 346 240 464 240 464 271 346 271 154 241 228 241 228 271 154 271 672 240 726 240 726 271 672 271 530 240 573 240 573 272 530 272 55 241 76 241 76 270 55 270 854 196 927 198 926 228 853 225 672 197 728 197 728 228 672 228 342 199 439 194 441 224 344 230 175 196 229 196 229 226 175 226 55 199 75 199 75 228 55 228 526 193 578 193 578 228 526 228 347 154 420 154 420 182 347 182 853 153 927 153 927 181 853 181 175 153 228 153 228 184 175 184 668 152 725 152 725 182 668 182 536 153 572 153 572 183 536 183 55 155 76 155 76 183 55 183 347 109 420 109 420 138 347 138 172 109 229 109 229 140 172 140 544 111 565 111 565 138 544 138 51 110 77 110 77 140 51 140 639 105 729 105 729 141 639 141 815 101 929 109 927 141 813 133 812 65 953 65 953 93 812 93 305 64 447 66 447 94 305 92 671 65 725 65 725 95 671 95 173 64 229 66 228 96 172 94 37 64 91 66 90 98 36 96 527 63 581 63 581 95 527 95 333 18 671 18 671 45 333 45 +../../inference/ch_det_data_50/all-sum-510/00037951.jpg 432 973 552 977 552 994 432 991 431 931 554 931 554 970 431 970 29 520 101 520 101 546 29 546 29 441 146 441 146 465 29 465 233 333 328 331 328 356 233 358 121 250 439 250 439 287 121 287 180 205 380 205 380 229 180 229 255 104 323 121 307 184 239 166 35 57 147 57 147 82 35 82 +../../inference/ch_det_data_50/all-sum-510/00044782.jpg 222 214 247 214 247 230 222 230 162 214 183 214 183 231 162 231 122 190 216 190 216 203 122 203 90 82 252 82 252 100 90 100 70 61 279 61 279 78 70 78 103 14 244 14 244 46 103 46 +../../inference/ch_det_data_50/all-sum-510/00067516.jpg 139 806 596 807 596 824 139 823 46 782 699 782 699 800 46 800 577 749 669 749 669 766 577 766 353 748 397 748 397 769 353 769 220 749 261 749 261 767 220 767 475 748 502 748 502 769 475 769 68 746 134 749 133 766 67 763 574 680 670 680 670 700 574 700 474 680 519 680 519 701 474 701 352 680 397 680 397 701 352 701 68 679 134 682 133 700 67 697 219 678 245 681 242 702 216 698 575 614 669 614 669 633 575 633 66 612 135 614 135 633 66 631 474 613 501 613 501 633 474 633 353 613 379 613 379 634 353 634 219 612 245 612 245 633 219 633 576 546 669 546 669 566 576 566 474 545 519 545 519 566 474 566 351 544 381 544 381 567 351 567 219 545 245 545 245 566 219 566 67 541 134 544 133 565 66 562 67 477 134 480 133 501 66 498 584 479 666 479 666 499 584 499 474 478 519 478 519 500 474 500 352 478 397 478 397 500 352 500 218 477 246 477 246 502 218 502 579 424 666 427 665 451 578 448 344 428 410 428 410 449 344 449 66 425 151 427 151 451 66 449 473 427 515 427 515 450 473 450 218 427 259 427 259 450 218 450 282 396 479 397 479 420 282 419 83 316 667 316 667 335 83 335 64 277 666 277 666 292 64 292 456 209 585 209 585 226 456 226 311 208 373 208 373 227 311 227 163 208 227 208 227 227 163 227 504 150 541 150 541 168 504 168 264 47 485 47 485 69 264 69 +../../inference/ch_det_data_50/all-sum-510/00088568.jpg 57 443 119 443 119 456 57 456 309 413 744 413 744 430 309 430 309 375 737 375 737 392 309 392 415 337 559 337 559 351 415 351 307 322 674 321 674 338 307 339 275 292 348 294 348 313 275 311 52 285 210 285 210 301 52 301 273 262 421 262 421 279 273 279 55 262 249 262 249 279 55 279 669 247 697 247 697 262 669 262 601 247 629 247 629 262 601 262 531 247 559 247 559 262 531 262 461 247 489 247 489 262 461 262 277 247 310 247 310 261 277 261 55 240 142 240 142 254 55 254 276 230 400 230 400 246 276 246 741 227 749 237 741 246 732 237 665 230 701 230 701 245 665 245 598 230 631 230 631 245 598 245 527 230 563 230 563 245 527 245 458 230 493 230 493 245 458 245 52 213 212 215 212 233 52 231 732 214 747 214 747 227 732 227 662 212 706 212 706 230 662 230 594 213 638 213 638 227 594 227 522 213 570 213 570 227 522 227 453 213 497 213 497 227 453 227 278 213 352 213 352 227 278 227 734 198 748 198 748 210 734 210 667 196 702 196 702 210 667 210 599 196 633 196 633 211 599 211 527 196 564 196 564 210 527 210 459 196 493 196 493 210 459 210 276 194 418 195 418 212 276 211 54 190 241 190 241 207 54 207 664 179 705 179 705 194 664 194 278 178 352 180 352 195 278 193 733 179 747 179 747 194 733 194 596 178 635 178 635 193 596 193 523 177 567 177 567 195 523 195 456 178 495 178 495 193 456 193 55 170 142 170 142 184 55 184 733 164 748 164 748 176 733 176 664 162 705 162 705 176 664 176 597 162 635 162 635 176 597 176 525 162 566 162 566 176 525 176 456 162 494 162 494 176 456 176 277 160 399 160 399 176 277 176 54 146 149 146 149 161 54 161 452 145 497 145 497 160 452 160 729 144 748 144 748 162 729 162 662 143 706 143 706 161 662 161 595 144 636 144 636 159 595 159 521 143 566 141 567 159 522 161 277 143 310 143 310 159 277 159 275 120 430 120 430 140 275 140 50 119 234 120 234 140 50 139 402 90 703 90 703 107 402 107 46 78 282 78 282 98 46 98 324 67 745 68 745 86 324 85 667 47 744 47 744 64 667 64 295 47 435 47 435 63 295 63 64 30 232 27 233 65 65 68 +../../inference/ch_det_data_50/all-sum-510/00091741.jpg 46 335 87 335 87 360 46 360 98 209 258 209 258 232 98 232 101 189 258 190 258 206 101 205 87 99 268 97 269 184 88 186 92 45 266 53 263 117 89 109 89 10 258 12 258 38 89 36 +../../inference/ch_det_data_50/all-sum-510/00105313.jpg 289 261 407 261 407 277 289 277 152 260 265 260 265 276 152 276 10 257 74 259 74 276 10 274 32 230 134 230 134 245 32 245 34 215 218 215 218 228 34 228 32 199 148 199 148 214 32 214 31 181 217 182 217 199 31 198 34 169 107 169 107 182 34 182 34 153 126 153 126 166 34 166 33 136 144 137 144 150 33 149 34 122 177 122 177 135 34 135 32 104 178 104 178 120 32 120 32 91 102 91 102 104 32 104 33 75 121 75 121 88 33 88 32 60 121 60 121 73 32 73 34 44 121 44 121 57 34 57 31 28 144 28 144 43 31 43 177 20 415 15 416 51 178 56 24 10 152 10 152 26 24 26 +../../inference/ch_det_data_50/all-sum-510/00134770.jpg 386 645 457 645 457 658 386 658 406 618 486 616 486 634 406 636 111 533 272 530 272 550 111 553 110 501 445 496 445 516 110 521 110 469 445 465 445 485 110 489 110 438 446 433 446 453 110 458 109 407 445 403 445 423 109 427 151 375 443 372 443 392 151 395 183 336 371 334 371 358 183 360 73 96 517 101 516 220 72 215 +../../inference/ch_det_data_50/all-sum-510/00145943.jpg 390 243 751 274 735 454 375 423 88 90 302 90 302 121 88 121 43 40 329 37 329 78 43 81 +../../inference/ch_det_data_50/all-sum-510/00147605.jpg 800 613 878 613 878 627 800 627 514 605 786 604 786 629 514 630 116 521 226 521 226 561 116 561 252 522 309 522 309 558 252 558 713 500 902 503 902 539 713 536 254 501 296 501 296 519 254 519 345 479 475 479 475 517 345 517 251 483 296 483 296 501 251 501 350 456 447 456 447 471 350 471 143 442 203 442 203 469 143 469 727 370 880 370 880 422 727 422 526 369 684 369 684 421 526 421 140 367 490 367 490 423 140 423 742 313 872 313 872 338 742 338 798 155 888 155 888 192 798 192 272 140 457 140 457 161 272 161 737 114 895 118 894 158 736 155 107 110 206 110 206 131 107 131 268 92 464 94 464 134 268 131 +../../inference/ch_det_data_50/all-sum-510/00150341.jpg 98 640 300 640 300 664 98 664 113 615 289 615 289 633 113 633 82 591 320 590 320 611 82 612 30 563 315 561 315 582 30 584 30 513 169 513 169 531 30 531 32 488 111 488 111 506 32 506 357 458 465 461 464 486 356 483 26 458 271 459 271 483 26 482 338 438 423 442 422 461 337 457 64 437 145 437 145 455 64 455 205 414 293 414 293 436 205 436 318 407 442 411 441 439 317 435 42 404 176 407 176 435 42 432 28 381 137 381 137 405 28 405 +../../inference/ch_det_data_50/all-sum-510/00150669.jpg 647 698 683 698 683 718 647 718 515 684 551 684 551 721 515 721 650 687 680 687 680 702 650 702 920 673 938 673 938 686 920 686 518 670 548 670 548 690 518 690 785 670 808 670 808 688 785 688 590 670 608 670 608 688 590 688 732 665 745 679 732 692 718 679 652 668 680 668 680 689 652 689 271 665 423 665 423 690 271 690 45 666 110 666 110 688 45 688 130 664 205 664 205 690 130 690 781 628 812 628 812 663 781 663 643 626 687 626 687 666 643 666 514 627 550 627 550 665 514 665 654 617 673 617 673 629 654 629 521 617 541 617 541 629 521 629 858 617 868 617 868 628 858 628 727 617 736 617 736 628 727 628 920 614 940 614 940 631 920 631 785 614 807 614 807 631 785 631 371 603 421 603 421 620 371 620 83 600 216 603 216 624 83 620 46 602 72 602 72 623 46 623 780 569 817 573 813 610 776 606 922 559 936 559 936 575 922 575 856 559 869 559 869 575 856 575 61 552 411 552 411 569 61 569 61 531 117 533 117 547 61 545 859 527 868 527 868 539 859 539 923 525 936 525 936 542 923 542 787 524 807 524 807 540 787 540 526 526 536 526 536 536 526 536 261 511 396 511 396 528 261 528 120 512 246 512 246 526 120 526 47 512 120 512 120 527 47 527 753 491 829 491 829 508 753 508 636 491 712 491 712 508 636 508 517 491 593 491 593 508 517 508 84 448 125 448 125 463 84 463 221 448 238 448 238 462 221 462 682 444 869 444 869 461 682 461 561 444 667 444 667 461 561 461 489 445 545 445 545 459 489 459 183 437 209 437 209 459 183 459 52 429 73 437 64 464 42 456 222 430 278 430 278 445 222 445 86 430 145 430 145 445 86 445 505 382 617 381 617 398 505 399 701 380 758 380 758 398 701 398 307 371 365 371 365 386 307 386 90 371 168 371 168 386 90 386 686 334 821 334 821 352 686 352 496 333 659 333 659 350 496 350 207 314 245 314 245 333 207 333 497 287 642 287 642 304 497 304 670 286 804 286 804 304 670 304 668 239 817 239 817 257 668 257 495 239 644 239 644 257 495 257 668 193 816 193 816 209 668 209 496 192 644 192 644 208 496 208 668 144 816 144 816 161 668 161 497 144 646 144 646 161 497 161 488 102 546 102 546 121 488 121 845 21 900 21 900 43 845 43 25 18 702 18 702 39 25 39 896 10 997 14 996 46 895 42 +../../inference/ch_det_data_50/all-sum-510/00152568.jpg 2 250 285 252 285 281 2 279 195 231 255 231 255 241 195 241 198 158 282 164 277 230 193 224 177 148 251 148 251 161 177 161 +../../inference/ch_det_data_50/all-sum-510/00155628.jpg 147 898 506 901 506 925 147 922 519 892 562 894 561 912 518 910 59 884 83 884 83 895 59 895 148 877 505 881 505 902 148 897 523 833 641 837 640 858 522 854 68 832 187 834 187 855 68 853 245 554 468 554 468 570 245 570 307 506 405 508 405 526 307 523 243 481 460 483 460 504 243 502 250 420 460 422 460 454 250 452 193 377 518 379 518 410 193 408 473 194 625 194 625 212 473 212 70 127 643 129 643 163 70 161 478 39 599 35 602 101 481 105 67 23 136 14 140 44 71 54 +../../inference/ch_det_data_50/all-sum-510/00173364.jpg 7 176 59 176 59 201 7 201 135 118 196 118 196 135 135 135 38 75 87 75 87 105 38 105 249 19 313 19 313 38 249 38 19 15 105 15 105 40 19 40 +../../inference/ch_det_data_50/all-sum-510/00175503.jpg 39 256 503 252 504 362 40 366 49 198 351 175 357 253 55 276 +../../inference/ch_det_data_50/all-sum-510/00193218.jpg 282 373 411 373 411 389 282 389 170 373 223 373 223 390 170 390 108 373 162 373 162 390 108 390 276 357 358 357 358 371 276 371 169 357 222 357 222 371 169 371 106 356 175 356 175 373 106 373 408 356 493 356 493 370 408 370 24 185 64 185 64 203 24 203 500 184 558 184 558 201 500 201 379 185 421 183 422 200 380 202 283 184 311 184 311 202 283 202 173 185 197 185 197 201 173 201 498 163 544 163 544 177 498 177 379 162 412 162 412 177 379 177 261 161 303 161 303 178 261 178 174 161 231 161 231 178 174 178 24 161 80 161 80 178 24 178 385 139 489 139 489 155 385 155 26 137 133 137 133 153 26 153 442 115 538 117 538 134 442 132 345 117 406 117 406 131 345 131 259 117 303 117 303 131 259 131 28 112 229 114 229 132 28 130 130 90 395 93 395 110 130 107 560 81 585 81 585 109 560 109 +../../inference/ch_det_data_50/all-sum-510/00195033.jpg 221 302 240 302 240 309 221 309 487 262 534 264 533 282 486 280 125 249 194 249 194 285 125 285 336 248 364 248 364 268 336 268 317 221 381 223 381 240 317 238 431 224 450 224 450 236 431 236 360 202 539 202 539 218 360 218 87 199 148 201 148 218 87 216 371 181 450 181 450 195 371 195 327 180 354 180 354 194 327 194 94 178 241 178 241 195 94 195 431 159 559 159 559 175 431 175 128 148 289 149 289 166 128 165 35 145 75 148 74 163 34 160 487 146 501 146 501 153 487 153 100 143 122 143 122 154 100 154 370 127 505 126 505 140 370 141 98 125 194 125 194 139 98 139 320 125 338 125 338 136 320 136 35 121 78 121 78 135 35 135 322 104 338 104 338 116 322 116 371 101 503 101 503 117 371 117 348 103 362 103 362 115 348 115 37 101 81 101 81 114 37 114 97 98 207 99 207 116 97 115 305 89 317 89 317 97 305 97 346 86 364 86 364 97 346 97 319 85 342 85 342 100 319 100 357 82 515 80 515 96 357 98 40 81 90 81 90 94 40 94 92 77 242 78 242 95 92 94 312 65 394 65 394 79 312 79 240 64 290 64 290 78 240 78 183 52 222 52 222 66 183 66 468 47 547 47 547 61 468 61 422 34 438 34 438 55 422 55 464 29 551 29 551 43 464 43 206 19 330 21 330 42 206 40 +../../inference/ch_det_data_50/all-sum-510/00208502.jpg 556 535 630 535 630 569 556 569 204 537 284 537 284 552 204 552 142 512 191 512 191 526 142 526 248 511 309 511 309 525 248 525 41 499 118 499 118 520 41 520 465 490 558 490 558 510 465 510 666 489 680 493 677 505 662 501 724 490 739 490 739 503 724 503 40 450 118 448 118 469 40 471 173 448 237 448 237 465 173 465 93 403 121 403 121 424 93 424 38 403 63 403 63 424 38 424 214 392 232 405 220 422 203 409 39 357 58 357 58 375 39 375 92 355 121 355 121 375 92 375 187 339 248 337 249 363 188 365 458 319 551 317 551 338 458 340 457 271 553 271 553 292 457 292 562 271 737 267 737 288 562 292 516 225 548 225 548 245 516 245 620 185 675 185 675 202 620 202 456 130 550 128 550 149 456 151 571 104 789 98 789 121 571 127 121 46 291 46 291 99 121 99 536 36 710 36 710 92 536 92 +../../inference/ch_det_data_50/all-sum-510/00224225.jpg 135 426 157 426 157 449 135 449 199 402 480 408 479 461 198 455 200 225 474 225 474 394 200 394 130 264 174 264 174 281 130 281 343 205 458 205 458 232 343 232 197 186 349 194 346 242 194 234 7 41 160 39 161 115 8 117 +../../inference/ch_det_data_50/all-sum-510/00227746.jpg 142 230 210 230 210 240 142 240 71 230 130 230 130 240 71 240 215 228 386 228 386 240 215 240 290 208 347 208 347 224 290 224 142 179 165 181 162 209 139 208 172 179 250 179 250 195 172 195 171 152 347 152 347 167 171 167 143 110 279 112 279 135 143 132 202 53 387 53 387 69 202 69 141 47 193 47 193 64 141 64 +../../inference/ch_det_data_50/all-sum-510/00229605.jpg 742 528 882 528 882 545 742 545 232 497 590 496 590 524 232 525 5 496 229 496 229 524 5 524 733 494 884 497 884 522 733 519 605 493 718 488 719 517 606 522 2 242 865 227 866 291 3 305 477 26 884 26 884 77 477 77 +../../inference/ch_det_data_50/all-sum-510/00233011.jpg 61 225 293 225 293 243 61 243 11 218 43 218 43 252 11 252 60 177 120 177 120 196 60 196 11 169 44 169 44 204 11 204 59 127 149 129 149 148 59 146 11 123 45 123 45 156 11 156 124 86 239 86 239 104 124 104 147 49 218 49 218 67 147 67 257 44 354 47 353 71 256 68 8 47 54 47 54 69 8 69 275 10 346 10 346 32 275 32 26 9 75 9 75 32 26 32 +../../inference/ch_det_data_50/all-sum-510/00233625.jpg 370 395 635 397 635 445 370 443 67 210 935 204 936 325 68 331 +../../inference/ch_det_data_50/all-sum-510/00233634.jpg 213 637 264 637 264 706 213 706 522 634 572 634 572 697 522 697 641 522 684 522 684 570 641 570 95 514 155 514 155 592 95 592 754 394 762 394 762 403 754 403 677 362 730 360 733 432 679 433 53 360 109 360 109 436 53 436 77 207 157 207 157 282 77 282 642 204 695 204 695 274 642 274 208 88 262 85 266 165 212 168 362 47 428 44 432 117 366 120 +../../inference/ch_det_data_50/all-sum-510/00234400.jpg 156 419 739 419 739 439 156 439 157 393 653 393 653 412 157 412 38 390 129 390 129 413 38 413 156 339 307 342 307 365 156 362 36 342 125 342 125 363 36 363 519 293 705 293 705 316 519 316 393 290 485 288 485 316 393 318 156 291 271 291 271 315 156 315 35 291 127 291 127 315 35 315 155 242 360 242 360 269 155 269 34 242 83 242 83 270 34 270 27 150 159 150 159 177 27 177 280 96 507 96 507 113 280 113 313 44 477 47 476 90 312 87 516 50 664 52 664 68 516 67 485 17 708 15 708 45 485 47 +../../inference/ch_det_data_50/all-sum-510/00234883.jpg 64 122 318 117 319 193 65 197 71 118 122 118 122 132 71 132 381 62 506 61 506 75 381 76 57 26 368 26 368 116 57 116 385 26 503 23 503 47 385 50 +../../inference/ch_det_data_50/all-sum-510/test_add_0.jpg 311 521 391 521 391 534 311 534 277 499 426 499 426 516 277 516 259 445 438 445 438 461 259 461 210 426 487 426 487 443 210 443 244 385 460 385 460 411 244 411 220 327 476 327 476 373 220 373 205 204 494 208 493 279 204 275 264 163 423 165 423 198 264 196 15 17 203 15 203 45 15 47 +../../inference/ch_det_data_50/all-sum-510/test_add_1.png +../../inference/ch_det_data_50/all-sum-510/test_add_10.png 155 123 187 123 187 174 155 174 160 105 184 105 184 131 160 131 116 45 155 44 158 176 119 176 63 30 102 31 99 172 60 171 +../../inference/ch_det_data_50/all-sum-510/test_add_11.jpg 1388 755 1486 755 1486 794 1388 794 1011 752 1210 752 1210 802 1011 802 681 752 879 752 879 801 681 801 355 750 568 745 570 796 356 801 76 748 266 743 268 796 78 801 600 645 1155 645 1155 706 600 706 600 562 1151 553 1151 614 600 622 596 478 1070 470 1070 529 596 537 595 390 1095 385 1095 444 595 448 600 303 1061 303 1061 362 600 362 353 180 1521 180 1521 265 353 265 59 40 261 40 261 91 59 91 1303 39 1495 39 1495 90 1303 90 971 37 1173 32 1175 83 973 88 668 37 864 32 866 83 670 88 361 32 561 32 561 88 361 88 +../../inference/ch_det_data_50/all-sum-510/test_add_12.jpg 9 590 140 592 140 615 9 613 107 520 908 524 908 571 107 566 632 448 905 445 905 481 632 484 110 445 468 447 468 487 110 485 580 303 682 301 683 351 581 353 368 257 568 262 565 361 364 355 61 83 856 85 856 164 61 162 +../../inference/ch_det_data_50/all-sum-510/test_add_13.jpg 68 94 117 97 116 115 67 112 +../../inference/ch_det_data_50/all-sum-510/test_add_14.jpg 28 94 238 92 238 130 28 132 27 50 241 48 241 88 27 90 +../../inference/ch_det_data_50/all-sum-510/test_add_15.jpg 140 251 354 251 354 268 140 268 203 212 407 217 407 234 203 229 104 210 194 212 194 229 104 227 153 155 287 159 287 175 153 172 143 134 307 140 307 157 143 150 106 136 147 136 147 149 106 149 106 101 278 107 277 126 105 119 106 70 247 77 246 97 105 90 106 37 211 40 210 64 105 61 +../../inference/ch_det_data_50/all-sum-510/test_add_16.jpg 380 740 750 740 750 780 380 780 360 700 472 700 472 728 360 728 1550 698 1580 698 1580 750 1550 750 1256 694 1444 694 1444 722 1256 722 1242 659 1452 659 1452 690 1242 690 384 643 672 643 672 682 384 682 1226 623 1474 621 1474 655 1226 657 356 599 582 599 582 631 356 631 1198 587 1496 587 1496 619 1198 619 1164 553 1534 553 1534 585 1164 585 378 549 642 549 642 589 378 589 354 500 520 500 520 540 354 540 772 258 1128 258 1128 303 772 303 372 208 508 208 508 303 372 303 774 208 1092 214 1092 260 774 254 +../../inference/ch_det_data_50/all-sum-510/test_add_17.jpg 319 255 394 257 394 271 319 269 306 236 407 238 407 257 306 255 306 221 413 226 412 243 305 237 93 134 387 140 386 210 92 204 69 92 401 100 401 127 69 118 66 74 225 77 225 95 66 92 64 58 227 60 227 77 64 75 +../../inference/ch_det_data_50/all-sum-510/test_add_18.jpg 153 908 616 914 616 935 153 930 464 786 718 788 718 816 464 813 552 750 666 755 665 792 551 788 117 538 190 538 190 572 117 572 115 472 676 484 675 530 114 518 119 427 670 439 670 471 119 459 119 374 676 379 676 411 119 406 555 261 677 262 677 280 555 279 164 258 336 258 336 275 164 275 342 194 457 196 457 221 342 219 307 172 490 172 490 190 307 190 252 125 540 129 540 171 252 168 345 90 488 92 488 110 345 108 283 40 569 48 567 84 282 76 235 30 268 30 268 64 235 64 +../../inference/ch_det_data_50/all-sum-510/test_add_19.jpg 22 293 44 293 44 304 22 304 62 291 106 291 106 305 62 305 61 279 107 279 107 291 61 291 218 278 247 278 247 292 218 292 176 278 210 278 210 291 176 291 141 275 166 275 166 307 141 307 7 266 20 266 20 278 7 278 219 264 245 264 245 279 219 279 60 263 133 263 133 279 60 279 22 264 49 264 49 279 22 279 218 251 250 251 250 266 218 266 63 251 133 251 133 264 63 264 22 250 45 250 45 265 22 265 7 251 20 251 20 263 7 263 8 240 18 240 18 249 8 249 61 236 115 236 115 252 61 252 23 234 49 237 47 253 21 250 210 235 246 235 246 252 210 252 143 236 166 236 166 252 143 252 493 224 533 224 533 241 493 241 334 224 355 224 355 239 334 239 287 224 315 224 315 239 287 239 61 224 114 224 114 238 61 238 7 226 18 226 18 235 7 235 219 223 250 223 250 237 219 237 141 224 167 221 169 235 143 238 23 223 49 223 49 239 23 239 494 212 526 212 526 225 494 225 418 211 439 211 439 226 418 226 335 211 400 211 400 224 335 224 291 211 322 211 322 224 291 224 220 211 251 211 251 224 220 224 144 212 167 212 167 223 144 223 60 211 115 209 115 222 60 224 24 210 50 210 50 224 24 224 336 197 384 197 384 211 336 211 63 198 89 198 89 209 63 209 492 195 542 195 542 213 492 213 443 201 456 194 464 207 451 215 219 195 257 195 257 213 219 213 177 196 207 196 207 210 177 210 144 197 158 197 158 210 144 210 23 196 44 196 44 212 23 212 416 193 440 193 440 213 416 213 63 185 134 185 134 197 63 197 335 184 400 184 400 197 335 197 455 180 466 191 456 201 444 190 289 187 309 180 315 194 295 202 219 183 256 183 256 197 219 197 140 183 160 183 160 198 140 198 493 182 519 182 519 197 493 197 426 178 441 191 426 204 412 190 32 177 46 189 32 202 19 189 176 180 193 180 193 197 176 197 335 170 402 170 402 186 335 186 491 169 521 169 521 186 491 186 426 163 441 176 426 191 412 179 292 170 315 170 315 186 292 186 219 170 252 170 252 185 219 185 177 171 189 171 189 185 177 185 62 170 127 168 127 182 62 184 454 167 464 177 455 186 445 176 142 169 164 169 164 185 142 185 492 158 525 158 525 172 492 172 399 159 436 159 436 169 399 169 334 157 403 157 403 170 334 170 295 157 327 157 327 171 295 171 219 156 253 156 253 170 219 170 143 156 164 156 164 171 143 171 60 157 127 155 127 169 60 171 491 142 543 142 543 158 491 158 449 143 480 143 480 157 449 157 334 142 441 142 441 157 334 157 294 143 328 143 328 157 294 157 219 143 254 143 254 157 219 157 61 143 105 143 105 156 61 156 142 141 164 141 164 157 142 157 17 150 31 136 45 149 30 162 285 133 293 133 293 141 285 141 177 132 193 132 193 145 177 145 335 130 389 130 389 143 335 143 491 129 528 129 528 143 491 143 449 129 479 129 479 143 449 143 417 130 437 130 437 142 417 142 291 129 323 129 323 143 291 143 217 130 256 128 257 143 218 145 61 129 97 129 97 143 61 143 143 128 161 128 161 145 143 145 29 123 45 132 34 149 18 139 492 117 537 117 537 130 492 130 335 117 389 117 389 130 335 130 218 118 256 118 256 128 218 128 450 116 480 116 480 130 450 130 417 116 440 116 440 131 417 131 177 116 210 116 210 130 177 130 143 116 164 116 164 131 143 131 60 115 90 115 90 132 60 132 17 121 32 110 45 124 29 136 490 105 527 105 527 115 490 115 448 105 479 105 479 115 448 115 419 106 436 106 436 114 419 114 292 105 321 105 321 116 292 116 218 105 244 105 244 115 218 115 175 105 205 105 205 115 175 115 143 105 163 105 163 116 143 116 334 104 373 104 373 115 334 115 61 104 88 104 88 115 61 115 483 89 523 89 523 99 483 99 330 87 381 87 381 100 330 100 274 87 336 87 336 100 274 100 213 87 248 87 248 100 213 100 5 85 103 85 103 101 5 101 414 64 464 64 464 78 414 78 287 64 335 64 335 78 287 78 155 62 208 62 208 79 155 79 414 47 525 48 525 64 414 63 287 48 377 48 377 64 287 64 157 48 270 48 270 63 157 63 415 34 483 34 483 48 415 48 287 33 338 33 338 50 287 50 26 34 45 34 45 52 26 52 155 32 207 32 207 49 155 49 55 32 115 31 116 51 56 53 411 2 529 2 529 19 411 19 144 2 346 0 346 17 144 19 +../../inference/ch_det_data_50/all-sum-510/test_add_2.jpg 251 404 535 404 535 430 251 430 302 339 483 339 483 385 302 385 302 303 482 303 482 326 302 326 573 217 693 217 693 240 573 240 331 216 455 214 455 240 331 242 108 212 182 214 181 244 107 242 313 98 672 99 672 121 313 120 311 60 585 61 585 87 311 86 +../../inference/ch_det_data_50/all-sum-510/test_add_20.jpg 30 345 607 345 607 372 30 372 216 292 512 292 512 323 216 323 472 270 527 270 527 287 472 287 216 266 292 266 292 287 216 287 218 238 486 238 486 265 218 265 220 215 305 215 305 236 220 236 399 190 419 190 419 207 399 207 221 185 343 185 343 209 221 209 220 160 289 160 289 182 220 182 374 120 477 122 477 147 374 145 221 122 367 120 367 145 221 147 217 80 354 82 354 117 217 115 439 33 607 33 607 60 439 60 67 15 400 15 400 46 67 46 +../../inference/ch_det_data_50/all-sum-510/test_add_3.jpg 168 326 339 324 339 341 168 343 169 286 309 288 309 314 169 312 169 219 324 219 324 235 169 235 339 219 451 216 451 232 339 235 168 200 373 200 373 216 168 216 168 180 418 180 418 197 168 197 169 147 417 147 417 165 169 165 170 117 419 117 419 141 170 141 325 62 480 62 480 93 325 93 170 62 310 59 311 91 171 94 +../../inference/ch_det_data_50/all-sum-510/test_add_4.png +../../inference/ch_det_data_50/all-sum-510/test_add_5.png 47 162 109 162 109 176 47 176 51 119 170 119 170 136 51 136 49 100 166 100 166 119 49 119 51 83 166 83 166 102 51 102 50 66 169 66 169 85 50 85 49 47 149 46 149 68 49 69 5 9 81 9 81 43 5 43 +../../inference/ch_det_data_50/all-sum-510/test_add_6.jpg 122 222 220 226 219 253 121 249 160 176 185 180 182 200 157 196 +../../inference/ch_det_data_50/all-sum-510/test_add_7.jpg 47 937 175 933 176 964 48 967 224 870 632 873 632 955 224 952 53 743 640 743 640 793 53 793 148 673 546 676 546 723 148 720 71 502 636 502 636 604 71 604 54 264 660 274 657 446 51 436 59 173 534 173 534 241 59 241 502 173 646 173 646 239 502 239 +../../inference/ch_det_data_50/all-sum-510/test_add_8.jpg 249 584 455 578 456 608 250 614 106 531 458 524 458 561 107 568 334 492 385 492 385 509 334 509 26 306 356 296 357 321 27 331 21 258 447 250 447 275 21 283 77 208 447 204 447 226 77 230 158 20 322 28 319 82 155 74 +../../inference/ch_det_data_50/all-sum-510/test_add_9.png 264 684 486 684 486 697 264 697 194 666 556 666 556 682 194 682 152 595 600 595 600 608 152 608 211 577 542 577 542 590 211 590 131 558 616 558 616 571 131 571 84 540 665 540 665 553 84 553 95 521 654 521 654 536 95 536 361 448 390 448 390 461 361 461 236 375 515 375 515 391 236 391 174 353 575 353 575 369 174 369 342 279 409 281 409 298 342 296 254 203 493 203 493 220 254 220 diff --git a/PTDN/results/cpp_ppocr_det_mobile_results_fp32.txt b/PTDN/results/cpp_ppocr_det_mobile_results_fp32.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb33ce1becd834b4d3a0948f448e2cba6fd54769 --- /dev/null +++ b/PTDN/results/cpp_ppocr_det_mobile_results_fp32.txt @@ -0,0 +1,50 @@ +../../inference/ch_det_data_50/all-sum-510/00008790.jpg 208 404 282 404 282 421 208 421 58 396 107 396 107 413 58 413 197 387 296 387 296 403 197 403 161 389 174 389 174 402 161 402 34 378 134 378 134 394 34 394 323 377 329 377 329 382 323 382 199 370 292 370 292 383 199 383 216 309 274 309 274 325 216 325 161 304 173 304 173 315 161 315 370 301 437 301 437 317 370 317 30 301 135 300 135 316 30 317 221 291 270 291 270 308 221 308 58 224 106 224 106 238 58 238 216 222 274 222 274 239 216 239 161 217 174 217 174 229 161 229 33 205 133 205 133 221 33 221 221 204 270 204 270 221 221 221 73 145 385 145 385 162 73 162 52 119 119 119 119 135 52 135 72 50 296 50 296 66 72 66 54 15 118 15 118 32 54 32 +../../inference/ch_det_data_50/all-sum-510/00018946.jpg 439 327 476 327 476 341 439 341 85 284 142 284 142 308 85 308 300 278 380 278 380 299 300 299 195 262 287 275 284 299 192 286 196 196 454 218 452 244 194 222 343 182 376 182 376 193 343 193 198 162 341 169 340 195 197 188 176 130 381 145 380 165 175 150 176 100 417 118 415 148 174 130 +../../inference/ch_det_data_50/all-sum-510/00034387.jpg 263 459 741 459 741 485 263 485 346 415 421 415 421 444 346 444 544 418 568 418 568 442 544 442 684 415 712 415 712 444 684 444 173 413 228 413 228 444 173 444 872 412 910 412 910 447 872 447 55 415 76 415 76 443 55 443 855 371 927 371 927 401 855 401 347 371 420 371 420 400 347 400 672 370 725 370 725 402 672 402 537 371 571 371 571 401 537 401 136 364 230 367 229 403 135 400 55 370 76 370 76 399 55 399 856 328 927 328 927 358 856 358 350 328 420 328 420 358 350 358 672 326 725 326 725 358 672 358 539 327 571 327 571 359 539 359 170 326 229 323 231 357 171 359 56 328 76 328 76 358 56 358 297 326 316 326 316 334 297 334 854 284 927 284 927 314 854 314 672 284 725 284 725 315 672 315 344 284 431 282 432 315 345 317 537 283 570 283 570 314 537 314 170 281 228 281 228 315 170 315 55 285 75 285 75 314 55 314 856 241 927 241 927 270 856 270 346 240 464 240 464 271 346 271 154 241 228 241 228 271 154 271 672 240 726 240 726 271 672 271 530 240 573 240 573 272 530 272 55 241 76 241 76 270 55 270 854 196 927 198 926 228 853 225 672 197 728 197 728 228 672 228 342 199 439 194 441 224 344 230 175 196 229 196 229 226 175 226 55 199 75 199 75 228 55 228 526 193 578 193 578 228 526 228 347 154 420 154 420 182 347 182 853 153 927 153 927 181 853 181 175 153 228 153 228 184 175 184 668 152 725 152 725 182 668 182 536 153 572 153 572 183 536 183 55 155 76 155 76 183 55 183 347 109 420 109 420 138 347 138 172 109 229 109 229 140 172 140 544 111 565 111 565 138 544 138 51 110 77 110 77 140 51 140 639 105 730 105 730 141 639 141 815 101 929 109 927 141 813 133 812 65 953 65 953 93 812 93 305 64 447 66 447 94 305 92 671 65 725 65 725 95 671 95 173 64 229 66 228 96 172 94 37 64 91 66 90 98 36 96 527 63 581 63 581 95 527 95 333 18 671 18 671 45 333 45 +../../inference/ch_det_data_50/all-sum-510/00037951.jpg 432 973 552 977 552 994 432 991 431 931 554 931 554 970 431 970 29 520 101 520 101 546 29 546 29 441 146 441 146 465 29 465 233 333 328 331 328 356 233 358 121 250 439 250 439 287 121 287 180 205 380 205 380 229 180 229 257 103 323 121 305 184 239 165 35 57 147 57 147 82 35 82 +../../inference/ch_det_data_50/all-sum-510/00044782.jpg 222 214 247 214 247 230 222 230 162 214 183 214 183 231 162 231 122 190 216 190 216 203 122 203 90 82 252 82 252 100 90 100 70 61 279 61 279 78 70 78 103 14 244 14 244 46 103 46 +../../inference/ch_det_data_50/all-sum-510/00067516.jpg 139 806 596 807 596 824 139 823 46 782 699 782 699 800 46 800 577 749 669 749 669 766 577 766 353 748 397 748 397 769 353 769 220 749 261 749 261 767 220 767 475 748 502 748 502 769 475 769 68 746 134 749 133 766 67 763 574 680 670 680 670 700 574 700 474 680 519 680 519 701 474 701 352 680 397 680 397 701 352 701 68 679 134 682 133 700 67 697 219 678 245 681 242 702 216 698 575 614 669 614 669 633 575 633 68 611 134 614 133 633 67 630 474 613 501 613 501 633 474 633 353 613 379 613 379 634 353 634 219 612 245 612 245 633 219 633 576 546 669 546 669 566 576 566 474 545 519 545 519 566 474 566 351 544 381 544 381 567 351 567 219 545 245 545 245 566 219 566 67 541 134 544 133 565 66 562 67 477 134 480 133 501 66 498 584 479 666 479 666 499 584 499 474 478 519 478 519 500 474 500 352 478 397 478 397 500 352 500 218 477 246 477 246 502 218 502 579 424 666 427 665 451 578 448 345 428 411 428 411 449 345 449 66 425 151 427 151 451 66 449 473 427 515 427 515 450 473 450 218 427 259 427 259 450 218 450 282 396 479 397 479 420 282 419 83 316 667 316 667 335 83 335 64 277 666 277 666 292 64 292 456 209 585 209 585 226 456 226 311 208 373 208 373 227 311 227 163 208 227 208 227 227 163 227 504 150 541 150 541 168 504 168 264 47 485 47 485 69 264 69 +../../inference/ch_det_data_50/all-sum-510/00088568.jpg 57 443 119 443 119 456 57 456 309 413 744 413 744 430 309 430 309 375 737 375 737 392 309 392 415 337 559 337 559 351 415 351 307 322 674 321 674 338 307 339 275 292 349 294 349 313 275 311 52 285 210 285 210 301 52 301 273 262 420 262 420 279 273 279 55 262 249 262 249 279 55 279 669 247 697 247 697 262 669 262 601 247 629 247 629 262 601 262 531 247 559 247 559 262 531 262 461 247 489 247 489 262 461 262 277 247 310 247 310 261 277 261 55 240 142 240 142 254 55 254 276 230 400 230 400 246 276 246 741 227 749 237 741 246 732 237 665 230 701 230 701 245 665 245 598 230 631 230 631 245 598 245 527 230 563 230 563 245 527 245 458 230 493 230 493 245 458 245 52 213 212 215 212 233 52 231 732 214 747 214 747 227 732 227 662 212 706 212 706 230 662 230 594 213 638 213 638 227 594 227 522 213 570 213 570 227 522 227 453 213 497 213 497 227 453 227 278 213 352 213 352 227 278 227 734 198 748 198 748 210 734 210 667 196 702 196 702 210 667 210 599 196 633 196 633 211 599 211 527 196 564 196 564 210 527 210 459 196 493 196 493 210 459 210 276 194 418 195 418 212 276 211 54 190 241 190 241 207 54 207 664 179 705 179 705 194 664 194 278 178 352 180 352 195 278 193 733 179 747 179 747 194 733 194 596 178 635 178 635 193 596 193 523 177 567 177 567 195 523 195 456 178 495 178 495 193 456 193 55 170 142 170 142 184 55 184 733 164 748 164 748 176 733 176 664 162 705 162 705 176 664 176 597 162 635 162 635 176 597 176 525 162 566 162 566 176 525 176 456 162 494 162 494 176 456 176 277 160 399 160 399 176 277 176 54 146 149 146 149 161 54 161 452 145 497 145 497 160 452 160 729 144 748 144 748 162 729 162 662 143 706 143 706 161 662 161 595 144 636 144 636 159 595 159 521 143 566 141 567 159 522 161 277 143 310 143 310 159 277 159 275 120 430 120 430 140 275 140 50 119 234 120 234 140 50 139 402 90 703 90 703 107 402 107 46 78 282 78 282 98 46 98 324 67 745 68 745 86 324 85 666 47 743 47 743 64 666 64 295 47 435 47 435 63 295 63 64 30 232 27 233 65 65 68 +../../inference/ch_det_data_50/all-sum-510/00091741.jpg 46 335 87 335 87 360 46 360 98 209 258 209 258 232 98 232 101 189 258 190 258 206 101 205 87 99 268 97 269 184 88 186 92 45 266 53 263 117 89 109 89 10 258 12 258 38 89 36 +../../inference/ch_det_data_50/all-sum-510/00105313.jpg 289 261 407 261 407 277 289 277 152 260 265 260 265 276 152 276 10 257 74 259 74 276 10 274 32 230 134 230 134 245 32 245 34 215 218 215 218 228 34 228 32 199 148 199 148 214 32 214 31 181 217 182 217 199 31 198 34 169 107 169 107 182 34 182 34 153 126 153 126 166 34 166 33 136 144 137 144 150 33 149 34 122 177 122 177 135 34 135 32 104 178 104 178 120 32 120 32 91 102 91 102 104 32 104 33 75 121 75 121 88 33 88 32 60 121 60 121 73 32 73 34 44 121 44 121 57 34 57 31 28 144 28 144 43 31 43 177 20 415 15 416 51 178 56 24 10 152 10 152 26 24 26 +../../inference/ch_det_data_50/all-sum-510/00134770.jpg 386 645 457 645 457 658 386 658 406 618 486 617 486 635 406 636 111 533 272 530 272 550 111 553 110 501 445 496 445 516 110 521 110 469 445 465 445 485 110 489 110 438 446 433 446 453 110 458 109 407 445 403 445 423 109 427 151 375 443 372 443 392 151 395 183 336 371 334 371 358 183 360 224 307 272 308 272 318 224 317 73 96 517 101 516 220 72 215 +../../inference/ch_det_data_50/all-sum-510/00145943.jpg 390 243 751 274 735 454 375 423 88 90 302 90 302 121 88 121 43 40 329 37 329 78 43 81 +../../inference/ch_det_data_50/all-sum-510/00147605.jpg 514 605 786 604 786 629 514 630 116 521 226 521 226 561 116 561 252 522 309 522 309 558 252 558 713 500 902 503 902 539 713 536 254 501 296 501 296 519 254 519 345 479 475 479 475 517 345 517 251 483 296 483 296 501 251 501 350 456 447 456 447 471 350 471 143 442 203 442 203 469 143 469 727 370 880 370 880 422 727 422 526 369 684 369 684 421 526 421 140 367 490 367 490 423 140 423 742 313 872 313 872 338 742 338 798 155 888 155 888 192 798 192 272 140 457 140 457 161 272 161 737 114 895 118 894 158 736 155 107 110 206 110 206 131 107 131 268 92 464 94 464 134 268 131 +../../inference/ch_det_data_50/all-sum-510/00150341.jpg 99 643 300 643 300 664 99 664 113 615 289 615 289 633 113 633 82 591 320 590 320 611 82 612 30 563 315 561 315 582 30 584 30 513 169 513 169 531 30 531 32 488 111 488 111 506 32 506 357 458 465 461 464 486 356 483 26 458 271 459 271 483 26 482 338 438 423 442 422 461 337 457 64 437 145 437 145 455 64 455 205 414 293 414 293 436 205 436 318 407 442 411 441 439 317 435 42 404 176 407 176 435 42 432 28 381 137 381 137 405 28 405 +../../inference/ch_det_data_50/all-sum-510/00150669.jpg 647 698 683 698 683 718 647 718 515 684 551 684 551 721 515 721 650 687 680 687 680 702 650 702 920 673 938 673 938 686 920 686 518 670 548 670 548 690 518 690 785 670 808 670 808 688 785 688 590 670 608 670 608 688 590 688 732 665 745 679 732 692 718 679 652 668 680 668 680 689 652 689 271 665 423 665 423 690 271 690 130 664 205 664 205 690 130 690 44 664 111 664 111 689 44 689 781 628 812 628 812 663 781 663 643 626 687 626 687 666 643 666 514 627 550 627 550 665 514 665 654 617 673 617 673 629 654 629 858 617 868 617 868 628 858 628 727 617 736 617 736 628 727 628 920 614 940 614 940 631 920 631 785 614 807 614 807 631 785 631 371 603 421 603 421 620 371 620 83 600 216 603 216 624 83 620 46 602 72 602 72 624 46 624 780 569 817 573 813 610 776 606 922 559 936 559 936 575 922 575 856 559 869 559 869 575 856 575 61 552 411 552 411 569 61 569 61 531 117 533 117 547 61 545 859 527 868 527 868 539 859 539 923 525 936 525 936 542 923 542 787 524 807 524 807 540 787 540 526 526 536 526 536 536 526 536 261 511 396 511 396 528 261 528 120 512 246 512 246 526 120 526 47 512 120 512 120 527 47 527 753 491 829 491 829 508 753 508 636 491 712 491 712 508 636 508 517 491 593 491 593 508 517 508 84 448 125 448 125 463 84 463 221 448 238 448 238 462 221 462 682 444 869 444 869 461 682 461 561 444 667 444 667 461 561 461 489 445 545 445 545 459 489 459 183 437 209 437 209 459 183 459 52 429 73 437 64 464 42 456 222 430 278 430 278 445 222 445 86 430 145 430 145 445 86 445 505 382 617 381 617 398 505 399 701 380 758 380 758 398 701 398 307 371 365 371 365 386 307 386 90 371 168 371 168 386 90 386 686 334 821 334 821 352 686 352 496 333 659 333 659 350 496 350 207 314 245 314 245 333 207 333 497 287 642 287 642 304 497 304 670 286 804 286 804 304 670 304 668 239 817 239 817 257 668 257 495 239 644 239 644 257 495 257 668 193 816 193 816 209 668 209 496 193 644 193 644 209 496 209 668 144 816 144 816 161 668 161 497 144 646 144 646 161 497 161 488 102 546 102 546 121 488 121 845 21 900 21 900 43 845 43 25 18 702 18 702 39 25 39 896 10 997 14 996 46 895 42 +../../inference/ch_det_data_50/all-sum-510/00152568.jpg 2 250 285 252 285 281 2 279 195 231 255 231 255 241 195 241 198 158 282 164 277 230 193 224 177 148 251 148 251 161 177 161 +../../inference/ch_det_data_50/all-sum-510/00155628.jpg 147 898 506 901 506 925 147 922 519 892 562 894 561 912 518 910 59 884 83 884 83 895 59 895 148 877 505 881 505 902 148 897 523 833 641 837 640 858 522 854 68 832 187 834 187 855 68 853 245 554 468 554 468 570 245 570 307 506 405 508 405 526 307 523 243 481 460 483 460 504 243 502 250 420 460 422 460 454 250 452 193 377 518 379 518 410 193 408 473 194 625 194 625 212 473 212 70 127 643 129 643 163 70 161 478 39 599 35 602 101 481 105 67 23 136 14 140 44 71 54 +../../inference/ch_det_data_50/all-sum-510/00173364.jpg 7 176 58 176 58 201 7 201 135 118 196 118 196 135 135 135 38 75 87 75 87 105 38 105 249 19 313 19 313 38 249 38 19 15 105 15 105 40 19 40 +../../inference/ch_det_data_50/all-sum-510/00175503.jpg 39 256 503 252 504 362 40 366 49 198 351 175 357 253 55 276 +../../inference/ch_det_data_50/all-sum-510/00193218.jpg 282 373 411 373 411 389 282 389 170 373 223 373 223 390 170 390 108 373 162 373 162 390 108 390 276 357 358 357 358 371 276 371 169 357 222 357 222 371 169 371 106 356 175 356 175 373 106 373 408 356 493 356 493 370 408 370 24 185 64 185 64 203 24 203 500 184 558 184 558 201 500 201 379 185 421 183 422 200 380 202 283 184 311 184 311 202 283 202 173 185 197 185 197 201 173 201 498 163 544 163 544 177 498 177 379 162 412 162 412 177 379 177 261 161 303 161 303 178 261 178 174 161 231 161 231 178 174 178 24 161 80 161 80 178 24 178 385 139 489 139 489 155 385 155 26 137 133 137 133 153 26 153 442 115 538 117 538 134 442 132 345 117 406 117 406 131 345 131 259 117 303 117 303 131 259 131 28 112 229 114 229 132 28 130 130 90 395 93 395 110 130 107 560 81 585 81 585 109 560 109 +../../inference/ch_det_data_50/all-sum-510/00195033.jpg 221 302 240 302 240 309 221 309 487 262 534 264 533 282 486 280 125 249 194 249 194 285 125 285 336 248 364 248 364 268 336 268 317 221 381 223 381 240 317 238 431 224 450 224 450 236 431 236 360 202 539 202 539 218 360 218 87 199 148 201 148 218 87 216 371 181 450 181 450 195 371 195 327 180 354 180 354 194 327 194 94 178 241 178 241 195 94 195 431 159 559 159 559 175 431 175 128 148 289 149 289 166 128 165 35 145 75 148 74 163 34 160 487 146 501 146 501 153 487 153 100 143 122 143 122 154 100 154 370 127 505 126 505 140 370 141 98 125 194 125 194 139 98 139 320 125 338 125 338 136 320 136 35 121 78 121 78 135 35 135 322 104 338 104 338 116 322 116 371 101 503 101 503 117 371 117 348 103 362 103 362 115 348 115 37 101 81 101 81 114 37 114 97 98 207 99 207 116 97 115 305 89 317 89 317 97 305 97 346 86 364 86 364 97 346 97 319 85 342 85 342 100 319 100 357 82 515 80 515 96 357 98 40 81 90 81 90 94 40 94 92 77 242 78 242 95 92 94 312 65 394 65 394 79 312 79 240 64 290 64 290 78 240 78 183 52 222 52 222 66 183 66 468 47 547 47 547 61 468 61 422 34 438 34 438 55 422 55 464 29 551 29 551 43 464 43 206 19 330 21 330 42 206 40 +../../inference/ch_det_data_50/all-sum-510/00208502.jpg 556 535 630 535 630 569 556 569 204 537 284 537 284 552 204 552 142 512 191 512 191 526 142 526 248 511 309 511 309 525 248 525 41 499 118 499 118 520 41 520 465 490 558 490 558 510 465 510 666 489 680 493 677 505 662 501 724 490 739 490 739 503 724 503 40 450 118 448 118 469 40 471 173 448 237 448 237 465 173 465 93 403 121 403 121 424 93 424 38 403 63 403 63 424 38 424 214 392 232 405 220 422 203 409 39 357 58 357 58 375 39 375 92 355 121 355 121 375 92 375 187 339 248 337 249 363 188 365 458 319 551 317 551 338 458 340 457 271 553 271 553 292 457 292 562 271 737 267 737 288 562 292 516 225 548 225 548 245 516 245 620 185 675 185 675 202 620 202 456 130 550 128 550 149 456 151 571 104 789 98 789 121 571 127 121 46 291 46 291 99 121 99 536 36 710 36 710 92 536 92 +../../inference/ch_det_data_50/all-sum-510/00224225.jpg 135 426 157 426 157 449 135 449 199 402 480 408 479 461 198 455 200 225 474 225 474 394 200 394 130 264 174 264 174 281 130 281 343 205 458 205 458 232 343 232 197 186 349 194 346 242 194 234 7 41 160 39 161 115 8 117 +../../inference/ch_det_data_50/all-sum-510/00227746.jpg 142 230 210 230 210 240 142 240 72 230 130 230 130 240 72 240 215 228 386 228 386 240 215 240 290 208 347 208 347 224 290 224 142 179 165 181 162 209 139 208 171 152 347 152 347 167 171 167 143 110 279 112 279 135 143 132 202 53 387 53 387 69 202 69 141 47 193 47 193 64 141 64 +../../inference/ch_det_data_50/all-sum-510/00229605.jpg 742 528 882 528 882 545 742 545 232 497 590 496 590 524 232 525 5 496 229 496 229 524 5 524 734 494 884 497 884 522 734 519 605 493 718 488 719 517 606 522 2 242 865 227 866 291 3 305 477 26 884 26 884 77 477 77 +../../inference/ch_det_data_50/all-sum-510/00233011.jpg 61 225 293 225 293 243 61 243 11 218 43 218 43 252 11 252 60 177 120 177 120 196 60 196 11 169 44 169 44 204 11 204 59 127 149 129 149 148 59 146 11 123 45 123 45 156 11 156 124 87 239 87 239 105 124 105 147 49 218 49 218 67 147 67 257 44 354 47 353 71 256 68 8 47 54 47 54 69 8 69 275 10 346 10 346 32 275 32 26 9 75 9 75 32 26 32 +../../inference/ch_det_data_50/all-sum-510/00233625.jpg 370 395 635 397 635 445 370 443 67 210 935 204 936 325 68 331 +../../inference/ch_det_data_50/all-sum-510/00233634.jpg 213 637 264 637 264 706 213 706 522 634 572 634 572 697 522 697 641 522 684 522 684 570 641 570 95 514 155 514 155 592 95 592 754 394 762 394 762 403 754 403 677 362 730 360 733 432 679 433 53 360 109 360 109 436 53 436 77 207 157 207 157 282 77 282 642 204 695 204 695 274 642 274 208 88 262 85 266 165 212 168 362 47 428 44 432 117 366 120 +../../inference/ch_det_data_50/all-sum-510/00234400.jpg 156 419 739 419 739 439 156 439 157 393 653 393 653 412 157 412 38 390 129 390 129 413 38 413 156 339 307 342 307 365 156 362 36 342 125 342 125 363 36 363 519 293 705 293 705 316 519 316 393 290 485 288 485 316 393 318 156 291 271 291 271 315 156 315 35 291 127 291 127 315 35 315 155 242 360 242 360 269 155 269 34 242 83 242 83 270 34 270 27 150 159 150 159 177 27 177 280 96 507 96 507 113 280 113 313 44 477 47 476 90 312 87 516 50 664 52 664 68 516 67 485 17 708 15 708 45 485 47 +../../inference/ch_det_data_50/all-sum-510/00234883.jpg 64 122 318 117 319 193 65 197 71 118 122 118 122 132 71 132 381 62 506 61 506 75 381 76 54 25 369 23 369 116 54 118 385 26 503 23 503 47 385 50 +../../inference/ch_det_data_50/all-sum-510/test_add_0.jpg 311 521 391 521 391 534 311 534 277 499 426 499 426 516 277 516 259 445 438 445 438 461 259 461 210 426 487 426 487 443 210 443 244 385 460 385 460 411 244 411 220 327 476 327 476 373 220 373 205 204 494 208 493 279 204 275 264 163 423 165 423 198 264 196 15 17 203 15 203 45 15 47 +../../inference/ch_det_data_50/all-sum-510/test_add_1.png +../../inference/ch_det_data_50/all-sum-510/test_add_10.png 155 123 187 123 187 174 155 174 160 105 184 105 184 131 160 131 116 45 155 44 158 176 119 176 63 30 102 31 99 172 60 171 +../../inference/ch_det_data_50/all-sum-510/test_add_11.jpg 1388 755 1486 755 1486 794 1388 794 1011 752 1210 752 1210 802 1011 802 681 752 879 752 879 801 681 801 355 750 568 745 570 796 356 801 76 748 266 743 268 796 78 801 600 645 1155 645 1155 706 600 706 600 562 1151 553 1151 614 600 622 596 478 1070 470 1070 529 596 537 595 390 1095 385 1095 444 595 448 600 303 1061 303 1061 362 600 362 353 180 1521 180 1521 265 353 265 59 40 261 40 261 91 59 91 1303 39 1495 39 1495 90 1303 90 971 37 1173 32 1175 83 973 88 668 37 864 32 866 83 670 88 361 32 561 32 561 88 361 88 +../../inference/ch_det_data_50/all-sum-510/test_add_12.jpg 9 590 140 592 140 615 9 613 107 520 908 524 908 571 107 566 632 448 905 445 905 481 632 484 110 445 468 447 468 487 110 485 580 303 682 301 683 351 581 353 368 257 568 262 565 361 364 355 61 83 856 85 856 164 61 162 +../../inference/ch_det_data_50/all-sum-510/test_add_13.jpg 68 93 118 96 116 116 66 113 +../../inference/ch_det_data_50/all-sum-510/test_add_14.jpg 28 94 238 92 238 130 28 132 27 50 241 48 241 88 27 90 +../../inference/ch_det_data_50/all-sum-510/test_add_15.jpg 140 251 354 251 354 268 140 268 203 212 407 217 407 234 203 229 104 210 194 212 194 229 104 227 153 155 287 159 287 175 153 172 143 134 307 140 307 157 143 150 106 136 147 136 147 149 106 149 106 101 278 107 277 126 105 119 106 70 247 77 246 97 105 90 106 37 211 40 210 64 105 61 +../../inference/ch_det_data_50/all-sum-510/test_add_16.jpg 380 740 750 740 750 780 380 780 360 700 472 700 472 728 360 728 1550 698 1580 698 1580 750 1550 750 1256 694 1444 694 1444 722 1256 722 1242 659 1452 659 1452 690 1242 690 384 643 672 643 672 682 384 682 1226 623 1474 621 1474 655 1226 657 356 599 582 599 582 631 356 631 1198 587 1496 587 1496 619 1198 619 1164 553 1534 553 1534 585 1164 585 378 549 642 549 642 589 378 589 354 500 520 500 520 540 354 540 772 258 1128 258 1128 303 772 303 372 208 508 208 508 303 372 303 774 208 1092 214 1092 260 774 254 +../../inference/ch_det_data_50/all-sum-510/test_add_17.jpg 319 255 394 257 394 271 319 269 306 236 407 238 407 257 306 255 306 221 413 226 412 243 305 237 93 135 387 140 386 209 92 204 69 92 401 100 401 127 69 118 66 74 225 77 225 95 66 92 64 58 227 60 227 77 64 75 +../../inference/ch_det_data_50/all-sum-510/test_add_18.jpg 153 908 616 914 616 935 153 930 464 786 718 788 718 816 464 813 552 750 666 755 665 792 551 788 117 538 190 538 190 572 117 572 115 472 676 484 675 530 114 518 119 427 670 439 670 471 119 459 119 374 676 380 676 411 119 405 555 261 677 262 677 280 555 279 164 258 336 258 336 275 164 275 342 194 457 196 457 221 342 219 307 172 490 172 490 190 307 190 252 125 540 129 540 171 252 168 345 90 488 92 488 110 345 108 283 40 569 48 567 84 282 76 235 30 268 30 268 64 235 64 +../../inference/ch_det_data_50/all-sum-510/test_add_19.jpg 22 293 44 293 44 304 22 304 62 291 106 291 106 305 62 305 61 279 107 279 107 291 61 291 218 278 247 278 247 292 218 292 176 278 210 278 210 291 176 291 141 275 166 275 166 307 141 307 7 266 20 266 20 278 7 278 219 264 245 264 245 279 219 279 60 263 133 263 133 279 60 279 22 264 49 264 49 279 22 279 218 251 250 251 250 266 218 266 63 251 133 251 133 264 63 264 22 250 45 250 45 265 22 265 7 251 20 251 20 263 7 263 8 240 18 240 18 249 8 249 61 236 115 236 115 252 61 252 23 234 49 237 47 253 21 250 210 235 246 235 246 252 210 252 143 236 166 236 166 252 143 252 493 224 533 224 533 241 493 241 334 224 355 224 355 239 334 239 287 224 315 224 315 239 287 239 61 224 114 224 114 238 61 238 7 226 18 226 18 235 7 235 219 223 250 223 250 237 219 237 141 224 167 221 169 235 143 238 23 223 49 223 49 239 23 239 494 212 526 212 526 225 494 225 418 211 439 211 439 226 418 226 335 211 400 211 400 224 335 224 291 211 322 211 322 224 291 224 220 211 251 211 251 224 220 224 144 212 167 212 167 223 144 223 60 211 115 209 115 222 60 224 24 210 50 210 50 224 24 224 336 197 384 197 384 211 336 211 63 198 89 198 89 209 63 209 492 195 542 195 542 213 492 213 219 195 257 195 257 213 219 213 177 196 207 196 207 210 177 210 144 197 158 197 158 210 144 210 23 196 44 196 44 212 23 212 416 193 440 193 440 213 416 213 63 185 134 185 134 197 63 197 335 184 400 184 400 197 335 197 455 180 466 191 456 201 444 190 289 187 309 180 315 194 295 202 219 183 256 183 256 197 219 197 140 183 160 183 160 198 140 198 493 182 519 182 519 197 493 197 426 178 441 191 426 204 412 190 32 177 46 189 32 202 19 189 176 180 193 180 193 197 176 197 335 170 402 170 402 186 335 186 491 169 521 169 521 186 491 186 426 163 441 176 426 191 412 179 292 170 315 170 315 186 292 186 219 170 252 170 252 185 219 185 177 171 189 171 189 185 177 185 62 170 127 168 127 182 62 184 454 167 464 177 455 186 445 176 142 169 164 169 164 185 142 185 492 158 525 158 525 172 492 172 399 159 436 159 436 169 399 169 334 157 403 157 403 170 334 170 295 157 327 157 327 171 295 171 219 156 253 156 253 170 219 170 143 156 164 156 164 171 143 171 60 157 127 155 127 169 60 171 491 142 543 142 543 158 491 158 449 143 480 143 480 157 449 157 334 142 441 142 441 157 334 157 294 143 328 143 328 157 294 157 219 143 254 143 254 157 219 157 61 143 105 143 105 156 61 156 142 141 164 141 164 157 142 157 17 150 31 136 45 149 30 162 285 133 293 133 293 141 285 141 177 132 193 132 193 145 177 145 335 130 389 130 389 143 335 143 491 129 528 129 528 143 491 143 449 129 479 129 479 143 449 143 291 129 323 129 323 143 291 143 217 130 256 128 257 143 218 145 61 129 97 129 97 143 61 143 416 128 439 128 439 143 416 143 143 128 161 128 161 145 143 145 29 123 45 132 34 149 18 139 492 117 537 117 537 130 492 130 335 117 389 117 389 130 335 130 218 118 256 118 256 128 218 128 450 116 480 116 480 130 450 130 417 116 440 116 440 131 417 131 177 116 210 116 210 130 177 130 143 116 164 116 164 131 143 131 60 115 90 115 90 132 60 132 17 121 32 110 45 124 29 136 490 105 527 105 527 115 490 115 448 105 479 105 479 115 448 115 419 106 436 106 436 114 419 114 292 105 321 105 321 116 292 116 218 105 244 105 244 115 218 115 175 105 205 105 205 115 175 115 143 105 163 105 163 116 143 116 334 104 373 104 373 115 334 115 61 104 88 104 88 115 61 115 483 89 523 89 523 99 483 99 330 87 381 87 381 100 330 100 274 87 336 87 336 100 274 100 213 87 248 87 248 100 213 100 5 85 103 85 103 101 5 101 414 64 464 64 464 78 414 78 287 64 335 64 335 78 287 78 155 62 208 62 208 79 155 79 414 47 525 48 525 64 414 63 287 48 377 48 377 64 287 64 157 48 270 48 270 63 157 63 415 34 483 34 483 48 415 48 287 33 338 33 338 50 287 50 26 34 45 34 45 52 26 52 155 32 207 32 207 49 155 49 55 32 115 31 116 51 56 53 411 2 529 2 529 19 411 19 144 2 346 0 346 17 144 19 +../../inference/ch_det_data_50/all-sum-510/test_add_2.jpg 251 404 535 404 535 430 251 430 302 339 483 339 483 385 302 385 302 303 482 303 482 326 302 326 573 217 693 217 693 240 573 240 331 216 455 214 455 240 331 242 108 212 182 214 181 244 107 242 313 98 672 99 672 121 313 120 311 60 585 61 585 87 311 86 +../../inference/ch_det_data_50/all-sum-510/test_add_20.jpg 30 345 607 345 607 372 30 372 216 292 512 292 512 323 216 323 472 270 527 270 527 287 472 287 216 266 292 266 292 287 216 287 218 238 486 238 486 265 218 265 220 215 305 215 305 236 220 236 221 185 343 185 343 209 221 209 220 160 289 160 289 182 220 182 374 120 477 122 477 147 374 145 221 122 367 120 367 145 221 147 217 80 354 82 354 117 217 115 439 33 607 33 607 60 439 60 67 15 400 15 400 46 67 46 +../../inference/ch_det_data_50/all-sum-510/test_add_3.jpg 168 326 339 324 339 341 168 343 169 286 309 288 309 314 169 312 169 219 324 219 324 235 169 235 339 219 451 216 451 232 339 235 168 200 373 200 373 216 168 216 168 180 418 180 418 197 168 197 169 147 417 147 417 165 169 165 170 117 419 117 419 141 170 141 325 62 480 62 480 93 325 93 170 62 310 59 311 91 171 94 +../../inference/ch_det_data_50/all-sum-510/test_add_4.png +../../inference/ch_det_data_50/all-sum-510/test_add_5.png 47 162 109 162 109 176 47 176 51 119 170 119 170 136 51 136 49 100 166 100 166 119 49 119 51 83 166 83 166 102 51 102 50 66 169 66 169 85 50 85 49 47 149 46 149 68 49 69 5 9 81 9 81 43 5 43 +../../inference/ch_det_data_50/all-sum-510/test_add_6.jpg 122 222 220 226 219 253 121 249 160 176 185 180 182 200 157 196 +../../inference/ch_det_data_50/all-sum-510/test_add_7.jpg 47 937 175 933 176 964 48 967 224 870 632 873 632 955 224 952 53 743 640 743 640 793 53 793 148 673 546 676 546 723 148 720 71 502 636 502 636 604 71 604 54 264 660 274 657 446 51 436 59 173 534 173 534 241 59 241 502 173 646 173 646 239 502 239 +../../inference/ch_det_data_50/all-sum-510/test_add_8.jpg 249 584 455 578 456 608 250 614 106 531 458 524 458 561 107 568 334 492 385 492 385 509 334 509 26 306 356 296 357 321 27 331 21 258 447 250 447 275 21 283 77 208 447 204 447 226 77 230 158 20 322 28 319 82 155 74 +../../inference/ch_det_data_50/all-sum-510/test_add_9.png 264 684 486 684 486 697 264 697 194 666 556 666 556 682 194 682 152 595 600 595 600 608 152 608 211 577 543 577 543 590 211 590 131 559 617 558 617 572 131 573 84 540 665 540 665 553 84 553 95 521 654 521 654 536 95 536 361 448 390 448 390 461 361 461 236 375 515 375 515 391 236 391 174 353 575 353 575 369 174 369 342 279 409 281 409 298 342 296 254 203 493 203 493 220 254 220 diff --git a/PTDN/results/python_ppocr_det_mobile_results_fp16.txt b/PTDN/results/python_ppocr_det_mobile_results_fp16.txt new file mode 100644 index 0000000000000000000000000000000000000000..191bdaf7807dad9129eb965f4ac81dadc9572af6 --- /dev/null +++ b/PTDN/results/python_ppocr_det_mobile_results_fp16.txt @@ -0,0 +1,49 @@ +00008790.jpg [[[209, 406], [280, 406], [280, 419], [209, 419]], [[60, 398], [105, 398], [105, 411], [60, 411]], [[198, 389], [291, 389], [291, 402], [198, 402]], [[162, 391], [173, 391], [173, 401], [162, 401]], [[35, 380], [133, 380], [133, 393], [35, 393]], [[199, 371], [292, 371], [292, 384], [199, 384]], [[218, 310], [272, 310], [272, 324], [218, 324]], [[162, 305], [172, 305], [172, 314], [162, 314]], [[371, 302], [436, 302], [436, 316], [371, 316]], [[31, 302], [134, 301], [134, 315], [31, 316]], [[223, 292], [269, 292], [269, 306], [223, 306]], [[60, 225], [104, 225], [104, 236], [60, 236]], [[218, 223], [272, 223], [272, 237], [218, 237]], [[162, 219], [173, 219], [173, 227], [162, 227]], [[33, 207], [131, 207], [131, 220], [33, 220]], [[223, 206], [269, 206], [269, 220], [223, 220]], [[74, 146], [383, 146], [383, 159], [74, 159]], [[54, 120], [117, 120], [117, 134], [54, 134]], [[74, 51], [296, 51], [296, 65], [74, 65]], [[56, 18], [116, 18], [116, 32], [56, 32]]] +00018946.jpg [[[441, 328], [474, 328], [474, 339], [441, 339]], [[86, 284], [141, 286], [140, 307], [85, 305]], [[302, 279], [377, 279], [377, 297], [302, 297]], [[197, 265], [281, 274], [279, 293], [195, 284]], [[198, 197], [452, 219], [450, 242], [196, 220]], [[343, 182], [376, 182], [376, 192], [343, 192]], [[199, 164], [340, 171], [339, 192], [198, 185]], [[177, 101], [415, 118], [413, 145], [175, 128]]] +00034387.jpg [[[265, 460], [740, 460], [740, 484], [265, 484]], [[348, 417], [420, 417], [420, 443], [348, 443]], [[545, 418], [568, 418], [568, 442], [545, 442]], [[685, 417], [710, 417], [710, 443], [685, 443]], [[175, 415], [226, 415], [226, 443], [175, 443]], [[874, 414], [908, 414], [908, 446], [874, 446]], [[56, 417], [74, 417], [74, 442], [56, 442]], [[856, 373], [925, 373], [925, 400], [856, 400]], [[348, 372], [418, 372], [418, 397], [348, 397]], [[674, 372], [723, 372], [723, 401], [674, 401]], [[539, 373], [570, 373], [570, 400], [539, 400]], [[151, 365], [228, 369], [226, 402], [149, 398]], [[56, 372], [74, 372], [74, 397], [56, 397]], [[857, 329], [925, 329], [925, 355], [857, 355]], [[351, 330], [419, 330], [419, 356], [351, 356]], [[674, 328], [723, 328], [723, 356], [674, 356]], [[541, 329], [570, 329], [570, 357], [541, 357]], [[171, 327], [227, 324], [229, 355], [173, 358]], [[57, 330], [74, 330], [74, 356], [57, 356]], [[298, 327], [316, 327], [316, 334], [298, 334]], [[855, 286], [925, 286], [925, 312], [855, 312]], [[674, 286], [723, 286], [723, 313], [674, 313]], [[346, 286], [426, 283], [427, 313], [347, 316]], [[540, 285], [569, 285], [569, 312], [540, 312]], [[172, 282], [226, 282], [226, 313], [172, 313]], [[56, 287], [73, 287], [73, 312], [56, 312]], [[857, 242], [925, 242], [925, 268], [857, 268]], [[348, 242], [460, 242], [460, 268], [348, 268]], [[156, 242], [227, 242], [227, 269], [156, 269]], [[674, 241], [724, 241], [724, 269], [674, 269]], [[531, 241], [572, 241], [572, 270], [531, 270]], [[56, 242], [74, 242], [74, 268], [56, 268]], [[855, 197], [925, 200], [924, 226], [854, 224]], [[674, 198], [726, 198], [726, 226], [674, 226]], [[344, 200], [430, 195], [432, 224], [346, 230]], [[176, 197], [227, 197], [227, 225], [176, 225]], [[56, 200], [73, 200], [73, 226], [56, 226]], [[527, 194], [576, 194], [576, 226], [527, 226]], [[349, 155], [419, 155], [419, 181], [349, 181]], [[854, 154], [925, 154], [925, 180], [854, 180]], [[176, 154], [226, 154], [226, 183], [176, 183]], [[670, 153], [723, 153], [723, 181], [670, 181]], [[538, 154], [571, 154], [571, 182], [538, 182]], [[56, 156], [74, 156], [74, 182], [56, 182]], [[349, 111], [419, 111], [419, 137], [349, 137]], [[174, 111], [227, 111], [227, 139], [174, 139]], [[546, 113], [564, 113], [564, 137], [546, 137]], [[52, 112], [75, 112], [75, 139], [52, 139]], [[639, 108], [727, 105], [728, 138], [640, 141]], [[817, 103], [927, 110], [925, 139], [815, 132]], [[814, 68], [951, 68], [951, 92], [814, 92]], [[307, 66], [446, 68], [446, 93], [306, 90]], [[673, 67], [723, 67], [723, 93], [673, 93]], [[175, 65], [228, 68], [226, 95], [174, 92]], [[39, 65], [90, 68], [88, 97], [37, 94]], [[528, 65], [580, 65], [580, 94], [528, 94]], [[334, 20], [670, 20], [670, 43], [334, 43]]] +00037951.jpg [[[434, 976], [551, 978], [550, 993], [434, 991]], [[433, 932], [553, 932], [553, 969], [433, 969]], [[30, 522], [98, 522], [98, 545], [30, 545]], [[31, 443], [145, 443], [145, 464], [31, 464]], [[234, 335], [326, 332], [327, 354], [235, 356]], [[124, 252], [436, 252], [436, 284], [124, 284]], [[182, 206], [378, 206], [378, 227], [182, 227]], [[258, 106], [320, 123], [304, 181], [242, 163]], [[28, 65], [33, 65], [33, 71], [28, 71]], [[37, 58], [147, 58], [147, 80], [37, 80]]] +00044782.jpg [[[104, 218], [115, 218], [115, 227], [104, 227]], [[223, 216], [246, 216], [246, 228], [223, 228]], [[163, 216], [182, 216], [182, 229], [163, 229]], [[124, 191], [164, 191], [164, 202], [124, 202]], [[91, 84], [251, 84], [251, 98], [91, 98]], [[73, 63], [278, 63], [278, 78], [73, 78]], [[104, 15], [243, 15], [243, 44], [104, 44]]] +00067516.jpg [[[141, 808], [594, 809], [594, 822], [141, 821]], [[49, 784], [696, 784], [696, 798], [49, 798]], [[579, 751], [667, 751], [667, 764], [579, 764]], [[355, 750], [395, 750], [395, 767], [355, 767]], [[221, 751], [260, 751], [260, 765], [221, 765]], [[477, 750], [501, 750], [501, 768], [477, 768]], [[69, 748], [133, 751], [132, 765], [68, 761]], [[576, 682], [668, 682], [668, 699], [576, 699]], [[476, 682], [518, 682], [518, 700], [476, 700]], [[354, 682], [395, 682], [395, 700], [354, 700]], [[69, 681], [133, 684], [132, 699], [68, 695]], [[220, 679], [243, 682], [241, 700], [218, 697]], [[577, 615], [667, 615], [667, 632], [577, 632]], [[68, 612], [134, 615], [133, 632], [67, 629]], [[476, 614], [500, 614], [500, 633], [476, 633]], [[354, 613], [378, 613], [378, 634], [354, 634]], [[219, 612], [245, 612], [245, 633], [219, 633]], [[578, 547], [667, 547], [667, 564], [578, 564]], [[476, 546], [518, 546], [518, 565], [476, 565]], [[353, 545], [379, 545], [379, 566], [353, 566]], [[219, 545], [245, 545], [245, 566], [219, 566]], [[68, 542], [133, 546], [132, 563], [67, 560]], [[68, 478], [133, 482], [132, 499], [67, 496]], [[586, 481], [664, 481], [664, 497], [586, 497]], [[476, 480], [518, 480], [518, 498], [476, 498]], [[354, 480], [395, 480], [395, 498], [354, 498]], [[219, 479], [245, 479], [245, 500], [219, 500]], [[580, 425], [665, 429], [664, 449], [580, 446]], [[346, 429], [410, 429], [410, 447], [346, 447]], [[68, 426], [150, 429], [149, 449], [67, 447]], [[474, 427], [515, 427], [515, 449], [474, 449]], [[218, 427], [259, 427], [259, 449], [218, 449]], [[283, 398], [478, 399], [478, 419], [283, 418]], [[86, 318], [664, 318], [664, 332], [86, 332]], [[65, 279], [665, 279], [665, 292], [65, 292]], [[458, 210], [584, 210], [584, 224], [458, 224]], [[313, 209], [372, 209], [372, 226], [313, 226]], [[164, 209], [225, 209], [225, 226], [164, 226]], [[505, 151], [539, 151], [539, 166], [505, 166]], [[266, 48], [483, 48], [483, 68], [266, 68]]] +00088568.jpg [[[341, 446], [371, 446], [371, 453], [341, 453]], [[58, 445], [117, 445], [117, 455], [58, 455]], [[552, 433], [571, 433], [571, 440], [552, 440]], [[583, 431], [740, 431], [740, 442], [583, 442]], [[311, 415], [743, 415], [743, 428], [311, 428]], [[311, 377], [736, 377], [736, 390], [311, 390]], [[425, 340], [551, 340], [551, 350], [425, 350]], [[287, 324], [294, 332], [289, 337], [281, 330]], [[276, 294], [348, 296], [347, 311], [276, 309]], [[54, 288], [210, 288], [210, 301], [54, 301]], [[275, 265], [421, 265], [421, 278], [275, 278]], [[56, 264], [248, 264], [248, 277], [56, 277]], [[671, 248], [695, 248], [695, 261], [671, 261]], [[602, 248], [628, 248], [628, 261], [602, 261]], [[533, 248], [557, 248], [557, 261], [533, 261]], [[463, 248], [487, 248], [487, 261], [463, 261]], [[278, 248], [309, 248], [309, 260], [278, 260]], [[55, 240], [142, 240], [142, 254], [55, 254]], [[277, 231], [398, 231], [398, 244], [277, 244]], [[741, 228], [749, 237], [742, 245], [733, 236]], [[665, 230], [700, 230], [700, 244], [665, 244]], [[598, 230], [631, 230], [631, 244], [598, 244]], [[528, 230], [562, 230], [562, 244], [528, 244]], [[459, 230], [492, 230], [492, 244], [459, 244]], [[54, 215], [211, 217], [211, 231], [54, 229]], [[739, 211], [749, 221], [740, 229], [731, 220]], [[663, 214], [704, 214], [704, 228], [663, 228]], [[595, 215], [637, 215], [637, 226], [595, 226]], [[524, 215], [568, 215], [568, 226], [524, 226]], [[454, 215], [495, 215], [495, 226], [454, 226]], [[279, 215], [351, 215], [351, 226], [279, 226]], [[736, 199], [747, 199], [747, 208], [736, 208]], [[668, 197], [700, 197], [700, 208], [668, 208]], [[599, 196], [633, 196], [633, 210], [599, 210]], [[529, 197], [562, 197], [562, 208], [529, 208]], [[461, 197], [491, 197], [491, 208], [461, 208]], [[277, 195], [417, 196], [417, 211], [277, 209]], [[55, 192], [239, 192], [239, 205], [55, 205]], [[665, 181], [703, 181], [703, 192], [665, 192]], [[279, 180], [351, 181], [350, 192], [279, 191]], [[734, 180], [747, 180], [747, 193], [734, 193]], [[597, 180], [634, 180], [634, 191], [597, 191]], [[525, 179], [566, 179], [566, 193], [525, 193]], [[458, 180], [493, 180], [493, 191], [458, 191]], [[55, 170], [142, 170], [142, 184], [55, 184]], [[735, 165], [747, 165], [747, 175], [735, 175]], [[665, 163], [703, 163], [703, 175], [665, 175]], [[598, 163], [634, 163], [634, 175], [598, 175]], [[527, 163], [565, 163], [565, 175], [527, 175]], [[458, 163], [492, 163], [492, 175], [458, 175]], [[279, 162], [398, 162], [398, 176], [279, 176]], [[54, 146], [148, 146], [148, 159], [54, 159]], [[453, 147], [495, 147], [495, 158], [453, 158]], [[731, 143], [748, 146], [745, 161], [728, 158]], [[663, 145], [704, 145], [704, 159], [663, 159]], [[596, 146], [635, 146], [635, 157], [596, 157]], [[522, 145], [566, 142], [567, 157], [523, 159]], [[277, 144], [310, 144], [310, 158], [277, 158]], [[276, 121], [428, 121], [428, 139], [276, 139]], [[52, 120], [232, 121], [232, 139], [52, 138]], [[404, 91], [701, 91], [701, 106], [404, 106]], [[48, 79], [280, 79], [280, 97], [48, 97]], [[325, 69], [744, 70], [744, 84], [325, 83]], [[668, 48], [743, 48], [743, 63], [668, 63]], [[297, 48], [433, 48], [433, 62], [297, 62]]] +00091741.jpg [[[47, 336], [83, 336], [83, 358], [47, 358]], [[98, 211], [257, 209], [257, 229], [98, 231]], [[103, 190], [257, 191], [257, 205], [103, 204]], [[89, 101], [266, 99], [267, 181], [90, 184]], [[94, 48], [262, 55], [260, 114], [91, 107]], [[91, 12], [257, 14], [257, 37], [90, 35]]] +00105313.jpg [[[291, 262], [406, 262], [406, 275], [291, 275]], [[153, 262], [264, 262], [264, 274], [153, 274]], [[11, 258], [73, 261], [72, 274], [11, 272]], [[33, 231], [132, 231], [132, 244], [33, 244]], [[35, 217], [216, 217], [216, 227], [35, 227]], [[33, 200], [146, 200], [146, 213], [33, 213]], [[32, 183], [215, 184], [215, 197], [32, 196]], [[35, 170], [105, 170], [105, 181], [35, 181]], [[35, 155], [124, 155], [124, 164], [35, 164]], [[34, 137], [142, 138], [142, 149], [34, 148]], [[35, 123], [176, 123], [176, 133], [35, 133]], [[33, 106], [176, 106], [176, 119], [33, 119]], [[34, 92], [102, 92], [102, 102], [34, 102]], [[34, 77], [119, 77], [119, 87], [34, 87]], [[32, 60], [120, 60], [120, 73], [32, 73]], [[35, 46], [119, 46], [119, 55], [35, 55]], [[32, 29], [142, 29], [142, 42], [32, 42]], [[25, 12], [147, 12], [147, 24], [25, 24]]] +00134770.jpg [[[388, 646], [456, 646], [456, 656], [388, 656]], [[407, 620], [484, 619], [485, 633], [408, 634]], [[112, 534], [270, 531], [270, 549], [113, 551]], [[111, 502], [443, 497], [443, 514], [112, 519]], [[111, 471], [443, 467], [443, 484], [112, 488]], [[111, 439], [444, 434], [444, 451], [112, 457]], [[111, 409], [442, 405], [442, 421], [112, 425]], [[153, 376], [441, 373], [441, 390], [153, 394]], [[184, 338], [369, 336], [369, 356], [185, 358]], [[75, 98], [515, 104], [513, 218], [74, 212]]] +00145943.jpg [[[394, 248], [746, 279], [731, 449], [379, 418]], [[90, 92], [300, 92], [300, 119], [90, 119]], [[46, 41], [326, 39], [326, 75], [46, 77]]] +00147605.jpg [[[805, 616], [874, 616], [874, 627], [805, 627]], [[516, 607], [784, 605], [784, 628], [516, 629]], [[118, 522], [224, 522], [224, 560], [118, 560]], [[253, 524], [307, 524], [307, 557], [253, 557]], [[715, 501], [900, 505], [900, 538], [714, 534]], [[255, 502], [295, 502], [295, 517], [255, 517]], [[347, 481], [473, 481], [473, 515], [347, 515]], [[252, 484], [295, 484], [295, 499], [252, 499]], [[350, 456], [447, 456], [447, 470], [350, 470]], [[145, 444], [201, 444], [201, 467], [145, 467]], [[728, 371], [878, 371], [878, 420], [728, 420]], [[528, 369], [681, 369], [681, 418], [528, 418]], [[143, 369], [488, 369], [488, 420], [143, 420]], [[744, 315], [871, 315], [871, 336], [744, 336]], [[799, 157], [886, 154], [887, 188], [800, 191]], [[274, 142], [455, 142], [455, 160], [274, 160]], [[738, 116], [894, 119], [893, 157], [737, 153]], [[108, 112], [204, 112], [204, 130], [108, 130]], [[270, 92], [463, 96], [462, 132], [270, 129]]] +00150341.jpg [[[100, 644], [297, 644], [297, 661], [100, 661]], [[115, 617], [288, 617], [288, 631], [115, 631]], [[84, 593], [319, 592], [319, 609], [84, 610]], [[31, 565], [313, 562], [314, 580], [31, 582]], [[444, 560], [461, 560], [461, 569], [444, 569]], [[390, 557], [446, 557], [446, 572], [390, 572]], [[31, 515], [168, 515], [168, 529], [31, 529]], [[33, 490], [110, 490], [110, 504], [33, 504]], [[358, 459], [464, 463], [463, 485], [357, 481]], [[28, 459], [268, 460], [268, 481], [28, 480]], [[339, 439], [421, 444], [421, 460], [338, 455]], [[65, 439], [143, 439], [143, 453], [65, 453]], [[207, 416], [292, 416], [292, 434], [207, 434]], [[319, 408], [441, 413], [440, 438], [318, 433]], [[44, 405], [175, 409], [174, 434], [43, 430]], [[31, 383], [137, 383], [137, 404], [31, 404]]] +00150669.jpg [[[649, 700], [681, 700], [681, 716], [649, 716]], [[517, 685], [549, 685], [549, 720], [517, 720]], [[651, 688], [678, 688], [678, 701], [651, 701]], [[862, 687], [876, 687], [876, 695], [862, 695]], [[922, 675], [938, 675], [938, 685], [922, 685]], [[785, 671], [807, 671], [807, 687], [785, 687]], [[592, 672], [606, 672], [606, 686], [592, 686]], [[722, 679], [732, 669], [742, 678], [731, 688]], [[651, 680], [667, 665], [681, 679], [666, 695]], [[273, 667], [422, 667], [422, 688], [273, 688]], [[47, 668], [108, 668], [108, 686], [47, 686]], [[136, 666], [203, 666], [203, 688], [136, 688]], [[782, 629], [810, 629], [810, 661], [782, 661]], [[645, 627], [685, 627], [685, 665], [645, 665]], [[516, 628], [548, 628], [548, 664], [516, 664]], [[655, 619], [672, 619], [672, 627], [655, 627]], [[598, 617], [605, 624], [599, 629], [592, 622]], [[523, 619], [540, 619], [540, 627], [523, 627]], [[858, 618], [868, 618], [868, 627], [858, 627]], [[727, 618], [735, 618], [735, 627], [727, 627]], [[919, 620], [932, 611], [942, 624], [929, 633]], [[786, 616], [805, 616], [805, 629], [786, 629]], [[373, 604], [420, 604], [420, 619], [373, 619]], [[85, 603], [215, 605], [214, 621], [84, 619]], [[48, 603], [71, 603], [71, 622], [48, 622]], [[788, 561], [806, 561], [806, 572], [788, 572]], [[923, 560], [935, 560], [935, 574], [923, 574]], [[856, 560], [869, 560], [869, 574], [856, 574]], [[62, 554], [410, 554], [410, 568], [62, 568]], [[63, 532], [116, 535], [115, 545], [62, 543]], [[859, 527], [868, 527], [868, 539], [859, 539]], [[925, 526], [934, 526], [934, 540], [925, 540]], [[794, 520], [807, 529], [798, 542], [785, 533]], [[526, 526], [535, 526], [535, 536], [526, 536]], [[262, 513], [395, 513], [395, 526], [262, 526]], [[122, 514], [245, 514], [245, 524], [122, 524]], [[49, 514], [119, 514], [119, 525], [49, 525]], [[755, 492], [828, 492], [828, 507], [755, 507]], [[638, 492], [710, 492], [710, 507], [638, 507]], [[519, 492], [592, 492], [592, 507], [519, 507]], [[85, 450], [123, 450], [123, 461], [85, 461]], [[220, 450], [236, 447], [238, 459], [223, 462]], [[683, 445], [868, 445], [868, 459], [683, 459]], [[562, 445], [666, 445], [666, 459], [562, 459]], [[491, 446], [544, 446], [544, 458], [491, 458]], [[183, 437], [208, 437], [208, 459], [183, 459]], [[52, 431], [72, 438], [64, 462], [44, 455]], [[224, 432], [276, 432], [276, 443], [224, 443]], [[88, 432], [144, 432], [144, 443], [88, 443]], [[506, 383], [616, 382], [616, 397], [506, 398]], [[702, 381], [758, 381], [758, 399], [702, 399]], [[308, 373], [364, 373], [364, 384], [308, 384]], [[92, 373], [167, 373], [167, 384], [92, 384]], [[688, 335], [820, 335], [820, 350], [688, 350]], [[498, 335], [657, 335], [657, 350], [498, 350]], [[208, 316], [244, 316], [244, 331], [208, 331]], [[499, 289], [641, 289], [641, 302], [499, 302]], [[671, 287], [801, 287], [801, 301], [671, 301]], [[670, 241], [816, 241], [816, 255], [670, 255]], [[497, 241], [643, 241], [643, 255], [497, 255]], [[670, 194], [815, 194], [815, 208], [670, 208]], [[498, 194], [643, 194], [643, 208], [498, 208]], [[670, 145], [815, 145], [815, 160], [670, 160]], [[499, 145], [645, 145], [645, 160], [499, 160]], [[489, 103], [546, 103], [546, 120], [489, 120]], [[56, 89], [95, 89], [95, 97], [56, 97]], [[845, 26], [887, 20], [889, 39], [848, 44]], [[26, 20], [700, 20], [700, 37], [26, 37]], [[898, 11], [996, 16], [995, 45], [896, 40]]] +00152568.jpg [[[3, 252], [284, 254], [284, 280], [3, 278]], [[196, 233], [254, 233], [254, 240], [196, 240]], [[49, 229], [90, 229], [90, 240], [49, 240]], [[200, 159], [281, 165], [276, 229], [195, 222]]] +00155628.jpg [[[149, 901], [503, 903], [503, 922], [149, 920]], [[520, 893], [561, 896], [560, 911], [519, 908]], [[61, 885], [81, 885], [81, 894], [61, 894]], [[150, 878], [503, 882], [503, 900], [149, 896]], [[524, 834], [640, 839], [639, 856], [524, 852]], [[70, 834], [185, 835], [185, 853], [69, 852]], [[246, 555], [466, 555], [466, 569], [246, 569]], [[308, 507], [403, 509], [403, 524], [308, 522]], [[244, 482], [459, 484], [459, 502], [244, 500]], [[252, 422], [459, 424], [458, 452], [251, 450]], [[195, 378], [517, 380], [516, 408], [195, 406]], [[474, 194], [624, 196], [624, 210], [473, 208]], [[73, 129], [641, 131], [641, 160], [73, 158]], [[483, 41], [597, 37], [599, 98], [486, 102]], [[68, 25], [135, 16], [139, 43], [72, 52]]] +00173364.jpg [[[8, 178], [56, 178], [56, 200], [8, 200]], [[137, 120], [194, 120], [194, 133], [137, 133]], [[39, 76], [86, 76], [86, 105], [39, 105]], [[249, 20], [310, 20], [310, 36], [249, 36]], [[21, 16], [104, 16], [104, 39], [21, 39]]] +00175503.jpg [[[43, 260], [500, 255], [501, 358], [44, 363]], [[52, 200], [349, 178], [354, 251], [58, 273]]] +00193218.jpg [[[283, 375], [410, 375], [410, 388], [283, 388]], [[172, 375], [221, 375], [221, 389], [172, 389]], [[110, 375], [161, 375], [161, 389], [110, 389]], [[276, 358], [357, 358], [357, 371], [276, 371]], [[171, 359], [220, 359], [220, 370], [171, 370]], [[409, 357], [492, 357], [492, 370], [409, 370]], [[26, 187], [62, 187], [62, 202], [26, 202]], [[501, 185], [557, 185], [557, 199], [501, 199]], [[381, 187], [420, 185], [421, 199], [382, 201]], [[284, 186], [310, 186], [310, 201], [284, 201]], [[174, 186], [196, 186], [196, 201], [174, 201]], [[499, 165], [540, 165], [540, 176], [499, 176]], [[381, 164], [409, 164], [409, 177], [381, 177]], [[262, 163], [302, 163], [302, 177], [262, 177]], [[176, 163], [230, 163], [230, 177], [176, 177]], [[26, 163], [79, 163], [79, 177], [26, 177]], [[387, 140], [488, 140], [488, 153], [387, 153]], [[28, 139], [131, 139], [131, 152], [28, 152]], [[443, 117], [537, 119], [537, 133], [443, 132]], [[346, 119], [405, 119], [405, 130], [346, 130]], [[261, 119], [303, 119], [303, 130], [261, 130]], [[30, 113], [228, 116], [228, 131], [30, 129]], [[131, 91], [394, 94], [394, 109], [131, 105]], [[562, 82], [583, 82], [583, 107], [562, 107]]] +00195033.jpg [[[488, 263], [533, 265], [532, 280], [487, 278]], [[126, 250], [192, 250], [192, 283], [126, 283]], [[338, 249], [362, 249], [362, 266], [338, 266]], [[319, 222], [380, 225], [380, 238], [319, 236]], [[431, 224], [450, 224], [450, 235], [431, 235]], [[365, 203], [538, 203], [538, 216], [365, 216]], [[89, 200], [146, 203], [146, 217], [89, 214]], [[329, 201], [354, 201], [354, 212], [329, 212]], [[371, 181], [449, 181], [449, 194], [371, 194]], [[329, 181], [352, 181], [352, 192], [329, 192]], [[96, 179], [240, 179], [240, 193], [96, 193]], [[456, 162], [555, 162], [555, 175], [456, 175]], [[129, 150], [287, 151], [287, 165], [129, 164]], [[36, 145], [73, 149], [72, 163], [35, 159]], [[527, 146], [552, 146], [552, 155], [527, 155]], [[102, 145], [120, 145], [120, 153], [102, 153]], [[371, 129], [503, 128], [503, 139], [371, 140]], [[99, 126], [193, 126], [193, 139], [99, 139]], [[322, 127], [337, 127], [337, 135], [322, 135]], [[37, 123], [77, 123], [77, 134], [37, 134]], [[324, 106], [337, 106], [337, 115], [324, 115]], [[309, 107], [315, 107], [315, 112], [309, 112]], [[372, 103], [501, 103], [501, 116], [372, 116]], [[349, 105], [360, 105], [360, 114], [349, 114]], [[38, 103], [80, 103], [80, 113], [38, 113]], [[99, 100], [205, 101], [205, 115], [99, 114]], [[306, 90], [317, 90], [317, 97], [306, 97]], [[347, 88], [362, 88], [362, 96], [347, 96]], [[321, 87], [340, 87], [340, 99], [321, 99]], [[358, 84], [513, 82], [513, 95], [358, 97]], [[41, 83], [89, 83], [89, 93], [41, 93]], [[94, 79], [241, 80], [241, 94], [94, 93]], [[313, 66], [394, 66], [394, 79], [313, 79]], [[242, 66], [288, 66], [288, 77], [242, 77]], [[185, 54], [220, 54], [220, 65], [185, 65]], [[469, 48], [547, 48], [547, 61], [469, 61]], [[423, 36], [436, 36], [436, 54], [423, 54]], [[465, 30], [551, 30], [551, 43], [465, 43]], [[207, 21], [329, 23], [328, 41], [207, 39]]] +00208502.jpg [[[247, 566], [282, 566], [282, 573], [247, 573]], [[558, 534], [629, 539], [627, 570], [556, 565]], [[205, 540], [284, 540], [284, 552], [205, 552]], [[143, 513], [189, 513], [189, 525], [143, 525]], [[249, 512], [307, 512], [307, 524], [249, 524]], [[44, 500], [118, 500], [118, 519], [44, 519]], [[467, 491], [556, 491], [556, 508], [467, 508]], [[667, 490], [678, 494], [675, 503], [664, 499]], [[788, 489], [794, 495], [789, 499], [783, 494]], [[726, 491], [737, 491], [737, 501], [726, 501]], [[42, 452], [117, 450], [117, 469], [42, 470]], [[175, 450], [236, 450], [236, 464], [175, 464]], [[614, 407], [638, 407], [638, 422], [614, 422]], [[95, 405], [119, 405], [119, 422], [95, 422]], [[49, 399], [64, 414], [50, 427], [36, 413]], [[209, 401], [226, 401], [226, 415], [209, 415]], [[40, 357], [58, 357], [58, 374], [40, 374]], [[94, 356], [119, 356], [119, 373], [94, 373]], [[188, 341], [246, 339], [247, 361], [189, 364]], [[459, 321], [549, 319], [549, 337], [460, 339]], [[459, 273], [551, 273], [551, 290], [459, 290]], [[563, 272], [735, 269], [735, 286], [564, 289]], [[517, 225], [547, 225], [547, 245], [517, 245]], [[459, 226], [480, 226], [480, 244], [459, 244]], [[621, 187], [673, 187], [673, 201], [621, 201]], [[457, 132], [548, 130], [548, 147], [458, 149]], [[572, 106], [787, 99], [787, 120], [573, 126]], [[122, 48], [290, 48], [290, 97], [122, 97]], [[539, 39], [708, 39], [708, 89], [539, 89]]] +00224225.jpg [[[134, 429], [153, 426], [157, 445], [138, 448]], [[202, 404], [478, 411], [476, 459], [201, 452]], [[205, 230], [469, 230], [469, 390], [205, 390]], [[131, 265], [172, 265], [172, 279], [131, 279]], [[345, 207], [456, 207], [456, 231], [345, 231]], [[199, 189], [346, 196], [344, 239], [197, 232]], [[10, 44], [157, 41], [158, 112], [11, 115]]] +00227746.jpg [[[190, 232], [258, 232], [258, 238], [190, 238]], [[160, 232], [183, 232], [183, 238], [160, 238]], [[123, 232], [150, 232], [150, 238], [123, 238]], [[290, 208], [345, 206], [346, 222], [291, 224]], [[172, 181], [249, 181], [249, 194], [172, 194]], [[143, 178], [165, 180], [162, 208], [140, 206]], [[142, 164], [157, 162], [160, 177], [145, 180]], [[173, 157], [203, 157], [203, 164], [173, 164]], [[200, 154], [347, 154], [347, 167], [200, 167]], [[144, 111], [277, 114], [277, 134], [144, 131]], [[201, 52], [387, 53], [386, 69], [201, 68]], [[141, 46], [192, 46], [192, 63], [141, 63]], [[40, 26], [61, 26], [61, 42], [40, 42]]] +00229605.jpg [[[743, 529], [881, 529], [881, 544], [743, 544]], [[236, 499], [589, 498], [589, 522], [236, 523]], [[6, 498], [227, 498], [227, 522], [6, 522]], [[735, 496], [883, 499], [883, 520], [734, 517]], [[606, 495], [716, 489], [718, 515], [608, 521]], [[4, 245], [863, 230], [864, 288], [5, 303]], [[478, 28], [883, 28], [883, 76], [478, 76]]] +00233011.jpg [[[63, 227], [291, 227], [291, 242], [63, 242]], [[12, 219], [41, 219], [41, 250], [12, 250]], [[61, 177], [119, 177], [119, 195], [61, 195]], [[11, 173], [40, 169], [44, 200], [14, 203]], [[61, 129], [147, 131], [147, 147], [61, 144]], [[12, 124], [43, 124], [43, 154], [12, 154]], [[125, 89], [238, 89], [238, 103], [125, 103]], [[148, 51], [216, 51], [216, 65], [148, 65]], [[258, 46], [353, 50], [352, 69], [257, 65]], [[9, 49], [52, 49], [52, 68], [9, 68]], [[277, 12], [345, 12], [345, 31], [277, 31]], [[28, 11], [74, 11], [74, 31], [28, 31]]] +00233625.jpg [[[375, 397], [632, 399], [632, 443], [375, 440]], [[71, 214], [932, 207], [933, 321], [71, 328]]] +00233634.jpg [[[215, 639], [261, 639], [261, 703], [215, 703]], [[523, 635], [570, 635], [570, 695], [523, 695]], [[643, 523], [682, 523], [682, 568], [643, 568]], [[97, 516], [152, 516], [152, 589], [97, 589]], [[755, 395], [760, 395], [760, 401], [755, 401]], [[26, 395], [32, 395], [32, 400], [26, 400]], [[678, 364], [728, 362], [731, 430], [681, 432]], [[54, 361], [107, 361], [107, 434], [54, 434]], [[78, 208], [155, 208], [155, 280], [78, 280]], [[643, 205], [693, 205], [693, 272], [643, 272]], [[210, 88], [260, 86], [263, 164], [213, 166]], [[363, 48], [426, 45], [430, 115], [367, 118]]] +00234400.jpg [[[446, 421], [738, 421], [738, 438], [446, 438]], [[157, 421], [454, 421], [454, 438], [157, 438]], [[158, 394], [652, 394], [652, 411], [158, 411]], [[40, 391], [127, 391], [127, 412], [40, 412]], [[158, 342], [304, 345], [304, 363], [158, 360]], [[38, 344], [123, 344], [123, 362], [38, 362]], [[520, 295], [703, 295], [703, 314], [520, 314]], [[394, 292], [483, 290], [484, 314], [394, 317]], [[157, 293], [270, 293], [270, 313], [157, 313]], [[37, 293], [125, 293], [125, 313], [37, 313]], [[156, 243], [358, 243], [358, 267], [156, 267]], [[36, 243], [82, 243], [82, 269], [36, 269]], [[29, 152], [158, 152], [158, 175], [29, 175]], [[282, 98], [507, 98], [507, 111], [282, 111]], [[315, 46], [475, 50], [474, 88], [314, 85]], [[518, 51], [663, 53], [662, 67], [517, 65]], [[487, 19], [706, 17], [706, 43], [487, 45]]] +00234883.jpg [[[344, 145], [355, 145], [355, 153], [344, 153]], [[66, 125], [316, 120], [317, 190], [67, 195]], [[79, 138], [109, 141], [108, 152], [78, 148]], [[72, 120], [120, 120], [120, 130], [72, 130]], [[383, 63], [504, 62], [504, 74], [383, 75]], [[58, 29], [365, 26], [366, 112], [59, 115]], [[387, 28], [501, 26], [501, 45], [387, 47]]] +test_add_0.jpg [[[311, 521], [391, 521], [391, 534], [311, 534]], [[277, 500], [424, 500], [424, 514], [277, 514]], [[261, 446], [437, 446], [437, 459], [261, 459]], [[212, 428], [485, 428], [485, 441], [212, 441]], [[247, 388], [457, 388], [457, 409], [247, 409]], [[222, 328], [474, 328], [474, 372], [222, 372]], [[208, 207], [492, 211], [490, 277], [207, 272]], [[266, 164], [422, 166], [421, 197], [265, 195]], [[18, 20], [201, 18], [201, 43], [18, 45]]] +test_add_1.png [] +test_add_10.png [[[157, 124], [186, 124], [186, 172], [157, 172]], [[65, 117], [95, 117], [95, 168], [65, 168]], [[161, 106], [183, 106], [183, 127], [161, 127]], [[69, 100], [94, 100], [94, 128], [69, 128]], [[117, 46], [154, 45], [157, 174], [121, 175]], [[66, 34], [97, 34], [97, 112], [66, 112]]] +test_add_11.jpg [[[1525, 773], [1564, 756], [1575, 780], [1536, 798]], [[1390, 757], [1483, 757], [1483, 791], [1390, 791]], [[1013, 754], [1207, 754], [1207, 800], [1013, 800]], [[685, 755], [875, 755], [875, 796], [685, 796]], [[356, 753], [566, 747], [567, 793], [358, 798]], [[78, 751], [264, 745], [265, 793], [79, 798]], [[602, 647], [1152, 647], [1152, 703], [602, 703]], [[601, 564], [1148, 555], [1149, 611], [602, 620]], [[598, 480], [1066, 472], [1067, 526], [599, 535]], [[598, 393], [1090, 388], [1090, 439], [599, 444]], [[603, 306], [1057, 306], [1057, 357], [603, 357]], [[357, 184], [1517, 184], [1517, 261], [357, 261]], [[60, 43], [257, 37], [259, 83], [61, 89]], [[1305, 41], [1492, 41], [1492, 87], [1305, 87]], [[973, 40], [1171, 34], [1172, 80], [974, 86]], [[670, 40], [862, 34], [864, 80], [671, 86]], [[363, 34], [558, 34], [558, 85], [363, 85]]] +test_add_12.jpg [[[11, 592], [136, 594], [135, 613], [11, 611]], [[109, 521], [907, 526], [907, 569], [109, 565]], [[635, 451], [902, 448], [903, 478], [635, 481]], [[112, 447], [466, 449], [466, 486], [112, 483]], [[582, 306], [680, 304], [681, 348], [583, 351]], [[369, 261], [565, 266], [563, 357], [367, 353]], [[64, 85], [853, 88], [853, 161], [64, 159]]] +test_add_13.jpg [[[68, 94], [117, 97], [116, 114], [67, 111]]] +test_add_14.jpg [[[30, 97], [235, 95], [236, 127], [31, 129]], [[30, 52], [239, 50], [239, 86], [30, 87]]] +test_add_15.jpg [[[141, 253], [353, 253], [353, 266], [141, 266]], [[205, 214], [406, 219], [406, 232], [204, 227]], [[106, 212], [193, 213], [193, 227], [106, 226]], [[154, 156], [286, 161], [286, 174], [154, 170]], [[148, 136], [305, 142], [305, 156], [147, 150]], [[108, 137], [144, 137], [144, 148], [108, 148]], [[108, 102], [275, 109], [275, 125], [107, 117]], [[107, 72], [245, 79], [245, 96], [106, 88]], [[107, 39], [209, 42], [209, 62], [106, 59]]] +test_add_16.jpg [[[398, 842], [408, 842], [408, 852], [398, 852]], [[382, 742], [746, 742], [746, 776], [382, 776]], [[362, 703], [468, 703], [468, 725], [362, 725]], [[1552, 701], [1576, 701], [1576, 746], [1552, 746]], [[1256, 695], [1442, 695], [1442, 721], [1256, 721]], [[1244, 661], [1448, 661], [1448, 687], [1244, 687]], [[386, 645], [668, 645], [668, 679], [386, 679]], [[1228, 625], [1470, 623], [1470, 651], [1228, 653]], [[360, 604], [580, 604], [580, 629], [360, 629]], [[1202, 592], [1494, 592], [1494, 617], [1202, 617]], [[1166, 556], [1530, 556], [1530, 582], [1166, 582]], [[380, 552], [638, 552], [638, 586], [380, 586]], [[356, 502], [516, 502], [516, 536], [356, 536]], [[774, 260], [1124, 260], [1124, 300], [774, 300]], [[374, 210], [504, 210], [504, 300], [374, 300]], [[776, 212], [1088, 217], [1088, 252], [776, 248]]] +test_add_17.jpg [[[321, 255], [393, 258], [392, 271], [320, 269]], [[307, 222], [411, 228], [411, 241], [306, 236]], [[96, 136], [385, 143], [384, 208], [94, 201]], [[72, 95], [399, 103], [398, 124], [71, 117]], [[68, 76], [224, 79], [223, 93], [67, 90]], [[66, 59], [226, 62], [225, 76], [65, 74]]] +test_add_18.jpg [[[466, 788], [715, 790], [715, 813], [466, 811]], [[553, 752], [665, 757], [663, 791], [552, 786]], [[119, 539], [189, 539], [189, 570], [119, 570]], [[116, 473], [674, 486], [673, 528], [115, 516]], [[121, 429], [669, 441], [668, 470], [121, 457]], [[121, 375], [674, 381], [674, 410], [121, 404]], [[556, 262], [675, 264], [675, 278], [556, 277]], [[164, 259], [334, 259], [334, 273], [164, 273]], [[344, 195], [456, 197], [455, 220], [343, 217]], [[309, 175], [490, 175], [490, 190], [309, 190]], [[255, 128], [537, 131], [537, 169], [254, 165]], [[347, 92], [486, 94], [486, 109], [346, 107]], [[285, 41], [567, 49], [566, 82], [284, 74]], [[236, 32], [266, 32], [266, 60], [236, 60]]] +test_add_19.jpg [[[24, 294], [42, 294], [42, 302], [24, 302]], [[64, 293], [105, 293], [105, 303], [64, 303]], [[145, 287], [163, 287], [163, 304], [145, 304]], [[63, 280], [106, 280], [106, 290], [63, 290]], [[9, 281], [26, 281], [26, 288], [9, 288]], [[220, 279], [245, 279], [245, 291], [220, 291]], [[177, 279], [208, 279], [208, 290], [177, 290]], [[23, 279], [51, 279], [51, 290], [23, 290]], [[145, 278], [162, 278], [162, 292], [145, 292]], [[8, 267], [18, 267], [18, 276], [8, 276]], [[221, 265], [243, 265], [243, 277], [221, 277]], [[24, 265], [47, 265], [47, 277], [24, 277]], [[142, 263], [163, 263], [163, 279], [142, 279]], [[218, 252], [249, 252], [249, 265], [218, 265]], [[65, 253], [131, 253], [131, 263], [65, 263]], [[24, 252], [43, 252], [43, 264], [24, 264]], [[8, 253], [18, 253], [18, 262], [8, 262]], [[8, 240], [17, 240], [17, 249], [8, 249]], [[63, 237], [114, 237], [114, 251], [63, 251]], [[25, 236], [47, 239], [45, 251], [23, 249]], [[144, 234], [166, 237], [163, 253], [142, 249]], [[494, 226], [531, 226], [531, 239], [494, 239]], [[335, 226], [354, 226], [354, 237], [335, 237]], [[288, 226], [314, 226], [314, 237], [288, 237]], [[63, 226], [113, 226], [113, 236], [63, 236]], [[7, 227], [17, 227], [17, 234], [7, 234]], [[221, 225], [248, 225], [248, 235], [221, 235]], [[143, 225], [165, 222], [167, 234], [145, 237]], [[24, 224], [48, 224], [48, 238], [24, 238]], [[495, 213], [524, 213], [524, 224], [495, 224]], [[420, 212], [437, 212], [437, 225], [420, 225]], [[336, 212], [398, 212], [398, 223], [336, 223]], [[292, 212], [320, 212], [320, 223], [292, 223]], [[222, 212], [249, 212], [249, 223], [222, 223]], [[145, 212], [166, 212], [166, 223], [145, 223]], [[61, 211], [113, 209], [114, 222], [62, 224]], [[26, 211], [48, 211], [48, 223], [26, 223]], [[337, 199], [383, 199], [383, 209], [337, 209]], [[65, 200], [87, 200], [87, 207], [65, 207]], [[493, 197], [541, 197], [541, 211], [493, 211]], [[445, 202], [455, 196], [462, 206], [452, 212]], [[178, 198], [205, 198], [205, 208], [178, 208]], [[146, 199], [157, 199], [157, 208], [146, 208]], [[32, 194], [43, 204], [33, 214], [22, 203]], [[422, 193], [440, 201], [432, 215], [415, 207]], [[65, 186], [132, 186], [132, 196], [65, 196]], [[337, 185], [399, 185], [399, 196], [337, 196]], [[445, 190], [456, 182], [465, 191], [454, 200]], [[292, 188], [308, 182], [313, 193], [297, 200]], [[220, 183], [255, 183], [255, 197], [220, 197]], [[142, 184], [158, 184], [158, 197], [142, 197]], [[493, 182], [518, 182], [518, 197], [493, 197]], [[425, 180], [437, 191], [427, 202], [414, 190]], [[32, 179], [42, 189], [32, 199], [22, 189]], [[182, 179], [195, 185], [188, 198], [175, 192]], [[335, 172], [400, 169], [400, 183], [336, 185]], [[492, 170], [519, 170], [519, 185], [492, 185]], [[412, 177], [428, 164], [440, 178], [425, 190]], [[293, 171], [315, 171], [315, 185], [293, 185]], [[220, 170], [251, 170], [251, 184], [220, 184]], [[178, 172], [188, 172], [188, 183], [178, 183]], [[64, 172], [125, 170], [125, 181], [64, 182]], [[454, 168], [464, 176], [454, 185], [445, 176]], [[142, 172], [159, 168], [163, 180], [145, 185]], [[30, 165], [43, 174], [34, 186], [20, 177]], [[493, 160], [523, 160], [523, 170], [493, 170]], [[402, 161], [435, 161], [435, 168], [402, 168]], [[335, 159], [401, 159], [401, 169], [335, 169]], [[296, 159], [325, 159], [325, 170], [296, 170]], [[221, 158], [251, 158], [251, 169], [221, 169]], [[174, 161], [183, 156], [190, 167], [181, 172]], [[145, 158], [162, 158], [162, 170], [145, 170]], [[61, 158], [125, 157], [125, 168], [62, 169]], [[20, 161], [33, 154], [40, 167], [28, 174]], [[492, 143], [542, 143], [542, 157], [492, 157]], [[450, 144], [479, 144], [479, 157], [450, 157]], [[335, 143], [439, 143], [439, 156], [335, 156]], [[294, 143], [327, 143], [327, 157], [294, 157]], [[220, 143], [253, 143], [253, 157], [220, 157]], [[178, 145], [187, 145], [187, 156], [178, 156]], [[63, 144], [104, 144], [104, 155], [63, 155]], [[144, 140], [164, 145], [160, 159], [141, 154]], [[31, 137], [44, 149], [31, 162], [17, 149]], [[286, 135], [291, 135], [291, 140], [286, 140]], [[177, 133], [193, 133], [193, 144], [177, 144]], [[336, 132], [388, 132], [388, 141], [336, 141]], [[492, 131], [525, 131], [525, 141], [492, 141]], [[450, 131], [477, 131], [477, 141], [450, 141]], [[292, 131], [321, 131], [321, 141], [292, 141]], [[218, 132], [255, 130], [256, 141], [219, 144]], [[63, 131], [95, 131], [95, 141], [63, 141]], [[417, 130], [437, 130], [437, 141], [417, 141]], [[145, 130], [159, 130], [159, 143], [145, 143]], [[30, 124], [43, 133], [32, 147], [19, 138]], [[493, 118], [535, 118], [535, 129], [493, 129]], [[336, 118], [388, 118], [388, 129], [336, 129]], [[218, 118], [255, 118], [255, 128], [218, 128]], [[451, 117], [478, 117], [478, 129], [451, 129]], [[418, 117], [438, 117], [438, 130], [418, 130]], [[177, 116], [209, 116], [209, 130], [177, 130]], [[145, 117], [162, 117], [162, 130], [145, 130]], [[62, 116], [88, 116], [88, 131], [62, 131]], [[19, 121], [33, 111], [43, 124], [29, 134]], [[491, 107], [523, 107], [523, 113], [491, 113]], [[449, 107], [477, 107], [477, 113], [449, 113]], [[420, 107], [436, 107], [436, 113], [420, 113]], [[295, 107], [319, 107], [319, 114], [295, 114]], [[220, 107], [242, 107], [242, 113], [220, 113]], [[176, 107], [203, 107], [203, 113], [176, 113]], [[145, 107], [161, 107], [161, 114], [145, 114]], [[334, 105], [372, 105], [372, 114], [334, 114]], [[63, 106], [86, 106], [86, 113], [63, 113]], [[483, 89], [522, 89], [522, 99], [483, 99]], [[331, 88], [380, 88], [380, 99], [331, 99]], [[276, 88], [325, 88], [325, 99], [276, 99]], [[214, 88], [246, 88], [246, 99], [214, 99]], [[411, 86], [474, 86], [474, 100], [411, 100]], [[6, 86], [102, 86], [102, 100], [6, 100]], [[415, 66], [461, 66], [461, 77], [415, 77]], [[288, 66], [333, 66], [333, 77], [288, 77]], [[157, 64], [206, 64], [206, 78], [157, 78]], [[416, 48], [523, 49], [523, 63], [415, 62]], [[288, 49], [375, 49], [375, 63], [288, 63]], [[159, 49], [269, 49], [269, 62], [159, 62]], [[24, 53], [36, 46], [45, 59], [33, 67]], [[416, 36], [481, 36], [481, 46], [416, 46]], [[25, 38], [39, 32], [46, 46], [33, 52]], [[157, 34], [205, 34], [205, 47], [157, 47]], [[412, 4], [527, 4], [527, 17], [412, 17]], [[146, 4], [345, 2], [345, 15], [146, 17]]] +test_add_20.jpg [[[31, 346], [605, 346], [605, 370], [31, 370]], [[217, 294], [510, 294], [510, 322], [217, 322]], [[473, 271], [525, 271], [525, 286], [473, 286]], [[220, 267], [287, 267], [287, 286], [220, 286]], [[219, 239], [484, 239], [484, 263], [219, 263]], [[221, 217], [303, 217], [303, 234], [221, 234]], [[402, 192], [417, 192], [417, 205], [402, 205]], [[222, 187], [341, 187], [341, 207], [222, 207]], [[221, 162], [287, 162], [287, 180], [221, 180]], [[375, 122], [475, 124], [475, 146], [375, 143]], [[222, 124], [356, 122], [356, 143], [222, 146]], [[218, 81], [352, 84], [352, 116], [218, 113]], [[440, 35], [605, 35], [605, 60], [440, 60]], [[72, 16], [398, 16], [398, 44], [72, 44]]] +test_add_3.jpg [[[169, 327], [337, 326], [337, 341], [169, 342]], [[170, 288], [307, 290], [307, 312], [170, 310]], [[171, 221], [323, 221], [323, 234], [171, 234]], [[340, 221], [449, 217], [449, 231], [341, 234]], [[169, 201], [372, 201], [372, 214], [169, 214]], [[170, 183], [418, 183], [418, 196], [170, 196]], [[170, 149], [416, 149], [416, 163], [170, 163]], [[171, 119], [418, 119], [418, 140], [171, 140]], [[326, 64], [478, 64], [478, 91], [326, 91]], [[173, 64], [306, 60], [306, 89], [174, 93]]] +test_add_4.png [] +test_add_5.png [[[48, 164], [108, 164], [108, 174], [48, 174]], [[52, 121], [169, 121], [169, 134], [52, 134]], [[50, 102], [165, 102], [165, 118], [50, 118]], [[52, 83], [164, 83], [164, 100], [52, 100]], [[51, 68], [166, 68], [166, 84], [51, 84]], [[51, 50], [145, 47], [145, 64], [52, 67]]] +test_add_6.jpg [[[123, 223], [219, 227], [218, 251], [122, 247]], [[172, 172], [186, 186], [172, 200], [158, 186]]] +test_add_7.jpg [[[48, 938], [174, 936], [174, 962], [48, 964]], [[227, 873], [629, 876], [628, 953], [226, 949]], [[56, 745], [638, 745], [638, 790], [56, 790]], [[150, 674], [545, 678], [544, 721], [150, 718]], [[73, 504], [633, 504], [633, 601], [73, 601]], [[59, 270], [655, 279], [652, 441], [56, 432]], [[513, 193], [553, 193], [553, 223], [513, 223]], [[61, 175], [532, 175], [532, 239], [61, 239]], [[533, 178], [642, 178], [642, 236], [533, 236]]] +test_add_8.jpg [[[251, 586], [454, 580], [454, 606], [252, 613]], [[107, 533], [457, 527], [457, 560], [108, 566]], [[336, 494], [384, 494], [384, 507], [336, 507]], [[27, 307], [355, 297], [356, 320], [28, 330]], [[22, 259], [445, 251], [445, 274], [23, 282]], [[78, 209], [445, 205], [445, 225], [78, 229]], [[160, 23], [319, 30], [317, 79], [158, 72]]] +test_add_9.png [[[266, 687], [486, 687], [486, 696], [266, 696]], [[196, 668], [554, 668], [554, 681], [196, 681]], [[154, 596], [597, 596], [597, 606], [154, 606]], [[215, 578], [541, 578], [541, 588], [215, 588]], [[134, 560], [615, 560], [615, 570], [134, 570]], [[85, 543], [665, 543], [665, 553], [85, 553]], [[96, 522], [653, 522], [653, 535], [96, 535]], [[362, 449], [389, 449], [389, 460], [362, 460]], [[238, 376], [513, 376], [513, 389], [238, 389]], [[177, 356], [574, 356], [574, 368], [177, 368]], [[344, 281], [408, 283], [407, 297], [343, 294]], [[257, 205], [493, 205], [493, 219], [257, 219]]] diff --git a/PTDN/results/python_ppocr_det_mobile_results_fp32.txt b/PTDN/results/python_ppocr_det_mobile_results_fp32.txt new file mode 100644 index 0000000000000000000000000000000000000000..e370b491bf0b45c896f00226e504d407ab156c2e --- /dev/null +++ b/PTDN/results/python_ppocr_det_mobile_results_fp32.txt @@ -0,0 +1,49 @@ +00008790.jpg [[[209, 406], [280, 406], [280, 419], [209, 419]], [[60, 398], [105, 398], [105, 411], [60, 411]], [[198, 389], [291, 389], [291, 402], [198, 402]], [[162, 391], [173, 391], [173, 401], [162, 401]], [[35, 380], [133, 380], [133, 393], [35, 393]], [[199, 371], [292, 371], [292, 384], [199, 384]], [[218, 310], [272, 310], [272, 324], [218, 324]], [[162, 305], [172, 305], [172, 314], [162, 314]], [[371, 302], [436, 302], [436, 316], [371, 316]], [[31, 302], [134, 301], [134, 315], [31, 316]], [[223, 292], [269, 292], [269, 306], [223, 306]], [[60, 225], [104, 225], [104, 236], [60, 236]], [[218, 223], [272, 223], [272, 237], [218, 237]], [[162, 219], [173, 219], [173, 227], [162, 227]], [[33, 207], [131, 207], [131, 220], [33, 220]], [[223, 206], [269, 206], [269, 220], [223, 220]], [[74, 146], [383, 146], [383, 159], [74, 159]], [[54, 120], [117, 120], [117, 134], [54, 134]], [[74, 51], [296, 51], [296, 65], [74, 65]], [[56, 17], [116, 17], [116, 31], [56, 31]]] +00018946.jpg [[[441, 328], [474, 328], [474, 339], [441, 339]], [[86, 284], [141, 286], [140, 307], [85, 305]], [[302, 279], [377, 279], [377, 297], [302, 297]], [[197, 265], [281, 274], [279, 293], [195, 284]], [[198, 197], [452, 219], [450, 242], [196, 220]], [[343, 182], [376, 182], [376, 192], [343, 192]], [[199, 164], [340, 171], [339, 192], [198, 185]], [[177, 101], [415, 118], [413, 145], [175, 128]]] +00034387.jpg [[[265, 460], [740, 460], [740, 484], [265, 484]], [[348, 417], [420, 417], [420, 443], [348, 443]], [[545, 418], [568, 418], [568, 442], [545, 442]], [[685, 417], [710, 417], [710, 443], [685, 443]], [[175, 415], [226, 415], [226, 443], [175, 443]], [[874, 414], [908, 414], [908, 446], [874, 446]], [[56, 417], [74, 417], [74, 442], [56, 442]], [[856, 373], [925, 373], [925, 400], [856, 400]], [[348, 372], [418, 372], [418, 397], [348, 397]], [[674, 372], [723, 372], [723, 401], [674, 401]], [[539, 373], [570, 373], [570, 400], [539, 400]], [[151, 365], [228, 369], [226, 402], [149, 398]], [[56, 372], [74, 372], [74, 397], [56, 397]], [[857, 329], [925, 329], [925, 355], [857, 355]], [[351, 330], [419, 330], [419, 356], [351, 356]], [[674, 328], [723, 328], [723, 356], [674, 356]], [[541, 329], [570, 329], [570, 357], [541, 357]], [[171, 327], [227, 324], [229, 355], [173, 358]], [[57, 330], [74, 330], [74, 356], [57, 356]], [[298, 327], [316, 327], [316, 334], [298, 334]], [[855, 286], [925, 286], [925, 312], [855, 312]], [[674, 286], [723, 286], [723, 313], [674, 313]], [[346, 286], [426, 283], [427, 313], [347, 316]], [[540, 285], [569, 285], [569, 312], [540, 312]], [[172, 282], [226, 282], [226, 313], [172, 313]], [[56, 287], [73, 287], [73, 312], [56, 312]], [[857, 242], [925, 242], [925, 268], [857, 268]], [[348, 242], [460, 242], [460, 268], [348, 268]], [[156, 242], [227, 242], [227, 269], [156, 269]], [[674, 241], [724, 241], [724, 269], [674, 269]], [[531, 241], [572, 241], [572, 270], [531, 270]], [[56, 242], [74, 242], [74, 268], [56, 268]], [[855, 197], [925, 200], [924, 226], [854, 224]], [[674, 198], [726, 198], [726, 226], [674, 226]], [[344, 200], [434, 195], [436, 223], [346, 228]], [[176, 197], [227, 197], [227, 225], [176, 225]], [[56, 200], [73, 200], [73, 226], [56, 226]], [[527, 194], [576, 194], [576, 226], [527, 226]], [[349, 155], [419, 155], [419, 181], [349, 181]], [[854, 154], [925, 154], [925, 180], [854, 180]], [[176, 154], [226, 154], [226, 183], [176, 183]], [[670, 153], [723, 153], [723, 181], [670, 181]], [[538, 154], [571, 154], [571, 182], [538, 182]], [[56, 156], [74, 156], [74, 182], [56, 182]], [[349, 111], [419, 111], [419, 137], [349, 137]], [[174, 111], [227, 111], [227, 139], [174, 139]], [[546, 113], [564, 113], [564, 137], [546, 137]], [[52, 112], [75, 112], [75, 139], [52, 139]], [[639, 108], [727, 105], [728, 138], [640, 141]], [[817, 103], [927, 110], [925, 139], [815, 132]], [[814, 68], [951, 68], [951, 92], [814, 92]], [[307, 66], [446, 68], [446, 93], [306, 90]], [[673, 67], [723, 67], [723, 93], [673, 93]], [[175, 65], [228, 68], [226, 95], [174, 92]], [[39, 65], [90, 68], [88, 97], [37, 94]], [[528, 65], [580, 65], [580, 94], [528, 94]], [[334, 20], [670, 20], [670, 43], [334, 43]]] +00037951.jpg [[[434, 976], [551, 978], [550, 993], [434, 991]], [[433, 932], [553, 932], [553, 969], [433, 969]], [[30, 522], [98, 522], [98, 545], [30, 545]], [[31, 443], [145, 443], [145, 464], [31, 464]], [[234, 335], [326, 332], [327, 354], [235, 356]], [[124, 252], [436, 252], [436, 284], [124, 284]], [[182, 206], [378, 206], [378, 227], [182, 227]], [[258, 106], [320, 123], [304, 181], [242, 163]], [[28, 65], [33, 65], [33, 71], [28, 71]], [[37, 58], [147, 58], [147, 80], [37, 80]]] +00044782.jpg [[[104, 218], [115, 218], [115, 227], [104, 227]], [[223, 216], [246, 216], [246, 228], [223, 228]], [[163, 216], [182, 216], [182, 229], [163, 229]], [[124, 191], [164, 191], [164, 202], [124, 202]], [[91, 84], [251, 84], [251, 98], [91, 98]], [[73, 63], [278, 63], [278, 78], [73, 78]], [[104, 15], [243, 15], [243, 44], [104, 44]]] +00067516.jpg [[[141, 808], [594, 809], [594, 822], [141, 821]], [[49, 784], [695, 784], [695, 798], [49, 798]], [[579, 751], [667, 751], [667, 764], [579, 764]], [[355, 750], [395, 750], [395, 767], [355, 767]], [[221, 751], [260, 751], [260, 765], [221, 765]], [[477, 750], [501, 750], [501, 768], [477, 768]], [[69, 748], [133, 751], [132, 765], [68, 761]], [[576, 682], [668, 682], [668, 699], [576, 699]], [[476, 682], [518, 682], [518, 700], [476, 700]], [[354, 682], [395, 682], [395, 700], [354, 700]], [[69, 681], [133, 684], [132, 699], [68, 695]], [[220, 679], [243, 682], [241, 700], [218, 697]], [[577, 615], [667, 615], [667, 632], [577, 632]], [[68, 612], [134, 615], [133, 632], [67, 629]], [[476, 614], [500, 614], [500, 633], [476, 633]], [[354, 613], [378, 613], [378, 634], [354, 634]], [[219, 612], [245, 612], [245, 633], [219, 633]], [[578, 547], [667, 547], [667, 564], [578, 564]], [[476, 546], [518, 546], [518, 565], [476, 565]], [[353, 545], [379, 545], [379, 566], [353, 566]], [[219, 545], [245, 545], [245, 566], [219, 566]], [[68, 542], [133, 546], [132, 563], [67, 560]], [[68, 478], [133, 482], [132, 499], [67, 496]], [[586, 481], [664, 481], [664, 497], [586, 497]], [[476, 480], [518, 480], [518, 498], [476, 498]], [[354, 480], [395, 480], [395, 498], [354, 498]], [[219, 479], [245, 479], [245, 500], [219, 500]], [[580, 425], [665, 429], [664, 449], [580, 446]], [[346, 429], [410, 429], [410, 447], [346, 447]], [[68, 426], [150, 429], [149, 449], [67, 447]], [[474, 427], [515, 427], [515, 449], [474, 449]], [[218, 427], [259, 427], [259, 449], [218, 449]], [[283, 398], [478, 399], [478, 419], [283, 418]], [[86, 318], [664, 318], [664, 332], [86, 332]], [[65, 279], [665, 279], [665, 292], [65, 292]], [[458, 210], [584, 210], [584, 224], [458, 224]], [[312, 209], [371, 209], [371, 226], [312, 226]], [[164, 209], [225, 209], [225, 226], [164, 226]], [[505, 151], [539, 151], [539, 166], [505, 166]], [[266, 48], [483, 48], [483, 68], [266, 68]]] +00088568.jpg [[[341, 446], [371, 446], [371, 453], [341, 453]], [[58, 445], [117, 445], [117, 455], [58, 455]], [[552, 433], [571, 433], [571, 440], [552, 440]], [[583, 431], [740, 431], [740, 442], [583, 442]], [[311, 415], [743, 415], [743, 428], [311, 428]], [[310, 376], [735, 376], [735, 389], [310, 389]], [[425, 340], [551, 340], [551, 350], [425, 350]], [[287, 324], [294, 332], [289, 337], [281, 330]], [[276, 294], [348, 296], [347, 311], [276, 309]], [[54, 288], [210, 288], [210, 301], [54, 301]], [[275, 265], [421, 265], [421, 278], [275, 278]], [[56, 264], [248, 264], [248, 277], [56, 277]], [[671, 248], [695, 248], [695, 261], [671, 261]], [[602, 248], [628, 248], [628, 261], [602, 261]], [[533, 248], [557, 248], [557, 261], [533, 261]], [[463, 248], [487, 248], [487, 261], [463, 261]], [[278, 248], [309, 248], [309, 260], [278, 260]], [[55, 240], [142, 240], [142, 254], [55, 254]], [[277, 231], [398, 231], [398, 244], [277, 244]], [[741, 228], [749, 237], [742, 245], [733, 236]], [[665, 230], [700, 230], [700, 244], [665, 244]], [[598, 230], [631, 230], [631, 244], [598, 244]], [[528, 230], [562, 230], [562, 244], [528, 244]], [[459, 230], [492, 230], [492, 244], [459, 244]], [[54, 215], [211, 217], [211, 231], [54, 229]], [[739, 211], [749, 221], [740, 229], [731, 220]], [[663, 214], [704, 214], [704, 228], [663, 228]], [[595, 215], [637, 215], [637, 226], [595, 226]], [[524, 215], [569, 215], [569, 226], [524, 226]], [[454, 215], [495, 215], [495, 226], [454, 226]], [[279, 215], [351, 215], [351, 226], [279, 226]], [[736, 199], [747, 199], [747, 208], [736, 208]], [[668, 197], [700, 197], [700, 208], [668, 208]], [[599, 196], [633, 196], [633, 210], [599, 210]], [[529, 197], [562, 197], [562, 208], [529, 208]], [[461, 197], [491, 197], [491, 208], [461, 208]], [[277, 195], [417, 196], [417, 211], [277, 209]], [[55, 192], [239, 192], [239, 205], [55, 205]], [[665, 181], [703, 181], [703, 192], [665, 192]], [[279, 180], [351, 181], [350, 192], [279, 191]], [[734, 180], [747, 180], [747, 193], [734, 193]], [[597, 180], [634, 180], [634, 191], [597, 191]], [[525, 179], [566, 179], [566, 193], [525, 193]], [[458, 180], [493, 180], [493, 191], [458, 191]], [[55, 170], [142, 170], [142, 184], [55, 184]], [[735, 165], [747, 165], [747, 175], [735, 175]], [[665, 163], [703, 163], [703, 175], [665, 175]], [[598, 163], [634, 163], [634, 175], [598, 175]], [[527, 163], [565, 163], [565, 175], [527, 175]], [[458, 163], [492, 163], [492, 175], [458, 175]], [[279, 162], [398, 162], [398, 176], [279, 176]], [[54, 146], [148, 146], [148, 159], [54, 159]], [[453, 147], [495, 147], [495, 158], [453, 158]], [[731, 143], [748, 146], [745, 161], [728, 158]], [[663, 145], [704, 145], [704, 159], [663, 159]], [[596, 146], [635, 146], [635, 157], [596, 157]], [[522, 145], [566, 142], [567, 157], [523, 159]], [[277, 144], [310, 144], [310, 158], [277, 158]], [[276, 121], [428, 121], [428, 139], [276, 139]], [[52, 120], [232, 121], [232, 139], [52, 138]], [[404, 91], [701, 91], [701, 106], [404, 106]], [[48, 79], [280, 79], [280, 97], [48, 97]], [[325, 69], [744, 70], [744, 84], [325, 83]], [[668, 48], [743, 48], [743, 63], [668, 63]], [[297, 48], [433, 48], [433, 62], [297, 62]]] +00091741.jpg [[[47, 336], [83, 336], [83, 358], [47, 358]], [[99, 211], [257, 211], [257, 230], [99, 230]], [[103, 190], [257, 191], [257, 205], [103, 204]], [[89, 101], [266, 99], [267, 181], [90, 184]], [[94, 48], [262, 55], [260, 114], [91, 107]], [[91, 12], [257, 14], [257, 37], [90, 35]]] +00105313.jpg [[[291, 262], [406, 262], [406, 275], [291, 275]], [[153, 262], [264, 262], [264, 274], [153, 274]], [[11, 258], [73, 261], [72, 274], [11, 272]], [[33, 231], [132, 231], [132, 244], [33, 244]], [[35, 217], [216, 217], [216, 227], [35, 227]], [[32, 200], [145, 200], [145, 213], [32, 213]], [[32, 183], [215, 184], [215, 197], [32, 196]], [[35, 170], [105, 170], [105, 181], [35, 181]], [[35, 155], [124, 155], [124, 164], [35, 164]], [[34, 137], [142, 138], [142, 149], [34, 148]], [[35, 123], [176, 123], [176, 133], [35, 133]], [[33, 106], [176, 106], [176, 119], [33, 119]], [[34, 92], [102, 92], [102, 102], [34, 102]], [[34, 77], [119, 77], [119, 87], [34, 87]], [[32, 60], [120, 60], [120, 73], [32, 73]], [[35, 46], [119, 46], [119, 55], [35, 55]], [[32, 29], [142, 29], [142, 42], [32, 42]], [[25, 12], [147, 12], [147, 24], [25, 24]]] +00134770.jpg [[[388, 645], [456, 645], [456, 655], [388, 655]], [[407, 620], [484, 619], [485, 633], [408, 634]], [[112, 534], [270, 531], [270, 549], [113, 551]], [[111, 502], [443, 497], [443, 514], [112, 519]], [[111, 471], [443, 467], [443, 484], [112, 488]], [[111, 439], [444, 434], [444, 451], [112, 457]], [[111, 409], [442, 405], [442, 421], [112, 425]], [[153, 376], [441, 373], [441, 390], [153, 394]], [[184, 338], [369, 336], [369, 356], [185, 358]], [[75, 98], [515, 104], [513, 218], [74, 212]]] +00145943.jpg [[[394, 248], [746, 279], [731, 449], [379, 418]], [[90, 92], [300, 92], [300, 119], [90, 119]], [[46, 41], [326, 39], [326, 75], [46, 77]]] +00147605.jpg [[[804, 615], [874, 616], [874, 627], [804, 626]], [[516, 607], [784, 605], [784, 628], [516, 629]], [[118, 522], [224, 522], [224, 560], [118, 560]], [[253, 524], [307, 524], [307, 557], [253, 557]], [[715, 501], [900, 505], [900, 538], [714, 534]], [[255, 502], [295, 502], [295, 517], [255, 517]], [[347, 481], [473, 481], [473, 515], [347, 515]], [[252, 484], [295, 484], [295, 499], [252, 499]], [[350, 456], [447, 456], [447, 470], [350, 470]], [[145, 444], [201, 444], [201, 467], [145, 467]], [[728, 371], [878, 371], [878, 420], [728, 420]], [[528, 369], [681, 369], [681, 418], [528, 418]], [[142, 368], [486, 368], [486, 419], [142, 419]], [[744, 315], [871, 315], [871, 336], [744, 336]], [[799, 157], [886, 154], [887, 188], [800, 191]], [[274, 142], [455, 142], [455, 160], [274, 160]], [[738, 116], [894, 119], [893, 157], [737, 153]], [[108, 112], [204, 112], [204, 130], [108, 130]], [[270, 92], [463, 96], [462, 132], [270, 129]]] +00150341.jpg [[[100, 645], [298, 645], [298, 662], [100, 662]], [[115, 617], [288, 617], [288, 631], [115, 631]], [[84, 593], [319, 592], [319, 609], [84, 610]], [[31, 565], [313, 562], [314, 580], [31, 582]], [[444, 560], [461, 560], [461, 569], [444, 569]], [[390, 557], [446, 557], [446, 572], [390, 572]], [[31, 515], [168, 515], [168, 529], [31, 529]], [[33, 490], [110, 490], [110, 504], [33, 504]], [[358, 459], [464, 463], [463, 485], [357, 481]], [[28, 459], [268, 460], [268, 481], [28, 480]], [[339, 439], [421, 444], [421, 460], [338, 455]], [[64, 439], [143, 439], [143, 453], [64, 453]], [[207, 416], [292, 416], [292, 434], [207, 434]], [[319, 408], [441, 413], [440, 438], [318, 433]], [[44, 405], [175, 409], [174, 434], [43, 430]], [[31, 383], [137, 383], [137, 404], [31, 404]]] +00150669.jpg [[[649, 700], [681, 700], [681, 716], [649, 716]], [[517, 685], [549, 685], [549, 720], [517, 720]], [[651, 688], [678, 688], [678, 701], [651, 701]], [[862, 687], [876, 687], [876, 695], [862, 695]], [[922, 675], [938, 675], [938, 685], [922, 685]], [[785, 671], [807, 671], [807, 687], [785, 687]], [[592, 672], [606, 672], [606, 686], [592, 686]], [[722, 679], [732, 669], [742, 678], [731, 688]], [[651, 680], [667, 665], [681, 679], [666, 695]], [[273, 667], [422, 667], [422, 688], [273, 688]], [[136, 666], [203, 666], [203, 688], [136, 688]], [[46, 666], [109, 666], [109, 687], [46, 687]], [[782, 629], [810, 629], [810, 661], [782, 661]], [[645, 627], [685, 627], [685, 665], [645, 665]], [[516, 628], [548, 628], [548, 664], [516, 664]], [[655, 619], [672, 619], [672, 627], [655, 627]], [[598, 617], [605, 624], [599, 629], [592, 622]], [[523, 619], [540, 619], [540, 627], [523, 627]], [[858, 618], [868, 618], [868, 627], [858, 627]], [[727, 618], [735, 618], [735, 627], [727, 627]], [[918, 621], [932, 611], [942, 624], [928, 634]], [[786, 616], [805, 616], [805, 629], [786, 629]], [[373, 604], [420, 604], [420, 619], [373, 619]], [[85, 603], [215, 605], [214, 621], [84, 619]], [[48, 603], [71, 603], [71, 622], [48, 622]], [[788, 561], [806, 561], [806, 572], [788, 572]], [[923, 560], [935, 560], [935, 574], [923, 574]], [[856, 560], [869, 560], [869, 574], [856, 574]], [[62, 553], [409, 553], [409, 567], [62, 567]], [[63, 532], [116, 535], [115, 545], [62, 543]], [[859, 527], [868, 527], [868, 539], [859, 539]], [[925, 526], [934, 526], [934, 540], [925, 540]], [[794, 520], [807, 529], [798, 542], [785, 533]], [[526, 526], [535, 526], [535, 536], [526, 536]], [[262, 513], [395, 513], [395, 526], [262, 526]], [[122, 514], [245, 514], [245, 524], [122, 524]], [[49, 514], [119, 514], [119, 525], [49, 525]], [[755, 492], [828, 492], [828, 507], [755, 507]], [[638, 492], [710, 492], [710, 507], [638, 507]], [[519, 492], [592, 492], [592, 507], [519, 507]], [[85, 450], [123, 450], [123, 461], [85, 461]], [[220, 450], [236, 447], [238, 459], [223, 462]], [[682, 445], [867, 445], [867, 459], [682, 459]], [[562, 445], [666, 445], [666, 459], [562, 459]], [[491, 446], [544, 446], [544, 458], [491, 458]], [[183, 437], [208, 437], [208, 459], [183, 459]], [[52, 431], [72, 438], [64, 462], [44, 455]], [[224, 432], [276, 432], [276, 443], [224, 443]], [[88, 432], [144, 432], [144, 443], [88, 443]], [[506, 383], [616, 382], [616, 397], [506, 398]], [[702, 381], [758, 381], [758, 399], [702, 399]], [[308, 373], [364, 373], [364, 384], [308, 384]], [[91, 372], [166, 372], [166, 383], [91, 383]], [[688, 335], [820, 335], [820, 350], [688, 350]], [[498, 335], [657, 335], [657, 350], [498, 350]], [[208, 316], [244, 316], [244, 331], [208, 331]], [[499, 289], [641, 289], [641, 302], [499, 302]], [[671, 287], [801, 287], [801, 301], [671, 301]], [[670, 241], [816, 241], [816, 255], [670, 255]], [[497, 241], [643, 241], [643, 255], [497, 255]], [[669, 193], [814, 193], [814, 207], [669, 207]], [[498, 193], [643, 193], [643, 207], [498, 207]], [[670, 145], [815, 145], [815, 160], [670, 160]], [[499, 145], [645, 145], [645, 160], [499, 160]], [[489, 103], [546, 103], [546, 120], [489, 120]], [[56, 89], [95, 89], [95, 97], [56, 97]], [[845, 26], [887, 20], [889, 39], [848, 44]], [[26, 20], [700, 20], [700, 37], [26, 37]], [[898, 11], [996, 16], [995, 45], [896, 40]]] +00152568.jpg [[[3, 252], [284, 254], [284, 280], [3, 278]], [[196, 233], [254, 233], [254, 240], [196, 240]], [[49, 229], [90, 229], [90, 240], [49, 240]], [[200, 159], [281, 165], [276, 229], [195, 222]]] +00155628.jpg [[[149, 901], [503, 903], [503, 922], [149, 920]], [[520, 893], [561, 896], [560, 911], [519, 908]], [[61, 885], [81, 885], [81, 894], [61, 894]], [[150, 878], [503, 882], [503, 900], [149, 896]], [[524, 834], [640, 839], [639, 856], [524, 852]], [[70, 834], [185, 835], [185, 853], [69, 852]], [[246, 555], [466, 555], [466, 569], [246, 569]], [[308, 507], [403, 509], [403, 524], [308, 522]], [[244, 482], [459, 484], [459, 502], [244, 500]], [[252, 422], [459, 424], [458, 452], [251, 450]], [[195, 378], [517, 380], [516, 408], [195, 406]], [[474, 194], [624, 196], [624, 210], [473, 208]], [[73, 129], [641, 131], [641, 160], [73, 158]], [[483, 41], [597, 37], [599, 98], [486, 102]], [[68, 25], [135, 16], [139, 43], [72, 52]]] +00173364.jpg [[[8, 178], [57, 178], [57, 200], [8, 200]], [[137, 120], [194, 120], [194, 133], [137, 133]], [[39, 76], [86, 76], [86, 105], [39, 105]], [[250, 20], [311, 20], [311, 36], [250, 36]], [[21, 16], [104, 16], [104, 39], [21, 39]]] +00175503.jpg [[[43, 260], [500, 255], [501, 358], [44, 363]], [[52, 200], [349, 178], [354, 251], [58, 273]]] +00193218.jpg [[[283, 375], [410, 375], [410, 388], [283, 388]], [[172, 375], [221, 375], [221, 389], [172, 389]], [[110, 375], [161, 375], [161, 389], [110, 389]], [[276, 358], [357, 358], [357, 371], [276, 371]], [[171, 359], [220, 359], [220, 370], [171, 370]], [[409, 357], [492, 357], [492, 370], [409, 370]], [[26, 187], [62, 187], [62, 202], [26, 202]], [[501, 185], [557, 185], [557, 199], [501, 199]], [[381, 187], [420, 185], [421, 199], [382, 201]], [[284, 186], [310, 186], [310, 201], [284, 201]], [[174, 186], [196, 186], [196, 201], [174, 201]], [[499, 165], [540, 165], [540, 176], [499, 176]], [[381, 164], [409, 164], [409, 177], [381, 177]], [[262, 163], [302, 163], [302, 177], [262, 177]], [[176, 163], [230, 163], [230, 177], [176, 177]], [[26, 163], [79, 163], [79, 177], [26, 177]], [[387, 140], [488, 140], [488, 153], [387, 153]], [[28, 139], [131, 139], [131, 152], [28, 152]], [[443, 117], [537, 119], [537, 133], [443, 132]], [[346, 119], [405, 119], [405, 130], [346, 130]], [[261, 119], [302, 119], [302, 130], [261, 130]], [[30, 113], [228, 116], [228, 131], [30, 129]], [[131, 91], [394, 94], [394, 109], [131, 105]], [[562, 82], [583, 82], [583, 107], [562, 107]]] +00195033.jpg [[[488, 263], [533, 265], [532, 280], [487, 278]], [[126, 250], [192, 250], [192, 283], [126, 283]], [[338, 249], [362, 249], [362, 266], [338, 266]], [[319, 222], [380, 225], [380, 238], [319, 236]], [[431, 224], [450, 224], [450, 235], [431, 235]], [[365, 203], [538, 203], [538, 216], [365, 216]], [[89, 200], [146, 203], [146, 217], [89, 214]], [[329, 201], [354, 201], [354, 212], [329, 212]], [[371, 181], [449, 181], [449, 194], [371, 194]], [[329, 181], [352, 181], [352, 192], [329, 192]], [[96, 179], [240, 179], [240, 193], [96, 193]], [[456, 161], [555, 161], [555, 174], [456, 174]], [[129, 150], [287, 151], [287, 165], [129, 164]], [[36, 145], [73, 149], [72, 163], [35, 159]], [[527, 146], [552, 146], [552, 155], [527, 155]], [[102, 145], [120, 145], [120, 153], [102, 153]], [[371, 129], [503, 128], [503, 139], [371, 140]], [[99, 126], [193, 126], [193, 139], [99, 139]], [[322, 127], [337, 127], [337, 135], [322, 135]], [[37, 123], [77, 123], [77, 134], [37, 134]], [[324, 106], [337, 106], [337, 115], [324, 115]], [[309, 107], [315, 107], [315, 112], [309, 112]], [[372, 103], [501, 103], [501, 116], [372, 116]], [[349, 105], [360, 105], [360, 114], [349, 114]], [[38, 103], [80, 103], [80, 113], [38, 113]], [[99, 100], [205, 101], [205, 115], [99, 114]], [[306, 90], [317, 90], [317, 97], [306, 97]], [[347, 88], [362, 88], [362, 96], [347, 96]], [[321, 87], [340, 87], [340, 99], [321, 99]], [[358, 84], [513, 82], [513, 95], [358, 97]], [[41, 83], [89, 83], [89, 93], [41, 93]], [[94, 79], [241, 80], [241, 94], [94, 93]], [[313, 66], [394, 66], [394, 79], [313, 79]], [[242, 66], [288, 66], [288, 77], [242, 77]], [[185, 54], [220, 54], [220, 65], [185, 65]], [[469, 48], [547, 48], [547, 61], [469, 61]], [[423, 36], [436, 36], [436, 54], [423, 54]], [[465, 30], [551, 30], [551, 43], [465, 43]], [[207, 21], [329, 23], [328, 41], [207, 39]]] +00208502.jpg [[[247, 566], [282, 566], [282, 573], [247, 573]], [[558, 534], [629, 539], [627, 570], [556, 565]], [[205, 540], [284, 540], [284, 552], [205, 552]], [[143, 513], [189, 513], [189, 525], [143, 525]], [[249, 512], [307, 512], [307, 524], [249, 524]], [[44, 500], [118, 500], [118, 519], [44, 519]], [[467, 491], [556, 491], [556, 508], [467, 508]], [[667, 490], [678, 494], [675, 503], [664, 499]], [[788, 489], [794, 495], [789, 499], [783, 494]], [[726, 491], [737, 491], [737, 501], [726, 501]], [[42, 452], [117, 450], [117, 469], [42, 470]], [[175, 450], [236, 450], [236, 464], [175, 464]], [[614, 407], [638, 407], [638, 422], [614, 422]], [[95, 405], [119, 405], [119, 422], [95, 422]], [[49, 399], [63, 413], [49, 427], [35, 414]], [[209, 401], [226, 401], [226, 415], [209, 415]], [[40, 357], [58, 357], [58, 374], [40, 374]], [[94, 356], [119, 356], [119, 373], [94, 373]], [[188, 341], [246, 339], [247, 361], [189, 364]], [[459, 321], [549, 319], [549, 337], [460, 339]], [[459, 273], [551, 273], [551, 290], [459, 290]], [[563, 272], [735, 269], [735, 286], [564, 289]], [[517, 225], [547, 225], [547, 245], [517, 245]], [[459, 226], [480, 226], [480, 244], [459, 244]], [[621, 187], [673, 187], [673, 201], [621, 201]], [[457, 132], [548, 130], [548, 147], [458, 149]], [[572, 106], [787, 99], [787, 120], [573, 126]], [[122, 48], [290, 48], [290, 97], [122, 97]], [[539, 39], [708, 39], [708, 89], [539, 89]]] +00224225.jpg [[[134, 429], [153, 426], [157, 445], [138, 448]], [[202, 404], [478, 411], [476, 459], [201, 452]], [[205, 230], [469, 230], [469, 390], [205, 390]], [[131, 265], [172, 265], [172, 279], [131, 279]], [[344, 207], [455, 207], [455, 231], [344, 231]], [[199, 189], [346, 196], [344, 239], [197, 232]], [[10, 44], [157, 41], [158, 112], [11, 115]]] +00227746.jpg [[[190, 232], [258, 232], [258, 238], [190, 238]], [[160, 232], [183, 232], [183, 238], [160, 238]], [[123, 232], [150, 232], [150, 238], [123, 238]], [[290, 209], [345, 207], [346, 221], [291, 223]], [[172, 181], [249, 181], [249, 194], [172, 194]], [[143, 178], [165, 180], [162, 208], [140, 206]], [[142, 164], [157, 162], [160, 177], [145, 180]], [[173, 157], [203, 157], [203, 164], [173, 164]], [[200, 154], [347, 154], [347, 167], [200, 167]], [[144, 111], [277, 114], [277, 134], [144, 131]], [[201, 52], [387, 53], [386, 69], [201, 68]], [[139, 47], [191, 45], [192, 62], [140, 64]], [[40, 26], [61, 26], [61, 42], [40, 42]]] +00229605.jpg [[[743, 529], [881, 529], [881, 544], [743, 544]], [[236, 499], [589, 498], [589, 522], [236, 523]], [[6, 498], [227, 498], [227, 522], [6, 522]], [[736, 496], [883, 499], [883, 520], [735, 517]], [[606, 495], [716, 489], [718, 515], [608, 521]], [[4, 245], [863, 230], [864, 288], [5, 303]], [[478, 28], [883, 28], [883, 76], [478, 76]]] +00233011.jpg [[[63, 227], [291, 227], [291, 242], [63, 242]], [[12, 219], [41, 219], [41, 250], [12, 250]], [[61, 177], [119, 177], [119, 195], [61, 195]], [[11, 173], [40, 169], [44, 200], [14, 203]], [[61, 129], [147, 131], [147, 147], [61, 144]], [[12, 124], [43, 124], [43, 154], [12, 154]], [[125, 89], [238, 89], [238, 103], [125, 103]], [[148, 51], [216, 51], [216, 65], [148, 65]], [[258, 46], [353, 50], [352, 69], [257, 65]], [[9, 49], [52, 49], [52, 68], [9, 68]], [[277, 12], [345, 12], [345, 31], [277, 31]], [[28, 11], [73, 11], [73, 31], [28, 31]]] +00233625.jpg [[[375, 397], [632, 399], [632, 443], [375, 440]], [[71, 214], [932, 207], [933, 321], [71, 328]]] +00233634.jpg [[[215, 639], [261, 639], [261, 703], [215, 703]], [[523, 635], [570, 635], [570, 695], [523, 695]], [[643, 523], [682, 523], [682, 568], [643, 568]], [[97, 516], [152, 516], [152, 589], [97, 589]], [[755, 395], [760, 395], [760, 401], [755, 401]], [[26, 395], [32, 395], [32, 400], [26, 400]], [[678, 364], [728, 362], [731, 430], [681, 432]], [[54, 361], [107, 361], [107, 434], [54, 434]], [[78, 208], [155, 208], [155, 280], [78, 280]], [[643, 205], [693, 205], [693, 272], [643, 272]], [[210, 88], [260, 86], [263, 164], [213, 166]], [[363, 48], [426, 45], [430, 115], [367, 118]]] +00234400.jpg [[[446, 421], [738, 421], [738, 438], [446, 438]], [[157, 421], [454, 421], [454, 438], [157, 438]], [[158, 394], [652, 394], [652, 411], [158, 411]], [[40, 391], [127, 391], [127, 412], [40, 412]], [[158, 342], [304, 345], [304, 363], [158, 360]], [[38, 344], [123, 344], [123, 362], [38, 362]], [[520, 295], [703, 295], [703, 314], [520, 314]], [[394, 292], [483, 290], [484, 314], [394, 317]], [[157, 293], [270, 293], [270, 313], [157, 313]], [[37, 293], [125, 293], [125, 313], [37, 313]], [[156, 243], [358, 243], [358, 267], [156, 267]], [[36, 243], [82, 243], [82, 269], [36, 269]], [[28, 151], [157, 151], [157, 174], [28, 174]], [[282, 98], [507, 98], [507, 111], [282, 111]], [[315, 46], [475, 50], [474, 88], [314, 85]], [[518, 51], [663, 53], [662, 67], [517, 65]], [[487, 19], [706, 17], [706, 43], [487, 45]]] +00234883.jpg [[[66, 125], [316, 120], [317, 190], [67, 195]], [[79, 138], [109, 140], [108, 151], [78, 148]], [[72, 120], [120, 120], [120, 130], [72, 130]], [[383, 63], [504, 62], [504, 74], [383, 75]], [[58, 29], [365, 26], [366, 112], [59, 115]], [[387, 28], [501, 26], [501, 45], [387, 47]]] +test_add_0.jpg [[[311, 521], [391, 521], [391, 534], [311, 534]], [[277, 500], [424, 500], [424, 514], [277, 514]], [[261, 446], [437, 446], [437, 459], [261, 459]], [[212, 428], [485, 428], [485, 441], [212, 441]], [[247, 388], [457, 388], [457, 409], [247, 409]], [[222, 328], [474, 328], [474, 372], [222, 372]], [[208, 207], [492, 211], [490, 277], [207, 272]], [[266, 164], [422, 166], [421, 197], [265, 195]], [[18, 20], [201, 18], [201, 43], [18, 45]]] +test_add_1.png [] +test_add_10.png [[[157, 124], [186, 124], [186, 172], [157, 172]], [[65, 117], [95, 117], [95, 168], [65, 168]], [[161, 106], [183, 106], [183, 127], [161, 127]], [[69, 100], [94, 100], [94, 128], [69, 128]], [[117, 46], [154, 45], [157, 174], [121, 175]], [[66, 34], [97, 34], [97, 112], [66, 112]]] +test_add_11.jpg [[[1525, 773], [1564, 756], [1575, 780], [1536, 798]], [[1390, 757], [1483, 757], [1483, 791], [1390, 791]], [[1013, 754], [1207, 754], [1207, 800], [1013, 800]], [[685, 755], [875, 755], [875, 796], [685, 796]], [[356, 753], [566, 747], [567, 793], [358, 798]], [[78, 751], [264, 745], [265, 793], [79, 798]], [[602, 647], [1152, 647], [1152, 703], [602, 703]], [[601, 564], [1148, 555], [1149, 611], [602, 620]], [[598, 480], [1066, 472], [1067, 526], [599, 535]], [[598, 393], [1090, 388], [1090, 439], [599, 444]], [[603, 306], [1057, 306], [1057, 357], [603, 357]], [[357, 184], [1515, 184], [1515, 261], [357, 261]], [[60, 43], [257, 37], [259, 83], [61, 89]], [[1305, 41], [1492, 41], [1492, 87], [1305, 87]], [[973, 40], [1171, 34], [1172, 80], [974, 86]], [[670, 40], [862, 34], [864, 80], [671, 86]], [[363, 34], [558, 34], [558, 85], [363, 85]]] +test_add_12.jpg [[[11, 592], [136, 594], [135, 613], [11, 611]], [[109, 521], [907, 526], [907, 569], [109, 565]], [[635, 451], [902, 448], [903, 478], [635, 481]], [[112, 447], [466, 449], [466, 486], [112, 483]], [[582, 306], [680, 304], [681, 348], [583, 351]], [[369, 261], [565, 266], [563, 357], [367, 353]], [[64, 85], [853, 88], [853, 161], [64, 159]]] +test_add_13.jpg [[[68, 94], [117, 97], [116, 114], [67, 111]]] +test_add_14.jpg [[[30, 97], [235, 95], [236, 127], [31, 129]], [[30, 52], [239, 50], [239, 86], [30, 87]]] +test_add_15.jpg [[[141, 253], [353, 253], [353, 266], [141, 266]], [[205, 214], [406, 219], [406, 232], [204, 227]], [[106, 212], [193, 213], [193, 227], [106, 226]], [[154, 156], [286, 161], [286, 174], [154, 170]], [[148, 136], [305, 142], [305, 156], [147, 150]], [[108, 137], [145, 137], [145, 148], [108, 148]], [[108, 102], [275, 109], [275, 125], [107, 117]], [[107, 72], [245, 79], [245, 96], [106, 88]], [[107, 39], [209, 42], [209, 62], [106, 59]]] +test_add_16.jpg [[[398, 842], [408, 842], [408, 852], [398, 852]], [[382, 741], [746, 741], [746, 774], [382, 774]], [[362, 703], [468, 703], [468, 725], [362, 725]], [[1552, 701], [1576, 701], [1576, 746], [1552, 746]], [[1256, 695], [1442, 695], [1442, 721], [1256, 721]], [[1244, 661], [1448, 661], [1448, 687], [1244, 687]], [[386, 645], [668, 645], [668, 679], [386, 679]], [[1226, 625], [1470, 623], [1470, 651], [1226, 653]], [[360, 604], [580, 604], [580, 629], [360, 629]], [[1202, 592], [1494, 592], [1494, 617], [1202, 617]], [[1166, 556], [1530, 556], [1530, 582], [1166, 582]], [[380, 552], [638, 552], [638, 586], [380, 586]], [[356, 502], [516, 502], [516, 536], [356, 536]], [[774, 260], [1124, 260], [1124, 300], [774, 300]], [[374, 210], [504, 210], [504, 300], [374, 300]], [[776, 212], [1088, 217], [1088, 252], [776, 248]]] +test_add_17.jpg [[[321, 255], [393, 258], [392, 271], [320, 269]], [[307, 222], [411, 228], [411, 241], [306, 236]], [[96, 137], [385, 143], [384, 206], [94, 201]], [[72, 95], [399, 103], [398, 124], [71, 117]], [[68, 76], [224, 79], [223, 93], [67, 90]], [[66, 59], [226, 62], [225, 76], [65, 74]]] +test_add_18.jpg [[[466, 788], [715, 790], [715, 813], [466, 811]], [[553, 752], [665, 757], [663, 791], [552, 786]], [[119, 539], [189, 539], [189, 570], [119, 570]], [[116, 473], [674, 486], [673, 528], [115, 516]], [[121, 429], [669, 441], [668, 470], [121, 457]], [[122, 376], [673, 383], [673, 409], [122, 402]], [[556, 262], [675, 264], [675, 278], [556, 277]], [[165, 259], [335, 259], [335, 273], [165, 273]], [[344, 195], [456, 197], [455, 220], [343, 217]], [[309, 175], [490, 175], [490, 190], [309, 190]], [[255, 128], [537, 131], [537, 169], [254, 165]], [[347, 92], [486, 94], [486, 109], [346, 107]], [[285, 41], [567, 49], [566, 82], [284, 74]], [[236, 32], [266, 32], [266, 60], [236, 60]]] +test_add_19.jpg [[[24, 294], [42, 294], [42, 302], [24, 302]], [[64, 293], [105, 293], [105, 303], [64, 303]], [[145, 287], [163, 287], [163, 304], [145, 304]], [[63, 280], [106, 280], [106, 290], [63, 290]], [[9, 281], [26, 281], [26, 288], [9, 288]], [[220, 279], [245, 279], [245, 291], [220, 291]], [[177, 279], [208, 279], [208, 290], [177, 290]], [[23, 279], [51, 279], [51, 290], [23, 290]], [[145, 278], [162, 278], [162, 292], [145, 292]], [[8, 267], [18, 267], [18, 276], [8, 276]], [[221, 265], [243, 265], [243, 277], [221, 277]], [[24, 265], [47, 265], [47, 277], [24, 277]], [[142, 263], [163, 263], [163, 279], [142, 279]], [[218, 252], [249, 252], [249, 265], [218, 265]], [[65, 253], [131, 253], [131, 263], [65, 263]], [[24, 252], [43, 252], [43, 264], [24, 264]], [[8, 253], [18, 253], [18, 262], [8, 262]], [[8, 240], [17, 240], [17, 249], [8, 249]], [[63, 237], [114, 237], [114, 251], [63, 251]], [[25, 236], [47, 239], [45, 251], [23, 249]], [[144, 234], [166, 237], [163, 253], [142, 249]], [[494, 226], [531, 226], [531, 239], [494, 239]], [[335, 226], [354, 226], [354, 237], [335, 237]], [[288, 226], [314, 226], [314, 237], [288, 237]], [[63, 226], [113, 226], [113, 236], [63, 236]], [[7, 227], [17, 227], [17, 234], [7, 234]], [[221, 225], [248, 225], [248, 235], [221, 235]], [[143, 225], [165, 222], [167, 234], [145, 237]], [[24, 224], [48, 224], [48, 238], [24, 238]], [[495, 213], [524, 213], [524, 224], [495, 224]], [[420, 212], [437, 212], [437, 225], [420, 225]], [[336, 212], [398, 212], [398, 223], [336, 223]], [[292, 212], [320, 212], [320, 223], [292, 223]], [[222, 212], [249, 212], [249, 223], [222, 223]], [[145, 212], [166, 212], [166, 223], [145, 223]], [[61, 211], [113, 209], [114, 222], [62, 224]], [[26, 211], [48, 211], [48, 223], [26, 223]], [[337, 199], [383, 199], [383, 209], [337, 209]], [[65, 200], [87, 200], [87, 207], [65, 207]], [[493, 197], [541, 197], [541, 211], [493, 211]], [[445, 202], [455, 196], [462, 206], [452, 212]], [[178, 198], [205, 198], [205, 208], [178, 208]], [[146, 199], [157, 199], [157, 208], [146, 208]], [[32, 194], [43, 204], [33, 214], [22, 203]], [[422, 193], [440, 201], [432, 215], [415, 207]], [[64, 186], [131, 186], [131, 196], [64, 196]], [[337, 185], [399, 185], [399, 196], [337, 196]], [[445, 190], [456, 182], [465, 191], [454, 200]], [[292, 188], [308, 182], [313, 193], [297, 200]], [[220, 183], [255, 183], [255, 197], [220, 197]], [[142, 184], [158, 184], [158, 197], [142, 197]], [[493, 182], [518, 182], [518, 197], [493, 197]], [[425, 180], [437, 191], [427, 202], [414, 190]], [[32, 179], [42, 189], [32, 199], [22, 189]], [[182, 179], [196, 185], [188, 199], [175, 193]], [[335, 172], [400, 169], [400, 183], [336, 185]], [[492, 170], [519, 170], [519, 185], [492, 185]], [[412, 177], [428, 164], [440, 178], [425, 190]], [[293, 171], [315, 171], [315, 185], [293, 185]], [[220, 170], [251, 170], [251, 184], [220, 184]], [[178, 172], [188, 172], [188, 183], [178, 183]], [[64, 172], [125, 170], [125, 181], [64, 182]], [[454, 168], [464, 176], [454, 185], [445, 176]], [[145, 168], [163, 172], [159, 185], [142, 180]], [[30, 165], [43, 174], [34, 186], [20, 177]], [[493, 160], [523, 160], [523, 170], [493, 170]], [[402, 161], [435, 161], [435, 168], [402, 168]], [[335, 159], [401, 159], [401, 169], [335, 169]], [[296, 159], [325, 159], [325, 170], [296, 170]], [[221, 158], [251, 158], [251, 169], [221, 169]], [[174, 161], [183, 156], [190, 167], [181, 172]], [[145, 158], [162, 158], [162, 170], [145, 170]], [[61, 158], [125, 157], [125, 168], [62, 169]], [[20, 161], [33, 154], [40, 167], [28, 174]], [[492, 143], [542, 143], [542, 157], [492, 157]], [[450, 144], [479, 144], [479, 157], [450, 157]], [[335, 143], [439, 143], [439, 156], [335, 156]], [[294, 143], [327, 143], [327, 157], [294, 157]], [[220, 143], [253, 143], [253, 157], [220, 157]], [[178, 145], [187, 145], [187, 156], [178, 156]], [[63, 144], [104, 144], [104, 155], [63, 155]], [[144, 140], [164, 145], [160, 159], [141, 154]], [[31, 137], [44, 149], [31, 162], [17, 149]], [[286, 135], [291, 135], [291, 140], [286, 140]], [[177, 133], [193, 133], [193, 144], [177, 144]], [[336, 132], [388, 132], [388, 141], [336, 141]], [[492, 131], [525, 131], [525, 141], [492, 141]], [[450, 131], [477, 131], [477, 141], [450, 141]], [[292, 131], [321, 131], [321, 141], [292, 141]], [[218, 132], [255, 130], [256, 141], [219, 144]], [[63, 131], [95, 131], [95, 141], [63, 141]], [[417, 130], [437, 130], [437, 141], [417, 141]], [[145, 130], [159, 130], [159, 143], [145, 143]], [[30, 124], [43, 133], [32, 147], [19, 138]], [[493, 118], [535, 118], [535, 129], [493, 129]], [[335, 118], [387, 118], [387, 129], [335, 129]], [[218, 118], [255, 118], [255, 128], [218, 128]], [[451, 117], [478, 117], [478, 129], [451, 129]], [[418, 117], [438, 117], [438, 130], [418, 130]], [[177, 116], [209, 116], [209, 130], [177, 130]], [[145, 117], [162, 117], [162, 130], [145, 130]], [[62, 116], [88, 116], [88, 131], [62, 131]], [[19, 121], [33, 111], [43, 124], [29, 134]], [[491, 107], [523, 107], [523, 113], [491, 113]], [[449, 107], [477, 107], [477, 113], [449, 113]], [[420, 107], [436, 107], [436, 113], [420, 113]], [[295, 107], [319, 107], [319, 114], [295, 114]], [[220, 107], [242, 107], [242, 113], [220, 113]], [[176, 107], [203, 107], [203, 113], [176, 113]], [[145, 107], [161, 107], [161, 114], [145, 114]], [[334, 105], [372, 105], [372, 114], [334, 114]], [[63, 106], [86, 106], [86, 113], [63, 113]], [[483, 89], [522, 89], [522, 99], [483, 99]], [[331, 88], [380, 88], [380, 99], [331, 99]], [[276, 88], [325, 88], [325, 99], [276, 99]], [[214, 88], [246, 88], [246, 99], [214, 99]], [[411, 86], [474, 86], [474, 100], [411, 100]], [[6, 86], [102, 86], [102, 100], [6, 100]], [[415, 66], [461, 66], [461, 77], [415, 77]], [[288, 66], [333, 66], [333, 77], [288, 77]], [[157, 64], [206, 64], [206, 78], [157, 78]], [[33, 61], [43, 66], [37, 77], [27, 72]], [[416, 48], [523, 49], [523, 63], [415, 62]], [[288, 49], [375, 49], [375, 63], [288, 63]], [[159, 49], [269, 49], [269, 62], [159, 62]], [[24, 53], [36, 46], [45, 59], [33, 67]], [[416, 36], [481, 36], [481, 46], [416, 46]], [[25, 38], [39, 32], [46, 45], [32, 52]], [[157, 34], [205, 34], [205, 47], [157, 47]], [[412, 4], [527, 4], [527, 17], [412, 17]], [[146, 4], [345, 2], [345, 15], [146, 17]]] +test_add_20.jpg [[[31, 346], [605, 346], [605, 370], [31, 370]], [[217, 294], [510, 294], [510, 322], [217, 322]], [[473, 271], [525, 271], [525, 286], [473, 286]], [[220, 267], [287, 267], [287, 286], [220, 286]], [[219, 239], [484, 239], [484, 263], [219, 263]], [[221, 217], [303, 217], [303, 234], [221, 234]], [[402, 192], [417, 192], [417, 205], [402, 205]], [[222, 187], [341, 187], [341, 207], [222, 207]], [[221, 162], [287, 162], [287, 180], [221, 180]], [[375, 122], [475, 124], [475, 146], [375, 143]], [[222, 124], [356, 122], [356, 143], [222, 146]], [[218, 81], [352, 84], [352, 116], [218, 113]], [[440, 34], [605, 34], [605, 59], [440, 59]], [[72, 16], [398, 16], [398, 44], [72, 44]]] +test_add_3.jpg [[[169, 327], [337, 326], [337, 341], [169, 342]], [[170, 288], [307, 290], [307, 312], [170, 310]], [[171, 221], [323, 221], [323, 234], [171, 234]], [[340, 221], [449, 217], [449, 231], [341, 234]], [[169, 201], [372, 201], [372, 214], [169, 214]], [[170, 183], [418, 183], [418, 196], [170, 196]], [[170, 149], [416, 149], [416, 163], [170, 163]], [[171, 119], [418, 119], [418, 140], [171, 140]], [[326, 64], [478, 64], [478, 91], [326, 91]], [[173, 64], [306, 60], [306, 89], [174, 93]]] +test_add_4.png [] +test_add_5.png [[[48, 164], [108, 164], [108, 174], [48, 174]], [[52, 121], [169, 121], [169, 134], [52, 134]], [[50, 102], [165, 102], [165, 118], [50, 118]], [[52, 83], [164, 83], [164, 100], [52, 100]], [[51, 68], [166, 68], [166, 84], [51, 84]], [[51, 50], [145, 47], [145, 64], [52, 67]]] +test_add_6.jpg [[[123, 223], [219, 227], [218, 251], [122, 247]], [[172, 172], [186, 186], [172, 200], [158, 186]]] +test_add_7.jpg [[[48, 938], [174, 936], [174, 962], [48, 964]], [[227, 873], [629, 876], [628, 953], [226, 949]], [[56, 744], [637, 744], [637, 789], [56, 789]], [[150, 674], [545, 678], [544, 721], [150, 718]], [[73, 504], [633, 504], [633, 601], [73, 601]], [[59, 270], [655, 279], [652, 441], [56, 432]], [[513, 193], [553, 193], [553, 223], [513, 223]], [[61, 175], [532, 175], [532, 239], [61, 239]], [[533, 178], [642, 178], [642, 236], [533, 236]]] +test_add_8.jpg [[[251, 586], [454, 580], [454, 606], [252, 613]], [[107, 533], [457, 527], [457, 560], [108, 566]], [[336, 494], [384, 494], [384, 507], [336, 507]], [[27, 307], [355, 297], [356, 320], [28, 330]], [[22, 259], [445, 251], [445, 274], [23, 282]], [[78, 209], [445, 205], [445, 225], [78, 229]], [[160, 23], [319, 30], [317, 79], [158, 72]]] +test_add_9.png [[[266, 687], [486, 687], [486, 696], [266, 696]], [[196, 668], [554, 668], [554, 681], [196, 681]], [[153, 596], [597, 596], [597, 606], [153, 606]], [[215, 578], [541, 578], [541, 588], [215, 588]], [[85, 542], [664, 542], [664, 552], [85, 552]], [[96, 522], [653, 522], [653, 535], [96, 535]], [[362, 449], [389, 449], [389, 460], [362, 460]], [[238, 376], [513, 376], [513, 389], [238, 389]], [[177, 356], [574, 356], [574, 368], [177, 368]], [[344, 281], [408, 283], [407, 297], [343, 294]], [[256, 205], [492, 205], [492, 219], [256, 219]]] diff --git a/PTDN/test_inference_cpp.sh b/PTDN/test_inference_cpp.sh new file mode 100644 index 0000000000000000000000000000000000000000..124bdacb7dad04bdea07a62ba9c86b248be5a06d --- /dev/null +++ b/PTDN/test_inference_cpp.sh @@ -0,0 +1,208 @@ +#!/bin/bash +source tests/common_func.sh + +FILENAME=$1 +dataline=$(awk 'NR==52, NR==66{print}' $FILENAME) + +# parser params +IFS=$'\n' +lines=(${dataline}) + +# parser cpp inference model +use_opencv=$(func_parser_value "${lines[1]}") +cpp_infer_model_dir_list=$(func_parser_value "${lines[2]}") +cpp_infer_is_quant=$(func_parser_value "${lines[3]}") +# parser cpp inference +inference_cmd=$(func_parser_value "${lines[4]}") +cpp_use_gpu_key=$(func_parser_key "${lines[5]}") +cpp_use_gpu_list=$(func_parser_value "${lines[5]}") +cpp_use_mkldnn_key=$(func_parser_key "${lines[6]}") +cpp_use_mkldnn_list=$(func_parser_value "${lines[6]}") +cpp_cpu_threads_key=$(func_parser_key "${lines[7]}") +cpp_cpu_threads_list=$(func_parser_value "${lines[7]}") +cpp_batch_size_key=$(func_parser_key "${lines[8]}") +cpp_batch_size_list=$(func_parser_value "${lines[8]}") +cpp_use_trt_key=$(func_parser_key "${lines[9]}") +cpp_use_trt_list=$(func_parser_value "${lines[9]}") +cpp_precision_key=$(func_parser_key "${lines[10]}") +cpp_precision_list=$(func_parser_value "${lines[10]}") +cpp_infer_model_key=$(func_parser_key "${lines[11]}") +cpp_image_dir_key=$(func_parser_key "${lines[12]}") +cpp_infer_img_dir=$(func_parser_value "${lines[12]}") +cpp_infer_key1=$(func_parser_key "${lines[13]}") +cpp_infer_value1=$(func_parser_value "${lines[13]}") +cpp_benchmark_key=$(func_parser_key "${lines[14]}") +cpp_benchmark_value=$(func_parser_value "${lines[14]}") + + +LOG_PATH="./tests/output" +mkdir -p ${LOG_PATH} +status_log="${LOG_PATH}/results_cpp.log" + + +function func_cpp_inference(){ + IFS='|' + _script=$1 + _model_dir=$2 + _log_path=$3 + _img_dir=$4 + _flag_quant=$5 + # inference + for use_gpu in ${cpp_use_gpu_list[*]}; do + if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then + for use_mkldnn in ${cpp_use_mkldnn_list[*]}; do + if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then + continue + fi + for threads in ${cpp_cpu_threads_list[*]}; do + for batch_size in ${cpp_batch_size_list[*]}; do + precision="fp32" + if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then + precison="int8" + fi + _save_log_path="${_log_path}/cpp_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log" + set_infer_data=$(func_set_params "${cpp_image_dir_key}" "${_img_dir}") + set_benchmark=$(func_set_params "${cpp_benchmark_key}" "${cpp_benchmark_value}") + set_batchsize=$(func_set_params "${cpp_batch_size_key}" "${batch_size}") + set_cpu_threads=$(func_set_params "${cpp_cpu_threads_key}" "${threads}") + set_model_dir=$(func_set_params "${cpp_infer_model_key}" "${_model_dir}") + set_infer_params1=$(func_set_params "${cpp_infer_key1}" "${cpp_infer_value1}") + command="${_script} ${cpp_use_gpu_key}=${use_gpu} ${cpp_use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} > ${_save_log_path} 2>&1 " + eval $command + last_status=${PIPESTATUS[0]} + eval "cat ${_save_log_path}" + status_check $last_status "${command}" "${status_log}" + done + done + done + elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then + for use_trt in ${cpp_use_trt_list[*]}; do + for precision in ${cpp_precision_list[*]}; do + if [[ ${_flag_quant} = "False" ]] && [[ ${precision} =~ "int8" ]]; then + continue + fi + if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [ ${use_trt} = "False" ]; then + continue + fi + if [[ ${use_trt} = "False" || ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then + continue + fi + for batch_size in ${cpp_batch_size_list[*]}; do + _save_log_path="${_log_path}/cpp_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log" + set_infer_data=$(func_set_params "${cpp_image_dir_key}" "${_img_dir}") + set_benchmark=$(func_set_params "${cpp_benchmark_key}" "${cpp_benchmark_value}") + set_batchsize=$(func_set_params "${cpp_batch_size_key}" "${batch_size}") + set_tensorrt=$(func_set_params "${cpp_use_trt_key}" "${use_trt}") + set_precision=$(func_set_params "${cpp_precision_key}" "${precision}") + set_model_dir=$(func_set_params "${cpp_infer_model_key}" "${_model_dir}") + set_infer_params1=$(func_set_params "${cpp_infer_key1}" "${cpp_infer_value1}") + command="${_script} ${cpp_use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} > ${_save_log_path} 2>&1 " + eval $command + last_status=${PIPESTATUS[0]} + eval "cat ${_save_log_path}" + status_check $last_status "${command}" "${status_log}" + + done + done + done + else + echo "Does not support hardware other than CPU and GPU Currently!" + fi + done +} + + +cd deploy/cpp_infer +if [ ${use_opencv} = "True" ]; then + if [ -d "opencv-3.4.7/opencv3/" ] && [ $(md5sum opencv-3.4.7.tar.gz | awk -F ' ' '{print $1}') = "faa2b5950f8bee3f03118e600c74746a" ];then + echo "################### build opencv skipped ###################" + else + echo "################### build opencv ###################" + rm -rf opencv-3.4.7.tar.gz opencv-3.4.7/ + wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/opencv-3.4.7.tar.gz + tar -xf opencv-3.4.7.tar.gz + + cd opencv-3.4.7/ + install_path=$(pwd)/opencv3 + + rm -rf build + mkdir build + cd build + + cmake .. \ + -DCMAKE_INSTALL_PREFIX=${install_path} \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=OFF \ + -DWITH_IPP=OFF \ + -DBUILD_IPP_IW=OFF \ + -DWITH_LAPACK=OFF \ + -DWITH_EIGEN=OFF \ + -DCMAKE_INSTALL_LIBDIR=lib64 \ + -DWITH_ZLIB=ON \ + -DBUILD_ZLIB=ON \ + -DWITH_JPEG=ON \ + -DBUILD_JPEG=ON \ + -DWITH_PNG=ON \ + -DBUILD_PNG=ON \ + -DWITH_TIFF=ON \ + -DBUILD_TIFF=ON + + make -j + make install + cd ../ + echo "################### build opencv finished ###################" + fi +fi + + +echo "################### build PaddleOCR demo ####################" +if [ ${use_opencv} = "True" ]; then + OPENCV_DIR=$(pwd)/opencv-3.4.7/opencv3/ +else + OPENCV_DIR='' +fi +LIB_DIR=$(pwd)/Paddle/build/paddle_inference_install_dir/ +CUDA_LIB_DIR=$(dirname `find /usr -name libcudart.so`) +CUDNN_LIB_DIR=$(dirname `find /usr -name libcudnn.so`) + +BUILD_DIR=build +rm -rf ${BUILD_DIR} +mkdir ${BUILD_DIR} +cd ${BUILD_DIR} +cmake .. \ + -DPADDLE_LIB=${LIB_DIR} \ + -DWITH_MKL=ON \ + -DWITH_GPU=OFF \ + -DWITH_STATIC_LIB=OFF \ + -DWITH_TENSORRT=OFF \ + -DOPENCV_DIR=${OPENCV_DIR} \ + -DCUDNN_LIB=${CUDNN_LIB_DIR} \ + -DCUDA_LIB=${CUDA_LIB_DIR} \ + -DTENSORRT_DIR=${TENSORRT_DIR} \ + +make -j +cd ../../../ +echo "################### build PaddleOCR demo finished ###################" + + +# set cuda device +GPUID=$2 +if [ ${#GPUID} -le 0 ];then + env=" " +else + env="export CUDA_VISIBLE_DEVICES=${GPUID}" +fi +set CUDA_VISIBLE_DEVICES +eval $env + + +echo "################### run test ###################" +export Count=0 +IFS="|" +infer_quant_flag=(${cpp_infer_is_quant}) +for infer_model in ${cpp_infer_model_dir_list[*]}; do + #run inference + is_quant=${infer_quant_flag[Count]} + func_cpp_inference "${inference_cmd}" "${infer_model}" "${LOG_PATH}" "${cpp_infer_img_dir}" ${is_quant} + Count=$(($Count + 1)) +done diff --git a/PTDN/test_serving.sh b/PTDN/test_serving.sh new file mode 100644 index 0000000000000000000000000000000000000000..ec79a46c9bf4b51c16b1c0ddfff41b772b13b0ae --- /dev/null +++ b/PTDN/test_serving.sh @@ -0,0 +1,135 @@ +#!/bin/bash +source tests/common_func.sh + +FILENAME=$1 +dataline=$(awk 'NR==67, NR==83{print}' $FILENAME) + +# parser params +IFS=$'\n' +lines=(${dataline}) + +# parser serving +model_name=$(func_parser_value "${lines[1]}") +python=$(func_parser_value "${lines[2]}") +trans_model_py=$(func_parser_value "${lines[3]}") +infer_model_dir_key=$(func_parser_key "${lines[4]}") +infer_model_dir_value=$(func_parser_value "${lines[4]}") +model_filename_key=$(func_parser_key "${lines[5]}") +model_filename_value=$(func_parser_value "${lines[5]}") +params_filename_key=$(func_parser_key "${lines[6]}") +params_filename_value=$(func_parser_value "${lines[6]}") +serving_server_key=$(func_parser_key "${lines[7]}") +serving_server_value=$(func_parser_value "${lines[7]}") +serving_client_key=$(func_parser_key "${lines[8]}") +serving_client_value=$(func_parser_value "${lines[8]}") +serving_dir_value=$(func_parser_value "${lines[9]}") +web_service_py=$(func_parser_value "${lines[10]}") +web_use_gpu_key=$(func_parser_key "${lines[11]}") +web_use_gpu_list=$(func_parser_value "${lines[11]}") +web_use_mkldnn_key=$(func_parser_key "${lines[12]}") +web_use_mkldnn_list=$(func_parser_value "${lines[12]}") +web_cpu_threads_key=$(func_parser_key "${lines[13]}") +web_cpu_threads_list=$(func_parser_value "${lines[13]}") +web_use_trt_key=$(func_parser_key "${lines[14]}") +web_use_trt_list=$(func_parser_value "${lines[14]}") +web_precision_key=$(func_parser_key "${lines[15]}") +web_precision_list=$(func_parser_value "${lines[15]}") +pipeline_py=$(func_parser_value "${lines[16]}") + +LOG_PATH="../../tests/output" +mkdir -p ./tests/output +status_log="${LOG_PATH}/results_serving.log" + +function func_serving(){ + IFS='|' + _python=$1 + _script=$2 + _model_dir=$3 + # pdserving + set_dirname=$(func_set_params "${infer_model_dir_key}" "${infer_model_dir_value}") + set_model_filename=$(func_set_params "${model_filename_key}" "${model_filename_value}") + set_params_filename=$(func_set_params "${params_filename_key}" "${params_filename_value}") + set_serving_server=$(func_set_params "${serving_server_key}" "${serving_server_value}") + set_serving_client=$(func_set_params "${serving_client_key}" "${serving_client_value}") + trans_model_cmd="${python} ${trans_model_py} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_serving_server} ${set_serving_client}" + eval $trans_model_cmd + cd ${serving_dir_value} + echo $PWD + unset https_proxy + unset http_proxy + for use_gpu in ${web_use_gpu_list[*]}; do + echo ${ues_gpu} + if [ ${use_gpu} = "null" ]; then + for use_mkldnn in ${web_use_mkldnn_list[*]}; do + if [ ${use_mkldnn} = "False" ]; then + continue + fi + for threads in ${web_cpu_threads_list[*]}; do + _save_log_path="${LOG_PATH}/server_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_batchsize_1.log" + set_cpu_threads=$(func_set_params "${web_cpu_threads_key}" "${threads}") + web_service_cmd="${python} ${web_service_py} ${web_use_gpu_key}=${use_gpu} ${web_use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} &" + eval $web_service_cmd + sleep 2s + pipeline_cmd="${python} ${pipeline_py} > ${_save_log_path} 2>&1 " + eval $pipeline_cmd + last_status=${PIPESTATUS[0]} + eval "cat ${_save_log_path}" + status_check $last_status "${pipeline_cmd}" "${status_log}" + PID=$! + kill $PID + sleep 2s + ps ux | grep -E 'web_service|pipeline' | awk '{print $2}' | xargs kill -s 9 + done + done + elif [ ${use_gpu} = "0" ]; then + for use_trt in ${web_use_trt_list[*]}; do + for precision in ${web_precision_list[*]}; do + if [[ ${_flag_quant} = "False" ]] && [[ ${precision} =~ "int8" ]]; then + continue + fi + if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [ ${use_trt} = "False" ]; then + continue + fi + if [[ ${use_trt} = "False" || ${precision} =~ "int8" ]] && [[ ${_flag_quant} = "True" ]]; then + continue + fi + _save_log_path="${LOG_PATH}/server_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_1.log" + set_tensorrt=$(func_set_params "${web_use_trt_key}" "${use_trt}") + set_precision=$(func_set_params "${web_precision_key}" "${precision}") + web_service_cmd="${python} ${web_service_py} ${web_use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} & " + eval $web_service_cmd + sleep 2s + pipeline_cmd="${python} ${pipeline_py} > ${_save_log_path} 2>&1" + eval $pipeline_cmd + last_status=${PIPESTATUS[0]} + eval "cat ${_save_log_path}" + status_check $last_status "${pipeline_cmd}" "${status_log}" + PID=$! + kill $PID + sleep 2s + ps ux | grep -E 'web_service|pipeline' | awk '{print $2}' | xargs kill -s 9 + done + done + else + echo "Does not support hardware other than CPU and GPU Currently!" + fi + done +} + + +# set cuda device +GPUID=$2 +if [ ${#GPUID} -le 0 ];then + env=" " +else + env="export CUDA_VISIBLE_DEVICES=${GPUID}" +fi +set CUDA_VISIBLE_DEVICES +eval $env + + +echo "################### run test ###################" + +export Count=0 +IFS="|" +func_serving "${web_service_cmd}" diff --git a/tests/test.sh b/PTDN/test_train_inference_python.sh similarity index 76% rename from tests/test.sh rename to PTDN/test_train_inference_python.sh index 9888e0faabb13b00acdf41ad154ba0a0e7ec2b63..28cc037801bb4c1f1bcc10a74855b8c146197f4d 100644 --- a/tests/test.sh +++ b/PTDN/test_train_inference_python.sh @@ -1,74 +1,16 @@ #!/bin/bash +source tests/common_func.sh + FILENAME=$1 -# MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer', 'infer'] +# MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer', 'infer', 'klquant_infer'] MODE=$2 -dataline=$(cat ${FILENAME}) +dataline=$(awk 'NR==1, NR==51{print}' $FILENAME) # parser params IFS=$'\n' lines=(${dataline}) -function func_parser_key(){ - strs=$1 - IFS=":" - array=(${strs}) - tmp=${array[0]} - echo ${tmp} -} -function func_parser_value(){ - strs=$1 - IFS=":" - array=(${strs}) - tmp=${array[1]} - echo ${tmp} -} -function func_set_params(){ - key=$1 - value=$2 - if [ ${key} = "null" ];then - echo " " - elif [[ ${value} = "null" ]] || [[ ${value} = " " ]] || [ ${#value} -le 0 ];then - echo " " - else - echo "${key}=${value}" - fi -} -function func_parser_params(){ - strs=$1 - IFS=":" - array=(${strs}) - key=${array[0]} - tmp=${array[1]} - IFS="|" - res="" - for _params in ${tmp[*]}; do - IFS="=" - array=(${_params}) - mode=${array[0]} - value=${array[1]} - if [[ ${mode} = ${MODE} ]]; then - IFS="|" - #echo $(func_set_params "${mode}" "${value}") - echo $value - break - fi - IFS="|" - done - echo ${res} -} -function status_check(){ - last_status=$1 # the exit code - run_command=$2 - run_log=$3 - if [ $last_status -eq 0 ]; then - echo -e "\033[33m Run successfully with command - ${run_command}! \033[0m" | tee -a ${run_log} - else - echo -e "\033[33m Run failed with command - ${run_command}! \033[0m" | tee -a ${run_log} - fi -} - -IFS=$'\n' # The training params model_name=$(func_parser_value "${lines[1]}") python=$(func_parser_value "${lines[2]}") @@ -145,9 +87,41 @@ benchmark_value=$(func_parser_value "${lines[49]}") infer_key1=$(func_parser_key "${lines[50]}") infer_value1=$(func_parser_value "${lines[50]}") +# parser klquant_infer +if [ ${MODE} = "klquant_infer" ]; then + dataline=$(awk 'NR==82, NR==98{print}' $FILENAME) + lines=(${dataline}) + # parser inference model + infer_model_dir_list=$(func_parser_value "${lines[1]}") + infer_export_list=$(func_parser_value "${lines[2]}") + infer_is_quant=$(func_parser_value "${lines[3]}") + # parser inference + inference_py=$(func_parser_value "${lines[4]}") + use_gpu_key=$(func_parser_key "${lines[5]}") + use_gpu_list=$(func_parser_value "${lines[5]}") + use_mkldnn_key=$(func_parser_key "${lines[6]}") + use_mkldnn_list=$(func_parser_value "${lines[6]}") + cpu_threads_key=$(func_parser_key "${lines[7]}") + cpu_threads_list=$(func_parser_value "${lines[7]}") + batch_size_key=$(func_parser_key "${lines[8]}") + batch_size_list=$(func_parser_value "${lines[8]}") + use_trt_key=$(func_parser_key "${lines[9]}") + use_trt_list=$(func_parser_value "${lines[9]}") + precision_key=$(func_parser_key "${lines[10]}") + precision_list=$(func_parser_value "${lines[10]}") + infer_model_key=$(func_parser_key "${lines[11]}") + image_dir_key=$(func_parser_key "${lines[12]}") + infer_img_dir=$(func_parser_value "${lines[12]}") + save_log_key=$(func_parser_key "${lines[13]}") + benchmark_key=$(func_parser_key "${lines[14]}") + benchmark_value=$(func_parser_value "${lines[14]}") + infer_key1=$(func_parser_key "${lines[15]}") + infer_value1=$(func_parser_value "${lines[15]}") +fi + LOG_PATH="./tests/output" mkdir -p ${LOG_PATH} -status_log="${LOG_PATH}/results.log" +status_log="${LOG_PATH}/results_python.log" function func_inference(){ @@ -167,18 +141,28 @@ function func_inference(){ fi for threads in ${cpu_threads_list[*]}; do for batch_size in ${batch_size_list[*]}; do - _save_log_path="${_log_path}/infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_batchsize_${batch_size}.log" - set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}") - set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}") - set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}") - set_cpu_threads=$(func_set_params "${cpu_threads_key}" "${threads}") - set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}") - set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}") - command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} > ${_save_log_path} 2>&1 " - eval $command - last_status=${PIPESTATUS[0]} - eval "cat ${_save_log_path}" - status_check $last_status "${command}" "${status_log}" + for precision in ${precision_list[*]}; do + if [ ${use_mkldnn} = "False" ] && [ ${precision} = "fp16" ]; then + continue + fi # skip when enable fp16 but disable mkldnn + if [ ${_flag_quant} = "True" ] && [ ${precision} != "int8" ]; then + continue + fi # skip when quant model inference but precision is not int8 + set_precision=$(func_set_params "${precision_key}" "${precision}") + + _save_log_path="${_log_path}/python_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log" + set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}") + set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}") + set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}") + set_cpu_threads=$(func_set_params "${cpu_threads_key}" "${threads}") + set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}") + set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}") + command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_precision} ${set_infer_params1} > ${_save_log_path} 2>&1 " + eval $command + last_status=${PIPESTATUS[0]} + eval "cat ${_save_log_path}" + status_check $last_status "${command}" "${status_log}" + done done done done @@ -195,7 +179,7 @@ function func_inference(){ continue fi for batch_size in ${batch_size_list[*]}; do - _save_log_path="${_log_path}/infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log" + _save_log_path="${_log_path}/python_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log" set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}") set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}") set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}") @@ -218,7 +202,7 @@ function func_inference(){ done } -if [ ${MODE} = "infer" ]; then +if [ ${MODE} = "infer" ] || [ ${MODE} = "klquant_infer" ]; then GPUID=$3 if [ ${#GPUID} -le 0 ];then env=" " @@ -237,21 +221,23 @@ if [ ${MODE} = "infer" ]; then save_infer_dir=$(dirname $infer_model) set_export_weight=$(func_set_params "${export_weight}" "${infer_model}") set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_dir}") - export_cmd="${python} ${norm_export} ${set_export_weight} ${set_save_infer_key}" + export_cmd="${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key}" + echo ${infer_run_exports[Count]} + echo $export_cmd eval $export_cmd status_export=$? - if [ ${status_export} = 0 ];then - status_check $status_export "${export_cmd}" "${status_log}" - fi + status_check $status_export "${export_cmd}" "${status_log}" else save_infer_dir=${infer_model} fi #run inference is_quant=${infer_quant_flag[Count]} + if [ ${MODE} = "klquant_infer" ]; then + is_quant="True" + fi func_inference "${python}" "${inference_py}" "${save_infer_dir}" "${LOG_PATH}" "${infer_img_dir}" ${is_quant} Count=$(($Count + 1)) done - else IFS="|" export Count=0 @@ -363,3 +349,4 @@ else done # done with: for autocast in ${autocast_list[*]}; do done # done with: for gpu in ${gpu_list[*]}; do fi # end if [ ${MODE} = "infer" ]; then + diff --git a/README.md b/README.md index 0072e481a54bc56f16fb172ad8cb9f35adf98d39..c19493b07b0e615876404689f4eaac0802dbda60 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,42 @@ English | [简体中文](README_ch.md) +

+ +

+ + +------------------------------------------------------------------------------------------ + +

+ + + + + + + + +

+ ## Introduction + PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools that help users train better models and apply them into practice. -## Notice -PaddleOCR supports both dynamic graph and static graph programming paradigm -- Dynamic graph: dygraph branch (default), **supported by paddle 2.0.0 ([installation](./doc/doc_en/installation_en.md))** -- Static graph: develop branch **Recent updates** -- 2021.1.21 update more than 25+ multilingual recognition models [models list](./doc/doc_en/models_list_en.md), including:English, Chinese, German, French, Japanese,Spanish,Portuguese Russia Arabic and so on. Models for more languages will continue to be updated [Develop Plan](https://github.com/PaddlePaddle/PaddleOCR/issues/1048). -- 2020.12.15 update Data synthesis tool, i.e., [Style-Text](./StyleText/README.md),easy to synthesize a large number of images which are similar to the target scene image. -- 2020.11.25 Update a new data annotation tool, i.e., [PPOCRLabel](./PPOCRLabel/README.md), which is helpful to improve the labeling efficiency. Moreover, the labeling results can be used in training of the PP-OCR system directly. -- 2020.9.22 Update the PP-OCR technical article, https://arxiv.org/abs/2009.09941 + +- PaddleOCR R&D team would like to share the key points of PP-OCRv2, at 20:15 pm on September 8th, [Live Address](https://live.bilibili.com/21689802). +- 2021.9.7 release PaddleOCR v2.3, [PP-OCRv2](#PP-OCRv2) is proposed. The inference speed of PP-OCRv2 is 220% higher than that of PP-OCR server in CPU device. The F-score of PP-OCRv2 is 7% higher than that of PP-OCR mobile. +- 2021.8.3 released PaddleOCR v2.2, add a new structured documents analysis toolkit, i.e., [PP-Structure](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/ppstructure/README.md), support layout analysis and table recognition (One-key to export chart images to Excel files). +- 2021.4.8 release end-to-end text recognition algorithm [PGNet](https://www.aaai.org/AAAI21Papers/AAAI-2885.WangP.pdf) which is published in AAAI 2021. Find tutorial [here](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/pgnet_en.md);release multi language recognition [models](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/multi_languages_en.md), support more than 80 languages recognition; especically, the performance of [English recognition model](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/models_list_en.md#English) is Optimized. + - [more](./doc/doc_en/update_en.md) ## Features -- PPOCR series of high-quality pre-trained models, comparable to commercial effects - - Ultra lightweight ppocr_mobile series models: detection (3.0M) + direction classifier (1.4M) + recognition (5.0M) = 9.4M - - General ppocr_server series models: detection (47.1M) + direction classifier (1.4M) + recognition (94.9M) = 143.4M +- PP-OCR series of high-quality pre-trained models, comparable to commercial effects + - Ultra lightweight PP-OCRv2 series models: detection (3.1M) + direction classifier (1.4M) + recognition 8.5M) = 13.0M + - Ultra lightweight PP-OCR mobile series models: detection (3.0M) + direction classifier (1.4M) + recognition (5.0M) = 9.4M + - General PP-OCR server series models: detection (47.1M) + direction classifier (1.4M) + recognition (94.9M) = 143.4M - Support Chinese, English, and digit recognition, vertical text recognition, and long text recognition - Support multi-language recognition: Korean, Japanese, German, French - Rich toolkits related to the OCR areas @@ -64,39 +82,45 @@ Mobile DEMO experience (based on EasyEdge and Paddle-Lite, supports iOS and Andr -## PP-OCR 2.0 series model list(Update on Dec 15) -**Note** : Compared with [models 1.1](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/models_list_en.md), which are trained with static graph programming paradigm, models 2.0 are the dynamic graph trained version and achieve close performance. +## PP-OCR Series Model List(Update on September 8th) | Model introduction | Model name | Recommended scene | Detection model | Direction classifier | Recognition model | | ------------------------------------------------------------ | ---------------------------- | ----------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | -| Chinese and English ultra-lightweight OCR model (9.4M) | ch_ppocr_mobile_v2.0_xx | Mobile & server |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | -| Chinese and English general OCR model (143.4M) | ch_ppocr_server_v2.0_xx | Server |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_traingit.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | +| Chinese and English ultra-lightweight PP-OCRv2 model(11.6M) | ch_PP-OCRv2_xx |Mobile&Server|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/ch/ch_PP-OCRv2_rec_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar)| +| Chinese and English ultra-lightweight PP-OCR model (9.4M) | ch_ppocr_mobile_v2.0_xx | Mobile & server |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | +| Chinese and English general PP-OCR model (143.4M) | ch_ppocr_server_v2.0_xx | Server |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_traingit.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | -For more model downloads (including multiple languages), please refer to [PP-OCR v2.0 series model downloads](./doc/doc_en/models_list_en.md). +For more model downloads (including multiple languages), please refer to [PP-OCR series model downloads](./doc/doc_en/models_list_en.md). For a new language request, please refer to [Guideline for new language_requests](#language_requests). ## Tutorials -- [Installation](./doc/doc_en/installation_en.md) +- [Environment Preparation](./doc/doc_en/environment_en.md) - [Quick Start](./doc/doc_en/quickstart_en.md) -- [Code Structure](./doc/doc_en/tree_en.md) -- Algorithm Introduction - - [Text Detection Algorithm](./doc/doc_en/algorithm_overview_en.md) - - [Text Recognition Algorithm](./doc/doc_en/algorithm_overview_en.md) - - [PP-OCR Pipeline](#PP-OCR-Pipeline) -- Model Training/Evaluation - - [Text Detection](./doc/doc_en/detection_en.md) - - [Text Recognition](./doc/doc_en/recognition_en.md) - - [Direction Classification](./doc/doc_en/angle_class_en.md) - - [Yml Configuration](./doc/doc_en/config_en.md) -- Inference and Deployment - - [Quick Inference Based on PIP](./doc/doc_en/whl_en.md) +- [PaddleOCR Overview and Installation](./doc/doc_en/paddleOCR_overview_en.md) +- PP-OCR Industry Landing: from Training to Deployment + - [PP-OCR Model and Configuration](./doc/doc_en/models_and_config_en.md) + - [PP-OCR Model Download](./doc/doc_en/models_list_en.md) + - [Yml Configuration](./doc/doc_en/config_en.md) + - [Python Inference for PP-OCR Model Library](./doc/doc_en/inference_ppocr_en.md) + - [PP-OCR Training](./doc/doc_en/training_en.md) + - [Text Detection](./doc/doc_en/detection_en.md) + - [Text Recognition](./doc/doc_en/recognition_en.md) + - [Text Direction Classification](./doc/doc_en/angle_class_en.md) + - [Yml Configuration](./doc/doc_en/config_en.md) + - Inference and Deployment + - [C++ Inference](./deploy/cpp_infer/readme_en.md) + - [Serving](./deploy/pdserving/README.md) + - [Mobile](./deploy/lite/readme_en.md) + - [Benchmark](./doc/doc_en/benchmark_en.md) +- [PP-Structure: Information Extraction](./ppstructure/README.md) + - [Layout Parser](./ppstructure/layout/README.md) + - [Table Recognition](./ppstructure/table/README.md) +- Academic Circles + - [Two-stage Algorithm](./doc/doc_en/algorithm_overview_en.md) + - [PGNet Algorithm](./doc/doc_en/algorithm_overview_en.md) - [Python Inference](./doc/doc_en/inference_en.md) - - [C++ Inference](./deploy/cpp_infer/readme_en.md) - - [Serving](./deploy/pdserving/README.md) - - [Mobile](./deploy/lite/readme_en.md) - - [Benchmark](./doc/doc_en/benchmark_en.md) - Data Annotation and Synthesis - [Semi-automatic Annotation Tool: PPOCRLabel](./PPOCRLabel/README.md) - [Data Synthesis Tool: Style-Text](./StyleText/README.md) @@ -114,17 +138,18 @@ For a new language request, please refer to [Guideline for new language_requests - [License](#LICENSE) - [Contribution](#CONTRIBUTION) + +## PP-OCRv2 Pipeline +
+ +
- +[1] PP-OCR is a practical ultra-lightweight OCR system. It is mainly composed of three parts: DB text detection, detection frame correction and CRNN text recognition. The system adopts 19 effective strategies from 8 aspects including backbone network selection and adjustment, prediction head design, data augmentation, learning rate transformation strategy, regularization parameter selection, pre-training model use, and automatic model tailoring and quantization to optimize and slim down the models of each module (as shown in the green box above). The final results are an ultra-lightweight Chinese and English OCR model with an overall size of 3.5M and a 2.8M English digital OCR model. For more details, please refer to the PP-OCR technical article (https://arxiv.org/abs/2009.09941). -## PP-OCR Pipeline +[2] On the basis of PP-OCR, PP-OCRv2 is further optimized in five aspects. The detection model adopts CML(Collaborative Mutual Learning) knowledge distillation strategy and CopyPaste data expansion strategy. The recognition model adopts LCNet lightweight backbone network, U-DML knowledge distillation strategy and enhanced CTC loss function improvement (as shown in the red box above), which further improves the inference speed and prediction effect. For more details, please refer to the technical report of PP-OCRv2 (arXiv link is coming soon). -
- -
-PP-OCR is a practical ultra-lightweight OCR system. It is mainly composed of three parts: DB text detection[2], detection frame correction and CRNN text recognition[7]. The system adopts 19 effective strategies from 8 aspects including backbone network selection and adjustment, prediction head design, data augmentation, learning rate transformation strategy, regularization parameter selection, pre-training model use, and automatic model tailoring and quantization to optimize and slim down the models of each module. The final results are an ultra-lightweight Chinese and English OCR model with an overall size of 3.5M and a 2.8M English digital OCR model. For more details, please refer to the PP-OCR technical article (https://arxiv.org/abs/2009.09941). Besides, The implementation of the FPGM Pruner [8] and PACT quantization [9] is based on [PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim). ## Visualization [more](./doc/doc_en/visualization_en.md) @@ -149,7 +174,7 @@ PP-OCR is a practical ultra-lightweight OCR system. It is mainly composed of thr -## Guideline for new language requests +## Guideline for New Language Requests If you want to request a new language support, a PR with 2 following files are needed: diff --git a/README_ch.md b/README_ch.md index aec248f81c9e9494b87119b4dd15b70bdbca98b8..7e088e30116a4dd636b044fcc55169972ef04eb6 100755 --- a/README_ch.md +++ b/README_ch.md @@ -1,33 +1,48 @@ [English](README.md) | 简体中文 +

+ +

+ + +------------------------------------------------------------------------------------------ + +

+ + + + + + + + +

+ ## 简介 + PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力使用者训练出更好的模型,并应用落地。 -## 注意 -PaddleOCR同时支持动态图与静态图两种编程范式 -- 动态图版本:dygraph分支(默认),需将paddle版本升级至2.0.0([快速安装](./doc/doc_ch/installation.md)) -- 静态图版本:develop分支 **近期更新** -- 2021.4.8 release 2.1版本,新增AAAI 2021论文[端到端识别算法PGNet](./doc/doc_ch/pgnet.md)开源,[多语言模型](./doc/doc_ch/multi_languages.md)支持种类增加到80+。 -- 2021.2.1 [FAQ](./doc/doc_ch/FAQ.md)新增5个高频问题,总数162个,每周一都会更新,欢迎大家持续关注。 -- 2021.1.21 更新多语言识别模型,目前支持语种超过27种,包括中文简体、中文繁体、英文、法文、德文、韩文、日文、意大利文、西班牙文、葡萄牙文、俄罗斯文、阿拉伯文等,后续计划可以参考[多语言研发计划](https://github.com/PaddlePaddle/PaddleOCR/issues/1048) -- 2020.12.15 更新数据合成工具[Style-Text](./StyleText/README_ch.md),可以批量合成大量与目标场景类似的图像,在多个场景验证,效果明显提升。 -- 2020.11.25 更新半自动标注工具[PPOCRLabel](./PPOCRLabel/README_ch.md),辅助开发者高效完成标注任务,输出格式与PP-OCR训练任务完美衔接。 -- 2020.9.22 更新PP-OCR技术文章,https://arxiv.org/abs/2009.09941 -- [More](./doc/doc_ch/update.md) - +- PaddleOCR研发团队对最新发版内容技术深入解读,9月8日晚上20:15,[直播地址](https://live.bilibili.com/21689802)。 +- 2021.9.7 发布PaddleOCR v2.3,发布[PP-OCRv2](#PP-OCRv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7%。 +- 2021.8.3 发布PaddleOCR v2.2,新增文档结构分析[PP-Structure](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/ppstructure/README_ch.md)工具包,支持版面分析与表格识别(含Excel导出)。 +- 2021.6.29 [FAQ](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/doc/doc_ch/FAQ.md)新增5个高频问题,总数248个,每周一都会更新,欢迎大家持续关注。 +- 2021.4.8 release 2.1版本,新增AAAI 2021论文[端到端识别算法PGNet](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/doc/doc_ch/pgnet.md)开源,[多语言模型](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/doc/doc_ch/multi_languages.md)支持种类增加到80+。 +- [More](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/doc/doc_ch/update.md) ## 特性 -- PPOCR系列高质量预训练模型,准确的识别效果 - - 超轻量ppocr_mobile移动端系列:检测(3.0M)+方向分类器(1.4M)+ 识别(5.0M)= 9.4M - - 通用ppocr_server系列:检测(47.1M)+方向分类器(1.4M)+ 识别(94.9M)= 143.4M +- PP-OCR系列高质量预训练模型,准确的识别效果 + - 超轻量PP-OCRv2系列:检测(3.1M)+ 方向分类器(1.4M)+ 识别(8.5M)= 13.0M + - 超轻量PP-OCR mobile移动端系列:检测(3.0M)+方向分类器(1.4M)+ 识别(5.0M)= 9.4M + - 通用PPOCR server系列:检测(47.1M)+方向分类器(1.4M)+ 识别(94.9M)= 143.4M - 支持中英文数字组合识别、竖排文本识别、长文本识别 - 支持多语言识别:韩语、日语、德语、法语 - 丰富易用的OCR相关工具组件 - 半自动数据标注工具PPOCRLabel:支持快速高效的数据标注 - 数据合成工具Style-Text:批量合成大量与目标场景类似的图像 + - 文档分析能力PP-Structure:版面分析与表格识别 - 支持用户自定义训练,提供丰富的预测推理部署方案 - 支持PIP快速安装使用 - 可运行于Linux、Windows、MacOS等多种系统 @@ -39,7 +54,7 @@ PaddleOCR同时支持动态图与静态图两种编程范式 -上图是通用ppocr_server模型效果展示,更多效果图请见[效果展示页面](./doc/doc_ch/visualization.md)。 +上图是通用PP-OCR server模型效果展示,更多效果图请见[效果展示页面](./doc/doc_ch/visualization.md)。 ## 欢迎加入PaddleOCR技术交流群 @@ -62,71 +77,79 @@ PaddleOCR同时支持动态图与静态图两种编程范式 - 代码体验:从[快速安装](./doc/doc_ch/quickstart.md) 开始 -## PP-OCR 2.0系列模型列表(更新中) -**说明** :2.0版模型和[1.1版模型](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/models_list.md)的主要区别在于动态图训练vs.静态图训练,模型性能上无明显差距。 +## PP-OCR系列模型列表(更新中) + | 模型简介 | 模型名称 |推荐场景 | 检测模型 | 方向分类器 | 识别模型 | | ------------ | --------------- | ----------------|---- | ---------- | -------- | -| 中英文超轻量OCR模型(9.4M) | ch_ppocr_mobile_v2.0_xx |移动端&服务器端|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | -| 中英文通用OCR模型(143.4M) |ch_ppocr_server_v2.0_xx|服务器端 |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | +| 中英文超轻量PP-OCRv2模型(13.0M) | ch_PP-OCRv2_xx |移动端&服务器端|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar)| +| 中英文超轻量PP-OCR mobile模型(9.4M) | ch_ppocr_mobile_v2.0_xx |移动端&服务器端|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | +| 中英文通用PP-OCR server模型(143.4M) |ch_ppocr_server_v2.0_xx|服务器端 |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | -更多模型下载(包括多语言),可以参考[PP-OCR v2.0 系列模型下载](./doc/doc_ch/models_list.md) +更多模型下载(包括多语言),可以参考[PP-OCR 系列模型下载](./doc/doc_ch/models_list.md) ## 文档教程 -- [快速安装](./doc/doc_ch/installation.md) -- [中文OCR模型快速使用](./doc/doc_ch/quickstart.md) -- [多语言OCR模型快速使用](./doc/doc_ch/multi_languages.md) -- [代码组织结构](./doc/doc_ch/tree.md) -- 算法介绍 - - [文本检测](./doc/doc_ch/algorithm_overview.md) - - [文本识别](./doc/doc_ch/algorithm_overview.md) - - [PP-OCR Pipeline](#PP-OCR) +- [运行环境准备](./doc/doc_ch/environment.md) +- [快速开始(中英文/多语言/文档分析)](./doc/doc_ch/quickstart.md) +- [PaddleOCR全景图与项目克隆](./doc/doc_ch/paddleOCR_overview.md) +- PP-OCR产业落地:从训练到部署 + - [PP-OCR模型与配置文件](./doc/doc_ch/models_and_config.md) + - [PP-OCR模型下载](./doc/doc_ch/models_list.md) + - [配置文件内容与生成](./doc/doc_ch/config.md) + - [PP-OCR模型库快速推理](./doc/doc_ch/inference_ppocr.md) + - [PP-OCR模型训练](./doc/doc_ch/training.md) + - [文本检测](./doc/doc_ch/detection.md) + - [文本识别](./doc/doc_ch/recognition.md) + - [文本方向分类器](./doc/doc_ch/angle_class.md) + - [配置文件内容与生成](./doc/doc_ch/config.md) + - PP-OCR模型推理部署 + - [基于C++预测引擎推理](./deploy/cpp_infer/readme.md) + - [服务化部署](./deploy/pdserving/README_CN.md) + - [端侧部署](./deploy/lite/readme.md) + - [Benchmark](./doc/doc_ch/benchmark.md) +- [PP-Structure信息提取](./ppstructure/README_ch.md) + - [版面分析](./ppstructure/layout/README_ch.md) + - [表格识别](./ppstructure/table/README_ch.md) +- 数据标注与合成 + - [半自动标注工具PPOCRLabel](./PPOCRLabel/README_ch.md) + - [数据合成工具Style-Text](./StyleText/README_ch.md) + - [其它数据标注工具](./doc/doc_ch/data_annotation.md) + - [其它数据合成工具](./doc/doc_ch/data_synthesis.md) +- OCR学术圈 + - [两阶段模型介绍与下载](./doc/doc_ch/algorithm_overview.md) - [端到端PGNet算法](./doc/doc_ch/pgnet.md) -- 模型训练/评估 - - [文本检测](./doc/doc_ch/detection.md) - - [文本识别](./doc/doc_ch/recognition.md) - - [方向分类器](./doc/doc_ch/angle_class.md) - - [yml参数配置文件介绍](./doc/doc_ch/config.md) -- 预测部署 - - [基于pip安装whl包快速推理](./doc/doc_ch/whl.md) - [基于Python脚本预测引擎推理](./doc/doc_ch/inference.md) - - [基于C++预测引擎推理](./deploy/cpp_infer/readme.md) - - [服务化部署](./deploy/pdserving/README_CN.md) - - [端侧部署](./deploy/lite/readme.md) - - [Benchmark](./doc/doc_ch/benchmark.md) - 数据集 - [通用中英文OCR数据集](./doc/doc_ch/datasets.md) - [手写中文OCR数据集](./doc/doc_ch/handwritten_datasets.md) - [垂类多语言OCR数据集](./doc/doc_ch/vertical_and_multilingual_datasets.md) -- 数据标注与合成 - - [半自动标注工具PPOCRLabel](./PPOCRLabel/README_ch.md) - - [数据合成工具Style-Text](./StyleText/README_ch.md) - - [其它数据标注工具](./doc/doc_ch/data_annotation.md) - - [其它数据合成工具](./doc/doc_ch/data_synthesis.md) - [效果展示](#效果展示) - FAQ - [【精选】OCR精选10个问题](./doc/doc_ch/FAQ.md) - - [【理论篇】OCR通用32个问题](./doc/doc_ch/FAQ.md) - - [【实战篇】PaddleOCR实战110个问题](./doc/doc_ch/FAQ.md) + - [【理论篇】OCR通用50个问题](./doc/doc_ch/FAQ.md) + - [【实战篇】PaddleOCR实战183个问题](./doc/doc_ch/FAQ.md) - [技术交流群](#欢迎加入PaddleOCR技术交流群) - [参考文献](./doc/doc_ch/reference.md) - [许可证书](#许可证书) - [贡献代码](#贡献代码) +- [代码组织结构](./doc/doc_ch/tree.md) + + - -## PP-OCR Pipeline +## PP-OCRv2 Pipeline
- +
-PP-OCR是一个实用的超轻量OCR系统。主要由DB文本检测[2]、检测框矫正和CRNN文本识别三部分组成[7]。该系统从骨干网络选择和调整、预测头部的设计、数据增强、学习率变换策略、正则化参数选择、预训练模型使用以及模型自动裁剪量化8个方面,采用19个有效策略,对各个模块的模型进行效果调优和瘦身,最终得到整体大小为3.5M的超轻量中英文OCR和2.8M的英文数字OCR。更多细节请参考PP-OCR技术方案 https://arxiv.org/abs/2009.09941 。其中FPGM裁剪器[8]和PACT量化[9]的实现可以参考[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)。 +[1] PP-OCR是一个实用的超轻量OCR系统。主要由DB文本检测、检测框矫正和CRNN文本识别三部分组成。该系统从骨干网络选择和调整、预测头部的设计、数据增强、学习率变换策略、正则化参数选择、预训练模型使用以及模型自动裁剪量化8个方面,采用19个有效策略,对各个模块的模型进行效果调优和瘦身(如绿框所示),最终得到整体大小为3.5M的超轻量中英文OCR和2.8M的英文数字OCR。更多细节请参考PP-OCR技术方案 https://arxiv.org/abs/2009.09941 + +[2] PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模型采用CML协同互学习知识蒸馏策略和CopyPaste数据增广策略;识别模型采用LCNet轻量级骨干网络、UDML 改进知识蒸馏策略和Enhanced CTC loss损失函数改进(如上图红框所示),进一步在推理速度和预测效果上取得明显提升。更多细节请参考PP-OCR技术方案(arxiv链接生成中)。 + ## 效果展示 [more](./doc/doc_ch/visualization.md) - 中文模型
- -
diff --git a/benchmark/analysis.py b/benchmark/analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..c4189b99d8ee082082a254718617a7e58bebe961 --- /dev/null +++ b/benchmark/analysis.py @@ -0,0 +1,273 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import argparse +import json +import os +import re +import traceback + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--filename", type=str, help="The name of log which need to analysis.") + parser.add_argument( + "--log_with_profiler", type=str, help="The path of train log with profiler") + parser.add_argument( + "--profiler_path", type=str, help="The path of profiler timeline log.") + parser.add_argument( + "--keyword", type=str, help="Keyword to specify analysis data") + parser.add_argument( + "--separator", type=str, default=None, help="Separator of different field in log") + parser.add_argument( + '--position', type=int, default=None, help='The position of data field') + parser.add_argument( + '--range', type=str, default="", help='The range of data field to intercept') + parser.add_argument( + '--base_batch_size', type=int, help='base_batch size on gpu') + parser.add_argument( + '--skip_steps', type=int, default=0, help='The number of steps to be skipped') + parser.add_argument( + '--model_mode', type=int, default=-1, help='Analysis mode, default value is -1') + parser.add_argument( + '--ips_unit', type=str, default=None, help='IPS unit') + parser.add_argument( + '--model_name', type=str, default=0, help='training model_name, transformer_base') + parser.add_argument( + '--mission_name', type=str, default=0, help='training mission name') + parser.add_argument( + '--direction_id', type=int, default=0, help='training direction_id') + parser.add_argument( + '--run_mode', type=str, default="sp", help='multi process or single process') + parser.add_argument( + '--index', type=int, default=1, help='{1: speed, 2:mem, 3:profiler, 6:max_batch_size}') + parser.add_argument( + '--gpu_num', type=int, default=1, help='nums of training gpus') + args = parser.parse_args() + args.separator = None if args.separator == "None" else args.separator + return args + + +def _is_number(num): + pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$') + result = pattern.match(num) + if result: + return True + else: + return False + + +class TimeAnalyzer(object): + def __init__(self, filename, keyword=None, separator=None, position=None, range="-1"): + if filename is None: + raise Exception("Please specify the filename!") + + if keyword is None: + raise Exception("Please specify the keyword!") + + self.filename = filename + self.keyword = keyword + self.separator = separator + self.position = position + self.range = range + self.records = None + self._distil() + + def _distil(self): + self.records = [] + with open(self.filename, "r") as f_object: + lines = f_object.readlines() + for line in lines: + if self.keyword not in line: + continue + try: + result = None + + # Distil the string from a line. + line = line.strip() + line_words = line.split(self.separator) if self.separator else line.split() + if args.position: + result = line_words[self.position] + else: + # Distil the string following the keyword. + for i in range(len(line_words) - 1): + if line_words[i] == self.keyword: + result = line_words[i + 1] + break + + # Distil the result from the picked string. + if not self.range: + result = result[0:] + elif _is_number(self.range): + result = result[0: int(self.range)] + else: + result = result[int(self.range.split(":")[0]): int(self.range.split(":")[1])] + self.records.append(float(result)) + except Exception as exc: + print("line is: {}; separator={}; position={}".format(line, self.separator, self.position)) + + print("Extract {} records: separator={}; position={}".format(len(self.records), self.separator, self.position)) + + def _get_fps(self, mode, batch_size, gpu_num, avg_of_records, run_mode, unit=None): + if mode == -1 and run_mode == 'sp': + assert unit, "Please set the unit when mode is -1." + fps = gpu_num * avg_of_records + elif mode == -1 and run_mode == 'mp': + assert unit, "Please set the unit when mode is -1." + fps = gpu_num * avg_of_records #temporarily, not used now + print("------------this is mp") + elif mode == 0: + # s/step -> samples/s + fps = (batch_size * gpu_num) / avg_of_records + unit = "samples/s" + elif mode == 1: + # steps/s -> steps/s + fps = avg_of_records + unit = "steps/s" + elif mode == 2: + # s/step -> steps/s + fps = 1 / avg_of_records + unit = "steps/s" + elif mode == 3: + # steps/s -> samples/s + fps = batch_size * gpu_num * avg_of_records + unit = "samples/s" + elif mode == 4: + # s/epoch -> s/epoch + fps = avg_of_records + unit = "s/epoch" + else: + ValueError("Unsupported analysis mode.") + + return fps, unit + + def analysis(self, batch_size, gpu_num=1, skip_steps=0, mode=-1, run_mode='sp', unit=None): + if batch_size <= 0: + print("base_batch_size should larger than 0.") + return 0, '' + + if len(self.records) <= skip_steps: # to address the condition which item of log equals to skip_steps + print("no records") + return 0, '' + + sum_of_records = 0 + sum_of_records_skipped = 0 + skip_min = self.records[skip_steps] + skip_max = self.records[skip_steps] + + count = len(self.records) + for i in range(count): + sum_of_records += self.records[i] + if i >= skip_steps: + sum_of_records_skipped += self.records[i] + if self.records[i] < skip_min: + skip_min = self.records[i] + if self.records[i] > skip_max: + skip_max = self.records[i] + + avg_of_records = sum_of_records / float(count) + avg_of_records_skipped = sum_of_records_skipped / float(count - skip_steps) + + fps, fps_unit = self._get_fps(mode, batch_size, gpu_num, avg_of_records, run_mode, unit) + fps_skipped, _ = self._get_fps(mode, batch_size, gpu_num, avg_of_records_skipped, run_mode, unit) + if mode == -1: + print("average ips of %d steps, skip 0 step:" % count) + print("\tAvg: %.3f %s" % (avg_of_records, fps_unit)) + print("\tFPS: %.3f %s" % (fps, fps_unit)) + if skip_steps > 0: + print("average ips of %d steps, skip %d steps:" % (count, skip_steps)) + print("\tAvg: %.3f %s" % (avg_of_records_skipped, fps_unit)) + print("\tMin: %.3f %s" % (skip_min, fps_unit)) + print("\tMax: %.3f %s" % (skip_max, fps_unit)) + print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) + elif mode == 1 or mode == 3: + print("average latency of %d steps, skip 0 step:" % count) + print("\tAvg: %.3f steps/s" % avg_of_records) + print("\tFPS: %.3f %s" % (fps, fps_unit)) + if skip_steps > 0: + print("average latency of %d steps, skip %d steps:" % (count, skip_steps)) + print("\tAvg: %.3f steps/s" % avg_of_records_skipped) + print("\tMin: %.3f steps/s" % skip_min) + print("\tMax: %.3f steps/s" % skip_max) + print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) + elif mode == 0 or mode == 2: + print("average latency of %d steps, skip 0 step:" % count) + print("\tAvg: %.3f s/step" % avg_of_records) + print("\tFPS: %.3f %s" % (fps, fps_unit)) + if skip_steps > 0: + print("average latency of %d steps, skip %d steps:" % (count, skip_steps)) + print("\tAvg: %.3f s/step" % avg_of_records_skipped) + print("\tMin: %.3f s/step" % skip_min) + print("\tMax: %.3f s/step" % skip_max) + print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) + + return round(fps_skipped, 3), fps_unit + + +if __name__ == "__main__": + args = parse_args() + run_info = dict() + run_info["log_file"] = args.filename + run_info["model_name"] = args.model_name + run_info["mission_name"] = args.mission_name + run_info["direction_id"] = args.direction_id + run_info["run_mode"] = args.run_mode + run_info["index"] = args.index + run_info["gpu_num"] = args.gpu_num + run_info["FINAL_RESULT"] = 0 + run_info["JOB_FAIL_FLAG"] = 0 + + try: + if args.index == 1: + if args.gpu_num == 1: + run_info["log_with_profiler"] = args.log_with_profiler + run_info["profiler_path"] = args.profiler_path + analyzer = TimeAnalyzer(args.filename, args.keyword, args.separator, args.position, args.range) + run_info["FINAL_RESULT"], run_info["UNIT"] = analyzer.analysis( + batch_size=args.base_batch_size, + gpu_num=args.gpu_num, + skip_steps=args.skip_steps, + mode=args.model_mode, + run_mode=args.run_mode, + unit=args.ips_unit) + try: + if int(os.getenv('job_fail_flag')) == 1 or int(run_info["FINAL_RESULT"]) == 0: + run_info["JOB_FAIL_FLAG"] = 1 + except: + pass + elif args.index == 3: + run_info["FINAL_RESULT"] = {} + records_fo_total = TimeAnalyzer(args.filename, 'Framework overhead', None, 3, '').records + records_fo_ratio = TimeAnalyzer(args.filename, 'Framework overhead', None, 5).records + records_ct_total = TimeAnalyzer(args.filename, 'Computation time', None, 3, '').records + records_gm_total = TimeAnalyzer(args.filename, 'GpuMemcpy Calls', None, 4, '').records + records_gm_ratio = TimeAnalyzer(args.filename, 'GpuMemcpy Calls', None, 6).records + records_gmas_total = TimeAnalyzer(args.filename, 'GpuMemcpyAsync Calls', None, 4, '').records + records_gms_total = TimeAnalyzer(args.filename, 'GpuMemcpySync Calls', None, 4, '').records + run_info["FINAL_RESULT"]["Framework_Total"] = records_fo_total[0] if records_fo_total else 0 + run_info["FINAL_RESULT"]["Framework_Ratio"] = records_fo_ratio[0] if records_fo_ratio else 0 + run_info["FINAL_RESULT"]["ComputationTime_Total"] = records_ct_total[0] if records_ct_total else 0 + run_info["FINAL_RESULT"]["GpuMemcpy_Total"] = records_gm_total[0] if records_gm_total else 0 + run_info["FINAL_RESULT"]["GpuMemcpy_Ratio"] = records_gm_ratio[0] if records_gm_ratio else 0 + run_info["FINAL_RESULT"]["GpuMemcpyAsync_Total"] = records_gmas_total[0] if records_gmas_total else 0 + run_info["FINAL_RESULT"]["GpuMemcpySync_Total"] = records_gms_total[0] if records_gms_total else 0 + else: + print("Not support!") + except Exception: + traceback.print_exc() + print("{}".format(json.dumps(run_info))) # it's required, for the log file path insert to the database + diff --git a/benchmark/readme.md b/benchmark/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..7f7704cca5341d495dfbcdc66ddfd29fbea1e1df --- /dev/null +++ b/benchmark/readme.md @@ -0,0 +1,34 @@ + +# PaddleOCR DB/EAST 算法训练benchmark测试 + +PaddleOCR/benchmark目录下的文件用于获取并分析训练日志。 +训练采用icdar2015数据集,包括1000张训练图像和500张测试图像。模型配置采用resnet18_vd作为backbone,分别训练batch_size=8和batch_size=16的情况。 + +## 运行训练benchmark + +benchmark/run_det.sh 中包含了三个过程: +- 安装依赖 +- 下载数据 +- 执行训练 +- 日志分析获取IPS + +在执行训练部分,会执行单机单卡(默认0号卡)单机多卡训练,并分别执行batch_size=8和batch_size=16的情况。所以执行完后,每种模型会得到4个日志文件。 + +run_det.sh 执行方式如下: + +``` +# cd PaddleOCR/ +bash benchmark/run_det.sh +``` + +以DB为例,将得到四个日志文件,如下: +``` +det_res18_db_v2.0_sp_bs16_fp32_1 +det_res18_db_v2.0_sp_bs8_fp32_1 +det_res18_db_v2.0_mp_bs16_fp32_1 +det_res18_db_v2.0_mp_bs8_fp32_1 +``` + + + + diff --git a/benchmark/run_benchmark_det.sh b/benchmark/run_benchmark_det.sh new file mode 100644 index 0000000000000000000000000000000000000000..26bcda5d20ba4e4d0498da28aafb93f29468169d --- /dev/null +++ b/benchmark/run_benchmark_det.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -xe +# 运行示例:CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode} +# 参数说明 +function _set_params(){ + run_mode=${1:-"sp"} # 单卡sp|多卡mp + batch_size=${2:-"64"} + fp_item=${3:-"fp32"} # fp32|fp16 + max_iter=${4:-"500"} # 可选,如果需要修改代码提前中断 + model_name=${5:-"model_name"} + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数 + +# 以下不用修改 + device=${CUDA_VISIBLE_DEVICES//,/ } + arr=(${device}) + num_gpu_devices=${#arr[*]} + log_file=${run_log_path}/${model_name}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices} +} +function _train(){ + echo "Train on ${num_gpu_devices} GPUs" + echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" + + train_cmd="-c configs/det/${model_name}.yml -o Train.loader.batch_size_per_card=${batch_size} Global.epoch_num=${max_iter} " + case ${run_mode} in + sp) + train_cmd="python3.7 tools/train.py "${train_cmd}"" + ;; + mp) + train_cmd="python3.7 -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py ${train_cmd}" + ;; + *) echo "choose run_mode(sp or mp)"; exit 1; + esac +# 以下不用修改 + timeout 15m ${train_cmd} > ${log_file} 2>&1 + if [ $? -ne 0 ];then + echo -e "${model_name}, FAIL" + export job_fail_flag=1 + else + echo -e "${model_name}, SUCCESS" + export job_fail_flag=0 + fi + kill -9 `ps -ef|grep 'python3.7'|awk '{print $2}'` + + if [ $run_mode = "mp" -a -d mylog ]; then + rm ${log_file} + cp mylog/workerlog.0 ${log_file} + fi + + # run log analysis + analysis_cmd="python3.7 benchmark/analysis.py --filename ${log_file} --mission_name ${model_name} --run_mode ${mode} --direction_id 0 --keyword 'ips:' --base_batch_size ${batch_szie} --skip_steps 1 --gpu_num ${num_gpu_devices} --index 1 --model_mode=-1 --ips_unit=samples/sec" + eval $analysis_cmd +} + +_set_params $@ +_train + diff --git a/benchmark/run_det.sh b/benchmark/run_det.sh new file mode 100644 index 0000000000000000000000000000000000000000..c507510c615a60177e07300976947b010dbae990 --- /dev/null +++ b/benchmark/run_det.sh @@ -0,0 +1,28 @@ +# 提供可稳定复现性能的脚本,默认在标准docker环境内py37执行: paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7 paddle=2.1.2 py=37 +# 执行目录: ./PaddleOCR +# 1 安装该模型需要的依赖 (如需开启优化策略请注明) +python3.7 -m pip install -r requirements.txt +# 2 拷贝该模型需要数据、预训练模型 +wget -c -p ./tain_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015.tar && cd train_data && tar xf icdar2015.tar && cd ../ +wget -c -p ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_pretrained.pdparams +# 3 批量运行(如不方便批量,1,2需放到单个模型中) + +model_mode_list=(det_res18_db_v2.0 det_r50_vd_east) +fp_item_list=(fp32) +bs_list=(8 16) +for model_mode in ${model_mode_list[@]}; do + for fp_item in ${fp_item_list[@]}; do + for bs_item in ${bs_list[@]}; do + echo "index is speed, 1gpus, begin, ${model_name}" + run_mode=sp + CUDA_VISIBLE_DEVICES=0 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 10 ${model_mode} # (5min) + sleep 60 + echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}" + run_mode=mp + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 10 ${model_mode} + sleep 60 + done + done +done + + diff --git a/configs/det/ch_ppocr_v2.1/ch_det_lite_train_cml_v2.1.yml b/configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml similarity index 94% rename from configs/det/ch_ppocr_v2.1/ch_det_lite_train_cml_v2.1.yml rename to configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml index dcf0e1f25f8076f8c29fe50413e567301ba644ce..ab484a44833a405513d7f2b4079a4da4c2e403c8 100644 --- a/configs/det/ch_ppocr_v2.1/ch_det_lite_train_cml_v2.1.yml +++ b/configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml @@ -8,7 +8,7 @@ Global: # evaluation is run every 5000 iterations after the 4000th iteration eval_batch_step: [3000, 2000] cal_metric_during_train: False - pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + pretrained_model: ./pretrain_models/ch_PP-OCRv2_det_distill_train/best_accuracy checkpoints: save_inference_dir: use_visualdl: False @@ -19,30 +19,26 @@ Architecture: name: DistillationModel algorithm: Distillation Models: - Student: - pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained - freeze_params: false + Teacher: + freeze_params: true return_all_feats: false model_type: det algorithm: DB + Transform: Backbone: - name: MobileNetV3 - scale: 0.5 - model_name: large - disable_se: True + name: ResNet + layers: 18 Neck: name: DBFPN - out_channels: 96 + out_channels: 256 Head: name: DBHead k: 50 - Student2: - pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + Student: freeze_params: false return_all_feats: false model_type: det algorithm: DB - Transform: Backbone: name: MobileNetV3 scale: 0.5 @@ -54,23 +50,24 @@ Architecture: Head: name: DBHead k: 50 - Teacher: - pretrained: ./pretrain_models/ch_ppocr_server_v2.0_det_train/best_accuracy - freeze_params: true + Student2: + freeze_params: false return_all_feats: false model_type: det algorithm: DB Transform: Backbone: - name: ResNet - layers: 18 + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: True Neck: name: DBFPN - out_channels: 256 + out_channels: 96 Head: name: DBHead k: 50 - + Loss: name: CombinedLoss loss_config_list: @@ -144,6 +141,7 @@ Train: img_mode: BGR channel_first: False - DetLabelEncode: # Class handling label + - CopyPaste: - IaaAugment: augmenter_args: - { 'type': Fliplr, 'args': { 'p': 0.5 } } diff --git a/configs/det/ch_ppocr_v2.1/ch_det_lite_train_distill_v2.1.yml b/configs/det/ch_PP-OCRv2/ch_PP-OCR_det_distill.yml similarity index 98% rename from configs/det/ch_ppocr_v2.1/ch_det_lite_train_distill_v2.1.yml rename to configs/det/ch_PP-OCRv2/ch_PP-OCR_det_distill.yml index 1159d71bf94c330e26c3009b38c5c2b4a9c96f52..46daeeb86d004772a6fb964d602369dcd53b3a01 100644 --- a/configs/det/ch_ppocr_v2.1/ch_det_lite_train_distill_v2.1.yml +++ b/configs/det/ch_PP-OCRv2/ch_PP-OCR_det_distill.yml @@ -68,8 +68,7 @@ Loss: ohem_ratio: 3 - DistillationDBLoss: weight: 1.0 - model_name_list: ["Student", "Teacher"] - # key: maps + model_name_list: ["Student"] name: DBLoss balance_loss: true main_loss_type: DiceLoss @@ -116,6 +115,7 @@ Train: img_mode: BGR channel_first: False - DetLabelEncode: # Class handling label + - CopyPaste: - IaaAugment: augmenter_args: - { 'type': Fliplr, 'args': { 'p': 0.5 } } diff --git a/configs/det/ch_ppocr_v2.1/ch_det_lite_train_dml_v2.1.yml b/configs/det/ch_PP-OCRv2/ch_PP-OCR_det_dml.yml similarity index 99% rename from configs/det/ch_ppocr_v2.1/ch_det_lite_train_dml_v2.1.yml rename to configs/det/ch_PP-OCRv2/ch_PP-OCR_det_dml.yml index 7fe2d2e1a065b54d0e2479475f5f67ac5e38a166..bfbc3b6268cf521acb035be33ced9141046fc430 100644 --- a/configs/det/ch_ppocr_v2.1/ch_det_lite_train_dml_v2.1.yml +++ b/configs/det/ch_PP-OCRv2/ch_PP-OCR_det_dml.yml @@ -118,6 +118,7 @@ Train: img_mode: BGR channel_first: False - DetLabelEncode: # Class handling label + - CopyPaste: - IaaAugment: augmenter_args: - { 'type': Fliplr, 'args': { 'p': 0.5 } } diff --git a/configs/det/ch_PP-OCRv2/ch_PP-OCR_det_student.yml b/configs/det/ch_PP-OCRv2/ch_PP-OCR_det_student.yml new file mode 100644 index 0000000000000000000000000000000000000000..cca2a596ce73d7f66a14e5967e5926c5ee36295c --- /dev/null +++ b/configs/det/ch_PP-OCRv2/ch_PP-OCR_det_student.yml @@ -0,0 +1,132 @@ +Global: + use_gpu: true + epoch_num: 1200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/ch_db_mv3/ + save_epoch_step: 1200 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 400] + cal_metric_during_train: False + pretrained_model: ./pretrain_models/student.pdparams + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_db/predicts_db.txt + +Architecture: + model_type: det + algorithm: DB + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: True + Neck: + name: DBFPN + out_channels: 96 + Head: + name: DBHead + k: 50 + +Loss: + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 2 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - IaaAugment: + augmenter_args: + - { 'type': Fliplr, 'args': { 'p': 0.5 } } + - { 'type': Affine, 'args': { 'rotate': [-10, 10] } } + - { 'type': Resize, 'args': { 'size': [0.5, 3] } } + - EastRandomCropData: + size: [960, 960] + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: +# image_shape: [736, 1280] + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 2 diff --git a/configs/det/det_mv3_db.yml b/configs/det/det_mv3_db.yml index b69ed58cd6f64b82b419d46f4f91458d7d167d84..1fab509d12167f0cfa3bb77cf21173c68af55737 100644 --- a/configs/det/det_mv3_db.yml +++ b/configs/det/det_mv3_db.yml @@ -128,4 +128,4 @@ Eval: drop_last: False batch_size_per_card: 1 # must be 1 num_workers: 8 - use_shared_memory: False \ No newline at end of file + use_shared_memory: False diff --git a/configs/det/det_mv3_pse.yml b/configs/det/det_mv3_pse.yml new file mode 100644 index 0000000000000000000000000000000000000000..61ac24727acbd4f0b1eea15af08c0f9e71ce95a3 --- /dev/null +++ b/configs/det/det_mv3_pse.yml @@ -0,0 +1,135 @@ +Global: + use_gpu: true + epoch_num: 600 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/det_mv3_pse/ + save_epoch_step: 600 + # evaluation is run every 63 iterations + eval_batch_step: [ 0,63 ] + cal_metric_during_train: False + pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + checkpoints: #./output/det_r50_vd_pse_batch8_ColorJitter/best_accuracy + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_pse/predicts_pse.txt + +Architecture: + model_type: det + algorithm: PSE + Transform: null + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + Neck: + name: FPN + out_channels: 96 + Head: + name: PSEHead + hidden_dim: 96 + out_channels: 7 + +Loss: + name: PSELoss + alpha: 0.7 + ohem_ratio: 3 + kernel_sample_mask: pred + reduction: none + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Step + learning_rate: 0.001 + step_size: 200 + gamma: 0.1 + regularizer: + name: 'L2' + factor: 0.0005 + +PostProcess: + name: PSEPostProcess + thresh: 0 + box_thresh: 0.85 + min_area: 16 + box_type: box # 'box' or 'poly' + scale: 1 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [ 1.0 ] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - ColorJitter: + brightness: 0.12549019607843137 + saturation: 0.5 + - IaaAugment: + augmenter_args: + - { 'type': Resize, 'args': { 'size': [ 0.5, 3 ] } } + - { 'type': Fliplr, 'args': { 'p': 0.5 } } + - { 'type': Affine, 'args': { 'rotate': [ -10, 10 ] } } + - MakePseGt: + kernel_num: 7 + min_shrink_ratio: 0.4 + size: 640 + - RandomCropImgMask: + size: [ 640,640 ] + main_key: gt_text + crop_keys: [ 'image', 'gt_text', 'gt_kernels', 'mask' ] + - NormalizeImage: + scale: 1./255. + mean: [ 0.485, 0.456, 0.406 ] + std: [ 0.229, 0.224, 0.225 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'image', 'gt_text', 'gt_kernels', 'mask' ] # the order of the dataloader list + loader: + shuffle: True + drop_last: False + batch_size_per_card: 16 + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + ratio_list: [ 1.0 ] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: + limit_side_len: 736 + limit_type: min + - NormalizeImage: + scale: 1./255. + mean: [ 0.485, 0.456, 0.406 ] + std: [ 0.229, 0.224, 0.225 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'image', 'shape', 'polys', 'ignore_tags' ] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 8 \ No newline at end of file diff --git a/configs/det/det_r50_vd_db.yml b/configs/det/det_r50_vd_db.yml index 42b3898ef463b8dba6338f37824ade0c93794212..ab67786ece2db9c082ad0484e9dd9a71a795c2d7 100644 --- a/configs/det/det_r50_vd_db.yml +++ b/configs/det/det_r50_vd_db.yml @@ -98,7 +98,7 @@ Train: shuffle: True drop_last: False batch_size_per_card: 16 - num_workers: 8 + num_workers: 4 Eval: dataset: @@ -125,4 +125,4 @@ Eval: shuffle: False drop_last: False batch_size_per_card: 1 # must be 1 - num_workers: 8 \ No newline at end of file + num_workers: 8 diff --git a/configs/det/det_r50_vd_east.yml b/configs/det/det_r50_vd_east.yml index 0253c5bd9940fa6c0ec7da2c6639c1bc060842ca..e84a5fa7a7af34bde5e0abc6fed2e01f6ce42e6b 100644 --- a/configs/det/det_r50_vd_east.yml +++ b/configs/det/det_r50_vd_east.yml @@ -8,7 +8,7 @@ Global: # evaluation is run every 5000 iterations after the 4000th iteration eval_batch_step: [4000, 5000] cal_metric_during_train: False - pretrained_model: ./pretrain_models/ResNet50_vd_pretrained/ + pretrained_model: ./pretrain_models/ResNet50_vd_pretrained checkpoints: save_inference_dir: use_visualdl: False diff --git a/configs/det/det_r50_vd_pse.yml b/configs/det/det_r50_vd_pse.yml new file mode 100644 index 0000000000000000000000000000000000000000..4629210747d3b61344cc47b11dcff01e6b738586 --- /dev/null +++ b/configs/det/det_r50_vd_pse.yml @@ -0,0 +1,134 @@ +Global: + use_gpu: true + epoch_num: 600 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/det_r50_vd_pse/ + save_epoch_step: 600 + # evaluation is run every 125 iterations + eval_batch_step: [ 0,125 ] + cal_metric_during_train: False + pretrained_model: ./pretrain_models/ResNet50_vd_ssld_pretrained + checkpoints: #./output/det_r50_vd_pse_batch8_ColorJitter/best_accuracy + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_pse/predicts_pse.txt + +Architecture: + model_type: det + algorithm: PSE + Transform: + Backbone: + name: ResNet + layers: 50 + Neck: + name: FPN + out_channels: 256 + Head: + name: PSEHead + hidden_dim: 256 + out_channels: 7 + +Loss: + name: PSELoss + alpha: 0.7 + ohem_ratio: 3 + kernel_sample_mask: pred + reduction: none + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Step + learning_rate: 0.0001 + step_size: 200 + gamma: 0.1 + regularizer: + name: 'L2' + factor: 0.0005 + +PostProcess: + name: PSEPostProcess + thresh: 0 + box_thresh: 0.85 + min_area: 16 + box_type: box # 'box' or 'poly' + scale: 1 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [ 1.0 ] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - ColorJitter: + brightness: 0.12549019607843137 + saturation: 0.5 + - IaaAugment: + augmenter_args: + - { 'type': Resize, 'args': { 'size': [ 0.5, 3 ] } } + - { 'type': Fliplr, 'args': { 'p': 0.5 } } + - { 'type': Affine, 'args': { 'rotate': [ -10, 10 ] } } + - MakePseGt: + kernel_num: 7 + min_shrink_ratio: 0.4 + size: 640 + - RandomCropImgMask: + size: [ 640,640 ] + main_key: gt_text + crop_keys: [ 'image', 'gt_text', 'gt_kernels', 'mask' ] + - NormalizeImage: + scale: 1./255. + mean: [ 0.485, 0.456, 0.406 ] + std: [ 0.229, 0.224, 0.225 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'image', 'gt_text', 'gt_kernels', 'mask' ] # the order of the dataloader list + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 8 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + ratio_list: [ 1.0 ] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: + limit_side_len: 736 + limit_type: min + - NormalizeImage: + scale: 1./255. + mean: [ 0.485, 0.456, 0.406 ] + std: [ 0.229, 0.224, 0.225 ] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: [ 'image', 'shape', 'polys', 'ignore_tags' ] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 8 \ No newline at end of file diff --git a/configs/det/det_res18_db_v2.0.yml b/configs/det/det_res18_db_v2.0.yml new file mode 100644 index 0000000000000000000000000000000000000000..7b07ef99648956a70b5a71f1e61f09b592226f90 --- /dev/null +++ b/configs/det/det_res18_db_v2.0.yml @@ -0,0 +1,131 @@ +Global: + use_gpu: true + epoch_num: 1200 + log_smooth_window: 20 + print_batch_step: 2 + save_model_dir: ./output/ch_db_res18/ + save_epoch_step: 1200 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [3000, 2000] + cal_metric_during_train: False + pretrained_model: ./pretrain_models/ResNet18_vd_pretrained + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_db/predicts_db.txt + +Architecture: + model_type: det + algorithm: DB + Transform: + Backbone: + name: ResNet + layers: 18 + disable_se: True + Neck: + name: DBFPN + out_channels: 256 + Head: + name: DBHead + k: 50 + +Loss: + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 2 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - IaaAugment: + augmenter_args: + - { 'type': Fliplr, 'args': { 'p': 0.5 } } + - { 'type': Affine, 'args': { 'rotate': [-10, 10] } } + - { 'type': Resize, 'args': { 'size': [0.5, 3] } } + - EastRandomCropData: + size: [960, 960] + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 + num_workers: 4 + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: +# image_shape: [736, 1280] + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 2 diff --git a/configs/e2e/e2e_r50_vd_pg.yml b/configs/e2e/e2e_r50_vd_pg.yml index 4a6e19f4461c7236f3a9a5253437eff97fa72f67..c4c5226e796a42db723ce78ef65473e357c25dc6 100644 --- a/configs/e2e/e2e_r50_vd_pg.yml +++ b/configs/e2e/e2e_r50_vd_pg.yml @@ -94,7 +94,7 @@ Eval: label_file_list: [./train_data/total_text/test/test.txt] transforms: - DecodeImage: # load image - img_mode: RGB + img_mode: BGR channel_first: False - E2ELabelEncodeTest: - E2EResizeForTest: @@ -111,4 +111,4 @@ Eval: shuffle: False drop_last: False batch_size_per_card: 1 # must be 1 - num_workers: 2 \ No newline at end of file + num_workers: 2 diff --git a/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml b/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml new file mode 100644 index 0000000000000000000000000000000000000000..e2aa50106ff60aa61858a22ba6fdd03b8cd04d85 --- /dev/null +++ b/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml @@ -0,0 +1,110 @@ +Global: + debug: false + use_gpu: true + epoch_num: 800 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_mobile_pp-OCRv2 + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_mobile_pp-OCRv2.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Piecewise + decay_epochs : [700, 800] + values : [0.001, 0.0001] + warmup_epoch: 5 + regularizer: + name: L2 + factor: 2.0e-05 + + +Architecture: + model_type: rec + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 64 + Head: + name: CTCHead + mid_channels: 96 + fc_decay: 0.00002 + +Loss: + name: CTCLoss + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: + - CTCLabelEncode: + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: + - image + - label + - length + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - CTCLabelEncode: + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: + - image + - label + - length + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 8 diff --git a/configs/rec/ch_ppocr_v2.1/rec_chinese_lite_train_distillation_v2.1.yml b/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml similarity index 95% rename from configs/rec/ch_ppocr_v2.1/rec_chinese_lite_train_distillation_v2.1.yml rename to configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml index 27ba4fd70b9a7ee7d4d905b3948f6cbf2b7e9469..ab48b99791d00785d143cd933ccc31b3f69d0f8f 100644 --- a/configs/rec/ch_ppocr_v2.1/rec_chinese_lite_train_distillation_v2.1.yml +++ b/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml @@ -4,7 +4,7 @@ Global: epoch_num: 800 log_smooth_window: 20 print_batch_step: 10 - save_model_dir: ./output/rec_chinese_lite_distillation_v2.1 + save_model_dir: ./output/rec_pp-OCRv2_distillation save_epoch_step: 3 eval_batch_step: [0, 2000] cal_metric_during_train: true @@ -14,12 +14,11 @@ Global: use_visualdl: false infer_img: doc/imgs_words/ch/word_1.jpg character_dict_path: ppocr/utils/ppocr_keys_v1.txt - character_type: ch max_text_length: 25 infer_mode: false use_space_char: true distributed: true - save_res_path: ./output/rec/predicts_chinese_lite_distillation_v2.1.txt + save_res_path: ./output/rec/predicts_pp-OCRv2_distillation.txt Optimizer: @@ -88,6 +87,7 @@ Loss: - DistillationDMLLoss: weight: 1.0 act: "softmax" + use_log: true model_name_pairs: - ["Student", "Teacher"] key: head_out diff --git a/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_enhanced_ctc_loss.yml b/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_enhanced_ctc_loss.yml new file mode 100644 index 0000000000000000000000000000000000000000..7161203035b2324c7afc56b2b0c743428558a098 --- /dev/null +++ b/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_enhanced_ctc_loss.yml @@ -0,0 +1,125 @@ +Global: + debug: false + use_gpu: true + epoch_num: 800 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_mobile_pp-OCRv2_enhanced_ctc_loss + save_epoch_step: 3 + eval_batch_step: [0, 2000] + cal_metric_during_train: true + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_mobile_pp-OCRv2_enhanced_ctc_loss.txt + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Piecewise + decay_epochs : [700, 800] + values : [0.001, 0.0001] + warmup_epoch: 5 + regularizer: + name: L2 + factor: 2.0e-05 + + +Architecture: + model_type: rec + algorithm: CRNN + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 64 + Head: + name: CTCHead + mid_channels: 96 + fc_decay: 0.00002 + return_feats: true + +Loss: + name: CombinedLoss + loss_config_list: + - CTCLoss: + use_focal_loss: false + weight: 1.0 + - CenterLoss: + weight: 0.05 + num_classes: 6625 + feat_dim: 96 + init_center: false + center_file_path: "./train_center.pkl" + # you can also try to add ace loss on your own dataset + # - ACELoss: + # weight: 0.1 + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: + - CTCLabelEncode: + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: + - image + - label + - length + - label_ace + loader: + shuffle: true + batch_size_per_card: 128 + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - CTCLabelEncode: + - RecResizeImg: + image_shape: [3, 32, 320] + - KeepKeys: + keep_keys: + - image + - label + - length + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 8 diff --git a/configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml b/configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml index 717c16814bac2f6fca78aa63566df12bd8cbf67b..c76063d5cedc31985404ddfff5147e1e0c100d20 100644 --- a/configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml +++ b/configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml @@ -15,7 +15,6 @@ Global: infer_img: doc/imgs_words/ch/word_1.jpg # for data or label process character_dict_path: ppocr/utils/ppocr_keys_v1.txt - character_type: ch max_text_length: 25 infer_mode: False use_space_char: True diff --git a/configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml b/configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml index 660465f301047110db7001db7a32e687f2917b61..563ce110b865adabf320616227bdf8d2eb465c11 100644 --- a/configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml +++ b/configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml @@ -15,7 +15,6 @@ Global: infer_img: doc/imgs_words/ch/word_1.jpg # for data or label process character_dict_path: ppocr/utils/ppocr_keys_v1.txt - character_type: ch max_text_length: 25 infer_mode: False use_space_char: True diff --git a/configs/rec/multi_language/rec_arabic_lite_train.yml b/configs/rec/multi_language/rec_arabic_lite_train.yml index 6dcfd1b69988b09c7dfc05cdbacce9756ea1f7cb..a746260e0001e34b1f50fb066885091b3686cb4d 100644 --- a/configs/rec/multi_language/rec_arabic_lite_train.yml +++ b/configs/rec/multi_language/rec_arabic_lite_train.yml @@ -15,7 +15,6 @@ Global: use_visualdl: false infer_img: null character_dict_path: ppocr/utils/dict/arabic_dict.txt - character_type: arabic max_text_length: 25 infer_mode: false use_space_char: true diff --git a/configs/rec/multi_language/rec_cyrillic_lite_train.yml b/configs/rec/multi_language/rec_cyrillic_lite_train.yml index 52527c1dfb9a306429bbab9241c623581d546e45..98544f627111340b61abd210ea5b4d7979511a15 100644 --- a/configs/rec/multi_language/rec_cyrillic_lite_train.yml +++ b/configs/rec/multi_language/rec_cyrillic_lite_train.yml @@ -15,7 +15,6 @@ Global: use_visualdl: false infer_img: null character_dict_path: ppocr/utils/dict/cyrillic_dict.txt - character_type: cyrillic max_text_length: 25 infer_mode: false use_space_char: true diff --git a/configs/rec/multi_language/rec_devanagari_lite_train.yml b/configs/rec/multi_language/rec_devanagari_lite_train.yml index e1a7c829c3e6d3c3a57f1d501cdd80a560703ec7..518b9f19ccaccb6405f7e9cb4d783b441e8c7ae7 100644 --- a/configs/rec/multi_language/rec_devanagari_lite_train.yml +++ b/configs/rec/multi_language/rec_devanagari_lite_train.yml @@ -15,7 +15,6 @@ Global: use_visualdl: false infer_img: null character_dict_path: ppocr/utils/dict/devanagari_dict.txt - character_type: devanagari max_text_length: 25 infer_mode: false use_space_char: true diff --git a/configs/rec/multi_language/rec_en_number_lite_train.yml b/configs/rec/multi_language/rec_en_number_lite_train.yml index fff4dfcd905b406964bb07cf14017af22f40e91e..ff1fb8698163d00fae57e682059da47d2007505d 100644 --- a/configs/rec/multi_language/rec_en_number_lite_train.yml +++ b/configs/rec/multi_language/rec_en_number_lite_train.yml @@ -16,7 +16,6 @@ Global: infer_img: # for data or label process character_dict_path: ppocr/utils/en_dict.txt - character_type: EN max_text_length: 25 infer_mode: False use_space_char: True diff --git a/configs/rec/multi_language/rec_french_lite_train.yml b/configs/rec/multi_language/rec_french_lite_train.yml index 63378d38a0d31fc77c33173e0ed864f28c5c3a8b..217369d30bc3ac6e09c2a580facbd0395e0ce727 100644 --- a/configs/rec/multi_language/rec_french_lite_train.yml +++ b/configs/rec/multi_language/rec_french_lite_train.yml @@ -16,7 +16,6 @@ Global: infer_img: # for data or label process character_dict_path: ppocr/utils/dict/french_dict.txt - character_type: french max_text_length: 25 infer_mode: False use_space_char: False diff --git a/configs/rec/multi_language/rec_german_lite_train.yml b/configs/rec/multi_language/rec_german_lite_train.yml index 1651510c5e4597e82298135d2f6c64aa747cf961..67520f5fb668327fdbd0cddb68cb6a3d6d3d112e 100644 --- a/configs/rec/multi_language/rec_german_lite_train.yml +++ b/configs/rec/multi_language/rec_german_lite_train.yml @@ -16,7 +16,6 @@ Global: infer_img: # for data or label process character_dict_path: ppocr/utils/dict/german_dict.txt - character_type: german max_text_length: 25 infer_mode: False use_space_char: False diff --git a/configs/rec/multi_language/rec_japan_lite_train.yml b/configs/rec/multi_language/rec_japan_lite_train.yml index bb47584edbc70f68d8d2d89dced3ec9b12f0e1cb..448aff1ebd0b418191c622cee97346931a86929b 100644 --- a/configs/rec/multi_language/rec_japan_lite_train.yml +++ b/configs/rec/multi_language/rec_japan_lite_train.yml @@ -16,7 +16,6 @@ Global: infer_img: # for data or label process character_dict_path: ppocr/utils/dict/japan_dict.txt - character_type: japan max_text_length: 25 infer_mode: False use_space_char: False diff --git a/configs/rec/multi_language/rec_korean_lite_train.yml b/configs/rec/multi_language/rec_korean_lite_train.yml index 77f15524f78cd7f1c3dcf4988960e718422f5d89..8118119da8f15102ad4c8485b7e26b9436d65cda 100644 --- a/configs/rec/multi_language/rec_korean_lite_train.yml +++ b/configs/rec/multi_language/rec_korean_lite_train.yml @@ -16,7 +16,6 @@ Global: infer_img: # for data or label process character_dict_path: ppocr/utils/dict/korean_dict.txt - character_type: korean max_text_length: 25 infer_mode: False use_space_char: False diff --git a/configs/rec/multi_language/rec_latin_lite_train.yml b/configs/rec/multi_language/rec_latin_lite_train.yml index e71112b4b4f0afd3ceab9f10078bc5d518ee9e59..04fe6d1a49ea06341b2218123d2319a5962b934b 100644 --- a/configs/rec/multi_language/rec_latin_lite_train.yml +++ b/configs/rec/multi_language/rec_latin_lite_train.yml @@ -15,7 +15,6 @@ Global: use_visualdl: false infer_img: null character_dict_path: ppocr/utils/dict/latin_dict.txt - character_type: latin max_text_length: 25 infer_mode: false use_space_char: true diff --git a/configs/rec/rec_icdar15_train.yml b/configs/rec/rec_icdar15_train.yml index 500d2333f217008b2abf352b0ccd29a43ec24fd5..893f7382f8b82f3c2d5f10cdf10735645fd3a5ee 100644 --- a/configs/rec/rec_icdar15_train.yml +++ b/configs/rec/rec_icdar15_train.yml @@ -14,8 +14,7 @@ Global: use_visualdl: False infer_img: doc/imgs_words_en/word_10.png # for data or label process - character_dict_path: ppocr/utils/ic15_dict.txt - character_type: ch + character_dict_path: ppocr/utils/en_dict.txt max_text_length: 25 infer_mode: False use_space_char: False diff --git a/configs/rec/rec_mtb_nrtr.yml b/configs/rec/rec_mtb_nrtr.yml new file mode 100644 index 0000000000000000000000000000000000000000..04267500854310dc6d5df9318bb8c056c65cd5b5 --- /dev/null +++ b/configs/rec/rec_mtb_nrtr.yml @@ -0,0 +1,101 @@ +Global: + use_gpu: True + epoch_num: 21 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/nrtr/ + save_epoch_step: 1 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: ppocr/utils/EN_symbol_dict.txt + max_text_length: 25 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_nrtr.txt + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.99 + clip_norm: 5.0 + lr: + name: Cosine + learning_rate: 0.0005 + warmup_epoch: 2 + regularizer: + name: 'L2' + factor: 0. + +Architecture: + model_type: rec + algorithm: NRTR + in_channels: 1 + Transform: + Backbone: + name: MTB + cnn_num: 2 + Head: + name: Transformer + d_model: 512 + num_encoder_layers: 6 + beam_size: -1 # When Beam size is greater than 0, it means to use beam search when evaluation. + + +Loss: + name: NRTRLoss + smoothing: True + +PostProcess: + name: NRTRLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - NRTRLabelEncode: # Class handling label + - NRTRRecResizeImg: + image_shape: [100, 32] + resize_type: PIL # PIL or OpenCV + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 512 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/evaluation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - NRTRLabelEncode: # Class handling label + - NRTRRecResizeImg: + image_shape: [100, 32] + resize_type: PIL # PIL or OpenCV + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 1 + use_shared_memory: False diff --git a/configs/rec/rec_mv3_none_bilstm_ctc.yml b/configs/rec/rec_mv3_none_bilstm_ctc.yml index 9e0bd23edba053b44fc7241c0a587ced5cd1ac76..9a950923b0cd4292f3f4d70ae51abc60c59dc615 100644 --- a/configs/rec/rec_mv3_none_bilstm_ctc.yml +++ b/configs/rec/rec_mv3_none_bilstm_ctc.yml @@ -14,8 +14,7 @@ Global: use_visualdl: False infer_img: doc/imgs_words_en/word_10.png # for data or label process - character_dict_path: - character_type: en + character_dict_path: max_text_length: 25 infer_mode: False use_space_char: False diff --git a/configs/rec/rec_mv3_none_none_ctc.yml b/configs/rec/rec_mv3_none_none_ctc.yml index 904afe1134b565d6459cdcda4cbfa43ae4925b92..28f0252adb4b74f88f8c6203521adb66c851e6b0 100644 --- a/configs/rec/rec_mv3_none_none_ctc.yml +++ b/configs/rec/rec_mv3_none_none_ctc.yml @@ -15,7 +15,6 @@ Global: infer_img: doc/imgs_words_en/word_10.png # for data or label process character_dict_path: - character_type: en max_text_length: 25 infer_mode: False use_space_char: False diff --git a/configs/rec/rec_mv3_tps_bilstm_att.yml b/configs/rec/rec_mv3_tps_bilstm_att.yml index feaeb0545c687774938521e4c45c026207172f11..6c347e765fe04ca3e5330de6cabb9998855436c9 100644 --- a/configs/rec/rec_mv3_tps_bilstm_att.yml +++ b/configs/rec/rec_mv3_tps_bilstm_att.yml @@ -14,8 +14,7 @@ Global: use_visualdl: False infer_img: doc/imgs_words/ch/word_1.jpg # for data or label process - character_dict_path: - character_type: en + character_dict_path: max_text_length: 25 infer_mode: False use_space_char: False diff --git a/configs/rec/rec_mv3_tps_bilstm_ctc.yml b/configs/rec/rec_mv3_tps_bilstm_ctc.yml index 65ab23c42aff54ee548867e3482d7400603551ad..9d1ebbe4e2ce25d746ff9d6993bf820347a3558a 100644 --- a/configs/rec/rec_mv3_tps_bilstm_ctc.yml +++ b/configs/rec/rec_mv3_tps_bilstm_ctc.yml @@ -15,7 +15,6 @@ Global: infer_img: doc/imgs_words_en/word_10.png # for data or label process character_dict_path: - character_type: en max_text_length: 25 infer_mode: False use_space_char: False diff --git a/configs/rec/rec_r31_sar.yml b/configs/rec/rec_r31_sar.yml new file mode 100644 index 0000000000000000000000000000000000000000..65e7877b28da80e0730f551b07d60b8a8c0ac48e --- /dev/null +++ b/configs/rec/rec_r31_sar.yml @@ -0,0 +1,98 @@ +Global: + use_gpu: true + epoch_num: 5 + log_smooth_window: 20 + print_batch_step: 20 + save_model_dir: ./sar_rec + save_epoch_step: 1 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: + # for data or label process + character_dict_path: ppocr/utils/dict90.txt + max_text_length: 30 + infer_mode: False + use_space_char: False + rm_symbol: True + save_res_path: ./output/rec/predicts_sar.txt + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + name: Piecewise + decay_epochs: [3, 4] + values: [0.001, 0.0001, 0.00001] + regularizer: + name: 'L2' + factor: 0 + +Architecture: + model_type: rec + algorithm: SAR + Transform: + Backbone: + name: ResNet31 + Head: + name: SARHead + +Loss: + name: SARLoss + +PostProcess: + name: SARLabelDecode + +Metric: + name: RecMetric + + +Train: + dataset: + name: SimpleDataSet + label_file_list: ['./train_data/train_list.txt'] + data_dir: ./train_data/ + ratio_list: 1.0 + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - SARLabelEncode: # Class handling label + - SARRecResizeImg: + image_shape: [3, 48, 48, 160] # h:48 w:[48,160] + width_downsample_ratio: 0.25 + - KeepKeys: + keep_keys: ['image', 'label', 'valid_ratio'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 64 + drop_last: True + num_workers: 8 + use_shared_memory: False + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/evaluation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - SARLabelEncode: # Class handling label + - SARRecResizeImg: + image_shape: [3, 48, 48, 160] + width_downsample_ratio: 0.25 + - KeepKeys: + keep_keys: ['image', 'label', 'valid_ratio'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 64 + num_workers: 4 + use_shared_memory: False + diff --git a/configs/rec/rec_r34_vd_none_bilstm_ctc.yml b/configs/rec/rec_r34_vd_none_bilstm_ctc.yml index 331bb36ed84b83dc62a0f9b15524457238dedc13..9fdb5e99acec4ab5b2c3ff4b29158a41c766844b 100644 --- a/configs/rec/rec_r34_vd_none_bilstm_ctc.yml +++ b/configs/rec/rec_r34_vd_none_bilstm_ctc.yml @@ -14,8 +14,7 @@ Global: use_visualdl: False infer_img: doc/imgs_words_en/word_10.png # for data or label process - character_dict_path: - character_type: en + character_dict_path: max_text_length: 25 infer_mode: False use_space_char: False diff --git a/configs/rec/rec_r34_vd_none_none_ctc.yml b/configs/rec/rec_r34_vd_none_none_ctc.yml index 695a46958f669e4cb9508646080b45ac0767b8c9..0af2b2ff21938ce9b1750bd0fd8e27dabfd39998 100644 --- a/configs/rec/rec_r34_vd_none_none_ctc.yml +++ b/configs/rec/rec_r34_vd_none_none_ctc.yml @@ -15,7 +15,6 @@ Global: infer_img: doc/imgs_words_en/word_10.png # for data or label process character_dict_path: - character_type: en max_text_length: 25 infer_mode: False use_space_char: False diff --git a/configs/rec/rec_r34_vd_tps_bilstm_att.yml b/configs/rec/rec_r34_vd_tps_bilstm_att.yml index fdd3588c844ffd7ed61de73077ae2994f0ad498d..8919aae75720d1e2f786957dd44e2d5d6dcbb5af 100644 --- a/configs/rec/rec_r34_vd_tps_bilstm_att.yml +++ b/configs/rec/rec_r34_vd_tps_bilstm_att.yml @@ -14,8 +14,7 @@ Global: use_visualdl: False infer_img: doc/imgs_words/ch/word_1.jpg # for data or label process - character_dict_path: - character_type: en + character_dict_path: max_text_length: 25 infer_mode: False use_space_char: False diff --git a/configs/rec/rec_r34_vd_tps_bilstm_ctc.yml b/configs/rec/rec_r34_vd_tps_bilstm_ctc.yml index 67108a6eaca2dd6f239261f5184341e5ade00dc0..c21fe61fbe62bab940bdb5ec1fef7833f402cb6c 100644 --- a/configs/rec/rec_r34_vd_tps_bilstm_ctc.yml +++ b/configs/rec/rec_r34_vd_tps_bilstm_ctc.yml @@ -14,8 +14,7 @@ Global: use_visualdl: False infer_img: doc/imgs_words_en/word_10.png # for data or label process - character_dict_path: - character_type: en + character_dict_path: max_text_length: 25 infer_mode: False use_space_char: False diff --git a/configs/rec/rec_r50_fpn_srn.yml b/configs/rec/rec_r50_fpn_srn.yml index fa7b1ae4e5fed41d3aa3670d6672cca01b63c359..b685362dedbcd6022fa247fe1499017647fa1546 100644 --- a/configs/rec/rec_r50_fpn_srn.yml +++ b/configs/rec/rec_r50_fpn_srn.yml @@ -14,8 +14,7 @@ Global: use_visualdl: False infer_img: doc/imgs_words/ch/word_1.jpg # for data or label process - character_dict_path: - character_type: en + character_dict_path: max_text_length: 25 num_heads: 8 infer_mode: False diff --git a/configs/rec/rec_resnet_stn_bilstm_att.yml b/configs/rec/rec_resnet_stn_bilstm_att.yml new file mode 100644 index 0000000000000000000000000000000000000000..0f599258d46e2ce89a6b7deccf8287a2ec0f7e4e --- /dev/null +++ b/configs/rec/rec_resnet_stn_bilstm_att.yml @@ -0,0 +1,108 @@ +Global: + use_gpu: True + epoch_num: 400 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/seed + save_epoch_step: 3 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words_en/word_10.png + # for data or label process + character_dict_path: ppocr/utils/EN_symbol_dict.txt + max_text_length: 100 + infer_mode: False + use_space_char: False + save_res_path: ./output/rec/predicts_seed.txt + + +Optimizer: + name: Adadelta + weight_deacy: 0.0 + momentum: 0.9 + lr: + name: Piecewise + decay_epochs: [4,5,8] + values: [1.0, 0.1, 0.01] + regularizer: + name: 'L2' + factor: 2.0e-05 + + +Architecture: + model_type: rec + algorithm: SEED + Transform: + name: STN_ON + tps_inputsize: [32, 64] + tps_outputsize: [32, 100] + num_control_points: 20 + tps_margins: [0.05,0.05] + stn_activation: none + Backbone: + name: ResNet_ASTER + Head: + name: AsterHead # AttentionHead + sDim: 512 + attDim: 512 + max_len_labels: 100 + +Loss: + name: AsterLoss + +PostProcess: + name: SEEDLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + is_filter: True + +Train: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/training/ + transforms: + - Fasttext: + path: "./cc.en.300.bin" + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - SEEDLabelEncode: # Class handling label + - RecResizeImg: + character_type: en + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length', 'fast_label'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 6 + +Eval: + dataset: + name: LMDBDataSet + data_dir: ./train_data/data_lmdb_release/evaluation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - SEEDLabelEncode: # Class handling label + - RecResizeImg: + character_type: en + image_shape: [3, 64, 256] + padding: False + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: True + batch_size_per_card: 256 + num_workers: 4 diff --git a/deploy/cpp_infer/CMakeLists.txt b/deploy/cpp_infer/CMakeLists.txt index efb183c5b4ebb460832b7d353e8a019ee079d975..6d3ecb6ac2e9e6993814f077ca772d0d94f5d008 100644 --- a/deploy/cpp_infer/CMakeLists.txt +++ b/deploy/cpp_infer/CMakeLists.txt @@ -1,4 +1,5 @@ project(ppocr CXX C) +cmake_minimum_required(VERSION 3.14) option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON) option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) @@ -206,9 +207,12 @@ endif() set(DEPS ${DEPS} ${OpenCV_LIBS}) +include(FetchContent) +include(external-cmake/auto-log.cmake) +include_directories(${FETCHCONTENT_BASE_DIR}/extern_autolog-src) + AUX_SOURCE_DIRECTORY(./src SRCS) add_executable(${DEMO_NAME} ${SRCS}) - target_link_libraries(${DEMO_NAME} ${DEPS}) if (WIN32 AND WITH_MKL) diff --git a/deploy/cpp_infer/docs/vs2019_build_withgpu_config.png b/deploy/cpp_infer/docs/vs2019_build_withgpu_config.png new file mode 100644 index 0000000000000000000000000000000000000000..beff2884480790d97ef3577c77c0336fc04557ed Binary files /dev/null and b/deploy/cpp_infer/docs/vs2019_build_withgpu_config.png differ diff --git a/deploy/cpp_infer/docs/windows_vs2019_build.md b/deploy/cpp_infer/docs/windows_vs2019_build.md index e46f542a323dbe539b4a7f596e4587f7729a4420..24a1e55cd7e5728e9cd56da8a35a72892380d28b 100644 --- a/deploy/cpp_infer/docs/windows_vs2019_build.md +++ b/deploy/cpp_infer/docs/windows_vs2019_build.md @@ -5,20 +5,20 @@ PaddleOCR在Windows 平台下基于`Visual Studio 2019 Community` 进行了测 ## 前置条件 * Visual Studio 2019 -* CUDA 9.0 / CUDA 10.0,cudnn 7+ (仅在使用GPU版本的预测库时需要) +* CUDA 10.2,cudnn 7+ (仅在使用GPU版本的预测库时需要) * CMake 3.0+ 请确保系统已经安装好上述基本软件,我们使用的是`VS2019`的社区版。 **下面所有示例以工作目录为 `D:\projects`演示**。 -### Step1: 下载PaddlePaddle C++ 预测库 fluid_inference +### Step1: 下载PaddlePaddle C++ 预测库 paddle_inference PaddlePaddle C++ 预测库针对不同的`CPU`和`CUDA`版本提供了不同的预编译版本,请根据实际情况下载: [C++预测库下载列表](https://paddleinference.paddlepaddle.org.cn/user_guides/download_lib.html#windows) -解压后`D:\projects\fluid_inference`目录包含内容为: +解压后`D:\projects\paddle_inference`目录包含内容为: ``` -fluid_inference +paddle_inference ├── paddle # paddle核心库和头文件 | ├── third_party # 第三方依赖库和头文件 @@ -46,13 +46,13 @@ fluid_inference ![step2.2](https://paddleseg.bj.bcebos.com/inference/vs2019_step3.png) -3. 点击:`项目`->`cpp_inference_demo的CMake设置` +3. 点击:`项目`->`CMake设置` ![step3](https://paddleseg.bj.bcebos.com/inference/vs2019_step4.png) -4. 点击`浏览`,分别设置编译选项指定`CUDA`、`CUDNN_LIB`、`OpenCV`、`Paddle预测库`的路径 +4. 分别设置编译选项指定`CUDA`、`CUDNN_LIB`、`OpenCV`、`Paddle预测库`的路径 -三个编译参数的含义说明如下(带`*`表示仅在使用**GPU版本**预测库时指定, 其中CUDA库版本尽量对齐,**使用9.0、10.0版本,不使用9.2、10.1等版本CUDA库**): +三个编译参数的含义说明如下(带`*`表示仅在使用**GPU版本**预测库时指定, 其中CUDA库版本尽量对齐): | 参数名 | 含义 | | ---- | ---- | @@ -67,6 +67,11 @@ fluid_inference ![step4](https://paddleseg.bj.bcebos.com/inference/vs2019_step5.png) +下面给出with GPU的配置示例: +![step5](./vs2019_build_withgpu_config.png) +**注意:** + CMAKE_BACKWARDS的版本要根据平台安装cmake的版本进行设置。 + **设置完成后**, 点击上图中`保存并生成CMake缓存以加载变量`。 5. 点击`生成`->`全部生成` @@ -74,24 +79,34 @@ fluid_inference ![step6](https://paddleseg.bj.bcebos.com/inference/vs2019_step6.png) -### Step4: 预测及可视化 +### Step4: 预测 -上述`Visual Studio 2019`编译产出的可执行文件在`out\build\x64-Release`目录下,打开`cmd`,并切换到该目录: +上述`Visual Studio 2019`编译产出的可执行文件在`out\build\x64-Release\Release`目录下,打开`cmd`,并切换到`D:\projects\PaddleOCR\deploy\cpp_infer\`: ``` -cd D:\projects\PaddleOCR\deploy\cpp_infer\out\build\x64-Release +cd D:\projects\PaddleOCR\deploy\cpp_infer ``` -可执行文件`ocr_system.exe`即为样例的预测程序,其主要使用方法如下 +可执行文件`ppocr.exe`即为样例的预测程序,其主要使用方法如下,更多使用方法可以参考[说明文档](../readme.md)`运行demo`部分。 ```shell -#预测图片 `D:\projects\PaddleOCR\doc\imgs\10.jpg` -.\ocr_system.exe D:\projects\PaddleOCR\deploy\cpp_infer\tools\config.txt D:\projects\PaddleOCR\doc\imgs\10.jpg +#识别中文图片 `D:\projects\PaddleOCR\doc\imgs_words\ch\` +.\out\build\x64-Release\Release\ppocr.exe rec --rec_model_dir=D:\projects\PaddleOCR\ch_ppocr_mobile_v2.0_rec_infer --image_dir=D:\projects\PaddleOCR\doc\imgs_words\ch\ + +#识别英文图片 'D:\projects\PaddleOCR\doc\imgs_words\en\' +.\out\build\x64-Release\Release\ppocr.exe rec --rec_model_dir=D:\projects\PaddleOCR\inference\rec_mv3crnn --image_dir=D:\projects\PaddleOCR\doc\imgs_words\en\ --char_list_file=D:\projects\PaddleOCR\ppocr\utils\dict\en_dict.txt ``` -第一个参数为配置文件路径,第二个参数为需要预测的图片路径。 + +第一个参数为配置文件路径,第二个参数为需要预测的图片路径,第三个参数为配置文本识别的字典。 -### 注意 +### FQA * 在Windows下的终端中执行文件exe时,可能会发生乱码的现象,此时需要在终端中输入`CHCP 65001`,将终端的编码方式由GBK编码(默认)改为UTF-8编码,更加具体的解释可以参考这篇博客:[https://blog.csdn.net/qq_35038153/article/details/78430359](https://blog.csdn.net/qq_35038153/article/details/78430359)。 -* 编译时,如果报错`错误:C1083 无法打开包括文件:"dirent.h":No such file or directory`,可以参考该[文档](https://blog.csdn.net/Dora_blank/article/details/117740837#41_C1083_direnthNo_such_file_or_directory_54),新建`dirent.h`文件,并添加到`VC++`的包含目录中。 +* 编译时,如果报错`错误:C1083 无法打开包括文件:"dirent.h":No such file or directory`,可以参考该[文档](https://blog.csdn.net/Dora_blank/article/details/117740837#41_C1083_direnthNo_such_file_or_directory_54),新建`dirent.h`文件,并添加到`utility.cpp`的头文件引用中。同时修改`utility.cpp`70行:`lstat`改成`stat`。 + +* 编译时,如果报错`Autolog未定义`,新建`autolog.h`文件,内容为:[autolog.h](https://github.com/LDOUBLEV/AutoLog/blob/main/auto_log/autolog.h),并添加到`main.cpp`的头文件引用中,再次编译。 + +* 运行时,如果弹窗报错找不到`paddle_inference.dll`或者`openblas.dll`,在`D:\projects\paddle_inference`预测库内找到这两个文件,复制到`D:\projects\PaddleOCR\deploy\cpp_infer\out\build\x64-Release\Release`目录下。不用重新编译,再次运行即可。 + +* 运行时,弹窗报错提示`应用程序无法正常启动(0xc0000142)`,并且`cmd`窗口内提示`You are using Paddle compiled with TensorRT, but TensorRT dynamic library is not found.`,把tensort目录下的lib里面的所有dll文件复制到release目录下,再次运行即可。 diff --git a/deploy/cpp_infer/external-cmake/auto-log.cmake b/deploy/cpp_infer/external-cmake/auto-log.cmake new file mode 100644 index 0000000000000000000000000000000000000000..becbff0f45df51e5db541889ae1ffdacf2c4fc78 --- /dev/null +++ b/deploy/cpp_infer/external-cmake/auto-log.cmake @@ -0,0 +1,13 @@ +find_package(Git REQUIRED) +include(FetchContent) + +set(FETCHCONTENT_BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}/third-party") + +FetchContent_Declare( + extern_Autolog + PREFIX autolog + GIT_REPOSITORY https://github.com/LDOUBLEV/AutoLog.git + GIT_TAG main +) +FetchContent_MakeAvailable(extern_Autolog) + diff --git a/deploy/cpp_infer/readme.md b/deploy/cpp_infer/readme.md index 9bdd54669faec874e3cdad59f604882ab0bce010..f88d021d0a050aeecf859981cc2de1cee8f3a2c0 100644 --- a/deploy/cpp_infer/readme.md +++ b/deploy/cpp_infer/readme.md @@ -4,15 +4,32 @@ C++在性能计算上优于python,因此,在大多数CPU、GPU部署场景,多采用C++的部署方式,本节将介绍如何在Linux\Windows (CPU\GPU)环境下配置C++环境并完成 PaddleOCR模型部署。 +* [1. 准备环境](#1) + + [1.0 运行准备](#10) + + [1.1 编译opencv库](#11) + + [1.2 下载或者编译Paddle预测库](#12) + - [1.2.1 直接下载安装](#121) + - [1.2.2 预测库源码编译](#122) +* [2 开始运行](#2) + + [2.1 将模型导出为inference model](#21) + + [2.2 编译PaddleOCR C++预测demo](#22) + + [2.3运行demo](#23) + + ## 1. 准备环境 -### 运行准备 + + +### 1.0 运行准备 + - Linux环境,推荐使用docker。 - Windows环境,目前支持基于`Visual Studio 2019 Community`进行编译。 * 该文档主要介绍基于Linux环境的PaddleOCR C++预测流程,如果需要在Windows下基于预测库进行C++预测,具体编译方法请参考[Windows下编译教程](./docs/windows_vs2019_build.md) + + ### 1.1 编译opencv库 * 首先需要从opencv官网上下载在Linux环境下源码编译的包,以opencv3.4.7为例,下载命令如下。 @@ -71,6 +88,8 @@ opencv3/ |-- share ``` + + ### 1.2 下载或者编译Paddle预测库 * 有2种方式获取Paddle预测库,下面进行详细介绍。 @@ -132,9 +151,12 @@ build/paddle_inference_install_dir/ 其中`paddle`就是C++预测所需的Paddle库,`version.txt`中包含当前预测库的版本信息。 + ## 2 开始运行 + + ### 2.1 将模型导出为inference model * 可以参考[模型预测章节](../../doc/doc_ch/inference.md),导出inference model,用于模型预测。模型导出之后,假设放在`inference`目录下,则目录结构如下。 @@ -149,6 +171,7 @@ inference/ | |--inference.pdmodel ``` + ### 2.2 编译PaddleOCR C++预测demo @@ -172,13 +195,14 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir * 编译完成之后,会在`build`文件夹下生成一个名为`ppocr`的可执行文件。 + -### 运行demo +### 2.3 运行demo 运行方式: ```shell ./build/ppocr [--param1] [--param2] [...] -``` +``` 其中,`mode`为必选参数,表示选择的功能,取值范围['det', 'rec', 'system'],分别表示调用检测、识别、检测识别串联(包括方向分类器)。具体命令如下: ##### 1. 只调用检测: @@ -258,6 +282,4 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir -### 2.3 注意 - -* 在使用Paddle预测库时,推荐使用2.0.0版本的预测库。 +**注意:在使用Paddle预测库时,推荐使用2.0.0版本的预测库。** diff --git a/deploy/cpp_infer/readme_en.md b/deploy/cpp_infer/readme_en.md index 039aecf1ba3d6c1c717bafbecdb117416a1acc32..48de51ae726e662f48d465b8489a494448dafac1 100644 --- a/deploy/cpp_infer/readme_en.md +++ b/deploy/cpp_infer/readme_en.md @@ -1,4 +1,4 @@ -# Server-side C++ inference +# Server-side C++ Inference This chapter introduces the C++ deployment method of the PaddleOCR model, and the corresponding python predictive deployment method refers to [document](../../doc/doc_ch/inference.md). C++ is better than python in terms of performance calculation. Therefore, in most CPU and GPU deployment scenarios, C++ deployment is mostly used. @@ -6,14 +6,14 @@ This section will introduce how to configure the C++ environment and complete it PaddleOCR model deployment. -## 1. Prepare the environment +## 1. Prepare the Environment ### Environment - Linux, docker is recommended. -### 1.1 Compile opencv +### 1.1 Compile OpenCV * First of all, you need to download the source code compiled package in the Linux environment from the opencv official website. Taking opencv3.4.7 as an example, the download command is as follows. @@ -73,7 +73,7 @@ opencv3/ |-- share ``` -### 1.2 Compile or download or the Paddle inference library +### 1.2 Compile or Download or the Paddle Inference Library * There are 2 ways to obtain the Paddle inference library, described in detail below. @@ -136,7 +136,7 @@ build/paddle_inference_install_dir/ Among them, `paddle` is the Paddle library required for C++ prediction later, and `version.txt` contains the version information of the current inference library. -## 2. Compile and run the demo +## 2. Compile and Run the Demo ### 2.1 Export the inference model @@ -183,7 +183,7 @@ or the generated Paddle inference library path (`build/paddle_inference_install_ Execute the built executable file: ```shell ./build/ppocr [--param1] [--param2] [...] -``` +``` Here, `mode` is a required parameter,and the value range is ['det', 'rec', 'system'], representing using detection only, using recognition only and using the end-to-end system respectively. Specifically, ##### 1. run det demo: diff --git a/deploy/cpp_infer/src/main.cpp b/deploy/cpp_infer/src/main.cpp index 5e5c851517d5efaa75f54b7a156563a4a42880d5..82a248416f086dd2b90e891a23774c294ed50ae3 100644 --- a/deploy/cpp_infer/src/main.cpp +++ b/deploy/cpp_infer/src/main.cpp @@ -35,15 +35,16 @@ #include #include +#include "auto_log/autolog.h" DEFINE_bool(use_gpu, false, "Infering with GPU or CPU."); DEFINE_int32(gpu_id, 0, "Device id of GPU to execute."); DEFINE_int32(gpu_mem, 4000, "GPU id when infering with GPU."); -DEFINE_int32(cpu_math_library_num_threads, 10, "Num of threads with CPU."); -DEFINE_bool(use_mkldnn, false, "Whether use mkldnn with CPU."); +DEFINE_int32(cpu_threads, 10, "Num of threads with CPU."); +DEFINE_bool(enable_mkldnn, false, "Whether use mkldnn with CPU."); DEFINE_bool(use_tensorrt, false, "Whether use tensorrt."); DEFINE_string(precision, "fp32", "Precision be one of fp32/fp16/int8"); -DEFINE_bool(benchmark, true, "Whether use benchmark."); +DEFINE_bool(benchmark, false, "Whether use benchmark."); DEFINE_string(save_log_path, "./log_output/", "Save benchmark log path."); // detection related DEFINE_string(image_dir, "", "Dir of input image."); @@ -60,6 +61,7 @@ DEFINE_string(cls_model_dir, "", "Path of cls inference model."); DEFINE_double(cls_thresh, 0.9, "Threshold of cls_thresh."); // recognition related DEFINE_string(rec_model_dir, "", "Path of rec inference model."); +DEFINE_int32(rec_batch_num, 1, "rec_batch_num."); DEFINE_string(char_list_file, "../../ppocr/utils/ppocr_keys_v1.txt", "Path of dictionary."); @@ -68,34 +70,6 @@ using namespace cv; using namespace PaddleOCR; -void PrintBenchmarkLog(std::string model_name, - int batch_size, - std::string input_shape, - std::vector time_info, - int img_num){ - LOG(INFO) << "----------------------- Config info -----------------------"; - LOG(INFO) << "runtime_device: " << (FLAGS_use_gpu ? "gpu" : "cpu"); - LOG(INFO) << "ir_optim: " << "True"; - LOG(INFO) << "enable_memory_optim: " << "True"; - LOG(INFO) << "enable_tensorrt: " << FLAGS_use_tensorrt; - LOG(INFO) << "enable_mkldnn: " << (FLAGS_use_mkldnn ? "True" : "False"); - LOG(INFO) << "cpu_math_library_num_threads: " << FLAGS_cpu_math_library_num_threads; - LOG(INFO) << "----------------------- Data info -----------------------"; - LOG(INFO) << "batch_size: " << batch_size; - LOG(INFO) << "input_shape: " << input_shape; - LOG(INFO) << "data_num: " << img_num; - LOG(INFO) << "----------------------- Model info -----------------------"; - LOG(INFO) << "model_name: " << model_name; - LOG(INFO) << "precision: " << FLAGS_precision; - LOG(INFO) << "----------------------- Perf info ------------------------"; - LOG(INFO) << "Total time spent(ms): " - << std::accumulate(time_info.begin(), time_info.end(), 0); - LOG(INFO) << "preprocess_time(ms): " << time_info[0] / img_num - << ", inference_time(ms): " << time_info[1] / img_num - << ", postprocess_time(ms): " << time_info[2] / img_num; -} - - static bool PathExists(const std::string& path){ #ifdef _WIN32 struct _stat buffer; @@ -110,14 +84,14 @@ static bool PathExists(const std::string& path){ int main_det(std::vector cv_all_img_names) { std::vector time_info = {0, 0, 0}; DBDetector det(FLAGS_det_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, - FLAGS_gpu_mem, FLAGS_cpu_math_library_num_threads, - FLAGS_use_mkldnn, FLAGS_max_side_len, FLAGS_det_db_thresh, + FLAGS_gpu_mem, FLAGS_cpu_threads, + FLAGS_enable_mkldnn, FLAGS_max_side_len, FLAGS_det_db_thresh, FLAGS_det_db_box_thresh, FLAGS_det_db_unclip_ratio, FLAGS_use_polygon_score, FLAGS_visualize, FLAGS_use_tensorrt, FLAGS_precision); for (int i = 0; i < cv_all_img_names.size(); ++i) { - LOG(INFO) << "The predict img: " << cv_all_img_names[i]; +// LOG(INFO) << "The predict img: " << cv_all_img_names[i]; cv::Mat srcimg = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR); if (!srcimg.data) { @@ -132,10 +106,30 @@ int main_det(std::vector cv_all_img_names) { time_info[0] += det_times[0]; time_info[1] += det_times[1]; time_info[2] += det_times[2]; + + if (FLAGS_benchmark) { + cout << cv_all_img_names[i] << '\t'; + for (int n = 0; n < boxes.size(); n++) { + for (int m = 0; m < boxes[n].size(); m++) { + cout << boxes[n][m][0] << ' ' << boxes[n][m][1] << ' '; + } + } + cout << endl; + } } if (FLAGS_benchmark) { - PrintBenchmarkLog("det", 1, "dynamic", time_info, cv_all_img_names.size()); + AutoLogger autolog("ocr_det", + FLAGS_use_gpu, + FLAGS_use_tensorrt, + FLAGS_enable_mkldnn, + FLAGS_cpu_threads, + 1, + "dynamic", + FLAGS_precision, + time_info, + cv_all_img_names.size()); + autolog.report(); } return 0; } @@ -143,9 +137,15 @@ int main_det(std::vector cv_all_img_names) { int main_rec(std::vector cv_all_img_names) { std::vector time_info = {0, 0, 0}; + + std::string char_list_file = FLAGS_char_list_file; + if (FLAGS_benchmark) + char_list_file = FLAGS_char_list_file.substr(6); + cout << "label file: " << char_list_file << endl; + CRNNRecognizer rec(FLAGS_rec_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, - FLAGS_gpu_mem, FLAGS_cpu_math_library_num_threads, - FLAGS_use_mkldnn, FLAGS_char_list_file, + FLAGS_gpu_mem, FLAGS_cpu_threads, + FLAGS_enable_mkldnn, char_list_file, FLAGS_use_tensorrt, FLAGS_precision); for (int i = 0; i < cv_all_img_names.size(); ++i) { @@ -164,19 +164,31 @@ int main_rec(std::vector cv_all_img_names) { time_info[1] += rec_times[1]; time_info[2] += rec_times[2]; } - + if (FLAGS_benchmark) { - PrintBenchmarkLog("rec", 1, "dynamic", time_info, cv_all_img_names.size()); + AutoLogger autolog("ocr_rec", + FLAGS_use_gpu, + FLAGS_use_tensorrt, + FLAGS_enable_mkldnn, + FLAGS_cpu_threads, + 1, + "dynamic", + FLAGS_precision, + time_info, + cv_all_img_names.size()); + autolog.report(); } - return 0; } int main_system(std::vector cv_all_img_names) { + std::vector time_info_det = {0, 0, 0}; + std::vector time_info_rec = {0, 0, 0}; + DBDetector det(FLAGS_det_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, - FLAGS_gpu_mem, FLAGS_cpu_math_library_num_threads, - FLAGS_use_mkldnn, FLAGS_max_side_len, FLAGS_det_db_thresh, + FLAGS_gpu_mem, FLAGS_cpu_threads, + FLAGS_enable_mkldnn, FLAGS_max_side_len, FLAGS_det_db_thresh, FLAGS_det_db_box_thresh, FLAGS_det_db_unclip_ratio, FLAGS_use_polygon_score, FLAGS_visualize, FLAGS_use_tensorrt, FLAGS_precision); @@ -184,22 +196,25 @@ int main_system(std::vector cv_all_img_names) { Classifier *cls = nullptr; if (FLAGS_use_angle_cls) { cls = new Classifier(FLAGS_cls_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, - FLAGS_gpu_mem, FLAGS_cpu_math_library_num_threads, - FLAGS_use_mkldnn, FLAGS_cls_thresh, + FLAGS_gpu_mem, FLAGS_cpu_threads, + FLAGS_enable_mkldnn, FLAGS_cls_thresh, FLAGS_use_tensorrt, FLAGS_precision); } + std::string char_list_file = FLAGS_char_list_file; + if (FLAGS_benchmark) + char_list_file = FLAGS_char_list_file.substr(6); + cout << "label file: " << char_list_file << endl; + CRNNRecognizer rec(FLAGS_rec_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, - FLAGS_gpu_mem, FLAGS_cpu_math_library_num_threads, - FLAGS_use_mkldnn, FLAGS_char_list_file, + FLAGS_gpu_mem, FLAGS_cpu_threads, + FLAGS_enable_mkldnn, char_list_file, FLAGS_use_tensorrt, FLAGS_precision); - auto start = std::chrono::system_clock::now(); - for (int i = 0; i < cv_all_img_names.size(); ++i) { LOG(INFO) << "The predict img: " << cv_all_img_names[i]; - cv::Mat srcimg = cv::imread(FLAGS_image_dir, cv::IMREAD_COLOR); + cv::Mat srcimg = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR); if (!srcimg.data) { std::cerr << "[ERROR] image read failed! image path: " << cv_all_img_names[i] << endl; exit(1); @@ -209,7 +224,10 @@ int main_system(std::vector cv_all_img_names) { std::vector rec_times; det.Run(srcimg, boxes, &det_times); - + time_info_det[0] += det_times[0]; + time_info_det[1] += det_times[1]; + time_info_det[2] += det_times[2]; + cv::Mat crop_img; for (int j = 0; j < boxes.size(); j++) { crop_img = Utility::GetRotateCropImage(srcimg, boxes[j]); @@ -218,18 +236,36 @@ int main_system(std::vector cv_all_img_names) { crop_img = cls->Run(crop_img); } rec.Run(crop_img, &rec_times); + time_info_rec[0] += rec_times[0]; + time_info_rec[1] += rec_times[1]; + time_info_rec[2] += rec_times[2]; } - - auto end = std::chrono::system_clock::now(); - auto duration = - std::chrono::duration_cast(end - start); - std::cout << "Cost " - << double(duration.count()) * - std::chrono::microseconds::period::num / - std::chrono::microseconds::period::den - << "s" << std::endl; } - + if (FLAGS_benchmark) { + AutoLogger autolog_det("ocr_det", + FLAGS_use_gpu, + FLAGS_use_tensorrt, + FLAGS_enable_mkldnn, + FLAGS_cpu_threads, + 1, + "dynamic", + FLAGS_precision, + time_info_det, + cv_all_img_names.size()); + AutoLogger autolog_rec("ocr_rec", + FLAGS_use_gpu, + FLAGS_use_tensorrt, + FLAGS_enable_mkldnn, + FLAGS_cpu_threads, + 1, + "dynamic", + FLAGS_precision, + time_info_rec, + cv_all_img_names.size()); + autolog_det.report(); + std::cout << endl; + autolog_rec.report(); + } return 0; } diff --git a/deploy/cpp_infer/src/ocr_rec.cpp b/deploy/cpp_infer/src/ocr_rec.cpp index b64dcea5ae2a68485296c02cdb7689c60ea504f8..3739a66ad802fd108df16bbbbe8c8695963b7693 100644 --- a/deploy/cpp_infer/src/ocr_rec.cpp +++ b/deploy/cpp_infer/src/ocr_rec.cpp @@ -112,12 +112,16 @@ void CRNNRecognizer::LoadModel(const std::string &model_dir) { 1 << 20, 10, 3, precision, false, false); + std::map> min_input_shape = { - {"x", {1, 3, 32, 10}}}; + {"x", {1, 3, 32, 10}}, + {"lstm_0.tmp_0", {10, 1, 96}}}; std::map> max_input_shape = { - {"x", {1, 3, 32, 2000}}}; + {"x", {1, 3, 32, 2000}}, + {"lstm_0.tmp_0", {1000, 1, 96}}}; std::map> opt_input_shape = { - {"x", {1, 3, 32, 320}}}; + {"x", {1, 3, 32, 320}}, + {"lstm_0.tmp_0", {25, 1, 96}}}; config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, opt_input_shape); @@ -139,7 +143,7 @@ void CRNNRecognizer::LoadModel(const std::string &model_dir) { config.SwitchIrOptim(true); config.EnableMemoryOptim(); - config.DisableGlogInfo(); +// config.DisableGlogInfo(); this->predictor_ = CreatePredictor(config); } diff --git a/deploy/hubserving/ocr_det/params.py b/deploy/hubserving/ocr_det/params.py index bc75cc404e43e0a6e9242c2684d615b4575e5d8f..2587a297662cb34d22dbdfe191439e61066cda78 100755 --- a/deploy/hubserving/ocr_det/params.py +++ b/deploy/hubserving/ocr_det/params.py @@ -13,7 +13,7 @@ def read_params(): #params for text detector cfg.det_algorithm = "DB" - cfg.det_model_dir = "./inference/ch_ppocr_mobile_v2.0_det_infer/" + cfg.det_model_dir = "./inference/ch_PP-OCRv2_det_infer/" cfg.det_limit_side_len = 960 cfg.det_limit_type = 'max' diff --git a/deploy/hubserving/ocr_rec/params.py b/deploy/hubserving/ocr_rec/params.py index f8d29114357946c9b6264079fca2eb4b19dbefba..5e11c3cfee0c9387fce7f465f15f9424b7b04e9d 100644 --- a/deploy/hubserving/ocr_rec/params.py +++ b/deploy/hubserving/ocr_rec/params.py @@ -13,7 +13,7 @@ def read_params(): #params for text recognizer cfg.rec_algorithm = "CRNN" - cfg.rec_model_dir = "./inference/ch_ppocr_mobile_v2.0_rec_infer/" + cfg.rec_model_dir = "./inference/ch_PP-OCRv2_rec_infer/" cfg.rec_image_shape = "3, 32, 320" cfg.rec_char_type = 'ch' diff --git a/deploy/hubserving/ocr_system/params.py b/deploy/hubserving/ocr_system/params.py index bee53bfd346e6d4d91738a2e06a0b4dab8e2b0de..4698e8ce5d8f8c826fe04a85906189e729104ddb 100755 --- a/deploy/hubserving/ocr_system/params.py +++ b/deploy/hubserving/ocr_system/params.py @@ -13,7 +13,7 @@ def read_params(): #params for text detector cfg.det_algorithm = "DB" - cfg.det_model_dir = "./inference/ch_ppocr_mobile_v2.0_det_infer/" + cfg.det_model_dir = "./inference/ch_PP-OCRv2_det_infer/" cfg.det_limit_side_len = 960 cfg.det_limit_type = 'max' @@ -31,7 +31,7 @@ def read_params(): #params for text recognizer cfg.rec_algorithm = "CRNN" - cfg.rec_model_dir = "./inference/ch_ppocr_mobile_v2.0_rec_infer/" + cfg.rec_model_dir = "./inference/ch_PP-OCRv2_rec_infer/" cfg.rec_image_shape = "3, 32, 320" cfg.rec_char_type = 'ch' diff --git a/deploy/hubserving/readme.md b/deploy/hubserving/readme.md index 11b843fec1052c3ad401ca0b7d1cb602401af8f8..b52e3584c36173e4c607dbbd9679605c98de8a67 100755 --- a/deploy/hubserving/readme.md +++ b/deploy/hubserving/readme.md @@ -34,10 +34,10 @@ pip3 install paddlehub==2.1.0 --upgrade -i https://pypi.tuna.tsinghua.edu.cn/sim ``` ### 2. 下载推理模型 -安装服务模块前,需要准备推理模型并放到正确路径。默认使用的是v2.0版的超轻量模型,默认模型路径为: +安装服务模块前,需要准备推理模型并放到正确路径。默认使用的是PP-OCRv2模型,默认模型路径为: ``` -检测模型:./inference/ch_ppocr_mobile_v2.0_det_infer/ -识别模型:./inference/ch_ppocr_mobile_v2.0_rec_infer/ +检测模型:./inference/ch_PP-OCRv2_det_infer/ +识别模型:./inference/ch_PP-OCRv2_rec_infer/ 方向分类器:./inference/ch_ppocr_mobile_v2.0_cls_infer/ ``` diff --git a/deploy/hubserving/readme_en.md b/deploy/hubserving/readme_en.md index 539ad722cae78b8315b87d35f9af6ab81140c5b3..3bbcf98cd8b78407613e6bdfb5d5ab8b0a25a084 100755 --- a/deploy/hubserving/readme_en.md +++ b/deploy/hubserving/readme_en.md @@ -35,10 +35,10 @@ pip3 install paddlehub==2.1.0 --upgrade -i https://pypi.tuna.tsinghua.edu.cn/sim ``` ### 2. Download inference model -Before installing the service module, you need to prepare the inference model and put it in the correct path. By default, the ultra lightweight model of v2.0 is used, and the default model path is: +Before installing the service module, you need to prepare the inference model and put it in the correct path. By default, the PP-OCRv2 models are used, and the default model path is: ``` -detection model: ./inference/ch_ppocr_mobile_v2.0_det_infer/ -recognition model: ./inference/ch_ppocr_mobile_v2.0_rec_infer/ +detection model: ./inference/ch_PP-OCRv2_det_infer/ +recognition model: ./inference/ch_PP-OCRv2_rec_infer/ text direction classifier: ./inference/ch_ppocr_mobile_v2.0_cls_infer/ ``` diff --git a/deploy/pdserving/web_service_det.py b/deploy/pdserving/web_service_det.py new file mode 100644 index 0000000000000000000000000000000000000000..25ac2f37dbd3cdf05b3503abaab0c5651867fae9 --- /dev/null +++ b/deploy/pdserving/web_service_det.py @@ -0,0 +1,77 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from paddle_serving_server.web_service import WebService, Op + +import logging +import numpy as np +import cv2 +import base64 +# from paddle_serving_app.reader import OCRReader +from ocr_reader import OCRReader, DetResizeForTest +from paddle_serving_app.reader import Sequential, ResizeByFactor +from paddle_serving_app.reader import Div, Normalize, Transpose +from paddle_serving_app.reader import DBPostProcess, FilterBoxes, GetRotateCropImage, SortedBoxes + +_LOGGER = logging.getLogger() + + +class DetOp(Op): + def init_op(self): + self.det_preprocess = Sequential([ + DetResizeForTest(), Div(255), + Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), Transpose( + (2, 0, 1)) + ]) + self.filter_func = FilterBoxes(10, 10) + self.post_func = DBPostProcess({ + "thresh": 0.3, + "box_thresh": 0.5, + "max_candidates": 1000, + "unclip_ratio": 1.5, + "min_size": 3 + }) + + def preprocess(self, input_dicts, data_id, log_id): + (_, input_dict), = input_dicts.items() + data = base64.b64decode(input_dict["image"].encode('utf8')) + self.raw_im = data + data = np.fromstring(data, np.uint8) + # Note: class variables(self.var) can only be used in process op mode + im = cv2.imdecode(data, cv2.IMREAD_COLOR) + self.ori_h, self.ori_w, _ = im.shape + det_img = self.det_preprocess(im) + _, self.new_h, self.new_w = det_img.shape + return {"x": det_img[np.newaxis, :].copy()}, False, None, "" + + def postprocess(self, input_dicts, fetch_dict, log_id): + det_out = fetch_dict["save_infer_model/scale_0.tmp_1"] + ratio_list = [ + float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w + ] + dt_boxes_list = self.post_func(det_out, [ratio_list]) + dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w]) + out_dict = {"dt_boxes": str(dt_boxes)} + + return out_dict, None, "" + + +class OcrService(WebService): + def get_pipeline_response(self, read_op): + det_op = DetOp(name="det", input_ops=[read_op]) + return det_op + + +uci_service = OcrService(name="ocr") +uci_service.prepare_pipeline_config("config.yml") +uci_service.run_service() diff --git a/deploy/pdserving/web_service_rec.py b/deploy/pdserving/web_service_rec.py new file mode 100644 index 0000000000000000000000000000000000000000..6b3cf707f0f19034a0734fd27824feb4fb6cce20 --- /dev/null +++ b/deploy/pdserving/web_service_rec.py @@ -0,0 +1,86 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from paddle_serving_server.web_service import WebService, Op + +import logging +import numpy as np +import cv2 +import base64 +# from paddle_serving_app.reader import OCRReader +from ocr_reader import OCRReader, DetResizeForTest +from paddle_serving_app.reader import Sequential, ResizeByFactor +from paddle_serving_app.reader import Div, Normalize, Transpose + +_LOGGER = logging.getLogger() + + +class RecOp(Op): + def init_op(self): + self.ocr_reader = OCRReader( + char_dict_path="../../ppocr/utils/ppocr_keys_v1.txt") + + def preprocess(self, input_dicts, data_id, log_id): + (_, input_dict), = input_dicts.items() + raw_im = base64.b64decode(input_dict["image"].encode('utf8')) + data = np.fromstring(raw_im, np.uint8) + im = cv2.imdecode(data, cv2.IMREAD_COLOR) + feed_list = [] + max_wh_ratio = 0 + ## Many mini-batchs, the type of feed_data is list. + max_batch_size = 6 # len(dt_boxes) + + # If max_batch_size is 0, skipping predict stage + if max_batch_size == 0: + return {}, True, None, "" + boxes_size = max_batch_size + rem = boxes_size % max_batch_size + + h, w = im.shape[0:2] + wh_ratio = w * 1.0 / h + max_wh_ratio = max(max_wh_ratio, wh_ratio) + _, w, h = self.ocr_reader.resize_norm_img(im, max_wh_ratio).shape + norm_img = self.ocr_reader.resize_norm_img(im, max_batch_size) + norm_img = norm_img[np.newaxis, :] + feed = {"x": norm_img.copy()} + feed_list.append(feed) + return feed_list, False, None, "" + + def postprocess(self, input_dicts, fetch_data, log_id): + res_list = [] + if isinstance(fetch_data, dict): + if len(fetch_data) > 0: + rec_batch_res = self.ocr_reader.postprocess( + fetch_data, with_score=True) + for res in rec_batch_res: + res_list.append(res[0]) + elif isinstance(fetch_data, list): + for one_batch in fetch_data: + one_batch_res = self.ocr_reader.postprocess( + one_batch, with_score=True) + for res in one_batch_res: + res_list.append(res[0]) + + res = {"res": str(res_list)} + return res, None, "" + + +class OcrService(WebService): + def get_pipeline_response(self, read_op): + rec_op = RecOp(name="rec", input_ops=[read_op]) + return rec_op + + +uci_service = OcrService(name="ocr") +uci_service.prepare_pipeline_config("config.yml") +uci_service.run_service() diff --git a/deploy/slim/prune/sensitivity_anal.py b/deploy/slim/prune/sensitivity_anal.py index bd2b96497221fd886c83b9401cc8ed2a1a201a50..0f0492af2f57eea9b9c1d13ec5ee1dad9fc2f1bc 100644 --- a/deploy/slim/prune/sensitivity_anal.py +++ b/deploy/slim/prune/sensitivity_anal.py @@ -75,7 +75,7 @@ def main(config, device, logger, vdl_writer): model = build_model(config['Architecture']) flops = paddle.flops(model, [1, 3, 640, 640]) - logger.info(f"FLOPs before pruning: {flops}") + logger.info("FLOPs before pruning: {}".format(flops)) from paddleslim.dygraph import FPGMFilterPruner model.train() @@ -106,33 +106,51 @@ def main(config, device, logger, vdl_writer): def eval_fn(): metric = program.eval(model, valid_dataloader, post_process_class, - eval_class) - logger.info(f"metric['hmean']: {metric['hmean']}") + eval_class, False) + logger.info("metric['hmean']: {}".format(metric['hmean'])) return metric['hmean'] - params_sensitive = pruner.sensitive( - eval_func=eval_fn, - sen_file="./sen.pickle", - skip_vars=[ - "conv2d_57.w_0", "conv2d_transpose_2.w_0", "conv2d_transpose_3.w_0" - ]) - - logger.info( - "The sensitivity analysis results of model parameters saved in sen.pickle" - ) - # calculate pruned params's ratio - params_sensitive = pruner._get_ratios_by_loss(params_sensitive, loss=0.02) - for key in params_sensitive.keys(): - logger.info(f"{key}, {params_sensitive[key]}") + run_sensitive_analysis = False + """ + run_sensitive_analysis=True: + Automatically compute the sensitivities of convolutions in a model. + The sensitivity of a convolution is the losses of accuracy on test dataset in + differenct pruned ratios. The sensitivities can be used to get a group of best + ratios with some condition. + + run_sensitive_analysis=False: + Set prune trim ratio to a fixed value, such as 10%. The larger the value, + the more convolution weights will be cropped. + + """ + + if run_sensitive_analysis: + params_sensitive = pruner.sensitive( + eval_func=eval_fn, + sen_file="./deploy/slim/prune/sen.pickle", + skip_vars=[ + "conv2d_57.w_0", "conv2d_transpose_2.w_0", + "conv2d_transpose_3.w_0" + ]) + logger.info( + "The sensitivity analysis results of model parameters saved in sen.pickle" + ) + # calculate pruned params's ratio + params_sensitive = pruner._get_ratios_by_loss( + params_sensitive, loss=0.02) + for key in params_sensitive.keys(): + logger.info("{}, {}".format(key, params_sensitive[key])) + else: + params_sensitive = {} + for param in model.parameters(): + if 'transpose' not in param.name and 'linear' not in param.name: + # set prune ratio as 10%. The larger the value, the more convolution weights will be cropped + params_sensitive[param.name] = 0.1 plan = pruner.prune_vars(params_sensitive, [0]) - for param in model.parameters(): - if ("weights" in param.name and "conv" in param.name) or ( - "w_0" in param.name and "conv2d" in param.name): - logger.info(f"{param.name}: {param.shape}") flops = paddle.flops(model, [1, 3, 640, 640]) - logger.info(f"FLOPs after pruning: {flops}") + logger.info("FLOPs after pruning: {}".format(flops)) # start train diff --git a/deploy/slim/quantization/quant_kl.py b/deploy/slim/quantization/quant_kl.py new file mode 100755 index 0000000000000000000000000000000000000000..d866784ae6a3c087215320ec95bd39fdd1e89418 --- /dev/null +++ b/deploy/slim/quantization/quant_kl.py @@ -0,0 +1,146 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.append(os.path.abspath(os.path.join(__dir__, '..', '..', '..'))) +sys.path.append( + os.path.abspath(os.path.join(__dir__, '..', '..', '..', 'tools'))) + +import yaml +import paddle +import paddle.distributed as dist + +paddle.seed(2) + +from ppocr.data import build_dataloader +from ppocr.modeling.architectures import build_model +from ppocr.losses import build_loss +from ppocr.optimizer import build_optimizer +from ppocr.postprocess import build_post_process +from ppocr.metrics import build_metric +from ppocr.utils.save_load import init_model +import tools.program as program +import paddleslim +from paddleslim.dygraph.quant import QAT +import numpy as np + +dist.get_world_size() + + +class PACT(paddle.nn.Layer): + def __init__(self): + super(PACT, self).__init__() + alpha_attr = paddle.ParamAttr( + name=self.full_name() + ".pact", + initializer=paddle.nn.initializer.Constant(value=20), + learning_rate=1.0, + regularizer=paddle.regularizer.L2Decay(2e-5)) + + self.alpha = self.create_parameter( + shape=[1], attr=alpha_attr, dtype='float32') + + def forward(self, x): + out_left = paddle.nn.functional.relu(x - self.alpha) + out_right = paddle.nn.functional.relu(-self.alpha - x) + x = x - out_left + out_right + return x + + +quant_config = { + # weight preprocess type, default is None and no preprocessing is performed. + 'weight_preprocess_type': None, + # activation preprocess type, default is None and no preprocessing is performed. + 'activation_preprocess_type': None, + # weight quantize type, default is 'channel_wise_abs_max' + 'weight_quantize_type': 'channel_wise_abs_max', + # activation quantize type, default is 'moving_average_abs_max' + 'activation_quantize_type': 'moving_average_abs_max', + # weight quantize bit num, default is 8 + 'weight_bits': 8, + # activation quantize bit num, default is 8 + 'activation_bits': 8, + # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8' + 'dtype': 'int8', + # window size for 'range_abs_max' quantization. default is 10000 + 'window_size': 10000, + # The decay coefficient of moving average, default is 0.9 + 'moving_rate': 0.9, + # for dygraph quantization, layers of type in quantizable_layer_type will be quantized + 'quantizable_layer_type': ['Conv2D', 'Linear'], +} + + +def sample_generator(loader): + def __reader__(): + for indx, data in enumerate(loader): + images = np.array(data[0]) + yield images + + return __reader__ + + +def main(config, device, logger, vdl_writer): + # init dist environment + if config['Global']['distributed']: + dist.init_parallel_env() + + global_config = config['Global'] + + # build dataloader + config['Train']['loader']['num_workers'] = 0 + train_dataloader = build_dataloader(config, 'Train', device, logger) + if config['Eval']: + config['Eval']['loader']['num_workers'] = 0 + valid_dataloader = build_dataloader(config, 'Eval', device, logger) + else: + valid_dataloader = None + + paddle.enable_static() + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + + if 'inference_model' in global_config.keys(): # , 'inference_model'): + inference_model_dir = global_config['inference_model'] + else: + inference_model_dir = os.path.dirname(global_config['pretrained_model']) + if not (os.path.exists(os.path.join(inference_model_dir, "inference.pdmodel")) and \ + os.path.exists(os.path.join(inference_model_dir, "inference.pdiparams")) ): + raise ValueError( + "Please set inference model dir in Global.inference_model or Global.pretrained_model for post-quantazition" + ) + + paddleslim.quant.quant_post_static( + executor=exe, + model_dir=inference_model_dir, + model_filename='inference.pdmodel', + params_filename='inference.pdiparams', + quantize_model_path=global_config['save_inference_dir'], + sample_generator=sample_generator(train_dataloader), + save_model_filename='inference.pdmodel', + save_params_filename='inference.pdiparams', + batch_size=1, + batch_nums=None) + + +if __name__ == '__main__': + config, device, logger, vdl_writer = program.preprocess(is_train=True) + main(config, device, logger, vdl_writer) diff --git a/doc/PaddleOCR_log.png b/doc/PaddleOCR_log.png new file mode 100644 index 0000000000000000000000000000000000000000..a2df52f8565b71e6eea29782febb7b4212980ee0 Binary files /dev/null and b/doc/PaddleOCR_log.png differ diff --git a/doc/datasets/ic15_location_download.png b/doc/datasets/ic15_location_download.png new file mode 100644 index 0000000000000000000000000000000000000000..7cb8540e5e51b77aa8b480069841fc51c0d907b7 Binary files /dev/null and b/doc/datasets/ic15_location_download.png differ diff --git a/doc/datasets/icdar_rec.png b/doc/datasets/icdar_rec.png new file mode 100644 index 0000000000000000000000000000000000000000..a840d6af5949251f5b13ed4356c7eaf1b172070a Binary files /dev/null and b/doc/datasets/icdar_rec.png differ diff --git a/doc/doc_ch/FAQ.md b/doc/doc_ch/FAQ.md index 2af9006ad19c4f89d4798b581c0ebfedef7984f7..f863c1b1ad79999187fafe9ffe06fb2fdaf7774b 100755 --- a/doc/doc_ch/FAQ.md +++ b/doc/doc_ch/FAQ.md @@ -9,38 +9,42 @@ ## PaddleOCR常见问题汇总(持续更新) -* [近期更新(2021.2.1)](#近期更新) +* [近期更新(2021.6.29)](#近期更新) * [【精选】OCR精选10个问题](#OCR精选10个问题) -* [【理论篇】OCR通用32个问题](#OCR通用问题) - * [基础知识7题](#基础知识) - * [数据集7题](#数据集2) - * [模型训练调优18题](#模型训练调优2) -* [【实战篇】PaddleOCR实战120个问题](#PaddleOCR实战问题) - * [使用咨询38题](#使用咨询) - * [数据集18题](#数据集3) - * [模型训练调优30题](#模型训练调优3) - * [预测部署34题](#预测部署3) - +* [【理论篇】OCR通用51个问题](#OCR通用问题) + * [基础知识16题](#基础知识) + * [数据集10题](#数据集2) + * [模型训练调优25题](#模型训练调优2) +* [【实战篇】PaddleOCR实战187个问题](#PaddleOCR实战问题) + * [使用咨询80题](#使用咨询) + * [数据集19题](#数据集3) + * [模型训练调优39题](#模型训练调优3) + * [预测部署49题](#预测部署3) -## 近期更新(2021.2.1) +## 近期更新(2021.6.29) -#### Q3.2.18: PaddleOCR动态图版本如何finetune? -**A**:finetune需要将配置文件里的 Global.load_static_weights设置为false,如果没有此字段可以手动添加,然后将模型地址放到Global.pretrained_model字段下即可。 +#### Q2.3.25: 图像正常识别出来的文字是OK的,旋转90度后识别出来的结果比较差,有什么方法可以优化? +A: 整图旋转90之后效果变差是有可能的,因为目前PPOCR默认输入的图片是正向的; 可以自己训练一个整图的方向分类器,放在预测的最前端(可以参照现有方向分类器的方式),或者可以基于规则做一些预处理,比如判断长宽等等。 +#### Q3.1.78: 在线demo支持阿拉伯语吗 +**A**: 在线demo目前只支持中英文, 多语言的都需要通过whl包自行处理 -#### Q3.3.29: 微调v1.1预训练的模型,可以直接用文字垂直排列和上下颠倒的图片吗?还是必须要水平排列的? -**A**:1.1和2.0的模型一样,微调时,垂直排列的文字需要逆时针旋转 90° 后加入训练,上下颠倒的需要旋转为水平的。 +#### Q3.1.79: 某个类别的样本比较少,通过增加训练的迭代次数或者是epoch,变相增加小样本的数目,这样能缓解这个问题么? +**A**: 尽量保证类别均衡, 某些类别样本少,可以通过补充合成数据的方式处理;实验证明训练集中出现频次较少的字符,识别效果会比较差,增加迭代次数不能改变样本量少的问题。 -#### Q3.3.30: 模型训练过程中如何得到 best_accuracy 模型? -**A**:配置文件里的eval_batch_step字段用来控制多少次iter进行一次eval,在eval完成后会自动生成 best_accuracy 模型,所以如果希望很快就能拿到best_accuracy模型,可以将eval_batch_step改小一点(例如,10)。 +#### Q3.1.80: 想把简历上的文字识别出来后,能够把关系一一对应起来,比如姓名和它后面的名字组成一对,籍贯、邮箱、学历等等都和各自的内容关联起来,这个应该如何处理,PPOCR目前支持吗? +**A**: 这样的需求在企业应用中确实比较常见,但往往都是个性化的需求,没有非常规整统一的处理方式。常见的处理方式有如下两种: +1. 对于单一版式、或者版式差异不大的应用场景,可以基于识别场景的一些先验信息,将识别内容进行配对; 比如运用表单结构信息:常见表单"姓名"关键字的后面,往往紧跟的就是名字信息 +2. 对于版式多样,或者无固定版式的场景, 需要借助于NLP中的NER技术,给识别内容中的某些字段,赋予key值 -#### Q3.4.33: 如何多进程运行paddleocr? -**A**:实例化多个paddleocr服务,然后将服务注册到注册中心,之后通过注册中心统一调度即可,关于注册中心,可以搜索eureka了解一下具体使用,其他的注册中心也行。 +由于这部分需求和业务场景强相关,难以用一个统一的模型去处理,目前PPOCR暂不支持。 如果需要用到NER技术,可以参照Paddle团队的另一个开源套件: https://github.com/PaddlePaddle/ERNIE, 其提供的预训练模型ERNIE, 可以帮助提升NER任务的准确率。 - -#### Q3.4.34: 2.0训练出来的模型,能否在1.1版本上进行部署? -**A**:这个是不建议的,2.0训练出来的模型建议使用dygraph分支里提供的部署代码。 +#### Q3.4.49: 同一个模型,c++部署和python部署方式,出来的结果不一致,如何定位? +**A**:有如下几个Debug经验: +1. 优先对一下几个阈值参数是否一致; +2. 排查一下c++代码和python代码的预处理和后处理方式是否一致; +3. 用python在模型输入输出各保存一下二进制文件,排除inference的差异性 ## 【精选】OCR精选10个问题 @@ -76,8 +80,7 @@ **A**:(1)在人眼确认可识别的条件下,对于背景有干扰的文字,首先要保证检测框足够准确,如果检测框不准确,需要考虑是否可以通过过滤颜色等方式对图像预处理并且增加更多相关的训练数据;在识别的部分,注意在训练数据中加入背景干扰类的扩增图像。 -(2)如果MobileNet模型不能满足需求,可以尝试ResNet系列大模型来获得更好的效果 -。 +(2)如果MobileNet模型不能满足需求,可以尝试ResNet系列大模型来获得更好的效果。 #### Q1.1.6:OCR领域常用的评估指标是什么? @@ -125,7 +128,7 @@ #### Q1.1.10:PaddleOCR中,对于模型预测加速,CPU加速的途径有哪些?基于TenorRT加速GPU对输入有什么要求? -**A**:(1)CPU可以使用mkldnn进行加速;对于python inference的话,可以把enable_mkldnn改为true,[参考代码](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/tools/infer/utility.py#L84),对于cpp inference的话,在配置文件里面配置use_mkldnn 1即可,[参考代码](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/deploy/cpp_infer/tools/config.txt#L6) +**A**:(1)CPU可以使用mkldnn进行加速;对于python inference的话,可以把enable_mkldnn改为true,[参考代码](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/tools/infer/utility.py#L99),对于cpp inference的话,在配置文件里面配置use_mkldnn 1即可,[参考代码](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/deploy/cpp_infer/tools/config.txt#L6) (2)GPU需要注意变长输入问题等,TRT6 之后才支持变长输入 @@ -161,6 +164,39 @@ **A**:处理字符的时候,把多字符的当作一个字就行,字典中每行是一个字。 +#### Q2.1.8: 端到端的场景文本识别方法大概分为几种? + +**A**:端到端的场景文本识别方法大概分为2种:基于二阶段的方法和基于字符级别的方法。基于两阶段的方法一般先检测文本块,然后提取文本块中的特征用于识别,例如ABCNet;基于字符级别方法直接进行字符检测与识别,直接输出单词的文本框,字符框以及对应的字符类别,例如CharNet。 + +#### Q2.1.9: 二阶段的端到端的场景文本识别方法的不足有哪些? + +**A**: 这类方法一般需要设计针对ROI提取特征的方法,而ROI操作一般比较耗时。 + +#### Q2.1.10: 基于字符级别的端到端的场景文本识别方法的不足有哪些? + +**A**: 这类方法一方面训练时需要加入字符级别的数据,一般使用合成数据,但是合成数据和真实数据有分布Gap。另一方面,现有工作大多数假设文本阅读方向,从上到下,从左到右,没有解决文本方向预测问题。 + +#### Q2.1.11: AAAI 2021最新的端到端场景文本识别PGNet算法有什么特点? + +**A**: PGNet不需要字符级别的标注,NMS操作以及ROI操作。同时提出预测文本行内的阅读顺序模块和基于图的修正模块来提升文本识别效果。该算法是百度自研,近期会在PaddleOCR开源。 + +#### Q2.1.12: PubTabNet 数据集关注的是什么问题? + +**A**: PubTabNet是IBM提出的基于图片格式的表格识别数据集,包含 56.8 万张表格数据的图像,以及图像对应的 html 格式的注释。该数据集的发布推动了表格结构化算法的研发和落地应用。 + +#### Q2.1.13: PaddleOCR提供的文本识别算法包括哪些? +**A**: PaddleOCR主要提供五种文本识别算法,包括CRNN\StarNet\RARE\Rosetta和SRN, 其中CRNN\StarNet和Rosetta是基于ctc的文字识别算法,RARE是基于attention的文字识别算法;SRN为百度自研的文本识别算法,引入了语义信息,显著提升了准确率。 详情可参照如下页面: [文本识别算法](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.0/doc/doc_ch/algorithm_overview.md#%E6%96%87%E6%9C%AC%E8%AF%86%E5%88%AB%E7%AE%97%E6%B3%95) + +#### Q2.1.14: 在识别模型中,为什么降采样残差结构的stride为(2, 1)? +**A**: stride为(2, 1),表示在图像y方向(高度方向)上stride为2,x方向(宽度方向)上为1。由于待识别的文本图像通常为长方形,这样只在高度方向做下采样,尽量保留宽度方向的序列信息,避免宽度方向下采样后丢失过多的文字信息。 + +#### Q2.1.15: 文本识别方法CRNN关键技术有哪些? +**A**: CRNN 关键技术包括三部分。(1)CNN提取图像卷积特征。(2)深层双向LSTM网络,在卷积特征的基础上继续提取文字序列特征。(3)Connectionist Temporal Classification(CTC),解决训练时字符无法对齐的问题。 + +#### Q2.1.16: 百度自研的SRN文本识别方法特点有哪些? +**A**: SRN文本识别方法特点主要有四个部分:(1)使用Transformer Units(TUs)模块加强图像卷积特征的表达能力。(2)提出Parallel Visual Attention Module(PVAM)模块挖掘特征之间的相互关系。(3)提出Global Semantic Reasoning Module(GSRM)模块挖掘识别结果语义相互关系。(4)提出Visual-Semantic Fusion Decoder(VSFD)模块有效融合PVAM提取的视觉特征和GSRM提取的语义特征。 + + ### 数据集 @@ -192,6 +228,16 @@ **A**:SRNet是借鉴GAN中图像到图像转换、风格迁移的想法合成文本数据。不同于通用GAN的方法只选择一个分支,SRNet将文本合成任务分解为三个简单的子模块,提升合成数据的效果。这三个子模块为不带背景的文本风格迁移模块、背景抽取模块和融合模块。PaddleOCR计划将在2020年12月中旬开源基于SRNet的实用模型。 +#### Q2.2.8: DBNet如果想使用多边形作为输入,数据标签格式应该如何设定? +**A**:如果想使用多边形作为DBNet的输入,数据标签也应该用多边形来表示。这样子可以更好得拟合弯曲文本。PPOCRLabel暂时只支持矩形框标注和四边形框标注。 + +#### Q2.2.9: 端到端算法PGNet使用的是什么类型的数据集呢? +**A**: PGNet目前可以使用四点标注数据集,也可以使用多点标注数据集(十四点),多点标注训练的效果要比四点的好,一种可以尝试的策略是先在四点数据集上训练,之后用多点数据集在此基础上继续训练。 + +#### Q2.2.10: 文档版面分析常用数据集有哪些? +**A**: 文档版面分析常用数据集常用数据集有PubLayNet、TableBank word、TableBank latex等。 + + ### 模型训练调优 @@ -254,7 +300,7 @@ **A**:建议可以先了解OCR方向的基础知识,大概了解基础的检测和识别模型算法。然后在Github上可以查看OCR方向相关的repo。目前来看,从内容的完备性来看,PaddleOCR的中英文双语教程文档是有明显优势的,在数据集、模型训练、预测部署文档详实,可以快速入手。而且还有微信用户群答疑,非常适合学习实践。项目地址:[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) -#### Q3.12:如何识别带空格的英文行文本图像? +#### Q2.3.12:如何识别带空格的英文行文本图像? **A**:空格识别可以考虑以下两种方案: @@ -286,6 +332,33 @@ **A**:SE模块是MobileNetV3网络一个重要模块,目的是估计特征图每个特征通道重要性,给特征图每个特征分配权重,提高网络的表达能力。但是,对于文本检测,输入网络的分辨率比较大,一般是640\*640,利用SE模块估计特征图每个特征通道重要性比较困难,网络提升能力有限,但是该模块又比较耗时,因此在PP-OCR系统中,文本检测的骨干网络没有使用SE模块。实验也表明,当去掉SE模块,超轻量模型大小可以减小40%,文本检测效果基本不受影响。详细可以参考PP-OCR技术文章,https://arxiv.org/abs/2009.09941. +#### Q2.3.19: 参照文档做实际项目时,是重新训练还是在官方训练的基础上进行训练?具体如何操作? +**A**: 基于官方提供的模型,进行finetune的话,收敛会更快一些。 具体操作上,以识别模型训练为例:如果修改了字符文件,可以设置pretraind_model为官方提供的预训练模型 + +#### Q2.3.20: 如何根据不同的硬件平台选用不同的backbone? +**A**:在不同的硬件上,不同的backbone的速度优势不同,可以根据不同平台的速度-精度图来确定backbone,这里可以参考[PaddleClas模型速度-精度图](https://github.com/PaddlePaddle/PaddleClas/tree/release/2.0/docs/zh_CN/models)。 + +#### Q2.3.21: 端到端算法PGNet是否支持中文识别,速度会很慢嘛? +**A**:目前开源的PGNet算法模型主要是用于检测英文数字,对于中文的识别需要自己训练,大家可以使用开源的端到端中文数据集,而对于复杂文本(弯曲文本)的识别,也可以自己构造一批数据集针对进行训练,对于推理速度,可以先将模型转换为inference再进行预测,速度应该会相当可观。 + +#### Q2.3.22: 目前知识蒸馏有哪些主要的实践思路? + +**A**:知识蒸馏即利用教师模型指导学生模型的训练,目前有3种主要的蒸馏思路: +1. 基于输出结果的蒸馏,即让学生模型学习教师模型的软标签(分类或者OCR识别等任务中)或者概率热度图(分割等任务中)。 +2. 基于特征图的蒸馏,即让学生模型学习教师模型中间层的特征图,拟合中间层的一些特征。 +3. 基于关系的蒸馏,针对不同的样本(假设个数为N),教师模型会有不同的输出,那么可以基于不同样本的输出,计算一个NxN的相关性矩阵,可以让学生模型去学习教师模型关于不同样本的相关性矩阵。 + +当然,知识蒸馏方法日新月异,也欢迎大家提出更多的总结与建议。 + +#### Q2.3.23: 文档版面分析常用方法有哪些? +**A**: 文档版面分析通常使用通用目标检测方法,包括Faster RCNN系列,YOLO系列等。面向产业实践,建议使用PaddleDetection中精度和效率出色的PP-YOLO v2目标检测方法进行训练。 + +#### Q2.3.24: 如何识别招牌或者广告图中的艺术字? +**A**: 招牌或者广告图中的艺术字是文本识别一个非常有挑战性的难题,因为艺术字中的单字和印刷体相比,变化非常大。如果需要识别的艺术字是在一个词典列表内,可以将改每个词典认为是一个待识别图像模板,通过通用图像检索识别系统解决识别问题。可以尝试使用PaddleClas的图像识别系统。 + +#### Q2.3.25: 图像正常识别出来的文字是OK的,旋转90度后识别出来的结果就比较差,有什么方法可以优化? +**A**: 整图旋转90之后效果变差是有可能的,因为目前PPOCR默认输入的图片是正向的; 可以自己训练一个整图的方向分类器,放在预测的最前端(可以参照现有方向分类器的方式),或者可以基于规则做一些预处理,比如判断长宽等等。 + ## 【实战篇】PaddleOCR实战问题 @@ -361,13 +434,13 @@ (2)inference模型下载时,如果没有安装wget,可直接点击模型链接或将链接地址复制到浏览器进行下载,并解压放置到相应目录。 #### Q3.1.17:PaddleOCR开源的超轻量模型和通用OCR模型的区别? -**A**:目前PaddleOCR开源了2个中文模型,分别是9.4M超轻量中文模型和通用中文OCR模型。两者对比信息如下: +**A**:目前PaddleOCR开源了2个中文模型,分别是8.6M超轻量中文模型和通用中文OCR模型。两者对比信息如下: - 相同点:两者使用相同的**算法**和**训练数据**; - 不同点:不同之处在于**骨干网络**和**通道参数**,超轻量模型使用MobileNetV3作为骨干网络,通用模型使用Resnet50_vd作为检测模型backbone,Resnet34_vd作为识别模型backbone,具体参数差异可对比两种模型训练的配置文件. |模型|骨干网络|检测训练配置|识别训练配置| |-|-|-|-| -|9.4M超轻量中文OCR模型|MobileNetV3+MobileNetV3|det_mv3_db.yml|rec_chinese_lite_train.yml| +|8.6M超轻量中文OCR模型|MobileNetV3+MobileNetV3|det_mv3_db.yml|rec_chinese_lite_train.yml| |通用中文OCR模型|Resnet50_vd+Resnet34_vd|det_r50_vd_db.yml|rec_chinese_common_train.yml| #### Q3.1.18:如何加入自己的检测算法? @@ -482,7 +555,239 @@ StyleText的用途主要是:提取style_image中的字体、背景等style信 **A**:Paddle版本问题,请安装2.0版本Paddle:pip install paddlepaddle==2.0.0。 +#### Q3.1.39: 字典中没有的字应该如何标注,是用空格代替还是直接忽略掉? + +**A**:可以直接按照图片内容标注,在编码的时候,会忽略掉字典中不存在的字符。 + +#### Q3.1.40: dygraph、release/2.0-rc1-0、release/2.0 这三个分支有什么区别? + +**A**:dygraph是动态图分支,并且适配Paddle-develop,当然目前在Paddle2.0上也可以运行,新特性我们会在这里更新。 +release/2.0-rc1-0是基于Paddle 2.0rc1的稳定版本,release/2.0是基于Paddle2.0的稳定版本,如果希望版本或者代 +码稳定的话,建议使用release/2.0分支,如果希望可以实时拿到一些最新特性,建议使用dygraph分支。 + +#### Q3.1.41: style-text 融合模块的输入是生成的前景图像以及背景特征权重吗? + +**A**:目前版本是直接输入两个图像进行融合的,没有用到feature_map,替换背景图片不会影响效果。 + +#### Q3.1.42: 训练识别任务的时候,在CPU上运行时,报错`The setting of Parameter-Server must has server_num or servers`。 + +**A**:这是训练任务启动方式不对造成的。 + +1. 在使用CPU或者单块GPU训练的时候,可以直接使用`python3 tools/train.py -c xxx.yml`的方式启动。 +2. 在使用多块GPU训练的时候,需要使用`distributed.launch`的方式启动,如`python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c xxx.yml`,这种方式需要安装NCCL库,如果没有的话会报错。 + +#### Q3.1.43:使用StyleText进行数据合成时,文本(TextInput)的长度远超StyleInput的长度,该怎么处理与合成呢? + +**A**:在使用StyleText进行数据合成的时候,建议StyleInput的长度长于TextInput的长度。有2种方法可以处理上述问题: + +1. 将StyleInput按列的方向进行复制与扩充,直到其超过TextInput的长度。 +2. 将TextInput进行裁剪,保证每段TextInput都稍短于StyleInput,分别合成之后,再拼接在一起。 + +实际使用中发现,使用第2种方法的效果在长文本合成的场景中的合成效果更好,StyleText中提供的也是第2种数据合成的逻辑。 + + +#### Q3.1.44: 文字识别训练,设置图像高度不等于32时报错 + +**A**:ctc decode的时候,输入需要是1维向量,因此降采样之后,建议特征图高度为1,ppocr中,特征图会降采样32倍,之后高度正好为1,所以有2种解决方案 +- 指定输入shape高度为32(推荐) +- 在backbone的mv3中添加更多的降采样模块,保证输出的特征图高度为1 + +#### Q3.1.45: 增大batch_size模型训练速度没有明显提升 + +**A**:如果batch_size打得太大,加速效果不明显的话,可以试一下增大初始化内存的值,运行代码前设置环境变量: +``` +export FLAGS_initial_cpu_memory_in_mb=2000 # 设置初始化内存约2G左右 +``` + +#### Q3.1.46: 动态图分支(dygraph,release/2.0),训练模型和推理模型效果不一致 + +**A**:当前问题表现为:使用训练完的模型直接测试结果较好,但是转换为inference model后,预测结果不一致;出现这个问题一般是两个原因: +1. 预处理函数设置的不一致 +2. 后处理参数不一致 +repo中config.yml文件的前后处理参数和inference预测默认的超参数有不一致的地方,建议排查下训练模型预测和inference预测的前后处理, +参考[issue](https://github.com/PaddlePaddle/PaddleOCR/issues/2080)。 + +#### Q3.1.47: paddleocr package 报错 FatalError: `Process abort signal` is detected by the operating system + +**A**:首先,按照[安装文档](./installation.md)安装PaddleOCR的运行环境;另外,检查python环境,python3.6/3.8上可能会出现这个问题,建议用python3.7, +参考[issue](https://github.com/PaddlePaddle/PaddleOCR/issues/2069)。 + +#### Q3.1.48: 下载的识别模型解压后缺失文件,没有期望的inference.pdiparams, inference.pdmodel等文件 + +**A**:用解压软件解压可能会出现这个问题,建议二次解压下或者用命令行解压`tar xf ` + +#### Q3.1.49: 只想要识别票据中的部分片段,重新训练它的话,只需要训练文本检测模型就可以了吗?问文本识别,方向分类还是用原来的模型这样可以吗? + +**A**:可以的。PaddleOCR的检测、识别、方向分类器三个模型是独立的,在实际使用中可以优化和替换其中任何一个模型。 + +#### Q3.1.50: 为什么在checkpoints中load下载的预训练模型会报错? + +**A**: 这里有两个不同的概念: +- pretrained_model:指预训练模型,是已经训练完成的模型。这时会load预训练模型的参数,但并不会load学习率、优化器以及训练状态等。如果需要finetune,应该使用pretrained。 +- checkpoints:指之前训练的中间结果,例如前一次训练到了100个epoch,想接着训练。这时会load尝试所有信息,包括模型的参数,之前的状态等。 + +这里应该使用pretrained_model而不是checkpoints + +#### Q3.1.51: 如何用PaddleOCR识别视频中的文字? + +**A**: 目前PaddleOCR主要针对图像做处理,如果需要视频识别,可以先对视频抽帧,然后用PPOCR识别。 + +#### Q3.1.52: 相机采集的图像为四通道,应该如何处理? + +**A**: 有两种方式处理: +- 如果没有其他需要,可以在解码数据的时候指定模式为三通道,例如如果使用opencv,可以使用cv::imread(img_path, cv::IMREAD_COLOR)。 +- 如果其他模块需要处理四通道的图像,那也可以在输入PaddleOCR模块之前进行转换,例如使用cvCvtColor(&img,img3chan,CV_RGBA2RGB)。 + +#### Q3.1.53: 预测时提示图像过大,显存、内存溢出了,应该如何处理? +**A**: 可以按照这个PR的修改来缓解显存、内存占用 [#2230](https://github.com/PaddlePaddle/PaddleOCR/pull/2230) + +#### Q3.1.54: 用c++来部署,目前支持Paddle2.0的模型吗? +**A**: PPOCR 2.0的模型在arm上运行可以参照该PR [#1877](https://github.com/PaddlePaddle/PaddleOCR/pull/1877) + +#### Q3.1.55: 目前PaddleOCR有知识蒸馏的demo吗? +**A**: 目前我们还没有提供PaddleOCR知识蒸馏的相关demo,PaddleClas开源了一个效果还不错的方案,可以移步[SSLD知识蒸馏方案](https://github.com/PaddlePaddle/PaddleClas/blob/release%2F2.0/docs/zh_CN/advanced_tutorials/distillation/distillation.md), paper: https://arxiv.org/abs/2103.05959 关于PaddleOCR的蒸馏,我们也会在未来支持。 + +#### Q3.1.56: 在使用PPOCRLabel的时候,如何标注倾斜的文字? +**A**: 如果矩形框标注后空白冗余较多,可以尝试PPOCRLabel提供的四点标注,可以标注各种倾斜角度的文本。 + +#### Q3.1.57: 端到端算法PGNet提供了两种后处理方式,两者之间有什么区别呢? +**A**: 两种后处理的区别主要在于速度的推理,config中PostProcess有fast/slow两种模式,slow模式的后处理速度慢,精度相对较高,fast模式的后处理速度快,精度也在可接受的范围之内。建议使用速度快的后处理方式。 + +#### Q3.1.58: 使用PGNet进行eval报错? +**A**: 需要注意,我们目前在release/2.1更新了评测代码,目前支持A,B两种评测模式: +* A模式:该模式主要为了方便用户使用,与训练集一样的标注文件就可以正常进行eval操作, 代码中默认是A模式。 +* B模式:该模式主要为了保证我们的评测代码可以和Total Text官方的评测方式对齐,该模式下直接加载官方提供的mat文件进行eval。 + +#### Q3.1.59: 使用预训练模型进行预测,对于特定字符识别识别效果较差,怎么解决? +**A**: 由于我们所提供的识别模型是基于通用大规模数据集进行训练的,部分字符可能在训练集中包含较少,因此您可以构建特定场景的数据集,基于我们提供的预训练模型进行微调。建议用于微调的数据集中,每个字符出现的样本数量不低于300,但同时需要注意不同字符的数量均衡。具体可以参考:[微调](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/recognition.md#2-%E5%90%AF%E5%8A%A8%E8%AE%AD%E7%BB%83)。 + +#### Q3.1.60: PGNet有中文预训练模型吗? +**A**: 目前我们尚未提供针对中文的预训练模型,如有需要,可以尝试自己训练。具体需要修改的地方有: + 1. [config文件中](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/configs/e2e/e2e_r50_vd_pg.yml#L23-L24),字典文件路径及语种设置; + 1. [网络结构中](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/ppocr/modeling/heads/e2e_pg_head.py#L181),`out_channels`修改为字典中的字符数目+1(考虑到空格); + 1. [loss中](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/ppocr/losses/e2e_pg_loss.py#L93),修改`37`为字典中的字符数目+1(考虑到空格); + +#### Q3.1.61: 用于PGNet的训练集,文本框的标注有要求吗? +**A**: PGNet支持多点标注,比如4点、8点、14点等。但需要注意的是,标注点尽可能分布均匀(相邻标注点间隔距离均匀一致),且label文件中的标注点需要从标注框的左上角开始,按标注点顺时针顺序依次编写,以上问题都可能对训练精度造成影响。 +我们提供的,基于Total Text数据集的PGNet预训练模型使用了14点标注方式。 + +#### Q3.1.62: 弯曲文本(如略微形变的文档图像)漏检问题 +**A**: db后处理中计算文本框平均得分时,是求rectangle区域的平均分数,容易造成弯曲文本漏检,已新增求polygon区域的平均分数,会更准确,但速度有所降低,可按需选择,在相关pr中可查看[可视化对比效果](https://github.com/PaddlePaddle/PaddleOCR/pull/2604)。该功能通过参数 [det_db_score_mode](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/tools/infer/utility.py#L51)进行选择,参数值可选[`fast`(默认)、`slow`],`fast`对应原始的rectangle方式,`slow`对应polygon方式。感谢用户[buptlihang](https://github.com/buptlihang)提[pr](https://github.com/PaddlePaddle/PaddleOCR/pull/2574)帮助解决该问题🌹。 + +#### Q3.1.63: 请问端到端的pgnet相比于DB+CRNN在准确率上有优势吗?或者是pgnet最擅长的场景是什么场景呢? +**A**: pgnet是端到端算法,检测识别一步到位,不用分开训练2个模型,也支持弯曲文本的识别,但是在中文上的效果还没有充分验证;db+crnn的验证更充分,应用相对成熟,常规非弯曲的文本都能解的不错。 + +#### Q3.1.64: config yml文件中的ratio_list参数的作用是什么? +**A**: 在动态图中,ratio_list在有多个数据源的情况下使用,ratio_list中的每个值是每个epoch从对应数据源采样数据的比例。如ratio_list=[0.3,0.2],label_file_list=['data1','data2'],代表每个epoch的训练数据包含data1 30%的数据,和data2里 20%的数据,ratio_list中数值的和不需要等于1。ratio_list和label_file_list的长度必须一致。 + +静态图检测数据采样的逻辑与动态图不同,但基本不影响训练精度。 + +在静态图中,使用 检测 dataloader读取数据时,会先设置每个epoch的数据量,比如这里设置为1000,ratio_list中的值表示在1000中的占比,比如ratio_list是[0.3, 0.7],则表示使用两个数据源,每个epoch从第一个数据源采样1000*0.3=300张图,从第二个数据源采样700张图。ratio_list的值的和也不需要等于1。 + +#### Q3.1.65: 支持动态图模型的android和ios demo什么时候上线?? +**A**: 支持动态图模型的android demo已经合入dygraph分支,欢迎试用(https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/deploy/android_demo/README.md); ios demo暂时未提供动态图模型版本,可以基于静态图版本(https://github.com/PaddlePaddle/PaddleOCR/blob/develop/deploy/ios_demo)自行改造。 + +#### Q3.1.66: iaa里面添加的数据增强方式,是每张图像训练都会做增强还是随机的?如何添加一个数据增强方法? + +**A**:iaa增强的训练配置参考:https://github.com/PaddlePaddle/PaddleOCR/blob/0ccc1720c252beb277b9e522a1b228eb6abffb8a/configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml#L82, +其中{ 'type': Fliplr, 'args': { 'p': 0.5 } } p是概率。新增数据增强,可以参考这个方法:https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.1/doc/doc_ch/add_new_algorithm.md#%E6%95%B0%E6%8D%AE%E5%8A%A0%E8%BD%BD%E5%92%8C%E5%A4%84%E7%90%86 + +#### Q3.1.67: PGNet训练中文弯曲数据集,可视化时弯曲文本无法显示。 + +**A**: 可能是因为安装的OpenCV里,cv2.putText不能显示中文的原因,可以尝试用Pillow来添加显示中文,需要改draw_e2e_res函数里面的代码,可以参考如下代码: +``` +box = box.astype(np.int32).reshape((-1, 1, 2)) +cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2) + +from PIL import ImageFont, ImageDraw, Image +img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) +draw = ImageDraw.Draw(img) +fontStyle = ImageFont.truetype( +"font/msyh.ttc", 16, encoding="utf-8") +draw.text((int(box[0, 0, 0]), int(box[0, 0, 1])), text, (0, 255, 0), font=fontStyle) + +src_im= cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR) +``` +#### Q3.1.68: 用PGNet做进行端到端训练时,数据集标注的点的个数必须都是统一一样的吗? 能不能随意标点数,只要能够按顺时针从左上角开始标这样? + +**A**: 目前代码要求标注为统一的点数。 + +#### Q3.1.69: 怎么加速训练过程呢? + +**A**:OCR模型训练过程中一般包含大量的数据增广,这些数据增广是比较耗时的,因此可以离线生成大量增广后的图像,直接送入网络进行训练,机器资源充足的情况下,也可以使用分布式训练的方法,可以参考[分布式训练教程文档](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/distributed_training.md)。 + + +#### Q3.1.70: 文字识别模型模型的输出矩阵需要进行解码才能得到识别的文本。代码中实现为preds_idx = preds.argmax(axis=2),也就是最佳路径解码法。这是一种贪心算法,是每一个时间步只将最大概率的字符作为当前时间步的预测输出,但得到的结果不一定是最好的。为什么不使用beam search这种方式进行解码呢? + +**A**:实验发现,使用贪心的方法去做解码,识别精度影响不大,但是速度方面的优势比较明显,因此PaddleOCR中使用贪心算法去做识别的解码。 + +#### Q3.1.71: 遇到中英文识别模型不支持的字符,该如何对模型做微调? + +**A**:如果希望识别中英文识别模型中不支持的字符,需要更新识别的字典,并完成微调过程。比如说如果希望模型能够进一步识别罗马数字,可以按照以下步骤完成模型微调过程。 +1. 准备中英文识别数据以及罗马数字的识别数据,用于训练,同时保证罗马数字和中英文识别数字的效果; +2. 修改默认的字典文件,在后面添加罗马数字的字符; +3. 下载PaddleOCR提供的预训练模型,配置预训练模型和数据的路径,开始训练。 + + +#### Q3.1.72: 文字识别主要有CRNN和Attention两种方式,但是在我们的说明文档中,CRNN有对应的论文,但是Attention没看到,这个具体在哪里呢? + +**A**:文字识别主要有CTC和Attention两种方式,基于CTC的算法有CRNN、Rosetta、StarNet,基于Attention的方法有RARE、其他的算法PaddleOCR里没有提供复现代码。论文的链接可以参考:[PaddleOCR文本识别算法教程文档](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/algorithm_overview.md#%E6%96%87%E6%9C%AC%E8%AF%86%E5%88%AB%E7%AE%97%E6%B3%95) + + +#### Q3.1.73: 如何使用TensorRT加速PaddleOCR预测? + +**A**: 目前paddle的dygraph分支已经支持了python和C++ TensorRT预测的代码,python端inference预测时把参数[--use_tensorrt=True](https://github.com/PaddlePaddle/PaddleOCR/blob/3ec57e8df9263de6fa897e33d2d91bc5d0849ef3/tools/infer/utility.py#L37)即可, +C++TensorRT预测需要使用支持TRT的预测库并在编译时打开[-DWITH_TENSORRT=ON](https://github.com/PaddlePaddle/PaddleOCR/blob/3ec57e8df9263de6fa897e33d2d91bc5d0849ef3/deploy/cpp_infer/tools/build.sh#L15)。 +如果想修改其他分支代码支持TensorRT预测,可以参考[PR](https://github.com/PaddlePaddle/PaddleOCR/pull/2921)。 + +注:建议使用TensorRT大于等于6.1.0.5以上的版本。 + +#### Q3.1.74: ppocr检测效果不好,该如何优化? + +**A**: 具体问题具体分析: +1. 如果在你的场景上检测效果不可用,首选是在你的数据上做finetune训练; +2. 如果图像过大,文字过于密集,建议不要过度压缩图像,可以尝试修改检测预处理的[resize逻辑](https://github.com/PaddlePaddle/PaddleOCR/blob/3ec57e8df9263de6fa897e33d2d91bc5d0849ef3/tools/infer/predict_det.py#L42),防止图像被过度压缩; +3. 检测框大小过于紧贴文字或检测框过大,可以调整[db_unclip_ratio](https://github.com/PaddlePaddle/PaddleOCR/blob/3ec57e8df9263de6fa897e33d2d91bc5d0849ef3/tools/infer/utility.py#L51)这个参数,加大参数可以扩大检测框,减小参数可以减小检测框大小; +4. 检测框存在很多漏检问题,可以减小DB检测后处理的阈值参数[det_db_box_thresh](https://github.com/PaddlePaddle/PaddleOCR/blob/3ec57e8df9263de6fa897e33d2d91bc5d0849ef3/tools/infer/utility.py#L50),防止一些检测框被过滤掉,也可以尝试设置[det_db_score_mode](https://github.com/PaddlePaddle/PaddleOCR/blob/3ec57e8df9263de6fa897e33d2d91bc5d0849ef3/tools/infer/utility.py#L54)为'slow'; +5. 其他方法可以选择[use_dilation](https://github.com/PaddlePaddle/PaddleOCR/blob/3ec57e8df9263de6fa897e33d2d91bc5d0849ef3/tools/infer/utility.py#L53)为True,对检测输出的feature map做膨胀处理,一般情况下,会有效果改善; + +#### Q3.1.75: lite预测库和nb模型版本不匹配,该如何解决? + +**A**: 如果可以正常预测就不用管,如果这个问题导致无法正常预测,可以尝试使用同一个commit的Paddle Lite代码编译预测库和opt文件,可以参考[移动端部署教程](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.1/deploy/lite/readme.md)。 + +#### Q3.1.76: 'SystemError: (Fatal) Blocking queue is killed because the data reader raises an exception.' 遇到这个错如何处理? + +这个报错说明dataloader的时候报错了,如果是还未开始训练就报错,需要检查下数据和标签格式是不是对的,ppocr的数据标签格式为 +``` +" 图像文件名 json.dumps编码的图像标注信息" +ch4_test_images/img_61.jpg [{"transcription": "MASA", "points": [[310, 104], [416, 141], [418, 216], [312, 179]]}, {...}] +``` +提供的标注文件格式如上,中间用"\t"分隔,不是四个空格分隔。 + +如果是训练期间报错了,需要检查下是不是遇到了异常数据,或者是共享内存不足导致了这个问题,可以使用tools/train.py中的test_reader进行调试, +linux系统共享内存位于/dev/shm目录下,如果内存不足,可以清理/dev/shm目录, 另外,如果是使用docker,在创建镜像时,可通过设置参数--shm_size=8G 设置较大的共享内存。 + +#### Q3.1.77: 使用mkldnn加速预测时遇到 'Please compile with MKLDNN first to use MKLDNN' + +**A**: 报错提示当前环境没有mkldnn,建议检查下当前CPU是否支持mlkdnn(MAC上是无法用mkldnn);另外的可能是使用的预测库不支持mkldnn, +建议从[这里](https://paddle-inference.readthedocs.io/en/latest/user_guides/download_lib.html#linux)下载支持mlkdnn的CPU预测库。 + +#### Q3.1.78: 在线demo支持阿拉伯语吗 +**A**: 在线demo目前只支持中英文, 多语言的都需要通过whl包自行处理 + +#### Q3.1.79: 某个类别的样本比较少,通过增加训练的迭代次数或者是epoch,变相增加小样本的数目,这样能缓解这个问题么? +**A**: 尽量保证类别均衡, 某些类别样本少,可以通过补充合成数据的方式处理;实验证明训练集中出现频次较少的字符,识别效果会比较差,增加迭代次数不能改变样本量少的问题。 + +#### Q3.1.80: 想把简历上的文字识别出来后,能够把关系一一对应起来,比如姓名和它后面的名字组成一对,籍贯、邮箱、学历等等都和各自的内容关联起来,这个应该如何处理,PPOCR目前支持吗? +**A**: 这样的需求在企业应用中确实比较常见,但往往都是个性化的需求,没有非常规整统一的处理方式。常见的处理方式有如下两种: +1. 对于单一版式、或者版式差异不大的应用场景,可以基于识别场景的一些先验信息,将识别内容进行配对; 比如运用表单结构信息:常见表单"姓名"关键字的后面,往往紧跟的就是名字信息 +2. 对于版式多样,或者无固定版式的场景, 需要借助于NLP中的NER技术,给识别内容中的某些字段,赋予key值 + +由于这部分需求和业务场景强相关,难以用一个统一的模型去处理,目前PPOCR暂不支持。 如果需要用到NER技术,可以参照Paddle团队的另一个开源套件: https://github.com/PaddlePaddle/ERNIE, 其提供的预训练模型ERNIE, 可以帮助提升NER任务的准确率。 + + + ### 数据集 #### Q3.2.1:如何制作PaddleOCR支持的数据格式 @@ -576,6 +881,10 @@ StyleText的用途主要是:提取style_image中的字体、背景等style信 #### Q3.2.18: PaddleOCR动态图版本如何finetune? **A**:finetune需要将配置文件里的 Global.load_static_weights设置为false,如果没有此字段可以手动添加,然后将模型地址放到Global.pretrained_model字段下即可。 +#### Q3.2.19: 如何合成手写中文数据集? +**A**: 手写数据集可以通过手写单字数据集合成得到。随机选取一定数量的单字图片和对应的label,将图片高度resize为随机的统一高度后拼接在一起,即可得到合成数据集。对于需要添加文字背景的情况,建议使用阈值化将单字图片的白色背景处理为透明背景,再与真实背景图进行合成。具体可以参考文档[手写数据集](https://github.com/PaddlePaddle/PaddleOCR/blob/a72d6f23be9979e0c103d911a9dca3e4613e8ccf/doc/doc_ch/handwritten_datasets.md)。 + + ### 模型训练调优 @@ -725,8 +1034,52 @@ ps -axu | grep train.py | awk '{print $2}' | xargs kill -9 **A**:1.1和2.0的模型一样,微调时,垂直排列的文字需要逆时针旋转 90°后加入训练,上下颠倒的需要旋转为水平的。 #### Q3.3.30: 模型训练过程中如何得到 best_accuracy 模型? + **A**:配置文件里的eval_batch_step字段用来控制多少次iter进行一次eval,在eval完成后会自动生成 best_accuracy 模型,所以如果希望很快就能拿到best_accuracy模型,可以将eval_batch_step改小一点,如改为[10,10],这样表示第10次迭代后,以后没隔10个迭代就进行一次模型的评估。 +#### Q3.3.31: Cosine学习率的更新策略是怎样的?训练过程中为什么会在一个值上停很久? + +**A**: Cosine学习率的说明可以参考[这里](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/optimizer/lr/CosineAnnealingDecay_cn.html#cosineannealingdecay) + +在PaddleOCR中,为了让学习率更加平缓,我们将其中的epoch调整成了iter。 +学习率的更新会和总的iter数量有关。当iter比较大时,会经过较多iter才能看出学习率的值有变化。 + +#### Q3.3.32: 之前的CosineWarmup方法为什么不见了? + +**A**: 我们对代码结构进行了调整,目前的Cosine可以覆盖原有的CosineWarmup的功能,只需要在配置文件中增加相应配置即可。 +例如下面的代码,可以设置warmup为2个epoch: +``` +lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 2 +``` + +#### Q3.3.33: 训练识别和检测时学习率要加上warmup,目的是什么? +**A**: Warmup机制先使学习率从一个较小的值逐步升到一个较大的值,而不是直接就使用较大的学习率,这样有助于模型的稳定收敛。在OCR检测和OCR识别中,一般会带来精度~0.5%的提升。 + +#### Q3.3.34: 表格识别中,如何提高单字的识别结果? +**A**: 首先需要确认一下检测模型有没有有效的检测出单个字符,如果没有的话,需要在训练集当中添加相应的单字数据集。 + +#### Q3.3.35: SRN训练不收敛(loss不降)或SRN训练acc一直为0。 +**A**: 如果loss下降不正常,需要确认没有修改yml文件中的image_shape,默认[1, 64, 256],代码中针对这个配置写死了,修改可能会造成无法收敛。如果确认参数无误,loss正常下降,可以多迭代一段时间观察下,开始acc为0是正常的。 + +#### Q3.3.36: 训练starnet网络,印章数据可以和非弯曲数据一起训练吗。 +**A**: 可以的,starnet里的tps模块会对印章图片进行校正,使其和非弯曲的图片一样。 + +#### Q3.3.37: 训练过程中,训练程序意外退出/挂起,应该如何解决? +**A**: 考虑内存,显存(使用GPU训练的话)是否不足,可在配置文件中,将训练和评估的batch size调小一些。需要注意,训练batch size调小时,学习率learning rate也要调小,一般可按等比例调整。 + +#### Q3.3.38: 训练程序启动后直到结束,看不到训练过程log? +**A**: 可以从以下三方面考虑: + 1. 检查训练进程是否正常退出、显存占用是否释放、是否有残留进程,如果确定是训练程序卡死,可以检查环境配置,遇到环境问题建议使用docker,可以参考说明文档[安装](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/installation.md)。 + 2. 检查数据集的数据量是否太小,可调小batch size从而增加一个epoch中的训练step数量,或在训练config文件中,将参数print_batch_step改为1,即每一个step打印一次log信息。 + 3. 如果使用私有数据集训练,可先用PaddleOCR提供/推荐的数据集进行训练,排查私有数据集是否存在问题。 + +#### Q3.3.39: 配置文件中的参数num workers是什么意思,应该如何设置? +**A**: 训练数据的读取需要硬盘IO,而硬盘IO速度远小于GPU运算速度,为了避免数据读取成为训练速度瓶颈,可以使用多进程读取数据,num workers表示数据读取的进程数量,0表示不使用多进程读取。在Linux系统下,多进程读取数据时,进程间通信需要基于共享内存,因此使用多进程读取数据时,建议设置共享内存不低于2GB,最好可以达到8GB,此时,num workers可以设置为CPU核心数。如果机器硬件配置较低,或训练进程卡死、dataloader报错,可以将num workers设置为0,即不使用多进程读取数据。 + + ### 预测部署 @@ -771,10 +1124,6 @@ ps -axu | grep train.py | awk '{print $2}' | xargs kill -9 **A**:在安卓APK上无法设置,没有暴露这个接口,如果使用的是PaddledOCR/deploy/lite/的demo,可以修改config.txt中的对应参数来设置 -#### Q3.4.9:PaddleOCR模型是否可以转换成ONNX模型? - -**A**:目前暂不支持转ONNX,相关工作在研发中。 - #### Q3.4.10:使用opt工具对检测模型转换时报错 can not found op arguments for node conv2_b_attr **A**:这个问题大概率是编译opt工具的Paddle-Lite不是develop分支,建议使用Paddle-Lite 的develop分支编译opt工具。 @@ -841,7 +1190,8 @@ ps -axu | grep train.py | awk '{print $2}' | xargs kill -9 **A**:使用EAST或SAST模型进行推理预测时,需要在命令中指定参数--det_algorithm="EAST" 或 --det_algorithm="SAST",使用DB时不用指定是因为该参数默认值是"DB":https://github.com/PaddlePaddle/PaddleOCR/blob/e7a708e9fdaf413ed7a14da8e4a7b4ac0b211e42/tools/infer/utility.py#L43 #### Q3.4.25: PaddleOCR模型Python端预测和C++预测结果不一致? -正常来说,python端预测和C++预测文本是一致的,如果预测结果差异较大, + +**A**:正常来说,python端预测和C++预测文本是一致的,如果预测结果差异较大, 建议首先排查diff出现在检测模型还是识别模型,或者尝试换其他模型是否有类似的问题。 其次,检查python端和C++端数据处理部分是否存在差异,建议保存环境,更新PaddleOCR代码再试下。 如果更新代码或者更新代码都没能解决,建议在PaddleOCR微信群里或者issue中抛出您的问题。 @@ -889,3 +1239,68 @@ Paddle2ONNX支持转换的[模型列表](https://github.com/PaddlePaddle/Paddle2 #### Q3.4.34: 2.0训练出来的模型,能否在1.1版本上进行部署? **A**:这个是不建议的,2.0训练出来的模型建议使用dygraph分支里提供的部署代码。 + +#### Q3.4.35: 怎么解决paddleOCR在T4卡上有越预测越慢的情况? +**A**: +1. T4 GPU没有主动散热,因此在测试的时候需要在每次infer之后需要sleep 30ms,否则机器容易因为过热而降频(inference速度会变慢),温度过高也有可能会导致宕机。 +2. T4在不使用的时候,也有可能会降频,因此在做benchmark的时候需要锁频,下面这两条命令可以进行锁频。 +``` +nvidia-smi -i 0 -pm ENABLED +nvidia-smi --lock-gpu-clocks=1590 -i 0 +``` + +#### Q3.4.36: DB有些框太贴文本了反而去掉了一些文本的边角影响识别,这个问题有什么办法可以缓解吗? + +**A**:可以把后处理的参数unclip_ratio适当调大一点。 + +#### Q3.4.37: 在windows上进行cpp inference的部署时,总是提示找不到`paddle_fluid.dll`和`opencv_world346.dll`, +**A**:有2种方法可以解决这个问题: + +1. 将paddle预测库和opencv库的地址添加到系统环境变量中。 +2. 将提示缺失的dll文件拷贝到编译产出的`ocr_system.exe`文件夹中。 + + +#### Q3.4.38:想在Mac上部署,从哪里下载预测库呢? + +**A**:Mac上的Paddle预测库可以从这里下载:[https://paddle-inference-lib.bj.bcebos.com/mac/2.0.0/cpu_avx_openblas/paddle_inference.tgz](https://paddle-inference-lib.bj.bcebos.com/mac/2.0.0/cpu_avx_openblas/paddle_inference.tgz) + + +#### Q3.4.39:内网环境如何进行服务化部署呢? + +**A**:仍然可以使用PaddleServing或者HubServing进行服务化部署,保证内网地址可以访问即可。 + +#### Q3.4.40: 使用hub_serving部署,延时较高,可能的原因是什么呀? + +**A**: 首先,测试的时候第一张图延时较高,可以多测试几张然后观察后几张图的速度;其次,如果是在cpu端部署serving端模型(如backbone为ResNet34),耗时较慢,建议在cpu端部署mobile(如backbone为MobileNetV3)模型。 + +#### Q3.4.41: PaddleOCR支持tensorrt推理吗? +**A**: 支持的,需要在编译的时候将CMakeLists.txt文件当中,将相关代码`option(WITH_TENSORRT "Compile demo with TensorRT." OFF)`的OFF改成ON。关于服务器端部署的更多设置,可以参考[飞桨官网](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/native_infer.html) + +#### Q3.4.42: 在使用PaddleLite进行预测部署时,启动预测后卡死/手机死机? +**A**: 请检查模型转换时所用PaddleLite的版本,和预测库的版本是否对齐。即PaddleLite版本为2.8,则预测库版本也要为2.8。 + +#### Q3.4.43: 预测时显存爆炸、内存泄漏问题? +**A**: 打开显存/内存优化开关`enable_memory_optim`可以解决该问题,相关代码已合入,[查看详情](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/tools/infer/utility.py#L153)。 + +#### Q3.4.44: 如何多进程预测? +**A**: 近期PaddleOCR新增了[多进程预测控制参数](https://github.com/PaddlePaddle/PaddleOCR/blob/a312647be716776c1aac33ff939ae358a39e8188/tools/infer/utility.py#L103),`use_mp`表示是否使用多进程,`total_process_num`表示在使用多进程时的进程数。具体使用方式请参考[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/inference.md#1-%E8%B6%85%E8%BD%BB%E9%87%8F%E4%B8%AD%E6%96%87ocr%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)。 + +#### Q3.4.45: win下C++部署中文识别乱码的解决方法 +**A**: win下编码格式不是utf8,而ppocr_keys_v1.txt的编码格式的utf8,将ppocr_keys_v1.txt 的编码从utf-8修改为 Ansi 编码格式就行了。 + +#### Q3.4.46: windows 3060显卡GPU模式启动 加载模型慢。 +**A**: 30系列的显卡需要使用cuda11。 + +#### Q3.4.47: 请教如何优化检测阶段时长? + +**A**: 预测单张图会慢一点,如果批量预测,第一张图比较慢,后面就快了,因为最开始一些初始化操作比较耗时。服务部署的话,访问一次后,后面再访问就不会初始化了,推理的话每次都需要初始化的。 + +#### Q3.4.48: paddle serving 本地启动调用失败,怎么判断是否正常工作? + +**A**:没有打印出预测结果,说明启动失败。可以参考这篇文档重新配置下动态图的paddle serving:https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/deploy/pdserving/README_CN.md + +#### Q3.4.49: 同一个模型,c++部署和python部署方式,出来的结果不一致,如何定位? +**A**:有如下几个Debug经验: +1. 优先对一下几个阈值参数是否一致; +2. 排查一下c++代码和python代码的预处理和后处理方式是否一致; +3. 用python在模型输入输出各保存一下二进制文件,排除inference的差异性 diff --git a/doc/doc_ch/add_new_algorithm.md b/doc/doc_ch/add_new_algorithm.md index f66e26b4c13ae19460c44d80b85eb253c2accfde..79c29249dd7dd0b25ffa7625d11ed2378bfafec4 100644 --- a/doc/doc_ch/add_new_algorithm.md +++ b/doc/doc_ch/add_new_algorithm.md @@ -2,16 +2,18 @@ PaddleOCR将一个算法分解为以下几个部分,并对各部分进行模块化处理,方便快速组合出新的算法。 -* 数据加载和处理 -* 网络 -* 后处理 -* 损失函数 -* 指标评估 -* 优化器 +* [1. 数据加载和处理](#1) +* [2. 网络](#2) +* [3. 后处理](#3) +* [4. 损失函数](#4) +* [5. 指标评估](#5) +* [6. 优化器](#6) 下面将分别对每个部分进行介绍,并介绍如何在该部分里添加新算法所需模块。 -## 数据加载和处理 + + +## 1. 数据加载和处理 数据加载和处理由不同的模块(module)组成,其完成了图片的读取、数据增强和label的制作。这一部分在[ppocr/data](../../ppocr/data)下。 各个文件及文件夹作用说明如下: @@ -64,7 +66,9 @@ transforms: keep_keys: [ 'image', 'label' ] # dataloader will return list in this order ``` -## 网络 + + +## 2. 网络 网络部分完成了网络的组网操作,PaddleOCR将网络划分为四部分,这一部分在[ppocr/modeling](../../ppocr/modeling)下。 进入网络的数据将按照顺序(transforms->backbones-> necks->heads)依次通过这四个部分。 @@ -123,7 +127,9 @@ Architecture: args1: args1 ``` -## 后处理 + + +## 3. 后处理 后处理实现解码网络输出获得文本框或者识别到的文字。这一部分在[ppocr/postprocess](../../ppocr/postprocess)下。 PaddleOCR内置了DB,EAST,SAST,CRNN和Attention等算法相关的后处理模块,对于没有内置的组件可通过如下步骤添加: @@ -171,7 +177,9 @@ PostProcess: args2: args2 ``` -## 损失函数 + + +## 4. 损失函数 损失函数用于计算网络输出和label之间的距离。这一部分在[ppocr/losses](../../ppocr/losses)下。 PaddleOCR内置了DB,EAST,SAST,CRNN和Attention等算法相关的损失函数模块,对于没有内置的模块可通过如下步骤添加: @@ -208,7 +216,9 @@ Loss: args2: args2 ``` -## 指标评估 + + +## 5. 指标评估 指标评估用于计算网络在当前batch上的性能。这一部分在[ppocr/metrics](../../ppocr/metrics)下。 PaddleOCR内置了检测,分类和识别等算法相关的指标评估模块,对于没有内置的模块可通过如下步骤添加: @@ -262,7 +272,9 @@ Metric: main_indicator: acc ``` -## 优化器 + + +## 6. 优化器 优化器用于训练网络。优化器内部还包含了网络正则化和学习率衰减模块。 这一部分在[ppocr/optimizer](../../ppocr/optimizer)下。 PaddleOCR内置了`Momentum`,`Adam` 和`RMSProp`等常用的优化器模块,`Linear`,`Cosine`,`Step`和`Piecewise`等常用的正则化模块与`L1Decay`和`L2Decay`等常用的学习率衰减模块。 diff --git a/doc/doc_ch/algorithm_overview.md b/doc/doc_ch/algorithm_overview.md index 19d7a69c7fb08a8e7fb36c3043aa211de19b9295..af883de86c798babe6ca1616710c0e13546e1045 100755 --- a/doc/doc_ch/algorithm_overview.md +++ b/doc/doc_ch/algorithm_overview.md @@ -9,11 +9,13 @@ ### 1.文本检测算法 PaddleOCR开源的文本检测算法列表: -- [x] DB([paper]( https://arxiv.org/abs/1911.08947)) [2](ppocr推荐) -- [x] EAST([paper](https://arxiv.org/abs/1704.03155))[1] -- [x] SAST([paper](https://arxiv.org/abs/1908.05498))[4] +- [x] DB([paper]( https://arxiv.org/abs/1911.08947))(ppocr推荐) +- [x] EAST([paper](https://arxiv.org/abs/1704.03155)) +- [x] SAST([paper](https://arxiv.org/abs/1908.05498)) +- [x] PSENet([paper](https://arxiv.org/abs/1903.12473v2)) 在ICDAR2015文本检测公开数据集上,算法效果如下: + |模型|骨干网络|precision|recall|Hmean|下载链接| | --- | --- | --- | --- | --- | --- | |EAST|ResNet50_vd|85.80%|86.71%|86.25%|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| @@ -21,6 +23,8 @@ PaddleOCR开源的文本检测算法列表: |DB|ResNet50_vd|86.41%|78.72%|82.38%|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_db_v2.0_train.tar)| |DB|MobileNetV3|77.29%|73.08%|75.12%|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar)| |SAST|ResNet50_vd|91.39%|83.77%|87.42%|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)| +|PSE|ResNet50_vd|85.81%|79.53%|82.55%|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_vd_pse_v2.0_train.tar)| +|PSE|MobileNetV3|82.20%|70.48%|75.89%|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_mv3_pse_v2.0_train.tar)| 在Total-text文本检测公开数据集上,算法效果如下: @@ -39,13 +43,16 @@ PaddleOCR文本检测算法的训练和使用请参考文档教程中[模型训 ### 2.文本识别算法 PaddleOCR基于动态图开源的文本识别算法列表: -- [x] CRNN([paper](https://arxiv.org/abs/1507.05717))[7](ppocr推荐) -- [x] Rosetta([paper](https://arxiv.org/abs/1910.05085))[10] -- [x] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html))[11] -- [x] RARE([paper](https://arxiv.org/abs/1603.03915v1))[12] -- [x] SRN([paper](https://arxiv.org/abs/2003.12294))[5] +- [x] CRNN([paper](https://arxiv.org/abs/1507.05717))(ppocr推荐) +- [x] Rosetta([paper](https://arxiv.org/abs/1910.05085)) +- [x] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html)) +- [x] RARE([paper](https://arxiv.org/abs/1603.03915v1)) +- [x] SRN([paper](https://arxiv.org/abs/2003.12294)) +- [x] NRTR([paper](https://arxiv.org/abs/1806.00926v2)) +- [x] SAR([paper](https://arxiv.org/abs/1811.00751v2)) +- [x] SEED([paper](https://arxiv.org/pdf/2005.10977.pdf)) -参考[DTRB][3](https://arxiv.org/abs/1904.01906)文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: +参考[DTRB](https://arxiv.org/abs/1904.01906) 文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: |模型|骨干网络|Avg Accuracy|模型存储命名|下载链接| |---|---|---|---|---| @@ -58,6 +65,7 @@ PaddleOCR基于动态图开源的文本识别算法列表: |RARE|MobileNetV3|82.5%|rec_mv3_tps_bilstm_att |[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_att_v2.0_train.tar)| |RARE|Resnet34_vd|83.6%|rec_r34_vd_tps_bilstm_att |[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar)| |SRN|Resnet50_vd_fpn| 88.52% | rec_r50fpn_vd_none_srn | [下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r50_vd_srn_train.tar) | - - +|NRTR|NRTR_MTB| 84.3% | rec_mtb_nrtr | [下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar) | +|SAR|Resnet31| 87.2% | rec_r31_sar | [下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) | +|SEED| Aster_Resnet | 85.2% | rec_resnet_stn_bilstm_att | [下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar)| PaddleOCR文本识别算法的训练和使用请参考文档教程中[模型训练/评估中的文本识别部分](./recognition.md)。 diff --git a/doc/doc_ch/angle_class.md b/doc/doc_ch/angle_class.md index ad25a6661817623419af0c0c7a139dd4bfaeb08c..8142d7dcca62511811603303d6a4bfe2e914269b 100644 --- a/doc/doc_ch/angle_class.md +++ b/doc/doc_ch/angle_class.md @@ -1,6 +1,15 @@ -## 文字角度分类 -### 方法介绍 -文字角度分类主要用于图片非0度的场景下,在这种场景下需要对图片里检测到的文本行进行一个转正的操作。在PaddleOCR系统内, +# 文本方向分类器 + +- [1.方法介绍](#方法介绍) +- [2.数据准备](#数据准备) +- [3.启动训练](#启动训练) +- [4.训练](#训练) +- [5.评估](#评估) +- [6.预测](#预测) + + +## 1. 方法介绍 +文本方向分类器主要用于图片非0度的场景下,在这种场景下需要对图片里检测到的文本行进行一个转正的操作。在PaddleOCR系统内, 文字检测之后得到的文本行图片经过仿射变换之后送入识别模型,此时只需要对文字进行一个0和180度的角度分类,因此PaddleOCR内置的 文字角度分类器**只支持了0和180度的分类**。如果想支持更多角度,可以自己修改算法进行支持。 @@ -8,7 +17,8 @@ ![](../imgs_results/angle_class_example.jpg) -### 数据准备 + +## 2. 数据准备 请按如下步骤设置数据集: @@ -59,6 +69,8 @@ train/cls/train/word_002.jpg 180 |- word_003.jpg | ... ``` + +## 3. 启动训练 ### 启动训练 @@ -88,7 +100,8 @@ PaddleOCR提供了多种数据增强方式,如果您希望在训练时加入 *由于OpenCV的兼容性问题,扰动操作暂时只支持linux* -### 训练 + +## 4. 训练 PaddleOCR支持训练和评估交替进行, 可以在 `configs/cls/cls_mv3.yml` 中修改 `eval_batch_step` 设置评估频率,默认每1000个iter评估一次。训练过程中将会保存如下内容: ```bash @@ -106,7 +119,8 @@ PaddleOCR支持训练和评估交替进行, 可以在 `configs/cls/cls_mv3.yml` **注意,预测/评估时的配置文件请务必与训练一致。** -### 评估 + +## 5. 评估 评估数据集可以通过修改`configs/cls/cls_mv3.yml`文件里的`Eval.dataset.label_file_list` 字段设置。 @@ -116,7 +130,8 @@ export CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py -c configs/cls/cls_mv3.yml -o Global.checkpoints={path/to/weights}/best_accuracy ``` -### 预测 + +## 6. 预测 * 训练引擎的预测 diff --git a/doc/doc_ch/benchmark.md b/doc/doc_ch/benchmark.md index 520a2fcea35ef4bc19ae448517fbfcba61ed60b0..39b9724abe04494c3e61f54597c64400a132e30b 100644 --- a/doc/doc_ch/benchmark.md +++ b/doc/doc_ch/benchmark.md @@ -12,40 +12,27 @@ ## 评估指标 说明: -- v1.0是未添加优化策略的DB+CRNN模型,v1.1是添加多种优化策略和方向分类器的PP-OCR模型。slim_v1.1是使用裁剪或量化的模型。 + - 检测输入图像的的长边尺寸是960。 -- 评估耗时阶段为图像输入到结果输出的完整阶段,包括了图像的预处理和后处理。 +- 评估耗时阶段为图像预测耗时,不包括图像的预处理和后处理。 - `Intel至强6148`为服务器端CPU型号,测试中使用Intel MKL-DNN 加速。 - `骁龙855`为移动端处理平台型号。 -不同预测模型大小和整体识别精度对比 +预测模型大小和整体识别精度对比 | 模型名称 | 整体模型
大小\(M\) | 检测模型
大小\(M\) | 方向分类器
模型大小\(M\) | 识别模型
大小\(M\) | 整体识别
F\-score | |:-:|:-:|:-:|:-:|:-:|:-:| -| ch\_ppocr\_mobile\_v1\.1 | 8\.1 | 2\.6 | 0\.9 | 4\.6 | 0\.5193 | -| ch\_ppocr\_server\_v1\.1 | 155\.1 | 47\.2 | 0\.9 | 107 | 0\.5414 | -| ch\_ppocr\_mobile\_v1\.0 | 8\.6 | 4\.1 | \- | 4\.5 | 0\.393 | -| ch\_ppocr\_server\_v1\.0 | 203\.8 | 98\.5 | \- | 105\.3 | 0\.4436 | - -不同预测模型在T4 GPU上预测速度对比,单位ms - -| 模型名称 | 整体 | 检测 | 方向分类器 | 识别 | -|:-:|:-:|:-:|:-:|:-:| -| ch\_ppocr\_mobile\_v1\.1 | 137 | 35 | 24 | 78 | -| ch\_ppocr\_server\_v1\.1 | 204 | 39 | 25 | 140 | -| ch\_ppocr\_mobile\_v1\.0 | 117 | 41 | \- | 76 | -| ch\_ppocr\_server\_v1\.0 | 199 | 52 | \- | 147 | +| PP-OCRv2 | 11\.6 | 3\.0 | 0\.9 | 8\.6 | 0\.5224 | +| PP-OCR mobile | 8\.1 | 2\.6 | 0\.9 | 4\.6 | 0\.503 | +| PP-OCR server | 155\.1 | 47\.2 | 0\.9 | 107 | 0\.570 | -不同预测模型在CPU上预测速度对比,单位ms -| 模型名称 | 整体 | 检测 | 方向分类器 | 识别 | -|:-:|:-:|:-:|:-:|:-:| -| ch\_ppocr\_mobile\_v1\.1 | 421 | 164 | 51 | 206 | -| ch\_ppocr\_mobile\_v1\.0 | 398 | 219 | \- | 179 | +预测模型在CPU和GPU上的速度对比,单位ms -裁剪量化模型和原始模型模型大小,整体识别精度和在SD 855上预测速度对比 +| 模型名称 | CPU | T4 GPU | +|:-:|:-:|:-:| +| PP-OCRv2 | 330 | 111 | +| PP-OCR mobile | 356 | 11 6| +| PP-OCR server | 1056 | 200 | -| 模型名称 | 整体模型
大小\(M\) | 检测模型
大小\(M\) | 方向分类器
模型大小\(M\) | 识别模型
大小\(M\) | 整体识别
F\-score | SD 855
\(ms\) | -|:-:|:-:|:-:|:-:|:-:|:-:|:-:| -| ch\_ppocr\_mobile\_v1\.1 | 8\.1 | 2\.6 | 0\.9 | 4\.6 | 0\.5193 | 306 | -| ch\_ppocr\_mobile\_slim\_v1\.1 | 3\.5 | 1\.4 | 0\.5 | 1\.6 | 0\.521 | 268 | +更多 PP-OCR 系列模型的预测指标可以参考[PP-OCR Benchmark](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/doc/doc_ch/benchmark.md) diff --git a/doc/doc_ch/config.md b/doc/doc_ch/config.md index 74cd238134d1999a6fbd96d0ad053d0304231a0b..dcd0318ed908375c896d7a6730cd72db4cc4b848 100644 --- a/doc/doc_ch/config.md +++ b/doc/doc_ch/config.md @@ -1,4 +1,12 @@ -## 可选参数列表 +# 配置文件内容与生成 + +* [1. 可选参数列表](#1) +* [2. 配置文件参数介绍](#2) +* [3. 多语言配置文件生成](#3) + + + +## 1. 可选参数列表 以下列表可以通过`--help`查看 @@ -7,8 +15,9 @@ | -c | ALL | 指定配置文件 | None | **配置模块说明请参考 参数介绍** | | -o | ALL | 设置配置文件里的参数内容 | None | 使用-o配置相较于-c选择的配置文件具有更高的优先级。例如:`-o Global.use_gpu=false` | + -## 配置文件参数介绍 +## 2. 配置文件参数介绍 以 `rec_chinese_lite_train_v2.0.yml ` 为例 ### Global @@ -28,10 +37,9 @@ | checkpoints | 加载模型参数路径 | None | 用于中断后加载参数继续训练 | | use_visualdl | 设置是否启用visualdl进行可视化log展示 | False | [教程地址](https://www.paddlepaddle.org.cn/paddle/visualdl) | | infer_img | 设置预测图像路径或文件夹路径 | ./infer_img | \| -| character_dict_path | 设置字典路径 | ./ppocr/utils/ppocr_keys_v1.txt | \ | +| character_dict_path | 设置字典路径 | ./ppocr/utils/ppocr_keys_v1.txt | 如果为空,则默认使用小写字母+数字作为字典 | | max_text_length | 设置文本最大长度 | 25 | \ | -| character_type | 设置字符类型 | ch | en/ch, en时将使用默认dict,ch时使用自定义dict| -| use_space_char | 设置是否识别空格 | True | 仅在 character_type=ch 时支持空格 | +| use_space_char | 设置是否识别空格 | True | | | label_list | 设置方向分类器支持的角度 | ['0','180'] | 仅在方向分类器中生效 | | save_res_path | 设置检测模型的结果保存地址 | ./output/det_db/predicts_db.txt | 仅在检测模型中生效 | @@ -52,7 +60,7 @@ ### Architecture ([ppocr/modeling](../../ppocr/modeling)) -在ppocr中,网络被划分为Transform,Backbone,Neck和Head四个阶段 +在PaddleOCR中,网络被划分为Transform,Backbone,Neck和Head四个阶段 | 字段 | 用途 | 默认值 | 备注 | | :---------------------: | :---------------------: | :--------------: | :--------------------: | @@ -121,3 +129,98 @@ | batch_size_per_card | 训练时单卡batch size | 256 | \ | | drop_last | 是否丢弃因数据集样本数不能被 batch_size 整除而产生的最后一个不完整的mini-batch | True | \ | | num_workers | 用于加载数据的子进程个数,若为0即为不开启子进程,在主进程中进行数据加载 | 8 | \ | + + + +## 3. 多语言配置文件生成 + +PaddleOCR目前已支持80种(除中文外)语种识别,`configs/rec/multi_languages` 路径下提供了一个多语言的配置文件模版: [rec_multi_language_lite_train.yml](../../configs/rec/multi_language/rec_multi_language_lite_train.yml)。 + +您有两种方式创建所需的配置文件: + +1. 通过脚本自动生成 + +[generate_multi_language_configs.py](../../configs/rec/multi_language/generate_multi_language_configs.py) 可以帮助您生成多语言模型的配置文件 + +- 以意大利语为例,如果您的数据是按如下格式准备的: + ``` + |-train_data + |- it_train.txt # 训练集标签 + |- it_val.txt # 验证集标签 + |- data + |- word_001.jpg + |- word_002.jpg + |- word_003.jpg + | ... + ``` + + 可以使用默认参数,生成配置文件: + + ```bash + # 该代码需要在指定目录运行 + cd PaddleOCR/configs/rec/multi_language/ + # 通过-l或者--language参数设置需要生成的语种的配置文件,该命令会将默认参数写入配置文件 + python3 generate_multi_language_configs.py -l it + ``` + +- 如果您的数据放置在其他位置,或希望使用自己的字典,可以通过指定相关参数来生成配置文件: + + ```bash + # -l或者--language字段是必须的 + # --train修改训练集,--val修改验证集,--data_dir修改数据集目录,--dict修改字典路径, -o修改对应默认参数 + cd PaddleOCR/configs/rec/multi_language/ + python3 generate_multi_language_configs.py -l it \ # 语种 + --train {path/of/train_label.txt} \ # 训练标签文件的路径 + --val {path/of/val_label.txt} \ # 验证集标签文件的路径 + --data_dir {train_data/path} \ # 训练数据的根目录 + --dict {path/of/dict} \ # 字典文件路径 + -o Global.use_gpu=False # 是否使用gpu + ... + + ``` + +意大利文由拉丁字母组成,因此执行完命令后会得到名为 rec_latin_lite_train.yml 的配置文件。 + +2. 手动修改配置文件 + + 您也可以手动修改模版中的以下几个字段得到配置文件: + + ``` + Global: + use_gpu: True + epoch_num: 500 + ... + character_dict_path: {path/of/dict} # 字典文件所在路径 + + Train: + dataset: + name: SimpleDataSet + data_dir: train_data/ # 数据存放根目录 + label_file_list: ["./train_data/train_list.txt"] # 训练集label路径 + ... + + Eval: + dataset: + name: SimpleDataSet + data_dir: train_data/ # 数据存放根目录 + label_file_list: ["./train_data/val_list.txt"] # 验证集label路径 + ... + + ``` + +目前PaddleOCR支持的多语言算法有: + +| 配置文件 | 算法名称 | backbone | trans | seq | pred | language | +| :--------: | :-------: | :-------: | :-------: | :-----: | :-----: | :-----: | +| rec_chinese_cht_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 中文繁体 | +| rec_en_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 英语(区分大小写) | +| rec_french_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 法语 | +| rec_ger_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 德语 | +| rec_japan_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 日语 | +| rec_korean_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 韩语 | +| rec_latin_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 拉丁字母 | +| rec_arabic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 阿拉伯字母 | +| rec_cyrillic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 斯拉夫字母 | +| rec_devanagari_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 梵文字母 | + +更多支持语种请参考: [多语言模型](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/multi_languages.md#%E8%AF%AD%E7%A7%8D%E7%BC%A9%E5%86%99) diff --git a/doc/doc_ch/detection.md b/doc/doc_ch/detection.md index 6fc85992c04123a10ad937f2694b513b50a37876..dc50e8388128017d608a4ff38471d24bcb143bf3 100644 --- a/doc/doc_ch/detection.md +++ b/doc/doc_ch/detection.md @@ -1,11 +1,34 @@ -# 文字检测 -本节以icdar2015数据集为例,介绍PaddleOCR中检测模型的训练、评估与测试。 +# 目录 +- [1. 文字检测](#1-----) + * [1.1 数据准备](#11-----) + * [1.2 下载预训练模型](#12--------) + * [1.3 启动训练](#13-----) + * [1.4 断点训练](#14-----) + * [1.5 更换Backbone 训练](#15---backbone---) + * [1.6 指标评估](#16-----) + * [1.7 测试检测效果](#17-------) + * [1.8 转inference模型测试](#18--inference----) +- [2. FAQ](#2-faq) -## 数据准备 + + +# 1. 文字检测 + +本节以icdar2015数据集为例,介绍PaddleOCR中检测模型训练、评估、测试的使用方式。 + + +## 1.1 数据准备 + +icdar2015 TextLocalization数据集是文本检测的数据集,包含1000张训练图像和500张测试图像。 icdar2015数据集可以从[官网](https://rrc.cvc.uab.es/?ch=4&com=downloads)下载到,首次下载需注册。 +注册完成登陆后,下载下图中红色框标出的部分,其中, `Training Set Images`下载的内容保存为`icdar_c4_train_imgs`文件夹下,`Test Set Images` 下载的内容保存为`ch4_test_images`文件夹下 + +

+ +

-将下载到的数据集解压到工作目录下,假设解压在 PaddleOCR/train_data/ 下。另外,PaddleOCR将零散的标注文件整理成单独的标注文件 +将下载到的数据集解压到工作目录下,假设解压在 PaddleOCR/train_data/下。另外,PaddleOCR将零散的标注文件整理成单独的标注文件 ,您可以通过wget的方式进行下载。 ```shell # 在PaddleOCR路径下 @@ -23,7 +46,7 @@ python gen_label.py --mode="det" --root_path="/path/to/icdar_c4_train_imgs/" \ --output_label="/path/to/train_icdar2015_label.txt" ``` -解压数据集和下载标注文件后,PaddleOCR/train_data/ 有两个文件夹和两个文件,分别是: +解压数据集和下载标注文件后,PaddleOCR/train_data/ 有两个文件夹和两个文件,按照如下方式组织icdar2015数据集: ``` /PaddleOCR/train_data/icdar2015/text_localization/ └─ icdar_c4_train_imgs/ icdar数据集的训练数据 @@ -42,11 +65,13 @@ json.dumps编码前的图像标注信息是包含多个字典的list,字典中 如果您想在其他数据集上训练,可以按照上述形式构建标注文件。 -## 快速启动训练 + +## 1.2 下载预训练模型 首先下载模型backbone的pretrain model,PaddleOCR的检测模型目前支持两种backbone,分别是MobileNetV3、ResNet_vd系列, -您可以根据需求使用[PaddleClas](https://github.com/PaddlePaddle/PaddleClas/tree/develop/ppcls/modeling/architectures)中的模型更换backbone, -对应的backbone预训练模型可以从[PaddleClas repo 主页中找到下载链接](https://github.com/PaddlePaddle/PaddleClas#mobile-series)。 +您可以根据需求使用[PaddleClas](https://github.com/PaddlePaddle/PaddleClas/tree/release/2.0/ppcls/modeling/architectures)中的模型更换backbone, +对应的backbone预训练模型可以从[PaddleClas repo 主页中找到下载链接](https://github.com/PaddlePaddle/PaddleClas/blob/release%2F2.0/README_cn.md#resnet%E5%8F%8A%E5%85%B6vd%E7%B3%BB%E5%88%97)。 + ```shell cd PaddleOCR/ # 根据backbone的不同选择下载对应的预训练模型 @@ -56,23 +81,23 @@ wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dyg wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet18_vd_pretrained.pdparams # 或,下载ResNet50_vd的预训练模型 wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_ssld_pretrained.pdparams - ``` -#### 启动训练 + +## 1.3 启动训练 *如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false* ```shell # 单机单卡训练 mv3_db 模型 python3 tools/train.py -c configs/det/det_mv3_db.yml \ - -o Global.pretrain_weights=./pretrain_models/MobileNetV3_large_x0_5_pretrained/ + -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained + # 单机多卡训练,通过 --gpus 参数设置使用的GPU ID python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/det/det_mv3_db.yml \ - -o Global.pretrain_weights=./pretrain_models/MobileNetV3_large_x0_5_pretrained/ + -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained ``` - 上述指令中,通过-c 选择训练使用configs/det/det_db_mv3.yml配置文件。 有关配置文件的详细解释,请参考[链接](./config.md)。 @@ -81,46 +106,122 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/ python3 tools/train.py -c configs/det/det_mv3_db.yml -o Optimizer.base_lr=0.0001 ``` -#### 断点训练 + +## 1.4 断点训练 如果训练程序中断,如果希望加载训练中断的模型从而恢复训练,可以通过指定Global.checkpoints指定要加载的模型路径: ```shell python3 tools/train.py -c configs/det/det_mv3_db.yml -o Global.checkpoints=./your/trained/model +``` + +**注意**:`Global.checkpoints`的优先级高于`Global.pretrained_model`的优先级,即同时指定两个参数时,优先加载`Global.checkpoints`指定的模型,如果`Global.checkpoints`指定的模型路径有误,会加载`Global.pretrained_model`指定的模型。 + + +## 1.5 更换Backbone 训练 + +PaddleOCR将网络划分为四部分,分别在[ppocr/modeling](../../ppocr/modeling)下。 进入网络的数据将按照顺序(transforms->backbones-> +necks->heads)依次通过这四个部分。 + +```bash +├── architectures # 网络的组网代码 +├── transforms # 网络的图像变换模块 +├── backbones # 网络的特征提取模块 +├── necks # 网络的特征增强模块 +└── heads # 网络的输出模块 +``` +如果要更换的Backbone 在PaddleOCR中有对应实现,直接修改配置yml文件中`Backbone`部分的参数即可。 + +如果要使用新的Backbone,更换backbones的例子如下: + +1. 在 [ppocr/modeling/backbones](../../ppocr/modeling/backbones) 文件夹下新建文件,如my_backbone.py。 +2. 在 my_backbone.py 文件内添加相关代码,示例代码如下: + +```python +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +class MyBackbone(nn.Layer): + def __init__(self, *args, **kwargs): + super(MyBackbone, self).__init__() + # your init code + self.conv = nn.xxxx + + def forward(self, inputs): + # your network forward + y = self.conv(inputs) + return y ``` -**注意**:`Global.checkpoints`的优先级高于`Global.pretrain_weights`的优先级,即同时指定两个参数时,优先加载`Global.checkpoints`指定的模型,如果`Global.checkpoints`指定的模型路径有误,会加载`Global.pretrain_weights`指定的模型。 +3. 在 [ppocr/modeling/backbones/\__init\__.py](../../ppocr/modeling/backbones/__init__.py)文件内导入添加的`MyBackbone`模块,然后修改配置文件中Backbone进行配置即可使用,格式如下: + +```yaml +Backbone: +name: MyBackbone +args1: args1 +``` -## 指标评估 +**注意**:如果要更换网络的其他模块,可以参考[文档](./add_new_algorithm.md)。 -PaddleOCR计算三个OCR检测相关的指标,分别是:Precision、Recall、Hmean。 + +## 1.6 指标评估 -运行如下代码,根据配置文件`det_db_mv3.yml`中`save_res_path`指定的测试集检测结果文件,计算评估指标。 +PaddleOCR计算三个OCR检测相关的指标,分别是:Precision、Recall、Hmean(F-Score)。 -评估时设置后处理参数`box_thresh=0.5`,`unclip_ratio=1.5`,使用不同数据集、不同模型训练,可调整这两个参数进行优化 训练中模型参数默认保存在`Global.save_model_dir`目录下。在评估指标时,需要设置`Global.checkpoints`指向保存的参数文件。 + ```shell -python3 tools/eval.py -c configs/det/det_mv3_db.yml -o Global.checkpoints="{path/to/weights}/best_accuracy" PostProcess.box_thresh=0.5 PostProcess.unclip_ratio=1.5 +python3 tools/eval.py -c configs/det/det_mv3_db.yml -o Global.checkpoints="{path/to/weights}/best_accuracy" ``` - * 注:`box_thresh`、`unclip_ratio`是DB后处理所需要的参数,在评估EAST模型时不需要设置 -## 测试检测效果 + +## 1.7 测试检测效果 测试单张图像的检测效果 ```shell python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o Global.infer_img="./doc/imgs_en/img_10.jpg" Global.pretrained_model="./output/det_db/best_accuracy" ``` -测试DB模型时,调整后处理阈值, +测试DB模型时,调整后处理阈值 ```shell -python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o Global.infer_img="./doc/imgs_en/img_10.jpg" Global.pretrained_model="./output/det_db/best_accuracy" PostProcess.box_thresh=0.6 PostProcess.unclip_ratio=1.5 +python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o Global.infer_img="./doc/imgs_en/img_10.jpg" Global.pretrained_model="./output/det_db/best_accuracy" PostProcess.box_thresh=0.6 PostProcess.unclip_ratio=2.0 ``` - 测试文件夹下所有图像的检测效果 ```shell python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o Global.infer_img="./doc/imgs_en/" Global.pretrained_model="./output/det_db/best_accuracy" ``` + + +## 1.8 转inference模型测试 + +inference 模型(`paddle.jit.save`保存的模型) +一般是模型训练,把模型结构和模型参数保存在文件中的固化模型,多用于预测部署场景。 +训练过程中保存的模型是checkpoints模型,保存的只有模型的参数,多用于恢复训练等。 +与checkpoints模型相比,inference 模型会额外保存模型的结构信息,在预测部署、加速推理上性能优越,灵活方便,适合于实际系统集成。 + +检测模型转inference 模型方式: +```shell +# 加载配置文件`det_mv3_db.yml`,从`output/det_db`目录下加载`best_accuracy`模型,inference模型保存在`./output/det_db_inference`目录下 +python3 tools/export_model.py -c configs/det/det_mv3_db.yml -o Global.pretrained_model="./output/det_db/best_accuracy" Global.save_inference_dir="./output/det_db_inference/" +``` + +DB检测模型inference 模型预测: +```shell +python3 tools/infer/predict_det.py --det_algorithm="DB" --det_model_dir="./output/det_db_inference/" --image_dir="./doc/imgs/" --use_gpu=True +``` +如果是其他检测,比如EAST模型,det_algorithm参数需要修改为EAST,默认为DB算法: +```shell +python3 tools/infer/predict_det.py --det_algorithm="EAST" --det_model_dir="./output/det_db_inference/" --image_dir="./doc/imgs/" --use_gpu=True +``` + + +# 2. FAQ + +Q1: 训练模型转inference 模型之后预测效果不一致? +**A**:此类问题出现较多,问题多是trained model预测时候的预处理、后处理参数和inference model预测的时候的预处理、后处理参数不一致导致的。以det_mv3_db.yml配置文件训练的模型为例,训练模型、inference模型预测结果不一致问题解决方式如下: +- 检查[trained model预处理](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/configs/det/det_mv3_db.yml#L116),和[inference model的预测预处理](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/tools/infer/predict_det.py#L42)函数是否一致。算法在评估的时候,输入图像大小会影响精度,为了和论文保持一致,训练icdar15配置文件中将图像resize到[736, 1280],但是在inference model预测的时候只有一套默认参数,会考虑到预测速度问题,默认限制图像最长边为960做resize的。训练模型预处理和inference模型的预处理函数位于[ppocr/data/imaug/operators.py](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/ppocr/data/imaug/operators.py#L147) +- 检查[trained model后处理](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/configs/det/det_mv3_db.yml#L51),和[inference 后处理参数](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/tools/infer/utility.py#L50)是否一致。 diff --git a/doc/doc_ch/enhanced_ctc_loss.md b/doc/doc_ch/enhanced_ctc_loss.md new file mode 100644 index 0000000000000000000000000000000000000000..5525c7785f0a8fc642cebc82674400c2487558f9 --- /dev/null +++ b/doc/doc_ch/enhanced_ctc_loss.md @@ -0,0 +1,78 @@ +# Enhanced CTC Loss + +在OCR识别中, CRNN是一种在工业界广泛使用的文字识别算法。 在训练阶段,其采用CTCLoss来计算网络损失; 在推理阶段,其采用CTCDecode来获得解码结果。虽然CRNN算法在实际业务中被证明能够获得很好的识别效果, 然而用户对识别准确率的要求却是无止境的,如何进一步提升文字识别的准确率呢? 本文以CTCLoss为切人点,分别从难例挖掘、 多任务学习、 Metric Learning 3个不同的角度探索了CTCLoss的改进融合方案,提出了EnhancedCTCLoss,其包括如下3个组成部分: Focal-CTC Loss,A-CTC Loss, C-CTC Loss。 + +## 1. Focal-CTC Loss +Focal Loss 出自论文《Focal Loss for Dense Object Detection》, 该loss最先提出的时候主要是为了解决one-stage目标检测中正负样本比例严重失衡的问题。该损失函数降低了大量简单负样本在训练中所占的权重,也可理解为一种困难样本挖掘。 +其损失函数形式如下: +

+ +
+ +其中, y' 是经过激活函数的输出,取值在0-1之间。其在原始的交叉熵损失的基础上加了一个调制系数(1 – y’)^ γ和平衡因子α。 当α = 1,y=1时,其损失函数与交叉熵损失的对比如下图所示: +
+ +
+ +从上图可以看到, 当γ> 0时,调整系数(1-y’)^γ 赋予易分类样本损失一个更小的权重,使得网络更关注于困难的、错分的样本。 调整因子γ用于调节简单样本权重降低的速率,当γ为0时即为交叉熵损失函数,当γ增加时,调整因子的影响也会随之增大。实验发现γ为2是最优。平衡因子α用来平衡正负样本本身的比例不均,文中α取0.25。 + +对于经典的CTC算法,假设某个特征序列(f1, f2, ......ft), 经过CTC解码之后结果等于label的概率为y’, 则CTC解码结果不为label的概率即为(1-y’);不难发现, CTCLoss值和y’有如下关系: +
+ +
+ +结合Focal Loss的思想,赋予困难样本较大的权重,简单样本较小的权重,可以使网络更加聚焦于对困难样本的挖掘,进一步提升识别的准确率,由此我们提出了Focal-CTC Loss; 其定义如下所示: +
+ +
+ +实验中,γ取值为2, α= 1, 具体实现见: [rec_ctc_loss.py](../../ppocr/losses/rec_ctc_loss.py) + +## 2. A-CTC Loss +A-CTC Loss是CTC Loss + ACE Loss的简称。 其中ACE Loss出自论文< Aggregation Cross-Entropy for Sequence Recognition>. ACE Loss相比于CTCLoss,主要有如下两点优势: ++ ACE Loss能够解决2-D文本的识别问题; CTCLoss只能够处理1-D文本 ++ ACE Loss 在时间复杂度和空间复杂度上优于CTC loss + +前人总结的OCR识别算法的优劣如下图所示: +
+ +
+ +虽然ACELoss确实如上图所说,可以处理2D预测,在内存占用及推理速度方面具备优势,但在实践过程中,我们发现单独使用ACE Loss, 识别效果并不如CTCLoss. 因此,我们尝试将CTCLoss和ACELoss进行结合,同时以CTCLoss为主,将ACELoss 定位为一个辅助监督loss。 这一尝试收到了效果,在我们内部的实验数据集上,相比单独使用CTCLoss,识别准确率可以提升1%左右。 +A_CTC Loss定义如下: +
+ +
+ +实验中,λ = 0.1. ACE loss实现代码见: [ace_loss.py](../../ppocr/losses/ace_loss.py) + +## 3. C-CTC Loss +C-CTC Loss是CTC Loss + Center Loss的简称。 其中Center Loss出自论文 < A Discriminative Feature Learning Approach for Deep Face Recognition>. 最早用于人脸识别任务,用于增大类间距离,减小类内距离, 是Metric Learning领域一种较早的、也比较常用的一种算法。 +在中文OCR识别任务中,通过对badcase分析, 我们发现中文识别的一大难点是相似字符多,容易误识。 由此我们想到是否可以借鉴Metric Learing的想法, 增大相似字符的类间距,从而提高识别准确率。然而,MetricLearning主要用于图像识别领域,训练数据的标签为一个固定的值;而对于OCR识别来说,其本质上是一个序列识别任务,特征和label之间并不具有显式的对齐关系,因此两者如何结合依然是一个值得探索的方向。 +通过尝试Arcmargin, Cosmargin等方法, 我们最终发现Centerloss 有助于进一步提升识别的准确率。C_CTC Loss定义如下: +
+ +
+ +实验中,我们设置λ=0.25. center_loss实现代码见: [center_loss.py](../../ppocr/losses/center_loss.py) + +值得一提的是, 在C-CTC Loss中,选择随机初始化Center并不能够带来明显的提升. 我们的Center初始化方法如下: ++ 基于原始的CTCLoss, 训练得到一个网络N ++ 挑选出训练集中,识别完全正确的部分, 组成集合G ++ 将G中的每个样本送入网络,进行前向计算, 提取最后一个FC层的输入(即feature)及其经过argmax计算的结果(即index)之间的对应关系 ++ 将相同index的feature进行聚合,计算平均值,得到各自字符的初始center. + +以配置文件`configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml`为例, center提取命令如下所示: +``` +python tools/export_center.py -c configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml -o Global.pretrained_model: "./output/rec_mobile_pp-OCRv2/best_accuracy" +``` +运行完后,会在PaddleOCR主目录下生成`train_center.pkl`. + +## 4. 实验 +对于上述的三种方案,我们基于百度内部数据集进行了训练、评测,实验情况如下表所示: +|algorithm| Focal_CTC | A_CTC | C-CTC | +|:------| :------| ------: | :------: | +|gain| +0.3% | +0.7% | +1.7% | + +基于上述实验结论,我们在PP-OCRv2中,采用了C-CTC的策略。 值得一提的是,由于PP-OCRv2 处理的是6625个中文字符的识别任务,字符集比较大,形似字较多,所以在该任务上C-CTC 方案带来的提升较大。 但如果换做其他OCR识别任务,结论可能会有所不同。大家可以尝试Focal-CTC,A-CTC, C-CTC以及组合方案EnhancedCTC,相信会带来不同程度的提升效果。 +统一的融合方案见如下文件: [rec_enhanced_ctc_loss.py](../../ppocr/losses/rec_enhanced_ctc_loss.py) diff --git a/doc/doc_ch/environment.md b/doc/doc_ch/environment.md new file mode 100644 index 0000000000000000000000000000000000000000..3a266c4bb8fe5516f844bea9f0aa21359d51660e --- /dev/null +++ b/doc/doc_ch/environment.md @@ -0,0 +1,331 @@ +# 运行环境准备 + +Windows和Mac用户推荐使用Anaconda搭建Python环境,Linux用户建议使用docker搭建PyThon环境。 + +推荐环境: +- PaddlePaddle >= 2.0.0 (2.1.2) +- python3.7 +- CUDA10.1 / CUDA10.2 +- CUDNN 7.6 + +如果对于Python环境熟悉的用户可以直接跳到第2步安装PaddlePaddle。 + +* [1. Python环境搭建](#1) + + [1.1 Windows](#1.1) + + [1.2 Mac](#1.2) + + [1.3 Linux](#1.3) +* [2. 安装PaddlePaddle](#2) + + + +## 1. Python环境搭建 + + + +### 1.1 Windows + +#### 1.1.1 安装Anaconda + +- 说明:使用paddlepaddle需要先安装python环境,这里我们选择python集成环境Anaconda工具包 + - Anaconda是1个常用的python包管理程序 + - 安装完Anaconda后,可以安装python环境,以及numpy等所需的工具包环境。 +- Anaconda下载: + - 地址:https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/?C=M&O=D + - 大部分win10电脑均为64位操作系统,选择x86_64版本;若电脑为32位操作系统,则选择x86.exe + + anaconda download + - 下载完成后,双击安装程序进入图形界面 + - 默认安装位置为C盘,建议将安装位置更改到D盘: + + install config + - 勾选conda加入环境变量,忽略警告: + + add conda to path + +#### 1.1.2 打开终端并创建conda环境 + +- 打开Anaconda Prompt终端:左下角Windows Start Menu -> Anaconda3 -> Anaconda Prompt启动控制台 + + anaconda download + + +- 创建新的conda环境 + + ```shell + # 在命令行输入以下命令,创建名为paddle_env的环境 + # 此处为加速下载,使用清华源 + conda create --name paddle_env python=3.8 --channel https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ # 这是一行命令 + ``` + + 该命令会创建1个名为paddle_env、python版本为3.8的可执行环境,根据网络状态,需要花费一段时间 + + 之后命令行中会输出提示信息,输入y并回车继续安装 + + conda create + +- 激活刚创建的conda环境,在命令行中输入以下命令: + + ```shell + # 激活paddle_env环境 + conda activate paddle_env + # 查看当前python的位置 + where python + ``` + + create environment + + + + +以上anaconda环境和python环境安装完毕 + + + +### 1.2 Mac + +#### 1.2.1 安装Anaconda + +- 说明:使用paddlepaddle需要先安装python环境,这里我们选择python集成环境Anaconda工具包 + - Anaconda是1个常用的python包管理程序 + - 安装完Anaconda后,可以安装python环境,以及numpy等所需的工具包环境 +- Anaconda下载: + - 地址:https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/?C=M&O=D + + anaconda download + + - 选择最下方的`Anaconda3-2021.05-MacOSX-x86_64.pkg`下载 +- 下载完成后,双击.pkg文件进入图形界面 + - 按默认设置即可,安装需要花费一段时间 +- 建议安装vscode或pycharm等代码编辑器 + +#### 1.2.2 打开终端并创建conda环境 + +- 打开终端 + + - 同时按下command键和空格键,在聚焦搜索中输入"终端",双击进入终端 + +- **将conda加入环境变量** + + - 加入环境变量是为了让系统能识别conda命令 + + - 输入以下命令,在终端中打开`~/.bash_profile`: + + ```shell + vim ~/.bash_profile + ``` + + - 在`~/.bash_profile`中将conda添加为环境变量: + + ```shell + # 先按i进入编辑模式 + # 在第一行输入: + export PATH="~/opt/anaconda3/bin:$PATH" + # 若安装时自定义了安装位置,则将~/opt/anaconda3/bin改为自定义的安装目录下的bin文件夹 + ``` + + ```shell + # 修改后的~/.bash_profile文件应如下(其中xxx为用户名): + export PATH="~/opt/anaconda3/bin:$PATH" + # >>> conda initialize >>> + # !! Contents within this block are managed by 'conda init' !! + __conda_setup="$('/Users/xxx/opt/anaconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" + if [ $? -eq 0 ]; then + eval "$__conda_setup" + else + if [ -f "/Users/xxx/opt/anaconda3/etc/profile.d/conda.sh" ]; then + . "/Users/xxx/opt/anaconda3/etc/profile.d/conda.sh" + else + export PATH="/Users/xxx/opt/anaconda3/bin:$PATH" + fi + fi + unset __conda_setup + # <<< conda initialize <<< + ``` + + - 修改完成后,先按`esc`键退出编辑模式,再输入`:wq!`并回车,以保存退出 + + - 验证是否能识别conda命令: + + - 在终端中输入`source ~/.bash_profile`以更新环境变量 + - 再在终端输入`conda info --envs`,若能显示当前有base环境,则conda已加入环境变量 + +- 创建新的conda环境 + + ```shell + # 在命令行输入以下命令,创建名为paddle_env的环境 + # 此处为加速下载,使用清华源 + conda create --name paddle_env python=3.8 --channel https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ + ``` + + - 该命令会创建1个名为paddle_env、python版本为3.8的可执行环境,根据网络状态,需要花费一段时间 + + - 之后命令行中会输出提示信息,输入y并回车继续安装 + + - conda_create + +- 激活刚创建的conda环境,在命令行中输入以下命令: + + ```shell + # 激活paddle_env环境 + conda activate paddle_env + # 查看当前python的位置 + where python + ``` + + conda_actviate + +以上anaconda环境和python环境安装完毕 + + + +### 1.3 Linux + +Linux用户可选择Anaconda或Docker两种方式运行。如果你熟悉Docker且需要训练PaddleOCR模型,推荐使用Docker环境,PaddleOCR的开发流程均在Docker环境下运行。如果你不熟悉Docker,也可以使用Anaconda来运行项目。 + +#### 1.3.1 Anaconda环境配置 + +- 说明:使用paddlepaddle需要先安装python环境,这里我们选择python集成环境Anaconda工具包 + - Anaconda是1个常用的python包管理程序 + - 安装完Anaconda后,可以安装python环境,以及numpy等所需的工具包环境 + +- **下载Anaconda**: + + - 下载地址:https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/?C=M&O=D + + + - 选择适合您操作系统的版本 + - 可在终端输入`uname -m`查询系统所用的指令集 + +- 下载法1:本地下载,再将安装包传到linux服务器上 + +- 下载法2:直接使用linux命令行下载 + + ```shell + # 首先安装wget + sudo apt-get install wget # Ubuntu + sudo yum install wget # CentOS + ``` + + ```shell + # 然后使用wget从清华源上下载 + # 如要下载Anaconda3-2021.05-Linux-x86_64.sh,则下载命令如下: + wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2021.05-Linux-x86_64.sh + + # 若您要下载其他版本,需要将最后1个/后的文件名改成您希望下载的版本 + ``` + +- 安装Anaconda: + + - 在命令行输入`sh Anaconda3-2021.05-Linux-x86_64.sh` + - 若您下载的是其它版本,则将该命令的文件名替换为您下载的文件名 + - 按照安装提示安装即可 + - 查看许可时可输入q来退出 + +- **将conda加入环境变量** + + - 加入环境变量是为了让系统能识别conda命令,若您在安装时已将conda加入环境变量path,则可跳过本步 + + - 在终端中打开`~/.bashrc`: + + ```shell + # 在终端中输入以下命令: + vim ~/.bashrc + ``` + + - 在`~/.bashrc`中将conda添加为环境变量: + + ```shell + # 先按i进入编辑模式 + # 在第一行输入: + export PATH="~/anaconda3/bin:$PATH" + # 若安装时自定义了安装位置,则将~/anaconda3/bin改为自定义的安装目录下的bin文件夹 + ``` + + ```shell + # 修改后的~/.bash_profile文件应如下(其中xxx为用户名): + export PATH="~/opt/anaconda3/bin:$PATH" + # >>> conda initialize >>> + # !! Contents within this block are managed by 'conda init' !! + __conda_setup="$('/Users/xxx/opt/anaconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" + if [ $? -eq 0 ]; then + eval "$__conda_setup" + else + if [ -f "/Users/xxx/opt/anaconda3/etc/profile.d/conda.sh" ]; then + . "/Users/xxx/opt/anaconda3/etc/profile.d/conda.sh" + else + export PATH="/Users/xxx/opt/anaconda3/bin:$PATH" + fi + fi + unset __conda_setup + # <<< conda initialize <<< + ``` + + - 修改完成后,先按`esc`键退出编辑模式,再输入`:wq!`并回车,以保存退出 + + - 验证是否能识别conda命令: + + - 在终端中输入`source ~/.bash_profile`以更新环境变量 + - 再在终端输入`conda info --envs`,若能显示当前有base环境,则conda已加入环境变量 + +- 创建新的conda环境 + + ```shell + # 在命令行输入以下命令,创建名为paddle_env的环境 + # 此处为加速下载,使用清华源 + conda create --name paddle_env python=3.8 --channel https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ + ``` + + - 该命令会创建1个名为paddle_env、python版本为3.8的可执行环境,根据网络状态,需要花费一段时间 + + - 之后命令行中会输出提示信息,输入y并回车继续安装 + + conda_create + +- 激活刚创建的conda环境,在命令行中输入以下命令: + + ```shell + # 激活paddle_env环境 + conda activate paddle_env + ``` + + +以上anaconda环境和python环境安装完毕 + +#### 1.3.2 Docker环境配置 + +**注意:第一次使用这个镜像,会自动下载该镜像,请耐心等待。您也可以访问[DockerHub](https://hub.docker.com/r/paddlepaddle/paddle/tags/)获取与您机器适配的镜像。** + +```bash +# 切换到工作目录下 +cd /home/Projects +# 首次运行需创建一个docker容器,再次运行时不需要运行当前命令 +# 创建一个名字为ppocr的docker容器,并将当前目录映射到容器的/paddle目录下 + +#如果您希望在CPU环境下使用docker,使用docker而不是nvidia-docker创建docker +sudo docker run --name ppocr -v $PWD:/paddle --network=host -it registry.baidubce.com/paddlepaddle/paddle:2.1.3-gpu-cuda10.2-cudnn7 /bin/bash + +#如果使用CUDA10,请运行以下命令创建容器,设置docker容器共享内存shm-size为64G,建议设置32G以上 +# 如果是CUDA11+CUDNN8,推荐使用镜像registry.baidubce.com/paddlepaddle/paddle:2.1.3-gpu-cuda11.2-cudnn8 +sudo nvidia-docker run --name ppocr -v $PWD:/paddle --shm-size=64G --network=host -it registry.baidubce.com/paddlepaddle/paddle:2.1.3-gpu-cuda10.2-cudnn7 /bin/bash + +# ctrl+P+Q可退出docker 容器,重新进入docker 容器使用如下命令 +sudo docker container exec -it ppocr /bin/bash +``` + + + +## 2. 安装PaddlePaddle + +- 如果您的机器安装的是CUDA9或CUDA10,请运行以下命令安装 + +```bash +python3 -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple +``` + +- 如果您的机器是CPU,请运行以下命令安装 + +```bash +python3 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple +``` + +更多的版本需求,请参照[飞桨官网安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 diff --git a/doc/doc_ch/equation_a_ctc.png b/doc/doc_ch/equation_a_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..ae097610d37a88e76edefdbeb81df8403e94215f Binary files /dev/null and b/doc/doc_ch/equation_a_ctc.png differ diff --git a/doc/doc_ch/equation_c_ctc.png b/doc/doc_ch/equation_c_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..67207a9937481f4920af3cbafbe1bfe8d27ee5dc Binary files /dev/null and b/doc/doc_ch/equation_c_ctc.png differ diff --git a/doc/doc_ch/equation_ctcloss.png b/doc/doc_ch/equation_ctcloss.png new file mode 100644 index 0000000000000000000000000000000000000000..33ad92c9e4567d2a4a0c8fc3b2a0bf3fba5ea8f2 Binary files /dev/null and b/doc/doc_ch/equation_ctcloss.png differ diff --git a/doc/doc_ch/equation_focal_ctc.png b/doc/doc_ch/equation_focal_ctc.png new file mode 100644 index 0000000000000000000000000000000000000000..6ba1e8715d5876705ef429e48b5c94388fd41398 Binary files /dev/null and b/doc/doc_ch/equation_focal_ctc.png differ diff --git a/doc/doc_ch/focal_loss_formula.png b/doc/doc_ch/focal_loss_formula.png new file mode 100644 index 0000000000000000000000000000000000000000..971cebcd082cf5e19f9246f02216c0c14896bdc9 Binary files /dev/null and b/doc/doc_ch/focal_loss_formula.png differ diff --git a/doc/doc_ch/focal_loss_image.png b/doc/doc_ch/focal_loss_image.png new file mode 100644 index 0000000000000000000000000000000000000000..430550a732d4e2769151771bc85ae889dfc78fda Binary files /dev/null and b/doc/doc_ch/focal_loss_image.png differ diff --git a/doc/doc_ch/inference.md b/doc/doc_ch/inference.md index b9be1e4cb2d1b256a05b82ef5d6db49dfcb2f31f..4e0f1d131e2547f0d4a8bdf35c0f4a6f8bf2e7a3 100755 --- a/doc/doc_ch/inference.md +++ b/doc/doc_ch/inference.md @@ -273,7 +273,7 @@ python3 tools/export_model.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml -o G CRNN 文本识别模型推理,可以执行如下命令: ``` -python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./inference/rec_crnn/" --rec_image_shape="3, 32, 100" --rec_char_type="en" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./inference/rec_crnn/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt" ``` ![](../imgs_words_en/word_336.png) @@ -288,7 +288,7 @@ Predicts of ./doc/imgs_words_en/word_336.png:('super', 0.9999073) - 训练时采用的图像分辨率不同,训练上述模型采用的图像分辨率是[3,32,100],而中文模型训练时,为了保证长文本的识别效果,训练时采用的图像分辨率是[3, 32, 320]。预测推理程序默认的的形状参数是训练中文采用的图像分辨率,即[3, 32, 320]。因此,这里推理上述英文模型时,需要通过参数rec_image_shape设置识别图像的形状。 -- 字符列表,DTRB论文中实验只是针对26个小写英文本母和10个数字进行实验,总共36个字符。所有大小字符都转成了小写字符,不在上面列表的字符都忽略,认为是空格。因此这里没有输入字符字典,而是通过如下命令生成字典.因此在推理时需要设置参数rec_char_type,指定为英文"en"。 +- 字符列表,DTRB论文中实验只是针对26个小写英文本母和10个数字进行实验,总共36个字符。所有大小字符都转成了小写字符,不在上面列表的字符都忽略,认为是空格。因此这里没有输入字符字典,而是通过如下命令生成字典.因此在推理时需要设置参数rec_char_dict_path,指定为英文字典"./ppocr/utils/ic15_dict.txt"。 ``` self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" @@ -303,15 +303,15 @@ dict_character = list(self.character_str) python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" \ --rec_model_dir="./inference/srn/" \ --rec_image_shape="1, 64, 256" \ - --rec_char_type="en" \ + --rec_char_dict_path="./ppocr/utils/ic15_dict.txt" \ --rec_algorithm="SRN" ``` ### 4. 自定义文本识别字典的推理 -如果训练时修改了文本的字典,在使用inference模型预测时,需要通过`--rec_char_dict_path`指定使用的字典路径,并且设置 `rec_char_type=ch` +如果训练时修改了文本的字典,在使用inference模型预测时,需要通过`--rec_char_dict_path`指定使用的字典路径 ``` -python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./your inference model" --rec_image_shape="3, 32, 100" --rec_char_type="ch" --rec_char_dict_path="your text dict path" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./your inference model" --rec_image_shape="3, 32, 100" --rec_char_dict_path="your text dict path" ``` @@ -320,7 +320,7 @@ python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png 需要通过 `--vis_font_path` 指定可视化的字体路径,`doc/fonts/` 路径下有默认提供的小语种字体,例如韩文识别: ``` -python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/korean/1.jpg" --rec_model_dir="./your inference model" --rec_char_type="korean" --rec_char_dict_path="ppocr/utils/dict/korean_dict.txt" --vis_font_path="doc/fonts/korean.ttf" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/korean/1.jpg" --rec_model_dir="./your inference model" --rec_char_dict_path="ppocr/utils/dict/korean_dict.txt" --vis_font_path="doc/fonts/korean.ttf" ``` ![](../imgs_words/korean/1.jpg) @@ -388,7 +388,7 @@ python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --de 下面给出基于EAST文本检测和STAR-Net文本识别执行命令: ``` -python3 tools/infer/predict_system.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_east/" --det_algorithm="EAST" --rec_model_dir="./inference/starnet/" --rec_image_shape="3, 32, 100" --rec_char_type="en" +python3 tools/infer/predict_system.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_east/" --det_algorithm="EAST" --rec_model_dir="./inference/starnet/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt" ``` 执行命令后,识别结果图像如下: diff --git a/doc/doc_ch/inference_ppocr.md b/doc/doc_ch/inference_ppocr.md new file mode 100644 index 0000000000000000000000000000000000000000..493a4c9868621b762895e1ee11f76ac250918453 --- /dev/null +++ b/doc/doc_ch/inference_ppocr.md @@ -0,0 +1,136 @@ +# PP-OCR模型库快速推理 + +本文介绍针对PP-OCR模型库的Python推理引擎使用方法,内容依次为文本检测、文本识别、方向分类器以及三者串联在CPU、GPU上的预测方法。 + + +- [1. 文本检测模型推理](#文本检测模型推理) + +- [2. 文本识别模型推理](#文本识别模型推理) + - [2.1 超轻量中文识别模型推理](#超轻量中文识别模型推理) + - [2.2 多语言模型的推理](#多语言模型的推理) + +- [3. 方向分类模型推理](#方向分类模型推理) + +- [4. 文本检测、方向分类和文字识别串联推理](#文本检测、方向分类和文字识别串联推理) + + + +## 1. 文本检测模型推理 + +文本检测模型推理,默认使用DB模型的配置参数。超轻量中文检测模型推理,可以执行如下命令: + +``` +# 下载超轻量中文检测模型: +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tartar xf ch_ppocr_mobile_v2.0_det_infer.tarpython3 tools/infer/predict_det.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_ppocr_mobile_v2.0_det_infer/" +``` + +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下: + +![](/Users/zhulingfeng01/OCR/PaddleOCR/doc/imgs_results/det_res_00018069.jpg) + +通过参数`limit_type`和`det_limit_side_len`来对图片的尺寸进行限制, +`limit_type`可选参数为[`max`, `min`], +`det_limit_size_len` 为正整数,一般设置为32 的倍数,比如960。 + +参数默认设置为`limit_type='max', det_limit_side_len=960`。表示网络输入图像的最长边不能超过960, +如果超过这个值,会对图像做等宽比的resize操作,确保最长边为`det_limit_side_len`。 +设置为`limit_type='min', det_limit_side_len=960` 则表示限制图像的最短边为960。 + +如果输入图片的分辨率比较大,而且想使用更大的分辨率预测,可以设置det_limit_side_len 为想要的值,比如1216: + +``` +python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./inference/det_db/" --det_limit_type=max --det_limit_side_len=1216 +``` + +如果想使用CPU进行预测,执行命令如下 + +``` +python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./inference/det_db/" --use_gpu=False +``` + + + + + +## 2. 文本识别模型推理 + + + +### 2.1 超轻量中文识别模型推理 + +超轻量中文识别模型推理,可以执行如下命令: + +``` +# 下载超轻量中文识别模型: +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar +tar xf ch_ppocr_mobile_v2.0_rec_infer.tar +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/ch/word_4.jpg" --rec_model_dir="ch_ppocr_mobile_v2.0_rec_infer" +``` + +![](/Users/zhulingfeng01/OCR/PaddleOCR/doc/imgs_words/ch/word_4.jpg) + +执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下: + +```bash +Predicts of ./doc/imgs_words/ch/word_4.jpg:('实力活力', 0.98458153) +``` + + + +### 2.2 多语言模型的推理 + +如果您需要预测的是其他语言模型,在使用inference模型预测时,需要通过`--rec_char_dict_path`指定使用的字典路径, 同时为了得到正确的可视化结果, +需要通过 `--vis_font_path` 指定可视化的字体路径,`doc/fonts/` 路径下有默认提供的小语种字体,例如韩文识别: + +``` +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/korean/1.jpg" --rec_model_dir="./your inference model" --rec_char_type="korean" --rec_char_dict_path="ppocr/utils/dict/korean_dict.txt" --vis_font_path="doc/fonts/korean.ttf" +``` + +![](/Users/zhulingfeng01/OCR/PaddleOCR/doc/imgs_words/korean/1.jpg) + +执行命令后,上图的预测结果为: + +``` text +Predicts of ./doc/imgs_words/korean/1.jpg:('바탕으로', 0.9948904) +``` + + + +## 3. 方向分类模型推理 + +方向分类模型推理,可以执行如下命令: + +``` +# 下载超轻量中文方向分类器模型: +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +tar xf ch_ppocr_mobile_v2.0_cls_infer.tar +python3 tools/infer/predict_cls.py --image_dir="./doc/imgs_words/ch/word_4.jpg" --cls_model_dir="ch_ppocr_mobile_v2.0_cls_infer" +``` + +![](/Users/zhulingfeng01/OCR/PaddleOCR/doc/imgs_words/ch/word_1.jpg) + +执行命令后,上面图像的预测结果(分类的方向和得分)会打印到屏幕上,示例如下: + +``` +Predicts of ./doc/imgs_words/ch/word_4.jpg:['0', 0.9999982] +``` + + + +## 4. 文本检测、方向分类和文字识别串联推理 + +以超轻量中文OCR模型推理为例,在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径、参数`det_model_dir`,`cls_model_dir`和`rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。 + +```shell +# 使用方向分类器 +python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/det_db/" --cls_model_dir="./inference/cls/" --rec_model_dir="./inference/rec_crnn/" --use_angle_cls=true +# 不使用方向分类器 +python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/det_db/" --rec_model_dir="./inference/rec_crnn/" --use_angle_cls=false +# 使用多进程 +python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/det_db/" --rec_model_dir="./inference/rec_crnn/" --use_angle_cls=false --use_mp=True --total_process_num=6 +``` + +执行命令后,识别结果图像如下: + +![](/Users/zhulingfeng01/OCR/PaddleOCR/doc/imgs_results/system_res_00018069.jpg) + diff --git a/doc/doc_ch/knowledge_distillation.md b/doc/doc_ch/knowledge_distillation.md index b561f718491011e8dddcd44e66bfd6da62101ba6..b2772454d90ba40e5d65e035d083f8fcd79f69af 100644 --- a/doc/doc_ch/knowledge_distillation.md +++ b/doc/doc_ch/knowledge_distillation.md @@ -39,7 +39,7 @@ PaddleOCR中集成了知识蒸馏的算法,具体地,有以下几个主要 ### 2.1 识别配置文件解析 -配置文件在[rec_chinese_lite_train_distillation_v2.1.yml](../../configs/rec/ch_ppocr_v2.1/rec_chinese_lite_train_distillation_v2.1.yml)。 +配置文件在[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)。 #### 2.1.1 模型结构 @@ -246,6 +246,39 @@ Metric: 关于`DistillationMetric`更加具体的实现可以参考: [distillation_metric.py](../../ppocr/metrics/distillation_metric.py#L24)。 +#### 2.1.5 蒸馏模型微调 + +对蒸馏得到的识别蒸馏进行微调有2种方式。 + +(1)基于知识蒸馏的微调:这种情况比较简单,下载预训练模型,在[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)中配置好预训练模型路径以及自己的数据路径,即可进行模型微调训练。 + +(2)微调时不使用知识蒸馏:这种情况,需要首先将预训练模型中的学生模型参数提取出来,具体步骤如下。 + +* 首先下载预训练模型并解压。 +```shell +# 下面预训练模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar +tar -xf ch_PP-OCRv2_rec_train.tar +``` + +* 然后使用python,对其中的学生模型参数进行提取 + +```python +import paddle +# 加载预训练模型 +all_params = paddle.load("ch_PP-OCRv2_rec_train/best_accuracy.pdparams") +# 查看权重参数的keys +print(all_params.keys()) +# 学生模型的权重提取 +s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Student." in key} +# 查看学生模型权重参数的keys +print(s_params.keys()) +# 保存 +paddle.save(s_params, "ch_PP-OCRv2_rec_train/student.pdparams") +``` + +转化完成之后,使用[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml),修改预训练模型的路径(为导出的`student.pdparams`模型路径)以及自己的数据路径,即可进行模型微调。 + ### 2.2 检测配置文件解析 * coming soon! diff --git a/doc/doc_ch/models_and_config.md b/doc/doc_ch/models_and_config.md new file mode 100644 index 0000000000000000000000000000000000000000..89afc89a99bed364fd2abe247946dfe9e552ae86 --- /dev/null +++ b/doc/doc_ch/models_and_config.md @@ -0,0 +1,47 @@ + +# PP-OCR模型与配置文件 +PP-OCR模型与配置文件一章主要补充一些OCR模型的基本概念、配置文件的内容与作用以便对模型后续的参数调整和训练中拥有更好的体验。 + +本章包含三个部分,首先在[PP-OCR模型下载](./models_list.md)中解释PP-OCR模型的类型概念,并提供所有模型的下载链接。然后在[配置文件内容与生成](./config.md)中详细说明调整PP-OCR模型所需的参数。最后的[模型库快速使用](./inference_ppocr.md)是对第一节PP-OCR模型库使用方法的介绍,可以通过Python推理引擎快速利用丰富的模型库模型获得测试结果。 + +------ + +下面我们首先了解一些OCR相关的基本概念: + +- [1. OCR 简要介绍](#1-ocr-----) + * [1.1 OCR 检测模型基本概念](#11-ocr---------) + * [1.2 OCR 识别模型基本概念](#12-ocr---------) + * [1.3 PP-OCR模型](#13-pp-ocr--) + + +## 1. OCR 简要介绍 +本节简要介绍OCR检测模型、识别模型的基本概念,并介绍PaddleOCR的PP-OCR模型。 + +OCR(Optical Character Recognition,光学字符识别)目前是文字识别的统称,已不限于文档或书本文字识别,更包括识别自然场景下的文字,又可以称为STR(Scene Text Recognition)。 + +OCR文字识别一般包括两个部分,文本检测和文本识别;文本检测首先利用检测算法检测到图像中的文本行;然后检测到的文本行用识别算法去识别到具体文字。 + + +### 1.1 OCR 检测模型基本概念 + +文本检测就是要定位图像中的文字区域,然后通常以边界框的形式将单词或文本行标记出来。传统的文字检测算法多是通过手工提取特征的方式,特点是速度快,简单场景效果好,但是面对自然场景,效果会大打折扣。当前多是采用深度学习方法来做。 + +基于深度学习的文本检测算法可以大致分为以下几类: +1. 基于目标检测的方法;一般是预测得到文本框后,通过NMS筛选得到最终文本框,多是四点文本框,对弯曲文本场景效果不理想。典型算法为EAST、Text Box等方法。 +2. 基于分割的方法;将文本行当成分割目标,然后通过分割结果构建外接文本框,可以处理弯曲文本,对于文本交叉场景问题效果不理想。典型算法为DB、PSENet等方法。 +3. 混合目标检测和分割的方法; + + +### 1.2 OCR 识别模型基本概念 + +OCR识别算法的输入数据一般是文本行,背景信息不多,文字占据主要部分,识别算法目前可以分为两类算法: +1. 基于CTC的方法;即识别算法的文字预测模块是基于CTC的,常用的算法组合为CNN+RNN+CTC。目前也有一些算法尝试在网络中加入transformer模块等等。 +2. 基于Attention的方法;即识别算法的文字预测模块是基于Attention的,常用算法组合是CNN+RNN+Attention。 + + +### 1.3 PP-OCR模型 + +PaddleOCR 中集成了很多OCR算法,文本检测算法有DB、EAST、SAST等等,文本识别算法有CRNN、RARE、StarNet、Rosetta、SRN等算法。 + +其中PaddleOCR针对中英文自然场景通用OCR,推出了PP-OCR系列模型,PP-OCR模型由DB+CRNN算法组成,利用海量中文数据训练加上模型调优方法,在中文场景上具备较高的文本检测识别能力。并且PaddleOCR推出了高精度超轻量PP-OCRv2模型,检测模型仅3M,识别模型仅8.5M,利用[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)的模型量化方法,可以在保持精度不降低的情况下,将检测模型压缩到0.8M,识别压缩到3M,更加适用于移动端部署场景。 + diff --git a/doc/doc_ch/models_list.md b/doc/doc_ch/models_list.md index 35713ae67f797618e043697eb93642208c3df865..5e78795bcda96cf005f24b97cfcc0a8580b2ae1e 100644 --- a/doc/doc_ch/models_list.md +++ b/doc/doc_ch/models_list.md @@ -1,8 +1,9 @@ -## OCR模型列表(V2.0,2021年1月20日更新) +## OCR模型列表(V2.1,2021年9月6日更新) > **说明** -> 1. 2.0版模型和[1.1版模型](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/models_list.md) 的主要区别在于动态图训练vs.静态图训练,模型性能上无明显差距。 -> 2. 本文档提供的是PPOCR自研模型列表,更多基于公开数据集的算法介绍与预训练模型可以参考:[算法概览文档](./algorithm_overview.md)。 +> 1. 2.1版模型相比2.0版模型,2.1的模型在模型精度上做了提升 +> 2. 2.0版模型和[1.1版模型](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/models_list.md) 的主要区别在于动态图训练vs.静态图训练,模型性能上无明显差距。 +> 3. 本文档提供的是PPOCR自研模型列表,更多基于公开数据集的算法介绍与预训练模型可以参考:[算法概览文档](./algorithm_overview.md)。 - [一、文本检测模型](#文本检测模型) @@ -32,6 +33,8 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- | --- | +|ch_PP-OCRv2_det_slim|slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml)| 3M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)| +|ch_PP-OCRv2_det|原始超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml)|3M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| |ch_ppocr_mobile_slim_v2.0_det|slim裁剪版超轻量模型,支持中英文、多语种文本检测|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)| 2.6M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar)| |ch_ppocr_mobile_v2.0_det|原始超轻量模型,支持中英文、多语种文本检测|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|3M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)| |ch_ppocr_server_v2.0_det|通用模型,支持中英文、多语种文本检测,比超轻量模型更大,但效果更好|[ch_det_res18_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml)|47M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar)| @@ -45,6 +48,8 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- | --- | +|ch_PP-OCRv2_rec_slim|slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | +|ch_PP-OCRv2_rec|原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)|8.5M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | |ch_ppocr_mobile_slim_v2.0_rec|slim裁剪量化版超轻量模型,支持中英文、数字识别|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)| 6M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) | |ch_ppocr_mobile_v2.0_rec|原始超轻量模型,支持中英文、数字识别|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)|5.2M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | |ch_ppocr_server_v2.0_rec|通用模型,支持中英文、数字识别|[rec_chinese_common_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml)|94.8M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | @@ -62,46 +67,6 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 #### 3. 多语言识别模型(更多语言持续更新中...) -**说明:** 新增的多语言模型的配置文件通过代码方式生成,您可以通过`--help`参数查看当前PaddleOCR支持生成哪些多语言的配置文件: -```bash -# 该代码需要在指定目录运行 -cd {your/path/}PaddleOCR/configs/rec/multi_language/ -python3 generate_multi_language_configs.py --help -``` -下面以生成意大利语配置文件为例: -##### 1. 生成意大利语配置文件测试现有模型 - -如果您仅仅想用配置文件测试PaddleOCR提供的多语言模型可以通过下面命令生成默认的配置文件,使用PaddleOCR提供的小语种字典进行预测。 -```bash -# 该代码需要在指定目录运行 -cd {your/path/}PaddleOCR/configs/rec/multi_language/ -# 通过-l或者--language参数设置需要生成的语种的配置文件,该命令会将默认参数写入配置文件 -python3 generate_multi_language_configs.py -l it -``` -##### 2. 生成意大利语配置文件训练自己的数据 -如果您想训练自己的小语种模型,可以准备好训练集文件、验证集文件、字典文件和训练数据路径,这里假设准备的意大利语的训练集、验证集、字典和训练数据路径为: -- 训练集:{your/path/}PaddleOCR/train_data/train_list.txt -- 验证集:{your/path/}PaddleOCR/train_data/val_list.txt -- 使用PaddleOCR提供的默认字典:{your/path/}PaddleOCR/ppocr/utils/dict/it_dict.txt -- 训练数据路径:{your/path/}PaddleOCR/train_data - -使用以下命令生成配置文件: -```bash -# 该代码需要在指定目录运行 -cd {your/path/}PaddleOCR/configs/rec/multi_language/ -# -l或者--language字段是必须的 -# --train修改训练集,--val修改验证集,--data_dir修改数据集目录,-o修改对应默认参数 -# --dict命令改变字典路径,示例使用默认字典路径则该参数可不填 -python3 generate_multi_language_configs.py -l it \ ---train train_data/train_list.txt \ ---val train_data/val_list.txt \ ---data_dir train_data \ --o Global.use_gpu=False -``` - - -##### 3. 多语言模型与配置文件 - |模型名称|字典文件|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- |--- | --- | | french_mobile_v2.0_rec | ppocr/utils/dict/french_dict.txt |法文识别|[rec_french_lite_train.yml](../../configs/rec/multi_language/rec_french_lite_train.yml)|2.65M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/french_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/french_mobile_v2.0_rec_train.tar) | @@ -125,13 +90,16 @@ python3 generate_multi_language_configs.py -l it \ |模型名称|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- | --- | -|ch_ppocr_mobile_slim_v2.0_cls|slim量化版模型|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)| 2.1M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) | -|ch_ppocr_mobile_v2.0_cls|原始模型|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)|1.38M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | +|ch_ppocr_mobile_slim_v2.0_cls|slim量化版模型,对检测到的文本行文字角度分类|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)| 2.1M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) | +|ch_ppocr_mobile_v2.0_cls|原始分类器模型,对检测到的文本行文字角度分类|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)|1.38M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | + ### 四、Paddle-Lite 模型 |模型版本|模型简介|模型大小|检测模型|文本方向分类模型|识别模型|Paddle-Lite版本| |---|---|---|---|---|---|---| -|V2.0|超轻量中文OCR 移动端模型|7.8M|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_opt.nb)|v2.9| -|V2.0(slim)|超轻量中文OCR 移动端模型|3.3M|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_slim_opt.nb)|v2.9| +|PP-OCRv2|蒸馏版超轻量中文OCR移动端模型|11M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer_opt.nb)|v2.9| +|PP-OCRv2(slim)|蒸馏版超轻量中文OCR移动端模型|4.9M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_opt.nb)|v2.9| +|V2.0|ppocr_v2.0超轻量中文OCR移动端模型|7.8M|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_opt.nb)|v2.9| +|V2.0(slim)|ppocr_v2.0超轻量中文OCR移动端模型|3.3M|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_slim_opt.nb)|v2.9| diff --git a/doc/doc_ch/multi_languages.md b/doc/doc_ch/multi_languages.md index 1bebdb58fa025d5b6891fd1579285b5db956a2d4..af9ff82e357e5945bfddf10337d0af3cd04390a0 100644 --- a/doc/doc_ch/multi_languages.md +++ b/doc/doc_ch/multi_languages.md @@ -200,9 +200,9 @@ ppocr 支持使用自己的数据进行自定义训练或finetune, 其中识别 |英文|english|en| |乌克兰文|Ukranian|uk| |法文|french|fr| |白俄罗斯文|Belarusian|be| |德文|german|german| |泰卢固文|Telugu |te| -|日文|japan|japan| | |阿巴扎文|Abaza |abq| +|日文|japan|japan| | 阿巴扎文 | Abaza | abq | |韩文|korean|korean| |泰米尔文|Tamil |ta| -|中文繁体|chinese traditional |ch_tra| |南非荷兰文 |Afrikaans |af| +|中文繁体|chinese traditional |chinese_cht| |南非荷兰文 |Afrikaans |af| |意大利文| Italian |it| |阿塞拜疆文 |Azerbaijani |az| |西班牙文|Spanish |es| |波斯尼亚文|Bosnian|bs| |葡萄牙文| Portuguese|pt| |捷克文|Czech|cs| diff --git a/doc/doc_ch/paddleOCR_overview.md b/doc/doc_ch/paddleOCR_overview.md new file mode 100644 index 0000000000000000000000000000000000000000..f49c1ae302607ff6629da2462f91a36793b4db3a --- /dev/null +++ b/doc/doc_ch/paddleOCR_overview.md @@ -0,0 +1,33 @@ +# PaddleOCR全景图与项目克隆 + +## 1. PaddleOCR全景图 + +PaddleOCR包含丰富的文本检测、文本识别以及端到端算法。结合实际测试与产业经验,PaddleOCR选择DB和CRNN作为基础的检测和识别模型,经过一系列优化策略提出面向产业应用的PP-OCR模型。PP-OCR模型针对通用场景,根据不同语种形成了PP-OCR模型库。基于PP-OCR的能力,PaddleOCR针对文档场景任务发布PP-Structure工具库,包含版面分析和表格识别两大任务。为了打通产业落地的全流程,PaddleOCR提供了规模化的数据生产工具和多种预测部署工具,助力开发者快速落地。 + +
+ +
+ +## 2. 项目克隆 + +### **2.1 克隆PaddleOCR repo代码** + +``` +【推荐】git clone https://github.com/PaddlePaddle/PaddleOCR +``` + +如果因为网络问题无法pull成功,也可选择使用码云上的托管: + +``` +git clone https://gitee.com/paddlepaddle/PaddleOCR +``` + +注:码云托管代码可能无法实时同步本github项目更新,存在3~5天延时,请优先使用推荐方式。 + +### **2.2 安装第三方库** + +``` +cd PaddleOCR +pip3 install -r requirements.txt +``` + diff --git a/doc/doc_ch/quickstart.md b/doc/doc_ch/quickstart.md index d9460989336118bfde6cafb5cc2a7f1d0b6b8691..1896d7a137f0768c6b2a8e0c02b18ff61fbfd03c 100644 --- a/doc/doc_ch/quickstart.md +++ b/doc/doc_ch/quickstart.md @@ -1,100 +1,262 @@ +# PaddleOCR快速开始 -# 中文OCR模型快速使用 -## 1.环境配置 +- [PaddleOCR快速开始](#paddleocr) -请先参考[快速安装](./installation.md)配置PaddleOCR运行环境。 + + [1. 安装PaddleOCR whl包](#1) + * [2. 便捷使用](#2) + + [2.1 命令行使用](#21) + - [2.1.1 中英文模型](#211) + - [2.1.2 多语言模型](#212) + - [2.1.3 版面分析](#213) + + [2.2 Python脚本使用](#22) + - [2.2.1 中英文与多语言使用](#221) + - [2.2.2 版面分析](#222) -*注意:也可以通过 whl 包安装使用PaddleOCR,具体参考[Paddleocr Package使用说明](./whl.md)。* + -## 2.inference模型下载 +## 1. 安装PaddleOCR whl包 -* 移动端和服务器端的检测与识别模型如下,更多模型下载(包括多语言),可以参考[PP-OCR v2.0 系列模型下载](../doc_ch/models_list.md) +```bash +pip install "paddleocr>=2.0.1" # 推荐使用2.0.1+版本 +``` -| 模型简介 | 模型名称 |推荐场景 | 检测模型 | 方向分类器 | 识别模型 | -| ------------ | --------------- | ----------------|---- | ---------- | -------- | -| 中英文超轻量OCR模型(8.1M) | ch_ppocr_mobile_v2.0_xx |移动端&服务器端|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | -| 中英文通用OCR模型(143M) | ch_ppocr_server_v2.0_xx |服务器端 |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | +- 对于Windows环境用户: + 直接通过pip安装的shapely库可能出现`[winRrror 126] 找不到指定模块的问题`。建议从[这里](https://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely)下载shapely安装包完成安装, -* windows 环境下如果没有安装wget,下载模型时可将链接复制到浏览器中下载,并解压放置在相应目录下 +- 使用**版面分析**功能时,运行以下命令**安装 Layout-Parser** -复制上表中的检测和识别的`inference模型`下载地址,并解压 + ```bash + pip3 install -U https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl + ``` -``` -mkdir inference && cd inference -# 下载检测模型并解压 -wget {url/of/detection/inference_model} && tar xf {name/of/detection/inference_model/package} -# 下载识别模型并解压 -wget {url/of/recognition/inference_model} && tar xf {name/of/recognition/inference_model/package} -# 下载方向分类器模型并解压 -wget {url/of/classification/inference_model} && tar xf {name/of/classification/inference_model/package} -cd .. -``` -以超轻量级模型为例: + +## 2. 便捷使用 + +### 2.1 命令行使用 + +PaddleOCR提供了一系列测试图片,点击[这里](https://paddleocr.bj.bcebos.com/dygraph_v2.1/ppocr_img.zip)下载并解压,然后在终端中切换到相应目录 ``` -mkdir inference && cd inference -# 下载超轻量级中文OCR模型的检测模型并解压 -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar && tar xf ch_ppocr_mobile_v2.0_det_infer.tar -# 下载超轻量级中文OCR模型的识别模型并解压 -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar && tar xf ch_ppocr_mobile_v2.0_rec_infer.tar -# 下载超轻量级中文OCR模型的文本方向分类器模型并解压 -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar && tar xf ch_ppocr_mobile_v2.0_cls_infer.tar -cd .. +cd /path/to/ppocr_img ``` -解压完毕后应有如下文件结构: +如果不使用提供的测试图片,可以将下方`--image_dir`参数替换为相应的测试图片路径 + +#### 2.1.1 中英文模型 + +* 检测+方向分类器+识别全流程:设置方向分类器参数`--use_angle_cls true`后可对竖排文本进行识别。 + + ```bash + paddleocr --image_dir ./imgs/11.jpg --use_angle_cls true + ``` + + 结果是一个list,每个item包含了文本框,文字和识别置信度 + + ```bash + [[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]] + [[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]] + [[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]] + ...... + ``` + +- 单独使用检测:设置`--rec`为`false` + + ```bash + paddleocr --image_dir ./imgs/11.jpg --rec false + ``` + + 结果是一个list,每个item只包含文本框 + + ```bash + [[26.0, 457.0], [137.0, 457.0], [137.0, 477.0], [26.0, 477.0]] + [[25.0, 425.0], [372.0, 425.0], [372.0, 448.0], [25.0, 448.0]] + [[128.0, 397.0], [273.0, 397.0], [273.0, 414.0], [128.0, 414.0]] + ...... + ``` + +- 单独使用识别:设置`--det`为`false` + ```bash + paddleocr --image_dir ./imgs_words/ch/word_1.jpg --det false + ``` + + 结果是一个list,每个item只包含识别结果和识别置信度 + + ```bash + ['韩国小馆', 0.9907421] + ``` + + +如需使用2.0模型,请指定参数`--version PP-OCR`,paddleocr默认使用2.1模型(`--versioin PP-OCRv2`)。更多whl包使用可参考[whl包文档](./whl.md) + + + +#### 2.1.2 多语言模型 + +Paddleocr目前支持80个语种,可以通过修改`--lang`参数进行切换,对于英文模型,指定`--lang=en`。 + +``` bash +paddleocr --image_dir ./imgs_en/254.jpg --lang=en ``` -├── ch_ppocr_mobile_v2.0_cls_infer -│ ├── inference.pdiparams -│ ├── inference.pdiparams.info -│ └── inference.pdmodel -├── ch_ppocr_mobile_v2.0_det_infer -│ ├── inference.pdiparams -│ ├── inference.pdiparams.info -│ └── inference.pdmodel -├── ch_ppocr_mobile_v2.0_rec_infer - ├── inference.pdiparams - ├── inference.pdiparams.info - └── inference.pdmodel + +
+ + +
+ +结果是一个list,每个item包含了文本框,文字和识别置信度 + +```text +[('PHO CAPITAL', 0.95723116), [[66.0, 50.0], [327.0, 44.0], [327.0, 76.0], [67.0, 82.0]]] +[('107 State Street', 0.96311164), [[72.0, 90.0], [451.0, 84.0], [452.0, 116.0], [73.0, 121.0]]] +[('Montpelier Vermont', 0.97389287), [[69.0, 132.0], [501.0, 126.0], [501.0, 158.0], [70.0, 164.0]]] +[('8022256183', 0.99810505), [[71.0, 175.0], [363.0, 170.0], [364.0, 202.0], [72.0, 207.0]]] +[('REG 07-24-201706:59 PM', 0.93537045), [[73.0, 299.0], [653.0, 281.0], [654.0, 318.0], [74.0, 336.0]]] +[('045555', 0.99346405), [[509.0, 331.0], [651.0, 325.0], [652.0, 356.0], [511.0, 362.0]]] +[('CT1', 0.9988654), [[535.0, 367.0], [654.0, 367.0], [654.0, 406.0], [535.0, 406.0]]] +...... ``` -## 3.单张图像或者图像集合预测 +常用的多语言简写包括 -以下代码实现了文本检测、方向分类器和识别串联推理,在执行预测时,需要通过参数image_dir指定单张图像或者图像集合的路径、参数`det_model_dir`指定检测inference模型的路径、参数`rec_model_dir`指定识别inference模型的路径、参数`use_angle_cls`指定是否使用方向分类器、参数`cls_model_dir`指定方向分类器inference模型的路径、参数`use_space_char`指定是否预测空格字符。可视化识别结果默认保存到`./inference_results`文件夹里面。 +| 语种 | 缩写 | | 语种 | 缩写 | | 语种 | 缩写 | +| -------- | ----------- | ---- | -------- | ------ | ---- | -------- | ------ | +| 中文 | ch | | 法文 | fr | | 日文 | japan | +| 英文 | en | | 德文 | german | | 韩文 | korean | +| 繁体中文 | chinese_cht | | 意大利文 | it | | 俄罗斯文 | ru | -```bash +全部语种及其对应的缩写列表可查看[多语言模型教程](./multi_languages.md) + + +#### 2.1.3 版面分析 -# 预测image_dir指定的单张图像 -python3 tools/infer/predict_system.py --image_dir="./doc/imgs/11.jpg" --det_model_dir="./inference/ch_ppocr_mobile_v2.0_det_infer/" --rec_model_dir="./inference/ch_ppocr_mobile_v2.0_rec_infer/" --cls_model_dir="./inference/ch_ppocr_mobile_v2.0_cls_infer/" --use_angle_cls=True --use_space_char=True +版面分析是指对文档图片中的文字、标题、列表、图片和表格5类区域进行划分。对于前三类区域,直接使用OCR模型完成对应区域文字检测与识别,并将结果保存在txt中。对于表格类区域,经过表格结构化处理后,表格图片转换为相同表格样式的Excel文件。图片区域会被单独裁剪成图像。 -# 预测image_dir指定的图像集合 -python3 tools/infer/predict_system.py --image_dir="./doc/imgs/" --det_model_dir="./inference/ch_ppocr_mobile_v2.0_det_infer/" --rec_model_dir="./inference/ch_ppocr_mobile_v2.0_rec_infer/" --cls_model_dir="./inference/ch_ppocr_mobile_v2.0_cls_infer/" --use_angle_cls=True --use_space_char=True +使用PaddleOCR的版面分析功能,需要指定`--type=structure` -# 如果想使用CPU进行预测,需设置use_gpu参数为False -python3 tools/infer/predict_system.py --image_dir="./doc/imgs/11.jpg" --det_model_dir="./inference/ch_ppocr_mobile_v2.0_det_infer/" --rec_model_dir="./inference/ch_ppocr_mobile_v2.0_rec_infer/" --cls_model_dir="./inference/ch_ppocr_mobile_v2.0_cls_infer/" --use_angle_cls=True --use_space_char=True --use_gpu=False +```bash +paddleocr --image_dir=./table/1.png --type=structure ``` -- 通用中文OCR模型 +- **返回结果说明** + + PP-Structure的返回结果为一个dict组成的list,示例如下 + + ```shell + [{ 'type': 'Text', + 'bbox': [34, 432, 345, 462], + 'res': ([[36.0, 437.0, 341.0, 437.0, 341.0, 446.0, 36.0, 447.0], [41.0, 454.0, 125.0, 453.0, 125.0, 459.0, 41.0, 460.0]], + [('Tigure-6. The performance of CNN and IPT models using difforen', 0.90060663), ('Tent ', 0.465441)]) + } + ] + ``` + + 其中各个字段说明如下 + + | 字段 | 说明 | + | ---- | ------------------------------------------------------------ | + | type | 图片区域的类型 | + | bbox | 图片区域的在原图的坐标,分别[左上角x,左上角y,右下角x,右下角y] | + | res | 图片区域的OCR或表格识别结果。
表格: 表格的HTML字符串;
OCR: 一个包含各个单行文字的检测坐标和识别结果的元组 | + + 运行完成后,每张图片会在`output`字段指定的目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名为表格在图片里的坐标。 + + ``` + /output/table/1/ + └─ res.txt + └─ [454, 360, 824, 658].xlsx 表格识别结果 + └─ [16, 2, 828, 305].jpg 被裁剪出的图片区域 + └─ [17, 361, 404, 711].xlsx 表格识别结果 + ``` + +- **参数说明** + + | 字段 | 说明 | 默认值 | + | --------------- | ---------------------------------------- | -------------------------------------------- | + | output | excel和识别结果保存的地址 | ./output/table | + | table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 | + | table_model_dir | 表格结构模型 inference 模型地址 | None | + | table_char_type | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt | + + 大部分参数和paddleocr whl包保持一致,见 [whl包文档](./whl.md) + + + + +### 2.2 Python脚本使用 + +#### 2.2.1 中英文与多语言使用 -请按照上述步骤下载相应的模型,并且更新相关的参数,示例如下: +通过Python脚本使用PaddleOCR whl包,whl包会自动下载ppocr轻量级模型作为默认模型。 + +* 检测+方向分类器+识别全流程 + +```python +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换 +# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan` +ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory +img_path = './imgs/11.jpg' +result = ocr.ocr(img_path, cls=True) +for line in result: + print(line) + +# 显示结果 +from PIL import Image + +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='./fonts/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +结果是一个list,每个item包含了文本框,文字和识别置信度 ```bash -# 预测image_dir指定的单张图像 -python3 tools/infer/predict_system.py --image_dir="./doc/imgs/11.jpg" --det_model_dir="./inference/ch_ppocr_server_v2.0_det_infer/" --rec_model_dir="./inference/ch_ppocr_server_v2.0_rec_infer/" --cls_model_dir="./inference/ch_ppocr_mobile_v2.0_cls_infer/" --use_angle_cls=True --use_space_char=True +[[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]] +[[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]] +[[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]] +...... ``` -* 注意: - - 如果希望使用不支持空格的识别模型,在预测的时候需要注意:请将代码更新到最新版本,并添加参数 `--use_space_char=False`。 - - 如果不希望使用方向分类器,在预测的时候需要注意:请将代码更新到最新版本,并添加参数 `--use_angle_cls=False`。 +结果可视化 +
+ +
+ -更多的文本检测、识别串联推理使用方式请参考文档教程中[基于Python预测引擎推理](./inference.md)。 +#### 2.2.2 版面分析 -此外,文档教程中也提供了中文OCR模型的其他预测部署方式: -- [基于C++预测引擎推理](../../deploy/cpp_infer/readme.md) -- [服务部署](../../deploy/hubserving) -- [端侧部署(目前只支持静态图)](https://github.com/PaddlePaddle/PaddleOCR/tree/develop/deploy/lite) +```python +import os +import cv2 +from paddleocr import PPStructure,draw_structure_result,save_structure_res + +table_engine = PPStructure(show_log=True) + +save_folder = './output/table' +img_path = './table/paper-image.jpg' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) + +from PIL import Image + +font_path = './fonts/simfang.ttf' # PaddleOCR下提供字体包 +image = Image.open(img_path).convert('RGB') +im_show = draw_structure_result(image, result,font_path=font_path) +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` diff --git a/doc/doc_ch/rec_algo_compare.png b/doc/doc_ch/rec_algo_compare.png new file mode 100644 index 0000000000000000000000000000000000000000..2dde496c75f327ca1c0c9ccb0dbe6949215a4a1b Binary files /dev/null and b/doc/doc_ch/rec_algo_compare.png differ diff --git a/doc/doc_ch/recognition.md b/doc/doc_ch/recognition.md index 0ff0513a2b9a3e5e732e78bd8b4f42ab9f79094f..bb7d01712a85c92a02109e41814059e6c98c7cdc 100644 --- a/doc/doc_ch/recognition.md +++ b/doc/doc_ch/recognition.md @@ -1,30 +1,28 @@ -## 文字识别 +# 文字识别 +本文提供了PaddleOCR文本识别任务的全流程指南,包括数据准备、模型训练、调优、评估、预测,各个阶段的详细说明: - [1 数据准备](#数据准备) - [1.1 自定义数据集](#自定义数据集) - [1.2 数据下载](#数据下载) - [1.3 字典](#字典) - [1.4 支持空格](#支持空格) - - [2 启动训练](#启动训练) - [2.1 数据增强](#数据增强) - - [2.2 训练](#训练) - - [2.3 小语种](#小语种) - + - [2.2 通用模型训练](#通用模型训练) + - [2.3 多语言模型训练](#多语言模型训练) - [3 评估](#评估) - - [4 预测](#预测) - - [4.1 训练引擎预测](#训练引擎预测) +- [5 转Inference模型测试](#Inference) -### 1. 数据准备 +## 1. 数据准备 PaddleOCR 支持两种数据格式: - - `lmdb` 用于训练以lmdb格式存储的数据集; - - `通用数据` 用于训练以文本文件存储的数据集: + - `lmdb` 用于训练以lmdb格式存储的数据集(LMDBDataSet); + - `通用数据` 用于训练以文本文件存储的数据集(SimpleDataSet); 训练数据的默认存储路径是 `PaddleOCR/train_data`,如果您的磁盘上已有数据集,只需创建软链接至数据集目录: @@ -36,7 +34,7 @@ mklink /d /train_data/dataset ``` -#### 1.1 自定义数据集 +### 1.1 自定义数据集 下面以通用数据集为例, 介绍如何准备数据集: * 训练集 @@ -82,13 +80,15 @@ train_data/rec/train/word_002.jpg 用科技让复杂的世界更简单 -1.2 数据下载 +### 1.2 数据下载 + +- ICDAR2015 -若您本地没有数据集,可以在官网下载 [icdar2015](http://rrc.cvc.uab.es/?ch=4&com=downloads) 数据,用于快速验证。也可以参考[DTRB](https://github.com/clovaai/deep-text-recognition-benchmark#download-lmdb-dataset-for-traininig-and-evaluation-from-here) ,下载 benchmark 所需的lmdb格式数据集。 +若您本地没有数据集,可以在官网下载 [ICDAR2015](http://rrc.cvc.uab.es/?ch=4&com=downloads) 数据,用于快速验证。也可以参考[DTRB](https://github.com/clovaai/deep-text-recognition-benchmark#download-lmdb-dataset-for-traininig-and-evaluation-from-here) ,下载 benchmark 所需的lmdb格式数据集。 -如果你使用的是icdar2015的公开数据集,PaddleOCR 提供了一份用于训练 icdar2015 数据集的标签文件,通过以下方式下载: +如果希望复现SAR的论文指标,需要下载[SynthAdd](https://pan.baidu.com/share/init?surl=uV0LtoNmcxbO-0YA7Ch4dg), 提取码:627x。此外,真实数据集icdar2013, icdar2015, cocotext, IIIT5也作为训练数据的一部分。具体数据细节可以参考论文SAR。 -如果希望复现SRN的论文指标,需要下载离线[增广数据](https://pan.baidu.com/s/1-HSZ-ZVdqBF2HaBZ5pRAKA),提取码: y3ry。增广数据是由MJSynth和SynthText做旋转和扰动得到的。数据下载完成后请解压到 {your_path}/PaddleOCR/train_data/data_lmdb_release/training/ 路径下。 +如果你使用的是icdar2015的公开数据集,PaddleOCR 提供了一份用于训练 ICDAR2015 数据集的标签文件,通过以下方式下载: ``` # 训练集标签 @@ -97,15 +97,25 @@ wget -P ./train_data/ic15_data https://paddleocr.bj.bcebos.com/dataset/rec_gt_t wget -P ./train_data/ic15_data https://paddleocr.bj.bcebos.com/dataset/rec_gt_test.txt ``` -PaddleOCR 也提供了数据格式转换脚本,可以将官网 label 转换支持的数据格式。 数据转换工具在 `ppocr/utils/gen_label.py`, 这里以训练集为例: +PaddleOCR 也提供了数据格式转换脚本,可以将ICDAR官网 label 转换为PaddleOCR支持的数据格式。 数据转换工具在 `ppocr/utils/gen_label.py`, 这里以训练集为例: ``` # 将官网下载的标签文件转换为 rec_gt_label.txt python gen_label.py --mode="rec" --input_path="{path/of/origin/label}" --output_label="rec_gt_label.txt" ``` +数据样式格式如下,(a)为原始图片,(b)为每张图片对应的 Ground Truth 文本文件: +![](../datasets/icdar_rec.png) + +- 多语言数据集 + +多语言模型的训练数据集均为100w的合成数据,使用了开源合成工具 [text_renderer](https://github.com/Sanster/text_renderer) ,少量的字体可以通过下面两种方式下载。 +* [百度网盘](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA) 提取码:frgi +* [google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) + + -1.3 字典 +### 1.3 字典 最后需要提供一个字典({word_dict_name}.txt),使模型在训练时,可以将所有出现的字符映射为字典的索引。 @@ -149,16 +159,29 @@ PaddleOCR内置了一部分字典,可以按需使用。 - 自定义字典 如需自定义dic文件,请在 `configs/rec/rec_icdar15_train.yml` 中添加 `character_dict_path` 字段, 指向您的字典路径。 -并将 `character_type` 设置为 `ch`。 -1.4 添加空格类别 +### 1.4 添加空格类别 如果希望支持识别"空格"类别, 请将yml文件中的 `use_space_char` 字段设置为 `True`。 -### 2. 启动训练 +## 2. 启动训练 + + +### 2.1 数据增强 + +PaddleOCR提供了多种数据增强方式,默认配置文件中已经添加了数据增广。 + +默认的扰动方式有:颜色空间转换(cvtColor)、模糊(blur)、抖动(jitter)、噪声(Gasuss noise)、随机切割(random crop)、透视(perspective)、颜色反转(reverse)、TIA数据增广。 + +训练过程中每种扰动方式以40%的概率被选择,具体代码实现请参考:[rec_img_aug.py](../../ppocr/data/imaug/rec_img_aug.py) + +*由于OpenCV的兼容性问题,扰动操作暂时只支持Linux* + + +### 2.2 通用模型训练 PaddleOCR提供了训练脚本、评估脚本和预测脚本,本节将以 CRNN 识别模型为例: @@ -178,23 +201,16 @@ tar -xf rec_mv3_none_bilstm_ctc_v2.0_train.tar && rm -rf rec_mv3_none_bilstm_ctc *如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false* ``` -# GPU训练 支持单卡,多卡训练,通过--gpus参数指定卡号 +# GPU训练 支持单卡,多卡训练 # 训练icdar15英文数据 训练日志会自动保存为 "{save_model_dir}" 下的train.log -python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_icdar15_train.yml -``` - -#### 2.1 数据增强 -PaddleOCR提供了多种数据增强方式,如果您希望在训练时加入扰动,请在配置文件中设置 `distort: true`。 +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_icdar15_train.yml -默认的扰动方式有:颜色空间转换(cvtColor)、模糊(blur)、抖动(jitter)、噪声(Gasuss noise)、随机切割(random crop)、透视(perspective)、颜色反转(reverse)。 - -训练过程中每种扰动方式以50%的概率被选择,具体代码实现请参考:[img_tools.py](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/ppocr/data/rec/img_tools.py) - -*由于OpenCV的兼容性问题,扰动操作暂时只支持Linux* +#多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_icdar15_train.yml +``` - -#### 2.2 训练 PaddleOCR支持训练和评估交替进行, 可以在 `configs/rec/rec_icdar15_train.yml` 中修改 `eval_batch_step` 设置评估频率,默认每500个iter评估一次。评估过程中默认将最佳acc模型,保存为 `output/rec_CRNN/best_accuracy` 。 @@ -215,6 +231,11 @@ PaddleOCR支持训练和评估交替进行, 可以在 `configs/rec/rec_icdar15_t | rec_mv3_tps_bilstm_att.yml | CRNN | Mobilenet_v3 | TPS | BiLSTM | att | | rec_r34_vd_tps_bilstm_att.yml | CRNN | Resnet34_vd | TPS | BiLSTM | att | | rec_r50fpn_vd_none_srn.yml | SRN | Resnet50_fpn_vd | None | rnn | srn | +| rec_mtb_nrtr.yml | NRTR | nrtr_mtb | None | transformer encoder | transformer decoder | +| rec_r31_sar.yml | SAR | ResNet31 | None | LSTM encoder | LSTM decoder | +| rec_resnet_stn_bilstm_att.yml | SEED | Aster_Resnet | STN | BiLSTM | att | + +*其中SEED模型需要额外加载FastText训练好的[语言模型](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz) 训练中文数据,推荐使用[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml),如您希望尝试其他算法在中文数据集上的效果,请参考下列说明修改配置文件: @@ -224,8 +245,6 @@ Global: ... # 添加自定义字典,如修改字典请将路径指向新字典 character_dict_path: ppocr/utils/ppocr_keys_v1.txt - # 修改字符类型 - character_type: ch ... # 识别空格 use_space_char: True @@ -282,105 +301,28 @@ Eval: ``` **注意,预测/评估时的配置文件请务必与训练一致。** - -#### 2.3 小语种 + +### 2.3 多语言模型训练 PaddleOCR目前已支持80种(除中文外)语种识别,`configs/rec/multi_languages` 路径下提供了一个多语言的配置文件模版: [rec_multi_language_lite_train.yml](../../configs/rec/multi_language/rec_multi_language_lite_train.yml)。 -您有两种方式创建所需的配置文件: - -1. 通过脚本自动生成 - -[generate_multi_language_configs.py](../../configs/rec/multi_language/generate_multi_language_configs.py) 可以帮助您生成多语言模型的配置文件 - -- 以意大利语为例,如果您的数据是按如下格式准备的: - ``` - |-train_data - |- it_train.txt # 训练集标签 - |- it_val.txt # 验证集标签 - |- data - |- word_001.jpg - |- word_002.jpg - |- word_003.jpg - | ... - ``` - - 可以使用默认参数,生成配置文件: - - ```bash - # 该代码需要在指定目录运行 - cd PaddleOCR/configs/rec/multi_language/ - # 通过-l或者--language参数设置需要生成的语种的配置文件,该命令会将默认参数写入配置文件 - python3 generate_multi_language_configs.py -l it - ``` - -- 如果您的数据放置在其他位置,或希望使用自己的字典,可以通过指定相关参数来生成配置文件: - - ```bash - # -l或者--language字段是必须的 - # --train修改训练集,--val修改验证集,--data_dir修改数据集目录,--dict修改字典路径, -o修改对应默认参数 - cd PaddleOCR/configs/rec/multi_language/ - python3 generate_multi_language_configs.py -l it \ # 语种 - --train {path/of/train_label.txt} \ # 训练标签文件的路径 - --val {path/of/val_label.txt} \ # 验证集标签文件的路径 - --data_dir {train_data/path} \ # 训练数据的根目录 - --dict {path/of/dict} \ # 字典文件路径 - -o Global.use_gpu=False # 是否使用gpu - ... - - ``` - -意大利文由拉丁字母组成,因此执行完命令后会得到名为 rec_latin_lite_train.yml 的配置文件。 - -2. 手动修改配置文件 - - 您也可以手动修改模版中的以下几个字段: - - ``` - Global: - use_gpu: True - epoch_num: 500 - ... - character_type: it # 需要识别的语种 - character_dict_path: {path/of/dict} # 字典文件所在路径 - - Train: - dataset: - name: SimpleDataSet - data_dir: train_data/ # 数据存放根目录 - label_file_list: ["./train_data/train_list.txt"] # 训练集label路径 - ... - - Eval: - dataset: - name: SimpleDataSet - data_dir: train_data/ # 数据存放根目录 - label_file_list: ["./train_data/val_list.txt"] # 验证集label路径 - ... - - ``` - -目前PaddleOCR支持的多语言算法有: - -| 配置文件 | 算法名称 | backbone | trans | seq | pred | language | character_type | -| :--------: | :-------: | :-------: | :-------: | :-----: | :-----: | :-----: | :-----: | -| rec_chinese_cht_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 中文繁体 | chinese_cht| -| rec_en_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 英语(区分大小写) | EN | -| rec_french_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 法语 | french | -| rec_ger_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 德语 | german | -| rec_japan_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 日语 | japan | -| rec_korean_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 韩语 | korean | -| rec_latin_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 拉丁字母 | latin | -| rec_arabic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 阿拉伯字母 | ar | -| rec_cyrillic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 斯拉夫字母 | cyrillic | -| rec_devanagari_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 梵文字母 | devanagari | +按语系划分,目前PaddleOCR支持的语种有: + +| 配置文件 | 算法名称 | backbone | trans | seq | pred | language | +| :--------: | :-------: | :-------: | :-------: | :-----: | :-----: | :-----: | +| rec_chinese_cht_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 中文繁体 | +| rec_en_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 英语(区分大小写) | +| rec_french_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 法语 | +| rec_ger_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 德语 | +| rec_japan_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 日语 | +| rec_korean_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 韩语 | +| rec_latin_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 拉丁字母 | +| rec_arabic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 阿拉伯字母 | +| rec_cyrillic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 斯拉夫字母 | +| rec_devanagari_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 梵文字母 | 更多支持语种请参考: [多语言模型](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/multi_languages.md#%E8%AF%AD%E7%A7%8D%E7%BC%A9%E5%86%99) -多语言模型训练方式与中文模型一致,训练数据集均为100w的合成数据,少量的字体可以通过下面两种方式下载。 -* [百度网盘](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA)。提取码:frgi。 -* [google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) - 如您希望在现有模型效果的基础上调优,请参考下列说明修改配置文件: 以 `rec_french_lite_train` 为例: @@ -416,7 +358,7 @@ Eval: ... ``` -### 3 评估 +## 3 评估 评估数据集可以通过 `configs/rec/rec_icdar15_train.yml` 修改Eval中的 `label_file_path` 设置。 @@ -426,14 +368,29 @@ python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec ``` -### 4 预测 - - -#### 4.1 训练引擎的预测 +## 4 预测 使用 PaddleOCR 训练好的模型,可以通过以下脚本进行快速预测。 -默认预测图片存储在 `infer_img` 里,通过 `-o Global.checkpoints` 指定权重: +默认预测图片存储在 `infer_img` 里,通过 `-o Global.checkpoints` 加载训练好的参数文件: + +根据配置文件中设置的的 `save_model_dir` 和 `save_epoch_step` 字段,会有以下几种参数被保存下来: + +``` +output/rec/ +├── best_accuracy.pdopt +├── best_accuracy.pdparams +├── best_accuracy.states +├── config.yml +├── iter_epoch_3.pdopt +├── iter_epoch_3.pdparams +├── iter_epoch_3.states +├── latest.pdopt +├── latest.pdparams +├── latest.states +└── train.log +``` +其中 best_accuracy.* 是评估集上的最优模型;iter_epoch_x.* 是以 `save_epoch_step` 为间隔保存下来的模型;latest.* 是最后一个epoch的模型。 ``` # 预测英文结果 @@ -469,3 +426,37 @@ python3 tools/infer_rec.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v infer_img: doc/imgs_words/ch/word_1.jpg result: ('韩国小馆', 0.997218) ``` + + + +## 5. 转Inference模型测试 + +识别模型转inference模型与检测的方式相同,如下: + +``` +# -c 后面设置训练算法的yml配置文件 +# -o 配置可选参数 +# Global.pretrained_model 参数设置待转换的训练模型地址,不用添加文件后缀 .pdmodel,.pdopt或.pdparams。 +# Global.save_inference_dir参数设置转换的模型将保存的地址。 + +python3 tools/export_model.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml -o Global.pretrained_model=./ch_lite/ch_ppocr_mobile_v2.0_rec_train/best_accuracy Global.save_inference_dir=./inference/rec_crnn/ +``` + +**注意:**如果您是在自己的数据集上训练的模型,并且调整了中文字符的字典文件,请注意修改配置文件中的`character_dict_path`是否是所需要的字典文件。 + +转换成功后,在目录下有三个文件: + +``` +/inference/rec_crnn/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +- 自定义模型推理 + + 如果训练时修改了文本的字典,在使用inference模型预测时,需要通过`--rec_char_dict_path`指定使用的字典路径,并且设置 `rec_char_type=ch` + + ``` + python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./your inference model" --rec_image_shape="3, 32, 100" --rec_char_type="ch" --rec_char_dict_path="your text dict path" + ``` diff --git a/doc/doc_ch/training.md b/doc/doc_ch/training.md new file mode 100644 index 0000000000000000000000000000000000000000..c6c7b87d9925197b36a246c651ab7179ff9d2e81 --- /dev/null +++ b/doc/doc_ch/training.md @@ -0,0 +1,137 @@ +# 模型训练 + +本文将介绍模型训练时需掌握的基本概念,和训练时的调优方法。 + +同时会简单介绍PaddleOCR模型训练数据的组成部分,以及如何在垂类场景中准备数据finetune模型。 + +- [1. 基本概念](#基本概念) + * [1.1 学习率](#学习率) + * [1.2 正则化](#正则化) + * [1.3 评估指标](#评估指标) +- [2. 数据与垂类场景](#数据与垂类场景) + * [2.1 训练数据](#训练数据) + * [2.2 垂类场景](#垂类场景) + * [2.3 自己构建数据集](#自己构建数据集) +* [3. 常见问题](#常见问题) + + +## 1. 基本概念 + +OCR(Optical Character Recognition,光学字符识别)是指对图像进行分析识别处理,获取文字和版面信息的过程,是典型的计算机视觉任务, +通常由文本检测和文本识别两个子任务构成。 + +模型调优时需要关注以下参数: + + +### 1.1 学习率 + +学习率是训练神经网络的重要超参数之一,它代表在每一次迭代中梯度向损失函数最优解移动的步长。 +在PaddleOCR中提供了多种学习率更新策略,可以通过配置文件修改,例如: + +``` +Optimizer: + ... + lr: + name: Piecewise + decay_epochs : [700, 800] + values : [0.001, 0.0001] + warmup_epoch: 5 +``` + +Piecewise 代表分段常数衰减,在不同的学习阶段指定不同的学习率,在每段内学习率相同。 +warmup_epoch 代表在前5个epoch中,学习率将逐渐从0增加到base_lr。全部策略可以参考代码[learning_rate.py](../../ppocr/optimizer/learning_rate.py) 。 + + +### 1.2 正则化 + +正则化可以有效的避免算法过拟合,PaddleOCR中提供了L1、L2正则方法,L1 和 L2 正则化是最常用的正则化方法。L1 正则化向目标函数添加正则化项,以减少参数的绝对值总和;而 L2 正则化中,添加正则化项的目的在于减少参数平方的总和。配置方法如下: + +``` +Optimizer: + ... + regularizer: + name: L2 + factor: 2.0e-05 +``` + + +### 1.3 评估指标 + +(1)检测阶段:先按照检测框和标注框的IOU评估,IOU大于某个阈值判断为检测准确。这里检测框和标注框不同于一般的通用目标检测框,是采用多边形进行表示。检测准确率:正确的检测框个数在全部检测框的占比,主要是判断检测指标。检测召回率:正确的检测框个数在全部标注框的占比,主要是判断漏检的指标。 + +(2)识别阶段: 字符识别准确率,即正确识别的文本行占标注的文本行数量的比例,只有整行文本识别对才算正确识别。 + +(3)端到端统计: 端对端召回率:准确检测并正确识别文本行在全部标注文本行的占比; 端到端准确率:准确检测并正确识别文本行在 检测到的文本行数量 的占比; 准确检测的标准是检测框与标注框的IOU大于某个阈值,正确识别的的检测框中的文本与标注的文本相同。 + + + +## 2. 数据与垂类场景 + + +### 2.1 训练数据 +目前开源的模型,数据集和量级如下: + + - 检测: + - 英文数据集,ICDAR2015 + - 中文数据集,LSVT街景数据集训练数据3w张图片 + + - 识别: + - 英文数据集,MJSynth和SynthText合成数据,数据量上千万。 + - 中文数据集,LSVT街景数据集根据真值将图crop出来,并进行位置校准,总共30w张图像。此外基于LSVT的语料,合成数据500w。 + - 小语种数据集,使用不同语料和字体,分别生成了100w合成数据集,并使用ICDAR-MLT作为验证集。 + +其中,公开数据集都是开源的,用户可自行搜索下载,也可参考[中文数据集](./datasets.md),合成数据暂不开源,用户可使用开源合成工具自行合成,可参考的合成工具包括[text_renderer](https://github.com/Sanster/text_renderer) 、[SynthText](https://github.com/ankush-me/SynthText) 、[TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator) 等。 + + +### 2.2 垂类场景 + +PaddleOCR主要聚焦通用OCR,如果有垂类需求,您可以用PaddleOCR+垂类数据自己训练; +如果缺少带标注的数据,或者不想投入研发成本,建议直接调用开放的API,开放的API覆盖了目前比较常见的一些垂类。 + + +### 2.3 自己构建数据集 + +在构建数据集时有几个经验可供参考: + +(1) 训练集的数据量: + + a. 检测需要的数据相对较少,在PaddleOCR模型的基础上进行Fine-tune,一般需要500张可达到不错的效果。 + b. 识别分英文和中文,一般英文场景需要几十万数据可达到不错的效果,中文则需要几百万甚至更多。 + + +(2)当训练数据量少时,可以尝试以下三种方式获取更多的数据: + + a. 人工采集更多的训练数据,最直接也是最有效的方式。 + b. 基于PIL和opencv基本图像处理或者变换。例如PIL中ImageFont, Image, ImageDraw三个模块将文字写到背景中,opencv的旋转仿射变换,高斯滤波等。 + c. 利用数据生成算法合成数据,例如pix2pix或StyleText等算法。 + + + +## 3. 常见问题 + +**Q**:训练CRNN识别时,如何选择合适的网络输入shape? + + A:一般高度采用32,最长宽度的选择,有两种方法: + + (1)统计训练样本图像的宽高比分布。最大宽高比的选取考虑满足80%的训练样本。 + + (2)统计训练样本文字数目。最长字符数目的选取考虑满足80%的训练样本。然后中文字符长宽比近似认为是1,英文认为3:1,预估一个最长宽度。 + +**Q**:识别训练时,训练集精度已经到达90了,但验证集精度一直在70,涨不上去怎么办? + + A:训练集精度90,测试集70多的话,应该是过拟合了,有两个可尝试的方法: + + (1)加入更多的增广方式或者调大增广prob的[概率](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppocr/data/imaug/rec_img_aug.py#L341),默认为0.4。 + + (2)调大系统的[l2 dcay值](https://github.com/PaddlePaddle/PaddleOCR/blob/a501603d54ff5513fc4fc760319472e59da25424/configs/rec/ch_ppocr_v1.1/rec_chinese_lite_train_v1.1.yml#L47) + +**Q**: 识别模型训练时,loss能正常下降,但acc一直为0 + + A:识别模型训练初期acc为0是正常的,多训一段时间指标就上来了。 + + +*** +具体的训练教程可点击下方链接跳转: +- [文本检测模型训练](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/detection.md) +- [文本识别模型训练](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/recognition.md) +- [文本方向分类器训练](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/angle_class.md) \ No newline at end of file diff --git a/doc/doc_ch/update.md b/doc/doc_ch/update.md index 3fe8a0c9ace4be31882b22fe75b88f18848e1ad9..0852e240886b4ca736a830c8c44651ca35ec1f25 100644 --- a/doc/doc_ch/update.md +++ b/doc/doc_ch/update.md @@ -1,4 +1,8 @@ # 更新 +- 2021.9.7 发布PaddleOCR v2.3,发布[PP-OCRv2](#PP-OCRv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7%。 +- 2021.8.3 发布PaddleOCR v2.2,新增文档结构分析[PP-Structure](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/ppstructure/README_ch.md)工具包,支持版面分析与表格识别(含Excel导出)。 +- 2021.6.29 [FAQ](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/doc/doc_ch/FAQ.md)新增5个高频问题,总数248个,每周一都会更新,欢迎大家持续关注。 +- 2021.4.8 release 2.1版本,新增AAAI 2021论文[端到端识别算法PGNet](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/doc/doc_ch/pgnet.md)开源,[多语言模型](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/doc/doc_ch/multi_languages.md)支持种类增加到80+。 - 2020.12.15 更新数据合成工具[Style-Text](../../StyleText/README_ch.md),可以批量合成大量与目标场景类似的图像,在多个场景验证,效果明显提升。 - 2020.12.07 [FAQ](../../doc/doc_ch/FAQ.md)新增5个高频问题,总数124个,并且计划以后每周一都会更新,欢迎大家持续关注。 - 2020.11.25 更新半自动标注工具[PPOCRLabel](../../PPOCRLabel/README_ch.md),辅助开发者高效完成标注任务,输出格式与PP-OCR训练任务完美衔接。 diff --git a/doc/doc_ch/visualization.md b/doc/doc_ch/visualization.md index f2ea2b09d9431ebd710f2d7ccac0bd73c50b558e..99d071ec22daccaa295b5087760c5fc0d45f9802 100644 --- a/doc/doc_ch/visualization.md +++ b/doc/doc_ch/visualization.md @@ -1,7 +1,13 @@ # 效果展示 + +## 超轻量PP-OCRv2效果展示 + + + + -## 通用ppocr_server_2.0 效果展示 +## 通用PP-OCR server 效果展示
@@ -10,8 +16,6 @@ - -
diff --git a/doc/doc_ch/whl.md b/doc/doc_ch/whl.md index 167ed7b2b8a13706dfe1533265b6d96560265511..ba5bbae6255382d0c7fa5be319946d6242b1a544 100644 --- a/doc/doc_ch/whl.md +++ b/doc/doc_ch/whl.md @@ -210,7 +210,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true ```bash [[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]] [[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]] -[[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]] +[[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]]µ ...... ``` diff --git a/doc/doc_en/algorithm_overview_en.md b/doc/doc_en/algorithm_overview_en.md index d70f99bb5c5b0bdcb7d39209dfc9a77c56918260..df8a4ce3ef5fbcadb7ebdfd8ddf2bdf59637783e 100755 --- a/doc/doc_en/algorithm_overview_en.md +++ b/doc/doc_en/algorithm_overview_en.md @@ -11,9 +11,10 @@ This tutorial lists the text detection algorithms and text recognition algorithm ### 1. Text Detection Algorithm PaddleOCR open source text detection algorithms list: -- [x] EAST([paper](https://arxiv.org/abs/1704.03155))[2] -- [x] DB([paper](https://arxiv.org/abs/1911.08947))[1] -- [x] SAST([paper](https://arxiv.org/abs/1908.05498))[4] +- [x] EAST([paper](https://arxiv.org/abs/1704.03155)) +- [x] DB([paper](https://arxiv.org/abs/1911.08947)) +- [x] SAST([paper](https://arxiv.org/abs/1908.05498)) +- [x] PSE([paper](https://arxiv.org/abs/1903.12473v2)) On the ICDAR2015 dataset, the text detection result is as follows: @@ -24,6 +25,8 @@ On the ICDAR2015 dataset, the text detection result is as follows: |DB|ResNet50_vd|86.41%|78.72%|82.38%|[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_db_v2.0_train.tar)| |DB|MobileNetV3|77.29%|73.08%|75.12%|[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar)| |SAST|ResNet50_vd|91.39%|83.77%|87.42%|[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)| +|PSE|ResNet50_vd|85.81%|79.53%|82.55%|[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_vd_pse_v2.0_train.tar)| +|PSE|MobileNetV3|82.20%|70.48%|75.89%|[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_mv3_pse_v2.0_train.tar)| On Total-Text dataset, the text detection result is as follows: @@ -41,11 +44,13 @@ For the training guide and use of PaddleOCR text detection algorithms, please re ### 2. Text Recognition Algorithm PaddleOCR open-source text recognition algorithms list: -- [x] CRNN([paper](https://arxiv.org/abs/1507.05717))[7] -- [x] Rosetta([paper](https://arxiv.org/abs/1910.05085))[10] -- [x] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html))[11] -- [x] RARE([paper](https://arxiv.org/abs/1603.03915v1))[12] -- [x] SRN([paper](https://arxiv.org/abs/2003.12294))[5] +- [x] CRNN([paper](https://arxiv.org/abs/1507.05717)) +- [x] Rosetta([paper](https://arxiv.org/abs/1910.05085)) +- [x] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html)) +- [x] RARE([paper](https://arxiv.org/abs/1603.03915v1)) +- [x] SRN([paper](https://arxiv.org/abs/2003.12294)) +- [x] NRTR([paper](https://arxiv.org/abs/1806.00926v2)) +- [x] SAR([paper](https://arxiv.org/abs/1811.00751v2)) Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow: @@ -60,5 +65,7 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r |RARE|MobileNetV3|82.5%|rec_mv3_tps_bilstm_att |[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_att_v2.0_train.tar)| |RARE|Resnet34_vd|83.6%|rec_r34_vd_tps_bilstm_att |[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar)| |SRN|Resnet50_vd_fpn| 88.52% | rec_r50fpn_vd_none_srn |[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r50_vd_srn_train.tar)| +|NRTR|NRTR_MTB| 84.3% | rec_mtb_nrtr | [Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar) | +|SAR|Resnet31| 87.2% | rec_r31_sar | [Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) | Please refer to the document for training guide and use of PaddleOCR text recognition algorithms [Text recognition model training/evaluation/prediction](./recognition_en.md) diff --git a/doc/doc_en/angle_class_en.md b/doc/doc_en/angle_class_en.md index 0044d85ac0a43529c67746d25118bd80ee52be9a..46d91bee43de3af99659651b7f31cf1148e7b294 100644 --- a/doc/doc_en/angle_class_en.md +++ b/doc/doc_en/angle_class_en.md @@ -1,6 +1,14 @@ -## TEXT ANGLE CLASSIFICATION +# Text Direction Classification -### Method introduction +- [1. Method Introduction](#method-introduction) +- [2. Data Preparation](#data-preparation) +- [3. Training](#training) +- [4. Evaluation](#evaluation) +- [5. Prediction](#prediction) + + + +## 1. Method Introduction The angle classification is used in the scene where the image is not 0 degrees. In this scene, it is necessary to perform a correction operation on the text line detected in the picture. In the PaddleOCR system, The text line image obtained after text detection is sent to the recognition model after affine transformation. At this time, only a 0 and 180 degree angle classification of the text is required, so the built-in PaddleOCR text angle classifier **only supports 0 and 180 degree classification**. If you want to support more angles, you can modify the algorithm yourself to support. @@ -9,6 +17,9 @@ Example of 0 and 180 degree data samples: ![](../imgs_results/angle_class_example.jpg) ### DATA PREPARATION + +## 2. Data Preparation + Please organize the dataset as follows: The default storage path for training data is `PaddleOCR/train_data/cls`, if you already have a dataset on your disk, just create a soft link to the dataset directory: @@ -62,8 +73,8 @@ containing all images (test) and a cls_gt_test.txt. The structure of the test se |- word_003.jpg | ... ``` - -### TRAINING + +## 3. Training Write the prepared txt file and image folder path into the configuration file under the `Train/Eval.dataset.label_file_list` and `Train/Eval.dataset.data_dir` fields, the absolute path of the image consists of the `Train/Eval.dataset.data_dir` field and the image name recorded in the txt file. PaddleOCR provides training scripts, evaluation scripts, and prediction scripts. @@ -107,7 +118,8 @@ If the evaluation set is large, the test will be time-consuming. It is recommend **Note that the configuration file for prediction/evaluation must be consistent with the training.** -### EVALUATION + +## 4. Evaluation The evaluation dataset can be set by modifying the `Eval.dataset.label_file_list` field in the `configs/cls/cls_mv3.yml` file. @@ -116,6 +128,8 @@ export CUDA_VISIBLE_DEVICES=0 # GPU evaluation, Global.checkpoints is the weight to be tested python3 tools/eval.py -c configs/cls/cls_mv3.yml -o Global.checkpoints={path/to/weights}/best_accuracy ``` + +## 5. Prediction ### PREDICTION diff --git a/doc/doc_en/benchmark_en.md b/doc/doc_en/benchmark_en.md index 91b015941924add81f8b4f0d9d9ca13274348131..70b33aebd95cfa6e02122c6816cd3863d2b584ab 100755 --- a/doc/doc_en/benchmark_en.md +++ b/doc/doc_en/benchmark_en.md @@ -1,8 +1,8 @@ -# BENCHMARK +# Benchmark This document gives the performance of the series models for Chinese and English recognition. -## TEST DATA +## Test Data We collected 300 images for different real application scenarios to evaluate the overall OCR system, including contract samples, license plates, nameplates, train tickets, test sheets, forms, certificates, street view images, business cards, digital meter, etc. The following figure shows some images of the test set. @@ -10,10 +10,9 @@ We collected 300 images for different real application scenarios to evaluate the -## MEASUREMENT +## Measurement Explanation: -- v1.0 indicates DB+CRNN models without the strategies. v1.1 indicates the PP-OCR models with the strategies and the direction classify. slim_v1.1 indicates the PP-OCR models with prunner or quantization. - The long size of the input for the text detector is 960. @@ -27,30 +26,16 @@ Compares the model size and F-score: | Model Name | Model Size
of the
Whole System\(M\) | Model Size
of the Text
Detector\(M\) | Model Size
of the Direction
Classifier\(M\) | Model Size
of the Text
Recognizer \(M\) | F\-score | |:-:|:-:|:-:|:-:|:-:|:-:| -| ch\_ppocr\_mobile\_v1\.1 | 8\.1 | 2\.6 | 0\.9 | 4\.6 | 0\.5193 | -| ch\_ppocr\_server\_v1\.1 | 155\.1 | 47\.2 | 0\.9 | 107 | 0\.5414 | -| ch\_ppocr\_mobile\_v1\.0 | 8\.6 | 4\.1 | \- | 4\.5 | 0\.393 | -| ch\_ppocr\_server\_v1\.0 | 203\.8 | 98\.5 | \- | 105\.3 | 0\.4436 | +| PP-OCRv2 | 11\.6 | 3\.0 | 0\.9 | 8\.6 | 0\.5224 | +| PP-OCR mobile | 8\.1 | 2\.6 | 0\.9 | 4\.6 | 0\.503 | +| PP-OCR server | 155\.1 | 47\.2 | 0\.9 | 107 | 0\.570 | -Compares the time-consuming on T4 GPU (ms): +Compares the time-consuming on CPU and T4 GPU (ms): -| Model Name | Overall | Text Detector | Direction Classifier | Text Recognizer | -|:-:|:-:|:-:|:-:|:-:| -| ch\_ppocr\_mobile\_v1\.1 | 137 | 35 | 24 | 78 | -| ch\_ppocr\_server\_v1\.1 | 204 | 39 | 25 | 140 | -| ch\_ppocr\_mobile\_v1\.0 | 117 | 41 | \- | 76 | -| ch\_ppocr\_server\_v1\.0 | 199 | 52 | \- | 147 | +| Model Name | CPU | T4 GPU | +|:-:|:-:|:-:| +| PP-OCRv2 | 330 | 111 | +| PP-OCR mobile | 356 | 116| +| PP-OCR server | 1056 | 200 | -Compares the time-consuming on CPU (ms): - -| Model Name | Overall | Text Detector | Direction Classifier | Text Recognizer | -|:-:|:-:|:-:|:-:|:-:| -| ch\_ppocr\_mobile\_v1\.1 | 421 | 164 | 51 | 206 | -| ch\_ppocr\_mobile\_v1\.0 | 398 | 219 | \- | 179 | - -Compares the model size, F-score, the time-consuming on SD 855 of between the slim models and the original models: - -| Model Name | Model Size
of the
Whole System\(M\) | Model Size
of the Text
Detector\(M\) | Model Size
of the Direction
Classifier\(M\) | Model Size
of the Text
Recognizer \(M\) | F\-score | SD 855
\(ms\) | -|:-:|:-:|:-:|:-:|:-:|:-:|:-:| -| ch\_ppocr\_mobile\_v1\.1 | 8\.1 | 2\.6 | 0\.9 | 4\.6 | 0\.5193 | 306 | -| ch\_ppocr\_mobile\_slim\_v1\.1 | 3\.5 | 1\.4 | 0\.5 | 1\.6 | 0\.521 | 268 | +More indicators of PP-OCR series models can be referred to [PP-OCR Benchmark](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/doc/doc_en/benchmark_en.md) diff --git a/doc/doc_en/config_en.md b/doc/doc_en/config_en.md index 5e5847c4b298553b2d376b90196b61b7e0286efe..ce76da9b2f39532b387e3e45ca2ff497b0408635 100644 --- a/doc/doc_en/config_en.md +++ b/doc/doc_en/config_en.md @@ -1,4 +1,12 @@ -## Optional parameter list +# Configuration + +- [1. Optional Parameter List](#1-optional-parameter-list) +- [2. Intorduction to Global Parameters of Configuration File](#2-intorduction-to-global-parameters-of-configuration-file) +- [3. Multilingual Config File Generation](#3-multilingual-config-file-generation) + + + +## 1. Optional Parameter List The following list can be viewed through `--help` @@ -7,7 +15,9 @@ The following list can be viewed through `--help` | -c | ALL | Specify configuration file to use | None | **Please refer to the parameter introduction for configuration file usage** | | -o | ALL | set configuration options | None | Configuration using -o has higher priority than the configuration file selected with -c. E.g: -o Global.use_gpu=false | -## INTRODUCTION TO GLOBAL PARAMETERS OF CONFIGURATION FILE + + +## 2. Intorduction to Global Parameters of Configuration File Take rec_chinese_lite_train_v2.0.yml as an example ### Global @@ -27,9 +37,8 @@ Take rec_chinese_lite_train_v2.0.yml as an example | checkpoints | set model parameter path | None | Used to load parameters after interruption to continue training| | use_visualdl | Set whether to enable visualdl for visual log display | False | [Tutorial](https://www.paddlepaddle.org.cn/paddle/visualdl) | | infer_img | Set inference image path or folder path | ./infer_img | \| -| character_dict_path | Set dictionary path | ./ppocr/utils/ppocr_keys_v1.txt | \ | +| character_dict_path | Set dictionary path | ./ppocr/utils/ppocr_keys_v1.txt | If the character_dict_path is None, model can only recognize number and lower letters | | max_text_length | Set the maximum length of text | 25 | \ | -| character_type | Set character type | ch | en/ch, the default dict will be used for en, and the custom dict will be used for ch | | use_space_char | Set whether to recognize spaces | True | Only support in character_type=ch mode | | label_list | Set the angle supported by the direction classifier | ['0','180'] | Only valid in angle classifier model | | save_res_path | Set the save address of the test model results | ./output/det_db/predicts_db.txt | Only valid in the text detection model | @@ -51,7 +60,7 @@ Take rec_chinese_lite_train_v2.0.yml as an example ### Architecture ([ppocr/modeling](../../ppocr/modeling)) -In ppocr, the network is divided into four stages: Transform, Backbone, Neck and Head +In PaddleOCR, the network is divided into four stages: Transform, Backbone, Neck and Head | Parameter | Use | Defaults | Note | | :---------------------: | :---------------------: | :--------------: | :--------------------: | @@ -120,3 +129,108 @@ In ppocr, the network is divided into four stages: Transform, Backbone, Neck and | batch_size_per_card | Single card batch size during training | 256 | \ | | drop_last | Whether to discard the last incomplete mini-batch because the number of samples in the data set cannot be divisible by batch_size | True | \ | | num_workers | The number of sub-processes used to load data, if it is 0, the sub-process is not started, and the data is loaded in the main process | 8 | \ | + + + +## 3. Multilingual Config File Generation + +PaddleOCR currently supports 80 (except Chinese) language recognition. A multi-language configuration file template is +provided under the path `configs/rec/multi_languages`: [rec_multi_language_lite_train.yml](../../configs/rec/multi_language/rec_multi_language_lite_train.yml)。 + +There are two ways to create the required configuration file:: + +1. Automatically generated by script + +[generate_multi_language_configs.py](../../configs/rec/multi_language/generate_multi_language_configs.py) Can help you generate configuration files for multi-language models + +- Take Italian as an example, if your data is prepared in the following format: + ``` + |-train_data + |- it_train.txt # train_set label + |- it_val.txt # val_set label + |- data + |- word_001.jpg + |- word_002.jpg + |- word_003.jpg + | ... + ``` + + You can use the default parameters to generate a configuration file: + + ```bash + # The code needs to be run in the specified directory + cd PaddleOCR/configs/rec/multi_language/ + # Set the configuration file of the language to be generated through the -l or --language parameter. + # This command will write the default parameters into the configuration file + python3 generate_multi_language_configs.py -l it + ``` + +- If your data is placed in another location, or you want to use your own dictionary, you can generate the configuration file by specifying the relevant parameters: + + ```bash + # -l or --language field is required + # --train to modify the training set + # --val to modify the validation set + # --data_dir to modify the data set directory + # --dict to modify the dict path + # -o to modify the corresponding default parameters + cd PaddleOCR/configs/rec/multi_language/ + python3 generate_multi_language_configs.py -l it \ # language + --train {path/of/train_label.txt} \ # path of train_label + --val {path/of/val_label.txt} \ # path of val_label + --data_dir {train_data/path} \ # root directory of training data + --dict {path/of/dict} \ # path of dict + -o Global.use_gpu=False # whether to use gpu + ... + + ``` +Italian is made up of Latin letters, so after executing the command, you will get the rec_latin_lite_train.yml. + +2. Manually modify the configuration file + + You can also manually modify the following fields in the template: + + ``` + Global: + use_gpu: True + epoch_num: 500 + ... + character_dict_path: {path/of/dict} # path of dict + + Train: + dataset: + name: SimpleDataSet + data_dir: train_data/ # root directory of training data + label_file_list: ["./train_data/train_list.txt"] # train label path + ... + + Eval: + dataset: + name: SimpleDataSet + data_dir: train_data/ # root directory of val data + label_file_list: ["./train_data/val_list.txt"] # val label path + ... + + ``` + + +Currently, the multi-language algorithms supported by PaddleOCR are: + +| Configuration file | Algorithm name | backbone | trans | seq | pred | language | +| :--------: | :-------: | :-------: | :-------: | :-----: | :-----: | :-----: | +| rec_chinese_cht_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | chinese traditional | +| rec_en_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | English(Case sensitive) | +| rec_french_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | French | +| rec_ger_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | German | +| rec_japan_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | Japanese | +| rec_korean_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | Korean | +| rec_latin_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | Latin | +| rec_arabic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | arabic | +| rec_cyrillic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | cyrillic | +| rec_devanagari_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | devanagari | + +For more supported languages, please refer to : [Multi-language model](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/multi_languages_en.md#4-support-languages-and-abbreviations) + +The multi-language model training method is the same as the Chinese model. The training data set is 100w synthetic data. A small amount of fonts and test data can be downloaded using the following two methods. +* [Baidu Netdisk](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA),Extraction code:frgi. +* [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) diff --git a/doc/doc_en/detection_en.md b/doc/doc_en/detection_en.md index b736beb55d79db02bf4d4301a74c685537fce249..df96fd5336cd64049e8f5d9b898f60c55b82b7b4 100644 --- a/doc/doc_en/detection_en.md +++ b/doc/doc_en/detection_en.md @@ -1,9 +1,32 @@ -# TEXT DETECTION +# Text Detection This section uses the icdar2015 dataset as an example to introduce the training, evaluation, and testing of the detection model in PaddleOCR. -## DATA PREPARATION -The icdar2015 dataset can be obtained from [official website](https://rrc.cvc.uab.es/?ch=4&com=downloads). Registration is required for downloading. +- [1. Data and Weights Preparation](#1-data-and-weights-preparatio) + * [1.1 Data Preparation](#11-data-preparation) + * [1.2 Download Pretrained Model](#12-download-pretrained-model) +- [2. Training](#2-training) + * [2.1 Start Training](#21-start-training) + * [2.2 Load Trained Model and Continue Training](#22-load-trained-model-and-continue-training) + * [2.3 Training with New Backbone](#23-training-with-new-backbone) +- [3. Evaluation and Test](#3-evaluation-and-test) + * [3.1 Evaluation](#31-evaluation) + * [3.2 Test](#32-test) +- [4. Inference](#4-inference) +- [5. FAQ](#2-faq) + +## 1. Data and Weights Preparation + +### 1.1 Data Preparation + +The icdar2015 dataset contains train set which has 1000 images obtained with wearable cameras and test set which has 500 images obtained with wearable cameras. The icdar2015 can be obtained from [official website](https://rrc.cvc.uab.es/?ch=4&com=downloads). Registration is required for downloading. + + +After registering and logging in, download the part marked in the red box in the figure below. And, the content downloaded by `Training Set Images` should be saved as the folder `icdar_c4_train_imgs`, and the content downloaded by `Test Set Images` is saved as the folder `ch4_test_images` + +

+ +

Decompress the downloaded dataset to the working directory, assuming it is decompressed under PaddleOCR/train_data/. In addition, PaddleOCR organizes many scattered annotation files into two separate annotation files for train and test respectively, which can be downloaded by wget: ```shell @@ -36,10 +59,11 @@ The `points` in the dictionary represent the coordinates (x, y) of the four poin If you want to train PaddleOCR on other datasets, please build the annotation file according to the above format. -## TRAINING +### 1.2 Download Pretrained Model + +First download the pretrained model. The detection model of PaddleOCR currently supports 3 backbones, namely MobileNetV3, ResNet18_vd and ResNet50_vd. You can use the model in [PaddleClas](https://github.com/PaddlePaddle/PaddleClas/tree/release/2.0/ppcls/modeling/architectures) to replace backbone according to your needs. +And the responding download link of backbone pretrain weights can be found in (https://github.com/PaddlePaddle/PaddleClas/blob/release%2F2.0/README_cn.md#resnet%E5%8F%8A%E5%85%B6vd%E7%B3%BB%E5%88%97). -First download the pretrained model. The detection model of PaddleOCR currently supports 3 backbones, namely MobileNetV3, ResNet18_vd and ResNet50_vd. You can use the model in [PaddleClas](https://github.com/PaddlePaddle/PaddleClas/tree/develop/ppcls/modeling/architectures) to replace backbone according to your needs. -And the responding download link of backbone pretrain weights can be found in [PaddleClas repo](https://github.com/PaddlePaddle/PaddleClas#mobile-series). ```shell cd PaddleOCR/ # Download the pre-trained model of MobileNetV3 @@ -49,11 +73,16 @@ wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dyg # or, download the pre-trained model of ResNet50_vd wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_ssld_pretrained.pdparams +``` + +## 2. Training + +### 2.1 Start Training -#### START TRAINING *If CPU version installed, please set the parameter `use_gpu` to `false` in the configuration.* ```shell -python3 tools/train.py -c configs/det/det_mv3_db.yml +python3 tools/train.py -c configs/det/det_mv3_db.yml \ + -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained ``` In the above instruction, use `-c` to select the training to use the `configs/det/det_db_mv3.yml` configuration file. @@ -62,16 +91,17 @@ For a detailed explanation of the configuration file, please refer to [config](. You can also use `-o` to change the training parameters without modifying the yml file. For example, adjust the training learning rate to 0.0001 ```shell # single GPU training -python3 tools/train.py -c configs/det/det_mv3_db.yml -o Optimizer.base_lr=0.0001 +python3 tools/train.py -c configs/det/det_mv3_db.yml -o \ + Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained \ + Optimizer.base_lr=0.0001 # multi-GPU training # Set the GPU ID used by the '--gpus' parameter. -python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/det/det_mv3_db.yml -o Optimizer.base_lr=0.0001 - +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/det/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained ``` -#### load trained model and continue training +### 2.2 Load Trained Model and Continue Training If you expect to load trained model and continue the training again, you can specify the parameter `Global.checkpoints` as the model path to be loaded. For example: @@ -79,12 +109,64 @@ For example: python3 tools/train.py -c configs/det/det_mv3_db.yml -o Global.checkpoints=./your/trained/model ``` -**Note**: The priority of `Global.checkpoints` is higher than that of `Global.pretrain_weights`, that is, when two parameters are specified at the same time, the model specified by `Global.checkpoints` will be loaded first. If the model path specified by `Global.checkpoints` is wrong, the one specified by `Global.pretrain_weights` will be loaded. +**Note**: The priority of `Global.checkpoints` is higher than that of `Global.pretrained_model`, that is, when two parameters are specified at the same time, the model specified by `Global.checkpoints` will be loaded first. If the model path specified by `Global.checkpoints` is wrong, the one specified by `Global.pretrained_model` will be loaded. + +### 2.3 Training with New Backbone + +The network part completes the construction of the network, and PaddleOCR divides the network into four parts, which are under [ppocr/modeling](../../ppocr/modeling). The data entering the network will pass through these four parts in sequence(transforms->backbones-> +necks->heads). + +```bash +├── architectures # Code for building network +├── transforms # Image Transformation Module +├── backbones # Feature extraction module +├── necks # Feature enhancement module +└── heads # Output module +``` + +If the Backbone to be replaced has a corresponding implementation in PaddleOCR, you can directly modify the parameters in the `Backbone` part of the configuration yml file. + +However, if you want to use a new Backbone, an example of replacing the backbones is as follows: + +1. Create a new file under the [ppocr/modeling/backbones](../../ppocr/modeling/backbones) folder, such as my_backbone.py. +2. Add code in the my_backbone.py file, the sample code is as follows: + +```python +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class MyBackbone(nn.Layer): + def __init__(self, *args, **kwargs): + super(MyBackbone, self).__init__() + # your init code + self.conv = nn.xxxx + + def forward(self, inputs): + # your network forward + y = self.conv(inputs) + return y +``` -## EVALUATION +3. Import the added module in the [ppocr/modeling/backbones/\__init\__.py](../../ppocr/modeling/backbones/__init__.py) file. -PaddleOCR calculates three indicators for evaluating performance of OCR detection task: Precision, Recall, and Hmean. +After adding the four-part modules of the network, you only need to configure them in the configuration file to use, such as: + +```yaml + Backbone: + name: MyBackbone + args1: args1 +``` + +**NOTE**: More details about replace Backbone and other mudule can be found in [doc](add_new_algorithm_en.md). + +## 3. Evaluation and Test + +### 3.1 Evaluation + +PaddleOCR calculates three indicators for evaluating performance of OCR detection task: Precision, Recall, and Hmean(F-Score). Run the following code to calculate the evaluation indicators. The result will be saved in the test result file specified by `save_res_path` in the configuration file `det_db_mv3.yml` @@ -95,10 +177,9 @@ The model parameters during training are saved in the `Global.save_model_dir` di python3 tools/eval.py -c configs/det/det_mv3_db.yml -o Global.checkpoints="{path/to/weights}/best_accuracy" PostProcess.box_thresh=0.6 PostProcess.unclip_ratio=1.5 ``` +* Note: `box_thresh` and `unclip_ratio` are parameters required for DB post-processing, and not need to be set when evaluating the EAST and SAST model. -* Note: `box_thresh` and `unclip_ratio` are parameters required for DB post-processing, and not need to be set when evaluating the EAST model. - -## TEST +### 3.2 Test Test the detection result on a single image: ```shell @@ -107,7 +188,7 @@ python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o Global.infer_img="./ When testing the DB model, adjust the post-processing threshold: ```shell -python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o Global.infer_img="./doc/imgs_en/img_10.jpg" Global.pretrained_model="./output/det_db/best_accuracy" PostProcess.box_thresh=0.6 PostProcess.unclip_ratio=1.5 +python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o Global.infer_img="./doc/imgs_en/img_10.jpg" Global.pretrained_model="./output/det_db/best_accuracy" PostProcess.box_thresh=0.6 PostProcess.unclip_ratio=2.0 ``` @@ -115,3 +196,33 @@ Test the detection result on all images in the folder: ```shell python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o Global.infer_img="./doc/imgs_en/" Global.pretrained_model="./output/det_db/best_accuracy" ``` + +## 4. Inference + +The inference model (the model saved by `paddle.jit.save`) is generally a solidified model saved after the model training is completed, and is mostly used to give prediction in deployment. + +The model saved during the training process is the checkpoints model, which saves the parameters of the model and is mostly used to resume training. + +Compared with the checkpoints model, the inference model will additionally save the structural information of the model. Therefore, it is easier to deploy because the model structure and model parameters are already solidified in the inference model file, and is suitable for integration with actual systems. + +Firstly, we can convert DB trained model to inference model: +```shell +python3 tools/export_model.py -c configs/det/det_mv3_db.yml -o Global.pretrained_model="./output/det_db/best_accuracy" Global.save_inference_dir="./output/det_db_inference/" +``` + +The detection inference model prediction: +```shell +python3 tools/infer/predict_det.py --det_algorithm="DB" --det_model_dir="./output/det_db_inference/" --image_dir="./doc/imgs/" --use_gpu=True +``` + +If it is other detection algorithms, such as the EAST, the det_algorithm parameter needs to be modified to EAST, and the default is the DB algorithm: +```shell +python3 tools/infer/predict_det.py --det_algorithm="EAST" --det_model_dir="./output/det_db_inference/" --image_dir="./doc/imgs/" --use_gpu=True +``` + +## 5. FAQ + +Q1: The prediction results of trained model and inference model are inconsistent? +**A**: Most of the problems are caused by the inconsistency of the pre-processing and post-processing parameters during the prediction of the trained model and the pre-processing and post-processing parameters during the prediction of the inference model. Taking the model trained by the det_mv3_db.yml configuration file as an example, the solution to the problem of inconsistent prediction results between the training model and the inference model is as follows: +- Check whether the [trained model preprocessing](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/configs/det/det_mv3_db.yml#L116) is consistent with the prediction [preprocessing function of the inference model](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/tools/infer/predict_det.py#L42). When the algorithm is evaluated, the input image size will affect the accuracy. In order to be consistent with the paper, the image is resized to [736, 1280] in the training icdar15 configuration file, but there is only a set of default parameters when the inference model predicts, which will be considered To predict the speed problem, the longest side of the image is limited to 960 for resize by default. The preprocessing function of the training model preprocessing and the inference model is located in [ppocr/data/imaug/operators.py](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/ppocr/data/imaug/operators.py#L147) +- Check whether the [post-processing of the trained model](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/configs/det/det_mv3_db.yml#L51) is consistent with the [post-processing parameters of the inference](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/tools/infer/utility.py#L50). diff --git a/doc/doc_en/environment_en.md b/doc/doc_en/environment_en.md new file mode 100644 index 0000000000000000000000000000000000000000..9aad92cafb809a0eec519808b1c1755403b39318 --- /dev/null +++ b/doc/doc_en/environment_en.md @@ -0,0 +1,348 @@ +# Environment Preparation + +Recommended working environment: +- PaddlePaddle >= 2.0.0 (2.1.2) +- python3.7 +- CUDA10.1 / CUDA10.2 +- CUDNN 7.6 + +* [1. Python Environment Setup](#1) + + [1.1 Windows](#1.1) + + [1.2 Mac](#1.2) + + [1.3 Linux](#1.3) +* [2. Install PaddlePaddle 2.0](#2) + + + + +## 1. Python Environment Setup + + + +### 1.1 Windows + +#### 1.1.1 Install Anaconda + +- Note: To use paddlepaddle you need to install python environment first, here we choose python integrated environment Anaconda toolkit + + - Anaconda is a common python package manager + - After installing Anaconda, you can install the python environment, as well as numpy and other required toolkit environment. + +- Anaconda download. + + - Address: https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/?C=M&O=D + + - Most Win10 computers are 64-bit operating systems, choose x86_64 version; if the computer is a 32-bit operating system, choose x86.exe + + anaconda download + + - After the download is complete, double-click the installer to enter the graphical interface + + - The default installation location is C drive, it is recommended to change the installation location to D drive. + + install config + + - Check conda to add environment variables and ignore the warning that + + add conda to path + + +#### 1.1.2 Opening the terminal and creating the conda environment + +- Open Anaconda Prompt terminal: bottom left Windows Start Menu -> Anaconda3 -> Anaconda Prompt start console + + anaconda download + + +- Create a new conda environment + + ```shell + # Enter the following command at the command line to create an environment named paddle_env + # Here to speed up the download, use the Tsinghua source + conda create --name paddle_env python=3.8 --channel https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ # This is a one line command + ``` + + This command will create an executable environment named paddle_env with python version 3.8, which will take a while depending on the network status + + The command line will then output a prompt, type y and enter to continue the installation + + conda create + +- To activate the conda environment you just created, enter the following command at the command line. + + ```shell + # Activate the paddle_env environment + conda activate paddle_env + # View the current location of python + where python + ``` + + create environment + +The above anaconda environment and python environment are installed + + + + + +### 1.2 Mac + +#### 1.2.1 Installing Anaconda + +- Note: To use paddlepaddle you need to install the python environment first, here we choose the python integrated environment Anaconda toolkit + + - Anaconda is a common python package manager + - After installing Anaconda, you can install the python environment, as well as numpy and other required toolkit environment + +- Anaconda download:. + + - Address: https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/?C=M&O=D + + anaconda download + + - Select `Anaconda3-2021.05-MacOSX-x86_64.pkg` at the bottom to download + +- After downloading, double click on the .pkg file to enter the graphical interface + + - Just follow the default settings, it will take a while to install + +- It is recommended to install a code editor such as vscode or pycharm + +#### 1.2.2 Open a terminal and create a conda environment + +- Open the terminal + + - Press command and spacebar at the same time, type "terminal" in the focus search, double click to enter terminal + +- **Add conda to the environment variables** + + - Environment variables are added so that the system can recognize the conda command + + - Open `~/.bash_profile` in the terminal by typing the following command. + + ```shell + vim ~/.bash_profile + ``` + + - Add conda as an environment variable in `~/.bash_profile`. + + ```shell + # Press i first to enter edit mode + # In the first line type. + export PATH="~/opt/anaconda3/bin:$PATH" + # If you customized the installation location during installation, change ~/opt/anaconda3/bin to the bin folder in the customized installation directory + ``` + + ```shell + # The modified ~/.bash_profile file should look like this (where xxx is the username) + export PATH="~/opt/anaconda3/bin:$PATH" + # >>> conda initialize >>> + # !!! Contents within this block are managed by 'conda init' !!! + __conda_setup="$('/Users/xxx/opt/anaconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" + if [ $? -eq 0 ]; then + eval "$__conda_setup" + else + if [ -f "/Users/xxx/opt/anaconda3/etc/profile.d/conda.sh" ]; then + . "/Users/xxx/opt/anaconda3/etc/profile.d/conda.sh" + else + export PATH="/Users/xxx/opt/anaconda3/bin:$PATH" + fi + fi + unset __conda_setup + # <<< conda initialize <<< + ``` + + - When you are done, press `esc` to exit edit mode, then type `:wq!` and enter to save and exit + + - Verify that the conda command is recognized. + + - Enter `source ~/.bash_profile` in the terminal to update the environment variables + - Enter `conda info --envs` in the terminal again, if it shows that there is a base environment, then conda has been added to the environment variables + +- Create a new conda environment + + ```shell + # Enter the following command at the command line to create an environment called paddle_env + # Here to speed up the download, use Tsinghua source + conda create --name paddle_env python=3.8 --channel https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ + ``` + + - This command will create an executable environment named paddle_env with python version 3.8, which will take a while depending on the network status + + - The command line will then output a prompt, type y and enter to continue the installation + + - conda_create + +- To activate the conda environment you just created, enter the following command at the command line. + + ```shell + # Activate the paddle_env environment + conda activate paddle_env + # View the current location of python + where python + ``` + + conda_actviate + +The above anaconda environment and python environment are installed + + + + + +### 1.3 Linux + +Linux users can choose to run either Anaconda or Docker. If you are familiar with Docker and need to train the PaddleOCR model, it is recommended to use the Docker environment, where the development process of PaddleOCR is run. If you are not familiar with Docker, you can also use Anaconda to run the project. + +#### 1.3.1 Anaconda environment configuration + +- Note: To use paddlepaddle you need to install the python environment first, here we choose the python integrated environment Anaconda toolkit + + - Anaconda is a common python package manager + - After installing Anaconda, you can install the python environment, as well as numpy and other required toolkit environment + +- **Download Anaconda**. + + - Download at: https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/?C=M&O=D + + + + + + - Select the appropriate version for your operating system + - Type `uname -m` in the terminal to check the command set used by your system + + - Download method 1: Download locally, then transfer the installation package to the linux server + + - Download method 2: Directly use linux command line to download + + ```shell + # First install wget + sudo apt-get install wget # Ubuntu + sudo yum install wget # CentOS + ``` + ```bash + # Then use wget to download from Tsinghua source + # If you want to download Anaconda3-2021.05-Linux-x86_64.sh, the download command is as follows + wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2021.05-Linux-x86_64.sh + # If you want to download another version, you need to change the file name after the last 1 / to the version you want to download + ``` + +- To install Anaconda. + + - Type `sh Anaconda3-2021.05-Linux-x86_64.sh` at the command line + - If you downloaded a different version, replace the file name of the command with the name of the file you downloaded + - Just follow the installation instructions + - You can exit by typing q when viewing the license + +- **Add conda to the environment variables** + + - If you have already added conda to the environment variable path during the installation, you can skip this step + + - Open `~/.bashrc` in a terminal. + + ```shell + # Enter the following command in the terminal. + vim ~/.bashrc + ``` + + - Add conda as an environment variable in `~/.bashrc`. + + ```shell + # Press i first to enter edit mode # In the first line enter. + export PATH="~/anaconda3/bin:$PATH" + # If you customized the installation location during installation, change ~/anaconda3/bin to the bin folder in the customized installation directory + ``` + + ```shell + # The modified ~/.bash_profile file should look like this (where xxx is the username) + export PATH="~/opt/anaconda3/bin:$PATH" + # >>> conda initialize >>> + # !!! Contents within this block are managed by 'conda init' !!! + __conda_setup="$('/Users/xxx/opt/anaconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" + if [ $? -eq 0 ]; then + eval "$__conda_setup" + else + if [ -f "/Users/xxx/opt/anaconda3/etc/profile.d/conda.sh" ]; then + . "/Users/xxx/opt/anaconda3/etc/profile.d/conda.sh" + else + export PATH="/Users/xxx/opt/anaconda3/bin:$PATH" + fi + fi + unset __conda_setup + # <<< conda initialize <<< + ``` + + - When you are done, press `esc` to exit edit mode, then type `:wq!` and enter to save and exit + + - Verify that the conda command is recognized. + + - Enter `source ~/.bash_profile` in the terminal to update the environment variables + - Enter `conda info --envs` in the terminal again, if it shows that there is a base environment, then conda has been added to the environment variables + +- Create a new conda environment + + ```shell + # Enter the following command at the command line to create an environment called paddle_env + # Here to speed up the download, use Tsinghua source + conda create --name paddle_env python=3.8 --channel https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ + ``` + + - This command will create an executable environment named paddle_env with python version 3.8, which will take a while depending on the network status + + - The command line will then output a prompt, type y and enter to continue the installation + + conda_create + +- To activate the conda environment you just created, enter the following command at the command line. + + ```shell + # Activate the paddle_env environment + conda activate paddle_env + ``` + +The above anaconda environment and python environment are installed + + +#### 1.3.2 Docker environment preparation + +**The first time you use this docker image, it will be downloaded automatically. Please be patient.** + +```bash +# Switch to the working directory +cd /home/Projects +# You need to create a docker container for the first run, and do not need to run the current command when you run it again +# Create a docker container named ppocr and map the current directory to the /paddle directory of the container + +# If using CPU, use docker instead of nvidia-docker to create docker +sudo docker run --name ppocr -v $PWD:/paddle --network=host -it registry.baidubce.com/paddlepaddle/paddle:2.1.3-gpu-cuda10.2-cudnn7 /bin/bash + +# If using GPU, use nvidia-docker to create docker +# docker image registry.baidubce.com/paddlepaddle/paddle:2.1.3-gpu-cuda11.2-cudnn8 is recommended for CUDA11.2 + CUDNN8. +sudo nvidia-docker run --name ppocr -v $PWD:/paddle --shm-size=64G --network=host -it registry.baidubce.com/paddlepaddle/paddle:2.1.3-gpu-cuda10.2-cudnn7 /bin/bash + +``` +You can also visit [DockerHub](https://hub.docker.com/r/paddlepaddle/paddle/tags/) to get the image that fits your machine. + +``` +# ctrl+P+Q to exit docker, to re-enter docker using the following command: +sudo docker container exec -it ppocr /bin/bash +``` + + + +## 2. Install PaddlePaddle 2.0 + +- If you have cuda9 or cuda10 installed on your machine, please run the following command to install + +```bash +python3 -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple +``` + +- If you only have cpu on your machine, please run the following command to install + +```bash +python3 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple +``` + +For more software version requirements, please refer to the instructions in [Installation Document](https://www.paddlepaddle.org.cn/install/quick) for operation. diff --git a/doc/doc_en/inference_en.md b/doc/doc_en/inference_en.md index e30355fb8e29031bd4ce040a86ad0f57d18ce398..019ac4d0ac15aceed89286048d2c4d88a259e501 100755 --- a/doc/doc_en/inference_en.md +++ b/doc/doc_en/inference_en.md @@ -1,5 +1,5 @@ -# Reasoning based on Python prediction engine +# Inference Based on Python Prediction Engine The inference model (the model saved by `paddle.jit.save`) is generally a solidified model saved after the model training is completed, and is mostly used to give prediction in deployment. @@ -10,37 +10,36 @@ For more details, please refer to the document [Classification Framework](https: Next, we first introduce how to convert a trained model into an inference model, and then we will introduce text detection, text recognition, angle class, and the concatenation of them based on inference model. -- [CONVERT TRAINING MODEL TO INFERENCE MODEL](#CONVERT) - - [Convert detection model to inference model](#Convert_detection_model) - - [Convert recognition model to inference model](#Convert_recognition_model) - - [Convert angle classification model to inference model](#Convert_angle_class_model) +- [1. Convert Training Model to Inference Model](#CONVERT) + - [1.1 Convert Detection Model to Inference Model](#Convert_detection_model) + - [1.2 Convert Recognition Model to Inference Model](#Convert_recognition_model) + - [1.3 Convert Angle Classification Model to Inference Model](#Convert_angle_class_model) -- [TEXT DETECTION MODEL INFERENCE](#DETECTION_MODEL_INFERENCE) - - [1. LIGHTWEIGHT CHINESE DETECTION MODEL INFERENCE](#LIGHTWEIGHT_DETECTION) - - [2. DB TEXT DETECTION MODEL INFERENCE](#DB_DETECTION) - - [3. EAST TEXT DETECTION MODEL INFERENCE](#EAST_DETECTION) - - [4. SAST TEXT DETECTION MODEL INFERENCE](#SAST_DETECTION) - - [5. Multilingual model inference](#Multilingual model inference) +- [2. Text Detection Model Inference](#DETECTION_MODEL_INFERENCE) + - [2.1 Lightweight Chinese Detection Model Inference](#LIGHTWEIGHT_DETECTION) + - [2.2 DB Text Detection Model Inference](#DB_DETECTION) + - [2.3 East Text Detection Model Inference](#EAST_DETECTION) + - [2.4 Sast Text Detection Model Inference](#SAST_DETECTION) -- [TEXT RECOGNITION MODEL INFERENCE](#RECOGNITION_MODEL_INFERENCE) - - [1. LIGHTWEIGHT CHINESE MODEL](#LIGHTWEIGHT_RECOGNITION) - - [2. CTC-BASED TEXT RECOGNITION MODEL INFERENCE](#CTC-BASED_RECOGNITION) - - [3. SRN-BASED TEXT RECOGNITION MODEL INFERENCE](#SRN-BASED_RECOGNITION) - - [3. TEXT RECOGNITION MODEL INFERENCE USING CUSTOM CHARACTERS DICTIONARY](#USING_CUSTOM_CHARACTERS) - - [4. MULTILINGUAL MODEL INFERENCE](MULTILINGUAL_MODEL_INFERENCE) +- [3. Text Recognition Model Inference](#RECOGNITION_MODEL_INFERENCE) + - [3.1 Lightweight Chinese Text Recognition Model Reference](#LIGHTWEIGHT_RECOGNITION) + - [3.2 CTC-Based Text Recognition Model Inference](#CTC-BASED_RECOGNITION) + - [3.3 SRN-Based Text Recognition Model Inference](#SRN-BASED_RECOGNITION) + - [3.4 Text Recognition Model Inference Using Custom Characters Dictionary](#USING_CUSTOM_CHARACTERS) + - [3.5 Multilingual Model Inference](#MULTILINGUAL_MODEL_INFERENCE) -- [ANGLE CLASSIFICATION MODEL INFERENCE](#ANGLE_CLASS_MODEL_INFERENCE) - - [1. ANGLE CLASSIFICATION MODEL INFERENCE](#ANGLE_CLASS_MODEL_INFERENCE) +- [4. Angle Classification Model Inference](#ANGLE_CLASS_MODEL_INFERENCE) -- [TEXT DETECTION ANGLE CLASSIFICATION AND RECOGNITION INFERENCE CONCATENATION](#CONCATENATION) - - [1. LIGHTWEIGHT CHINESE MODEL](#LIGHTWEIGHT_CHINESE_MODEL) - - [2. OTHER MODELS](#OTHER_MODELS) +- [5. Text Detection Angle Classification And Recognition Inference Concatenation](#CONCATENATION) + - [5.1 Lightweight Chinese Model](#LIGHTWEIGHT_CHINESE_MODEL) + - [5.2 Other Models](#OTHER_MODELS) -## CONVERT TRAINING MODEL TO INFERENCE MODEL +## 1. Convert Training Model to Inference Model -### Convert detection model to inference model + +### 1.1 Convert Detection Model to Inference Model Download the lightweight Chinese detection model: ``` @@ -67,7 +66,7 @@ inference/det_db/ ``` -### Convert recognition model to inference model +### 1.2 Convert Recognition Model to Inference Model Download the lightweight Chinese recognition model: ``` @@ -95,7 +94,7 @@ inference/det_db/ ``` -### Convert angle classification model to inference model +### 1.3 Convert Angle Classification Model to Inference Model Download the angle classification model: ``` @@ -122,13 +121,13 @@ inference/det_db/ -## TEXT DETECTION MODEL INFERENCE +## 2. Text Detection Model Inference The following will introduce the lightweight Chinese detection model inference, DB text detection model inference and EAST text detection model inference. The default configuration is based on the inference setting of the DB text detection model. Because EAST and DB algorithms are very different, when inference, it is necessary to **adapt the EAST text detection algorithm by passing in corresponding parameters**. -### 1. LIGHTWEIGHT CHINESE DETECTION MODEL INFERENCE +### 2.1 Lightweight Chinese Detection Model Inference For lightweight Chinese detection model inference, you can execute the following commands: @@ -163,7 +162,7 @@ python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_di ``` -### 2. DB TEXT DETECTION MODEL INFERENCE +### 2.2 DB Text Detection Model Inference First, convert the model saved in the DB text detection training process into an inference model. Taking the model based on the Resnet50_vd backbone network and trained on the ICDAR2015 English dataset as an example ([model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_db_v2.0_train.tar)), you can use the following command to convert: @@ -184,7 +183,7 @@ The visualized text detection results are saved to the `./inference_results` fol **Note**: Since the ICDAR2015 dataset has only 1,000 training images, mainly for English scenes, the above model has very poor detection result on Chinese text images. -### 3. EAST TEXT DETECTION MODEL INFERENCE +### 2.3 EAST TEXT DETECTION MODEL INFERENCE First, convert the model saved in the EAST text detection training process into an inference model. Taking the model based on the Resnet50_vd backbone network and trained on the ICDAR2015 English dataset as an example ([model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)), you can use the following command to convert: @@ -205,7 +204,7 @@ The visualized text detection results are saved to the `./inference_results` fol -### 4. SAST TEXT DETECTION MODEL INFERENCE +### 2.4 Sast Text Detection Model Inference #### (1). Quadrangle text detection model (ICDAR2015) First, convert the model saved in the SAST text detection training process into an inference model. Taking the model based on the Resnet50_vd backbone network and trained on the ICDAR2015 English dataset as an example ([model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)), you can use the following command to convert: @@ -243,13 +242,13 @@ The visualized text detection results are saved to the `./inference_results` fol **Note**: SAST post-processing locality aware NMS has two versions: Python and C++. The speed of C++ version is obviously faster than that of Python version. Due to the compilation version problem of NMS of C++ version, C++ version NMS will be called only in Python 3.5 environment, and python version NMS will be called in other cases. -## TEXT RECOGNITION MODEL INFERENCE +## 3. Text Recognition Model Inference The following will introduce the lightweight Chinese recognition model inference, other CTC-based and Attention-based text recognition models inference. For Chinese text recognition, it is recommended to choose the recognition model based on CTC loss. In practice, it is also found that the result of the model based on Attention loss is not as good as the one based on CTC loss. In addition, if the characters dictionary is modified during training, make sure that you use the same characters set during inferencing. Please check below for details. -### 1. LIGHTWEIGHT CHINESE TEXT RECOGNITION MODEL REFERENCE +### 3.1 Lightweight Chinese Text Recognition Model Reference For lightweight Chinese recognition model inference, you can execute the following commands: @@ -269,7 +268,7 @@ Predicts of ./doc/imgs_words_en/word_10.png:('PAIN', 0.9897658) ``` -### 2. CTC-BASED TEXT RECOGNITION MODEL INFERENCE +### 3.2 CTC-Based Text Recognition Model Inference Taking CRNN as an example, we introduce the recognition model inference based on CTC loss. Rosetta and Star-Net are used in a similar way, No need to set the recognition algorithm parameter rec_algorithm. @@ -282,7 +281,7 @@ python3 tools/export_model.py -c configs/det/rec_r34_vd_none_bilstm_ctc.yml -o G For CRNN text recognition model inference, execute the following commands: ``` -python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./inference/starnet/" --rec_image_shape="3, 32, 100" --rec_char_type="en" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./inference/starnet/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt" ``` ![](../imgs_words_en/word_336.png) @@ -292,6 +291,7 @@ After executing the command, the recognition result of the above image is as fol ```bash Predicts of ./doc/imgs_words_en/word_336.png:('super', 0.9999073) ``` + **Note**:Since the above model refers to [DTRB](https://arxiv.org/abs/1904.01906) text recognition training and evaluation process, it is different from the training of lightweight Chinese recognition model in two aspects: - The image resolution used in training is different: the image resolution used in training the above model is [3,32,100], while during our Chinese model training, in order to ensure the recognition effect of long text, the image resolution used in training is [3, 32, 320]. The default shape parameter of the inference stage is the image resolution used in training phase, that is [3, 32, 320]. Therefore, when running inference of the above English model here, you need to set the shape of the recognition image through the parameter `rec_image_shape`. @@ -304,7 +304,7 @@ dict_character = list(self.character_str) ``` -### 3. SRN-BASED TEXT RECOGNITION MODEL INFERENCE +### 3.3 SRN-Based Text Recognition Model Inference The recognition model based on SRN requires additional setting of the recognition algorithm parameter --rec_algorithm="SRN". At the same time, it is necessary to ensure that the predicted shape is consistent @@ -314,25 +314,26 @@ with the training, such as: --rec_image_shape="1, 64, 256" python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" \ --rec_model_dir="./inference/srn/" \ --rec_image_shape="1, 64, 256" \ - --rec_char_type="en" \ + --rec_char_dict_path="./ppocr/utils/ic15_dict.txt" \ --rec_algorithm="SRN" ``` -### 4. TEXT RECOGNITION MODEL INFERENCE USING CUSTOM CHARACTERS DICTIONARY +### 3.4 Text Recognition Model Inference Using Custom Characters Dictionary If the text dictionary is modified during training, when using the inference model to predict, you need to specify the dictionary path used by `--rec_char_dict_path`, and set `rec_char_type=ch` ``` -python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./your inference model" --rec_image_shape="3, 32, 100" --rec_char_type="ch" --rec_char_dict_path="your text dict path" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./your inference model" --rec_image_shape="3, 32, 100" --rec_char_dict_path="your text dict path" ``` -### 5. MULTILINGAUL MODEL INFERENCE + +### 3.5 Multilingual Model Inference If you need to predict other language models, when using inference model prediction, you need to specify the dictionary path used by `--rec_char_dict_path`. At the same time, in order to get the correct visualization results, You need to specify the visual font path through `--vis_font_path`. There are small language fonts provided by default under the `doc/fonts` path, such as Korean recognition: ``` -python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/korean/1.jpg" --rec_model_dir="./your inference model" --rec_char_type="korean" --rec_char_dict_path="ppocr/utils/dict/korean_dict.txt" --vis_font_path="doc/fonts/korean.ttf" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/korean/1.jpg" --rec_model_dir="./your inference model" --rec_char_dict_path="ppocr/utils/dict/korean_dict.txt" --vis_font_path="doc/fonts/korean.ttf" ``` ![](../imgs_words/korean/1.jpg) @@ -343,13 +344,7 @@ Predicts of ./doc/imgs_words/korean/1.jpg:('바탕으로', 0.9948904) ``` -## ANGLE CLASSIFICATION MODEL INFERENCE - -The following will introduce the angle classification model inference. - - - -### 1.ANGLE CLASSIFICATION MODEL INFERENCE +## 4. Angle Classification Model Inference For angle classification model inference, you can execute the following commands: @@ -371,10 +366,10 @@ After executing the command, the prediction results (classification angle and sc ``` -## TEXT DETECTION ANGLE CLASSIFICATION AND RECOGNITION INFERENCE CONCATENATION +## 5. Text Detection Angle Classification and Recognition Inference Concatenation -### 1. LIGHTWEIGHT CHINESE MODEL +### 5.1 Lightweight Chinese Model When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default. @@ -388,14 +383,14 @@ python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --de # use multi-process python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/det_db/" --rec_model_dir="./inference/rec_crnn/" --use_angle_cls=false --use_mp=True --total_process_num=6 ``` -``` + After executing the command, the recognition result image is as follows: ![](../imgs_results/system_res_00018069.jpg) -### 2. OTHER MODELS +### 5.2 Other Models If you want to try other detection algorithms or recognition algorithms, please refer to the above text detection model inference and text recognition model inference, update the corresponding configuration and model. @@ -404,7 +399,7 @@ If you want to try other detection algorithms or recognition algorithms, please The following command uses the combination of the EAST text detection and STAR-Net text recognition: ``` -python3 tools/infer/predict_system.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_east/" --det_algorithm="EAST" --rec_model_dir="./inference/starnet/" --rec_image_shape="3, 32, 100" --rec_char_type="en" +python3 tools/infer/predict_system.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_east/" --det_algorithm="EAST" --rec_model_dir="./inference/starnet/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt" ``` After executing the command, the recognition result image is as follows: diff --git a/doc/doc_en/inference_ppocr_en.md b/doc/doc_en/inference_ppocr_en.md new file mode 100755 index 0000000000000000000000000000000000000000..fa3b1c88713f01e8e411cf95d107b4b58dd7f4e1 --- /dev/null +++ b/doc/doc_en/inference_ppocr_en.md @@ -0,0 +1,135 @@ + +# Python Inference for PP-OCR Model Library + +This article introduces the use of the Python inference engine for the PP-OCR model library. The content is in order of text detection, text recognition, direction classifier and the prediction method of the three in series on the CPU and GPU. + + +- [Text Detection Model Inference](#DETECTION_MODEL_INFERENCE) + +- [Text Recognition Model Inference](#RECOGNITION_MODEL_INFERENCE) + - [1. Lightweight Chinese Recognition Model Inference](#LIGHTWEIGHT_RECOGNITION) + - [2. Multilingaul Model Inference](#MULTILINGUAL_MODEL_INFERENCE) + +- [Angle Classification Model Inference](#ANGLE_CLASS_MODEL_INFERENCE) + +- [Text Detection Angle Classification and Recognition Inference Concatenation](#CONCATENATION) + + + +## Text Detection Model Inference + +The default configuration is based on the inference setting of the DB text detection model. For lightweight Chinese detection model inference, you can execute the following commands: + +``` +# download DB text detection inference model +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar +tar xf ch_ppocr_mobile_v2.0_det_infer.tar +# predict +python3 tools/infer/predict_det.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/det_db/" +``` + +The visual text detection results are saved to the ./inference_results folder by default, and the name of the result file is prefixed with'det_res'. Examples of results are as follows: + +![](../imgs_results/det_res_00018069.jpg) + +You can use the parameters `limit_type` and `det_limit_side_len` to limit the size of the input image, +The optional parameters of `limit_type` are [`max`, `min`], and +`det_limit_size_len` is a positive integer, generally set to a multiple of 32, such as 960. + +The default setting of the parameters is `limit_type='max', det_limit_side_len=960`. Indicates that the longest side of the network input image cannot exceed 960, +If this value is exceeded, the image will be resized with the same width ratio to ensure that the longest side is `det_limit_side_len`. +Set as `limit_type='min', det_limit_side_len=960`, it means that the shortest side of the image is limited to 960. + +If the resolution of the input picture is relatively large and you want to use a larger resolution prediction, you can set det_limit_side_len to the desired value, such as 1216: +``` +python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./inference/det_db/" --det_limit_type=max --det_limit_side_len=1216 +``` + +If you want to use the CPU for prediction, execute the command as follows +``` +python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./inference/det_db/" --use_gpu=False +``` + + + +## Text Recognition Model Inference + + + +### 1. Lightweight Chinese Recognition Model Inference + +For lightweight Chinese recognition model inference, you can execute the following commands: + +``` +# download CRNN text recognition inference model +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar +tar xf ch_ppocr_mobile_v2.0_rec_infer.tar +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_10.png" --rec_model_dir="ch_ppocr_mobile_v2.0_rec_infer" +``` + +![](../imgs_words_en/word_10.png) + +After executing the command, the prediction results (recognized text and score) of the above image will be printed on the screen. + +```bash +Predicts of ./doc/imgs_words_en/word_10.png:('PAIN', 0.9897658) +``` + + + +### 2. Multilingaul Model Inference +If you need to predict other language models, when using inference model prediction, you need to specify the dictionary path used by `--rec_char_dict_path`. At the same time, in order to get the correct visualization results, +You need to specify the visual font path through `--vis_font_path`. There are small language fonts provided by default under the `doc/fonts` path, such as Korean recognition: + +``` +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/korean/1.jpg" --rec_model_dir="./your inference model" --rec_char_type="korean" --rec_char_dict_path="ppocr/utils/dict/korean_dict.txt" --vis_font_path="doc/fonts/korean.ttf" +``` +![](../imgs_words/korean/1.jpg) + +After executing the command, the prediction result of the above figure is: + +``` text +Predicts of ./doc/imgs_words/korean/1.jpg:('바탕으로', 0.9948904) +``` + + + +## Angle Classification Model Inference + +For angle classification model inference, you can execute the following commands: + + +``` +# download text angle class inference model: +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +tar xf ch_ppocr_mobile_v2.0_cls_infer.tar +python3 tools/infer/predict_cls.py --image_dir="./doc/imgs_words_en/word_10.png" --cls_model_dir="ch_ppocr_mobile_v2.0_cls_infer" +``` +![](../imgs_words_en/word_10.png) + +After executing the command, the prediction results (classification angle and score) of the above image will be printed on the screen. + +``` + Predicts of ./doc/imgs_words_en/word_10.png:['0', 0.9999995] +``` + + +## Text Detection Angle Classification and Recognition Inference Concatenation + +When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default. + +```shell +# use direction classifier +python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/det_db/" --cls_model_dir="./inference/cls/" --rec_model_dir="./inference/rec_crnn/" --use_angle_cls=true + +# not use use direction classifier +python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/det_db/" --rec_model_dir="./inference/rec_crnn/" + +# use multi-process +python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/det_db/" --rec_model_dir="./inference/rec_crnn/" --use_angle_cls=false --use_mp=True --total_process_num=6 +``` + + +After executing the command, the recognition result image is as follows: + +![](../imgs_results/system_res_00018069.jpg) diff --git a/doc/doc_en/models_and_config_en.md b/doc/doc_en/models_and_config_en.md new file mode 100644 index 0000000000000000000000000000000000000000..414d844d63d51a2b53feea035c1f735594d73fe0 --- /dev/null +++ b/doc/doc_en/models_and_config_en.md @@ -0,0 +1,48 @@ +# PP-OCR Model and Configuration +The chapter on PP-OCR model and configuration file mainly adds some basic concepts of OCR model and the content and role of configuration file to have a better experience in the subsequent parameter adjustment and training of the model. + +This chapter contains three parts. Firstly, [PP-OCR Model Download](. /models_list_en.md) explains the concept of PP-OCR model types and provides links to download all models. Then in [Yml Configuration](. /config_en.md) details the parameters needed to fine-tune the PP-OCR models. The final [Python Inference for PP-OCR Model Library](. /inference_ppocr_en.md) is an introduction to the use of the PP-OCR model library in the first section, which can quickly utilize the rich model library models to obtain test results through the Python inference engine. + +------ + +Let's first understand some basic concepts. + +- [INTRODUCTION ABOUT OCR](#introduction-about-ocr) + * [BASIC CONCEPTS OF OCR DETECTION MODEL](#basic-concepts-of-ocr-detection-model) + * [Basic concepts of OCR recognition model](#basic-concepts-of-ocr-recognition-model) + * [PP-OCR model](#pp-ocr-model) + * [And a table of contents](#and-a-table-of-contents) + * [On the right](#on-the-right) + + +## 1. INTRODUCTION ABOUT OCR + +This section briefly introduces the basic concepts of OCR detection model and recognition model, and introduces PaddleOCR's PP-OCR model. + +OCR (Optical Character Recognition, Optical Character Recognition) is currently the general term for text recognition. It is not limited to document or book text recognition, but also includes recognizing text in natural scenes. It can also be called STR (Scene Text Recognition). + +OCR text recognition generally includes two parts, text detection and text recognition. The text detection module first uses detection algorithms to detect text lines in the image. And then the recognition algorithm to identify the specific text in the text line. + + +### 1.1 BASIC CONCEPTS OF OCR DETECTION MODEL + +Text detection can locate the text area in the image, and then usually mark the word or text line in the form of a bounding box. Traditional text detection algorithms mostly extract features manually, which are characterized by fast speed and good effect in simple scenes, but the effect will be greatly reduced when faced with natural scenes. Currently, deep learning methods are mostly used. + +Text detection algorithms based on deep learning can be roughly divided into the following categories: +1. Method based on target detection. Generally, after the text box is predicted, the final text box is filtered through NMS, which is mostly four-point text box, which is not ideal for curved text scenes. Typical algorithms are methods such as EAST and Text Box. +2. Method based on text segmentation. The text line is regarded as the segmentation target, and then the external text box is constructed through the segmentation result, which can handle curved text, and the effect is not ideal for the text cross scene problem. Typical algorithms are DB, PSENet and other methods. +3. Hybrid target detection and segmentation method. + + +### 1.2 Basic concepts of OCR recognition model + +The input of the OCR recognition algorithm is generally text lines images which has less background information, and the text information occupies the main part. The recognition algorithm can be divided into two types of algorithms: +1. CTC-based method. The text prediction module of the recognition algorithm is based on CTC, and the commonly used algorithm combination is CNN+RNN+CTC. There are also some algorithms that try to add transformer modules to the network and so on. +2. Attention-based method. The text prediction module of the recognition algorithm is based on Attention, and the commonly used algorithm combination is CNN+RNN+Attention. + + +### 1.3 PP-OCR model + +PaddleOCR integrates many OCR algorithms, text detection algorithms include DB, EAST, SAST, etc., text recognition algorithms include CRNN, RARE, StarNet, Rosetta, SRN and other algorithms. + +Among them, PaddleOCR has released the PP-OCR series model for the general OCR in Chinese and English natural scenes. The PP-OCR model is composed of the DB+CRNN algorithm. It uses massive Chinese data training and model tuning methods to have high text detection and recognition capabilities in Chinese scenes. And PaddleOCR has launched a high-precision and ultra-lightweight PP-OCRv2 model. The detection model is only 3M, and the recognition model is only 8.5M. Using [PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)'s model quantification method, the detection model can be compressed to 0.8M without reducing the accuracy. The recognition is compressed to 3M, which is more suitable for mobile deployment scenarios. diff --git a/doc/doc_en/models_en.md b/doc/doc_en/models_en.md new file mode 100644 index 0000000000000000000000000000000000000000..37c4a174563abc68085a103e11e2ddb3bd954714 --- /dev/null +++ b/doc/doc_en/models_en.md @@ -0,0 +1,46 @@ +# PP-OCR Model Zoo +The PP-OCR model zoo section explains some basic concepts of the OCR model and how to quickly use the models in the PP-OCR model library. + +This section contains two parts. Firstly, [PP-OCR Model Download](./models_list_en.md) explains the concept of PP-OCR model types and provides links to download all models. The next [Python Inference for PP-OCR Model Zoo](./inference_ppocr_en.md) is an introduction to the use of the PP-OCR model library, which can quickly utilize the rich model library models to obtain test results through the Python inference engine. + +------ + +Let's first understand some basic concepts. + +- [Introduction about OCR](#introduction-about-ocr) + * [Basic Concepts of OCR Detection Model](#basic-concepts-of-ocr-detection-model) + * [Basic Concepts of OCR Recognition Model](#basic-concepts-of-ocr-recognition-model) + * [PP-OCR Model](#pp-ocr-model) + + +## 1. Introduction about OCR + +This section briefly introduces the basic concepts of OCR detection model and recognition model, and introduces PaddleOCR's PP-OCR model. + +OCR (Optical Character Recognition, Optical Character Recognition) is currently the general term for text recognition. It is not limited to document or book text recognition, but also includes recognizing text in natural scenes. It can also be called STR (Scene Text Recognition). + +OCR text recognition generally includes two parts, text detection and text recognition. The text detection module first uses detection algorithms to detect text lines in the image. And then the recognition algorithm to identify the specific text in the text line. + + +### 1.1 Basic Concepts of OCR Detection Model + +Text detection can locate the text area in the image, and then usually mark the word or text line in the form of a bounding box. Traditional text detection algorithms mostly extract features manually, which are characterized by fast speed and good effect in simple scenes, but the effect will be greatly reduced when faced with natural scenes. Currently, deep learning methods are mostly used. + +Text detection algorithms based on deep learning can be roughly divided into the following categories: +1. Method based on target detection. Generally, after the text box is predicted, the final text box is filtered through NMS, which is mostly four-point text box, which is not ideal for curved text scenes. Typical algorithms are methods such as EAST and Text Box. +2. Method based on text segmentation. The text line is regarded as the segmentation target, and then the external text box is constructed through the segmentation result, which can handle curved text, and the effect is not ideal for the text cross scene problem. Typical algorithms are DB, PSENet and other methods. +3. Hybrid target detection and segmentation method. + + +### 1.2 Basic Concepts of OCR Recognition Model + +The input of the OCR recognition algorithm is generally text lines images which has less background information, and the text information occupies the main part. The recognition algorithm can be divided into two types of algorithms: +1. CTC-based method. The text prediction module of the recognition algorithm is based on CTC, and the commonly used algorithm combination is CNN+RNN+CTC. There are also some algorithms that try to add transformer modules to the network and so on. +2. Attention-based method. The text prediction module of the recognition algorithm is based on Attention, and the commonly used algorithm combination is CNN+RNN+Attention. + + +### 1.3 PP-OCR Model + +PaddleOCR integrates many OCR algorithms, text detection algorithms include DB, EAST, SAST, etc., text recognition algorithms include CRNN, RARE, StarNet, Rosetta, SRN and other algorithms. + +Among them, PaddleOCR has released the PP-OCR series model for the general OCR in Chinese and English natural scenes. The PP-OCR model is composed of the DB+CRNN algorithm. It uses massive Chinese data training and model tuning methods to have high text detection and recognition capabilities in Chinese scenes. And PaddleOCR has launched a high-precision and ultra-lightweight PP-OCRv2 model. The detection model is only 3M, and the recognition model is only 8.5M. Using [PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)'s model quantification method, the detection model can be compressed to 0.8M without reducing the accuracy. The recognition is compressed to 3M, which is more suitable for mobile deployment scenarios. diff --git a/doc/doc_en/models_list_en.md b/doc/doc_en/models_list_en.md index 9bee4aef5121b1964a9bdbdeeaad4e81dd9ff6d4..3b9b5518701f052079af1398a4fa3e3770eb12a1 100644 --- a/doc/doc_en/models_list_en.md +++ b/doc/doc_en/models_list_en.md @@ -1,7 +1,8 @@ -## OCR model list(V2.0, updated on 2021.1.20) +## OCR model list(V2.1, updated on 2021.9.6) > **Note** -> 1. Compared with [models 1.1](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/models_list_en.md), which are trained with static graph programming paradigm, models 2.0 are the dynamic graph trained version and achieve close performance. -> 2. All models in this tutorial are all ppocr-series models, for more introduction of algorithms and models based on public dataset, you can refer to [algorithm overview tutorial](./algorithm_overview_en.md). +> 1. Compared with the model v2.0, the 2.1 version of the detection model has a improvement in accuracy, and the 2.1 version of the recognition model is optimized in accuracy and CPU speed. +> 2. Compared with [models 1.1](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/models_list_en.md), which are trained with static graph programming paradigm, models 2.0 are the dynamic graph trained version and achieve close performance. +> 3. All models in this tutorial are all ppocr-series models, for more introduction of algorithms and models based on public dataset, you can refer to [algorithm overview tutorial](./algorithm_overview_en.md). - [1. Text Detection Model](#Detection) - [2. Text Recognition Model](#Recognition) @@ -28,6 +29,8 @@ Relationship of the above models is as follows. |model name|description|config|model size|download| | --- | --- | --- | --- | --- | +|ch_PP-OCRv2_det_slim|slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml)| 3M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)| +|ch_PP-OCRv2_det|Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml)|3M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| |ch_ppocr_mobile_slim_v2.0_det|Slim pruned lightweight model, supporting Chinese, English, multilingual text detection|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|2.6M |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar)| |ch_ppocr_mobile_v2.0_det|Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|3M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)| |ch_ppocr_server_v2.0_det|General model, which is larger than the lightweight model, but achieved better performance|[ch_det_res18_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml)|47M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar)| @@ -40,6 +43,8 @@ Relationship of the above models is as follows. |model name|description|config|model size|download| | --- | --- | --- | --- | --- | +|ch_PP-OCRv2_rec_slim|Slim qunatization with distillation lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | +|ch_PP-OCRv2_rec|Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)|8.5M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | |ch_ppocr_mobile_slim_v2.0_rec|Slim pruned and quantized lightweight model, supporting Chinese, English and number recognition|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)| 6M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) | |ch_ppocr_mobile_v2.0_rec|Original lightweight model, supporting Chinese, English and number recognition|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)|5.2M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | |ch_ppocr_server_v2.0_rec|General model, supporting Chinese, English and number recognition|[rec_chinese_common_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml)|94.8M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | @@ -58,45 +63,6 @@ Relationship of the above models is as follows. #### Multilingual Recognition Model(Updating...) -**Note:** The configuration file of the new multi language model is generated by code. You can use the `--help` parameter to check which multi language are supported by current PaddleOCR. - -```bash -# The code needs to run in the specified directory -cd {your/path/}PaddleOCR/configs/rec/multi_language/ -python3 generate_multi_language_configs.py --help -``` - -Take the Italian configuration file as an example: -##### 1.Generate Italian configuration file to test the model provided -you can generate the default configuration file through the following command, and use the default language dictionary provided by paddleocr for prediction. -```bash -# The code needs to run in the specified directory -cd {your/path/}PaddleOCR/configs/rec/multi_language/ -# Set the required language configuration file through -l or --language parameter -# This command will write the default parameter to the configuration file. -python3 generate_multi_language_configs.py -l it -``` -##### 2. Generate Italian configuration file to train your own data -If you want to train your own model, you can prepare the training set file, verification set file, dictionary file and training data path. Here we assume that the Italian training set, verification set, dictionary and training data path are: -- Training set:{your/path/}PaddleOCR/train_data/train_list.txt -- Validation set: {your/path/}PaddleOCR/train_data/val_list.txt -- Use the default dictionary provided by paddleocr:{your/path/}PaddleOCR/ppocr/utils/dict/it_dict.txt -- Training data path:{your/path/}PaddleOCR/train_data -```bash -# The code needs to run in the specified directory -cd {your/path/}PaddleOCR/configs/rec/multi_language/ -# The -l or --language parameter is required -# --train modify train_list path -# --val modify eval_list path -# --data_dir modify data dir -# -o modify default parameters -# --dict Change the dictionary path. The example uses the default dictionary path, so that this parameter can be empty. -python3 generate_multi_language_configs.py -l it \ ---train {path/to/train_list} \ ---val {path/to/val_list} \ ---data_dir {path/to/data_dir} \ --o Global.use_gpu=False -``` |model name| dict file | description|config|model size|download| | --- | --- | --- |--- | --- | --- | | french_mobile_v2.0_rec | ppocr/utils/dict/french_dict.txt | Lightweight model for French recognition|[rec_french_lite_train.yml](../../configs/rec/multi_language/rec_french_lite_train.yml)|2.65M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/french_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/french_mobile_v2.0_rec_train.tar) | @@ -120,12 +86,14 @@ For more supported languages, please refer to : [Multi-language model](./multi_l |model name|description|config|model size|download| | --- | --- | --- | --- | --- | -|ch_ppocr_mobile_slim_v2.0_cls|Slim quantized model|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)| 2.1M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_train.tar) | -|ch_ppocr_mobile_v2.0_cls|Original model|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)|1.38M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | +|ch_ppocr_mobile_slim_v2.0_cls|Slim quantized model for text angle classification|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)| 2.1M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_train.tar) | +|ch_ppocr_mobile_v2.0_cls|Original model for text angle classification|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)|1.38M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | ### 4. Paddle-Lite Model |Version|Introduction|Model size|Detection model|Text Direction model|Recognition model|Paddle-Lite branch| |---|---|---|---|---|---|---| -|V2.0|extra-lightweight chinese OCR optimized model|7.8M|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_opt.nb)|v2.9| -|V2.0(slim)|extra-lightweight chinese OCR optimized model|3.3M|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_slim_opt.nb)|v2.9| +|PP-OCRv2|extra-lightweight chinese OCR optimized model|11M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer_opt.nb)|v2.9| +|PP-OCRv2(slim)|extra-lightweight chinese OCR optimized model|4.9M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_opt.nb)|v2.9| +|V2.0|ppocr_v2.0 extra-lightweight chinese OCR optimized model|7.8M|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_opt.nb)|v2.9| +|V2.0(slim)|ppovr_v2.0 extra-lightweight chinese OCR optimized model|3.3M|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_slim_opt.nb)|v2.9| diff --git a/doc/doc_en/multi_languages_en.md b/doc/doc_en/multi_languages_en.md index 43650c6ddfdd8c27ab44d0495111a767aeac9ca8..545be5524f2c52c9799d3b013f1aac8baf1a379f 100644 --- a/doc/doc_en/multi_languages_en.md +++ b/doc/doc_en/multi_languages_en.md @@ -198,13 +198,13 @@ If necessary, you can read related documents: | Language | Abbreviation | | Language | Abbreviation | | --- | --- | --- | --- | --- | -|chinese and english|ch| |Arabic|ar| -|english|en| |Hindi|hi| -|french|fr| |Uyghur|ug| -|german|german| |Persian|fa| -|japan|japan| |Urdu|ur| -|korean|korean| | Serbian(latin) |rs_latin| -|chinese traditional |ch_tra| |Occitan |oc| +|Chinese & English|ch| |Arabic|ar| +|English|en| |Hindi|hi| +|French|fr| |Uyghur|ug| +|German|german| |Persian|fa| +|Japan|japan| |Urdu|ur| +|Korean|korean| | Serbian(latin) |rs_latin| +|Chinese Traditional |chinese_cht| |Occitan |oc| | Italian |it| |Marathi|mr| |Spanish |es| |Nepali|ne| | Portuguese|pt| |Serbian(cyrillic)|rs_cyrillic| diff --git a/doc/doc_en/paddleOCR_overview_en.md b/doc/doc_en/paddleOCR_overview_en.md new file mode 100644 index 0000000000000000000000000000000000000000..073c3ec889b2f21e9e40f5f7d1d6dc719e3dcac9 --- /dev/null +++ b/doc/doc_en/paddleOCR_overview_en.md @@ -0,0 +1,39 @@ +# PaddleOCR Overview and Project Clone + +## 1. PaddleOCR Overview + +PaddleOCR contains rich text detection, text recognition and end-to-end algorithms. Combining actual testing and industrial experience, PaddleOCR chooses DB and CRNN as the basic detection and recognition models, and proposes a series of models, named PP-OCR, for industrial applications after a series of optimization strategies. The PP-OCR model is aimed at general scenarios and forms a model library according to different languages. Based on the capabilities of PP-OCR, PaddleOCR releases the PP-Structure tool library for document scene tasks, including two major tasks: layout analysis and table recognition. In order to get through the entire process of industrial landing, PaddleOCR provides large-scale data production tools and a variety of prediction deployment tools to help developers quickly turn ideas into reality. + +

+ +
+ + + +## 2. Project Clone + +### **2.1 Clone PaddleOCR repo** + +``` +# Recommend +git clone https://github.com/PaddlePaddle/PaddleOCR + +# If you cannot pull successfully due to network problems, you can also choose to use the code hosting on the cloud: + +git clone https://gitee.com/paddlepaddle/PaddleOCR + +# Note: The cloud-hosting code may not be able to synchronize the update with this GitHub project in real time. There might be a delay of 3-5 days. Please give priority to the recommended method. +``` + +### **2.2 Install third-party libraries** + +``` +cd PaddleOCR +pip3 install -r requirements.txt +``` + +If you getting this error `OSError: [WinError 126] The specified module could not be found` when you install shapely on windows. + +Please try to download Shapely whl file using [http://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely](http://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely). + +Reference: [Solve shapely installation on windows](https://stackoverflow.com/questions/44398265/install-shapely-oserror-winerror-126-the-specified-module-could-not-be-found) \ No newline at end of file diff --git a/doc/doc_en/quickstart_en.md b/doc/doc_en/quickstart_en.md index a5c0881de30bfd4b76d30c7840b6585b5d7e2af9..0055d8f7a89d0d218d001ea94fd4c620de5d037f 100644 --- a/doc/doc_en/quickstart_en.md +++ b/doc/doc_en/quickstart_en.md @@ -1,103 +1,252 @@ -# Quick start of Chinese OCR model +# PaddleOCR Quick Start -## 1. Prepare for the environment +[PaddleOCR Quick Start](#paddleocr-quick-start) -Please refer to [quick installation](./installation_en.md) to configure the PaddleOCR operating environment. ++ [1. Install PaddleOCR Whl Package](#1-install-paddleocr-whl-package) +* [2. Easy-to-Use](#2-easy-to-use) + + [2.1 Use by Command Line](#21-use-by-command-line) + - [2.1.1 English and Chinese Model](#211-english-and-chinese-model) + - [2.1.2 Multi-language Model](#212-multi-language-model) + - [2.1.3 Layout Analysis](#213-layoutAnalysis) + + [2.2 Use by Code](#22-use-by-code) + - [2.2.1 Chinese & English Model and Multilingual Model](#221-chinese---english-model-and-multilingual-model) + - [2.2.2 Layout Analysis](#222-layoutAnalysis) -* Note: Support the use of PaddleOCR through whl package installation,pelease refer [PaddleOCR Package](./whl_en.md). -## 2.inference models -The detection and recognition models on the mobile and server sides are as follows. For more models (including multiple languages), please refer to [PP-OCR v2.0 series model list](../doc_ch/models_list.md) + -| Model introduction | Model name | Recommended scene | Detection model | Direction Classifier | Recognition model | -| ------------ | --------------- | ----------------|---- | ---------- | -------- | -| Ultra-lightweight Chinese OCR model (8.1M) | ch_ppocr_mobile_v2.0_xx |Mobile-side/Server-side|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [pretrained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [pretrained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [pretrained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | -| Universal Chinese OCR model (143M) | ch_ppocr_server_v2.0_xx |Server-side |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [pretrained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [pretrained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [pretrained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | +## 1. Install PaddleOCR Whl Package +```bash +pip install "paddleocr>=2.0.1" # Recommend to use version 2.0.1+ +``` -* If `wget` is not installed in the windows environment, you can copy the link to the browser to download when downloading the model, then uncompress it and place it in the corresponding directory. +- **For windows users:** If you getting this error `OSError: [WinError 126] The specified module could not be found` when you install shapely on windows. Please try to download Shapely whl file [here](http://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely). -Copy the download address of the `inference model` for detection and recognition in the table above, and uncompress them. + Reference: [Solve shapely installation on windows](https://stackoverflow.com/questions/44398265/install-shapely-oserror-winerror-126-the-specified-module-could-not-be-found) -``` -mkdir inference && cd inference -# Download the detection model and unzip -wget {url/of/detection/inference_model} && tar xf {name/of/detection/inference_model/package} -# Download the recognition model and unzip -wget {url/of/recognition/inference_model} && tar xf {name/of/recognition/inference_model/package} -# Download the direction classifier model and unzip -wget {url/of/classification/inference_model} && tar xf {name/of/classification/inference_model/package} -cd .. -``` +- **For layout analysis users**, run the following command to install **Layout-Parser** -Take the ultra-lightweight model as an example: + ```bash + pip3 install -U https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl + ``` + + +## 2. Easy-to-Use + + + +### 2.1 Use by Command Line + +PaddleOCR provides a series of test images, click [here](https://paddleocr.bj.bcebos.com/dygraph_v2.1/ppocr_img.zip) to download, and then switch to the corresponding directory in the terminal + +```bash +cd /path/to/ppocr_img ``` -mkdir inference && cd inference -# Download the detection model of the ultra-lightweight Chinese OCR model and uncompress it -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar && tar xf ch_ppocr_mobile_v2.0_det_infer.tar -# Download the recognition model of the ultra-lightweight Chinese OCR model and uncompress it -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar && tar xf ch_ppocr_mobile_v2.0_rec_infer.tar -# Download the angle classifier model of the ultra-lightweight Chinese OCR model and uncompress it -wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar && tar xf ch_ppocr_mobile_v2.0_cls_infer.tar -cd .. -``` -After decompression, the file structure should be as follows: +If you do not use the provided test image, you can replace the following `--image_dir` parameter with the corresponding test image path + + + +#### 2.1.1 Chinese and English Model + +* Detection, direction classification and recognition: set the direction classifier parameter`--use_angle_cls true` to recognize vertical text. + + ```bash + paddleocr --image_dir ./imgs_en/img_12.jpg --use_angle_cls true --lang en + ``` + + Output will be a list, each item contains bounding box, text and recognition confidence + + ```bash + [[[442.0, 173.0], [1169.0, 173.0], [1169.0, 225.0], [442.0, 225.0]], ['ACKNOWLEDGEMENTS', 0.99283075]] + [[[393.0, 340.0], [1207.0, 342.0], [1207.0, 389.0], [393.0, 387.0]], ['We would like to thank all the designers and', 0.9357758]] + [[[399.0, 398.0], [1204.0, 398.0], [1204.0, 433.0], [399.0, 433.0]], ['contributors whohave been involved in the', 0.9592447]] + ...... + ``` + +* Only detection: set `--rec` to `false` + + ```bash + paddleocr --image_dir ./imgs_en/img_12.jpg --rec false + ``` + + Output will be a list, each item only contains bounding box + ```bash + [[756.0, 812.0], [805.0, 812.0], [805.0, 830.0], [756.0, 830.0]] + [[820.0, 803.0], [1085.0, 801.0], [1085.0, 836.0], [820.0, 838.0]] + [[393.0, 801.0], [715.0, 805.0], [715.0, 839.0], [393.0, 836.0]] + ...... + ``` + +* Only recognition: set `--det` to `false` + + ```bash + paddleocr --image_dir ./imgs_words_en/word_10.png --det false --lang en + ``` + + Output will be a list, each item contains text and recognition confidence + + ```bash + ['PAIN', 0.990372] + ``` + +If you need to use the 2.0 model, please specify the parameter `--version PP-OCR`, paddleocr uses the 2.1 model by default(`--versioin PP-OCRv2`). More whl package usage can be found in [whl package](./whl_en.md) + + +#### 2.1.2 Multi-language Model + +Paddleocr currently supports 80 languages, which can be switched by modifying the `--lang` parameter. + +``` bash +paddleocr --image_dir ./doc/imgs_en/254.jpg --lang=en ``` -├── ch_ppocr_mobile_v2.0_cls_infer -│ ├── inference.pdiparams -│ ├── inference.pdiparams.info -│ └── inference.pdmodel -├── ch_ppocr_mobile_v2.0_det_infer -│ ├── inference.pdiparams -│ ├── inference.pdiparams.info -│ └── inference.pdmodel -├── ch_ppocr_mobile_v2.0_rec_infer - ├── inference.pdiparams - ├── inference.pdiparams.info - └── inference.pdmodel + +
+ + +
+The result is a list, each item contains a text box, text and recognition confidence + +```text +[('PHO CAPITAL', 0.95723116), [[66.0, 50.0], [327.0, 44.0], [327.0, 76.0], [67.0, 82.0]]] +[('107 State Street', 0.96311164), [[72.0, 90.0], [451.0, 84.0], [452.0, 116.0], [73.0, 121.0]]] +[('Montpelier Vermont', 0.97389287), [[69.0, 132.0], [501.0, 126.0], [501.0, 158.0], [70.0, 164.0]]] +[('8022256183', 0.99810505), [[71.0, 175.0], [363.0, 170.0], [364.0, 202.0], [72.0, 207.0]]] +[('REG 07-24-201706:59 PM', 0.93537045), [[73.0, 299.0], [653.0, 281.0], [654.0, 318.0], [74.0, 336.0]]] +[('045555', 0.99346405), [[509.0, 331.0], [651.0, 325.0], [652.0, 356.0], [511.0, 362.0]]] +[('CT1', 0.9988654), [[535.0, 367.0], [654.0, 367.0], [654.0, 406.0], [535.0, 406.0]]] +...... ``` -## 3. Single image or image set prediction +Commonly used multilingual abbreviations include + +| Language | Abbreviation | | Language | Abbreviation | | Language | Abbreviation | +| ------------------- | ------------ | ---- | -------- | ------------ | ---- | -------- | ------------ | +| Chinese & English | ch | | French | fr | | Japanese | japan | +| English | en | | German | german | | Korean | korean | +| Chinese Traditional | chinese_cht | | Italian | it | | Russian | ru | -* The following code implements text detection、angle class and recognition process. When performing prediction, you need to specify the path of a single image or image set through the parameter `image_dir`, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `rec_model_dir` specifies the path to identify the inference model, the parameter `use_angle_cls` specifies whether to use the direction classifier, the parameter `cls_model_dir` specifies the path to identify the direction classifier model, the parameter `use_space_char` specifies whether to predict the space char. The visual results are saved to the `./inference_results` folder by default. +A list of all languages and their corresponding abbreviations can be found in [Multi-Language Model Tutorial](./multi_languages_en.md) + +#### 2.1.3 Layout Analysis +Layout analysis refers to the division of 5 types of areas of the document, including text, title, list, picture and table. For the first three types of regions, directly use the OCR model to complete the text detection and recognition of the corresponding regions, and save the results in txt. For the table area, after the table structuring process, the table picture is converted into an Excel file of the same table style. The picture area will be individually cropped into an image. + +To use the layout analysis function of PaddleOCR, you need to specify `--type=structure` ```bash +paddleocr --image_dir=../doc/table/1.png --type=structure +``` -# Predict a single image specified by image_dir -python3 tools/infer/predict_system.py --image_dir="./doc/imgs/11.jpg" --det_model_dir="./inference/ch_ppocr_mobile_v2.0_det_infer/" --rec_model_dir="./inference/ch_ppocr_mobile_v2.0_rec_infer/" --cls_model_dir="./inference/ch_ppocr_mobile_v2.0_cls_infer/" --use_angle_cls=True --use_space_char=True +- **Results Format** + + The returned results of PP-Structure is a list composed of a dict, an example is as follows + + ```shell + [ + { 'type': 'Text', + 'bbox': [34, 432, 345, 462], + 'res': ([[36.0, 437.0, 341.0, 437.0, 341.0, 446.0, 36.0, 447.0], [41.0, 454.0, 125.0, 453.0, 125.0, 459.0, 41.0, 460.0]], + [('Tigure-6. The performance of CNN and IPT models using difforen', 0.90060663), ('Tent ', 0.465441)]) + } + ] + ``` + + The description of each field in dict is as follows + + | Parameter | Description | + | --------- | ------------------------------------------------------------ | + | type | Type of image area | + | bbox | The coordinates of the image area in the original image, respectively [left upper x, left upper y, right bottom x, right bottom y] | + | res | OCR or table recognition result of image area。
Table: HTML string of the table;
OCR: A tuple containing the detection coordinates and recognition results of each single line of text | + +- **Parameter Description:** + + | Parameter | Description | Default value | + | --------------- | ------------------------------------------------------------ | -------------------------------------------- | + | output | The path where excel and recognition results are saved | ./output/table | + | table_max_len | The long side of the image is resized in table structure model | 488 | + | table_model_dir | inference model path of table structure model | None | + | table_char_type | dict path of table structure model | ../ppocr/utils/dict/table_structure_dict.txt | + + + +### 2.2 Use by Code + + +#### 2.2.1 Chinese & English Model and Multilingual Model + +* detection, angle classification and recognition: + +```python +from paddleocr import PaddleOCR,draw_ocr +# Paddleocr supports Chinese, English, French, German, Korean and Japanese. +# You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan` +# to switch the language model in order. +ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory +img_path = './imgs_en/img_12.jpg' +result = ocr.ocr(img_path, cls=True) +for line in result: + print(line) + + +# draw result +from PIL import Image +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='./fonts/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` -# Predict imageset specified by image_dir -python3 tools/infer/predict_system.py --image_dir="./doc/imgs/" --det_model_dir="./inference/ch_ppocr_mobile_v2.0_det_infer/" --rec_model_dir="./inference/ch_ppocr_mobile_v2.0_rec_infer/" --cls_model_dir="./inference/ch_ppocr_mobile_v2.0_cls_infer/" --use_angle_cls=True --use_space_char=True +Output will be a list, each item contains bounding box, text and recognition confidence -# If you want to use the CPU for prediction, you need to set the use_gpu parameter to False -python3 tools/infer/predict_system.py --image_dir="./doc/imgs/11.jpg" --det_model_dir="./inference/ch_ppocr_mobile_v2.0_det_infer/" --rec_model_dir="./inference/ch_ppocr_mobile_v2.0_rec_infer/" --cls_model_dir="./inference/ch_ppocr_mobile_v2.0_cls_infer/" --use_angle_cls=True --use_space_char=True --use_gpu=False +```bash +[[[442.0, 173.0], [1169.0, 173.0], [1169.0, 225.0], [442.0, 225.0]], ['ACKNOWLEDGEMENTS', 0.99283075]] +[[[393.0, 340.0], [1207.0, 342.0], [1207.0, 389.0], [393.0, 387.0]], ['We would like to thank all the designers and', 0.9357758]] +[[[399.0, 398.0], [1204.0, 398.0], [1204.0, 433.0], [399.0, 433.0]], ['contributors whohave been involved in the', 0.9592447]] +...... ``` -- Universal Chinese OCR model +Visualization of results -Please follow the above steps to download the corresponding models and update the relevant parameters, The example is as follows. +
+ +
+ -``` -# Predict a single image specified by image_dir -python3 tools/infer/predict_system.py --image_dir="./doc/imgs/11.jpg" --det_model_dir="./inference/ch_ppocr_server_v2.0_det_infer/" --rec_model_dir="./inference/ch_ppocr_server_v2.0_rec_infer/" --cls_model_dir="./inference/ch_ppocr_mobile_v2.0_cls_infer/" --use_angle_cls=True --use_space_char=True -``` +#### 2.2.2 Layout Analysis + +```python +import os +import cv2 +from paddleocr import PPStructure,draw_structure_result,save_structure_res -* Note - - If you want to use the recognition model which does not support space char recognition, please update the source code to the latest version and add parameters `--use_space_char=False`. - - If you do not want to use direction classifier, please update the source code to the latest version and add parameters `--use_angle_cls=False`. +table_engine = PPStructure(show_log=True) +save_folder = './output/table' +img_path = './table/1.png' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0]) -For more text detection and recognition tandem reasoning, please refer to the document tutorial -: [Inference with Python inference engine](./inference_en.md)。 +for line in result: + line.pop('img') + print(line) -In addition, the tutorial also provides other deployment methods for the Chinese OCR model: -- [Server-side C++ inference](../../deploy/cpp_infer/readme_en.md) -- [Service deployment](../../deploy/hubserving) -- [End-to-end deployment](https://github.com/PaddlePaddle/PaddleOCR/tree/develop/deploy/lite) +from PIL import Image + +font_path = './fonts/simfang.ttf' +image = Image.open(img_path).convert('RGB') +im_show = draw_structure_result(image, result,font_path=font_path) +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` diff --git a/doc/doc_en/recognition_en.md b/doc/doc_en/recognition_en.md index 634ec783aa5e1dd6c9202385cf2978d140ca44a1..51857ba16b7773ef38452fad6aa070f2117a9086 100644 --- a/doc/doc_en/recognition_en.md +++ b/doc/doc_en/recognition_en.md @@ -1,28 +1,28 @@ -## TEXT RECOGNITION +# Text Recognition -- [1 DATA PREPARATION](#DATA_PREPARATION) +- [1. Data Preparation](#DATA_PREPARATION) - [1.1 Costom Dataset](#Costom_Dataset) - [1.2 Dataset Download](#Dataset_download) - [1.3 Dictionary](#Dictionary) - [1.4 Add Space Category](#Add_space_category) -- [2 TRAINING](#TRAINING) +- [2. Training](#TRAINING) - [2.1 Data Augmentation](#Data_Augmentation) - - [2.2 Training](#Training) - - [2.3 Multi-language](#Multi_language) + - [2.2 General Training](#Training) + - [2.3 Multi-language Training](#Multi_language) -- [3 EVALUATION](#EVALUATION) +- [3. Evaluation](#EVALUATION) -- [4 PREDICTION](#PREDICTION) - - [4.1 Training engine prediction](#Training_engine_prediction) +- [4. Prediction](#PREDICTION) +- [5. Convert to Inference Model](#Inference) -### DATA PREPARATION +## 1. Data Preparation PaddleOCR supports two data formats: -- `LMDB` is used to train data sets stored in lmdb format; -- `general data` is used to train data sets stored in text files: +- `LMDB` is used to train data sets stored in lmdb format(LMDBDataSet); +- `general data` is used to train data sets stored in text files(SimpleDataSet): Please organize the dataset as follows: @@ -36,7 +36,7 @@ mklink /d /train_data/dataset ``` -#### 1.1 Costom dataset +### 1.1 Costom Dataset If you want to use your own data for training, please refer to the following to organize your data. @@ -84,11 +84,14 @@ Similar to the training set, the test set also needs to be provided a folder con ``` -#### 1.2 Dataset download +### 1.2 Dataset Download -If you do not have a dataset locally, you can download it on the official website [icdar2015](http://rrc.cvc.uab.es/?ch=4&com=downloads). Also refer to [DTRB](https://github.com/clovaai/deep-text-recognition-benchmark#download-lmdb-dataset-for-traininig-and-evaluation-from-here) ,download the lmdb format dataset required for benchmark +- ICDAR2015 -If you want to reproduce the paper indicators of SRN, you need to download offline [augmented data](https://pan.baidu.com/s/1-HSZ-ZVdqBF2HaBZ5pRAKA), extraction code: y3ry. The augmented data is obtained by rotation and perturbation of mjsynth and synthtext. Please unzip the data to {your_path}/PaddleOCR/train_data/data_lmdb_Release/training/path. +If you do not have a dataset locally, you can download it on the official website [icdar2015](http://rrc.cvc.uab.es/?ch=4&com=downloads). +Also refer to [DTRB](https://github.com/clovaai/deep-text-recognition-benchmark#download-lmdb-dataset-for-traininig-and-evaluation-from-here) ,download the lmdb format dataset required for benchmark + +If you want to reproduce the paper SAR, you need to download extra dataset [SynthAdd](https://pan.baidu.com/share/init?surl=uV0LtoNmcxbO-0YA7Ch4dg), extraction code: 627x. Besides, icdar2013, icdar2015, cocotext, IIIT5k datasets are also used to train. For specific details, please refer to the paper SAR. PaddleOCR provides label files for training the icdar2015 dataset, which can be downloaded in the following ways: @@ -99,8 +102,28 @@ wget -P ./train_data/ic15_data https://paddleocr.bj.bcebos.com/dataset/rec_gt_t wget -P ./train_data/ic15_data https://paddleocr.bj.bcebos.com/dataset/rec_gt_test.txt ``` +PaddleOCR also provides a data format conversion script, which can convert ICDAR official website label to a data format +supported by PaddleOCR. The data conversion tool is in `ppocr/utils/gen_label.py`, here is the training set as an example: + +``` +# convert the official gt to rec_gt_label.txt +python gen_label.py --mode="rec" --input_path="{path/of/origin/label}" --output_label="rec_gt_label.txt" +``` + +The data format is as follows, (a) is the original picture, (b) is the Ground Truth text file corresponding to each picture: + +![](../datasets/icdar_rec.png) + + +- Multilingual dataset + +The multi-language model training method is the same as the Chinese model. The training data set is 100w synthetic data. A small amount of fonts and test data can be downloaded using the following two methods. +* [Baidu Netdisk](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA) ,Extraction code:frgi. +* [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) + + -#### 1.3 Dictionary +### 1.3 Dictionary Finally, a dictionary ({word_dict_name}.txt) needs to be provided so that when the model is trained, all the characters that appear can be mapped to the dictionary index. @@ -138,21 +161,31 @@ The current multi-language model is still in the demo stage and will continue to If you like, you can submit the dictionary file to [dict](../../ppocr/utils/dict) and we will thank you in the Repo. -To customize the dict file, please modify the `character_dict_path` field in `configs/rec/rec_icdar15_train.yml` and set `character_type` to `ch`. +To customize the dict file, please modify the `character_dict_path` field in `configs/rec/rec_icdar15_train.yml` . - Custom dictionary If you need to customize dic file, please add character_dict_path field in configs/rec/rec_icdar15_train.yml to point to your dictionary path. And set character_type to ch. -#### 1.4 Add space category +### 1.4 Add Space Category If you want to support the recognition of the `space` category, please set the `use_space_char` field in the yml file to `True`. -**Note: use_space_char only takes effect when character_type=ch** - -### 2 TRAINING +## 2.Training + + +### 2.1 Data Augmentation + +PaddleOCR provides a variety of data augmentation methods. All the augmentation methods are enabled by default. + +The default perturbation methods are: cvtColor, blur, jitter, Gasuss noise, random crop, perspective, color reverse, TIA augmentation. + +Each disturbance method is selected with a 40% probability during the training process. For specific code implementation, please refer to: [rec_img_aug.py](../../ppocr/data/imaug/rec_img_aug.py) + + +### 2.2 General Training PaddleOCR provides training scripts, evaluation scripts, and prediction scripts. In this section, the CRNN recognition model will be used as an example: @@ -170,21 +203,15 @@ tar -xf rec_mv3_none_bilstm_ctc_v2.0_train.tar && rm -rf rec_mv3_none_bilstm_ctc Start training: ``` -# GPU training Support single card and multi-card training, specify the card number through --gpus +# GPU training Support single card and multi-card training # Training icdar15 English data and The training log will be automatically saved as train.log under "{save_model_dir}" + +#specify the single card training(Long training time, not recommended) +python3 tools/train.py -c configs/rec/rec_icdar15_train.yml +#specify the card number through --gpus python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_icdar15_train.yml ``` - -#### 2.1 Data Augmentation -PaddleOCR provides a variety of data augmentation methods. If you want to add disturbance during training, please set `distort: true` in the configuration file. - -The default perturbation methods are: cvtColor, blur, jitter, Gasuss noise, random crop, perspective, color reverse. - -Each disturbance method is selected with a 50% probability during the training process. For specific code implementation, please refer to: [img_tools.py](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/ppocr/data/rec/img_tools.py) - - -#### 2.2 Training PaddleOCR supports alternating training and evaluation. You can modify `eval_batch_step` in `configs/rec/rec_icdar15_train.yml` to set the evaluation frequency. By default, it is evaluated every 500 iter and the best acc model is saved under `output/rec_CRNN/best_accuracy` during the evaluation process. @@ -207,6 +234,8 @@ If the evaluation set is large, the test will be time-consuming. It is recommend | rec_mv3_tps_bilstm_att.yml | CRNN | Mobilenet_v3 | TPS | BiLSTM | att | | rec_r34_vd_tps_bilstm_att.yml | CRNN | Resnet34_vd | TPS | BiLSTM | att | | rec_r50fpn_vd_none_srn.yml | SRN | Resnet50_fpn_vd | None | rnn | srn | +| rec_mtb_nrtr.yml | NRTR | nrtr_mtb | None | transformer encoder | transformer decoder | +| rec_r31_sar.yml | SAR | ResNet31 | None | LSTM encoder | LSTM decoder | For training Chinese data, it is recommended to use @@ -219,7 +248,6 @@ Global: # Add a custom dictionary, such as modify the dictionary, please point the path to the new dictionary character_dict_path: ppocr/utils/ppocr_keys_v1.txt # Modify character type - character_type: ch ... # Whether to recognize spaces use_space_char: True @@ -277,108 +305,25 @@ Eval: **Note that the configuration file for prediction/evaluation must be consistent with the training.** -#### 2.3 Multi-language - -PaddleOCR currently supports 80 (except Chinese) language recognition. A multi-language configuration file template is -provided under the path `configs/rec/multi_languages`: [rec_multi_language_lite_train.yml](../../configs/rec/multi_language/rec_multi_language_lite_train.yml)。 - -There are two ways to create the required configuration file:: - -1. Automatically generated by script - -[generate_multi_language_configs.py](../../configs/rec/multi_language/generate_multi_language_configs.py) Can help you generate configuration files for multi-language models - -- Take Italian as an example, if your data is prepared in the following format: - ``` - |-train_data - |- it_train.txt # train_set label - |- it_val.txt # val_set label - |- data - |- word_001.jpg - |- word_002.jpg - |- word_003.jpg - | ... - ``` - - You can use the default parameters to generate a configuration file: - - ```bash - # The code needs to be run in the specified directory - cd PaddleOCR/configs/rec/multi_language/ - # Set the configuration file of the language to be generated through the -l or --language parameter. - # This command will write the default parameters into the configuration file - python3 generate_multi_language_configs.py -l it - ``` - -- If your data is placed in another location, or you want to use your own dictionary, you can generate the configuration file by specifying the relevant parameters: - - ```bash - # -l or --language field is required - # --train to modify the training set - # --val to modify the validation set - # --data_dir to modify the data set directory - # --dict to modify the dict path - # -o to modify the corresponding default parameters - cd PaddleOCR/configs/rec/multi_language/ - python3 generate_multi_language_configs.py -l it \ # language - --train {path/of/train_label.txt} \ # path of train_label - --val {path/of/val_label.txt} \ # path of val_label - --data_dir {train_data/path} \ # root directory of training data - --dict {path/of/dict} \ # path of dict - -o Global.use_gpu=False # whether to use gpu - ... - - ``` -Italian is made up of Latin letters, so after executing the command, you will get the rec_latin_lite_train.yml. - -2. Manually modify the configuration file - - You can also manually modify the following fields in the template: - - ``` - Global: - use_gpu: True - epoch_num: 500 - ... - character_type: it # language - character_dict_path: {path/of/dict} # path of dict - - Train: - dataset: - name: SimpleDataSet - data_dir: train_data/ # root directory of training data - label_file_list: ["./train_data/train_list.txt"] # train label path - ... - - Eval: - dataset: - name: SimpleDataSet - data_dir: train_data/ # root directory of val data - label_file_list: ["./train_data/val_list.txt"] # val label path - ... - - ``` +### 2.3 Multi-language Training Currently, the multi-language algorithms supported by PaddleOCR are: -| Configuration file | Algorithm name | backbone | trans | seq | pred | language | character_type | -| :--------: | :-------: | :-------: | :-------: | :-----: | :-----: | :-----: | :-----: | -| rec_chinese_cht_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | chinese traditional | chinese_cht| -| rec_en_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | English(Case sensitive) | EN | -| rec_french_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | French | french | -| rec_ger_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | German | german | -| rec_japan_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | Japanese | japan | -| rec_korean_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | Korean | korean | -| rec_latin_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | Latin | latin | -| rec_arabic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | arabic | ar | -| rec_cyrillic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | cyrillic | cyrillic | -| rec_devanagari_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | devanagari | devanagari | +| Configuration file | Algorithm name | backbone | trans | seq | pred | language | +| :--------: | :-------: | :-------: | :-------: | :-----: | :-----: | :-----: | +| rec_chinese_cht_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | chinese traditional | +| rec_en_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | English(Case sensitive) | +| rec_french_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | French | +| rec_ger_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | German | +| rec_japan_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | Japanese | +| rec_korean_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | Korean | +| rec_latin_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | Latin | +| rec_arabic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | arabic | +| rec_cyrillic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | cyrillic | +| rec_devanagari_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | devanagari | For more supported languages, please refer to : [Multi-language model](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/multi_languages_en.md#4-support-languages-and-abbreviations) -The multi-language model training method is the same as the Chinese model. The training data set is 100w synthetic data. A small amount of fonts and test data can be downloaded using the following two methods. -* [Baidu Netdisk](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA),Extraction code:frgi. -* [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) If you want to finetune on the basis of the existing model effect, please refer to the following instructions to modify the configuration file: @@ -417,7 +362,8 @@ Eval: ``` -### 3 EVALUATION + +## 3. Evalution The evaluation dataset can be set by modifying the `Eval.dataset.label_file_list` field in the `configs/rec/rec_icdar15_train.yml` file. @@ -427,20 +373,39 @@ python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec ``` -### 4 PREDICTION +## 4. Prediction - -#### 4.1 Training engine prediction Using the model trained by paddleocr, you can quickly get prediction through the following script. -The default prediction picture is stored in `infer_img`, and the weight is specified via `-o Global.checkpoints`: +The default prediction picture is stored in `infer_img`, and the trained weight is specified via `-o Global.checkpoints`: + + +According to the `save_model_dir` and `save_epoch_step` fields set in the configuration file, the following parameters will be saved: + +``` +output/rec/ +├── best_accuracy.pdopt +├── best_accuracy.pdparams +├── best_accuracy.states +├── config.yml +├── iter_epoch_3.pdopt +├── iter_epoch_3.pdparams +├── iter_epoch_3.states +├── latest.pdopt +├── latest.pdparams +├── latest.states +└── train.log +``` + +Among them, best_accuracy.* is the best model on the evaluation set; iter_epoch_x.* is the model saved at intervals of `save_epoch_step`; latest.* is the model of the last epoch. ``` # Predict English results python3 tools/infer_rec.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.load_static_weights=false Global.infer_img=doc/imgs_words/en/word_1.jpg ``` + Input image: ![](../imgs_words/en/word_1.png) @@ -469,3 +434,37 @@ Get the prediction result of the input image: infer_img: doc/imgs_words/ch/word_1.jpg result: ('韩国小馆', 0.997218) ``` + + + +## 5. Convert to Inference Model + +The recognition model is converted to the inference model in the same way as the detection, as follows: + +``` +# -c Set the training algorithm yml configuration file +# -o Set optional parameters +# Global.pretrained_model parameter Set the training model address to be converted without adding the file suffix .pdmodel, .pdopt or .pdparams. +# Global.save_inference_dir Set the address where the converted model will be saved. + +python3 tools/export_model.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml -o Global.pretrained_model=./ch_lite/ch_ppocr_mobile_v2.0_rec_train/best_accuracy Global.save_inference_dir=./inference/rec_crnn/ +``` + +If you have a model trained on your own dataset with a different dictionary file, please make sure that you modify the `character_dict_path` in the configuration file to your dictionary file path. + +After the conversion is successful, there are three files in the model save directory: + +``` +inference/det_db/ + ├── inference.pdiparams # The parameter file of recognition inference model + ├── inference.pdiparams.info # The parameter information of recognition inference model, which can be ignored + └── inference.pdmodel # The program file of recognition model +``` + +- Text recognition model Inference using custom characters dictionary + + If the text dictionary is modified during training, when using the inference model to predict, you need to specify the dictionary path used by `--rec_char_dict_path`, and set `rec_char_type=ch` + + ``` + python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./your inference model" --rec_image_shape="3, 32, 100" --rec_char_type="ch" --rec_char_dict_path="your text dict path" + ``` diff --git a/doc/doc_en/training_en.md b/doc/doc_en/training_en.md new file mode 100644 index 0000000000000000000000000000000000000000..aa5500ac88fef97829b4f19c5421e36f18ae1812 --- /dev/null +++ b/doc/doc_en/training_en.md @@ -0,0 +1,155 @@ +# Model Training + +- [1.Yml Configuration ](#1-Yml-Configuration) +- [2. Basic Concepts](#1-basic-concepts) + * [2.1 Learning Rate](#11-learning-rate) + * [2.2 Regularization](#12-regularization) + * [2.3 Evaluation Indicators](#13-evaluation-indicators-) +- [3. Data and Vertical Scenes](#2-data-and-vertical-scenes) + * [3.1 Training Data](#21-training-data) + * [3.2 Vertical Scene](#22-vertical-scene) + * [3.3 Build Your Own Dataset](#23-build-your-own-data-set) +* [4. FAQ](#3-faq) + + +This article will introduce the basic concepts that need to be mastered during model training and the tuning methods during training. + +At the same time, it will briefly introduce the components of the PaddleOCR model training data and how to prepare the data finetune model in the vertical scene. + + + +## 1. Yml Configuration + +The PaddleOCR model uses configuration files to manage network training and evaluation parameters. In the configuration file, you can set the model, optimizer, loss function, and pre- and post-processing parameters of the model. PaddleOCR reads these parameters from the configuration file, and then builds a complete training process to complete the model training. When optimized, the configuration can be completed by modifying the parameters in the configuration file, which is simple to use and convenient to modify. + +For the complete configuration file description, please refer to [Configuration File](./config_en.md) + + +# 1. Basic concepts + +## 2. Basic Concepts + +The following parameters need to be paid attention to when tuning the model: + + +### 2.1 Learning Rate + +The learning rate is one of the important hyperparameters for training neural networks. It represents the step length of the gradient moving to the optimal solution of the loss function in each iteration. +A variety of learning rate update strategies are provided in PaddleOCR, which can be modified through configuration files, for example: + +``` +Optimizer: + ... + lr: + name: Piecewise + decay_epochs : [700, 800] + values : [0.001, 0.0001] + warmup_epoch: 5 +``` + +Piecewise stands for piecewise constant attenuation. Different learning rates are specified in different learning stages, +and the learning rate is the same in each stage. + +warmup_epoch means that in the first 5 epochs, the learning rate will gradually increase from 0 to base_lr. For all strategies, please refer to the code [learning_rate.py](../../ppocr/optimizer/learning_rate.py). + + +## 1.2 Regularization + +Regularization can effectively avoid algorithm overfitting. PaddleOCR provides L1 and L2 regularization methods. +L1 and L2 regularization are the most commonly used regularization methods. +L1 regularization adds a regularization term to the objective function to reduce the sum of absolute values of the parameters; +while in L2 regularization, the purpose of adding a regularization term is to reduce the sum of squared parameters. +The configuration method is as follows: + +``` +Optimizer: + ... + regularizer: + name: L2 + factor: 2.0e-05 +``` + +### 2.3 Evaluation Indicators + +(1) Detection stage: First, evaluate according to the IOU of the detection frame and the labeled frame. If the IOU is greater than a certain threshold, it is judged that the detection is accurate. Here, the detection frame and the label frame are different from the general general target detection frame, and they are represented by polygons. Detection accuracy: the percentage of the correct detection frame number in all detection frames is mainly used to judge the detection index. Detection recall rate: the percentage of correct detection frames in all marked frames, which is mainly an indicator of missed detection. + +(2) Recognition stage: Character recognition accuracy, that is, the ratio of correctly recognized text lines to the number of marked text lines. Only the entire line of text recognition pairs can be regarded as correct recognition. + +(3) End-to-end statistics: End-to-end recall rate: accurately detect and correctly identify the proportion of text lines in all labeled text lines; End-to-end accuracy rate: accurately detect and correctly identify the number of text lines in the detected text lines The standard for accurate detection is that the IOU of the detection box and the labeled box is greater than a certain threshold, and the text in the correctly identified detection box is the same as the labeled text. + + + +## 3. Data and Vertical Scenes + + + +### 3.1 Training Data + +The current open source models, data sets and magnitudes are as follows: + +- Detection: + - English data set, ICDAR2015 + - Chinese data set, LSVT street view data set training data 3w pictures + +- Identification: + - English data set, MJSynth and SynthText synthetic data, the data volume is tens of millions. + - Chinese data set, LSVT street view data set crops the image according to the truth value, and performs position calibration, a total of 30w images. In addition, based on the LSVT corpus, 500w of synthesized data. + - Small language data set, using different corpora and fonts, respectively generated 100w synthetic data set, and using ICDAR-MLT as the verification set. + +Among them, the public data sets are all open source, users can search and download by themselves, or refer to [Chinese data set](./datasets.md), synthetic data is not open source, users can use open source synthesis tools to synthesize by themselves. Synthesis tools include [text_renderer](https://github.com/Sanster/text_renderer), [SynthText](https://github.com/ankush-me/SynthText), [TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator) etc. + + + +### 3.2 Vertical Scene + +PaddleOCR mainly focuses on general OCR. If you have vertical requirements, you can use PaddleOCR + vertical data to train yourself; +If there is a lack of labeled data, or if you do not want to invest in research and development costs, it is recommended to directly call the open API, which covers some of the more common vertical categories. + + + +### 3.3 Build Your Own Dataset + +There are several experiences for reference when constructing the data set: + +(1) The amount of data in the training set: + + a. The data required for detection is relatively small. For Fine-tune based on the PaddleOCR model, 500 sheets are generally required to achieve good results. + b. Recognition is divided into English and Chinese. Generally, English scenarios require hundreds of thousands of data to achieve good results, while Chinese requires several million or more. + + +(2) When the amount of training data is small, you can try the following three ways to get more data: + + a. Manually collect more training data, the most direct and effective way. + b. Basic image processing or transformation based on PIL and opencv. For example, the three modules of ImageFont, Image, ImageDraw in PIL write text into the background, opencv's rotating affine transformation, Gaussian filtering and so on. + c. Use data generation algorithms to synthesize data, such as algorithms such as pix2pix. + + + +# 3. FAQ + +**Q**: How to choose a suitable network input shape when training CRNN recognition? + + A: The general height is 32, the longest width is selected, there are two methods: + + (1) Calculate the aspect ratio distribution of training sample images. The selection of the maximum aspect ratio considers 80% of the training samples. + + (2) Count the number of texts in training samples. The selection of the longest number of characters considers the training sample that satisfies 80%. Then the aspect ratio of Chinese characters is approximately considered to be 1, and that of English is 3:1, and the longest width is estimated. + +**Q**: During the recognition training, the accuracy of the training set has reached 90, but the accuracy of the verification set has been kept at 70, what should I do? + + A: If the accuracy of the training set is 90 and the test set is more than 70, it should be over-fitting. There are two methods to try: + + (1) Add more augmentation methods or increase the [probability] of augmented prob (https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppocr/data/imaug/rec_img_aug.py#L341), The default is 0.4. + + (2) Increase the [l2 dcay value] of the system (https://github.com/PaddlePaddle/PaddleOCR/blob/a501603d54ff5513fc4fc760319472e59da25424/configs/rec/ch_ppocr_v1.1/rec_chinese_lite_train_v1.1.yml#L47) + +**Q**: When the recognition model is trained, loss can drop normally, but acc is always 0 + + A: It is normal for the acc to be 0 at the beginning of the recognition model training, and the indicator will come up after a longer training period. + + +*** +Click the following links for detailed training tutorial: +- [text detection model training](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/detection.md) +- [text recognition model training](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/recognition.md) +- [text direction classification model training](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/doc/doc_ch/angle_class.md) diff --git a/doc/doc_en/update_en.md b/doc/doc_en/update_en.md index ca2ecb0535ce27bc7f98a476752a131f869761d5..660688c6d6991a4744dbc327d24e9c677afa0fc1 100644 --- a/doc/doc_en/update_en.md +++ b/doc/doc_en/update_en.md @@ -1,4 +1,9 @@ # RECENT UPDATES +- 2021.9.7 release PaddleOCR v2.3, [PP-OCRv2](#PP-OCRv2) is proposed. The CPU inference speed of PP-OCRv2 is 220% higher than that of PP-OCR server. The F-score of PP-OCRv2 is 7% higher than that of PP-OCR mobile. +- 2021.8.3 released PaddleOCR v2.2, add a new structured documents analysis toolkit, i.e., [PP-Structure](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/ppstructure/README.md), support layout analysis and table recognition (One-key to export chart images to Excel files). +- 2021.4.8 release end-to-end text recognition algorithm [PGNet](https://www.aaai.org/AAAI21Papers/AAAI-2885.WangP.pdf) which is published in AAAI 2021. Find tutorial [here](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/pgnet_en.md);release multi language recognition [models](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/multi_languages_en.md), support more than 80 languages recognition; especically, the performance of [English recognition model](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/models_list_en.md#English) is Optimized. + +- 2021.1.21 update more than 25+ multilingual recognition models [models list](./doc/doc_en/models_list_en.md), including:English, Chinese, German, French, Japanese,Spanish,Portuguese Russia Arabic and so on. Models for more languages will continue to be updated [Develop Plan](https://github.com/PaddlePaddle/PaddleOCR/issues/1048). - 2020.12.15 update Data synthesis tool, i.e., [Style-Text](../../StyleText/README.md),easy to synthesize a large number of images which are similar to the target scene image. - 2020.11.25 Update a new data annotation tool, i.e., [PPOCRLabel](../../PPOCRLabel/README.md), which is helpful to improve the labeling efficiency. Moreover, the labeling results can be used in training of the PP-OCR system directly. - 2020.9.22 Update the PP-OCR technical article, https://arxiv.org/abs/2009.09941 diff --git a/doc/doc_en/visualization_en.md b/doc/doc_en/visualization_en.md index f9c455e5b3510a9f262c6bf59b8adfbaef3fa01d..71cfb043462f34f2b3bef594364d33f15e98d81e 100644 --- a/doc/doc_en/visualization_en.md +++ b/doc/doc_en/visualization_en.md @@ -1,5 +1,10 @@ # Visualization + +## PP-OCRv2 + + + ## ch_ppocr_server_2.0 diff --git a/doc/doc_en/whl_en.md b/doc/doc_en/whl_en.md index c8c8353accdf7f6ce179d3700547bfe9bd70c200..c2577e1e151e4675abab5139da099db9ad20fb4b 100644 --- a/doc/doc_en/whl_en.md +++ b/doc/doc_en/whl_en.md @@ -1,4 +1,4 @@ -# paddleocr package +# Paddleocr Package ## 1 Get started quickly ### 1.1 install package diff --git a/doc/imgs_results/PP-OCRv2/PP-OCRv2-pic001.jpg b/doc/imgs_results/PP-OCRv2/PP-OCRv2-pic001.jpg new file mode 100644 index 0000000000000000000000000000000000000000..45ffdb53aa431c8d25cc7219b2c0523690182ab6 Binary files /dev/null and b/doc/imgs_results/PP-OCRv2/PP-OCRv2-pic001.jpg differ diff --git a/doc/imgs_results/PP-OCRv2/PP-OCRv2-pic002.jpg b/doc/imgs_results/PP-OCRv2/PP-OCRv2-pic002.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7ac153aee0d703580971539b5cff95587c0e830e Binary files /dev/null and b/doc/imgs_results/PP-OCRv2/PP-OCRv2-pic002.jpg differ diff --git a/doc/imgs_results/PP-OCRv2/PP-OCRv2-pic003.jpg b/doc/imgs_results/PP-OCRv2/PP-OCRv2-pic003.jpg new file mode 100644 index 0000000000000000000000000000000000000000..781aade629651b5adf24fcc76b84a9674154b8b8 Binary files /dev/null and b/doc/imgs_results/PP-OCRv2/PP-OCRv2-pic003.jpg differ diff --git a/doc/install/linux/anaconda_download.png b/doc/install/linux/anaconda_download.png new file mode 100755 index 0000000000000000000000000000000000000000..6ab6db30899d8431874e52bbe97af242e638ed6c Binary files /dev/null and b/doc/install/linux/anaconda_download.png differ diff --git a/doc/install/linux/conda_create.png b/doc/install/linux/conda_create.png new file mode 100755 index 0000000000000000000000000000000000000000..533f592b7c1db78699d9166278e91332d3d8f258 Binary files /dev/null and b/doc/install/linux/conda_create.png differ diff --git a/doc/install/mac/anaconda_start.png b/doc/install/mac/anaconda_start.png new file mode 100755 index 0000000000000000000000000000000000000000..a860f5e56a76558a764d3d92055743832f4d5acb Binary files /dev/null and b/doc/install/mac/anaconda_start.png differ diff --git a/doc/install/mac/conda_activate.png b/doc/install/mac/conda_activate.png new file mode 100755 index 0000000000000000000000000000000000000000..a2e6074e912988218b62068476b9d5d22deb0d71 Binary files /dev/null and b/doc/install/mac/conda_activate.png differ diff --git a/doc/install/mac/conda_create.png b/doc/install/mac/conda_create.png new file mode 100755 index 0000000000000000000000000000000000000000..9ff10c241be39216ea8255826ea50844368f27e8 Binary files /dev/null and b/doc/install/mac/conda_create.png differ diff --git a/doc/install/windows/Anaconda_download.png b/doc/install/windows/Anaconda_download.png new file mode 100644 index 0000000000000000000000000000000000000000..83a03414934a12f7071389ef664b6fd5e7df956f Binary files /dev/null and b/doc/install/windows/Anaconda_download.png differ diff --git a/doc/install/windows/anaconda_install_env.png b/doc/install/windows/anaconda_install_env.png new file mode 100644 index 0000000000000000000000000000000000000000..7a22542712a3fa5d471f13d940806d483225c38f Binary files /dev/null and b/doc/install/windows/anaconda_install_env.png differ diff --git a/doc/install/windows/anaconda_install_folder.png b/doc/install/windows/anaconda_install_folder.png new file mode 100644 index 0000000000000000000000000000000000000000..e9fac29eaa92fc445d324a565e95c064a984f9bf Binary files /dev/null and b/doc/install/windows/anaconda_install_folder.png differ diff --git a/doc/install/windows/anaconda_prompt.png b/doc/install/windows/anaconda_prompt.png new file mode 100755 index 0000000000000000000000000000000000000000..1087610ae01f5c6181434e3dcc11189b138d419c Binary files /dev/null and b/doc/install/windows/anaconda_prompt.png differ diff --git a/doc/install/windows/conda_list_env.png b/doc/install/windows/conda_list_env.png new file mode 100644 index 0000000000000000000000000000000000000000..5ffa0037c5e62b75c7b452a4012b7015b03c3f5f Binary files /dev/null and b/doc/install/windows/conda_list_env.png differ diff --git a/doc/install/windows/conda_new_env.png b/doc/install/windows/conda_new_env.png new file mode 100644 index 0000000000000000000000000000000000000000..eed667ec3d4a4419cdfdd842fe57a1efca734c94 Binary files /dev/null and b/doc/install/windows/conda_new_env.png differ diff --git a/doc/joinus.PNG b/doc/joinus.PNG index 7a10f7aac3748062184085b68583c637d3963117..974a4bd008d7b103de044cf8b4dbf37f09a0d06b 100644 Binary files a/doc/joinus.PNG and b/doc/joinus.PNG differ diff --git a/doc/overview.png b/doc/overview.png new file mode 100644 index 0000000000000000000000000000000000000000..c5c4e09d6730bb0b1ca2c0b5442079ceb41ecdfa Binary files /dev/null and b/doc/overview.png differ diff --git a/doc/overview_en.png b/doc/overview_en.png new file mode 100644 index 0000000000000000000000000000000000000000..b44da4e9874d6a2162a8bb05ff1b479875bd65f3 Binary files /dev/null and b/doc/overview_en.png differ diff --git a/doc/ppocrv2_framework.jpg b/doc/ppocrv2_framework.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e5f1a2ef47601c3a9eaef43a6046a15ea0319e2b Binary files /dev/null and b/doc/ppocrv2_framework.jpg differ diff --git a/doc/table/1.png b/doc/table/1.png index 47df618ab1bef431a5dd94418c01be16b09d31aa..faff6e3178662407961fe074a9202015f755e2f8 100644 Binary files a/doc/table/1.png and b/doc/table/1.png differ diff --git a/doc/table/table.jpg b/doc/table/table.jpg index 3daa619e52dc2471df62ea7767be3bff350b623f..95fdf84d92908d4b21f49fb516601334867163b1 100644 Binary files a/doc/table/table.jpg and b/doc/table/table.jpg differ diff --git a/paddleocr.py b/paddleocr.py index c52737f55b61cd29c08367adb6d7e05c561e933e..a98efd34088701d5eb5602743cf75b7d5e80157f 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -33,104 +33,141 @@ from tools.infer.utility import draw_ocr, str2bool from ppstructure.utility import init_args, draw_structure_result from ppstructure.predict_system import OCRSystem, save_structure_res -__all__ = ['PaddleOCR', 'PPStructure', 'draw_ocr', 'draw_structure_result', 'save_structure_res','download_with_progressbar'] - -model_urls = { - 'det': { - 'ch': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar', - 'en': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_ppocr_mobile_v2.0_det_infer.tar', - 'structure': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar' +__all__ = [ + 'PaddleOCR', 'PPStructure', 'draw_ocr', 'draw_structure_result', + 'save_structure_res', 'download_with_progressbar' +] + +SUPPORT_DET_MODEL = ['DB'] +VERSION = '2.2.1' +SUPPORT_REC_MODEL = ['CRNN'] +BASE_DIR = os.path.expanduser("~/.paddleocr/") + +DEFAULT_MODEL_VERSION = '2.0' +MODEL_URLS = { + '2.1': { + 'det': { + 'ch': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar', + }, + }, + 'rec': { + 'ch': { + 'url': + 'https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar', + 'dict_path': './ppocr/utils/ppocr_keys_v1.txt' + } + } }, - 'rec': { - 'ch': { - 'url': - 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar', - 'dict_path': './ppocr/utils/ppocr_keys_v1.txt' + '2.0': { + 'det': { + 'ch': { + 'url': + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar', + }, + 'en': { + 'url': + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_ppocr_mobile_v2.0_det_infer.tar', + }, + 'structure': { + 'url': + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar' + } }, - 'en': { - 'url': + 'rec': { + 'ch': { + 'url': + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar', + 'dict_path': './ppocr/utils/ppocr_keys_v1.txt' + }, + 'en': { + 'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar', - 'dict_path': './ppocr/utils/en_dict.txt' - }, - 'french': { - 'url': + 'dict_path': './ppocr/utils/en_dict.txt' + }, + 'french': { + 'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/french_mobile_v2.0_rec_infer.tar', - 'dict_path': './ppocr/utils/dict/french_dict.txt' - }, - 'german': { - 'url': + 'dict_path': './ppocr/utils/dict/french_dict.txt' + }, + 'german': { + 'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/german_mobile_v2.0_rec_infer.tar', - 'dict_path': './ppocr/utils/dict/german_dict.txt' - }, - 'korean': { - 'url': + 'dict_path': './ppocr/utils/dict/german_dict.txt' + }, + 'korean': { + 'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_mobile_v2.0_rec_infer.tar', - 'dict_path': './ppocr/utils/dict/korean_dict.txt' - }, - 'japan': { - 'url': + 'dict_path': './ppocr/utils/dict/korean_dict.txt' + }, + 'japan': { + 'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_mobile_v2.0_rec_infer.tar', - 'dict_path': './ppocr/utils/dict/japan_dict.txt' - }, - 'chinese_cht': { - 'url': + 'dict_path': './ppocr/utils/dict/japan_dict.txt' + }, + 'chinese_cht': { + 'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_mobile_v2.0_rec_infer.tar', - 'dict_path': './ppocr/utils/dict/chinese_cht_dict.txt' - }, - 'ta': { - 'url': + 'dict_path': './ppocr/utils/dict/chinese_cht_dict.txt' + }, + 'ta': { + 'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_mobile_v2.0_rec_infer.tar', - 'dict_path': './ppocr/utils/dict/ta_dict.txt' - }, - 'te': { - 'url': + 'dict_path': './ppocr/utils/dict/ta_dict.txt' + }, + 'te': { + 'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_mobile_v2.0_rec_infer.tar', - 'dict_path': './ppocr/utils/dict/te_dict.txt' - }, - 'ka': { - 'url': + 'dict_path': './ppocr/utils/dict/te_dict.txt' + }, + 'ka': { + 'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_mobile_v2.0_rec_infer.tar', - 'dict_path': './ppocr/utils/dict/ka_dict.txt' - }, - 'latin': { - 'url': + 'dict_path': './ppocr/utils/dict/ka_dict.txt' + }, + 'latin': { + 'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_mobile_v2.0_rec_infer.tar', - 'dict_path': './ppocr/utils/dict/latin_dict.txt' - }, - 'arabic': { - 'url': + 'dict_path': './ppocr/utils/dict/latin_dict.txt' + }, + 'arabic': { + 'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_mobile_v2.0_rec_infer.tar', - 'dict_path': './ppocr/utils/dict/arabic_dict.txt' - }, - 'cyrillic': { - 'url': + 'dict_path': './ppocr/utils/dict/arabic_dict.txt' + }, + 'cyrillic': { + 'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_mobile_v2.0_rec_infer.tar', - 'dict_path': './ppocr/utils/dict/cyrillic_dict.txt' - }, - 'devanagari': { - 'url': + 'dict_path': './ppocr/utils/dict/cyrillic_dict.txt' + }, + 'devanagari': { + 'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_mobile_v2.0_rec_infer.tar', - 'dict_path': './ppocr/utils/dict/devanagari_dict.txt' + 'dict_path': './ppocr/utils/dict/devanagari_dict.txt' + }, + 'structure': { + 'url': + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar', + 'dict_path': 'ppocr/utils/dict/table_dict.txt' + } + }, + 'cls': { + 'ch': { + 'url': + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar', + } }, - 'structure': { - 'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar', - 'dict_path': 'ppocr/utils/dict/table_dict.txt' + 'table': { + 'en': { + 'url': + 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar', + 'dict_path': 'ppocr/utils/dict/table_structure_dict.txt' + } } - }, - 'cls': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar', - 'table': { - 'url': 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar', - 'dict_path': 'ppocr/utils/dict/table_structure_dict.txt' } } -SUPPORT_DET_MODEL = ['DB'] -VERSION = '2.2' -SUPPORT_REC_MODEL = ['CRNN'] -BASE_DIR = os.path.expanduser("~/.paddleocr/") - def parse_args(mMain=True): import argparse @@ -140,6 +177,7 @@ def parse_args(mMain=True): parser.add_argument("--det", type=str2bool, default=True) parser.add_argument("--rec", type=str2bool, default=True) parser.add_argument("--type", type=str, default='ocr') + parser.add_argument("--version", type=str, default='2.1') for action in parser._actions: if action.dest in ['rec_char_dict_path', 'table_char_dict_path']: @@ -155,19 +193,19 @@ def parse_args(mMain=True): def parse_lang(lang): latin_lang = [ - 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', - 'hr', 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', - 'mt', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', - 'sl', 'sq', 'sv', 'sw', 'tl', 'tr', 'uz', 'vi' + 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', + 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl', + 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv', + 'sw', 'tl', 'tr', 'uz', 'vi' ] arabic_lang = ['ar', 'fa', 'ug', 'ur'] cyrillic_lang = [ - 'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', - 'ava', 'dar', 'inh', 'che', 'lbe', 'lez', 'tab' + 'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava', + 'dar', 'inh', 'che', 'lbe', 'lez', 'tab' ] devanagari_lang = [ - 'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', - 'gom', 'sa', 'bgc' + 'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', + 'sa', 'bgc' ] if lang in latin_lang: lang = "latin" @@ -177,9 +215,9 @@ def parse_lang(lang): lang = "cyrillic" elif lang in devanagari_lang: lang = "devanagari" - assert lang in model_urls[ + assert lang in MODEL_URLS[DEFAULT_MODEL_VERSION][ 'rec'], 'param lang must in {}, but got {}'.format( - model_urls['rec'].keys(), lang) + MODEL_URLS[DEFAULT_MODEL_VERSION]['rec'].keys(), lang) if lang == "ch": det_lang = "ch" elif lang == 'structure': @@ -189,6 +227,35 @@ def parse_lang(lang): return lang, det_lang +def get_model_config(version, model_type, lang): + if version not in MODEL_URLS: + logger.warning('version {} not in {}, use version {} instead'.format( + version, MODEL_URLS.keys(), DEFAULT_MODEL_VERSION)) + version = DEFAULT_MODEL_VERSION + if model_type not in MODEL_URLS[version]: + if model_type in MODEL_URLS[DEFAULT_MODEL_VERSION]: + logger.warning( + 'version {} not support {} models, use version {} instead'. + format(version, model_type, DEFAULT_MODEL_VERSION)) + version = DEFAULT_MODEL_VERSION + else: + logger.error('{} models is not support, we only support {}'.format( + model_type, MODEL_URLS[DEFAULT_MODEL_VERSION].keys())) + sys.exit(-1) + if lang not in MODEL_URLS[version][model_type]: + if lang in MODEL_URLS[DEFAULT_MODEL_VERSION][model_type]: + logger.warning('lang {} is not support in {}, use {} instead'. + format(lang, version, DEFAULT_MODEL_VERSION)) + version = DEFAULT_MODEL_VERSION + else: + logger.error( + 'lang {} is not support, we only support {} for {} models'. + format(lang, MODEL_URLS[DEFAULT_MODEL_VERSION][model_type].keys( + ), model_type)) + sys.exit(-1) + return MODEL_URLS[version][model_type][lang] + + class PaddleOCR(predict_system.TextSystem): def __init__(self, **kwargs): """ @@ -204,15 +271,21 @@ class PaddleOCR(predict_system.TextSystem): lang, det_lang = parse_lang(params.lang) # init model dir - params.det_model_dir, det_url = confirm_model_dir_url(params.det_model_dir, - os.path.join(BASE_DIR, VERSION, 'ocr', 'det', det_lang), - model_urls['det'][det_lang]) - params.rec_model_dir, rec_url = confirm_model_dir_url(params.rec_model_dir, - os.path.join(BASE_DIR, VERSION, 'ocr', 'rec', lang), - model_urls['rec'][lang]['url']) - params.cls_model_dir, cls_url = confirm_model_dir_url(params.cls_model_dir, - os.path.join(BASE_DIR, VERSION, 'ocr', 'cls'), - model_urls['cls']) + det_model_config = get_model_config(params.version, 'det', det_lang) + params.det_model_dir, det_url = confirm_model_dir_url( + params.det_model_dir, + os.path.join(BASE_DIR, VERSION, 'ocr', 'det', det_lang), + det_model_config['url']) + rec_model_config = get_model_config(params.version, 'rec', lang) + params.rec_model_dir, rec_url = confirm_model_dir_url( + params.rec_model_dir, + os.path.join(BASE_DIR, VERSION, 'ocr', 'rec', lang), + rec_model_config['url']) + cls_model_config = get_model_config(params.version, 'cls', 'ch') + params.cls_model_dir, cls_url = confirm_model_dir_url( + params.cls_model_dir, + os.path.join(BASE_DIR, VERSION, 'ocr', 'cls'), + cls_model_config['url']) # download model maybe_download(params.det_model_dir, det_url) maybe_download(params.rec_model_dir, rec_url) @@ -226,7 +299,8 @@ class PaddleOCR(predict_system.TextSystem): sys.exit(0) if params.rec_char_dict_path is None: - params.rec_char_dict_path = str(Path(__file__).parent / model_urls['rec'][lang]['dict_path']) + params.rec_char_dict_path = str( + Path(__file__).parent / rec_model_config['dict_path']) print(params) # init det_model and rec_model @@ -293,24 +367,32 @@ class PPStructure(OCRSystem): lang, det_lang = parse_lang(params.lang) # init model dir - params.det_model_dir, det_url = confirm_model_dir_url(params.det_model_dir, - os.path.join(BASE_DIR, VERSION, 'ocr', 'det', det_lang), - model_urls['det'][det_lang]) - params.rec_model_dir, rec_url = confirm_model_dir_url(params.rec_model_dir, - os.path.join(BASE_DIR, VERSION, 'ocr', 'rec', lang), - model_urls['rec'][lang]['url']) - params.table_model_dir, table_url = confirm_model_dir_url(params.table_model_dir, - os.path.join(BASE_DIR, VERSION, 'ocr', 'table'), - model_urls['table']['url']) + det_model_config = get_model_config(params.version, 'det', det_lang) + params.det_model_dir, det_url = confirm_model_dir_url( + params.det_model_dir, + os.path.join(BASE_DIR, VERSION, 'ocr', 'det', det_lang), + det_model_config['url']) + rec_model_config = get_model_config(params.version, 'rec', lang) + params.rec_model_dir, rec_url = confirm_model_dir_url( + params.rec_model_dir, + os.path.join(BASE_DIR, VERSION, 'ocr', 'rec', lang), + rec_model_config['url']) + table_model_config = get_model_config(params.version, 'table', 'en') + params.table_model_dir, table_url = confirm_model_dir_url( + params.table_model_dir, + os.path.join(BASE_DIR, VERSION, 'ocr', 'table'), + table_model_config['url']) # download model maybe_download(params.det_model_dir, det_url) maybe_download(params.rec_model_dir, rec_url) maybe_download(params.table_model_dir, table_url) if params.rec_char_dict_path is None: - params.rec_char_dict_path = str(Path(__file__).parent / model_urls['rec'][lang]['dict_path']) + params.rec_char_dict_path = str( + Path(__file__).parent / rec_model_config['dict_path']) if params.table_char_dict_path is None: - params.table_char_dict_path = str(Path(__file__).parent / model_urls['table']['dict_path']) + params.table_char_dict_path = str( + Path(__file__).parent / table_model_config['dict_path']) print(params) super().__init__(params) @@ -374,4 +456,3 @@ def main(): for item in result: item.pop('img') logger.info(item) - diff --git a/ppocr/data/__init__.py b/ppocr/data/__init__.py index e860c5a6986f495e6384d9df93c24795c04a0d5f..0bb3d506483a331fba48feafeff9ca2d439f3782 100644 --- a/ppocr/data/__init__.py +++ b/ppocr/data/__init__.py @@ -49,14 +49,12 @@ def term_mp(sig_num, frame): os.killpg(pgid, signal.SIGKILL) -signal.signal(signal.SIGINT, term_mp) -signal.signal(signal.SIGTERM, term_mp) - - def build_dataloader(config, mode, device, logger, seed=None): config = copy.deepcopy(config) - support_dict = ['SimpleDataSet', 'LMDBDataSet', 'PGDataSet', 'PubTabDataSet'] + support_dict = [ + 'SimpleDataSet', 'LMDBDataSet', 'PGDataSet', 'PubTabDataSet' + ] module_name = config[mode]['dataset']['name'] assert module_name in support_dict, Exception( 'DataSet only support {}'.format(support_dict)) @@ -96,4 +94,8 @@ def build_dataloader(config, mode, device, logger, seed=None): return_list=True, use_shared_memory=use_shared_memory) + # support exit using ctrl+c + signal.signal(signal.SIGINT, term_mp) + signal.signal(signal.SIGTERM, term_mp) + return data_loader diff --git a/ppocr/data/imaug/ColorJitter.py b/ppocr/data/imaug/ColorJitter.py new file mode 100644 index 0000000000000000000000000000000000000000..4b542abc8f9dc5af76529f9feb4bcb8b47b5f7d0 --- /dev/null +++ b/ppocr/data/imaug/ColorJitter.py @@ -0,0 +1,26 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from paddle.vision.transforms import ColorJitter as pp_ColorJitter + +__all__ = ['ColorJitter'] + +class ColorJitter(object): + def __init__(self, brightness=0, contrast=0, saturation=0, hue=0,**kwargs): + self.aug = pp_ColorJitter(brightness, contrast, saturation, hue) + + def __call__(self, data): + image = data['image'] + image = self.aug(image) + data['image'] = image + return data diff --git a/ppocr/data/imaug/__init__.py b/ppocr/data/imaug/__init__.py index 52194eb964f7a7fd159cc1a42b73d280f8ee5fb4..5aaa1cd71eb791efa94e6bd812f3ab76632c96c6 100644 --- a/ppocr/data/imaug/__init__.py +++ b/ppocr/data/imaug/__init__.py @@ -19,11 +19,13 @@ from __future__ import unicode_literals from .iaa_augment import IaaAugment from .make_border_map import MakeBorderMap from .make_shrink_map import MakeShrinkMap -from .random_crop_data import EastRandomCropData, PSERandomCrop +from .random_crop_data import EastRandomCropData, RandomCropImgMask +from .make_pse_gt import MakePseGt -from .rec_img_aug import RecAug, RecResizeImg, ClsResizeImg, SRNRecResizeImg +from .rec_img_aug import RecAug, RecResizeImg, ClsResizeImg, SRNRecResizeImg, NRTRRecResizeImg, SARRecResizeImg from .randaugment import RandAugment from .copy_paste import CopyPaste +from .ColorJitter import ColorJitter from .operators import * from .label_ops import * diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py index d222c4109c3723bc1adb71ee7c21a27a010f8f45..0a4fad621a9038e71a9d43eb4e12f78e7e92d73d 100644 --- a/ppocr/data/imaug/label_ops.py +++ b/ppocr/data/imaug/label_ops.py @@ -21,6 +21,8 @@ import numpy as np import string import json +from ppocr.utils.logging import get_logger + class ClsLabelEncode(object): def __init__(self, label_list, **kwargs): @@ -92,31 +94,23 @@ class BaseRecLabelEncode(object): def __init__(self, max_text_length, character_dict_path=None, - character_type='ch', use_space_char=False): - support_character_type = [ - 'ch', 'en', 'EN_symbol', 'french', 'german', 'japan', 'korean', - 'EN', 'it', 'xi', 'pu', 'ru', 'ar', 'ta', 'ug', 'fa', 'ur', 'rs', - 'oc', 'rsc', 'bg', 'uk', 'be', 'te', 'ka', 'chinese_cht', 'hi', - 'mr', 'ne', 'latin', 'arabic', 'cyrillic', 'devanagari' - ] - assert character_type in support_character_type, "Only {} are supported now but get {}".format( - support_character_type, character_type) self.max_text_len = max_text_length self.beg_str = "sos" self.end_str = "eos" - if character_type == "en": + self.lower = False + + if character_dict_path is None: + logger = get_logger() + logger.warning( + "The character_dict_path is None, model can only recognize number and lower letters" + ) self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" dict_character = list(self.character_str) - elif character_type == "EN_symbol": - # same with ASTER setting (use 94 char). - self.character_str = string.printable[:-6] - dict_character = list(self.character_str) - elif character_type in support_character_type: + self.lower = True + else: self.character_str = "" - assert character_dict_path is not None, "character_dict_path should not be None when character_type is {}".format( - character_type) with open(character_dict_path, "rb") as fin: lines = fin.readlines() for line in lines: @@ -125,7 +119,6 @@ class BaseRecLabelEncode(object): if use_space_char: self.character_str += " " dict_character = list(self.character_str) - self.character_type = character_type dict_character = self.add_special_char(dict_character) self.dict = {} for i, char in enumerate(dict_character): @@ -147,7 +140,7 @@ class BaseRecLabelEncode(object): """ if len(text) == 0 or len(text) > self.max_text_len: return None - if self.character_type == "en": + if self.lower: text = text.lower() text_list = [] for char in text: @@ -161,18 +154,47 @@ class BaseRecLabelEncode(object): return text_list +class NRTRLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + + super(NRTRLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len - 1: + return None + data['length'] = np.array(len(text)) + text.insert(0, 2) + text.append(3) + text = text + [0] * (self.max_text_len - len(text)) + data['label'] = np.array(text) + return data + + def add_special_char(self, dict_character): + dict_character = ['blank', '', '', ''] + dict_character + return dict_character + + class CTCLabelEncode(BaseRecLabelEncode): """ Convert between text-label and text-index """ def __init__(self, max_text_length, character_dict_path=None, - character_type='ch', use_space_char=False, **kwargs): - super(CTCLabelEncode, - self).__init__(max_text_length, character_dict_path, - character_type, use_space_char) + super(CTCLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) def __call__(self, data): text = data['label'] @@ -182,6 +204,11 @@ class CTCLabelEncode(BaseRecLabelEncode): data['length'] = np.array(len(text)) text = text + [0] * (self.max_text_len - len(text)) data['label'] = np.array(text) + + label = [0] * len(self.character) + for x in text: + label[x] += 1 + data['label_ace'] = np.array(label) return data def add_special_char(self, dict_character): @@ -193,12 +220,10 @@ class E2ELabelEncodeTest(BaseRecLabelEncode): def __init__(self, max_text_length, character_dict_path=None, - character_type='EN', use_space_char=False, **kwargs): - super(E2ELabelEncodeTest, - self).__init__(max_text_length, character_dict_path, - character_type, use_space_char) + super(E2ELabelEncodeTest, self).__init__( + max_text_length, character_dict_path, use_space_char) def __call__(self, data): import json @@ -267,12 +292,10 @@ class AttnLabelEncode(BaseRecLabelEncode): def __init__(self, max_text_length, character_dict_path=None, - character_type='ch', use_space_char=False, **kwargs): - super(AttnLabelEncode, - self).__init__(max_text_length, character_dict_path, - character_type, use_space_char) + super(AttnLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) def add_special_char(self, dict_character): self.beg_str = "sos" @@ -309,18 +332,46 @@ class AttnLabelEncode(BaseRecLabelEncode): return idx +class SEEDLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(SEEDLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def add_special_char(self, dict_character): + self.end_str = "eos" + dict_character = dict_character + [self.end_str] + return dict_character + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len: + return None + data['length'] = np.array(len(text)) + 1 # conclude eos + text = text + [len(self.character) - 1] * (self.max_text_len - len(text) + ) + data['label'] = np.array(text) + return data + + class SRNLabelEncode(BaseRecLabelEncode): """ Convert between text-label and text-index """ def __init__(self, max_text_length=25, character_dict_path=None, - character_type='en', use_space_char=False, **kwargs): - super(SRNLabelEncode, - self).__init__(max_text_length, character_dict_path, - character_type, use_space_char) + super(SRNLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) def add_special_char(self, dict_character): dict_character = dict_character + [self.beg_str, self.end_str] @@ -388,7 +439,6 @@ class TableLabelEncode(object): substr = lines[0].decode('utf-8').strip("\r\n").split("\t") character_num = int(substr[0]) elem_num = int(substr[1]) - for cno in range(1, 1 + character_num): character = lines[cno].decode('utf-8').strip("\r\n") list_character.append(character) @@ -521,3 +571,47 @@ class TableLabelEncode(object): assert False, "Unsupport type %s in char_or_elem" \ % char_or_elem return idx + + +class SARLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(SARLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def add_special_char(self, dict_character): + beg_end_str = "" + unknown_str = "" + padding_str = "" + dict_character = dict_character + [unknown_str] + self.unknown_idx = len(dict_character) - 1 + dict_character = dict_character + [beg_end_str] + self.start_idx = len(dict_character) - 1 + self.end_idx = len(dict_character) - 1 + dict_character = dict_character + [padding_str] + self.padding_idx = len(dict_character) - 1 + + return dict_character + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len - 1: + return None + data['length'] = np.array(len(text)) + target = [self.start_idx] + text + [self.end_idx] + padded_text = [self.padding_idx for _ in range(self.max_text_len)] + + padded_text[:len(target)] = target + data['label'] = np.array(padded_text) + return data + + def get_ignored_tokens(self): + return [self.padding_idx] diff --git a/ppocr/data/imaug/make_pse_gt.py b/ppocr/data/imaug/make_pse_gt.py new file mode 100644 index 0000000000000000000000000000000000000000..55abc8970784fd00843d2e91f259c58b65ae8579 --- /dev/null +++ b/ppocr/data/imaug/make_pse_gt.py @@ -0,0 +1,85 @@ +# -*- coding:utf-8 -*- + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import cv2 +import numpy as np +import pyclipper +from shapely.geometry import Polygon + +__all__ = ['MakePseGt'] + +class MakePseGt(object): + r''' + Making binary mask from detection data with ICDAR format. + Typically following the process of class `MakeICDARData`. + ''' + + def __init__(self, kernel_num=7, size=640, min_shrink_ratio=0.4, **kwargs): + self.kernel_num = kernel_num + self.min_shrink_ratio = min_shrink_ratio + self.size = size + + def __call__(self, data): + + image = data['image'] + text_polys = data['polys'] + ignore_tags = data['ignore_tags'] + + h, w, _ = image.shape + short_edge = min(h, w) + if short_edge < self.size: + # keep short_size >= self.size + scale = self.size / short_edge + image = cv2.resize(image, dsize=None, fx=scale, fy=scale) + text_polys *= scale + + gt_kernels = [] + for i in range(1,self.kernel_num+1): + # s1->sn, from big to small + rate = 1.0 - (1.0 - self.min_shrink_ratio) / (self.kernel_num - 1) * i + text_kernel, ignore_tags = self.generate_kernel(image.shape[0:2], rate, text_polys, ignore_tags) + gt_kernels.append(text_kernel) + + training_mask = np.ones(image.shape[0:2], dtype='uint8') + for i in range(text_polys.shape[0]): + if ignore_tags[i]: + cv2.fillPoly(training_mask, text_polys[i].astype(np.int32)[np.newaxis, :, :], 0) + + gt_kernels = np.array(gt_kernels) + gt_kernels[gt_kernels > 0] = 1 + + data['image'] = image + data['polys'] = text_polys + data['gt_kernels'] = gt_kernels[0:] + data['gt_text'] = gt_kernels[0] + data['mask'] = training_mask.astype('float32') + return data + + def generate_kernel(self, img_size, shrink_ratio, text_polys, ignore_tags=None): + h, w = img_size + text_kernel = np.zeros((h, w), dtype=np.float32) + for i, poly in enumerate(text_polys): + polygon = Polygon(poly) + distance = polygon.area * (1 - shrink_ratio * shrink_ratio) / (polygon.length + 1e-6) + subject = [tuple(l) for l in poly] + pco = pyclipper.PyclipperOffset() + pco.AddPath(subject, pyclipper.JT_ROUND, + pyclipper.ET_CLOSEDPOLYGON) + shrinked = np.array(pco.Execute(-distance)) + + if len(shrinked) == 0 or shrinked.size == 0: + if ignore_tags is not None: + ignore_tags[i] = True + continue + try: + shrinked = np.array(shrinked[0]).reshape(-1, 2) + except: + if ignore_tags is not None: + ignore_tags[i] = True + continue + cv2.fillPoly(text_kernel, [shrinked.astype(np.int32)], i + 1) + return text_kernel, ignore_tags diff --git a/ppocr/data/imaug/operators.py b/ppocr/data/imaug/operators.py index 2535b4420c503f2e9e9cc5a677ef70c4dd9c36be..87e3088d07a8c5a2eea5d4deff87c69a753e215b 100644 --- a/ppocr/data/imaug/operators.py +++ b/ppocr/data/imaug/operators.py @@ -23,6 +23,7 @@ import sys import six import cv2 import numpy as np +import fasttext class DecodeImage(object): @@ -57,6 +58,39 @@ class DecodeImage(object): return data +class NRTRDecodeImage(object): + """ decode image """ + + def __init__(self, img_mode='RGB', channel_first=False, **kwargs): + self.img_mode = img_mode + self.channel_first = channel_first + + def __call__(self, data): + img = data['image'] + if six.PY2: + assert type(img) is str and len( + img) > 0, "invalid input 'img' in DecodeImage" + else: + assert type(img) is bytes and len( + img) > 0, "invalid input 'img' in DecodeImage" + img = np.frombuffer(img, dtype='uint8') + + img = cv2.imdecode(img, 1) + + if img is None: + return None + if self.img_mode == 'GRAY': + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + elif self.img_mode == 'RGB': + assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape) + img = img[:, :, ::-1] + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + if self.channel_first: + img = img.transpose((2, 0, 1)) + data['image'] = img + return data + + class NormalizeImage(object): """ normalize image such as substract mean, divide std """ @@ -81,7 +115,7 @@ class NormalizeImage(object): assert isinstance(img, np.ndarray), "invalid input 'img' in NormalizeImage" data['image'] = ( - img.astype('float32') * self.scale - self.mean) / self.std + img.astype('float32') * self.scale - self.mean) / self.std return data @@ -101,6 +135,17 @@ class ToCHWImage(object): return data +class Fasttext(object): + def __init__(self, path="None", **kwargs): + self.fast_model = fasttext.load_model(path) + + def __call__(self, data): + label = data['label'] + fast_label = self.fast_model[label] + data['fast_label'] = fast_label + return data + + class KeepKeys(object): def __init__(self, keep_keys, **kwargs): self.keep_keys = keep_keys @@ -112,6 +157,34 @@ class KeepKeys(object): return data_list +class Resize(object): + def __init__(self, size=(640, 640), **kwargs): + self.size = size + + def resize_image(self, img): + resize_h, resize_w = self.size + ori_h, ori_w = img.shape[:2] # (h, w, c) + ratio_h = float(resize_h) / ori_h + ratio_w = float(resize_w) / ori_w + img = cv2.resize(img, (int(resize_w), int(resize_h))) + return img, [ratio_h, ratio_w] + + def __call__(self, data): + img = data['image'] + text_polys = data['polys'] + + img_resize, [ratio_h, ratio_w] = self.resize_image(img) + new_boxes = [] + for box in text_polys: + new_box = [] + for cord in box: + new_box.append([cord[0] * ratio_w, cord[1] * ratio_h]) + new_boxes.append(new_box) + data['image'] = img_resize + data['polys'] = np.array(new_boxes, dtype=np.float32) + return data + + class DetResizeForTest(object): def __init__(self, **kwargs): super(DetResizeForTest, self).__init__() @@ -183,7 +256,7 @@ class DetResizeForTest(object): else: ratio = 1. elif self.limit_type == 'resize_long': - ratio = float(limit_side_len) / max(h,w) + ratio = float(limit_side_len) / max(h, w) else: raise Exception('not support limit type, image ') resize_h = int(h * ratio) diff --git a/ppocr/data/imaug/random_crop_data.py b/ppocr/data/imaug/random_crop_data.py index 4d67cff61d6f340be6d80d8243c68909a94c4e88..7c1c25abb56a0cf7d4d59b8523962bd5d81c873a 100644 --- a/ppocr/data/imaug/random_crop_data.py +++ b/ppocr/data/imaug/random_crop_data.py @@ -164,47 +164,55 @@ class EastRandomCropData(object): return data -class PSERandomCrop(object): - def __init__(self, size, **kwargs): +class RandomCropImgMask(object): + def __init__(self, size, main_key, crop_keys, p=3 / 8, **kwargs): self.size = size + self.main_key = main_key + self.crop_keys = crop_keys + self.p = p def __call__(self, data): - imgs = data['imgs'] + image = data['image'] - h, w = imgs[0].shape[0:2] + h, w = image.shape[0:2] th, tw = self.size if w == tw and h == th: - return imgs + return data - # label中存在文本实例,并且按照概率进行裁剪,使用threshold_label_map控制 - if np.max(imgs[2]) > 0 and random.random() > 3 / 8: - # 文本实例的左上角点 - tl = np.min(np.where(imgs[2] > 0), axis=1) - self.size + mask = data[self.main_key] + if np.max(mask) > 0 and random.random() > self.p: + # make sure to crop the text region + tl = np.min(np.where(mask > 0), axis=1) - (th, tw) tl[tl < 0] = 0 - # 文本实例的右下角点 - br = np.max(np.where(imgs[2] > 0), axis=1) - self.size + br = np.max(np.where(mask > 0), axis=1) - (th, tw) br[br < 0] = 0 - # 保证选到右下角点时,有足够的距离进行crop + br[0] = min(br[0], h - th) br[1] = min(br[1], w - tw) - for _ in range(50000): - i = random.randint(tl[0], br[0]) - j = random.randint(tl[1], br[1]) - # 保证shrink_label_map有文本 - if imgs[1][i:i + th, j:j + tw].sum() <= 0: - continue - else: - break + i = random.randint(tl[0], br[0]) if tl[0] < br[0] else 0 + j = random.randint(tl[1], br[1]) if tl[1] < br[1] else 0 else: - i = random.randint(0, h - th) - j = random.randint(0, w - tw) + i = random.randint(0, h - th) if h - th > 0 else 0 + j = random.randint(0, w - tw) if w - tw > 0 else 0 # return i, j, th, tw - for idx in range(len(imgs)): - if len(imgs[idx].shape) == 3: - imgs[idx] = imgs[idx][i:i + th, j:j + tw, :] - else: - imgs[idx] = imgs[idx][i:i + th, j:j + tw] - data['imgs'] = imgs + for k in data: + if k in self.crop_keys: + if len(data[k].shape) == 3: + if np.argmin(data[k].shape) == 0: + img = data[k][:, i:i + th, j:j + tw] + if img.shape[1] != img.shape[2]: + a = 1 + elif np.argmin(data[k].shape) == 2: + img = data[k][i:i + th, j:j + tw, :] + if img.shape[1] != img.shape[0]: + a = 1 + else: + img = data[k] + else: + img = data[k][i:i + th, j:j + tw] + if img.shape[0] != img.shape[1]: + a = 1 + data[k] = img return data diff --git a/ppocr/data/imaug/rec_img_aug.py b/ppocr/data/imaug/rec_img_aug.py index 28e6bd0bce768c45dbc334c15ace601fd6403f5d..b4de6de95b09ced803375d9a3bb857194ef3e64b 100644 --- a/ppocr/data/imaug/rec_img_aug.py +++ b/ppocr/data/imaug/rec_img_aug.py @@ -16,7 +16,7 @@ import math import cv2 import numpy as np import random - +from PIL import Image from .text_image_aug import tia_perspective, tia_stretch, tia_distort @@ -43,22 +43,64 @@ class ClsResizeImg(object): return data +class NRTRRecResizeImg(object): + def __init__(self, image_shape, resize_type, padding=False, **kwargs): + self.image_shape = image_shape + self.resize_type = resize_type + self.padding = padding + + def __call__(self, data): + img = data['image'] + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + image_shape = self.image_shape + if self.padding: + imgC, imgH, imgW = image_shape + # todo: change to 0 and modified image shape + h = img.shape[0] + w = img.shape[1] + ratio = w / float(h) + if math.ceil(imgH * ratio) > imgW: + resized_w = imgW + else: + resized_w = int(math.ceil(imgH * ratio)) + resized_image = cv2.resize(img, (resized_w, imgH)) + norm_img = np.expand_dims(resized_image, -1) + norm_img = norm_img.transpose((2, 0, 1)) + resized_image = norm_img.astype(np.float32) / 128. - 1. + padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) + padding_im[:, :, 0:resized_w] = resized_image + data['image'] = padding_im + return data + if self.resize_type == 'PIL': + image_pil = Image.fromarray(np.uint8(img)) + img = image_pil.resize(self.image_shape, Image.ANTIALIAS) + img = np.array(img) + if self.resize_type == 'OpenCV': + img = cv2.resize(img, self.image_shape) + norm_img = np.expand_dims(img, -1) + norm_img = norm_img.transpose((2, 0, 1)) + data['image'] = norm_img.astype(np.float32) / 128. - 1. + return data + + class RecResizeImg(object): def __init__(self, image_shape, infer_mode=False, - character_type='ch', + character_dict_path='./ppocr/utils/ppocr_keys_v1.txt', + padding=True, **kwargs): self.image_shape = image_shape self.infer_mode = infer_mode - self.character_type = character_type + self.character_dict_path = character_dict_path + self.padding = padding def __call__(self, data): img = data['image'] - if self.infer_mode and self.character_type == "ch": + if self.infer_mode and self.character_dict_path is not None: norm_img = resize_norm_img_chinese(img, self.image_shape) else: - norm_img = resize_norm_img(img, self.image_shape) + norm_img = resize_norm_img(img, self.image_shape, self.padding) data['image'] = norm_img return data @@ -83,16 +125,72 @@ class SRNRecResizeImg(object): return data -def resize_norm_img(img, image_shape): - imgC, imgH, imgW = image_shape +class SARRecResizeImg(object): + def __init__(self, image_shape, width_downsample_ratio=0.25, **kwargs): + self.image_shape = image_shape + self.width_downsample_ratio = width_downsample_ratio + + def __call__(self, data): + img = data['image'] + norm_img, resize_shape, pad_shape, valid_ratio = resize_norm_img_sar( + img, self.image_shape, self.width_downsample_ratio) + data['image'] = norm_img + data['resized_shape'] = resize_shape + data['pad_shape'] = pad_shape + data['valid_ratio'] = valid_ratio + return data + + +def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25): + imgC, imgH, imgW_min, imgW_max = image_shape h = img.shape[0] w = img.shape[1] + valid_ratio = 1.0 + # make sure new_width is an integral multiple of width_divisor. + width_divisor = int(1 / width_downsample_ratio) + # resize ratio = w / float(h) - if math.ceil(imgH * ratio) > imgW: + resize_w = math.ceil(imgH * ratio) + if resize_w % width_divisor != 0: + resize_w = round(resize_w / width_divisor) * width_divisor + if imgW_min is not None: + resize_w = max(imgW_min, resize_w) + if imgW_max is not None: + valid_ratio = min(1.0, 1.0 * resize_w / imgW_max) + resize_w = min(imgW_max, resize_w) + resized_image = cv2.resize(img, (resize_w, imgH)) + resized_image = resized_image.astype('float32') + # norm + if image_shape[0] == 1: + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + else: + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + resize_shape = resized_image.shape + padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32) + padding_im[:, :, 0:resize_w] = resized_image + pad_shape = padding_im.shape + + return padding_im, resize_shape, pad_shape, valid_ratio + + +def resize_norm_img(img, image_shape, padding=True): + imgC, imgH, imgW = image_shape + h = img.shape[0] + w = img.shape[1] + if not padding: + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) resized_w = imgW else: - resized_w = int(math.ceil(imgH * ratio)) - resized_image = cv2.resize(img, (resized_w, imgH)) + ratio = w / float(h) + if math.ceil(imgH * ratio) > imgW: + resized_w = imgW + else: + resized_w = int(math.ceil(imgH * ratio)) + resized_image = cv2.resize(img, (resized_w, imgH)) resized_image = resized_image.astype('float32') if image_shape[0] == 1: resized_image = resized_image / 255 diff --git a/ppocr/data/simple_dataset.py b/ppocr/data/simple_dataset.py index e9c3394cbe930d5169ae005e7582a2902e697b7e..6a33e1342506f26ccaa4a146f3f02fadfbd741a2 100644 --- a/ppocr/data/simple_dataset.py +++ b/ppocr/data/simple_dataset.py @@ -15,7 +15,6 @@ import numpy as np import os import random from paddle.io import Dataset - from .imaug import transform, create_operators diff --git a/ppocr/losses/__init__.py b/ppocr/losses/__init__.py index 025ae7ca5cc604eea59423ca7f523c37c1492e35..f3f4cd49332b605ec3a0e65e688d965fd91a5cdf 100755 --- a/ppocr/losses/__init__.py +++ b/ppocr/losses/__init__.py @@ -20,11 +20,15 @@ import paddle.nn as nn from .det_db_loss import DBLoss from .det_east_loss import EASTLoss from .det_sast_loss import SASTLoss +from .det_pse_loss import PSELoss # rec loss from .rec_ctc_loss import CTCLoss from .rec_att_loss import AttentionLoss from .rec_srn_loss import SRNLoss +from .rec_nrtr_loss import NRTRLoss +from .rec_sar_loss import SARLoss +from .rec_aster_loss import AsterLoss # cls loss from .cls_loss import ClsLoss @@ -41,10 +45,12 @@ from .combined_loss import CombinedLoss # table loss from .table_att_loss import TableAttentionLoss + def build_loss(config): support_dict = [ - 'DBLoss', 'EASTLoss', 'SASTLoss', 'CTCLoss', 'ClsLoss', 'AttentionLoss', - 'SRNLoss', 'PGLoss', 'CombinedLoss', 'TableAttentionLoss' + 'DBLoss', 'PSELoss', 'EASTLoss', 'SASTLoss', 'CTCLoss', 'ClsLoss', + 'AttentionLoss', 'SRNLoss', 'PGLoss', 'CombinedLoss', 'NRTRLoss', + 'TableAttentionLoss', 'SARLoss', 'AsterLoss' ] config = copy.deepcopy(config) module_name = config.pop('name') diff --git a/ppocr/losses/ace_loss.py b/ppocr/losses/ace_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..bf15f8e3a7b355bd9e8b69435a5dae01fc75a892 --- /dev/null +++ b/ppocr/losses/ace_loss.py @@ -0,0 +1,49 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn + + +class ACELoss(nn.Layer): + def __init__(self, **kwargs): + super().__init__() + self.loss_func = nn.CrossEntropyLoss( + weight=None, + ignore_index=0, + reduction='none', + soft_label=True, + axis=-1) + + def __call__(self, predicts, batch): + if isinstance(predicts, (list, tuple)): + predicts = predicts[-1] + + B, N = predicts.shape[:2] + div = paddle.to_tensor([N]).astype('float32') + + predicts = nn.functional.softmax(predicts, axis=-1) + aggregation_preds = paddle.sum(predicts, axis=1) + aggregation_preds = paddle.divide(aggregation_preds, div) + + length = batch[2].astype("float32") + batch = batch[3].astype("float32") + batch[:, 0] = paddle.subtract(div, length) + batch = paddle.divide(batch, div) + + loss = self.loss_func(aggregation_preds, batch) + return {"loss_ace": loss} diff --git a/ppocr/losses/basic_loss.py b/ppocr/losses/basic_loss.py index 8306523ac1a933f0c664fc0b4cf077659cccdee3..d2ef5e5ac9692eec5bc30774c4451eab7706705d 100644 --- a/ppocr/losses/basic_loss.py +++ b/ppocr/losses/basic_loss.py @@ -56,31 +56,34 @@ class CELoss(nn.Layer): class KLJSLoss(object): def __init__(self, mode='kl'): - assert mode in ['kl', 'js', 'KL', 'JS'], "mode can only be one of ['kl', 'js', 'KL', 'JS']" + assert mode in ['kl', 'js', 'KL', 'JS' + ], "mode can only be one of ['kl', 'js', 'KL', 'JS']" self.mode = mode def __call__(self, p1, p2, reduction="mean"): - loss = paddle.multiply(p2, paddle.log( (p2+1e-5)/(p1+1e-5) + 1e-5)) + loss = paddle.multiply(p2, paddle.log((p2 + 1e-5) / (p1 + 1e-5) + 1e-5)) if self.mode.lower() == "js": - loss += paddle.multiply(p1, paddle.log((p1+1e-5)/(p2+1e-5) + 1e-5)) + loss += paddle.multiply( + p1, paddle.log((p1 + 1e-5) / (p2 + 1e-5) + 1e-5)) loss *= 0.5 if reduction == "mean": - loss = paddle.mean(loss, axis=[1,2]) - elif reduction=="none" or reduction is None: - return loss + loss = paddle.mean(loss, axis=[1, 2]) + elif reduction == "none" or reduction is None: + return loss else: - loss = paddle.sum(loss, axis=[1,2]) + loss = paddle.sum(loss, axis=[1, 2]) + + return loss - return loss class DMLLoss(nn.Layer): """ DMLLoss """ - def __init__(self, act=None): + def __init__(self, act=None, use_log=False): super().__init__() if act is not None: assert act in ["softmax", "sigmoid"] @@ -90,20 +93,24 @@ class DMLLoss(nn.Layer): self.act = nn.Sigmoid() else: self.act = None - + + self.use_log = use_log + self.jskl_loss = KLJSLoss(mode="js") def forward(self, out1, out2): if self.act is not None: out1 = self.act(out1) out2 = self.act(out2) - if len(out1.shape) < 2: + if self.use_log: + # for recognition distillation, log is needed for feature map log_out1 = paddle.log(out1) log_out2 = paddle.log(out2) loss = (F.kl_div( log_out1, out2, reduction='batchmean') + F.kl_div( log_out2, out1, reduction='batchmean')) / 2.0 else: + # for detection distillation log is not needed loss = self.jskl_loss(out1, out2) return loss diff --git a/ppocr/losses/center_loss.py b/ppocr/losses/center_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..cbef4df965e2659c6aa63c0c69cd8798143df485 --- /dev/null +++ b/ppocr/losses/center_loss.py @@ -0,0 +1,89 @@ +#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +import pickle + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class CenterLoss(nn.Layer): + """ + Reference: Wen et al. A Discriminative Feature Learning Approach for Deep Face Recognition. ECCV 2016. + """ + def __init__(self, + num_classes=6625, + feat_dim=96, + init_center=False, + center_file_path=None): + super().__init__() + self.num_classes = num_classes + self.feat_dim = feat_dim + self.centers = paddle.randn( + shape=[self.num_classes, self.feat_dim]).astype("float64") + + if init_center: + assert os.path.exists( + center_file_path + ), f"center path({center_file_path}) must exist when init_center is set as True." + with open(center_file_path, 'rb') as f: + char_dict = pickle.load(f) + for key in char_dict.keys(): + self.centers[key] = paddle.to_tensor(char_dict[key]) + + def __call__(self, predicts, batch): + assert isinstance(predicts, (list, tuple)) + features, predicts = predicts + + feats_reshape = paddle.reshape( + features, [-1, features.shape[-1]]).astype("float64") + label = paddle.argmax(predicts, axis=2) + label = paddle.reshape(label, [label.shape[0] * label.shape[1]]) + + batch_size = feats_reshape.shape[0] + + #calc l2 distance between feats and centers + square_feat = paddle.sum(paddle.square(feats_reshape), + axis=1, + keepdim=True) + square_feat = paddle.expand(square_feat, [batch_size, self.num_classes]) + + square_center = paddle.sum(paddle.square(self.centers), + axis=1, + keepdim=True) + square_center = paddle.expand( + square_center, [self.num_classes, batch_size]).astype("float64") + square_center = paddle.transpose(square_center, [1, 0]) + + distmat = paddle.add(square_feat, square_center) + feat_dot_center = paddle.matmul(feats_reshape, + paddle.transpose(self.centers, [1, 0])) + distmat = distmat - 2.0 * feat_dot_center + + #generate the mask + classes = paddle.arange(self.num_classes).astype("int64") + label = paddle.expand( + paddle.unsqueeze(label, 1), (batch_size, self.num_classes)) + mask = paddle.equal( + paddle.expand(classes, [batch_size, self.num_classes]), + label).astype("float64") + dist = paddle.multiply(distmat, mask) + + loss = paddle.sum(paddle.clip(dist, min=1e-12, max=1e+12)) / batch_size + return {'loss_center': loss} diff --git a/ppocr/losses/combined_loss.py b/ppocr/losses/combined_loss.py index 0d6fe968d0d7733200a4cfd21d779196cccaba03..72f706e37d6eb0c640cc30de80afe00bce82fd13 100644 --- a/ppocr/losses/combined_loss.py +++ b/ppocr/losses/combined_loss.py @@ -15,6 +15,10 @@ import paddle import paddle.nn as nn +from .rec_ctc_loss import CTCLoss +from .center_loss import CenterLoss +from .ace_loss import ACELoss + from .distillation_loss import DistillationCTCLoss from .distillation_loss import DistillationDMLLoss from .distillation_loss import DistillationDistanceLoss, DistillationDBLoss, DistillationDilaDBLoss @@ -49,11 +53,15 @@ class CombinedLoss(nn.Layer): loss = loss_func(input, batch, **kargs) if isinstance(loss, paddle.Tensor): loss = {"loss_{}_{}".format(str(loss), idx): loss} + weight = self.loss_weight[idx] - for key in loss.keys(): - if key == "loss": - loss_all += loss[key] * weight - else: - loss_dict["{}_{}".format(key, idx)] = loss[key] + + loss = {key: loss[key] * weight for key in loss} + + if "loss" in loss: + loss_all += loss["loss"] + else: + loss_all += paddle.add_n(list(loss.values())) + loss_dict.update(loss) loss_dict["loss"] = loss_all return loss_dict diff --git a/ppocr/losses/det_basic_loss.py b/ppocr/losses/det_basic_loss.py index eba5526dd2bd1c0328130b50817172df437cc360..7017236c284e55710f242275a413d56d32158d34 100644 --- a/ppocr/losses/det_basic_loss.py +++ b/ppocr/losses/det_basic_loss.py @@ -75,12 +75,6 @@ class BalanceLoss(nn.Layer): mask (variable): masked maps. return: (variable) balanced loss """ - # if self.main_loss_type in ['DiceLoss']: - # # For the loss that returns to scalar value, perform ohem on the mask - # mask = ohem_batch(pred, gt, mask, self.negative_ratio) - # loss = self.loss(pred, gt, mask) - # return loss - positive = gt * mask negative = (1 - gt) * mask @@ -153,53 +147,4 @@ class BCELoss(nn.Layer): def forward(self, input, label, mask=None, weight=None, name=None): loss = F.binary_cross_entropy(input, label, reduction=self.reduction) - return loss - - -def ohem_single(score, gt_text, training_mask, ohem_ratio): - pos_num = (int)(np.sum(gt_text > 0.5)) - ( - int)(np.sum((gt_text > 0.5) & (training_mask <= 0.5))) - - if pos_num == 0: - # selected_mask = gt_text.copy() * 0 # may be not good - selected_mask = training_mask - selected_mask = selected_mask.reshape( - 1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32') - return selected_mask - - neg_num = (int)(np.sum(gt_text <= 0.5)) - neg_num = (int)(min(pos_num * ohem_ratio, neg_num)) - - if neg_num == 0: - selected_mask = training_mask - selected_mask = selected_mask.reshape( - 1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32') - return selected_mask - - neg_score = score[gt_text <= 0.5] - # 将负样本得分从高到低排序 - neg_score_sorted = np.sort(-neg_score) - threshold = -neg_score_sorted[neg_num - 1] - # 选出 得分高的 负样本 和正样本 的 mask - selected_mask = ((score >= threshold) | - (gt_text > 0.5)) & (training_mask > 0.5) - selected_mask = selected_mask.reshape( - 1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32') - return selected_mask - - -def ohem_batch(scores, gt_texts, training_masks, ohem_ratio): - scores = scores.numpy() - gt_texts = gt_texts.numpy() - training_masks = training_masks.numpy() - - selected_masks = [] - for i in range(scores.shape[0]): - selected_masks.append( - ohem_single(scores[i, :, :], gt_texts[i, :, :], training_masks[ - i, :, :], ohem_ratio)) - - selected_masks = np.concatenate(selected_masks, 0) - selected_masks = paddle.to_tensor(selected_masks) - - return selected_masks + return loss \ No newline at end of file diff --git a/ppocr/losses/det_pse_loss.py b/ppocr/losses/det_pse_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..78423091f841f29b1217f73f79beb26fe1575844 --- /dev/null +++ b/ppocr/losses/det_pse_loss.py @@ -0,0 +1,145 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import nn +from paddle.nn import functional as F +import numpy as np +from ppocr.utils.iou import iou + + +class PSELoss(nn.Layer): + def __init__(self, + alpha, + ohem_ratio=3, + kernel_sample_mask='pred', + reduction='sum', + eps=1e-6, + **kwargs): + """Implement PSE Loss. + """ + super(PSELoss, self).__init__() + assert reduction in ['sum', 'mean', 'none'] + self.alpha = alpha + self.ohem_ratio = ohem_ratio + self.kernel_sample_mask = kernel_sample_mask + self.reduction = reduction + self.eps = eps + + def forward(self, outputs, labels): + predicts = outputs['maps'] + predicts = F.interpolate(predicts, scale_factor=4) + + texts = predicts[:, 0, :, :] + kernels = predicts[:, 1:, :, :] + gt_texts, gt_kernels, training_masks = labels[1:] + + # text loss + selected_masks = self.ohem_batch(texts, gt_texts, training_masks) + + loss_text = self.dice_loss(texts, gt_texts, selected_masks) + iou_text = iou((texts > 0).astype('int64'), + gt_texts, + training_masks, + reduce=False) + losses = dict(loss_text=loss_text, iou_text=iou_text) + + # kernel loss + loss_kernels = [] + if self.kernel_sample_mask == 'gt': + selected_masks = gt_texts * training_masks + elif self.kernel_sample_mask == 'pred': + selected_masks = ( + F.sigmoid(texts) > 0.5).astype('float32') * training_masks + + for i in range(kernels.shape[1]): + kernel_i = kernels[:, i, :, :] + gt_kernel_i = gt_kernels[:, i, :, :] + loss_kernel_i = self.dice_loss(kernel_i, gt_kernel_i, + selected_masks) + loss_kernels.append(loss_kernel_i) + loss_kernels = paddle.mean(paddle.stack(loss_kernels, axis=1), axis=1) + iou_kernel = iou((kernels[:, -1, :, :] > 0).astype('int64'), + gt_kernels[:, -1, :, :], + training_masks * gt_texts, + reduce=False) + losses.update(dict(loss_kernels=loss_kernels, iou_kernel=iou_kernel)) + loss = self.alpha * loss_text + (1 - self.alpha) * loss_kernels + losses['loss'] = loss + if self.reduction == 'sum': + losses = {x: paddle.sum(v) for x, v in losses.items()} + elif self.reduction == 'mean': + losses = {x: paddle.mean(v) for x, v in losses.items()} + return losses + + def dice_loss(self, input, target, mask): + input = F.sigmoid(input) + + input = input.reshape([input.shape[0], -1]) + target = target.reshape([target.shape[0], -1]) + mask = mask.reshape([mask.shape[0], -1]) + + input = input * mask + target = target * mask + + a = paddle.sum(input * target, 1) + b = paddle.sum(input * input, 1) + self.eps + c = paddle.sum(target * target, 1) + self.eps + d = (2 * a) / (b + c) + return 1 - d + + def ohem_single(self, score, gt_text, training_mask, ohem_ratio=3): + pos_num = int(paddle.sum((gt_text > 0.5).astype('float32'))) - int( + paddle.sum( + paddle.logical_and((gt_text > 0.5), (training_mask <= 0.5)) + .astype('float32'))) + + if pos_num == 0: + selected_mask = training_mask + selected_mask = selected_mask.reshape( + [1, selected_mask.shape[0], selected_mask.shape[1]]).astype( + 'float32') + return selected_mask + + neg_num = int(paddle.sum((gt_text <= 0.5).astype('float32'))) + neg_num = int(min(pos_num * ohem_ratio, neg_num)) + + if neg_num == 0: + selected_mask = training_mask + selected_mask = selected_mask.view( + 1, selected_mask.shape[0], + selected_mask.shape[1]).astype('float32') + return selected_mask + + neg_score = paddle.masked_select(score, gt_text <= 0.5) + neg_score_sorted = paddle.sort(-neg_score) + threshold = -neg_score_sorted[neg_num - 1] + + selected_mask = paddle.logical_and( + paddle.logical_or((score >= threshold), (gt_text > 0.5)), + (training_mask > 0.5)) + selected_mask = selected_mask.reshape( + [1, selected_mask.shape[0], selected_mask.shape[1]]).astype( + 'float32') + return selected_mask + + def ohem_batch(self, scores, gt_texts, training_masks, ohem_ratio=3): + selected_masks = [] + for i in range(scores.shape[0]): + selected_masks.append( + self.ohem_single(scores[i, :, :], gt_texts[i, :, :], + training_masks[i, :, :], ohem_ratio)) + + selected_masks = paddle.concat(selected_masks, 0).astype('float32') + return selected_masks diff --git a/ppocr/losses/distillation_loss.py b/ppocr/losses/distillation_loss.py index 75f0a773152e52c98ada5c1907f1c8cc2f72d8f3..06aa7fa8458a5deece75f1393fe7300e8227d3ca 100644 --- a/ppocr/losses/distillation_loss.py +++ b/ppocr/losses/distillation_loss.py @@ -44,20 +44,22 @@ class DistillationDMLLoss(DMLLoss): def __init__(self, model_name_pairs=[], act=None, + use_log=False, key=None, maps_name=None, name="dml"): - super().__init__(act=act) + super().__init__(act=act, use_log=use_log) assert isinstance(model_name_pairs, list) self.key = key self.model_name_pairs = self._check_model_name_pairs(model_name_pairs) self.name = name self.maps_name = self._check_maps_name(maps_name) - + def _check_model_name_pairs(self, model_name_pairs): if not isinstance(model_name_pairs, list): return [] - elif isinstance(model_name_pairs[0], list) and isinstance(model_name_pairs[0][0], str): + elif isinstance(model_name_pairs[0], list) and isinstance( + model_name_pairs[0][0], str): return model_name_pairs else: return [model_name_pairs] @@ -110,11 +112,11 @@ class DistillationDMLLoss(DMLLoss): if isinstance(loss, dict): for key in loss: loss_dict["{}_{}_{}_{}_{}".format(key, pair[ - 0], pair[1], map_name, idx)] = loss[key] + 0], pair[1], self.maps_name, idx)] = loss[key] else: - loss_dict["{}_{}_{}".format(self.name, self.maps_name[_c], - idx)] = loss - + loss_dict["{}_{}_{}".format(self.name, self.maps_name[ + _c], idx)] = loss + loss_dict = _sum_loss(loss_dict) return loss_dict diff --git a/ppocr/losses/rec_aster_loss.py b/ppocr/losses/rec_aster_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..fbb99d29a638540b02649a8912051339c08b22dd --- /dev/null +++ b/ppocr/losses/rec_aster_loss.py @@ -0,0 +1,99 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + + +class CosineEmbeddingLoss(nn.Layer): + def __init__(self, margin=0.): + super(CosineEmbeddingLoss, self).__init__() + self.margin = margin + self.epsilon = 1e-12 + + def forward(self, x1, x2, target): + similarity = paddle.fluid.layers.reduce_sum( + x1 * x2, dim=-1) / (paddle.norm( + x1, axis=-1) * paddle.norm( + x2, axis=-1) + self.epsilon) + one_list = paddle.full_like(target, fill_value=1) + out = paddle.fluid.layers.reduce_mean( + paddle.where( + paddle.equal(target, one_list), 1. - similarity, + paddle.maximum( + paddle.zeros_like(similarity), similarity - self.margin))) + + return out + + +class AsterLoss(nn.Layer): + def __init__(self, + weight=None, + size_average=True, + ignore_index=-100, + sequence_normalize=False, + sample_normalize=True, + **kwargs): + super(AsterLoss, self).__init__() + self.weight = weight + self.size_average = size_average + self.ignore_index = ignore_index + self.sequence_normalize = sequence_normalize + self.sample_normalize = sample_normalize + self.loss_sem = CosineEmbeddingLoss() + self.is_cosin_loss = True + self.loss_func_rec = nn.CrossEntropyLoss(weight=None, reduction='none') + + def forward(self, predicts, batch): + targets = batch[1].astype("int64") + label_lengths = batch[2].astype('int64') + sem_target = batch[3].astype('float32') + embedding_vectors = predicts['embedding_vectors'] + rec_pred = predicts['rec_pred'] + + if not self.is_cosin_loss: + sem_loss = paddle.sum(self.loss_sem(embedding_vectors, sem_target)) + else: + label_target = paddle.ones([embedding_vectors.shape[0]]) + sem_loss = paddle.sum( + self.loss_sem(embedding_vectors, sem_target, label_target)) + + # rec loss + batch_size, def_max_length = targets.shape[0], targets.shape[1] + + mask = paddle.zeros([batch_size, def_max_length]) + for i in range(batch_size): + mask[i, :label_lengths[i]] = 1 + mask = paddle.cast(mask, "float32") + max_length = max(label_lengths) + assert max_length == rec_pred.shape[1] + targets = targets[:, :max_length] + mask = mask[:, :max_length] + rec_pred = paddle.reshape(rec_pred, [-1, rec_pred.shape[2]]) + input = nn.functional.log_softmax(rec_pred, axis=1) + targets = paddle.reshape(targets, [-1, 1]) + mask = paddle.reshape(mask, [-1, 1]) + output = -paddle.index_sample(input, index=targets) * mask + output = paddle.sum(output) + if self.sequence_normalize: + output = output / paddle.sum(mask) + if self.sample_normalize: + output = output / batch_size + + loss = output + sem_loss * 0.1 + return {'loss': loss} diff --git a/ppocr/losses/rec_ctc_loss.py b/ppocr/losses/rec_ctc_loss.py index 6c0b56ff84db4ff23786fb781d461bf9fbc86ef2..063d68e30861e092e10fa3068e4b7f4755b6197f 100755 --- a/ppocr/losses/rec_ctc_loss.py +++ b/ppocr/losses/rec_ctc_loss.py @@ -21,16 +21,24 @@ from paddle import nn class CTCLoss(nn.Layer): - def __init__(self, **kwargs): + def __init__(self, use_focal_loss=False, **kwargs): super(CTCLoss, self).__init__() self.loss_func = nn.CTCLoss(blank=0, reduction='none') + self.use_focal_loss = use_focal_loss def forward(self, predicts, batch): + if isinstance(predicts, (list, tuple)): + predicts = predicts[-1] predicts = predicts.transpose((1, 0, 2)) N, B, _ = predicts.shape preds_lengths = paddle.to_tensor([N] * B, dtype='int64') labels = batch[1].astype("int32") label_lengths = batch[2].astype('int64') loss = self.loss_func(predicts, labels, preds_lengths, label_lengths) - loss = loss.mean() # sum + if self.use_focal_loss: + weight = paddle.exp(-loss) + weight = paddle.subtract(paddle.to_tensor([1.0]), weight) + weight = paddle.square(weight) + loss = paddle.multiply(loss, weight) + loss = loss.mean() return {'loss': loss} diff --git a/ppocr/losses/rec_enhanced_ctc_loss.py b/ppocr/losses/rec_enhanced_ctc_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..b57be6468e2ec75811442e7449525267e7d9e82e --- /dev/null +++ b/ppocr/losses/rec_enhanced_ctc_loss.py @@ -0,0 +1,70 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +from .ace_loss import ACELoss +from .center_loss import CenterLoss +from .rec_ctc_loss import CTCLoss + + +class EnhancedCTCLoss(nn.Layer): + def __init__(self, + use_focal_loss=False, + use_ace_loss=False, + ace_loss_weight=0.1, + use_center_loss=False, + center_loss_weight=0.05, + num_classes=6625, + feat_dim=96, + init_center=False, + center_file_path=None, + **kwargs): + super(EnhancedCTCLoss, self).__init__() + self.ctc_loss_func = CTCLoss(use_focal_loss=use_focal_loss) + + self.use_ace_loss = False + if use_ace_loss: + self.use_ace_loss = use_ace_loss + self.ace_loss_func = ACELoss() + self.ace_loss_weight = ace_loss_weight + + self.use_center_loss = False + if use_center_loss: + self.use_center_loss = use_center_loss + self.center_loss_func = CenterLoss( + num_classes=num_classes, + feat_dim=feat_dim, + init_center=init_center, + center_file_path=center_file_path) + self.center_loss_weight = center_loss_weight + + def __call__(self, predicts, batch): + loss = self.ctc_loss_func(predicts, batch)["loss"] + + if self.use_center_loss: + center_loss = self.center_loss_func( + predicts, batch)["loss_center"] * self.center_loss_weight + loss = loss + center_loss + + if self.use_ace_loss: + ace_loss = self.ace_loss_func( + predicts, batch)["loss_ace"] * self.ace_loss_weight + loss = loss + ace_loss + + return {'enhanced_ctc_loss': loss} diff --git a/ppocr/losses/rec_nrtr_loss.py b/ppocr/losses/rec_nrtr_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..41714dd2a3ae15eeedc62521d97935f68271c598 --- /dev/null +++ b/ppocr/losses/rec_nrtr_loss.py @@ -0,0 +1,30 @@ +import paddle +from paddle import nn +import paddle.nn.functional as F + + +class NRTRLoss(nn.Layer): + def __init__(self, smoothing=True, **kwargs): + super(NRTRLoss, self).__init__() + self.loss_func = nn.CrossEntropyLoss(reduction='mean', ignore_index=0) + self.smoothing = smoothing + + def forward(self, pred, batch): + pred = pred.reshape([-1, pred.shape[2]]) + max_len = batch[2].max() + tgt = batch[1][:, 1:2 + max_len] + tgt = tgt.reshape([-1]) + if self.smoothing: + eps = 0.1 + n_class = pred.shape[1] + one_hot = F.one_hot(tgt, pred.shape[1]) + one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1) + log_prb = F.log_softmax(pred, axis=1) + non_pad_mask = paddle.not_equal( + tgt, paddle.zeros( + tgt.shape, dtype='int64')) + loss = -(one_hot * log_prb).sum(axis=1) + loss = loss.masked_select(non_pad_mask).mean() + else: + loss = self.loss_func(pred, tgt) + return {'loss': loss} diff --git a/ppocr/losses/rec_sar_loss.py b/ppocr/losses/rec_sar_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..c8bd8bb0ca395fa4658e57b8dcac52a3e94aadce --- /dev/null +++ b/ppocr/losses/rec_sar_loss.py @@ -0,0 +1,28 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + + +class SARLoss(nn.Layer): + def __init__(self, **kwargs): + super(SARLoss, self).__init__() + self.loss_func = paddle.nn.loss.CrossEntropyLoss( + reduction="mean", ignore_index=92) + + def forward(self, predicts, batch): + predict = predicts[:, : + -1, :] # ignore last index of outputs to be in same seq_len with targets + label = batch[1].astype( + "int64")[:, 1:] # ignore first index of target in loss calculation + batch_size, num_steps, num_classes = predict.shape[0], predict.shape[ + 1], predict.shape[2] + assert len(label.shape) == len(list(predict.shape)) - 1, \ + "The target's shape and inputs's shape is [N, d] and [N, num_steps]" + + inputs = paddle.reshape(predict, [-1, num_classes]) + targets = paddle.reshape(label, [-1]) + loss = self.loss_func(inputs, targets) + return {'loss': loss} diff --git a/ppocr/metrics/eval_det_iou.py b/ppocr/metrics/eval_det_iou.py index 0e32b2d19281de9a18a1fe0343bd7e8237825b7b..bc05e7df7d1d21abfb9d9fbd224ecd7254d9f393 100644 --- a/ppocr/metrics/eval_det_iou.py +++ b/ppocr/metrics/eval_det_iou.py @@ -169,21 +169,10 @@ class DetectionIoUEvaluator(object): numGlobalCareDet += numDetCare perSampleMetrics = { - 'precision': precision, - 'recall': recall, - 'hmean': hmean, - 'pairs': pairs, - 'iouMat': [] if len(detPols) > 100 else iouMat.tolist(), - 'gtPolPoints': gtPolPoints, - 'detPolPoints': detPolPoints, 'gtCare': numGtCare, 'detCare': numDetCare, - 'gtDontCare': gtDontCarePolsNum, - 'detDontCare': detDontCarePolsNum, 'detMatched': detMatched, - 'evaluationLog': evaluationLog } - return perSampleMetrics def combine_results(self, results): diff --git a/ppocr/metrics/rec_metric.py b/ppocr/metrics/rec_metric.py index 66c084d771dece0e2974bc72a177b53f564a8f2e..db2f41c3a140ecebc42b71ee03f0ecb5cf50ca80 100644 --- a/ppocr/metrics/rec_metric.py +++ b/ppocr/metrics/rec_metric.py @@ -13,13 +13,20 @@ # limitations under the License. import Levenshtein +import string class RecMetric(object): - def __init__(self, main_indicator='acc', **kwargs): + def __init__(self, main_indicator='acc', is_filter=False, **kwargs): self.main_indicator = main_indicator + self.is_filter = is_filter self.reset() + def _normalize_text(self, text): + text = ''.join( + filter(lambda x: x in (string.digits + string.ascii_letters), text)) + return text.lower() + def __call__(self, pred_label, *args, **kwargs): preds, labels = pred_label correct_num = 0 @@ -28,6 +35,9 @@ class RecMetric(object): for (pred, pred_conf), (target, _) in zip(preds, labels): pred = pred.replace(" ", "") target = target.replace(" ", "") + if self.is_filter: + pred = self._normalize_text(pred) + target = self._normalize_text(target) norm_edit_dis += Levenshtein.distance(pred, target) / max( len(pred), len(target), 1) if pred == target: diff --git a/ppocr/modeling/architectures/base_model.py b/ppocr/modeling/architectures/base_model.py index dbd18070b36f7e99c62de94048ab53d1bedcebe0..c498d9862abcfc85eaf29ed1d949230a1dc1629c 100644 --- a/ppocr/modeling/architectures/base_model.py +++ b/ppocr/modeling/architectures/base_model.py @@ -14,7 +14,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function - from paddle import nn from ppocr.modeling.transforms import build_transform from ppocr.modeling.backbones import build_backbone diff --git a/ppocr/modeling/backbones/__init__.py b/ppocr/modeling/backbones/__init__.py index f4fe8c76be0835f55f402f35ad6a91a5ca116d88..169eb821f110d4a212068ebab4d46d636e241307 100755 --- a/ppocr/modeling/backbones/__init__.py +++ b/ppocr/modeling/backbones/__init__.py @@ -26,8 +26,12 @@ def build_backbone(config, model_type): from .rec_resnet_vd import ResNet from .rec_resnet_fpn import ResNetFPN from .rec_mv1_enhance import MobileNetV1Enhance + from .rec_nrtr_mtb import MTB + from .rec_resnet_31 import ResNet31 + from .rec_resnet_aster import ResNet_ASTER support_dict = [ - "MobileNetV1Enhance", "MobileNetV3", "ResNet", "ResNetFPN" + 'MobileNetV1Enhance', 'MobileNetV3', 'ResNet', 'ResNetFPN', 'MTB', + "ResNet31", "ResNet_ASTER" ] elif model_type == "e2e": from .e2e_resnet_vd_pg import ResNet diff --git a/ppocr/modeling/backbones/rec_mv1_enhance.py b/ppocr/modeling/backbones/rec_mv1_enhance.py index fe874fac1af439bfb47ba9050a61f02db302e224..04a909b8ccafd8e62f9a7076c7dedf63ff745303 100644 --- a/ppocr/modeling/backbones/rec_mv1_enhance.py +++ b/ppocr/modeling/backbones/rec_mv1_enhance.py @@ -1,4 +1,4 @@ -# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,26 +16,17 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import numpy as np -import paddle -from paddle import ParamAttr -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.nn import Conv2D, BatchNorm, Linear, Dropout -from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D -from paddle.nn.initializer import KaimingNormal import math import numpy as np import paddle -from paddle import ParamAttr, reshape, transpose, concat, split +from paddle import ParamAttr, reshape, transpose import paddle.nn as nn import paddle.nn.functional as F from paddle.nn import Conv2D, BatchNorm, Linear, Dropout from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D from paddle.nn.initializer import KaimingNormal -import math -from paddle.nn.functional import hardswish, hardsigmoid from paddle.regularizer import L2Decay +from paddle.nn.functional import hardswish, hardsigmoid class ConvBNLayer(nn.Layer): diff --git a/ppocr/modeling/backbones/rec_nrtr_mtb.py b/ppocr/modeling/backbones/rec_nrtr_mtb.py new file mode 100644 index 0000000000000000000000000000000000000000..22e02a6371c3ff8b28fd88b5cfa1087309d551f8 --- /dev/null +++ b/ppocr/modeling/backbones/rec_nrtr_mtb.py @@ -0,0 +1,48 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle import nn +import paddle + + +class MTB(nn.Layer): + def __init__(self, cnn_num, in_channels): + super(MTB, self).__init__() + self.block = nn.Sequential() + self.out_channels = in_channels + self.cnn_num = cnn_num + if self.cnn_num == 2: + for i in range(self.cnn_num): + self.block.add_sublayer( + 'conv_{}'.format(i), + nn.Conv2D( + in_channels=in_channels + if i == 0 else 32 * (2**(i - 1)), + out_channels=32 * (2**i), + kernel_size=3, + stride=2, + padding=1)) + self.block.add_sublayer('relu_{}'.format(i), nn.ReLU()) + self.block.add_sublayer('bn_{}'.format(i), + nn.BatchNorm2D(32 * (2**i))) + + def forward(self, images): + x = self.block(images) + if self.cnn_num == 2: + # (b, w, h, c) + x = paddle.transpose(x, [0, 3, 2, 1]) + x_shape = paddle.shape(x) + x = paddle.reshape( + x, [x_shape[0], x_shape[1], x_shape[2] * x_shape[3]]) + return x diff --git a/ppocr/modeling/backbones/rec_resnet_31.py b/ppocr/modeling/backbones/rec_resnet_31.py new file mode 100644 index 0000000000000000000000000000000000000000..f60729cdcced2af7626e5615ca323e32c99432ec --- /dev/null +++ b/ppocr/modeling/backbones/rec_resnet_31.py @@ -0,0 +1,176 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np + +__all__ = ["ResNet31"] + + +def conv3x3(in_channel, out_channel, stride=1): + return nn.Conv2D( + in_channel, + out_channel, + kernel_size=3, + stride=stride, + padding=1, + bias_attr=False + ) + + +class BasicBlock(nn.Layer): + expansion = 1 + def __init__(self, in_channels, channels, stride=1, downsample=False): + super().__init__() + self.conv1 = conv3x3(in_channels, channels, stride) + self.bn1 = nn.BatchNorm2D(channels) + self.relu = nn.ReLU() + self.conv2 = conv3x3(channels, channels) + self.bn2 = nn.BatchNorm2D(channels) + self.downsample = downsample + if downsample: + self.downsample = nn.Sequential( + nn.Conv2D(in_channels, channels * self.expansion, 1, stride, bias_attr=False), + nn.BatchNorm2D(channels * self.expansion), + ) + else: + self.downsample = nn.Sequential() + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNet31(nn.Layer): + ''' + Args: + in_channels (int): Number of channels of input image tensor. + layers (list[int]): List of BasicBlock number for each stage. + channels (list[int]): List of out_channels of Conv2d layer. + out_indices (None | Sequence[int]): Indices of output stages. + last_stage_pool (bool): If True, add `MaxPool2d` layer to last stage. + ''' + def __init__(self, + in_channels=3, + layers=[1, 2, 5, 3], + channels=[64, 128, 256, 256, 512, 512, 512], + out_indices=None, + last_stage_pool=False): + super(ResNet31, self).__init__() + assert isinstance(in_channels, int) + assert isinstance(last_stage_pool, bool) + + self.out_indices = out_indices + self.last_stage_pool = last_stage_pool + + # conv 1 (Conv Conv) + self.conv1_1 = nn.Conv2D(in_channels, channels[0], kernel_size=3, stride=1, padding=1) + self.bn1_1 = nn.BatchNorm2D(channels[0]) + self.relu1_1 = nn.ReLU() + + self.conv1_2 = nn.Conv2D(channels[0], channels[1], kernel_size=3, stride=1, padding=1) + self.bn1_2 = nn.BatchNorm2D(channels[1]) + self.relu1_2 = nn.ReLU() + + # conv 2 (Max-pooling, Residual block, Conv) + self.pool2 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block2 = self._make_layer(channels[1], channels[2], layers[0]) + self.conv2 = nn.Conv2D(channels[2], channels[2], kernel_size=3, stride=1, padding=1) + self.bn2 = nn.BatchNorm2D(channels[2]) + self.relu2 = nn.ReLU() + + # conv 3 (Max-pooling, Residual block, Conv) + self.pool3 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block3 = self._make_layer(channels[2], channels[3], layers[1]) + self.conv3 = nn.Conv2D(channels[3], channels[3], kernel_size=3, stride=1, padding=1) + self.bn3 = nn.BatchNorm2D(channels[3]) + self.relu3 = nn.ReLU() + + # conv 4 (Max-pooling, Residual block, Conv) + self.pool4 = nn.MaxPool2D(kernel_size=(2, 1), stride=(2, 1), padding=0, ceil_mode=True) + self.block4 = self._make_layer(channels[3], channels[4], layers[2]) + self.conv4 = nn.Conv2D(channels[4], channels[4], kernel_size=3, stride=1, padding=1) + self.bn4 = nn.BatchNorm2D(channels[4]) + self.relu4 = nn.ReLU() + + # conv 5 ((Max-pooling), Residual block, Conv) + self.pool5 = None + if self.last_stage_pool: + self.pool5 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block5 = self._make_layer(channels[4], channels[5], layers[3]) + self.conv5 = nn.Conv2D(channels[5], channels[5], kernel_size=3, stride=1, padding=1) + self.bn5 = nn.BatchNorm2D(channels[5]) + self.relu5 = nn.ReLU() + + self.out_channels = channels[-1] + + def _make_layer(self, input_channels, output_channels, blocks): + layers = [] + for _ in range(blocks): + downsample = None + if input_channels != output_channels: + downsample = nn.Sequential( + nn.Conv2D( + input_channels, + output_channels, + kernel_size=1, + stride=1, + bias_attr=False), + nn.BatchNorm2D(output_channels), + ) + + layers.append(BasicBlock(input_channels, output_channels, downsample=downsample)) + input_channels = output_channels + return nn.Sequential(*layers) + + + def forward(self, x): + x = self.conv1_1(x) + x = self.bn1_1(x) + x = self.relu1_1(x) + + x = self.conv1_2(x) + x = self.bn1_2(x) + x = self.relu1_2(x) + + outs = [] + for i in range(4): + layer_index = i + 2 + pool_layer = getattr(self, f'pool{layer_index}') + block_layer = getattr(self, f'block{layer_index}') + conv_layer = getattr(self, f'conv{layer_index}') + bn_layer = getattr(self, f'bn{layer_index}') + relu_layer = getattr(self, f'relu{layer_index}') + + if pool_layer is not None: + x = pool_layer(x) + x = block_layer(x) + x = conv_layer(x) + x = bn_layer(x) + x= relu_layer(x) + + outs.append(x) + + if self.out_indices is not None: + return tuple([outs[i] for i in self.out_indices]) + + return x diff --git a/ppocr/modeling/backbones/rec_resnet_aster.py b/ppocr/modeling/backbones/rec_resnet_aster.py new file mode 100644 index 0000000000000000000000000000000000000000..bdecaf46af98f9b967d9a339f82d4e938abdc6d9 --- /dev/null +++ b/ppocr/modeling/backbones/rec_resnet_aster.py @@ -0,0 +1,140 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn + +import sys +import math + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2D( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=1, + bias_attr=False) + + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution""" + return nn.Conv2D( + in_planes, out_planes, kernel_size=1, stride=stride, bias_attr=False) + + +def get_sinusoid_encoding(n_position, feat_dim, wave_length=10000): + # [n_position] + positions = paddle.arange(0, n_position) + # [feat_dim] + dim_range = paddle.arange(0, feat_dim) + dim_range = paddle.pow(wave_length, 2 * (dim_range // 2) / feat_dim) + # [n_position, feat_dim] + angles = paddle.unsqueeze( + positions, axis=1) / paddle.unsqueeze( + dim_range, axis=0) + angles = paddle.cast(angles, "float32") + angles[:, 0::2] = paddle.sin(angles[:, 0::2]) + angles[:, 1::2] = paddle.cos(angles[:, 1::2]) + return angles + + +class AsterBlock(nn.Layer): + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(AsterBlock, self).__init__() + self.conv1 = conv1x1(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2D(planes) + self.relu = nn.ReLU() + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2D(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + out += residual + out = self.relu(out) + return out + + +class ResNet_ASTER(nn.Layer): + """For aster or crnn""" + + def __init__(self, with_lstm=True, n_group=1, in_channels=3): + super(ResNet_ASTER, self).__init__() + self.with_lstm = with_lstm + self.n_group = n_group + + self.layer0 = nn.Sequential( + nn.Conv2D( + in_channels, + 32, + kernel_size=(3, 3), + stride=1, + padding=1, + bias_attr=False), + nn.BatchNorm2D(32), + nn.ReLU()) + + self.inplanes = 32 + self.layer1 = self._make_layer(32, 3, [2, 2]) # [16, 50] + self.layer2 = self._make_layer(64, 4, [2, 2]) # [8, 25] + self.layer3 = self._make_layer(128, 6, [2, 1]) # [4, 25] + self.layer4 = self._make_layer(256, 6, [2, 1]) # [2, 25] + self.layer5 = self._make_layer(512, 3, [2, 1]) # [1, 25] + + if with_lstm: + self.rnn = nn.LSTM(512, 256, direction="bidirect", num_layers=2) + self.out_channels = 2 * 256 + else: + self.out_channels = 512 + + def _make_layer(self, planes, blocks, stride): + downsample = None + if stride != [1, 1] or self.inplanes != planes: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes, stride), nn.BatchNorm2D(planes)) + + layers = [] + layers.append(AsterBlock(self.inplanes, planes, stride, downsample)) + self.inplanes = planes + for _ in range(1, blocks): + layers.append(AsterBlock(self.inplanes, planes)) + return nn.Sequential(*layers) + + def forward(self, x): + x0 = self.layer0(x) + x1 = self.layer1(x0) + x2 = self.layer2(x1) + x3 = self.layer3(x2) + x4 = self.layer4(x3) + x5 = self.layer5(x4) + + cnn_feat = x5.squeeze(2) # [N, c, w] + cnn_feat = paddle.transpose(cnn_feat, perm=[0, 2, 1]) + if self.with_lstm: + rnn_feat, _ = self.rnn(cnn_feat) + return rnn_feat + else: + return cnn_feat diff --git a/ppocr/modeling/heads/__init__.py b/ppocr/modeling/heads/__init__.py index 5096479415f504aa9f074d55bd9b2e4a31c730b4..fdadfed5e3fe30b6bd311a07d6ba36869f175488 100755 --- a/ppocr/modeling/heads/__init__.py +++ b/ppocr/modeling/heads/__init__.py @@ -20,18 +20,24 @@ def build_head(config): from .det_db_head import DBHead from .det_east_head import EASTHead from .det_sast_head import SASTHead + from .det_pse_head import PSEHead from .e2e_pg_head import PGHead # rec head from .rec_ctc_head import CTCHead from .rec_att_head import AttentionHead from .rec_srn_head import SRNHead + from .rec_nrtr_head import Transformer + from .rec_sar_head import SARHead + from .rec_aster_head import AsterHead # cls head from .cls_head import ClsHead support_dict = [ - 'DBHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead', 'AttentionHead', - 'SRNHead', 'PGHead', 'TableAttentionHead'] + 'DBHead', 'PSEHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead', + 'AttentionHead', 'SRNHead', 'PGHead', 'Transformer', + 'TableAttentionHead', 'SARHead', 'AsterHead' + ] #table head from .table_att_head import TableAttentionHead diff --git a/ppocr/modeling/heads/det_pse_head.py b/ppocr/modeling/heads/det_pse_head.py new file mode 100644 index 0000000000000000000000000000000000000000..db800f57a216ab437b724988ce692a9ac0c545d9 --- /dev/null +++ b/ppocr/modeling/heads/det_pse_head.py @@ -0,0 +1,35 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from paddle import nn + + +class PSEHead(nn.Layer): + def __init__(self, + in_channels, + hidden_dim=256, + out_channels=7, + **kwargs): + super(PSEHead, self).__init__() + self.conv1 = nn.Conv2D(in_channels, hidden_dim, kernel_size=3, stride=1, padding=1) + self.bn1 = nn.BatchNorm2D(hidden_dim) + self.relu1 = nn.ReLU() + + self.conv2 = nn.Conv2D(hidden_dim, out_channels, kernel_size=1, stride=1, padding=0) + + + def forward(self, x, **kwargs): + out = self.conv1(x) + out = self.relu1(self.bn1(out)) + out = self.conv2(out) + return {'maps': out} diff --git a/ppocr/modeling/heads/multiheadAttention.py b/ppocr/modeling/heads/multiheadAttention.py new file mode 100755 index 0000000000000000000000000000000000000000..900865ba1a8d80a108b3247ce1aff91c242860f2 --- /dev/null +++ b/ppocr/modeling/heads/multiheadAttention.py @@ -0,0 +1,163 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle.nn import Linear +from paddle.nn.initializer import XavierUniform as xavier_uniform_ +from paddle.nn.initializer import Constant as constant_ +from paddle.nn.initializer import XavierNormal as xavier_normal_ + +zeros_ = constant_(value=0.) +ones_ = constant_(value=1.) + + +class MultiheadAttention(nn.Layer): + """Allows the model to jointly attend to information + from different representation subspaces. + See reference: Attention Is All You Need + + .. math:: + \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O + \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) + + Args: + embed_dim: total dimension of the model + num_heads: parallel attention layers, or heads + + """ + + def __init__(self, + embed_dim, + num_heads, + dropout=0., + bias=True, + add_bias_kv=False, + add_zero_attn=False): + super(MultiheadAttention, self).__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + self.scaling = self.head_dim**-0.5 + self.out_proj = Linear(embed_dim, embed_dim, bias_attr=bias) + self._reset_parameters() + self.conv1 = paddle.nn.Conv2D( + in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1)) + self.conv2 = paddle.nn.Conv2D( + in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1)) + self.conv3 = paddle.nn.Conv2D( + in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1)) + + def _reset_parameters(self): + xavier_uniform_(self.out_proj.weight) + + def forward(self, + query, + key, + value, + key_padding_mask=None, + incremental_state=None, + attn_mask=None): + """ + Inputs of forward function + query: [target length, batch size, embed dim] + key: [sequence length, batch size, embed dim] + value: [sequence length, batch size, embed dim] + key_padding_mask: if True, mask padding based on batch size + incremental_state: if provided, previous time steps are cashed + need_weights: output attn_output_weights + static_kv: key and value are static + + Outputs of forward function + attn_output: [target length, batch size, embed dim] + attn_output_weights: [batch size, target length, sequence length] + """ + q_shape = paddle.shape(query) + src_shape = paddle.shape(key) + q = self._in_proj_q(query) + k = self._in_proj_k(key) + v = self._in_proj_v(value) + q *= self.scaling + q = paddle.transpose( + paddle.reshape( + q, [q_shape[0], q_shape[1], self.num_heads, self.head_dim]), + [1, 2, 0, 3]) + k = paddle.transpose( + paddle.reshape( + k, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]), + [1, 2, 0, 3]) + v = paddle.transpose( + paddle.reshape( + v, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]), + [1, 2, 0, 3]) + if key_padding_mask is not None: + assert key_padding_mask.shape[0] == q_shape[1] + assert key_padding_mask.shape[1] == src_shape[0] + attn_output_weights = paddle.matmul(q, + paddle.transpose(k, [0, 1, 3, 2])) + if attn_mask is not None: + attn_mask = paddle.unsqueeze(paddle.unsqueeze(attn_mask, 0), 0) + attn_output_weights += attn_mask + if key_padding_mask is not None: + attn_output_weights = paddle.reshape( + attn_output_weights, + [q_shape[1], self.num_heads, q_shape[0], src_shape[0]]) + key = paddle.unsqueeze(paddle.unsqueeze(key_padding_mask, 1), 2) + key = paddle.cast(key, 'float32') + y = paddle.full( + shape=paddle.shape(key), dtype='float32', fill_value='-inf') + y = paddle.where(key == 0., key, y) + attn_output_weights += y + attn_output_weights = F.softmax( + attn_output_weights.astype('float32'), + axis=-1, + dtype=paddle.float32 if attn_output_weights.dtype == paddle.float16 + else attn_output_weights.dtype) + attn_output_weights = F.dropout( + attn_output_weights, p=self.dropout, training=self.training) + + attn_output = paddle.matmul(attn_output_weights, v) + attn_output = paddle.reshape( + paddle.transpose(attn_output, [2, 0, 1, 3]), + [q_shape[0], q_shape[1], self.embed_dim]) + attn_output = self.out_proj(attn_output) + + return attn_output + + def _in_proj_q(self, query): + query = paddle.transpose(query, [1, 2, 0]) + query = paddle.unsqueeze(query, axis=2) + res = self.conv1(query) + res = paddle.squeeze(res, axis=2) + res = paddle.transpose(res, [2, 0, 1]) + return res + + def _in_proj_k(self, key): + key = paddle.transpose(key, [1, 2, 0]) + key = paddle.unsqueeze(key, axis=2) + res = self.conv2(key) + res = paddle.squeeze(res, axis=2) + res = paddle.transpose(res, [2, 0, 1]) + return res + + def _in_proj_v(self, value): + value = paddle.transpose(value, [1, 2, 0]) #(1, 2, 0) + value = paddle.unsqueeze(value, axis=2) + res = self.conv3(value) + res = paddle.squeeze(res, axis=2) + res = paddle.transpose(res, [2, 0, 1]) + return res diff --git a/ppocr/modeling/heads/rec_aster_head.py b/ppocr/modeling/heads/rec_aster_head.py new file mode 100644 index 0000000000000000000000000000000000000000..4961897b409020fe6cff72eb96f3257156fa33ac --- /dev/null +++ b/ppocr/modeling/heads/rec_aster_head.py @@ -0,0 +1,389 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys + +import paddle +from paddle import nn +from paddle.nn import functional as F + + +class AsterHead(nn.Layer): + def __init__(self, + in_channels, + out_channels, + sDim, + attDim, + max_len_labels, + time_step=25, + beam_width=5, + **kwargs): + super(AsterHead, self).__init__() + self.num_classes = out_channels + self.in_planes = in_channels + self.sDim = sDim + self.attDim = attDim + self.max_len_labels = max_len_labels + self.decoder = AttentionRecognitionHead(in_channels, out_channels, sDim, + attDim, max_len_labels) + self.time_step = time_step + self.embeder = Embedding(self.time_step, in_channels) + self.beam_width = beam_width + self.eos = self.num_classes - 1 + + def forward(self, x, targets=None, embed=None): + return_dict = {} + embedding_vectors = self.embeder(x) + + if self.training: + rec_targets, rec_lengths, _ = targets + rec_pred = self.decoder([x, rec_targets, rec_lengths], + embedding_vectors) + return_dict['rec_pred'] = rec_pred + return_dict['embedding_vectors'] = embedding_vectors + else: + rec_pred, rec_pred_scores = self.decoder.beam_search( + x, self.beam_width, self.eos, embedding_vectors) + return_dict['rec_pred'] = rec_pred + return_dict['rec_pred_scores'] = rec_pred_scores + return_dict['embedding_vectors'] = embedding_vectors + + return return_dict + + +class Embedding(nn.Layer): + def __init__(self, in_timestep, in_planes, mid_dim=4096, embed_dim=300): + super(Embedding, self).__init__() + self.in_timestep = in_timestep + self.in_planes = in_planes + self.embed_dim = embed_dim + self.mid_dim = mid_dim + self.eEmbed = nn.Linear( + in_timestep * in_planes, + self.embed_dim) # Embed encoder output to a word-embedding like + + def forward(self, x): + x = paddle.reshape(x, [paddle.shape(x)[0], -1]) + x = self.eEmbed(x) + return x + + +class AttentionRecognitionHead(nn.Layer): + """ + input: [b x 16 x 64 x in_planes] + output: probability sequence: [b x T x num_classes] + """ + + def __init__(self, in_channels, out_channels, sDim, attDim, max_len_labels): + super(AttentionRecognitionHead, self).__init__() + self.num_classes = out_channels # this is the output classes. So it includes the . + self.in_planes = in_channels + self.sDim = sDim + self.attDim = attDim + self.max_len_labels = max_len_labels + + self.decoder = DecoderUnit( + sDim=sDim, xDim=in_channels, yDim=self.num_classes, attDim=attDim) + + def forward(self, x, embed): + x, targets, lengths = x + batch_size = paddle.shape(x)[0] + # Decoder + state = self.decoder.get_initial_state(embed) + outputs = [] + for i in range(max(lengths)): + if i == 0: + y_prev = paddle.full( + shape=[batch_size], fill_value=self.num_classes) + else: + y_prev = targets[:, i - 1] + output, state = self.decoder(x, state, y_prev) + outputs.append(output) + outputs = paddle.concat([_.unsqueeze(1) for _ in outputs], 1) + return outputs + + # inference stage. + def sample(self, x): + x, _, _ = x + batch_size = x.size(0) + # Decoder + state = paddle.zeros([1, batch_size, self.sDim]) + + predicted_ids, predicted_scores = [], [] + for i in range(self.max_len_labels): + if i == 0: + y_prev = paddle.full( + shape=[batch_size], fill_value=self.num_classes) + else: + y_prev = predicted + + output, state = self.decoder(x, state, y_prev) + output = F.softmax(output, axis=1) + score, predicted = output.max(1) + predicted_ids.append(predicted.unsqueeze(1)) + predicted_scores.append(score.unsqueeze(1)) + predicted_ids = paddle.concat([predicted_ids, 1]) + predicted_scores = paddle.concat([predicted_scores, 1]) + # return predicted_ids.squeeze(), predicted_scores.squeeze() + return predicted_ids, predicted_scores + + def beam_search(self, x, beam_width, eos, embed): + def _inflate(tensor, times, dim): + repeat_dims = [1] * tensor.dim() + repeat_dims[dim] = times + output = paddle.tile(tensor, repeat_dims) + return output + + # https://github.com/IBM/pytorch-seq2seq/blob/fede87655ddce6c94b38886089e05321dc9802af/seq2seq/models/TopKDecoder.py + batch_size, l, d = x.shape + x = paddle.tile( + paddle.transpose( + x.unsqueeze(1), perm=[1, 0, 2, 3]), [beam_width, 1, 1, 1]) + inflated_encoder_feats = paddle.reshape( + paddle.transpose( + x, perm=[1, 0, 2, 3]), [-1, l, d]) + + # Initialize the decoder + state = self.decoder.get_initial_state(embed, tile_times=beam_width) + + pos_index = paddle.reshape( + paddle.arange(batch_size) * beam_width, shape=[-1, 1]) + + # Initialize the scores + sequence_scores = paddle.full( + shape=[batch_size * beam_width, 1], fill_value=-float('Inf')) + index = [i * beam_width for i in range(0, batch_size)] + sequence_scores[index] = 0.0 + + # Initialize the input vector + y_prev = paddle.full( + shape=[batch_size * beam_width], fill_value=self.num_classes) + + # Store decisions for backtracking + stored_scores = list() + stored_predecessors = list() + stored_emitted_symbols = list() + + for i in range(self.max_len_labels): + output, state = self.decoder(inflated_encoder_feats, state, y_prev) + state = paddle.unsqueeze(state, axis=0) + log_softmax_output = paddle.nn.functional.log_softmax( + output, axis=1) + + sequence_scores = _inflate(sequence_scores, self.num_classes, 1) + sequence_scores += log_softmax_output + scores, candidates = paddle.topk( + paddle.reshape(sequence_scores, [batch_size, -1]), + beam_width, + axis=1) + + # Reshape input = (bk, 1) and sequence_scores = (bk, 1) + y_prev = paddle.reshape( + candidates % self.num_classes, shape=[batch_size * beam_width]) + sequence_scores = paddle.reshape( + scores, shape=[batch_size * beam_width, 1]) + + # Update fields for next timestep + pos_index = paddle.expand_as(pos_index, candidates) + predecessors = paddle.cast( + candidates / self.num_classes + pos_index, dtype='int64') + predecessors = paddle.reshape( + predecessors, shape=[batch_size * beam_width, 1]) + state = paddle.index_select( + state, index=predecessors.squeeze(), axis=1) + + # Update sequence socres and erase scores for symbol so that they aren't expanded + stored_scores.append(sequence_scores.clone()) + y_prev = paddle.reshape(y_prev, shape=[-1, 1]) + eos_prev = paddle.full_like(y_prev, fill_value=eos) + mask = eos_prev == y_prev + mask = paddle.nonzero(mask) + if mask.dim() > 0: + sequence_scores = sequence_scores.numpy() + mask = mask.numpy() + sequence_scores[mask] = -float('inf') + sequence_scores = paddle.to_tensor(sequence_scores) + + # Cache results for backtracking + stored_predecessors.append(predecessors) + y_prev = paddle.squeeze(y_prev) + stored_emitted_symbols.append(y_prev) + + # Do backtracking to return the optimal values + #====== backtrak ======# + # Initialize return variables given different types + p = list() + l = [[self.max_len_labels] * beam_width for _ in range(batch_size) + ] # Placeholder for lengths of top-k sequences + + # the last step output of the beams are not sorted + # thus they are sorted here + sorted_score, sorted_idx = paddle.topk( + paddle.reshape( + stored_scores[-1], shape=[batch_size, beam_width]), + beam_width) + + # initialize the sequence scores with the sorted last step beam scores + s = sorted_score.clone() + + batch_eos_found = [0] * batch_size # the number of EOS found + # in the backward loop below for each batch + t = self.max_len_labels - 1 + # initialize the back pointer with the sorted order of the last step beams. + # add pos_index for indexing variable with b*k as the first dimension. + t_predecessors = paddle.reshape( + sorted_idx + pos_index.expand_as(sorted_idx), + shape=[batch_size * beam_width]) + while t >= 0: + # Re-order the variables with the back pointer + current_symbol = paddle.index_select( + stored_emitted_symbols[t], index=t_predecessors, axis=0) + t_predecessors = paddle.index_select( + stored_predecessors[t].squeeze(), index=t_predecessors, axis=0) + eos_indices = stored_emitted_symbols[t] == eos + eos_indices = paddle.nonzero(eos_indices) + + if eos_indices.dim() > 0: + for i in range(eos_indices.shape[0] - 1, -1, -1): + # Indices of the EOS symbol for both variables + # with b*k as the first dimension, and b, k for + # the first two dimensions + idx = eos_indices[i] + b_idx = int(idx[0] / beam_width) + # The indices of the replacing position + # according to the replacement strategy noted above + res_k_idx = beam_width - (batch_eos_found[b_idx] % + beam_width) - 1 + batch_eos_found[b_idx] += 1 + res_idx = b_idx * beam_width + res_k_idx + + # Replace the old information in return variables + # with the new ended sequence information + t_predecessors[res_idx] = stored_predecessors[t][idx[0]] + current_symbol[res_idx] = stored_emitted_symbols[t][idx[0]] + s[b_idx, res_k_idx] = stored_scores[t][idx[0], 0] + l[b_idx][res_k_idx] = t + 1 + + # record the back tracked results + p.append(current_symbol) + t -= 1 + + # Sort and re-order again as the added ended sequences may change + # the order (very unlikely) + s, re_sorted_idx = s.topk(beam_width) + for b_idx in range(batch_size): + l[b_idx] = [ + l[b_idx][k_idx.item()] for k_idx in re_sorted_idx[b_idx, :] + ] + + re_sorted_idx = paddle.reshape( + re_sorted_idx + pos_index.expand_as(re_sorted_idx), + [batch_size * beam_width]) + + # Reverse the sequences and re-order at the same time + # It is reversed because the backtracking happens in reverse time order + p = [ + paddle.reshape( + paddle.index_select(step, re_sorted_idx, 0), + shape=[batch_size, beam_width, -1]) for step in reversed(p) + ] + p = paddle.concat(p, -1)[:, 0, :] + return p, paddle.ones_like(p) + + +class AttentionUnit(nn.Layer): + def __init__(self, sDim, xDim, attDim): + super(AttentionUnit, self).__init__() + + self.sDim = sDim + self.xDim = xDim + self.attDim = attDim + + self.sEmbed = nn.Linear(sDim, attDim) + self.xEmbed = nn.Linear(xDim, attDim) + self.wEmbed = nn.Linear(attDim, 1) + + def forward(self, x, sPrev): + batch_size, T, _ = x.shape # [b x T x xDim] + x = paddle.reshape(x, [-1, self.xDim]) # [(b x T) x xDim] + xProj = self.xEmbed(x) # [(b x T) x attDim] + xProj = paddle.reshape(xProj, [batch_size, T, -1]) # [b x T x attDim] + + sPrev = sPrev.squeeze(0) + sProj = self.sEmbed(sPrev) # [b x attDim] + sProj = paddle.unsqueeze(sProj, 1) # [b x 1 x attDim] + sProj = paddle.expand(sProj, + [batch_size, T, self.attDim]) # [b x T x attDim] + + sumTanh = paddle.tanh(sProj + xProj) + sumTanh = paddle.reshape(sumTanh, [-1, self.attDim]) + + vProj = self.wEmbed(sumTanh) # [(b x T) x 1] + vProj = paddle.reshape(vProj, [batch_size, T]) + alpha = F.softmax( + vProj, axis=1) # attention weights for each sample in the minibatch + return alpha + + +class DecoderUnit(nn.Layer): + def __init__(self, sDim, xDim, yDim, attDim): + super(DecoderUnit, self).__init__() + self.sDim = sDim + self.xDim = xDim + self.yDim = yDim + self.attDim = attDim + self.emdDim = attDim + + self.attention_unit = AttentionUnit(sDim, xDim, attDim) + self.tgt_embedding = nn.Embedding( + yDim + 1, self.emdDim, weight_attr=nn.initializer.Normal( + std=0.01)) # the last is used for + self.gru = nn.GRUCell(input_size=xDim + self.emdDim, hidden_size=sDim) + self.fc = nn.Linear( + sDim, + yDim, + weight_attr=nn.initializer.Normal(std=0.01), + bias_attr=nn.initializer.Constant(value=0)) + self.embed_fc = nn.Linear(300, self.sDim) + + def get_initial_state(self, embed, tile_times=1): + assert embed.shape[1] == 300 + state = self.embed_fc(embed) # N * sDim + if tile_times != 1: + state = state.unsqueeze(1) + trans_state = paddle.transpose(state, perm=[1, 0, 2]) + state = paddle.tile(trans_state, repeat_times=[tile_times, 1, 1]) + trans_state = paddle.transpose(state, perm=[1, 0, 2]) + state = paddle.reshape(trans_state, shape=[-1, self.sDim]) + state = state.unsqueeze(0) # 1 * N * sDim + return state + + def forward(self, x, sPrev, yPrev): + # x: feature sequence from the image decoder. + batch_size, T, _ = x.shape + alpha = self.attention_unit(x, sPrev) + context = paddle.squeeze(paddle.matmul(alpha.unsqueeze(1), x), axis=1) + yPrev = paddle.cast(yPrev, dtype="int64") + yProj = self.tgt_embedding(yPrev) + + concat_context = paddle.concat([yProj, context], 1) + concat_context = paddle.squeeze(concat_context, 1) + sPrev = paddle.squeeze(sPrev, 0) + output, state = self.gru(concat_context, sPrev) + output = paddle.squeeze(output, axis=1) + output = self.fc(output) + return output, state \ No newline at end of file diff --git a/ppocr/modeling/heads/rec_ctc_head.py b/ppocr/modeling/heads/rec_ctc_head.py index 9c38d31fa0abcf39a583e5edcebfc8f336f41c46..35d33d5f56b3b378286565cbfa9755f43343b278 100755 --- a/ppocr/modeling/heads/rec_ctc_head.py +++ b/ppocr/modeling/heads/rec_ctc_head.py @@ -38,6 +38,7 @@ class CTCHead(nn.Layer): out_channels, fc_decay=0.0004, mid_channels=None, + return_feats=False, **kwargs): super(CTCHead, self).__init__() if mid_channels is None: @@ -66,14 +67,22 @@ class CTCHead(nn.Layer): bias_attr=bias_attr2) self.out_channels = out_channels self.mid_channels = mid_channels + self.return_feats = return_feats def forward(self, x, targets=None): if self.mid_channels is None: predicts = self.fc(x) else: - predicts = self.fc1(x) - predicts = self.fc2(predicts) - + x = self.fc1(x) + predicts = self.fc2(x) + + if self.return_feats: + result = (x, predicts) + else: + result = predicts + if not self.training: predicts = F.softmax(predicts, axis=2) - return predicts + result = predicts + + return result diff --git a/ppocr/modeling/heads/rec_nrtr_head.py b/ppocr/modeling/heads/rec_nrtr_head.py new file mode 100644 index 0000000000000000000000000000000000000000..38ba0c917840ea7d1e2a3c2bf0da32c2c35f2b40 --- /dev/null +++ b/ppocr/modeling/heads/rec_nrtr_head.py @@ -0,0 +1,826 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle +import copy +from paddle import nn +import paddle.nn.functional as F +from paddle.nn import LayerList +from paddle.nn.initializer import XavierNormal as xavier_uniform_ +from paddle.nn import Dropout, Linear, LayerNorm, Conv2D +import numpy as np +from ppocr.modeling.heads.multiheadAttention import MultiheadAttention +from paddle.nn.initializer import Constant as constant_ +from paddle.nn.initializer import XavierNormal as xavier_normal_ + +zeros_ = constant_(value=0.) +ones_ = constant_(value=1.) + + +class Transformer(nn.Layer): + """A transformer model. User is able to modify the attributes as needed. The architechture + is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer, + Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and + Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information + Processing Systems, pages 6000-6010. + + Args: + d_model: the number of expected features in the encoder/decoder inputs (default=512). + nhead: the number of heads in the multiheadattention models (default=8). + num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6). + num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + custom_encoder: custom encoder (default=None). + custom_decoder: custom decoder (default=None). + + """ + + def __init__(self, + d_model=512, + nhead=8, + num_encoder_layers=6, + beam_size=0, + num_decoder_layers=6, + dim_feedforward=1024, + attention_dropout_rate=0.0, + residual_dropout_rate=0.1, + custom_encoder=None, + custom_decoder=None, + in_channels=0, + out_channels=0, + scale_embedding=True): + super(Transformer, self).__init__() + self.out_channels = out_channels + 1 + self.embedding = Embeddings( + d_model=d_model, + vocab=self.out_channels, + padding_idx=0, + scale_embedding=scale_embedding) + self.positional_encoding = PositionalEncoding( + dropout=residual_dropout_rate, + dim=d_model, ) + if custom_encoder is not None: + self.encoder = custom_encoder + else: + if num_encoder_layers > 0: + encoder_layer = TransformerEncoderLayer( + d_model, nhead, dim_feedforward, attention_dropout_rate, + residual_dropout_rate) + self.encoder = TransformerEncoder(encoder_layer, + num_encoder_layers) + else: + self.encoder = None + + if custom_decoder is not None: + self.decoder = custom_decoder + else: + decoder_layer = TransformerDecoderLayer( + d_model, nhead, dim_feedforward, attention_dropout_rate, + residual_dropout_rate) + self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers) + + self._reset_parameters() + self.beam_size = beam_size + self.d_model = d_model + self.nhead = nhead + self.tgt_word_prj = nn.Linear( + d_model, self.out_channels, bias_attr=False) + w0 = np.random.normal(0.0, d_model**-0.5, + (d_model, self.out_channels)).astype(np.float32) + self.tgt_word_prj.weight.set_value(w0) + self.apply(self._init_weights) + + def _init_weights(self, m): + + if isinstance(m, nn.Conv2D): + xavier_normal_(m.weight) + if m.bias is not None: + zeros_(m.bias) + + def forward_train(self, src, tgt): + tgt = tgt[:, :-1] + + tgt_key_padding_mask = self.generate_padding_mask(tgt) + tgt = self.embedding(tgt).transpose([1, 0, 2]) + tgt = self.positional_encoding(tgt) + tgt_mask = self.generate_square_subsequent_mask(tgt.shape[0]) + + if self.encoder is not None: + src = self.positional_encoding(src.transpose([1, 0, 2])) + memory = self.encoder(src) + else: + memory = src.squeeze(2).transpose([2, 0, 1]) + output = self.decoder( + tgt, + memory, + tgt_mask=tgt_mask, + memory_mask=None, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=None) + output = output.transpose([1, 0, 2]) + logit = self.tgt_word_prj(output) + return logit + + def forward(self, src, targets=None): + """Take in and process masked source/target sequences. + Args: + src: the sequence to the encoder (required). + tgt: the sequence to the decoder (required). + Shape: + - src: :math:`(S, N, E)`. + - tgt: :math:`(T, N, E)`. + Examples: + >>> output = transformer_model(src, tgt) + """ + + if self.training: + max_len = targets[1].max() + tgt = targets[0][:, :2 + max_len] + return self.forward_train(src, tgt) + else: + if self.beam_size > 0: + return self.forward_beam(src) + else: + return self.forward_test(src) + + def forward_test(self, src): + bs = paddle.shape(src)[0] + if self.encoder is not None: + src = self.positional_encoding(paddle.transpose(src, [1, 0, 2])) + memory = self.encoder(src) + else: + memory = paddle.transpose(paddle.squeeze(src, 2), [2, 0, 1]) + dec_seq = paddle.full((bs, 1), 2, dtype=paddle.int64) + dec_prob = paddle.full((bs, 1), 1., dtype=paddle.float32) + for len_dec_seq in range(1, 25): + dec_seq_embed = paddle.transpose(self.embedding(dec_seq), [1, 0, 2]) + dec_seq_embed = self.positional_encoding(dec_seq_embed) + tgt_mask = self.generate_square_subsequent_mask( + paddle.shape(dec_seq_embed)[0]) + output = self.decoder( + dec_seq_embed, + memory, + tgt_mask=tgt_mask, + memory_mask=None, + tgt_key_padding_mask=None, + memory_key_padding_mask=None) + dec_output = paddle.transpose(output, [1, 0, 2]) + dec_output = dec_output[:, -1, :] + word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=1) + preds_idx = paddle.argmax(word_prob, axis=1) + if paddle.equal_all( + preds_idx, + paddle.full( + paddle.shape(preds_idx), 3, dtype='int64')): + break + preds_prob = paddle.max(word_prob, axis=1) + dec_seq = paddle.concat( + [dec_seq, paddle.reshape(preds_idx, [-1, 1])], axis=1) + dec_prob = paddle.concat( + [dec_prob, paddle.reshape(preds_prob, [-1, 1])], axis=1) + return [dec_seq, dec_prob] + + def forward_beam(self, images): + ''' Translation work in one batch ''' + + def get_inst_idx_to_tensor_position_map(inst_idx_list): + ''' Indicate the position of an instance in a tensor. ''' + return { + inst_idx: tensor_position + for tensor_position, inst_idx in enumerate(inst_idx_list) + } + + def collect_active_part(beamed_tensor, curr_active_inst_idx, + n_prev_active_inst, n_bm): + ''' Collect tensor parts associated to active instances. ''' + + beamed_tensor_shape = paddle.shape(beamed_tensor) + n_curr_active_inst = len(curr_active_inst_idx) + new_shape = (n_curr_active_inst * n_bm, beamed_tensor_shape[1], + beamed_tensor_shape[2]) + + beamed_tensor = beamed_tensor.reshape([n_prev_active_inst, -1]) + beamed_tensor = beamed_tensor.index_select( + curr_active_inst_idx, axis=0) + beamed_tensor = beamed_tensor.reshape(new_shape) + + return beamed_tensor + + def collate_active_info(src_enc, inst_idx_to_position_map, + active_inst_idx_list): + # Sentences which are still active are collected, + # so the decoder will not run on completed sentences. + + n_prev_active_inst = len(inst_idx_to_position_map) + active_inst_idx = [ + inst_idx_to_position_map[k] for k in active_inst_idx_list + ] + active_inst_idx = paddle.to_tensor(active_inst_idx, dtype='int64') + active_src_enc = collect_active_part( + src_enc.transpose([1, 0, 2]), active_inst_idx, + n_prev_active_inst, n_bm).transpose([1, 0, 2]) + active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map( + active_inst_idx_list) + return active_src_enc, active_inst_idx_to_position_map + + def beam_decode_step(inst_dec_beams, len_dec_seq, enc_output, + inst_idx_to_position_map, n_bm, + memory_key_padding_mask): + ''' Decode and update beam status, and then return active beam idx ''' + + def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq): + dec_partial_seq = [ + b.get_current_state() for b in inst_dec_beams if not b.done + ] + dec_partial_seq = paddle.stack(dec_partial_seq) + dec_partial_seq = dec_partial_seq.reshape([-1, len_dec_seq]) + return dec_partial_seq + + def predict_word(dec_seq, enc_output, n_active_inst, n_bm, + memory_key_padding_mask): + dec_seq = paddle.transpose(self.embedding(dec_seq), [1, 0, 2]) + dec_seq = self.positional_encoding(dec_seq) + tgt_mask = self.generate_square_subsequent_mask( + paddle.shape(dec_seq)[0]) + dec_output = self.decoder( + dec_seq, + enc_output, + tgt_mask=tgt_mask, + tgt_key_padding_mask=None, + memory_key_padding_mask=memory_key_padding_mask, ) + dec_output = paddle.transpose(dec_output, [1, 0, 2]) + dec_output = dec_output[:, + -1, :] # Pick the last step: (bh * bm) * d_h + word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=1) + word_prob = paddle.reshape(word_prob, [n_active_inst, n_bm, -1]) + return word_prob + + def collect_active_inst_idx_list(inst_beams, word_prob, + inst_idx_to_position_map): + active_inst_idx_list = [] + for inst_idx, inst_position in inst_idx_to_position_map.items(): + is_inst_complete = inst_beams[inst_idx].advance(word_prob[ + inst_position]) + if not is_inst_complete: + active_inst_idx_list += [inst_idx] + + return active_inst_idx_list + + n_active_inst = len(inst_idx_to_position_map) + dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq) + word_prob = predict_word(dec_seq, enc_output, n_active_inst, n_bm, + None) + # Update the beam with predicted word prob information and collect incomplete instances + active_inst_idx_list = collect_active_inst_idx_list( + inst_dec_beams, word_prob, inst_idx_to_position_map) + return active_inst_idx_list + + def collect_hypothesis_and_scores(inst_dec_beams, n_best): + all_hyp, all_scores = [], [] + for inst_idx in range(len(inst_dec_beams)): + scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores() + all_scores += [scores[:n_best]] + hyps = [ + inst_dec_beams[inst_idx].get_hypothesis(i) + for i in tail_idxs[:n_best] + ] + all_hyp += [hyps] + return all_hyp, all_scores + + with paddle.no_grad(): + #-- Encode + if self.encoder is not None: + src = self.positional_encoding(images.transpose([1, 0, 2])) + src_enc = self.encoder(src) + else: + src_enc = images.squeeze(2).transpose([0, 2, 1]) + + n_bm = self.beam_size + src_shape = paddle.shape(src_enc) + inst_dec_beams = [Beam(n_bm) for _ in range(1)] + active_inst_idx_list = list(range(1)) + # Repeat data for beam search + src_enc = paddle.tile(src_enc, [1, n_bm, 1]) + inst_idx_to_position_map = get_inst_idx_to_tensor_position_map( + active_inst_idx_list) + # Decode + for len_dec_seq in range(1, 25): + src_enc_copy = src_enc.clone() + active_inst_idx_list = beam_decode_step( + inst_dec_beams, len_dec_seq, src_enc_copy, + inst_idx_to_position_map, n_bm, None) + if not active_inst_idx_list: + break # all instances have finished their path to + src_enc, inst_idx_to_position_map = collate_active_info( + src_enc_copy, inst_idx_to_position_map, + active_inst_idx_list) + batch_hyp, batch_scores = collect_hypothesis_and_scores(inst_dec_beams, + 1) + result_hyp = [] + hyp_scores = [] + for bs_hyp, score in zip(batch_hyp, batch_scores): + l = len(bs_hyp[0]) + bs_hyp_pad = bs_hyp[0] + [3] * (25 - l) + result_hyp.append(bs_hyp_pad) + score = float(score) / l + hyp_score = [score for _ in range(25)] + hyp_scores.append(hyp_score) + return [ + paddle.to_tensor( + np.array(result_hyp), dtype=paddle.int64), + paddle.to_tensor(hyp_scores) + ] + + def generate_square_subsequent_mask(self, sz): + """Generate a square mask for the sequence. The masked positions are filled with float('-inf'). + Unmasked positions are filled with float(0.0). + """ + mask = paddle.zeros([sz, sz], dtype='float32') + mask_inf = paddle.triu( + paddle.full( + shape=[sz, sz], dtype='float32', fill_value='-inf'), + diagonal=1) + mask = mask + mask_inf + return mask + + def generate_padding_mask(self, x): + padding_mask = paddle.equal(x, paddle.to_tensor(0, dtype=x.dtype)) + return padding_mask + + def _reset_parameters(self): + """Initiate parameters in the transformer model.""" + + for p in self.parameters(): + if p.dim() > 1: + xavier_uniform_(p) + + +class TransformerEncoder(nn.Layer): + """TransformerEncoder is a stack of N encoder layers + Args: + encoder_layer: an instance of the TransformerEncoderLayer() class (required). + num_layers: the number of sub-encoder-layers in the encoder (required). + norm: the layer normalization component (optional). + """ + + def __init__(self, encoder_layer, num_layers): + super(TransformerEncoder, self).__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + + def forward(self, src): + """Pass the input through the endocder layers in turn. + Args: + src: the sequnce to the encoder (required). + mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + """ + output = src + + for i in range(self.num_layers): + output = self.layers[i](output, + src_mask=None, + src_key_padding_mask=None) + + return output + + +class TransformerDecoder(nn.Layer): + """TransformerDecoder is a stack of N decoder layers + + Args: + decoder_layer: an instance of the TransformerDecoderLayer() class (required). + num_layers: the number of sub-decoder-layers in the decoder (required). + norm: the layer normalization component (optional). + + """ + + def __init__(self, decoder_layer, num_layers): + super(TransformerDecoder, self).__init__() + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + + def forward(self, + tgt, + memory, + tgt_mask=None, + memory_mask=None, + tgt_key_padding_mask=None, + memory_key_padding_mask=None): + """Pass the inputs (and mask) through the decoder layer in turn. + + Args: + tgt: the sequence to the decoder (required). + memory: the sequnce from the last layer of the encoder (required). + tgt_mask: the mask for the tgt sequence (optional). + memory_mask: the mask for the memory sequence (optional). + tgt_key_padding_mask: the mask for the tgt keys per batch (optional). + memory_key_padding_mask: the mask for the memory keys per batch (optional). + """ + output = tgt + for i in range(self.num_layers): + output = self.layers[i]( + output, + memory, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask) + + return output + + +class TransformerEncoderLayer(nn.Layer): + """TransformerEncoderLayer is made up of self-attn and feedforward network. + This standard encoder layer is based on the paper "Attention Is All You Need". + Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, + Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in + Neural Information Processing Systems, pages 6000-6010. Users may modify or implement + in a different way during application. + + Args: + d_model: the number of expected features in the input (required). + nhead: the number of heads in the multiheadattention models (required). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + + """ + + def __init__(self, + d_model, + nhead, + dim_feedforward=2048, + attention_dropout_rate=0.0, + residual_dropout_rate=0.1): + super(TransformerEncoderLayer, self).__init__() + self.self_attn = MultiheadAttention( + d_model, nhead, dropout=attention_dropout_rate) + + self.conv1 = Conv2D( + in_channels=d_model, + out_channels=dim_feedforward, + kernel_size=(1, 1)) + self.conv2 = Conv2D( + in_channels=dim_feedforward, + out_channels=d_model, + kernel_size=(1, 1)) + + self.norm1 = LayerNorm(d_model) + self.norm2 = LayerNorm(d_model) + self.dropout1 = Dropout(residual_dropout_rate) + self.dropout2 = Dropout(residual_dropout_rate) + + def forward(self, src, src_mask=None, src_key_padding_mask=None): + """Pass the input through the endocder layer. + Args: + src: the sequnce to the encoder layer (required). + src_mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + """ + src2 = self.self_attn( + src, + src, + src, + attn_mask=src_mask, + key_padding_mask=src_key_padding_mask) + src = src + self.dropout1(src2) + src = self.norm1(src) + + src = paddle.transpose(src, [1, 2, 0]) + src = paddle.unsqueeze(src, 2) + src2 = self.conv2(F.relu(self.conv1(src))) + src2 = paddle.squeeze(src2, 2) + src2 = paddle.transpose(src2, [2, 0, 1]) + src = paddle.squeeze(src, 2) + src = paddle.transpose(src, [2, 0, 1]) + + src = src + self.dropout2(src2) + src = self.norm2(src) + return src + + +class TransformerDecoderLayer(nn.Layer): + """TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network. + This standard decoder layer is based on the paper "Attention Is All You Need". + Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, + Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in + Neural Information Processing Systems, pages 6000-6010. Users may modify or implement + in a different way during application. + + Args: + d_model: the number of expected features in the input (required). + nhead: the number of heads in the multiheadattention models (required). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + + """ + + def __init__(self, + d_model, + nhead, + dim_feedforward=2048, + attention_dropout_rate=0.0, + residual_dropout_rate=0.1): + super(TransformerDecoderLayer, self).__init__() + self.self_attn = MultiheadAttention( + d_model, nhead, dropout=attention_dropout_rate) + self.multihead_attn = MultiheadAttention( + d_model, nhead, dropout=attention_dropout_rate) + + self.conv1 = Conv2D( + in_channels=d_model, + out_channels=dim_feedforward, + kernel_size=(1, 1)) + self.conv2 = Conv2D( + in_channels=dim_feedforward, + out_channels=d_model, + kernel_size=(1, 1)) + + self.norm1 = LayerNorm(d_model) + self.norm2 = LayerNorm(d_model) + self.norm3 = LayerNorm(d_model) + self.dropout1 = Dropout(residual_dropout_rate) + self.dropout2 = Dropout(residual_dropout_rate) + self.dropout3 = Dropout(residual_dropout_rate) + + def forward(self, + tgt, + memory, + tgt_mask=None, + memory_mask=None, + tgt_key_padding_mask=None, + memory_key_padding_mask=None): + """Pass the inputs (and mask) through the decoder layer. + + Args: + tgt: the sequence to the decoder layer (required). + memory: the sequnce from the last layer of the encoder (required). + tgt_mask: the mask for the tgt sequence (optional). + memory_mask: the mask for the memory sequence (optional). + tgt_key_padding_mask: the mask for the tgt keys per batch (optional). + memory_key_padding_mask: the mask for the memory keys per batch (optional). + + """ + tgt2 = self.self_attn( + tgt, + tgt, + tgt, + attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask) + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + tgt2 = self.multihead_attn( + tgt, + memory, + memory, + attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask) + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + + # default + tgt = paddle.transpose(tgt, [1, 2, 0]) + tgt = paddle.unsqueeze(tgt, 2) + tgt2 = self.conv2(F.relu(self.conv1(tgt))) + tgt2 = paddle.squeeze(tgt2, 2) + tgt2 = paddle.transpose(tgt2, [2, 0, 1]) + tgt = paddle.squeeze(tgt, 2) + tgt = paddle.transpose(tgt, [2, 0, 1]) + + tgt = tgt + self.dropout3(tgt2) + tgt = self.norm3(tgt) + return tgt + + +def _get_clones(module, N): + return LayerList([copy.deepcopy(module) for i in range(N)]) + + +class PositionalEncoding(nn.Layer): + """Inject some information about the relative or absolute position of the tokens + in the sequence. The positional encodings have the same dimension as + the embeddings, so that the two can be summed. Here, we use sine and cosine + functions of different frequencies. + .. math:: + \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model)) + \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model)) + \text{where pos is the word position and i is the embed idx) + Args: + d_model: the embed dim (required). + dropout: the dropout value (default=0.1). + max_len: the max. length of the incoming sequence (default=5000). + Examples: + >>> pos_encoder = PositionalEncoding(d_model) + """ + + def __init__(self, dropout, dim, max_len=5000): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = paddle.zeros([max_len, dim]) + position = paddle.arange(0, max_len, dtype=paddle.float32).unsqueeze(1) + div_term = paddle.exp( + paddle.arange(0, dim, 2).astype('float32') * + (-math.log(10000.0) / dim)) + pe[:, 0::2] = paddle.sin(position * div_term) + pe[:, 1::2] = paddle.cos(position * div_term) + pe = paddle.unsqueeze(pe, 0) + pe = paddle.transpose(pe, [1, 0, 2]) + self.register_buffer('pe', pe) + + def forward(self, x): + """Inputs of forward function + Args: + x: the sequence fed to the positional encoder model (required). + Shape: + x: [sequence length, batch size, embed dim] + output: [sequence length, batch size, embed dim] + Examples: + >>> output = pos_encoder(x) + """ + x = x + self.pe[:paddle.shape(x)[0], :] + return self.dropout(x) + + +class PositionalEncoding_2d(nn.Layer): + """Inject some information about the relative or absolute position of the tokens + in the sequence. The positional encodings have the same dimension as + the embeddings, so that the two can be summed. Here, we use sine and cosine + functions of different frequencies. + .. math:: + \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model)) + \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model)) + \text{where pos is the word position and i is the embed idx) + Args: + d_model: the embed dim (required). + dropout: the dropout value (default=0.1). + max_len: the max. length of the incoming sequence (default=5000). + Examples: + >>> pos_encoder = PositionalEncoding(d_model) + """ + + def __init__(self, dropout, dim, max_len=5000): + super(PositionalEncoding_2d, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = paddle.zeros([max_len, dim]) + position = paddle.arange(0, max_len, dtype=paddle.float32).unsqueeze(1) + div_term = paddle.exp( + paddle.arange(0, dim, 2).astype('float32') * + (-math.log(10000.0) / dim)) + pe[:, 0::2] = paddle.sin(position * div_term) + pe[:, 1::2] = paddle.cos(position * div_term) + pe = paddle.transpose(paddle.unsqueeze(pe, 0), [1, 0, 2]) + self.register_buffer('pe', pe) + + self.avg_pool_1 = nn.AdaptiveAvgPool2D((1, 1)) + self.linear1 = nn.Linear(dim, dim) + self.linear1.weight.data.fill_(1.) + self.avg_pool_2 = nn.AdaptiveAvgPool2D((1, 1)) + self.linear2 = nn.Linear(dim, dim) + self.linear2.weight.data.fill_(1.) + + def forward(self, x): + """Inputs of forward function + Args: + x: the sequence fed to the positional encoder model (required). + Shape: + x: [sequence length, batch size, embed dim] + output: [sequence length, batch size, embed dim] + Examples: + >>> output = pos_encoder(x) + """ + w_pe = self.pe[:paddle.shape(x)[-1], :] + w1 = self.linear1(self.avg_pool_1(x).squeeze()).unsqueeze(0) + w_pe = w_pe * w1 + w_pe = paddle.transpose(w_pe, [1, 2, 0]) + w_pe = paddle.unsqueeze(w_pe, 2) + + h_pe = self.pe[:paddle.shape(x).shape[-2], :] + w2 = self.linear2(self.avg_pool_2(x).squeeze()).unsqueeze(0) + h_pe = h_pe * w2 + h_pe = paddle.transpose(h_pe, [1, 2, 0]) + h_pe = paddle.unsqueeze(h_pe, 3) + + x = x + w_pe + h_pe + x = paddle.transpose( + paddle.reshape(x, + [x.shape[0], x.shape[1], x.shape[2] * x.shape[3]]), + [2, 0, 1]) + + return self.dropout(x) + + +class Embeddings(nn.Layer): + def __init__(self, d_model, vocab, padding_idx, scale_embedding): + super(Embeddings, self).__init__() + self.embedding = nn.Embedding(vocab, d_model, padding_idx=padding_idx) + w0 = np.random.normal(0.0, d_model**-0.5, + (vocab, d_model)).astype(np.float32) + self.embedding.weight.set_value(w0) + self.d_model = d_model + self.scale_embedding = scale_embedding + + def forward(self, x): + if self.scale_embedding: + x = self.embedding(x) + return x * math.sqrt(self.d_model) + return self.embedding(x) + + +class Beam(): + ''' Beam search ''' + + def __init__(self, size, device=False): + + self.size = size + self._done = False + # The score for each translation on the beam. + self.scores = paddle.zeros((size, ), dtype=paddle.float32) + self.all_scores = [] + # The backpointers at each time-step. + self.prev_ks = [] + # The outputs at each time-step. + self.next_ys = [paddle.full((size, ), 0, dtype=paddle.int64)] + self.next_ys[0][0] = 2 + + def get_current_state(self): + "Get the outputs for the current timestep." + return self.get_tentative_hypothesis() + + def get_current_origin(self): + "Get the backpointers for the current timestep." + return self.prev_ks[-1] + + @property + def done(self): + return self._done + + def advance(self, word_prob): + "Update beam status and check if finished or not." + num_words = word_prob.shape[1] + + # Sum the previous scores. + if len(self.prev_ks) > 0: + beam_lk = word_prob + self.scores.unsqueeze(1).expand_as(word_prob) + else: + beam_lk = word_prob[0] + + flat_beam_lk = beam_lk.reshape([-1]) + best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, + True) # 1st sort + self.all_scores.append(self.scores) + self.scores = best_scores + # bestScoresId is flattened as a (beam x word) array, + # so we need to calculate which word and beam each score came from + prev_k = best_scores_id // num_words + self.prev_ks.append(prev_k) + self.next_ys.append(best_scores_id - prev_k * num_words) + # End condition is when top-of-beam is EOS. + if self.next_ys[-1][0] == 3: + self._done = True + self.all_scores.append(self.scores) + + return self._done + + def sort_scores(self): + "Sort the scores." + return self.scores, paddle.to_tensor( + [i for i in range(int(self.scores.shape[0]))], dtype='int32') + + def get_the_best_score_and_idx(self): + "Get the score of the best in the beam." + scores, ids = self.sort_scores() + return scores[1], ids[1] + + def get_tentative_hypothesis(self): + "Get the decoded sequence for the current timestep." + if len(self.next_ys) == 1: + dec_seq = self.next_ys[0].unsqueeze(1) + else: + _, keys = self.sort_scores() + hyps = [self.get_hypothesis(k) for k in keys] + hyps = [[2] + h for h in hyps] + dec_seq = paddle.to_tensor(hyps, dtype='int64') + return dec_seq + + def get_hypothesis(self, k): + """ Walk back to construct the full hypothesis. """ + hyp = [] + for j in range(len(self.prev_ks) - 1, -1, -1): + hyp.append(self.next_ys[j + 1][k]) + k = self.prev_ks[j][k] + return list(map(lambda x: x.item(), hyp[::-1])) diff --git a/ppocr/modeling/heads/rec_sar_head.py b/ppocr/modeling/heads/rec_sar_head.py new file mode 100644 index 0000000000000000000000000000000000000000..7107788d9ef3b49ac6d4dcd4a8133a9603ada19b --- /dev/null +++ b/ppocr/modeling/heads/rec_sar_head.py @@ -0,0 +1,384 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F + + +class SAREncoder(nn.Layer): + """ + Args: + enc_bi_rnn (bool): If True, use bidirectional RNN in encoder. + enc_drop_rnn (float): Dropout probability of RNN layer in encoder. + enc_gru (bool): If True, use GRU, else LSTM in encoder. + d_model (int): Dim of channels from backbone. + d_enc (int): Dim of encoder RNN layer. + mask (bool): If True, mask padding in RNN sequence. + """ + + def __init__(self, + enc_bi_rnn=False, + enc_drop_rnn=0.1, + enc_gru=False, + d_model=512, + d_enc=512, + mask=True, + **kwargs): + super().__init__() + assert isinstance(enc_bi_rnn, bool) + assert isinstance(enc_drop_rnn, (int, float)) + assert 0 <= enc_drop_rnn < 1.0 + assert isinstance(enc_gru, bool) + assert isinstance(d_model, int) + assert isinstance(d_enc, int) + assert isinstance(mask, bool) + + self.enc_bi_rnn = enc_bi_rnn + self.enc_drop_rnn = enc_drop_rnn + self.mask = mask + + # LSTM Encoder + if enc_bi_rnn: + direction = 'bidirectional' + else: + direction = 'forward' + kwargs = dict( + input_size=d_model, + hidden_size=d_enc, + num_layers=2, + time_major=False, + dropout=enc_drop_rnn, + direction=direction) + if enc_gru: + self.rnn_encoder = nn.GRU(**kwargs) + else: + self.rnn_encoder = nn.LSTM(**kwargs) + + # global feature transformation + encoder_rnn_out_size = d_enc * (int(enc_bi_rnn) + 1) + self.linear = nn.Linear(encoder_rnn_out_size, encoder_rnn_out_size) + + def forward(self, feat, img_metas=None): + if img_metas is not None: + assert len(img_metas[0]) == feat.shape[0] + + valid_ratios = None + if img_metas is not None and self.mask: + valid_ratios = img_metas[-1] + + h_feat = feat.shape[2] # bsz c h w + feat_v = F.max_pool2d( + feat, kernel_size=(h_feat, 1), stride=1, padding=0) + feat_v = feat_v.squeeze(2) # bsz * C * W + feat_v = paddle.transpose(feat_v, perm=[0, 2, 1]) # bsz * W * C + holistic_feat = self.rnn_encoder(feat_v)[0] # bsz * T * C + + if valid_ratios is not None: + valid_hf = [] + T = holistic_feat.shape[1] + for i, valid_ratio in enumerate(valid_ratios): + valid_step = min(T, math.ceil(T * valid_ratio)) - 1 + valid_hf.append(holistic_feat[i, valid_step, :]) + valid_hf = paddle.stack(valid_hf, axis=0) + else: + valid_hf = holistic_feat[:, -1, :] # bsz * C + holistic_feat = self.linear(valid_hf) # bsz * C + + return holistic_feat + + +class BaseDecoder(nn.Layer): + def __init__(self, **kwargs): + super().__init__() + + def forward_train(self, feat, out_enc, targets, img_metas): + raise NotImplementedError + + def forward_test(self, feat, out_enc, img_metas): + raise NotImplementedError + + def forward(self, + feat, + out_enc, + label=None, + img_metas=None, + train_mode=True): + self.train_mode = train_mode + + if train_mode: + return self.forward_train(feat, out_enc, label, img_metas) + return self.forward_test(feat, out_enc, img_metas) + + +class ParallelSARDecoder(BaseDecoder): + """ + Args: + out_channels (int): Output class number. + enc_bi_rnn (bool): If True, use bidirectional RNN in encoder. + dec_bi_rnn (bool): If True, use bidirectional RNN in decoder. + dec_drop_rnn (float): Dropout of RNN layer in decoder. + dec_gru (bool): If True, use GRU, else LSTM in decoder. + d_model (int): Dim of channels from backbone. + d_enc (int): Dim of encoder RNN layer. + d_k (int): Dim of channels of attention module. + pred_dropout (float): Dropout probability of prediction layer. + max_seq_len (int): Maximum sequence length for decoding. + mask (bool): If True, mask padding in feature map. + start_idx (int): Index of start token. + padding_idx (int): Index of padding token. + pred_concat (bool): If True, concat glimpse feature from + attention with holistic feature and hidden state. + """ + + def __init__( + self, + out_channels, # 90 + unknown + start + padding + enc_bi_rnn=False, + dec_bi_rnn=False, + dec_drop_rnn=0.0, + dec_gru=False, + d_model=512, + d_enc=512, + d_k=64, + pred_dropout=0.1, + max_text_length=30, + mask=True, + pred_concat=True, + **kwargs): + super().__init__() + + self.num_classes = out_channels + self.enc_bi_rnn = enc_bi_rnn + self.d_k = d_k + self.start_idx = out_channels - 2 + self.padding_idx = out_channels - 1 + self.max_seq_len = max_text_length + self.mask = mask + self.pred_concat = pred_concat + + encoder_rnn_out_size = d_enc * (int(enc_bi_rnn) + 1) + decoder_rnn_out_size = encoder_rnn_out_size * (int(dec_bi_rnn) + 1) + + # 2D attention layer + self.conv1x1_1 = nn.Linear(decoder_rnn_out_size, d_k) + self.conv3x3_1 = nn.Conv2D( + d_model, d_k, kernel_size=3, stride=1, padding=1) + self.conv1x1_2 = nn.Linear(d_k, 1) + + # Decoder RNN layer + if dec_bi_rnn: + direction = 'bidirectional' + else: + direction = 'forward' + + kwargs = dict( + input_size=encoder_rnn_out_size, + hidden_size=encoder_rnn_out_size, + num_layers=2, + time_major=False, + dropout=dec_drop_rnn, + direction=direction) + if dec_gru: + self.rnn_decoder = nn.GRU(**kwargs) + else: + self.rnn_decoder = nn.LSTM(**kwargs) + + # Decoder input embedding + self.embedding = nn.Embedding( + self.num_classes, + encoder_rnn_out_size, + padding_idx=self.padding_idx) + + # Prediction layer + self.pred_dropout = nn.Dropout(pred_dropout) + pred_num_classes = self.num_classes - 1 + if pred_concat: + fc_in_channel = decoder_rnn_out_size + d_model + d_enc + else: + fc_in_channel = d_model + self.prediction = nn.Linear(fc_in_channel, pred_num_classes) + + def _2d_attention(self, + decoder_input, + feat, + holistic_feat, + valid_ratios=None): + + y = self.rnn_decoder(decoder_input)[0] + # y: bsz * (seq_len + 1) * hidden_size + + attn_query = self.conv1x1_1(y) # bsz * (seq_len + 1) * attn_size + bsz, seq_len, attn_size = attn_query.shape + attn_query = paddle.unsqueeze(attn_query, axis=[3, 4]) + # (bsz, seq_len + 1, attn_size, 1, 1) + + attn_key = self.conv3x3_1(feat) + # bsz * attn_size * h * w + attn_key = attn_key.unsqueeze(1) + # bsz * 1 * attn_size * h * w + + attn_weight = paddle.tanh(paddle.add(attn_key, attn_query)) + + # bsz * (seq_len + 1) * attn_size * h * w + attn_weight = paddle.transpose(attn_weight, perm=[0, 1, 3, 4, 2]) + # bsz * (seq_len + 1) * h * w * attn_size + attn_weight = self.conv1x1_2(attn_weight) + # bsz * (seq_len + 1) * h * w * 1 + bsz, T, h, w, c = attn_weight.shape + assert c == 1 + + if valid_ratios is not None: + # cal mask of attention weight + for i, valid_ratio in enumerate(valid_ratios): + valid_width = min(w, math.ceil(w * valid_ratio)) + if valid_width < w: + attn_weight[i, :, :, valid_width:, :] = float('-inf') + + attn_weight = paddle.reshape(attn_weight, [bsz, T, -1]) + attn_weight = F.softmax(attn_weight, axis=-1) + + attn_weight = paddle.reshape(attn_weight, [bsz, T, h, w, c]) + attn_weight = paddle.transpose(attn_weight, perm=[0, 1, 4, 2, 3]) + # attn_weight: bsz * T * c * h * w + # feat: bsz * c * h * w + attn_feat = paddle.sum(paddle.multiply(feat.unsqueeze(1), attn_weight), + (3, 4), + keepdim=False) + # bsz * (seq_len + 1) * C + + # Linear transformation + if self.pred_concat: + hf_c = holistic_feat.shape[-1] + holistic_feat = paddle.expand( + holistic_feat, shape=[bsz, seq_len, hf_c]) + y = self.prediction(paddle.concat((y, attn_feat, holistic_feat), 2)) + else: + y = self.prediction(attn_feat) + # bsz * (seq_len + 1) * num_classes + if self.train_mode: + y = self.pred_dropout(y) + + return y + + def forward_train(self, feat, out_enc, label, img_metas): + ''' + img_metas: [label, valid_ratio] + ''' + if img_metas is not None: + assert len(img_metas[0]) == feat.shape[0] + + valid_ratios = None + if img_metas is not None and self.mask: + valid_ratios = img_metas[-1] + + label = label.cuda() + lab_embedding = self.embedding(label) + # bsz * seq_len * emb_dim + out_enc = out_enc.unsqueeze(1) + # bsz * 1 * emb_dim + in_dec = paddle.concat((out_enc, lab_embedding), axis=1) + # bsz * (seq_len + 1) * C + out_dec = self._2d_attention( + in_dec, feat, out_enc, valid_ratios=valid_ratios) + # bsz * (seq_len + 1) * num_classes + + return out_dec[:, 1:, :] # bsz * seq_len * num_classes + + def forward_test(self, feat, out_enc, img_metas): + if img_metas is not None: + assert len(img_metas[0]) == feat.shape[0] + + valid_ratios = None + if img_metas is not None and self.mask: + valid_ratios = img_metas[-1] + + seq_len = self.max_seq_len + bsz = feat.shape[0] + start_token = paddle.full( + (bsz, ), fill_value=self.start_idx, dtype='int64') + # bsz + start_token = self.embedding(start_token) + # bsz * emb_dim + emb_dim = start_token.shape[1] + start_token = start_token.unsqueeze(1) + start_token = paddle.expand(start_token, shape=[bsz, seq_len, emb_dim]) + # bsz * seq_len * emb_dim + out_enc = out_enc.unsqueeze(1) + # bsz * 1 * emb_dim + decoder_input = paddle.concat((out_enc, start_token), axis=1) + # bsz * (seq_len + 1) * emb_dim + + outputs = [] + for i in range(1, seq_len + 1): + decoder_output = self._2d_attention( + decoder_input, feat, out_enc, valid_ratios=valid_ratios) + char_output = decoder_output[:, i, :] # bsz * num_classes + char_output = F.softmax(char_output, -1) + outputs.append(char_output) + max_idx = paddle.argmax(char_output, axis=1, keepdim=False) + char_embedding = self.embedding(max_idx) # bsz * emb_dim + if i < seq_len: + decoder_input[:, i + 1, :] = char_embedding + + outputs = paddle.stack(outputs, 1) # bsz * seq_len * num_classes + + return outputs + + +class SARHead(nn.Layer): + def __init__(self, + out_channels, + enc_bi_rnn=False, + enc_drop_rnn=0.1, + enc_gru=False, + dec_bi_rnn=False, + dec_drop_rnn=0.0, + dec_gru=False, + d_k=512, + pred_dropout=0.1, + max_text_length=30, + pred_concat=True, + **kwargs): + super(SARHead, self).__init__() + + # encoder module + self.encoder = SAREncoder( + enc_bi_rnn=enc_bi_rnn, enc_drop_rnn=enc_drop_rnn, enc_gru=enc_gru) + + # decoder module + self.decoder = ParallelSARDecoder( + out_channels=out_channels, + enc_bi_rnn=enc_bi_rnn, + dec_bi_rnn=dec_bi_rnn, + dec_drop_rnn=dec_drop_rnn, + dec_gru=dec_gru, + d_k=d_k, + pred_dropout=pred_dropout, + max_text_length=max_text_length, + pred_concat=pred_concat) + + def forward(self, feat, targets=None): + ''' + img_metas: [label, valid_ratio] + ''' + holistic_feat = self.encoder(feat, targets) # bsz c + + if self.training: + label = targets[0] # label + label = paddle.to_tensor(label, dtype='int64') + final_out = self.decoder( + feat, holistic_feat, label, img_metas=targets) + if not self.training: + final_out = self.decoder( + feat, + holistic_feat, + label=None, + img_metas=targets, + train_mode=False) + # (bsz, seq_len, num_classes) + + return final_out diff --git a/ppocr/modeling/necks/__init__.py b/ppocr/modeling/necks/__init__.py index e97c4f64bdc9acd6729d67a9c6ff7a7563f6c95e..5606a4c35f68021e7f151a7eae4a0da4d5b6b95e 100644 --- a/ppocr/modeling/necks/__init__.py +++ b/ppocr/modeling/necks/__init__.py @@ -22,7 +22,8 @@ def build_neck(config): from .rnn import SequenceEncoder from .pg_fpn import PGFPN from .table_fpn import TableFPN - support_dict = ['DBFPN', 'EASTFPN', 'SASTFPN', 'SequenceEncoder', 'PGFPN', 'TableFPN'] + from .fpn import FPN + support_dict = ['FPN','DBFPN', 'EASTFPN', 'SASTFPN', 'SequenceEncoder', 'PGFPN', 'TableFPN'] module_name = config.pop('name') assert module_name in support_dict, Exception('neck only support {}'.format( diff --git a/ppocr/modeling/necks/fpn.py b/ppocr/modeling/necks/fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..8728a5c9ded5b9c174fd34f088d8012961f65ec0 --- /dev/null +++ b/ppocr/modeling/necks/fpn.py @@ -0,0 +1,100 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.nn as nn +import paddle +import math +import paddle.nn.functional as F + +class Conv_BN_ReLU(nn.Layer): + def __init__(self, in_planes, out_planes, kernel_size=1, stride=1, padding=0): + super(Conv_BN_ReLU, self).__init__() + self.conv = nn.Conv2D(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, + bias_attr=False) + self.bn = nn.BatchNorm2D(out_planes, momentum=0.1) + self.relu = nn.ReLU() + + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels + m.weight = paddle.create_parameter(shape=m.weight.shape, dtype='float32', default_initializer=paddle.nn.initializer.Normal(0, math.sqrt(2. / n))) + elif isinstance(m, nn.BatchNorm2D): + m.weight = paddle.create_parameter(shape=m.weight.shape, dtype='float32', default_initializer=paddle.nn.initializer.Constant(1.0)) + m.bias = paddle.create_parameter(shape=m.bias.shape, dtype='float32', default_initializer=paddle.nn.initializer.Constant(0.0)) + + def forward(self, x): + return self.relu(self.bn(self.conv(x))) + +class FPN(nn.Layer): + def __init__(self, in_channels, out_channels): + super(FPN, self).__init__() + + # Top layer + self.toplayer_ = Conv_BN_ReLU(in_channels[3], out_channels, kernel_size=1, stride=1, padding=0) + # Lateral layers + self.latlayer1_ = Conv_BN_ReLU(in_channels[2], out_channels, kernel_size=1, stride=1, padding=0) + + self.latlayer2_ = Conv_BN_ReLU(in_channels[1], out_channels, kernel_size=1, stride=1, padding=0) + + self.latlayer3_ = Conv_BN_ReLU(in_channels[0], out_channels, kernel_size=1, stride=1, padding=0) + + # Smooth layers + self.smooth1_ = Conv_BN_ReLU(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + + self.smooth2_ = Conv_BN_ReLU(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + + self.smooth3_ = Conv_BN_ReLU(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + + + self.out_channels = out_channels * 4 + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels + m.weight = paddle.create_parameter(shape=m.weight.shape, dtype='float32', + default_initializer=paddle.nn.initializer.Normal(0, + math.sqrt(2. / n))) + elif isinstance(m, nn.BatchNorm2D): + m.weight = paddle.create_parameter(shape=m.weight.shape, dtype='float32', + default_initializer=paddle.nn.initializer.Constant(1.0)) + m.bias = paddle.create_parameter(shape=m.bias.shape, dtype='float32', + default_initializer=paddle.nn.initializer.Constant(0.0)) + + def _upsample(self, x, scale=1): + return F.upsample(x, scale_factor=scale, mode='bilinear') + + def _upsample_add(self, x, y, scale=1): + return F.upsample(x, scale_factor=scale, mode='bilinear') + y + + def forward(self, x): + f2, f3, f4, f5 = x + p5 = self.toplayer_(f5) + + f4 = self.latlayer1_(f4) + p4 = self._upsample_add(p5, f4,2) + p4 = self.smooth1_(p4) + + f3 = self.latlayer2_(f3) + p3 = self._upsample_add(p4, f3,2) + p3 = self.smooth2_(p3) + + f2 = self.latlayer3_(f2) + p2 = self._upsample_add(p3, f2,2) + p2 = self.smooth3_(p2) + + p3 = self._upsample(p3, 2) + p4 = self._upsample(p4, 4) + p5 = self._upsample(p5, 8) + + fuse = paddle.concat([p2, p3, p4, p5], axis=1) + return fuse \ No newline at end of file diff --git a/ppocr/modeling/necks/rnn.py b/ppocr/modeling/necks/rnn.py index de87b3d9895168657f8c9722177c026b992c2966..86e649028f8fbb76cb5a1fd85381bd361277c6ee 100644 --- a/ppocr/modeling/necks/rnn.py +++ b/ppocr/modeling/necks/rnn.py @@ -51,7 +51,7 @@ class EncoderWithFC(nn.Layer): super(EncoderWithFC, self).__init__() self.out_channels = hidden_size weight_attr, bias_attr = get_para_bias_attr( - l2_decay=0.00001, k=in_channels, name='reduce_encoder_fea') + l2_decay=0.00001, k=in_channels) self.fc = nn.Linear( in_channels, hidden_size, diff --git a/ppocr/modeling/transforms/__init__.py b/ppocr/modeling/transforms/__init__.py index 78eaecccc55f77d6624aa0c5bdb839acc3462129..405ab3cc6c0380654f61e42e523ddc85839139b3 100755 --- a/ppocr/modeling/transforms/__init__.py +++ b/ppocr/modeling/transforms/__init__.py @@ -17,8 +17,9 @@ __all__ = ['build_transform'] def build_transform(config): from .tps import TPS + from .stn import STN_ON - support_dict = ['TPS'] + support_dict = ['TPS', 'STN_ON'] module_name = config.pop('name') assert module_name in support_dict, Exception( diff --git a/ppocr/modeling/transforms/stn.py b/ppocr/modeling/transforms/stn.py new file mode 100644 index 0000000000000000000000000000000000000000..215895f4c4c719f407f4998f7429d965e0529ddc --- /dev/null +++ b/ppocr/modeling/transforms/stn.py @@ -0,0 +1,132 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn, ParamAttr +from paddle.nn import functional as F +import numpy as np + +from .tps_spatial_transformer import TPSSpatialTransformer + + +def conv3x3_block(in_channels, out_channels, stride=1): + n = 3 * 3 * out_channels + w = math.sqrt(2. / n) + conv_layer = nn.Conv2D( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + weight_attr=nn.initializer.Normal( + mean=0.0, std=w), + bias_attr=nn.initializer.Constant(0)) + block = nn.Sequential(conv_layer, nn.BatchNorm2D(out_channels), nn.ReLU()) + return block + + +class STN(nn.Layer): + def __init__(self, in_channels, num_ctrlpoints, activation='none'): + super(STN, self).__init__() + self.in_channels = in_channels + self.num_ctrlpoints = num_ctrlpoints + self.activation = activation + self.stn_convnet = nn.Sequential( + conv3x3_block(in_channels, 32), #32x64 + nn.MaxPool2D( + kernel_size=2, stride=2), + conv3x3_block(32, 64), #16x32 + nn.MaxPool2D( + kernel_size=2, stride=2), + conv3x3_block(64, 128), # 8*16 + nn.MaxPool2D( + kernel_size=2, stride=2), + conv3x3_block(128, 256), # 4*8 + nn.MaxPool2D( + kernel_size=2, stride=2), + conv3x3_block(256, 256), # 2*4, + nn.MaxPool2D( + kernel_size=2, stride=2), + conv3x3_block(256, 256)) # 1*2 + self.stn_fc1 = nn.Sequential( + nn.Linear( + 2 * 256, + 512, + weight_attr=nn.initializer.Normal(0, 0.001), + bias_attr=nn.initializer.Constant(0)), + nn.BatchNorm1D(512), + nn.ReLU()) + fc2_bias = self.init_stn() + self.stn_fc2 = nn.Linear( + 512, + num_ctrlpoints * 2, + weight_attr=nn.initializer.Constant(0.0), + bias_attr=nn.initializer.Assign(fc2_bias)) + + def init_stn(self): + margin = 0.01 + sampling_num_per_side = int(self.num_ctrlpoints / 2) + ctrl_pts_x = np.linspace(margin, 1. - margin, sampling_num_per_side) + ctrl_pts_y_top = np.ones(sampling_num_per_side) * margin + ctrl_pts_y_bottom = np.ones(sampling_num_per_side) * (1 - margin) + ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1) + ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1) + ctrl_points = np.concatenate( + [ctrl_pts_top, ctrl_pts_bottom], axis=0).astype(np.float32) + if self.activation == 'none': + pass + elif self.activation == 'sigmoid': + ctrl_points = -np.log(1. / ctrl_points - 1.) + ctrl_points = paddle.to_tensor(ctrl_points) + fc2_bias = paddle.reshape( + ctrl_points, shape=[ctrl_points.shape[0] * ctrl_points.shape[1]]) + return fc2_bias + + def forward(self, x): + x = self.stn_convnet(x) + batch_size, _, h, w = x.shape + x = paddle.reshape(x, shape=(batch_size, -1)) + img_feat = self.stn_fc1(x) + x = self.stn_fc2(0.1 * img_feat) + if self.activation == 'sigmoid': + x = F.sigmoid(x) + x = paddle.reshape(x, shape=[-1, self.num_ctrlpoints, 2]) + return img_feat, x + + +class STN_ON(nn.Layer): + def __init__(self, in_channels, tps_inputsize, tps_outputsize, + num_control_points, tps_margins, stn_activation): + super(STN_ON, self).__init__() + self.tps = TPSSpatialTransformer( + output_image_size=tuple(tps_outputsize), + num_control_points=num_control_points, + margins=tuple(tps_margins)) + self.stn_head = STN(in_channels=in_channels, + num_ctrlpoints=num_control_points, + activation=stn_activation) + self.tps_inputsize = tps_inputsize + self.out_channels = in_channels + + def forward(self, image): + stn_input = paddle.nn.functional.interpolate( + image, self.tps_inputsize, mode="bilinear", align_corners=True) + stn_img_feat, ctrl_points = self.stn_head(stn_input) + x, _ = self.tps(image, ctrl_points) + return x diff --git a/ppocr/modeling/transforms/tps.py b/ppocr/modeling/transforms/tps.py index dcce6246ac64b4b84229cbd69a4dc53c658b4c7b..6cd68555369dd1ddbd6ccf5236688a4b957b8525 100644 --- a/ppocr/modeling/transforms/tps.py +++ b/ppocr/modeling/transforms/tps.py @@ -231,7 +231,8 @@ class GridGenerator(nn.Layer): """ Return inv_delta_C which is needed to calculate T """ F = self.F hat_eye = paddle.eye(F, dtype='float64') # F x F - hat_C = paddle.norm(C.reshape([1, F, 2]) - C.reshape([F, 1, 2]), axis=2) + hat_eye + hat_C = paddle.norm( + C.reshape([1, F, 2]) - C.reshape([F, 1, 2]), axis=2) + hat_eye hat_C = (hat_C**2) * paddle.log(hat_C) delta_C = paddle.concat( # F+3 x F+3 [ diff --git a/ppocr/modeling/transforms/tps_spatial_transformer.py b/ppocr/modeling/transforms/tps_spatial_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..b510acb0d4012c9a4d90c7ca07cac895f0bf242e --- /dev/null +++ b/ppocr/modeling/transforms/tps_spatial_transformer.py @@ -0,0 +1,152 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn, ParamAttr +from paddle.nn import functional as F +import numpy as np +import itertools + + +def grid_sample(input, grid, canvas=None): + input.stop_gradient = False + output = F.grid_sample(input, grid) + if canvas is None: + return output + else: + input_mask = paddle.ones(shape=input.shape) + output_mask = F.grid_sample(input_mask, grid) + padded_output = output * output_mask + canvas * (1 - output_mask) + return padded_output + + +# phi(x1, x2) = r^2 * log(r), where r = ||x1 - x2||_2 +def compute_partial_repr(input_points, control_points): + N = input_points.shape[0] + M = control_points.shape[0] + pairwise_diff = paddle.reshape( + input_points, shape=[N, 1, 2]) - paddle.reshape( + control_points, shape=[1, M, 2]) + # original implementation, very slow + # pairwise_dist = torch.sum(pairwise_diff ** 2, dim = 2) # square of distance + pairwise_diff_square = pairwise_diff * pairwise_diff + pairwise_dist = pairwise_diff_square[:, :, 0] + pairwise_diff_square[:, :, + 1] + repr_matrix = 0.5 * pairwise_dist * paddle.log(pairwise_dist) + # fix numerical error for 0 * log(0), substitute all nan with 0 + mask = repr_matrix != repr_matrix + repr_matrix[mask] = 0 + return repr_matrix + + +# output_ctrl_pts are specified, according to our task. +def build_output_control_points(num_control_points, margins): + margin_x, margin_y = margins + num_ctrl_pts_per_side = num_control_points // 2 + ctrl_pts_x = np.linspace(margin_x, 1.0 - margin_x, num_ctrl_pts_per_side) + ctrl_pts_y_top = np.ones(num_ctrl_pts_per_side) * margin_y + ctrl_pts_y_bottom = np.ones(num_ctrl_pts_per_side) * (1.0 - margin_y) + ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1) + ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1) + output_ctrl_pts_arr = np.concatenate( + [ctrl_pts_top, ctrl_pts_bottom], axis=0) + output_ctrl_pts = paddle.to_tensor(output_ctrl_pts_arr) + return output_ctrl_pts + + +class TPSSpatialTransformer(nn.Layer): + def __init__(self, + output_image_size=None, + num_control_points=None, + margins=None): + super(TPSSpatialTransformer, self).__init__() + self.output_image_size = output_image_size + self.num_control_points = num_control_points + self.margins = margins + + self.target_height, self.target_width = output_image_size + target_control_points = build_output_control_points(num_control_points, + margins) + N = num_control_points + + # create padded kernel matrix + forward_kernel = paddle.zeros(shape=[N + 3, N + 3]) + target_control_partial_repr = compute_partial_repr( + target_control_points, target_control_points) + target_control_partial_repr = paddle.cast(target_control_partial_repr, + forward_kernel.dtype) + forward_kernel[:N, :N] = target_control_partial_repr + forward_kernel[:N, -3] = 1 + forward_kernel[-3, :N] = 1 + target_control_points = paddle.cast(target_control_points, + forward_kernel.dtype) + forward_kernel[:N, -2:] = target_control_points + forward_kernel[-2:, :N] = paddle.transpose( + target_control_points, perm=[1, 0]) + # compute inverse matrix + inverse_kernel = paddle.inverse(forward_kernel) + + # create target cordinate matrix + HW = self.target_height * self.target_width + target_coordinate = list( + itertools.product( + range(self.target_height), range(self.target_width))) + target_coordinate = paddle.to_tensor(target_coordinate) # HW x 2 + Y, X = paddle.split( + target_coordinate, target_coordinate.shape[1], axis=1) + Y = Y / (self.target_height - 1) + X = X / (self.target_width - 1) + target_coordinate = paddle.concat( + [X, Y], axis=1) # convert from (y, x) to (x, y) + target_coordinate_partial_repr = compute_partial_repr( + target_coordinate, target_control_points) + target_coordinate_repr = paddle.concat( + [ + target_coordinate_partial_repr, paddle.ones(shape=[HW, 1]), + target_coordinate + ], + axis=1) + + # register precomputed matrices + self.inverse_kernel = inverse_kernel + self.padding_matrix = paddle.zeros(shape=[3, 2]) + self.target_coordinate_repr = target_coordinate_repr + self.target_control_points = target_control_points + + def forward(self, input, source_control_points): + assert source_control_points.ndimension() == 3 + assert source_control_points.shape[1] == self.num_control_points + assert source_control_points.shape[2] == 2 + batch_size = paddle.shape(source_control_points)[0] + + self.padding_matrix = paddle.expand( + self.padding_matrix, shape=[batch_size, 3, 2]) + Y = paddle.concat([source_control_points, self.padding_matrix], 1) + mapping_matrix = paddle.matmul(self.inverse_kernel, Y) + source_coordinate = paddle.matmul(self.target_coordinate_repr, + mapping_matrix) + + grid = paddle.reshape( + source_coordinate, + shape=[-1, self.target_height, self.target_width, 2]) + grid = paddle.clip(grid, 0, + 1) # the source_control_points may be out of [0, 1]. + # the input to grid_sample is normalized [-1, 1], but what we get is [0, 1] + grid = 2.0 * grid - 1.0 + output_maps = grid_sample(input, grid, canvas=None) + return output_maps, source_coordinate diff --git a/ppocr/optimizer/optimizer.py b/ppocr/optimizer/optimizer.py index 8215b92d8c8d05c2b3c2e95ac989bf4ea011310b..34098c0fad553f7d39f6b5341e4da70a263eeaea 100644 --- a/ppocr/optimizer/optimizer.py +++ b/ppocr/optimizer/optimizer.py @@ -127,3 +127,34 @@ class RMSProp(object): grad_clip=self.grad_clip, parameters=parameters) return opt + + +class Adadelta(object): + def __init__(self, + learning_rate=0.001, + epsilon=1e-08, + rho=0.95, + parameter_list=None, + weight_decay=None, + grad_clip=None, + name=None, + **kwargs): + self.learning_rate = learning_rate + self.epsilon = epsilon + self.rho = rho + self.parameter_list = parameter_list + self.learning_rate = learning_rate + self.weight_decay = weight_decay + self.grad_clip = grad_clip + self.name = name + + def __call__(self, parameters): + opt = optim.Adadelta( + learning_rate=self.learning_rate, + epsilon=self.epsilon, + rho=self.rho, + weight_decay=self.weight_decay, + grad_clip=self.grad_clip, + name=self.name, + parameters=parameters) + return opt diff --git a/ppocr/postprocess/__init__.py b/ppocr/postprocess/__init__.py index 654ddf39d23590fbaf7f7b9b57f38cc86a1b6669..3a4ebf52a3bd91ffd509b113103dab900588b0bd 100644 --- a/ppocr/postprocess/__init__.py +++ b/ppocr/postprocess/__init__.py @@ -18,6 +18,7 @@ from __future__ import print_function from __future__ import unicode_literals import copy +import platform __all__ = ['build_post_process'] @@ -25,17 +26,22 @@ from .db_postprocess import DBPostProcess, DistillationDBPostProcess from .east_postprocess import EASTPostProcess from .sast_postprocess import SASTPostProcess from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, DistillationCTCLabelDecode, \ - TableLabelDecode + TableLabelDecode, NRTRLabelDecode, SARLabelDecode , SEEDLabelDecode from .cls_postprocess import ClsPostProcess from .pg_postprocess import PGPostProcess +if platform.system() != "Windows": + # pse is not support in Windows + from .pse_postprocess import PSEPostProcess + def build_post_process(config, global_config=None): support_dict = [ - 'DBPostProcess', 'EASTPostProcess', 'SASTPostProcess', 'CTCLabelDecode', - 'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode', 'PGPostProcess', - 'DistillationCTCLabelDecode', 'TableLabelDecode', - 'DistillationDBPostProcess' + 'DBPostProcess', 'PSEPostProcess', 'EASTPostProcess', 'SASTPostProcess', + 'CTCLabelDecode', 'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode', + 'PGPostProcess', 'DistillationCTCLabelDecode', 'TableLabelDecode', + 'DistillationDBPostProcess', 'NRTRLabelDecode', 'SARLabelDecode', + 'SEEDLabelDecode' ] config = copy.deepcopy(config) diff --git a/ppocr/postprocess/pse_postprocess/__init__.py b/ppocr/postprocess/pse_postprocess/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..680473bf4b1863ac695dc8173778e59bd4fdacf9 --- /dev/null +++ b/ppocr/postprocess/pse_postprocess/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .pse_postprocess import PSEPostProcess \ No newline at end of file diff --git a/ppocr/postprocess/pse_postprocess/pse/README.md b/ppocr/postprocess/pse_postprocess/pse/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9c2d9eaeaa5f93550358ebdd4d9161330b78a86f --- /dev/null +++ b/ppocr/postprocess/pse_postprocess/pse/README.md @@ -0,0 +1,5 @@ +## 编译 +code from https://github.com/whai362/pan_pp.pytorch +```python +python3 setup.py build_ext --inplace +``` diff --git a/ppocr/postprocess/pse_postprocess/pse/__init__.py b/ppocr/postprocess/pse_postprocess/pse/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..97b8d8aff0cf229a4e3ec1961638273bd201822a --- /dev/null +++ b/ppocr/postprocess/pse_postprocess/pse/__init__.py @@ -0,0 +1,23 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +import os +import subprocess + +python_path = sys.executable + +if subprocess.call('cd ppocr/postprocess/pse_postprocess/pse;{} setup.py build_ext --inplace;cd -'.format(python_path), shell=True) != 0: + raise RuntimeError('Cannot compile pse: {}'.format(os.path.dirname(os.path.realpath(__file__)))) + +from .pse import pse \ No newline at end of file diff --git a/ppocr/postprocess/pse_postprocess/pse/pse.pyx b/ppocr/postprocess/pse_postprocess/pse/pse.pyx new file mode 100644 index 0000000000000000000000000000000000000000..b2be49e9471865c11b840207f922258e67a554b6 --- /dev/null +++ b/ppocr/postprocess/pse_postprocess/pse/pse.pyx @@ -0,0 +1,70 @@ + +import numpy as np +import cv2 +cimport numpy as np +cimport cython +cimport libcpp +cimport libcpp.pair +cimport libcpp.queue +from libcpp.pair cimport * +from libcpp.queue cimport * + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef np.ndarray[np.int32_t, ndim=2] _pse(np.ndarray[np.uint8_t, ndim=3] kernels, + np.ndarray[np.int32_t, ndim=2] label, + int kernel_num, + int label_num, + float min_area=0): + cdef np.ndarray[np.int32_t, ndim=2] pred + pred = np.zeros((label.shape[0], label.shape[1]), dtype=np.int32) + + for label_idx in range(1, label_num): + if np.sum(label == label_idx) < min_area: + label[label == label_idx] = 0 + + cdef libcpp.queue.queue[libcpp.pair.pair[np.int16_t,np.int16_t]] que = \ + queue[libcpp.pair.pair[np.int16_t,np.int16_t]]() + cdef libcpp.queue.queue[libcpp.pair.pair[np.int16_t,np.int16_t]] nxt_que = \ + queue[libcpp.pair.pair[np.int16_t,np.int16_t]]() + cdef np.int16_t* dx = [-1, 1, 0, 0] + cdef np.int16_t* dy = [0, 0, -1, 1] + cdef np.int16_t tmpx, tmpy + + points = np.array(np.where(label > 0)).transpose((1, 0)) + for point_idx in range(points.shape[0]): + tmpx, tmpy = points[point_idx, 0], points[point_idx, 1] + que.push(pair[np.int16_t,np.int16_t](tmpx, tmpy)) + pred[tmpx, tmpy] = label[tmpx, tmpy] + + cdef libcpp.pair.pair[np.int16_t,np.int16_t] cur + cdef int cur_label + for kernel_idx in range(kernel_num - 1, -1, -1): + while not que.empty(): + cur = que.front() + que.pop() + cur_label = pred[cur.first, cur.second] + + is_edge = True + for j in range(4): + tmpx = cur.first + dx[j] + tmpy = cur.second + dy[j] + if tmpx < 0 or tmpx >= label.shape[0] or tmpy < 0 or tmpy >= label.shape[1]: + continue + if kernels[kernel_idx, tmpx, tmpy] == 0 or pred[tmpx, tmpy] > 0: + continue + + que.push(pair[np.int16_t,np.int16_t](tmpx, tmpy)) + pred[tmpx, tmpy] = cur_label + is_edge = False + if is_edge: + nxt_que.push(cur) + + que, nxt_que = nxt_que, que + + return pred + +def pse(kernels, min_area): + kernel_num = kernels.shape[0] + label_num, label = cv2.connectedComponents(kernels[-1], connectivity=4) + return _pse(kernels[:-1], label, kernel_num, label_num, min_area) \ No newline at end of file diff --git a/ppocr/postprocess/pse_postprocess/pse/setup.py b/ppocr/postprocess/pse_postprocess/pse/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..03746782af791938bff31c24e4a760f566c73b49 --- /dev/null +++ b/ppocr/postprocess/pse_postprocess/pse/setup.py @@ -0,0 +1,14 @@ +from distutils.core import setup, Extension +from Cython.Build import cythonize +import numpy + +setup(ext_modules=cythonize(Extension( + 'pse', + sources=['pse.pyx'], + language='c++', + include_dirs=[numpy.get_include()], + library_dirs=[], + libraries=[], + extra_compile_args=['-O3'], + extra_link_args=[] +))) diff --git a/ppocr/postprocess/pse_postprocess/pse_postprocess.py b/ppocr/postprocess/pse_postprocess/pse_postprocess.py new file mode 100755 index 0000000000000000000000000000000000000000..4b89d221d284602933ab3d4f21468fcae79ef310 --- /dev/null +++ b/ppocr/postprocess/pse_postprocess/pse_postprocess.py @@ -0,0 +1,112 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import cv2 +import paddle +from paddle.nn import functional as F + +from ppocr.postprocess.pse_postprocess.pse import pse + + +class PSEPostProcess(object): + """ + The post process for PSE. + """ + + def __init__(self, + thresh=0.5, + box_thresh=0.85, + min_area=16, + box_type='box', + scale=4, + **kwargs): + assert box_type in ['box', 'poly'], 'Only box and poly is supported' + self.thresh = thresh + self.box_thresh = box_thresh + self.min_area = min_area + self.box_type = box_type + self.scale = scale + + def __call__(self, outs_dict, shape_list): + pred = outs_dict['maps'] + if not isinstance(pred, paddle.Tensor): + pred = paddle.to_tensor(pred) + pred = F.interpolate(pred, scale_factor=4 // self.scale, mode='bilinear') + + score = F.sigmoid(pred[:, 0, :, :]) + + kernels = (pred > self.thresh).astype('float32') + text_mask = kernels[:, 0, :, :] + kernels[:, 0:, :, :] = kernels[:, 0:, :, :] * text_mask + + score = score.numpy() + kernels = kernels.numpy().astype(np.uint8) + + boxes_batch = [] + for batch_index in range(pred.shape[0]): + boxes, scores = self.boxes_from_bitmap(score[batch_index], kernels[batch_index], shape_list[batch_index]) + + boxes_batch.append({'points': boxes, 'scores': scores}) + return boxes_batch + + def boxes_from_bitmap(self, score, kernels, shape): + label = pse(kernels, self.min_area) + return self.generate_box(score, label, shape) + + def generate_box(self, score, label, shape): + src_h, src_w, ratio_h, ratio_w = shape + label_num = np.max(label) + 1 + + boxes = [] + scores = [] + for i in range(1, label_num): + ind = label == i + points = np.array(np.where(ind)).transpose((1, 0))[:, ::-1] + + if points.shape[0] < self.min_area: + label[ind] = 0 + continue + + score_i = np.mean(score[ind]) + if score_i < self.box_thresh: + label[ind] = 0 + continue + + if self.box_type == 'box': + rect = cv2.minAreaRect(points) + bbox = cv2.boxPoints(rect) + elif self.box_type == 'poly': + box_height = np.max(points[:, 1]) + 10 + box_width = np.max(points[:, 0]) + 10 + + mask = np.zeros((box_height, box_width), np.uint8) + mask[points[:, 1], points[:, 0]] = 255 + + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + bbox = np.squeeze(contours[0], 1) + else: + raise NotImplementedError + + bbox[:, 0] = np.clip( + np.round(bbox[:, 0] / ratio_w), 0, src_w) + bbox[:, 1] = np.clip( + np.round(bbox[:, 1] / ratio_h), 0, src_h) + boxes.append(bbox) + scores.append(score_i) + return boxes, scores diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 8ebe5b2741b77537b46b8057d9aa9c36dc99aeec..ef1a43fd0ee65f3e55a8f72dfd2f96c478da1a9a 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -15,38 +15,21 @@ import numpy as np import string import paddle from paddle.nn import functional as F +import re class BaseRecLabelDecode(object): """ Convert between text-label and text-index """ - def __init__(self, - character_dict_path=None, - character_type='ch', - use_space_char=False): - support_character_type = [ - 'ch', 'en', 'EN_symbol', 'french', 'german', 'japan', 'korean', - 'it', 'xi', 'pu', 'ru', 'ar', 'ta', 'ug', 'fa', 'ur', 'rs', 'oc', - 'rsc', 'bg', 'uk', 'be', 'te', 'ka', 'chinese_cht', 'hi', 'mr', - 'ne', 'EN', 'latin', 'arabic', 'cyrillic', 'devanagari' - ] - assert character_type in support_character_type, "Only {} are supported now but get {}".format( - support_character_type, character_type) - + def __init__(self, character_dict_path=None, use_space_char=False): self.beg_str = "sos" self.end_str = "eos" - if character_type == "en": + self.character_str = [] + if character_dict_path is None: self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" dict_character = list(self.character_str) - elif character_type == "EN_symbol": - # same with ASTER setting (use 94 char). - self.character_str = string.printable[:-6] - dict_character = list(self.character_str) - elif character_type in support_character_type: - self.character_str = [] - assert character_dict_path is not None, "character_dict_path should not be None when character_type is {}".format( - character_type) + else: with open(character_dict_path, "rb") as fin: lines = fin.readlines() for line in lines: @@ -56,9 +39,6 @@ class BaseRecLabelDecode(object): self.character_str.append(" ") dict_character = list(self.character_str) - else: - raise NotImplementedError - self.character_type = character_type dict_character = self.add_special_char(dict_character) self.dict = {} for i, char in enumerate(dict_character): @@ -101,15 +81,14 @@ class BaseRecLabelDecode(object): class CTCLabelDecode(BaseRecLabelDecode): """ Convert between text-label and text-index """ - def __init__(self, - character_dict_path=None, - character_type='ch', - use_space_char=False, + def __init__(self, character_dict_path=None, use_space_char=False, **kwargs): super(CTCLabelDecode, self).__init__(character_dict_path, - character_type, use_space_char) + use_space_char) def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, tuple): + preds = preds[-1] if isinstance(preds, paddle.Tensor): preds = preds.numpy() preds_idx = preds.argmax(axis=2) @@ -133,13 +112,12 @@ class DistillationCTCLabelDecode(CTCLabelDecode): def __init__(self, character_dict_path=None, - character_type='ch', use_space_char=False, model_name=["student"], key=None, **kwargs): - super(DistillationCTCLabelDecode, self).__init__( - character_dict_path, character_type, use_space_char) + super(DistillationCTCLabelDecode, self).__init__(character_dict_path, + use_space_char) if not isinstance(model_name, list): model_name = [model_name] self.model_name = model_name @@ -156,16 +134,77 @@ class DistillationCTCLabelDecode(CTCLabelDecode): return output +class NRTRLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=True, **kwargs): + super(NRTRLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + + if len(preds) == 2: + preds_id = preds[0] + preds_prob = preds[1] + if isinstance(preds_id, paddle.Tensor): + preds_id = preds_id.numpy() + if isinstance(preds_prob, paddle.Tensor): + preds_prob = preds_prob.numpy() + if preds_id[0][0] == 2: + preds_idx = preds_id[:, 1:] + preds_prob = preds_prob[:, 1:] + else: + preds_idx = preds_id + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label[:, 1:]) + else: + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label[:, 1:]) + return text, label + + def add_special_char(self, dict_character): + dict_character = ['blank', '', '', ''] + dict_character + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] == 3: # end + break + try: + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + except: + continue + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text.lower(), np.mean(conf_list))) + return result_list + + class AttnLabelDecode(BaseRecLabelDecode): """ Convert between text-label and text-index """ - def __init__(self, - character_dict_path=None, - character_type='ch', - use_space_char=False, + def __init__(self, character_dict_path=None, use_space_char=False, **kwargs): super(AttnLabelDecode, self).__init__(character_dict_path, - character_type, use_space_char) + use_space_char) def add_special_char(self, dict_character): self.beg_str = "sos" @@ -239,16 +278,91 @@ class AttnLabelDecode(BaseRecLabelDecode): return idx +class SEEDLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(SEEDLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def add_special_char(self, dict_character): + self.beg_str = "sos" + self.end_str = "eos" + dict_character = dict_character + [self.end_str] + return dict_character + + def get_ignored_tokens(self): + end_idx = self.get_beg_end_flag_idx("eos") + return [end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "sos": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "eos": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "unsupport type %s in get_beg_end_flag_idx" % beg_or_end + return idx + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + [end_idx] = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if int(text_index[batch_idx][idx]) == int(end_idx): + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list))) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + """ + text = self.decode(text) + if label is None: + return text + else: + label = self.decode(label, is_remove_duplicate=False) + return text, label + """ + preds_idx = preds["rec_pred"] + if isinstance(preds_idx, paddle.Tensor): + preds_idx = preds_idx.numpy() + if "rec_pred_scores" in preds: + preds_idx = preds["rec_pred"] + preds_prob = preds["rec_pred_scores"] + else: + preds_idx = preds["rec_pred"].argmax(axis=2) + preds_prob = preds["rec_pred"].max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + class SRNLabelDecode(BaseRecLabelDecode): """ Convert between text-label and text-index """ - def __init__(self, - character_dict_path=None, - character_type='en', - use_space_char=False, + def __init__(self, character_dict_path=None, use_space_char=False, **kwargs): super(SRNLabelDecode, self).__init__(character_dict_path, - character_type, use_space_char) + use_space_char) self.max_text_length = kwargs.get('max_text_length', 25) def __call__(self, preds, label=None, *args, **kwargs): @@ -324,10 +438,9 @@ class SRNLabelDecode(BaseRecLabelDecode): class TableLabelDecode(object): """ """ - def __init__(self, - character_dict_path, - **kwargs): - list_character, list_elem = self.load_char_elem_dict(character_dict_path) + def __init__(self, character_dict_path, **kwargs): + list_character, list_elem = self.load_char_elem_dict( + character_dict_path) list_character = self.add_special_char(list_character) list_elem = self.add_special_char(list_elem) self.dict_character = {} @@ -346,7 +459,8 @@ class TableLabelDecode(object): list_elem = [] with open(character_dict_path, "rb") as fin: lines = fin.readlines() - substr = lines[0].decode('utf-8').strip("\n").strip("\r\n").split("\t") + substr = lines[0].decode('utf-8').strip("\n").strip("\r\n").split( + "\t") character_num = int(substr[0]) elem_num = int(substr[1]) for cno in range(1, 1 + character_num): @@ -366,14 +480,14 @@ class TableLabelDecode(object): def __call__(self, preds): structure_probs = preds['structure_probs'] loc_preds = preds['loc_preds'] - if isinstance(structure_probs,paddle.Tensor): + if isinstance(structure_probs, paddle.Tensor): structure_probs = structure_probs.numpy() - if isinstance(loc_preds,paddle.Tensor): + if isinstance(loc_preds, paddle.Tensor): loc_preds = loc_preds.numpy() structure_idx = structure_probs.argmax(axis=2) structure_probs = structure_probs.max(axis=2) - structure_str, structure_pos, result_score_list, result_elem_idx_list = self.decode(structure_idx, - structure_probs, 'elem') + structure_str, structure_pos, result_score_list, result_elem_idx_list = self.decode( + structure_idx, structure_probs, 'elem') res_html_code_list = [] res_loc_list = [] batch_num = len(structure_str) @@ -388,8 +502,13 @@ class TableLabelDecode(object): res_loc = np.array(res_loc) res_html_code_list.append(res_html_code) res_loc_list.append(res_loc) - return {'res_html_code': res_html_code_list, 'res_loc': res_loc_list, 'res_score_list': result_score_list, - 'res_elem_idx_list': result_elem_idx_list,'structure_str_list':structure_str} + return { + 'res_html_code': res_html_code_list, + 'res_loc': res_loc_list, + 'res_score_list': result_score_list, + 'res_elem_idx_list': result_elem_idx_list, + 'structure_str_list': structure_str + } def decode(self, text_index, structure_probs, char_or_elem): """convert text-label into text-index. @@ -454,3 +573,79 @@ class TableLabelDecode(object): assert False, "Unsupport type %s in char_or_elem" \ % char_or_elem return idx + + +class SARLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(SARLabelDecode, self).__init__(character_dict_path, + use_space_char) + + self.rm_symbol = kwargs.get('rm_symbol', False) + + def add_special_char(self, dict_character): + beg_end_str = "" + unknown_str = "" + padding_str = "" + dict_character = dict_character + [unknown_str] + self.unknown_idx = len(dict_character) - 1 + dict_character = dict_character + [beg_end_str] + self.start_idx = len(dict_character) - 1 + self.end_idx = len(dict_character) - 1 + dict_character = dict_character + [padding_str] + self.padding_idx = len(dict_character) - 1 + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if int(text_index[batch_idx][idx]) == int(self.end_idx): + if text_prob is None and idx == 0: + continue + else: + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + if self.rm_symbol: + comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]') + text = text.lower() + text = comp.sub('', text) + result_list.append((text, np.mean(conf_list))) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + def get_ignored_tokens(self): + return [self.padding_idx] diff --git a/ppocr/utils/EN_symbol_dict.txt b/ppocr/utils/EN_symbol_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..1aef43d6b842731a54cbe682ccda5c2dbfa694d9 --- /dev/null +++ b/ppocr/utils/EN_symbol_dict.txt @@ -0,0 +1,94 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +: +; +< += +> +? +@ +[ +\ +] +^ +_ +` +{ +| +} +~ \ No newline at end of file diff --git a/ppocr/utils/dict90.txt b/ppocr/utils/dict90.txt new file mode 100644 index 0000000000000000000000000000000000000000..a945ae9c526e4faa68852eb3fb47d078a2f3f6ce --- /dev/null +++ b/ppocr/utils/dict90.txt @@ -0,0 +1,90 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +: +; +< += +> +? +@ +[ +\ +] +_ +` +~ \ No newline at end of file diff --git a/ppocr/utils/iou.py b/ppocr/utils/iou.py new file mode 100644 index 0000000000000000000000000000000000000000..20529dee2d14083f3de4ac034668d004136c56e2 --- /dev/null +++ b/ppocr/utils/iou.py @@ -0,0 +1,48 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle + +EPS = 1e-6 + +def iou_single(a, b, mask, n_class): + valid = mask == 1 + a = a.masked_select(valid) + b = b.masked_select(valid) + miou = [] + for i in range(n_class): + if a.shape == [0] and a.shape==b.shape: + inter = paddle.to_tensor(0.0) + union = paddle.to_tensor(0.0) + else: + inter = ((a == i).logical_and(b == i)).astype('float32') + union = ((a == i).logical_or(b == i)).astype('float32') + miou.append(paddle.sum(inter) / (paddle.sum(union) + EPS)) + miou = sum(miou) / len(miou) + return miou + +def iou(a, b, mask, n_class=2, reduce=True): + batch_size = a.shape[0] + + a = a.reshape([batch_size, -1]) + b = b.reshape([batch_size, -1]) + mask = mask.reshape([batch_size, -1]) + + iou = paddle.zeros((batch_size,), dtype='float32') + for i in range(batch_size): + iou[i] = iou_single(a[i], b[i], mask[i], n_class) + + if reduce: + iou = paddle.mean(iou) + return iou \ No newline at end of file diff --git a/ppocr/utils/profiler.py b/ppocr/utils/profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..c4e28bc6bea9ca912a0786d879a48ec0349e7698 --- /dev/null +++ b/ppocr/utils/profiler.py @@ -0,0 +1,110 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import paddle + +# A global variable to record the number of calling times for profiler +# functions. It is used to specify the tracing range of training steps. +_profiler_step_id = 0 + +# A global variable to avoid parsing from string every time. +_profiler_options = None + + +class ProfilerOptions(object): + ''' + Use a string to initialize a ProfilerOptions. + The string should be in the format: "key1=value1;key2=value;key3=value3". + For example: + "profile_path=model.profile" + "batch_range=[50, 60]; profile_path=model.profile" + "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile" + ProfilerOptions supports following key-value pair: + batch_range - a integer list, e.g. [100, 110]. + state - a string, the optional values are 'CPU', 'GPU' or 'All'. + sorted_key - a string, the optional values are 'calls', 'total', + 'max', 'min' or 'ave. + tracer_option - a string, the optional values are 'Default', 'OpDetail', + 'AllOpDetail'. + profile_path - a string, the path to save the serialized profile data, + which can be used to generate a timeline. + exit_on_finished - a boolean. + ''' + + def __init__(self, options_str): + assert isinstance(options_str, str) + + self._options = { + 'batch_range': [10, 20], + 'state': 'All', + 'sorted_key': 'total', + 'tracer_option': 'Default', + 'profile_path': '/tmp/profile', + 'exit_on_finished': True + } + self._parse_from_string(options_str) + + def _parse_from_string(self, options_str): + for kv in options_str.replace(' ', '').split(';'): + key, value = kv.split('=') + if key == 'batch_range': + value_list = value.replace('[', '').replace(']', '').split(',') + value_list = list(map(int, value_list)) + if len(value_list) >= 2 and value_list[0] >= 0 and value_list[ + 1] > value_list[0]: + self._options[key] = value_list + elif key == 'exit_on_finished': + self._options[key] = value.lower() in ("yes", "true", "t", "1") + elif key in [ + 'state', 'sorted_key', 'tracer_option', 'profile_path' + ]: + self._options[key] = value + + def __getitem__(self, name): + if self._options.get(name, None) is None: + raise ValueError( + "ProfilerOptions does not have an option named %s." % name) + return self._options[name] + + +def add_profiler_step(options_str=None): + ''' + Enable the operator-level timing using PaddlePaddle's profiler. + The profiler uses a independent variable to count the profiler steps. + One call of this function is treated as a profiler step. + + Args: + profiler_options - a string to initialize the ProfilerOptions. + Default is None, and the profiler is disabled. + ''' + if options_str is None: + return + + global _profiler_step_id + global _profiler_options + + if _profiler_options is None: + _profiler_options = ProfilerOptions(options_str) + + if _profiler_step_id == _profiler_options['batch_range'][0]: + paddle.utils.profiler.start_profiler( + _profiler_options['state'], _profiler_options['tracer_option']) + elif _profiler_step_id == _profiler_options['batch_range'][1]: + paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'], + _profiler_options['profile_path']) + if _profiler_options['exit_on_finished']: + sys.exit(0) + + _profiler_step_id += 1 diff --git a/ppocr/utils/save_load.py b/ppocr/utils/save_load.py index 3bb022ed98b140995b79ceea93d7f494d3f5930d..a7d24dd71a6e35ca619c2a3f90df3a202b8ad94b 100644 --- a/ppocr/utils/save_load.py +++ b/ppocr/utils/save_load.py @@ -108,14 +108,15 @@ def load_dygraph_params(config, model, logger, optimizer): for k1, k2 in zip(state_dict.keys(), params.keys()): if list(state_dict[k1].shape) == list(params[k2].shape): new_state_dict[k1] = params[k2] - else: - logger.info( - f"The shape of model params {k1} {state_dict[k1].shape} not matched with loaded params {k2} {params[k2].shape} !" - ) + else: + logger.info( + f"The shape of model params {k1} {state_dict[k1].shape} not matched with loaded params {k2} {params[k2].shape} !" + ) model.set_state_dict(new_state_dict) logger.info(f"loaded pretrained_model successful from {pm}") return {} + def load_pretrained_params(model, path): if path is None: return False @@ -138,6 +139,7 @@ def load_pretrained_params(model, path): print(f"load pretrain successful from {path}") return model + def save_model(model, optimizer, model_path, diff --git a/ppstructure/README.md b/ppstructure/README.md index 8e1642cc75cc52b179d0f8441a8da2fe86e78d7b..849c5c5667ff0532dfee35479715880192df0dc5 100644 --- a/ppstructure/README.md +++ b/ppstructure/README.md @@ -30,13 +30,13 @@ python3 -m pip install paddlepaddle-gpu==2.1.1 -i https://mirror.baidu.com/pypi/ # CPU python3 -m pip install paddlepaddle==2.1.1 -i https://mirror.baidu.com/pypi/simple -# For more,refer[Installation](https://www.paddlepaddle.org.cn/install/quick)。 ``` +For more,refer [Installation](https://www.paddlepaddle.org.cn/install/quick) . - **(2) Install Layout-Parser** ```bash -pip3 install -U premailer paddleocr https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl +pip3 install -U https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl ``` ### 2.2 Install PaddleOCR(including PP-OCR and PP-Structure) @@ -124,8 +124,6 @@ Most of the parameters are consistent with the paddleocr whl package, see [doc o After running, each image will have a directory with the same name under the directory specified in the output field. Each table in the picture will be stored as an excel and figure area will be cropped and saved, the excel and image file name will be the coordinates of the table in the image. ## 4. PP-Structure Pipeline - -the process is as follows ![pipeline](../doc/table/pipeline_en.jpg) In PP-Structure, the image will be analyzed by layoutparser first. In the layout analysis, the area in the image will be classified, including **text, title, image, list and table** 5 categories. For the first 4 types of areas, directly use the PP-OCR to complete the text detection and recognition. The table area will be converted to an excel file of the same table style via Table OCR. @@ -180,10 +178,10 @@ OCR and table recognition model |model name|description|model size|download| | --- | --- | --- | --- | -|ch_ppocr_mobile_slim_v2.0_det|Slim pruned lightweight model, supporting Chinese, English, multilingual text detection|2.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar) | -|ch_ppocr_mobile_slim_v2.0_rec|Slim pruned and quantized lightweight model, supporting Chinese, English and number recognition|6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) | -|en_ppocr_mobile_v2.0_table_det|Text detection of English table scenes trained on PubLayNet dataset|4.7M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) | -|en_ppocr_mobile_v2.0_table_rec|Text recognition of English table scene trained on PubLayNet dataset|6.9M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) | -|en_ppocr_mobile_v2.0_table_structure|Table structure prediction of English table scene trained on PubLayNet dataset|18.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) | +|ch_ppocr_mobile_slim_v2.0_det|Slim pruned lightweight model, supporting Chinese, English, multilingual text detection|2.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar) | +|ch_ppocr_mobile_slim_v2.0_rec|Slim pruned and quantized lightweight model, supporting Chinese, English and number recognition|6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) | +|en_ppocr_mobile_v2.0_table_det|Text detection of English table scenes trained on PubLayNet dataset|4.7M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_det_train.tar) | +|en_ppocr_mobile_v2.0_table_rec|Text recognition of English table scene trained on PubLayNet dataset|6.9M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_rec_train.tar) | +|en_ppocr_mobile_v2.0_table_structure|Table structure prediction of English table scene trained on PubLayNet dataset|18.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | If you need to use other models, you can download the model in [model_list](../doc/doc_en/models_list_en.md) or use your own trained model to configure it to the three fields of `det_model_dir`, `rec_model_dir`, `table_model_dir` . diff --git a/ppstructure/README_ch.md b/ppstructure/README_ch.md index c8acac590039647cf52f47b16a99092ff68f2b6e..821a6c3e36361abefa4d754537fdbd694e844efe 100644 --- a/ppstructure/README_ch.md +++ b/ppstructure/README_ch.md @@ -30,13 +30,13 @@ python3 -m pip install paddlepaddle-gpu==2.1.1 -i https://mirror.baidu.com/pypi/ # CPU安装 python3 -m pip install paddlepaddle==2.1.1 -i https://mirror.baidu.com/pypi/simple -# 更多需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 ``` +更多需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 - **(2) 安装 Layout-Parser** ```bash -pip3 install -U premailer paddleocr https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl +pip3 install -U https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl ``` ### 2.2 安装PaddleOCR(包含PP-OCR和PP-Structure) @@ -179,10 +179,10 @@ OCR和表格识别模型 |模型名称|模型简介|推理模型大小|下载地址| | --- | --- | --- | --- | -|ch_ppocr_mobile_slim_v2.0_det|slim裁剪版超轻量模型,支持中英文、多语种文本检测|2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar) | -|ch_ppocr_mobile_slim_v2.0_rec|slim裁剪量化版超轻量模型,支持中英文、数字识别|6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) | -|en_ppocr_mobile_v2.0_table_det|PubLayNet数据集训练的英文表格场景的文字检测|4.7M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) | -|en_ppocr_mobile_v2.0_table_rec|PubLayNet数据集训练的英文表格场景的文字识别|6.9M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) | -|en_ppocr_mobile_v2.0_table_structure|PubLayNet数据集训练的英文表格场景的表格结构预测|18.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) | +|ch_ppocr_mobile_slim_v2.0_det|slim裁剪版超轻量模型,支持中英文、多语种文本检测|2.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar) | +|ch_ppocr_mobile_slim_v2.0_rec|slim裁剪量化版超轻量模型,支持中英文、数字识别|6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) | +|en_ppocr_mobile_v2.0_table_det|PubLayNet数据集训练的英文表格场景的文字检测|4.7M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_det_train.tar) | +|en_ppocr_mobile_v2.0_table_rec|PubLayNet数据集训练的英文表格场景的文字识别|6.9M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_rec_train.tar) | +|en_ppocr_mobile_v2.0_table_structure|PubLayNet数据集训练的英文表格场景的表格结构预测|18.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | 如需要使用其他模型,可以在 [model_list](../doc/doc_ch/models_list.md) 下载模型或者使用自己训练好的模型配置到`det_model_dir`,`rec_model_dir`,`table_model_dir`三个字段即可。 diff --git a/ppstructure/layout/train_layoutparser_model.md b/ppstructure/layout/train_layoutparser_model.md index 08f5ebbf1aa276e4a3ecf27af46442161afcda1f..58975d71606e45b2f68a7f68565459042ef32775 100644 --- a/ppstructure/layout/train_layoutparser_model.md +++ b/ppstructure/layout/train_layoutparser_model.md @@ -4,9 +4,9 @@ ​ [1.1 Requirements](#Requirements) -​ [1.2 Install PaddleDetection](#Install PaddleDetection) +​ [1.2 Install PaddleDetection](#Install_PaddleDetection) -[2. Data preparation](#Data preparation) +[2. Data preparation](#Data_reparation) [3. Configuration](#Configuration) @@ -16,7 +16,7 @@ [6. Deployment](#Deployment) -​ [6.1 Export model](#Export model) +​ [6.1 Export model](#Export_model) ​ [6.2 Inference](#Inference) @@ -35,7 +35,7 @@ - CUDA >= 10.1 - cuDNN >= 7.6 - + ### 1.2 Install PaddleDetection @@ -51,7 +51,7 @@ pip install -r requirements.txt For more installation tutorials, please refer to: [Install doc](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/docs/tutorials/INSTALL_cn.md) - + ## 2. Data preparation @@ -165,7 +165,7 @@ python tools/infer.py -c configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml --infer Use your trained model in Layout Parser - + ### 6.1 Export model diff --git a/ppstructure/table/README.md b/ppstructure/table/README.md index a8d10b79e507ab59ef2481982a33902e4a95e73e..67c4d8e26d5c615f4a930752005420ba1abcc834 100644 --- a/ppstructure/table/README.md +++ b/ppstructure/table/README.md @@ -41,7 +41,7 @@ wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_tab wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar cd .. # run -python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=../doc/table/table.jpg --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=ch --det_limit_side_len=736 --det_limit_type=min --output ../output/table +python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=../doc/table/table.jpg --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=EN --det_limit_side_len=736 --det_limit_type=min --output ../output/table ``` Note: The above model is trained on the PubLayNet dataset and only supports English scanning scenarios. If you need to identify other scenarios, you need to train the model yourself and replace the three fields `det_model_dir`, `rec_model_dir`, `table_model_dir`. diff --git a/ppstructure/table/README_ch.md b/ppstructure/table/README_ch.md index 2ded403c371984a447f94268d23ca1c6240cf432..2e90ad33423da347b5a51444f2be53ed2eb67a7a 100644 --- a/ppstructure/table/README_ch.md +++ b/ppstructure/table/README_ch.md @@ -1,6 +1,16 @@ # 表格识别 +* [1. 表格识别 pipeline](#1) +* [2. 性能](#2) +* [3. 使用](#3) + + [3.1 快速开始](#31) + + [3.2 训练](#32) + + [3.3 评估](#33) + + [3.4 预测](#34) + + ## 1. 表格识别 pipeline + 表格识别主要包含三个模型 1. 单行文本检测-DB 2. 单行文本识别-CRNN @@ -17,6 +27,8 @@ 3. 由单行文字的坐标、识别结果和单元格的坐标一起组合出单元格的识别结果。 4. 单元格的识别结果和表格结构一起构造表格的html字符串。 + + ## 2. 性能 我们在 PubTabNet[1] 评估数据集上对算法进行了评估,性能如下 @@ -26,8 +38,9 @@ | EDD[2] | 88.3 | | Ours | 93.32 | + ## 3. 使用 - + ### 3.1 快速开始 ```python @@ -43,12 +56,12 @@ wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_tab wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar cd .. # 执行预测 -python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=../doc/table/table.jpg --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=ch --det_limit_side_len=736 --det_limit_type=min --output ../output/table +python3 table/predict_table.py --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=../doc/table/table.jpg --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=EN --det_limit_side_len=736 --det_limit_type=min --output ../output/table ``` 运行完成后,每张图片的excel表格会保存到output字段指定的目录下 note: 上述模型是在 PubLayNet 数据集上训练的表格识别模型,仅支持英文扫描场景,如需识别其他场景需要自己训练模型后替换 `det_model_dir`,`rec_model_dir`,`table_model_dir`三个字段即可。 - + ### 3.2 训练 在这一章节中,我们仅介绍表格结构模型的训练,[文字检测](../../doc/doc_ch/detection.md)和[文字识别](../../doc/doc_ch/recognition.md)的模型训练请参考对应的文档。 @@ -75,7 +88,7 @@ python3 tools/train.py -c configs/table/table_mv3.yml -o Global.checkpoints=./yo **注意**:`Global.checkpoints`的优先级高于`Global.pretrain_weights`的优先级,即同时指定两个参数时,优先加载`Global.checkpoints`指定的模型,如果`Global.checkpoints`指定的模型路径有误,会加载`Global.pretrain_weights`指定的模型。 - + ### 3.3 评估 表格使用 [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) 作为模型的评估指标。在进行模型评估之前,需要将pipeline中的三个模型分别导出为inference模型(我们已经提供好),还需要准备评估的gt, gt示例如下: @@ -100,7 +113,7 @@ python3 table/eval_table.py --det_model_dir=path/to/det_model_dir --rec_model_di ```bash teds: 93.32 ``` - + ### 3.4 预测 ```python diff --git a/requirements.txt b/requirements.txt index 351d409092a1f387b720c3ff2d43889170f320a7..6758a59bad20f6ffa271766fc4d0df5ebf4c7a4b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ shapely -scikit-image==0.17.2 +scikit-image==0.18.3 imgaug==0.4.0 pyclipper lmdb @@ -7,4 +7,9 @@ tqdm numpy visualdl python-Levenshtein -opencv-contrib-python==4.4.0.46 \ No newline at end of file +opencv-contrib-python==4.4.0.46 +cython +lxml +premailer +openpyxl +fasttext==0.9.1 \ No newline at end of file diff --git a/tests/ocr_det_params.txt b/tests/ocr_det_params.txt deleted file mode 100644 index 6aff66c6aa8591c9f48c81cf857809f956a3cda2..0000000000000000000000000000000000000000 --- a/tests/ocr_det_params.txt +++ /dev/null @@ -1,52 +0,0 @@ -===========================train_params=========================== -model_name:ocr_det -python:python3.7 -gpu_list:0|0,1 -Global.use_gpu:True|True -Global.auto_cast:null -Global.epoch_num:lite_train_infer=2|whole_train_infer=300 -Global.save_model_dir:./output/ -Train.loader.batch_size_per_card:lite_train_infer=2|whole_train_infer=4 -Global.pretrained_model:null -train_model_name:latest -train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/ -null:null -## -trainer:norm_train|pact_train -norm_train:tools/train.py -c configs/det/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained -pact_train:deploy/slim/quantization/quant.py -c configs/det/det_mv3_db.yml -o -fpgm_train:null -distill_train:null -null:null -null:null -## -===========================eval_params=========================== -eval:tools/eval.py -c configs/det/det_mv3_db.yml -o -null:null -## -===========================infer_params=========================== -Global.save_inference_dir:./output/ -Global.pretrained_model: -norm_export:tools/export_model.py -c configs/det/det_mv3_db.yml -o -quant_export:deploy/slim/quantization/export_model.py -c configs/det/det_mv3_db.yml -o -fpgm_export:deploy/slim/prune/export_prune_model.py -distill_export:null -export1:null -export2:null -## -infer_model:./inference/ch_ppocr_mobile_v2.0_det_infer/ -infer_export:null -infer_quant:False -inference:tools/infer/predict_det.py ---use_gpu:True|False ---enable_mkldnn:True|False ---cpu_threads:1|6 ---rec_batch_num:1 ---use_tensorrt:False|True ---precision:fp32|fp16|int8 ---det_model_dir: ---image_dir:./inference/ch_det_data_50/all-sum-510/ ---save_log_path:null ---benchmark:True -null:null - diff --git a/tests/prepare.sh b/tests/prepare.sh deleted file mode 100644 index 418e5661ad0f315bc60b8fda37742c115b395b7c..0000000000000000000000000000000000000000 --- a/tests/prepare.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash -FILENAME=$1 -# MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer', 'infer'] -MODE=$2 - -dataline=$(cat ${FILENAME}) - -# parser params -IFS=$'\n' -lines=(${dataline}) -function func_parser_key(){ - strs=$1 - IFS=":" - array=(${strs}) - tmp=${array[0]} - echo ${tmp} -} -function func_parser_value(){ - strs=$1 - IFS=":" - array=(${strs}) - tmp=${array[1]} - echo ${tmp} -} -IFS=$'\n' -# The training params -model_name=$(func_parser_value "${lines[1]}") - -trainer_list=$(func_parser_value "${lines[14]}") - -# MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer'] -MODE=$2 - -if [ ${MODE} = "lite_train_infer" ];then - # pretrain lite train data - wget -nc -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams - rm -rf ./train_data/icdar2015 - rm -rf ./train_data/ic15_data - wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015_lite.tar - wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ic15_data.tar # todo change to bcebos - - cd ./train_data/ && tar xf icdar2015_lite.tar && tar xf ic15_data.tar - ln -s ./icdar2015_lite ./icdar2015 - cd ../ -elif [ ${MODE} = "whole_train_infer" ];then - wget -nc -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams - rm -rf ./train_data/icdar2015 - rm -rf ./train_data/ic15_data - wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015.tar - wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ic15_data.tar - cd ./train_data/ && tar xf icdar2015.tar && tar xf ic15_data.tar && cd ../ -elif [ ${MODE} = "whole_infer" ];then - wget -nc -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams - rm -rf ./train_data/icdar2015 - rm -rf ./train_data/ic15_data - wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015_infer.tar - wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ic15_data.tar - cd ./train_data/ && tar xf icdar2015_infer.tar && tar xf ic15_data.tar - ln -s ./icdar2015_infer ./icdar2015 - cd ../ -else - if [ ${model_name} = "ocr_det" ]; then - eval_model_name="ch_ppocr_mobile_v2.0_det_infer" - rm -rf ./train_data/icdar2015 - wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ch_det_data_50.tar - wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar - cd ./inference && tar xf ${eval_model_name}.tar && tar xf ch_det_data_50.tar && cd ../ - else - rm -rf ./train_data/ic15_data - eval_model_name="ch_ppocr_mobile_v2.0_rec_infer" - wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/ic15_data.tar - wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar - cd ./inference && tar xf ${eval_model_name}.tar && tar xf ic15_data.tar && cd ../ - fi -fi - diff --git a/tests/readme.md b/tests/readme.md deleted file mode 100644 index 1c5e0faee90cad9709b6e4d517cbf7830aa2bb8e..0000000000000000000000000000000000000000 --- a/tests/readme.md +++ /dev/null @@ -1,58 +0,0 @@ - -# 介绍 - -test.sh和params.txt文件配合使用,完成OCR轻量检测和识别模型从训练到预测的流程测试。 - -# 安装依赖 -- 安装PaddlePaddle >= 2.0 -- 安装PaddleOCR依赖 - ``` - pip3 install -r ../requirements.txt - ``` -- 安装autolog - ``` - git clone https://github.com/LDOUBLEV/AutoLog - cd AutoLog - pip3 install -r requirements.txt - python3 setup.py bdist_wheel - pip3 install ./dist/auto_log-1.0.0-py3-none-any.whl - cd ../ - ``` - -# 目录介绍 - -```bash -tests/ -├── ocr_det_params.txt # 测试OCR检测模型的参数配置文件 -├── ocr_rec_params.txt # 测试OCR识别模型的参数配置文件 -└── prepare.sh # 完成test.sh运行所需要的数据和模型下载 -└── test.sh # 根据 -``` - -# 使用方法 -test.sh包含四种运行模式,每种模式的运行数据不同,分别用于测试速度和精度,分别是: -- 模式1 lite_train_infer,使用少量数据训练,用于快速验证训练到预测的走通流程,不验证精度和速度; -``` -bash test/prepare.sh ./tests/ocr_det_params.txt 'lite_train_infer' -bash tests/test.sh ./tests/ocr_det_params.txt 'lite_train_infer' -``` -- 模式2 whole_infer,使用少量数据训练,一定量数据预测,用于验证训练后的模型执行预测,预测速度是否合理; -``` -bash tests/prepare.sh ./tests/ocr_det_params.txt 'whole_infer' -bash tests/test.sh ./tests/ocr_det_params.txt 'whole_infer' -``` - -- 模式3 infer 不训练,全量数据预测,走通开源模型评估、动转静,检查inference model预测时间和精度; -``` -bash tests/prepare.sh ./tests/ocr_det_params.txt 'infer' -用法1: -bash tests/test.sh ./tests/ocr_det_params.txt 'infer' -用法2: 指定GPU卡预测,第三个传入参数为GPU卡号 -bash tests/test.sh ./tests/ocr_det_params.txt 'infer' '1' -``` - -模式4: whole_train_infer , CE: 全量数据训练,全量数据预测,验证模型训练精度,预测精度,预测速度 -``` -bash tests/prepare.sh ./tests/ocr_det_params.txt 'whole_train_infer' -bash tests/test.sh ./tests/ocr_det_params.txt 'whole_train_infer' -``` diff --git a/tools/eval.py b/tools/eval.py index 0120baab0f34d5fadbbf4df20d92d6b62dd176a2..28247bc57450aaf067fcb405674098eacb990166 100755 --- a/tools/eval.py +++ b/tools/eval.py @@ -27,7 +27,7 @@ from ppocr.data import build_dataloader from ppocr.modeling.architectures import build_model from ppocr.postprocess import build_post_process from ppocr.metrics import build_metric -from ppocr.utils.save_load import init_model, load_pretrained_params +from ppocr.utils.save_load import init_model, load_dygraph_params from ppocr.utils.utility import print_dict import tools.program as program @@ -54,13 +54,13 @@ def main(): config['Architecture']["Head"]['out_channels'] = char_num model = build_model(config['Architecture']) - use_srn = config['Architecture']['algorithm'] == "SRN" + extra_input = config['Architecture']['algorithm'] in ["SRN", "SAR"] if "model_type" in config['Architecture'].keys(): model_type = config['Architecture']['model_type'] else: model_type = None - best_model_dict = init_model(config, model) + best_model_dict = load_dygraph_params(config, model, logger, None) if len(best_model_dict): logger.info('metric in ckpt ***************') for k, v in best_model_dict.items(): @@ -71,7 +71,7 @@ def main(): # start eval metric = program.eval(model, valid_dataloader, post_process_class, - eval_class, model_type, use_srn) + eval_class, model_type, extra_input) logger.info('metric eval ***************') for k, v in metric.items(): logger.info('{}:{}'.format(k, v)) diff --git a/tools/export_center.py b/tools/export_center.py new file mode 100644 index 0000000000000000000000000000000000000000..c46e8b9d58997b9b66c6ce81b2558ecd4cad0e81 --- /dev/null +++ b/tools/export_center.py @@ -0,0 +1,77 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import pickle + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) + +from ppocr.data import build_dataloader +from ppocr.modeling.architectures import build_model +from ppocr.postprocess import build_post_process +from ppocr.utils.save_load import init_model, load_dygraph_params +from ppocr.utils.utility import print_dict +import tools.program as program + + +def main(): + global_config = config['Global'] + # build dataloader + config['Eval']['dataset']['name'] = config['Train']['dataset']['name'] + config['Eval']['dataset']['data_dir'] = config['Train']['dataset'][ + 'data_dir'] + config['Eval']['dataset']['label_file_list'] = config['Train']['dataset'][ + 'label_file_list'] + eval_dataloader = build_dataloader(config, 'Eval', device, logger) + + # build post process + post_process_class = build_post_process(config['PostProcess'], + global_config) + + # build model + # for rec algorithm + if hasattr(post_process_class, 'character'): + char_num = len(getattr(post_process_class, 'character')) + config['Architecture']["Head"]['out_channels'] = char_num + + #set return_features = True + config['Architecture']["Head"]["return_feats"] = True + + model = build_model(config['Architecture']) + + best_model_dict = load_dygraph_params(config, model, logger, None) + if len(best_model_dict): + logger.info('metric in ckpt ***************') + for k, v in best_model_dict.items(): + logger.info('{}:{}'.format(k, v)) + + # get features from train data + char_center = program.get_center(model, eval_dataloader, post_process_class) + + #serialize to disk + with open("train_center.pkl", 'wb') as f: + pickle.dump(char_center, f) + return + + +if __name__ == '__main__': + config, device, logger, vdl_writer = program.preprocess() + main() diff --git a/tools/export_model.py b/tools/export_model.py index 785aca10e46200bda49bdff2b89ba00cafbe7a20..64a0d4036303716a632eb93c53f2478f32b42848 100755 --- a/tools/export_model.py +++ b/tools/export_model.py @@ -49,6 +49,12 @@ def export_single_model(model, arch_config, save_path, logger): ] ] model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "SAR": + other_shape = [ + paddle.static.InputSpec( + shape=[None, 3, 48, 160], dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) else: infer_shape = [3, -1, -1] if arch_config["model_type"] == "rec": @@ -60,6 +66,8 @@ def export_single_model(model, arch_config, save_path, logger): "When there is tps in the network, variable length input is not supported, and the input size needs to be the same as during training" ) infer_shape[-1] = 100 + if arch_config["algorithm"] == "NRTR": + infer_shape = [1, 32, 100] elif arch_config["model_type"] == "table": infer_shape = [3, 488, 488] model = to_static( @@ -93,6 +101,9 @@ def main(): for key in config["Architecture"]["Models"]: config["Architecture"]["Models"][key]["Head"][ "out_channels"] = char_num + # just one final tensor needs to to exported for inference + config["Architecture"]["Models"][key][ + "return_all_feats"] = False else: # base rec model config["Architecture"]["Head"]["out_channels"] = char_num model = build_model(config["Architecture"]) diff --git a/tools/infer/predict_cls.py b/tools/infer/predict_cls.py index 53e50bd6d1d1a2bd07b9f1204b9f56594c669d13..1c68494861e60b4aaef541a4e247071944cf420c 100755 --- a/tools/infer/predict_cls.py +++ b/tools/infer/predict_cls.py @@ -131,14 +131,9 @@ def main(args): img_list.append(img) try: img_list, cls_res, predict_time = text_classifier(img_list) - except: + except Exception as E: logger.info(traceback.format_exc()) - logger.info( - "ERROR!!!! \n" - "Please read the FAQ:https://github.com/PaddlePaddle/PaddleOCR#faq \n" - "If your model has tps module: " - "TPS does not support variable shape.\n" - "Please set --rec_image_shape='3,32,100' and --rec_char_type='en' ") + logger.info(E) exit() for ino in range(len(img_list)): logger.info("Predicts of {}:{}".format(valid_image_file_list[ino], diff --git a/tools/infer/predict_det.py b/tools/infer/predict_det.py index 5c75e0c480eac6796d6d4b7075d1b38d254380fd..b24ad2bbb504caf1f262b4e47625348ce32d6fce 100755 --- a/tools/infer/predict_det.py +++ b/tools/infer/predict_det.py @@ -30,7 +30,7 @@ from ppocr.utils.logging import get_logger from ppocr.utils.utility import get_image_file_list, check_and_read_gif from ppocr.data import create_operators, transform from ppocr.postprocess import build_post_process - +import json logger = get_logger() @@ -89,6 +89,14 @@ class TextDetector(object): postprocess_params["sample_pts_num"] = 2 postprocess_params["expand_scale"] = 1.0 postprocess_params["shrink_ratio_of_width"] = 0.3 + elif self.det_algorithm == "PSE": + postprocess_params['name'] = 'PSEPostProcess' + postprocess_params["thresh"] = args.det_pse_thresh + postprocess_params["box_thresh"] = args.det_pse_box_thresh + postprocess_params["min_area"] = args.det_pse_min_area + postprocess_params["box_type"] = args.det_pse_box_type + postprocess_params["scale"] = args.det_pse_scale + self.det_pse_box_type = args.det_pse_box_type else: logger.info("unknown det_algorithm:{}".format(self.det_algorithm)) sys.exit(0) @@ -209,7 +217,7 @@ class TextDetector(object): preds['f_score'] = outputs[1] preds['f_tco'] = outputs[2] preds['f_tvo'] = outputs[3] - elif self.det_algorithm == 'DB': + elif self.det_algorithm in ['DB', 'PSE']: preds['maps'] = outputs[0] else: raise NotImplementedError @@ -217,7 +225,9 @@ class TextDetector(object): #self.predictor.try_shrink_memory() post_result = self.postprocess_op(preds, shape_list) dt_boxes = post_result[0]['points'] - if self.det_algorithm == "SAST" and self.det_sast_polygon: + if (self.det_algorithm == "SAST" and + self.det_sast_polygon) or (self.det_algorithm == "PSE" and + self.det_pse_box_type == 'poly'): dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape) else: dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape) @@ -243,6 +253,7 @@ if __name__ == "__main__": if not os.path.exists(draw_img_save): os.makedirs(draw_img_save) + save_results = [] for image_file in image_file_list: img, flag = check_and_read_gif(image_file) if not flag: @@ -256,8 +267,11 @@ if __name__ == "__main__": if count > 0: total_time += elapse count += 1 - - logger.info("Predict time of {}: {}".format(image_file, elapse)) + save_pred = os.path.basename(image_file) + "\t" + str( + json.dumps(np.array(dt_boxes).astype(np.int32).tolist())) + "\n" + save_results.append(save_pred) + logger.info(save_pred) + logger.info("The predict time of {}: {}".format(image_file, elapse)) src_im = utility.draw_text_det_res(dt_boxes, image_file) img_name_pure = os.path.split(image_file)[-1] img_path = os.path.join(draw_img_save, @@ -265,5 +279,8 @@ if __name__ == "__main__": cv2.imwrite(img_path, src_im) logger.info("The visualized image saved in {}".format(img_path)) + with open(os.path.join(draw_img_save, "det_results.txt"), 'w') as f: + f.writelines(save_results) + f.close() if args.benchmark: text_detector.autolog.report() diff --git a/tools/infer/predict_e2e.py b/tools/infer/predict_e2e.py index cd6c2005a7cc77c356e3f004cd586a84676ea7fa..5029d6059346a00062418d8d1b6cb029b0110643 100755 --- a/tools/infer/predict_e2e.py +++ b/tools/infer/predict_e2e.py @@ -74,7 +74,7 @@ class TextE2E(object): self.preprocess_op = create_operators(pre_process_list) self.postprocess_op = build_post_process(postprocess_params) - self.predictor, self.input_tensor, self.output_tensors = utility.create_predictor( + self.predictor, self.input_tensor, self.output_tensors, _ = utility.create_predictor( args, 'e2e', logger) # paddle.jit.load(args.det_model_dir) # self.predictor.eval() diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py index 97dfa5214628123d0c9b7edd7d94060a2bfd2a1e..936994a215d10d543537b29cb41bfa42b42590c7 100755 --- a/tools/infer/predict_rec.py +++ b/tools/infer/predict_rec.py @@ -13,7 +13,7 @@ # limitations under the License. import os import sys - +from PIL import Image __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) @@ -38,26 +38,34 @@ logger = get_logger() class TextRecognizer(object): def __init__(self, args): self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")] - self.character_type = args.rec_char_type self.rec_batch_num = args.rec_batch_num self.rec_algorithm = args.rec_algorithm postprocess_params = { 'name': 'CTCLabelDecode', - "character_type": args.rec_char_type, "character_dict_path": args.rec_char_dict_path, "use_space_char": args.use_space_char } if self.rec_algorithm == "SRN": postprocess_params = { 'name': 'SRNLabelDecode', - "character_type": args.rec_char_type, "character_dict_path": args.rec_char_dict_path, "use_space_char": args.use_space_char } elif self.rec_algorithm == "RARE": postprocess_params = { 'name': 'AttnLabelDecode', - "character_type": args.rec_char_type, + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == 'NRTR': + postprocess_params = { + 'name': 'NRTRLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == "SAR": + postprocess_params = { + 'name': 'SARLabelDecode', "character_dict_path": args.rec_char_dict_path, "use_space_char": args.use_space_char } @@ -87,9 +95,19 @@ class TextRecognizer(object): def resize_norm_img(self, img, max_wh_ratio): imgC, imgH, imgW = self.rec_image_shape + if self.rec_algorithm == 'NRTR': + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + # return padding_im + image_pil = Image.fromarray(np.uint8(img)) + img = image_pil.resize([100, 32], Image.ANTIALIAS) + img = np.array(img) + norm_img = np.expand_dims(img, -1) + norm_img = norm_img.transpose((2, 0, 1)) + return norm_img.astype(np.float32) / 128. - 1. + assert imgC == img.shape[2] - if self.character_type == "ch": - imgW = int((32 * max_wh_ratio)) + max_wh_ratio = max(max_wh_ratio, imgW / imgH) + imgW = int((32 * max_wh_ratio)) h, w = img.shape[:2] ratio = w / float(h) if math.ceil(imgH * ratio) > imgW: @@ -177,6 +195,41 @@ class TextRecognizer(object): return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2) + def resize_norm_img_sar(self, img, image_shape, + width_downsample_ratio=0.25): + imgC, imgH, imgW_min, imgW_max = image_shape + h = img.shape[0] + w = img.shape[1] + valid_ratio = 1.0 + # make sure new_width is an integral multiple of width_divisor. + width_divisor = int(1 / width_downsample_ratio) + # resize + ratio = w / float(h) + resize_w = math.ceil(imgH * ratio) + if resize_w % width_divisor != 0: + resize_w = round(resize_w / width_divisor) * width_divisor + if imgW_min is not None: + resize_w = max(imgW_min, resize_w) + if imgW_max is not None: + valid_ratio = min(1.0, 1.0 * resize_w / imgW_max) + resize_w = min(imgW_max, resize_w) + resized_image = cv2.resize(img, (resize_w, imgH)) + resized_image = resized_image.astype('float32') + # norm + if image_shape[0] == 1: + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + else: + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + resize_shape = resized_image.shape + padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32) + padding_im[:, :, 0:resize_w] = resized_image + pad_shape = padding_im.shape + + return padding_im, resize_shape, pad_shape, valid_ratio + def __call__(self, img_list): img_num = len(img_list) # Calculate the aspect ratio of all text bars @@ -199,11 +252,19 @@ class TextRecognizer(object): wh_ratio = w * 1.0 / h max_wh_ratio = max(max_wh_ratio, wh_ratio) for ino in range(beg_img_no, end_img_no): - if self.rec_algorithm != "SRN": + if self.rec_algorithm != "SRN" and self.rec_algorithm != "SAR": norm_img = self.resize_norm_img(img_list[indices[ino]], max_wh_ratio) norm_img = norm_img[np.newaxis, :] norm_img_batch.append(norm_img) + elif self.rec_algorithm == "SAR": + norm_img, _, _, valid_ratio = self.resize_norm_img_sar( + img_list[indices[ino]], self.rec_image_shape) + norm_img = norm_img[np.newaxis, :] + valid_ratio = np.expand_dims(valid_ratio, axis=0) + valid_ratios = [] + valid_ratios.append(valid_ratio) + norm_img_batch.append(norm_img) else: norm_img = self.process_image_srn( img_list[indices[ino]], self.rec_image_shape, 8, 25) @@ -249,17 +310,38 @@ class TextRecognizer(object): if self.benchmark: self.autolog.times.stamp() preds = {"predict": outputs[2]} + elif self.rec_algorithm == "SAR": + valid_ratios = np.concatenate(valid_ratios) + inputs = [ + norm_img_batch, + valid_ratios, + ] + input_names = self.predictor.get_input_names() + for i in range(len(input_names)): + input_tensor = self.predictor.get_input_handle(input_names[ + i]) + input_tensor.copy_from_cpu(inputs[i]) + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + if self.benchmark: + self.autolog.times.stamp() + preds = outputs[0] else: self.input_tensor.copy_from_cpu(norm_img_batch) self.predictor.run() - outputs = [] for output_tensor in self.output_tensors: output = output_tensor.copy_to_cpu() outputs.append(output) if self.benchmark: self.autolog.times.stamp() - preds = outputs[0] + if len(outputs) != 1: + preds = outputs + else: + preds = outputs[0] rec_result = self.postprocess_op(preds) for rno in range(len(rec_result)): rec_res[indices[beg_img_no + rno]] = rec_result[rno] @@ -278,7 +360,7 @@ def main(args): if args.warmup: img = np.random.uniform(0, 255, [32, 320, 3]).astype(np.uint8) for i in range(2): - res = text_recognizer([img]) + res = text_recognizer([img] * int(args.rec_batch_num)) for image_file in image_file_list: img, flag = check_and_read_gif(image_file) diff --git a/tools/infer/predict_system.py b/tools/infer/predict_system.py index eae0e27cd284ccce9f41f0c20b05dee09f46fc84..b5edd01589685a29a37dc20064b0d58e9d776fec 100755 --- a/tools/infer/predict_system.py +++ b/tools/infer/predict_system.py @@ -173,6 +173,9 @@ def main(args): logger.info("The predict total time is {}".format(time.time() - _st)) logger.info("\nThe predict total time is {}".format(total_time)) + if args.benchmark: + text_sys.text_detector.autolog.report() + text_sys.text_recognizer.autolog.report() if __name__ == "__main__": diff --git a/tools/infer/utility.py b/tools/infer/utility.py index 707328f28468db86c5061795d04713dc3b21a5cb..41a3c0f14b6378751a367a3709ad7943ee981a4e 100755 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -35,7 +35,7 @@ def init_args(): parser.add_argument("--use_gpu", type=str2bool, default=True) parser.add_argument("--ir_optim", type=str2bool, default=True) parser.add_argument("--use_tensorrt", type=str2bool, default=False) - parser.add_argument("--min_subgraph_size", type=int, default=10) + parser.add_argument("--min_subgraph_size", type=int, default=15) parser.add_argument("--precision", type=str, default="fp32") parser.add_argument("--gpu_mem", type=int, default=500) @@ -63,11 +63,17 @@ def init_args(): parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2) parser.add_argument("--det_sast_polygon", type=str2bool, default=False) + # PSE parmas + parser.add_argument("--det_pse_thresh", type=float, default=0) + parser.add_argument("--det_pse_box_thresh", type=float, default=0.85) + parser.add_argument("--det_pse_min_area", type=float, default=16) + parser.add_argument("--det_pse_box_type", type=str, default='box') + parser.add_argument("--det_pse_scale", type=int, default=1) + # params for text recognizer parser.add_argument("--rec_algorithm", type=str, default='CRNN') parser.add_argument("--rec_model_dir", type=str) parser.add_argument("--rec_image_shape", type=str, default="3, 32, 320") - parser.add_argument("--rec_char_type", type=str, default='ch') parser.add_argument("--rec_batch_num", type=int, default=6) parser.add_argument("--max_text_length", type=int, default=25) parser.add_argument( @@ -236,11 +242,11 @@ def create_predictor(args, mode, logger): max_input_shape.update(max_pact_shape) opt_input_shape.update(opt_pact_shape) elif mode == "rec": - min_input_shape = {"x": [args.rec_batch_num, 3, 32, 10]} + min_input_shape = {"x": [1, 3, 32, 10]} max_input_shape = {"x": [args.rec_batch_num, 3, 32, 2000]} opt_input_shape = {"x": [args.rec_batch_num, 3, 32, 320]} elif mode == "cls": - min_input_shape = {"x": [args.rec_batch_num, 3, 48, 10]} + min_input_shape = {"x": [1, 3, 48, 10]} max_input_shape = {"x": [args.rec_batch_num, 3, 48, 2000]} opt_input_shape = {"x": [args.rec_batch_num, 3, 48, 320]} else: @@ -261,10 +267,11 @@ def create_predictor(args, mode, logger): # cache 10 different shapes for mkldnn to avoid memory leak config.set_mkldnn_cache_capacity(10) config.enable_mkldnn() - + if args.precision == "fp16": + config.enable_mkldnn_bfloat16() # enable memory optim config.enable_memory_optim() - #config.disable_glog_info() + config.disable_glog_info() config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass") if mode == 'table': diff --git a/tools/infer_det.py b/tools/infer_det.py index a964cd28c934504ce79ea4873d3345295c1266e5..ce16da8dc5fffb3f5fdc633aeb00a386a2d60d4f 100755 --- a/tools/infer_det.py +++ b/tools/infer_det.py @@ -34,23 +34,21 @@ import paddle from ppocr.data import create_operators, transform from ppocr.modeling.architectures import build_model from ppocr.postprocess import build_post_process -from ppocr.utils.save_load import init_model +from ppocr.utils.save_load import init_model, load_dygraph_params from ppocr.utils.utility import get_image_file_list import tools.program as program -def draw_det_res(dt_boxes, config, img, img_name): +def draw_det_res(dt_boxes, config, img, img_name, save_path): if len(dt_boxes) > 0: import cv2 src_im = img for box in dt_boxes: box = box.astype(np.int32).reshape((-1, 1, 2)) cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2) - save_det_path = os.path.dirname(config['Global'][ - 'save_res_path']) + "/det_results/" - if not os.path.exists(save_det_path): - os.makedirs(save_det_path) - save_path = os.path.join(save_det_path, os.path.basename(img_name)) + if not os.path.exists(save_path): + os.makedirs(save_path) + save_path = os.path.join(save_path, os.path.basename(img_name)) cv2.imwrite(save_path, src_im) logger.info("The detected Image saved in {}".format(save_path)) @@ -61,8 +59,7 @@ def main(): # build model model = build_model(config['Architecture']) - init_model(config, model) - + _ = load_dygraph_params(config, model, logger, None) # build post process post_process_class = build_post_process(config['PostProcess']) @@ -96,17 +93,41 @@ def main(): images = paddle.to_tensor(images) preds = model(images) post_result = post_process_class(preds, shape_list) - boxes = post_result[0]['points'] - # write result + + src_img = cv2.imread(file) + dt_boxes_json = [] - for box in boxes: - tmp_json = {"transcription": ""} - tmp_json['points'] = box.tolist() - dt_boxes_json.append(tmp_json) + # parser boxes if post_result is dict + if isinstance(post_result, dict): + det_box_json = {} + for k in post_result.keys(): + boxes = post_result[k][0]['points'] + dt_boxes_list = [] + for box in boxes: + tmp_json = {"transcription": ""} + tmp_json['points'] = box.tolist() + dt_boxes_list.append(tmp_json) + det_box_json[k] = dt_boxes_list + save_det_path = os.path.dirname(config['Global'][ + 'save_res_path']) + "/det_results_{}/".format(k) + draw_det_res(boxes, config, src_img, file, save_det_path) + else: + boxes = post_result[0]['points'] + dt_boxes_json = [] + # write result + for box in boxes: + tmp_json = {"transcription": ""} + tmp_json['points'] = box.tolist() + dt_boxes_json.append(tmp_json) + save_det_path = os.path.dirname(config['Global'][ + 'save_res_path']) + "/det_results/" + draw_det_res(boxes, config, src_img, file, save_det_path) otstr = file + "\t" + json.dumps(dt_boxes_json) + "\n" fout.write(otstr.encode()) - src_img = cv2.imread(file) - draw_det_res(boxes, config, src_img, file) + + save_det_path = os.path.dirname(config['Global'][ + 'save_res_path']) + "/det_results/" + draw_det_res(boxes, config, src_img, file, save_det_path) logger.info("success!") diff --git a/tools/infer_rec.py b/tools/infer_rec.py index 09f5a0c767b15c312cdfbe8ed695ea06bdc8cdc4..29d4b530dfcfb8a3201e12b38c9b9f186f34b627 100755 --- a/tools/infer_rec.py +++ b/tools/infer_rec.py @@ -74,6 +74,10 @@ def main(): 'image', 'encoder_word_pos', 'gsrm_word_pos', 'gsrm_slf_attn_bias1', 'gsrm_slf_attn_bias2' ] + elif config['Architecture']['algorithm'] == "SAR": + op[op_name]['keep_keys'] = [ + 'image', 'valid_ratio' + ] else: op[op_name]['keep_keys'] = ['image'] transforms.append(op) @@ -106,11 +110,16 @@ def main(): paddle.to_tensor(gsrm_slf_attn_bias1_list), paddle.to_tensor(gsrm_slf_attn_bias2_list) ] + if config['Architecture']['algorithm'] == "SAR": + valid_ratio = np.expand_dims(batch[-1], axis=0) + img_metas = [paddle.to_tensor(valid_ratio)] images = np.expand_dims(batch[0], axis=0) images = paddle.to_tensor(images) if config['Architecture']['algorithm'] == "SRN": preds = model(images, others) + elif config['Architecture']['algorithm'] == "SAR": + preds = model(images, img_metas) else: preds = model(images) post_result = post_process_class(preds) @@ -121,7 +130,7 @@ def main(): if len(post_result[key][0]) >= 2: rec_info[key] = { "label": post_result[key][0][0], - "score": post_result[key][0][1], + "score": float(post_result[key][0][1]), } info = json.dumps(rec_info) else: diff --git a/tools/program.py b/tools/program.py index 595fe4cb96c0379b1a33504e0ebdd85e70086340..798e6dff297ad1149942488cca1d5540f1924867 100755 --- a/tools/program.py +++ b/tools/program.py @@ -31,6 +31,7 @@ from ppocr.utils.stats import TrainingStats from ppocr.utils.save_load import save_model from ppocr.utils.utility import print_dict from ppocr.utils.logging import get_logger +from ppocr.utils import profiler from ppocr.data import build_dataloader import numpy as np @@ -42,6 +43,13 @@ class ArgsParser(ArgumentParser): self.add_argument("-c", "--config", help="configuration file to use") self.add_argument( "-o", "--opt", nargs='+', help="set configuration options") + self.add_argument( + '-p', + '--profiler_options', + type=str, + default=None, + help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".' + ) def parse_args(self, argv=None): args = super(ArgsParser, self).parse_args(argv) @@ -158,6 +166,7 @@ def train(config, epoch_num = config['Global']['epoch_num'] print_batch_step = config['Global']['print_batch_step'] eval_batch_step = config['Global']['eval_batch_step'] + profiler_options = config['profiler_options'] global_step = 0 if 'global_step' in pre_best_model_dict: @@ -186,10 +195,13 @@ def train(config, model.train() use_srn = config['Architecture']['algorithm'] == "SRN" - try: + extra_input = config['Architecture'][ + 'algorithm'] in ["SRN", "NRTR", "SAR", "SEED"] + try: model_type = config['Architecture']['model_type'] - except: + except: model_type = None + algorithm = config['Architecture']['algorithm'] if 'start_epoch' in best_model_dict: start_epoch = best_model_dict['start_epoch'] @@ -206,6 +218,7 @@ def train(config, max_iter = len(train_dataloader) - 1 if platform.system( ) == "Windows" else len(train_dataloader) for idx, batch in enumerate(train_dataloader): + profiler.add_profiler_step(profiler_options) train_reader_cost += time.time() - batch_start if idx >= max_iter: break @@ -213,7 +226,7 @@ def train(config, images = batch[0] if use_srn: model_average = True - if use_srn or model_type == 'table': + if model_type == 'table' or extra_input: preds = model(images, data=batch[1:]) else: preds = model(images) @@ -277,7 +290,7 @@ def train(config, post_process_class, eval_class, model_type, - use_srn=use_srn) + extra_input=extra_input) cur_metric_str = 'cur metric, {}'.format(', '.join( ['{}: {}'.format(k, v) for k, v in cur_metric.items()])) logger.info(cur_metric_str) @@ -348,8 +361,8 @@ def eval(model, valid_dataloader, post_process_class, eval_class, - model_type, - use_srn=False): + model_type=None, + extra_input=False): model.eval() with paddle.no_grad(): total_frame = 0.0 @@ -362,7 +375,7 @@ def eval(model, break images = batch[0] start = time.time() - if use_srn or model_type == 'table': + if model_type == 'table' or extra_input: preds = model(images, data=batch[1:]) else: preds = model(images) @@ -386,10 +399,76 @@ def eval(model, return metric +def update_center(char_center, post_result, preds): + result, label = post_result + feats, logits = preds + logits = paddle.argmax(logits, axis=-1) + feats = feats.numpy() + logits = logits.numpy() + + for idx_sample in range(len(label)): + if result[idx_sample][0] == label[idx_sample][0]: + feat = feats[idx_sample] + logit = logits[idx_sample] + for idx_time in range(len(logit)): + index = logit[idx_time] + if index in char_center.keys(): + char_center[index][0] = ( + char_center[index][0] * char_center[index][1] + + feat[idx_time]) / (char_center[index][1] + 1) + char_center[index][1] += 1 + else: + char_center[index] = [feat[idx_time], 1] + return char_center + + +def get_center(model, eval_dataloader, post_process_class): + pbar = tqdm(total=len(eval_dataloader), desc='get center:') + max_iter = len(eval_dataloader) - 1 if platform.system( + ) == "Windows" else len(eval_dataloader) + char_center = dict() + for idx, batch in enumerate(eval_dataloader): + if idx >= max_iter: + break + images = batch[0] + start = time.time() + preds = model(images) + + batch = [item.numpy() for item in batch] + # Obtain usable results from post-processing methods + total_time += time.time() - start + # Evaluate the results of the current batch + post_result = post_process_class(preds, batch[1]) + + #update char_center + char_center = update_center(char_center, post_result, preds) + pbar.update(1) + + pbar.close() + for key in char_center.keys(): + char_center[key] = char_center[key][0] + return char_center + + def preprocess(is_train=False): FLAGS = ArgsParser().parse_args() + profiler_options = FLAGS.profiler_options config = load_config(FLAGS.config) merge_config(FLAGS.opt) + profile_dic = {"profiler_options": FLAGS.profiler_options} + merge_config(profile_dic) + + if is_train: + # save_config + save_model_dir = config['Global']['save_model_dir'] + os.makedirs(save_model_dir, exist_ok=True) + with open(os.path.join(save_model_dir, 'config.yml'), 'w') as f: + yaml.dump( + dict(config), f, default_flow_style=False, sort_keys=False) + log_file = '{}/train.log'.format(save_model_dir) + else: + log_file = None + logger = get_logger(name='root', log_file=log_file) # check if set use_gpu=True in paddlepaddle cpu version use_gpu = config['Global']['use_gpu'] @@ -398,24 +477,20 @@ def preprocess(is_train=False): alg = config['Architecture']['algorithm'] assert alg in [ 'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN', - 'CLS', 'PGNet', 'Distillation', 'TableAttn' + 'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE', + 'SEED' ] + windows_not_support_list = ['PSE'] + if platform.system() == "Windows" and alg in windows_not_support_list: + logger.warning('{} is not support in Windows now'.format( + windows_not_support_list)) + sys.exit() device = 'gpu:{}'.format(dist.ParallelEnv().dev_id) if use_gpu else 'cpu' device = paddle.set_device(device) config['Global']['distributed'] = dist.get_world_size() != 1 - if is_train: - # save_config - save_model_dir = config['Global']['save_model_dir'] - os.makedirs(save_model_dir, exist_ok=True) - with open(os.path.join(save_model_dir, 'config.yml'), 'w') as f: - yaml.dump( - dict(config), f, default_flow_style=False, sort_keys=False) - log_file = '{}/train.log'.format(save_model_dir) - else: - log_file = None - logger = get_logger(name='root', log_file=log_file) + if config['Global']['use_visualdl']: from visualdl import LogWriter save_model_dir = config['Global']['save_model_dir']