From 4d816b61499c9e312316b87ac6ff273e993262d9 Mon Sep 17 00:00:00 2001 From: tink2123 <y_tink@163.com> Date: Fri, 18 Sep 2020 18:54:15 +0800 Subject: [PATCH 1/3] make label for paddleocr --- train_data/gen_label.py | 63 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 train_data/gen_label.py diff --git a/train_data/gen_label.py b/train_data/gen_label.py new file mode 100644 index 00000000..ae0903b1 --- /dev/null +++ b/train_data/gen_label.py @@ -0,0 +1,63 @@ +import os +import argparse + + +def gen_rec_label(input_path, out_label): + out_file = open(out_label, 'w') + with open(input_path, 'r') as f: + for line in f.readlines(): + tmp = line.strip('\n').replace(" ", "").split(',') + img_path, label = tmp[0], tmp[1] + label = label.replace("\"", "") + out_file.write(img_path + '\t' + label + '\n') + out_file.close() + + +def gen_det_label(input_dir, out_label): + root_path = "" + if "training" in input_dir: + root_path = "icdar_c4_train_imgs/" + elif "test" in input_dir: + root_path = "ch4_test_images/" + out_file = open(out_label, 'w') + for label_file in os.listdir(input_dir): + img_path = root_path + label_file[3:-4] + ".jpg" + label = [] + with open(os.path.join(input_dir, label_file), 'r') as f: + for line in f.readlines(): + tmp = line.strip("\n\r").replace("\xef\xbb\xbf", "").split(',') + points = tmp[:-2] + s = [] + for i in range(0, len(points), 2): + b = points[i:i + 2] + s.append(b) + result = {"transcription": tmp[-1], "points": s} + label.append(result) + out_file.write(img_path + '\t' + str(label) + '\n') + out_file.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--mode', + type=str, + default="rec", + help='Generate rec_label or det_label, can be set rec or det') + parser.add_argument( + '--input_path', + type=str, + default=".", + help='Input_label or input path to be converted') + parser.add_argument( + '--output_label', + type=str, + default="out_label.txt", + help='Output file name') + + args = parser.parse_args() + if args.mode == "rec": + print("Generate rec label") + gen_rec_label(args.input_path, args.output_label) + elif args.mode == "det": + gen_det_label(args.input_path, args.output_label) -- GitLab From f9170fcfce4cdd03e9ed38a4adfb48516f378bfd Mon Sep 17 00:00:00 2001 From: tink2123 <y_tink@163.com> Date: Sat, 19 Sep 2020 14:40:13 +0800 Subject: [PATCH 2/3] polish gen_label --- doc/doc_ch/detection.md | 9 ++++++ doc/doc_ch/recognition.md | 7 +++++ train_data/gen_label.py | 60 +++++++++++++++++++-------------------- 3 files changed, 45 insertions(+), 31 deletions(-) diff --git a/doc/doc_ch/detection.md b/doc/doc_ch/detection.md index 84c90d18..aa320d62 100644 --- a/doc/doc_ch/detection.md +++ b/doc/doc_ch/detection.md @@ -14,6 +14,15 @@ wget -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/train_icdar2015_l wget -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/test_icdar2015_label.txt ``` +PaddleOCR 也æ供了数æ®æ ¼å¼è½¬æ¢è„šæœ¬ï¼Œå¯ä»¥å°†å®˜ç½‘ label 转æ¢æ”¯æŒçš„æ•°æ®æ ¼å¼ã€‚ æ•°æ®è½¬æ¢å·¥å…·åœ¨ `train_data/gen_label.py`, 这里以è®ç»ƒé›†ä¸ºä¾‹ï¼š + +``` +# å°†å®˜ç½‘ä¸‹è½½çš„æ ‡ç¾æ–‡ä»¶è½¬æ¢ä¸º train_icdar2015_label.txt +python gen_label.py --mode="det" --root_path="icdar_c4_train_imgs/" \ + --input_path="ch4_training_localization_transcription_gt" \ + --output_label="train_icdar2015_label.txt" +``` + 解压数æ®é›†å’Œä¸‹è½½æ ‡æ³¨æ–‡ä»¶åŽï¼ŒPaddleOCR/train_data/ 有两个文件夹和两个文件,分别是: ``` /PaddleOCR/train_data/icdar2015/text_localization/ diff --git a/doc/doc_ch/recognition.md b/doc/doc_ch/recognition.md index c554b9f1..6d6034a6 100644 --- a/doc/doc_ch/recognition.md +++ b/doc/doc_ch/recognition.md @@ -44,6 +44,13 @@ wget -P ./train_data/ic15_data https://paddleocr.bj.bcebos.com/dataset/rec_gt_t wget -P ./train_data/ic15_data https://paddleocr.bj.bcebos.com/dataset/rec_gt_test.txt ``` +PaddleOCR 也æ供了数æ®æ ¼å¼è½¬æ¢è„šæœ¬ï¼Œå¯ä»¥å°†å®˜ç½‘ label 转æ¢æ”¯æŒçš„æ•°æ®æ ¼å¼ã€‚ æ•°æ®è½¬æ¢å·¥å…·åœ¨ `train_data/gen_label.py`, 这里以è®ç»ƒé›†ä¸ºä¾‹ï¼š + +``` +# å°†å®˜ç½‘ä¸‹è½½çš„æ ‡ç¾æ–‡ä»¶è½¬æ¢ä¸º rec_gt_label.txt +python gen_label.py --mode="rec" --input_path="{path/of/origin/label}" --output_label="rec_gt_label.txt" +``` + 最终è®ç»ƒé›†åº”有如下文件结构: ``` |-train_data diff --git a/train_data/gen_label.py b/train_data/gen_label.py index ae0903b1..de0de2bf 100644 --- a/train_data/gen_label.py +++ b/train_data/gen_label.py @@ -3,38 +3,31 @@ import argparse def gen_rec_label(input_path, out_label): - out_file = open(out_label, 'w') - with open(input_path, 'r') as f: - for line in f.readlines(): - tmp = line.strip('\n').replace(" ", "").split(',') - img_path, label = tmp[0], tmp[1] - label = label.replace("\"", "") - out_file.write(img_path + '\t' + label + '\n') - out_file.close() + with open(out_label, 'w') as out_file: + with open(input_path, 'r') as f: + for line in f.readlines(): + tmp = line.strip('\n').replace(" ", "").split(',') + img_path, label = tmp[0], tmp[1] + label = label.replace("\"", "") + out_file.write(img_path + '\t' + label + '\n') -def gen_det_label(input_dir, out_label): - root_path = "" - if "training" in input_dir: - root_path = "icdar_c4_train_imgs/" - elif "test" in input_dir: - root_path = "ch4_test_images/" - out_file = open(out_label, 'w') - for label_file in os.listdir(input_dir): - img_path = root_path + label_file[3:-4] + ".jpg" - label = [] - with open(os.path.join(input_dir, label_file), 'r') as f: - for line in f.readlines(): - tmp = line.strip("\n\r").replace("\xef\xbb\xbf", "").split(',') - points = tmp[:-2] - s = [] - for i in range(0, len(points), 2): - b = points[i:i + 2] - s.append(b) - result = {"transcription": tmp[-1], "points": s} - label.append(result) - out_file.write(img_path + '\t' + str(label) + '\n') - out_file.close() +def gen_det_label(root_path, input_dir, out_label): + with open(out_label, 'w') as out_file: + for label_file in os.listdir(input_dir): + img_path = root_path + label_file[3:-4] + ".jpg" + label = [] + with open(os.path.join(input_dir, label_file), 'r') as f: + for line in f.readlines(): + tmp = line.strip("\n\r").replace("\xef\xbb\xbf", "").split(',') + points = tmp[:-2] + s = [] + for i in range(0, len(points), 2): + b = points[i:i + 2] + s.append(b) + result = {"transcription": tmp[-1], "points": s} + label.append(result) + out_file.write(img_path + '\t' + str(label) + '\n') if __name__ == "__main__": @@ -44,6 +37,11 @@ if __name__ == "__main__": type=str, default="rec", help='Generate rec_label or det_label, can be set rec or det') + parser.add_argument( + '--root_path', + type=str, + default=".", + help='The root directory of images.Only takes effect when mode=det ') parser.add_argument( '--input_path', type=str, @@ -60,4 +58,4 @@ if __name__ == "__main__": print("Generate rec label") gen_rec_label(args.input_path, args.output_label) elif args.mode == "det": - gen_det_label(args.input_path, args.output_label) + gen_det_label(args.root_path, args.input_path, args.output_label) -- GitLab From cf054cffc1ef0f9cb0f150d73c818b4736daaa6b Mon Sep 17 00:00:00 2001 From: tink2123 <y_tink@163.com> Date: Sat, 19 Sep 2020 14:41:43 +0800 Subject: [PATCH 3/3] add copyright --- train_data/gen_label.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/train_data/gen_label.py b/train_data/gen_label.py index de0de2bf..552f279f 100644 --- a/train_data/gen_label.py +++ b/train_data/gen_label.py @@ -1,3 +1,16 @@ +#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. import os import argparse -- GitLab