From 4d816b61499c9e312316b87ac6ff273e993262d9 Mon Sep 17 00:00:00 2001
From: tink2123 <y_tink@163.com>
Date: Fri, 18 Sep 2020 18:54:15 +0800
Subject: [PATCH 1/3] make label for paddleocr

---
 train_data/gen_label.py | 63 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 train_data/gen_label.py

diff --git a/train_data/gen_label.py b/train_data/gen_label.py
new file mode 100644
index 00000000..ae0903b1
--- /dev/null
+++ b/train_data/gen_label.py
@@ -0,0 +1,63 @@
+import os
+import argparse
+
+
+def gen_rec_label(input_path, out_label):
+    out_file = open(out_label, 'w')
+    with open(input_path, 'r') as f:
+        for line in f.readlines():
+            tmp = line.strip('\n').replace(" ", "").split(',')
+            img_path, label = tmp[0], tmp[1]
+            label = label.replace("\"", "")
+            out_file.write(img_path + '\t' + label + '\n')
+    out_file.close()
+
+
+def gen_det_label(input_dir, out_label):
+    root_path = ""
+    if "training" in input_dir:
+        root_path = "icdar_c4_train_imgs/"
+    elif "test" in input_dir:
+        root_path = "ch4_test_images/"
+    out_file = open(out_label, 'w')
+    for label_file in os.listdir(input_dir):
+        img_path = root_path + label_file[3:-4] + ".jpg"
+        label = []
+        with open(os.path.join(input_dir, label_file), 'r') as f:
+            for line in f.readlines():
+                tmp = line.strip("\n\r").replace("\xef\xbb\xbf", "").split(',')
+                points = tmp[:-2]
+                s = []
+                for i in range(0, len(points), 2):
+                    b = points[i:i + 2]
+                    s.append(b)
+                result = {"transcription": tmp[-1], "points": s}
+                label.append(result)
+        out_file.write(img_path + '\t' + str(label) + '\n')
+    out_file.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--mode',
+        type=str,
+        default="rec",
+        help='Generate rec_label or det_label, can be set rec or det')
+    parser.add_argument(
+        '--input_path',
+        type=str,
+        default=".",
+        help='Input_label or input path to be converted')
+    parser.add_argument(
+        '--output_label',
+        type=str,
+        default="out_label.txt",
+        help='Output file name')
+
+    args = parser.parse_args()
+    if args.mode == "rec":
+        print("Generate rec label")
+        gen_rec_label(args.input_path, args.output_label)
+    elif args.mode == "det":
+        gen_det_label(args.input_path, args.output_label)
-- 
GitLab


From f9170fcfce4cdd03e9ed38a4adfb48516f378bfd Mon Sep 17 00:00:00 2001
From: tink2123 <y_tink@163.com>
Date: Sat, 19 Sep 2020 14:40:13 +0800
Subject: [PATCH 2/3] polish gen_label

---
 doc/doc_ch/detection.md   |  9 ++++++
 doc/doc_ch/recognition.md |  7 +++++
 train_data/gen_label.py   | 60 +++++++++++++++++++--------------------
 3 files changed, 45 insertions(+), 31 deletions(-)

diff --git a/doc/doc_ch/detection.md b/doc/doc_ch/detection.md
index 84c90d18..aa320d62 100644
--- a/doc/doc_ch/detection.md
+++ b/doc/doc_ch/detection.md
@@ -14,6 +14,15 @@ wget -P ./train_data/  https://paddleocr.bj.bcebos.com/dataset/train_icdar2015_l
 wget -P ./train_data/  https://paddleocr.bj.bcebos.com/dataset/test_icdar2015_label.txt
 ```
 
+PaddleOCR 也提供了数据格式转换脚本,可以将官网 label 转换支持的数据格式。 数据转换工具在 `train_data/gen_label.py`, 这里以训练集为例:
+
+```
+# 将官网下载的标签文件转换为 train_icdar2015_label.txt 
+python gen_label.py --mode="det" --root_path="icdar_c4_train_imgs/"  \
+                    --input_path="ch4_training_localization_transcription_gt" \
+                    --output_label="train_icdar2015_label.txt"
+```
+
 解压数据集和下载标注文件后,PaddleOCR/train_data/ 有两个文件夹和两个文件,分别是:
 ```
 /PaddleOCR/train_data/icdar2015/text_localization/
diff --git a/doc/doc_ch/recognition.md b/doc/doc_ch/recognition.md
index c554b9f1..6d6034a6 100644
--- a/doc/doc_ch/recognition.md
+++ b/doc/doc_ch/recognition.md
@@ -44,6 +44,13 @@ wget -P ./train_data/ic15_data  https://paddleocr.bj.bcebos.com/dataset/rec_gt_t
 wget -P ./train_data/ic15_data  https://paddleocr.bj.bcebos.com/dataset/rec_gt_test.txt
 ```
 
+PaddleOCR 也提供了数据格式转换脚本,可以将官网 label 转换支持的数据格式。 数据转换工具在 `train_data/gen_label.py`, 这里以训练集为例:
+
+```
+# 将官网下载的标签文件转换为 rec_gt_label.txt
+python gen_label.py --mode="rec" --input_path="{path/of/origin/label}" --output_label="rec_gt_label.txt"
+```
+
 最终训练集应有如下文件结构:
 ```
 |-train_data
diff --git a/train_data/gen_label.py b/train_data/gen_label.py
index ae0903b1..de0de2bf 100644
--- a/train_data/gen_label.py
+++ b/train_data/gen_label.py
@@ -3,38 +3,31 @@ import argparse
 
 
 def gen_rec_label(input_path, out_label):
-    out_file = open(out_label, 'w')
-    with open(input_path, 'r') as f:
-        for line in f.readlines():
-            tmp = line.strip('\n').replace(" ", "").split(',')
-            img_path, label = tmp[0], tmp[1]
-            label = label.replace("\"", "")
-            out_file.write(img_path + '\t' + label + '\n')
-    out_file.close()
+    with open(out_label, 'w') as out_file:
+        with open(input_path, 'r') as f:
+            for line in f.readlines():
+                tmp = line.strip('\n').replace(" ", "").split(',')
+                img_path, label = tmp[0], tmp[1]
+                label = label.replace("\"", "")
+                out_file.write(img_path + '\t' + label + '\n')
 
 
-def gen_det_label(input_dir, out_label):
-    root_path = ""
-    if "training" in input_dir:
-        root_path = "icdar_c4_train_imgs/"
-    elif "test" in input_dir:
-        root_path = "ch4_test_images/"
-    out_file = open(out_label, 'w')
-    for label_file in os.listdir(input_dir):
-        img_path = root_path + label_file[3:-4] + ".jpg"
-        label = []
-        with open(os.path.join(input_dir, label_file), 'r') as f:
-            for line in f.readlines():
-                tmp = line.strip("\n\r").replace("\xef\xbb\xbf", "").split(',')
-                points = tmp[:-2]
-                s = []
-                for i in range(0, len(points), 2):
-                    b = points[i:i + 2]
-                    s.append(b)
-                result = {"transcription": tmp[-1], "points": s}
-                label.append(result)
-        out_file.write(img_path + '\t' + str(label) + '\n')
-    out_file.close()
+def gen_det_label(root_path, input_dir, out_label):
+    with open(out_label, 'w') as out_file:
+        for label_file in os.listdir(input_dir):
+            img_path = root_path + label_file[3:-4] + ".jpg"
+            label = []
+            with open(os.path.join(input_dir, label_file), 'r') as f:
+                for line in f.readlines():
+                    tmp = line.strip("\n\r").replace("\xef\xbb\xbf", "").split(',')
+                    points = tmp[:-2]
+                    s = []
+                    for i in range(0, len(points), 2):
+                        b = points[i:i + 2]
+                        s.append(b)
+                    result = {"transcription": tmp[-1], "points": s}
+                    label.append(result)
+            out_file.write(img_path + '\t' + str(label) + '\n')
 
 
 if __name__ == "__main__":
@@ -44,6 +37,11 @@ if __name__ == "__main__":
         type=str,
         default="rec",
         help='Generate rec_label or det_label, can be set rec or det')
+    parser.add_argument(
+        '--root_path',
+        type=str,
+        default=".",
+        help='The root directory of images.Only takes effect when mode=det ')
     parser.add_argument(
         '--input_path',
         type=str,
@@ -60,4 +58,4 @@ if __name__ == "__main__":
         print("Generate rec label")
         gen_rec_label(args.input_path, args.output_label)
     elif args.mode == "det":
-        gen_det_label(args.input_path, args.output_label)
+        gen_det_label(args.root_path, args.input_path, args.output_label)
-- 
GitLab


From cf054cffc1ef0f9cb0f150d73c818b4736daaa6b Mon Sep 17 00:00:00 2001
From: tink2123 <y_tink@163.com>
Date: Sat, 19 Sep 2020 14:41:43 +0800
Subject: [PATCH 3/3] add copyright

---
 train_data/gen_label.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/train_data/gen_label.py b/train_data/gen_label.py
index de0de2bf..552f279f 100644
--- a/train_data/gen_label.py
+++ b/train_data/gen_label.py
@@ -1,3 +1,16 @@
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import os
 import argparse
 
-- 
GitLab