Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
s920243400
PaddleOCR
提交
f9170fcf
P
PaddleOCR
项目概览
s920243400
/
PaddleOCR
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleOCR
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleOCR
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f9170fcf
编写于
9月 19, 2020
作者:
T
tink2123
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
polish gen_label
上级
4d816b61
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
45 addition
and
31 deletion
+45
-31
doc/doc_ch/detection.md
doc/doc_ch/detection.md
+9
-0
doc/doc_ch/recognition.md
doc/doc_ch/recognition.md
+7
-0
train_data/gen_label.py
train_data/gen_label.py
+29
-31
未找到文件。
doc/doc_ch/detection.md
浏览文件 @
f9170fcf
...
@@ -14,6 +14,15 @@ wget -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/train_icdar2015_l
...
@@ -14,6 +14,15 @@ wget -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/train_icdar2015_l
wget
-P
./train_data/ https://paddleocr.bj.bcebos.com/dataset/test_icdar2015_label.txt
wget
-P
./train_data/ https://paddleocr.bj.bcebos.com/dataset/test_icdar2015_label.txt
```
```
PaddleOCR 也提供了数据格式转换脚本,可以将官网 label 转换支持的数据格式。 数据转换工具在
`train_data/gen_label.py`
, 这里以训练集为例:
```
# 将官网下载的标签文件转换为 train_icdar2015_label.txt
python gen_label.py --mode="det" --root_path="icdar_c4_train_imgs/" \
--input_path="ch4_training_localization_transcription_gt" \
--output_label="train_icdar2015_label.txt"
```
解压数据集和下载标注文件后,PaddleOCR/train_data/ 有两个文件夹和两个文件,分别是:
解压数据集和下载标注文件后,PaddleOCR/train_data/ 有两个文件夹和两个文件,分别是:
```
```
/PaddleOCR/train_data/icdar2015/text_localization/
/PaddleOCR/train_data/icdar2015/text_localization/
...
...
doc/doc_ch/recognition.md
浏览文件 @
f9170fcf
...
@@ -44,6 +44,13 @@ wget -P ./train_data/ic15_data https://paddleocr.bj.bcebos.com/dataset/rec_gt_t
...
@@ -44,6 +44,13 @@ wget -P ./train_data/ic15_data https://paddleocr.bj.bcebos.com/dataset/rec_gt_t
wget -P ./train_data/ic15_data https://paddleocr.bj.bcebos.com/dataset/rec_gt_test.txt
wget -P ./train_data/ic15_data https://paddleocr.bj.bcebos.com/dataset/rec_gt_test.txt
```
```
PaddleOCR 也提供了数据格式转换脚本,可以将官网 label 转换支持的数据格式。 数据转换工具在
`train_data/gen_label.py`
, 这里以训练集为例:
```
# 将官网下载的标签文件转换为 rec_gt_label.txt
python gen_label.py --mode="rec" --input_path="{path/of/origin/label}" --output_label="rec_gt_label.txt"
```
最终训练集应有如下文件结构:
最终训练集应有如下文件结构:
```
```
|-train_data
|-train_data
...
...
train_data/gen_label.py
浏览文件 @
f9170fcf
...
@@ -3,38 +3,31 @@ import argparse
...
@@ -3,38 +3,31 @@ import argparse
def
gen_rec_label
(
input_path
,
out_label
):
def
gen_rec_label
(
input_path
,
out_label
):
out_file
=
open
(
out_label
,
'w'
)
with
open
(
out_label
,
'w'
)
as
out_file
:
with
open
(
input_path
,
'r'
)
as
f
:
with
open
(
input_path
,
'r'
)
as
f
:
for
line
in
f
.
readlines
():
for
line
in
f
.
readlines
():
tmp
=
line
.
strip
(
'
\n
'
).
replace
(
" "
,
""
).
split
(
','
)
tmp
=
line
.
strip
(
'
\n
'
).
replace
(
" "
,
""
).
split
(
','
)
img_path
,
label
=
tmp
[
0
],
tmp
[
1
]
img_path
,
label
=
tmp
[
0
],
tmp
[
1
]
label
=
label
.
replace
(
"
\"
"
,
""
)
label
=
label
.
replace
(
"
\"
"
,
""
)
out_file
.
write
(
img_path
+
'
\t
'
+
label
+
'
\n
'
)
out_file
.
write
(
img_path
+
'
\t
'
+
label
+
'
\n
'
)
out_file
.
close
()
def
gen_det_label
(
input_dir
,
out_label
):
def
gen_det_label
(
root_path
,
input_dir
,
out_label
):
root_path
=
""
with
open
(
out_label
,
'w'
)
as
out_file
:
if
"training"
in
input_dir
:
for
label_file
in
os
.
listdir
(
input_dir
):
root_path
=
"icdar_c4_train_imgs/"
img_path
=
root_path
+
label_file
[
3
:
-
4
]
+
".jpg"
elif
"test"
in
input_dir
:
label
=
[]
root_path
=
"ch4_test_images/"
with
open
(
os
.
path
.
join
(
input_dir
,
label_file
),
'r'
)
as
f
:
out_file
=
open
(
out_label
,
'w'
)
for
line
in
f
.
readlines
():
for
label_file
in
os
.
listdir
(
input_dir
):
tmp
=
line
.
strip
(
"
\n\r
"
).
replace
(
"
\xef\xbb\xbf
"
,
""
).
split
(
','
)
img_path
=
root_path
+
label_file
[
3
:
-
4
]
+
".jpg"
points
=
tmp
[:
-
2
]
label
=
[]
s
=
[]
with
open
(
os
.
path
.
join
(
input_dir
,
label_file
),
'r'
)
as
f
:
for
i
in
range
(
0
,
len
(
points
),
2
):
for
line
in
f
.
readlines
():
b
=
points
[
i
:
i
+
2
]
tmp
=
line
.
strip
(
"
\n\r
"
).
replace
(
"
\xef\xbb\xbf
"
,
""
).
split
(
','
)
s
.
append
(
b
)
points
=
tmp
[:
-
2
]
result
=
{
"transcription"
:
tmp
[
-
1
],
"points"
:
s
}
s
=
[]
label
.
append
(
result
)
for
i
in
range
(
0
,
len
(
points
),
2
):
out_file
.
write
(
img_path
+
'
\t
'
+
str
(
label
)
+
'
\n
'
)
b
=
points
[
i
:
i
+
2
]
s
.
append
(
b
)
result
=
{
"transcription"
:
tmp
[
-
1
],
"points"
:
s
}
label
.
append
(
result
)
out_file
.
write
(
img_path
+
'
\t
'
+
str
(
label
)
+
'
\n
'
)
out_file
.
close
()
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
@@ -44,6 +37,11 @@ if __name__ == "__main__":
...
@@ -44,6 +37,11 @@ if __name__ == "__main__":
type
=
str
,
type
=
str
,
default
=
"rec"
,
default
=
"rec"
,
help
=
'Generate rec_label or det_label, can be set rec or det'
)
help
=
'Generate rec_label or det_label, can be set rec or det'
)
parser
.
add_argument
(
'--root_path'
,
type
=
str
,
default
=
"."
,
help
=
'The root directory of images.Only takes effect when mode=det '
)
parser
.
add_argument
(
parser
.
add_argument
(
'--input_path'
,
'--input_path'
,
type
=
str
,
type
=
str
,
...
@@ -60,4 +58,4 @@ if __name__ == "__main__":
...
@@ -60,4 +58,4 @@ if __name__ == "__main__":
print
(
"Generate rec label"
)
print
(
"Generate rec label"
)
gen_rec_label
(
args
.
input_path
,
args
.
output_label
)
gen_rec_label
(
args
.
input_path
,
args
.
output_label
)
elif
args
.
mode
==
"det"
:
elif
args
.
mode
==
"det"
:
gen_det_label
(
args
.
input_path
,
args
.
output_label
)
gen_det_label
(
args
.
root_path
,
args
.
input_path
,
args
.
output_label
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录