Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
a7660331a
tesseract
提交
ed48b2a8
T
tesseract
项目概览
a7660331a
/
tesseract
与 Fork 源项目一致
从无法访问的项目Fork
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
tesseract
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
ed48b2a8
编写于
11月 30, 2018
作者:
S
Stefan Weil
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Format new ALTO code with clang-format
Signed-off-by:
N
Stefan Weil
<
sw@weilnetz.de
>
上级
d7cee03a
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
217 addition
and
215 deletion
+217
-215
src/api/altorenderer.cpp
src/api/altorenderer.cpp
+217
-215
未找到文件。
src/api/altorenderer.cpp
浏览文件 @
ed48b2a8
...
...
@@ -13,240 +13,242 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "baseapi.h"
#include <memory>
#include "baseapi.h"
#include "renderer.h"
namespace
tesseract
{
///
/// Add coordinates to specified TextBlock, TextLine, or String bounding box
/// Add word confidence if adding to a String bounding box
///
static
void
AddBoxToAlto
(
const
ResultIterator
*
it
,
PageIteratorLevel
level
,
STRING
*
alto_str
)
{
int
left
,
top
,
right
,
bottom
;
it
->
BoundingBox
(
level
,
&
left
,
&
top
,
&
right
,
&
bottom
);
int
hpos
=
left
;
int
vpos
=
top
;
int
height
=
bottom
-
top
;
int
width
=
right
-
left
;
*
alto_str
+=
" HPOS=
\"
"
;
alto_str
->
add_str_int
(
""
,
hpos
);
*
alto_str
+=
"
\"
"
;
*
alto_str
+=
" VPOS=
\"
"
;
alto_str
->
add_str_int
(
""
,
vpos
);
*
alto_str
+=
"
\"
"
;
*
alto_str
+=
" WIDTH=
\"
"
;
alto_str
->
add_str_int
(
""
,
width
);
*
alto_str
+=
"
\"
"
;
*
alto_str
+=
" HEIGHT=
\"
"
;
alto_str
->
add_str_int
(
""
,
height
);
*
alto_str
+=
"
\"
"
;
if
(
level
==
RIL_WORD
)
{
int
wc
=
it
->
Confidence
(
RIL_WORD
);
*
alto_str
+=
" WC=
\"
0."
;
alto_str
->
add_str_int
(
""
,
wc
);
*
alto_str
+=
"
\"
"
;
}
if
(
level
!=
RIL_WORD
)
{
*
alto_str
+=
">"
;
}
///
/// Add coordinates to specified TextBlock, TextLine, or String bounding box
/// Add word confidence if adding to a String bounding box
///
static
void
AddBoxToAlto
(
const
ResultIterator
*
it
,
PageIteratorLevel
level
,
STRING
*
alto_str
)
{
int
left
,
top
,
right
,
bottom
;
it
->
BoundingBox
(
level
,
&
left
,
&
top
,
&
right
,
&
bottom
);
int
hpos
=
left
;
int
vpos
=
top
;
int
height
=
bottom
-
top
;
int
width
=
right
-
left
;
*
alto_str
+=
" HPOS=
\"
"
;
alto_str
->
add_str_int
(
""
,
hpos
);
*
alto_str
+=
"
\"
"
;
*
alto_str
+=
" VPOS=
\"
"
;
alto_str
->
add_str_int
(
""
,
vpos
);
*
alto_str
+=
"
\"
"
;
*
alto_str
+=
" WIDTH=
\"
"
;
alto_str
->
add_str_int
(
""
,
width
);
*
alto_str
+=
"
\"
"
;
*
alto_str
+=
" HEIGHT=
\"
"
;
alto_str
->
add_str_int
(
""
,
height
);
*
alto_str
+=
"
\"
"
;
if
(
level
==
RIL_WORD
)
{
int
wc
=
it
->
Confidence
(
RIL_WORD
);
*
alto_str
+=
" WC=
\"
0."
;
alto_str
->
add_str_int
(
""
,
wc
);
*
alto_str
+=
"
\"
"
;
}
if
(
level
!=
RIL_WORD
)
{
*
alto_str
+=
">"
;
}
}
///
/// Add a unique ID to an ALTO element
///
static
void
AddIdToAlto
(
STRING
*
alto_str
,
const
std
::
string
base
,
int
num1
)
{
const
size_t
BUFSIZE
=
64
;
char
id_buffer
[
BUFSIZE
];
snprintf
(
id_buffer
,
BUFSIZE
-
1
,
"%s_%d"
,
base
.
c_str
(),
num1
);
id_buffer
[
BUFSIZE
-
1
]
=
'\0'
;
*
alto_str
+=
" ID=
\"
"
;
*
alto_str
+=
id_buffer
;
*
alto_str
+=
"
\"
"
;
}
///
/// Append the ALTO XML for the beginning of the document
///
bool
TessAltoRenderer
::
BeginDocumentHandler
()
{
AppendString
(
"<?xml version=
\"
1.0
\"
encoding=
\"
UTF-8
\"
?>
\n
"
"<alto xmlns=
\"
http://www.loc.gov/standards/alto/ns-v3#
\"
"
"xmlns:xlink=
\"
http://www.w3.org/1999/xlink
\"
"
"xmlns:xsi=
\"
http://www.w3.org/2001/XMLSchema-instance
\"
"
"xsi:schemaLocation=
\"
http://www.loc.gov/standards/alto/ns-v3# "
"http://www.loc.gov/alto/v3/alto-3-0.xsd
\"
>
\n
"
"
\t
<Description>
\n
"
"
\t\t
<MeasurementUnit>pixel</MeasurementUnit>
\n
"
"
\t\t
<sourceImageInformation>
\n
"
"
\t\t\t
<fileName>"
);
AppendString
(
title
());
AppendString
(
"
\t\t\t
</fileName>
\n
"
"
\t\t
</sourceImageInformation>
\n
"
"
\t\t
<OCRProcessing ID=
\"
OCR_0
\"
>
\n
"
"
\t\t\t
<ocrProcessingStep>
\n
"
"
\t\t\t\t
<processingSoftware>
\n
"
"
\t\t\t\t\t
<softwareName>tesseract "
);
AppendString
(
TessBaseAPI
::
Version
());
AppendString
(
"</softwareName>
\n
"
"
\t\t\t\t
</processingSoftware>
\n
"
"
\t\t\t
</ocrProcessingStep>
\n
"
"
\t\t
</OCRProcessing>
\n
"
"
\t
</Description>
\n
"
"
\t
<Layout>
\n
"
);
return
true
;
}
///
/// Append the ALTO XML for the layout of the image
///
bool
TessAltoRenderer
::
AddImageHandler
(
TessBaseAPI
*
api
)
{
const
std
::
unique_ptr
<
const
char
[]
>
hocr
(
api
->
GetAltoText
(
imagenum
()));
if
(
hocr
==
nullptr
)
return
false
;
AppendString
(
hocr
.
get
());
return
true
;
}
///
/// Append the ALTO XML for the end of the document
///
bool
TessAltoRenderer
::
EndDocumentHandler
()
{
AppendString
(
"
\t
</Layout>
\n
</alto>
\n
"
);
return
true
;
}
TessAltoRenderer
::
TessAltoRenderer
(
const
char
*
outputbase
)
:
TessResultRenderer
(
outputbase
,
"xml"
)
{}
///
/// Make an XML-formatted string with ALTO markup from the internal
/// data structures.
///
char
*
TessBaseAPI
::
GetAltoText
(
int
page_number
)
{
return
GetAltoText
(
nullptr
,
page_number
);
}
///
/// Make an XML-formatted string with ALTO markup from the internal
/// data structures.
///
char
*
TessBaseAPI
::
GetAltoText
(
ETEXT_DESC
*
monitor
,
int
page_number
)
{
if
(
tesseract_
==
nullptr
||
(
page_res_
==
nullptr
&&
Recognize
(
monitor
)
<
0
))
return
nullptr
;
int
lcnt
=
0
,
bcnt
=
0
,
wcnt
=
0
;
int
page_id
=
page_number
;
STRING
alto_str
(
""
);
if
(
input_file_
==
nullptr
)
SetInputName
(
nullptr
);
#ifdef _WIN32
// convert input name from ANSI encoding to utf-8
int
str16_len
=
MultiByteToWideChar
(
CP_ACP
,
0
,
input_file_
->
string
(),
-
1
,
nullptr
,
0
);
wchar_t
*
uni16_str
=
new
WCHAR
[
str16_len
];
str16_len
=
MultiByteToWideChar
(
CP_ACP
,
0
,
input_file_
->
string
(),
-
1
,
uni16_str
,
str16_len
);
int
utf8_len
=
WideCharToMultiByte
(
CP_UTF8
,
0
,
uni16_str
,
str16_len
,
nullptr
,
0
,
nullptr
,
nullptr
);
char
*
utf8_str
=
new
char
[
utf8_len
];
WideCharToMultiByte
(
CP_UTF8
,
0
,
uni16_str
,
str16_len
,
utf8_str
,
utf8_len
,
nullptr
,
nullptr
);
*
input_file_
=
utf8_str
;
delete
[]
uni16_str
;
delete
[]
utf8_str
;
#endif
alto_str
+=
"
\t\t
<Page WIDTH=
\"
"
;
alto_str
.
add_str_int
(
""
,
rect_width_
);
alto_str
+=
"
\"
HEIGHT=
\"
"
;
alto_str
.
add_str_int
(
""
,
rect_height_
);
alto_str
+=
"
\"
PHYSICAL_IMG_NR=
\"
"
;
alto_str
.
add_str_int
(
""
,
rect_height_
);
alto_str
+=
"
\"
"
;
AddIdToAlto
(
&
alto_str
,
"page"
,
page_id
);
alto_str
+=
">
\n
"
;
alto_str
+=
(
"
\t\t\t
<PrintSpace HPOS=
\"
0
\"
"
"VPOS=
\"
0
\"
"
" WIDTH=
\"
"
);
alto_str
.
add_str_int
(
""
,
rect_width_
);
alto_str
+=
"
\"
HEIGHT=
\"
"
;
alto_str
.
add_str_int
(
""
,
rect_height_
);
alto_str
+=
"
\"
>
\n
"
;
ResultIterator
*
res_it
=
GetIterator
();
while
(
!
res_it
->
Empty
(
RIL_BLOCK
))
{
if
(
res_it
->
Empty
(
RIL_WORD
))
{
res_it
->
Next
(
RIL_WORD
);
continue
;
}
///
/// Add a unique ID to an ALTO element
///
static
void
AddIdToAlto
(
STRING
*
alto_str
,
const
std
::
string
base
,
int
num1
)
{
const
size_t
BUFSIZE
=
64
;
char
id_buffer
[
BUFSIZE
];
snprintf
(
id_buffer
,
BUFSIZE
-
1
,
"%s_%d"
,
base
.
c_str
(),
num1
);
id_buffer
[
BUFSIZE
-
1
]
=
'\0'
;
*
alto_str
+=
" ID=
\"
"
;
*
alto_str
+=
id_buffer
;
*
alto_str
+=
"
\"
"
;
if
(
res_it
->
IsAtBeginningOf
(
RIL_BLOCK
))
{
alto_str
+=
"
\t\t\t\t
<TextBlock "
;
AddIdToAlto
(
&
alto_str
,
"block"
,
bcnt
);
AddBoxToAlto
(
res_it
,
RIL_BLOCK
,
&
alto_str
);
alto_str
+=
"
\n
"
;
}
///
/// Append the ALTO XML for the beginning of the document
///
bool
TessAltoRenderer
::
BeginDocumentHandler
()
{
AppendString
(
"<?xml version=
\"
1.0
\"
encoding=
\"
UTF-8
\"
?>
\n
"
"<alto xmlns=
\"
http://www.loc.gov/standards/alto/ns-v3#
\"
xmlns:xlink=
\"
http://www.w3.org/1999/xlink
\"
xmlns:xsi=
\"
http://www.w3.org/2001/XMLSchema-instance
\"
xsi:schemaLocation=
\"
http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/alto/v3/alto-3-0.xsd
\"
>
\n
"
"
\t
<Description>
\n
"
"
\t\t
<MeasurementUnit>pixel</MeasurementUnit>
\n
"
"
\t\t
<sourceImageInformation>
\n
"
"
\t\t\t
<fileName>"
);
AppendString
(
title
());
AppendString
(
"
\t\t\t
</fileName>
\n
"
"
\t\t
</sourceImageInformation>
\n
"
"
\t\t
<OCRProcessing ID=
\"
OCR_0
\"
>
\n
"
"
\t\t\t
<ocrProcessingStep>
\n
"
"
\t\t\t\t
<processingSoftware>
\n
"
"
\t\t\t\t\t
<softwareName>tesseract "
);
AppendString
(
TessBaseAPI
::
Version
());
AppendString
(
"</softwareName>
\n
"
"
\t\t\t\t
</processingSoftware>
\n
"
"
\t\t\t
</ocrProcessingStep>
\n
"
"
\t\t
</OCRProcessing>
\n
"
"
\t
</Description>
\n
"
"
\t
<Layout>
\n
"
);
return
true
;
if
(
res_it
->
IsAtBeginningOf
(
RIL_TEXTLINE
))
{
alto_str
+=
"
\t\t\t\t\t
<TextLine "
;
AddIdToAlto
(
&
alto_str
,
"line"
,
lcnt
);
AddBoxToAlto
(
res_it
,
RIL_TEXTLINE
,
&
alto_str
);
alto_str
+=
"
\n
"
;
}
///
/// Append the ALTO XML for the layout of the image
///
bool
TessAltoRenderer
::
AddImageHandler
(
TessBaseAPI
*
api
)
{
const
std
::
unique_ptr
<
const
char
[]
>
hocr
(
api
->
GetAltoText
(
imagenum
()));
if
(
hocr
==
nullptr
)
return
false
;
alto_str
+=
"
\t\t\t\t\t\t
<String "
;
AddIdToAlto
(
&
alto_str
,
"string"
,
wcnt
);
AddBoxToAlto
(
res_it
,
RIL_WORD
,
&
alto_str
);
alto_str
+=
" CONTENT=
\"
"
;
AppendString
(
hocr
.
get
());
bool
last_word_in_line
=
res_it
->
IsAtFinalElement
(
RIL_TEXTLINE
,
RIL_WORD
);
bool
last_word_in_block
=
res_it
->
IsAtFinalElement
(
RIL_BLOCK
,
RIL_WORD
);
return
true
;
}
do
{
const
std
::
unique_ptr
<
const
char
[]
>
grapheme
(
res_it
->
GetUTF8Text
(
RIL_SYMBOL
));
if
(
grapheme
&&
grapheme
[
0
]
!=
0
)
{
alto_str
+=
HOcrEscape
(
grapheme
.
get
());
}
res_it
->
Next
(
RIL_SYMBOL
);
}
while
(
!
res_it
->
Empty
(
RIL_BLOCK
)
&&
!
res_it
->
IsAtBeginningOf
(
RIL_WORD
));
///
/// Append the ALTO XML for the end of the document
///
bool
TessAltoRenderer
::
EndDocumentHandler
()
{
AppendString
(
"
\t
</Layout>
\n
</alto>
\n
"
);
alto_str
+=
"
\"
/>
\n
"
;
return
true
;
}
wcnt
++
;
TessAltoRenderer
::
TessAltoRenderer
(
const
char
*
outputbase
)
:
TessResultRenderer
(
outputbase
,
"xml"
)
{
if
(
last_word_in_line
)
{
alto_str
+=
"
\t\t\t\t\t
</TextLine>
\n
"
;
lcnt
++
;
}
///
/// Make an XML-formatted string with ALTO markup from the internal
/// data structures.
///
char
*
TessBaseAPI
::
GetAltoText
(
int
page_number
)
{
return
GetAltoText
(
nullptr
,
page_number
);
if
(
last_word_in_block
)
{
alto_str
+=
"
\t\t\t\t
</TextBlock>
\n
"
;
bcnt
++
;
}
}
///
/// Make an XML-formatted string with ALTO markup from the internal
/// data structures.
///
char
*
TessBaseAPI
::
GetAltoText
(
ETEXT_DESC
*
monitor
,
int
page_number
)
{
if
(
tesseract_
==
nullptr
||
(
page_res_
==
nullptr
&&
Recognize
(
monitor
)
<
0
))
return
nullptr
;
int
lcnt
=
0
,
bcnt
=
0
,
wcnt
=
0
;
int
page_id
=
page_number
;
STRING
alto_str
(
""
);
if
(
input_file_
==
nullptr
)
SetInputName
(
nullptr
);
#ifdef _WIN32
// convert input name from ANSI encoding to utf-8
int
str16_len
=
MultiByteToWideChar
(
CP_ACP
,
0
,
input_file_
->
string
(),
-
1
,
nullptr
,
0
);
wchar_t
*
uni16_str
=
new
WCHAR
[
str16_len
];
str16_len
=
MultiByteToWideChar
(
CP_ACP
,
0
,
input_file_
->
string
(),
-
1
,
uni16_str
,
str16_len
);
int
utf8_len
=
WideCharToMultiByte
(
CP_UTF8
,
0
,
uni16_str
,
str16_len
,
nullptr
,
0
,
nullptr
,
nullptr
);
char
*
utf8_str
=
new
char
[
utf8_len
];
WideCharToMultiByte
(
CP_UTF8
,
0
,
uni16_str
,
str16_len
,
utf8_str
,
utf8_len
,
nullptr
,
nullptr
);
*
input_file_
=
utf8_str
;
delete
[]
uni16_str
;
delete
[]
utf8_str
;
#endif
alto_str
+=
"
\t\t
<Page WIDTH=
\"
"
;
alto_str
.
add_str_int
(
""
,
rect_width_
);
alto_str
+=
"
\"
HEIGHT=
\"
"
;
alto_str
.
add_str_int
(
""
,
rect_height_
);
alto_str
+=
"
\"
PHYSICAL_IMG_NR=
\"
"
;
alto_str
.
add_str_int
(
""
,
rect_height_
);
alto_str
+=
"
\"
"
;
AddIdToAlto
(
&
alto_str
,
"page"
,
page_id
);
alto_str
+=
">
\n
"
;
alto_str
+=
(
"
\t\t\t
<PrintSpace HPOS=
\"
0
\"
"
"VPOS=
\"
0
\"
"
" WIDTH=
\"
"
);
alto_str
.
add_str_int
(
""
,
rect_width_
);
alto_str
+=
"
\"
HEIGHT=
\"
"
;
alto_str
.
add_str_int
(
""
,
rect_height_
);
alto_str
+=
"
\"
>
\n
"
;
ResultIterator
*
res_it
=
GetIterator
();
while
(
!
res_it
->
Empty
(
RIL_BLOCK
))
{
if
(
res_it
->
Empty
(
RIL_WORD
))
{
res_it
->
Next
(
RIL_WORD
);
continue
;
}
if
(
res_it
->
IsAtBeginningOf
(
RIL_BLOCK
))
{
alto_str
+=
"
\t\t\t\t
<TextBlock "
;
AddIdToAlto
(
&
alto_str
,
"block"
,
bcnt
);
AddBoxToAlto
(
res_it
,
RIL_BLOCK
,
&
alto_str
);
alto_str
+=
"
\n
"
;
}
if
(
res_it
->
IsAtBeginningOf
(
RIL_TEXTLINE
))
{
alto_str
+=
"
\t\t\t\t\t
<TextLine "
;
AddIdToAlto
(
&
alto_str
,
"line"
,
lcnt
);
AddBoxToAlto
(
res_it
,
RIL_TEXTLINE
,
&
alto_str
);
alto_str
+=
"
\n
"
;
}
alto_str
+=
"
\t\t\t\t\t\t
<String "
;
AddIdToAlto
(
&
alto_str
,
"string"
,
wcnt
);
AddBoxToAlto
(
res_it
,
RIL_WORD
,
&
alto_str
);
alto_str
+=
" CONTENT=
\"
"
;
bool
last_word_in_line
=
res_it
->
IsAtFinalElement
(
RIL_TEXTLINE
,
RIL_WORD
);
bool
last_word_in_block
=
res_it
->
IsAtFinalElement
(
RIL_BLOCK
,
RIL_WORD
);
do
{
const
std
::
unique_ptr
<
const
char
[]
>
grapheme
(
res_it
->
GetUTF8Text
(
RIL_SYMBOL
));
if
(
grapheme
&&
grapheme
[
0
]
!=
0
)
{
alto_str
+=
HOcrEscape
(
grapheme
.
get
());
}
res_it
->
Next
(
RIL_SYMBOL
);
}
while
(
!
res_it
->
Empty
(
RIL_BLOCK
)
&&
!
res_it
->
IsAtBeginningOf
(
RIL_WORD
));
alto_str
+=
"
\"
/>
\n
"
;
wcnt
++
;
if
(
last_word_in_line
)
{
alto_str
+=
"
\t\t\t\t\t
</TextLine>
\n
"
;
lcnt
++
;
}
if
(
last_word_in_block
)
{
alto_str
+=
"
\t\t\t\t
</TextBlock>
\n
"
;
bcnt
++
;
}
}
alto_str
+=
"
\t\t\t
</PrintSpace>
\n
"
;
alto_str
+=
"
\t\t
</Page>
\n
"
;
char
*
ret
=
new
char
[
alto_str
.
length
()
+
1
];
strcpy
(
ret
,
alto_str
.
string
());
delete
res_it
;
return
ret
;
}
alto_str
+=
"
\t\t\t
</PrintSpace>
\n
"
;
alto_str
+=
"
\t\t
</Page>
\n
"
;
char
*
ret
=
new
char
[
alto_str
.
length
()
+
1
];
strcpy
(
ret
,
alto_str
.
string
());
delete
res_it
;
return
ret
;
}
}
// namespace tesseract
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录