Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
a7660331a
tesseract
提交
7f382df5
T
tesseract
项目概览
a7660331a
/
tesseract
与 Fork 源项目一致
从无法访问的项目Fork
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
tesseract
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
7f382df5
编写于
5月 11, 2017
作者:
R
Raf Schietekat
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fewer g++ -Wsign-compare warnings (cont.)
上级
c335508e
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
48 addition
and
38 deletion
+48
-38
training/boxchar.cpp
training/boxchar.cpp
+17
-17
training/commontraining.cpp
training/commontraining.cpp
+4
-1
training/normstrngs.cpp
training/normstrngs.cpp
+7
-0
training/stringrenderer.cpp
training/stringrenderer.cpp
+9
-7
training/text2image.cpp
training/text2image.cpp
+11
-13
未找到文件。
training/boxchar.cpp
浏览文件 @
7f382df5
...
...
@@ -51,7 +51,7 @@ void BoxChar::AddBox(int x, int y, int width, int height) {
/* static */
void
BoxChar
::
TranslateBoxes
(
int
xshift
,
int
yshift
,
std
::
vector
<
BoxChar
*>*
boxes
)
{
for
(
in
t
i
=
0
;
i
<
boxes
->
size
();
++
i
)
{
for
(
size_
t
i
=
0
;
i
<
boxes
->
size
();
++
i
)
{
BOX
*
box
=
(
*
boxes
)[
i
]
->
box_
;
if
(
box
!=
nullptr
)
{
box
->
x
+=
xshift
;
...
...
@@ -68,8 +68,8 @@ void BoxChar::PrepareToWrite(std::vector<BoxChar*>* boxes) {
bool
vertical_rules
=
MostlyVertical
(
*
boxes
);
InsertNewlines
(
rtl_rules
,
vertical_rules
,
boxes
);
InsertSpaces
(
rtl_rules
,
vertical_rules
,
boxes
);
for
(
int
i
=
0
;
i
<
boxes
->
size
();
++
i
)
{
if
((
*
boxes
)[
i
]
->
box_
==
nullptr
)
tprintf
(
"Null box at index %
d
\n
"
,
i
);
for
(
unsigned
int
i
=
0
;
i
<
boxes
->
size
();
++
i
)
{
if
((
*
boxes
)[
i
]
->
box_
==
nullptr
)
tprintf
(
"Null box at index %
u
\n
"
,
i
);
}
if
(
rtl_rules
)
{
ReorderRTLText
(
boxes
);
...
...
@@ -82,16 +82,16 @@ void BoxChar::InsertNewlines(bool rtl_rules, bool vertical_rules,
std
::
vector
<
BoxChar
*>*
boxes
)
{
int
prev_i
=
-
1
;
int
max_shift
=
0
;
for
(
int
i
=
0
;
i
<
boxes
->
size
();
++
i
)
{
for
(
int
i
=
0
;
static_cast
<
unsigned
int
>
(
i
)
<
boxes
->
size
();
++
i
)
{
Box
*
box
=
(
*
boxes
)[
i
]
->
box_
;
if
(
box
==
nullptr
)
{
if
(
prev_i
<
0
||
prev_i
<
i
-
1
||
i
+
1
==
boxes
->
size
())
{
if
(
prev_i
<
0
||
prev_i
<
i
-
1
||
static_cast
<
unsigned
int
>
(
i
)
+
1
==
boxes
->
size
())
{
// Erase null boxes at the start of a line and after another null box.
do
{
delete
(
*
boxes
)[
i
];
boxes
->
erase
(
boxes
->
begin
()
+
i
);
--
i
;
}
while
(
i
>=
0
&&
i
+
1
==
boxes
->
size
()
&&
}
while
(
i
>=
0
&&
static_cast
<
unsigned
int
>
(
i
)
+
1
==
boxes
->
size
()
&&
(
*
boxes
)[
i
]
->
box_
==
nullptr
);
}
continue
;
...
...
@@ -146,7 +146,7 @@ void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules,
std
::
vector
<
BoxChar
*>*
boxes
)
{
// After InsertNewlines, any remaining null boxes are not newlines, and are
// singletons, so add a box to each remaining null box.
for
(
int
i
=
1
;
i
+
1
<
boxes
->
size
();
++
i
)
{
for
(
int
i
=
1
;
static_cast
<
unsigned
int
>
(
i
)
+
1
<
boxes
->
size
();
++
i
)
{
Box
*
box
=
(
*
boxes
)[
i
]
->
box_
;
if
(
box
==
nullptr
)
{
Box
*
prev
=
(
*
boxes
)[
i
-
1
]
->
box_
;
...
...
@@ -178,8 +178,8 @@ void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules,
}
// Left becomes the max right of all next boxes forward to the first
// space or newline.
for
(
in
t
j
=
i
+
2
;
j
<
boxes
->
size
()
&&
(
*
boxes
)[
j
]
->
box_
!=
nullptr
&&
(
*
boxes
)[
j
]
->
ch_
!=
"
\t
"
;
for
(
size_
t
j
=
i
+
2
;
j
<
boxes
->
size
()
&&
(
*
boxes
)[
j
]
->
box_
!=
nullptr
&&
(
*
boxes
)[
j
]
->
ch_
!=
"
\t
"
;
++
j
)
{
next
=
(
*
boxes
)[
j
]
->
box_
;
if
(
next
->
x
+
next
->
w
>
left
)
{
...
...
@@ -203,8 +203,8 @@ void BoxChar::ReorderRTLText(std::vector<BoxChar*>* boxes) {
// After adding newlines and spaces, this task is simply a matter of sorting
// by left each group of boxes between newlines.
BoxCharPtrSort
sorter
;
in
t
end
=
0
;
for
(
in
t
start
=
0
;
start
<
boxes
->
size
();
start
=
end
+
1
)
{
size_
t
end
=
0
;
for
(
size_
t
start
=
0
;
start
<
boxes
->
size
();
start
=
end
+
1
)
{
end
=
start
+
1
;
while
(
end
<
boxes
->
size
()
&&
(
*
boxes
)[
end
]
->
ch_
!=
"
\t
"
)
++
end
;
std
::
sort
(
boxes
->
begin
()
+
start
,
boxes
->
begin
()
+
end
,
sorter
);
...
...
@@ -215,13 +215,13 @@ void BoxChar::ReorderRTLText(std::vector<BoxChar*>* boxes) {
/* static */
bool
BoxChar
::
ContainsMostlyRTL
(
const
std
::
vector
<
BoxChar
*>&
boxes
)
{
int
num_rtl
=
0
,
num_ltr
=
0
;
for
(
int
i
=
0
;
i
<
boxes
.
size
();
++
i
)
{
for
(
unsigned
int
i
=
0
;
i
<
boxes
.
size
();
++
i
)
{
// Convert the unichar to UTF32 representation
GenericVector
<
char32
>
uni_vector
;
if
(
!
UNICHAR
::
UTF8ToUnicode
(
boxes
[
i
]
->
ch_
.
c_str
(),
&
uni_vector
))
{
tprintf
(
"Illegal utf8 in boxchar %
d
string:%s = "
,
i
,
tprintf
(
"Illegal utf8 in boxchar %
u
string:%s = "
,
i
,
boxes
[
i
]
->
ch_
.
c_str
());
for
(
in
t
c
=
0
;
c
<
boxes
[
i
]
->
ch_
.
size
();
++
c
)
{
for
(
size_
t
c
=
0
;
c
<
boxes
[
i
]
->
ch_
.
size
();
++
c
)
{
tprintf
(
" 0x%x"
,
boxes
[
i
]
->
ch_
[
c
]);
}
tprintf
(
"
\n
"
);
...
...
@@ -244,7 +244,7 @@ bool BoxChar::ContainsMostlyRTL(const std::vector<BoxChar*>& boxes) {
/* static */
bool
BoxChar
::
MostlyVertical
(
const
std
::
vector
<
BoxChar
*>&
boxes
)
{
inT64
total_dx
=
0
,
total_dy
=
0
;
for
(
in
t
i
=
1
;
i
<
boxes
.
size
();
++
i
)
{
for
(
size_
t
i
=
1
;
i
<
boxes
.
size
();
++
i
)
{
if
(
boxes
[
i
-
1
]
->
box_
!=
nullptr
&&
boxes
[
i
]
->
box_
!=
nullptr
&&
boxes
[
i
-
1
]
->
page_
==
boxes
[
i
]
->
page_
)
{
int
dx
=
boxes
[
i
]
->
box_
->
x
-
boxes
[
i
-
1
]
->
box_
->
x
;
...
...
@@ -263,7 +263,7 @@ bool BoxChar::MostlyVertical(const std::vector<BoxChar*>& boxes) {
/* static */
int
BoxChar
::
TotalByteLength
(
const
std
::
vector
<
BoxChar
*>&
boxes
)
{
int
total_length
=
0
;
for
(
in
t
i
=
0
;
i
<
boxes
.
size
();
++
i
)
total_length
+=
boxes
[
i
]
->
ch_
.
size
();
for
(
size_
t
i
=
0
;
i
<
boxes
.
size
();
++
i
)
total_length
+=
boxes
[
i
]
->
ch_
.
size
();
return
total_length
;
}
...
...
@@ -302,7 +302,7 @@ string BoxChar::GetTesseractBoxStr(int height,
const
std
::
vector
<
BoxChar
*>&
boxes
)
{
string
output
;
char
buffer
[
kMaxLineLength
];
for
(
in
t
i
=
0
;
i
<
boxes
.
size
();
++
i
)
{
for
(
size_
t
i
=
0
;
i
<
boxes
.
size
();
++
i
)
{
const
Box
*
box
=
boxes
[
i
]
->
box_
;
if
(
box
==
nullptr
)
{
tprintf
(
"Error: Call PrepareToWrite before WriteTesseractBoxFile!!
\n
"
);
...
...
training/commontraining.cpp
浏览文件 @
7f382df5
...
...
@@ -35,6 +35,7 @@
#include "tprintf.h"
#include "unicity_table.h"
#include <assert.h>
#include <math.h>
using
tesseract
::
CCUtil
;
...
...
@@ -368,7 +369,9 @@ void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_defs,
LABELEDLIST
char_sample
;
FEATURE_SET
feature_samples
;
CHAR_DESC
char_desc
;
int
feature_type
=
ShortNameToFeatureType
(
feature_defs
,
feature_name
);
int
ShortNameToFeatureType_res
=
ShortNameToFeatureType
(
feature_defs
,
feature_name
);
assert
(
0
<=
ShortNameToFeatureType_res
);
unsigned
int
feature_type
=
static_cast
<
unsigned
int
>
(
ShortNameToFeatureType_res
);
// Zero out the font_sample_count for all the classes.
LIST
it
=
*
training_samples
;
iterate
(
it
)
{
...
...
training/normstrngs.cpp
浏览文件 @
7f382df5
...
...
@@ -20,6 +20,7 @@
#include "normstrngs.h"
#include <assert.h>
#include "icuerrorcode.h"
#include "unichar.h"
#include "unicode/normalizer2.h" // From libicu
...
...
@@ -181,7 +182,13 @@ bool IsWhitespace(const char32 ch) {
}
bool
IsUTF8Whitespace
(
const
char
*
text
)
{
#if 0 // intent
return SpanUTF8Whitespace(text) == strlen(text);
#else
// avoiding g++ -Wsign-compare warning
const
int
res
=
SpanUTF8Whitespace
(
text
);
assert
(
0
<=
res
);
return
static_cast
<
unsigned
int
>
(
res
)
==
strlen
(
text
);
#endif
}
int
SpanUTF8Whitespace
(
const
char
*
text
)
{
...
...
training/stringrenderer.cpp
浏览文件 @
7f382df5
...
...
@@ -20,6 +20,7 @@
#include "stringrenderer.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <algorithm>
...
...
@@ -241,7 +242,7 @@ void StringRenderer::SetWordUnderlineAttributes(const string& page_text) {
PangoAttrList
*
attr_list
=
pango_layout_get_attributes
(
layout_
);
const
char
*
text
=
page_text
.
c_str
();
in
t
offset
=
0
;
size_
t
offset
=
0
;
TRand
rand
;
bool
started_underline
=
false
;
PangoAttribute
*
und_attr
=
nullptr
;
...
...
@@ -341,7 +342,7 @@ void StringRenderer::RotatePageBoxes(float rotation) {
void
StringRenderer
::
ClearBoxes
()
{
for
(
in
t
i
=
0
;
i
<
boxchars_
.
size
();
++
i
)
for
(
size_
t
i
=
0
;
i
<
boxchars_
.
size
();
++
i
)
delete
boxchars_
[
i
];
boxchars_
.
clear
();
boxaDestroy
(
&
page_boxes_
);
...
...
@@ -416,7 +417,7 @@ bool StringRenderer::GetClusterStrings(std::vector<string>* cluster_text) {
static
void
MergeBoxCharsToWords
(
std
::
vector
<
BoxChar
*>*
boxchars
)
{
std
::
vector
<
BoxChar
*>
result
;
bool
started_word
=
false
;
for
(
in
t
i
=
0
;
i
<
boxchars
->
size
();
++
i
)
{
for
(
size_
t
i
=
0
;
i
<
boxchars
->
size
();
++
i
)
{
if
(
boxchars
->
at
(
i
)
->
ch
()
==
" "
||
boxchars
->
at
(
i
)
->
box
()
==
nullptr
)
{
result
.
push_back
(
boxchars
->
at
(
i
));
boxchars
->
at
(
i
)
=
nullptr
;
...
...
@@ -480,7 +481,7 @@ void StringRenderer::ComputeClusterBoxes() {
// Sort the indices and create a map from start to end indices.
std
::
sort
(
cluster_start_indices
.
begin
(),
cluster_start_indices
.
end
());
std
::
map
<
int
,
int
>
cluster_start_to_end_index
;
for
(
int
i
=
0
;
i
<
cluster_start_indices
.
size
()
-
1
;
++
i
)
{
for
(
size_t
i
=
0
;
i
+
1
<
cluster_start_indices
.
size
()
;
++
i
)
{
cluster_start_to_end_index
[
cluster_start_indices
[
i
]]
=
cluster_start_indices
[
i
+
1
];
}
...
...
@@ -592,7 +593,7 @@ void StringRenderer::ComputeClusterBoxes() {
// Compute the page bounding box
Box
*
page_box
=
nullptr
;
Boxa
*
all_boxes
=
nullptr
;
for
(
in
t
i
=
0
;
i
<
page_boxchars
.
size
();
++
i
)
{
for
(
size_
t
i
=
0
;
i
<
page_boxchars
.
size
();
++
i
)
{
if
(
page_boxchars
[
i
]
->
box
()
==
nullptr
)
continue
;
if
(
all_boxes
==
nullptr
)
all_boxes
=
boxaCreate
(
0
);
boxaAddBox
(
all_boxes
,
page_boxchars
[
i
]
->
mutable_box
(),
L_CLONE
);
...
...
@@ -622,7 +623,7 @@ void StringRenderer::CorrectBoxPositionsToLayout(
int
StringRenderer
::
StripUnrenderableWords
(
string
*
utf8_text
)
const
{
string
output_text
;
const
char
*
text
=
utf8_text
->
c_str
();
in
t
offset
=
0
;
size_
t
offset
=
0
;
int
num_dropped
=
0
;
while
(
offset
<
utf8_text
->
length
())
{
int
space_len
=
SpanUTF8Whitespace
(
text
+
offset
);
...
...
@@ -866,7 +867,8 @@ int StringRenderer::RenderAllFontsToImage(double min_coverage,
tprintf
(
"Total chars = %d
\n
"
,
total_chars_
);
}
const
std
::
vector
<
string
>&
all_fonts
=
FontUtils
::
ListAvailableFonts
();
for
(
int
i
=
font_index_
;
i
<
all_fonts
.
size
();
++
i
)
{
assert
(
0
<=
font_index_
);
for
(
unsigned
int
i
=
static_cast
<
unsigned
int
>
(
font_index_
);
i
<
all_fonts
.
size
();
++
i
)
{
++
font_index_
;
int
raw_score
=
0
;
int
ok_chars
=
...
...
training/text2image.cpp
浏览文件 @
7f382df5
...
...
@@ -190,14 +190,12 @@ static bool IsWhitespaceBox(const BoxChar* boxchar) {
static
string
StringReplace
(
const
string
&
in
,
const
string
&
oldsub
,
const
string
&
newsub
)
{
string
out
;
int
start_pos
=
0
;
do
{
int
pos
=
in
.
find
(
oldsub
,
start_pos
);
if
(
pos
==
string
::
npos
)
break
;
size_t
start_pos
=
0
,
pos
;
while
((
pos
=
in
.
find
(
oldsub
,
start_pos
))
!=
string
::
npos
)
{
out
.
append
(
in
.
data
()
+
start_pos
,
pos
-
start_pos
);
out
.
append
(
newsub
.
data
(),
newsub
.
length
());
start_pos
=
pos
+
oldsub
.
length
();
}
while
(
true
);
}
out
.
append
(
in
.
data
()
+
start_pos
,
in
.
length
()
-
start_pos
);
return
out
;
}
...
...
@@ -239,7 +237,7 @@ void ExtractFontProperties(const string &utf8_text,
offset
-=
boxes
[
boxes
.
size
()
-
1
]
->
ch
().
size
();
}
for
(
in
t
b
=
0
;
b
<
boxes
.
size
();
b
+=
2
)
{
for
(
size_
t
b
=
0
;
b
<
boxes
.
size
();
b
+=
2
)
{
while
(
b
<
boxes
.
size
()
&&
IsWhitespaceBox
(
boxes
[
b
]))
++
b
;
if
(
b
+
1
>=
boxes
.
size
())
break
;
const
string
&
ch0
=
boxes
[
b
]
->
ch
();
...
...
@@ -422,8 +420,8 @@ int main(int argc, char** argv) {
if
(
FLAGS_list_available_fonts
)
{
const
std
::
vector
<
string
>&
all_fonts
=
FontUtils
::
ListAvailableFonts
();
for
(
int
i
=
0
;
i
<
all_fonts
.
size
();
++
i
)
{
printf
(
"%3
d
: %s
\n
"
,
i
,
all_fonts
[
i
].
c_str
());
for
(
unsigned
int
i
=
0
;
i
<
all_fonts
.
size
();
++
i
)
{
printf
(
"%3
u
: %s
\n
"
,
i
,
all_fonts
[
i
].
c_str
());
ASSERT_HOST_MSG
(
FontUtils
::
IsAvailableFont
(
all_fonts
[
i
].
c_str
()),
"Font %s is unrecognized.
\n
"
,
all_fonts
[
i
].
c_str
());
}
...
...
@@ -517,10 +515,10 @@ int main(int argc, char** argv) {
// Try to preserve behavior of old text2image by expanding inter-word
// spaces by a factor of 4.
const
string
kSeparator
=
FLAGS_render_ngrams
?
" "
:
" "
;
// Also restrict the number of charactes per line to try and avoid
// Also restrict the number of characte
r
s per line to try and avoid
// line-breaking in the middle of words like "-A", "R$" etc. which are
// otherwise allowed by the standard unicode line-breaking rules.
const
int
kCharsPerLine
=
(
FLAGS_ptsize
>
20
)
?
50
:
100
;
const
unsigned
int
kCharsPerLine
=
(
FLAGS_ptsize
>
20
)
?
50
:
100
;
string
rand_utf8
;
UNICHARSET
unicharset
;
if
(
FLAGS_render_ngrams
&&
!
FLAGS_unicharset_file
.
empty
()
&&
...
...
@@ -547,7 +545,7 @@ int main(int argc, char** argv) {
if
(
FLAGS_render_ngrams
)
std
::
random_shuffle
(
offsets
.
begin
(),
offsets
.
end
());
for
(
in
t
i
=
0
,
line
=
1
;
i
<
offsets
.
size
();
++
i
)
{
for
(
size_
t
i
=
0
,
line
=
1
;
i
<
offsets
.
size
();
++
i
)
{
const
char
*
curr_pos
=
str8
+
offsets
[
i
].
first
;
int
ngram_len
=
offsets
[
i
].
second
;
// Skip words that contain characters not in found in unicharset.
...
...
@@ -588,7 +586,7 @@ int main(int argc, char** argv) {
for
(
int
pass
=
0
;
pass
<
num_pass
;
++
pass
)
{
int
page_num
=
0
;
string
font_used
;
for
(
in
t
offset
=
0
;
offset
<
strlen
(
to_render_utf8
);
++
im
,
++
page_num
)
{
for
(
size_
t
offset
=
0
;
offset
<
strlen
(
to_render_utf8
);
++
im
,
++
page_num
)
{
tlog
(
1
,
"Starting page %d
\n
"
,
im
);
Pix
*
pix
=
nullptr
;
if
(
FLAGS_find_fonts
)
{
...
...
@@ -664,7 +662,7 @@ int main(int argc, char** argv) {
if
(
fp
==
nullptr
)
{
tprintf
(
"Failed to create output font list %s
\n
"
,
filename
.
c_str
());
}
else
{
for
(
in
t
i
=
0
;
i
<
font_names
.
size
();
++
i
)
{
for
(
size_
t
i
=
0
;
i
<
font_names
.
size
();
++
i
)
{
fprintf
(
fp
,
"%s
\n
"
,
font_names
[
i
].
c_str
());
}
fclose
(
fp
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录