Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
a7660331a
tesseract
提交
63c12a9e
T
tesseract
项目概览
a7660331a
/
tesseract
与 Fork 源项目一致
从无法访问的项目Fork
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
tesseract
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
63c12a9e
编写于
8月 08, 2021
作者:
S
Stefan Weil
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
unittest: Enable more code for tatweel_test without requiring Tensorflow
Signed-off-by:
N
Stefan Weil
<
sw@weilnetz.de
>
上级
c1180a8b
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
34 addition
and
13 deletion
+34
-13
Makefile.am
Makefile.am
+1
-3
unittest/include_gunit.h
unittest/include_gunit.h
+3
-2
unittest/syntaxnet/base.h
unittest/syntaxnet/base.h
+9
-0
unittest/tatweel_test.cc
unittest/tatweel_test.cc
+3
-7
unittest/util/utf8/unicodetext.cc
unittest/util/utf8/unicodetext.cc
+15
-0
unittest/util/utf8/unilib_utf8_utils.h
unittest/util/utf8/unilib_utf8_utils.h
+3
-1
未找到文件。
Makefile.am
浏览文件 @
63c12a9e
...
...
@@ -1156,9 +1156,9 @@ unittest_CPPFLAGS += $(pangocairo_CFLAGS)
endif
# ENABLE_TRAINING
unittest_CPPFLAGS
+=
-I
$(top_srcdir)
/src/viewer
unittest_CPPFLAGS
+=
-I
$(top_srcdir)
/src/wordrec
unittest_CPPFLAGS
+=
-I
$(top_srcdir)
/unittest
if
TENSORFLOW
unittest_CPPFLAGS
+=
-DINCLUDE_TENSORFLOW
unittest_CPPFLAGS
+=
-I
$(top_srcdir)
/unittest
unittest_CPPFLAGS
+=
-I
/usr/include/tensorflow
endif
# TENSORFLOW
...
...
@@ -1536,11 +1536,9 @@ tabvector_test_CPPFLAGS = $(unittest_CPPFLAGS)
tabvector_test_LDADD
=
$(TESS_LIBS)
tatweel_test_SOURCES
=
unittest/tatweel_test.cc
if
TENSORFLOW
tatweel_test_SOURCES
+=
unittest/third_party/utf/rune.c
tatweel_test_SOURCES
+=
unittest/util/utf8/unicodetext.cc
tatweel_test_SOURCES
+=
unittest/util/utf8/unilib.cc
endif
# TENSORFLOW
tatweel_test_CPPFLAGS
=
$(unittest_CPPFLAGS)
tatweel_test_LDADD
=
$(TRAINING_LIBS)
...
...
unittest/include_gunit.h
浏览文件 @
63c12a9e
...
...
@@ -18,11 +18,11 @@
#include "gtest/gtest.h"
#include "log.h" // for LOG
const
char
*
FLAGS_test_tmpdir
=
"./tmp"
;
static
const
char
*
FLAGS_test_tmpdir
=
"./tmp"
;
namespace
tesseract
{
void
trim
(
std
::
string
&
s
)
{
static
inline
void
trim
(
std
::
string
&
s
)
{
s
.
erase
(
s
.
begin
(),
std
::
find_if
(
s
.
begin
(),
s
.
end
(),
[](
unsigned
char
ch
)
{
return
!
std
::
isspace
(
ch
);
}));
...
...
@@ -77,6 +77,7 @@ public:
if (!(condition)) \
LOG(FATAL) << "Check failed: " #condition " "
# define CHECK_EQ(test, value) CHECK((test) == (value))
# define CHECK_GE(test, value) CHECK((test) >= (value))
# define CHECK_GT(test, value) CHECK((test) > (value))
# define CHECK_LT(test, value) CHECK((test) < (value))
# define CHECK_LE(test, value) CHECK((test) <= (value))
...
...
unittest/syntaxnet/base.h
浏览文件 @
63c12a9e
...
...
@@ -16,12 +16,15 @@ limitations under the License.
#ifndef SYNTAXNET_BASE_H_
#define SYNTAXNET_BASE_H_
#include <map>
#include <functional>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#ifdef INCLUDE_TENSORFLOW
#include "google/protobuf/util/message_differencer.h"
#include "tensorflow/core/lib/core/status.h"
...
...
@@ -31,11 +34,14 @@ limitations under the License.
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/protobuf.h"
#endif
using
std
::
map
;
using
std
::
pair
;
using
std
::
unordered_map
;
using
std
::
unordered_set
;
using
std
::
vector
;
#ifdef INCLUDE_TENSORFLOW
using
tensorflow
::
int16
;
using
tensorflow
::
int32
;
using
tensorflow
::
int64
;
...
...
@@ -47,10 +53,13 @@ using tensorflow::uint32;
using
tensorflow
::
uint64
;
using
tensorflow
::
uint8
;
using
tensorflow
::
protobuf
::
TextFormat
;
#endif
typedef
signed
int
char32
;
using
std
::
string
;
#ifdef INCLUDE_TENSORFLOW
using
tensorflow
::
StringPiece
;
#endif
// namespace syntaxnet
...
...
unittest/tatweel_test.cc
浏览文件 @
63c12a9e
...
...
@@ -19,9 +19,7 @@
#include "include_gunit.h"
#include "trie.h"
#include "unicharset.h"
#ifdef INCLUDE_TENSORFLOW
# include "util/utf8/unicodetext.h" // for UnicodeText
#endif
#include "util/utf8/unicodetext.h" // for UnicodeText
namespace
tesseract
{
...
...
@@ -42,10 +40,9 @@ protected:
}
TatweelTest
()
{
#ifdef INCLUDE_TENSORFLOW
std
::
string
filename
=
TestDataNameToPath
(
"ara.wordlist"
);
if
(
file_exists
(
filename
.
c_str
()))
{
std
::
string
wordlist
(
u8
"\u0640"
);
std
::
string
wordlist
(
"\u0640"
);
CHECK_OK
(
file
::
GetContents
(
filename
,
&
wordlist
,
file
::
Defaults
()));
// Put all the unicodes in the unicharset_.
UnicodeText
text
;
...
...
@@ -53,14 +50,13 @@ protected:
int
num_tatweel
=
0
;
for
(
auto
it
=
text
.
begin
();
it
!=
text
.
end
();
++
it
)
{
std
::
string
utf8
=
it
.
get_utf8_string
();
if
(
utf8
.
find
(
u8
"\u0640"
)
!=
std
::
string
::
npos
)
if
(
utf8
.
find
(
"\u0640"
)
!=
std
::
string
::
npos
)
++
num_tatweel
;
unicharset_
.
unichar_insert
(
utf8
.
c_str
());
}
LOG
(
INFO
)
<<
"Num tatweels in source data="
<<
num_tatweel
;
EXPECT_GT
(
num_tatweel
,
0
);
}
#endif
}
std
::
string
TestDataNameToPath
(
const
std
::
string
&
name
)
{
...
...
unittest/util/utf8/unicodetext.cc
浏览文件 @
63c12a9e
...
...
@@ -14,6 +14,7 @@
* limitations under the License.
*/
#include "include_gunit.h"
#include "util/utf8/unicodetext.h"
#include <string.h> // for memcpy, NULL, memcmp, etc
...
...
@@ -172,10 +173,12 @@ void UnicodeText::Repr::append(const char *bytes, int byte_length) {
size_
+=
byte_length
;
}
#ifdef INCLUDE_TENSORFLOW
string
UnicodeText
::
Repr
::
DebugString
()
const
{
return
tensorflow
::
strings
::
Printf
(
"{Repr %p data=%p size=%d capacity=%d %s}"
,
this
,
data_
,
size_
,
capacity_
,
ours_
?
"Owned"
:
"Alias"
);
}
#endif
// *************** UnicodeText ******************
...
...
@@ -310,17 +313,24 @@ UnicodeText::const_iterator UnicodeText::UnsafeFind(const UnicodeText &look,
const_iterator
start_pos
)
const
{
// Due to the magic of the UTF8 encoding, searching for a sequence of
// letters is equivalent to substring search.
#ifdef INCLUDE_TENSORFLOW
StringPiece
searching
(
utf8_data
(),
utf8_length
());
StringPiece
look_piece
(
look
.
utf8_data
(),
look
.
utf8_length
());
#endif
LOG
(
FATAL
)
<<
"Not implemented"
;
#ifdef INCLUDE_TENSORFLOW
// StringPiece::size_type found =
// searching.find(look_piece, start_pos.utf8_data() - utf8_data());
StringPiece
::
size_type
found
=
StringPiece
::
npos
;
if
(
found
==
StringPiece
::
npos
)
return
end
();
return
const_iterator
(
utf8_data
()
+
found
);
#else
return
end
();
#endif
}
#ifdef INCLUDE_TENSORFLOW
bool
UnicodeText
::
HasReplacementChar
()
const
{
// Equivalent to:
// UnicodeText replacement_char;
...
...
@@ -332,6 +342,7 @@ bool UnicodeText::HasReplacementChar() const {
// return searching.find(looking_for) != StringPiece::npos;
return
false
;
}
#endif
// ----- other methods -----
...
...
@@ -371,10 +382,12 @@ bool operator==(const UnicodeText &lhs, const UnicodeText &rhs) {
return
memcmp
(
lhs
.
repr_
.
data_
,
rhs
.
repr_
.
data_
,
lhs
.
repr_
.
size_
)
==
0
;
}
#ifdef INCLUDE_TENSORFLOW
string
UnicodeText
::
DebugString
()
const
{
return
tensorflow
::
strings
::
Printf
(
"{UnicodeText %p chars=%d repr=%s}"
,
this
,
size
(),
repr_
.
DebugString
().
c_str
());
}
#endif
// ******************* UnicodeText::const_iterator *********************
...
...
@@ -479,6 +492,7 @@ UnicodeText::const_iterator UnicodeText::MakeIterator(const char *p) const {
return
const_iterator
(
p
);
}
#ifdef INCLUDE_TENSORFLOW
string
UnicodeText
::
const_iterator
::
DebugString
()
const
{
return
tensorflow
::
strings
::
Printf
(
"{iter %p}"
,
it_
);
}
...
...
@@ -492,3 +506,4 @@ string CodepointString(const UnicodeText &t) {
tensorflow
::
strings
::
Appendf
(
&
s
,
"%X "
,
*
it
++
);
return
s
;
}
#endif
unittest/util/utf8/unilib_utf8_utils.h
浏览文件 @
63c12a9e
...
...
@@ -29,13 +29,14 @@ namespace UniLib {
// (i.e., is not a surrogate codepoint). See also
// IsValidCodepoint(const char* src) in util/utf8/public/unilib.h.
inline
bool
IsValidCodepoint
(
char32
c
)
{
return
(
static_cast
<
uint32
>
(
c
)
<
0xD800
)
||
(
c
>=
0xE000
&&
c
<=
0x10FFFF
);
return
(
static_cast
<
uint32
_t
>
(
c
)
<
0xD800
)
||
(
c
>=
0xE000
&&
c
<=
0x10FFFF
);
}
// Returns true if 'str' is the start of a structurally valid UTF-8
// sequence and is not a surrogate codepoint. Returns false if str.empty()
// or if str.length() < UniLib::OneCharLen(str[0]). Otherwise, this function
// will access 1-4 bytes of src, where n is UniLib::OneCharLen(src[0]).
#ifdef INCLUDE_TENSORFLOW
inline
bool
IsUTF8ValidCodepoint
(
StringPiece
str
)
{
char32
c
;
int
consumed
;
...
...
@@ -43,6 +44,7 @@ inline bool IsUTF8ValidCodepoint(StringPiece str) {
return
!
str
.
empty
()
&&
isvalidcharntorune
(
str
.
data
(),
str
.
size
(),
&
c
,
&
consumed
)
&&
IsValidCodepoint
(
c
);
}
#endif
// Returns the length (number of bytes) of the Unicode code point
// starting at src, based on inspecting just that one byte. This
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录