提交 63c12a9e 编写于 作者: S Stefan Weil

unittest: Enable more code for tatweel_test without requiring Tensorflow

Signed-off-by: NStefan Weil <sw@weilnetz.de>
上级 c1180a8b
...@@ -1156,9 +1156,9 @@ unittest_CPPFLAGS += $(pangocairo_CFLAGS) ...@@ -1156,9 +1156,9 @@ unittest_CPPFLAGS += $(pangocairo_CFLAGS)
endif # ENABLE_TRAINING endif # ENABLE_TRAINING
unittest_CPPFLAGS += -I$(top_srcdir)/src/viewer unittest_CPPFLAGS += -I$(top_srcdir)/src/viewer
unittest_CPPFLAGS += -I$(top_srcdir)/src/wordrec unittest_CPPFLAGS += -I$(top_srcdir)/src/wordrec
unittest_CPPFLAGS += -I$(top_srcdir)/unittest
if TENSORFLOW if TENSORFLOW
unittest_CPPFLAGS += -DINCLUDE_TENSORFLOW unittest_CPPFLAGS += -DINCLUDE_TENSORFLOW
unittest_CPPFLAGS += -I$(top_srcdir)/unittest
unittest_CPPFLAGS += -I/usr/include/tensorflow unittest_CPPFLAGS += -I/usr/include/tensorflow
endif # TENSORFLOW endif # TENSORFLOW
...@@ -1536,11 +1536,9 @@ tabvector_test_CPPFLAGS = $(unittest_CPPFLAGS) ...@@ -1536,11 +1536,9 @@ tabvector_test_CPPFLAGS = $(unittest_CPPFLAGS)
tabvector_test_LDADD = $(TESS_LIBS) tabvector_test_LDADD = $(TESS_LIBS)
tatweel_test_SOURCES = unittest/tatweel_test.cc tatweel_test_SOURCES = unittest/tatweel_test.cc
if TENSORFLOW
tatweel_test_SOURCES += unittest/third_party/utf/rune.c tatweel_test_SOURCES += unittest/third_party/utf/rune.c
tatweel_test_SOURCES += unittest/util/utf8/unicodetext.cc tatweel_test_SOURCES += unittest/util/utf8/unicodetext.cc
tatweel_test_SOURCES += unittest/util/utf8/unilib.cc tatweel_test_SOURCES += unittest/util/utf8/unilib.cc
endif # TENSORFLOW
tatweel_test_CPPFLAGS = $(unittest_CPPFLAGS) tatweel_test_CPPFLAGS = $(unittest_CPPFLAGS)
tatweel_test_LDADD = $(TRAINING_LIBS) tatweel_test_LDADD = $(TRAINING_LIBS)
......
...@@ -18,11 +18,11 @@ ...@@ -18,11 +18,11 @@
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "log.h" // for LOG #include "log.h" // for LOG
const char *FLAGS_test_tmpdir = "./tmp"; static const char *FLAGS_test_tmpdir = "./tmp";
namespace tesseract { namespace tesseract {
void trim(std::string &s) { static inline void trim(std::string &s) {
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) { s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {
return !std::isspace(ch); return !std::isspace(ch);
})); }));
...@@ -77,6 +77,7 @@ public: ...@@ -77,6 +77,7 @@ public:
if (!(condition)) \ if (!(condition)) \
LOG(FATAL) << "Check failed: " #condition " " LOG(FATAL) << "Check failed: " #condition " "
# define CHECK_EQ(test, value) CHECK((test) == (value)) # define CHECK_EQ(test, value) CHECK((test) == (value))
# define CHECK_GE(test, value) CHECK((test) >= (value))
# define CHECK_GT(test, value) CHECK((test) > (value)) # define CHECK_GT(test, value) CHECK((test) > (value))
# define CHECK_LT(test, value) CHECK((test) < (value)) # define CHECK_LT(test, value) CHECK((test) < (value))
# define CHECK_LE(test, value) CHECK((test) <= (value)) # define CHECK_LE(test, value) CHECK((test) <= (value))
......
...@@ -16,12 +16,15 @@ limitations under the License. ...@@ -16,12 +16,15 @@ limitations under the License.
#ifndef SYNTAXNET_BASE_H_ #ifndef SYNTAXNET_BASE_H_
#define SYNTAXNET_BASE_H_ #define SYNTAXNET_BASE_H_
#include <map>
#include <functional> #include <functional>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#ifdef INCLUDE_TENSORFLOW
#include "google/protobuf/util/message_differencer.h" #include "google/protobuf/util/message_differencer.h"
#include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/status.h"
...@@ -31,11 +34,14 @@ limitations under the License. ...@@ -31,11 +34,14 @@ limitations under the License.
#include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/protobuf.h"
#endif
using std::map; using std::map;
using std::pair; using std::pair;
using std::unordered_map; using std::unordered_map;
using std::unordered_set; using std::unordered_set;
using std::vector; using std::vector;
#ifdef INCLUDE_TENSORFLOW
using tensorflow::int16; using tensorflow::int16;
using tensorflow::int32; using tensorflow::int32;
using tensorflow::int64; using tensorflow::int64;
...@@ -47,10 +53,13 @@ using tensorflow::uint32; ...@@ -47,10 +53,13 @@ using tensorflow::uint32;
using tensorflow::uint64; using tensorflow::uint64;
using tensorflow::uint8; using tensorflow::uint8;
using tensorflow::protobuf::TextFormat; using tensorflow::protobuf::TextFormat;
#endif
typedef signed int char32; typedef signed int char32;
using std::string; using std::string;
#ifdef INCLUDE_TENSORFLOW
using tensorflow::StringPiece; using tensorflow::StringPiece;
#endif
// namespace syntaxnet // namespace syntaxnet
......
...@@ -19,9 +19,7 @@ ...@@ -19,9 +19,7 @@
#include "include_gunit.h" #include "include_gunit.h"
#include "trie.h" #include "trie.h"
#include "unicharset.h" #include "unicharset.h"
#ifdef INCLUDE_TENSORFLOW #include "util/utf8/unicodetext.h" // for UnicodeText
# include "util/utf8/unicodetext.h" // for UnicodeText
#endif
namespace tesseract { namespace tesseract {
...@@ -42,10 +40,9 @@ protected: ...@@ -42,10 +40,9 @@ protected:
} }
TatweelTest() { TatweelTest() {
#ifdef INCLUDE_TENSORFLOW
std::string filename = TestDataNameToPath("ara.wordlist"); std::string filename = TestDataNameToPath("ara.wordlist");
if (file_exists(filename.c_str())) { if (file_exists(filename.c_str())) {
std::string wordlist(u8"\u0640"); std::string wordlist("\u0640");
CHECK_OK(file::GetContents(filename, &wordlist, file::Defaults())); CHECK_OK(file::GetContents(filename, &wordlist, file::Defaults()));
// Put all the unicodes in the unicharset_. // Put all the unicodes in the unicharset_.
UnicodeText text; UnicodeText text;
...@@ -53,14 +50,13 @@ protected: ...@@ -53,14 +50,13 @@ protected:
int num_tatweel = 0; int num_tatweel = 0;
for (auto it = text.begin(); it != text.end(); ++it) { for (auto it = text.begin(); it != text.end(); ++it) {
std::string utf8 = it.get_utf8_string(); std::string utf8 = it.get_utf8_string();
if (utf8.find(u8"\u0640") != std::string::npos) if (utf8.find("\u0640") != std::string::npos)
++num_tatweel; ++num_tatweel;
unicharset_.unichar_insert(utf8.c_str()); unicharset_.unichar_insert(utf8.c_str());
} }
LOG(INFO) << "Num tatweels in source data=" << num_tatweel; LOG(INFO) << "Num tatweels in source data=" << num_tatweel;
EXPECT_GT(num_tatweel, 0); EXPECT_GT(num_tatweel, 0);
} }
#endif
} }
std::string TestDataNameToPath(const std::string &name) { std::string TestDataNameToPath(const std::string &name) {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include "include_gunit.h"
#include "util/utf8/unicodetext.h" #include "util/utf8/unicodetext.h"
#include <string.h> // for memcpy, NULL, memcmp, etc #include <string.h> // for memcpy, NULL, memcmp, etc
...@@ -172,10 +173,12 @@ void UnicodeText::Repr::append(const char *bytes, int byte_length) { ...@@ -172,10 +173,12 @@ void UnicodeText::Repr::append(const char *bytes, int byte_length) {
size_ += byte_length; size_ += byte_length;
} }
#ifdef INCLUDE_TENSORFLOW
string UnicodeText::Repr::DebugString() const { string UnicodeText::Repr::DebugString() const {
return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}", this, data_, size_, return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}", this, data_, size_,
capacity_, ours_ ? "Owned" : "Alias"); capacity_, ours_ ? "Owned" : "Alias");
} }
#endif
// *************** UnicodeText ****************** // *************** UnicodeText ******************
...@@ -310,17 +313,24 @@ UnicodeText::const_iterator UnicodeText::UnsafeFind(const UnicodeText &look, ...@@ -310,17 +313,24 @@ UnicodeText::const_iterator UnicodeText::UnsafeFind(const UnicodeText &look,
const_iterator start_pos) const { const_iterator start_pos) const {
// Due to the magic of the UTF8 encoding, searching for a sequence of // Due to the magic of the UTF8 encoding, searching for a sequence of
// letters is equivalent to substring search. // letters is equivalent to substring search.
#ifdef INCLUDE_TENSORFLOW
StringPiece searching(utf8_data(), utf8_length()); StringPiece searching(utf8_data(), utf8_length());
StringPiece look_piece(look.utf8_data(), look.utf8_length()); StringPiece look_piece(look.utf8_data(), look.utf8_length());
#endif
LOG(FATAL) << "Not implemented"; LOG(FATAL) << "Not implemented";
#ifdef INCLUDE_TENSORFLOW
// StringPiece::size_type found = // StringPiece::size_type found =
// searching.find(look_piece, start_pos.utf8_data() - utf8_data()); // searching.find(look_piece, start_pos.utf8_data() - utf8_data());
StringPiece::size_type found = StringPiece::npos; StringPiece::size_type found = StringPiece::npos;
if (found == StringPiece::npos) if (found == StringPiece::npos)
return end(); return end();
return const_iterator(utf8_data() + found); return const_iterator(utf8_data() + found);
#else
return end();
#endif
} }
#ifdef INCLUDE_TENSORFLOW
bool UnicodeText::HasReplacementChar() const { bool UnicodeText::HasReplacementChar() const {
// Equivalent to: // Equivalent to:
// UnicodeText replacement_char; // UnicodeText replacement_char;
...@@ -332,6 +342,7 @@ bool UnicodeText::HasReplacementChar() const { ...@@ -332,6 +342,7 @@ bool UnicodeText::HasReplacementChar() const {
// return searching.find(looking_for) != StringPiece::npos; // return searching.find(looking_for) != StringPiece::npos;
return false; return false;
} }
#endif
// ----- other methods ----- // ----- other methods -----
...@@ -371,10 +382,12 @@ bool operator==(const UnicodeText &lhs, const UnicodeText &rhs) { ...@@ -371,10 +382,12 @@ bool operator==(const UnicodeText &lhs, const UnicodeText &rhs) {
return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0; return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
} }
#ifdef INCLUDE_TENSORFLOW
string UnicodeText::DebugString() const { string UnicodeText::DebugString() const {
return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}", this, size(), return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}", this, size(),
repr_.DebugString().c_str()); repr_.DebugString().c_str());
} }
#endif
// ******************* UnicodeText::const_iterator ********************* // ******************* UnicodeText::const_iterator *********************
...@@ -479,6 +492,7 @@ UnicodeText::const_iterator UnicodeText::MakeIterator(const char *p) const { ...@@ -479,6 +492,7 @@ UnicodeText::const_iterator UnicodeText::MakeIterator(const char *p) const {
return const_iterator(p); return const_iterator(p);
} }
#ifdef INCLUDE_TENSORFLOW
string UnicodeText::const_iterator::DebugString() const { string UnicodeText::const_iterator::DebugString() const {
return tensorflow::strings::Printf("{iter %p}", it_); return tensorflow::strings::Printf("{iter %p}", it_);
} }
...@@ -492,3 +506,4 @@ string CodepointString(const UnicodeText &t) { ...@@ -492,3 +506,4 @@ string CodepointString(const UnicodeText &t) {
tensorflow::strings::Appendf(&s, "%X ", *it++); tensorflow::strings::Appendf(&s, "%X ", *it++);
return s; return s;
} }
#endif
...@@ -29,13 +29,14 @@ namespace UniLib { ...@@ -29,13 +29,14 @@ namespace UniLib {
// (i.e., is not a surrogate codepoint). See also // (i.e., is not a surrogate codepoint). See also
// IsValidCodepoint(const char* src) in util/utf8/public/unilib.h. // IsValidCodepoint(const char* src) in util/utf8/public/unilib.h.
inline bool IsValidCodepoint(char32 c) { inline bool IsValidCodepoint(char32 c) {
return (static_cast<uint32>(c) < 0xD800) || (c >= 0xE000 && c <= 0x10FFFF); return (static_cast<uint32_t>(c) < 0xD800) || (c >= 0xE000 && c <= 0x10FFFF);
} }
// Returns true if 'str' is the start of a structurally valid UTF-8 // Returns true if 'str' is the start of a structurally valid UTF-8
// sequence and is not a surrogate codepoint. Returns false if str.empty() // sequence and is not a surrogate codepoint. Returns false if str.empty()
// or if str.length() < UniLib::OneCharLen(str[0]). Otherwise, this function // or if str.length() < UniLib::OneCharLen(str[0]). Otherwise, this function
// will access 1-4 bytes of src, where n is UniLib::OneCharLen(src[0]). // will access 1-4 bytes of src, where n is UniLib::OneCharLen(src[0]).
#ifdef INCLUDE_TENSORFLOW
inline bool IsUTF8ValidCodepoint(StringPiece str) { inline bool IsUTF8ValidCodepoint(StringPiece str) {
char32 c; char32 c;
int consumed; int consumed;
...@@ -43,6 +44,7 @@ inline bool IsUTF8ValidCodepoint(StringPiece str) { ...@@ -43,6 +44,7 @@ inline bool IsUTF8ValidCodepoint(StringPiece str) {
return !str.empty() && isvalidcharntorune(str.data(), str.size(), &c, &consumed) && return !str.empty() && isvalidcharntorune(str.data(), str.size(), &c, &consumed) &&
IsValidCodepoint(c); IsValidCodepoint(c);
} }
#endif
// Returns the length (number of bytes) of the Unicode code point // Returns the length (number of bytes) of the Unicode code point
// starting at src, based on inspecting just that one byte. This // starting at src, based on inspecting just that one byte. This
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册