tatweel_test.cc 3.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#if defined(_WIN32)
E
Egor Pugin 已提交
13
#  include <io.h> // for _access
14
#else
E
Egor Pugin 已提交
15
#  include <unistd.h> // for access
16 17 18
#endif

#include "dawg.h"
E
Egor Pugin 已提交
19
#include "include_gunit.h"
20 21
#include "trie.h"
#include "unicharset.h"
22
#include "util/utf8/unicodetext.h" // for UnicodeText
S
Stefan Weil 已提交
23

24
namespace tesseract {
S
Stefan Weil 已提交
25

26
// Replacement for std::filesystem::exists (C++-17)
E
Egor Pugin 已提交
27
static bool file_exists(const char *filename) {
28 29 30 31 32 33 34
#if defined(_WIN32)
  return _access(filename, 0) == 0;
#else
  return access(filename, 0) == 0;
#endif
}

S
Stefan Weil 已提交
35
class TatweelTest : public ::testing::Test {
E
Egor Pugin 已提交
36
protected:
37 38 39
  void SetUp() override {
    static std::locale system_locale("");
    std::locale::global(system_locale);
40 41
  }

S
Stefan Weil 已提交
42
  TatweelTest() {
43 44
    std::string filename = TestDataNameToPath("ara.wordlist");
    if (file_exists(filename.c_str())) {
45
      std::string wordlist("\u0640");
46 47 48 49 50 51 52
      CHECK_OK(file::GetContents(filename, &wordlist, file::Defaults()));
      // Put all the unicodes in the unicharset_.
      UnicodeText text;
      text.PointToUTF8(wordlist.data(), wordlist.size());
      int num_tatweel = 0;
      for (auto it = text.begin(); it != text.end(); ++it) {
        std::string utf8 = it.get_utf8_string();
53
        if (utf8.find("\u0640") != std::string::npos)
E
Egor Pugin 已提交
54
          ++num_tatweel;
55 56 57 58
        unicharset_.unichar_insert(utf8.c_str());
      }
      LOG(INFO) << "Num tatweels in source data=" << num_tatweel;
      EXPECT_GT(num_tatweel, 0);
S
Stefan Weil 已提交
59 60 61
    }
  }

E
Egor Pugin 已提交
62
  std::string TestDataNameToPath(const std::string &name) {
63
    return file::JoinPath(TESTDATA_DIR, name);
S
Stefan Weil 已提交
64 65 66 67 68 69 70
  }
  UNICHARSET unicharset_;
};

TEST_F(TatweelTest, UnicharsetIgnoresTatweel) {
  // This test verifies that the unicharset ignores the Tatweel character.
  for (int i = 0; i < unicharset_.size(); ++i) {
E
Egor Pugin 已提交
71 72
    const char *utf8 = unicharset_.id_to_unichar(i);
    EXPECT_EQ(strstr(utf8, reinterpret_cast<const char *>(u8"\u0640")), nullptr);
S
Stefan Weil 已提交
73 74 75 76 77
  }
}

TEST_F(TatweelTest, DictIgnoresTatweel) {
  // This test verifies that the dictionary ignores the Tatweel character.
E
Egor Pugin 已提交
78
  tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "ara", SYSTEM_DAWG_PERM, unicharset_.size(), 0);
79 80 81 82 83
  std::string filename = TestDataNameToPath("ara.wordlist");
  if (!file_exists(filename.c_str())) {
    LOG(INFO) << "Skip test because of missing " << filename;
    GTEST_SKIP();
  } else {
E
Egor Pugin 已提交
84 85
    EXPECT_TRUE(trie.read_and_add_word_list(filename.c_str(), unicharset_,
                                            tesseract::Trie::RRP_REVERSE_IF_HAS_RTL));
86 87
    EXPECT_EQ(0, trie.check_for_words(filename.c_str(), unicharset_, false));
  }
S
Stefan Weil 已提交
88 89 90 91
}

TEST_F(TatweelTest, UnicharsetLoadKeepsTatweel) {
  // This test verifies that a load of an existing unicharset keeps any
92
  // existing tatweel for backwards compatibility.
93 94 95 96 97 98 99 100
  std::string filename = TestDataNameToPath("ara.unicharset");
  if (!file_exists(filename.c_str())) {
    LOG(INFO) << "Skip test because of missing " << filename;
    GTEST_SKIP();
  } else {
    EXPECT_TRUE(unicharset_.load_from_file(filename.c_str()));
    int num_tatweel = 0;
    for (int i = 0; i < unicharset_.size(); ++i) {
E
Egor Pugin 已提交
101
      const char *utf8 = unicharset_.id_to_unichar(i);
102
      if (strstr(utf8, reinterpret_cast<const char *>(u8"\u0640")) != nullptr) {
E
Egor Pugin 已提交
103
        ++num_tatweel;
104
      }
105 106 107
    }
    LOG(INFO) << "Num tatweels in unicharset=" << num_tatweel;
    EXPECT_EQ(num_tatweel, 4);
S
Stefan Weil 已提交
108 109 110
  }
}

E
Egor Pugin 已提交
111
} // namespace tesseract