Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
a7660331a
tesseract
提交
104fe793
T
tesseract
项目概览
a7660331a
/
tesseract
与 Fork 源项目一致
从无法访问的项目Fork
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
tesseract
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
104fe793
编写于
4月 25, 2018
作者:
E
Egor Pugin
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Move training to src.
上级
ca5c15e6
变更
65
展开全部
隐藏空白更改
内联
并排
Showing
65 changed file
with
1664 addition
and
1664 deletion
+1664
-1664
CMakeLists.txt
CMakeLists.txt
+1
-1
configure.ac
configure.ac
+1
-1
cppan.yml
cppan.yml
+43
-43
src/training/CMakeLists.txt
src/training/CMakeLists.txt
+0
-0
src/training/Makefile.am
src/training/Makefile.am
+0
-0
src/training/ambiguous_words.cpp
src/training/ambiguous_words.cpp
+0
-0
src/training/boxchar.cpp
src/training/boxchar.cpp
+0
-0
src/training/boxchar.h
src/training/boxchar.h
+0
-0
src/training/classifier_tester.cpp
src/training/classifier_tester.cpp
+0
-0
src/training/cntraining.cpp
src/training/cntraining.cpp
+0
-0
src/training/combine_lang_model.cpp
src/training/combine_lang_model.cpp
+0
-0
src/training/combine_tessdata.cpp
src/training/combine_tessdata.cpp
+0
-0
src/training/commandlineflags.cpp
src/training/commandlineflags.cpp
+0
-0
src/training/commandlineflags.h
src/training/commandlineflags.h
+0
-0
src/training/commontraining.cpp
src/training/commontraining.cpp
+0
-0
src/training/commontraining.h
src/training/commontraining.h
+0
-0
src/training/dawg2wordlist.cpp
src/training/dawg2wordlist.cpp
+0
-0
src/training/degradeimage.cpp
src/training/degradeimage.cpp
+310
-310
src/training/degradeimage.h
src/training/degradeimage.h
+61
-61
src/training/fileio.cpp
src/training/fileio.cpp
+0
-0
src/training/fileio.h
src/training/fileio.h
+0
-0
src/training/icuerrorcode.h
src/training/icuerrorcode.h
+66
-66
src/training/lang_model_helpers.cpp
src/training/lang_model_helpers.cpp
+0
-0
src/training/lang_model_helpers.h
src/training/lang_model_helpers.h
+0
-0
src/training/language-specific.sh
src/training/language-specific.sh
+0
-0
src/training/ligature_table.cpp
src/training/ligature_table.cpp
+0
-0
src/training/ligature_table.h
src/training/ligature_table.h
+0
-0
src/training/lstmeval.cpp
src/training/lstmeval.cpp
+0
-0
src/training/lstmtester.cpp
src/training/lstmtester.cpp
+0
-0
src/training/lstmtester.h
src/training/lstmtester.h
+0
-0
src/training/lstmtraining.cpp
src/training/lstmtraining.cpp
+0
-0
src/training/merge_unicharsets.cpp
src/training/merge_unicharsets.cpp
+0
-0
src/training/mergenf.cpp
src/training/mergenf.cpp
+353
-353
src/training/mergenf.h
src/training/mergenf.h
+103
-103
src/training/mftraining.cpp
src/training/mftraining.cpp
+0
-0
src/training/normstrngs.cpp
src/training/normstrngs.cpp
+0
-0
src/training/normstrngs.h
src/training/normstrngs.h
+0
-0
src/training/pango_font_info.cpp
src/training/pango_font_info.cpp
+0
-0
src/training/pango_font_info.h
src/training/pango_font_info.h
+0
-0
src/training/set_unicharset_properties.cpp
src/training/set_unicharset_properties.cpp
+0
-0
src/training/shapeclustering.cpp
src/training/shapeclustering.cpp
+0
-0
src/training/stringrenderer.cpp
src/training/stringrenderer.cpp
+0
-0
src/training/stringrenderer.h
src/training/stringrenderer.h
+0
-0
src/training/tessopt.cpp
src/training/tessopt.cpp
+0
-0
src/training/tessopt.h
src/training/tessopt.h
+0
-0
src/training/tesstrain.sh
src/training/tesstrain.sh
+0
-0
src/training/tesstrain_utils.sh
src/training/tesstrain_utils.sh
+0
-0
src/training/text2image.cpp
src/training/text2image.cpp
+0
-0
src/training/tlog.cpp
src/training/tlog.cpp
+23
-23
src/training/tlog.h
src/training/tlog.h
+41
-41
src/training/unicharset_extractor.cpp
src/training/unicharset_extractor.cpp
+0
-0
src/training/unicharset_training_utils.cpp
src/training/unicharset_training_utils.cpp
+0
-0
src/training/unicharset_training_utils.h
src/training/unicharset_training_utils.h
+0
-0
src/training/util.h
src/training/util.h
+0
-0
src/training/validate_grapheme.cpp
src/training/validate_grapheme.cpp
+0
-0
src/training/validate_grapheme.h
src/training/validate_grapheme.h
+35
-35
src/training/validate_indic.cpp
src/training/validate_indic.cpp
+0
-0
src/training/validate_indic.h
src/training/validate_indic.h
+44
-44
src/training/validate_khmer.cpp
src/training/validate_khmer.cpp
+106
-106
src/training/validate_khmer.h
src/training/validate_khmer.h
+27
-27
src/training/validate_myanmar.cpp
src/training/validate_myanmar.cpp
+160
-160
src/training/validate_myanmar.h
src/training/validate_myanmar.h
+47
-47
src/training/validator.cpp
src/training/validator.cpp
+0
-0
src/training/validator.h
src/training/validator.h
+243
-243
src/training/wordlist2dawg.cpp
src/training/wordlist2dawg.cpp
+0
-0
未找到文件。
CMakeLists.txt
浏览文件 @
104fe793
...
...
@@ -308,7 +308,7 @@ if (BUILD_TESTS AND EXISTS ${PROJECT_SOURCE_DIR}/googletest/CMakeLists.txt)
endif
()
if
(
BUILD_TRAINING_TOOLS
)
add_subdirectory
(
training
)
add_subdirectory
(
src/
training
)
endif
()
get_target_property
(
tesseract_NAME libtesseract NAME
)
...
...
configure.ac
浏览文件 @
104fe793
...
...
@@ -502,7 +502,7 @@ AC_CONFIG_FILES([java/com/google/scrollview/Makefile])
AC_CONFIG_FILES([java/com/google/scrollview/events/Makefile])
AC_CONFIG_FILES([java/com/google/scrollview/ui/Makefile])
AC_CONFIG_FILES([doc/Makefile])
AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(training/Makefile)])
AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(
src/
training/Makefile)])
AC_OUTPUT
# Final message
...
...
cppan.yml
浏览文件 @
104fe793
...
...
@@ -172,7 +172,7 @@ projects:
tessopt
:
type
:
lib
static_only
:
true
files
:
training/tessopt.*
files
:
src/
training/tessopt.*
include_directories
:
training
dependencies
:
libtesseract
...
...
@@ -180,104 +180,104 @@ projects:
type
:
lib
static_only
:
true
files
:
-
training/commandlineflags.cpp
-
training/commandlineflags.h
-
training/commontraining.cpp
-
training/commontraining.h
-
src/
training/commandlineflags.cpp
-
src/
training/commandlineflags.h
-
src/
training/commontraining.cpp
-
src/
training/commontraining.h
include_directories
:
training
dependencies
:
-
tessopt
ambiguous_words
:
files
:
training/ambiguous_words.cpp
files
:
src/
training/ambiguous_words.cpp
dependencies
:
-
libtesseract
classifier_tester
:
files
:
training/classifier_tester.cpp
files
:
src/
training/classifier_tester.cpp
dependencies
:
common_training
combine_lang_model
:
files
:
training/combine_lang_model.cpp
files
:
src/
training/combine_lang_model.cpp
dependencies
:
unicharset_training
combine_tessdata
:
files
:
training/combine_tessdata.cpp
files
:
src/
training/combine_tessdata.cpp
dependencies
:
libtesseract
cntraining
:
files
:
training/cntraining.cpp
files
:
src/
training/cntraining.cpp
dependencies
:
common_training
dawg2wordlist
:
files
:
training/dawg2wordlist.cpp
files
:
src/
training/dawg2wordlist.cpp
dependencies
:
libtesseract
mftraining
:
files
:
-
training/mftraining.cpp
-
training/mergenf.*
-
src/
training/mftraining.cpp
-
src/
training/mergenf.*
dependencies
:
common_training
shapeclustering
:
files
:
training/shapeclustering.cpp
files
:
src/
training/shapeclustering.cpp
dependencies
:
common_training
unicharset_extractor
:
files
:
training/unicharset_extractor.cpp
files
:
src/
training/unicharset_extractor.cpp
dependencies
:
unicharset_training
wordlist2dawg
:
files
:
training/wordlist2dawg.cpp
files
:
src/
training/wordlist2dawg.cpp
dependencies
:
libtesseract
unicharset_training
:
type
:
lib
static_only
:
true
files
:
-
training/fileio.*
-
training/icuerrorcode.h
-
training/lang_model_helpers.*
-
training/lstmtester.*
-
training/normstrngs.*
-
training/unicharset_training_utils.*
-
training/validat.*
-
src/
training/fileio.*
-
src/
training/icuerrorcode.h
-
src/
training/lang_model_helpers.*
-
src/
training/lstmtester.*
-
src/
training/normstrngs.*
-
src/
training/unicharset_training_utils.*
-
src/
training/validat.*
include_directories
:
training
dependencies
:
-
common_training
-
pvt.cppan.demo.unicode.icu.i18n
lstmeval
:
files
:
training/lstmeval.cpp
files
:
src/
training/lstmeval.cpp
dependencies
:
unicharset_training
lstmtraining
:
files
:
training/lstmtraining.cpp
files
:
src/
training/lstmtraining.cpp
dependencies
:
unicharset_training
set_unicharset_properties
:
files
:
training/set_unicharset_properties.cpp
files
:
src/
training/set_unicharset_properties.cpp
dependencies
:
unicharset_training
text2image
:
files
:
-
training/text2image.cpp
-
training/boxchar.cpp
-
training/boxchar.h
-
training/degradeimage.cpp
-
training/degradeimage.h
-
training/ligature_table.cpp
-
training/ligature_table.h
-
training/normstrngs.cpp
-
training/normstrngs.h
-
training/pango_font_info.cpp
-
training/pango_font_info.h
-
training/stringrenderer.cpp
-
training/stringrenderer.h
-
training/tlog.cpp
-
training/tlog.h
-
training/util.h
-
training/icuerrorcode.h
-
src/
training/text2image.cpp
-
src/
training/boxchar.cpp
-
src/
training/boxchar.h
-
src/
training/degradeimage.cpp
-
src/
training/degradeimage.h
-
src/
training/ligature_table.cpp
-
src/
training/ligature_table.h
-
src/
training/normstrngs.cpp
-
src/
training/normstrngs.h
-
src/
training/pango_font_info.cpp
-
src/
training/pango_font_info.h
-
src/
training/stringrenderer.cpp
-
src/
training/stringrenderer.h
-
src/
training/tlog.cpp
-
src/
training/tlog.h
-
src/
training/util.h
-
src/
training/icuerrorcode.h
dependencies
:
-
unicharset_training
...
...
training/CMakeLists.txt
→
src/
training/CMakeLists.txt
浏览文件 @
104fe793
文件已移动
training/Makefile.am
→
src/
training/Makefile.am
浏览文件 @
104fe793
文件已移动
training/ambiguous_words.cpp
→
src/
training/ambiguous_words.cpp
浏览文件 @
104fe793
文件已移动
training/boxchar.cpp
→
src/
training/boxchar.cpp
浏览文件 @
104fe793
文件已移动
training/boxchar.h
→
src/
training/boxchar.h
浏览文件 @
104fe793
文件已移动
training/classifier_tester.cpp
→
src/
training/classifier_tester.cpp
浏览文件 @
104fe793
文件已移动
training/cntraining.cpp
→
src/
training/cntraining.cpp
浏览文件 @
104fe793
文件已移动
training/combine_lang_model.cpp
→
src/
training/combine_lang_model.cpp
浏览文件 @
104fe793
文件已移动
training/combine_tessdata.cpp
→
src/
training/combine_tessdata.cpp
浏览文件 @
104fe793
文件已移动
training/commandlineflags.cpp
→
src/
training/commandlineflags.cpp
浏览文件 @
104fe793
文件已移动
training/commandlineflags.h
→
src/
training/commandlineflags.h
浏览文件 @
104fe793
文件已移动
training/commontraining.cpp
→
src/
training/commontraining.cpp
浏览文件 @
104fe793
文件已移动
training/commontraining.h
→
src/
training/commontraining.h
浏览文件 @
104fe793
文件已移动
training/dawg2wordlist.cpp
→
src/
training/dawg2wordlist.cpp
浏览文件 @
104fe793
文件已移动
training/degradeimage.cpp
→
src/
training/degradeimage.cpp
浏览文件 @
104fe793
此差异已折叠。
点击以展开。
training/degradeimage.h
→
src/
training/degradeimage.h
浏览文件 @
104fe793
/**********************************************************************
* File: degradeimage.h
* Description: Function to degrade an image (usually of text) as if it
* has been printed and then scanned.
* Authors: Ray Smith
* Created: Tue Nov 19 2013
*
* (C) Copyright 2013, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_
#define TESSERACT_TRAINING_DEGRADEIMAGE_H_
#include "allheaders.h"
#include "genericvector.h"
#include "helpers.h" // For TRand.
#include "rect.h"
namespace
tesseract
{
// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
// corresponding to darkening on the copier and <0 lighter and 0 not copied.
// If rotation is not nullptr, the clockwise rotation in radians is saved there.
// The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.)
// The input image is destroyed and a different image returned.
struct
Pix
*
DegradeImage
(
struct
Pix
*
input
,
int
exposure
,
TRand
*
randomizer
,
float
*
rotation
);
// Creates and returns a Pix distorted by various means according to the bool
// flags. If boxes is not nullptr, the boxes are resized/positioned according to
// any spatial distortion and also by the integer reduction factor box_scale
// so they will match what the network will output.
// Returns nullptr on error. The returned Pix must be pixDestroyed.
Pix
*
PrepareDistortedPix
(
const
Pix
*
pix
,
bool
perspective
,
bool
invert
,
bool
white_noise
,
bool
smooth_noise
,
bool
blur
,
int
box_reduction
,
TRand
*
randomizer
,
GenericVector
<
TBOX
>*
boxes
);
// Distorts anything that has a non-null pointer with the same pseudo-random
// perspective distortion. Width and height only need to be set if there
// is no pix. If there is a pix, then they will be taken from there.
void
GeneratePerspectiveDistortion
(
int
width
,
int
height
,
TRand
*
randomizer
,
Pix
**
pix
,
GenericVector
<
TBOX
>*
boxes
);
// Computes the coefficients of a randomized projective transformation.
// The image transform requires backward transformation coefficient, and the
// box transform the forward coefficients.
// Returns the incolor arg to pixProjective.
int
ProjectiveCoeffs
(
int
width
,
int
height
,
TRand
*
randomizer
,
float
**
im_coeffs
,
float
**
box_coeffs
);
}
// namespace tesseract
#endif // TESSERACT_TRAINING_DEGRADEIMAGE_H_
/**********************************************************************
* File: degradeimage.h
* Description: Function to degrade an image (usually of text) as if it
* has been printed and then scanned.
* Authors: Ray Smith
* Created: Tue Nov 19 2013
*
* (C) Copyright 2013, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_
#define TESSERACT_TRAINING_DEGRADEIMAGE_H_
#include "allheaders.h"
#include "genericvector.h"
#include "helpers.h" // For TRand.
#include "rect.h"
namespace
tesseract
{
// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
// corresponding to darkening on the copier and <0 lighter and 0 not copied.
// If rotation is not nullptr, the clockwise rotation in radians is saved there.
// The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.)
// The input image is destroyed and a different image returned.
struct
Pix
*
DegradeImage
(
struct
Pix
*
input
,
int
exposure
,
TRand
*
randomizer
,
float
*
rotation
);
// Creates and returns a Pix distorted by various means according to the bool
// flags. If boxes is not nullptr, the boxes are resized/positioned according to
// any spatial distortion and also by the integer reduction factor box_scale
// so they will match what the network will output.
// Returns nullptr on error. The returned Pix must be pixDestroyed.
Pix
*
PrepareDistortedPix
(
const
Pix
*
pix
,
bool
perspective
,
bool
invert
,
bool
white_noise
,
bool
smooth_noise
,
bool
blur
,
int
box_reduction
,
TRand
*
randomizer
,
GenericVector
<
TBOX
>*
boxes
);
// Distorts anything that has a non-null pointer with the same pseudo-random
// perspective distortion. Width and height only need to be set if there
// is no pix. If there is a pix, then they will be taken from there.
void
GeneratePerspectiveDistortion
(
int
width
,
int
height
,
TRand
*
randomizer
,
Pix
**
pix
,
GenericVector
<
TBOX
>*
boxes
);
// Computes the coefficients of a randomized projective transformation.
// The image transform requires backward transformation coefficient, and the
// box transform the forward coefficients.
// Returns the incolor arg to pixProjective.
int
ProjectiveCoeffs
(
int
width
,
int
height
,
TRand
*
randomizer
,
float
**
im_coeffs
,
float
**
box_coeffs
);
}
// namespace tesseract
#endif // TESSERACT_TRAINING_DEGRADEIMAGE_H_
training/fileio.cpp
→
src/
training/fileio.cpp
浏览文件 @
104fe793
文件已移动
training/fileio.h
→
src/
training/fileio.h
浏览文件 @
104fe793
文件已移动
training/icuerrorcode.h
→
src/
training/icuerrorcode.h
浏览文件 @
104fe793
/**********************************************************************
* File: icuerrorcode.h
* Description: Wrapper class for UErrorCode, with conversion operators for
* direct use in ICU C and C++ APIs.
* Author: Fredrik Roubert
* Created: Thu July 4 2013
*
* Features:
* - The constructor initializes the internal UErrorCode to U_ZERO_ERROR,
* removing one common source of errors.
* - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking
* UErrorCode& (reference), via conversion operators.
* - Automatic checking for success when it goes out of scope. On failure,
* the destructor will log an error message and exit.
*
* Most of ICU will handle errors gracefully and provide sensible fallbacks.
* Using IcuErrorCode, it is therefore possible to write very compact code
* that does sensible things on failure and provides logging for debugging.
*
* Example:
* IcuErrorCode icuerrorcode;
* return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL;
*
* (C) Copyright 2013, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_
#define TESSERACT_CCUTIL_ICUERRORCODE_H_
#include "tprintf.h"
#include "unicode/errorcode.h" // From libicu
namespace
tesseract
{
class
IcuErrorCode
:
public
icu
::
ErrorCode
{
public:
IcuErrorCode
()
{}
virtual
~
IcuErrorCode
()
{
if
(
isFailure
())
{
handleFailure
();
}
}
protected:
virtual
void
handleFailure
()
const
{
tprintf
(
"ICU ERROR: %s"
,
errorName
());
exit
(
errorCode
);
}
private:
// Disallow implicit copying of object.
IcuErrorCode
(
const
IcuErrorCode
&
);
void
operator
=
(
const
IcuErrorCode
&
);
};
}
// namespace tesseract
#endif // TESSERACT_CCUTIL_ICUERRORCODE_H_
/**********************************************************************
* File: icuerrorcode.h
* Description: Wrapper class for UErrorCode, with conversion operators for
* direct use in ICU C and C++ APIs.
* Author: Fredrik Roubert
* Created: Thu July 4 2013
*
* Features:
* - The constructor initializes the internal UErrorCode to U_ZERO_ERROR,
* removing one common source of errors.
* - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking
* UErrorCode& (reference), via conversion operators.
* - Automatic checking for success when it goes out of scope. On failure,
* the destructor will log an error message and exit.
*
* Most of ICU will handle errors gracefully and provide sensible fallbacks.
* Using IcuErrorCode, it is therefore possible to write very compact code
* that does sensible things on failure and provides logging for debugging.
*
* Example:
* IcuErrorCode icuerrorcode;
* return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL;
*
* (C) Copyright 2013, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_
#define TESSERACT_CCUTIL_ICUERRORCODE_H_
#include "tprintf.h"
#include "unicode/errorcode.h" // From libicu
namespace
tesseract
{
class
IcuErrorCode
:
public
icu
::
ErrorCode
{
public:
IcuErrorCode
()
{}
virtual
~
IcuErrorCode
()
{
if
(
isFailure
())
{
handleFailure
();
}
}
protected:
virtual
void
handleFailure
()
const
{
tprintf
(
"ICU ERROR: %s"
,
errorName
());
exit
(
errorCode
);
}
private:
// Disallow implicit copying of object.
IcuErrorCode
(
const
IcuErrorCode
&
);
void
operator
=
(
const
IcuErrorCode
&
);
};
}
// namespace tesseract
#endif // TESSERACT_CCUTIL_ICUERRORCODE_H_
training/lang_model_helpers.cpp
→
src/
training/lang_model_helpers.cpp
浏览文件 @
104fe793
文件已移动
training/lang_model_helpers.h
→
src/
training/lang_model_helpers.h
浏览文件 @
104fe793
文件已移动
training/language-specific.sh
→
src/
training/language-specific.sh
100755 → 100644
浏览文件 @
104fe793
文件已移动
training/ligature_table.cpp
→
src/
training/ligature_table.cpp
浏览文件 @
104fe793
文件已移动
training/ligature_table.h
→
src/
training/ligature_table.h
浏览文件 @
104fe793
文件已移动
training/lstmeval.cpp
→
src/
training/lstmeval.cpp
浏览文件 @
104fe793
文件已移动
training/lstmtester.cpp
→
src/
training/lstmtester.cpp
浏览文件 @
104fe793
文件已移动
training/lstmtester.h
→
src/
training/lstmtester.h
浏览文件 @
104fe793
文件已移动
training/lstmtraining.cpp
→
src/
training/lstmtraining.cpp
浏览文件 @
104fe793
文件已移动
training/merge_unicharsets.cpp
→
src/
training/merge_unicharsets.cpp
浏览文件 @
104fe793
文件已移动
training/mergenf.cpp
→
src/
training/mergenf.cpp
浏览文件 @
104fe793
此差异已折叠。
点击以展开。
training/mergenf.h
→
src/
training/mergenf.h
浏览文件 @
104fe793
/******************************************************************************
** Filename: MergeNF.c
** Purpose: Program for merging similar nano-feature protos
** Author: Dan Johnson
** History: Wed Nov 21 09:55:23 1990, DSJ, Created.
**
** (c) Copyright Hewlett-Packard Company, 1988.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
******************************************************************************/
#ifndef TESSERACT_TRAINING_MERGENF_H_
#define TESSERACT_TRAINING_MERGENF_H_
/**----------------------------------------------------------------------------
Include Files and Type Defines
----------------------------------------------------------------------------**/
#include "protos.h"
#include "cluster.h"
#include "ocrfeatures.h"
#include "callcpp.h"
#include "picofeat.h"
#define WORST_MATCH_ALLOWED (0.9)
#define WORST_EVIDENCE (1.0)
#define MAX_LENGTH_MISMATCH (2.0 * GetPicoFeatureLength ())
#define PROTO_SUFFIX ".mf.p"
#define CONFIG_SUFFIX ".cl"
#define NO_PROTO (-1)
#define XPOSITION 0
#define YPOSITION 1
#define MFLENGTH 2
#define ORIENTATION 3
typedef
struct
{
FLOAT32
MinX
,
MaxX
,
MinY
,
MaxY
;
}
FRECT
;
/**----------------------------------------------------------------------------
Public Macros
----------------------------------------------------------------------------**/
#define CenterX(M) ( (M)[XPOSITION] )
#define CenterY(M) ( (M)[YPOSITION] )
#define LengthOf(M) ( (M)[MFLENGTH] )
#define OrientationOf(M) ( (M)[ORIENTATION] )
/**----------------------------------------------------------------------------
Public Function Prototypes
----------------------------------------------------------------------------**/
FLOAT32
CompareProtos
(
PROTO
p1
,
PROTO
p2
);
void
ComputeMergedProto
(
PROTO
p1
,
PROTO
p2
,
FLOAT32
w1
,
FLOAT32
w2
,
PROTO
MergedProto
);
int
FindClosestExistingProto
(
CLASS_TYPE
Class
,
int
NumMerged
[],
PROTOTYPE
*
Prototype
);
void
MakeNewFromOld
(
PROTO
New
,
PROTOTYPE
*
Old
);
FLOAT32
SubfeatureEvidence
(
FEATURE
Feature
,
PROTO
Proto
);
double
EvidenceOf
(
register
double
Similarity
);
BOOL8
DummyFastMatch
(
FEATURE
Feature
,
PROTO
Proto
);
void
ComputePaddedBoundingBox
(
PROTO
Proto
,
FLOAT32
TangentPad
,
FLOAT32
OrthogonalPad
,
FRECT
*
BoundingBox
);
BOOL8
PointInside
(
FRECT
*
Rectangle
,
FLOAT32
X
,
FLOAT32
Y
);
#endif // TESSERACT_TRAINING_MERGENF_H_
/******************************************************************************
** Filename: MergeNF.c
** Purpose: Program for merging similar nano-feature protos
** Author: Dan Johnson
** History: Wed Nov 21 09:55:23 1990, DSJ, Created.
**
** (c) Copyright Hewlett-Packard Company, 1988.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
******************************************************************************/
#ifndef TESSERACT_TRAINING_MERGENF_H_
#define TESSERACT_TRAINING_MERGENF_H_
/**----------------------------------------------------------------------------
Include Files and Type Defines
----------------------------------------------------------------------------**/
#include "protos.h"
#include "cluster.h"
#include "ocrfeatures.h"
#include "callcpp.h"
#include "picofeat.h"
#define WORST_MATCH_ALLOWED (0.9)
#define WORST_EVIDENCE (1.0)
#define MAX_LENGTH_MISMATCH (2.0 * GetPicoFeatureLength ())
#define PROTO_SUFFIX ".mf.p"
#define CONFIG_SUFFIX ".cl"
#define NO_PROTO (-1)
#define XPOSITION 0
#define YPOSITION 1
#define MFLENGTH 2
#define ORIENTATION 3
typedef
struct
{
FLOAT32
MinX
,
MaxX
,
MinY
,
MaxY
;
}
FRECT
;
/**----------------------------------------------------------------------------
Public Macros
----------------------------------------------------------------------------**/
#define CenterX(M) ( (M)[XPOSITION] )
#define CenterY(M) ( (M)[YPOSITION] )
#define LengthOf(M) ( (M)[MFLENGTH] )
#define OrientationOf(M) ( (M)[ORIENTATION] )
/**----------------------------------------------------------------------------
Public Function Prototypes
----------------------------------------------------------------------------**/
FLOAT32
CompareProtos
(
PROTO
p1
,
PROTO
p2
);
void
ComputeMergedProto
(
PROTO
p1
,
PROTO
p2
,
FLOAT32
w1
,
FLOAT32
w2
,
PROTO
MergedProto
);
int
FindClosestExistingProto
(
CLASS_TYPE
Class
,
int
NumMerged
[],
PROTOTYPE
*
Prototype
);
void
MakeNewFromOld
(
PROTO
New
,
PROTOTYPE
*
Old
);
FLOAT32
SubfeatureEvidence
(
FEATURE
Feature
,
PROTO
Proto
);
double
EvidenceOf
(
register
double
Similarity
);
BOOL8
DummyFastMatch
(
FEATURE
Feature
,
PROTO
Proto
);
void
ComputePaddedBoundingBox
(
PROTO
Proto
,
FLOAT32
TangentPad
,
FLOAT32
OrthogonalPad
,
FRECT
*
BoundingBox
);
BOOL8
PointInside
(
FRECT
*
Rectangle
,
FLOAT32
X
,
FLOAT32
Y
);
#endif // TESSERACT_TRAINING_MERGENF_H_
training/mftraining.cpp
→
src/
training/mftraining.cpp
浏览文件 @
104fe793
文件已移动
training/normstrngs.cpp
→
src/
training/normstrngs.cpp
浏览文件 @
104fe793
文件已移动
training/normstrngs.h
→
src/
training/normstrngs.h
浏览文件 @
104fe793
文件已移动
training/pango_font_info.cpp
→
src/
training/pango_font_info.cpp
浏览文件 @
104fe793
文件已移动
training/pango_font_info.h
→
src/
training/pango_font_info.h
浏览文件 @
104fe793
文件已移动
training/set_unicharset_properties.cpp
→
src/
training/set_unicharset_properties.cpp
浏览文件 @
104fe793
文件已移动
training/shapeclustering.cpp
→
src/
training/shapeclustering.cpp
浏览文件 @
104fe793
文件已移动
training/stringrenderer.cpp
→
src/
training/stringrenderer.cpp
浏览文件 @
104fe793
文件已移动
training/stringrenderer.h
→
src/
training/stringrenderer.h
浏览文件 @
104fe793
文件已移动
training/tessopt.cpp
→
src/
training/tessopt.cpp
浏览文件 @
104fe793
文件已移动
training/tessopt.h
→
src/
training/tessopt.h
浏览文件 @
104fe793
文件已移动
training/tesstrain.sh
→
src/
training/tesstrain.sh
100755 → 100644
浏览文件 @
104fe793
文件已移动
training/tesstrain_utils.sh
→
src/
training/tesstrain_utils.sh
100755 → 100644
浏览文件 @
104fe793
文件已移动
training/text2image.cpp
→
src/
training/text2image.cpp
浏览文件 @
104fe793
文件已移动
training/tlog.cpp
→
src/
training/tlog.cpp
浏览文件 @
104fe793
/**********************************************************************
* File: tlog.cpp
* Description: Variant of printf with logging level controllable by a
* commandline flag.
* Author: Ranjith Unnikrishnan
* Created: Wed Nov 20 2013
*
* (C) Copyright 2013, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "tlog.h"
INT_PARAM_FLAG
(
tlog_level
,
0
,
"Minimum logging level for tlog() output"
);
/**********************************************************************
* File: tlog.cpp
* Description: Variant of printf with logging level controllable by a
* commandline flag.
* Author: Ranjith Unnikrishnan
* Created: Wed Nov 20 2013
*
* (C) Copyright 2013, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "tlog.h"
INT_PARAM_FLAG
(
tlog_level
,
0
,
"Minimum logging level for tlog() output"
);
training/tlog.h
→
src/
training/tlog.h
浏览文件 @
104fe793
/**********************************************************************
* File: tlog.h
* Description: Variant of printf with logging level controllable by a
* commandline flag.
* Author: Ranjith Unnikrishnan
* Created: Wed Nov 20 2013
*
* (C) Copyright 2013, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_TLOG_H_
#define TESSERACT_TRAINING_TLOG_H_
#include "commandlineflags.h"
#include "errcode.h"
#include "tprintf.h"
DECLARE_INT_PARAM_FLAG
(
tlog_level
);
// Variant guarded by the numeric logging level parameter FLAGS_tlog_level
// (default 0). Code using ParseCommandLineFlags() can control its value using
// the --tlog_level commandline argument. Otherwise it must be specified in a
// config file like other params.
#define tlog(level, ...) { \
if (FLAGS_tlog_level >= level) { \
tprintf_internal(__VA_ARGS__); \
} \
}
#define TLOG_IS_ON(level) (FLAGS_tlog_level >= level)
#endif // TESSERACT_TRAINING_TLOG_H_
/**********************************************************************
* File: tlog.h
* Description: Variant of printf with logging level controllable by a
* commandline flag.
* Author: Ranjith Unnikrishnan
* Created: Wed Nov 20 2013
*
* (C) Copyright 2013, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_TLOG_H_
#define TESSERACT_TRAINING_TLOG_H_
#include "commandlineflags.h"
#include "errcode.h"
#include "tprintf.h"
DECLARE_INT_PARAM_FLAG
(
tlog_level
);
// Variant guarded by the numeric logging level parameter FLAGS_tlog_level
// (default 0). Code using ParseCommandLineFlags() can control its value using
// the --tlog_level commandline argument. Otherwise it must be specified in a
// config file like other params.
#define tlog(level, ...) { \
if
(
FLAGS_tlog_level
>=
level
)
{
\
tprintf_internal
(
__VA_ARGS__
);
\
}
\
}
#define TLOG_IS_ON(level) (FLAGS_tlog_level >= level)
#endif // TESSERACT_TRAINING_TLOG_H_
training/unicharset_extractor.cpp
→
src/
training/unicharset_extractor.cpp
浏览文件 @
104fe793
文件已移动
training/unicharset_training_utils.cpp
→
src/
training/unicharset_training_utils.cpp
浏览文件 @
104fe793
文件已移动
training/unicharset_training_utils.h
→
src/
training/unicharset_training_utils.h
浏览文件 @
104fe793
文件已移动
training/util.h
→
src/
training/util.h
浏览文件 @
104fe793
文件已移动
training/validate_grapheme.cpp
→
src/
training/validate_grapheme.cpp
浏览文件 @
104fe793
文件已移动
training/validate_grapheme.h
→
src/
training/validate_grapheme.h
浏览文件 @
104fe793
#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#include "validator.h"
namespace
tesseract
{
// Subclass of Validator that validates and segments generic unicode into
// grapheme clusters, including Latin with diacritics.
class
ValidateGrapheme
:
public
Validator
{
public:
ValidateGrapheme
(
ViramaScript
script
,
bool
report_errors
)
:
Validator
(
script
,
report_errors
)
{}
~
ValidateGrapheme
()
{}
protected:
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool
ConsumeGraphemeIfValid
()
override
;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass
UnicodeToCharClass
(
char32
ch
)
const
override
;
private:
// Helper returns true if the sequence prev_ch,ch is invalid.
bool
IsBadlyFormed
(
char32
prev_ch
,
char32
ch
);
// Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
static
bool
IsBadlyFormedIndicVowel
(
char32
prev_ch
,
char32
ch
);
// Helper returns true if the sequence prev_ch,ch is invalid Thai.
static
bool
IsBadlyFormedThai
(
char32
prev_ch
,
char32
ch
);
};
}
// namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#include "validator.h"
namespace
tesseract
{
// Subclass of Validator that validates and segments generic unicode into
// grapheme clusters, including Latin with diacritics.
class
ValidateGrapheme
:
public
Validator
{
public:
ValidateGrapheme
(
ViramaScript
script
,
bool
report_errors
)
:
Validator
(
script
,
report_errors
)
{}
~
ValidateGrapheme
()
{}
protected:
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool
ConsumeGraphemeIfValid
()
override
;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass
UnicodeToCharClass
(
char32
ch
)
const
override
;
private:
// Helper returns true if the sequence prev_ch,ch is invalid.
bool
IsBadlyFormed
(
char32
prev_ch
,
char32
ch
);
// Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
static
bool
IsBadlyFormedIndicVowel
(
char32
prev_ch
,
char32
ch
);
// Helper returns true if the sequence prev_ch,ch is invalid Thai.
static
bool
IsBadlyFormedThai
(
char32
prev_ch
,
char32
ch
);
};
}
// namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
training/validate_indic.cpp
→
src/
training/validate_indic.cpp
浏览文件 @
104fe793
文件已移动
training/validate_indic.h
→
src/
training/validate_indic.h
浏览文件 @
104fe793
#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_
#define TESSERACT_TRAINING_VALIDATE_INDIC_H_
#include "validator.h"
namespace
tesseract
{
// Subclass of Validator that validates and segments Indic scripts in the
// unicode range 0x900-0xdff (Devanagari-Sinhala).
class
ValidateIndic
:
public
Validator
{
public:
ValidateIndic
(
ViramaScript
script
,
bool
report_errors
)
:
Validator
(
script
,
report_errors
)
{}
~
ValidateIndic
()
{}
protected:
// Returns whether codes matches the pattern for an Indic Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool
ConsumeGraphemeIfValid
()
override
;
// Returns the CharClass corresponding to the given Unicode ch.
Validator
::
CharClass
UnicodeToCharClass
(
char32
ch
)
const
override
;
private:
// Helper consumes/copies a virama and any associated post-virama joiners.
bool
ConsumeViramaIfValid
(
IndicPair
joiner
,
bool
post_matra
);
// Helper consumes/copies a series of consonants separated by viramas while
// valid, but not any vowel or other modifiers.
bool
ConsumeConsonantHeadIfValid
();
// Helper consumes/copies a tail part of a consonant, comprising optional
// matra/piece, vowel modifier, vedic mark, terminating virama.
bool
ConsumeConsonantTailIfValid
();
// Helper consumes/copies a vowel and optional modifiers.
bool
ConsumeVowelIfValid
();
// Some special unicodes used only for Indic processing.
static
const
char32
kYayana
=
0xdba
;
// Sinhala Ya
static
const
char32
kRayana
=
0xdbb
;
// Sinhala Ra
};
}
// namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_INDIC_H_
#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_
#define TESSERACT_TRAINING_VALIDATE_INDIC_H_
#include "validator.h"
namespace
tesseract
{
// Subclass of Validator that validates and segments Indic scripts in the
// unicode range 0x900-0xdff (Devanagari-Sinhala).
class
ValidateIndic
:
public
Validator
{
public:
ValidateIndic
(
ViramaScript
script
,
bool
report_errors
)
:
Validator
(
script
,
report_errors
)
{}
~
ValidateIndic
()
{}
protected:
// Returns whether codes matches the pattern for an Indic Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool
ConsumeGraphemeIfValid
()
override
;
// Returns the CharClass corresponding to the given Unicode ch.
Validator
::
CharClass
UnicodeToCharClass
(
char32
ch
)
const
override
;
private:
// Helper consumes/copies a virama and any associated post-virama joiners.
bool
ConsumeViramaIfValid
(
IndicPair
joiner
,
bool
post_matra
);
// Helper consumes/copies a series of consonants separated by viramas while
// valid, but not any vowel or other modifiers.
bool
ConsumeConsonantHeadIfValid
();
// Helper consumes/copies a tail part of a consonant, comprising optional
// matra/piece, vowel modifier, vedic mark, terminating virama.
bool
ConsumeConsonantTailIfValid
();
// Helper consumes/copies a vowel and optional modifiers.
bool
ConsumeVowelIfValid
();
// Some special unicodes used only for Indic processing.
static
const
char32
kYayana
=
0xdba
;
// Sinhala Ya
static
const
char32
kRayana
=
0xdbb
;
// Sinhala Ra
};
}
// namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_INDIC_H_
training/validate_khmer.cpp
→
src/
training/validate_khmer.cpp
浏览文件 @
104fe793
#include "validate_khmer.h"
#include "errcode.h"
#include "tprintf.h"
namespace
tesseract
{
// Returns whether codes matches the pattern for a Khmer Grapheme.
// Taken from unicode standard:
// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
// Translated to the codes used by the CharClass enum:
// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
// Also the Consonant class here includes independent vowels, as they are
// treated the same anyway.
// In the split grapheme mode, the only characters that get grouped are the
// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
// the BNF syntax, so who knows what they do.
bool
ValidateKhmer
::
ConsumeGraphemeIfValid
()
{
int
num_codes
=
codes_
.
size
();
if
(
codes_used_
==
num_codes
)
return
false
;
if
(
codes_
[
codes_used_
].
first
==
CharClass
::
kOther
)
{
UseMultiCode
(
1
);
return
true
;
}
if
(
codes_
[
codes_used_
].
first
!=
CharClass
::
kConsonant
)
{
if
(
report_errors_
)
{
tprintf
(
"Invalid start of Khmer syllable:0x%x
\n
"
,
codes_
[
codes_used_
].
second
);
}
return
false
;
}
if
(
UseMultiCode
(
1
))
return
true
;
if
(
codes_
[
codes_used_
].
first
==
CharClass
::
kRobat
||
codes_
[
codes_used_
].
first
==
CharClass
::
kNukta
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
while
(
codes_used_
+
1
<
num_codes
&&
codes_
[
codes_used_
].
first
==
CharClass
::
kVirama
&&
codes_
[
codes_used_
+
1
].
first
==
CharClass
::
kConsonant
)
{
ASSERT_HOST
(
!
CodeOnlyToOutput
());
if
(
UseMultiCode
(
2
))
return
true
;
if
(
codes_
[
codes_used_
].
first
==
CharClass
::
kRobat
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
}
int
num_matra_parts
=
0
;
if
(
codes_
[
codes_used_
].
second
==
kZeroWidthJoiner
||
codes_
[
codes_used_
].
second
==
kZeroWidthNonJoiner
)
{
if
(
CodeOnlyToOutput
())
{
if
(
report_errors_
)
{
tprintf
(
"Unterminated joiner: 0x%x
\n
"
,
output_
.
back
());
}
return
false
;
}
++
num_matra_parts
;
}
// Not quite as shown by the BNF, the matra piece is allowed as a matra on its
// own or as an addition to other matras.
if
(
codes_
[
codes_used_
].
first
==
CharClass
::
kMatra
||
codes_
[
codes_used_
].
first
==
CharClass
::
kMatraPiece
)
{
++
num_matra_parts
;
if
(
UseMultiCode
(
num_matra_parts
))
return
true
;
}
else
if
(
num_matra_parts
)
{
if
(
report_errors_
)
{
tprintf
(
"Joiner with non-dependent vowel after it!:0x%x 0x%x
\n
"
,
output_
.
back
(),
codes_
[
codes_used_
].
second
);
}
return
false
;
}
if
(
codes_
[
codes_used_
].
first
==
CharClass
::
kMatraPiece
&&
codes_
[
codes_used_
-
1
].
first
!=
CharClass
::
kMatraPiece
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
if
(
codes_
[
codes_used_
].
first
==
CharClass
::
kVowelModifier
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
if
(
codes_used_
+
1
<
num_codes
&&
codes_
[
codes_used_
].
first
==
CharClass
::
kVirama
&&
codes_
[
codes_used_
+
1
].
first
==
CharClass
::
kConsonant
)
{
ASSERT_HOST
(
!
CodeOnlyToOutput
());
if
(
UseMultiCode
(
2
))
return
true
;
}
return
true
;
}
Validator
::
CharClass
ValidateKhmer
::
UnicodeToCharClass
(
char32
ch
)
const
{
if
(
IsVedicAccent
(
ch
))
return
CharClass
::
kVedicMark
;
if
(
ch
==
kZeroWidthNonJoiner
)
return
CharClass
::
kZeroWidthNonJoiner
;
if
(
ch
==
kZeroWidthJoiner
)
return
CharClass
::
kZeroWidthJoiner
;
// Offset from the start of the relevant unicode code block aka code page.
int
off
=
ch
-
static_cast
<
char32
>
(
script_
);
// Anything in another code block is other.
if
(
off
<
0
||
off
>=
kIndicCodePageSize
)
return
CharClass
::
kOther
;
if
(
off
<=
0x33
)
return
CharClass
::
kConsonant
;
if
(
off
<=
0x45
)
return
CharClass
::
kMatra
;
if
(
off
==
0x46
)
return
CharClass
::
kMatraPiece
;
if
(
off
==
0x4c
)
return
CharClass
::
kRobat
;
if
(
off
==
0x49
||
off
==
0x4a
)
return
CharClass
::
kNukta
;
if
(
off
<=
0x51
)
return
CharClass
::
kVowelModifier
;
if
(
off
==
0x52
)
return
CharClass
::
kVirama
;
return
CharClass
::
kOther
;
}
}
// namespace tesseract
#include "validate_khmer.h"
#include "errcode.h"
#include "tprintf.h"
namespace
tesseract
{
// Returns whether codes matches the pattern for a Khmer Grapheme.
// Taken from unicode standard:
// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
// Translated to the codes used by the CharClass enum:
// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
// Also the Consonant class here includes independent vowels, as they are
// treated the same anyway.
// In the split grapheme mode, the only characters that get grouped are the
// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
// the BNF syntax, so who knows what they do.
bool
ValidateKhmer
::
ConsumeGraphemeIfValid
()
{
int
num_codes
=
codes_
.
size
();
if
(
codes_used_
==
num_codes
)
return
false
;
if
(
codes_
[
codes_used_
].
first
==
CharClass
::
kOther
)
{
UseMultiCode
(
1
);
return
true
;
}
if
(
codes_
[
codes_used_
].
first
!=
CharClass
::
kConsonant
)
{
if
(
report_errors_
)
{
tprintf
(
"Invalid start of Khmer syllable:0x%x
\n
"
,
codes_
[
codes_used_
].
second
);
}
return
false
;
}
if
(
UseMultiCode
(
1
))
return
true
;
if
(
codes_
[
codes_used_
].
first
==
CharClass
::
kRobat
||
codes_
[
codes_used_
].
first
==
CharClass
::
kNukta
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
while
(
codes_used_
+
1
<
num_codes
&&
codes_
[
codes_used_
].
first
==
CharClass
::
kVirama
&&
codes_
[
codes_used_
+
1
].
first
==
CharClass
::
kConsonant
)
{
ASSERT_HOST
(
!
CodeOnlyToOutput
());
if
(
UseMultiCode
(
2
))
return
true
;
if
(
codes_
[
codes_used_
].
first
==
CharClass
::
kRobat
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
}
int
num_matra_parts
=
0
;
if
(
codes_
[
codes_used_
].
second
==
kZeroWidthJoiner
||
codes_
[
codes_used_
].
second
==
kZeroWidthNonJoiner
)
{
if
(
CodeOnlyToOutput
())
{
if
(
report_errors_
)
{
tprintf
(
"Unterminated joiner: 0x%x
\n
"
,
output_
.
back
());
}
return
false
;
}
++
num_matra_parts
;
}
// Not quite as shown by the BNF, the matra piece is allowed as a matra on its
// own or as an addition to other matras.
if
(
codes_
[
codes_used_
].
first
==
CharClass
::
kMatra
||
codes_
[
codes_used_
].
first
==
CharClass
::
kMatraPiece
)
{
++
num_matra_parts
;
if
(
UseMultiCode
(
num_matra_parts
))
return
true
;
}
else
if
(
num_matra_parts
)
{
if
(
report_errors_
)
{
tprintf
(
"Joiner with non-dependent vowel after it!:0x%x 0x%x
\n
"
,
output_
.
back
(),
codes_
[
codes_used_
].
second
);
}
return
false
;
}
if
(
codes_
[
codes_used_
].
first
==
CharClass
::
kMatraPiece
&&
codes_
[
codes_used_
-
1
].
first
!=
CharClass
::
kMatraPiece
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
if
(
codes_
[
codes_used_
].
first
==
CharClass
::
kVowelModifier
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
if
(
codes_used_
+
1
<
num_codes
&&
codes_
[
codes_used_
].
first
==
CharClass
::
kVirama
&&
codes_
[
codes_used_
+
1
].
first
==
CharClass
::
kConsonant
)
{
ASSERT_HOST
(
!
CodeOnlyToOutput
());
if
(
UseMultiCode
(
2
))
return
true
;
}
return
true
;
}
Validator
::
CharClass
ValidateKhmer
::
UnicodeToCharClass
(
char32
ch
)
const
{
if
(
IsVedicAccent
(
ch
))
return
CharClass
::
kVedicMark
;
if
(
ch
==
kZeroWidthNonJoiner
)
return
CharClass
::
kZeroWidthNonJoiner
;
if
(
ch
==
kZeroWidthJoiner
)
return
CharClass
::
kZeroWidthJoiner
;
// Offset from the start of the relevant unicode code block aka code page.
int
off
=
ch
-
static_cast
<
char32
>
(
script_
);
// Anything in another code block is other.
if
(
off
<
0
||
off
>=
kIndicCodePageSize
)
return
CharClass
::
kOther
;
if
(
off
<=
0x33
)
return
CharClass
::
kConsonant
;
if
(
off
<=
0x45
)
return
CharClass
::
kMatra
;
if
(
off
==
0x46
)
return
CharClass
::
kMatraPiece
;
if
(
off
==
0x4c
)
return
CharClass
::
kRobat
;
if
(
off
==
0x49
||
off
==
0x4a
)
return
CharClass
::
kNukta
;
if
(
off
<=
0x51
)
return
CharClass
::
kVowelModifier
;
if
(
off
==
0x52
)
return
CharClass
::
kVirama
;
return
CharClass
::
kOther
;
}
}
// namespace tesseract
training/validate_khmer.h
→
src/
training/validate_khmer.h
浏览文件 @
104fe793
#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_
#define TESSERACT_TRAINING_VALIDATE_KHMER_H_
#include "validator.h"
namespace
tesseract
{
// Subclass of Validator that validates and segments Khmer.
class
ValidateKhmer
:
public
Validator
{
public:
ValidateKhmer
(
ViramaScript
script
,
bool
report_errors
)
:
Validator
(
script
,
report_errors
)
{}
~
ValidateKhmer
()
{}
protected:
// Returns whether codes matches the pattern for an Khmer Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool
ConsumeGraphemeIfValid
()
override
;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass
UnicodeToCharClass
(
char32
ch
)
const
override
;
};
}
// namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_KHMER_H_
#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_
#define TESSERACT_TRAINING_VALIDATE_KHMER_H_
#include "validator.h"
namespace
tesseract
{
// Subclass of Validator that validates and segments Khmer.
class
ValidateKhmer
:
public
Validator
{
public:
ValidateKhmer
(
ViramaScript
script
,
bool
report_errors
)
:
Validator
(
script
,
report_errors
)
{}
~
ValidateKhmer
()
{}
protected:
// Returns whether codes matches the pattern for an Khmer Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool
ConsumeGraphemeIfValid
()
override
;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass
UnicodeToCharClass
(
char32
ch
)
const
override
;
};
}
// namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_KHMER_H_
training/validate_myanmar.cpp
→
src/
training/validate_myanmar.cpp
浏览文件 @
104fe793
#include "validate_myanmar.h"
#include "errcode.h"
#include "icuerrorcode.h"
#include "tprintf.h"
#include "unicode/uchar.h" // From libicu
#include "unicode/uscript.h" // From libicu
namespace
tesseract
{
// Returns whether codes matches the pattern for a Myanmar Grapheme.
// Taken directly from the unicode table 16-3.
// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
bool
ValidateMyanmar
::
ConsumeGraphemeIfValid
()
{
int
num_codes
=
codes_
.
size
();
if
(
codes_used_
==
num_codes
)
return
true
;
// Other.
if
(
IsMyanmarOther
(
codes_
[
codes_used_
].
second
))
{
UseMultiCode
(
1
);
return
true
;
}
// Kinzi.
if
(
codes_used_
+
2
<
num_codes
&&
codes_
[
codes_used_
].
second
==
0x1004
&&
codes_
[
codes_used_
+
1
].
second
==
kMyanmarAsat
&&
codes_
[
codes_used_
+
2
].
second
==
kMyanmarVirama
)
{
ASSERT_HOST
(
!
CodeOnlyToOutput
());
ASSERT_HOST
(
!
CodeOnlyToOutput
());
if
(
UseMultiCode
(
3
))
return
true
;
}
// Base consonant/vowel. NOTE that since everything in Myanmar appears to be
// optional, except the base, this is the only place where invalid input can
// be detected and false returned.
if
(
IsMyanmarLetter
(
codes_
[
codes_used_
].
second
))
{
if
(
UseMultiCode
(
1
))
return
true
;
}
else
{
if
(
report_errors_
)
{
tprintf
(
"Invalid start of Myanmar syllable:0x%x
\n
"
,
codes_
[
codes_used_
].
second
);
}
return
false
;
// One of these is required.
}
if
(
ConsumeSubscriptIfPresent
())
return
true
;
ConsumeOptionalSignsIfPresent
();
// What we have consumed so far is a valid syllable.
return
true
;
}
// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
// is little correspondence between the content of table 16-3 and the char
// classes of the Indic languages. (Experts may disagree and improve!)
// In unicode table 16-3 there is basically a long list of optional characters,
// which can be coded quite easily.
// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
// The table also allows sequences that still result in dotted circles!!
// So with a lot of guesswork the rest have been added in a reasonable place.
Validator
::
CharClass
ValidateMyanmar
::
UnicodeToCharClass
(
char32
ch
)
const
{
if
(
IsMyanmarLetter
(
ch
))
return
CharClass
::
kConsonant
;
return
CharClass
::
kOther
;
}
// Helper consumes/copies a virama and any subscript consonant.
// Returns true if the end of input is reached.
bool
ValidateMyanmar
::
ConsumeSubscriptIfPresent
()
{
// Subscript consonant. It appears there can be only one.
int
num_codes
=
codes_
.
size
();
if
(
codes_used_
+
1
<
num_codes
&&
codes_
[
codes_used_
].
second
==
kMyanmarVirama
)
{
if
(
IsMyanmarLetter
(
codes_
[
codes_used_
+
1
].
second
))
{
ASSERT_HOST
(
!
CodeOnlyToOutput
());
if
(
UseMultiCode
(
2
))
return
true
;
}
}
return
false
;
}
// Helper consumes/copies a series of optional signs.
// Returns true if the end of input is reached.
bool
ValidateMyanmar
::
ConsumeOptionalSignsIfPresent
()
{
// The following characters are allowed, all optional, and in sequence.
// An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
const
std
::
vector
<
char32
>
kMedials
({
kMyanmarAsat
,
kMyanmarMedialYa
,
0x103c
,
0x103d
,
0x103e
,
0x105e
,
0x105f
,
0x1060
,
0x1081
,
0x1031
});
for
(
char32
ch
:
kMedials
)
{
if
(
codes_
[
codes_used_
].
second
==
ch
)
{
if
(
UseMultiCode
(
1
))
return
true
;
if
(
ch
==
kMyanmarMedialYa
&&
codes_
[
codes_used_
].
second
==
kMyanmarAsat
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
}
}
// Vowel sign i, ii, ai.
char32
ch
=
codes_
[
codes_used_
].
second
;
if
(
ch
==
0x102d
||
ch
==
0x102e
||
ch
==
0x1032
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
// Vowel sign u, uu, and extensions.
ch
=
codes_
[
codes_used_
].
second
;
if
(
ch
==
0x102f
||
ch
==
0x1030
||
(
0x1056
<=
ch
&&
ch
<=
0x1059
)
||
ch
==
0x1062
||
ch
==
0x1067
||
ch
==
0x1068
||
(
0x1071
<=
ch
&&
ch
<=
0x1074
)
||
(
0x1083
<=
ch
&&
ch
<=
0x1086
)
||
ch
==
0x109c
||
ch
==
0x109d
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
// Tall aa, aa with optional asat.
if
(
codes_
[
codes_used_
].
second
==
0x102b
||
codes_
[
codes_used_
].
second
==
0x102c
)
{
if
(
UseMultiCode
(
1
))
return
true
;
if
(
codes_
[
codes_used_
].
second
==
kMyanmarAsat
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
}
// The following characters are allowed, all optional, and in sequence.
const
std
::
vector
<
char32
>
kSigns
({
0x1036
,
0x1037
});
for
(
char32
ch
:
kSigns
)
{
if
(
codes_
[
codes_used_
].
second
==
ch
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
}
// Tone mark extensions.
ch
=
codes_
[
codes_used_
].
second
;
if
(
ch
==
0x1038
||
ch
==
kMyanmarAsat
||
ch
==
0x1063
||
ch
==
0x1064
||
(
0x1069
<=
ch
&&
ch
<=
0x106d
)
||
(
0x1087
<=
ch
&&
ch
<=
0x108d
)
||
ch
==
0x108f
||
ch
==
0x109a
||
ch
==
0x109b
||
(
0xaa7b
<=
ch
&&
ch
<=
0xaa7d
))
{
if
(
UseMultiCode
(
1
))
return
true
;
}
return
false
;
}
// Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket.
/* static */
bool
ValidateMyanmar
::
IsMyanmarLetter
(
char32
ch
)
{
return
(
0x1000
<=
ch
&&
ch
<=
0x102a
)
||
ch
==
0x103f
||
(
0x1050
<=
ch
&&
ch
<=
0x1055
)
||
(
0x105a
<=
ch
&&
ch
<=
0x105d
)
||
ch
==
0x1061
||
ch
==
0x1065
||
ch
==
0x1066
||
(
0x106e
<=
ch
&&
ch
<=
0x1070
)
||
(
0x1075
<=
ch
&&
ch
<=
0x1080
)
||
ch
==
0x108e
||
(
0xa9e0
<=
ch
&&
ch
<=
0xa9ef
)
||
(
0xa9fa
<=
ch
&&
ch
<=
0xa9ff
)
||
(
0xaa60
<=
ch
&&
ch
<=
0xaa73
)
||
ch
==
0xaa7a
||
ch
==
0xaa7e
||
ch
==
0xaa7f
;
}
// Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable.
/* static */
bool
ValidateMyanmar
::
IsMyanmarOther
(
char32
ch
)
{
IcuErrorCode
err
;
UScriptCode
script_code
=
uscript_getScript
(
ch
,
err
);
if
(
script_code
!=
USCRIPT_MYANMAR
&&
ch
!=
Validator
::
kZeroWidthJoiner
&&
ch
!=
Validator
::
kZeroWidthNonJoiner
)
return
true
;
return
(
0x1040
<=
ch
&&
ch
<=
0x1049
)
||
(
0x1090
<=
ch
&&
ch
<=
0x1099
)
||
(
0x109c
<=
ch
&&
ch
<=
0x109d
)
||
(
0xa9f0
<=
ch
&&
ch
<=
0xa9f9
)
||
(
0xaa74
<=
ch
&&
ch
<=
0xaa79
);
}
}
// namespace tesseract
#include "validate_myanmar.h"
#include "errcode.h"
#include "icuerrorcode.h"
#include "tprintf.h"
#include "unicode/uchar.h" // From libicu
#include "unicode/uscript.h" // From libicu
namespace
tesseract
{
// Returns whether codes matches the pattern for a Myanmar Grapheme.
// Taken directly from the unicode table 16-3.
// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
bool
ValidateMyanmar
::
ConsumeGraphemeIfValid
()
{
int
num_codes
=
codes_
.
size
();
if
(
codes_used_
==
num_codes
)
return
true
;
// Other.
if
(
IsMyanmarOther
(
codes_
[
codes_used_
].
second
))
{
UseMultiCode
(
1
);
return
true
;
}
// Kinzi.
if
(
codes_used_
+
2
<
num_codes
&&
codes_
[
codes_used_
].
second
==
0x1004
&&
codes_
[
codes_used_
+
1
].
second
==
kMyanmarAsat
&&
codes_
[
codes_used_
+
2
].
second
==
kMyanmarVirama
)
{
ASSERT_HOST
(
!
CodeOnlyToOutput
());
ASSERT_HOST
(
!
CodeOnlyToOutput
());
if
(
UseMultiCode
(
3
))
return
true
;
}
// Base consonant/vowel. NOTE that since everything in Myanmar appears to be
// optional, except the base, this is the only place where invalid input can
// be detected and false returned.
if
(
IsMyanmarLetter
(
codes_
[
codes_used_
].
second
))
{
if
(
UseMultiCode
(
1
))
return
true
;
}
else
{
if
(
report_errors_
)
{
tprintf
(
"Invalid start of Myanmar syllable:0x%x
\n
"
,
codes_
[
codes_used_
].
second
);
}
return
false
;
// One of these is required.
}
if
(
ConsumeSubscriptIfPresent
())
return
true
;
ConsumeOptionalSignsIfPresent
();
// What we have consumed so far is a valid syllable.
return
true
;
}
// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
// is little correspondence between the content of table 16-3 and the char
// classes of the Indic languages. (Experts may disagree and improve!)
// In unicode table 16-3 there is basically a long list of optional characters,
// which can be coded quite easily.
// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
// The table also allows sequences that still result in dotted circles!!
// So with a lot of guesswork the rest have been added in a reasonable place.
Validator
::
CharClass
ValidateMyanmar
::
UnicodeToCharClass
(
char32
ch
)
const
{
if
(
IsMyanmarLetter
(
ch
))
return
CharClass
::
kConsonant
;
return
CharClass
::
kOther
;
}
// Helper consumes/copies a virama and any subscript consonant.
// Returns true if the end of input is reached.
bool
ValidateMyanmar
::
ConsumeSubscriptIfPresent
()
{
// Subscript consonant. It appears there can be only one.
int
num_codes
=
codes_
.
size
();
if
(
codes_used_
+
1
<
num_codes
&&
codes_
[
codes_used_
].
second
==
kMyanmarVirama
)
{
if
(
IsMyanmarLetter
(
codes_
[
codes_used_
+
1
].
second
))
{
ASSERT_HOST
(
!
CodeOnlyToOutput
());
if
(
UseMultiCode
(
2
))
return
true
;
}
}
return
false
;
}
// Helper consumes/copies a series of optional signs.
// Returns true if the end of input is reached.
bool
ValidateMyanmar
::
ConsumeOptionalSignsIfPresent
()
{
// The following characters are allowed, all optional, and in sequence.
// An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
const
std
::
vector
<
char32
>
kMedials
({
kMyanmarAsat
,
kMyanmarMedialYa
,
0x103c
,
0x103d
,
0x103e
,
0x105e
,
0x105f
,
0x1060
,
0x1081
,
0x1031
});
for
(
char32
ch
:
kMedials
)
{
if
(
codes_
[
codes_used_
].
second
==
ch
)
{
if
(
UseMultiCode
(
1
))
return
true
;
if
(
ch
==
kMyanmarMedialYa
&&
codes_
[
codes_used_
].
second
==
kMyanmarAsat
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
}
}
// Vowel sign i, ii, ai.
char32
ch
=
codes_
[
codes_used_
].
second
;
if
(
ch
==
0x102d
||
ch
==
0x102e
||
ch
==
0x1032
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
// Vowel sign u, uu, and extensions.
ch
=
codes_
[
codes_used_
].
second
;
if
(
ch
==
0x102f
||
ch
==
0x1030
||
(
0x1056
<=
ch
&&
ch
<=
0x1059
)
||
ch
==
0x1062
||
ch
==
0x1067
||
ch
==
0x1068
||
(
0x1071
<=
ch
&&
ch
<=
0x1074
)
||
(
0x1083
<=
ch
&&
ch
<=
0x1086
)
||
ch
==
0x109c
||
ch
==
0x109d
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
// Tall aa, aa with optional asat.
if
(
codes_
[
codes_used_
].
second
==
0x102b
||
codes_
[
codes_used_
].
second
==
0x102c
)
{
if
(
UseMultiCode
(
1
))
return
true
;
if
(
codes_
[
codes_used_
].
second
==
kMyanmarAsat
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
}
// The following characters are allowed, all optional, and in sequence.
const
std
::
vector
<
char32
>
kSigns
({
0x1036
,
0x1037
});
for
(
char32
ch
:
kSigns
)
{
if
(
codes_
[
codes_used_
].
second
==
ch
)
{
if
(
UseMultiCode
(
1
))
return
true
;
}
}
// Tone mark extensions.
ch
=
codes_
[
codes_used_
].
second
;
if
(
ch
==
0x1038
||
ch
==
kMyanmarAsat
||
ch
==
0x1063
||
ch
==
0x1064
||
(
0x1069
<=
ch
&&
ch
<=
0x106d
)
||
(
0x1087
<=
ch
&&
ch
<=
0x108d
)
||
ch
==
0x108f
||
ch
==
0x109a
||
ch
==
0x109b
||
(
0xaa7b
<=
ch
&&
ch
<=
0xaa7d
))
{
if
(
UseMultiCode
(
1
))
return
true
;
}
return
false
;
}
// Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket.
/* static */
bool
ValidateMyanmar
::
IsMyanmarLetter
(
char32
ch
)
{
return
(
0x1000
<=
ch
&&
ch
<=
0x102a
)
||
ch
==
0x103f
||
(
0x1050
<=
ch
&&
ch
<=
0x1055
)
||
(
0x105a
<=
ch
&&
ch
<=
0x105d
)
||
ch
==
0x1061
||
ch
==
0x1065
||
ch
==
0x1066
||
(
0x106e
<=
ch
&&
ch
<=
0x1070
)
||
(
0x1075
<=
ch
&&
ch
<=
0x1080
)
||
ch
==
0x108e
||
(
0xa9e0
<=
ch
&&
ch
<=
0xa9ef
)
||
(
0xa9fa
<=
ch
&&
ch
<=
0xa9ff
)
||
(
0xaa60
<=
ch
&&
ch
<=
0xaa73
)
||
ch
==
0xaa7a
||
ch
==
0xaa7e
||
ch
==
0xaa7f
;
}
// Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable.
/* static */
bool
ValidateMyanmar
::
IsMyanmarOther
(
char32
ch
)
{
IcuErrorCode
err
;
UScriptCode
script_code
=
uscript_getScript
(
ch
,
err
);
if
(
script_code
!=
USCRIPT_MYANMAR
&&
ch
!=
Validator
::
kZeroWidthJoiner
&&
ch
!=
Validator
::
kZeroWidthNonJoiner
)
return
true
;
return
(
0x1040
<=
ch
&&
ch
<=
0x1049
)
||
(
0x1090
<=
ch
&&
ch
<=
0x1099
)
||
(
0x109c
<=
ch
&&
ch
<=
0x109d
)
||
(
0xa9f0
<=
ch
&&
ch
<=
0xa9f9
)
||
(
0xaa74
<=
ch
&&
ch
<=
0xaa79
);
}
}
// namespace tesseract
training/validate_myanmar.h
→
src/
training/validate_myanmar.h
浏览文件 @
104fe793
#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#include "validator.h"
namespace
tesseract
{
// Subclass of Validator that validates and segments Myanmar.
class
ValidateMyanmar
:
public
Validator
{
public:
ValidateMyanmar
(
ViramaScript
script
,
bool
report_errors
)
:
Validator
(
script
,
report_errors
)
{}
~
ValidateMyanmar
()
{}
protected:
// Returns whether codes matches the pattern for a Myanmar Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool
ConsumeGraphemeIfValid
()
override
;
// Returns the CharClass corresponding to the given Unicode ch.
Validator
::
CharClass
UnicodeToCharClass
(
char32
ch
)
const
override
;
private:
// Helper consumes/copies a virama and any subscript consonant.
// Returns true if the end of input is reached.
bool
ConsumeSubscriptIfPresent
();
// Helper consumes/copies a series of optional signs.
// Returns true if the end of input is reached.
bool
ConsumeOptionalSignsIfPresent
();
// Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket.
static
bool
IsMyanmarLetter
(
char32
ch
);
// Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable.
static
bool
IsMyanmarOther
(
char32
ch
);
// Some special unicodes used only for Myanmar processing.
static
const
char32
kMyanmarAsat
=
0x103a
;
static
const
char32
kMyanmarMedialYa
=
0x103b
;
};
}
// namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#include "validator.h"
namespace
tesseract
{
// Subclass of Validator that validates and segments Myanmar.
class
ValidateMyanmar
:
public
Validator
{
public:
ValidateMyanmar
(
ViramaScript
script
,
bool
report_errors
)
:
Validator
(
script
,
report_errors
)
{}
~
ValidateMyanmar
()
{}
protected:
// Returns whether codes matches the pattern for a Myanmar Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool
ConsumeGraphemeIfValid
()
override
;
// Returns the CharClass corresponding to the given Unicode ch.
Validator
::
CharClass
UnicodeToCharClass
(
char32
ch
)
const
override
;
private:
// Helper consumes/copies a virama and any subscript consonant.
// Returns true if the end of input is reached.
bool
ConsumeSubscriptIfPresent
();
// Helper consumes/copies a series of optional signs.
// Returns true if the end of input is reached.
bool
ConsumeOptionalSignsIfPresent
();
// Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket.
static
bool
IsMyanmarLetter
(
char32
ch
);
// Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable.
static
bool
IsMyanmarOther
(
char32
ch
);
// Some special unicodes used only for Myanmar processing.
static
const
char32
kMyanmarAsat
=
0x103a
;
static
const
char32
kMyanmarMedialYa
=
0x103b
;
};
}
// namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
training/validator.cpp
→
src/
training/validator.cpp
浏览文件 @
104fe793
文件已移动
training/validator.h
→
src/
training/validator.h
浏览文件 @
104fe793
此差异已折叠。
点击以展开。
training/wordlist2dawg.cpp
→
src/
training/wordlist2dawg.cpp
浏览文件 @
104fe793
文件已移动
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录